block: add BDRV_REQ_REGISTERED_BUF request flag

Block drivers may optimize I/O requests accessing buffers previously
registered with bdrv_register_buf(). Checking whether all elements of a
request's QEMUIOVector are within previously registered buffers is
expensive, so we need a hint from the user to avoid costly checks.

Add a BDRV_REQ_REGISTERED_BUF request flag to indicate that all
QEMUIOVector elements in an I/O request are known to be within
previously registered buffers.

Always pass the flag through to driver read/write functions. There is
little harm in passing the flag to a driver that does not use it.
Passing the flag to drivers avoids changes across many block drivers.
Filter drivers would need to explicitly support the flag and pass
through to their children when the children support it. That's a lot of
code changes and it's hard to remember to do that everywhere, leading to
silent reduced performance when the flag is accidentally dropped.

The only problematic scenario with the approach in this patch is when a
driver passes the flag through to internal I/O requests that don't use
the same I/O buffer. In that case the hint may be set when it should
actually be clear. This is a rare case though so the risk is low.

Some drivers have assert(!flags), which no longer works when
BDRV_REQ_REGISTERED_BUF is passed in. These assertions aren't very
useful anyway since the functions are called almost exclusively by
bdrv_driver_preadv/pwritev() so if we get flags handling right there
then the assertion is not needed.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20221013185908.1297568-7-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2022-10-13 14:59:01 -04:00
parent 98b3ddc78b
commit e8b6535533
16 changed files with 69 additions and 37 deletions

14
block.c
View file

@ -1641,6 +1641,20 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
goto open_failed;
}
assert(!(bs->supported_read_flags & ~BDRV_REQ_MASK));
assert(!(bs->supported_write_flags & ~BDRV_REQ_MASK));
/*
* Always allow the BDRV_REQ_REGISTERED_BUF optimization hint. This saves
* drivers that pass read/write requests through to a child the trouble of
* declaring support explicitly.
*
* Drivers must not propagate this flag accidentally when they initiate I/O
* to a bounce buffer. That case should be rare though.
*/
bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF;
bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF;
ret = refresh_total_sectors(bs, bs->total_sectors);
if (ret < 0) {
error_setg_errno(errp, -ret, "Could not refresh total sector count");

View file

@ -235,8 +235,8 @@ blkverify_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
qemu_iovec_init(&raw_qiov, qiov->niov);
qemu_iovec_clone(&raw_qiov, qiov, buf);
ret = blkverify_co_prwv(bs, &r, offset, bytes, qiov, &raw_qiov, flags,
false);
ret = blkverify_co_prwv(bs, &r, offset, bytes, qiov, &raw_qiov,
flags & ~BDRV_REQ_REGISTERED_BUF, false);
cmp_offset = qemu_iovec_compare(qiov, &raw_qiov);
if (cmp_offset != -1) {

View file

@ -410,7 +410,6 @@ block_crypto_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
uint64_t sector_size = qcrypto_block_get_sector_size(crypto->block);
uint64_t payload_offset = qcrypto_block_get_payload_offset(crypto->block);
assert(!flags);
assert(payload_offset < INT64_MAX);
assert(QEMU_IS_ALIGNED(offset, sector_size));
assert(QEMU_IS_ALIGNED(bytes, sector_size));
@ -473,7 +472,8 @@ block_crypto_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
uint64_t sector_size = qcrypto_block_get_sector_size(crypto->block);
uint64_t payload_offset = qcrypto_block_get_payload_offset(crypto->block);
assert(!(flags & ~BDRV_REQ_FUA));
flags &= ~BDRV_REQ_REGISTERED_BUF;
assert(payload_offset < INT64_MAX);
assert(QEMU_IS_ALIGNED(offset, sector_size));
assert(QEMU_IS_ALIGNED(bytes, sector_size));

View file

@ -2133,7 +2133,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
int64_t bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags)
{
assert(flags == 0);
return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
}

View file

@ -1236,7 +1236,6 @@ static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
QEMUIOVector *qiov,
int flags)
{
assert(!flags);
return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
}

View file

@ -1130,8 +1130,7 @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
int ret;
bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
assert(!(flags & ~BDRV_REQ_MASK));
assert(!(flags & BDRV_REQ_NO_FALLBACK));
assert(!(flags & ~bs->supported_read_flags));
if (!drv) {
return -ENOMEDIUM;
@ -1195,23 +1194,29 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
BdrvRequestFlags flags)
{
BlockDriver *drv = bs->drv;
bool emulate_fua = false;
int64_t sector_num;
unsigned int nb_sectors;
QEMUIOVector local_qiov;
int ret;
bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
assert(!(flags & ~BDRV_REQ_MASK));
assert(!(flags & BDRV_REQ_NO_FALLBACK));
if (!drv) {
return -ENOMEDIUM;
}
if ((flags & BDRV_REQ_FUA) &&
(~bs->supported_write_flags & BDRV_REQ_FUA)) {
flags &= ~BDRV_REQ_FUA;
emulate_fua = true;
}
flags &= bs->supported_write_flags;
if (drv->bdrv_co_pwritev_part) {
ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset,
flags & bs->supported_write_flags);
flags &= ~bs->supported_write_flags;
flags);
goto emulate_flags;
}
@ -1221,9 +1226,7 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
}
if (drv->bdrv_co_pwritev) {
ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
flags & bs->supported_write_flags);
flags &= ~bs->supported_write_flags;
ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags);
goto emulate_flags;
}
@ -1233,10 +1236,8 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
.coroutine = qemu_coroutine_self(),
};
acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
flags & bs->supported_write_flags,
acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, flags,
bdrv_co_io_em_complete, &co);
flags &= ~bs->supported_write_flags;
if (acb == NULL) {
ret = -EIO;
} else {
@ -1254,12 +1255,10 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
assert(bytes <= BDRV_REQUEST_MAX_BYTES);
assert(drv->bdrv_co_writev);
ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
flags & bs->supported_write_flags);
flags &= ~bs->supported_write_flags;
ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, flags);
emulate_flags:
if (ret == 0 && (flags & BDRV_REQ_FUA)) {
if (ret == 0 && emulate_fua) {
ret = bdrv_co_flush(bs);
}
@ -1487,11 +1486,14 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
align);
/* TODO: We would need a per-BDS .supported_read_flags and
/*
* TODO: We would need a per-BDS .supported_read_flags and
* potential fallback support, if we ever implement any read flags
* to pass through to drivers. For now, there aren't any
* passthrough flags. */
assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH)));
* passthrough flags except the BDRV_REQ_REGISTERED_BUF optimization hint.
*/
assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH |
BDRV_REQ_REGISTERED_BUF)));
/* Handle Copy on Read and associated serialisation */
if (flags & BDRV_REQ_COPY_ON_READ) {
@ -1532,7 +1534,7 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
goto out;
}
assert(!(flags & ~bs->supported_read_flags));
assert(!(flags & ~(bs->supported_read_flags | BDRV_REQ_REGISTERED_BUF)));
max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
if (bytes <= max_bytes && bytes <= max_transfer) {
@ -1721,7 +1723,8 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
static int bdrv_pad_request(BlockDriverState *bs,
QEMUIOVector **qiov, size_t *qiov_offset,
int64_t *offset, int64_t *bytes,
BdrvRequestPadding *pad, bool *padded)
BdrvRequestPadding *pad, bool *padded,
BdrvRequestFlags *flags)
{
int ret;
@ -1749,6 +1752,10 @@ static int bdrv_pad_request(BlockDriverState *bs,
if (padded) {
*padded = true;
}
if (flags) {
/* Can't use optimization hint with bounce buffer */
*flags &= ~BDRV_REQ_REGISTERED_BUF;
}
return 0;
}
@ -1803,7 +1810,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
}
ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
NULL);
NULL, &flags);
if (ret < 0) {
goto fail;
}
@ -1848,6 +1855,11 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
return -ENOTSUP;
}
/* By definition there is no user buffer so this flag doesn't make sense */
if (flags & BDRV_REQ_REGISTERED_BUF) {
return -EINVAL;
}
/* Invalidate the cached block-status data range if this write overlaps */
bdrv_bsc_invalidate_range(bs, offset, bytes);
@ -2133,6 +2145,9 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
bool padding;
BdrvRequestPadding pad;
/* This flag doesn't make sense for padding or zero writes */
flags &= ~BDRV_REQ_REGISTERED_BUF;
padding = bdrv_init_padding(bs, offset, bytes, &pad);
if (padding) {
assert(!(flags & BDRV_REQ_NO_WAIT));
@ -2250,7 +2265,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
* alignment only if there is no ZERO flag.
*/
ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
&padded);
&padded, &flags);
if (ret < 0) {
return ret;
}

View file

@ -1486,6 +1486,8 @@ static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
qemu_iovec_init(&bounce_qiov, 1);
qemu_iovec_add(&bounce_qiov, bounce_buf, bytes);
qiov = &bounce_qiov;
flags &= ~BDRV_REQ_REGISTERED_BUF;
}
ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, offset, bytes, qiov,

View file

@ -1222,7 +1222,6 @@ static int coroutine_fn nbd_client_co_preadv(BlockDriverState *bs, int64_t offse
};
assert(bytes <= NBD_MAX_BUFFER_SIZE);
assert(!flags);
if (!bytes) {
return 0;

View file

@ -329,7 +329,6 @@ static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
QEMUIOVector hd_qiov;
int ret = 0;
assert(!flags);
qemu_iovec_init(&hd_qiov, qiov->niov);
while (nb_sectors > 0) {

View file

@ -628,7 +628,6 @@ static coroutine_fn int qcow_co_preadv(BlockDriverState *bs, int64_t offset,
uint8_t *buf;
void *orig_buf;
assert(!flags);
if (qiov->niov > 1) {
buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
if (buf == NULL) {
@ -725,7 +724,6 @@ static coroutine_fn int qcow_co_pwritev(BlockDriverState *bs, int64_t offset,
uint8_t *buf;
void *orig_buf;
assert(!flags);
s->cluster_cache_offset = -1; /* disable compressed cache */
/* We must always copy the iov when encrypting, so we

View file

@ -1395,7 +1395,6 @@ static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
QEMUIOVector *qiov, int flags)
{
assert(!flags);
return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
}

View file

@ -258,6 +258,8 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
qemu_iovec_add(&local_qiov, buf, 512);
qemu_iovec_concat(&local_qiov, qiov, 512, qiov->size - 512);
qiov = &local_qiov;
flags &= ~BDRV_REQ_REGISTERED_BUF;
}
ret = raw_adjust_offset(bs, &offset, bytes, true);

View file

@ -261,7 +261,6 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs,
int ret;
int64_t n;
assert(!flags);
ret = replication_get_io_status(s);
if (ret < 0) {
goto out;

View file

@ -1196,7 +1196,6 @@ static coroutine_fn int ssh_co_writev(BlockDriverState *bs,
BDRVSSHState *s = bs->opaque;
int ret;
assert(!flags);
qemu_co_mutex_lock(&s->lock);
ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE,
nb_sectors * BDRV_SECTOR_SIZE, qiov);

View file

@ -1342,7 +1342,6 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
uint64_t bat_prior_offset = 0;
bool bat_update = false;
assert(!flags);
qemu_iovec_init(&hd_qiov, qiov->niov);
qemu_co_mutex_lock(&s->lock);

View file

@ -80,6 +80,15 @@ typedef enum {
*/
BDRV_REQ_MAY_UNMAP = 0x4,
/*
* An optimization hint when all QEMUIOVector elements are within
* previously registered bdrv_register_buf() memory ranges.
*
* Code that replaces the user's QEMUIOVector elements with bounce buffers
* must take care to clear this flag.
*/
BDRV_REQ_REGISTERED_BUF = 0x8,
BDRV_REQ_FUA = 0x10,
BDRV_REQ_WRITE_COMPRESSED = 0x20,