Avoid emulating FUA when the driver supports it natively. This should
provide better performance than a full flush after the write.
Signed-off-by: Alberto Faria <afaria@redhat.com>
---
hw/scsi/scsi-disk.c | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 8cf50845ab..ce48e20ee6 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -43,6 +43,7 @@
#include "qemu/cutils.h"
#include "trace.h"
#include "qom/object.h"
+#include "block/block_int-common.h"
#ifdef __linux
#include <scsi/sg.h>
@@ -75,7 +76,7 @@ struct SCSIDiskClass {
*/
DMAIOFunc *dma_readv;
DMAIOFunc *dma_writev;
- bool (*need_fua_emulation)(SCSICommand *cmd);
+ bool (*need_fua)(SCSICommand *cmd);
void (*update_sense)(SCSIRequest *r);
};
@@ -86,6 +87,7 @@ typedef struct SCSIDiskReq {
uint32_t sector_count;
uint32_t buflen;
bool started;
+ bool need_fua;
bool need_fua_emulation;
struct iovec iov;
QEMUIOVector qiov;
@@ -553,7 +555,7 @@ static void scsi_read_data(SCSIRequest *req)
first = !r->started;
r->started = true;
- if (first && r->need_fua_emulation) {
+ if (first && r->need_fua) {
block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
BLOCK_ACCT_FLUSH);
r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_do_read_cb, r);
@@ -2384,7 +2386,9 @@ static int32_t scsi_disk_dma_command(SCSIRequest *req, uint8_t *buf)
scsi_check_condition(r, SENSE_CODE(LBA_OUT_OF_RANGE));
return 0;
}
- r->need_fua_emulation = sdc->need_fua_emulation(&r->req.cmd);
+ r->need_fua = sdc->need_fua(&r->req.cmd);
+ r->need_fua_emulation = r->need_fua &&
+ (blk_bs(s->qdev.conf.blk)->supported_write_flags & BDRV_REQ_FUA) == 0;
if (r->sector_count == 0) {
scsi_req_complete(&r->req, GOOD);
}
@@ -3134,7 +3138,8 @@ BlockAIOCB *scsi_dma_writev(int64_t offset, QEMUIOVector *iov,
{
SCSIDiskReq *r = opaque;
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
- return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, 0, cb, cb_opaque);
+ int flags = r->need_fua && !r->need_fua_emulation ? BDRV_REQ_FUA : 0;
+ return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, flags, cb, cb_opaque);
}
static char *scsi_property_get_loadparm(Object *obj, Error **errp)
@@ -3183,7 +3188,7 @@ static void scsi_disk_base_class_initfn(ObjectClass *klass, void *data)
device_class_set_legacy_reset(dc, scsi_disk_reset);
sdc->dma_readv = scsi_dma_readv;
sdc->dma_writev = scsi_dma_writev;
- sdc->need_fua_emulation = scsi_is_cmd_fua;
+ sdc->need_fua = scsi_is_cmd_fua;
}
static const TypeInfo scsi_disk_base_info = {
@@ -3335,7 +3340,7 @@ static void scsi_block_class_initfn(ObjectClass *klass, void *data)
sdc->dma_readv = scsi_block_dma_readv;
sdc->dma_writev = scsi_block_dma_writev;
sdc->update_sense = scsi_block_update_sense;
- sdc->need_fua_emulation = scsi_block_no_fua;
+ sdc->need_fua = scsi_block_no_fua;
dc->desc = "SCSI block device passthrough";
device_class_set_props(dc, scsi_block_properties);
dc->vmsd = &vmstate_scsi_disk_state;
--
2.48.1
Am 04.03.2025 um 16:52 hat Alberto Faria geschrieben:
> Avoid emulating FUA when the driver supports it natively. This should
> provide better performance than a full flush after the write.
>
> Signed-off-by: Alberto Faria <afaria@redhat.com>
Did you try out if you can see performance improvements in practice?
It's always nice to have numbers in the commit message for patches that
promise performance improvements.
> hw/scsi/scsi-disk.c | 17 +++++++++++------
> 1 file changed, 11 insertions(+), 6 deletions(-)
>
> diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
> index 8cf50845ab..ce48e20ee6 100644
> --- a/hw/scsi/scsi-disk.c
> +++ b/hw/scsi/scsi-disk.c
> @@ -43,6 +43,7 @@
> #include "qemu/cutils.h"
> #include "trace.h"
> #include "qom/object.h"
> +#include "block/block_int-common.h"
>
> #ifdef __linux
> #include <scsi/sg.h>
> @@ -75,7 +76,7 @@ struct SCSIDiskClass {
> */
> DMAIOFunc *dma_readv;
> DMAIOFunc *dma_writev;
> - bool (*need_fua_emulation)(SCSICommand *cmd);
> + bool (*need_fua)(SCSICommand *cmd);
> void (*update_sense)(SCSIRequest *r);
> };
>
> @@ -86,6 +87,7 @@ typedef struct SCSIDiskReq {
> uint32_t sector_count;
> uint32_t buflen;
> bool started;
> + bool need_fua;
> bool need_fua_emulation;
> struct iovec iov;
> QEMUIOVector qiov;
> @@ -553,7 +555,7 @@ static void scsi_read_data(SCSIRequest *req)
>
> first = !r->started;
> r->started = true;
> - if (first && r->need_fua_emulation) {
> + if (first && r->need_fua) {
> block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
> BLOCK_ACCT_FLUSH);
> r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_do_read_cb, r);
> @@ -2384,7 +2386,9 @@ static int32_t scsi_disk_dma_command(SCSIRequest *req, uint8_t *buf)
> scsi_check_condition(r, SENSE_CODE(LBA_OUT_OF_RANGE));
> return 0;
> }
> - r->need_fua_emulation = sdc->need_fua_emulation(&r->req.cmd);
> + r->need_fua = sdc->need_fua(&r->req.cmd);
> + r->need_fua_emulation = r->need_fua &&
> + (blk_bs(s->qdev.conf.blk)->supported_write_flags & BDRV_REQ_FUA) == 0;
You can just use BDRV_REQ_FUA unconditionally. If the driver doesn't
support it directly, the block layer already emulates it internally. We
don't have to duplicate this here. If scsi_write_data() does a flush
directly for VERIFY (like scsi_read_data() already does),
scsi_write_do_fua() can go away completely.
However, we can only apply this to write requests. We still need to know
that FUA needs to be emulated for reads. scsi_read_data() issues a flush
for FUA requests and your patch would break it if writes support
BDRV_REQ_FUA.
Kevin
Am 06.03.2025 um 11:33 hat Kevin Wolf geschrieben: > Am 04.03.2025 um 16:52 hat Alberto Faria geschrieben: > > Avoid emulating FUA when the driver supports it natively. This should > > provide better performance than a full flush after the write. > > > > Signed-off-by: Alberto Faria <afaria@redhat.com> > > Did you try out if you can see performance improvements in practice? > It's always nice to have numbers in the commit message for patches that > promise performance improvements. I was curious enough to see how this and the recent series by Stefan (virtio-scsi multiqueue) and myself (FUA on the backend + polling improvements) play out with virtio-scsi, so I just ran some fio benchmarks with sync=1 myself to compare: iops bs=4k cache=none | virtio-scsi | virtio-blk | O_SYNC workload | qd 1 | qd 16 | qd 1 | qd 16 | --------------------------------+---------+---------+---------+---------+ master | 21296 | 109747 | 25762 | 130576 | + virtio-scsi multiqueue | 28798 | 121170 | - | - | + FUA in scsi-disk | 51893 | 204199 | - | - | --------------------------------+---------+---------+---------+---------+ Total change | +143.7% | +86.1% | - | - | (No new numbers for virtio-blk because virtio-scsi patches obviously don't change anything about it. Also no numbers for FUA in file-posix because it's unused with cache=none.) iops bs=4k cache=directsync | virtio-scsi | virtio-blk | O_SYNC workload | qd 1 | qd 16 | qd 1 | qd 16 | --------------------------------+---------+---------+---------+---------+ master | 32223 | 109748 | 45583 | 258416 | + FUA in file-posix + polling | 32148 | 198665 | 58601 | 320190 | + virtio-scsi multiqueue | 51739 | 225031 | - | - | + FUA in scsi-disk | 56061 | 227535 | - | - | --------------------------------+---------+---------+---------+---------+ Total change | +74.0% | +107.3% | +28.6% | +23.9% | Of course, the huge improvements on the virtio-scsi side only show how bad it was before. In most numbers it is still behind virtio-blk even after all three patch series (apart from cache=none where the availability of FUA on the device side makes a big difference, and I expect that virtio-blk will improve similarly once we implement it there). Also note that when testing the virtio-scsi multiqueue patches, this was still a single iothread, i.e. I wasn't even making use of the new feature per se. I assume much of this comes from enabling polling because the series moved the event queue handling to the main loop, which prevented polling for virtio-scsi before. The series also got rid of an extra coroutine per request for the blk_is_available() call in virtio_scsi_ctx_check(), which might play a role, too. Anyway, I like these numbers for FUA in scsi-disk. It makes write back cache modes almost catch up to write through with O_SYNC workloads. We should definitely get this merged and do the same for virtio-blk. Kevin
On Tue, Mar 25, 2025 at 01:48:35PM +0100, Kevin Wolf wrote: > Am 06.03.2025 um 11:33 hat Kevin Wolf geschrieben: > > Am 04.03.2025 um 16:52 hat Alberto Faria geschrieben: > > > Avoid emulating FUA when the driver supports it natively. This should > > > provide better performance than a full flush after the write. > > > > > > Signed-off-by: Alberto Faria <afaria@redhat.com> > > > > Did you try out if you can see performance improvements in practice? > > It's always nice to have numbers in the commit message for patches that > > promise performance improvements. > > I was curious enough to see how this and the recent series by Stefan > (virtio-scsi multiqueue) and myself (FUA on the backend + polling > improvements) play out with virtio-scsi, so I just ran some fio > benchmarks with sync=1 myself to compare: > > iops bs=4k cache=none | virtio-scsi | virtio-blk | > O_SYNC workload | qd 1 | qd 16 | qd 1 | qd 16 | > --------------------------------+---------+---------+---------+---------+ > master | 21296 | 109747 | 25762 | 130576 | > + virtio-scsi multiqueue | 28798 | 121170 | - | - | > + FUA in scsi-disk | 51893 | 204199 | - | - | > --------------------------------+---------+---------+---------+---------+ > Total change | +143.7% | +86.1% | - | - | > > (No new numbers for virtio-blk because virtio-scsi patches obviously > don't change anything about it. Also no numbers for FUA in file-posix > because it's unused with cache=none.) > > iops bs=4k cache=directsync | virtio-scsi | virtio-blk | > O_SYNC workload | qd 1 | qd 16 | qd 1 | qd 16 | > --------------------------------+---------+---------+---------+---------+ > master | 32223 | 109748 | 45583 | 258416 | > + FUA in file-posix + polling | 32148 | 198665 | 58601 | 320190 | > + virtio-scsi multiqueue | 51739 | 225031 | - | - | > + FUA in scsi-disk | 56061 | 227535 | - | - | > --------------------------------+---------+---------+---------+---------+ > Total change | +74.0% | +107.3% | +28.6% | +23.9% | > > Of course, the huge improvements on the virtio-scsi side only show how > bad it was before. In most numbers it is still behind virtio-blk even > after all three patch series (apart from cache=none where the > availability of FUA on the device side makes a big difference, and I > expect that virtio-blk will improve similarly once we implement it > there). > > Also note that when testing the virtio-scsi multiqueue patches, this > was still a single iothread, i.e. I wasn't even making use of the new > feature per se. I assume much of this comes from enabling polling > because the series moved the event queue handling to the main loop, > which prevented polling for virtio-scsi before. The series also got rid > of an extra coroutine per request for the blk_is_available() call in > virtio_scsi_ctx_check(), which might play a role, too. > > Anyway, I like these numbers for FUA in scsi-disk. It makes write back > cache modes almost catch up to write through with O_SYNC workloads. We > should definitely get this merged and do the same for virtio-blk. Thanks for sharing! Nice IOPS improvements across the board. Stefan
© 2016 - 2026 Red Hat, Inc.