migration/channel-block.c | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+)
Enable the use of the mapped-ram migration feature with savevm/loadvm
snapshots by adding the QIO_CHANNEL_FEATURE_SEEKABLE feature to
QIOChannelBlock. Implement io_preadv and io_pwritev methods to provide
positioned I/O capabilities that don't modify the channel's position
pointer.
Signed-off-by: Marco Cavenati <Marco.Cavenati@eurecom.fr>
---
Hello,
Please note that this depends on my previous fix [0] (which has already
been reviewed) in order to work.
The code in this patch is inspired by commit
0478b030fa2530cbbfc4d6432e8e39a16d06865b that adds the same feature to
QIOChannelFile.
Thank you,
Regards
Marco
[0] [PATCH] migration: fix SEEK_CUR offset calculation in
qio_channel_block_seek https://lore.kernel.org/all/20250326162230.3323199-1-Marco.Cavenati@eurecom.fr/t/#u
---
migration/channel-block.c | 48 +++++++++++++++++++++++++++++++++++++++
1 file changed, 48 insertions(+)
diff --git a/migration/channel-block.c b/migration/channel-block.c
index fff8d87094..741cf6f31b 100644
--- a/migration/channel-block.c
+++ b/migration/channel-block.c
@@ -30,6 +30,7 @@ qio_channel_block_new(BlockDriverState *bs)
QIOChannelBlock *ioc;
ioc = QIO_CHANNEL_BLOCK(object_new(TYPE_QIO_CHANNEL_BLOCK));
+ qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SEEKABLE);
bdrv_ref(bs);
ioc->bs = bs;
@@ -96,6 +97,49 @@ qio_channel_block_writev(QIOChannel *ioc,
return qiov.size;
}
+#ifdef CONFIG_PREADV
+static ssize_t
+qio_channel_block_preadv(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ off_t offset,
+ Error **errp)
+{
+ QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc);
+ QEMUIOVector qiov;
+ int ret;
+
+ qemu_iovec_init_external(&qiov, (struct iovec *)iov, niov);
+ ret = bdrv_readv_vmstate(bioc->bs, &qiov, offset);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "bdrv_readv_vmstate failed");
+ return -1;
+ }
+
+ return qiov.size;
+}
+
+static ssize_t
+qio_channel_block_pwritev(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ off_t offset,
+ Error **errp)
+{
+ QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc);
+ QEMUIOVector qiov;
+ int ret;
+
+ qemu_iovec_init_external(&qiov, (struct iovec *)iov, niov);
+ ret = bdrv_writev_vmstate(bioc->bs, &qiov, offset);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "bdrv_writev_vmstate failed");
+ return -1;
+ }
+
+ return qiov.size;
+}
+#endif /* CONFIG_PREADV */
static int
qio_channel_block_set_blocking(QIOChannel *ioc,
@@ -177,6 +221,10 @@ qio_channel_block_class_init(ObjectClass *klass,
ioc_klass->io_writev = qio_channel_block_writev;
ioc_klass->io_readv = qio_channel_block_readv;
ioc_klass->io_set_blocking = qio_channel_block_set_blocking;
+#ifdef CONFIG_PREADV
+ ioc_klass->io_preadv = qio_channel_block_preadv;
+ ioc_klass->io_pwritev = qio_channel_block_pwritev;
+#endif
ioc_klass->io_seek = qio_channel_block_seek;
ioc_klass->io_close = qio_channel_block_close;
ioc_klass->io_set_aio_fd_handler = qio_channel_block_set_aio_fd_handler;
--
2.43.0
Marco Cavenati <Marco.Cavenati@eurecom.fr> writes:
> Enable the use of the mapped-ram migration feature with savevm/loadvm
> snapshots by adding the QIO_CHANNEL_FEATURE_SEEKABLE feature to
> QIOChannelBlock. Implement io_preadv and io_pwritev methods to provide
> positioned I/O capabilities that don't modify the channel's position
> pointer.
We'll need to add the infrastructure to reject multifd and direct-io
before this. The rest of the capabilities should not affect mapped-ram,
so it's fine (for now) if we don't honor them.
What about zero page handling? Mapped-ram doesn't send zero pages
because the file will always have zeroes in it and the migration
destination is guaranteed to not have been running previously. I believe
loading a snapshot in a VM that's already been running would leave stale
data in the guest's memory.
>
> Signed-off-by: Marco Cavenati <Marco.Cavenati@eurecom.fr>
> ---
> Hello,
> Please note that this depends on my previous fix [0] (which has already
> been reviewed) in order to work.
>
> The code in this patch is inspired by commit
> 0478b030fa2530cbbfc4d6432e8e39a16d06865b that adds the same feature to
> QIOChannelFile.
>
> Thank you,
> Regards
> Marco
>
> [0] [PATCH] migration: fix SEEK_CUR offset calculation in
> qio_channel_block_seek https://lore.kernel.org/all/20250326162230.3323199-1-Marco.Cavenati@eurecom.fr/t/#u
> ---
> migration/channel-block.c | 48 +++++++++++++++++++++++++++++++++++++++
> 1 file changed, 48 insertions(+)
>
> diff --git a/migration/channel-block.c b/migration/channel-block.c
> index fff8d87094..741cf6f31b 100644
> --- a/migration/channel-block.c
> +++ b/migration/channel-block.c
> @@ -30,6 +30,7 @@ qio_channel_block_new(BlockDriverState *bs)
> QIOChannelBlock *ioc;
>
> ioc = QIO_CHANNEL_BLOCK(object_new(TYPE_QIO_CHANNEL_BLOCK));
> + qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SEEKABLE);
>
> bdrv_ref(bs);
> ioc->bs = bs;
> @@ -96,6 +97,49 @@ qio_channel_block_writev(QIOChannel *ioc,
> return qiov.size;
> }
>
> +#ifdef CONFIG_PREADV
> +static ssize_t
> +qio_channel_block_preadv(QIOChannel *ioc,
> + const struct iovec *iov,
> + size_t niov,
> + off_t offset,
> + Error **errp)
> +{
> + QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc);
> + QEMUIOVector qiov;
> + int ret;
> +
> + qemu_iovec_init_external(&qiov, (struct iovec *)iov, niov);
> + ret = bdrv_readv_vmstate(bioc->bs, &qiov, offset);
> + if (ret < 0) {
> + error_setg_errno(errp, -ret, "bdrv_readv_vmstate failed");
> + return -1;
> + }
> +
> + return qiov.size;
> +}
> +
> +static ssize_t
> +qio_channel_block_pwritev(QIOChannel *ioc,
> + const struct iovec *iov,
> + size_t niov,
> + off_t offset,
> + Error **errp)
> +{
> + QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc);
> + QEMUIOVector qiov;
> + int ret;
> +
> + qemu_iovec_init_external(&qiov, (struct iovec *)iov, niov);
> + ret = bdrv_writev_vmstate(bioc->bs, &qiov, offset);
> + if (ret < 0) {
> + error_setg_errno(errp, -ret, "bdrv_writev_vmstate failed");
> + return -1;
> + }
> +
> + return qiov.size;
> +}
> +#endif /* CONFIG_PREADV */
>
> static int
> qio_channel_block_set_blocking(QIOChannel *ioc,
> @@ -177,6 +221,10 @@ qio_channel_block_class_init(ObjectClass *klass,
> ioc_klass->io_writev = qio_channel_block_writev;
> ioc_klass->io_readv = qio_channel_block_readv;
> ioc_klass->io_set_blocking = qio_channel_block_set_blocking;
> +#ifdef CONFIG_PREADV
> + ioc_klass->io_preadv = qio_channel_block_preadv;
> + ioc_klass->io_pwritev = qio_channel_block_pwritev;
> +#endif
> ioc_klass->io_seek = qio_channel_block_seek;
> ioc_klass->io_close = qio_channel_block_close;
> ioc_klass->io_set_aio_fd_handler = qio_channel_block_set_aio_fd_handler;
Hello Fabiano,
On Thursday, April 10, 2025 21:52 CEST, Fabiano Rosas <farosas@suse.de> wrote:
> Marco Cavenati <Marco.Cavenati@eurecom.fr> writes:
>
> > Enable the use of the mapped-ram migration feature with savevm/loadvm
> > snapshots by adding the QIO_CHANNEL_FEATURE_SEEKABLE feature to
> > QIOChannelBlock. Implement io_preadv and io_pwritev methods to provide
> > positioned I/O capabilities that don't modify the channel's position
> > pointer.
>
> We'll need to add the infrastructure to reject multifd and direct-io
> before this. The rest of the capabilities should not affect mapped-ram,
> so it's fine (for now) if we don't honor them.
Do you have any status update on this infrastructure you mentioned?
> What about zero page handling? Mapped-ram doesn't send zero pages
> because the file will always have zeroes in it and the migration
> destination is guaranteed to not have been running previously. I believe
> loading a snapshot in a VM that's already been running would leave stale
> data in the guest's memory.
About the zero handling I'd like to hear your opinion about this idea I
proposed a while back:
The scenarios where zeroing is not required (incoming migration and
-loadvm) share a common characteristic: the VM has not yet run in the
current QEMU process.
To avoid splitting read_ramblock_mapped_ram(), could we implement
a check to determine if the VM has ever run and decide whether to zero
the memory based on that? Maybe using RunState?
Then we can add something like this to read_ramblock_mapped_ram()
...
clear_bit_idx = 0;
for (...) {
// Zero pages
if (guest_has_ever_run()) {
unread = TARGET_PAGE_SIZE * (set_bit_idx - clear_bit_idx);
offset = clear_bit_idx << TARGET_PAGE_BITS;
host = host_from_ram_block_offset(block, offset);
if (!host) {...}
ram_handle_zero(host, unread);
}
// Non-zero pages
clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1);
...
(Plus trailing zero pages handling)
Thank you :)
Marco
"Marco Cavenati" <Marco.Cavenati@eurecom.fr> writes:
> Hello Fabiano,
>
> On Thursday, April 10, 2025 21:52 CEST, Fabiano Rosas <farosas@suse.de> wrote:
>
>> Marco Cavenati <Marco.Cavenati@eurecom.fr> writes:
>>
>> > Enable the use of the mapped-ram migration feature with savevm/loadvm
>> > snapshots by adding the QIO_CHANNEL_FEATURE_SEEKABLE feature to
>> > QIOChannelBlock. Implement io_preadv and io_pwritev methods to provide
>> > positioned I/O capabilities that don't modify the channel's position
>> > pointer.
>>
>> We'll need to add the infrastructure to reject multifd and direct-io
>> before this. The rest of the capabilities should not affect mapped-ram,
>> so it's fine (for now) if we don't honor them.
>
> Do you have any status update on this infrastructure you mentioned?
>
I'm doing the work suggested by Daniel of passing migration
configuration options via the commands themselves. When that is ready we
can include savevm and have it only accept mapped-ram and clear all
other options.
But don't worry about that, propose your changes and I'll make sure to
have *something* ready before it merges. I don't see an issue with
merging this single patch, for instance:
https://lore.kernel.org/r/20250327143934.7935-2-farosas@suse.de
>> What about zero page handling? Mapped-ram doesn't send zero pages
>> because the file will always have zeroes in it and the migration
>> destination is guaranteed to not have been running previously. I believe
>> loading a snapshot in a VM that's already been running would leave stale
>> data in the guest's memory.
>
> About the zero handling I'd like to hear your opinion about this idea I
> proposed a while back:
> The scenarios where zeroing is not required (incoming migration and
> -loadvm) share a common characteristic: the VM has not yet run in the
> current QEMU process.
> To avoid splitting read_ramblock_mapped_ram(), could we implement
> a check to determine if the VM has ever run and decide whether to zero
> the memory based on that? Maybe using RunState?
>
We could just as well add some flag to load_snapshot() since we know
which invocations are guaranteed to happen with clean memory.
But if you can use existing code for that it would be great. Adding a
global guest_has_ever_run flag, not so much. What's the MachineInitPhase
before -loadvm?
> Then we can add something like this to read_ramblock_mapped_ram()
> ...
> clear_bit_idx = 0;
> for (...) {
> // Zero pages
> if (guest_has_ever_run()) {
> unread = TARGET_PAGE_SIZE * (set_bit_idx - clear_bit_idx);
> offset = clear_bit_idx << TARGET_PAGE_BITS;
> host = host_from_ram_block_offset(block, offset);
> if (!host) {...}
> ram_handle_zero(host, unread);
> }
> // Non-zero pages
> clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1);
> ...
> (Plus trailing zero pages handling)
>
The suggestion of splitting read_ramblock_mapped_ram() was to spare you
from having to touch this code. =) But since you already know how to do
it, then yes, let's add the change inline.
> Thank you :)
> Marco
On Friday, September 19, 2025 23:24 CEST, Fabiano Rosas <farosas@suse.de> wrote: > "Marco Cavenati" <Marco.Cavenati@eurecom.fr> writes: > > > Hello Fabiano, > > > > On Thursday, April 10, 2025 21:52 CEST, Fabiano Rosas <farosas@suse.de> wrote: > > > >> Marco Cavenati <Marco.Cavenati@eurecom.fr> writes: > >> > >> > Enable the use of the mapped-ram migration feature with savevm/loadvm > >> > snapshots by adding the QIO_CHANNEL_FEATURE_SEEKABLE feature to > >> > QIOChannelBlock. Implement io_preadv and io_pwritev methods to provide > >> > positioned I/O capabilities that don't modify the channel's position > >> > pointer. > >> > >> We'll need to add the infrastructure to reject multifd and direct-io > >> before this. The rest of the capabilities should not affect mapped-ram, > >> so it's fine (for now) if we don't honor them. > > > > Do you have any status update on this infrastructure you mentioned? > > > > I'm doing the work suggested by Daniel of passing migration > configuration options via the commands themselves. When that is ready we > can include savevm and have it only accept mapped-ram and clear all > other options. > > But don't worry about that, propose your changes and I'll make sure to > have *something* ready before it merges. I don't see an issue with > merging this single patch, for instance: > https://lore.kernel.org/r/20250327143934.7935-2-farosas@suse.de Perfect! > >> What about zero page handling? Mapped-ram doesn't send zero pages > >> because the file will always have zeroes in it and the migration > >> destination is guaranteed to not have been running previously. I believe > >> loading a snapshot in a VM that's already been running would leave stale > >> data in the guest's memory. > > > > About the zero handling I'd like to hear your opinion about this idea I > > proposed a while back: > > The scenarios where zeroing is not required (incoming migration and > > -loadvm) share a common characteristic: the VM has not yet run in the > > current QEMU process. > > To avoid splitting read_ramblock_mapped_ram(), could we implement > > a check to determine if the VM has ever run and decide whether to zero > > the memory based on that? Maybe using RunState? > > > > We could just as well add some flag to load_snapshot() since we know > which invocations are guaranteed to happen with clean memory. > > But if you can use existing code for that it would be great. Adding a > global guest_has_ever_run flag, not so much. What's the MachineInitPhase > before -loadvm? MachineInitPhase is set to PHASE_MACHINE_READY during ram_load() for both -loadvm and HMP loadvm, so unfortunately that isn’t an option. RunState during ram_load() is - RUN_STATE_INMIGRATE for -incoming, - RUN_STATE_PRELAUNCH for -loadvm - RUN_STATE_RESTORE_VM for HMP loadvm. But I’m not sure how reliable (or unreliable) it would be to depend on this to infer that RAM is zero. As for using a flag, I don’t see an obvious way to pass one down through load_snapshot -> qemu_loadvm_state -> ... -> read_ramblock_mapped_ram. Do you already have something in mind? Thank you Marco
"Marco Cavenati" <Marco.Cavenati@eurecom.fr> writes: > On Friday, September 19, 2025 23:24 CEST, Fabiano Rosas <farosas@suse.de> wrote: > >> "Marco Cavenati" <Marco.Cavenati@eurecom.fr> writes: >> >> > Hello Fabiano, >> > >> > On Thursday, April 10, 2025 21:52 CEST, Fabiano Rosas <farosas@suse.de> wrote: >> > >> >> Marco Cavenati <Marco.Cavenati@eurecom.fr> writes: >> >> >> >> > Enable the use of the mapped-ram migration feature with savevm/loadvm >> >> > snapshots by adding the QIO_CHANNEL_FEATURE_SEEKABLE feature to >> >> > QIOChannelBlock. Implement io_preadv and io_pwritev methods to provide >> >> > positioned I/O capabilities that don't modify the channel's position >> >> > pointer. >> >> >> >> We'll need to add the infrastructure to reject multifd and direct-io >> >> before this. The rest of the capabilities should not affect mapped-ram, >> >> so it's fine (for now) if we don't honor them. >> > >> > Do you have any status update on this infrastructure you mentioned? >> > >> >> I'm doing the work suggested by Daniel of passing migration >> configuration options via the commands themselves. When that is ready we >> can include savevm and have it only accept mapped-ram and clear all >> other options. >> >> But don't worry about that, propose your changes and I'll make sure to >> have *something* ready before it merges. I don't see an issue with >> merging this single patch, for instance: >> https://lore.kernel.org/r/20250327143934.7935-2-farosas@suse.de > > Perfect! > >> >> What about zero page handling? Mapped-ram doesn't send zero pages >> >> because the file will always have zeroes in it and the migration >> >> destination is guaranteed to not have been running previously. I believe >> >> loading a snapshot in a VM that's already been running would leave stale >> >> data in the guest's memory. >> > >> > About the zero handling I'd like to hear your opinion about this idea I >> > proposed a while back: >> > The scenarios where zeroing is not required (incoming migration and >> > -loadvm) share a common characteristic: the VM has not yet run in the >> > current QEMU process. >> > To avoid splitting read_ramblock_mapped_ram(), could we implement >> > a check to determine if the VM has ever run and decide whether to zero >> > the memory based on that? Maybe using RunState? >> > >> >> We could just as well add some flag to load_snapshot() since we know >> which invocations are guaranteed to happen with clean memory. >> >> But if you can use existing code for that it would be great. Adding a >> global guest_has_ever_run flag, not so much. What's the MachineInitPhase >> before -loadvm? > > MachineInitPhase is set to PHASE_MACHINE_READY during ram_load() for > both -loadvm and HMP loadvm, so unfortunately that isn’t an option. > > RunState during ram_load() is > - RUN_STATE_INMIGRATE for -incoming, > - RUN_STATE_PRELAUNCH for -loadvm > - RUN_STATE_RESTORE_VM for HMP loadvm. > But I’m not sure how reliable (or unreliable) it would be to depend on this > to infer that RAM is zero. > I wouldn't mind using INMIGRATE and PRELAUNCH, seeing how INMIGRATE is used indiscriminately around the codebase. And PRELAUNCH is actually descriptive. Maybe it's best if you pick an easy method, such as this one, and we'll iterate on it once you send a patch. > As for using a flag, I don’t see an obvious way to pass one down through > load_snapshot -> qemu_loadvm_state -> ... -> read_ramblock_mapped_ram. > Do you already have something in mind? > MigrateIncomingState > Thank you > Marco
On Thursday, April 10, 2025 21:52 CEST, Fabiano Rosas <farosas@suse.de> wrote: > We'll need to add the infrastructure to reject multifd and direct-io > before this. The rest of the capabilities should not affect mapped-ram, > so it's fine (for now) if we don't honor them. Ok, thanks for the update. > What about zero page handling? Mapped-ram doesn't send zero pages > because the file will always have zeroes in it and the migration > destination is guaranteed to not have been running previously. I believe > loading a snapshot in a VM that's already been running would leave stale > data in the guest's memory. Yes, you are correct. About the `RAMBlock->file_bmap`, according to the code it is a: `/* bitmap of pages present in the migration file */` And, if a pages is a zero page, it won't be in the migration file: `/* zero pages are not transferred with mapped-ram */` So, zero page implies bitmap 0. Does the opposite hold? If bitmap 0 implies zero page, we could call `ram_handle_zero` in `read_ramblock_mapped_ram` for the clear bits. Or do you fear this might be unnecessary expensive for migration? If bitmap 0 does not imply zero page, I feel like the "is present in the migration file" and "is zero page" info should be better separated. Best, Marco
"Marco Cavenati" <Marco.Cavenati@eurecom.fr> writes: > On Thursday, April 10, 2025 21:52 CEST, Fabiano Rosas <farosas@suse.de> wrote: > >> We'll need to add the infrastructure to reject multifd and direct-io >> before this. The rest of the capabilities should not affect mapped-ram, >> so it's fine (for now) if we don't honor them. > > Ok, thanks for the update. > >> What about zero page handling? Mapped-ram doesn't send zero pages >> because the file will always have zeroes in it and the migration >> destination is guaranteed to not have been running previously. I believe >> loading a snapshot in a VM that's already been running would leave stale >> data in the guest's memory. > > Yes, you are correct. > > About the `RAMBlock->file_bmap`, according to the code it is a: > `/* bitmap of pages present in the migration file */` > And, if a pages is a zero page, it won't be in the migration file: > `/* zero pages are not transferred with mapped-ram */` > So, zero page implies bitmap 0. > Does the opposite hold? > It does. Mapped-ram takes up (sparse) disk space equal to the guest's ram size. > If bitmap 0 implies zero page, we could call `ram_handle_zero` > in `read_ramblock_mapped_ram` for the clear bits. > Or do you fear this might be unnecessary expensive for migration? > Yes, unfortunately the peformance difference is noticeable. But we could have a slightly different algorithm for savevm. At this point it might be easier to just duplicate read_ramblock_mapped_ram(), check for savevm in there and see what that the resulting code looks like. By the way, what's your overall goal with enabling the feature? Do you intent to enable further capabilities for snapshot? Specifically multifd. I belive the zero page skip is responsible for most of the performance gains for mapped-ram without direct-io and multifd. The benefit of bounded stream size doesn't apply to snapshots because they're not live. It would be interesting to gather some numbers for the perf difference between mapped-ram=on vs off. > If bitmap 0 does not imply zero page, I feel like the > "is present in the migration file" and "is zero page" info should > be better separated. > > Best, > Marco
On Friday, April 11, 2025 14:24 CEST, Fabiano Rosas <farosas@suse.de> wrote: > > If bitmap 0 implies zero page, we could call `ram_handle_zero` > > in `read_ramblock_mapped_ram` for the clear bits. > > Or do you fear this might be unnecessary expensive for migration? > > Yes, unfortunately the peformance difference is noticeable. But we could > have a slightly different algorithm for savevm. At this point it might > be easier to just duplicate read_ramblock_mapped_ram(), check for savevm > in there and see what that the resulting code looks like. I tried to get some numbers in a "bad case" scenario restoring a clean, fully booted, idle Debian VM with 4GB of ram. The zero pages are ~90%. I'm using a nvme ssd to store the snapshot and I repeated the restore 10 times with and without zeroing (`ram_handle_zero`). The restore takes on average +25% of time. (It's not a broad nor deep investigation.) So, I see your point on performance, but I'm not fully comfortable with the difference in zero page handling between mapped-ram on and mapped-ram off. In the former case zero pages are skipped, while in the latter they are explicitly zeroed. Enabling mapped-ram has the implicit effect to also skip zero-pages. I think it is an optimization not really bound to mapped-ram and it could be better to have this feature external to mapped-ram, enabled when the destination ram is known to be already zeroed (also for mapped-ram off ideally). > By the way, what's your overall goal with enabling the feature? Do you > intent to enable further capabilities for snapshot? Specifically > multifd. I belive the zero page skip is responsible for most of the > performance gains for mapped-ram without direct-io and multifd. The > benefit of bounded stream size doesn't apply to snapshots because > they're not live. My overall goal is a hot-loadvm feature that currently lives in a fork downstream and has a long way before getting in a mergeable state :) In a nutshell, I'm using dirty page tracking to load from the snapshot only the pages that have been dirtied between two loadvm; mapped-ram is required to seek and read only the dirtied pages. About the other capabilities, I still have to understand if they might help in my use case. > It would be interesting to gather some numbers for the perf difference > between mapped-ram=on vs off. Repeating the same experiment as above, without mapped-ram, I obtain +48% in restore time compared to mapped-ram and, therefore, a +18% wrt to the mapped-ram with zeroing. (It should be noted that mapped-ram without zeroing leaves the restored vm in an inconsistent state). At the moment I don't have numbers regarding savevm. Thanks! Best, Marco
"Marco Cavenati" <Marco.Cavenati@eurecom.fr> writes: > On Friday, April 11, 2025 14:24 CEST, Fabiano Rosas <farosas@suse.de> wrote: > >> > If bitmap 0 implies zero page, we could call `ram_handle_zero` >> > in `read_ramblock_mapped_ram` for the clear bits. >> > Or do you fear this might be unnecessary expensive for migration? >> >> Yes, unfortunately the peformance difference is noticeable. But we could >> have a slightly different algorithm for savevm. At this point it might >> be easier to just duplicate read_ramblock_mapped_ram(), check for savevm >> in there and see what that the resulting code looks like. > > I tried to get some numbers in a "bad case" scenario restoring a > clean, fully booted, idle Debian VM with 4GB of ram. The zero pages > are ~90%. I'm using a nvme ssd to store the snapshot and I repeated > the restore 10 times with and without zeroing (`ram_handle_zero`). > The restore takes on average +25% of time. > (It's not a broad nor deep investigation.) > > So, I see your point on performance, but I'm not fully comfortable > with the difference in zero page handling between mapped-ram on > and mapped-ram off. In the former case zero pages are skipped, while > in the latter they are explicitly zeroed. > Enabling mapped-ram has the implicit effect to also skip zero-pages. > I think it is an optimization not really bound to mapped-ram and it > could be better to have this feature external to mapped-ram, enabled > when the destination ram is known to be already zeroed (also for > mapped-ram off ideally). > From a design perspective that makes sense. The only use case currently would be mapped-ram _migration_ (as opposed to snapshot) because non-mapped-ram migration is usually done live. The stream of pages will potentially have several versions of the same page, therefore we need to clear it when it's a zero page. We'd benefit from a separate "don't load zero pages" option only when the VM is guaranteed to be stopped and more importantly, not allowed to be unstopped. This is the tricky part. We have experimented with and dropped the idea of detecting a stopped-vm-migration for mapped-ram (the idea back then was not to do better zero page handling, but skip dirty tracking altogether). Since we're dealing with snapshots here, which are asynchronous, I'm wondering wheter it would make sense to extend the zero-page-detection option to the destination. Then we could bind the loadvm process to zero-page-detection=none because I don't think we can safely ignore them anyway. >> By the way, what's your overall goal with enabling the feature? Do you >> intent to enable further capabilities for snapshot? Specifically >> multifd. I belive the zero page skip is responsible for most of the >> performance gains for mapped-ram without direct-io and multifd. The >> benefit of bounded stream size doesn't apply to snapshots because >> they're not live. > > My overall goal is a hot-loadvm feature that currently lives in a fork > downstream and has a long way before getting in a mergeable state :) Cool. Don't hesitate to send stuff our way, the sooner and more often we discuss this, the better the chances of getting it merged upstream. Do you use libvirt at all? Mapped-ram support has been added to it in the latest version. The original use-case for mapped-ram was quick snapshot saving and loading after all. Libvirt does it in a way that does not use savevm, which might be helpful. > In a nutshell, I'm using dirty page tracking to load from the snapshot > only the pages that have been dirtied between two loadvm; > mapped-ram is required to seek and read only the dirtied pages. But then you'd have a bitmap of pages you could correlate with the file_bmap and force-load whatever pages you need. It doesn't seem like zero page handling would affect you too much in that case.
On Tuesday, April 15, 2025 15:50 CEST, Fabiano Rosas <farosas@suse.de> wrote: > > So, I see your point on performance, but I'm not fully comfortable > > with the difference in zero page handling between mapped-ram on > > and mapped-ram off. In the former case zero pages are skipped, while > > in the latter they are explicitly zeroed. > > Enabling mapped-ram has the implicit effect to also skip zero-pages. > > I think it is an optimization not really bound to mapped-ram and it > > could be better to have this feature external to mapped-ram, enabled > > when the destination ram is known to be already zeroed (also for > > mapped-ram off ideally). > > From a design perspective that makes sense. The only use case currently > would be mapped-ram _migration_ (as opposed to snapshot) because > non-mapped-ram migration is usually done live. The stream of pages will > potentially have several versions of the same page, therefore we need to > clear it when it's a zero page. It might be a niche use case (and maybe I'm the only one still using plain old QEMU snapshots) but, also the cli option `-loadvm file` might benefit from skipping 0 pages: snapshots are taken with the VM stopped and the `-loadvm file` acts on a clean ram. > We'd benefit from a separate "don't load zero pages" option only when > the VM is guaranteed to be stopped and more importantly, not allowed to > be unstopped. This is the tricky part. We have experimented with and > dropped the idea of detecting a stopped-vm-migration for mapped-ram (the > idea back then was not to do better zero page handling, but skip dirty > tracking altogether). > > Since we're dealing with snapshots here, which are asynchronous, I'm > wondering wheter it would make sense to extend the zero-page-detection > option to the destination. Then we could bind the loadvm process to > zero-page-detection=none because I don't think we can safely ignore them > anyway. > > My overall goal is a hot-loadvm feature that currently lives in a fork > > downstream and has a long way before getting in a mergeable state :) > > Cool. Don't hesitate to send stuff our way, the sooner and more often we > discuss this, the better the chances of getting it merged upstream. > > Do you use libvirt at all? Mapped-ram support has been added to it in > the latest version. The original use-case for mapped-ram was quick > snapshot saving and loading after all. Libvirt does it in a way that > does not use savevm, which might be helpful. No, I don't use libvirt. Thanks for the hint, but in that case I guess libvirt would spawn a new "QEMU -incoming" for each restore, and that would be too expensive. > > In a nutshell, I'm using dirty page tracking to load from the snapshot > > only the pages that have been dirtied between two loadvm; > > mapped-ram is required to seek and read only the dirtied pages. > > But then you'd have a bitmap of pages you could correlate with the > file_bmap and force-load whatever pages you need. It doesn't seem like > zero page handling would affect you too much in that case. Perhaps I'm missing your point; if a page was zero in the snapshot and then gets dirtied, I need to zero it out. But yeah, the code will be different and for my specific use case I don't absolutely need mapped-ram snapshots restore to fully work. Thank you. Best, Marco
"Marco Cavenati" <Marco.Cavenati@eurecom.fr> writes: > On Tuesday, April 15, 2025 15:50 CEST, Fabiano Rosas <farosas@suse.de> wrote: > >> > So, I see your point on performance, but I'm not fully comfortable >> > with the difference in zero page handling between mapped-ram on >> > and mapped-ram off. In the former case zero pages are skipped, while >> > in the latter they are explicitly zeroed. >> > Enabling mapped-ram has the implicit effect to also skip zero-pages. >> > I think it is an optimization not really bound to mapped-ram and it >> > could be better to have this feature external to mapped-ram, enabled >> > when the destination ram is known to be already zeroed (also for >> > mapped-ram off ideally). >> >> From a design perspective that makes sense. The only use case currently >> would be mapped-ram _migration_ (as opposed to snapshot) because >> non-mapped-ram migration is usually done live. The stream of pages will >> potentially have several versions of the same page, therefore we need to >> clear it when it's a zero page. > > It might be a niche use case (and maybe I'm the only one still using > plain old QEMU snapshots) but, also the cli option `-loadvm file` might > benefit from skipping 0 pages: snapshots are taken with the VM stopped > and the `-loadvm file` acts on a clean ram. > Indeed, I had forgotten about that usage. So that's a case where mapped-ram does the right thing. Now I can forget about it again =) For enabling mapped-ram generically at this point I think we don't have much choice but to introduce a new version of read_ramblock_mapped_ram() that writes zero pages. It would need to discriminate the zero pages, (instead of doing a big copy of a bunch of pages) so we could avoid copying a page that's already zero. In fact, if we do that, it would already work without further changes. The performance of -loadvm would leave something to be improved, but we could tackle that later. What do you think? If we decide we need to explicitly select that new code, I don't think we need any new capability, because the user has no choice in it. We know that loadvm needs it, but -loadvm doesn't, any other sort of mapped-ram migration also doesn't need it. There is some discussion to be had on whether we want to bind it to the commands themselves, or do some form of detection of clean ram. I think it's best we postopone this part of the discussion until Peter is back (next week), so we can have more eyes on it. >> We'd benefit from a separate "don't load zero pages" option only when >> the VM is guaranteed to be stopped and more importantly, not allowed to >> be unstopped. This is the tricky part. We have experimented with and >> dropped the idea of detecting a stopped-vm-migration for mapped-ram (the >> idea back then was not to do better zero page handling, but skip dirty >> tracking altogether). >> >> Since we're dealing with snapshots here, which are asynchronous, I'm >> wondering wheter it would make sense to extend the zero-page-detection >> option to the destination. Then we could bind the loadvm process to >> zero-page-detection=none because I don't think we can safely ignore them >> anyway. > >> > My overall goal is a hot-loadvm feature that currently lives in a fork >> > downstream and has a long way before getting in a mergeable state :) >> >> Cool. Don't hesitate to send stuff our way, the sooner and more often we >> discuss this, the better the chances of getting it merged upstream. >> >> Do you use libvirt at all? Mapped-ram support has been added to it in >> the latest version. The original use-case for mapped-ram was quick >> snapshot saving and loading after all. Libvirt does it in a way that >> does not use savevm, which might be helpful. > > No, I don't use libvirt. Thanks for the hint, but in that case I guess > libvirt would spawn a new "QEMU -incoming" for each restore, and > that would be too expensive. > >> > In a nutshell, I'm using dirty page tracking to load from the snapshot >> > only the pages that have been dirtied between two loadvm; >> > mapped-ram is required to seek and read only the dirtied pages. >> >> But then you'd have a bitmap of pages you could correlate with the >> file_bmap and force-load whatever pages you need. It doesn't seem like >> zero page handling would affect you too much in that case. > > Perhaps I'm missing your point; if a page was zero in the snapshot > and then gets dirtied, I need to zero it out. I used "zero page handling" as "a policy on how to deal with zero pages", similarly to the zero-page-detection option. The page is being zeroed because it was dirty (on your dirty bitmap) not because some policy determined that it needs to be zeroed. IOW, it seems your solution doesn't need to have any knowledge of whether the whole memory is supposed to be zeroed (as discussed above), the presence of a dirty bitmap already selects which pages are restored and if there's a 1 for that page in the bitmap, then we don't really care what's in the file_bmap. If we implement generic snapshot + mapped-ram + zero page then you could probably simply OR your dirty bitmap on top of the file_bmap. Does that makes sense? I might be missing something, I'm looking at a bunch of threads before going on vacation. > But yeah, the code will be > different and for my specific use case I don't absolutely need > mapped-ram snapshots restore to fully work. > > Thank you. > Best, > Marco
On Thursday, April 17, 2025 17:12 CEST, Fabiano Rosas <farosas@suse.de> wrote:
> For enabling mapped-ram generically at this point I think we don't have
> much choice but to introduce a new version of read_ramblock_mapped_ram()
> that writes zero pages. It would need to discriminate the zero pages,
> (instead of doing a big copy of a bunch of pages) so we could avoid
> copying a page that's already zero. In fact, if we do that, it would
> already work without further changes. The performance of -loadvm would
> leave something to be improved, but we could tackle that later.
>
> What do you think?
>
> If we decide we need to explicitly select that new code, I don't think
> we need any new capability, because the user has no choice in it. We
> know that loadvm needs it, but -loadvm doesn't, any other sort of
> mapped-ram migration also doesn't need it. There is some discussion to
> be had on whether we want to bind it to the commands themselves, or do
> some form of detection of clean ram. I think it's best we postopone this
> part of the discussion until Peter is back (next week), so we can have
> more eyes on it.
The scenarios where zeroing is not required (incoming migration and
-loadvm) share a common characteristic: the VM has not yet run in the
current QEMU process.
To avoid splitting read_ramblock_mapped_ram(), could we implement
a check to determine if the VM has ever run and decide whether to zero
the memory based on that? Maybe using RunState?
Then we can add something like this to read_ramblock_mapped_ram()
...
clear_bit_idx = 0;
for (...) {
// Zero pages
if (guest_has_ever_run()) {
unread = TARGET_PAGE_SIZE * (set_bit_idx - clear_bit_idx);
offset = clear_bit_idx << TARGET_PAGE_BITS;
host = host_from_ram_block_offset(block, offset);
if (!host) {...}
ram_handle_zero(host, unread);
}
// Non-zero pages
clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1);
...
(Plus trailing zero pages handling)
> >> We'd benefit from a separate "don't load zero pages" option only when
> >> the VM is guaranteed to be stopped and more importantly, not allowed to
> >> be unstopped. This is the tricky part. We have experimented with and
> >> dropped the idea of detecting a stopped-vm-migration for mapped-ram (the
> >> idea back then was not to do better zero page handling, but skip dirty
> >> tracking altogether).
> >>
> >> Since we're dealing with snapshots here, which are asynchronous, I'm
> >> wondering wheter it would make sense to extend the zero-page-detection
> >> option to the destination. Then we could bind the loadvm process to
> >> zero-page-detection=none because I don't think we can safely ignore them
> >> anyway.
> >
> >> > My overall goal is a hot-loadvm feature that currently lives in a fork
> >> > downstream and has a long way before getting in a mergeable state :)
> >>
> >> Cool. Don't hesitate to send stuff our way, the sooner and more often we
> >> discuss this, the better the chances of getting it merged upstream.
> >>
> >> Do you use libvirt at all? Mapped-ram support has been added to it in
> >> the latest version. The original use-case for mapped-ram was quick
> >> snapshot saving and loading after all. Libvirt does it in a way that
> >> does not use savevm, which might be helpful.
> >
> > No, I don't use libvirt. Thanks for the hint, but in that case I guess
> > libvirt would spawn a new "QEMU -incoming" for each restore, and
> > that would be too expensive.
> >
> >> > In a nutshell, I'm using dirty page tracking to load from the snapshot
> >> > only the pages that have been dirtied between two loadvm;
> >> > mapped-ram is required to seek and read only the dirtied pages.
> >>
> >> But then you'd have a bitmap of pages you could correlate with the
> >> file_bmap and force-load whatever pages you need. It doesn't seem like
> >> zero page handling would affect you too much in that case.
> >
> > Perhaps I'm missing your point; if a page was zero in the snapshot
> > and then gets dirtied, I need to zero it out.
>
> I used "zero page handling" as "a policy on how to deal with zero
> pages", similarly to the zero-page-detection option.
>
> The page is being zeroed because it was dirty (on your dirty bitmap) not
> because some policy determined that it needs to be zeroed. IOW, it seems
> your solution doesn't need to have any knowledge of whether the whole
> memory is supposed to be zeroed (as discussed above), the presence of a
> dirty bitmap already selects which pages are restored and if there's a 1
> for that page in the bitmap, then we don't really care what's in the
> file_bmap.
>
> If we implement generic snapshot + mapped-ram + zero page then you could
> probably simply OR your dirty bitmap on top of the file_bmap.
>
> Does that makes sense? I might be missing something, I'm looking at a
> bunch of threads before going on vacation.
It makes sense, but special handling for zero pages instead of simply
ORing the two maps could improve performance.
If dirty = true and file_bmap = false, instead of reading the zero
page from the snapshot I can simply memset to 0 as above.
Enjoy your vacation :)
Thank you.
Best,
Marco
On Thu, 27 Mar 2025 at 20:03, Marco Cavenati <Marco.Cavenati@eurecom.fr> wrote: > Enable the use of the mapped-ram migration feature with savevm/loadvm > snapshots by adding the QIO_CHANNEL_FEATURE_SEEKABLE feature to > QIOChannelBlock. Implement io_preadv and io_pwritev methods to provide > positioned I/O capabilities that don't modify the channel's position > pointer. > > migration/channel-block.c | 48 +++++++++++++++++++++++++++++++++++++++ > 1 file changed, 48 insertions(+) > > +++ b/migration/channel-block.c > @@ -30,6 +30,7 @@ qio_channel_block_new(BlockDriverState *bs) > QIOChannelBlock *ioc; > > ioc = QIO_CHANNEL_BLOCK(object_new(TYPE_QIO_CHANNEL_BLOCK)); > + qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SEEKABLE); > * IIUC, when _FEATURE_SEEKABLE is set, the channel I/O sequence eventually makes underlying preadv(2)/pwritev(2) calls, which use lseek(2) to adjust the stream r/w pointer with the given offset, before doing the r/w operation. * QIOChannelBlock has '.io_seek' routine corresponding to the lseek(2) call. It is not clear how setting QIO_CHANNEL_FEATURE_SEEKABLE helps for QIOChannelBlock streams. Thank you. --- - Prasad
On Fri, Apr 04, 2025 at 01:49:44PM +0530, Prasad Pandit wrote:
> On Thu, 27 Mar 2025 at 20:03, Marco Cavenati <Marco.Cavenati@eurecom.fr> wrote:
> > Enable the use of the mapped-ram migration feature with savevm/loadvm
> > snapshots by adding the QIO_CHANNEL_FEATURE_SEEKABLE feature to
> > QIOChannelBlock. Implement io_preadv and io_pwritev methods to provide
> > positioned I/O capabilities that don't modify the channel's position
> > pointer.
> >
> > migration/channel-block.c | 48 +++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 48 insertions(+)
> >
> > +++ b/migration/channel-block.c
> > @@ -30,6 +30,7 @@ qio_channel_block_new(BlockDriverState *bs)
> > QIOChannelBlock *ioc;
> >
> > ioc = QIO_CHANNEL_BLOCK(object_new(TYPE_QIO_CHANNEL_BLOCK));
> > + qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SEEKABLE);
> >
>
> * IIUC, when _FEATURE_SEEKABLE is set, the channel I/O sequence
> eventually makes underlying preadv(2)/pwritev(2) calls, which use
> lseek(2) to adjust the stream r/w pointer with the given offset,
> before doing the r/w operation.
>
> * QIOChannelBlock has '.io_seek' routine corresponding to the lseek(2)
> call. It is not clear how setting QIO_CHANNEL_FEATURE_SEEKABLE helps
> for QIOChannelBlock streams.
In QIOChanel API specification, having QIO_CHANNEL_FEATURE_SEEKABLE
set is a pre-requisite for use of qio_channel_{pread,preadv,pwrite,pwritev}.
It is actually NOT really connected to lseek, and as such
QIO_CHANNEL_FEATURE_SEEKABLE is probably a bad choice of name
in retrospect.
With regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Hi,
On Tue, 15 Apr 2025 at 15:51, Daniel P. Berrangé <berrange@redhat.com> wrote:
> It is actually NOT really connected to lseek, and as such
* For file and fd channels _FEATURE_SEEKABLE is set when/if lseek(2) works.
-> https://gitlab.com/qemu-project/qemu/-/commit/401e311ff72e0a62c834bfe466de68a82cfd90cb
> QIO_CHANNEL_FEATURE_SEEKABLE is probably a bad choice of name
> in retrospect.
* That's plausible. Because while reading code, _SEEKABLE indicates
(or hints) that the underlying stream allows random access at a given
offset. Even pread(2)/pwrite(2) manuals say that -> file referenced by
fd should be capable of seeking. So correlation is that, since
QIO_CHANNEL is an abstraction layer, it supports different
streams/channels underneath, maybe some of those underneath streams
support seeking (random access) and some don't. Hence we set
_FEATURE_SEEKABLE when the underlying channel/stream supports it.
> In QIOChanel API specification, having QIO_CHANNEL_FEATURE_SEEKABLE
> set is a pre-requisite for use of qio_channel_{pread,preadv,pwrite,pwritev}.
* If *_FEATURE_SEEKABLE is not connected to lseek(2) or seeking, what
is its right interpretation? Why is it a pre-requisite for use of
qio_channel_{pread,preadv,pwrite,pwritev} functions?
Thank you.
---
- Prasad
On Tue, Apr 15, 2025 at 04:14:08PM +0530, Prasad Pandit wrote:
> Hi,
>
> On Tue, 15 Apr 2025 at 15:51, Daniel P. Berrangé <berrange@redhat.com> wrote:
> > It is actually NOT really connected to lseek, and as such
>
> * For file and fd channels _FEATURE_SEEKABLE is set when/if lseek(2) works.
> -> https://gitlab.com/qemu-project/qemu/-/commit/401e311ff72e0a62c834bfe466de68a82cfd90cb
Yes I know, checking lseek is a proxy that determines whether
preadv will work or not. This is basically validating that
the file/fd is not a FIFO/pipe/character device. It must be
a block device or regular file.
> > QIO_CHANNEL_FEATURE_SEEKABLE is probably a bad choice of name
> > in retrospect.
>
> * That's plausible. Because while reading code, _SEEKABLE indicates
> (or hints) that the underlying stream allows random access at a given
> offset. Even pread(2)/pwrite(2) manuals say that -> file referenced by
> fd should be capable of seeking. So correlation is that, since
> QIO_CHANNEL is an abstraction layer, it supports different
> streams/channels underneath, maybe some of those underneath streams
> support seeking (random access) and some don't. Hence we set
> _FEATURE_SEEKABLE when the underlying channel/stream supports it.
>
> > In QIOChanel API specification, having QIO_CHANNEL_FEATURE_SEEKABLE
> > set is a pre-requisite for use of qio_channel_{pread,preadv,pwrite,pwritev}.
>
> * If *_FEATURE_SEEKABLE is not connected to lseek(2) or seeking, what
> is its right interpretation? Why is it a pre-requisite for use of
> qio_channel_{pread,preadv,pwrite,pwritev} functions?
Because that's what the QEMU API specification declares
* Not all implementations will support this facility, so may report
* an error. To avoid errors, the caller may check for the feature
* flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method.
and what the QEMU API impl defines
ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
size_t niov, off_t offset, Error **errp)
{
QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
if (!klass->io_pwritev) {
error_setg(errp, "Channel does not support pwritev");
return -1;
}
if (!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_SEEKABLE)) {
error_setg_errno(errp, EINVAL, "Requested channel is not seekable");
return -1;
}
return klass->io_pwritev(ioc, iov, niov, offset, errp);
}
With regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On Tue, 15 Apr 2025 at 16:33, Daniel P. Berrangé <berrange@redhat.com> wrote:
> Because that's what the QEMU API specification declares
> * Not all implementations will support this facility, so may report
> * an error. To avoid errors, the caller may check for the feature
> * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method.
>
> and what the QEMU API impl defines
>
> if (!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_SEEKABLE)) {
> error_setg_errno(errp, EINVAL, "Requested channel is not seekable");
> return -1;
> }
* ie. _FEATURE_SEEKABLE should be set iff the underlying
channel/stream supports seek (random access) functionality, right?
That is quite connected with the lseek(2) OR ->io_seek() and such
support, no?
Thank you.
---
- Prasad
On Tue, Apr 15, 2025 at 05:27:02PM +0530, Prasad Pandit wrote:
> On Tue, 15 Apr 2025 at 16:33, Daniel P. Berrangé <berrange@redhat.com> wrote:
> > Because that's what the QEMU API specification declares
> > * Not all implementations will support this facility, so may report
> > * an error. To avoid errors, the caller may check for the feature
> > * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method.
> >
> > and what the QEMU API impl defines
> >
> > if (!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_SEEKABLE)) {
> > error_setg_errno(errp, EINVAL, "Requested channel is not seekable");
> > return -1;
> > }
>
> * ie. _FEATURE_SEEKABLE should be set iff the underlying
> channel/stream supports seek (random access) functionality, right?
> That is quite connected with the lseek(2) OR ->io_seek() and such
> support, no?
The current semantics of QIO_CHANNEL_FEATURE_SEEKABLE is that it indicates
whether qio_channel_{preadv/pwritev} are usable or not.
For plain files that may also indicate that lseek is possible. For the
block driver based channel that doesn't map as there's no concept of
current file marker for that.
With regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Hello Prasad, On Friday, April 04, 2025 10:19 CEST, Prasad Pandit <ppandit@redhat.com> wrote: > * IIUC, when _FEATURE_SEEKABLE is set, the channel I/O sequence > eventually makes underlying preadv(2)/pwritev(2) calls, which use > lseek(2) to adjust the stream r/w pointer with the given offset, > before doing the r/w operation. Almost. Unlike lseek(2), pread(2) and co. do not have side effects on the fd's offset. From the man page: > The pread() and pwrite() system calls are especially useful in > multithreaded applications. They allow multiple threads to > perform I/O on the same file descriptor without being affected by > changes to the file offset by other threads. > * QIOChannelBlock has '.io_seek' routine corresponding to the lseek(2) > call. It is not clear how setting QIO_CHANNEL_FEATURE_SEEKABLE helps > for QIOChannelBlock streams. It is a first step in making QIOChannelBlock compatible with mapped-ram feature that requires it since mapped-ram uses io_preadv and io_pwritev methods. Best, Marco
Hi, On Fri, 4 Apr 2025 at 14:35, Marco Cavenati <Marco.Cavenati@eurecom.fr> wrote: > Almost. Unlike lseek(2), pread(2) and co. do not have side effects on the fd's offset. > From the man page: > > The pread() and pwrite() system calls are especially useful in > > multithreaded applications. They allow multiple threads to > > perform I/O on the same file descriptor without being affected by > > changes to the file offset by other threads. * If the r/w pointer adjustment (lseek(2)) is not required, then why set the '*_FEATURE_SEEKABLE' flag? > It is a first step in making QIOChannelBlock compatible with > mapped-ram feature that requires it since mapped-ram uses > io_preadv and io_pwritev methods. * The qio_channel_block_preadv/pwritev functions defined above, which shall be called via '->io_preadv' and '->io_pwritev' methods, appear to call bdrv_readv/writev_vmstate() functions. Do those functions need to adjust (lseek(2)) the stream r/w pointers? Thank you. --- - Prasad
On Friday, April 04, 2025 12:14 CEST, Prasad Pandit <ppandit@redhat.com> wrote: > * If the r/w pointer adjustment (lseek(2)) is not required, then why > set the '*_FEATURE_SEEKABLE' flag? The QIO_CHANNEL_FEATURE_SEEKABLE flag is set to indicate that the channel supports seekable operations. This flag is more about signaling capability rather than dictating the use of the specific lseek(2) function. > * The qio_channel_block_preadv/pwritev functions defined above, which > shall be called via '->io_preadv' and '->io_pwritev' methods, appear > to call bdrv_readv/writev_vmstate() functions. Do those functions need > to adjust (lseek(2)) the stream r/w pointers? In this case, I don't think any lseek(2) is involved, instead some flavor of pread(2) is used, which, according to the man page, requires that > The file referenced by fd must be capable of seeking. because pread(2) internally manages seeking without modifying the file descriptor's offset. Let me split the question here: * Do those functions need to seek into the channel? Yes * Do those functions lseek(2) the stream r/w pointers? No, they do not use lseek(2). The seeking is managed internally by the pread(2) and co. functions, which perform I/O operations at specified offsets without altering the file descriptor's position. I hope this clarifies :) Best, Marco
Hi,
On Fri, 4 Apr 2025 at 17:35, Marco Cavenati <Marco.Cavenati@eurecom.fr> wrote:
> The QIO_CHANNEL_FEATURE_SEEKABLE flag is set to indicate that
> the channel supports seekable operations. This flag is more about
> signaling capability rather than dictating the use of the specific
> lseek(2) function.
* Yes.
> In this case, I don't think any lseek(2) is involved, instead some flavor
> of pread(2) is used, which, according to the man page, requires that
> > The file referenced by fd must be capable of seeking.
> because pread(2) internally manages seeking without modifying the
> file descriptor's offset.
* I think "...capable of seeking" is referring to lseek(2) here,
because the man page mentions:
===
...
ERRORS
pread() can fail and set errno to any error specified for read(2) or
lseek(2). pwrite() can fail and set errno to any error specified for
write(2) or lseek(2).
====
ie. pread/pwrite internally makes lseek(2) call.
> Let me split the question here:
> * Do those functions need to seek into the channel?
> Yes
> * Do those functions lseek(2) the stream r/w pointers?
> No, they do not use lseek(2). The seeking is managed internally by the
> pread(2) and co. functions, which perform I/O operations at
> specified offsets without altering the file descriptor's position.
* If seeking is managed internally by pread(2)/pwrite(2) and co.
functions, then that is independent of the
'QIO_CHANNEL_FEATURE_SEEKABLE' flag; This flag is QEMU specific, it is
not available outside of QEMU/io/ system. pread(2)/pwrite(2) are
standard C library functions.
* Another question is: will pread(2)/pwrite(2) functions work the same
if we don't set the 'QIO_CHANNEL_FEATURE_SEEKABLE' flag?
(what I'm trying to say is: it is not clear how setting
'*_FEATURE_SEEKABLE' flag helps in case of QIOChannelBlock class)
Thank you.
---
- Prasad
Hello, On Monday, April 07, 2025 08:47 CEST, Prasad Pandit <ppandit@redhat.com> wrote: > * If seeking is managed internally by pread(2)/pwrite(2) and co. > functions, then that is independent of the > 'QIO_CHANNEL_FEATURE_SEEKABLE' flag; This flag is QEMU specific, it is > not available outside of QEMU/io/ system. pread(2)/pwrite(2) are > standard C library functions. > * Another question is: will pread(2)/pwrite(2) functions work the same > if we don't set the 'QIO_CHANNEL_FEATURE_SEEKABLE' flag? > > (what I'm trying to say is: it is not clear how setting > '*_FEATURE_SEEKABLE' flag helps in case of QIOChannelBlock class) As you said the capability is used internally. Its goal is to signal to other QEMU code that the QIOChannel is seekable. 'qio_channel_pwritev' and 'qio_channel_preadv' can be used only if the QIOChannel has the 'QIO_CHANNEL_FEATURE_SEEKABLE' capability. The mapped-ram migration checks if the channel has this capability because it uses the aforementioned functions. Without the capability and the functions implemented in this patch, the mapped-ram migration won't work with QIOChannelBlock. You can have a look at the patch where those functions were introduced here [0]. Best, Marco [0] https://lore.kernel.org/qemu-devel/20240229153017.2221-4-farosas@suse.de/
On Mon, 7 Apr 2025 at 14:31, Marco Cavenati <Marco.Cavenati@eurecom.fr> wrote:
> As you said the capability is used internally. Its goal is to signal to
> other QEMU code that the QIOChannel is seekable.
> 'qio_channel_pwritev' and 'qio_channel_preadv' can be used only if
> the QIOChannel has the 'QIO_CHANNEL_FEATURE_SEEKABLE'
> capability.
>
> The mapped-ram migration checks if the channel has this capability
> because it uses the aforementioned functions. Without the capability
> and the functions implemented in this patch, the mapped-ram migration
> won't work with QIOChannelBlock.
>
> You can have a look at the patch where those functions were
> introduced here [0].
* _channel_preadv/_writev functions are generic. They are independent
of whether the underlying channel is file or socket or memory or
something else. They are called if and when they are defined and they
in turn call channel specific preadv/pwritev functions.
if (!klass->io_pwritev) {
error_setg(errp, "Channel does not support pwritev");
return -1;
}
* io: add and implement QIO_CHANNEL_FEATURE_SEEKABLE for channel file
-> https://gitlab.com/qemu-project/qemu/-/commit/401e311ff72e0a62c834bfe466de68a82cfd90cb
This commit sets the *_FEATURE_SEEKABLE flag for the file channel
when the lseek(2) call succeeds.
* ie. 'file' OR 'fd' channel is seekable when lseek(2) call works.
Similarly Block channel would be seekable when ->io_seek() method is
defined and it works. And ->io_seek() method is also called if and
when it is defined
qio_channel_io_seek
if (!klass->io_seek) {
error_setg(errp, "Channel does not support random
access");
return -1;
}
Setting '*_FEATURE_SEEKABLE' for the block channel does not ensure
that ->io_seek() is defined and works. It seems redundant that way.
Maybe I'm missing something here, not sure. Thank you.
---
- Prasad
On Tuesday, April 08, 2025 07:25 CEST, Prasad Pandit <ppandit@redhat.com> wrote:
> * _channel_preadv/_writev functions are generic. They are independent
> of whether the underlying channel is file or socket or memory or
> something else. They are called if and when they are defined and they
> in turn call channel specific preadv/pwritev functions.
>
> if (!klass->io_pwritev) {
> error_setg(errp, "Channel does not support pwritev");
> return -1;
> }
They also require QIO_CHANNEL_FEATURE_SEEKABLE.
if (!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_SEEKABLE)) {
error_setg_errno(errp, EINVAL, "Requested channel is not seekable");
return -1;
}
> * io: add and implement QIO_CHANNEL_FEATURE_SEEKABLE for channel file
> -> https://gitlab.com/qemu-project/qemu/-/commit/401e311ff72e0a62c834bfe466de68a82cfd90cb
>
> This commit sets the *_FEATURE_SEEKABLE flag for the file channel
> when the lseek(2) call succeeds.
The QIOChannelBlock io_seek (qio_channel_block_seek) function
cannot fail for SEEK_CUR, no need to check.
> * ie. 'file' OR 'fd' channel is seekable when lseek(2) call works.
> Similarly Block channel would be seekable when ->io_seek() method is
> defined and it works. And ->io_seek() method is also called if and
> when it is defined
>
> qio_channel_io_seek
> if (!klass->io_seek) {
> error_setg(errp, "Channel does not support random
> access");
> return -1;
> }
>
> Setting '*_FEATURE_SEEKABLE' for the block channel does not ensure
> that ->io_seek() is defined and works.
QIOChannelBlock io_seek is always implemented and works
(except for SEEK_END).
> It seems redundant that way.
I'm not sure I understand what you mean here.
TLDR: QIOChannelBlock is already always seekable, this patch just adds
the capability explicitly so that it will work with mapped-ram.
Best
Marco
© 2016 - 2026 Red Hat, Inc.