Add the direct-io migration parameter that tells the migration code to
use O_DIRECT when opening the migration stream file whenever possible.
This is currently only used for the secondary channels of fixed-ram
migration, which can guarantee that writes are page aligned.
However the parameter could be made to affect other types of
file-based migrations in the future.
Signed-off-by: Fabiano Rosas <farosas@suse.de>
---
include/qemu/osdep.h | 2 ++
migration/file.c | 15 ++++++++++++---
migration/migration-hmp-cmds.c | 10 ++++++++++
migration/options.c | 30 ++++++++++++++++++++++++++++++
migration/options.h | 1 +
qapi/migration.json | 17 ++++++++++++++---
util/osdep.c | 9 +++++++++
7 files changed, 78 insertions(+), 6 deletions(-)
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 475a1c62ff..ea5d29ab9b 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -597,6 +597,8 @@ int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive);
bool qemu_has_ofd_lock(void);
#endif
+bool qemu_has_direct_io(void);
+
#if defined(__HAIKU__) && defined(__i386__)
#define FMT_pid "%ld"
#elif defined(WIN64)
diff --git a/migration/file.c b/migration/file.c
index ad75225f43..3d3c58ecad 100644
--- a/migration/file.c
+++ b/migration/file.c
@@ -11,9 +11,9 @@
#include "qemu/error-report.h"
#include "channel.h"
#include "file.h"
-#include "migration.h"
#include "io/channel-file.h"
#include "io/channel-util.h"
+#include "migration.h"
#include "options.h"
#include "trace.h"
@@ -77,9 +77,18 @@ void file_send_channel_create(QIOTaskFunc f, void *data)
QIOChannelFile *ioc;
QIOTask *task;
Error *errp = NULL;
+ int flags = outgoing_args.flags;
- ioc = qio_channel_file_new_path(outgoing_args.fname,
- outgoing_args.flags,
+ if (migrate_direct_io() && qemu_has_direct_io()) {
+ /*
+ * Enable O_DIRECT for the secondary channels. These are used
+ * for sending ram pages and writes should be guaranteed to be
+ * aligned to at least page size.
+ */
+ flags |= O_DIRECT;
+ }
+
+ ioc = qio_channel_file_new_path(outgoing_args.fname, flags,
outgoing_args.mode, &errp);
if (!ioc) {
file_migration_cancel(errp);
diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index a82597f18e..eab5ac3588 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -387,6 +387,12 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
monitor_printf(mon, "%s: %" PRIu64 " MB/s\n",
MigrationParameter_str(MIGRATION_PARAMETER_VCPU_DIRTY_LIMIT),
params->vcpu_dirty_limit);
+
+ if (params->has_direct_io) {
+ monitor_printf(mon, "%s: %s\n",
+ MigrationParameter_str(MIGRATION_PARAMETER_DIRECT_IO),
+ params->direct_io ? "on" : "off");
+ }
}
qapi_free_MigrationParameters(params);
@@ -661,6 +667,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
p->has_vcpu_dirty_limit = true;
visit_type_size(v, param, &p->vcpu_dirty_limit, &err);
break;
+ case MIGRATION_PARAMETER_DIRECT_IO:
+ p->has_direct_io = true;
+ visit_type_bool(v, param, &p->direct_io, &err);
+ break;
default:
assert(0);
}
diff --git a/migration/options.c b/migration/options.c
index 2193d69e71..6d0e3c26ae 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -817,6 +817,22 @@ int migrate_decompress_threads(void)
return s->parameters.decompress_threads;
}
+bool migrate_direct_io(void)
+{
+ MigrationState *s = migrate_get_current();
+
+ /* For now O_DIRECT is only supported with fixed-ram */
+ if (!s->capabilities[MIGRATION_CAPABILITY_FIXED_RAM]) {
+ return false;
+ }
+
+ if (s->parameters.has_direct_io) {
+ return s->parameters.direct_io;
+ }
+
+ return false;
+}
+
uint64_t migrate_downtime_limit(void)
{
MigrationState *s = migrate_get_current();
@@ -1025,6 +1041,11 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
params->has_vcpu_dirty_limit = true;
params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit;
+ if (s->parameters.has_direct_io) {
+ params->has_direct_io = true;
+ params->direct_io = s->parameters.direct_io;
+ }
+
return params;
}
@@ -1059,6 +1080,7 @@ void migrate_params_init(MigrationParameters *params)
params->has_announce_step = true;
params->has_x_vcpu_dirty_limit_period = true;
params->has_vcpu_dirty_limit = true;
+ params->has_direct_io = qemu_has_direct_io();
}
/*
@@ -1356,6 +1378,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params,
if (params->has_vcpu_dirty_limit) {
dest->vcpu_dirty_limit = params->vcpu_dirty_limit;
}
+
+ if (params->has_direct_io) {
+ dest->direct_io = params->direct_io;
+ }
}
static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
@@ -1486,6 +1512,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
if (params->has_vcpu_dirty_limit) {
s->parameters.vcpu_dirty_limit = params->vcpu_dirty_limit;
}
+
+ if (params->has_direct_io) {
+ s->parameters.direct_io = params->direct_io;
+ }
}
void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
diff --git a/migration/options.h b/migration/options.h
index 01bba5b928..280f86bed1 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -82,6 +82,7 @@ uint8_t migrate_cpu_throttle_increment(void);
uint8_t migrate_cpu_throttle_initial(void);
bool migrate_cpu_throttle_tailslow(void);
int migrate_decompress_threads(void);
+bool migrate_direct_io(void);
uint64_t migrate_downtime_limit(void);
uint8_t migrate_max_cpu_throttle(void);
uint64_t migrate_max_bandwidth(void);
diff --git a/qapi/migration.json b/qapi/migration.json
index 1317dd32ab..3eb9e2c9b5 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -840,6 +840,9 @@
# @vcpu-dirty-limit: Dirtyrate limit (MB/s) during live migration.
# Defaults to 1. (Since 8.1)
#
+# @direct-io: Open migration files with O_DIRECT when possible. Not
+# all migration transports support this. (since 8.1)
+#
# Features:
#
# @unstable: Members @x-checkpoint-delay and @x-vcpu-dirty-limit-period
@@ -864,7 +867,7 @@
'multifd-zlib-level', 'multifd-zstd-level',
'block-bitmap-mapping',
{ 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] },
- 'vcpu-dirty-limit'] }
+ 'vcpu-dirty-limit', 'direct-io'] }
##
# @MigrateSetParameters:
@@ -1016,6 +1019,9 @@
# @vcpu-dirty-limit: Dirtyrate limit (MB/s) during live migration.
# Defaults to 1. (Since 8.1)
#
+# @direct-io: Open migration files with O_DIRECT when possible. Not
+# all migration transports support this. (since 8.1)
+#
# Features:
#
# @unstable: Members @x-checkpoint-delay and @x-vcpu-dirty-limit-period
@@ -1058,7 +1064,8 @@
'*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ],
'*x-vcpu-dirty-limit-period': { 'type': 'uint64',
'features': [ 'unstable' ] },
- '*vcpu-dirty-limit': 'uint64'} }
+ '*vcpu-dirty-limit': 'uint64',
+ '*direct-io': 'bool' } }
##
# @migrate-set-parameters:
@@ -1230,6 +1237,9 @@
# @vcpu-dirty-limit: Dirtyrate limit (MB/s) during live migration.
# Defaults to 1. (Since 8.1)
#
+# @direct-io: Open migration files with O_DIRECT when possible. Not
+# all migration transports support this. (since 8.1)
+#
# Features:
#
# @unstable: Members @x-checkpoint-delay and @x-vcpu-dirty-limit-period
@@ -1269,7 +1279,8 @@
'*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ],
'*x-vcpu-dirty-limit-period': { 'type': 'uint64',
'features': [ 'unstable' ] },
- '*vcpu-dirty-limit': 'uint64'} }
+ '*vcpu-dirty-limit': 'uint64',
+ '*direct-io': 'bool' } }
##
# @query-migrate-parameters:
diff --git a/util/osdep.c b/util/osdep.c
index e996c4744a..d0227a60ab 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -277,6 +277,15 @@ int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive)
}
#endif
+bool qemu_has_direct_io(void)
+{
+#ifdef O_DIRECT
+ return true;
+#else
+ return false;
+#endif
+}
+
static int qemu_open_cloexec(const char *name, int flags, mode_t mode)
{
int ret;
--
2.35.3
On Mon, Oct 23, 2023 at 05:36:07PM -0300, Fabiano Rosas wrote: > Add the direct-io migration parameter that tells the migration code to > use O_DIRECT when opening the migration stream file whenever possible. > > This is currently only used for the secondary channels of fixed-ram > migration, which can guarantee that writes are page aligned. When you say "secondary channels", I presume you're meaning that the bulk memory regions will be written with O_DIRECT, while general vmstate will use normal I/O on the main channel ? If so, could we explain that a little more directly. Having a mixture of O_DIRECT and non-O_DIRECT I/O on the same file is a little bit of an unusual situation. It will work for us because we're writing to different regions of the file in each case. Still I wonder if it would be sane in the outgoing case to include a fsync() on the file in the main channel, to guarantee that the whole saved file is on-media at completion ? Or perhaps suggest in QAPI that mgmts might consider doing a fsync themselves ? With regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Daniel P. Berrangé <berrange@redhat.com> writes: > On Mon, Oct 23, 2023 at 05:36:07PM -0300, Fabiano Rosas wrote: >> Add the direct-io migration parameter that tells the migration code to >> use O_DIRECT when opening the migration stream file whenever possible. >> >> This is currently only used for the secondary channels of fixed-ram >> migration, which can guarantee that writes are page aligned. > > When you say "secondary channels", I presume you're meaning that > the bulk memory regions will be written with O_DIRECT, while > general vmstate will use normal I/O on the main channel ? If so, > could we explain that a little more directly. Yes, the main channel writes via QEMUFile, so no O_DIRECT. The channels created via multifd_new_send_channel_create() have O_DIRECT enabled. > Having a mixture of O_DIRECT and non-O_DIRECT I/O on the same > file is a little bit of an unusual situation. It will work for > us because we're writing to different regions of the file in > each case. > > Still I wonder if it would be sane in the outgoing case to > include a fsync() on the file in the main channel, to guarantee > that the whole saved file is on-media at completion ? Or perhaps > suggest in QAPI that mgmts might consider doing a fsync > themselves ? I think that should be present in QIOChannelFile in general. Not even related to this series. I'll add it at qio_channel_file_close() unless you object.
On Wed, Oct 25, 2023 at 11:48:08AM -0300, Fabiano Rosas wrote: > Daniel P. Berrangé <berrange@redhat.com> writes: > > > On Mon, Oct 23, 2023 at 05:36:07PM -0300, Fabiano Rosas wrote: > >> Add the direct-io migration parameter that tells the migration code to > >> use O_DIRECT when opening the migration stream file whenever possible. > >> > >> This is currently only used for the secondary channels of fixed-ram > >> migration, which can guarantee that writes are page aligned. > > > > When you say "secondary channels", I presume you're meaning that > > the bulk memory regions will be written with O_DIRECT, while > > general vmstate will use normal I/O on the main channel ? If so, > > could we explain that a little more directly. > > Yes, the main channel writes via QEMUFile, so no O_DIRECT. The channels > created via multifd_new_send_channel_create() have O_DIRECT enabled. > > > Having a mixture of O_DIRECT and non-O_DIRECT I/O on the same > > file is a little bit of an unusual situation. It will work for > > us because we're writing to different regions of the file in > > each case. > > > > Still I wonder if it would be sane in the outgoing case to > > include a fsync() on the file in the main channel, to guarantee > > that the whole saved file is on-media at completion ? Or perhaps > > suggest in QAPI that mgmts might consider doing a fsync > > themselves ? > > I think that should be present in QIOChannelFile in general. Not even > related to this series. I'll add it at qio_channel_file_close() unless > you object. Yes, looking at the places QIOChannelFile is used, I think they would all benefit from fdatasync(). fsync() is probably too big of a hammer With regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On Mon, Oct 23, 2023 at 05:36:07PM -0300, Fabiano Rosas wrote:
> Add the direct-io migration parameter that tells the migration code to
> use O_DIRECT when opening the migration stream file whenever possible.
>
> This is currently only used for the secondary channels of fixed-ram
> migration, which can guarantee that writes are page aligned.
>
> However the parameter could be made to affect other types of
> file-based migrations in the future.
>
> Signed-off-by: Fabiano Rosas <farosas@suse.de>
> ---
> include/qemu/osdep.h | 2 ++
> migration/file.c | 15 ++++++++++++---
> migration/migration-hmp-cmds.c | 10 ++++++++++
> migration/options.c | 30 ++++++++++++++++++++++++++++++
> migration/options.h | 1 +
> qapi/migration.json | 17 ++++++++++++++---
> util/osdep.c | 9 +++++++++
> 7 files changed, 78 insertions(+), 6 deletions(-)
>
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index 475a1c62ff..ea5d29ab9b 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -597,6 +597,8 @@ int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive);
> bool qemu_has_ofd_lock(void);
> #endif
>
> +bool qemu_has_direct_io(void);
> +
> #if defined(__HAIKU__) && defined(__i386__)
> #define FMT_pid "%ld"
> #elif defined(WIN64)
> diff --git a/migration/file.c b/migration/file.c
> index ad75225f43..3d3c58ecad 100644
> --- a/migration/file.c
> +++ b/migration/file.c
> @@ -11,9 +11,9 @@
> #include "qemu/error-report.h"
> #include "channel.h"
> #include "file.h"
> -#include "migration.h"
> #include "io/channel-file.h"
> #include "io/channel-util.h"
> +#include "migration.h"
> #include "options.h"
> #include "trace.h"
>
> @@ -77,9 +77,18 @@ void file_send_channel_create(QIOTaskFunc f, void *data)
> QIOChannelFile *ioc;
> QIOTask *task;
> Error *errp = NULL;
> + int flags = outgoing_args.flags;
>
> - ioc = qio_channel_file_new_path(outgoing_args.fname,
> - outgoing_args.flags,
> + if (migrate_direct_io() && qemu_has_direct_io()) {
> + /*
> + * Enable O_DIRECT for the secondary channels. These are used
> + * for sending ram pages and writes should be guaranteed to be
> + * aligned to at least page size.
> + */
> + flags |= O_DIRECT;
> + }
IMHO we should not be silently ignoring the user's request for
direct I/O if we've been comiled for a platform which can't
support it. We should fail the setting of the direct I/O
parameter
Also this is referencing O_DIRECT without any #ifdef check.
> +
> + ioc = qio_channel_file_new_path(outgoing_args.fname, flags,
> outgoing_args.mode, &errp);
> if (!ioc) {
> file_migration_cancel(errp);
> diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
> index a82597f18e..eab5ac3588 100644
> --- a/migration/migration-hmp-cmds.c
> +++ b/migration/migration-hmp-cmds.c
> @@ -387,6 +387,12 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
> monitor_printf(mon, "%s: %" PRIu64 " MB/s\n",
> MigrationParameter_str(MIGRATION_PARAMETER_VCPU_DIRTY_LIMIT),
> params->vcpu_dirty_limit);
> +
> + if (params->has_direct_io) {
> + monitor_printf(mon, "%s: %s\n",
> + MigrationParameter_str(MIGRATION_PARAMETER_DIRECT_IO),
> + params->direct_io ? "on" : "off");
> + }
> }
>
> qapi_free_MigrationParameters(params);
> @@ -661,6 +667,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
> p->has_vcpu_dirty_limit = true;
> visit_type_size(v, param, &p->vcpu_dirty_limit, &err);
> break;
> + case MIGRATION_PARAMETER_DIRECT_IO:
> + p->has_direct_io = true;
> + visit_type_bool(v, param, &p->direct_io, &err);
> + break;
> default:
> assert(0);
> }
> diff --git a/migration/options.c b/migration/options.c
> index 2193d69e71..6d0e3c26ae 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -817,6 +817,22 @@ int migrate_decompress_threads(void)
> return s->parameters.decompress_threads;
> }
>
> +bool migrate_direct_io(void)
> +{
> + MigrationState *s = migrate_get_current();
> +
> + /* For now O_DIRECT is only supported with fixed-ram */
> + if (!s->capabilities[MIGRATION_CAPABILITY_FIXED_RAM]) {
> + return false;
> + }
> +
> + if (s->parameters.has_direct_io) {
> + return s->parameters.direct_io;
> + }
> +
> + return false;
> +}
> +
> uint64_t migrate_downtime_limit(void)
> {
> MigrationState *s = migrate_get_current();
> @@ -1025,6 +1041,11 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
> params->has_vcpu_dirty_limit = true;
> params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit;
>
> + if (s->parameters.has_direct_io) {
> + params->has_direct_io = true;
> + params->direct_io = s->parameters.direct_io;
> + }
> +
> return params;
> }
>
> @@ -1059,6 +1080,7 @@ void migrate_params_init(MigrationParameters *params)
> params->has_announce_step = true;
> params->has_x_vcpu_dirty_limit_period = true;
> params->has_vcpu_dirty_limit = true;
> + params->has_direct_io = qemu_has_direct_io();
> }
>
> /*
> @@ -1356,6 +1378,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params,
> if (params->has_vcpu_dirty_limit) {
> dest->vcpu_dirty_limit = params->vcpu_dirty_limit;
> }
> +
> + if (params->has_direct_io) {
> + dest->direct_io = params->direct_io;
> + }
> }
>
> static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
> @@ -1486,6 +1512,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
> if (params->has_vcpu_dirty_limit) {
> s->parameters.vcpu_dirty_limit = params->vcpu_dirty_limit;
> }
> +
> + if (params->has_direct_io) {
#ifndef O_DIRECT
error_setg(errp, "Direct I/O is not supported on this platform");
#endif
Should also be doing a check for the 'fixed-ram' capability being
set at this point.
> + s->parameters.direct_io = params->direct_io;
> + }
> }
>
> void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
> diff --git a/migration/options.h b/migration/options.h
> index 01bba5b928..280f86bed1 100644
> --- a/migration/options.h
> +++ b/migration/options.h
> @@ -82,6 +82,7 @@ uint8_t migrate_cpu_throttle_increment(void);
> uint8_t migrate_cpu_throttle_initial(void);
> bool migrate_cpu_throttle_tailslow(void);
> int migrate_decompress_threads(void);
> +bool migrate_direct_io(void);
> uint64_t migrate_downtime_limit(void);
> uint8_t migrate_max_cpu_throttle(void);
> uint64_t migrate_max_bandwidth(void);
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 1317dd32ab..3eb9e2c9b5 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -840,6 +840,9 @@
> # @vcpu-dirty-limit: Dirtyrate limit (MB/s) during live migration.
> # Defaults to 1. (Since 8.1)
> #
> +# @direct-io: Open migration files with O_DIRECT when possible. Not
> +# all migration transports support this. (since 8.1)
"Not all migration transports support this" is too vague.
Lets say what the requirement is
"This requires that the 'fixed-ram' capability is enabled"
> +#
> # Features:
> #
> # @unstable: Members @x-checkpoint-delay and @x-vcpu-dirty-limit-period
> @@ -864,7 +867,7 @@
> 'multifd-zlib-level', 'multifd-zstd-level',
> 'block-bitmap-mapping',
> { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] },
> - 'vcpu-dirty-limit'] }
> + 'vcpu-dirty-limit', 'direct-io'] }
>
> ##
> # @MigrateSetParameters:
> @@ -1016,6 +1019,9 @@
> # @vcpu-dirty-limit: Dirtyrate limit (MB/s) during live migration.
> # Defaults to 1. (Since 8.1)
> #
> +# @direct-io: Open migration files with O_DIRECT when possible. Not
> +# all migration transports support this. (since 8.1)
> +#
> # Features:
> #
> # @unstable: Members @x-checkpoint-delay and @x-vcpu-dirty-limit-period
> @@ -1058,7 +1064,8 @@
> '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ],
> '*x-vcpu-dirty-limit-period': { 'type': 'uint64',
> 'features': [ 'unstable' ] },
> - '*vcpu-dirty-limit': 'uint64'} }
> + '*vcpu-dirty-limit': 'uint64',
> + '*direct-io': 'bool' } }
>
> ##
> # @migrate-set-parameters:
> @@ -1230,6 +1237,9 @@
> # @vcpu-dirty-limit: Dirtyrate limit (MB/s) during live migration.
> # Defaults to 1. (Since 8.1)
> #
> +# @direct-io: Open migration files with O_DIRECT when possible. Not
> +# all migration transports support this. (since 8.1)
> +#
> # Features:
> #
> # @unstable: Members @x-checkpoint-delay and @x-vcpu-dirty-limit-period
> @@ -1269,7 +1279,8 @@
> '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ],
> '*x-vcpu-dirty-limit-period': { 'type': 'uint64',
> 'features': [ 'unstable' ] },
> - '*vcpu-dirty-limit': 'uint64'} }
> + '*vcpu-dirty-limit': 'uint64',
> + '*direct-io': 'bool' } }
>
> ##
> # @query-migrate-parameters:
> diff --git a/util/osdep.c b/util/osdep.c
> index e996c4744a..d0227a60ab 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -277,6 +277,15 @@ int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive)
> }
> #endif
>
> +bool qemu_has_direct_io(void)
> +{
> +#ifdef O_DIRECT
> + return true;
> +#else
> + return false;
> +#endif
> +}
> +
> static int qemu_open_cloexec(const char *name, int flags, mode_t mode)
> {
> int ret;
> --
> 2.35.3
>
With regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Daniel P. Berrangé <berrange@redhat.com> writes:
> On Mon, Oct 23, 2023 at 05:36:07PM -0300, Fabiano Rosas wrote:
>> Add the direct-io migration parameter that tells the migration code to
>> use O_DIRECT when opening the migration stream file whenever possible.
>>
>> This is currently only used for the secondary channels of fixed-ram
>> migration, which can guarantee that writes are page aligned.
>>
>> However the parameter could be made to affect other types of
>> file-based migrations in the future.
>>
>> Signed-off-by: Fabiano Rosas <farosas@suse.de>
>> ---
>> include/qemu/osdep.h | 2 ++
>> migration/file.c | 15 ++++++++++++---
>> migration/migration-hmp-cmds.c | 10 ++++++++++
>> migration/options.c | 30 ++++++++++++++++++++++++++++++
>> migration/options.h | 1 +
>> qapi/migration.json | 17 ++++++++++++++---
>> util/osdep.c | 9 +++++++++
>> 7 files changed, 78 insertions(+), 6 deletions(-)
>>
>> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
>> index 475a1c62ff..ea5d29ab9b 100644
>> --- a/include/qemu/osdep.h
>> +++ b/include/qemu/osdep.h
>> @@ -597,6 +597,8 @@ int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive);
>> bool qemu_has_ofd_lock(void);
>> #endif
>>
>> +bool qemu_has_direct_io(void);
>> +
>> #if defined(__HAIKU__) && defined(__i386__)
>> #define FMT_pid "%ld"
>> #elif defined(WIN64)
>> diff --git a/migration/file.c b/migration/file.c
>> index ad75225f43..3d3c58ecad 100644
>> --- a/migration/file.c
>> +++ b/migration/file.c
>> @@ -11,9 +11,9 @@
>> #include "qemu/error-report.h"
>> #include "channel.h"
>> #include "file.h"
>> -#include "migration.h"
>> #include "io/channel-file.h"
>> #include "io/channel-util.h"
>> +#include "migration.h"
>> #include "options.h"
>> #include "trace.h"
>>
>> @@ -77,9 +77,18 @@ void file_send_channel_create(QIOTaskFunc f, void *data)
>> QIOChannelFile *ioc;
>> QIOTask *task;
>> Error *errp = NULL;
>> + int flags = outgoing_args.flags;
>>
>> - ioc = qio_channel_file_new_path(outgoing_args.fname,
>> - outgoing_args.flags,
>> + if (migrate_direct_io() && qemu_has_direct_io()) {
>> + /*
>> + * Enable O_DIRECT for the secondary channels. These are used
>> + * for sending ram pages and writes should be guaranteed to be
>> + * aligned to at least page size.
>> + */
>> + flags |= O_DIRECT;
>> + }
>
> IMHO we should not be silently ignoring the user's request for
> direct I/O if we've been comiled for a platform which can't
> support it. We should fail the setting of the direct I/O
> parameter
>
Good point.
> Also this is referencing O_DIRECT without any #ifdef check.
>
Ack.
>> +
>> + ioc = qio_channel_file_new_path(outgoing_args.fname, flags,
>> outgoing_args.mode, &errp);
>> if (!ioc) {
>> file_migration_cancel(errp);
>> diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
>> index a82597f18e..eab5ac3588 100644
>> --- a/migration/migration-hmp-cmds.c
>> +++ b/migration/migration-hmp-cmds.c
>> @@ -387,6 +387,12 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
>> monitor_printf(mon, "%s: %" PRIu64 " MB/s\n",
>> MigrationParameter_str(MIGRATION_PARAMETER_VCPU_DIRTY_LIMIT),
>> params->vcpu_dirty_limit);
>> +
>> + if (params->has_direct_io) {
>> + monitor_printf(mon, "%s: %s\n",
>> + MigrationParameter_str(MIGRATION_PARAMETER_DIRECT_IO),
>> + params->direct_io ? "on" : "off");
>> + }
>> }
>>
>> qapi_free_MigrationParameters(params);
>> @@ -661,6 +667,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
>> p->has_vcpu_dirty_limit = true;
>> visit_type_size(v, param, &p->vcpu_dirty_limit, &err);
>> break;
>> + case MIGRATION_PARAMETER_DIRECT_IO:
>> + p->has_direct_io = true;
>> + visit_type_bool(v, param, &p->direct_io, &err);
>> + break;
>> default:
>> assert(0);
>> }
>> diff --git a/migration/options.c b/migration/options.c
>> index 2193d69e71..6d0e3c26ae 100644
>> --- a/migration/options.c
>> +++ b/migration/options.c
>> @@ -817,6 +817,22 @@ int migrate_decompress_threads(void)
>> return s->parameters.decompress_threads;
>> }
>>
>> +bool migrate_direct_io(void)
>> +{
>> + MigrationState *s = migrate_get_current();
>> +
>> + /* For now O_DIRECT is only supported with fixed-ram */
>> + if (!s->capabilities[MIGRATION_CAPABILITY_FIXED_RAM]) {
>> + return false;
>> + }
>> +
>> + if (s->parameters.has_direct_io) {
>> + return s->parameters.direct_io;
>> + }
>> +
>> + return false;
>> +}
>> +
>> uint64_t migrate_downtime_limit(void)
>> {
>> MigrationState *s = migrate_get_current();
>> @@ -1025,6 +1041,11 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
>> params->has_vcpu_dirty_limit = true;
>> params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit;
>>
>> + if (s->parameters.has_direct_io) {
>> + params->has_direct_io = true;
>> + params->direct_io = s->parameters.direct_io;
>> + }
>> +
>> return params;
>> }
>>
>> @@ -1059,6 +1080,7 @@ void migrate_params_init(MigrationParameters *params)
>> params->has_announce_step = true;
>> params->has_x_vcpu_dirty_limit_period = true;
>> params->has_vcpu_dirty_limit = true;
>> + params->has_direct_io = qemu_has_direct_io();
>> }
>>
>> /*
>> @@ -1356,6 +1378,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params,
>> if (params->has_vcpu_dirty_limit) {
>> dest->vcpu_dirty_limit = params->vcpu_dirty_limit;
>> }
>> +
>> + if (params->has_direct_io) {
>> + dest->direct_io = params->direct_io;
>> + }
>> }
>>
>> static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
>> @@ -1486,6 +1512,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
>> if (params->has_vcpu_dirty_limit) {
>> s->parameters.vcpu_dirty_limit = params->vcpu_dirty_limit;
>> }
>> +
>> + if (params->has_direct_io) {
>
> #ifndef O_DIRECT
> error_setg(errp, "Direct I/O is not supported on this platform");
> #endif
>
> Should also be doing a check for the 'fixed-ram' capability being
> set at this point.
>
Ok.
Fabiano Rosas <farosas@suse.de> writes:
> Add the direct-io migration parameter that tells the migration code to
> use O_DIRECT when opening the migration stream file whenever possible.
>
> This is currently only used for the secondary channels of fixed-ram
> migration, which can guarantee that writes are page aligned.
>
> However the parameter could be made to affect other types of
> file-based migrations in the future.
>
> Signed-off-by: Fabiano Rosas <farosas@suse.de>
When would you want to enable @direct-io, and when would you want to
leave it disabled?
What happens when you enable @direct-io with a migration that cannot use
O_DIRECT?
[...]
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 1317dd32ab..3eb9e2c9b5 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -840,6 +840,9 @@
> # @vcpu-dirty-limit: Dirtyrate limit (MB/s) during live migration.
> # Defaults to 1. (Since 8.1)
> #
> +# @direct-io: Open migration files with O_DIRECT when possible. Not
> +# all migration transports support this. (since 8.1)
> +#
Please format like
# @direct-io: Open migration files with O_DIRECT when possible. Not
# all migration transports support this. (since 8.1)
to blend in with recent commit a937b6aa739 (qapi: Reformat doc comments
to conform to current conventions).
> # Features:
> #
> # @unstable: Members @x-checkpoint-delay and @x-vcpu-dirty-limit-period
> @@ -864,7 +867,7 @@
> 'multifd-zlib-level', 'multifd-zstd-level',
> 'block-bitmap-mapping',
> { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] },
> - 'vcpu-dirty-limit'] }
> + 'vcpu-dirty-limit', 'direct-io'] }
>
> ##
> # @MigrateSetParameters:
> @@ -1016,6 +1019,9 @@
> # @vcpu-dirty-limit: Dirtyrate limit (MB/s) during live migration.
> # Defaults to 1. (Since 8.1)
> #
> +# @direct-io: Open migration files with O_DIRECT when possible. Not
> +# all migration transports support this. (since 8.1)
> +#
Likewise.
> # Features:
> #
> # @unstable: Members @x-checkpoint-delay and @x-vcpu-dirty-limit-period
> @@ -1058,7 +1064,8 @@
> '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ],
> '*x-vcpu-dirty-limit-period': { 'type': 'uint64',
> 'features': [ 'unstable' ] },
> - '*vcpu-dirty-limit': 'uint64'} }
> + '*vcpu-dirty-limit': 'uint64',
> + '*direct-io': 'bool' } }
>
> ##
> # @migrate-set-parameters:
> @@ -1230,6 +1237,9 @@
> # @vcpu-dirty-limit: Dirtyrate limit (MB/s) during live migration.
> # Defaults to 1. (Since 8.1)
> #
> +# @direct-io: Open migration files with O_DIRECT when possible. Not
> +# all migration transports support this. (since 8.1)
> +#
Likewise.
> # Features:
> #
> # @unstable: Members @x-checkpoint-delay and @x-vcpu-dirty-limit-period
> @@ -1269,7 +1279,8 @@
> '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ],
> '*x-vcpu-dirty-limit-period': { 'type': 'uint64',
> 'features': [ 'unstable' ] },
> - '*vcpu-dirty-limit': 'uint64'} }
> + '*vcpu-dirty-limit': 'uint64',
> + '*direct-io': 'bool' } }
>
> ##
> # @query-migrate-parameters:
This one is for the migration maintainers: we have MigrationCapability,
set with migrate-set-capabilities, and MigrationParameters, set with
migrate-set-parameters. For a boolean configuration setting, either
works. Which one should we use when?
[...]
Markus Armbruster <armbru@redhat.com> writes: > Fabiano Rosas <farosas@suse.de> writes: > >> Add the direct-io migration parameter that tells the migration code to >> use O_DIRECT when opening the migration stream file whenever possible. >> >> This is currently only used for the secondary channels of fixed-ram >> migration, which can guarantee that writes are page aligned. >> >> However the parameter could be made to affect other types of >> file-based migrations in the future. >> >> Signed-off-by: Fabiano Rosas <farosas@suse.de> > > When would you want to enable @direct-io, and when would you want to > leave it disabled? That depends on a performance analysis. You'd generally leave it disabled unless there's some indication that the operating system is having trouble draining the page cache. However I don't think QEMU should attempt any kind of prescription in that regard. From the migration implementation perspective, we need to provide alignment guarantees on the stream before allowing direct IO to be enabled. In this series we're just enabling it for the secondary multifd channels which do page-aligned reads/writes. > What happens when you enable @direct-io with a migration that cannot use > O_DIRECT? > In this version of the series Daniel suggested that we fail migration in case there's no support for direct IO or the migration doesn't support it.
On Tue, Oct 24, 2023 at 04:32:10PM -0300, Fabiano Rosas wrote: > Markus Armbruster <armbru@redhat.com> writes: > > > Fabiano Rosas <farosas@suse.de> writes: > > > >> Add the direct-io migration parameter that tells the migration code to > >> use O_DIRECT when opening the migration stream file whenever possible. > >> > >> This is currently only used for the secondary channels of fixed-ram > >> migration, which can guarantee that writes are page aligned. > >> > >> However the parameter could be made to affect other types of > >> file-based migrations in the future. > >> > >> Signed-off-by: Fabiano Rosas <farosas@suse.de> > > > > When would you want to enable @direct-io, and when would you want to > > leave it disabled? > > That depends on a performance analysis. You'd generally leave it > disabled unless there's some indication that the operating system is > having trouble draining the page cache. That's not the usage model I would suggest. The biggest value of the page cache comes when it holds data that will be repeatedly accessed. When you are saving/restoring a guest to file, that data is used once only (assuming there's a large gap between save & restore). By using the page cache to save a big guest we essentially purge the page cache of most of its existing data that is likely to be reaccessed, to fill it up with data never to be reaccessed. I usually describe save/restore operations as trashing the page cache. IMHO, mgmt apps should request O_DIRECT always unless they expect the save/restore operation to run in quick succession, or if they know that the host has oodles of free RAM such that existing data in the page cache won't be trashed, or if the host FS does not support O_DIRECT of course. > However I don't think QEMU should attempt any kind of prescription in > that regard. It shouldn't prescribe it, but I think our docs should encourage its use where possible. With regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Daniel P. Berrangé <berrange@redhat.com> writes: > On Tue, Oct 24, 2023 at 04:32:10PM -0300, Fabiano Rosas wrote: >> Markus Armbruster <armbru@redhat.com> writes: >> >> > Fabiano Rosas <farosas@suse.de> writes: >> > >> >> Add the direct-io migration parameter that tells the migration code to >> >> use O_DIRECT when opening the migration stream file whenever possible. >> >> >> >> This is currently only used for the secondary channels of fixed-ram >> >> migration, which can guarantee that writes are page aligned. >> >> >> >> However the parameter could be made to affect other types of >> >> file-based migrations in the future. >> >> >> >> Signed-off-by: Fabiano Rosas <farosas@suse.de> >> > >> > When would you want to enable @direct-io, and when would you want to >> > leave it disabled? >> >> That depends on a performance analysis. You'd generally leave it >> disabled unless there's some indication that the operating system is >> having trouble draining the page cache. > > That's not the usage model I would suggest. > Hehe I took a shot at answering but I really wanted to say "ask Daniel". > The biggest value of the page cache comes when it holds data that > will be repeatedly accessed. > > When you are saving/restoring a guest to file, that data is used > once only (assuming there's a large gap between save & restore). > By using the page cache to save a big guest we essentially purge > the page cache of most of its existing data that is likely to be > reaccessed, to fill it up with data never to be reaccessed. > > I usually describe save/restore operations as trashing the page > cache. > > IMHO, mgmt apps should request O_DIRECT always unless they expect > the save/restore operation to run in quick succession, or if they > know that the host has oodles of free RAM such that existing data > in the page cache won't be trashed, or Thanks, I'll try to incorporate this to some kind of doc in the next version. > if the host FS does not support O_DIRECT of course. Should we try to probe for this and inform the user?
On Wed, Oct 25, 2023 at 11:32:00AM -0300, Fabiano Rosas wrote: > Daniel P. Berrangé <berrange@redhat.com> writes: > > > On Tue, Oct 24, 2023 at 04:32:10PM -0300, Fabiano Rosas wrote: > >> Markus Armbruster <armbru@redhat.com> writes: > >> > >> > Fabiano Rosas <farosas@suse.de> writes: > >> > > >> >> Add the direct-io migration parameter that tells the migration code to > >> >> use O_DIRECT when opening the migration stream file whenever possible. > >> >> > >> >> This is currently only used for the secondary channels of fixed-ram > >> >> migration, which can guarantee that writes are page aligned. > >> >> > >> >> However the parameter could be made to affect other types of > >> >> file-based migrations in the future. > >> >> > >> >> Signed-off-by: Fabiano Rosas <farosas@suse.de> > >> > > >> > When would you want to enable @direct-io, and when would you want to > >> > leave it disabled? > >> > >> That depends on a performance analysis. You'd generally leave it > >> disabled unless there's some indication that the operating system is > >> having trouble draining the page cache. > > > > That's not the usage model I would suggest. > > > > Hehe I took a shot at answering but I really wanted to say "ask Daniel". > > > The biggest value of the page cache comes when it holds data that > > will be repeatedly accessed. > > > > When you are saving/restoring a guest to file, that data is used > > once only (assuming there's a large gap between save & restore). > > By using the page cache to save a big guest we essentially purge > > the page cache of most of its existing data that is likely to be > > reaccessed, to fill it up with data never to be reaccessed. > > > > I usually describe save/restore operations as trashing the page > > cache. > > > > IMHO, mgmt apps should request O_DIRECT always unless they expect > > the save/restore operation to run in quick succession, or if they > > know that the host has oodles of free RAM such that existing data > > in the page cache won't be trashed, or > > Thanks, I'll try to incorporate this to some kind of doc in the next > version. > > > if the host FS does not support O_DIRECT of course. > > Should we try to probe for this and inform the user? qemu_open_internall will already do a nice error message. If it gets EINVAL when using O_DIRECT, it'll retry without O_DIRECT and if that works, it'll reoprt "filesystem does not support O_DIRECT" Having said that I see a problem with /dev/fdset handling, because we're only validating O_ACCMODE and that excludes O_DIRECT. If the mgmt apps passes an FD with O_DIRECT already set, then it won't work for VMstate saving which is unaligned. If the mgmt app passes an FD without O_DIRECT set, then we are not setting O_DIRECT for the multifd RAM threads. With regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Daniel P. Berrangé <berrange@redhat.com> writes:
> On Wed, Oct 25, 2023 at 11:32:00AM -0300, Fabiano Rosas wrote:
>> Daniel P. Berrangé <berrange@redhat.com> writes:
>>
>> > On Tue, Oct 24, 2023 at 04:32:10PM -0300, Fabiano Rosas wrote:
>> >> Markus Armbruster <armbru@redhat.com> writes:
>> >>
>> >> > Fabiano Rosas <farosas@suse.de> writes:
>> >> >
>> >> >> Add the direct-io migration parameter that tells the migration code to
>> >> >> use O_DIRECT when opening the migration stream file whenever possible.
>> >> >>
>> >> >> This is currently only used for the secondary channels of fixed-ram
>> >> >> migration, which can guarantee that writes are page aligned.
>> >> >>
>> >> >> However the parameter could be made to affect other types of
>> >> >> file-based migrations in the future.
>> >> >>
>> >> >> Signed-off-by: Fabiano Rosas <farosas@suse.de>
>> >> >
>> >> > When would you want to enable @direct-io, and when would you want to
>> >> > leave it disabled?
>> >>
>> >> That depends on a performance analysis. You'd generally leave it
>> >> disabled unless there's some indication that the operating system is
>> >> having trouble draining the page cache.
>> >
>> > That's not the usage model I would suggest.
>> >
>>
>> Hehe I took a shot at answering but I really wanted to say "ask Daniel".
>>
>> > The biggest value of the page cache comes when it holds data that
>> > will be repeatedly accessed.
>> >
>> > When you are saving/restoring a guest to file, that data is used
>> > once only (assuming there's a large gap between save & restore).
>> > By using the page cache to save a big guest we essentially purge
>> > the page cache of most of its existing data that is likely to be
>> > reaccessed, to fill it up with data never to be reaccessed.
>> >
>> > I usually describe save/restore operations as trashing the page
>> > cache.
>> >
>> > IMHO, mgmt apps should request O_DIRECT always unless they expect
>> > the save/restore operation to run in quick succession, or if they
>> > know that the host has oodles of free RAM such that existing data
>> > in the page cache won't be trashed, or
>>
>> Thanks, I'll try to incorporate this to some kind of doc in the next
>> version.
>>
>> > if the host FS does not support O_DIRECT of course.
>>
>> Should we try to probe for this and inform the user?
>
> qemu_open_internall will already do a nice error message. If it gets
> EINVAL when using O_DIRECT, it'll retry without O_DIRECT and if that
> works, it'll reoprt "filesystem does not support O_DIRECT"
>
> Having said that I see a problem with /dev/fdset handling, because
> we're only validating O_ACCMODE and that excludes O_DIRECT.
>
> If the mgmt apps passes an FD with O_DIRECT already set, then it
> won't work for VMstate saving which is unaligned.
>
> If the mgmt app passes an FD without O_DIRECT set, then we are
> not setting O_DIRECT for the multifd RAM threads.
I could use some advice on how to solve this situation. The fdset code
at monitor/fds.c and the add-fd command don't seem to be usable outside
the original use-case of passing fds with different open flags.
There are several problems, the biggest one being that there's no way to
manipulate the set of file descriptors aside from asking for duplication
of an fd that matches a particular set of flags.
That doesn't work for us because the two fds we need (one for main
channel, other for secondary channels) will have the same open flags. So
the fdset code will always return the first one it finds in the set.
Another problem (or feature) of the fdset code is that it only removes
an fd when qmp_remove_fd() is called if the VM runstate is RUNNING,
which means that the migration code cannot remove the fds after use
reliably. We need to be able to remove them to make sure we use the
correct fds in a subsequent migration.
I see some paths forward:
1) Require the user to put the fds in separate fdsets.
This would be the easiest to handle in the migration code, but we
would have to come up with special file: URL syntax to accomodate more
than one fdset. Perhaps "file:/dev/fdsets/1,2" ?
2) Require the two fds in the same fdset and separate them ourselves.
This would require extending the fdset code to allow more ways of
manipulating the fdset. There's two options here:
a) Implement a pop() operation in the fdset code. We receive the
fdset, pop one fd from it and put it in a new fdset. I did some
experimentation with this by having an fd->present flag and just
skipping the fd during query-fdsets and
monitor_fdset_dup_fd_add(). It works, but it's convoluted.
b) Add support for removing the original fd when given the dup()ed
fd. The list of duplicated fds is currently by-fdset and not
by-original-fd, so this would require a larger code change.
3) Design a whole new URI.
Here, there are the usual benefits and drawbacks of doing something
from scratch. With the added drawback of dissociating from the file:
URI which is already well tested and easy to use when doing QEMU-only
migration.
With the three options above there's still the issue of removing the
fd. I think the original commit[1] might have been mistaken in adding
the runstate_is_running() check for *both* the "removed = true" clause
and the "fd was never duplicated" clause. But it's hard to tell since
this whole feature is a bit opaque to me.
1- ebe52b592d (monitor: Prevent removing fd from set during init)
https://gitlab.com/qemu-project/qemu/-/commit/ebe52b592d
All in all, I'm inclined to consider the first option, unless someone
has a better idea. Assuming we can figure out the removal issue, that
is.
Thoughts?
On Mon, Oct 30, 2023 at 07:51:34PM -0300, Fabiano Rosas wrote: > I could use some advice on how to solve this situation. The fdset code > at monitor/fds.c and the add-fd command don't seem to be usable outside > the original use-case of passing fds with different open flags. > > There are several problems, the biggest one being that there's no way to > manipulate the set of file descriptors aside from asking for duplication > of an fd that matches a particular set of flags. > > That doesn't work for us because the two fds we need (one for main > channel, other for secondary channels) will have the same open flags. So > the fdset code will always return the first one it finds in the set. QEMU may want multiple FDs *internally*, but IMHO that fact should not be exposed to mgmt applications. It would be valid for a QEMU impl to share the same FD across multiple threads, or have a different FD for each thread. All threads are using pread/pwrite, so it is safe for them to use the same FD if they desire. It is a private impl choice for QEMU at any given point in time and could change over time. Thus from the POV of the mgmt app, QEMU is writing to a single file, no matter how many threads are involved & thus it should only need to supply a single FD for thta file. QEMU can then call 'dup()' on that FD as many times as it desires, and use fcntl() to toggle O_DIRECT if and when it needs to. > Another problem (or feature) of the fdset code is that it only removes > an fd when qmp_remove_fd() is called if the VM runstate is RUNNING, > which means that the migration code cannot remove the fds after use > reliably. We need to be able to remove them to make sure we use the > correct fds in a subsequent migration. The "easy" option is to just add a new API that does what you want. Maybe during review someone will then point out why the orgianl API works the way it does. With regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Daniel P. Berrangé <berrange@redhat.com> writes: > On Mon, Oct 30, 2023 at 07:51:34PM -0300, Fabiano Rosas wrote: >> I could use some advice on how to solve this situation. The fdset code >> at monitor/fds.c and the add-fd command don't seem to be usable outside >> the original use-case of passing fds with different open flags. >> >> There are several problems, the biggest one being that there's no way to >> manipulate the set of file descriptors aside from asking for duplication >> of an fd that matches a particular set of flags. >> >> That doesn't work for us because the two fds we need (one for main >> channel, other for secondary channels) will have the same open flags. So >> the fdset code will always return the first one it finds in the set. > > QEMU may want multiple FDs *internally*, but IMHO that fact should > not be exposed to mgmt applications. It would be valid for a QEMU > impl to share the same FD across multiple threads, or have a different > FD for each thread. All threads are using pread/pwrite, so it is safe > for them to use the same FD if they desire. It is a private impl choice > for QEMU at any given point in time and could change over time. > Sure, I don't disagree. However up until last week we had a seemingly usable "add-fd" command that allows the user to provide a *set of file descriptors* to QEMU. It's just now that we're learning that interface serves only a special use-case. > Thus from the POV of the mgmt app, QEMU is writing to a single file, no > matter how many threads are involved & thus it should only need to supply > a single FD for thta file. QEMU can then call 'dup()' on that FD as many > times as it desires, and use fcntl() to toggle O_DIRECT if and when > it needs to. Ok, so I think the way to go here is for QEMU to receive a file + offset instead of an FD. That way QEMU can have adequate control of the resources for the migration. I don't remember why we went on the FD tangent. Is it not acceptable for libvirt to provide the file name + offset? >> Another problem (or feature) of the fdset code is that it only removes >> an fd when qmp_remove_fd() is called if the VM runstate is RUNNING, >> which means that the migration code cannot remove the fds after use >> reliably. We need to be able to remove them to make sure we use the >> correct fds in a subsequent migration. > > The "easy" option is to just add a new API that does what you want. > Maybe during review someone will then point out why the orgianl > API works the way it does. Hehe so I'll add a qmp_actually_remove_fd() =)
On Tue, Oct 31, 2023 at 10:05:56AM -0300, Fabiano Rosas wrote: > Daniel P. Berrangé <berrange@redhat.com> writes: > > > On Mon, Oct 30, 2023 at 07:51:34PM -0300, Fabiano Rosas wrote: > >> I could use some advice on how to solve this situation. The fdset code > >> at monitor/fds.c and the add-fd command don't seem to be usable outside > >> the original use-case of passing fds with different open flags. > >> > >> There are several problems, the biggest one being that there's no way to > >> manipulate the set of file descriptors aside from asking for duplication > >> of an fd that matches a particular set of flags. > >> > >> That doesn't work for us because the two fds we need (one for main > >> channel, other for secondary channels) will have the same open flags. So > >> the fdset code will always return the first one it finds in the set. > > > > QEMU may want multiple FDs *internally*, but IMHO that fact should > > not be exposed to mgmt applications. It would be valid for a QEMU > > impl to share the same FD across multiple threads, or have a different > > FD for each thread. All threads are using pread/pwrite, so it is safe > > for them to use the same FD if they desire. It is a private impl choice > > for QEMU at any given point in time and could change over time. > > > > Sure, I don't disagree. However up until last week we had a seemingly > usable "add-fd" command that allows the user to provide a *set of file > descriptors* to QEMU. It's just now that we're learning that interface > serves only a special use-case. AFAICT though we don't need add-fd to support passing many files for our needs. Saving only requires a single FD. All others can be opened by dup(), so the limitation of add-fd is irrelevant surely ? > > > Thus from the POV of the mgmt app, QEMU is writing to a single file, no > > matter how many threads are involved & thus it should only need to supply > > a single FD for thta file. QEMU can then call 'dup()' on that FD as many > > times as it desires, and use fcntl() to toggle O_DIRECT if and when > > it needs to. > > Ok, so I think the way to go here is for QEMU to receive a file + offset > instead of an FD. That way QEMU can have adequate control of the > resources for the migration. I don't remember why we went on the FD > tangent. Is it not acceptable for libvirt to provide the file name + > offset? FD passing means QEMU does not need privileges to open the file which could be useful. With regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Daniel P. Berrangé <berrange@redhat.com> writes: > On Tue, Oct 31, 2023 at 10:05:56AM -0300, Fabiano Rosas wrote: >> Daniel P. Berrangé <berrange@redhat.com> writes: >> >> > On Mon, Oct 30, 2023 at 07:51:34PM -0300, Fabiano Rosas wrote: >> >> I could use some advice on how to solve this situation. The fdset code >> >> at monitor/fds.c and the add-fd command don't seem to be usable outside >> >> the original use-case of passing fds with different open flags. >> >> >> >> There are several problems, the biggest one being that there's no way to >> >> manipulate the set of file descriptors aside from asking for duplication >> >> of an fd that matches a particular set of flags. >> >> >> >> That doesn't work for us because the two fds we need (one for main >> >> channel, other for secondary channels) will have the same open flags. So >> >> the fdset code will always return the first one it finds in the set. >> > >> > QEMU may want multiple FDs *internally*, but IMHO that fact should >> > not be exposed to mgmt applications. It would be valid for a QEMU >> > impl to share the same FD across multiple threads, or have a different >> > FD for each thread. All threads are using pread/pwrite, so it is safe >> > for them to use the same FD if they desire. It is a private impl choice >> > for QEMU at any given point in time and could change over time. >> > >> >> Sure, I don't disagree. However up until last week we had a seemingly >> usable "add-fd" command that allows the user to provide a *set of file >> descriptors* to QEMU. It's just now that we're learning that interface >> serves only a special use-case. > > AFAICT though we don't need add-fd to support passing many files > for our needs. Saving only requires a single FD. All others can > be opened by dup(), so the limitation of add-fd is irrelevant > surely ? Only once we decide to use one FD. If we had a generic add-fd backend, then that's already a user-facing API, so the "implementation detail" argument becomes weaker. With a single FD we'll need to be very careful about what code is allowed to run while the multifd channels are doing IO. Since O_DIRECT is not widely supported, now we have to also be careful about someone using that QEMUFile handle to do unaligned writes and not even noticing that it breaks direct IO. None of this in unworkable, of course, I just find the design way clearer with just the file name + offset. >> > Thus from the POV of the mgmt app, QEMU is writing to a single file, no >> > matter how many threads are involved & thus it should only need to supply >> > a single FD for thta file. QEMU can then call 'dup()' on that FD as many >> > times as it desires, and use fcntl() to toggle O_DIRECT if and when >> > it needs to. >> >> Ok, so I think the way to go here is for QEMU to receive a file + offset >> instead of an FD. That way QEMU can have adequate control of the >> resources for the migration. I don't remember why we went on the FD >> tangent. Is it not acceptable for libvirt to provide the file name + >> offset? > > FD passing means QEMU does not need privileges to open the file > which could be useful. Ok, let me give this a try. Let's see how bad it is to juggle the flag around the main channel.
Daniel P. Berrangé <berrange@redhat.com> writes: > On Wed, Oct 25, 2023 at 11:32:00AM -0300, Fabiano Rosas wrote: >> Daniel P. Berrangé <berrange@redhat.com> writes: >> >> > On Tue, Oct 24, 2023 at 04:32:10PM -0300, Fabiano Rosas wrote: >> >> Markus Armbruster <armbru@redhat.com> writes: >> >> >> >> > Fabiano Rosas <farosas@suse.de> writes: >> >> > >> >> >> Add the direct-io migration parameter that tells the migration code to >> >> >> use O_DIRECT when opening the migration stream file whenever possible. >> >> >> >> >> >> This is currently only used for the secondary channels of fixed-ram >> >> >> migration, which can guarantee that writes are page aligned. >> >> >> >> >> >> However the parameter could be made to affect other types of >> >> >> file-based migrations in the future. >> >> >> >> >> >> Signed-off-by: Fabiano Rosas <farosas@suse.de> >> >> > >> >> > When would you want to enable @direct-io, and when would you want to >> >> > leave it disabled? >> >> >> >> That depends on a performance analysis. You'd generally leave it >> >> disabled unless there's some indication that the operating system is >> >> having trouble draining the page cache. >> > >> > That's not the usage model I would suggest. >> > >> >> Hehe I took a shot at answering but I really wanted to say "ask Daniel". >> >> > The biggest value of the page cache comes when it holds data that >> > will be repeatedly accessed. >> > >> > When you are saving/restoring a guest to file, that data is used >> > once only (assuming there's a large gap between save & restore). >> > By using the page cache to save a big guest we essentially purge >> > the page cache of most of its existing data that is likely to be >> > reaccessed, to fill it up with data never to be reaccessed. >> > >> > I usually describe save/restore operations as trashing the page >> > cache. >> > >> > IMHO, mgmt apps should request O_DIRECT always unless they expect >> > the save/restore operation to run in quick succession, or if they >> > know that the host has oodles of free RAM such that existing data >> > in the page cache won't be trashed, or >> >> Thanks, I'll try to incorporate this to some kind of doc in the next >> version. >> >> > if the host FS does not support O_DIRECT of course. >> >> Should we try to probe for this and inform the user? > > qemu_open_internall will already do a nice error message. If it gets > EINVAL when using O_DIRECT, it'll retry without O_DIRECT and if that > works, it'll reoprt "filesystem does not support O_DIRECT" > > Having said that I see a problem with /dev/fdset handling, because > we're only validating O_ACCMODE and that excludes O_DIRECT. > > If the mgmt apps passes an FD with O_DIRECT already set, then it > won't work for VMstate saving which is unaligned. > > If the mgmt app passes an FD without O_DIRECT set, then we are > not setting O_DIRECT for the multifd RAM threads. Worse, the fds get dup'ed so even without O_DIRECT, we we enable it for the secondary channels the main channel will break on unaligned writes. For now I can only think of requiring two fds. One for the main channel and a second one for the rest of the channels. And validating the fd flags to make sure O_DIRECT is only allowed to be set in the second fd.
On Wed, Oct 25, 2023 at 02:30:01PM -0300, Fabiano Rosas wrote: > Daniel P. Berrangé <berrange@redhat.com> writes: > > > On Wed, Oct 25, 2023 at 11:32:00AM -0300, Fabiano Rosas wrote: > >> Daniel P. Berrangé <berrange@redhat.com> writes: > >> > >> > On Tue, Oct 24, 2023 at 04:32:10PM -0300, Fabiano Rosas wrote: > >> >> Markus Armbruster <armbru@redhat.com> writes: > >> >> > >> >> > Fabiano Rosas <farosas@suse.de> writes: > >> >> > > >> >> >> Add the direct-io migration parameter that tells the migration code to > >> >> >> use O_DIRECT when opening the migration stream file whenever possible. > >> >> >> > >> >> >> This is currently only used for the secondary channels of fixed-ram > >> >> >> migration, which can guarantee that writes are page aligned. > >> >> >> > >> >> >> However the parameter could be made to affect other types of > >> >> >> file-based migrations in the future. > >> >> >> > >> >> >> Signed-off-by: Fabiano Rosas <farosas@suse.de> > >> >> > > >> >> > When would you want to enable @direct-io, and when would you want to > >> >> > leave it disabled? > >> >> > >> >> That depends on a performance analysis. You'd generally leave it > >> >> disabled unless there's some indication that the operating system is > >> >> having trouble draining the page cache. > >> > > >> > That's not the usage model I would suggest. > >> > > >> > >> Hehe I took a shot at answering but I really wanted to say "ask Daniel". > >> > >> > The biggest value of the page cache comes when it holds data that > >> > will be repeatedly accessed. > >> > > >> > When you are saving/restoring a guest to file, that data is used > >> > once only (assuming there's a large gap between save & restore). > >> > By using the page cache to save a big guest we essentially purge > >> > the page cache of most of its existing data that is likely to be > >> > reaccessed, to fill it up with data never to be reaccessed. > >> > > >> > I usually describe save/restore operations as trashing the page > >> > cache. > >> > > >> > IMHO, mgmt apps should request O_DIRECT always unless they expect > >> > the save/restore operation to run in quick succession, or if they > >> > know that the host has oodles of free RAM such that existing data > >> > in the page cache won't be trashed, or > >> > >> Thanks, I'll try to incorporate this to some kind of doc in the next > >> version. > >> > >> > if the host FS does not support O_DIRECT of course. > >> > >> Should we try to probe for this and inform the user? > > > > qemu_open_internall will already do a nice error message. If it gets > > EINVAL when using O_DIRECT, it'll retry without O_DIRECT and if that > > works, it'll reoprt "filesystem does not support O_DIRECT" > > > > Having said that I see a problem with /dev/fdset handling, because > > we're only validating O_ACCMODE and that excludes O_DIRECT. > > > > If the mgmt apps passes an FD with O_DIRECT already set, then it > > won't work for VMstate saving which is unaligned. > > > > If the mgmt app passes an FD without O_DIRECT set, then we are > > not setting O_DIRECT for the multifd RAM threads. > > Worse, the fds get dup'ed so even without O_DIRECT, we we enable it for > the secondary channels the main channel will break on unaligned writes. > > For now I can only think of requiring two fds. One for the main channel > and a second one for the rest of the channels. And validating the fd > flags to make sure O_DIRECT is only allowed to be set in the second fd. In this new model I think there's no reason for libvirt to set O_DIRECT for its own initial I/O. So we could just totally ignore O_DIRECT when initially opening the QIOCHannelFile. Then provide a method on QIOCHannelFile to enable O_DIRECT on the fly which can be called for the multifd threads setup ? With regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Daniel P. Berrangé <berrange@redhat.com> writes: > On Wed, Oct 25, 2023 at 02:30:01PM -0300, Fabiano Rosas wrote: >> Daniel P. Berrangé <berrange@redhat.com> writes: >> >> > On Wed, Oct 25, 2023 at 11:32:00AM -0300, Fabiano Rosas wrote: >> >> Daniel P. Berrangé <berrange@redhat.com> writes: >> >> >> >> > On Tue, Oct 24, 2023 at 04:32:10PM -0300, Fabiano Rosas wrote: >> >> >> Markus Armbruster <armbru@redhat.com> writes: >> >> >> >> >> >> > Fabiano Rosas <farosas@suse.de> writes: >> >> >> > >> >> >> >> Add the direct-io migration parameter that tells the migration code to >> >> >> >> use O_DIRECT when opening the migration stream file whenever possible. >> >> >> >> >> >> >> >> This is currently only used for the secondary channels of fixed-ram >> >> >> >> migration, which can guarantee that writes are page aligned. >> >> >> >> >> >> >> >> However the parameter could be made to affect other types of >> >> >> >> file-based migrations in the future. >> >> >> >> >> >> >> >> Signed-off-by: Fabiano Rosas <farosas@suse.de> >> >> >> > >> >> >> > When would you want to enable @direct-io, and when would you want to >> >> >> > leave it disabled? >> >> >> >> >> >> That depends on a performance analysis. You'd generally leave it >> >> >> disabled unless there's some indication that the operating system is >> >> >> having trouble draining the page cache. >> >> > >> >> > That's not the usage model I would suggest. >> >> > >> >> >> >> Hehe I took a shot at answering but I really wanted to say "ask Daniel". >> >> >> >> > The biggest value of the page cache comes when it holds data that >> >> > will be repeatedly accessed. >> >> > >> >> > When you are saving/restoring a guest to file, that data is used >> >> > once only (assuming there's a large gap between save & restore). >> >> > By using the page cache to save a big guest we essentially purge >> >> > the page cache of most of its existing data that is likely to be >> >> > reaccessed, to fill it up with data never to be reaccessed. >> >> > >> >> > I usually describe save/restore operations as trashing the page >> >> > cache. >> >> > >> >> > IMHO, mgmt apps should request O_DIRECT always unless they expect >> >> > the save/restore operation to run in quick succession, or if they >> >> > know that the host has oodles of free RAM such that existing data >> >> > in the page cache won't be trashed, or >> >> >> >> Thanks, I'll try to incorporate this to some kind of doc in the next >> >> version. >> >> >> >> > if the host FS does not support O_DIRECT of course. >> >> >> >> Should we try to probe for this and inform the user? >> > >> > qemu_open_internall will already do a nice error message. If it gets >> > EINVAL when using O_DIRECT, it'll retry without O_DIRECT and if that >> > works, it'll reoprt "filesystem does not support O_DIRECT" >> > >> > Having said that I see a problem with /dev/fdset handling, because >> > we're only validating O_ACCMODE and that excludes O_DIRECT. >> > >> > If the mgmt apps passes an FD with O_DIRECT already set, then it >> > won't work for VMstate saving which is unaligned. >> > >> > If the mgmt app passes an FD without O_DIRECT set, then we are >> > not setting O_DIRECT for the multifd RAM threads. >> >> Worse, the fds get dup'ed so even without O_DIRECT, we we enable it for >> the secondary channels the main channel will break on unaligned writes. >> >> For now I can only think of requiring two fds. One for the main channel >> and a second one for the rest of the channels. And validating the fd >> flags to make sure O_DIRECT is only allowed to be set in the second fd. > > In this new model I think there's no reason for libvirt to set O_DIRECT > for its own initial I/O. So we could just totally ignore O_DIRECT when > initially opening the QIOCHannelFile. > Yes. I still have to disallow setting it on the main channel just to be safe. > Then provide a method on QIOCHannelFile to enable O_DIRECT on the fly > which can be called for the multifd threads setup ? Sure, but there's not really an "on the fly" here, after file_send_channel_create() returns the channel should be ready to use. It would go from: flag |= O_DIRECT; qio_channel_file_new_path(...); to: qio_channel_file_new_path(...); qio_channel_file_set_direct_io(); Which could be cleaner since the migration code doesn't have to check for O_DIRECT support.
Fabiano Rosas <farosas@suse.de> writes: > Markus Armbruster <armbru@redhat.com> writes: > >> Fabiano Rosas <farosas@suse.de> writes: >> >>> Add the direct-io migration parameter that tells the migration code to >>> use O_DIRECT when opening the migration stream file whenever possible. >>> >>> This is currently only used for the secondary channels of fixed-ram >>> migration, which can guarantee that writes are page aligned. >>> >>> However the parameter could be made to affect other types of >>> file-based migrations in the future. >>> >>> Signed-off-by: Fabiano Rosas <farosas@suse.de> >> >> When would you want to enable @direct-io, and when would you want to >> leave it disabled? > > That depends on a performance analysis. You'd generally leave it > disabled unless there's some indication that the operating system is > having trouble draining the page cache. > > However I don't think QEMU should attempt any kind of prescription in > that regard. > > From the migration implementation perspective, we need to provide > alignment guarantees on the stream before allowing direct IO to be > enabled. In this series we're just enabling it for the secondary multifd > channels which do page-aligned reads/writes. I'm asking because QEMU provides too many configuration options with too little guidance on how to use them. "You'd generally leave it disabled unless there's some indication that the operating system is having trouble draining the page cache" is guidance. It'll be a lot more useful in documentation than in the mailing list archive ;) >> What happens when you enable @direct-io with a migration that cannot use >> O_DIRECT? > > In this version of the series Daniel suggested that we fail migration in > case there's no support for direct IO or the migration doesn't support > it. Makes sense.
© 2016 - 2026 Red Hat, Inc.