Signed-off-by: Igor Mammedov <imammedo@redhat.com>
---
include/sysemu/sysemu.h | 1 +
qemu-options.hx | 15 ++++++++++++++
qmp.c | 5 +++++
vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 74 insertions(+), 1 deletion(-)
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index b213696..3feb94f 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -66,6 +66,7 @@ typedef enum WakeupReason {
QEMU_WAKEUP_REASON_OTHER,
} WakeupReason;
+void qemu_exit_preconfig_request(void);
void qemu_system_reset_request(ShutdownCause reason);
void qemu_system_suspend_request(void);
void qemu_register_suspend_notifier(Notifier *notifier);
diff --git a/qemu-options.hx b/qemu-options.hx
index 39225ae..bd44db8 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -3498,6 +3498,21 @@ STEXI
Run the emulation in single step mode.
ETEXI
+DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
+ "-paused [state=]postconf|preconf\n"
+ " postconf: pause QEMU after machine is initialized\n"
+ " preconf: pause QEMU before machine is initialized\n",
+ QEMU_ARCH_ALL)
+STEXI
+@item -paused
+@findex -paused
+if set enabled interactive configuration stages before machine emulation starts.
+'postconf' option value mimics -S option behaviour where machine is created
+but emulation isn't started. 'preconf' option value pauses QEMU before machine
+is created, which allows to query and configure properties affecting machine
+initialization. Use monitor/QMP command 'cont' to go to exit paused state.
+ETEXI
+
DEF("S", 0, QEMU_OPTION_S, \
"-S freeze CPU at startup (use 'c' to start execution)\n",
QEMU_ARCH_ALL)
diff --git a/qmp.c b/qmp.c
index e8c3031..49e9a5c 100644
--- a/qmp.c
+++ b/qmp.c
@@ -167,6 +167,11 @@ void qmp_cont(Error **errp)
BlockBackend *blk;
Error *local_err = NULL;
+ if (runstate_check(RUN_STATE_PRELAUNCH)) {
+ qemu_exit_preconfig_request();
+ return;
+ }
+
/* if there is a dump in background, we should wait until the dump
* finished */
if (dump_in_progress()) {
diff --git a/vl.c b/vl.c
index 3fed457..30631fd 100644
--- a/vl.c
+++ b/vl.c
@@ -555,6 +555,20 @@ static QemuOptsList qemu_fw_cfg_opts = {
},
};
+static QemuOptsList qemu_paused_opts = {
+ .name = "paused",
+ .implied_opt_name = "state",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_paused_opts.head),
+ .desc = {
+ {
+ .name = "state",
+ .type = QEMU_OPT_STRING,
+ .help = "Pause state of QEMU on startup",
+ },
+ { /* end of list */ }
+ },
+};
+
/**
* Get machine options
*
@@ -1689,6 +1703,11 @@ static pid_t shutdown_pid;
static int powerdown_requested;
static int debug_requested;
static int suspend_requested;
+static enum {
+ PRECONFIG_CONT = 0,
+ PRECONFIG_PAUSE,
+ PRECONFIG_SKIP,
+} preconfig_requested;
static WakeupReason wakeup_reason;
static NotifierList powerdown_notifiers =
NOTIFIER_LIST_INITIALIZER(powerdown_notifiers);
@@ -1773,6 +1792,11 @@ static int qemu_debug_requested(void)
return r;
}
+void qemu_exit_preconfig_request(void)
+{
+ preconfig_requested = PRECONFIG_CONT;
+}
+
/*
* Reset the VM. Issue an event unless @reason is SHUTDOWN_CAUSE_NONE.
*/
@@ -1939,6 +1963,12 @@ static bool main_loop_should_exit(void)
RunState r;
ShutdownCause request;
+ if (runstate_check(RUN_STATE_PRELAUNCH)) {
+ if (preconfig_requested == PRECONFIG_CONT) {
+ preconfig_requested = PRECONFIG_SKIP;
+ return true;
+ }
+ }
if (qemu_debug_requested()) {
vm_stop(RUN_STATE_DEBUG);
}
@@ -3177,6 +3207,7 @@ int main(int argc, char **argv, char **envp)
qemu_add_opts(&qemu_icount_opts);
qemu_add_opts(&qemu_semihosting_config_opts);
qemu_add_opts(&qemu_fw_cfg_opts);
+ qemu_add_opts(&qemu_paused_opts);
module_call_init(MODULE_INIT_OPTS);
runstate_init();
@@ -3845,6 +3876,26 @@ int main(int argc, char **argv, char **envp)
exit(1);
}
break;
+ case QEMU_OPTION_paused:
+ {
+ const char *value;
+
+ opts = qemu_opts_parse_noisily(qemu_find_opts("paused"),
+ optarg, true);
+ if (opts == NULL) {
+ exit(1);
+ }
+ value = qemu_opt_get(opts, "state");
+ if (!strcmp(value, "postconf")) {
+ autostart = 0;
+ } else if (!strcmp(value, "preconf")) {
+ preconfig_requested = PRECONFIG_PAUSE;
+ } else {
+ error_report("incomplete '-paused' option\n");
+ exit(1);
+ }
+ break;
+ }
case QEMU_OPTION_enable_kvm:
olist = qemu_find_opts("machine");
qemu_opts_parse_noisily(olist, "accel=kvm", false);
@@ -4731,7 +4782,6 @@ int main(int argc, char **argv, char **envp)
current_machine->boot_order = boot_order;
current_machine->cpu_model = cpu_model;
-
/* parse features once if machine provides default cpu_type */
if (machine_class->default_cpu_type) {
current_machine->cpu_type = machine_class->default_cpu_type;
@@ -4741,6 +4791,8 @@ int main(int argc, char **argv, char **envp)
}
}
+ main_loop(); /* do monitor/qmp handling at preconfig state if requested */
+
machine_run_board_init(current_machine);
realtime_init();
--
2.7.4
On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
This really needs to have a commit message that provides justification
for why this option is needed when we already have -S that is used
to allow configuration before the guest starts.
> Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> ---
> include/sysemu/sysemu.h | 1 +
> qemu-options.hx | 15 ++++++++++++++
> qmp.c | 5 +++++
> vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> 4 files changed, 74 insertions(+), 1 deletion(-)
>
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index b213696..3feb94f 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> QEMU_WAKEUP_REASON_OTHER,
> } WakeupReason;
>
> +void qemu_exit_preconfig_request(void);
> void qemu_system_reset_request(ShutdownCause reason);
> void qemu_system_suspend_request(void);
> void qemu_register_suspend_notifier(Notifier *notifier);
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 39225ae..bd44db8 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -3498,6 +3498,21 @@ STEXI
> Run the emulation in single step mode.
> ETEXI
>
> +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> + "-paused [state=]postconf|preconf\n"
> + " postconf: pause QEMU after machine is initialized\n"
> + " preconf: pause QEMU before machine is initialized\n",
> + QEMU_ARCH_ALL)
> +STEXI
> +@item -paused
> +@findex -paused
> +if set enabled interactive configuration stages before machine emulation starts.
> +'postconf' option value mimics -S option behaviour where machine is created
> +but emulation isn't started. 'preconf' option value pauses QEMU before machine
> +is created, which allows to query and configure properties affecting machine
> +initialization. Use monitor/QMP command 'cont' to go to exit paused state.
> +ETEXI
To me it feels rather unpleasant to be exposing this kind of detailed knowledge
about the steps QEMU goes through when consttructing the machine and expecting
the mgmt application to synchronize certain monitor actions against this.
> +
> DEF("S", 0, QEMU_OPTION_S, \
> "-S freeze CPU at startup (use 'c' to start execution)\n",
> QEMU_ARCH_ALL)
> diff --git a/qmp.c b/qmp.c
> index e8c3031..49e9a5c 100644
> --- a/qmp.c
> +++ b/qmp.c
> @@ -167,6 +167,11 @@ void qmp_cont(Error **errp)
> BlockBackend *blk;
> Error *local_err = NULL;
>
> + if (runstate_check(RUN_STATE_PRELAUNCH)) {
> + qemu_exit_preconfig_request();
> + return;
> + }
> +
> /* if there is a dump in background, we should wait until the dump
> * finished */
> if (dump_in_progress()) {
> diff --git a/vl.c b/vl.c
> index 3fed457..30631fd 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -555,6 +555,20 @@ static QemuOptsList qemu_fw_cfg_opts = {
> },
> };
>
> +static QemuOptsList qemu_paused_opts = {
> + .name = "paused",
> + .implied_opt_name = "state",
> + .head = QTAILQ_HEAD_INITIALIZER(qemu_paused_opts.head),
> + .desc = {
> + {
> + .name = "state",
> + .type = QEMU_OPT_STRING,
> + .help = "Pause state of QEMU on startup",
> + },
> + { /* end of list */ }
> + },
> +};
> +
> /**
> * Get machine options
> *
> @@ -1689,6 +1703,11 @@ static pid_t shutdown_pid;
> static int powerdown_requested;
> static int debug_requested;
> static int suspend_requested;
> +static enum {
> + PRECONFIG_CONT = 0,
> + PRECONFIG_PAUSE,
> + PRECONFIG_SKIP,
> +} preconfig_requested;
> static WakeupReason wakeup_reason;
> static NotifierList powerdown_notifiers =
> NOTIFIER_LIST_INITIALIZER(powerdown_notifiers);
> @@ -1773,6 +1792,11 @@ static int qemu_debug_requested(void)
> return r;
> }
>
> +void qemu_exit_preconfig_request(void)
> +{
> + preconfig_requested = PRECONFIG_CONT;
> +}
> +
> /*
> * Reset the VM. Issue an event unless @reason is SHUTDOWN_CAUSE_NONE.
> */
> @@ -1939,6 +1963,12 @@ static bool main_loop_should_exit(void)
> RunState r;
> ShutdownCause request;
>
> + if (runstate_check(RUN_STATE_PRELAUNCH)) {
> + if (preconfig_requested == PRECONFIG_CONT) {
> + preconfig_requested = PRECONFIG_SKIP;
> + return true;
> + }
> + }
> if (qemu_debug_requested()) {
> vm_stop(RUN_STATE_DEBUG);
> }
> @@ -3177,6 +3207,7 @@ int main(int argc, char **argv, char **envp)
> qemu_add_opts(&qemu_icount_opts);
> qemu_add_opts(&qemu_semihosting_config_opts);
> qemu_add_opts(&qemu_fw_cfg_opts);
> + qemu_add_opts(&qemu_paused_opts);
> module_call_init(MODULE_INIT_OPTS);
>
> runstate_init();
> @@ -3845,6 +3876,26 @@ int main(int argc, char **argv, char **envp)
> exit(1);
> }
> break;
> + case QEMU_OPTION_paused:
> + {
> + const char *value;
> +
> + opts = qemu_opts_parse_noisily(qemu_find_opts("paused"),
> + optarg, true);
> + if (opts == NULL) {
> + exit(1);
> + }
> + value = qemu_opt_get(opts, "state");
> + if (!strcmp(value, "postconf")) {
> + autostart = 0;
> + } else if (!strcmp(value, "preconf")) {
> + preconfig_requested = PRECONFIG_PAUSE;
> + } else {
> + error_report("incomplete '-paused' option\n");
> + exit(1);
> + }
> + break;
> + }
> case QEMU_OPTION_enable_kvm:
> olist = qemu_find_opts("machine");
> qemu_opts_parse_noisily(olist, "accel=kvm", false);
> @@ -4731,7 +4782,6 @@ int main(int argc, char **argv, char **envp)
> current_machine->boot_order = boot_order;
> current_machine->cpu_model = cpu_model;
>
> -
> /* parse features once if machine provides default cpu_type */
> if (machine_class->default_cpu_type) {
> current_machine->cpu_type = machine_class->default_cpu_type;
> @@ -4741,6 +4791,8 @@ int main(int argc, char **argv, char **envp)
> }
> }
>
> + main_loop(); /* do monitor/qmp handling at preconfig state if requested */
> +
> machine_run_board_init(current_machine);
>
> realtime_init();
> --
> 2.7.4
>
>
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On Mon, 16 Oct 2017 17:35:15 +0100
"Daniel P. Berrange" <berrange@redhat.com> wrote:
> On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
>
> This really needs to have a commit message that provides justification
> for why this option is needed when we already have -S that is used
> to allow configuration before the guest starts.
Sorry, I've should have added here what I've tried to describe in cover letter.
-S pauses machine too late as machine is already created by the time
it's paused so trying to reconfigure it might require machine to be recreated.
In case of NUMA options it might be possible to hack x86 target to
rebuild/override acpi/fw_cfg so it would reflect the new settings set
this late but I wouldn't expect that it would work in general.
The cleanest way to configure it is pausing and configuring numa mapping
before machine is build.
> > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > ---
> > include/sysemu/sysemu.h | 1 +
> > qemu-options.hx | 15 ++++++++++++++
> > qmp.c | 5 +++++
> > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > 4 files changed, 74 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > index b213696..3feb94f 100644
> > --- a/include/sysemu/sysemu.h
> > +++ b/include/sysemu/sysemu.h
> > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > QEMU_WAKEUP_REASON_OTHER,
> > } WakeupReason;
> >
> > +void qemu_exit_preconfig_request(void);
> > void qemu_system_reset_request(ShutdownCause reason);
> > void qemu_system_suspend_request(void);
> > void qemu_register_suspend_notifier(Notifier *notifier);
> > diff --git a/qemu-options.hx b/qemu-options.hx
> > index 39225ae..bd44db8 100644
> > --- a/qemu-options.hx
> > +++ b/qemu-options.hx
> > @@ -3498,6 +3498,21 @@ STEXI
> > Run the emulation in single step mode.
> > ETEXI
> >
> > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > + "-paused [state=]postconf|preconf\n"
> > + " postconf: pause QEMU after machine is initialized\n"
> > + " preconf: pause QEMU before machine is initialized\n",
> > + QEMU_ARCH_ALL)
> > +STEXI
> > +@item -paused
> > +@findex -paused
> > +if set enabled interactive configuration stages before machine emulation starts.
> > +'postconf' option value mimics -S option behaviour where machine is created
> > +but emulation isn't started. 'preconf' option value pauses QEMU before machine
> > +is created, which allows to query and configure properties affecting machine
> > +initialization. Use monitor/QMP command 'cont' to go to exit paused state.
> > +ETEXI
>
> To me it feels rather unpleasant to be exposing this kind of detailed knowledge
> about the steps QEMU goes through when consttructing the machine and expecting
> the mgmt application to synchronize certain monitor actions against this.
well, so far alternative seems to be unacceptable as well, i.e.
start qemu twice
#1 to get cpu layout form given '-M -smp' options
#2 add -numa options that would map cpus provided at #1 to numa nodes
> > +
> > DEF("S", 0, QEMU_OPTION_S, \
> > "-S freeze CPU at startup (use 'c' to start execution)\n",
> > QEMU_ARCH_ALL)
> > diff --git a/qmp.c b/qmp.c
> > index e8c3031..49e9a5c 100644
> > --- a/qmp.c
> > +++ b/qmp.c
> > @@ -167,6 +167,11 @@ void qmp_cont(Error **errp)
> > BlockBackend *blk;
> > Error *local_err = NULL;
> >
> > + if (runstate_check(RUN_STATE_PRELAUNCH)) {
> > + qemu_exit_preconfig_request();
> > + return;
> > + }
> > +
> > /* if there is a dump in background, we should wait until the dump
> > * finished */
> > if (dump_in_progress()) {
> > diff --git a/vl.c b/vl.c
> > index 3fed457..30631fd 100644
> > --- a/vl.c
> > +++ b/vl.c
> > @@ -555,6 +555,20 @@ static QemuOptsList qemu_fw_cfg_opts = {
> > },
> > };
> >
> > +static QemuOptsList qemu_paused_opts = {
> > + .name = "paused",
> > + .implied_opt_name = "state",
> > + .head = QTAILQ_HEAD_INITIALIZER(qemu_paused_opts.head),
> > + .desc = {
> > + {
> > + .name = "state",
> > + .type = QEMU_OPT_STRING,
> > + .help = "Pause state of QEMU on startup",
> > + },
> > + { /* end of list */ }
> > + },
> > +};
> > +
> > /**
> > * Get machine options
> > *
> > @@ -1689,6 +1703,11 @@ static pid_t shutdown_pid;
> > static int powerdown_requested;
> > static int debug_requested;
> > static int suspend_requested;
> > +static enum {
> > + PRECONFIG_CONT = 0,
> > + PRECONFIG_PAUSE,
> > + PRECONFIG_SKIP,
> > +} preconfig_requested;
> > static WakeupReason wakeup_reason;
> > static NotifierList powerdown_notifiers =
> > NOTIFIER_LIST_INITIALIZER(powerdown_notifiers);
> > @@ -1773,6 +1792,11 @@ static int qemu_debug_requested(void)
> > return r;
> > }
> >
> > +void qemu_exit_preconfig_request(void)
> > +{
> > + preconfig_requested = PRECONFIG_CONT;
> > +}
> > +
> > /*
> > * Reset the VM. Issue an event unless @reason is SHUTDOWN_CAUSE_NONE.
> > */
> > @@ -1939,6 +1963,12 @@ static bool main_loop_should_exit(void)
> > RunState r;
> > ShutdownCause request;
> >
> > + if (runstate_check(RUN_STATE_PRELAUNCH)) {
> > + if (preconfig_requested == PRECONFIG_CONT) {
> > + preconfig_requested = PRECONFIG_SKIP;
> > + return true;
> > + }
> > + }
> > if (qemu_debug_requested()) {
> > vm_stop(RUN_STATE_DEBUG);
> > }
> > @@ -3177,6 +3207,7 @@ int main(int argc, char **argv, char **envp)
> > qemu_add_opts(&qemu_icount_opts);
> > qemu_add_opts(&qemu_semihosting_config_opts);
> > qemu_add_opts(&qemu_fw_cfg_opts);
> > + qemu_add_opts(&qemu_paused_opts);
> > module_call_init(MODULE_INIT_OPTS);
> >
> > runstate_init();
> > @@ -3845,6 +3876,26 @@ int main(int argc, char **argv, char **envp)
> > exit(1);
> > }
> > break;
> > + case QEMU_OPTION_paused:
> > + {
> > + const char *value;
> > +
> > + opts = qemu_opts_parse_noisily(qemu_find_opts("paused"),
> > + optarg, true);
> > + if (opts == NULL) {
> > + exit(1);
> > + }
> > + value = qemu_opt_get(opts, "state");
> > + if (!strcmp(value, "postconf")) {
> > + autostart = 0;
> > + } else if (!strcmp(value, "preconf")) {
> > + preconfig_requested = PRECONFIG_PAUSE;
> > + } else {
> > + error_report("incomplete '-paused' option\n");
> > + exit(1);
> > + }
> > + break;
> > + }
> > case QEMU_OPTION_enable_kvm:
> > olist = qemu_find_opts("machine");
> > qemu_opts_parse_noisily(olist, "accel=kvm", false);
> > @@ -4731,7 +4782,6 @@ int main(int argc, char **argv, char **envp)
> > current_machine->boot_order = boot_order;
> > current_machine->cpu_model = cpu_model;
> >
> > -
> > /* parse features once if machine provides default cpu_type */
> > if (machine_class->default_cpu_type) {
> > current_machine->cpu_type = machine_class->default_cpu_type;
> > @@ -4741,6 +4791,8 @@ int main(int argc, char **argv, char **envp)
> > }
> > }
> >
> > + main_loop(); /* do monitor/qmp handling at preconfig state if requested */
> > +
> > machine_run_board_init(current_machine);
> >
> > realtime_init();
> > --
> > 2.7.4
> >
> >
>
> Regards,
> Daniel
On 10/17/17 10:17, Igor Mammedov wrote: > On Mon, 16 Oct 2017 17:35:15 +0100 > "Daniel P. Berrange" <berrange@redhat.com> wrote: > >> On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote: >> >> This really needs to have a commit message that provides justification >> for why this option is needed when we already have -S that is used >> to allow configuration before the guest starts. > Sorry, I've should have added here what I've tried to describe in cover letter. > > -S pauses machine too late as machine is already created by the time > it's paused so trying to reconfigure it might require machine to be recreated. > In case of NUMA options it might be possible to hack x86 target to > rebuild/override acpi/fw_cfg so it would reflect the new settings set > this late but I wouldn't expect that it would work in general. > > The cleanest way to configure it is pausing and configuring numa mapping > before machine is build. Asking from the sideline: if the NUMA mapping has to be configured so early, why can't it be done on the QEMU command line? (I asked myself the same question when I first saw your patches -- I couldn't find an explanation in the blurb --, so I assumed it was obvious and/or others would ask the same question.) Again, I'm just curious. Thanks! Laszlo
On Tue, Oct 17, 2017 at 12:56:28 +0200, Laszlo Ersek wrote: > On 10/17/17 10:17, Igor Mammedov wrote: > > On Mon, 16 Oct 2017 17:35:15 +0100 > > "Daniel P. Berrange" <berrange@redhat.com> wrote: > > > >> On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote: > >> > >> This really needs to have a commit message that provides justification > >> for why this option is needed when we already have -S that is used > >> to allow configuration before the guest starts. > > Sorry, I've should have added here what I've tried to describe in cover letter. > > > > -S pauses machine too late as machine is already created by the time > > it's paused so trying to reconfigure it might require machine to be recreated. > > In case of NUMA options it might be possible to hack x86 target to > > rebuild/override acpi/fw_cfg so it would reflect the new settings set > > this late but I wouldn't expect that it would work in general. > > > > The cleanest way to configure it is pausing and configuring numa mapping > > before machine is build. > > Asking from the sideline: if the NUMA mapping has to be configured so > early, why can't it be done on the QEMU command line? > > (I asked myself the same question when I first saw your patches -- I > couldn't find an explanation in the blurb --, so I assumed it was > obvious and/or others would ask the same question.) Because libvirt needs to be able to query qemu before setting stuff up. As we already established, it's not okay to run a throwaway qemu process to do so, so we are getting into the chicken/egg problem zone.
On Mon, Oct 16, 2017 at 05:35:15PM +0100, Daniel P. Berrange wrote:
> On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
>
> This really needs to have a commit message that provides justification
> for why this option is needed when we already have -S that is used
> to allow configuration before the guest starts.
>
> > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > ---
> > include/sysemu/sysemu.h | 1 +
> > qemu-options.hx | 15 ++++++++++++++
> > qmp.c | 5 +++++
> > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > 4 files changed, 74 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > index b213696..3feb94f 100644
> > --- a/include/sysemu/sysemu.h
> > +++ b/include/sysemu/sysemu.h
> > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > QEMU_WAKEUP_REASON_OTHER,
> > } WakeupReason;
> >
> > +void qemu_exit_preconfig_request(void);
> > void qemu_system_reset_request(ShutdownCause reason);
> > void qemu_system_suspend_request(void);
> > void qemu_register_suspend_notifier(Notifier *notifier);
> > diff --git a/qemu-options.hx b/qemu-options.hx
> > index 39225ae..bd44db8 100644
> > --- a/qemu-options.hx
> > +++ b/qemu-options.hx
> > @@ -3498,6 +3498,21 @@ STEXI
> > Run the emulation in single step mode.
> > ETEXI
> >
> > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > + "-paused [state=]postconf|preconf\n"
> > + " postconf: pause QEMU after machine is initialized\n"
> > + " preconf: pause QEMU before machine is initialized\n",
> > + QEMU_ARCH_ALL)
> > +STEXI
> > +@item -paused
> > +@findex -paused
> > +if set enabled interactive configuration stages before machine emulation starts.
> > +'postconf' option value mimics -S option behaviour where machine is created
> > +but emulation isn't started. 'preconf' option value pauses QEMU before machine
> > +is created, which allows to query and configure properties affecting machine
> > +initialization. Use monitor/QMP command 'cont' to go to exit paused state.
> > +ETEXI
>
> To me it feels rather unpleasant to be exposing this kind of detailed knowledge
> about the steps QEMU goes through when consttructing the machine and expecting
> the mgmt application to synchronize certain monitor actions against this.
After discussing some ideas with David in this thread, I think
you have a really good point here: I don't see a reason why
set-numa-node should require anything except -S, except for the
way our machine initialization code work. In other words, why
should we generate the NUMA tables at
machine_run_board_init()-time and not at vm_start()-time?
--
Eduardo
On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
> Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> ---
> include/sysemu/sysemu.h | 1 +
> qemu-options.hx | 15 ++++++++++++++
> qmp.c | 5 +++++
> vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> 4 files changed, 74 insertions(+), 1 deletion(-)
>
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index b213696..3feb94f 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> QEMU_WAKEUP_REASON_OTHER,
> } WakeupReason;
>
> +void qemu_exit_preconfig_request(void);
> void qemu_system_reset_request(ShutdownCause reason);
> void qemu_system_suspend_request(void);
> void qemu_register_suspend_notifier(Notifier *notifier);
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 39225ae..bd44db8 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -3498,6 +3498,21 @@ STEXI
> Run the emulation in single step mode.
> ETEXI
>
> +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> + "-paused [state=]postconf|preconf\n"
> + " postconf: pause QEMU after machine is initialized\n"
> + " preconf: pause QEMU before machine is initialized\n",
> + QEMU_ARCH_ALL)
I would like to allow pausing before machine-type is selected, so
management could run query-machines before choosing a
machine-type. Would that need a third "-pause" mode, or will we
be able to change "preconf" to pause before select_machine() is
called?
The same probably applies to other things initialized before
machine_run_board_init() that could be configurable using QMP,
including but not limited to:
* Accelerator configuration
* Registering global properties
* RAM size
* SMP/CPU configuration
> +STEXI
> +@item -paused
> +@findex -paused
> +if set enabled interactive configuration stages before machine emulation starts.
> +'postconf' option value mimics -S option behaviour where machine is created
> +but emulation isn't started. 'preconf' option value pauses QEMU before machine
> +is created, which allows to query and configure properties affecting machine
> +initialization. Use monitor/QMP command 'cont' to go to exit paused state.
What if "-S" is used at the same time"? Will "cont" only
initialize the machine and wait for another "cont" command to
start the VCPUs, or will it unpause everything?
> +ETEXI
> +
> DEF("S", 0, QEMU_OPTION_S, \
> "-S freeze CPU at startup (use 'c' to start execution)\n",
> QEMU_ARCH_ALL)
> diff --git a/qmp.c b/qmp.c
> index e8c3031..49e9a5c 100644
> --- a/qmp.c
> +++ b/qmp.c
> @@ -167,6 +167,11 @@ void qmp_cont(Error **errp)
> BlockBackend *blk;
> Error *local_err = NULL;
>
> + if (runstate_check(RUN_STATE_PRELAUNCH)) {
> + qemu_exit_preconfig_request();
> + return;
> + }
> +
> /* if there is a dump in background, we should wait until the dump
> * finished */
> if (dump_in_progress()) {
> diff --git a/vl.c b/vl.c
> index 3fed457..30631fd 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -555,6 +555,20 @@ static QemuOptsList qemu_fw_cfg_opts = {
> },
> };
>
> +static QemuOptsList qemu_paused_opts = {
> + .name = "paused",
> + .implied_opt_name = "state",
> + .head = QTAILQ_HEAD_INITIALIZER(qemu_paused_opts.head),
> + .desc = {
> + {
> + .name = "state",
> + .type = QEMU_OPT_STRING,
> + .help = "Pause state of QEMU on startup",
> + },
> + { /* end of list */ }
> + },
> +};
> +
> /**
> * Get machine options
> *
> @@ -1689,6 +1703,11 @@ static pid_t shutdown_pid;
> static int powerdown_requested;
> static int debug_requested;
> static int suspend_requested;
> +static enum {
> + PRECONFIG_CONT = 0,
> + PRECONFIG_PAUSE,
> + PRECONFIG_SKIP,
> +} preconfig_requested;
> static WakeupReason wakeup_reason;
> static NotifierList powerdown_notifiers =
> NOTIFIER_LIST_INITIALIZER(powerdown_notifiers);
> @@ -1773,6 +1792,11 @@ static int qemu_debug_requested(void)
> return r;
> }
>
> +void qemu_exit_preconfig_request(void)
> +{
> + preconfig_requested = PRECONFIG_CONT;
> +}
> +
> /*
> * Reset the VM. Issue an event unless @reason is SHUTDOWN_CAUSE_NONE.
> */
> @@ -1939,6 +1963,12 @@ static bool main_loop_should_exit(void)
> RunState r;
> ShutdownCause request;
>
> + if (runstate_check(RUN_STATE_PRELAUNCH)) {
> + if (preconfig_requested == PRECONFIG_CONT) {
> + preconfig_requested = PRECONFIG_SKIP;
> + return true;
> + }
> + }
> if (qemu_debug_requested()) {
> vm_stop(RUN_STATE_DEBUG);
> }
> @@ -3177,6 +3207,7 @@ int main(int argc, char **argv, char **envp)
> qemu_add_opts(&qemu_icount_opts);
> qemu_add_opts(&qemu_semihosting_config_opts);
> qemu_add_opts(&qemu_fw_cfg_opts);
> + qemu_add_opts(&qemu_paused_opts);
> module_call_init(MODULE_INIT_OPTS);
>
> runstate_init();
> @@ -3845,6 +3876,26 @@ int main(int argc, char **argv, char **envp)
> exit(1);
> }
> break;
> + case QEMU_OPTION_paused:
> + {
> + const char *value;
> +
> + opts = qemu_opts_parse_noisily(qemu_find_opts("paused"),
> + optarg, true);
> + if (opts == NULL) {
> + exit(1);
> + }
> + value = qemu_opt_get(opts, "state");
> + if (!strcmp(value, "postconf")) {
> + autostart = 0;
> + } else if (!strcmp(value, "preconf")) {
> + preconfig_requested = PRECONFIG_PAUSE;
> + } else {
> + error_report("incomplete '-paused' option\n");
> + exit(1);
> + }
> + break;
> + }
> case QEMU_OPTION_enable_kvm:
> olist = qemu_find_opts("machine");
> qemu_opts_parse_noisily(olist, "accel=kvm", false);
> @@ -4731,7 +4782,6 @@ int main(int argc, char **argv, char **envp)
> current_machine->boot_order = boot_order;
> current_machine->cpu_model = cpu_model;
>
> -
> /* parse features once if machine provides default cpu_type */
> if (machine_class->default_cpu_type) {
> current_machine->cpu_type = machine_class->default_cpu_type;
> @@ -4741,6 +4791,8 @@ int main(int argc, char **argv, char **envp)
> }
> }
>
> + main_loop(); /* do monitor/qmp handling at preconfig state if requested */
> +
I'm impressed by the simplicity of the implementation. I though
this would involve moving everything between this line and the
next main_loop() call outside main(), so they would be called by
qmp_cont().
Any expert on GLib's Event Loop sees any gotcha in this method?
I would like to do a careful review of main_loop_wait() and
main_loop_should_exit(), to ensure those functions don't depend
on anything that's initialized after this line. Probably a few
existing QMP commands can crash if machine is not initialized
yet?
The rules and expectations on initialization ordering are very
subtle, I suggest including test code for the new feature to
ensure nothing crashes or breaks in the future.
> machine_run_board_init(current_machine);
>
> realtime_init();
> --
> 2.7.4
>
--
Eduardo
On 16/10/2017 18:59, Eduardo Habkost wrote:
>> +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
>> + "-paused [state=]postconf|preconf\n"
>> + " postconf: pause QEMU after machine is initialized\n"
>> + " preconf: pause QEMU before machine is initialized\n",
>> + QEMU_ARCH_ALL)
> I would like to allow pausing before machine-type is selected, so
> management could run query-machines before choosing a
> machine-type. Would that need a third "-pause" mode, or will we
> be able to change "preconf" to pause before select_machine() is
> called?
>
> The same probably applies to other things initialized before
> machine_run_board_init() that could be configurable using QMP,
> including but not limited to:
> * Accelerator configuration
> * Registering global properties
> * RAM size
> * SMP/CPU configuration
Should (or could) "-M none" be changed in a backwards-compatible way to
allow such preconfiguration? For example
qemu -M none -monitor stdio
(qemu) machine-set-options pc,accel=kvm
(qemu) c
Paolo
On Mon, Oct 16, 2017 at 07:01:01PM +0200, Paolo Bonzini wrote:
> On 16/10/2017 18:59, Eduardo Habkost wrote:
> >> +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> >> + "-paused [state=]postconf|preconf\n"
> >> + " postconf: pause QEMU after machine is initialized\n"
> >> + " preconf: pause QEMU before machine is initialized\n",
> >> + QEMU_ARCH_ALL)
> > I would like to allow pausing before machine-type is selected, so
> > management could run query-machines before choosing a
> > machine-type. Would that need a third "-pause" mode, or will we
> > be able to change "preconf" to pause before select_machine() is
> > called?
> >
> > The same probably applies to other things initialized before
> > machine_run_board_init() that could be configurable using QMP,
> > including but not limited to:
> > * Accelerator configuration
> > * Registering global properties
> > * RAM size
> > * SMP/CPU configuration
>
> Should (or could) "-M none" be changed in a backwards-compatible way to
> allow such preconfiguration? For example
>
> qemu -M none -monitor stdio
> (qemu) machine-set-options pc,accel=kvm
> (qemu) c
Sounds like an interesting idea. It would require ensuring it's
really safe to destroy current_machine/accel (and other global
state) and replace them with another object on the fly (which is
probably a nice goal by itself).
--
Eduardo
On 16/10/2017 19:17, Eduardo Habkost wrote: >> Should (or could) "-M none" be changed in a backwards-compatible way to >> allow such preconfiguration? For example >> >> qemu -M none -monitor stdio >> (qemu) machine-set-options pc,accel=kvm >> (qemu) c > Sounds like an interesting idea. It would require ensuring it's > really safe to destroy current_machine/accel (and other global > state) and replace them with another object on the fly (which is > probably a nice goal by itself). It is but, alternatively, you could delay creating the "none" machine until the last second. The important part, in my opinion, is having a good command-line interface that we can freeze even if the implementation below leaves something to be desired. Paolo
On Tue, 17 Oct 2017 10:47:40 +0200
Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 16/10/2017 19:17, Eduardo Habkost wrote:
> >> Should (or could) "-M none" be changed in a backwards-compatible way to
> >> allow such preconfiguration? For example
> >>
> >> qemu -M none -monitor stdio
> >> (qemu) machine-set-options pc,accel=kvm
> >> (qemu) c
> > Sounds like an interesting idea. It would require ensuring it's
> > really safe to destroy current_machine/accel (and other global
> > state) and replace them with another object on the fly (which is
> > probably a nice goal by itself).
>
> It is but, alternatively, you could delay creating the "none" machine
> until the last second. The important part, in my opinion, is having a
> good command-line interface that we can freeze even if the
> implementation below leaves something to be desired.
I sort of don't get how '-M none' could be used to build usable
machine (at least currently).
Do we really need "-M none" for dynamic configuration?
I'd imagine doing following instead:
qemu -monitor stdio -dynconfig
(qemu) query-machines
...
(qemu) set-option machine pc,accel=kvm
# machine object is created
(qemu) set-option smp 1,maxcpus
(qemu) info hotpluggable-cpus
...
(qemu) set-option numa node
(qemu) set-option numa cpu,node-id=0,socket=0
(qemu) set-option numa cpu,node-id=0,socket=1
(qemu) c
I'd start to make it working from 'info hotpluggable-cpus'
as it's close to my current project of making cpu-hotplug/numa
working nice together and we can expand the same interface to work
at earlier stages on top of that.
On Mon, Oct 16, 2017 at 07:01:01PM +0200, Paolo Bonzini wrote:
> On 16/10/2017 18:59, Eduardo Habkost wrote:
> >> +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> >> + "-paused [state=]postconf|preconf\n"
> >> + " postconf: pause QEMU after machine is initialized\n"
> >> + " preconf: pause QEMU before machine is initialized\n",
> >> + QEMU_ARCH_ALL)
> > I would like to allow pausing before machine-type is selected, so
> > management could run query-machines before choosing a
> > machine-type. Would that need a third "-pause" mode, or will we
> > be able to change "preconf" to pause before select_machine() is
> > called?
> >
> > The same probably applies to other things initialized before
> > machine_run_board_init() that could be configurable using QMP,
> > including but not limited to:
> > * Accelerator configuration
> > * Registering global properties
> > * RAM size
> > * SMP/CPU configuration
>
> Should (or could) "-M none" be changed in a backwards-compatible way to
> allow such preconfiguration? For example
>
> qemu -M none -monitor stdio
> (qemu) machine-set-options pc,accel=kvm
> (qemu) c
Going down this route has pretty major implications for the way libvirt
manages QEMU, and support / debugging of it. When you look at the QEMU
command line libvirt uses it will be almost devoid of any useful info.
So it will be more involved job to figure out just how QEMU is configured.
This also means it is difficult to replicate the config that libvirt has
used, outside of libvirt for sake of debugging.
I also think it will have pretty significant performance implications
for QEMU startup. To configure a guest via the monitor is going to
require a huge number of monitor commands to be executed to replicate
what we traditionally configured via ARGV. While each monitor command
is not massively slow, the round-trip time of each command will quickly
add up to several 100 milliseconds, perhaps even seconds in the the
case of very large configs.
Maybe we ultimately have no choice and this is inevitable, but I am
pretty wary of going in the direction of launching bare QEMU and
configuring everything via a huge number of monitor calls.
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On 10/17/17 16:48, Daniel P. Berrange wrote:
> On Mon, Oct 16, 2017 at 07:01:01PM +0200, Paolo Bonzini wrote:
>> On 16/10/2017 18:59, Eduardo Habkost wrote:
>>>> +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
>>>> + "-paused [state=]postconf|preconf\n"
>>>> + " postconf: pause QEMU after machine is initialized\n"
>>>> + " preconf: pause QEMU before machine is initialized\n",
>>>> + QEMU_ARCH_ALL)
>>> I would like to allow pausing before machine-type is selected, so
>>> management could run query-machines before choosing a
>>> machine-type. Would that need a third "-pause" mode, or will we
>>> be able to change "preconf" to pause before select_machine() is
>>> called?
>>>
>>> The same probably applies to other things initialized before
>>> machine_run_board_init() that could be configurable using QMP,
>>> including but not limited to:
>>> * Accelerator configuration
>>> * Registering global properties
>>> * RAM size
>>> * SMP/CPU configuration
>>
>> Should (or could) "-M none" be changed in a backwards-compatible way to
>> allow such preconfiguration? For example
>>
>> qemu -M none -monitor stdio
>> (qemu) machine-set-options pc,accel=kvm
>> (qemu) c
>
> Going down this route has pretty major implications for the way libvirt
> manages QEMU, and support / debugging of it. When you look at the QEMU
> command line libvirt uses it will be almost devoid of any useful info.
> So it will be more involved job to figure out just how QEMU is configured.
> This also means it is difficult to replicate the config that libvirt has
> used, outside of libvirt for sake of debugging.
>
> I also think it will have pretty significant performance implications
> for QEMU startup. To configure a guest via the monitor is going to
> require a huge number of monitor commands to be executed to replicate
> what we traditionally configured via ARGV. While each monitor command
> is not massively slow, the round-trip time of each command will quickly
> add up to several 100 milliseconds, perhaps even seconds in the the
> case of very large configs.
>
> Maybe we ultimately have no choice and this is inevitable, but I am
> pretty wary of going in the direction of launching bare QEMU and
> configuring everything via a huge number of monitor calls.
Where's the sweet spot between
- configuring everything dynamically, over QMP,
- and invoking QEMU separately, for querying capabilities etc?
Thanks,
Laszlo
On Tue, Oct 17, 2017 at 05:21:13PM +0200, Laszlo Ersek wrote:
> On 10/17/17 16:48, Daniel P. Berrange wrote:
> > On Mon, Oct 16, 2017 at 07:01:01PM +0200, Paolo Bonzini wrote:
> >> On 16/10/2017 18:59, Eduardo Habkost wrote:
> >>>> +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> >>>> + "-paused [state=]postconf|preconf\n"
> >>>> + " postconf: pause QEMU after machine is initialized\n"
> >>>> + " preconf: pause QEMU before machine is initialized\n",
> >>>> + QEMU_ARCH_ALL)
> >>> I would like to allow pausing before machine-type is selected, so
> >>> management could run query-machines before choosing a
> >>> machine-type. Would that need a third "-pause" mode, or will we
> >>> be able to change "preconf" to pause before select_machine() is
> >>> called?
> >>>
> >>> The same probably applies to other things initialized before
> >>> machine_run_board_init() that could be configurable using QMP,
> >>> including but not limited to:
> >>> * Accelerator configuration
> >>> * Registering global properties
> >>> * RAM size
> >>> * SMP/CPU configuration
> >>
> >> Should (or could) "-M none" be changed in a backwards-compatible way to
> >> allow such preconfiguration? For example
> >>
> >> qemu -M none -monitor stdio
> >> (qemu) machine-set-options pc,accel=kvm
> >> (qemu) c
> >
> > Going down this route has pretty major implications for the way libvirt
> > manages QEMU, and support / debugging of it. When you look at the QEMU
> > command line libvirt uses it will be almost devoid of any useful info.
> > So it will be more involved job to figure out just how QEMU is configured.
> > This also means it is difficult to replicate the config that libvirt has
> > used, outside of libvirt for sake of debugging.
> >
> > I also think it will have pretty significant performance implications
> > for QEMU startup. To configure a guest via the monitor is going to
> > require a huge number of monitor commands to be executed to replicate
> > what we traditionally configured via ARGV. While each monitor command
> > is not massively slow, the round-trip time of each command will quickly
> > add up to several 100 milliseconds, perhaps even seconds in the the
> > case of very large configs.
> >
> > Maybe we ultimately have no choice and this is inevitable, but I am
> > pretty wary of going in the direction of launching bare QEMU and
> > configuring everything via a huge number of monitor calls.
>
> Where's the sweet spot between
> - configuring everything dynamically, over QMP,
> - and invoking QEMU separately, for querying capabilities etc?
The key with the way we currently invoke & query QEMU over QMP to detect
capabilities is that this is not tied to a specific VM launch process.
We can query capabilities and cache them until such time as we detect
a QEMU binary change. So this never impacts on the startup performance
of individual VMs. The caching is critical, because querying capabilities
is actually quite time intensive already, taking many seconds to query
capabilities on all the different target binaries we have.
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On 10/17/17 17:35, Daniel P. Berrange wrote:
> On Tue, Oct 17, 2017 at 05:21:13PM +0200, Laszlo Ersek wrote:
>> On 10/17/17 16:48, Daniel P. Berrange wrote:
>>> On Mon, Oct 16, 2017 at 07:01:01PM +0200, Paolo Bonzini wrote:
>>>> On 16/10/2017 18:59, Eduardo Habkost wrote:
>>>>>> +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
>>>>>> + "-paused [state=]postconf|preconf\n"
>>>>>> + " postconf: pause QEMU after machine is initialized\n"
>>>>>> + " preconf: pause QEMU before machine is initialized\n",
>>>>>> + QEMU_ARCH_ALL)
>>>>> I would like to allow pausing before machine-type is selected, so
>>>>> management could run query-machines before choosing a
>>>>> machine-type. Would that need a third "-pause" mode, or will we
>>>>> be able to change "preconf" to pause before select_machine() is
>>>>> called?
>>>>>
>>>>> The same probably applies to other things initialized before
>>>>> machine_run_board_init() that could be configurable using QMP,
>>>>> including but not limited to:
>>>>> * Accelerator configuration
>>>>> * Registering global properties
>>>>> * RAM size
>>>>> * SMP/CPU configuration
>>>>
>>>> Should (or could) "-M none" be changed in a backwards-compatible way to
>>>> allow such preconfiguration? For example
>>>>
>>>> qemu -M none -monitor stdio
>>>> (qemu) machine-set-options pc,accel=kvm
>>>> (qemu) c
>>>
>>> Going down this route has pretty major implications for the way libvirt
>>> manages QEMU, and support / debugging of it. When you look at the QEMU
>>> command line libvirt uses it will be almost devoid of any useful info.
>>> So it will be more involved job to figure out just how QEMU is configured.
>>> This also means it is difficult to replicate the config that libvirt has
>>> used, outside of libvirt for sake of debugging.
>>>
>>> I also think it will have pretty significant performance implications
>>> for QEMU startup. To configure a guest via the monitor is going to
>>> require a huge number of monitor commands to be executed to replicate
>>> what we traditionally configured via ARGV. While each monitor command
>>> is not massively slow, the round-trip time of each command will quickly
>>> add up to several 100 milliseconds, perhaps even seconds in the the
>>> case of very large configs.
>>>
>>> Maybe we ultimately have no choice and this is inevitable, but I am
>>> pretty wary of going in the direction of launching bare QEMU and
>>> configuring everything via a huge number of monitor calls.
>>
>> Where's the sweet spot between
>> - configuring everything dynamically, over QMP,
>> - and invoking QEMU separately, for querying capabilities etc?
>
> The key with the way we currently invoke & query QEMU over QMP to detect
> capabilities is that this is not tied to a specific VM launch process.
> We can query capabilities and cache them until such time as we detect
> a QEMU binary change. So this never impacts on the startup performance
> of individual VMs. The caching is critical, because querying capabilities
> is actually quite time intensive already, taking many seconds to query
> capabilities on all the different target binaries we have.
(Sorry about hijacking the thread, but I can't stop asking :) )
This looks very smart -- for my own education, how does libvirtd detect
a QEMU binary change? Based on executable mtime, size, checksum? Are
perhaps the <emulator> elements of individual domains involved?
Thanks!
Laszlo
On Tue, Oct 17, 2017 at 05:42:19PM +0200, Laszlo Ersek wrote:
> On 10/17/17 17:35, Daniel P. Berrange wrote:
> > On Tue, Oct 17, 2017 at 05:21:13PM +0200, Laszlo Ersek wrote:
> >> On 10/17/17 16:48, Daniel P. Berrange wrote:
> >>> On Mon, Oct 16, 2017 at 07:01:01PM +0200, Paolo Bonzini wrote:
> >>>> On 16/10/2017 18:59, Eduardo Habkost wrote:
> >>>>>> +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> >>>>>> + "-paused [state=]postconf|preconf\n"
> >>>>>> + " postconf: pause QEMU after machine is initialized\n"
> >>>>>> + " preconf: pause QEMU before machine is initialized\n",
> >>>>>> + QEMU_ARCH_ALL)
> >>>>> I would like to allow pausing before machine-type is selected, so
> >>>>> management could run query-machines before choosing a
> >>>>> machine-type. Would that need a third "-pause" mode, or will we
> >>>>> be able to change "preconf" to pause before select_machine() is
> >>>>> called?
> >>>>>
> >>>>> The same probably applies to other things initialized before
> >>>>> machine_run_board_init() that could be configurable using QMP,
> >>>>> including but not limited to:
> >>>>> * Accelerator configuration
> >>>>> * Registering global properties
> >>>>> * RAM size
> >>>>> * SMP/CPU configuration
> >>>>
> >>>> Should (or could) "-M none" be changed in a backwards-compatible way to
> >>>> allow such preconfiguration? For example
> >>>>
> >>>> qemu -M none -monitor stdio
> >>>> (qemu) machine-set-options pc,accel=kvm
> >>>> (qemu) c
> >>>
> >>> Going down this route has pretty major implications for the way libvirt
> >>> manages QEMU, and support / debugging of it. When you look at the QEMU
> >>> command line libvirt uses it will be almost devoid of any useful info.
> >>> So it will be more involved job to figure out just how QEMU is configured.
> >>> This also means it is difficult to replicate the config that libvirt has
> >>> used, outside of libvirt for sake of debugging.
> >>>
> >>> I also think it will have pretty significant performance implications
> >>> for QEMU startup. To configure a guest via the monitor is going to
> >>> require a huge number of monitor commands to be executed to replicate
> >>> what we traditionally configured via ARGV. While each monitor command
> >>> is not massively slow, the round-trip time of each command will quickly
> >>> add up to several 100 milliseconds, perhaps even seconds in the the
> >>> case of very large configs.
> >>>
> >>> Maybe we ultimately have no choice and this is inevitable, but I am
> >>> pretty wary of going in the direction of launching bare QEMU and
> >>> configuring everything via a huge number of monitor calls.
> >>
> >> Where's the sweet spot between
> >> - configuring everything dynamically, over QMP,
> >> - and invoking QEMU separately, for querying capabilities etc?
> >
> > The key with the way we currently invoke & query QEMU over QMP to detect
> > capabilities is that this is not tied to a specific VM launch process.
> > We can query capabilities and cache them until such time as we detect
> > a QEMU binary change. So this never impacts on the startup performance
> > of individual VMs. The caching is critical, because querying capabilities
> > is actually quite time intensive already, taking many seconds to query
> > capabilities on all the different target binaries we have.
>
> (Sorry about hijacking the thread, but I can't stop asking :) )
>
> This looks very smart -- for my own education, how does libvirtd detect
> a QEMU binary change? Based on executable mtime, size, checksum? Are
> perhaps the <emulator> elements of individual domains involved?
We store the capabilities info in an XML file in /var, and this contains
the ctime of libvirtd and or qemu, as well as libvirt version number. If
any of those change, the cache is invalidated.
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On Tue, 17 Oct 2017 16:35:15 +0100
"Daniel P. Berrange" <berrange@redhat.com> wrote:
> On Tue, Oct 17, 2017 at 05:21:13PM +0200, Laszlo Ersek wrote:
> > On 10/17/17 16:48, Daniel P. Berrange wrote:
> > > On Mon, Oct 16, 2017 at 07:01:01PM +0200, Paolo Bonzini wrote:
> > >> On 16/10/2017 18:59, Eduardo Habkost wrote:
> > >>>> +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > >>>> + "-paused [state=]postconf|preconf\n"
> > >>>> + " postconf: pause QEMU after machine is initialized\n"
> > >>>> + " preconf: pause QEMU before machine is initialized\n",
> > >>>> + QEMU_ARCH_ALL)
> > >>> I would like to allow pausing before machine-type is selected, so
> > >>> management could run query-machines before choosing a
> > >>> machine-type. Would that need a third "-pause" mode, or will we
> > >>> be able to change "preconf" to pause before select_machine() is
> > >>> called?
> > >>>
> > >>> The same probably applies to other things initialized before
> > >>> machine_run_board_init() that could be configurable using QMP,
> > >>> including but not limited to:
> > >>> * Accelerator configuration
> > >>> * Registering global properties
> > >>> * RAM size
> > >>> * SMP/CPU configuration
> > >>
> > >> Should (or could) "-M none" be changed in a backwards-compatible way to
> > >> allow such preconfiguration? For example
> > >>
> > >> qemu -M none -monitor stdio
> > >> (qemu) machine-set-options pc,accel=kvm
> > >> (qemu) c
> > >
> > > Going down this route has pretty major implications for the way libvirt
> > > manages QEMU, and support / debugging of it. When you look at the QEMU
> > > command line libvirt uses it will be almost devoid of any useful info.
> > > So it will be more involved job to figure out just how QEMU is configured.
> > > This also means it is difficult to replicate the config that libvirt has
> > > used, outside of libvirt for sake of debugging.
> > >
> > > I also think it will have pretty significant performance implications
> > > for QEMU startup. To configure a guest via the monitor is going to
> > > require a huge number of monitor commands to be executed to replicate
> > > what we traditionally configured via ARGV. While each monitor command
> > > is not massively slow, the round-trip time of each command will quickly
> > > add up to several 100 milliseconds, perhaps even seconds in the the
> > > case of very large configs.
> > >
> > > Maybe we ultimately have no choice and this is inevitable, but I am
> > > pretty wary of going in the direction of launching bare QEMU and
> > > configuring everything via a huge number of monitor calls.
> >
> > Where's the sweet spot between
> > - configuring everything dynamically, over QMP,
> > - and invoking QEMU separately, for querying capabilities etc?
>
> The key with the way we currently invoke & query QEMU over QMP to detect
> capabilities is that this is not tied to a specific VM launch process.
> We can query capabilities and cache them until such time as we detect
> a QEMU binary change. So this never impacts on the startup performance
> of individual VMs. The caching is critical, because querying capabilities
> is actually quite time intensive already, taking many seconds to query
> capabilities on all the different target binaries we have.
is there another alternative for usecase where one option values depends (-numa cpu)
on values of another option values (-M + -smp + -cpu)?
so far we have 2 options on the table:
1: do configuration at runtime like in this series
2: start qemu 2 times
1st to query cpu layout and
2nd add -numa options using data from the 1st step
> Regards,
> Daniel
On Tue, Oct 17, 2017 at 05:47:03PM +0200, Igor Mammedov wrote:
> On Tue, 17 Oct 2017 16:35:15 +0100
> "Daniel P. Berrange" <berrange@redhat.com> wrote:
>
> > On Tue, Oct 17, 2017 at 05:21:13PM +0200, Laszlo Ersek wrote:
> > > On 10/17/17 16:48, Daniel P. Berrange wrote:
> > > > On Mon, Oct 16, 2017 at 07:01:01PM +0200, Paolo Bonzini wrote:
> > > >> On 16/10/2017 18:59, Eduardo Habkost wrote:
> > > >>>> +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > > >>>> + "-paused [state=]postconf|preconf\n"
> > > >>>> + " postconf: pause QEMU after machine is initialized\n"
> > > >>>> + " preconf: pause QEMU before machine is initialized\n",
> > > >>>> + QEMU_ARCH_ALL)
> > > >>> I would like to allow pausing before machine-type is selected, so
> > > >>> management could run query-machines before choosing a
> > > >>> machine-type. Would that need a third "-pause" mode, or will we
> > > >>> be able to change "preconf" to pause before select_machine() is
> > > >>> called?
> > > >>>
> > > >>> The same probably applies to other things initialized before
> > > >>> machine_run_board_init() that could be configurable using QMP,
> > > >>> including but not limited to:
> > > >>> * Accelerator configuration
> > > >>> * Registering global properties
> > > >>> * RAM size
> > > >>> * SMP/CPU configuration
> > > >>
> > > >> Should (or could) "-M none" be changed in a backwards-compatible way to
> > > >> allow such preconfiguration? For example
> > > >>
> > > >> qemu -M none -monitor stdio
> > > >> (qemu) machine-set-options pc,accel=kvm
> > > >> (qemu) c
> > > >
> > > > Going down this route has pretty major implications for the way libvirt
> > > > manages QEMU, and support / debugging of it. When you look at the QEMU
> > > > command line libvirt uses it will be almost devoid of any useful info.
> > > > So it will be more involved job to figure out just how QEMU is configured.
> > > > This also means it is difficult to replicate the config that libvirt has
> > > > used, outside of libvirt for sake of debugging.
> > > >
> > > > I also think it will have pretty significant performance implications
> > > > for QEMU startup. To configure a guest via the monitor is going to
> > > > require a huge number of monitor commands to be executed to replicate
> > > > what we traditionally configured via ARGV. While each monitor command
> > > > is not massively slow, the round-trip time of each command will quickly
> > > > add up to several 100 milliseconds, perhaps even seconds in the the
> > > > case of very large configs.
> > > >
> > > > Maybe we ultimately have no choice and this is inevitable, but I am
> > > > pretty wary of going in the direction of launching bare QEMU and
> > > > configuring everything via a huge number of monitor calls.
> > >
> > > Where's the sweet spot between
> > > - configuring everything dynamically, over QMP,
> > > - and invoking QEMU separately, for querying capabilities etc?
> >
> > The key with the way we currently invoke & query QEMU over QMP to detect
> > capabilities is that this is not tied to a specific VM launch process.
> > We can query capabilities and cache them until such time as we detect
> > a QEMU binary change. So this never impacts on the startup performance
> > of individual VMs. The caching is critical, because querying capabilities
> > is actually quite time intensive already, taking many seconds to query
> > capabilities on all the different target binaries we have.
> is there another alternative for usecase where one option values depends (-numa cpu)
> on values of another option values (-M + -smp + -cpu)?
> so far we have 2 options on the table:
> 1: do configuration at runtime like in this series
> 2: start qemu 2 times
> 1st to query cpu layout and
> 2nd add -numa options using data from the 1st step
Conceptually the problem occurs in places where libvirt does not fully
specifiy the object being created, leaving QEMU todo some config internally.
The elephant in the room in this regard is '-machine', since the machine
baseboard implies creation of a variety of embedded devices. Libvirt has
embedded knowledge about what device buses are assocaited with each machine
type (ie ISA, PCI, PCI-X, etc). In theory this information could be
introspectable ahead of time because the info about what controllers are
associated with 'pc' or 'q35' is static. In practical terms though, the
QEMU code for populating machines is not structured in a way that would
allow such introspection without instantiating the machine type.
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On Mon, 16 Oct 2017 14:59:16 -0200
Eduardo Habkost <ehabkost@redhat.com> wrote:
> On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
> > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > ---
> > include/sysemu/sysemu.h | 1 +
> > qemu-options.hx | 15 ++++++++++++++
> > qmp.c | 5 +++++
> > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > 4 files changed, 74 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > index b213696..3feb94f 100644
> > --- a/include/sysemu/sysemu.h
> > +++ b/include/sysemu/sysemu.h
> > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > QEMU_WAKEUP_REASON_OTHER,
> > } WakeupReason;
> >
> > +void qemu_exit_preconfig_request(void);
> > void qemu_system_reset_request(ShutdownCause reason);
> > void qemu_system_suspend_request(void);
> > void qemu_register_suspend_notifier(Notifier *notifier);
> > diff --git a/qemu-options.hx b/qemu-options.hx
> > index 39225ae..bd44db8 100644
> > --- a/qemu-options.hx
> > +++ b/qemu-options.hx
> > @@ -3498,6 +3498,21 @@ STEXI
> > Run the emulation in single step mode.
> > ETEXI
> >
> > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > + "-paused [state=]postconf|preconf\n"
> > + " postconf: pause QEMU after machine is initialized\n"
> > + " preconf: pause QEMU before machine is initialized\n",
> > + QEMU_ARCH_ALL)
>
> I would like to allow pausing before machine-type is selected, so
> management could run query-machines before choosing a
> machine-type. Would that need a third "-pause" mode, or will we
> be able to change "preconf" to pause before select_machine() is
> called?
> The same probably applies to other things initialized before
> machine_run_board_init() that could be configurable using QMP,
> including but not limited to:
> * Accelerator configuration
> * Registering global properties
> * RAM size
> * SMP/CPU configuration
My goal is/were much more narrow and reachable without rewriting whole
qemu again (well I had to do a bit of necessary preparatory refactoring
for that to happen default_cpu+generilizing cpu_model parsing).
This series is focused on allowing to query cpu layout defined by f("-M foo -smp ...")
and configiring numa mapping for resulted layout. So it needs machine
object to exist by the time it's paused, which means that -M and -smp
options have to be parsed by that time.
Allowing pause basically before machine is created I'd guess, would be
a lot of additional re-factoring (beyond this series scope), I can't
say for sure if new pause mode is need for it or 'preconf' could be
moved to earlier stage later.
I'd speculate that for generic handling we would need
- CLI options dependency tree
- add new QMP/HMP command (process-cli-option)
make it actionable, i.e. "process-cli-option -M foo" would create
machine and user would be allowed to use other options
that have machine dependency (for example -smp and after that -numa)
I woudn't like to go down that bottomless pit yet right now,
but probably we could add "process-cli-option" right now and allow only
-numa command for now so we would have external interface in place
and could extend it later.
> > +STEXI
> > +@item -paused
> > +@findex -paused
> > +if set enabled interactive configuration stages before machine emulation starts.
> > +'postconf' option value mimics -S option behaviour where machine is created
> > +but emulation isn't started. 'preconf' option value pauses QEMU before machine
> > +is created, which allows to query and configure properties affecting machine
> > +initialization. Use monitor/QMP command 'cont' to go to exit paused state.
>
> What if "-S" is used at the same time"? Will "cont" only
> initialize the machine and wait for another "cont" command to
> start the VCPUs, or will it unpause everything?
in current impl. first 'cont' will exit preconfig loop and continue to
work as it used to be, i.e. -S will cause second pause right before
vcpus started and the second 'cont' will be needed to run machine.
>
> > +ETEXI
> > +
> > DEF("S", 0, QEMU_OPTION_S, \
> > "-S freeze CPU at startup (use 'c' to start execution)\n",
> > QEMU_ARCH_ALL)
> > diff --git a/qmp.c b/qmp.c
> > index e8c3031..49e9a5c 100644
> > --- a/qmp.c
> > +++ b/qmp.c
> > @@ -167,6 +167,11 @@ void qmp_cont(Error **errp)
> > BlockBackend *blk;
> > Error *local_err = NULL;
> >
> > + if (runstate_check(RUN_STATE_PRELAUNCH)) {
> > + qemu_exit_preconfig_request();
> > + return;
> > + }
> > +
> > /* if there is a dump in background, we should wait until the dump
> > * finished */
> > if (dump_in_progress()) {
> > diff --git a/vl.c b/vl.c
> > index 3fed457..30631fd 100644
> > --- a/vl.c
> > +++ b/vl.c
> > @@ -555,6 +555,20 @@ static QemuOptsList qemu_fw_cfg_opts = {
> > },
> > };
> >
> > +static QemuOptsList qemu_paused_opts = {
> > + .name = "paused",
> > + .implied_opt_name = "state",
> > + .head = QTAILQ_HEAD_INITIALIZER(qemu_paused_opts.head),
> > + .desc = {
> > + {
> > + .name = "state",
> > + .type = QEMU_OPT_STRING,
> > + .help = "Pause state of QEMU on startup",
> > + },
> > + { /* end of list */ }
> > + },
> > +};
> > +
> > /**
> > * Get machine options
> > *
> > @@ -1689,6 +1703,11 @@ static pid_t shutdown_pid;
> > static int powerdown_requested;
> > static int debug_requested;
> > static int suspend_requested;
> > +static enum {
> > + PRECONFIG_CONT = 0,
> > + PRECONFIG_PAUSE,
> > + PRECONFIG_SKIP,
> > +} preconfig_requested;
> > static WakeupReason wakeup_reason;
> > static NotifierList powerdown_notifiers =
> > NOTIFIER_LIST_INITIALIZER(powerdown_notifiers);
> > @@ -1773,6 +1792,11 @@ static int qemu_debug_requested(void)
> > return r;
> > }
> >
> > +void qemu_exit_preconfig_request(void)
> > +{
> > + preconfig_requested = PRECONFIG_CONT;
> > +}
> > +
> > /*
> > * Reset the VM. Issue an event unless @reason is SHUTDOWN_CAUSE_NONE.
> > */
> > @@ -1939,6 +1963,12 @@ static bool main_loop_should_exit(void)
> > RunState r;
> > ShutdownCause request;
> >
> > + if (runstate_check(RUN_STATE_PRELAUNCH)) {
> > + if (preconfig_requested == PRECONFIG_CONT) {
> > + preconfig_requested = PRECONFIG_SKIP;
> > + return true;
> > + }
> > + }
> > if (qemu_debug_requested()) {
> > vm_stop(RUN_STATE_DEBUG);
> > }
> > @@ -3177,6 +3207,7 @@ int main(int argc, char **argv, char **envp)
> > qemu_add_opts(&qemu_icount_opts);
> > qemu_add_opts(&qemu_semihosting_config_opts);
> > qemu_add_opts(&qemu_fw_cfg_opts);
> > + qemu_add_opts(&qemu_paused_opts);
> > module_call_init(MODULE_INIT_OPTS);
> >
> > runstate_init();
> > @@ -3845,6 +3876,26 @@ int main(int argc, char **argv, char **envp)
> > exit(1);
> > }
> > break;
> > + case QEMU_OPTION_paused:
> > + {
> > + const char *value;
> > +
> > + opts = qemu_opts_parse_noisily(qemu_find_opts("paused"),
> > + optarg, true);
> > + if (opts == NULL) {
> > + exit(1);
> > + }
> > + value = qemu_opt_get(opts, "state");
> > + if (!strcmp(value, "postconf")) {
> > + autostart = 0;
> > + } else if (!strcmp(value, "preconf")) {
> > + preconfig_requested = PRECONFIG_PAUSE;
> > + } else {
> > + error_report("incomplete '-paused' option\n");
> > + exit(1);
> > + }
> > + break;
> > + }
> > case QEMU_OPTION_enable_kvm:
> > olist = qemu_find_opts("machine");
> > qemu_opts_parse_noisily(olist, "accel=kvm", false);
> > @@ -4731,7 +4782,6 @@ int main(int argc, char **argv, char **envp)
> > current_machine->boot_order = boot_order;
> > current_machine->cpu_model = cpu_model;
> >
> > -
> > /* parse features once if machine provides default cpu_type */
> > if (machine_class->default_cpu_type) {
> > current_machine->cpu_type = machine_class->default_cpu_type;
> > @@ -4741,6 +4791,8 @@ int main(int argc, char **argv, char **envp)
> > }
> > }
> >
> > + main_loop(); /* do monitor/qmp handling at preconfig state if requested */
> > +
>
> I'm impressed by the simplicity of the implementation. I though
> this would involve moving everything between this line and the
> next main_loop() call outside main(), so they would be called by
> qmp_cont().
>
> Any expert on GLib's Event Loop sees any gotcha in this method?
>
> I would like to do a careful review of main_loop_wait() and
> main_loop_should_exit(), to ensure those functions don't depend
> on anything that's initialized after this line. Probably a few
> existing QMP commands can crash if machine is not initialized
> yet?
some HMP/QMP commands will crash for sure, any idea on how to
handle issue (i.e. prevent not allowed commands to run) is welcome.
> The rules and expectations on initialization ordering are very
> subtle, I suggest including test code for the new feature to
> ensure nothing crashes or breaks in the future.
that's only RFC, I've omitted testing part as approach
to be used isn't certain yet, but yep I plan on adding tests
for features that are expected to work with this.
>
> > machine_run_board_init(current_machine);
> >
> > realtime_init();
> > --
> > 2.7.4
> >
>
On Mon, Oct 16, 2017 at 02:59:16PM -0200, Eduardo Habkost wrote:
> On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
> > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > ---
> > include/sysemu/sysemu.h | 1 +
> > qemu-options.hx | 15 ++++++++++++++
> > qmp.c | 5 +++++
> > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > 4 files changed, 74 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > index b213696..3feb94f 100644
> > --- a/include/sysemu/sysemu.h
> > +++ b/include/sysemu/sysemu.h
> > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > QEMU_WAKEUP_REASON_OTHER,
> > } WakeupReason;
> >
> > +void qemu_exit_preconfig_request(void);
> > void qemu_system_reset_request(ShutdownCause reason);
> > void qemu_system_suspend_request(void);
> > void qemu_register_suspend_notifier(Notifier *notifier);
> > diff --git a/qemu-options.hx b/qemu-options.hx
> > index 39225ae..bd44db8 100644
> > --- a/qemu-options.hx
> > +++ b/qemu-options.hx
> > @@ -3498,6 +3498,21 @@ STEXI
> > Run the emulation in single step mode.
> > ETEXI
> >
> > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > + "-paused [state=]postconf|preconf\n"
> > + " postconf: pause QEMU after machine is initialized\n"
> > + " preconf: pause QEMU before machine is initialized\n",
> > + QEMU_ARCH_ALL)
>
> I would like to allow pausing before machine-type is selected, so
> management could run query-machines before choosing a
> machine-type. Would that need a third "-pause" mode, or will we
> be able to change "preconf" to pause before select_machine() is
> called?
>
> The same probably applies to other things initialized before
> machine_run_board_init() that could be configurable using QMP,
> including but not limited to:
> * Accelerator configuration
> * Registering global properties
> * RAM size
> * SMP/CPU configuration
Yeah.. having a bunch of different possible pause stages to select
doesn't sound great. Could we avoid this by instead changing -S to
pause at the earliest possible spot, but having any monitor commands
that require a later stage automatically "fast forwarding" to the
right phase?
>
>
> > +STEXI
> > +@item -paused
> > +@findex -paused
> > +if set enabled interactive configuration stages before machine emulation starts.
> > +'postconf' option value mimics -S option behaviour where machine is created
> > +but emulation isn't started. 'preconf' option value pauses QEMU before machine
> > +is created, which allows to query and configure properties affecting machine
> > +initialization. Use monitor/QMP command 'cont' to go to exit paused state.
>
> What if "-S" is used at the same time"? Will "cont" only
> initialize the machine and wait for another "cont" command to
> start the VCPUs, or will it unpause everything?
>
>
> > +ETEXI
> > +
> > DEF("S", 0, QEMU_OPTION_S, \
> > "-S freeze CPU at startup (use 'c' to start execution)\n",
> > QEMU_ARCH_ALL)
> > diff --git a/qmp.c b/qmp.c
> > index e8c3031..49e9a5c 100644
> > --- a/qmp.c
> > +++ b/qmp.c
> > @@ -167,6 +167,11 @@ void qmp_cont(Error **errp)
> > BlockBackend *blk;
> > Error *local_err = NULL;
> >
> > + if (runstate_check(RUN_STATE_PRELAUNCH)) {
> > + qemu_exit_preconfig_request();
> > + return;
> > + }
> > +
> > /* if there is a dump in background, we should wait until the dump
> > * finished */
> > if (dump_in_progress()) {
> > diff --git a/vl.c b/vl.c
> > index 3fed457..30631fd 100644
> > --- a/vl.c
> > +++ b/vl.c
> > @@ -555,6 +555,20 @@ static QemuOptsList qemu_fw_cfg_opts = {
> > },
> > };
> >
> > +static QemuOptsList qemu_paused_opts = {
> > + .name = "paused",
> > + .implied_opt_name = "state",
> > + .head = QTAILQ_HEAD_INITIALIZER(qemu_paused_opts.head),
> > + .desc = {
> > + {
> > + .name = "state",
> > + .type = QEMU_OPT_STRING,
> > + .help = "Pause state of QEMU on startup",
> > + },
> > + { /* end of list */ }
> > + },
> > +};
> > +
> > /**
> > * Get machine options
> > *
> > @@ -1689,6 +1703,11 @@ static pid_t shutdown_pid;
> > static int powerdown_requested;
> > static int debug_requested;
> > static int suspend_requested;
> > +static enum {
> > + PRECONFIG_CONT = 0,
> > + PRECONFIG_PAUSE,
> > + PRECONFIG_SKIP,
> > +} preconfig_requested;
> > static WakeupReason wakeup_reason;
> > static NotifierList powerdown_notifiers =
> > NOTIFIER_LIST_INITIALIZER(powerdown_notifiers);
> > @@ -1773,6 +1792,11 @@ static int qemu_debug_requested(void)
> > return r;
> > }
> >
> > +void qemu_exit_preconfig_request(void)
> > +{
> > + preconfig_requested = PRECONFIG_CONT;
> > +}
> > +
> > /*
> > * Reset the VM. Issue an event unless @reason is SHUTDOWN_CAUSE_NONE.
> > */
> > @@ -1939,6 +1963,12 @@ static bool main_loop_should_exit(void)
> > RunState r;
> > ShutdownCause request;
> >
> > + if (runstate_check(RUN_STATE_PRELAUNCH)) {
> > + if (preconfig_requested == PRECONFIG_CONT) {
> > + preconfig_requested = PRECONFIG_SKIP;
> > + return true;
> > + }
> > + }
> > if (qemu_debug_requested()) {
> > vm_stop(RUN_STATE_DEBUG);
> > }
> > @@ -3177,6 +3207,7 @@ int main(int argc, char **argv, char **envp)
> > qemu_add_opts(&qemu_icount_opts);
> > qemu_add_opts(&qemu_semihosting_config_opts);
> > qemu_add_opts(&qemu_fw_cfg_opts);
> > + qemu_add_opts(&qemu_paused_opts);
> > module_call_init(MODULE_INIT_OPTS);
> >
> > runstate_init();
> > @@ -3845,6 +3876,26 @@ int main(int argc, char **argv, char **envp)
> > exit(1);
> > }
> > break;
> > + case QEMU_OPTION_paused:
> > + {
> > + const char *value;
> > +
> > + opts = qemu_opts_parse_noisily(qemu_find_opts("paused"),
> > + optarg, true);
> > + if (opts == NULL) {
> > + exit(1);
> > + }
> > + value = qemu_opt_get(opts, "state");
> > + if (!strcmp(value, "postconf")) {
> > + autostart = 0;
> > + } else if (!strcmp(value, "preconf")) {
> > + preconfig_requested = PRECONFIG_PAUSE;
> > + } else {
> > + error_report("incomplete '-paused' option\n");
> > + exit(1);
> > + }
> > + break;
> > + }
> > case QEMU_OPTION_enable_kvm:
> > olist = qemu_find_opts("machine");
> > qemu_opts_parse_noisily(olist, "accel=kvm", false);
> > @@ -4731,7 +4782,6 @@ int main(int argc, char **argv, char **envp)
> > current_machine->boot_order = boot_order;
> > current_machine->cpu_model = cpu_model;
> >
> > -
> > /* parse features once if machine provides default cpu_type */
> > if (machine_class->default_cpu_type) {
> > current_machine->cpu_type = machine_class->default_cpu_type;
> > @@ -4741,6 +4791,8 @@ int main(int argc, char **argv, char **envp)
> > }
> > }
> >
> > + main_loop(); /* do monitor/qmp handling at preconfig state if requested */
> > +
>
> I'm impressed by the simplicity of the implementation. I though
> this would involve moving everything between this line and the
> next main_loop() call outside main(), so they would be called by
> qmp_cont().
>
> Any expert on GLib's Event Loop sees any gotcha in this method?
>
> I would like to do a careful review of main_loop_wait() and
> main_loop_should_exit(), to ensure those functions don't depend
> on anything that's initialized after this line. Probably a few
> existing QMP commands can crash if machine is not initialized
> yet?
>
> The rules and expectations on initialization ordering are very
> subtle, I suggest including test code for the new feature to
> ensure nothing crashes or breaks in the future.
>
>
> > machine_run_board_init(current_machine);
> >
> > realtime_init();
>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Thu, Oct 19, 2017 at 09:42:18PM +1100, David Gibson wrote:
> On Mon, Oct 16, 2017 at 02:59:16PM -0200, Eduardo Habkost wrote:
> > On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
> > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > ---
> > > include/sysemu/sysemu.h | 1 +
> > > qemu-options.hx | 15 ++++++++++++++
> > > qmp.c | 5 +++++
> > > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > > 4 files changed, 74 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > index b213696..3feb94f 100644
> > > --- a/include/sysemu/sysemu.h
> > > +++ b/include/sysemu/sysemu.h
> > > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > > QEMU_WAKEUP_REASON_OTHER,
> > > } WakeupReason;
> > >
> > > +void qemu_exit_preconfig_request(void);
> > > void qemu_system_reset_request(ShutdownCause reason);
> > > void qemu_system_suspend_request(void);
> > > void qemu_register_suspend_notifier(Notifier *notifier);
> > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > index 39225ae..bd44db8 100644
> > > --- a/qemu-options.hx
> > > +++ b/qemu-options.hx
> > > @@ -3498,6 +3498,21 @@ STEXI
> > > Run the emulation in single step mode.
> > > ETEXI
> > >
> > > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > > + "-paused [state=]postconf|preconf\n"
> > > + " postconf: pause QEMU after machine is initialized\n"
> > > + " preconf: pause QEMU before machine is initialized\n",
> > > + QEMU_ARCH_ALL)
> >
> > I would like to allow pausing before machine-type is selected, so
> > management could run query-machines before choosing a
> > machine-type. Would that need a third "-pause" mode, or will we
> > be able to change "preconf" to pause before select_machine() is
> > called?
> >
> > The same probably applies to other things initialized before
> > machine_run_board_init() that could be configurable using QMP,
> > including but not limited to:
> > * Accelerator configuration
> > * Registering global properties
> > * RAM size
> > * SMP/CPU configuration
>
> Yeah.. having a bunch of different possible pause stages to select
> doesn't sound great.
I agree. The number of externally visible pause states should be
as small as possible.
> Could we avoid this by instead changing -S to
> pause at the earliest possible spot, but having any monitor commands
> that require a later stage automatically "fast forwarding" to the
> right phase?
That would hide the internal details from the outside. Sounds
nice, but adding new machine/device configuration QMP commands
while hiding the QEMU state from the outside sounds impossible.
For example, if we use -S today, this works:
$ qemu-system-x86_64 -S -qmp stdio
<- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
-> {"execute":"qmp_capabilities"}
<- {"return": {}}
-> {"execute":"query-cpus"}
<- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
This means "query-cpus" needs to fast-forward to the CPU creation
stage if we want to keep compatibility.
Now, assume we add a set-numa-node command like the one in this
series. e.g.:
$ qemu-system-x86_64 -S -qmp stdio
<- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
-> {"execute":"qmp_capabilities"}
<- {"return": {}}
-> {"execute":"set-numa-node" ... }
<- {"return": ...}
The command will work only if machine initialization didn't run
yet.
But now an innocent-looking query command would change QEMU state
in an unexpected way:
$ qemu-system-x86_64 -S -qmp stdio
<- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
-> {"execute":"qmp_capabilities"}
<- {"return": {}}
-> {"execute":"query-cpus"} [will silently fast-forward QEMU state]
<- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
-> {"execute":"set-numa-node" ... }
<- {"error": ...} [the command will fail because the machine was already created]
This means we do have a externally visible "too late to use
set-numa-node" QEMU state, and query-cpus will have a externally
visible side effect. Every QMP command would need to document
how it affects QEMU state in a externally visible way.
If QEMU pause state is still going to be externally visible this
way, I would prefer to let the client to explicitly tell what's
the state they want QEMU to be, instead of making QEMU change
state silently as a side effect of QMP commands.
>
[...]
--
Eduardo
On Thu, Oct 19, 2017 at 10:15:48PM -0200, Eduardo Habkost wrote:
> On Thu, Oct 19, 2017 at 09:42:18PM +1100, David Gibson wrote:
> > On Mon, Oct 16, 2017 at 02:59:16PM -0200, Eduardo Habkost wrote:
> > > On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
> > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > > ---
> > > > include/sysemu/sysemu.h | 1 +
> > > > qemu-options.hx | 15 ++++++++++++++
> > > > qmp.c | 5 +++++
> > > > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > 4 files changed, 74 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > index b213696..3feb94f 100644
> > > > --- a/include/sysemu/sysemu.h
> > > > +++ b/include/sysemu/sysemu.h
> > > > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > > > QEMU_WAKEUP_REASON_OTHER,
> > > > } WakeupReason;
> > > >
> > > > +void qemu_exit_preconfig_request(void);
> > > > void qemu_system_reset_request(ShutdownCause reason);
> > > > void qemu_system_suspend_request(void);
> > > > void qemu_register_suspend_notifier(Notifier *notifier);
> > > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > > index 39225ae..bd44db8 100644
> > > > --- a/qemu-options.hx
> > > > +++ b/qemu-options.hx
> > > > @@ -3498,6 +3498,21 @@ STEXI
> > > > Run the emulation in single step mode.
> > > > ETEXI
> > > >
> > > > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > > > + "-paused [state=]postconf|preconf\n"
> > > > + " postconf: pause QEMU after machine is initialized\n"
> > > > + " preconf: pause QEMU before machine is initialized\n",
> > > > + QEMU_ARCH_ALL)
> > >
> > > I would like to allow pausing before machine-type is selected, so
> > > management could run query-machines before choosing a
> > > machine-type. Would that need a third "-pause" mode, or will we
> > > be able to change "preconf" to pause before select_machine() is
> > > called?
> > >
> > > The same probably applies to other things initialized before
> > > machine_run_board_init() that could be configurable using QMP,
> > > including but not limited to:
> > > * Accelerator configuration
> > > * Registering global properties
> > > * RAM size
> > > * SMP/CPU configuration
> >
> > Yeah.. having a bunch of different possible pause stages to select
> > doesn't sound great.
>
> I agree. The number of externally visible pause states should be
> as small as possible.
>
>
> > Could we avoid this by instead changing -S to
> > pause at the earliest possible spot, but having any monitor commands
> > that require a later stage automatically "fast forwarding" to the
> > right phase?
>
> That would hide the internal details from the outside. Sounds
> nice, but adding new machine/device configuration QMP commands
> while hiding the QEMU state from the outside sounds impossible.
>
> For example, if we use -S today, this works:
>
> $ qemu-system-x86_64 -S -qmp stdio
> <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> -> {"execute":"qmp_capabilities"}
> <- {"return": {}}
> -> {"execute":"query-cpus"}
> <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
>
> This means "query-cpus" needs to fast-forward to the CPU creation
> stage if we want to keep compatibility.
>
> Now, assume we add a set-numa-node command like the one in this
> series. e.g.:
>
> $ qemu-system-x86_64 -S -qmp stdio
> <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> -> {"execute":"qmp_capabilities"}
> <- {"return": {}}
> -> {"execute":"set-numa-node" ... }
> <- {"return": ...}
>
> The command will work only if machine initialization didn't run
> yet.
>
> But now an innocent-looking query command would change QEMU state
> in an unexpected way:
>
> $ qemu-system-x86_64 -S -qmp stdio
> <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> -> {"execute":"qmp_capabilities"}
> <- {"return": {}}
> -> {"execute":"query-cpus"} [will silently fast-forward QEMU state]
> <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> -> {"execute":"set-numa-node" ... }
> <- {"error": ...} [the command will fail because the machine was already created]
>
> This means we do have a externally visible "too late to use
> set-numa-node" QEMU state, and query-cpus will have a externally
> visible side effect. Every QMP command would need to document
> how it affects QEMU state in a externally visible way.
>
> If QEMU pause state is still going to be externally visible this
> way, I would prefer to let the client to explicitly tell what's
> the state they want QEMU to be, instead of making QEMU change
> state silently as a side effect of QMP commands.
Yeah, good point. My proposal would just have changed explicitly
exposed ugly internal state to subtly exposed ugly internal state,
which is probably worse :(.
Ok.. next possibly bad idea..
What about a "re-exec" monitor command; it would take what's
essentially a new command line, and basically restart qemu from the
beginning, reparsing this new command line, but without actually
Pro:
* Mitigates Daniel Berrange's concern about lots of qemu
configuration being buried in the qmp session - if libvirt logged
its last "re-exec" that would have what is generally needed.
* Lets libvirt do assorted investigation of options, then rewind to
choose what it actually wants
Con:
* Would require a bunch of auditing of structures/state to make sure
they can be re-initialized cleanly
* Would it be fast enough for libvirt to use? Do we know if the
slowness which makes multiple qemu invocations by libvirt
unattractive is from the kernel/libc/ldso overhead, or from qemu's
internal start up processing?
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Fri, Oct 20, 2017 at 12:19:17PM +1100, David Gibson wrote:
> On Thu, Oct 19, 2017 at 10:15:48PM -0200, Eduardo Habkost wrote:
> > On Thu, Oct 19, 2017 at 09:42:18PM +1100, David Gibson wrote:
> > > On Mon, Oct 16, 2017 at 02:59:16PM -0200, Eduardo Habkost wrote:
> > > > On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
> > > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > > > ---
> > > > > include/sysemu/sysemu.h | 1 +
> > > > > qemu-options.hx | 15 ++++++++++++++
> > > > > qmp.c | 5 +++++
> > > > > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > 4 files changed, 74 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > > index b213696..3feb94f 100644
> > > > > --- a/include/sysemu/sysemu.h
> > > > > +++ b/include/sysemu/sysemu.h
> > > > > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > > > > QEMU_WAKEUP_REASON_OTHER,
> > > > > } WakeupReason;
> > > > >
> > > > > +void qemu_exit_preconfig_request(void);
> > > > > void qemu_system_reset_request(ShutdownCause reason);
> > > > > void qemu_system_suspend_request(void);
> > > > > void qemu_register_suspend_notifier(Notifier *notifier);
> > > > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > > > index 39225ae..bd44db8 100644
> > > > > --- a/qemu-options.hx
> > > > > +++ b/qemu-options.hx
> > > > > @@ -3498,6 +3498,21 @@ STEXI
> > > > > Run the emulation in single step mode.
> > > > > ETEXI
> > > > >
> > > > > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > > > > + "-paused [state=]postconf|preconf\n"
> > > > > + " postconf: pause QEMU after machine is initialized\n"
> > > > > + " preconf: pause QEMU before machine is initialized\n",
> > > > > + QEMU_ARCH_ALL)
> > > >
> > > > I would like to allow pausing before machine-type is selected, so
> > > > management could run query-machines before choosing a
> > > > machine-type. Would that need a third "-pause" mode, or will we
> > > > be able to change "preconf" to pause before select_machine() is
> > > > called?
> > > >
> > > > The same probably applies to other things initialized before
> > > > machine_run_board_init() that could be configurable using QMP,
> > > > including but not limited to:
> > > > * Accelerator configuration
> > > > * Registering global properties
> > > > * RAM size
> > > > * SMP/CPU configuration
> > >
> > > Yeah.. having a bunch of different possible pause stages to select
> > > doesn't sound great.
> >
> > I agree. The number of externally visible pause states should be
> > as small as possible.
> >
> >
> > > Could we avoid this by instead changing -S to
> > > pause at the earliest possible spot, but having any monitor commands
> > > that require a later stage automatically "fast forwarding" to the
> > > right phase?
> >
> > That would hide the internal details from the outside. Sounds
> > nice, but adding new machine/device configuration QMP commands
> > while hiding the QEMU state from the outside sounds impossible.
> >
> > For example, if we use -S today, this works:
> >
> > $ qemu-system-x86_64 -S -qmp stdio
> > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > -> {"execute":"qmp_capabilities"}
> > <- {"return": {}}
> > -> {"execute":"query-cpus"}
> > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> >
> > This means "query-cpus" needs to fast-forward to the CPU creation
> > stage if we want to keep compatibility.
> >
> > Now, assume we add a set-numa-node command like the one in this
> > series. e.g.:
> >
> > $ qemu-system-x86_64 -S -qmp stdio
> > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > -> {"execute":"qmp_capabilities"}
> > <- {"return": {}}
> > -> {"execute":"set-numa-node" ... }
> > <- {"return": ...}
> >
> > The command will work only if machine initialization didn't run
> > yet.
> >
> > But now an innocent-looking query command would change QEMU state
> > in an unexpected way:
> >
> > $ qemu-system-x86_64 -S -qmp stdio
> > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > -> {"execute":"qmp_capabilities"}
> > <- {"return": {}}
> > -> {"execute":"query-cpus"} [will silently fast-forward QEMU state]
> > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > -> {"execute":"set-numa-node" ... }
> > <- {"error": ...} [the command will fail because the machine was already created]
> >
> > This means we do have a externally visible "too late to use
> > set-numa-node" QEMU state, and query-cpus will have a externally
> > visible side effect. Every QMP command would need to document
> > how it affects QEMU state in a externally visible way.
> >
> > If QEMU pause state is still going to be externally visible this
> > way, I would prefer to let the client to explicitly tell what's
> > the state they want QEMU to be, instead of making QEMU change
> > state silently as a side effect of QMP commands.
>
> Yeah, good point. My proposal would just have changed explicitly
> exposed ugly internal state to subtly exposed ugly internal state,
> which is probably worse :(.
>
>
> Ok.. next possibly bad idea..
>
> What about a "re-exec" monitor command; it would take what's
> essentially a new command line, and basically restart qemu from the
> beginning, reparsing this new command line, but without actually
>
> Pro:
> * Mitigates Daniel Berrange's concern about lots of qemu
> configuration being buried in the qmp session - if libvirt logged
> its last "re-exec" that would have what is generally needed.
> * Lets libvirt do assorted investigation of options, then rewind to
> choose what it actually wants
Sounds like a superset of Paolo's "-machine none" proposal[1].
It would be a very simple interface, not sure it can be easily
implemented efficiently.
[1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg488618.html
>
> Con:
> * Would require a bunch of auditing of structures/state to make sure
> they can be re-initialized cleanly
This sounds like a big obstacle. QEMU still have too much global
state outside the machine/qdev tree.
> * Would it be fast enough for libvirt to use? Do we know if the
> slowness which makes multiple qemu invocations by libvirt
> unattractive is from the kernel/libc/ldso overhead, or from qemu's
> internal start up processing?
My gut feeling is that this could be too slow, if the scope of
"re-exec" is too big.
Now, let me try to go to the opposite extreme: I think you had a
good point in your previous proposal. Why should we need to
restart/re-execute anything at all just because some bit of
configuration is being changed by libvirt? Why commands like
set-numa-node should require QEMU to be in a state that is not
covered by -S? If the guest is not running yet, there should be
no reason to require clients to explicitly pause/continue/restart
anything.
(Translating this to my example above: why exactly have I assumed
above that keeping "query-cpus" working would necessarily make
set-numa-node stop working?)
--
Eduardo
On Fri, 20 Oct 2017 12:21:00 -0200
Eduardo Habkost <ehabkost@redhat.com> wrote:
> On Fri, Oct 20, 2017 at 12:19:17PM +1100, David Gibson wrote:
> > On Thu, Oct 19, 2017 at 10:15:48PM -0200, Eduardo Habkost wrote:
> > > On Thu, Oct 19, 2017 at 09:42:18PM +1100, David Gibson wrote:
> > > > On Mon, Oct 16, 2017 at 02:59:16PM -0200, Eduardo Habkost wrote:
> > > > > On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
> > > > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > > > > ---
> > > > > > include/sysemu/sysemu.h | 1 +
> > > > > > qemu-options.hx | 15 ++++++++++++++
> > > > > > qmp.c | 5 +++++
> > > > > > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > > 4 files changed, 74 insertions(+), 1 deletion(-)
> > > > > >
> > > > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > > > index b213696..3feb94f 100644
> > > > > > --- a/include/sysemu/sysemu.h
> > > > > > +++ b/include/sysemu/sysemu.h
> > > > > > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > > > > > QEMU_WAKEUP_REASON_OTHER,
> > > > > > } WakeupReason;
> > > > > >
> > > > > > +void qemu_exit_preconfig_request(void);
> > > > > > void qemu_system_reset_request(ShutdownCause reason);
> > > > > > void qemu_system_suspend_request(void);
> > > > > > void qemu_register_suspend_notifier(Notifier *notifier);
> > > > > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > > > > index 39225ae..bd44db8 100644
> > > > > > --- a/qemu-options.hx
> > > > > > +++ b/qemu-options.hx
> > > > > > @@ -3498,6 +3498,21 @@ STEXI
> > > > > > Run the emulation in single step mode.
> > > > > > ETEXI
> > > > > >
> > > > > > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > > > > > + "-paused [state=]postconf|preconf\n"
> > > > > > + " postconf: pause QEMU after machine is initialized\n"
> > > > > > + " preconf: pause QEMU before machine is initialized\n",
> > > > > > + QEMU_ARCH_ALL)
> > > > >
> > > > > I would like to allow pausing before machine-type is selected, so
> > > > > management could run query-machines before choosing a
> > > > > machine-type. Would that need a third "-pause" mode, or will we
> > > > > be able to change "preconf" to pause before select_machine() is
> > > > > called?
> > > > >
> > > > > The same probably applies to other things initialized before
> > > > > machine_run_board_init() that could be configurable using QMP,
> > > > > including but not limited to:
> > > > > * Accelerator configuration
> > > > > * Registering global properties
> > > > > * RAM size
> > > > > * SMP/CPU configuration
> > > >
> > > > Yeah.. having a bunch of different possible pause stages to select
> > > > doesn't sound great.
> > >
> > > I agree. The number of externally visible pause states should be
> > > as small as possible.
> > >
> > >
> > > > Could we avoid this by instead changing -S to
> > > > pause at the earliest possible spot, but having any monitor commands
> > > > that require a later stage automatically "fast forwarding" to the
> > > > right phase?
> > >
> > > That would hide the internal details from the outside. Sounds
> > > nice, but adding new machine/device configuration QMP commands
> > > while hiding the QEMU state from the outside sounds impossible.
> > >
> > > For example, if we use -S today, this works:
> > >
> > > $ qemu-system-x86_64 -S -qmp stdio
> > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > -> {"execute":"qmp_capabilities"}
> > > <- {"return": {}}
> > > -> {"execute":"query-cpus"}
> > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > >
> > > This means "query-cpus" needs to fast-forward to the CPU creation
> > > stage if we want to keep compatibility.
> > >
> > > Now, assume we add a set-numa-node command like the one in this
> > > series. e.g.:
> > >
> > > $ qemu-system-x86_64 -S -qmp stdio
> > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > -> {"execute":"qmp_capabilities"}
> > > <- {"return": {}}
> > > -> {"execute":"set-numa-node" ... }
> > > <- {"return": ...}
> > >
> > > The command will work only if machine initialization didn't run
> > > yet.
> > >
> > > But now an innocent-looking query command would change QEMU state
> > > in an unexpected way:
> > >
> > > $ qemu-system-x86_64 -S -qmp stdio
> > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > -> {"execute":"qmp_capabilities"}
> > > <- {"return": {}}
> > > -> {"execute":"query-cpus"} [will silently fast-forward QEMU state]
> > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > > -> {"execute":"set-numa-node" ... }
> > > <- {"error": ...} [the command will fail because the machine was already created]
> > >
> > > This means we do have a externally visible "too late to use
> > > set-numa-node" QEMU state, and query-cpus will have a externally
> > > visible side effect. Every QMP command would need to document
> > > how it affects QEMU state in a externally visible way.
> > >
> > > If QEMU pause state is still going to be externally visible this
> > > way, I would prefer to let the client to explicitly tell what's
> > > the state they want QEMU to be, instead of making QEMU change
> > > state silently as a side effect of QMP commands.
> >
> > Yeah, good point. My proposal would just have changed explicitly
> > exposed ugly internal state to subtly exposed ugly internal state,
> > which is probably worse :(.
> >
> >
> > Ok.. next possibly bad idea..
> >
> > What about a "re-exec" monitor command; it would take what's
> > essentially a new command line, and basically restart qemu from the
> > beginning, reparsing this new command line, but without actually
> >
> > Pro:
> > * Mitigates Daniel Berrange's concern about lots of qemu
> > configuration being buried in the qmp session - if libvirt logged
> > its last "re-exec" that would have what is generally needed.
> > * Lets libvirt do assorted investigation of options, then rewind to
> > choose what it actually wants
>
> Sounds like a superset of Paolo's "-machine none" proposal[1].
> It would be a very simple interface, not sure it can be easily
> implemented efficiently.
>
> [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg488618.html
>
> >
> > Con:
> > * Would require a bunch of auditing of structures/state to make sure
> > they can be re-initialized cleanly
>
> This sounds like a big obstacle. QEMU still have too much global
> state outside the machine/qdev tree.
>
>
> > * Would it be fast enough for libvirt to use? Do we know if the
> > slowness which makes multiple qemu invocations by libvirt
> > unattractive is from the kernel/libc/ldso overhead, or from qemu's
> > internal start up processing?
>
> My gut feeling is that this could be too slow, if the scope of
> "re-exec" is too big.
>
>
> Now, let me try to go to the opposite extreme: I think you had a
> good point in your previous proposal. Why should we need to
> restart/re-execute anything at all just because some bit of
> configuration is being changed by libvirt? Why commands like
> set-numa-node should require QEMU to be in a state that is not
> covered by -S? If the guest is not running yet, there should be
> no reason to require clients to explicitly pause/continue/restart
> anything.
It's probably doable to do numa config at '-S' time for x86 (arm),
since ACPI tables are regenerated on the first read (legacy fw_cfg
would be a little problematic but probably could be 'fixed' as well)
But I can't say outright if it's doable for other targets,
in general issue here is that '-S' pauses after machine_done is run
and all necessary wiring board requires is finalized by then
and no hooks run after unpause.
If there is a general consensus to go this route, I can invest
some time in making it work (then this series could be dropped)
Even so, postponing set-numa to '-S' won't address Daniel's concern,
i.e. configuration would take several round trips of command to complete
potentially oven slow network. But as it was said libvirt can cache
new CLI options for further reuse.
Whether is slower/faster than starting qemu with '-M foo -smp ...' +
querying layout and then restarting it again with -numa options
would depend on network speed.
>
> (Translating this to my example above: why exactly have I assumed
> above that keeping "query-cpus" working would necessarily make
> set-numa-node stop working?)
>
On Mon, Oct 23, 2017 at 11:49:13AM +0200, Igor Mammedov wrote:
> On Fri, 20 Oct 2017 12:21:00 -0200
> Eduardo Habkost <ehabkost@redhat.com> wrote:
>
> > On Fri, Oct 20, 2017 at 12:19:17PM +1100, David Gibson wrote:
> > > On Thu, Oct 19, 2017 at 10:15:48PM -0200, Eduardo Habkost wrote:
> > > > On Thu, Oct 19, 2017 at 09:42:18PM +1100, David Gibson wrote:
> > > > > On Mon, Oct 16, 2017 at 02:59:16PM -0200, Eduardo Habkost wrote:
> > > > > > On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
> > > > > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > > > > > ---
> > > > > > > include/sysemu/sysemu.h | 1 +
> > > > > > > qemu-options.hx | 15 ++++++++++++++
> > > > > > > qmp.c | 5 +++++
> > > > > > > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > > > 4 files changed, 74 insertions(+), 1 deletion(-)
> > > > > > >
> > > > > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > > > > index b213696..3feb94f 100644
> > > > > > > --- a/include/sysemu/sysemu.h
> > > > > > > +++ b/include/sysemu/sysemu.h
> > > > > > > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > > > > > > QEMU_WAKEUP_REASON_OTHER,
> > > > > > > } WakeupReason;
> > > > > > >
> > > > > > > +void qemu_exit_preconfig_request(void);
> > > > > > > void qemu_system_reset_request(ShutdownCause reason);
> > > > > > > void qemu_system_suspend_request(void);
> > > > > > > void qemu_register_suspend_notifier(Notifier *notifier);
> > > > > > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > > > > > index 39225ae..bd44db8 100644
> > > > > > > --- a/qemu-options.hx
> > > > > > > +++ b/qemu-options.hx
> > > > > > > @@ -3498,6 +3498,21 @@ STEXI
> > > > > > > Run the emulation in single step mode.
> > > > > > > ETEXI
> > > > > > >
> > > > > > > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > > > > > > + "-paused [state=]postconf|preconf\n"
> > > > > > > + " postconf: pause QEMU after machine is initialized\n"
> > > > > > > + " preconf: pause QEMU before machine is initialized\n",
> > > > > > > + QEMU_ARCH_ALL)
> > > > > >
> > > > > > I would like to allow pausing before machine-type is selected, so
> > > > > > management could run query-machines before choosing a
> > > > > > machine-type. Would that need a third "-pause" mode, or will we
> > > > > > be able to change "preconf" to pause before select_machine() is
> > > > > > called?
> > > > > >
> > > > > > The same probably applies to other things initialized before
> > > > > > machine_run_board_init() that could be configurable using QMP,
> > > > > > including but not limited to:
> > > > > > * Accelerator configuration
> > > > > > * Registering global properties
> > > > > > * RAM size
> > > > > > * SMP/CPU configuration
> > > > >
> > > > > Yeah.. having a bunch of different possible pause stages to select
> > > > > doesn't sound great.
> > > >
> > > > I agree. The number of externally visible pause states should be
> > > > as small as possible.
> > > >
> > > >
> > > > > Could we avoid this by instead changing -S to
> > > > > pause at the earliest possible spot, but having any monitor commands
> > > > > that require a later stage automatically "fast forwarding" to the
> > > > > right phase?
> > > >
> > > > That would hide the internal details from the outside. Sounds
> > > > nice, but adding new machine/device configuration QMP commands
> > > > while hiding the QEMU state from the outside sounds impossible.
> > > >
> > > > For example, if we use -S today, this works:
> > > >
> > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > -> {"execute":"qmp_capabilities"}
> > > > <- {"return": {}}
> > > > -> {"execute":"query-cpus"}
> > > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > > >
> > > > This means "query-cpus" needs to fast-forward to the CPU creation
> > > > stage if we want to keep compatibility.
> > > >
> > > > Now, assume we add a set-numa-node command like the one in this
> > > > series. e.g.:
> > > >
> > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > -> {"execute":"qmp_capabilities"}
> > > > <- {"return": {}}
> > > > -> {"execute":"set-numa-node" ... }
> > > > <- {"return": ...}
> > > >
> > > > The command will work only if machine initialization didn't run
> > > > yet.
> > > >
> > > > But now an innocent-looking query command would change QEMU state
> > > > in an unexpected way:
> > > >
> > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > -> {"execute":"qmp_capabilities"}
> > > > <- {"return": {}}
> > > > -> {"execute":"query-cpus"} [will silently fast-forward QEMU state]
> > > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > > > -> {"execute":"set-numa-node" ... }
> > > > <- {"error": ...} [the command will fail because the machine was already created]
> > > >
> > > > This means we do have a externally visible "too late to use
> > > > set-numa-node" QEMU state, and query-cpus will have a externally
> > > > visible side effect. Every QMP command would need to document
> > > > how it affects QEMU state in a externally visible way.
> > > >
> > > > If QEMU pause state is still going to be externally visible this
> > > > way, I would prefer to let the client to explicitly tell what's
> > > > the state they want QEMU to be, instead of making QEMU change
> > > > state silently as a side effect of QMP commands.
> > >
> > > Yeah, good point. My proposal would just have changed explicitly
> > > exposed ugly internal state to subtly exposed ugly internal state,
> > > which is probably worse :(.
> > >
> > >
> > > Ok.. next possibly bad idea..
> > >
> > > What about a "re-exec" monitor command; it would take what's
> > > essentially a new command line, and basically restart qemu from the
> > > beginning, reparsing this new command line, but without actually
> > >
> > > Pro:
> > > * Mitigates Daniel Berrange's concern about lots of qemu
> > > configuration being buried in the qmp session - if libvirt logged
> > > its last "re-exec" that would have what is generally needed.
> > > * Lets libvirt do assorted investigation of options, then rewind to
> > > choose what it actually wants
> >
> > Sounds like a superset of Paolo's "-machine none" proposal[1].
> > It would be a very simple interface, not sure it can be easily
> > implemented efficiently.
> >
> > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg488618.html
> >
> > >
> > > Con:
> > > * Would require a bunch of auditing of structures/state to make sure
> > > they can be re-initialized cleanly
> >
> > This sounds like a big obstacle. QEMU still have too much global
> > state outside the machine/qdev tree.
> >
> >
> > > * Would it be fast enough for libvirt to use? Do we know if the
> > > slowness which makes multiple qemu invocations by libvirt
> > > unattractive is from the kernel/libc/ldso overhead, or from qemu's
> > > internal start up processing?
> >
> > My gut feeling is that this could be too slow, if the scope of
> > "re-exec" is too big.
> >
> >
> > Now, let me try to go to the opposite extreme: I think you had a
> > good point in your previous proposal. Why should we need to
> > restart/re-execute anything at all just because some bit of
> > configuration is being changed by libvirt? Why commands like
> > set-numa-node should require QEMU to be in a state that is not
> > covered by -S? If the guest is not running yet, there should be
> > no reason to require clients to explicitly pause/continue/restart
> > anything.
> It's probably doable to do numa config at '-S' time for x86 (arm),
> since ACPI tables are regenerated on the first read (legacy fw_cfg
> would be a little problematic but probably could be 'fixed' as well)
>
> But I can't say outright if it's doable for other targets,
> in general issue here is that '-S' pauses after machine_done is run
> and all necessary wiring board requires is finalized by then
> and no hooks run after unpause.
> If there is a general consensus to go this route, I can invest
> some time in making it work (then this series could be dropped)
>
> Even so, postponing set-numa to '-S' won't address Daniel's concern,
> i.e. configuration would take several round trips of command to complete
> potentially oven slow network. But as it was said libvirt can cache
> new CLI options for further reuse.
We can cache stuff from the generic "-m none" invokation, but we won't
cache stuff from invokation of a specific VM instance, because we can't
have confidence that such data is independant of the VM config. So we
would likely just end up hardcoding the arch specific data in libvirt if
that was all QEMU provided.
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On Mon, 23 Oct 2017 10:53:16 +0100
"Daniel P. Berrange" <berrange@redhat.com> wrote:
> On Mon, Oct 23, 2017 at 11:49:13AM +0200, Igor Mammedov wrote:
> > On Fri, 20 Oct 2017 12:21:00 -0200
> > Eduardo Habkost <ehabkost@redhat.com> wrote:
> >
> > > On Fri, Oct 20, 2017 at 12:19:17PM +1100, David Gibson wrote:
> > > > On Thu, Oct 19, 2017 at 10:15:48PM -0200, Eduardo Habkost wrote:
> > > > > On Thu, Oct 19, 2017 at 09:42:18PM +1100, David Gibson wrote:
> > > > > > On Mon, Oct 16, 2017 at 02:59:16PM -0200, Eduardo Habkost wrote:
> > > > > > > On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
> > > > > > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > > > > > > ---
> > > > > > > > include/sysemu/sysemu.h | 1 +
> > > > > > > > qemu-options.hx | 15 ++++++++++++++
> > > > > > > > qmp.c | 5 +++++
> > > > > > > > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > > > > 4 files changed, 74 insertions(+), 1 deletion(-)
> > > > > > > >
> > > > > > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > > > > > index b213696..3feb94f 100644
> > > > > > > > --- a/include/sysemu/sysemu.h
> > > > > > > > +++ b/include/sysemu/sysemu.h
> > > > > > > > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > > > > > > > QEMU_WAKEUP_REASON_OTHER,
> > > > > > > > } WakeupReason;
> > > > > > > >
> > > > > > > > +void qemu_exit_preconfig_request(void);
> > > > > > > > void qemu_system_reset_request(ShutdownCause reason);
> > > > > > > > void qemu_system_suspend_request(void);
> > > > > > > > void qemu_register_suspend_notifier(Notifier *notifier);
> > > > > > > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > > > > > > index 39225ae..bd44db8 100644
> > > > > > > > --- a/qemu-options.hx
> > > > > > > > +++ b/qemu-options.hx
> > > > > > > > @@ -3498,6 +3498,21 @@ STEXI
> > > > > > > > Run the emulation in single step mode.
> > > > > > > > ETEXI
> > > > > > > >
> > > > > > > > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > > > > > > > + "-paused [state=]postconf|preconf\n"
> > > > > > > > + " postconf: pause QEMU after machine is initialized\n"
> > > > > > > > + " preconf: pause QEMU before machine is initialized\n",
> > > > > > > > + QEMU_ARCH_ALL)
> > > > > > >
> > > > > > > I would like to allow pausing before machine-type is selected, so
> > > > > > > management could run query-machines before choosing a
> > > > > > > machine-type. Would that need a third "-pause" mode, or will we
> > > > > > > be able to change "preconf" to pause before select_machine() is
> > > > > > > called?
> > > > > > >
> > > > > > > The same probably applies to other things initialized before
> > > > > > > machine_run_board_init() that could be configurable using QMP,
> > > > > > > including but not limited to:
> > > > > > > * Accelerator configuration
> > > > > > > * Registering global properties
> > > > > > > * RAM size
> > > > > > > * SMP/CPU configuration
> > > > > >
> > > > > > Yeah.. having a bunch of different possible pause stages to select
> > > > > > doesn't sound great.
> > > > >
> > > > > I agree. The number of externally visible pause states should be
> > > > > as small as possible.
> > > > >
> > > > >
> > > > > > Could we avoid this by instead changing -S to
> > > > > > pause at the earliest possible spot, but having any monitor commands
> > > > > > that require a later stage automatically "fast forwarding" to the
> > > > > > right phase?
> > > > >
> > > > > That would hide the internal details from the outside. Sounds
> > > > > nice, but adding new machine/device configuration QMP commands
> > > > > while hiding the QEMU state from the outside sounds impossible.
> > > > >
> > > > > For example, if we use -S today, this works:
> > > > >
> > > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > > -> {"execute":"qmp_capabilities"}
> > > > > <- {"return": {}}
> > > > > -> {"execute":"query-cpus"}
> > > > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > > > >
> > > > > This means "query-cpus" needs to fast-forward to the CPU creation
> > > > > stage if we want to keep compatibility.
> > > > >
> > > > > Now, assume we add a set-numa-node command like the one in this
> > > > > series. e.g.:
> > > > >
> > > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > > -> {"execute":"qmp_capabilities"}
> > > > > <- {"return": {}}
> > > > > -> {"execute":"set-numa-node" ... }
> > > > > <- {"return": ...}
> > > > >
> > > > > The command will work only if machine initialization didn't run
> > > > > yet.
> > > > >
> > > > > But now an innocent-looking query command would change QEMU state
> > > > > in an unexpected way:
> > > > >
> > > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > > -> {"execute":"qmp_capabilities"}
> > > > > <- {"return": {}}
> > > > > -> {"execute":"query-cpus"} [will silently fast-forward QEMU state]
> > > > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > > > > -> {"execute":"set-numa-node" ... }
> > > > > <- {"error": ...} [the command will fail because the machine was already created]
> > > > >
> > > > > This means we do have a externally visible "too late to use
> > > > > set-numa-node" QEMU state, and query-cpus will have a externally
> > > > > visible side effect. Every QMP command would need to document
> > > > > how it affects QEMU state in a externally visible way.
> > > > >
> > > > > If QEMU pause state is still going to be externally visible this
> > > > > way, I would prefer to let the client to explicitly tell what's
> > > > > the state they want QEMU to be, instead of making QEMU change
> > > > > state silently as a side effect of QMP commands.
> > > >
> > > > Yeah, good point. My proposal would just have changed explicitly
> > > > exposed ugly internal state to subtly exposed ugly internal state,
> > > > which is probably worse :(.
> > > >
> > > >
> > > > Ok.. next possibly bad idea..
> > > >
> > > > What about a "re-exec" monitor command; it would take what's
> > > > essentially a new command line, and basically restart qemu from the
> > > > beginning, reparsing this new command line, but without actually
> > > >
> > > > Pro:
> > > > * Mitigates Daniel Berrange's concern about lots of qemu
> > > > configuration being buried in the qmp session - if libvirt logged
> > > > its last "re-exec" that would have what is generally needed.
> > > > * Lets libvirt do assorted investigation of options, then rewind to
> > > > choose what it actually wants
> > >
> > > Sounds like a superset of Paolo's "-machine none" proposal[1].
> > > It would be a very simple interface, not sure it can be easily
> > > implemented efficiently.
> > >
> > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg488618.html
> > >
> > > >
> > > > Con:
> > > > * Would require a bunch of auditing of structures/state to make sure
> > > > they can be re-initialized cleanly
> > >
> > > This sounds like a big obstacle. QEMU still have too much global
> > > state outside the machine/qdev tree.
> > >
> > >
> > > > * Would it be fast enough for libvirt to use? Do we know if the
> > > > slowness which makes multiple qemu invocations by libvirt
> > > > unattractive is from the kernel/libc/ldso overhead, or from qemu's
> > > > internal start up processing?
> > >
> > > My gut feeling is that this could be too slow, if the scope of
> > > "re-exec" is too big.
> > >
> > >
> > > Now, let me try to go to the opposite extreme: I think you had a
> > > good point in your previous proposal. Why should we need to
> > > restart/re-execute anything at all just because some bit of
> > > configuration is being changed by libvirt? Why commands like
> > > set-numa-node should require QEMU to be in a state that is not
> > > covered by -S? If the guest is not running yet, there should be
> > > no reason to require clients to explicitly pause/continue/restart
> > > anything.
> > It's probably doable to do numa config at '-S' time for x86 (arm),
> > since ACPI tables are regenerated on the first read (legacy fw_cfg
> > would be a little problematic but probably could be 'fixed' as well)
> >
> > But I can't say outright if it's doable for other targets,
> > in general issue here is that '-S' pauses after machine_done is run
> > and all necessary wiring board requires is finalized by then
> > and no hooks run after unpause.
> > If there is a general consensus to go this route, I can invest
> > some time in making it work (then this series could be dropped)
> >
> > Even so, postponing set-numa to '-S' won't address Daniel's concern,
> > i.e. configuration would take several round trips of command to complete
> > potentially oven slow network. But as it was said libvirt can cache
> > new CLI options for further reuse.
>
> We can cache stuff from the generic "-m none" invokation, but we won't
> cache stuff from invokation of a specific VM instance, because we can't
> have confidence that such data is independant of the VM config. So we
In case if cpu layout we have fixed set of options that influence it
(-M foo_vXX -smp ...), so from QEMU side it should be possible to
promise it would stay stable.
But such caching would be useful in other use cases as well.
Is the issue in invalidating cached data in case of option(s) would
change cached data?
> would likely just end up hardcoding the arch specific data in libvirt if
> that was all QEMU provided.
Another insane idea is to make algorithm introspectable, i.e.
publish per machine code that would be used by both, mgmt and qemu
to compute layout, for example in python. It's probably not issue
for libvirt but qemu will have to embed python to make shared
algorithm work. Not sure if it's acceptable from qemu pov.
>
> Regards,
> Daniel
On Mon, Oct 23, 2017 at 12:36:20PM +0200, Igor Mammedov wrote:
> On Mon, 23 Oct 2017 10:53:16 +0100
> "Daniel P. Berrange" <berrange@redhat.com> wrote:
>
> > On Mon, Oct 23, 2017 at 11:49:13AM +0200, Igor Mammedov wrote:
> > > On Fri, 20 Oct 2017 12:21:00 -0200
> > > Eduardo Habkost <ehabkost@redhat.com> wrote:
> > >
> > > > On Fri, Oct 20, 2017 at 12:19:17PM +1100, David Gibson wrote:
> > > > > On Thu, Oct 19, 2017 at 10:15:48PM -0200, Eduardo Habkost wrote:
> > > > > > On Thu, Oct 19, 2017 at 09:42:18PM +1100, David Gibson wrote:
> > > > > > > On Mon, Oct 16, 2017 at 02:59:16PM -0200, Eduardo Habkost wrote:
> > > > > > > > On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
> > > > > > > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > > > > > > > ---
> > > > > > > > > include/sysemu/sysemu.h | 1 +
> > > > > > > > > qemu-options.hx | 15 ++++++++++++++
> > > > > > > > > qmp.c | 5 +++++
> > > > > > > > > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > > > > > 4 files changed, 74 insertions(+), 1 deletion(-)
> > > > > > > > >
> > > > > > > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > > > > > > index b213696..3feb94f 100644
> > > > > > > > > --- a/include/sysemu/sysemu.h
> > > > > > > > > +++ b/include/sysemu/sysemu.h
> > > > > > > > > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > > > > > > > > QEMU_WAKEUP_REASON_OTHER,
> > > > > > > > > } WakeupReason;
> > > > > > > > >
> > > > > > > > > +void qemu_exit_preconfig_request(void);
> > > > > > > > > void qemu_system_reset_request(ShutdownCause reason);
> > > > > > > > > void qemu_system_suspend_request(void);
> > > > > > > > > void qemu_register_suspend_notifier(Notifier *notifier);
> > > > > > > > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > > > > > > > index 39225ae..bd44db8 100644
> > > > > > > > > --- a/qemu-options.hx
> > > > > > > > > +++ b/qemu-options.hx
> > > > > > > > > @@ -3498,6 +3498,21 @@ STEXI
> > > > > > > > > Run the emulation in single step mode.
> > > > > > > > > ETEXI
> > > > > > > > >
> > > > > > > > > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > > > > > > > > + "-paused [state=]postconf|preconf\n"
> > > > > > > > > + " postconf: pause QEMU after machine is initialized\n"
> > > > > > > > > + " preconf: pause QEMU before machine is initialized\n",
> > > > > > > > > + QEMU_ARCH_ALL)
> > > > > > > >
> > > > > > > > I would like to allow pausing before machine-type is selected, so
> > > > > > > > management could run query-machines before choosing a
> > > > > > > > machine-type. Would that need a third "-pause" mode, or will we
> > > > > > > > be able to change "preconf" to pause before select_machine() is
> > > > > > > > called?
> > > > > > > >
> > > > > > > > The same probably applies to other things initialized before
> > > > > > > > machine_run_board_init() that could be configurable using QMP,
> > > > > > > > including but not limited to:
> > > > > > > > * Accelerator configuration
> > > > > > > > * Registering global properties
> > > > > > > > * RAM size
> > > > > > > > * SMP/CPU configuration
> > > > > > >
> > > > > > > Yeah.. having a bunch of different possible pause stages to select
> > > > > > > doesn't sound great.
> > > > > >
> > > > > > I agree. The number of externally visible pause states should be
> > > > > > as small as possible.
> > > > > >
> > > > > >
> > > > > > > Could we avoid this by instead changing -S to
> > > > > > > pause at the earliest possible spot, but having any monitor commands
> > > > > > > that require a later stage automatically "fast forwarding" to the
> > > > > > > right phase?
> > > > > >
> > > > > > That would hide the internal details from the outside. Sounds
> > > > > > nice, but adding new machine/device configuration QMP commands
> > > > > > while hiding the QEMU state from the outside sounds impossible.
> > > > > >
> > > > > > For example, if we use -S today, this works:
> > > > > >
> > > > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > > > -> {"execute":"qmp_capabilities"}
> > > > > > <- {"return": {}}
> > > > > > -> {"execute":"query-cpus"}
> > > > > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > > > > >
> > > > > > This means "query-cpus" needs to fast-forward to the CPU creation
> > > > > > stage if we want to keep compatibility.
> > > > > >
> > > > > > Now, assume we add a set-numa-node command like the one in this
> > > > > > series. e.g.:
> > > > > >
> > > > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > > > -> {"execute":"qmp_capabilities"}
> > > > > > <- {"return": {}}
> > > > > > -> {"execute":"set-numa-node" ... }
> > > > > > <- {"return": ...}
> > > > > >
> > > > > > The command will work only if machine initialization didn't run
> > > > > > yet.
> > > > > >
> > > > > > But now an innocent-looking query command would change QEMU state
> > > > > > in an unexpected way:
> > > > > >
> > > > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > > > -> {"execute":"qmp_capabilities"}
> > > > > > <- {"return": {}}
> > > > > > -> {"execute":"query-cpus"} [will silently fast-forward QEMU state]
> > > > > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > > > > > -> {"execute":"set-numa-node" ... }
> > > > > > <- {"error": ...} [the command will fail because the machine was already created]
> > > > > >
> > > > > > This means we do have a externally visible "too late to use
> > > > > > set-numa-node" QEMU state, and query-cpus will have a externally
> > > > > > visible side effect. Every QMP command would need to document
> > > > > > how it affects QEMU state in a externally visible way.
> > > > > >
> > > > > > If QEMU pause state is still going to be externally visible this
> > > > > > way, I would prefer to let the client to explicitly tell what's
> > > > > > the state they want QEMU to be, instead of making QEMU change
> > > > > > state silently as a side effect of QMP commands.
> > > > >
> > > > > Yeah, good point. My proposal would just have changed explicitly
> > > > > exposed ugly internal state to subtly exposed ugly internal state,
> > > > > which is probably worse :(.
> > > > >
> > > > >
> > > > > Ok.. next possibly bad idea..
> > > > >
> > > > > What about a "re-exec" monitor command; it would take what's
> > > > > essentially a new command line, and basically restart qemu from the
> > > > > beginning, reparsing this new command line, but without actually
> > > > >
> > > > > Pro:
> > > > > * Mitigates Daniel Berrange's concern about lots of qemu
> > > > > configuration being buried in the qmp session - if libvirt logged
> > > > > its last "re-exec" that would have what is generally needed.
> > > > > * Lets libvirt do assorted investigation of options, then rewind to
> > > > > choose what it actually wants
> > > >
> > > > Sounds like a superset of Paolo's "-machine none" proposal[1].
> > > > It would be a very simple interface, not sure it can be easily
> > > > implemented efficiently.
> > > >
> > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg488618.html
> > > >
> > > > >
> > > > > Con:
> > > > > * Would require a bunch of auditing of structures/state to make sure
> > > > > they can be re-initialized cleanly
> > > >
> > > > This sounds like a big obstacle. QEMU still have too much global
> > > > state outside the machine/qdev tree.
> > > >
> > > >
> > > > > * Would it be fast enough for libvirt to use? Do we know if the
> > > > > slowness which makes multiple qemu invocations by libvirt
> > > > > unattractive is from the kernel/libc/ldso overhead, or from qemu's
> > > > > internal start up processing?
> > > >
> > > > My gut feeling is that this could be too slow, if the scope of
> > > > "re-exec" is too big.
> > > >
> > > >
> > > > Now, let me try to go to the opposite extreme: I think you had a
> > > > good point in your previous proposal. Why should we need to
> > > > restart/re-execute anything at all just because some bit of
> > > > configuration is being changed by libvirt? Why commands like
> > > > set-numa-node should require QEMU to be in a state that is not
> > > > covered by -S? If the guest is not running yet, there should be
> > > > no reason to require clients to explicitly pause/continue/restart
> > > > anything.
> > > It's probably doable to do numa config at '-S' time for x86 (arm),
> > > since ACPI tables are regenerated on the first read (legacy fw_cfg
> > > would be a little problematic but probably could be 'fixed' as well)
> > >
> > > But I can't say outright if it's doable for other targets,
> > > in general issue here is that '-S' pauses after machine_done is run
> > > and all necessary wiring board requires is finalized by then
> > > and no hooks run after unpause.
> > > If there is a general consensus to go this route, I can invest
> > > some time in making it work (then this series could be dropped)
> > >
> > > Even so, postponing set-numa to '-S' won't address Daniel's concern,
> > > i.e. configuration would take several round trips of command to complete
> > > potentially oven slow network. But as it was said libvirt can cache
> > > new CLI options for further reuse.
> >
> > We can cache stuff from the generic "-m none" invokation, but we won't
> > cache stuff from invokation of a specific VM instance, because we can't
> > have confidence that such data is independant of the VM config. So we
> In case if cpu layout we have fixed set of options that influence it
> (-M foo_vXX -smp ...), so from QEMU side it should be possible to
> promise it would stay stable.
> But such caching would be useful in other use cases as well.
> Is the issue in invalidating cached data in case of option(s) would
> change cached data?
For the caching to be useful, we need to have a good cache hit rate.
If the cache depends on alot of different CLI args, then you're going
to have to populate many caches each with low hit rate. The current
caching is done based on QEMU/libvirtd binary, so we have 1 cache miss
when QEMU or libvirt are upgraded, then 100% cache hit thereafter, so
the cache is very effective.
> > would likely just end up hardcoding the arch specific data in libvirt if
> > that was all QEMU provided.
> Another insane idea is to make algorithm introspectable, i.e.
> publish per machine code that would be used by both, mgmt and qemu
> to compute layout, for example in python. It's probably not issue
> for libvirt but qemu will have to embed python to make shared
> algorithm work. Not sure if it's acceptable from qemu pov.
That's not going to fly - we definitely cannot assume apps want or can
run python code. Libvirt has major users in many programming languages,
including Python, C, Go, Java, Vala and more.
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On Mon, 23 Oct 2017 11:49:44 +0100
"Daniel P. Berrange" <berrange@redhat.com> wrote:
> On Mon, Oct 23, 2017 at 12:36:20PM +0200, Igor Mammedov wrote:
> > On Mon, 23 Oct 2017 10:53:16 +0100
> > "Daniel P. Berrange" <berrange@redhat.com> wrote:
> >
> > > On Mon, Oct 23, 2017 at 11:49:13AM +0200, Igor Mammedov wrote:
> > > > On Fri, 20 Oct 2017 12:21:00 -0200
> > > > Eduardo Habkost <ehabkost@redhat.com> wrote:
> > > >
> > > > > On Fri, Oct 20, 2017 at 12:19:17PM +1100, David Gibson wrote:
> > > > > > On Thu, Oct 19, 2017 at 10:15:48PM -0200, Eduardo Habkost wrote:
> > > > > > > On Thu, Oct 19, 2017 at 09:42:18PM +1100, David Gibson wrote:
> > > > > > > > On Mon, Oct 16, 2017 at 02:59:16PM -0200, Eduardo Habkost wrote:
> > > > > > > > > On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
> > > > > > > > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > > > > > > > > ---
> > > > > > > > > > include/sysemu/sysemu.h | 1 +
> > > > > > > > > > qemu-options.hx | 15 ++++++++++++++
> > > > > > > > > > qmp.c | 5 +++++
> > > > > > > > > > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > > > > > > 4 files changed, 74 insertions(+), 1 deletion(-)
> > > > > > > > > >
> > > > > > > > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > > > > > > > index b213696..3feb94f 100644
> > > > > > > > > > --- a/include/sysemu/sysemu.h
> > > > > > > > > > +++ b/include/sysemu/sysemu.h
> > > > > > > > > > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > > > > > > > > > QEMU_WAKEUP_REASON_OTHER,
> > > > > > > > > > } WakeupReason;
> > > > > > > > > >
> > > > > > > > > > +void qemu_exit_preconfig_request(void);
> > > > > > > > > > void qemu_system_reset_request(ShutdownCause reason);
> > > > > > > > > > void qemu_system_suspend_request(void);
> > > > > > > > > > void qemu_register_suspend_notifier(Notifier *notifier);
> > > > > > > > > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > > > > > > > > index 39225ae..bd44db8 100644
> > > > > > > > > > --- a/qemu-options.hx
> > > > > > > > > > +++ b/qemu-options.hx
> > > > > > > > > > @@ -3498,6 +3498,21 @@ STEXI
> > > > > > > > > > Run the emulation in single step mode.
> > > > > > > > > > ETEXI
> > > > > > > > > >
> > > > > > > > > > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > > > > > > > > > + "-paused [state=]postconf|preconf\n"
> > > > > > > > > > + " postconf: pause QEMU after machine is initialized\n"
> > > > > > > > > > + " preconf: pause QEMU before machine is initialized\n",
> > > > > > > > > > + QEMU_ARCH_ALL)
> > > > > > > > >
> > > > > > > > > I would like to allow pausing before machine-type is selected, so
> > > > > > > > > management could run query-machines before choosing a
> > > > > > > > > machine-type. Would that need a third "-pause" mode, or will we
> > > > > > > > > be able to change "preconf" to pause before select_machine() is
> > > > > > > > > called?
> > > > > > > > >
> > > > > > > > > The same probably applies to other things initialized before
> > > > > > > > > machine_run_board_init() that could be configurable using QMP,
> > > > > > > > > including but not limited to:
> > > > > > > > > * Accelerator configuration
> > > > > > > > > * Registering global properties
> > > > > > > > > * RAM size
> > > > > > > > > * SMP/CPU configuration
> > > > > > > >
> > > > > > > > Yeah.. having a bunch of different possible pause stages to select
> > > > > > > > doesn't sound great.
> > > > > > >
> > > > > > > I agree. The number of externally visible pause states should be
> > > > > > > as small as possible.
> > > > > > >
> > > > > > >
> > > > > > > > Could we avoid this by instead changing -S to
> > > > > > > > pause at the earliest possible spot, but having any monitor commands
> > > > > > > > that require a later stage automatically "fast forwarding" to the
> > > > > > > > right phase?
> > > > > > >
> > > > > > > That would hide the internal details from the outside. Sounds
> > > > > > > nice, but adding new machine/device configuration QMP commands
> > > > > > > while hiding the QEMU state from the outside sounds impossible.
> > > > > > >
> > > > > > > For example, if we use -S today, this works:
> > > > > > >
> > > > > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > > > > -> {"execute":"qmp_capabilities"}
> > > > > > > <- {"return": {}}
> > > > > > > -> {"execute":"query-cpus"}
> > > > > > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > > > > > >
> > > > > > > This means "query-cpus" needs to fast-forward to the CPU creation
> > > > > > > stage if we want to keep compatibility.
> > > > > > >
> > > > > > > Now, assume we add a set-numa-node command like the one in this
> > > > > > > series. e.g.:
> > > > > > >
> > > > > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > > > > -> {"execute":"qmp_capabilities"}
> > > > > > > <- {"return": {}}
> > > > > > > -> {"execute":"set-numa-node" ... }
> > > > > > > <- {"return": ...}
> > > > > > >
> > > > > > > The command will work only if machine initialization didn't run
> > > > > > > yet.
> > > > > > >
> > > > > > > But now an innocent-looking query command would change QEMU state
> > > > > > > in an unexpected way:
> > > > > > >
> > > > > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > > > > -> {"execute":"qmp_capabilities"}
> > > > > > > <- {"return": {}}
> > > > > > > -> {"execute":"query-cpus"} [will silently fast-forward QEMU state]
> > > > > > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > > > > > > -> {"execute":"set-numa-node" ... }
> > > > > > > <- {"error": ...} [the command will fail because the machine was already created]
> > > > > > >
> > > > > > > This means we do have a externally visible "too late to use
> > > > > > > set-numa-node" QEMU state, and query-cpus will have a externally
> > > > > > > visible side effect. Every QMP command would need to document
> > > > > > > how it affects QEMU state in a externally visible way.
> > > > > > >
> > > > > > > If QEMU pause state is still going to be externally visible this
> > > > > > > way, I would prefer to let the client to explicitly tell what's
> > > > > > > the state they want QEMU to be, instead of making QEMU change
> > > > > > > state silently as a side effect of QMP commands.
> > > > > >
> > > > > > Yeah, good point. My proposal would just have changed explicitly
> > > > > > exposed ugly internal state to subtly exposed ugly internal state,
> > > > > > which is probably worse :(.
> > > > > >
> > > > > >
> > > > > > Ok.. next possibly bad idea..
> > > > > >
> > > > > > What about a "re-exec" monitor command; it would take what's
> > > > > > essentially a new command line, and basically restart qemu from the
> > > > > > beginning, reparsing this new command line, but without actually
> > > > > >
> > > > > > Pro:
> > > > > > * Mitigates Daniel Berrange's concern about lots of qemu
> > > > > > configuration being buried in the qmp session - if libvirt logged
> > > > > > its last "re-exec" that would have what is generally needed.
> > > > > > * Lets libvirt do assorted investigation of options, then rewind to
> > > > > > choose what it actually wants
> > > > >
> > > > > Sounds like a superset of Paolo's "-machine none" proposal[1].
> > > > > It would be a very simple interface, not sure it can be easily
> > > > > implemented efficiently.
> > > > >
> > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg488618.html
> > > > >
> > > > > >
> > > > > > Con:
> > > > > > * Would require a bunch of auditing of structures/state to make sure
> > > > > > they can be re-initialized cleanly
> > > > >
> > > > > This sounds like a big obstacle. QEMU still have too much global
> > > > > state outside the machine/qdev tree.
> > > > >
> > > > >
> > > > > > * Would it be fast enough for libvirt to use? Do we know if the
> > > > > > slowness which makes multiple qemu invocations by libvirt
> > > > > > unattractive is from the kernel/libc/ldso overhead, or from qemu's
> > > > > > internal start up processing?
> > > > >
> > > > > My gut feeling is that this could be too slow, if the scope of
> > > > > "re-exec" is too big.
> > > > >
> > > > >
> > > > > Now, let me try to go to the opposite extreme: I think you had a
> > > > > good point in your previous proposal. Why should we need to
> > > > > restart/re-execute anything at all just because some bit of
> > > > > configuration is being changed by libvirt? Why commands like
> > > > > set-numa-node should require QEMU to be in a state that is not
> > > > > covered by -S? If the guest is not running yet, there should be
> > > > > no reason to require clients to explicitly pause/continue/restart
> > > > > anything.
> > > > It's probably doable to do numa config at '-S' time for x86 (arm),
> > > > since ACPI tables are regenerated on the first read (legacy fw_cfg
> > > > would be a little problematic but probably could be 'fixed' as well)
> > > >
> > > > But I can't say outright if it's doable for other targets,
> > > > in general issue here is that '-S' pauses after machine_done is run
> > > > and all necessary wiring board requires is finalized by then
> > > > and no hooks run after unpause.
> > > > If there is a general consensus to go this route, I can invest
> > > > some time in making it work (then this series could be dropped)
> > > >
> > > > Even so, postponing set-numa to '-S' won't address Daniel's concern,
> > > > i.e. configuration would take several round trips of command to complete
> > > > potentially oven slow network. But as it was said libvirt can cache
> > > > new CLI options for further reuse.
> > >
> > > We can cache stuff from the generic "-m none" invokation, but we won't
> > > cache stuff from invokation of a specific VM instance, because we can't
> > > have confidence that such data is independant of the VM config. So we
> > In case if cpu layout we have fixed set of options that influence it
> > (-M foo_vXX -smp ...), so from QEMU side it should be possible to
> > promise it would stay stable.
> > But such caching would be useful in other use cases as well.
> > Is the issue in invalidating cached data in case of option(s) would
> > change cached data?
>
> For the caching to be useful, we need to have a good cache hit rate.
> If the cache depends on alot of different CLI args, then you're going
> to have to populate many caches each with low hit rate. The current
> caching is done based on QEMU/libvirtd binary, so we have 1 cache miss
> when QEMU or libvirt are upgraded, then 100% cache hit thereafter, so
> the cache is very effective.
With per domain cache one could also have about 100% hit rate every time
the domain is started in case a new option does not invalidate cache.
In case of cpu layout it will remove need for query-hotpluggble-cpus
every time VM is started (when cpu hotplug is enabled) which libvirt
does now.
...
>
> Regards,
> Daniel
On Mon, Oct 23, 2017 at 01:18:30PM +0200, Igor Mammedov wrote:
> On Mon, 23 Oct 2017 11:49:44 +0100
> "Daniel P. Berrange" <berrange@redhat.com> wrote:
>
> > On Mon, Oct 23, 2017 at 12:36:20PM +0200, Igor Mammedov wrote:
> > > On Mon, 23 Oct 2017 10:53:16 +0100
> > > "Daniel P. Berrange" <berrange@redhat.com> wrote:
> > >
> > > > On Mon, Oct 23, 2017 at 11:49:13AM +0200, Igor Mammedov wrote:
> > > > > On Fri, 20 Oct 2017 12:21:00 -0200
> > > > > Eduardo Habkost <ehabkost@redhat.com> wrote:
> > > > >
> > > > > > On Fri, Oct 20, 2017 at 12:19:17PM +1100, David Gibson wrote:
> > > > > > > On Thu, Oct 19, 2017 at 10:15:48PM -0200, Eduardo Habkost wrote:
> > > > > > > > On Thu, Oct 19, 2017 at 09:42:18PM +1100, David Gibson wrote:
> > > > > > > > > On Mon, Oct 16, 2017 at 02:59:16PM -0200, Eduardo Habkost wrote:
> > > > > > > > > > On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
> > > > > > > > > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > > > > > > > > > ---
> > > > > > > > > > > include/sysemu/sysemu.h | 1 +
> > > > > > > > > > > qemu-options.hx | 15 ++++++++++++++
> > > > > > > > > > > qmp.c | 5 +++++
> > > > > > > > > > > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > > > > > > > 4 files changed, 74 insertions(+), 1 deletion(-)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > > > > > > > > index b213696..3feb94f 100644
> > > > > > > > > > > --- a/include/sysemu/sysemu.h
> > > > > > > > > > > +++ b/include/sysemu/sysemu.h
> > > > > > > > > > > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > > > > > > > > > > QEMU_WAKEUP_REASON_OTHER,
> > > > > > > > > > > } WakeupReason;
> > > > > > > > > > >
> > > > > > > > > > > +void qemu_exit_preconfig_request(void);
> > > > > > > > > > > void qemu_system_reset_request(ShutdownCause reason);
> > > > > > > > > > > void qemu_system_suspend_request(void);
> > > > > > > > > > > void qemu_register_suspend_notifier(Notifier *notifier);
> > > > > > > > > > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > > > > > > > > > index 39225ae..bd44db8 100644
> > > > > > > > > > > --- a/qemu-options.hx
> > > > > > > > > > > +++ b/qemu-options.hx
> > > > > > > > > > > @@ -3498,6 +3498,21 @@ STEXI
> > > > > > > > > > > Run the emulation in single step mode.
> > > > > > > > > > > ETEXI
> > > > > > > > > > >
> > > > > > > > > > > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > > > > > > > > > > + "-paused [state=]postconf|preconf\n"
> > > > > > > > > > > + " postconf: pause QEMU after machine is initialized\n"
> > > > > > > > > > > + " preconf: pause QEMU before machine is initialized\n",
> > > > > > > > > > > + QEMU_ARCH_ALL)
> > > > > > > > > >
> > > > > > > > > > I would like to allow pausing before machine-type is selected, so
> > > > > > > > > > management could run query-machines before choosing a
> > > > > > > > > > machine-type. Would that need a third "-pause" mode, or will we
> > > > > > > > > > be able to change "preconf" to pause before select_machine() is
> > > > > > > > > > called?
> > > > > > > > > >
> > > > > > > > > > The same probably applies to other things initialized before
> > > > > > > > > > machine_run_board_init() that could be configurable using QMP,
> > > > > > > > > > including but not limited to:
> > > > > > > > > > * Accelerator configuration
> > > > > > > > > > * Registering global properties
> > > > > > > > > > * RAM size
> > > > > > > > > > * SMP/CPU configuration
> > > > > > > > >
> > > > > > > > > Yeah.. having a bunch of different possible pause stages to select
> > > > > > > > > doesn't sound great.
> > > > > > > >
> > > > > > > > I agree. The number of externally visible pause states should be
> > > > > > > > as small as possible.
> > > > > > > >
> > > > > > > >
> > > > > > > > > Could we avoid this by instead changing -S to
> > > > > > > > > pause at the earliest possible spot, but having any monitor commands
> > > > > > > > > that require a later stage automatically "fast forwarding" to the
> > > > > > > > > right phase?
> > > > > > > >
> > > > > > > > That would hide the internal details from the outside. Sounds
> > > > > > > > nice, but adding new machine/device configuration QMP commands
> > > > > > > > while hiding the QEMU state from the outside sounds impossible.
> > > > > > > >
> > > > > > > > For example, if we use -S today, this works:
> > > > > > > >
> > > > > > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > > > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > > > > > -> {"execute":"qmp_capabilities"}
> > > > > > > > <- {"return": {}}
> > > > > > > > -> {"execute":"query-cpus"}
> > > > > > > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > > > > > > >
> > > > > > > > This means "query-cpus" needs to fast-forward to the CPU creation
> > > > > > > > stage if we want to keep compatibility.
> > > > > > > >
> > > > > > > > Now, assume we add a set-numa-node command like the one in this
> > > > > > > > series. e.g.:
> > > > > > > >
> > > > > > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > > > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > > > > > -> {"execute":"qmp_capabilities"}
> > > > > > > > <- {"return": {}}
> > > > > > > > -> {"execute":"set-numa-node" ... }
> > > > > > > > <- {"return": ...}
> > > > > > > >
> > > > > > > > The command will work only if machine initialization didn't run
> > > > > > > > yet.
> > > > > > > >
> > > > > > > > But now an innocent-looking query command would change QEMU state
> > > > > > > > in an unexpected way:
> > > > > > > >
> > > > > > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > > > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > > > > > -> {"execute":"qmp_capabilities"}
> > > > > > > > <- {"return": {}}
> > > > > > > > -> {"execute":"query-cpus"} [will silently fast-forward QEMU state]
> > > > > > > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > > > > > > > -> {"execute":"set-numa-node" ... }
> > > > > > > > <- {"error": ...} [the command will fail because the machine was already created]
> > > > > > > >
> > > > > > > > This means we do have a externally visible "too late to use
> > > > > > > > set-numa-node" QEMU state, and query-cpus will have a externally
> > > > > > > > visible side effect. Every QMP command would need to document
> > > > > > > > how it affects QEMU state in a externally visible way.
> > > > > > > >
> > > > > > > > If QEMU pause state is still going to be externally visible this
> > > > > > > > way, I would prefer to let the client to explicitly tell what's
> > > > > > > > the state they want QEMU to be, instead of making QEMU change
> > > > > > > > state silently as a side effect of QMP commands.
> > > > > > >
> > > > > > > Yeah, good point. My proposal would just have changed explicitly
> > > > > > > exposed ugly internal state to subtly exposed ugly internal state,
> > > > > > > which is probably worse :(.
> > > > > > >
> > > > > > >
> > > > > > > Ok.. next possibly bad idea..
> > > > > > >
> > > > > > > What about a "re-exec" monitor command; it would take what's
> > > > > > > essentially a new command line, and basically restart qemu from the
> > > > > > > beginning, reparsing this new command line, but without actually
> > > > > > >
> > > > > > > Pro:
> > > > > > > * Mitigates Daniel Berrange's concern about lots of qemu
> > > > > > > configuration being buried in the qmp session - if libvirt logged
> > > > > > > its last "re-exec" that would have what is generally needed.
> > > > > > > * Lets libvirt do assorted investigation of options, then rewind to
> > > > > > > choose what it actually wants
> > > > > >
> > > > > > Sounds like a superset of Paolo's "-machine none" proposal[1].
> > > > > > It would be a very simple interface, not sure it can be easily
> > > > > > implemented efficiently.
> > > > > >
> > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg488618.html
> > > > > >
> > > > > > >
> > > > > > > Con:
> > > > > > > * Would require a bunch of auditing of structures/state to make sure
> > > > > > > they can be re-initialized cleanly
> > > > > >
> > > > > > This sounds like a big obstacle. QEMU still have too much global
> > > > > > state outside the machine/qdev tree.
> > > > > >
> > > > > >
> > > > > > > * Would it be fast enough for libvirt to use? Do we know if the
> > > > > > > slowness which makes multiple qemu invocations by libvirt
> > > > > > > unattractive is from the kernel/libc/ldso overhead, or from qemu's
> > > > > > > internal start up processing?
> > > > > >
> > > > > > My gut feeling is that this could be too slow, if the scope of
> > > > > > "re-exec" is too big.
> > > > > >
> > > > > >
> > > > > > Now, let me try to go to the opposite extreme: I think you had a
> > > > > > good point in your previous proposal. Why should we need to
> > > > > > restart/re-execute anything at all just because some bit of
> > > > > > configuration is being changed by libvirt? Why commands like
> > > > > > set-numa-node should require QEMU to be in a state that is not
> > > > > > covered by -S? If the guest is not running yet, there should be
> > > > > > no reason to require clients to explicitly pause/continue/restart
> > > > > > anything.
> > > > > It's probably doable to do numa config at '-S' time for x86 (arm),
> > > > > since ACPI tables are regenerated on the first read (legacy fw_cfg
> > > > > would be a little problematic but probably could be 'fixed' as well)
> > > > >
> > > > > But I can't say outright if it's doable for other targets,
> > > > > in general issue here is that '-S' pauses after machine_done is run
> > > > > and all necessary wiring board requires is finalized by then
> > > > > and no hooks run after unpause.
> > > > > If there is a general consensus to go this route, I can invest
> > > > > some time in making it work (then this series could be dropped)
> > > > >
> > > > > Even so, postponing set-numa to '-S' won't address Daniel's concern,
> > > > > i.e. configuration would take several round trips of command to complete
> > > > > potentially oven slow network. But as it was said libvirt can cache
> > > > > new CLI options for further reuse.
> > > >
> > > > We can cache stuff from the generic "-m none" invokation, but we won't
> > > > cache stuff from invokation of a specific VM instance, because we can't
> > > > have confidence that such data is independant of the VM config. So we
> > > In case if cpu layout we have fixed set of options that influence it
> > > (-M foo_vXX -smp ...), so from QEMU side it should be possible to
> > > promise it would stay stable.
> > > But such caching would be useful in other use cases as well.
> > > Is the issue in invalidating cached data in case of option(s) would
> > > change cached data?
> >
> > For the caching to be useful, we need to have a good cache hit rate.
> > If the cache depends on alot of different CLI args, then you're going
> > to have to populate many caches each with low hit rate. The current
> > caching is done based on QEMU/libvirtd binary, so we have 1 cache miss
> > when QEMU or libvirt are upgraded, then 100% cache hit thereafter, so
> > the cache is very effective.
> With per domain cache one could also have about 100% hit rate every time
> the domain is started in case a new option does not invalidate cache.
Single-use VMs is an use case libvirt cares about, and in that
case the hit rate would be 0%.
...unless we specify more complex caching rules for
query-hotpluggable-cpus, which IMO would be more complex and
error-prone than simply allowing predictable
socket-index/core-index/thread-index values to identify CPU
slots.
(But, is the latency added by 2 or 3 QMP commands really an issue
here?)
>
> In case of cpu layout it will remove need for query-hotpluggble-cpus
> every time VM is started (when cpu hotplug is enabled) which libvirt
> does now.
>
> ...
> >
> > Regards,
> > Daniel
>
--
Eduardo
On Mon, Oct 23, 2017 at 11:49:13AM +0200, Igor Mammedov wrote:
> On Fri, 20 Oct 2017 12:21:00 -0200
> Eduardo Habkost <ehabkost@redhat.com> wrote:
>
> > On Fri, Oct 20, 2017 at 12:19:17PM +1100, David Gibson wrote:
> > > On Thu, Oct 19, 2017 at 10:15:48PM -0200, Eduardo Habkost wrote:
> > > > On Thu, Oct 19, 2017 at 09:42:18PM +1100, David Gibson wrote:
> > > > > On Mon, Oct 16, 2017 at 02:59:16PM -0200, Eduardo Habkost wrote:
> > > > > > On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
> > > > > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > > > > > ---
> > > > > > > include/sysemu/sysemu.h | 1 +
> > > > > > > qemu-options.hx | 15 ++++++++++++++
> > > > > > > qmp.c | 5 +++++
> > > > > > > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > > > 4 files changed, 74 insertions(+), 1 deletion(-)
> > > > > > >
> > > > > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > > > > index b213696..3feb94f 100644
> > > > > > > --- a/include/sysemu/sysemu.h
> > > > > > > +++ b/include/sysemu/sysemu.h
> > > > > > > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
> > > > > > > QEMU_WAKEUP_REASON_OTHER,
> > > > > > > } WakeupReason;
> > > > > > >
> > > > > > > +void qemu_exit_preconfig_request(void);
> > > > > > > void qemu_system_reset_request(ShutdownCause reason);
> > > > > > > void qemu_system_suspend_request(void);
> > > > > > > void qemu_register_suspend_notifier(Notifier *notifier);
> > > > > > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > > > > > index 39225ae..bd44db8 100644
> > > > > > > --- a/qemu-options.hx
> > > > > > > +++ b/qemu-options.hx
> > > > > > > @@ -3498,6 +3498,21 @@ STEXI
> > > > > > > Run the emulation in single step mode.
> > > > > > > ETEXI
> > > > > > >
> > > > > > > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
> > > > > > > + "-paused [state=]postconf|preconf\n"
> > > > > > > + " postconf: pause QEMU after machine is initialized\n"
> > > > > > > + " preconf: pause QEMU before machine is initialized\n",
> > > > > > > + QEMU_ARCH_ALL)
> > > > > >
> > > > > > I would like to allow pausing before machine-type is selected, so
> > > > > > management could run query-machines before choosing a
> > > > > > machine-type. Would that need a third "-pause" mode, or will we
> > > > > > be able to change "preconf" to pause before select_machine() is
> > > > > > called?
> > > > > >
> > > > > > The same probably applies to other things initialized before
> > > > > > machine_run_board_init() that could be configurable using QMP,
> > > > > > including but not limited to:
> > > > > > * Accelerator configuration
> > > > > > * Registering global properties
> > > > > > * RAM size
> > > > > > * SMP/CPU configuration
> > > > >
> > > > > Yeah.. having a bunch of different possible pause stages to select
> > > > > doesn't sound great.
> > > >
> > > > I agree. The number of externally visible pause states should be
> > > > as small as possible.
> > > >
> > > >
> > > > > Could we avoid this by instead changing -S to
> > > > > pause at the earliest possible spot, but having any monitor commands
> > > > > that require a later stage automatically "fast forwarding" to the
> > > > > right phase?
> > > >
> > > > That would hide the internal details from the outside. Sounds
> > > > nice, but adding new machine/device configuration QMP commands
> > > > while hiding the QEMU state from the outside sounds impossible.
> > > >
> > > > For example, if we use -S today, this works:
> > > >
> > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > -> {"execute":"qmp_capabilities"}
> > > > <- {"return": {}}
> > > > -> {"execute":"query-cpus"}
> > > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > > >
> > > > This means "query-cpus" needs to fast-forward to the CPU creation
> > > > stage if we want to keep compatibility.
> > > >
> > > > Now, assume we add a set-numa-node command like the one in this
> > > > series. e.g.:
> > > >
> > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > -> {"execute":"qmp_capabilities"}
> > > > <- {"return": {}}
> > > > -> {"execute":"set-numa-node" ... }
> > > > <- {"return": ...}
> > > >
> > > > The command will work only if machine initialization didn't run
> > > > yet.
> > > >
> > > > But now an innocent-looking query command would change QEMU state
> > > > in an unexpected way:
> > > >
> > > > $ qemu-system-x86_64 -S -qmp stdio
> > > > <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> > > > -> {"execute":"qmp_capabilities"}
> > > > <- {"return": {}}
> > > > -> {"execute":"query-cpus"} [will silently fast-forward QEMU state]
> > > > <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> > > > -> {"execute":"set-numa-node" ... }
> > > > <- {"error": ...} [the command will fail because the machine was already created]
> > > >
> > > > This means we do have a externally visible "too late to use
> > > > set-numa-node" QEMU state, and query-cpus will have a externally
> > > > visible side effect. Every QMP command would need to document
> > > > how it affects QEMU state in a externally visible way.
> > > >
> > > > If QEMU pause state is still going to be externally visible this
> > > > way, I would prefer to let the client to explicitly tell what's
> > > > the state they want QEMU to be, instead of making QEMU change
> > > > state silently as a side effect of QMP commands.
> > >
> > > Yeah, good point. My proposal would just have changed explicitly
> > > exposed ugly internal state to subtly exposed ugly internal state,
> > > which is probably worse :(.
> > >
> > >
> > > Ok.. next possibly bad idea..
> > >
> > > What about a "re-exec" monitor command; it would take what's
> > > essentially a new command line, and basically restart qemu from the
> > > beginning, reparsing this new command line, but without actually
> > >
> > > Pro:
> > > * Mitigates Daniel Berrange's concern about lots of qemu
> > > configuration being buried in the qmp session - if libvirt logged
> > > its last "re-exec" that would have what is generally needed.
> > > * Lets libvirt do assorted investigation of options, then rewind to
> > > choose what it actually wants
> >
> > Sounds like a superset of Paolo's "-machine none" proposal[1].
> > It would be a very simple interface, not sure it can be easily
> > implemented efficiently.
> >
> > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg488618.html
> >
> > >
> > > Con:
> > > * Would require a bunch of auditing of structures/state to make sure
> > > they can be re-initialized cleanly
> >
> > This sounds like a big obstacle. QEMU still have too much global
> > state outside the machine/qdev tree.
> >
> >
> > > * Would it be fast enough for libvirt to use? Do we know if the
> > > slowness which makes multiple qemu invocations by libvirt
> > > unattractive is from the kernel/libc/ldso overhead, or from qemu's
> > > internal start up processing?
> >
> > My gut feeling is that this could be too slow, if the scope of
> > "re-exec" is too big.
> >
> >
> > Now, let me try to go to the opposite extreme: I think you had a
> > good point in your previous proposal. Why should we need to
> > restart/re-execute anything at all just because some bit of
> > configuration is being changed by libvirt? Why commands like
> > set-numa-node should require QEMU to be in a state that is not
> > covered by -S? If the guest is not running yet, there should be
> > no reason to require clients to explicitly pause/continue/restart
> > anything.
> It's probably doable to do numa config at '-S' time for x86 (arm),
> since ACPI tables are regenerated on the first read (legacy fw_cfg
> would be a little problematic but probably could be 'fixed' as well)
>
> But I can't say outright if it's doable for other targets,
> in general issue here is that '-S' pauses after machine_done is run
> and all necessary wiring board requires is finalized by then
> and no hooks run after unpause.
> If there is a general consensus to go this route, I can invest
> some time in making it work (then this series could be dropped)
My argument is that it must be always possible to change
configuration using -S (before issuing a 'cont' command), because
the guest is not running at all. If current QEMU code makes that
difficult, we should address it internally in QEMU.
>
> Even so, postponing set-numa to '-S' won't address Daniel's concern,
> i.e. configuration would take several round trips of command to complete
> potentially oven slow network. But as it was said libvirt can cache
> new CLI options for further reuse.
> Whether is slower/faster than starting qemu with '-M foo -smp ...' +
> querying layout and then restarting it again with -numa options
> would depend on network speed.
True, my argument doesn't address that concern. But I expect QMP
configuration commands to be always done through a local socket,
so this is just about the added latency for local QMP round
trips.
--
Eduardo
Eduardo Habkost <ehabkost@redhat.com> writes:
> On Thu, Oct 19, 2017 at 09:42:18PM +1100, David Gibson wrote:
>> On Mon, Oct 16, 2017 at 02:59:16PM -0200, Eduardo Habkost wrote:
>> > On Mon, Oct 16, 2017 at 06:22:54PM +0200, Igor Mammedov wrote:
>> > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
>> > > ---
>> > > include/sysemu/sysemu.h | 1 +
>> > > qemu-options.hx | 15 ++++++++++++++
>> > > qmp.c | 5 +++++
>> > > vl.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-
>> > > 4 files changed, 74 insertions(+), 1 deletion(-)
>> > >
>> > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
>> > > index b213696..3feb94f 100644
>> > > --- a/include/sysemu/sysemu.h
>> > > +++ b/include/sysemu/sysemu.h
>> > > @@ -66,6 +66,7 @@ typedef enum WakeupReason {
>> > > QEMU_WAKEUP_REASON_OTHER,
>> > > } WakeupReason;
>> > >
>> > > +void qemu_exit_preconfig_request(void);
>> > > void qemu_system_reset_request(ShutdownCause reason);
>> > > void qemu_system_suspend_request(void);
>> > > void qemu_register_suspend_notifier(Notifier *notifier);
>> > > diff --git a/qemu-options.hx b/qemu-options.hx
>> > > index 39225ae..bd44db8 100644
>> > > --- a/qemu-options.hx
>> > > +++ b/qemu-options.hx
>> > > @@ -3498,6 +3498,21 @@ STEXI
>> > > Run the emulation in single step mode.
>> > > ETEXI
>> > >
>> > > +DEF("paused", HAS_ARG, QEMU_OPTION_paused, \
>> > > + "-paused [state=]postconf|preconf\n"
>> > > + " postconf: pause QEMU after machine is initialized\n"
>> > > + " preconf: pause QEMU before machine is initialized\n",
>> > > + QEMU_ARCH_ALL)
>> >
>> > I would like to allow pausing before machine-type is selected, so
>> > management could run query-machines before choosing a
>> > machine-type. Would that need a third "-pause" mode, or will we
>> > be able to change "preconf" to pause before select_machine() is
>> > called?
>> >
>> > The same probably applies to other things initialized before
>> > machine_run_board_init() that could be configurable using QMP,
>> > including but not limited to:
>> > * Accelerator configuration
>> > * Registering global properties
>> > * RAM size
>> > * SMP/CPU configuration
>>
>> Yeah.. having a bunch of different possible pause stages to select
>> doesn't sound great.
>
> I agree. The number of externally visible pause states should be
> as small as possible.
--pause isn't overly descriptive either. Maybe something like
--wait-for-dynamic-config which is a mouthful but makes it clearer why
you would want this over -S
>
>
>> Could we avoid this by instead changing -S to
>> pause at the earliest possible spot, but having any monitor commands
>> that require a later stage automatically "fast forwarding" to the
>> right phase?
>
> That would hide the internal details from the outside. Sounds
> nice, but adding new machine/device configuration QMP commands
> while hiding the QEMU state from the outside sounds impossible.
>
> For example, if we use -S today, this works:
>
> $ qemu-system-x86_64 -S -qmp stdio
> <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> -> {"execute":"qmp_capabilities"}
> <- {"return": {}}
> -> {"execute":"query-cpus"}
> <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
>
> This means "query-cpus" needs to fast-forward to the CPU creation
> stage if we want to keep compatibility.
>
> Now, assume we add a set-numa-node command like the one in this
> series. e.g.:
>
> $ qemu-system-x86_64 -S -qmp stdio
> <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> -> {"execute":"qmp_capabilities"}
> <- {"return": {}}
> -> {"execute":"set-numa-node" ... }
> <- {"return": ...}
>
> The command will work only if machine initialization didn't run
> yet.
>
> But now an innocent-looking query command would change QEMU state
> in an unexpected way:
>
> $ qemu-system-x86_64 -S -qmp stdio
> <- {"QMP": {"version": {"qemu": {"micro": 0, "minor": 10, "major": 2}, "package": " (v2.10.0-83-g9375da7831)"}, "capabilities": []}}
> -> {"execute":"qmp_capabilities"}
> <- {"return": {}}
> -> {"execute":"query-cpus"} [will silently fast-forward QEMU state]
> <- {"return": [{"arch": "x86", "current": true, "props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "CPU": 0, "qom_path": "/machine/unattached/device[0]", "pc": 4294967280, "halted": false, "thread_id": 4038}]}
> -> {"execute":"set-numa-node" ... }
> <- {"error": ...} [the command will fail because the machine was already created]
>
> This means we do have a externally visible "too late to use
> set-numa-node" QEMU state, and query-cpus will have a externally
> visible side effect. Every QMP command would need to document
> how it affects QEMU state in a externally visible way.
>
> If QEMU pause state is still going to be externally visible this
> way, I would prefer to let the client to explicitly tell what's
> the state they want QEMU to be, instead of making QEMU change
> state silently as a side effect of QMP commands.
>
>>
> [...]
--
Alex Bennée
© 2016 - 2026 Red Hat, Inc.