Changing domain definition must be guarded with acquiring modify
job. The problem is if there is a thread executing say
qemuDomainSetMemoryStatsPeriod() which is an API that acquires
modify job and then possibly unlock the domain object and locks
it again. However, becasue virDomainObjAssignDef() does not take
a job (only object lock) it may have changed the domain
definition while the other thread unlocked the domain object in
order to talk on the monitor. For instance:
Thread1:
1) lookup domain object and lock it
2) acquire job
3) get pointers to live and persistent defs
4) unlock the domain object
5) talk to qemu on the monitor
Thread2 (Execute qemuDomainDefineXMLFlags):
1) lookup domain object and lock it
2) virDomainObjAssignDef() which overwrites persistent def and
frees the old one
3) unlock domain object
Thread1:
6) lock the domain object again
7) try to access the persistent def via pointer stored in 3)
Now this will crash because the pointer from step 3) points to a
memory that was freed.
However, if Thread2 waited and acquired modify job (just like
Thread1) then these two would serialize and no harm would be
done.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
---
src/qemu/qemu_domain.c | 3 ++-
src/qemu/qemu_driver.c | 29 +++++++++++++++++++++++++----
src/qemu/qemu_migration.c | 7 +++++++
3 files changed, 34 insertions(+), 5 deletions(-)
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index c61d39b12b..fff2f1932e 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -8784,7 +8784,8 @@ qemuDomainRemoveInactive(virQEMUDriverPtr driver,
return;
}
- qemuDomainRemoveInactiveCommon(driver, vm);
+ if (vm->def)
+ qemuDomainRemoveInactiveCommon(driver, vm);
virDomainObjListRemove(driver->domains, vm);
}
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
index 8bbac339e0..77cb7e4b87 100644
--- a/src/qemu/qemu_driver.c
+++ b/src/qemu/qemu_driver.c
@@ -1706,9 +1706,16 @@ static virDomainPtr qemuDomainCreateXML(virConnectPtr conn,
VIR_DOMAIN_OBJ_LIST_ADD_CHECK_LIVE)))
goto cleanup;
+ if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) {
+ qemuDomainRemoveInactive(driver, vm);
+ goto cleanup;
+ }
+
virDomainObjAssignDef(vm, def, true, NULL);
def = NULL;
+ qemuDomainObjEndJob(driver, vm);
+
if (qemuProcessBeginJob(driver, vm, VIR_DOMAIN_JOB_OPERATION_START,
flags) < 0) {
qemuDomainRemoveInactiveJob(driver, vm);
@@ -6976,6 +6983,11 @@ qemuDomainRestoreFlags(virConnectPtr conn,
VIR_DOMAIN_OBJ_LIST_ADD_CHECK_LIVE)))
goto cleanup;
+ if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) {
+ qemuDomainRemoveInactive(driver, vm);
+ goto cleanup;
+ }
+
virDomainObjAssignDef(vm, def, true, NULL);
def = NULL;
@@ -6988,6 +7000,7 @@ qemuDomainRestoreFlags(virConnectPtr conn,
priv = vm->privateData;
priv->hookRun = true;
}
+ qemuDomainObjEndJob(driver, vm);
if (qemuProcessBeginJob(driver, vm, VIR_DOMAIN_JOB_OPERATION_RESTORE,
flags) < 0)
@@ -7651,6 +7664,11 @@ qemuDomainDefineXMLFlags(virConnectPtr conn,
driver->xmlopt, 0)))
goto cleanup;
+ if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) {
+ qemuDomainRemoveInactive(driver, vm);
+ goto cleanup;
+ }
+
virDomainObjAssignDef(vm, def, false, &oldDef);
def = NULL;
@@ -7673,7 +7691,7 @@ qemuDomainDefineXMLFlags(virConnectPtr conn,
vm->persistent = 0;
qemuDomainRemoveInactiveJob(driver, vm);
}
- goto cleanup;
+ goto endjob;
}
event = virDomainEventLifecycleNewFromObj(vm,
@@ -7685,6 +7703,9 @@ qemuDomainDefineXMLFlags(virConnectPtr conn,
VIR_INFO("Creating domain '%s'", vm->def->name);
dom = virGetDomain(conn, vm->def->name, vm->def->uuid, vm->def->id);
+ endjob:
+ qemuDomainObjEndJob(driver, vm);
+
cleanup:
virDomainDefFree(oldDef);
virDomainDefFree(def);
@@ -16889,14 +16910,14 @@ static virDomainPtr qemuDomainQemuAttach(virConnectPtr conn,
VIR_DOMAIN_OBJ_LIST_ADD_CHECK_LIVE)))
goto cleanup;
- virDomainObjAssignDef(vm, def, true, NULL);
- def = NULL;
-
if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) {
qemuDomainRemoveInactive(driver, vm);
goto cleanup;
}
+ virDomainObjAssignDef(vm, def, true, NULL);
+ def = NULL;
+
if (qemuProcessAttach(conn, driver, vm, pid,
pidfile, monConfig, monJSON) < 0) {
qemuDomainRemoveInactive(driver, vm);
diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
index 9e19c923ee..32325c9588 100644
--- a/src/qemu/qemu_migration.c
+++ b/src/qemu/qemu_migration.c
@@ -2408,9 +2408,16 @@ qemuMigrationDstPrepareAny(virQEMUDriverPtr driver,
VIR_DOMAIN_OBJ_LIST_ADD_CHECK_LIVE)))
goto cleanup;
+ if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) {
+ qemuDomainRemoveInactive(driver, vm);
+ goto cleanup;
+ }
+
virDomainObjAssignDef(vm, *def, true, NULL);
*def = NULL;
+ qemuDomainObjEndJob(driver, vm);
+
priv = vm->privateData;
if (VIR_STRDUP(priv->origname, origname) < 0)
goto cleanup;
--
2.21.0
--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
On 30.05.2019 12:34, Michal Privoznik wrote: > Changing domain definition must be guarded with acquiring modify > job. The problem is if there is a thread executing say > qemuDomainSetMemoryStatsPeriod() which is an API that acquires > modify job and then possibly unlock the domain object and locks > it again. However, becasue virDomainObjAssignDef() does not take > a job (only object lock) it may have changed the domain > definition while the other thread unlocked the domain object in > order to talk on the monitor. For instance: > > Thread1: > 1) lookup domain object and lock it > 2) acquire job > 3) get pointers to live and persistent defs > 4) unlock the domain object > 5) talk to qemu on the monitor > > Thread2 (Execute qemuDomainDefineXMLFlags): > 1) lookup domain object and lock it > 2) virDomainObjAssignDef() which overwrites persistent def and > frees the old one > 3) unlock domain object > > Thread1: > 6) lock the domain object again > 7) try to access the persistent def via pointer stored in 3) > > Now this will crash because the pointer from step 3) points to a > memory that was freed. > > However, if Thread2 waited and acquired modify job (just like > Thread1) then these two would serialize and no harm would be > done. > > Signed-off-by: Michal Privoznik <mprivozn@redhat.com> > --- > src/qemu/qemu_domain.c | 3 ++- > src/qemu/qemu_driver.c | 29 +++++++++++++++++++++++++---- > src/qemu/qemu_migration.c | 7 +++++++ > 3 files changed, 34 insertions(+), 5 deletions(-) > > diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c > index c61d39b12b..fff2f1932e 100644 > --- a/src/qemu/qemu_domain.c > +++ b/src/qemu/qemu_domain.c > @@ -8784,7 +8784,8 @@ qemuDomainRemoveInactive(virQEMUDriverPtr driver, > return; > } > > - qemuDomainRemoveInactiveCommon(driver, vm); > + if (vm->def) > + qemuDomainRemoveInactiveCommon(driver, vm); > > virDomainObjListRemove(driver->domains, vm); Hmm, virDomainObjListRemoveLocked uses vm->def too. > } > diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c > index 8bbac339e0..77cb7e4b87 100644 > --- a/src/qemu/qemu_driver.c > +++ b/src/qemu/qemu_driver.c > @@ -1706,9 +1706,16 @@ static virDomainPtr qemuDomainCreateXML(virConnectPtr conn, > VIR_DOMAIN_OBJ_LIST_ADD_CHECK_LIVE))) > goto cleanup; > > + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) { > + qemuDomainRemoveInactive(driver, vm); > + goto cleanup; > + } > + > virDomainObjAssignDef(vm, def, true, NULL); > def = NULL; > > + qemuDomainObjEndJob(driver, vm); > + Looks like we can begin VIR_DOMAIN_JOB_OPERATION_START from the start instead of acquiring job twice. > if (qemuProcessBeginJob(driver, vm, VIR_DOMAIN_JOB_OPERATION_START, > flags) < 0) { > qemuDomainRemoveInactiveJob(driver, vm); > @@ -6976,6 +6983,11 @@ qemuDomainRestoreFlags(virConnectPtr conn, > VIR_DOMAIN_OBJ_LIST_ADD_CHECK_LIVE))) > goto cleanup; > > + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) { > + qemuDomainRemoveInactive(driver, vm); > + goto cleanup; > + } > + > virDomainObjAssignDef(vm, def, true, NULL); > def = NULL; > > @@ -6988,6 +7000,7 @@ qemuDomainRestoreFlags(virConnectPtr conn, > priv = vm->privateData; > priv->hookRun = true; > }> + qemuDomainObjEndJob(driver, vm); Same here > > if (qemuProcessBeginJob(driver, vm, VIR_DOMAIN_JOB_OPERATION_RESTORE, > flags) < 0) > @@ -7651,6 +7664,11 @@ qemuDomainDefineXMLFlags(virConnectPtr conn, > driver->xmlopt, 0))) > goto cleanup; > > + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) { > + qemuDomainRemoveInactive(driver, vm); > + goto cleanup; > + } > + > virDomainObjAssignDef(vm, def, false, &oldDef); > def = NULL; > > @@ -7673,7 +7691,7 @@ qemuDomainDefineXMLFlags(virConnectPtr conn, > vm->persistent = 0; > qemuDomainRemoveInactiveJob(driver, vm); > } > - goto cleanup; > + goto endjob; > } > > event = virDomainEventLifecycleNewFromObj(vm, > @@ -7685,6 +7703,9 @@ qemuDomainDefineXMLFlags(virConnectPtr conn, > VIR_INFO("Creating domain '%s'", vm->def->name); > dom = virGetDomain(conn, vm->def->name, vm->def->uuid, vm->def->id); > > + endjob: > + qemuDomainObjEndJob(driver, vm); > + > cleanup: > virDomainDefFree(oldDef); > virDomainDefFree(def); Additionally we need to replace qemuDomainRemoveInactiveJob with qemuDomainRemoveInactive as we have a job already. > @@ -16889,14 +16910,14 @@ static virDomainPtr qemuDomainQemuAttach(virConnectPtr conn, > VIR_DOMAIN_OBJ_LIST_ADD_CHECK_LIVE))) > goto cleanup; > > - virDomainObjAssignDef(vm, def, true, NULL); > - def = NULL; > - > if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) { > qemuDomainRemoveInactive(driver, vm); > goto cleanup; > } > > + virDomainObjAssignDef(vm, def, true, NULL); > + def = NULL; > + > if (qemuProcessAttach(conn, driver, vm, pid, > pidfile, monConfig, monJSON) < 0) { > qemuDomainRemoveInactive(driver, vm); > diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c > index 9e19c923ee..32325c9588 100644 > --- a/src/qemu/qemu_migration.c > +++ b/src/qemu/qemu_migration.c > @@ -2408,9 +2408,16 @@ qemuMigrationDstPrepareAny(virQEMUDriverPtr driver, > VIR_DOMAIN_OBJ_LIST_ADD_CHECK_LIVE))) > goto cleanup; > > + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) { > + qemuDomainRemoveInactive(driver, vm); > + goto cleanup; > + } > + > virDomainObjAssignDef(vm, *def, true, NULL); > *def = NULL; > > + qemuDomainObjEndJob(driver, vm); > + > priv = vm->privateData; > if (VIR_STRDUP(priv->origname, origname) < 0) > goto cleanup; > On migration we acquire job already too ... Nikolay -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On 6/4/19 11:07 AM, Nikolay Shirokovskiy wrote: > > > On 30.05.2019 12:34, Michal Privoznik wrote: >> Changing domain definition must be guarded with acquiring modify >> job. The problem is if there is a thread executing say >> qemuDomainSetMemoryStatsPeriod() which is an API that acquires >> modify job and then possibly unlock the domain object and locks >> it again. However, becasue virDomainObjAssignDef() does not take >> a job (only object lock) it may have changed the domain >> definition while the other thread unlocked the domain object in >> order to talk on the monitor. For instance: >> >> Thread1: >> 1) lookup domain object and lock it >> 2) acquire job >> 3) get pointers to live and persistent defs >> 4) unlock the domain object >> 5) talk to qemu on the monitor >> >> Thread2 (Execute qemuDomainDefineXMLFlags): >> 1) lookup domain object and lock it >> 2) virDomainObjAssignDef() which overwrites persistent def and >> frees the old one >> 3) unlock domain object >> >> Thread1: >> 6) lock the domain object again >> 7) try to access the persistent def via pointer stored in 3) >> >> Now this will crash because the pointer from step 3) points to a >> memory that was freed. >> >> However, if Thread2 waited and acquired modify job (just like >> Thread1) then these two would serialize and no harm would be >> done. >> >> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> >> --- >> src/qemu/qemu_domain.c | 3 ++- >> src/qemu/qemu_driver.c | 29 +++++++++++++++++++++++++---- >> src/qemu/qemu_migration.c | 7 +++++++ >> 3 files changed, 34 insertions(+), 5 deletions(-) >> >> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c >> index c61d39b12b..fff2f1932e 100644 >> --- a/src/qemu/qemu_domain.c >> +++ b/src/qemu/qemu_domain.c >> @@ -8784,7 +8784,8 @@ qemuDomainRemoveInactive(virQEMUDriverPtr driver, >> return; >> } >> >> - qemuDomainRemoveInactiveCommon(driver, vm); >> + if (vm->def) >> + qemuDomainRemoveInactiveCommon(driver, vm); >> >> virDomainObjListRemove(driver->domains, vm); > > Hmm, virDomainObjListRemoveLocked uses vm->def too. > >> } >> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c >> index 8bbac339e0..77cb7e4b87 100644 >> --- a/src/qemu/qemu_driver.c >> +++ b/src/qemu/qemu_driver.c >> @@ -1706,9 +1706,16 @@ static virDomainPtr qemuDomainCreateXML(virConnectPtr conn, >> VIR_DOMAIN_OBJ_LIST_ADD_CHECK_LIVE))) >> goto cleanup; >> >> + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) { >> + qemuDomainRemoveInactive(driver, vm); >> + goto cleanup; >> + } >> + >> virDomainObjAssignDef(vm, def, true, NULL); >> def = NULL; >> >> + qemuDomainObjEndJob(driver, vm); >> + > > Looks like we can begin VIR_DOMAIN_JOB_OPERATION_START from the start instead > of acquiring job twice. That is an async job. We shouldn't rely on async job when modifying a domain. > >> if (qemuProcessBeginJob(driver, vm, VIR_DOMAIN_JOB_OPERATION_START, >> flags) < 0) { >> qemuDomainRemoveInactiveJob(driver, vm); >> @@ -6976,6 +6983,11 @@ qemuDomainRestoreFlags(virConnectPtr conn, >> VIR_DOMAIN_OBJ_LIST_ADD_CHECK_LIVE))) >> goto cleanup; >> >> + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) { >> + qemuDomainRemoveInactive(driver, vm); >> + goto cleanup; >> + } >> + >> virDomainObjAssignDef(vm, def, true, NULL); >> def = NULL; >> >> @@ -6988,6 +7000,7 @@ qemuDomainRestoreFlags(virConnectPtr conn, >> priv = vm->privateData; >> priv->hookRun = true; >> }> + qemuDomainObjEndJob(driver, vm); > > Same here > >> >> if (qemuProcessBeginJob(driver, vm, VIR_DOMAIN_JOB_OPERATION_RESTORE, >> flags) < 0) >> @@ -7651,6 +7664,11 @@ qemuDomainDefineXMLFlags(virConnectPtr conn, >> driver->xmlopt, 0))) >> goto cleanup; >> >> + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) { >> + qemuDomainRemoveInactive(driver, vm); >> + goto cleanup; >> + } >> + >> virDomainObjAssignDef(vm, def, false, &oldDef); >> def = NULL; >> >> @@ -7673,7 +7691,7 @@ qemuDomainDefineXMLFlags(virConnectPtr conn, >> vm->persistent = 0; >> qemuDomainRemoveInactiveJob(driver, vm); >> } >> - goto cleanup; >> + goto endjob; >> } >> >> event = virDomainEventLifecycleNewFromObj(vm, >> @@ -7685,6 +7703,9 @@ qemuDomainDefineXMLFlags(virConnectPtr conn, >> VIR_INFO("Creating domain '%s'", vm->def->name); >> dom = virGetDomain(conn, vm->def->name, vm->def->uuid, vm->def->id); >> >> + endjob: >> + qemuDomainObjEndJob(driver, vm); >> + >> cleanup: >> virDomainDefFree(oldDef); >> virDomainDefFree(def); > > Additionally we need to replace qemuDomainRemoveInactiveJob with qemuDomainRemoveInactive > as we have a job already. I'm planning on wrapping what you suggested in your reply to the cover letter into a single function. This way, the change won't be that intrusive and a lot of these changes won't be made. Michal -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
On 04.06.2019 17:23, Michal Privoznik wrote: > On 6/4/19 11:07 AM, Nikolay Shirokovskiy wrote: >> >> >> On 30.05.2019 12:34, Michal Privoznik wrote: >>> Changing domain definition must be guarded with acquiring modify >>> job. The problem is if there is a thread executing say >>> qemuDomainSetMemoryStatsPeriod() which is an API that acquires >>> modify job and then possibly unlock the domain object and locks >>> it again. However, becasue virDomainObjAssignDef() does not take >>> a job (only object lock) it may have changed the domain >>> definition while the other thread unlocked the domain object in >>> order to talk on the monitor. For instance: >>> >>> Thread1: >>> 1) lookup domain object and lock it >>> 2) acquire job >>> 3) get pointers to live and persistent defs >>> 4) unlock the domain object >>> 5) talk to qemu on the monitor >>> >>> Thread2 (Execute qemuDomainDefineXMLFlags): >>> 1) lookup domain object and lock it >>> 2) virDomainObjAssignDef() which overwrites persistent def and >>> frees the old one >>> 3) unlock domain object >>> >>> Thread1: >>> 6) lock the domain object again >>> 7) try to access the persistent def via pointer stored in 3) >>> >>> Now this will crash because the pointer from step 3) points to a >>> memory that was freed. >>> >>> However, if Thread2 waited and acquired modify job (just like >>> Thread1) then these two would serialize and no harm would be >>> done. >>> >>> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> >>> --- >>> src/qemu/qemu_domain.c | 3 ++- >>> src/qemu/qemu_driver.c | 29 +++++++++++++++++++++++++---- >>> src/qemu/qemu_migration.c | 7 +++++++ >>> 3 files changed, 34 insertions(+), 5 deletions(-) >>> >>> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c >>> index c61d39b12b..fff2f1932e 100644 >>> --- a/src/qemu/qemu_domain.c >>> +++ b/src/qemu/qemu_domain.c >>> @@ -8784,7 +8784,8 @@ qemuDomainRemoveInactive(virQEMUDriverPtr driver, >>> return; >>> } >>> - qemuDomainRemoveInactiveCommon(driver, vm); >>> + if (vm->def) >>> + qemuDomainRemoveInactiveCommon(driver, vm); >>> virDomainObjListRemove(driver->domains, vm); >> >> Hmm, virDomainObjListRemoveLocked uses vm->def too. >> >>> } >>> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c >>> index 8bbac339e0..77cb7e4b87 100644 >>> --- a/src/qemu/qemu_driver.c >>> +++ b/src/qemu/qemu_driver.c >>> @@ -1706,9 +1706,16 @@ static virDomainPtr qemuDomainCreateXML(virConnectPtr conn, >>> VIR_DOMAIN_OBJ_LIST_ADD_CHECK_LIVE))) >>> goto cleanup; >>> + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) { >>> + qemuDomainRemoveInactive(driver, vm); >>> + goto cleanup; >>> + } >>> + >>> virDomainObjAssignDef(vm, def, true, NULL); >>> def = NULL; >>> + qemuDomainObjEndJob(driver, vm); >>> + >> >> Looks like we can begin VIR_DOMAIN_JOB_OPERATION_START from the start instead >> of acquiring job twice. > > That is an async job. We shouldn't rely on async job when modifying a domain. > To me it looks ok. We don't drop vm lock between begin job and assigning definition so other threads have no chance to see invalid state. Nikolay -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
© 2016 - 2024 Red Hat, Inc.