Introduce a new field, memory_attribute_manager, in RAMBlock to link to
an MemoryAttributeManager object. This change centralizes all
guest_memfd state information (like fd and shared_bitmap) within a
RAMBlock, making it easier to manage.
Use the realize()/unrealize() helpers to initialize/uninitialize the
MemoryAttributeManager object. Register/unregister the object in the
target RAMBlock's MemoryRegion when creating guest_memfd.
In the kvm_convert_memory() function, manage memory state changes by
using the shared_bitmap to call set_attribute() only on the specific
memory range. Additionally, use the
memory_attribute_manager_state_change() helper to notify the reigstered
RamDiscardListener of these changes.
Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
---
Changes in v3:
- Use ram_discard_manager_reply_populated/discarded() to set the
memory attribute and add the undo support if state_change()
failed.
- Didn't add Reviewed-by from Alexey due to the new changes in this
commit.
Changes in v2:
- Introduce a new field memory_attribute_manager in RAMBlock.
- Move the state_change() handling during page conversion in this patch.
- Undo what we did if it fails to set.
- Change the order of close(guest_memfd) and memory_attribute_manager cleanup.
---
accel/kvm/kvm-all.c | 50 +++++++++++++++++++++++++++++++++++++++--
include/exec/ramblock.h | 2 ++
system/physmem.c | 13 +++++++++++
3 files changed, 63 insertions(+), 2 deletions(-)
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index c1fea69d58..a89c5655e8 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -48,6 +48,7 @@
#include "kvm-cpus.h"
#include "system/dirtylimit.h"
#include "qemu/range.h"
+#include "system/memory-attribute-manager.h"
#include "hw/boards.h"
#include "system/stats.h"
@@ -3018,6 +3019,25 @@ static void kvm_eat_signals(CPUState *cpu)
} while (sigismember(&chkset, SIG_IPI));
}
+typedef struct SetMemoryAttribute {
+ bool to_private;
+} SetMemoryAttribute;
+
+static int kvm_set_memory_attributes_cb(MemoryRegionSection *section,
+ void *opaque)
+{
+ hwaddr start = section->offset_within_address_space;
+ hwaddr size = section->size;
+ SetMemoryAttribute *args = opaque;
+ bool to_private = args->to_private;
+
+ if (to_private) {
+ return kvm_set_memory_attributes_private(start, size);
+ } else {
+ return kvm_set_memory_attributes_shared(start, size);
+ }
+}
+
int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
{
MemoryRegionSection section;
@@ -3026,6 +3046,7 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
RAMBlock *rb;
void *addr;
int ret = -EINVAL;
+ SetMemoryAttribute args = { .to_private = to_private };
trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared");
@@ -3077,9 +3098,13 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
}
if (to_private) {
- ret = kvm_set_memory_attributes_private(start, size);
+ ret = ram_discard_manager_replay_populated(mr->rdm, §ion,
+ kvm_set_memory_attributes_cb,
+ &args);
} else {
- ret = kvm_set_memory_attributes_shared(start, size);
+ ret = ram_discard_manager_replay_discarded(mr->rdm, §ion,
+ kvm_set_memory_attributes_cb,
+ &args);
}
if (ret) {
goto out_unref;
@@ -3088,6 +3113,27 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
addr = memory_region_get_ram_ptr(mr) + section.offset_within_region;
rb = qemu_ram_block_from_host(addr, false, &offset);
+ ret = memory_attribute_manager_state_change(MEMORY_ATTRIBUTE_MANAGER(mr->rdm),
+ offset, size, to_private);
+ if (ret) {
+ warn_report("Failed to notify the listener the state change of "
+ "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s",
+ start, size, to_private ? "private" : "shared");
+ args.to_private = !to_private;
+ if (to_private) {
+ ret = ram_discard_manager_replay_populated(mr->rdm, §ion,
+ kvm_set_memory_attributes_cb,
+ &args);
+ } else {
+ ret = ram_discard_manager_replay_discarded(mr->rdm, §ion,
+ kvm_set_memory_attributes_cb,
+ &args);
+ }
+ if (ret) {
+ goto out_unref;
+ }
+ }
+
if (to_private) {
if (rb->page_size != qemu_real_host_page_size()) {
/*
diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
index 0babd105c0..06fd365326 100644
--- a/include/exec/ramblock.h
+++ b/include/exec/ramblock.h
@@ -23,6 +23,7 @@
#include "cpu-common.h"
#include "qemu/rcu.h"
#include "exec/ramlist.h"
+#include "system/memory-attribute-manager.h"
struct RAMBlock {
struct rcu_head rcu;
@@ -42,6 +43,7 @@ struct RAMBlock {
int fd;
uint64_t fd_offset;
int guest_memfd;
+ MemoryAttributeManager *memory_attribute_manager;
size_t page_size;
/* dirty bitmap used during migration */
unsigned long *bmap;
diff --git a/system/physmem.c b/system/physmem.c
index c76503aea8..0ed394c5d2 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -54,6 +54,7 @@
#include "system/hostmem.h"
#include "system/hw_accel.h"
#include "system/xen-mapcache.h"
+#include "system/memory-attribute-manager.h"
#include "trace.h"
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
@@ -1885,6 +1886,16 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
qemu_mutex_unlock_ramlist();
goto out_free;
}
+
+ new_block->memory_attribute_manager = MEMORY_ATTRIBUTE_MANAGER(object_new(TYPE_MEMORY_ATTRIBUTE_MANAGER));
+ if (memory_attribute_manager_realize(new_block->memory_attribute_manager, new_block->mr)) {
+ error_setg(errp, "Failed to realize memory attribute manager");
+ object_unref(OBJECT(new_block->memory_attribute_manager));
+ close(new_block->guest_memfd);
+ ram_block_discard_require(false);
+ qemu_mutex_unlock_ramlist();
+ goto out_free;
+ }
}
ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS;
@@ -2138,6 +2149,8 @@ static void reclaim_ramblock(RAMBlock *block)
}
if (block->guest_memfd >= 0) {
+ memory_attribute_manager_unrealize(block->memory_attribute_manager);
+ object_unref(OBJECT(block->memory_attribute_manager));
close(block->guest_memfd);
ram_block_discard_require(false);
}
--
2.43.5
Hi, On Mon, Mar 10, 2025 at 04:18:34PM +0800, Chenyi Qiang wrote: > --- a/system/physmem.c > +++ b/system/physmem.c > @@ -1885,6 +1886,16 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) > qemu_mutex_unlock_ramlist(); > goto out_free; > } > + > + new_block->memory_attribute_manager = MEMORY_ATTRIBUTE_MANAGER(object_new(TYPE_MEMORY_ATTRIBUTE_MANAGER)); > + if (memory_attribute_manager_realize(new_block->memory_attribute_manager, new_block->mr)) { > + error_setg(errp, "Failed to realize memory attribute manager"); > + object_unref(OBJECT(new_block->memory_attribute_manager)); > + close(new_block->guest_memfd); > + ram_block_discard_require(false); > + qemu_mutex_unlock_ramlist(); > + goto out_free; > + } > } > > ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS; Might as well put the above into a separate memory manager init function to start with. It keeps the goto out_free error path unified, and makes things more future proof if the rest of ram_block_add() ever develops a need to check for errors. Regards, Tony
On 3/17/2025 2:18 PM, Tony Lindgren wrote: > Hi, > > On Mon, Mar 10, 2025 at 04:18:34PM +0800, Chenyi Qiang wrote: >> --- a/system/physmem.c >> +++ b/system/physmem.c >> @@ -1885,6 +1886,16 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) >> qemu_mutex_unlock_ramlist(); >> goto out_free; >> } >> + >> + new_block->memory_attribute_manager = MEMORY_ATTRIBUTE_MANAGER(object_new(TYPE_MEMORY_ATTRIBUTE_MANAGER)); >> + if (memory_attribute_manager_realize(new_block->memory_attribute_manager, new_block->mr)) { >> + error_setg(errp, "Failed to realize memory attribute manager"); >> + object_unref(OBJECT(new_block->memory_attribute_manager)); >> + close(new_block->guest_memfd); >> + ram_block_discard_require(false); >> + qemu_mutex_unlock_ramlist(); >> + goto out_free; >> + } >> } >> >> ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS; > > Might as well put the above into a separate memory manager init function > to start with. It keeps the goto out_free error path unified, and makes > things more future proof if the rest of ram_block_add() ever develops a > need to check for errors. Which part to be defined in a separate function? The init function of object_new() + realize(), or the error handling operation (object_unref() + close() + ram_block_discard_require(false))? If need to check for errors in the rest of ram_block_add() in future, how about adding a new label before out_free and move the error handling there? > > Regards, > > Tony
On Mon, Mar 17, 2025 at 03:32:16PM +0800, Chenyi Qiang wrote: > > > On 3/17/2025 2:18 PM, Tony Lindgren wrote: > > Hi, > > > > On Mon, Mar 10, 2025 at 04:18:34PM +0800, Chenyi Qiang wrote: > >> --- a/system/physmem.c > >> +++ b/system/physmem.c > >> @@ -1885,6 +1886,16 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) > >> qemu_mutex_unlock_ramlist(); > >> goto out_free; > >> } > >> + > >> + new_block->memory_attribute_manager = MEMORY_ATTRIBUTE_MANAGER(object_new(TYPE_MEMORY_ATTRIBUTE_MANAGER)); > >> + if (memory_attribute_manager_realize(new_block->memory_attribute_manager, new_block->mr)) { > >> + error_setg(errp, "Failed to realize memory attribute manager"); > >> + object_unref(OBJECT(new_block->memory_attribute_manager)); > >> + close(new_block->guest_memfd); > >> + ram_block_discard_require(false); > >> + qemu_mutex_unlock_ramlist(); > >> + goto out_free; > >> + } > >> } > >> > >> ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS; > > > > Might as well put the above into a separate memory manager init function > > to start with. It keeps the goto out_free error path unified, and makes > > things more future proof if the rest of ram_block_add() ever develops a > > need to check for errors. > > Which part to be defined in a separate function? The init function of > object_new() + realize(), or the error handling operation > (object_unref() + close() + ram_block_discard_require(false))? I was thinking the whole thing, including freeing :) But maybe there's something more to consider to keep calls paired. > If need to check for errors in the rest of ram_block_add() in future, > how about adding a new label before out_free and move the error handling > there? Yeah that would work too. Regards, Tony
On 3/17/2025 5:45 PM, Tony Lindgren wrote: > On Mon, Mar 17, 2025 at 03:32:16PM +0800, Chenyi Qiang wrote: >> >> >> On 3/17/2025 2:18 PM, Tony Lindgren wrote: >>> Hi, >>> >>> On Mon, Mar 10, 2025 at 04:18:34PM +0800, Chenyi Qiang wrote: >>>> --- a/system/physmem.c >>>> +++ b/system/physmem.c >>>> @@ -1885,6 +1886,16 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) >>>> qemu_mutex_unlock_ramlist(); >>>> goto out_free; >>>> } >>>> + >>>> + new_block->memory_attribute_manager = MEMORY_ATTRIBUTE_MANAGER(object_new(TYPE_MEMORY_ATTRIBUTE_MANAGER)); >>>> + if (memory_attribute_manager_realize(new_block->memory_attribute_manager, new_block->mr)) { >>>> + error_setg(errp, "Failed to realize memory attribute manager"); >>>> + object_unref(OBJECT(new_block->memory_attribute_manager)); >>>> + close(new_block->guest_memfd); >>>> + ram_block_discard_require(false); >>>> + qemu_mutex_unlock_ramlist(); >>>> + goto out_free; >>>> + } >>>> } >>>> >>>> ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS; >>> >>> Might as well put the above into a separate memory manager init function >>> to start with. It keeps the goto out_free error path unified, and makes >>> things more future proof if the rest of ram_block_add() ever develops a >>> need to check for errors. >> >> Which part to be defined in a separate function? The init function of >> object_new() + realize(), or the error handling operation >> (object_unref() + close() + ram_block_discard_require(false))? > > I was thinking the whole thing, including freeing :) But maybe there's > something more to consider to keep calls paired. If putting the whole thing separately, I think the rest part to do error handling still needs to add the same operation. Or I misunderstand something? > >> If need to check for errors in the rest of ram_block_add() in future, >> how about adding a new label before out_free and move the error handling >> there? > > Yeah that would work too. I'm not sure if we should add such change directly, or we can wait for the real error check introduced in future. > > Regards, > > Tony
On Mon, Mar 17, 2025 at 06:21:13PM +0800, Chenyi Qiang wrote: > > > On 3/17/2025 5:45 PM, Tony Lindgren wrote: > > On Mon, Mar 17, 2025 at 03:32:16PM +0800, Chenyi Qiang wrote: > >> > >> > >> On 3/17/2025 2:18 PM, Tony Lindgren wrote: > >>> Hi, > >>> > >>> On Mon, Mar 10, 2025 at 04:18:34PM +0800, Chenyi Qiang wrote: > >>>> --- a/system/physmem.c > >>>> +++ b/system/physmem.c > >>>> @@ -1885,6 +1886,16 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) > >>>> qemu_mutex_unlock_ramlist(); > >>>> goto out_free; > >>>> } > >>>> + > >>>> + new_block->memory_attribute_manager = MEMORY_ATTRIBUTE_MANAGER(object_new(TYPE_MEMORY_ATTRIBUTE_MANAGER)); > >>>> + if (memory_attribute_manager_realize(new_block->memory_attribute_manager, new_block->mr)) { > >>>> + error_setg(errp, "Failed to realize memory attribute manager"); > >>>> + object_unref(OBJECT(new_block->memory_attribute_manager)); > >>>> + close(new_block->guest_memfd); > >>>> + ram_block_discard_require(false); > >>>> + qemu_mutex_unlock_ramlist(); > >>>> + goto out_free; > >>>> + } > >>>> } > >>>> > >>>> ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS; > >>> > >>> Might as well put the above into a separate memory manager init function > >>> to start with. It keeps the goto out_free error path unified, and makes > >>> things more future proof if the rest of ram_block_add() ever develops a > >>> need to check for errors. > >> > >> Which part to be defined in a separate function? The init function of > >> object_new() + realize(), or the error handling operation > >> (object_unref() + close() + ram_block_discard_require(false))? > > > > I was thinking the whole thing, including freeing :) But maybe there's > > something more to consider to keep calls paired. > > If putting the whole thing separately, I think the rest part to do error > handling still needs to add the same operation. Or I misunderstand > something? So maybe you suggestion of just a separate clean-up function would work: new_block->memory_attribute_manager = MEMORY_ATTRIBUTE_MANAGER(object_new(TYPE_MEMORY_ATTRIBUTE_MANAGER)); if (memory_attribute_manager_realize(new_block->memory_attribute_manager, new_block->mr)) { memory_attribute_manager_cleanup(...); goto out_free; } > >> If need to check for errors in the rest of ram_block_add() in future, > >> how about adding a new label before out_free and move the error handling > >> there? > > > > Yeah that would work too. > > I'm not sure if we should add such change directly, or we can wait for > the real error check introduced in future. Right, not sure either. Regards, Tony
Hi David & Alexey, To keep the bitmap aligned, I add the undo operation for set_memory_attributes() and use the bitmap + replay callback to do set_memory_attributes(). Does this change make sense? Alexey, I didn't add your Reivewed-by since this patch introduced some new changes. On 3/10/2025 4:18 PM, Chenyi Qiang wrote: > Introduce a new field, memory_attribute_manager, in RAMBlock to link to > an MemoryAttributeManager object. This change centralizes all > guest_memfd state information (like fd and shared_bitmap) within a > RAMBlock, making it easier to manage. > > Use the realize()/unrealize() helpers to initialize/uninitialize the > MemoryAttributeManager object. Register/unregister the object in the > target RAMBlock's MemoryRegion when creating guest_memfd. > > In the kvm_convert_memory() function, manage memory state changes by > using the shared_bitmap to call set_attribute() only on the specific > memory range. Additionally, use the > memory_attribute_manager_state_change() helper to notify the reigstered > RamDiscardListener of these changes. > > Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> > --- > Changes in v3: > - Use ram_discard_manager_reply_populated/discarded() to set the > memory attribute and add the undo support if state_change() > failed. > - Didn't add Reviewed-by from Alexey due to the new changes in this > commit. > > Changes in v2: > - Introduce a new field memory_attribute_manager in RAMBlock. > - Move the state_change() handling during page conversion in this patch. > - Undo what we did if it fails to set. > - Change the order of close(guest_memfd) and memory_attribute_manager cleanup. > --- > accel/kvm/kvm-all.c | 50 +++++++++++++++++++++++++++++++++++++++-- > include/exec/ramblock.h | 2 ++ > system/physmem.c | 13 +++++++++++ > 3 files changed, 63 insertions(+), 2 deletions(-) > > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c > index c1fea69d58..a89c5655e8 100644 > --- a/accel/kvm/kvm-all.c > +++ b/accel/kvm/kvm-all.c > @@ -48,6 +48,7 @@ > #include "kvm-cpus.h" > #include "system/dirtylimit.h" > #include "qemu/range.h" > +#include "system/memory-attribute-manager.h" > > #include "hw/boards.h" > #include "system/stats.h" > @@ -3018,6 +3019,25 @@ static void kvm_eat_signals(CPUState *cpu) > } while (sigismember(&chkset, SIG_IPI)); > } > > +typedef struct SetMemoryAttribute { > + bool to_private; > +} SetMemoryAttribute; > + > +static int kvm_set_memory_attributes_cb(MemoryRegionSection *section, > + void *opaque) > +{ > + hwaddr start = section->offset_within_address_space; > + hwaddr size = section->size; > + SetMemoryAttribute *args = opaque; > + bool to_private = args->to_private; > + > + if (to_private) { > + return kvm_set_memory_attributes_private(start, size); > + } else { > + return kvm_set_memory_attributes_shared(start, size); > + } > +} > + > int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) > { > MemoryRegionSection section; > @@ -3026,6 +3046,7 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) > RAMBlock *rb; > void *addr; > int ret = -EINVAL; > + SetMemoryAttribute args = { .to_private = to_private }; > > trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared"); > > @@ -3077,9 +3098,13 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) > } > > if (to_private) { > - ret = kvm_set_memory_attributes_private(start, size); > + ret = ram_discard_manager_replay_populated(mr->rdm, §ion, > + kvm_set_memory_attributes_cb, > + &args); > } else { > - ret = kvm_set_memory_attributes_shared(start, size); > + ret = ram_discard_manager_replay_discarded(mr->rdm, §ion, > + kvm_set_memory_attributes_cb, > + &args); > } > if (ret) { > goto out_unref; > @@ -3088,6 +3113,27 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) > addr = memory_region_get_ram_ptr(mr) + section.offset_within_region; > rb = qemu_ram_block_from_host(addr, false, &offset); > > + ret = memory_attribute_manager_state_change(MEMORY_ATTRIBUTE_MANAGER(mr->rdm), > + offset, size, to_private); > + if (ret) { > + warn_report("Failed to notify the listener the state change of " > + "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s", > + start, size, to_private ? "private" : "shared"); > + args.to_private = !to_private; > + if (to_private) { > + ret = ram_discard_manager_replay_populated(mr->rdm, §ion, > + kvm_set_memory_attributes_cb, > + &args); > + } else { > + ret = ram_discard_manager_replay_discarded(mr->rdm, §ion, > + kvm_set_memory_attributes_cb, > + &args); > + } > + if (ret) { > + goto out_unref; > + } > + } > + > if (to_private) { > if (rb->page_size != qemu_real_host_page_size()) { > /* > diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h > index 0babd105c0..06fd365326 100644 > --- a/include/exec/ramblock.h > +++ b/include/exec/ramblock.h > @@ -23,6 +23,7 @@ > #include "cpu-common.h" > #include "qemu/rcu.h" > #include "exec/ramlist.h" > +#include "system/memory-attribute-manager.h" > > struct RAMBlock { > struct rcu_head rcu; > @@ -42,6 +43,7 @@ struct RAMBlock { > int fd; > uint64_t fd_offset; > int guest_memfd; > + MemoryAttributeManager *memory_attribute_manager; > size_t page_size; > /* dirty bitmap used during migration */ > unsigned long *bmap; > diff --git a/system/physmem.c b/system/physmem.c > index c76503aea8..0ed394c5d2 100644 > --- a/system/physmem.c > +++ b/system/physmem.c > @@ -54,6 +54,7 @@ > #include "system/hostmem.h" > #include "system/hw_accel.h" > #include "system/xen-mapcache.h" > +#include "system/memory-attribute-manager.h" > #include "trace.h" > > #ifdef CONFIG_FALLOCATE_PUNCH_HOLE > @@ -1885,6 +1886,16 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) > qemu_mutex_unlock_ramlist(); > goto out_free; > } > + > + new_block->memory_attribute_manager = MEMORY_ATTRIBUTE_MANAGER(object_new(TYPE_MEMORY_ATTRIBUTE_MANAGER)); > + if (memory_attribute_manager_realize(new_block->memory_attribute_manager, new_block->mr)) { > + error_setg(errp, "Failed to realize memory attribute manager"); > + object_unref(OBJECT(new_block->memory_attribute_manager)); > + close(new_block->guest_memfd); > + ram_block_discard_require(false); > + qemu_mutex_unlock_ramlist(); > + goto out_free; > + } > } > > ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS; > @@ -2138,6 +2149,8 @@ static void reclaim_ramblock(RAMBlock *block) > } > > if (block->guest_memfd >= 0) { > + memory_attribute_manager_unrealize(block->memory_attribute_manager); > + object_unref(OBJECT(block->memory_attribute_manager)); > close(block->guest_memfd); > ram_block_discard_require(false); > }
On 14.03.25 09:21, Chenyi Qiang wrote: > Hi David & Alexey, Hi, > > To keep the bitmap aligned, I add the undo operation for > set_memory_attributes() and use the bitmap + replay callback to do > set_memory_attributes(). Does this change make sense? I assume you mean this hunk: + ret = memory_attribute_manager_state_change(MEMORY_ATTRIBUTE_MANAGER(mr->rdm), + offset, size, to_private); + if (ret) { + warn_report("Failed to notify the listener the state change of " + "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s", + start, size, to_private ? "private" : "shared"); + args.to_private = !to_private; + if (to_private) { + ret = ram_discard_manager_replay_populated(mr->rdm, §ion, + kvm_set_memory_attributes_cb, + &args); + } else { + ret = ram_discard_manager_replay_discarded(mr->rdm, §ion, + kvm_set_memory_attributes_cb, + &args); + } + if (ret) { + goto out_unref; + } Why is that undo necessary? The bitmap + listeners should be held in sync inside of memory_attribute_manager_state_change(). Handling this in the caller looks wrong. I thought the current implementation properly handles that internally? In which scenario is that not the case? -- Cheers, David / dhildenb
On 3/14/2025 5:00 PM, David Hildenbrand wrote: > On 14.03.25 09:21, Chenyi Qiang wrote: >> Hi David & Alexey, > > Hi, > >> >> To keep the bitmap aligned, I add the undo operation for >> set_memory_attributes() and use the bitmap + replay callback to do >> set_memory_attributes(). Does this change make sense? > > I assume you mean this hunk: > > + ret = > memory_attribute_manager_state_change(MEMORY_ATTRIBUTE_MANAGER(mr->rdm), > + offset, size, to_private); > + if (ret) { > + warn_report("Failed to notify the listener the state change of " > + "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s", > + start, size, to_private ? "private" : "shared"); > + args.to_private = !to_private; > + if (to_private) { > + ret = ram_discard_manager_replay_populated(mr->rdm, §ion, > + > kvm_set_memory_attributes_cb, > + &args); > + } else { > + ret = ram_discard_manager_replay_discarded(mr->rdm, §ion, > + > kvm_set_memory_attributes_cb, > + &args); > + } > + if (ret) { > + goto out_unref; > + } > > > Why is that undo necessary? The bitmap + listeners should be held in > sync inside of > memory_attribute_manager_state_change(). Handling this in the caller > looks wrong. state_change() handles the listener, i.e. VFIO/IOMMU. And the caller handles the core mm side (guest_memfd set_attribute()) undo if state_change() failed. Just want to keep the attribute consistent with the bitmap on both side. Do we need this? If not, the bitmap can only represent the status of listeners. > > I thought the current implementation properly handles that internally? > In which scenario is that not the case? As mentioned above, e.g. if private_to_shared: 1. set_attribute_shared() convert to shared, but bitmap doesn't change at that stage and is still private. 2. state_change() failed, it undo the operation and bitmap status to keep unchanged. 3. core mm side status is inconsistent with bitmap. >
On 14.03.25 10:30, Chenyi Qiang wrote: > > > On 3/14/2025 5:00 PM, David Hildenbrand wrote: >> On 14.03.25 09:21, Chenyi Qiang wrote: >>> Hi David & Alexey, >> >> Hi, >> >>> >>> To keep the bitmap aligned, I add the undo operation for >>> set_memory_attributes() and use the bitmap + replay callback to do >>> set_memory_attributes(). Does this change make sense? >> >> I assume you mean this hunk: >> >> + ret = >> memory_attribute_manager_state_change(MEMORY_ATTRIBUTE_MANAGER(mr->rdm), >> + offset, size, to_private); >> + if (ret) { >> + warn_report("Failed to notify the listener the state change of " >> + "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s", >> + start, size, to_private ? "private" : "shared"); >> + args.to_private = !to_private; >> + if (to_private) { >> + ret = ram_discard_manager_replay_populated(mr->rdm, §ion, >> + >> kvm_set_memory_attributes_cb, >> + &args); >> + } else { >> + ret = ram_discard_manager_replay_discarded(mr->rdm, §ion, >> + >> kvm_set_memory_attributes_cb, >> + &args); >> + } >> + if (ret) { >> + goto out_unref; >> + } >> We should probably document that memory_attribute_state_change() cannot fail with "to_private", so you can simplify it to only handle the "to shared" case. >> >> Why is that undo necessary? The bitmap + listeners should be held in >> sync inside of >> memory_attribute_manager_state_change(). Handling this in the caller >> looks wrong. > > state_change() handles the listener, i.e. VFIO/IOMMU. And the caller > handles the core mm side (guest_memfd set_attribute()) undo if > state_change() failed. Just want to keep the attribute consistent with > the bitmap on both side. Do we need this? If not, the bitmap can only > represent the status of listeners. Ah, so you meant that you effectively want to undo the attribute change, because the state effectively cannot change, and we want to revert the attribute change. That makes sense when we are converting private->shared. BTW, I'm thinking if the orders should be the following (with in-place conversion in mind where we would mmap guest_memfd for the shared memory parts). On private -> shared conversion: (1) change_attribute() (2) state_change(): IOMMU pins shared memory (3) restore_attribute() if it failed On shared -> private conversion (1) state_change(): IOMMU unpins shared memory (2) change_attribute(): can convert in-place because there are not pins I'm wondering if the whole attribute change could actually be a listener, invoked by the memory_attribute_manager_state_change() call itself in the right order. We'd probably need listener priorities, and invoke them in reverse order on shared -> private conversion. Just an idea to get rid of the manual ram_discard_manager_replay_discarded() call in your code. -- Cheers, David / dhildenbh
On 3/14/2025 5:50 PM, David Hildenbrand wrote: > On 14.03.25 10:30, Chenyi Qiang wrote: >> >> >> On 3/14/2025 5:00 PM, David Hildenbrand wrote: >>> On 14.03.25 09:21, Chenyi Qiang wrote: >>>> Hi David & Alexey, >>> >>> Hi, >>> >>>> >>>> To keep the bitmap aligned, I add the undo operation for >>>> set_memory_attributes() and use the bitmap + replay callback to do >>>> set_memory_attributes(). Does this change make sense? >>> >>> I assume you mean this hunk: >>> >>> + ret = >>> memory_attribute_manager_state_change(MEMORY_ATTRIBUTE_MANAGER(mr->rdm), >>> + offset, size, >>> to_private); >>> + if (ret) { >>> + warn_report("Failed to notify the listener the state change >>> of " >>> + "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s", >>> + start, size, to_private ? "private" : "shared"); >>> + args.to_private = !to_private; >>> + if (to_private) { >>> + ret = ram_discard_manager_replay_populated(mr->rdm, >>> §ion, >>> + >>> kvm_set_memory_attributes_cb, >>> + &args); >>> + } else { >>> + ret = ram_discard_manager_replay_discarded(mr->rdm, >>> §ion, >>> + >>> kvm_set_memory_attributes_cb, >>> + &args); >>> + } >>> + if (ret) { >>> + goto out_unref; >>> + } >>> > > We should probably document that memory_attribute_state_change() cannot > fail with "to_private", so you can simplify it to only handle the "to > shared" case. Yes, "to_private" branch is unnecessary. > >>> >>> Why is that undo necessary? The bitmap + listeners should be held in >>> sync inside of >>> memory_attribute_manager_state_change(). Handling this in the caller >>> looks wrong. >> >> state_change() handles the listener, i.e. VFIO/IOMMU. And the caller >> handles the core mm side (guest_memfd set_attribute()) undo if >> state_change() failed. Just want to keep the attribute consistent with >> the bitmap on both side. Do we need this? If not, the bitmap can only >> represent the status of listeners. > > Ah, so you meant that you effectively want to undo the attribute change, > because the state effectively cannot change, and we want to revert the > attribute change. > > That makes sense when we are converting private->shared. > > > BTW, I'm thinking if the orders should be the following (with in-place > conversion in mind where we would mmap guest_memfd for the shared memory > parts). > > On private -> shared conversion: > > (1) change_attribute() > (2) state_change(): IOMMU pins shared memory > (3) restore_attribute() if it failed > > On shared -> private conversion > (1) state_change(): IOMMU unpins shared memory > (2) change_attribute(): can convert in-place because there are not pins > > I'm wondering if the whole attribute change could actually be a > listener, invoked by the memory_attribute_manager_state_change() call > itself in the right order. > > We'd probably need listener priorities, and invoke them in reverse order > on shared -> private conversion. Just an idea to get rid of the manual > ram_discard_manager_replay_discarded() call in your code. Good idea. I think listener priorities can make it more elegant with in-place conversion. And the change_attribute() listener can be given a highest or lowest priority. Maybe we can add this change in advance before in-place conversion available. >
On 3/14/2025 6:23 PM, Chenyi Qiang wrote: > > > On 3/14/2025 5:50 PM, David Hildenbrand wrote: >> On 14.03.25 10:30, Chenyi Qiang wrote: >>> >>> >>> On 3/14/2025 5:00 PM, David Hildenbrand wrote: >>>> On 14.03.25 09:21, Chenyi Qiang wrote: >>>>> Hi David & Alexey, >>>> >>>> Hi, >>>> >>>>> >>>>> To keep the bitmap aligned, I add the undo operation for >>>>> set_memory_attributes() and use the bitmap + replay callback to do >>>>> set_memory_attributes(). Does this change make sense? >>>> >>>> I assume you mean this hunk: >>>> >>>> + ret = >>>> memory_attribute_manager_state_change(MEMORY_ATTRIBUTE_MANAGER(mr->rdm), >>>> + offset, size, >>>> to_private); >>>> + if (ret) { >>>> + warn_report("Failed to notify the listener the state change >>>> of " >>>> + "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s", >>>> + start, size, to_private ? "private" : "shared"); >>>> + args.to_private = !to_private; >>>> + if (to_private) { >>>> + ret = ram_discard_manager_replay_populated(mr->rdm, >>>> §ion, >>>> + >>>> kvm_set_memory_attributes_cb, >>>> + &args); >>>> + } else { >>>> + ret = ram_discard_manager_replay_discarded(mr->rdm, >>>> §ion, >>>> + >>>> kvm_set_memory_attributes_cb, >>>> + &args); >>>> + } >>>> + if (ret) { >>>> + goto out_unref; >>>> + } >>>> >> >> We should probably document that memory_attribute_state_change() cannot >> fail with "to_private", so you can simplify it to only handle the "to >> shared" case. > > Yes, "to_private" branch is unnecessary. > >> >>>> >>>> Why is that undo necessary? The bitmap + listeners should be held in >>>> sync inside of >>>> memory_attribute_manager_state_change(). Handling this in the caller >>>> looks wrong. >>> >>> state_change() handles the listener, i.e. VFIO/IOMMU. And the caller >>> handles the core mm side (guest_memfd set_attribute()) undo if >>> state_change() failed. Just want to keep the attribute consistent with >>> the bitmap on both side. Do we need this? If not, the bitmap can only >>> represent the status of listeners. >> >> Ah, so you meant that you effectively want to undo the attribute change, >> because the state effectively cannot change, and we want to revert the >> attribute change. >> >> That makes sense when we are converting private->shared. >> >> >> BTW, I'm thinking if the orders should be the following (with in-place >> conversion in mind where we would mmap guest_memfd for the shared memory >> parts). >> >> On private -> shared conversion: >> >> (1) change_attribute() >> (2) state_change(): IOMMU pins shared memory >> (3) restore_attribute() if it failed >> >> On shared -> private conversion >> (1) state_change(): IOMMU unpins shared memory >> (2) change_attribute(): can convert in-place because there are not pins >> >> I'm wondering if the whole attribute change could actually be a >> listener, invoked by the memory_attribute_manager_state_change() call >> itself in the right order. >> >> We'd probably need listener priorities, and invoke them in reverse order >> on shared -> private conversion. Just an idea to get rid of the manual >> ram_discard_manager_replay_discarded() call in your code. > > Good idea. I think listener priorities can make it more elegant with > in-place conversion. And the change_attribute() listener can be given a > highest or lowest priority. Maybe we can add this change in advance > before in-place conversion available. Hi David, To add the change_attribute() listener priorities changes, I can think of several steps: 1) change the *NotifyRamDiscard() to return the result, because change attribute to private could return failure. 2) Add a list in confidential_guest_support structure (or some other place) to save the wrapped the listener like VFIORamDiscardListener. And add related listener_register/unregister() in kvm_region_add/del() 3) Add priority in listener and related handling to follow the listener expected sequence. Regarding the step 1), with change_attribute() listener, the to_private path could also fail. The tricky part is still the error handling. If we simply assume it couldn't fail, maybe add a g_assert() to avoid expected case. Or we also need to add the undo operation for to_private case. > >> >
© 2016 - 2025 Red Hat, Inc.