1 | Commit 852f0048f3 ("RAMBlock: make guest_memfd require uncoordinated | 1 | Commit 852f0048f3 ("RAMBlock: make guest_memfd require uncoordinated |
---|---|---|---|
2 | discard") effectively disables device assignment with guest_memfd. | 2 | discard") effectively disables device assignment when using guest_memfd. |
3 | guest_memfd is required for confidential guests, so device assignment to | 3 | This poses a significant challenge as guest_memfd is essential for |
4 | confidential guests is disabled. A supporting assumption for disabling | 4 | confidential guests, thereby blocking device assignment to these VMs. |
5 | device-assignment was that TEE I/O (SEV-TIO, TDX Connect, COVE-IO | 5 | The initial rationale for disabling device assignment was due to stale |
6 | etc...) solves the confidential-guest device-assignment problem [1]. | 6 | IOMMU mappings (see Problem section) and the assumption that TEE I/O |
7 | That turns out not to be the case because TEE I/O depends on being able | 7 | (SEV-TIO, TDX Connect, COVE-IO, etc.) would solve the device-assignment |
8 | to operate devices against "shared"/untrusted memory for device | 8 | problem for confidential guests [1]. However, this assumption has proven |
9 | initialization and error recovery scenarios. | 9 | to be incorrect. TEE I/O relies on the ability to operate devices against |
10 | 10 | "shared" or untrusted memory, which is crucial for device initialization | |
11 | This series utilizes an existing framework named RamDiscardManager to | 11 | and error recovery scenarios. As a result, the current implementation does |
12 | notify VFIO of page conversions. However, there's still one concern | 12 | not adequately support device assignment for confidential guests, necessitating |
13 | related to the semantics of RamDiscardManager which is used to manage | 13 | a reevaluation of the approach to ensure compatibility and functionality. |
14 | the memory plug/unplug state. This is a little different from the memory | 14 | |
15 | shared/private in our requirement. See the "Open" section below for more | 15 | This series enables shared device assignment by notifying VFIO of page |
16 | details. | 16 | conversions using an existing framework named RamDiscardListener. |
17 | Additionally, there is an ongoing patch set [2] that aims to add 1G page | ||
18 | support for guest_memfd. This patch set introduces in-place page conversion, | ||
19 | where private and shared memory share the same physical pages as the backend. | ||
20 | This development may impact our solution. | ||
21 | |||
22 | We presented our solution in the guest_memfd meeting to discuss its | ||
23 | compatibility with the new changes and potential future directions (see [3] | ||
24 | for more details). The conclusion was that, although our solution may not be | ||
25 | the most elegant (see the Limitation section), it is sufficient for now and | ||
26 | can be easily adapted to future changes. | ||
27 | |||
28 | We are re-posting the patch series with some cleanup and have removed the RFC | ||
29 | label for the main enabling patches (1-6). The newly-added patch 7 is still | ||
30 | marked as RFC as it tries to resolve some extension concerns related to | ||
31 | RamDiscardManager for future usage. | ||
32 | |||
33 | The overview of the patches: | ||
34 | - Patch 1: Export a helper to get intersection of a MemoryRegionSection | ||
35 | with a given range. | ||
36 | - Patch 2-6: Introduce a new object to manage the guest-memfd with | ||
37 | RamDiscardManager, and notify the shared/private state change during | ||
38 | conversion. | ||
39 | - Patch 7: Try to resolve a semantics concern related to RamDiscardManager | ||
40 | i.e. RamDiscardManager is used to manage memory plug/unplug state | ||
41 | instead of shared/private state. It would affect future users of | ||
42 | RamDiscardManger in confidential VMs. Attach it behind as a RFC patch[4]. | ||
43 | |||
44 | Changes since last version: | ||
45 | - Add a patch to export some generic helper functions from virtio-mem code. | ||
46 | - Change the bitmap in guest_memfd_manager from default shared to default | ||
47 | private. This keeps alignment with virtio-mem that 1-setting in bitmap | ||
48 | represents the populated state and may help to export more generic code | ||
49 | if necessary. | ||
50 | - Add the helpers to initialize/uninitialize the guest_memfd_manager instance | ||
51 | to make it more clear. | ||
52 | - Add a patch to distinguish between the shared/private state change and | ||
53 | the memory plug/unplug state change in RamDiscardManager. | ||
54 | - RFC: https://lore.kernel.org/qemu-devel/20240725072118.358923-1-chenyi.qiang@intel.com/ | ||
55 | |||
56 | --- | ||
17 | 57 | ||
18 | Background | 58 | Background |
19 | ========== | 59 | ========== |
20 | Confidential VMs have two classes of memory: shared and private memory. | 60 | Confidential VMs have two classes of memory: shared and private memory. |
21 | Shared memory is accessible from the host/VMM while private memory is | 61 | Shared memory is accessible from the host/VMM while private memory is |
... | ... | ||
51 | 91 | ||
52 | After step 3, VFIO is still pinning the page. However, DMA operations in | 92 | After step 3, VFIO is still pinning the page. However, DMA operations in |
53 | step 6 will hit the old mapping that was allocated in step 1, which | 93 | step 6 will hit the old mapping that was allocated in step 1, which |
54 | causes the device to access the invalid data. | 94 | causes the device to access the invalid data. |
55 | 95 | ||
56 | Currently, the commit 852f0048f3 ("RAMBlock: make guest_memfd require | ||
57 | uncoordinated discard") has blocked the device assignment with | ||
58 | guest_memfd to avoid this problem. | ||
59 | |||
60 | Solution | 96 | Solution |
61 | ======== | 97 | ======== |
62 | The key to enable shared device assignment is to solve the stale IOMMU | 98 | The key to enable shared device assignment is to update the IOMMU mappings |
63 | mappings problem. | 99 | on page conversion. |
64 | 100 | ||
65 | Given the constraints and assumptions here is a solution that satisfied | 101 | Given the constraints and assumptions here is a solution that satisfied |
66 | the use cases. RamDiscardManager, an existing interface currently | 102 | the use cases. RamDiscardManager, an existing interface currently |
67 | utilized by virtio-mem, offers a means to modify IOMMU mappings in | 103 | utilized by virtio-mem, offers a means to modify IOMMU mappings in |
68 | accordance with VM page assignment. Page conversion is similar to | 104 | accordance with VM page assignment. Page conversion is similar to |
69 | hot-removing a page in one mode and adding it back in the other. | 105 | hot-removing a page in one mode and adding it back in the other. |
70 | 106 | ||
71 | This series implements a RamDiscardManager for confidential VMs and | 107 | This series implements a RamDiscardManager for confidential VMs and |
72 | utilizes its infrastructure to notify VFIO of page conversions. | 108 | utilizes its infrastructure to notify VFIO of page conversions. |
73 | 109 | ||
74 | Another possible attempt [2] was to not discard shared pages in step 3 | 110 | Another possible attempt [5] was to not discard shared pages in step 3 |
75 | above. This was an incomplete band-aid because guests would consume | 111 | above. This was an incomplete band-aid because guests would consume |
76 | twice the memory since shared pages wouldn't be freed even after they | 112 | twice the memory since shared pages wouldn't be freed even after they |
77 | were converted to private. | 113 | were converted to private. |
78 | 114 | ||
79 | Open | 115 | w/ in-place page conversion |
80 | ==== | 116 | =========================== |
81 | Implementing a RamDiscardManager to notify VFIO of page conversions | 117 | To support 1G page support for guest_memfd, the current direction is to |
82 | causes changes in semantics: private memory is treated as discarded (or | 118 | allow mmap() of guest_memfd to userspace so that both private and shared |
83 | hot-removed) memory. This isn't aligned with the expectation of current | 119 | memory can use the same physical pages as the backend. This in-place page |
84 | RamDiscardManager users (e.g. VFIO or live migration) who really | 120 | conversion design eliminates the need to discard pages during shared/private |
85 | expect that discarded memory is hot-removed and thus can be skipped when | 121 | conversions. However, device assignment will still be blocked because the |
86 | the users are processing guest memory. Treating private memory as | 122 | in-place page conversion will reject the conversion when the page is pinned |
87 | discarded won't work in future if VFIO or live migration needs to handle | 123 | by VFIO. |
88 | private memory. e.g. VFIO may need to map private memory to support | 124 | |
89 | Trusted IO and live migration for confidential VMs need to migrate | 125 | To address this, the key difference lies in the sequence of VFIO map/unmap |
90 | private memory. | 126 | operations and the page conversion. This series can be adjusted to achieve |
91 | 127 | unmap-before-conversion-to-private and map-after-conversion-to-shared, | |
92 | There are two possible ways to mitigate the semantics changes. | 128 | ensuring compatibility with guest_memfd. |
93 | 1. Develop a new mechanism to notify the page conversions between | 129 | |
94 | private and shared. For example, utilize the notifier_list in QEMU. VFIO | 130 | Additionally, with in-place page conversion, the previously mentioned |
95 | registers its own handler and gets notified upon page conversions. This | 131 | solution to disable the discard of shared pages is not feasible because |
96 | is a clean approach which only touches the notifier workflow. A | 132 | shared and private memory share the same backend, and no discard operation |
97 | challenge is that for device hotplug, existing shared memory should be | 133 | is performed. Retaining the old mappings in the IOMMU would result in |
98 | mapped in IOMMU. This will need additional changes. | 134 | unsafe DMA access to protected memory. |
99 | 135 | ||
100 | 2. Extend the existing RamDiscardManager interface to manage not only | 136 | Limitation |
101 | the discarded/populated status of guest memory but also the | 137 | ========== |
102 | shared/private status. RamDiscardManager users like VFIO will be | 138 | |
103 | notified with one more argument indicating what change is happening and | 139 | One limitation (also discussed in the guest_memfd meeting) is that VFIO |
104 | can take action accordingly. It also has challenges e.g. QEMU allows | 140 | expects the DMA mapping for a specific IOVA to be mapped and unmapped with |
105 | only one RamDiscardManager, how to support virtio-mem for confidential | 141 | the same granularity. The guest may perform partial conversions, such as |
106 | VMs would be a problem. And some APIs like .is_populated() exposed by | 142 | converting a small region within a larger region. To prevent such invalid |
107 | RamDiscardManager are meaningless to shared/private memory. So they may | 143 | cases, all operations are performed with 4K granularity. The possible |
108 | need some adjustments. | 144 | solutions we can think of are either to enable VFIO to support partial unmap |
145 | or to implement an enlightened guest to avoid partial conversion. The former | ||
146 | requires complex changes in VFIO, while the latter requires the page | ||
147 | conversion to be a guest-enlightened behavior. It is still uncertain which | ||
148 | option is a preferred one. | ||
109 | 149 | ||
110 | Testing | 150 | Testing |
111 | ======= | 151 | ======= |
112 | This patch series is tested based on the internal TDX KVM/QEMU tree. | 152 | This patch series is tested with the KVM/QEMU branch: |
153 | KVM: https://github.com/intel/tdx/tree/tdx_kvm_dev-2024-11-20 | ||
154 | QEMU: https://github.com/intel-staging/qemu-tdx/tree/tdx-upstream-snapshot-2024-12-13 | ||
113 | 155 | ||
114 | To facilitate shared device assignment with the NIC, employ the legacy | 156 | To facilitate shared device assignment with the NIC, employ the legacy |
115 | type1 VFIO with the QEMU command: | 157 | type1 VFIO with the QEMU command: |
116 | 158 | ||
117 | qemu-system-x86_64 [...] | 159 | qemu-system-x86_64 [...] |
... | ... | ||
133 | visible, and iperf is able to successfully send and receive data. | 175 | visible, and iperf is able to successfully send and receive data. |
134 | 176 | ||
135 | Related link | 177 | Related link |
136 | ============ | 178 | ============ |
137 | [1] https://lore.kernel.org/all/d6acfbef-96a1-42bc-8866-c12a4de8c57c@redhat.com/ | 179 | [1] https://lore.kernel.org/all/d6acfbef-96a1-42bc-8866-c12a4de8c57c@redhat.com/ |
138 | [2] https://lore.kernel.org/all/20240320083945.991426-20-michael.roth@amd.com/ | 180 | [2] https://lore.kernel.org/lkml/cover.1726009989.git.ackerleytng@google.com/ |
139 | 181 | [3] https://docs.google.com/document/d/1M6766BzdY1Lhk7LiR5IqVR8B8mG3cr-cxTxOrAosPOk/edit?tab=t.0#heading=h.jr4csfgw1uql | |
140 | Chenyi Qiang (6): | 182 | [4] https://lore.kernel.org/qemu-devel/d299bbad-81bc-462e-91b5-a6d9c27ffe3a@redhat.com/ |
183 | [5] https://lore.kernel.org/all/20240320083945.991426-20-michael.roth@amd.com/ | ||
184 | |||
185 | Chenyi Qiang (7): | ||
186 | memory: Export a helper to get intersection of a MemoryRegionSection | ||
187 | with a given range | ||
141 | guest_memfd: Introduce an object to manage the guest-memfd with | 188 | guest_memfd: Introduce an object to manage the guest-memfd with |
142 | RamDiscardManager | 189 | RamDiscardManager |
143 | guest_memfd: Introduce a helper to notify the shared/private state | 190 | guest_memfd: Introduce a callback to notify the shared/private state |
144 | change | 191 | change |
145 | KVM: Notify the state change via RamDiscardManager helper during | 192 | KVM: Notify the state change event during shared/private conversion |
146 | shared/private conversion | ||
147 | memory: Register the RamDiscardManager instance upon guest_memfd | 193 | memory: Register the RamDiscardManager instance upon guest_memfd |
148 | creation | 194 | creation |
149 | guest-memfd: Default to discarded (private) in guest_memfd_manager | ||
150 | RAMBlock: make guest_memfd require coordinate discard | 195 | RAMBlock: make guest_memfd require coordinate discard |
151 | 196 | memory: Add a new argument to indicate the request attribute in | |
152 | accel/kvm/kvm-all.c | 7 + | 197 | RamDismcardManager helpers |
153 | include/sysemu/guest-memfd-manager.h | 49 +++ | 198 | |
154 | system/guest-memfd-manager.c | 425 +++++++++++++++++++++++++++ | 199 | accel/kvm/kvm-all.c | 4 + |
200 | hw/vfio/common.c | 22 +- | ||
201 | hw/virtio/virtio-mem.c | 55 ++-- | ||
202 | include/exec/memory.h | 36 ++- | ||
203 | include/sysemu/guest-memfd-manager.h | 91 ++++++ | ||
204 | migration/ram.c | 14 +- | ||
205 | system/guest-memfd-manager.c | 456 +++++++++++++++++++++++++++ | ||
206 | system/memory.c | 30 +- | ||
207 | system/memory_mapping.c | 4 +- | ||
155 | system/meson.build | 1 + | 208 | system/meson.build | 1 + |
156 | system/physmem.c | 11 +- | 209 | system/physmem.c | 9 +- |
157 | 5 files changed, 492 insertions(+), 1 deletion(-) | 210 | 11 files changed, 659 insertions(+), 63 deletions(-) |
158 | create mode 100644 include/sysemu/guest-memfd-manager.h | 211 | create mode 100644 include/sysemu/guest-memfd-manager.h |
159 | create mode 100644 system/guest-memfd-manager.c | 212 | create mode 100644 system/guest-memfd-manager.c |
160 | 213 | ||
161 | |||
162 | base-commit: 900536d3e97aed7fdd9cb4dadd3bf7023360e819 | ||
163 | -- | 214 | -- |
164 | 2.43.5 | 215 | 2.43.5 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Rename the helper to memory_region_section_intersect_range() to make it | ||
2 | more generic. | ||
1 | 3 | ||
4 | Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> | ||
5 | --- | ||
6 | hw/virtio/virtio-mem.c | 32 +++++--------------------------- | ||
7 | include/exec/memory.h | 13 +++++++++++++ | ||
8 | system/memory.c | 17 +++++++++++++++++ | ||
9 | 3 files changed, 35 insertions(+), 27 deletions(-) | ||
10 | |||
11 | diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c | ||
12 | index XXXXXXX..XXXXXXX 100644 | ||
13 | --- a/hw/virtio/virtio-mem.c | ||
14 | +++ b/hw/virtio/virtio-mem.c | ||
15 | @@ -XXX,XX +XXX,XX @@ static int virtio_mem_for_each_plugged_range(VirtIOMEM *vmem, void *arg, | ||
16 | return ret; | ||
17 | } | ||
18 | |||
19 | -/* | ||
20 | - * Adjust the memory section to cover the intersection with the given range. | ||
21 | - * | ||
22 | - * Returns false if the intersection is empty, otherwise returns true. | ||
23 | - */ | ||
24 | -static bool virtio_mem_intersect_memory_section(MemoryRegionSection *s, | ||
25 | - uint64_t offset, uint64_t size) | ||
26 | -{ | ||
27 | - uint64_t start = MAX(s->offset_within_region, offset); | ||
28 | - uint64_t end = MIN(s->offset_within_region + int128_get64(s->size), | ||
29 | - offset + size); | ||
30 | - | ||
31 | - if (end <= start) { | ||
32 | - return false; | ||
33 | - } | ||
34 | - | ||
35 | - s->offset_within_address_space += start - s->offset_within_region; | ||
36 | - s->offset_within_region = start; | ||
37 | - s->size = int128_make64(end - start); | ||
38 | - return true; | ||
39 | -} | ||
40 | - | ||
41 | typedef int (*virtio_mem_section_cb)(MemoryRegionSection *s, void *arg); | ||
42 | |||
43 | static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem, | ||
44 | @@ -XXX,XX +XXX,XX @@ static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem, | ||
45 | first_bit + 1) - 1; | ||
46 | size = (last_bit - first_bit + 1) * vmem->block_size; | ||
47 | |||
48 | - if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { | ||
49 | + if (!memory_region_section_intersect_range(&tmp, offset, size)) { | ||
50 | break; | ||
51 | } | ||
52 | ret = cb(&tmp, arg); | ||
53 | @@ -XXX,XX +XXX,XX @@ static int virtio_mem_for_each_unplugged_section(const VirtIOMEM *vmem, | ||
54 | first_bit + 1) - 1; | ||
55 | size = (last_bit - first_bit + 1) * vmem->block_size; | ||
56 | |||
57 | - if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { | ||
58 | + if (!memory_region_section_intersect_range(&tmp, offset, size)) { | ||
59 | break; | ||
60 | } | ||
61 | ret = cb(&tmp, arg); | ||
62 | @@ -XXX,XX +XXX,XX @@ static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset, | ||
63 | QLIST_FOREACH(rdl, &vmem->rdl_list, next) { | ||
64 | MemoryRegionSection tmp = *rdl->section; | ||
65 | |||
66 | - if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { | ||
67 | + if (!memory_region_section_intersect_range(&tmp, offset, size)) { | ||
68 | continue; | ||
69 | } | ||
70 | rdl->notify_discard(rdl, &tmp); | ||
71 | @@ -XXX,XX +XXX,XX @@ static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset, | ||
72 | QLIST_FOREACH(rdl, &vmem->rdl_list, next) { | ||
73 | MemoryRegionSection tmp = *rdl->section; | ||
74 | |||
75 | - if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { | ||
76 | + if (!memory_region_section_intersect_range(&tmp, offset, size)) { | ||
77 | continue; | ||
78 | } | ||
79 | ret = rdl->notify_populate(rdl, &tmp); | ||
80 | @@ -XXX,XX +XXX,XX @@ static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset, | ||
81 | if (rdl2 == rdl) { | ||
82 | break; | ||
83 | } | ||
84 | - if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { | ||
85 | + if (!memory_region_section_intersect_range(&tmp, offset, size)) { | ||
86 | continue; | ||
87 | } | ||
88 | rdl2->notify_discard(rdl2, &tmp); | ||
89 | diff --git a/include/exec/memory.h b/include/exec/memory.h | ||
90 | index XXXXXXX..XXXXXXX 100644 | ||
91 | --- a/include/exec/memory.h | ||
92 | +++ b/include/exec/memory.h | ||
93 | @@ -XXX,XX +XXX,XX @@ MemoryRegionSection *memory_region_section_new_copy(MemoryRegionSection *s); | ||
94 | */ | ||
95 | void memory_region_section_free_copy(MemoryRegionSection *s); | ||
96 | |||
97 | +/** | ||
98 | + * memory_region_section_intersect_range: Adjust the memory section to cover | ||
99 | + * the intersection with the given range. | ||
100 | + * | ||
101 | + * @s: the #MemoryRegionSection to be adjusted | ||
102 | + * @offset: the offset of the given range in the memory region | ||
103 | + * @size: the size of the given range | ||
104 | + * | ||
105 | + * Returns false if the intersection is empty, otherwise returns true. | ||
106 | + */ | ||
107 | +bool memory_region_section_intersect_range(MemoryRegionSection *s, | ||
108 | + uint64_t offset, uint64_t size); | ||
109 | + | ||
110 | /** | ||
111 | * memory_region_init: Initialize a memory region | ||
112 | * | ||
113 | diff --git a/system/memory.c b/system/memory.c | ||
114 | index XXXXXXX..XXXXXXX 100644 | ||
115 | --- a/system/memory.c | ||
116 | +++ b/system/memory.c | ||
117 | @@ -XXX,XX +XXX,XX @@ void memory_region_section_free_copy(MemoryRegionSection *s) | ||
118 | g_free(s); | ||
119 | } | ||
120 | |||
121 | +bool memory_region_section_intersect_range(MemoryRegionSection *s, | ||
122 | + uint64_t offset, uint64_t size) | ||
123 | +{ | ||
124 | + uint64_t start = MAX(s->offset_within_region, offset); | ||
125 | + uint64_t end = MIN(s->offset_within_region + int128_get64(s->size), | ||
126 | + offset + size); | ||
127 | + | ||
128 | + if (end <= start) { | ||
129 | + return false; | ||
130 | + } | ||
131 | + | ||
132 | + s->offset_within_address_space += start - s->offset_within_region; | ||
133 | + s->offset_within_region = start; | ||
134 | + s->size = int128_make64(end - start); | ||
135 | + return true; | ||
136 | +} | ||
137 | + | ||
138 | bool memory_region_present(MemoryRegion *container, hwaddr addr) | ||
139 | { | ||
140 | MemoryRegion *mr; | ||
141 | -- | ||
142 | 2.43.5 | diff view generated by jsdifflib |
1 | As the commit 852f0048f3 ("RAMBlock: make guest_memfd require | 1 | As the commit 852f0048f3 ("RAMBlock: make guest_memfd require |
---|---|---|---|
2 | uncoordinated discard") highlighted, some subsystems like VFIO might | 2 | uncoordinated discard") highlighted, some subsystems like VFIO might |
3 | disable ram block discard. However, guest_memfd relies on the discard | 3 | disable ram block discard. However, guest_memfd relies on the discard |
4 | operation to perform page conversion between private and shared memory. | 4 | operation to perform page conversion between private and shared memory. |
5 | This can lead to stale IOMMU mapping issue when assigning a hardware | 5 | This can lead to stale IOMMU mapping issue when assigning a hardware |
6 | device to a confidential guest via shared memory (unprotected memory | 6 | device to a confidential VM via shared memory (unprotected memory |
7 | pages). Blocking shared page discard can solve this problem, but it | 7 | pages). Blocking shared page discard can solve this problem, but it |
8 | could cause guests to consume twice the memory with VFIO, which is not | 8 | could cause guests to consume twice the memory with VFIO, which is not |
9 | acceptable in some cases. An alternative solution is to convey other | 9 | acceptable in some cases. An alternative solution is to convey other |
10 | systems like VFIO to refresh its outdated IOMMU mappings. | 10 | systems like VFIO to refresh its outdated IOMMU mappings. |
11 | 11 | ||
12 | RamDiscardManager is an existing concept (used by virtio-mem) to adjust | 12 | RamDiscardManager is an existing concept (used by virtio-mem) to adjust |
13 | VFIO mappings in relation to VM page assignement. Effectively page | 13 | VFIO mappings in relation to VM page assignment. Effectively page |
14 | conversion is similar to hot-removing a page in one mode and adding it | 14 | conversion is similar to hot-removing a page in one mode and adding it |
15 | back in the other, so the similar work that needs to happen in response | 15 | back in the other, so the similar work that needs to happen in response |
16 | to virtio-mem changes needs to happen for page conversion events. | 16 | to virtio-mem changes needs to happen for page conversion events. |
17 | Introduce the RamDiscardManager to guest_memfd to achieve it. | 17 | Introduce the RamDiscardManager to guest_memfd to achieve it. |
18 | 18 | ||
19 | However, Implementing the RamDiscardManager interface poses a challenge | 19 | However, guest_memfd is not an object so it cannot directly implement |
20 | as guest_memfd is not an object, instead, it is contained within RamBlock | 20 | the RamDiscardManager interface. |
21 | and is indicated by a RAM_GUEST_MEMFD flag upon creation. | 21 | |
22 | 22 | One solution is to implement the interface in HostMemoryBackend. Any | |
23 | One option is to implement the interface in HostMemoryBackend. Any | ||
24 | guest_memfd-backed host memory backend can register itself in the target | 23 | guest_memfd-backed host memory backend can register itself in the target |
25 | MemoryRegion. However, this solution doesn't cover the scenario where a | 24 | MemoryRegion. However, this solution doesn't cover the scenario where a |
26 | guest_memfd MemoryRegion doesn't belong to the HostMemoryBackend, e.g. | 25 | guest_memfd MemoryRegion doesn't belong to the HostMemoryBackend, e.g. |
27 | the virtual BIOS MemoryRegion. | 26 | the virtual BIOS MemoryRegion. |
28 | 27 | ||
29 | Thus, implement the second option, which involves defining an object type | 28 | Thus, choose the second option, i.e. define an object type named |
30 | named guest_memfd_manager with the RamDiscardManager interface. Upon | 29 | guest_memfd_manager with RamDiscardManager interface. Upon creation of |
31 | creation of guest_memfd, a new guest_memfd_manager object can be | 30 | guest_memfd, a new guest_memfd_manager object can be instantiated and |
32 | instantiated and registered to the managed guest_memfd MemoryRegion to | 31 | registered to the managed guest_memfd MemoryRegion to handle the page |
33 | handle the page conversion events. | 32 | conversion events. |
34 | 33 | ||
35 | In the context of guest_memfd, the discarded state signifies that the | 34 | In the context of guest_memfd, the discarded state signifies that the |
36 | page is private, while the populated state indicated that the page is | 35 | page is private, while the populated state indicated that the page is |
37 | shared. The state of the memory is tracked at the granularity of the | 36 | shared. The state of the memory is tracked at the granularity of the |
38 | host page size (i.e. block_size), as the minimum conversion size can be | 37 | host page size (i.e. block_size), as the minimum conversion size can be |
39 | one page per request. In addition, VFIO expects the DMA mapping for a | 38 | one page per request. |
40 | specific iova to be mapped and unmapped with the same granularity. | 39 | |
41 | However, there's no guarantee that the confidential guest won't | 40 | In addition, VFIO expects the DMA mapping for a specific iova to be |
42 | partially convert the pages. For instance the confidential guest may | 41 | mapped and unmapped with the same granularity. However, the confidential |
43 | flip a 2M page from private to shared and later flip the first 4K | 42 | VMs may do partial conversion, e.g. conversion happens on a small region |
44 | sub-range from shared to private. To prevent such invalid cases, all | 43 | within a large region. To prevent such invalid cases and before any |
45 | operations are performed with a 4K granularity. | 44 | potential optimization comes out, all operations are performed with 4K |
45 | granularity. | ||
46 | 46 | ||
47 | Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> | 47 | Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> |
48 | --- | 48 | --- |
49 | include/sysemu/guest-memfd-manager.h | 46 +++++ | 49 | include/sysemu/guest-memfd-manager.h | 46 +++++ |
50 | system/guest-memfd-manager.c | 283 +++++++++++++++++++++++++++ | 50 | system/guest-memfd-manager.c | 250 +++++++++++++++++++++++++++ |
51 | system/meson.build | 1 + | 51 | system/meson.build | 1 + |
52 | 3 files changed, 330 insertions(+) | 52 | 3 files changed, 297 insertions(+) |
53 | create mode 100644 include/sysemu/guest-memfd-manager.h | 53 | create mode 100644 include/sysemu/guest-memfd-manager.h |
54 | create mode 100644 system/guest-memfd-manager.c | 54 | create mode 100644 system/guest-memfd-manager.c |
55 | 55 | ||
56 | diff --git a/include/sysemu/guest-memfd-manager.h b/include/sysemu/guest-memfd-manager.h | 56 | diff --git a/include/sysemu/guest-memfd-manager.h b/include/sysemu/guest-memfd-manager.h |
57 | new file mode 100644 | 57 | new file mode 100644 |
... | ... | ||
85 | + Object parent; | 85 | + Object parent; |
86 | + | 86 | + |
87 | + /* Managed memory region. */ | 87 | + /* Managed memory region. */ |
88 | + MemoryRegion *mr; | 88 | + MemoryRegion *mr; |
89 | + | 89 | + |
90 | + /* bitmap used to track discard (private) memory */ | 90 | + /* |
91 | + int32_t discard_bitmap_size; | 91 | + * 1-setting of the bit represents the memory is populated (shared). |
92 | + unsigned long *discard_bitmap; | 92 | + */ |
93 | + int32_t bitmap_size; | ||
94 | + unsigned long *bitmap; | ||
93 | + | 95 | + |
94 | + /* block size and alignment */ | 96 | + /* block size and alignment */ |
95 | + uint64_t block_size; | 97 | + uint64_t block_size; |
96 | + | 98 | + |
97 | + /* listeners to notify on populate/discard activity. */ | 99 | + /* listeners to notify on populate/discard activity. */ |
98 | + QLIST_HEAD(, RamDiscardListener) rdl_list; | 100 | + QLIST_HEAD(, RamDiscardListener) rdl_list; |
99 | +}; | 101 | +}; |
100 | + | 102 | + |
101 | +struct GuestMemfdManagerClass { | 103 | +struct GuestMemfdManagerClass { |
102 | + ObjectClass parent_class; | 104 | + ObjectClass parent_class; |
103 | + | ||
104 | + void (*realize)(Object *gmm, MemoryRegion *mr, uint64_t region_size); | ||
105 | +}; | 105 | +}; |
106 | + | 106 | + |
107 | +#endif | 107 | +#endif |
108 | diff --git a/system/guest-memfd-manager.c b/system/guest-memfd-manager.c | 108 | diff --git a/system/guest-memfd-manager.c b/system/guest-memfd-manager.c |
109 | new file mode 100644 | 109 | new file mode 100644 |
... | ... | ||
141 | + const GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); | 141 | + const GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); |
142 | + uint64_t first_bit = section->offset_within_region / gmm->block_size; | 142 | + uint64_t first_bit = section->offset_within_region / gmm->block_size; |
143 | + uint64_t last_bit = first_bit + int128_get64(section->size) / gmm->block_size - 1; | 143 | + uint64_t last_bit = first_bit + int128_get64(section->size) / gmm->block_size - 1; |
144 | + unsigned long first_discard_bit; | 144 | + unsigned long first_discard_bit; |
145 | + | 145 | + |
146 | + first_discard_bit = find_next_bit(gmm->discard_bitmap, last_bit + 1, first_bit); | 146 | + first_discard_bit = find_next_zero_bit(gmm->bitmap, last_bit + 1, first_bit); |
147 | + return first_discard_bit > last_bit; | 147 | + return first_discard_bit > last_bit; |
148 | +} | 148 | +} |
149 | + | 149 | + |
150 | +static bool guest_memfd_rdm_intersect_memory_section(MemoryRegionSection *section, | ||
151 | + uint64_t offset, uint64_t size) | ||
152 | +{ | ||
153 | + uint64_t start = MAX(section->offset_within_region, offset); | ||
154 | + uint64_t end = MIN(section->offset_within_region + int128_get64(section->size), | ||
155 | + offset + size); | ||
156 | + if (end <= start) { | ||
157 | + return false; | ||
158 | + } | ||
159 | + | ||
160 | + section->offset_within_address_space += start - section->offset_within_region; | ||
161 | + section->offset_within_region = start; | ||
162 | + section->size = int128_make64(end - start); | ||
163 | + | ||
164 | + return true; | ||
165 | +} | ||
166 | + | ||
167 | +typedef int (*guest_memfd_section_cb)(MemoryRegionSection *s, void *arg); | 150 | +typedef int (*guest_memfd_section_cb)(MemoryRegionSection *s, void *arg); |
168 | + | 151 | + |
169 | +static int guest_memfd_notify_populate_cb(MemoryRegionSection *section, void *arg) | 152 | +static int guest_memfd_notify_populate_cb(MemoryRegionSection *section, void *arg) |
170 | +{ | 153 | +{ |
171 | + RamDiscardListener *rdl = arg; | 154 | + RamDiscardListener *rdl = arg; |
... | ... | ||
180 | + rdl->notify_discard(rdl, section); | 163 | + rdl->notify_discard(rdl, section); |
181 | + | 164 | + |
182 | + return 0; | 165 | + return 0; |
183 | +} | 166 | +} |
184 | + | 167 | + |
185 | +static int guest_memfd_for_each_populated_range(const GuestMemfdManager *gmm, | 168 | +static int guest_memfd_for_each_populated_section(const GuestMemfdManager *gmm, |
186 | + MemoryRegionSection *section, | 169 | + MemoryRegionSection *section, |
187 | + void *arg, | 170 | + void *arg, |
188 | + guest_memfd_section_cb cb) | 171 | + guest_memfd_section_cb cb) |
189 | +{ | 172 | +{ |
190 | + unsigned long first_zero_bit, last_zero_bit; | 173 | + unsigned long first_one_bit, last_one_bit; |
191 | + uint64_t offset, size; | 174 | + uint64_t offset, size; |
192 | + int ret = 0; | 175 | + int ret = 0; |
193 | + | 176 | + |
194 | + first_zero_bit = section->offset_within_region / gmm->block_size; | 177 | + first_one_bit = section->offset_within_region / gmm->block_size; |
195 | + first_zero_bit = find_next_zero_bit(gmm->discard_bitmap, gmm->discard_bitmap_size, | 178 | + first_one_bit = find_next_bit(gmm->bitmap, gmm->bitmap_size, first_one_bit); |
196 | + first_zero_bit); | 179 | + |
197 | + | 180 | + while (first_one_bit < gmm->bitmap_size) { |
198 | + while (first_zero_bit < gmm->discard_bitmap_size) { | ||
199 | + MemoryRegionSection tmp = *section; | 181 | + MemoryRegionSection tmp = *section; |
200 | + | 182 | + |
201 | + offset = first_zero_bit * gmm->block_size; | 183 | + offset = first_one_bit * gmm->block_size; |
202 | + last_zero_bit = find_next_bit(gmm->discard_bitmap, gmm->discard_bitmap_size, | 184 | + last_one_bit = find_next_zero_bit(gmm->bitmap, gmm->bitmap_size, |
203 | + first_zero_bit + 1) - 1; | 185 | + first_one_bit + 1) - 1; |
204 | + size = (last_zero_bit - first_zero_bit + 1) * gmm->block_size; | 186 | + size = (last_one_bit - first_one_bit + 1) * gmm->block_size; |
205 | + | 187 | + |
206 | + if (!guest_memfd_rdm_intersect_memory_section(&tmp, offset, size)) { | 188 | + if (!memory_region_section_intersect_range(&tmp, offset, size)) { |
207 | + break; | 189 | + break; |
208 | + } | 190 | + } |
209 | + | 191 | + |
210 | + ret = cb(&tmp, arg); | 192 | + ret = cb(&tmp, arg); |
211 | + if (ret) { | 193 | + if (ret) { |
212 | + break; | 194 | + break; |
213 | + } | 195 | + } |
214 | + | 196 | + |
215 | + first_zero_bit = find_next_zero_bit(gmm->discard_bitmap, gmm->discard_bitmap_size, | 197 | + first_one_bit = find_next_bit(gmm->bitmap, gmm->bitmap_size, |
216 | + last_zero_bit + 2); | 198 | + last_one_bit + 2); |
217 | + } | 199 | + } |
218 | + | 200 | + |
219 | + return ret; | 201 | + return ret; |
220 | +} | 202 | +} |
221 | + | 203 | + |
222 | +static int guest_memfd_for_each_discarded_range(const GuestMemfdManager *gmm, | 204 | +static int guest_memfd_for_each_discarded_section(const GuestMemfdManager *gmm, |
223 | + MemoryRegionSection *section, | 205 | + MemoryRegionSection *section, |
224 | + void *arg, | 206 | + void *arg, |
225 | + guest_memfd_section_cb cb) | 207 | + guest_memfd_section_cb cb) |
226 | +{ | 208 | +{ |
227 | + unsigned long first_one_bit, last_one_bit; | 209 | + unsigned long first_zero_bit, last_zero_bit; |
228 | + uint64_t offset, size; | 210 | + uint64_t offset, size; |
229 | + int ret = 0; | 211 | + int ret = 0; |
230 | + | 212 | + |
231 | + first_one_bit = section->offset_within_region / gmm->block_size; | 213 | + first_zero_bit = section->offset_within_region / gmm->block_size; |
232 | + first_one_bit = find_next_bit(gmm->discard_bitmap, gmm->discard_bitmap_size, | 214 | + first_zero_bit = find_next_zero_bit(gmm->bitmap, gmm->bitmap_size, |
233 | + first_one_bit); | 215 | + first_zero_bit); |
234 | + | 216 | + |
235 | + while (first_one_bit < gmm->discard_bitmap_size) { | 217 | + while (first_zero_bit < gmm->bitmap_size) { |
236 | + MemoryRegionSection tmp = *section; | 218 | + MemoryRegionSection tmp = *section; |
237 | + | 219 | + |
238 | + offset = first_one_bit * gmm->block_size; | 220 | + offset = first_zero_bit * gmm->block_size; |
239 | + last_one_bit = find_next_zero_bit(gmm->discard_bitmap, gmm->discard_bitmap_size, | 221 | + last_zero_bit = find_next_bit(gmm->bitmap, gmm->bitmap_size, |
240 | + first_one_bit + 1) - 1; | 222 | + first_zero_bit + 1) - 1; |
241 | + size = (last_one_bit - first_one_bit + 1) * gmm->block_size; | 223 | + size = (last_zero_bit - first_zero_bit + 1) * gmm->block_size; |
242 | + | 224 | + |
243 | + if (!guest_memfd_rdm_intersect_memory_section(&tmp, offset, size)) { | 225 | + if (!memory_region_section_intersect_range(&tmp, offset, size)) { |
244 | + break; | 226 | + break; |
245 | + } | 227 | + } |
246 | + | 228 | + |
247 | + ret = cb(&tmp, arg); | 229 | + ret = cb(&tmp, arg); |
248 | + if (ret) { | 230 | + if (ret) { |
249 | + break; | 231 | + break; |
250 | + } | 232 | + } |
251 | + | 233 | + |
252 | + first_one_bit = find_next_bit(gmm->discard_bitmap, gmm->discard_bitmap_size, | 234 | + first_zero_bit = find_next_zero_bit(gmm->bitmap, gmm->bitmap_size, |
253 | + last_one_bit + 2); | 235 | + last_zero_bit + 2); |
254 | + } | 236 | + } |
255 | + | 237 | + |
256 | + return ret; | 238 | + return ret; |
257 | +} | 239 | +} |
258 | + | 240 | + |
... | ... | ||
275 | + g_assert(section->mr == gmm->mr); | 257 | + g_assert(section->mr == gmm->mr); |
276 | + rdl->section = memory_region_section_new_copy(section); | 258 | + rdl->section = memory_region_section_new_copy(section); |
277 | + | 259 | + |
278 | + QLIST_INSERT_HEAD(&gmm->rdl_list, rdl, next); | 260 | + QLIST_INSERT_HEAD(&gmm->rdl_list, rdl, next); |
279 | + | 261 | + |
280 | + ret = guest_memfd_for_each_populated_range(gmm, section, rdl, | 262 | + ret = guest_memfd_for_each_populated_section(gmm, section, rdl, |
281 | + guest_memfd_notify_populate_cb); | 263 | + guest_memfd_notify_populate_cb); |
282 | + if (ret) { | 264 | + if (ret) { |
283 | + error_report("%s: Failed to register RAM discard listener: %s", __func__, | 265 | + error_report("%s: Failed to register RAM discard listener: %s", __func__, |
284 | + strerror(-ret)); | 266 | + strerror(-ret)); |
285 | + } | 267 | + } |
286 | +} | 268 | +} |
... | ... | ||
292 | + int ret; | 274 | + int ret; |
293 | + | 275 | + |
294 | + g_assert(rdl->section); | 276 | + g_assert(rdl->section); |
295 | + g_assert(rdl->section->mr == gmm->mr); | 277 | + g_assert(rdl->section->mr == gmm->mr); |
296 | + | 278 | + |
297 | + ret = guest_memfd_for_each_populated_range(gmm, rdl->section, rdl, | 279 | + ret = guest_memfd_for_each_populated_section(gmm, rdl->section, rdl, |
298 | + guest_memfd_notify_discard_cb); | 280 | + guest_memfd_notify_discard_cb); |
299 | + if (ret) { | 281 | + if (ret) { |
300 | + error_report("%s: Failed to unregister RAM discard listener: %s", __func__, | 282 | + error_report("%s: Failed to unregister RAM discard listener: %s", __func__, |
301 | + strerror(-ret)); | 283 | + strerror(-ret)); |
302 | + } | 284 | + } |
303 | + | 285 | + |
... | ... | ||
327 | +{ | 309 | +{ |
328 | + GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); | 310 | + GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); |
329 | + struct GuestMemfdReplayData data = { .fn = replay_fn, .opaque = opaque }; | 311 | + struct GuestMemfdReplayData data = { .fn = replay_fn, .opaque = opaque }; |
330 | + | 312 | + |
331 | + g_assert(section->mr == gmm->mr); | 313 | + g_assert(section->mr == gmm->mr); |
332 | + return guest_memfd_for_each_populated_range(gmm, section, &data, | 314 | + return guest_memfd_for_each_populated_section(gmm, section, &data, |
333 | + guest_memfd_rdm_replay_populated_cb); | 315 | + guest_memfd_rdm_replay_populated_cb); |
334 | +} | 316 | +} |
335 | + | 317 | + |
336 | +static int guest_memfd_rdm_replay_discarded_cb(MemoryRegionSection *section, void *arg) | 318 | +static int guest_memfd_rdm_replay_discarded_cb(MemoryRegionSection *section, void *arg) |
337 | +{ | 319 | +{ |
338 | + struct GuestMemfdReplayData *data = arg; | 320 | + struct GuestMemfdReplayData *data = arg; |
... | ... | ||
350 | +{ | 332 | +{ |
351 | + GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); | 333 | + GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); |
352 | + struct GuestMemfdReplayData data = { .fn = replay_fn, .opaque = opaque }; | 334 | + struct GuestMemfdReplayData data = { .fn = replay_fn, .opaque = opaque }; |
353 | + | 335 | + |
354 | + g_assert(section->mr == gmm->mr); | 336 | + g_assert(section->mr == gmm->mr); |
355 | + guest_memfd_for_each_discarded_range(gmm, section, &data, | 337 | + guest_memfd_for_each_discarded_section(gmm, section, &data, |
356 | + guest_memfd_rdm_replay_discarded_cb); | 338 | + guest_memfd_rdm_replay_discarded_cb); |
357 | +} | 339 | +} |
358 | + | 340 | + |
359 | +static void guest_memfd_manager_realize(Object *obj, MemoryRegion *mr, | 341 | +static void guest_memfd_manager_init(Object *obj) |
360 | + uint64_t region_size) | ||
361 | +{ | 342 | +{ |
362 | + GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(obj); | 343 | + GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(obj); |
363 | + uint64_t bitmap_size = ROUND_UP(region_size, gmm->block_size) / gmm->block_size; | 344 | + |
364 | + | ||
365 | + gmm->mr = mr; | ||
366 | + gmm->discard_bitmap_size = bitmap_size; | ||
367 | + gmm->discard_bitmap = bitmap_new(bitmap_size); | ||
368 | +} | ||
369 | + | ||
370 | +static void guest_memfd_manager_init(Object *obj) | ||
371 | +{ | ||
372 | + GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(obj); | ||
373 | + | ||
374 | + gmm->block_size = qemu_real_host_page_size(); | ||
375 | + QLIST_INIT(&gmm->rdl_list); | 345 | + QLIST_INIT(&gmm->rdl_list); |
376 | +} | 346 | +} |
377 | + | 347 | + |
378 | +static void guest_memfd_manager_finalize(Object *obj) | 348 | +static void guest_memfd_manager_finalize(Object *obj) |
379 | +{ | 349 | +{ |
380 | + g_free(GUEST_MEMFD_MANAGER(obj)->discard_bitmap); | 350 | + g_free(GUEST_MEMFD_MANAGER(obj)->bitmap); |
381 | +} | 351 | +} |
382 | + | 352 | + |
383 | +static void guest_memfd_manager_class_init(ObjectClass *oc, void *data) | 353 | +static void guest_memfd_manager_class_init(ObjectClass *oc, void *data) |
384 | +{ | 354 | +{ |
385 | + GuestMemfdManagerClass *gmmc = GUEST_MEMFD_MANAGER_CLASS(oc); | ||
386 | + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(oc); | 355 | + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(oc); |
387 | + | ||
388 | + gmmc->realize = guest_memfd_manager_realize; | ||
389 | + | 356 | + |
390 | + rdmc->get_min_granularity = guest_memfd_rdm_get_min_granularity; | 357 | + rdmc->get_min_granularity = guest_memfd_rdm_get_min_granularity; |
391 | + rdmc->register_listener = guest_memfd_rdm_register_listener; | 358 | + rdmc->register_listener = guest_memfd_rdm_register_listener; |
392 | + rdmc->unregister_listener = guest_memfd_rdm_unregister_listener; | 359 | + rdmc->unregister_listener = guest_memfd_rdm_unregister_listener; |
393 | + rdmc->is_populated = guest_memfd_rdm_is_populated; | 360 | + rdmc->is_populated = guest_memfd_rdm_is_populated; |
... | ... | diff view generated by jsdifflib |
1 | Introduce a helper function within RamDiscardManager to efficiently | 1 | Introduce a new state_change() callback in GuestMemfdManagerClass to |
---|---|---|---|
2 | notify all registered RamDiscardListeners, including VFIO listeners | 2 | efficiently notify all registered RamDiscardListeners, including VFIO |
3 | about the memory conversion events between shared and private in | 3 | listeners about the memory conversion events in guest_memfd. The |
4 | guest_memfd. The existing VFIO listener can dynamically DMA map/unmap | 4 | existing VFIO listener can dynamically DMA map/unmap the shared pages |
5 | the shared pages based on the conversion type: | 5 | based on conversion types: |
6 | - For conversions from shared to private, the VFIO system ensures the | 6 | - For conversions from shared to private, the VFIO system ensures the |
7 | discarding of shared mapping from the IOMMU. | 7 | discarding of shared mapping from the IOMMU. |
8 | - For conversions from private to shared, it triggers the population of | 8 | - For conversions from private to shared, it triggers the population of |
9 | the shared mapping into the IOMMU. | 9 | the shared mapping into the IOMMU. |
10 | 10 | ||
11 | Additionally, there could be some special conversion requests: | 11 | Additionally, there could be some special conversion requests: |
12 | - When a conversion request is made for a page already in the desired | 12 | - When a conversion request is made for a page already in the desired |
13 | state (either private or shared), the helper simply returns success. | 13 | state, the helper simply returns success. |
14 | - For requests involving a range partially in the desired state, only | 14 | - For requests involving a range partially in the desired state, only |
15 | the necessary segments are converted, ensuring the entire range | 15 | the necessary segments are converted, ensuring the entire range |
16 | complies with the request efficiently. | 16 | complies with the request efficiently. |
17 | - In scenarios where a conversion request is declined by other systems, | 17 | - In scenarios where a conversion request is declined by other systems, |
18 | such as a failure from VFIO during notify_populate(), the helper will | 18 | such as a failure from VFIO during notify_populate(), the helper will |
19 | roll back the request, maintaining consistency. | 19 | roll back the request, maintaining consistency. |
20 | 20 | ||
21 | Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> | 21 | Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> |
22 | --- | 22 | --- |
23 | include/sysemu/guest-memfd-manager.h | 3 + | 23 | include/sysemu/guest-memfd-manager.h | 3 + |
24 | system/guest-memfd-manager.c | 141 +++++++++++++++++++++++++++ | 24 | system/guest-memfd-manager.c | 144 +++++++++++++++++++++++++++ |
25 | 2 files changed, 144 insertions(+) | 25 | 2 files changed, 147 insertions(+) |
26 | 26 | ||
27 | diff --git a/include/sysemu/guest-memfd-manager.h b/include/sysemu/guest-memfd-manager.h | 27 | diff --git a/include/sysemu/guest-memfd-manager.h b/include/sysemu/guest-memfd-manager.h |
28 | index XXXXXXX..XXXXXXX 100644 | 28 | index XXXXXXX..XXXXXXX 100644 |
29 | --- a/include/sysemu/guest-memfd-manager.h | 29 | --- a/include/sysemu/guest-memfd-manager.h |
30 | +++ b/include/sysemu/guest-memfd-manager.h | 30 | +++ b/include/sysemu/guest-memfd-manager.h |
31 | @@ -XXX,XX +XXX,XX @@ struct GuestMemfdManagerClass { | 31 | @@ -XXX,XX +XXX,XX @@ struct GuestMemfdManager { |
32 | void (*realize)(Object *gmm, MemoryRegion *mr, uint64_t region_size); | 32 | |
33 | struct GuestMemfdManagerClass { | ||
34 | ObjectClass parent_class; | ||
35 | + | ||
36 | + int (*state_change)(GuestMemfdManager *gmm, uint64_t offset, uint64_t size, | ||
37 | + bool shared_to_private); | ||
33 | }; | 38 | }; |
34 | 39 | ||
35 | +int guest_memfd_state_change(GuestMemfdManager *gmm, uint64_t offset, uint64_t size, | ||
36 | + bool shared_to_private); | ||
37 | + | ||
38 | #endif | 40 | #endif |
39 | diff --git a/system/guest-memfd-manager.c b/system/guest-memfd-manager.c | 41 | diff --git a/system/guest-memfd-manager.c b/system/guest-memfd-manager.c |
40 | index XXXXXXX..XXXXXXX 100644 | 42 | index XXXXXXX..XXXXXXX 100644 |
41 | --- a/system/guest-memfd-manager.c | 43 | --- a/system/guest-memfd-manager.c |
42 | +++ b/system/guest-memfd-manager.c | 44 | +++ b/system/guest-memfd-manager.c |
43 | @@ -XXX,XX +XXX,XX @@ static void guest_memfd_rdm_replay_discarded(const RamDiscardManager *rdm, | 45 | @@ -XXX,XX +XXX,XX @@ static void guest_memfd_rdm_replay_discarded(const RamDiscardManager *rdm, |
44 | guest_memfd_rdm_replay_discarded_cb); | 46 | guest_memfd_rdm_replay_discarded_cb); |
45 | } | 47 | } |
46 | 48 | ||
47 | +static bool guest_memfd_is_valid_range(GuestMemfdManager *gmm, | 49 | +static bool guest_memfd_is_valid_range(GuestMemfdManager *gmm, |
48 | + uint64_t offset, uint64_t size) | 50 | + uint64_t offset, uint64_t size) |
49 | +{ | 51 | +{ |
... | ... | ||
70 | + RamDiscardListener *rdl; | 72 | + RamDiscardListener *rdl; |
71 | + | 73 | + |
72 | + QLIST_FOREACH(rdl, &gmm->rdl_list, next) { | 74 | + QLIST_FOREACH(rdl, &gmm->rdl_list, next) { |
73 | + MemoryRegionSection tmp = *rdl->section; | 75 | + MemoryRegionSection tmp = *rdl->section; |
74 | + | 76 | + |
75 | + if (!guest_memfd_rdm_intersect_memory_section(&tmp, offset, size)) { | 77 | + if (!memory_region_section_intersect_range(&tmp, offset, size)) { |
76 | + continue; | 78 | + continue; |
77 | + } | 79 | + } |
78 | + | 80 | + |
79 | + guest_memfd_for_each_populated_range(gmm, &tmp, rdl, | 81 | + guest_memfd_for_each_populated_section(gmm, &tmp, rdl, |
80 | + guest_memfd_notify_discard_cb); | 82 | + guest_memfd_notify_discard_cb); |
81 | + } | 83 | + } |
82 | +} | 84 | +} |
83 | + | 85 | + |
84 | + | 86 | + |
85 | +static int guest_memfd_notify_populate(GuestMemfdManager *gmm, | 87 | +static int guest_memfd_notify_populate(GuestMemfdManager *gmm, |
... | ... | ||
89 | + int ret = 0; | 91 | + int ret = 0; |
90 | + | 92 | + |
91 | + QLIST_FOREACH(rdl, &gmm->rdl_list, next) { | 93 | + QLIST_FOREACH(rdl, &gmm->rdl_list, next) { |
92 | + MemoryRegionSection tmp = *rdl->section; | 94 | + MemoryRegionSection tmp = *rdl->section; |
93 | + | 95 | + |
94 | + if (!guest_memfd_rdm_intersect_memory_section(&tmp, offset, size)) { | 96 | + if (!memory_region_section_intersect_range(&tmp, offset, size)) { |
95 | + continue; | 97 | + continue; |
96 | + } | 98 | + } |
97 | + | 99 | + |
98 | + ret = guest_memfd_for_each_discarded_range(gmm, &tmp, rdl, | 100 | + ret = guest_memfd_for_each_discarded_section(gmm, &tmp, rdl, |
99 | + guest_memfd_notify_populate_cb); | 101 | + guest_memfd_notify_populate_cb); |
100 | + if (ret) { | 102 | + if (ret) { |
101 | + break; | 103 | + break; |
102 | + } | 104 | + } |
103 | + } | 105 | + } |
104 | + | 106 | + |
... | ... | ||
108 | + MemoryRegionSection tmp = *rdl2->section; | 110 | + MemoryRegionSection tmp = *rdl2->section; |
109 | + | 111 | + |
110 | + if (rdl2 == rdl) { | 112 | + if (rdl2 == rdl) { |
111 | + break; | 113 | + break; |
112 | + } | 114 | + } |
113 | + if (!guest_memfd_rdm_intersect_memory_section(&tmp, offset, size)) { | 115 | + if (!memory_region_section_intersect_range(&tmp, offset, size)) { |
114 | + continue; | 116 | + continue; |
115 | + } | 117 | + } |
116 | + | 118 | + |
117 | + guest_memfd_for_each_discarded_range(gmm, &tmp, rdl2, | 119 | + guest_memfd_for_each_discarded_section(gmm, &tmp, rdl2, |
118 | + guest_memfd_notify_discard_cb); | 120 | + guest_memfd_notify_discard_cb); |
119 | + } | 121 | + } |
120 | + } | 122 | + } |
121 | + return ret; | 123 | + return ret; |
122 | +} | 124 | +} |
123 | + | 125 | + |
... | ... | ||
127 | + const unsigned long first_bit = offset / gmm->block_size; | 129 | + const unsigned long first_bit = offset / gmm->block_size; |
128 | + const unsigned long last_bit = first_bit + (size / gmm->block_size) - 1; | 130 | + const unsigned long last_bit = first_bit + (size / gmm->block_size) - 1; |
129 | + unsigned long found_bit; | 131 | + unsigned long found_bit; |
130 | + | 132 | + |
131 | + /* We fake a shorter bitmap to avoid searching too far. */ | 133 | + /* We fake a shorter bitmap to avoid searching too far. */ |
132 | + found_bit = find_next_bit(gmm->discard_bitmap, last_bit + 1, first_bit); | 134 | + found_bit = find_next_zero_bit(gmm->bitmap, last_bit + 1, first_bit); |
133 | + return found_bit > last_bit; | 135 | + return found_bit > last_bit; |
134 | +} | 136 | +} |
135 | + | 137 | + |
136 | +static bool guest_memfd_is_range_discarded(GuestMemfdManager *gmm, | 138 | +static bool guest_memfd_is_range_discarded(GuestMemfdManager *gmm, |
137 | + uint64_t offset, uint64_t size) | 139 | + uint64_t offset, uint64_t size) |
138 | +{ | 140 | +{ |
139 | + const unsigned long first_bit = offset / gmm->block_size; | 141 | + const unsigned long first_bit = offset / gmm->block_size; |
140 | + const unsigned long last_bit = first_bit + (size / gmm->block_size) - 1; | 142 | + const unsigned long last_bit = first_bit + (size / gmm->block_size) - 1; |
141 | + unsigned long found_bit; | 143 | + unsigned long found_bit; |
142 | + | 144 | + |
143 | + /* We fake a shorter bitmap to avoid searching too far. */ | 145 | + /* We fake a shorter bitmap to avoid searching too far. */ |
144 | + found_bit = find_next_zero_bit(gmm->discard_bitmap, last_bit + 1, first_bit); | 146 | + found_bit = find_next_bit(gmm->bitmap, last_bit + 1, first_bit); |
145 | + return found_bit > last_bit; | 147 | + return found_bit > last_bit; |
146 | +} | 148 | +} |
147 | + | 149 | + |
148 | +int guest_memfd_state_change(GuestMemfdManager *gmm, uint64_t offset, uint64_t size, | 150 | +static int guest_memfd_state_change(GuestMemfdManager *gmm, uint64_t offset, |
149 | + bool shared_to_private) | 151 | + uint64_t size, bool shared_to_private) |
150 | +{ | 152 | +{ |
151 | + int ret = 0; | 153 | + int ret = 0; |
152 | + | 154 | + |
153 | + if (!guest_memfd_is_valid_range(gmm, offset, size)) { | 155 | + if (!guest_memfd_is_valid_range(gmm, offset, size)) { |
154 | + error_report("%s, invalid range: offset 0x%lx, size 0x%lx", | 156 | + error_report("%s, invalid range: offset 0x%lx, size 0x%lx", |
... | ... | ||
169 | + | 171 | + |
170 | + if (!ret) { | 172 | + if (!ret) { |
171 | + unsigned long first_bit = offset / gmm->block_size; | 173 | + unsigned long first_bit = offset / gmm->block_size; |
172 | + unsigned long nbits = size / gmm->block_size; | 174 | + unsigned long nbits = size / gmm->block_size; |
173 | + | 175 | + |
174 | + g_assert((first_bit + nbits) <= gmm->discard_bitmap_size); | 176 | + g_assert((first_bit + nbits) <= gmm->bitmap_size); |
175 | + | 177 | + |
176 | + if (shared_to_private) { | 178 | + if (shared_to_private) { |
177 | + bitmap_set(gmm->discard_bitmap, first_bit, nbits); | 179 | + bitmap_clear(gmm->bitmap, first_bit, nbits); |
178 | + } else { | 180 | + } else { |
179 | + bitmap_clear(gmm->discard_bitmap, first_bit, nbits); | 181 | + bitmap_set(gmm->bitmap, first_bit, nbits); |
180 | + } | 182 | + } |
181 | + | 183 | + |
182 | + return 0; | 184 | + return 0; |
183 | + } | 185 | + } |
184 | + | 186 | + |
185 | + return ret; | 187 | + return ret; |
186 | +} | 188 | +} |
187 | + | 189 | + |
188 | static void guest_memfd_manager_realize(Object *obj, MemoryRegion *mr, | 190 | static void guest_memfd_manager_init(Object *obj) |
189 | uint64_t region_size) | ||
190 | { | 191 | { |
192 | GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(obj); | ||
193 | @@ -XXX,XX +XXX,XX @@ static void guest_memfd_manager_finalize(Object *obj) | ||
194 | |||
195 | static void guest_memfd_manager_class_init(ObjectClass *oc, void *data) | ||
196 | { | ||
197 | + GuestMemfdManagerClass *gmmc = GUEST_MEMFD_MANAGER_CLASS(oc); | ||
198 | RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(oc); | ||
199 | |||
200 | + gmmc->state_change = guest_memfd_state_change; | ||
201 | + | ||
202 | rdmc->get_min_granularity = guest_memfd_rdm_get_min_granularity; | ||
203 | rdmc->register_listener = guest_memfd_rdm_register_listener; | ||
204 | rdmc->unregister_listener = guest_memfd_rdm_unregister_listener; | ||
191 | -- | 205 | -- |
192 | 2.43.5 | 206 | 2.43.5 | diff view generated by jsdifflib |
1 | Introduce a helper to trigger the state_change() callback of the class. | ||
---|---|---|---|
1 | Once exit to userspace to convert the page from private to shared or | 2 | Once exit to userspace to convert the page from private to shared or |
2 | vice versa at runtime, notify the state change via the | 3 | vice versa at runtime, notify the event via the helper so that other |
3 | guest_memfd_state_change() helper so that other registered subsystems | 4 | registered subsystems like VFIO can be notified. |
4 | like VFIO can be notified. | ||
5 | 5 | ||
6 | Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> | 6 | Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> |
7 | --- | 7 | --- |
8 | accel/kvm/kvm-all.c | 7 +++++++ | 8 | accel/kvm/kvm-all.c | 4 ++++ |
9 | 1 file changed, 7 insertions(+) | 9 | include/sysemu/guest-memfd-manager.h | 15 +++++++++++++++ |
10 | 2 files changed, 19 insertions(+) | ||
10 | 11 | ||
11 | diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c | 12 | diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c |
12 | index XXXXXXX..XXXXXXX 100644 | 13 | index XXXXXXX..XXXXXXX 100644 |
13 | --- a/accel/kvm/kvm-all.c | 14 | --- a/accel/kvm/kvm-all.c |
14 | +++ b/accel/kvm/kvm-all.c | 15 | +++ b/accel/kvm/kvm-all.c |
... | ... | ||
19 | +#include "sysemu/guest-memfd-manager.h" | 20 | +#include "sysemu/guest-memfd-manager.h" |
20 | 21 | ||
21 | #include "hw/boards.h" | 22 | #include "hw/boards.h" |
22 | #include "sysemu/stats.h" | 23 | #include "sysemu/stats.h" |
23 | @@ -XXX,XX +XXX,XX @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) | 24 | @@ -XXX,XX +XXX,XX @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) |
24 | RAMBlock *rb; | ||
25 | void *addr; | ||
26 | int ret = -1; | ||
27 | + GuestMemfdManager *gmm; | ||
28 | |||
29 | trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared"); | ||
30 | |||
31 | @@ -XXX,XX +XXX,XX @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) | ||
32 | addr = memory_region_get_ram_ptr(mr) + section.offset_within_region; | 25 | addr = memory_region_get_ram_ptr(mr) + section.offset_within_region; |
33 | rb = qemu_ram_block_from_host(addr, false, &offset); | 26 | rb = qemu_ram_block_from_host(addr, false, &offset); |
34 | 27 | ||
35 | + gmm = GUEST_MEMFD_MANAGER(mr->rdm); | 28 | + guest_memfd_manager_state_change(GUEST_MEMFD_MANAGER(mr->rdm), offset, |
36 | + if (gmm) { | 29 | + size, to_private); |
37 | + guest_memfd_state_change(gmm, offset, size, to_private); | ||
38 | + } | ||
39 | + | 30 | + |
40 | if (to_private) { | 31 | if (to_private) { |
41 | if (rb->page_size != qemu_real_host_page_size()) { | 32 | if (rb->page_size != qemu_real_host_page_size()) { |
42 | /* | 33 | /* |
34 | diff --git a/include/sysemu/guest-memfd-manager.h b/include/sysemu/guest-memfd-manager.h | ||
35 | index XXXXXXX..XXXXXXX 100644 | ||
36 | --- a/include/sysemu/guest-memfd-manager.h | ||
37 | +++ b/include/sysemu/guest-memfd-manager.h | ||
38 | @@ -XXX,XX +XXX,XX @@ struct GuestMemfdManagerClass { | ||
39 | bool shared_to_private); | ||
40 | }; | ||
41 | |||
42 | +static inline int guest_memfd_manager_state_change(GuestMemfdManager *gmm, uint64_t offset, | ||
43 | + uint64_t size, bool shared_to_private) | ||
44 | +{ | ||
45 | + GuestMemfdManagerClass *klass; | ||
46 | + | ||
47 | + g_assert(gmm); | ||
48 | + klass = GUEST_MEMFD_MANAGER_GET_CLASS(gmm); | ||
49 | + | ||
50 | + if (klass->state_change) { | ||
51 | + return klass->state_change(gmm, offset, size, shared_to_private); | ||
52 | + } | ||
53 | + | ||
54 | + return 0; | ||
55 | +} | ||
56 | + | ||
57 | #endif | ||
43 | -- | 58 | -- |
44 | 2.43.5 | 59 | 2.43.5 | diff view generated by jsdifflib |
1 | Instantiate a new guest_memfd_manager object and register it in the | 1 | Introduce the realize()/unrealize() callbacks to initialize/uninitialize |
---|---|---|---|
2 | target MemoryRegion. From this point, other subsystems such as VFIO can | 2 | the new guest_memfd_manager object and register/unregister it in the |
3 | register their listeners in guest_memfd_manager and receive conversion | 3 | target MemoryRegion. |
4 | events through RamDiscardManager. | 4 | |
5 | Guest_memfd was initially set to shared until the commit bd3bcf6962 | ||
6 | ("kvm/memory: Make memory type private by default if it has guest memfd | ||
7 | backend"). To align with this change, the default state in | ||
8 | guest_memfd_manager is set to private. (The bitmap is cleared to 0). | ||
9 | Additionally, setting the default to private can also reduce the | ||
10 | overhead of mapping shared pages into IOMMU by VFIO during the bootup stage. | ||
5 | 11 | ||
6 | Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> | 12 | Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> |
7 | --- | 13 | --- |
8 | system/physmem.c | 9 +++++++++ | 14 | include/sysemu/guest-memfd-manager.h | 27 +++++++++++++++++++++++++++ |
9 | 1 file changed, 9 insertions(+) | 15 | system/guest-memfd-manager.c | 28 +++++++++++++++++++++++++++- |
16 | system/physmem.c | 7 +++++++ | ||
17 | 3 files changed, 61 insertions(+), 1 deletion(-) | ||
10 | 18 | ||
19 | diff --git a/include/sysemu/guest-memfd-manager.h b/include/sysemu/guest-memfd-manager.h | ||
20 | index XXXXXXX..XXXXXXX 100644 | ||
21 | --- a/include/sysemu/guest-memfd-manager.h | ||
22 | +++ b/include/sysemu/guest-memfd-manager.h | ||
23 | @@ -XXX,XX +XXX,XX @@ struct GuestMemfdManager { | ||
24 | struct GuestMemfdManagerClass { | ||
25 | ObjectClass parent_class; | ||
26 | |||
27 | + void (*realize)(GuestMemfdManager *gmm, MemoryRegion *mr, uint64_t region_size); | ||
28 | + void (*unrealize)(GuestMemfdManager *gmm); | ||
29 | int (*state_change)(GuestMemfdManager *gmm, uint64_t offset, uint64_t size, | ||
30 | bool shared_to_private); | ||
31 | }; | ||
32 | @@ -XXX,XX +XXX,XX @@ static inline int guest_memfd_manager_state_change(GuestMemfdManager *gmm, uint6 | ||
33 | return 0; | ||
34 | } | ||
35 | |||
36 | +static inline void guest_memfd_manager_realize(GuestMemfdManager *gmm, | ||
37 | + MemoryRegion *mr, uint64_t region_size) | ||
38 | +{ | ||
39 | + GuestMemfdManagerClass *klass; | ||
40 | + | ||
41 | + g_assert(gmm); | ||
42 | + klass = GUEST_MEMFD_MANAGER_GET_CLASS(gmm); | ||
43 | + | ||
44 | + if (klass->realize) { | ||
45 | + klass->realize(gmm, mr, region_size); | ||
46 | + } | ||
47 | +} | ||
48 | + | ||
49 | +static inline void guest_memfd_manager_unrealize(GuestMemfdManager *gmm) | ||
50 | +{ | ||
51 | + GuestMemfdManagerClass *klass; | ||
52 | + | ||
53 | + g_assert(gmm); | ||
54 | + klass = GUEST_MEMFD_MANAGER_GET_CLASS(gmm); | ||
55 | + | ||
56 | + if (klass->unrealize) { | ||
57 | + klass->unrealize(gmm); | ||
58 | + } | ||
59 | +} | ||
60 | + | ||
61 | #endif | ||
62 | diff --git a/system/guest-memfd-manager.c b/system/guest-memfd-manager.c | ||
63 | index XXXXXXX..XXXXXXX 100644 | ||
64 | --- a/system/guest-memfd-manager.c | ||
65 | +++ b/system/guest-memfd-manager.c | ||
66 | @@ -XXX,XX +XXX,XX @@ static int guest_memfd_state_change(GuestMemfdManager *gmm, uint64_t offset, | ||
67 | return ret; | ||
68 | } | ||
69 | |||
70 | +static void guest_memfd_manager_realizefn(GuestMemfdManager *gmm, MemoryRegion *mr, | ||
71 | + uint64_t region_size) | ||
72 | +{ | ||
73 | + uint64_t bitmap_size; | ||
74 | + | ||
75 | + gmm->block_size = qemu_real_host_page_size(); | ||
76 | + bitmap_size = ROUND_UP(region_size, gmm->block_size) / gmm->block_size; | ||
77 | + | ||
78 | + gmm->mr = mr; | ||
79 | + gmm->bitmap_size = bitmap_size; | ||
80 | + gmm->bitmap = bitmap_new(bitmap_size); | ||
81 | + | ||
82 | + memory_region_set_ram_discard_manager(gmm->mr, RAM_DISCARD_MANAGER(gmm)); | ||
83 | +} | ||
84 | + | ||
85 | +static void guest_memfd_manager_unrealizefn(GuestMemfdManager *gmm) | ||
86 | +{ | ||
87 | + memory_region_set_ram_discard_manager(gmm->mr, NULL); | ||
88 | + | ||
89 | + g_free(gmm->bitmap); | ||
90 | + gmm->bitmap = NULL; | ||
91 | + gmm->bitmap_size = 0; | ||
92 | + gmm->mr = NULL; | ||
93 | +} | ||
94 | + | ||
95 | static void guest_memfd_manager_init(Object *obj) | ||
96 | { | ||
97 | GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(obj); | ||
98 | @@ -XXX,XX +XXX,XX @@ static void guest_memfd_manager_init(Object *obj) | ||
99 | |||
100 | static void guest_memfd_manager_finalize(Object *obj) | ||
101 | { | ||
102 | - g_free(GUEST_MEMFD_MANAGER(obj)->bitmap); | ||
103 | } | ||
104 | |||
105 | static void guest_memfd_manager_class_init(ObjectClass *oc, void *data) | ||
106 | @@ -XXX,XX +XXX,XX @@ static void guest_memfd_manager_class_init(ObjectClass *oc, void *data) | ||
107 | RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(oc); | ||
108 | |||
109 | gmmc->state_change = guest_memfd_state_change; | ||
110 | + gmmc->realize = guest_memfd_manager_realizefn; | ||
111 | + gmmc->unrealize = guest_memfd_manager_unrealizefn; | ||
112 | |||
113 | rdmc->get_min_granularity = guest_memfd_rdm_get_min_granularity; | ||
114 | rdmc->register_listener = guest_memfd_rdm_register_listener; | ||
11 | diff --git a/system/physmem.c b/system/physmem.c | 115 | diff --git a/system/physmem.c b/system/physmem.c |
12 | index XXXXXXX..XXXXXXX 100644 | 116 | index XXXXXXX..XXXXXXX 100644 |
13 | --- a/system/physmem.c | 117 | --- a/system/physmem.c |
14 | +++ b/system/physmem.c | 118 | +++ b/system/physmem.c |
15 | @@ -XXX,XX +XXX,XX @@ | 119 | @@ -XXX,XX +XXX,XX @@ |
16 | #include "sysemu/hostmem.h" | 120 | #include "sysemu/hostmem.h" |
17 | #include "sysemu/hw_accel.h" | 121 | #include "sysemu/hw_accel.h" |
18 | #include "sysemu/xen-mapcache.h" | 122 | #include "sysemu/xen-mapcache.h" |
19 | +#include "sysemu/guest-memfd-manager.h" | 123 | +#include "sysemu/guest-memfd-manager.h" |
20 | #include "trace/trace-root.h" | 124 | #include "trace.h" |
21 | 125 | ||
22 | #ifdef CONFIG_FALLOCATE_PUNCH_HOLE | 126 | #ifdef CONFIG_FALLOCATE_PUNCH_HOLE |
23 | @@ -XXX,XX +XXX,XX @@ static void ram_block_add(RAMBlock *new_block, Error **errp) | 127 | @@ -XXX,XX +XXX,XX @@ static void ram_block_add(RAMBlock *new_block, Error **errp) |
24 | qemu_mutex_unlock_ramlist(); | 128 | qemu_mutex_unlock_ramlist(); |
25 | goto out_free; | 129 | goto out_free; |
26 | } | 130 | } |
27 | + | 131 | + |
28 | + GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(object_new(TYPE_GUEST_MEMFD_MANAGER)); | 132 | + GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(object_new(TYPE_GUEST_MEMFD_MANAGER)); |
29 | + GuestMemfdManagerClass *gmmc = GUEST_MEMFD_MANAGER_GET_CLASS(gmm); | 133 | + guest_memfd_manager_realize(gmm, new_block->mr, new_block->mr->size); |
30 | + g_assert(new_block->mr); | ||
31 | + gmmc->realize(OBJECT(gmm), new_block->mr, new_block->mr->size); | ||
32 | + memory_region_set_ram_discard_manager(gmm->mr, RAM_DISCARD_MANAGER(gmm)); | ||
33 | } | 134 | } |
34 | 135 | ||
35 | new_ram_size = MAX(old_ram_size, | 136 | ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS; |
36 | @@ -XXX,XX +XXX,XX @@ static void reclaim_ramblock(RAMBlock *block) | 137 | @@ -XXX,XX +XXX,XX @@ static void reclaim_ramblock(RAMBlock *block) |
37 | 138 | ||
38 | if (block->guest_memfd >= 0) { | 139 | if (block->guest_memfd >= 0) { |
39 | close(block->guest_memfd); | 140 | close(block->guest_memfd); |
40 | + g_assert(block->mr); | 141 | + GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(block->mr->rdm); |
41 | + object_unref(OBJECT(block->mr->rdm)); | 142 | + guest_memfd_manager_unrealize(gmm); |
143 | + object_unref(OBJECT(gmm)); | ||
42 | ram_block_discard_require(false); | 144 | ram_block_discard_require(false); |
43 | } | 145 | } |
44 | 146 | ||
45 | -- | 147 | -- |
46 | 2.43.5 | 148 | 2.43.5 | diff view generated by jsdifflib |
... | ... | ||
---|---|---|---|
12 | +++ b/system/physmem.c | 12 | +++ b/system/physmem.c |
13 | @@ -XXX,XX +XXX,XX @@ static void ram_block_add(RAMBlock *new_block, Error **errp) | 13 | @@ -XXX,XX +XXX,XX @@ static void ram_block_add(RAMBlock *new_block, Error **errp) |
14 | assert(kvm_enabled()); | 14 | assert(kvm_enabled()); |
15 | assert(new_block->guest_memfd < 0); | 15 | assert(new_block->guest_memfd < 0); |
16 | 16 | ||
17 | - if (ram_block_discard_require(true) < 0) { | 17 | - ret = ram_block_discard_require(true); |
18 | + if (ram_block_coordinated_discard_require(true) < 0) { | 18 | + ret = ram_block_coordinated_discard_require(true); |
19 | error_setg_errno(errp, errno, | 19 | if (ret < 0) { |
20 | error_setg_errno(errp, -ret, | ||
20 | "cannot set up private guest memory: discard currently blocked"); | 21 | "cannot set up private guest memory: discard currently blocked"); |
21 | error_append_hint(errp, "Are you using assigned devices?\n"); | ||
22 | -- | 22 | -- |
23 | 2.43.5 | 23 | 2.43.5 | diff view generated by jsdifflib |
1 | guest_memfd was initially set to shared until the commit bd3bcf6962 | 1 | For each ram_discard_manager helper, add a new argument 'is_private' to |
---|---|---|---|
2 | ("kvm/memory: Make memory type private by default if it has guest memfd | 2 | indicate the request attribute. If is_private is true, the operation |
3 | backend"). To align with this change, the default state in | 3 | targets the private range in the section. For example, |
4 | guest_memfd_manager is set to discarded. | 4 | replay_populate(true) will replay the populate operation on private part |
5 | in the MemoryRegionSection, while replay_popuate(false) will replay | ||
6 | population on shared part. | ||
5 | 7 | ||
6 | One concern raised by this commit is the handling of the virtual BIOS. | 8 | This helps to distinguish between the states of private/shared and |
7 | The virtual BIOS loads its image into the shared memory of guest_memfd. | 9 | discarded/populated. It is essential for guest_memfd_manager which uses |
8 | However, during the region_commit() stage, the memory attribute is | 10 | RamDiscardManager interface but can't treat private memory as discarded |
9 | set to private while its shared memory remains valid. This mismatch | 11 | memory. This is because it does not align with the expectation of |
10 | persists until the shared content is copied to the private region. | 12 | current RamDiscardManager users (e.g. live migration), who expect that |
11 | Fortunately, this interval only exits during setup stage and currently, | 13 | discarded memory is hot-removed and can be skipped when processing guest |
12 | only the guest_memfd_manager is concerned with the state of the | 14 | memory. Treating private memory as discarded won't work in the future if |
13 | guest_memfd at that stage. For simplicity, the default bitmap in | 15 | live migration needs to handle private memory. For example, live |
14 | guest_memfd_manager is set to discarded (private). This is feasible | 16 | migration needs to migrate private memory. |
15 | because the shared content of the virtual BIOS will eventually be | ||
16 | discarded and there are no requests to DMA access to this shared part | ||
17 | during this period. | ||
18 | 17 | ||
19 | Additionally, setting the default to private can also reduce the | 18 | The user of the helper needs to figure out which attribute to |
20 | overhead of mapping shared pages into IOMMU by VFIO at the bootup stage. | 19 | manipulate. For legacy VM case, use is_private=true by default. Private |
20 | attribute is only valid in a guest_memfd based VM. | ||
21 | |||
22 | Opportunistically rename the guest_memfd_for_each_{discarded, | ||
23 | populated}_section() to guest_memfd_for_each_{private, shared)_section() | ||
24 | to distinguish between private/shared and discarded/populated at the | ||
25 | same time. | ||
21 | 26 | ||
22 | Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> | 27 | Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> |
23 | --- | 28 | --- |
24 | system/guest-memfd-manager.c | 1 + | 29 | hw/vfio/common.c | 22 ++++++-- |
25 | 1 file changed, 1 insertion(+) | 30 | hw/virtio/virtio-mem.c | 23 ++++---- |
31 | include/exec/memory.h | 23 ++++++-- | ||
32 | migration/ram.c | 14 ++--- | ||
33 | system/guest-memfd-manager.c | 106 +++++++++++++++++++++++------------ | ||
34 | system/memory.c | 13 +++-- | ||
35 | system/memory_mapping.c | 4 +- | ||
36 | 7 files changed, 135 insertions(+), 70 deletions(-) | ||
26 | 37 | ||
38 | diff --git a/hw/vfio/common.c b/hw/vfio/common.c | ||
39 | index XXXXXXX..XXXXXXX 100644 | ||
40 | --- a/hw/vfio/common.c | ||
41 | +++ b/hw/vfio/common.c | ||
42 | @@ -XXX,XX +XXX,XX @@ out: | ||
43 | } | ||
44 | |||
45 | static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, | ||
46 | - MemoryRegionSection *section) | ||
47 | + MemoryRegionSection *section, | ||
48 | + bool is_private) | ||
49 | { | ||
50 | VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, | ||
51 | listener); | ||
52 | @@ -XXX,XX +XXX,XX @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, | ||
53 | const hwaddr iova = section->offset_within_address_space; | ||
54 | int ret; | ||
55 | |||
56 | + if (is_private) { | ||
57 | + /* Not support discard private memory yet. */ | ||
58 | + return; | ||
59 | + } | ||
60 | + | ||
61 | /* Unmap with a single call. */ | ||
62 | ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); | ||
63 | if (ret) { | ||
64 | @@ -XXX,XX +XXX,XX @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, | ||
65 | } | ||
66 | |||
67 | static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, | ||
68 | - MemoryRegionSection *section) | ||
69 | + MemoryRegionSection *section, | ||
70 | + bool is_private) | ||
71 | { | ||
72 | VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, | ||
73 | listener); | ||
74 | @@ -XXX,XX +XXX,XX @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, | ||
75 | void *vaddr; | ||
76 | int ret; | ||
77 | |||
78 | + if (is_private) { | ||
79 | + /* Not support discard private memory yet. */ | ||
80 | + return 0; | ||
81 | + } | ||
82 | + | ||
83 | /* | ||
84 | * Map in (aligned within memory region) minimum granularity, so we can | ||
85 | * unmap in minimum granularity later. | ||
86 | @@ -XXX,XX +XXX,XX @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, | ||
87 | vaddr, section->readonly); | ||
88 | if (ret) { | ||
89 | /* Rollback */ | ||
90 | - vfio_ram_discard_notify_discard(rdl, section); | ||
91 | + vfio_ram_discard_notify_discard(rdl, section, false); | ||
92 | return ret; | ||
93 | } | ||
94 | } | ||
95 | @@ -XXX,XX +XXX,XX @@ out: | ||
96 | } | ||
97 | |||
98 | static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, | ||
99 | - void *opaque) | ||
100 | + bool is_private, void *opaque) | ||
101 | { | ||
102 | const hwaddr size = int128_get64(section->size); | ||
103 | const hwaddr iova = section->offset_within_address_space; | ||
104 | @@ -XXX,XX +XXX,XX @@ vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, | ||
105 | * We only want/can synchronize the bitmap for actually mapped parts - | ||
106 | * which correspond to populated parts. Replay all populated parts. | ||
107 | */ | ||
108 | - return ram_discard_manager_replay_populated(rdm, section, | ||
109 | + return ram_discard_manager_replay_populated(rdm, section, false, | ||
110 | vfio_ram_discard_get_dirty_bitmap, | ||
111 | &vrdl); | ||
112 | } | ||
113 | diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c | ||
114 | index XXXXXXX..XXXXXXX 100644 | ||
115 | --- a/hw/virtio/virtio-mem.c | ||
116 | +++ b/hw/virtio/virtio-mem.c | ||
117 | @@ -XXX,XX +XXX,XX @@ static int virtio_mem_notify_populate_cb(MemoryRegionSection *s, void *arg) | ||
118 | { | ||
119 | RamDiscardListener *rdl = arg; | ||
120 | |||
121 | - return rdl->notify_populate(rdl, s); | ||
122 | + return rdl->notify_populate(rdl, s, false); | ||
123 | } | ||
124 | |||
125 | static int virtio_mem_notify_discard_cb(MemoryRegionSection *s, void *arg) | ||
126 | { | ||
127 | RamDiscardListener *rdl = arg; | ||
128 | |||
129 | - rdl->notify_discard(rdl, s); | ||
130 | + rdl->notify_discard(rdl, s, false); | ||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | @@ -XXX,XX +XXX,XX @@ static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset, | ||
135 | if (!memory_region_section_intersect_range(&tmp, offset, size)) { | ||
136 | continue; | ||
137 | } | ||
138 | - rdl->notify_discard(rdl, &tmp); | ||
139 | + rdl->notify_discard(rdl, &tmp, false); | ||
140 | } | ||
141 | } | ||
142 | |||
143 | @@ -XXX,XX +XXX,XX @@ static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset, | ||
144 | if (!memory_region_section_intersect_range(&tmp, offset, size)) { | ||
145 | continue; | ||
146 | } | ||
147 | - ret = rdl->notify_populate(rdl, &tmp); | ||
148 | + ret = rdl->notify_populate(rdl, &tmp, false); | ||
149 | if (ret) { | ||
150 | break; | ||
151 | } | ||
152 | @@ -XXX,XX +XXX,XX @@ static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset, | ||
153 | if (!memory_region_section_intersect_range(&tmp, offset, size)) { | ||
154 | continue; | ||
155 | } | ||
156 | - rdl2->notify_discard(rdl2, &tmp); | ||
157 | + rdl2->notify_discard(rdl2, &tmp, false); | ||
158 | } | ||
159 | } | ||
160 | return ret; | ||
161 | @@ -XXX,XX +XXX,XX @@ static void virtio_mem_notify_unplug_all(VirtIOMEM *vmem) | ||
162 | |||
163 | QLIST_FOREACH(rdl, &vmem->rdl_list, next) { | ||
164 | if (rdl->double_discard_supported) { | ||
165 | - rdl->notify_discard(rdl, rdl->section); | ||
166 | + rdl->notify_discard(rdl, rdl->section, false); | ||
167 | } else { | ||
168 | virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, | ||
169 | virtio_mem_notify_discard_cb); | ||
170 | @@ -XXX,XX +XXX,XX @@ static uint64_t virtio_mem_rdm_get_min_granularity(const RamDiscardManager *rdm, | ||
171 | } | ||
172 | |||
173 | static bool virtio_mem_rdm_is_populated(const RamDiscardManager *rdm, | ||
174 | - const MemoryRegionSection *s) | ||
175 | + const MemoryRegionSection *s, | ||
176 | + bool is_private) | ||
177 | { | ||
178 | const VirtIOMEM *vmem = VIRTIO_MEM(rdm); | ||
179 | uint64_t start_gpa = vmem->addr + s->offset_within_region; | ||
180 | @@ -XXX,XX +XXX,XX @@ static int virtio_mem_rdm_replay_populated_cb(MemoryRegionSection *s, void *arg) | ||
181 | { | ||
182 | struct VirtIOMEMReplayData *data = arg; | ||
183 | |||
184 | - return ((ReplayRamPopulate)data->fn)(s, data->opaque); | ||
185 | + return ((ReplayRamPopulate)data->fn)(s, false, data->opaque); | ||
186 | } | ||
187 | |||
188 | static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm, | ||
189 | MemoryRegionSection *s, | ||
190 | + bool is_private, | ||
191 | ReplayRamPopulate replay_fn, | ||
192 | void *opaque) | ||
193 | { | ||
194 | @@ -XXX,XX +XXX,XX @@ static int virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection *s, | ||
195 | { | ||
196 | struct VirtIOMEMReplayData *data = arg; | ||
197 | |||
198 | - ((ReplayRamDiscard)data->fn)(s, data->opaque); | ||
199 | + ((ReplayRamDiscard)data->fn)(s, false, data->opaque); | ||
200 | return 0; | ||
201 | } | ||
202 | |||
203 | static void virtio_mem_rdm_replay_discarded(const RamDiscardManager *rdm, | ||
204 | MemoryRegionSection *s, | ||
205 | + bool is_private, | ||
206 | ReplayRamDiscard replay_fn, | ||
207 | void *opaque) | ||
208 | { | ||
209 | @@ -XXX,XX +XXX,XX @@ static void virtio_mem_rdm_unregister_listener(RamDiscardManager *rdm, | ||
210 | g_assert(rdl->section->mr == &vmem->memdev->mr); | ||
211 | if (vmem->size) { | ||
212 | if (rdl->double_discard_supported) { | ||
213 | - rdl->notify_discard(rdl, rdl->section); | ||
214 | + rdl->notify_discard(rdl, rdl->section, false); | ||
215 | } else { | ||
216 | virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, | ||
217 | virtio_mem_notify_discard_cb); | ||
218 | diff --git a/include/exec/memory.h b/include/exec/memory.h | ||
219 | index XXXXXXX..XXXXXXX 100644 | ||
220 | --- a/include/exec/memory.h | ||
221 | +++ b/include/exec/memory.h | ||
222 | @@ -XXX,XX +XXX,XX @@ struct IOMMUMemoryRegionClass { | ||
223 | |||
224 | typedef struct RamDiscardListener RamDiscardListener; | ||
225 | typedef int (*NotifyRamPopulate)(RamDiscardListener *rdl, | ||
226 | - MemoryRegionSection *section); | ||
227 | + MemoryRegionSection *section, | ||
228 | + bool is_private); | ||
229 | typedef void (*NotifyRamDiscard)(RamDiscardListener *rdl, | ||
230 | - MemoryRegionSection *section); | ||
231 | + MemoryRegionSection *section, | ||
232 | + bool is_private); | ||
233 | |||
234 | struct RamDiscardListener { | ||
235 | /* | ||
236 | @@ -XXX,XX +XXX,XX @@ static inline void ram_discard_listener_init(RamDiscardListener *rdl, | ||
237 | rdl->double_discard_supported = double_discard_supported; | ||
238 | } | ||
239 | |||
240 | -typedef int (*ReplayRamPopulate)(MemoryRegionSection *section, void *opaque); | ||
241 | -typedef void (*ReplayRamDiscard)(MemoryRegionSection *section, void *opaque); | ||
242 | +typedef int (*ReplayRamPopulate)(MemoryRegionSection *section, bool is_private, void *opaque); | ||
243 | +typedef void (*ReplayRamDiscard)(MemoryRegionSection *section, bool is_private, void *opaque); | ||
244 | |||
245 | /* | ||
246 | * RamDiscardManagerClass: | ||
247 | @@ -XXX,XX +XXX,XX @@ struct RamDiscardManagerClass { | ||
248 | * | ||
249 | * @rdm: the #RamDiscardManager | ||
250 | * @section: the #MemoryRegionSection | ||
251 | + * @is_private: the attribute of the request section | ||
252 | * | ||
253 | * Returns whether the given range is completely populated. | ||
254 | */ | ||
255 | bool (*is_populated)(const RamDiscardManager *rdm, | ||
256 | - const MemoryRegionSection *section); | ||
257 | + const MemoryRegionSection *section, | ||
258 | + bool is_private); | ||
259 | |||
260 | /** | ||
261 | * @replay_populated: | ||
262 | @@ -XXX,XX +XXX,XX @@ struct RamDiscardManagerClass { | ||
263 | * | ||
264 | * @rdm: the #RamDiscardManager | ||
265 | * @section: the #MemoryRegionSection | ||
266 | + * @is_private: the attribute of the populated parts | ||
267 | * @replay_fn: the #ReplayRamPopulate callback | ||
268 | * @opaque: pointer to forward to the callback | ||
269 | * | ||
270 | @@ -XXX,XX +XXX,XX @@ struct RamDiscardManagerClass { | ||
271 | */ | ||
272 | int (*replay_populated)(const RamDiscardManager *rdm, | ||
273 | MemoryRegionSection *section, | ||
274 | + bool is_private, | ||
275 | ReplayRamPopulate replay_fn, void *opaque); | ||
276 | |||
277 | /** | ||
278 | @@ -XXX,XX +XXX,XX @@ struct RamDiscardManagerClass { | ||
279 | * | ||
280 | * @rdm: the #RamDiscardManager | ||
281 | * @section: the #MemoryRegionSection | ||
282 | + * @is_private: the attribute of the discarded parts | ||
283 | * @replay_fn: the #ReplayRamDiscard callback | ||
284 | * @opaque: pointer to forward to the callback | ||
285 | */ | ||
286 | void (*replay_discarded)(const RamDiscardManager *rdm, | ||
287 | MemoryRegionSection *section, | ||
288 | + bool is_private, | ||
289 | ReplayRamDiscard replay_fn, void *opaque); | ||
290 | |||
291 | /** | ||
292 | @@ -XXX,XX +XXX,XX @@ uint64_t ram_discard_manager_get_min_granularity(const RamDiscardManager *rdm, | ||
293 | const MemoryRegion *mr); | ||
294 | |||
295 | bool ram_discard_manager_is_populated(const RamDiscardManager *rdm, | ||
296 | - const MemoryRegionSection *section); | ||
297 | + const MemoryRegionSection *section, | ||
298 | + bool is_private); | ||
299 | |||
300 | int ram_discard_manager_replay_populated(const RamDiscardManager *rdm, | ||
301 | MemoryRegionSection *section, | ||
302 | + bool is_private, | ||
303 | ReplayRamPopulate replay_fn, | ||
304 | void *opaque); | ||
305 | |||
306 | void ram_discard_manager_replay_discarded(const RamDiscardManager *rdm, | ||
307 | MemoryRegionSection *section, | ||
308 | + bool is_private, | ||
309 | ReplayRamDiscard replay_fn, | ||
310 | void *opaque); | ||
311 | |||
312 | diff --git a/migration/ram.c b/migration/ram.c | ||
313 | index XXXXXXX..XXXXXXX 100644 | ||
314 | --- a/migration/ram.c | ||
315 | +++ b/migration/ram.c | ||
316 | @@ -XXX,XX +XXX,XX @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs, | ||
317 | } | ||
318 | |||
319 | static void dirty_bitmap_clear_section(MemoryRegionSection *section, | ||
320 | - void *opaque) | ||
321 | + bool is_private, void *opaque) | ||
322 | { | ||
323 | const hwaddr offset = section->offset_within_region; | ||
324 | const hwaddr size = int128_get64(section->size); | ||
325 | @@ -XXX,XX +XXX,XX @@ static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) | ||
326 | .size = int128_make64(qemu_ram_get_used_length(rb)), | ||
327 | }; | ||
328 | |||
329 | - ram_discard_manager_replay_discarded(rdm, §ion, | ||
330 | + ram_discard_manager_replay_discarded(rdm, §ion, false, | ||
331 | dirty_bitmap_clear_section, | ||
332 | &cleared_bits); | ||
333 | } | ||
334 | @@ -XXX,XX +XXX,XX @@ bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) | ||
335 | .size = int128_make64(qemu_ram_pagesize(rb)), | ||
336 | }; | ||
337 | |||
338 | - return !ram_discard_manager_is_populated(rdm, §ion); | ||
339 | + return !ram_discard_manager_is_populated(rdm, §ion, false); | ||
340 | } | ||
341 | return false; | ||
342 | } | ||
343 | @@ -XXX,XX +XXX,XX @@ static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, | ||
344 | } | ||
345 | |||
346 | static inline int populate_read_section(MemoryRegionSection *section, | ||
347 | - void *opaque) | ||
348 | + bool is_private, void *opaque) | ||
349 | { | ||
350 | const hwaddr size = int128_get64(section->size); | ||
351 | hwaddr offset = section->offset_within_region; | ||
352 | @@ -XXX,XX +XXX,XX @@ static void ram_block_populate_read(RAMBlock *rb) | ||
353 | .size = rb->mr->size, | ||
354 | }; | ||
355 | |||
356 | - ram_discard_manager_replay_populated(rdm, §ion, | ||
357 | + ram_discard_manager_replay_populated(rdm, §ion, false, | ||
358 | populate_read_section, NULL); | ||
359 | } else { | ||
360 | populate_read_range(rb, 0, rb->used_length); | ||
361 | @@ -XXX,XX +XXX,XX @@ void ram_write_tracking_prepare(void) | ||
362 | } | ||
363 | |||
364 | static inline int uffd_protect_section(MemoryRegionSection *section, | ||
365 | - void *opaque) | ||
366 | + bool is_private, void *opaque) | ||
367 | { | ||
368 | const hwaddr size = int128_get64(section->size); | ||
369 | const hwaddr offset = section->offset_within_region; | ||
370 | @@ -XXX,XX +XXX,XX @@ static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) | ||
371 | .size = rb->mr->size, | ||
372 | }; | ||
373 | |||
374 | - return ram_discard_manager_replay_populated(rdm, §ion, | ||
375 | + return ram_discard_manager_replay_populated(rdm, §ion, false, | ||
376 | uffd_protect_section, | ||
377 | (void *)(uintptr_t)uffd_fd); | ||
378 | } | ||
27 | diff --git a/system/guest-memfd-manager.c b/system/guest-memfd-manager.c | 379 | diff --git a/system/guest-memfd-manager.c b/system/guest-memfd-manager.c |
28 | index XXXXXXX..XXXXXXX 100644 | 380 | index XXXXXXX..XXXXXXX 100644 |
29 | --- a/system/guest-memfd-manager.c | 381 | --- a/system/guest-memfd-manager.c |
30 | +++ b/system/guest-memfd-manager.c | 382 | +++ b/system/guest-memfd-manager.c |
31 | @@ -XXX,XX +XXX,XX @@ static void guest_memfd_manager_realize(Object *obj, MemoryRegion *mr, | 383 | @@ -XXX,XX +XXX,XX @@ OBJECT_DEFINE_SIMPLE_TYPE_WITH_INTERFACES(GuestMemfdManager, |
32 | gmm->mr = mr; | 384 | { }) |
33 | gmm->discard_bitmap_size = bitmap_size; | 385 | |
34 | gmm->discard_bitmap = bitmap_new(bitmap_size); | 386 | static bool guest_memfd_rdm_is_populated(const RamDiscardManager *rdm, |
35 | + bitmap_fill(gmm->discard_bitmap, bitmap_size); | 387 | - const MemoryRegionSection *section) |
36 | } | 388 | + const MemoryRegionSection *section, |
37 | 389 | + bool is_private) | |
38 | static void guest_memfd_manager_init(Object *obj) | 390 | { |
391 | const GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); | ||
392 | uint64_t first_bit = section->offset_within_region / gmm->block_size; | ||
393 | uint64_t last_bit = first_bit + int128_get64(section->size) / gmm->block_size - 1; | ||
394 | unsigned long first_discard_bit; | ||
395 | |||
396 | - first_discard_bit = find_next_zero_bit(gmm->bitmap, last_bit + 1, first_bit); | ||
397 | + if (is_private) { | ||
398 | + /* Check if the private section is populated */ | ||
399 | + first_discard_bit = find_next_bit(gmm->bitmap, last_bit + 1, first_bit); | ||
400 | + } else { | ||
401 | + /* Check if the shared section is populated */ | ||
402 | + first_discard_bit = find_next_zero_bit(gmm->bitmap, last_bit + 1, first_bit); | ||
403 | + } | ||
404 | + | ||
405 | return first_discard_bit > last_bit; | ||
406 | } | ||
407 | |||
408 | -typedef int (*guest_memfd_section_cb)(MemoryRegionSection *s, void *arg); | ||
409 | +typedef int (*guest_memfd_section_cb)(MemoryRegionSection *s, bool is_private, | ||
410 | + void *arg); | ||
411 | |||
412 | -static int guest_memfd_notify_populate_cb(MemoryRegionSection *section, void *arg) | ||
413 | +static int guest_memfd_notify_populate_cb(MemoryRegionSection *section, bool is_private, | ||
414 | + void *arg) | ||
415 | { | ||
416 | RamDiscardListener *rdl = arg; | ||
417 | |||
418 | - return rdl->notify_populate(rdl, section); | ||
419 | + return rdl->notify_populate(rdl, section, is_private); | ||
420 | } | ||
421 | |||
422 | -static int guest_memfd_notify_discard_cb(MemoryRegionSection *section, void *arg) | ||
423 | +static int guest_memfd_notify_discard_cb(MemoryRegionSection *section, bool is_private, | ||
424 | + void *arg) | ||
425 | { | ||
426 | RamDiscardListener *rdl = arg; | ||
427 | |||
428 | - rdl->notify_discard(rdl, section); | ||
429 | + rdl->notify_discard(rdl, section, is_private); | ||
430 | |||
431 | return 0; | ||
432 | } | ||
433 | |||
434 | -static int guest_memfd_for_each_populated_section(const GuestMemfdManager *gmm, | ||
435 | - MemoryRegionSection *section, | ||
436 | - void *arg, | ||
437 | - guest_memfd_section_cb cb) | ||
438 | +static int guest_memfd_for_each_shared_section(const GuestMemfdManager *gmm, | ||
439 | + MemoryRegionSection *section, | ||
440 | + bool is_private, | ||
441 | + void *arg, | ||
442 | + guest_memfd_section_cb cb) | ||
443 | { | ||
444 | unsigned long first_one_bit, last_one_bit; | ||
445 | uint64_t offset, size; | ||
446 | @@ -XXX,XX +XXX,XX @@ static int guest_memfd_for_each_populated_section(const GuestMemfdManager *gmm, | ||
447 | break; | ||
448 | } | ||
449 | |||
450 | - ret = cb(&tmp, arg); | ||
451 | + ret = cb(&tmp, is_private, arg); | ||
452 | if (ret) { | ||
453 | break; | ||
454 | } | ||
455 | @@ -XXX,XX +XXX,XX @@ static int guest_memfd_for_each_populated_section(const GuestMemfdManager *gmm, | ||
456 | return ret; | ||
457 | } | ||
458 | |||
459 | -static int guest_memfd_for_each_discarded_section(const GuestMemfdManager *gmm, | ||
460 | - MemoryRegionSection *section, | ||
461 | - void *arg, | ||
462 | - guest_memfd_section_cb cb) | ||
463 | +static int guest_memfd_for_each_private_section(const GuestMemfdManager *gmm, | ||
464 | + MemoryRegionSection *section, | ||
465 | + bool is_private, | ||
466 | + void *arg, | ||
467 | + guest_memfd_section_cb cb) | ||
468 | { | ||
469 | unsigned long first_zero_bit, last_zero_bit; | ||
470 | uint64_t offset, size; | ||
471 | @@ -XXX,XX +XXX,XX @@ static int guest_memfd_for_each_discarded_section(const GuestMemfdManager *gmm, | ||
472 | break; | ||
473 | } | ||
474 | |||
475 | - ret = cb(&tmp, arg); | ||
476 | + ret = cb(&tmp, is_private, arg); | ||
477 | if (ret) { | ||
478 | break; | ||
479 | } | ||
480 | @@ -XXX,XX +XXX,XX @@ static void guest_memfd_rdm_register_listener(RamDiscardManager *rdm, | ||
481 | |||
482 | QLIST_INSERT_HEAD(&gmm->rdl_list, rdl, next); | ||
483 | |||
484 | - ret = guest_memfd_for_each_populated_section(gmm, section, rdl, | ||
485 | - guest_memfd_notify_populate_cb); | ||
486 | + /* Populate shared part */ | ||
487 | + ret = guest_memfd_for_each_shared_section(gmm, section, false, rdl, | ||
488 | + guest_memfd_notify_populate_cb); | ||
489 | if (ret) { | ||
490 | error_report("%s: Failed to register RAM discard listener: %s", __func__, | ||
491 | strerror(-ret)); | ||
492 | @@ -XXX,XX +XXX,XX @@ static void guest_memfd_rdm_unregister_listener(RamDiscardManager *rdm, | ||
493 | g_assert(rdl->section); | ||
494 | g_assert(rdl->section->mr == gmm->mr); | ||
495 | |||
496 | - ret = guest_memfd_for_each_populated_section(gmm, rdl->section, rdl, | ||
497 | - guest_memfd_notify_discard_cb); | ||
498 | + /* Discard shared part */ | ||
499 | + ret = guest_memfd_for_each_shared_section(gmm, rdl->section, false, rdl, | ||
500 | + guest_memfd_notify_discard_cb); | ||
501 | if (ret) { | ||
502 | error_report("%s: Failed to unregister RAM discard listener: %s", __func__, | ||
503 | strerror(-ret)); | ||
504 | @@ -XXX,XX +XXX,XX @@ typedef struct GuestMemfdReplayData { | ||
505 | void *opaque; | ||
506 | } GuestMemfdReplayData; | ||
507 | |||
508 | -static int guest_memfd_rdm_replay_populated_cb(MemoryRegionSection *section, void *arg) | ||
509 | +static int guest_memfd_rdm_replay_populated_cb(MemoryRegionSection *section, | ||
510 | + bool is_private, void *arg) | ||
511 | { | ||
512 | struct GuestMemfdReplayData *data = arg; | ||
513 | ReplayRamPopulate replay_fn = data->fn; | ||
514 | |||
515 | - return replay_fn(section, data->opaque); | ||
516 | + return replay_fn(section, is_private, data->opaque); | ||
517 | } | ||
518 | |||
519 | static int guest_memfd_rdm_replay_populated(const RamDiscardManager *rdm, | ||
520 | MemoryRegionSection *section, | ||
521 | + bool is_private, | ||
522 | ReplayRamPopulate replay_fn, | ||
523 | void *opaque) | ||
524 | { | ||
525 | @@ -XXX,XX +XXX,XX @@ static int guest_memfd_rdm_replay_populated(const RamDiscardManager *rdm, | ||
526 | struct GuestMemfdReplayData data = { .fn = replay_fn, .opaque = opaque }; | ||
527 | |||
528 | g_assert(section->mr == gmm->mr); | ||
529 | - return guest_memfd_for_each_populated_section(gmm, section, &data, | ||
530 | - guest_memfd_rdm_replay_populated_cb); | ||
531 | + if (is_private) { | ||
532 | + /* Replay populate on private section */ | ||
533 | + return guest_memfd_for_each_private_section(gmm, section, is_private, &data, | ||
534 | + guest_memfd_rdm_replay_populated_cb); | ||
535 | + } else { | ||
536 | + /* Replay populate on shared section */ | ||
537 | + return guest_memfd_for_each_shared_section(gmm, section, is_private, &data, | ||
538 | + guest_memfd_rdm_replay_populated_cb); | ||
539 | + } | ||
540 | } | ||
541 | |||
542 | -static int guest_memfd_rdm_replay_discarded_cb(MemoryRegionSection *section, void *arg) | ||
543 | +static int guest_memfd_rdm_replay_discarded_cb(MemoryRegionSection *section, | ||
544 | + bool is_private, void *arg) | ||
545 | { | ||
546 | struct GuestMemfdReplayData *data = arg; | ||
547 | ReplayRamDiscard replay_fn = data->fn; | ||
548 | |||
549 | - replay_fn(section, data->opaque); | ||
550 | + replay_fn(section, is_private, data->opaque); | ||
551 | |||
552 | return 0; | ||
553 | } | ||
554 | |||
555 | static void guest_memfd_rdm_replay_discarded(const RamDiscardManager *rdm, | ||
556 | MemoryRegionSection *section, | ||
557 | + bool is_private, | ||
558 | ReplayRamDiscard replay_fn, | ||
559 | void *opaque) | ||
560 | { | ||
561 | @@ -XXX,XX +XXX,XX @@ static void guest_memfd_rdm_replay_discarded(const RamDiscardManager *rdm, | ||
562 | struct GuestMemfdReplayData data = { .fn = replay_fn, .opaque = opaque }; | ||
563 | |||
564 | g_assert(section->mr == gmm->mr); | ||
565 | - guest_memfd_for_each_discarded_section(gmm, section, &data, | ||
566 | - guest_memfd_rdm_replay_discarded_cb); | ||
567 | + | ||
568 | + if (is_private) { | ||
569 | + /* Replay discard on private section */ | ||
570 | + guest_memfd_for_each_private_section(gmm, section, is_private, &data, | ||
571 | + guest_memfd_rdm_replay_discarded_cb); | ||
572 | + } else { | ||
573 | + /* Replay discard on shared section */ | ||
574 | + guest_memfd_for_each_shared_section(gmm, section, is_private, &data, | ||
575 | + guest_memfd_rdm_replay_discarded_cb); | ||
576 | + } | ||
577 | } | ||
578 | |||
579 | static bool guest_memfd_is_valid_range(GuestMemfdManager *gmm, | ||
580 | @@ -XXX,XX +XXX,XX @@ static void guest_memfd_notify_discard(GuestMemfdManager *gmm, | ||
581 | continue; | ||
582 | } | ||
583 | |||
584 | - guest_memfd_for_each_populated_section(gmm, &tmp, rdl, | ||
585 | - guest_memfd_notify_discard_cb); | ||
586 | + /* For current shared section, notify to discard shared parts */ | ||
587 | + guest_memfd_for_each_shared_section(gmm, &tmp, false, rdl, | ||
588 | + guest_memfd_notify_discard_cb); | ||
589 | } | ||
590 | } | ||
591 | |||
592 | @@ -XXX,XX +XXX,XX @@ static int guest_memfd_notify_populate(GuestMemfdManager *gmm, | ||
593 | continue; | ||
594 | } | ||
595 | |||
596 | - ret = guest_memfd_for_each_discarded_section(gmm, &tmp, rdl, | ||
597 | - guest_memfd_notify_populate_cb); | ||
598 | + /* For current private section, notify to populate the shared parts */ | ||
599 | + ret = guest_memfd_for_each_private_section(gmm, &tmp, false, rdl, | ||
600 | + guest_memfd_notify_populate_cb); | ||
601 | if (ret) { | ||
602 | break; | ||
603 | } | ||
604 | @@ -XXX,XX +XXX,XX @@ static int guest_memfd_notify_populate(GuestMemfdManager *gmm, | ||
605 | continue; | ||
606 | } | ||
607 | |||
608 | - guest_memfd_for_each_discarded_section(gmm, &tmp, rdl2, | ||
609 | - guest_memfd_notify_discard_cb); | ||
610 | + guest_memfd_for_each_private_section(gmm, &tmp, false, rdl2, | ||
611 | + guest_memfd_notify_discard_cb); | ||
612 | } | ||
613 | } | ||
614 | return ret; | ||
615 | diff --git a/system/memory.c b/system/memory.c | ||
616 | index XXXXXXX..XXXXXXX 100644 | ||
617 | --- a/system/memory.c | ||
618 | +++ b/system/memory.c | ||
619 | @@ -XXX,XX +XXX,XX @@ uint64_t ram_discard_manager_get_min_granularity(const RamDiscardManager *rdm, | ||
620 | } | ||
621 | |||
622 | bool ram_discard_manager_is_populated(const RamDiscardManager *rdm, | ||
623 | - const MemoryRegionSection *section) | ||
624 | + const MemoryRegionSection *section, | ||
625 | + bool is_private) | ||
626 | { | ||
627 | RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); | ||
628 | |||
629 | g_assert(rdmc->is_populated); | ||
630 | - return rdmc->is_populated(rdm, section); | ||
631 | + return rdmc->is_populated(rdm, section, is_private); | ||
632 | } | ||
633 | |||
634 | int ram_discard_manager_replay_populated(const RamDiscardManager *rdm, | ||
635 | MemoryRegionSection *section, | ||
636 | + bool is_private, | ||
637 | ReplayRamPopulate replay_fn, | ||
638 | void *opaque) | ||
639 | { | ||
640 | RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); | ||
641 | |||
642 | g_assert(rdmc->replay_populated); | ||
643 | - return rdmc->replay_populated(rdm, section, replay_fn, opaque); | ||
644 | + return rdmc->replay_populated(rdm, section, is_private, replay_fn, opaque); | ||
645 | } | ||
646 | |||
647 | void ram_discard_manager_replay_discarded(const RamDiscardManager *rdm, | ||
648 | MemoryRegionSection *section, | ||
649 | + bool is_private, | ||
650 | ReplayRamDiscard replay_fn, | ||
651 | void *opaque) | ||
652 | { | ||
653 | RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); | ||
654 | |||
655 | g_assert(rdmc->replay_discarded); | ||
656 | - rdmc->replay_discarded(rdm, section, replay_fn, opaque); | ||
657 | + rdmc->replay_discarded(rdm, section, is_private, replay_fn, opaque); | ||
658 | } | ||
659 | |||
660 | void ram_discard_manager_register_listener(RamDiscardManager *rdm, | ||
661 | @@ -XXX,XX +XXX,XX @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, | ||
662 | * Disallow that. vmstate priorities make sure any RamDiscardManager | ||
663 | * were already restored before IOMMUs are restored. | ||
664 | */ | ||
665 | - if (!ram_discard_manager_is_populated(rdm, &tmp)) { | ||
666 | + if (!ram_discard_manager_is_populated(rdm, &tmp, false)) { | ||
667 | error_setg(errp, "iommu map to discarded memory (e.g., unplugged" | ||
668 | " via virtio-mem): %" HWADDR_PRIx "", | ||
669 | iotlb->translated_addr); | ||
670 | diff --git a/system/memory_mapping.c b/system/memory_mapping.c | ||
671 | index XXXXXXX..XXXXXXX 100644 | ||
672 | --- a/system/memory_mapping.c | ||
673 | +++ b/system/memory_mapping.c | ||
674 | @@ -XXX,XX +XXX,XX @@ static void guest_phys_block_add_section(GuestPhysListener *g, | ||
675 | } | ||
676 | |||
677 | static int guest_phys_ram_populate_cb(MemoryRegionSection *section, | ||
678 | - void *opaque) | ||
679 | + bool is_private, void *opaque) | ||
680 | { | ||
681 | GuestPhysListener *g = opaque; | ||
682 | |||
683 | @@ -XXX,XX +XXX,XX @@ static void guest_phys_blocks_region_add(MemoryListener *listener, | ||
684 | RamDiscardManager *rdm; | ||
685 | |||
686 | rdm = memory_region_get_ram_discard_manager(section->mr); | ||
687 | - ram_discard_manager_replay_populated(rdm, section, | ||
688 | + ram_discard_manager_replay_populated(rdm, section, false, | ||
689 | guest_phys_ram_populate_cb, g); | ||
690 | return; | ||
691 | } | ||
39 | -- | 692 | -- |
40 | 2.43.5 | 693 | 2.43.5 | diff view generated by jsdifflib |