hw/core/numa.c | 3 ++ hw/i386/e820_memory_layout.c | 73 ++++++++++++++++++++++++++++++++++++ hw/i386/e820_memory_layout.h | 2 + hw/i386/pc.c | 37 ++++++++++++++++++ include/exec/cpu-common.h | 1 + include/system/memory.h | 3 ++ include/system/numa.h | 1 + qapi/machine.json | 6 +++ system/physmem.c | 7 +++- 9 files changed, 132 insertions(+), 1 deletion(-)
This patch adds support for Specific Purpose Memory (SPM) through the
NUMA node configuration. When 'spm=on' is specified for a NUMA node,
QEMU will:
1. Set the RAM_SPM flag in the RAM block of the corresponding memory region
2. Update the overlapping E820 RAM entries before adding E820_SOFT_RESERVED
3. Set the E820 type to E820_SOFT_RESERVED for this memory region
This allows guest operating systems to recognize the memory as soft reserved
memory, which can be used for device-specific memory management without
E820 table conflicts.
Usage:
-numa node,nodeid=0,memdev=m1,spm=on
Signed-off-by: fanhuang <FangSheng.Huang@amd.com>
---
hw/core/numa.c | 3 ++
hw/i386/e820_memory_layout.c | 73 ++++++++++++++++++++++++++++++++++++
hw/i386/e820_memory_layout.h | 2 +
hw/i386/pc.c | 37 ++++++++++++++++++
include/exec/cpu-common.h | 1 +
include/system/memory.h | 3 ++
include/system/numa.h | 1 +
qapi/machine.json | 6 +++
system/physmem.c | 7 +++-
9 files changed, 132 insertions(+), 1 deletion(-)
diff --git a/hw/core/numa.c b/hw/core/numa.c
index 218576f745..e680130460 100644
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@@ -163,6 +163,9 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
}
+ /* Store spm configuration for later processing */
+ numa_info[nodenr].is_spm = node->has_spm && node->spm;
+
numa_info[nodenr].present = true;
max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
ms->numa_state->num_nodes++;
diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
index 3e848fb69c..5b090ac6df 100644
--- a/hw/i386/e820_memory_layout.c
+++ b/hw/i386/e820_memory_layout.c
@@ -46,3 +46,76 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length)
}
return false;
}
+
+bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type)
+{
+ uint64_t end = start + length;
+ bool updated = false;
+ assert(!e820_done);
+
+ /* For E820_SOFT_RESERVED, validate range is within E820_RAM */
+ if (new_type == E820_SOFT_RESERVED) {
+ bool range_in_ram = false;
+ for (size_t j = 0; j < e820_entries; j++) {
+ uint64_t ram_start = le64_to_cpu(e820_table[j].address);
+ uint64_t ram_end = ram_start + le64_to_cpu(e820_table[j].length);
+ uint32_t ram_type = le32_to_cpu(e820_table[j].type);
+
+ if (ram_type == E820_RAM && ram_start <= start && ram_end >= end) {
+ range_in_ram = true;
+ break;
+ }
+ }
+ if (!range_in_ram) {
+ return false;
+ }
+ }
+
+ /* Find entry that contains the target range and update it */
+ for (size_t i = 0; i < e820_entries; i++) {
+ uint64_t entry_start = le64_to_cpu(e820_table[i].address);
+ uint64_t entry_length = le64_to_cpu(e820_table[i].length);
+ uint64_t entry_end = entry_start + entry_length;
+
+ if (entry_start <= start && entry_end >= end) {
+ uint32_t original_type = e820_table[i].type;
+
+ /* Remove original entry */
+ memmove(&e820_table[i], &e820_table[i + 1],
+ (e820_entries - i - 1) * sizeof(struct e820_entry));
+ e820_entries--;
+
+ /* Add split parts inline */
+ if (entry_start < start) {
+ e820_table = g_renew(struct e820_entry, e820_table,
+ e820_entries + 1);
+ e820_table[e820_entries].address = cpu_to_le64(entry_start);
+ e820_table[e820_entries].length =
+ cpu_to_le64(start - entry_start);
+ e820_table[e820_entries].type = original_type;
+ e820_entries++;
+ }
+
+ e820_table = g_renew(struct e820_entry, e820_table,
+ e820_entries + 1);
+ e820_table[e820_entries].address = cpu_to_le64(start);
+ e820_table[e820_entries].length = cpu_to_le64(length);
+ e820_table[e820_entries].type = cpu_to_le32(new_type);
+ e820_entries++;
+
+ if (end < entry_end) {
+ e820_table = g_renew(struct e820_entry, e820_table,
+ e820_entries + 1);
+ e820_table[e820_entries].address = cpu_to_le64(end);
+ e820_table[e820_entries].length = cpu_to_le64(entry_end - end);
+ e820_table[e820_entries].type = original_type;
+ e820_entries++;
+ }
+
+ updated = true;
+ break;
+ }
+ }
+
+ return updated;
+}
diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
index b50acfa201..657cc679e2 100644
--- a/hw/i386/e820_memory_layout.h
+++ b/hw/i386/e820_memory_layout.h
@@ -15,6 +15,7 @@
#define E820_ACPI 3
#define E820_NVS 4
#define E820_UNUSABLE 5
+#define E820_SOFT_RESERVED 0xEFFFFFFF
struct e820_entry {
uint64_t address;
@@ -26,5 +27,6 @@ void e820_add_entry(uint64_t address, uint64_t length, uint32_t type);
bool e820_get_entry(int index, uint32_t type,
uint64_t *address, uint64_t *length);
int e820_get_table(struct e820_entry **table);
+bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type);
#endif
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index bc048a6d13..3e50570484 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -26,6 +26,7 @@
#include "qemu/units.h"
#include "exec/target_page.h"
#include "hw/i386/pc.h"
+#include "system/ramblock.h"
#include "hw/char/serial-isa.h"
#include "hw/char/parallel.h"
#include "hw/hyperv/hv-balloon.h"
@@ -787,6 +788,41 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
return pc_above_4g_end(pcms) - 1;
}
+static int pc_update_spm_memory(RAMBlock *rb, void *opaque)
+{
+ X86MachineState *x86ms = opaque;
+ MachineState *ms = MACHINE(x86ms);
+ ram_addr_t offset;
+ ram_addr_t length;
+ bool is_spm = false;
+
+ /* Check if this RAM block belongs to a NUMA node with spm=on */
+ for (int i = 0; i < ms->numa_state->num_nodes; i++) {
+ NodeInfo *numa_info = &ms->numa_state->nodes[i];
+ if (numa_info->is_spm && numa_info->node_memdev) {
+ MemoryRegion *mr = &numa_info->node_memdev->mr;
+ if (mr->ram_block == rb) {
+ /* Mark this RAM block as SPM and set the flag */
+ rb->flags |= RAM_SPM;
+ is_spm = true;
+ break;
+ }
+ }
+ }
+
+ if (is_spm) {
+ offset = qemu_ram_get_offset(rb) +
+ (0x100000000ULL - x86ms->below_4g_mem_size);
+ length = qemu_ram_get_used_length(rb);
+ if (!e820_update_entry_type(offset, length, E820_SOFT_RESERVED)) {
+ warn_report("Failed to update E820 entry for SPM at 0x%" PRIx64
+ " length 0x%" PRIx64, offset, length);
+ }
+ }
+
+ return 0;
+}
+
/*
* AMD systems with an IOMMU have an additional hole close to the
* 1Tb, which are special GPAs that cannot be DMA mapped. Depending
@@ -901,6 +937,7 @@ void pc_memory_init(PCMachineState *pcms,
if (pcms->sgx_epc.size != 0) {
e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
}
+ qemu_ram_foreach_block(pc_update_spm_memory, x86ms);
if (!pcmc->has_reserved_memory &&
(machine->ram_slots ||
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 9b658a3f48..9b437eaa10 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -89,6 +89,7 @@ ram_addr_t qemu_ram_get_fd_offset(RAMBlock *rb);
ram_addr_t qemu_ram_get_used_length(RAMBlock *rb);
ram_addr_t qemu_ram_get_max_length(RAMBlock *rb);
bool qemu_ram_is_shared(RAMBlock *rb);
+bool qemu_ram_is_spm(RAMBlock *rb);
bool qemu_ram_is_noreserve(RAMBlock *rb);
bool qemu_ram_is_uf_zeroable(RAMBlock *rb);
void qemu_ram_set_uf_zeroable(RAMBlock *rb);
diff --git a/include/system/memory.h b/include/system/memory.h
index aa85fc27a1..0d36cbd30d 100644
--- a/include/system/memory.h
+++ b/include/system/memory.h
@@ -275,6 +275,9 @@ typedef struct IOMMUTLBEvent {
*/
#define RAM_PRIVATE (1 << 13)
+/* RAM is Specific Purpose Memory */
+#define RAM_SPM (1 << 14)
+
static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
IOMMUNotifierFlag flags,
hwaddr start, hwaddr end,
diff --git a/include/system/numa.h b/include/system/numa.h
index 1044b0eb6e..438511a756 100644
--- a/include/system/numa.h
+++ b/include/system/numa.h
@@ -41,6 +41,7 @@ typedef struct NodeInfo {
bool present;
bool has_cpu;
bool has_gi;
+ bool is_spm;
uint8_t lb_info_provided;
uint16_t initiator;
uint8_t distance[MAX_NODES];
diff --git a/qapi/machine.json b/qapi/machine.json
index 038eab281c..1fa31b0224 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -500,6 +500,11 @@
# @memdev: memory backend object. If specified for one node, it must
# be specified for all nodes.
#
+# @spm: if true, mark the memory region of this node as Specific
+# Purpose Memory (SPM). This will set the RAM_SPM flag for the
+# corresponding memory region and set the E820 type to
+# E820_SOFT_RESERVED. (default: false, since 9.2)
+#
# @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points
# to the nodeid which has the memory controller responsible for
# this NUMA node. This field provides additional information as
@@ -514,6 +519,7 @@
'*cpus': ['uint16'],
'*mem': 'size',
'*memdev': 'str',
+ '*spm': 'bool',
'*initiator': 'uint16' }}
##
diff --git a/system/physmem.c b/system/physmem.c
index ae8ecd50ea..0090d9955d 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1611,6 +1611,11 @@ bool qemu_ram_is_noreserve(RAMBlock *rb)
return rb->flags & RAM_NORESERVE;
}
+bool qemu_ram_is_spm(RAMBlock *rb)
+{
+ return rb->flags & RAM_SPM;
+}
+
/* Note: Only set at the start of postcopy */
bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
{
@@ -2032,7 +2037,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, ram_addr_t max_size,
ram_flags &= ~RAM_PRIVATE;
/* Just support these ram flags by now. */
- assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE |
+ assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_SPM | RAM_NORESERVE |
RAM_PROTECTED | RAM_NAMED_FILE | RAM_READONLY |
RAM_READONLY_FD | RAM_GUEST_MEMFD |
RAM_RESIZEABLE)) == 0);
--
2.34.1
On 20.10.25 11:07, fanhuang wrote:
> This patch adds support for Specific Purpose Memory (SPM) through the
> NUMA node configuration. When 'spm=on' is specified for a NUMA node,
> QEMU will:
>
> 1. Set the RAM_SPM flag in the RAM block of the corresponding memory region
> 2. Update the overlapping E820 RAM entries before adding E820_SOFT_RESERVED
> 3. Set the E820 type to E820_SOFT_RESERVED for this memory region
>
> This allows guest operating systems to recognize the memory as soft reserved
> memory, which can be used for device-specific memory management without
> E820 table conflicts.
>
> Usage:
> -numa node,nodeid=0,memdev=m1,spm=on
>
> Signed-off-by: fanhuang <FangSheng.Huang@amd.com>
> ---
> hw/core/numa.c | 3 ++
> hw/i386/e820_memory_layout.c | 73 ++++++++++++++++++++++++++++++++++++
> hw/i386/e820_memory_layout.h | 2 +
> hw/i386/pc.c | 37 ++++++++++++++++++
> include/exec/cpu-common.h | 1 +
> include/system/memory.h | 3 ++
> include/system/numa.h | 1 +
> qapi/machine.json | 6 +++
> system/physmem.c | 7 +++-
> 9 files changed, 132 insertions(+), 1 deletion(-)
>
> diff --git a/hw/core/numa.c b/hw/core/numa.c
> index 218576f745..e680130460 100644
> --- a/hw/core/numa.c
> +++ b/hw/core/numa.c
> @@ -163,6 +163,9 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
> numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
> }
>
> + /* Store spm configuration for later processing */
> + numa_info[nodenr].is_spm = node->has_spm && node->spm;
> +
> numa_info[nodenr].present = true;
> max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
> ms->numa_state->num_nodes++;
> diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
> index 3e848fb69c..5b090ac6df 100644
> --- a/hw/i386/e820_memory_layout.c
> +++ b/hw/i386/e820_memory_layout.c
> @@ -46,3 +46,76 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length)
> }
> return false;
> }
> +
> +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type)
> +{
> + uint64_t end = start + length;
> + bool updated = false;
> + assert(!e820_done);
> +
> + /* For E820_SOFT_RESERVED, validate range is within E820_RAM */
> + if (new_type == E820_SOFT_RESERVED) {
> + bool range_in_ram = false;
> + for (size_t j = 0; j < e820_entries; j++) {
> + uint64_t ram_start = le64_to_cpu(e820_table[j].address);
> + uint64_t ram_end = ram_start + le64_to_cpu(e820_table[j].length);
> + uint32_t ram_type = le32_to_cpu(e820_table[j].type);
> +
> + if (ram_type == E820_RAM && ram_start <= start && ram_end >= end) {
> + range_in_ram = true;
> + break;
> + }
> + }
> + if (!range_in_ram) {
> + return false;
> + }
> + }
> +
> + /* Find entry that contains the target range and update it */
> + for (size_t i = 0; i < e820_entries; i++) {
> + uint64_t entry_start = le64_to_cpu(e820_table[i].address);
> + uint64_t entry_length = le64_to_cpu(e820_table[i].length);
> + uint64_t entry_end = entry_start + entry_length;
> +
> + if (entry_start <= start && entry_end >= end) {
> + uint32_t original_type = e820_table[i].type;
> +
> + /* Remove original entry */
> + memmove(&e820_table[i], &e820_table[i + 1],
> + (e820_entries - i - 1) * sizeof(struct e820_entry));
> + e820_entries--;
> +
> + /* Add split parts inline */
> + if (entry_start < start) {
> + e820_table = g_renew(struct e820_entry, e820_table,
> + e820_entries + 1);
> + e820_table[e820_entries].address = cpu_to_le64(entry_start);
> + e820_table[e820_entries].length =
> + cpu_to_le64(start - entry_start);
> + e820_table[e820_entries].type = original_type;
> + e820_entries++;
> + }
> +
> + e820_table = g_renew(struct e820_entry, e820_table,
> + e820_entries + 1);
> + e820_table[e820_entries].address = cpu_to_le64(start);
> + e820_table[e820_entries].length = cpu_to_le64(length);
> + e820_table[e820_entries].type = cpu_to_le32(new_type);
> + e820_entries++;
> +
> + if (end < entry_end) {
> + e820_table = g_renew(struct e820_entry, e820_table,
> + e820_entries + 1);
> + e820_table[e820_entries].address = cpu_to_le64(end);
> + e820_table[e820_entries].length = cpu_to_le64(entry_end - end);
> + e820_table[e820_entries].type = original_type;
> + e820_entries++;
> + }
> +
> + updated = true;
> + break;
> + }
> + }
> +
> + return updated;
> +}
> diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
> index b50acfa201..657cc679e2 100644
> --- a/hw/i386/e820_memory_layout.h
> +++ b/hw/i386/e820_memory_layout.h
> @@ -15,6 +15,7 @@
> #define E820_ACPI 3
> #define E820_NVS 4
> #define E820_UNUSABLE 5
> +#define E820_SOFT_RESERVED 0xEFFFFFFF
>
> struct e820_entry {
> uint64_t address;
> @@ -26,5 +27,6 @@ void e820_add_entry(uint64_t address, uint64_t length, uint32_t type);
> bool e820_get_entry(int index, uint32_t type,
> uint64_t *address, uint64_t *length);
> int e820_get_table(struct e820_entry **table);
> +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type);
>
> #endif
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index bc048a6d13..3e50570484 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -26,6 +26,7 @@
> #include "qemu/units.h"
> #include "exec/target_page.h"
> #include "hw/i386/pc.h"
> +#include "system/ramblock.h"
> #include "hw/char/serial-isa.h"
> #include "hw/char/parallel.h"
> #include "hw/hyperv/hv-balloon.h"
> @@ -787,6 +788,41 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
> return pc_above_4g_end(pcms) - 1;
> }
>
> +static int pc_update_spm_memory(RAMBlock *rb, void *opaque)
> +{
> + X86MachineState *x86ms = opaque;
> + MachineState *ms = MACHINE(x86ms);
> + ram_addr_t offset;
> + ram_addr_t length;
> + bool is_spm = false;
> +
> + /* Check if this RAM block belongs to a NUMA node with spm=on */
> + for (int i = 0; i < ms->numa_state->num_nodes; i++) {
> + NodeInfo *numa_info = &ms->numa_state->nodes[i];
> + if (numa_info->is_spm && numa_info->node_memdev) {
> + MemoryRegion *mr = &numa_info->node_memdev->mr;
> + if (mr->ram_block == rb) {
> + /* Mark this RAM block as SPM and set the flag */
> + rb->flags |= RAM_SPM;
> + is_spm = true;
> + break;
> + }
> + }
> + }
> +
> + if (is_spm) {
> + offset = qemu_ram_get_offset(rb) +
> + (0x100000000ULL - x86ms->below_4g_mem_size);
> + length = qemu_ram_get_used_length(rb);
> + if (!e820_update_entry_type(offset, length, E820_SOFT_RESERVED)) {
> + warn_report("Failed to update E820 entry for SPM at 0x%" PRIx64
> + " length 0x%" PRIx64, offset, length);
> + }
> + }
> +
> + return 0;
> +}
> +
> /*
> * AMD systems with an IOMMU have an additional hole close to the
> * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
> @@ -901,6 +937,7 @@ void pc_memory_init(PCMachineState *pcms,
> if (pcms->sgx_epc.size != 0) {
> e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
> }
> + qemu_ram_foreach_block(pc_update_spm_memory, x86ms);
>
> if (!pcmc->has_reserved_memory &&
> (machine->ram_slots ||
> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
> index 9b658a3f48..9b437eaa10 100644
> --- a/include/exec/cpu-common.h
> +++ b/include/exec/cpu-common.h
> @@ -89,6 +89,7 @@ ram_addr_t qemu_ram_get_fd_offset(RAMBlock *rb);
> ram_addr_t qemu_ram_get_used_length(RAMBlock *rb);
> ram_addr_t qemu_ram_get_max_length(RAMBlock *rb);
> bool qemu_ram_is_shared(RAMBlock *rb);
> +bool qemu_ram_is_spm(RAMBlock *rb);
> bool qemu_ram_is_noreserve(RAMBlock *rb);
> bool qemu_ram_is_uf_zeroable(RAMBlock *rb);
> void qemu_ram_set_uf_zeroable(RAMBlock *rb);
> diff --git a/include/system/memory.h b/include/system/memory.h
> index aa85fc27a1..0d36cbd30d 100644
> --- a/include/system/memory.h
> +++ b/include/system/memory.h
> @@ -275,6 +275,9 @@ typedef struct IOMMUTLBEvent {
> */
> #define RAM_PRIVATE (1 << 13)
>
> +/* RAM is Specific Purpose Memory */
> +#define RAM_SPM (1 << 14)
> +
> static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
> IOMMUNotifierFlag flags,
> hwaddr start, hwaddr end,
> diff --git a/include/system/numa.h b/include/system/numa.h
> index 1044b0eb6e..438511a756 100644
> --- a/include/system/numa.h
> +++ b/include/system/numa.h
> @@ -41,6 +41,7 @@ typedef struct NodeInfo {
> bool present;
> bool has_cpu;
> bool has_gi;
> + bool is_spm;
> uint8_t lb_info_provided;
> uint16_t initiator;
> uint8_t distance[MAX_NODES];
> diff --git a/qapi/machine.json b/qapi/machine.json
> index 038eab281c..1fa31b0224 100644
> --- a/qapi/machine.json
> +++ b/qapi/machine.json
> @@ -500,6 +500,11 @@
> # @memdev: memory backend object. If specified for one node, it must
> # be specified for all nodes.
> #
> +# @spm: if true, mark the memory region of this node as Specific
> +# Purpose Memory (SPM). This will set the RAM_SPM flag for the
> +# corresponding memory region and set the E820 type to
> +# E820_SOFT_RESERVED. (default: false, since 9.2)
> +#
> # @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points
> # to the nodeid which has the memory controller responsible for
> # this NUMA node. This field provides additional information as
> @@ -514,6 +519,7 @@
> '*cpus': ['uint16'],
> '*mem': 'size',
> '*memdev': 'str',
> + '*spm': 'bool',
> '*initiator': 'uint16' }}
>
> ##
> diff --git a/system/physmem.c b/system/physmem.c
> index ae8ecd50ea..0090d9955d 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -1611,6 +1611,11 @@ bool qemu_ram_is_noreserve(RAMBlock *rb)
> return rb->flags & RAM_NORESERVE;
> }
>
> +bool qemu_ram_is_spm(RAMBlock *rb)
> +{
> + return rb->flags & RAM_SPM;
> +}
> +
IIUC, this function is unused, and the only setter is in
pc_update_spm_memory().
Why do we have to modify the RAMBlock at all or walk over them?
Shouldn't it be sufficient to just walk over all
&ms->numa_state->nodes[i] and update e820 accordingly?
--
Cheers
David / dhildenb
On 11/3/2025 8:32 PM, David Hildenbrand wrote:
> On 20.10.25 11:07, fanhuang wrote:
>> This patch adds support for Specific Purpose Memory (SPM) through the
>> NUMA node configuration. When 'spm=on' is specified for a NUMA node,
>> QEMU will:
>>
>> 1. Set the RAM_SPM flag in the RAM block of the corresponding memory
>> region
>> 2. Update the overlapping E820 RAM entries before adding
>> E820_SOFT_RESERVED
>> 3. Set the E820 type to E820_SOFT_RESERVED for this memory region
>>
>> This allows guest operating systems to recognize the memory as soft
>> reserved
>> memory, which can be used for device-specific memory management without
>> E820 table conflicts.
>>
>> Usage:
>> -numa node,nodeid=0,memdev=m1,spm=on
>>
>> Signed-off-by: fanhuang <FangSheng.Huang@amd.com>
>> ---
>> hw/core/numa.c | 3 ++
>> hw/i386/e820_memory_layout.c | 73 ++++++++++++++++++++++++++++++++++++
>> hw/i386/e820_memory_layout.h | 2 +
>> hw/i386/pc.c | 37 ++++++++++++++++++
>> include/exec/cpu-common.h | 1 +
>> include/system/memory.h | 3 ++
>> include/system/numa.h | 1 +
>> qapi/machine.json | 6 +++
>> system/physmem.c | 7 +++-
>> 9 files changed, 132 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/core/numa.c b/hw/core/numa.c
>> index 218576f745..e680130460 100644
>> --- a/hw/core/numa.c
>> +++ b/hw/core/numa.c
>> @@ -163,6 +163,9 @@ static void parse_numa_node(MachineState *ms,
>> NumaNodeOptions *node,
>> numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
>> }
>> + /* Store spm configuration for later processing */
>> + numa_info[nodenr].is_spm = node->has_spm && node->spm;
>> +
>> numa_info[nodenr].present = true;
>> max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
>> ms->numa_state->num_nodes++;
>> diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
>> index 3e848fb69c..5b090ac6df 100644
>> --- a/hw/i386/e820_memory_layout.c
>> +++ b/hw/i386/e820_memory_layout.c
>> @@ -46,3 +46,76 @@ bool e820_get_entry(int idx, uint32_t type,
>> uint64_t *address, uint64_t *length)
>> }
>> return false;
>> }
>> +
>> +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t
>> new_type)
>> +{
>> + uint64_t end = start + length;
>> + bool updated = false;
>> + assert(!e820_done);
>> +
>> + /* For E820_SOFT_RESERVED, validate range is within E820_RAM */
>> + if (new_type == E820_SOFT_RESERVED) {
>> + bool range_in_ram = false;
>> + for (size_t j = 0; j < e820_entries; j++) {
>> + uint64_t ram_start = le64_to_cpu(e820_table[j].address);
>> + uint64_t ram_end = ram_start +
>> le64_to_cpu(e820_table[j].length);
>> + uint32_t ram_type = le32_to_cpu(e820_table[j].type);
>> +
>> + if (ram_type == E820_RAM && ram_start <= start && ram_end
>> >= end) {
>> + range_in_ram = true;
>> + break;
>> + }
>> + }
>> + if (!range_in_ram) {
>> + return false;
>> + }
>> + }
>> +
>> + /* Find entry that contains the target range and update it */
>> + for (size_t i = 0; i < e820_entries; i++) {
>> + uint64_t entry_start = le64_to_cpu(e820_table[i].address);
>> + uint64_t entry_length = le64_to_cpu(e820_table[i].length);
>> + uint64_t entry_end = entry_start + entry_length;
>> +
>> + if (entry_start <= start && entry_end >= end) {
>> + uint32_t original_type = e820_table[i].type;
>> +
>> + /* Remove original entry */
>> + memmove(&e820_table[i], &e820_table[i + 1],
>> + (e820_entries - i - 1) * sizeof(struct e820_entry));
>> + e820_entries--;
>> +
>> + /* Add split parts inline */
>> + if (entry_start < start) {
>> + e820_table = g_renew(struct e820_entry, e820_table,
>> + e820_entries + 1);
>> + e820_table[e820_entries].address =
>> cpu_to_le64(entry_start);
>> + e820_table[e820_entries].length =
>> + cpu_to_le64(start - entry_start);
>> + e820_table[e820_entries].type = original_type;
>> + e820_entries++;
>> + }
>> +
>> + e820_table = g_renew(struct e820_entry, e820_table,
>> + e820_entries + 1);
>> + e820_table[e820_entries].address = cpu_to_le64(start);
>> + e820_table[e820_entries].length = cpu_to_le64(length);
>> + e820_table[e820_entries].type = cpu_to_le32(new_type);
>> + e820_entries++;
>> +
>> + if (end < entry_end) {
>> + e820_table = g_renew(struct e820_entry, e820_table,
>> + e820_entries + 1);
>> + e820_table[e820_entries].address = cpu_to_le64(end);
>> + e820_table[e820_entries].length =
>> cpu_to_le64(entry_end - end);
>> + e820_table[e820_entries].type = original_type;
>> + e820_entries++;
>> + }
>> +
>> + updated = true;
>> + break;
>> + }
>> + }
>> +
>> + return updated;
>> +}
>> diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
>> index b50acfa201..657cc679e2 100644
>> --- a/hw/i386/e820_memory_layout.h
>> +++ b/hw/i386/e820_memory_layout.h
>> @@ -15,6 +15,7 @@
>> #define E820_ACPI 3
>> #define E820_NVS 4
>> #define E820_UNUSABLE 5
>> +#define E820_SOFT_RESERVED 0xEFFFFFFF
>> struct e820_entry {
>> uint64_t address;
>> @@ -26,5 +27,6 @@ void e820_add_entry(uint64_t address, uint64_t
>> length, uint32_t type);
>> bool e820_get_entry(int index, uint32_t type,
>> uint64_t *address, uint64_t *length);
>> int e820_get_table(struct e820_entry **table);
>> +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t
>> new_type);
>> #endif
>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>> index bc048a6d13..3e50570484 100644
>> --- a/hw/i386/pc.c
>> +++ b/hw/i386/pc.c
>> @@ -26,6 +26,7 @@
>> #include "qemu/units.h"
>> #include "exec/target_page.h"
>> #include "hw/i386/pc.h"
>> +#include "system/ramblock.h"
>> #include "hw/char/serial-isa.h"
>> #include "hw/char/parallel.h"
>> #include "hw/hyperv/hv-balloon.h"
>> @@ -787,6 +788,41 @@ static hwaddr pc_max_used_gpa(PCMachineState
>> *pcms, uint64_t pci_hole64_size)
>> return pc_above_4g_end(pcms) - 1;
>> }
>> +static int pc_update_spm_memory(RAMBlock *rb, void *opaque)
>> +{
>> + X86MachineState *x86ms = opaque;
>> + MachineState *ms = MACHINE(x86ms);
>> + ram_addr_t offset;
>> + ram_addr_t length;
>> + bool is_spm = false;
>> +
>> + /* Check if this RAM block belongs to a NUMA node with spm=on */
>> + for (int i = 0; i < ms->numa_state->num_nodes; i++) {
>> + NodeInfo *numa_info = &ms->numa_state->nodes[i];
>> + if (numa_info->is_spm && numa_info->node_memdev) {
>> + MemoryRegion *mr = &numa_info->node_memdev->mr;
>> + if (mr->ram_block == rb) {
>> + /* Mark this RAM block as SPM and set the flag */
>> + rb->flags |= RAM_SPM;
>> + is_spm = true;
>> + break;
>> + }
>> + }
>> + }
>> +
>> + if (is_spm) {
>> + offset = qemu_ram_get_offset(rb) +
>> + (0x100000000ULL - x86ms->below_4g_mem_size);
>> + length = qemu_ram_get_used_length(rb);
>> + if (!e820_update_entry_type(offset, length,
>> E820_SOFT_RESERVED)) {
>> + warn_report("Failed to update E820 entry for SPM at 0x%"
>> PRIx64
>> + " length 0x%" PRIx64, offset, length);
>> + }
>> + }
>> +
>> + return 0;
>> +}
>> +
>> /*
>> * AMD systems with an IOMMU have an additional hole close to the
>> * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
>> @@ -901,6 +937,7 @@ void pc_memory_init(PCMachineState *pcms,
>> if (pcms->sgx_epc.size != 0) {
>> e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size,
>> E820_RESERVED);
>> }
>> + qemu_ram_foreach_block(pc_update_spm_memory, x86ms);
>> if (!pcmc->has_reserved_memory &&
>> (machine->ram_slots ||
>> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
>> index 9b658a3f48..9b437eaa10 100644
>> --- a/include/exec/cpu-common.h
>> +++ b/include/exec/cpu-common.h
>> @@ -89,6 +89,7 @@ ram_addr_t qemu_ram_get_fd_offset(RAMBlock *rb);
>> ram_addr_t qemu_ram_get_used_length(RAMBlock *rb);
>> ram_addr_t qemu_ram_get_max_length(RAMBlock *rb);
>> bool qemu_ram_is_shared(RAMBlock *rb);
>> +bool qemu_ram_is_spm(RAMBlock *rb);
>> bool qemu_ram_is_noreserve(RAMBlock *rb);
>> bool qemu_ram_is_uf_zeroable(RAMBlock *rb);
>> void qemu_ram_set_uf_zeroable(RAMBlock *rb);
>> diff --git a/include/system/memory.h b/include/system/memory.h
>> index aa85fc27a1..0d36cbd30d 100644
>> --- a/include/system/memory.h
>> +++ b/include/system/memory.h
>> @@ -275,6 +275,9 @@ typedef struct IOMMUTLBEvent {
>> */
>> #define RAM_PRIVATE (1 << 13)
>> +/* RAM is Specific Purpose Memory */
>> +#define RAM_SPM (1 << 14)
>> +
>> static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify
>> fn,
>> IOMMUNotifierFlag flags,
>> hwaddr start, hwaddr end,
>> diff --git a/include/system/numa.h b/include/system/numa.h
>> index 1044b0eb6e..438511a756 100644
>> --- a/include/system/numa.h
>> +++ b/include/system/numa.h
>> @@ -41,6 +41,7 @@ typedef struct NodeInfo {
>> bool present;
>> bool has_cpu;
>> bool has_gi;
>> + bool is_spm;
>> uint8_t lb_info_provided;
>> uint16_t initiator;
>> uint8_t distance[MAX_NODES];
>> diff --git a/qapi/machine.json b/qapi/machine.json
>> index 038eab281c..1fa31b0224 100644
>> --- a/qapi/machine.json
>> +++ b/qapi/machine.json
>> @@ -500,6 +500,11 @@
>> # @memdev: memory backend object. If specified for one node, it must
>> # be specified for all nodes.
>> #
>> +# @spm: if true, mark the memory region of this node as Specific
>> +# Purpose Memory (SPM). This will set the RAM_SPM flag for the
>> +# corresponding memory region and set the E820 type to
>> +# E820_SOFT_RESERVED. (default: false, since 9.2)
>> +#
>> # @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points
>> # to the nodeid which has the memory controller responsible for
>> # this NUMA node. This field provides additional information as
>> @@ -514,6 +519,7 @@
>> '*cpus': ['uint16'],
>> '*mem': 'size',
>> '*memdev': 'str',
>> + '*spm': 'bool',
>> '*initiator': 'uint16' }}
>> ##
>> diff --git a/system/physmem.c b/system/physmem.c
>> index ae8ecd50ea..0090d9955d 100644
>> --- a/system/physmem.c
>> +++ b/system/physmem.c
>> @@ -1611,6 +1611,11 @@ bool qemu_ram_is_noreserve(RAMBlock *rb)
>> return rb->flags & RAM_NORESERVE;
>> }
>> +bool qemu_ram_is_spm(RAMBlock *rb)
>> +{
>> + return rb->flags & RAM_SPM;
>> +}
>> +
>
> IIUC, this function is unused, and the only setter is in
> pc_update_spm_memory().
>
> Why do we have to modify the RAMBlock at all or walk over them?
>
> Shouldn't it be sufficient to just walk over all &ms->numa_state-
> >nodes[i] and update e820 accordingly?
>
Hi David,
Thank you for the excellent review and the insightful suggestion!
You're absolutely right - I've simplified the implementation to
directly iterate over NUMA nodes instead of RAMBlocks.
I'll send v3 after internal review. I also understand Igor's
feedback would be valuable - I'll wait to hear if he has any
concerns.
Best regards,
Jerry Huang
© 2016 - 2025 Red Hat, Inc.