hw/core/numa.c | 19 ++++++++++ hw/i386/e820_memory_layout.c | 72 ++++++++++++++++++++++++++++++++++++ hw/i386/e820_memory_layout.h | 12 +++--- hw/i386/pc.c | 61 ++++++++++++++++++++++++++++++ include/system/numa.h | 7 ++++ qapi/machine.json | 24 ++++++++++++ qemu-options.hx | 14 ++++++- 7 files changed, 202 insertions(+), 7 deletions(-)
Add a 'memmap-type' option to NUMA node configuration that allows
specifying the memory type for a NUMA node.
Supported values:
- normal: Regular system RAM (E820 type 1, default)
- spm: Specific Purpose Memory (E820 type 0xEFFFFFFF)
- reserved: Reserved memory (E820 type 2)
The 'spm' type indicates Specific Purpose Memory - a hint to the guest
that this memory might be managed by device drivers based on guest policy.
The 'reserved' type marks memory as not usable as RAM.
Note: This option is only supported on x86 platforms.
Usage:
-numa node,nodeid=1,memdev=m1,memmap-type=spm
Signed-off-by: fanhuang <FangSheng.Huang@amd.com>
---
hw/core/numa.c | 19 ++++++++++
hw/i386/e820_memory_layout.c | 72 ++++++++++++++++++++++++++++++++++++
hw/i386/e820_memory_layout.h | 12 +++---
hw/i386/pc.c | 61 ++++++++++++++++++++++++++++++
include/system/numa.h | 7 ++++
qapi/machine.json | 24 ++++++++++++
qemu-options.hx | 14 ++++++-
7 files changed, 202 insertions(+), 7 deletions(-)
diff --git a/hw/core/numa.c b/hw/core/numa.c
index f462883c87..409b2e2bb9 100644
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@@ -38,6 +38,7 @@
#include "hw/mem/pc-dimm.h"
#include "hw/core/boards.h"
#include "hw/mem/memory-device.h"
+#include "hw/i386/x86.h"
#include "qemu/option.h"
#include "qemu/config-file.h"
#include "qemu/cutils.h"
@@ -164,6 +165,24 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
}
+ if (node->has_memmap_type && node->memmap_type != NUMA_MEMMAP_TYPE_NORMAL) {
+ if (!object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) {
+ error_setg(errp, "memmap-type=%s is only supported on x86 machines",
+ NumaMemmapType_str(node->memmap_type));
+ return;
+ }
+ switch (node->memmap_type) {
+ case NUMA_MEMMAP_TYPE_SPM:
+ numa_info[nodenr].memmap_type = NUMA_MEMMAP_SPM;
+ break;
+ case NUMA_MEMMAP_TYPE_RESERVED:
+ numa_info[nodenr].memmap_type = NUMA_MEMMAP_RESERVED;
+ break;
+ default:
+ break;
+ }
+ }
+
numa_info[nodenr].present = true;
max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
ms->numa_state->num_nodes++;
diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
index 3e848fb69c..4c62b5ddea 100644
--- a/hw/i386/e820_memory_layout.c
+++ b/hw/i386/e820_memory_layout.c
@@ -46,3 +46,75 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length)
}
return false;
}
+
+bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type)
+{
+ uint64_t end = start + length;
+ assert(!e820_done);
+
+ /* For E820_SOFT_RESERVED, validate range is within E820_RAM */
+ if (new_type == E820_SOFT_RESERVED) {
+ bool range_in_ram = false;
+
+ for (size_t j = 0; j < e820_entries; j++) {
+ uint64_t ram_start = le64_to_cpu(e820_table[j].address);
+ uint64_t ram_end = ram_start + le64_to_cpu(e820_table[j].length);
+ uint32_t ram_type = le32_to_cpu(e820_table[j].type);
+
+ if (ram_type == E820_RAM && ram_start <= start && ram_end >= end) {
+ range_in_ram = true;
+ break;
+ }
+ }
+ if (!range_in_ram) {
+ return false;
+ }
+ }
+
+ /* Find entry that contains the target range and update it */
+ for (size_t i = 0; i < e820_entries; i++) {
+ uint64_t entry_start = le64_to_cpu(e820_table[i].address);
+ uint64_t entry_length = le64_to_cpu(e820_table[i].length);
+ uint64_t entry_end = entry_start + entry_length;
+
+ if (entry_start <= start && entry_end >= end) {
+ uint32_t original_type = e820_table[i].type;
+
+ /* Remove original entry */
+ memmove(&e820_table[i], &e820_table[i + 1],
+ (e820_entries - i - 1) * sizeof(struct e820_entry));
+ e820_entries--;
+
+ /* Add split parts inline */
+ if (entry_start < start) {
+ e820_table = g_renew(struct e820_entry, e820_table,
+ e820_entries + 1);
+ e820_table[e820_entries].address = cpu_to_le64(entry_start);
+ e820_table[e820_entries].length =
+ cpu_to_le64(start - entry_start);
+ e820_table[e820_entries].type = original_type;
+ e820_entries++;
+ }
+
+ e820_table = g_renew(struct e820_entry, e820_table,
+ e820_entries + 1);
+ e820_table[e820_entries].address = cpu_to_le64(start);
+ e820_table[e820_entries].length = cpu_to_le64(length);
+ e820_table[e820_entries].type = cpu_to_le32(new_type);
+ e820_entries++;
+
+ if (end < entry_end) {
+ e820_table = g_renew(struct e820_entry, e820_table,
+ e820_entries + 1);
+ e820_table[e820_entries].address = cpu_to_le64(end);
+ e820_table[e820_entries].length = cpu_to_le64(entry_end - end);
+ e820_table[e820_entries].type = original_type;
+ e820_entries++;
+ }
+
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
index b50acfa201..a85b4fd14c 100644
--- a/hw/i386/e820_memory_layout.h
+++ b/hw/i386/e820_memory_layout.h
@@ -10,11 +10,12 @@
#define HW_I386_E820_MEMORY_LAYOUT_H
/* e820 types */
-#define E820_RAM 1
-#define E820_RESERVED 2
-#define E820_ACPI 3
-#define E820_NVS 4
-#define E820_UNUSABLE 5
+#define E820_RAM 1
+#define E820_RESERVED 2
+#define E820_ACPI 3
+#define E820_NVS 4
+#define E820_UNUSABLE 5
+#define E820_SOFT_RESERVED 0xEFFFFFFF
struct e820_entry {
uint64_t address;
@@ -26,5 +27,6 @@ void e820_add_entry(uint64_t address, uint64_t length, uint32_t type);
bool e820_get_entry(int index, uint32_t type,
uint64_t *address, uint64_t *length);
int e820_get_table(struct e820_entry **table);
+bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type);
#endif
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 5cb074c0a0..d2230966f9 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -794,6 +794,64 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
return pc_above_4g_end(pcms) - 1;
}
+/*
+ * Update E820 entries for NUMA nodes with non-default memory types.
+ */
+static void pc_update_numa_memory_types(X86MachineState *x86ms)
+{
+ MachineState *ms = MACHINE(x86ms);
+ uint64_t addr = 0;
+
+ for (int i = 0; i < ms->numa_state->num_nodes; i++) {
+ NodeInfo *numa_info = &ms->numa_state->nodes[i];
+ uint64_t node_size = numa_info->node_mem;
+
+ /* Process non-normal memory types */
+ if (numa_info->memmap_type != NUMA_MEMMAP_NORMAL &&
+ numa_info->node_memdev) {
+ uint64_t guest_addr;
+ uint32_t e820_type;
+
+ switch (numa_info->memmap_type) {
+ case NUMA_MEMMAP_SPM:
+ e820_type = E820_SOFT_RESERVED;
+ break;
+ case NUMA_MEMMAP_RESERVED:
+ e820_type = E820_RESERVED;
+ break;
+ default:
+ goto next;
+ }
+
+ /* Calculate guest physical address accounting for PCI hole */
+ if (addr < x86ms->below_4g_mem_size) {
+ if (addr + node_size <= x86ms->below_4g_mem_size) {
+ guest_addr = addr;
+ } else {
+ error_report("NUMA node %d with memmap-type spans across "
+ "4GB boundary, not supported", i);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ guest_addr = 0x100000000ULL +
+ (addr - x86ms->below_4g_mem_size);
+ }
+
+ if (!e820_update_entry_type(guest_addr, node_size, e820_type)) {
+ warn_report("Failed to update E820 entry for node %d "
+ "at 0x%" PRIx64 " length 0x%" PRIx64,
+ i, guest_addr, node_size);
+ }
+ }
+
+next:
+ /* Accumulate address for next node */
+ if (numa_info->node_memdev) {
+ addr += node_size;
+ }
+ }
+}
+
/*
* AMD systems with an IOMMU have an additional hole close to the
* 1Tb, which are special GPAs that cannot be DMA mapped. Depending
@@ -910,6 +968,9 @@ void pc_memory_init(PCMachineState *pcms,
e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
}
+ /* Update E820 for NUMA nodes with special memory types */
+ pc_update_numa_memory_types(x86ms);
+
if (!pcmc->has_reserved_memory &&
(machine->ram_slots ||
(machine->maxram_size > machine->ram_size))) {
diff --git a/include/system/numa.h b/include/system/numa.h
index 1044b0eb6e..64e8f63736 100644
--- a/include/system/numa.h
+++ b/include/system/numa.h
@@ -35,12 +35,19 @@ enum {
#define UINT16_BITS 16
+typedef enum {
+ NUMA_MEMMAP_NORMAL = 0,
+ NUMA_MEMMAP_SPM,
+ NUMA_MEMMAP_RESERVED,
+} NumaMemmapTypeInternal;
+
typedef struct NodeInfo {
uint64_t node_mem;
struct HostMemoryBackend *node_memdev;
bool present;
bool has_cpu;
bool has_gi;
+ NumaMemmapTypeInternal memmap_type;
uint8_t lb_info_provided;
uint16_t initiator;
uint8_t distance[MAX_NODES];
diff --git a/qapi/machine.json b/qapi/machine.json
index 907cb25f75..b7fc8c564f 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -464,6 +464,22 @@
{ 'enum': 'NumaOptionsType',
'data': [ 'node', 'dist', 'cpu', 'hmat-lb', 'hmat-cache' ] }
+##
+# @NumaMemmapType:
+#
+# Memory mapping type for a NUMA node.
+#
+# @normal: Normal system RAM (E820 type 1)
+#
+# @spm: Specific Purpose Memory (E820 type 0xEFFFFFFF)
+#
+# @reserved: Reserved memory (E820 type 2)
+#
+# Since: 10.2
+##
+{ 'enum': 'NumaMemmapType',
+ 'data': ['normal', 'spm', 'reserved'] }
+
##
# @NumaOptions:
#
@@ -500,6 +516,13 @@
# @memdev: memory backend object. If specified for one node, it must
# be specified for all nodes.
#
+# @memmap-type: specifies the memory type for this NUMA node.
+# 'normal' (default) is regular system RAM.
+# 'spm' is Specific Purpose Memory - a hint to the guest that
+# this memory might be managed by device drivers based on policy.
+# 'reserved' is reserved memory, not usable as RAM.
+# Currently only supported on x86. (since 10.2)
+#
# @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points
# to the nodeid which has the memory controller responsible for
# this NUMA node. This field provides additional information as
@@ -514,6 +537,7 @@
'*cpus': ['uint16'],
'*mem': 'size',
'*memdev': 'str',
+ '*memmap-type': 'NumaMemmapType',
'*initiator': 'uint16' }}
##
diff --git a/qemu-options.hx b/qemu-options.hx
index ec92723f10..4da17cbefb 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -433,7 +433,7 @@ ERST
DEF("numa", HAS_ARG, QEMU_OPTION_numa,
"-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
- "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
+ "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node][,memmap-type=normal|spm|reserved]\n"
"-numa dist,src=source,dst=destination,val=distance\n"
"-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n"
"-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n"
@@ -442,7 +442,7 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa,
SRST
``-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]``
\
-``-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]``
+``-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator][,memmap-type=type]``
\
``-numa dist,src=source,dst=destination,val=distance``
\
@@ -510,6 +510,16 @@ SRST
largest bandwidth) to this NUMA node. Note that this option can be
set only when the machine property 'hmat' is set to 'on'.
+ '\ ``memmap-type``\ ' specifies the memory type for this NUMA node:
+
+ - ``normal`` (default): Regular system RAM (E820 type 1)
+ - ``spm``: Specific Purpose Memory (E820 type 0xEFFFFFFF). This is a
+ hint to the guest that the memory might be managed by device drivers
+ based on guest policy.
+ - ``reserved``: Reserved memory (E820 type 2), not usable as RAM.
+
+ This option is only supported on x86 platforms.
+
Following example creates a machine with 2 NUMA nodes, node 0 has
CPU. node 1 has only memory, and its initiator is node 0. Note that
because node 0 has CPU, by default the initiator of node 0 is itself
--
2.34.1
© 2016 - 2026 Red Hat, Inc.