[v2] numa: add 'spm' option for Specific Purpose Memory

[PATCH v2] numa: add 'spm' option for Specific Purpose Memory

Posted by fanhuang 3 months, 2 weeks ago

This patch adds support for Specific Purpose Memory (SPM) through the
NUMA node configuration. When 'spm=on' is specified for a NUMA node,
QEMU will:

1. Set the RAM_SPM flag in the RAM block of the corresponding memory region
2. Update the overlapping E820 RAM entries before adding E820_SOFT_RESERVED
3. Set the E820 type to E820_SOFT_RESERVED for this memory region

This allows guest operating systems to recognize the memory as soft reserved
memory, which can be used for device-specific memory management without
E820 table conflicts.

Usage:
  -numa node,nodeid=0,memdev=m1,spm=on

Signed-off-by: fanhuang <FangSheng.Huang@amd.com>
---
 hw/core/numa.c               |  3 ++
 hw/i386/e820_memory_layout.c | 73 ++++++++++++++++++++++++++++++++++++
 hw/i386/e820_memory_layout.h |  2 +
 hw/i386/pc.c                 | 37 ++++++++++++++++++
 include/exec/cpu-common.h    |  1 +
 include/system/memory.h      |  3 ++
 include/system/numa.h        |  1 +
 qapi/machine.json            |  6 +++
 system/physmem.c             |  7 +++-
 9 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/hw/core/numa.c b/hw/core/numa.c
index 218576f745..e680130460 100644
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@@ -163,6 +163,9 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
     }
 
+    /* Store spm configuration for later processing */
+    numa_info[nodenr].is_spm = node->has_spm && node->spm;
+
     numa_info[nodenr].present = true;
     max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
     ms->numa_state->num_nodes++;
diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
index 3e848fb69c..5b090ac6df 100644
--- a/hw/i386/e820_memory_layout.c
+++ b/hw/i386/e820_memory_layout.c
@@ -46,3 +46,76 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length)
     }
     return false;
 }
+
+bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type)
+{
+    uint64_t end = start + length;
+    bool updated = false;
+    assert(!e820_done);
+
+    /* For E820_SOFT_RESERVED, validate range is within E820_RAM */
+    if (new_type == E820_SOFT_RESERVED) {
+        bool range_in_ram = false;
+        for (size_t j = 0; j < e820_entries; j++) {
+            uint64_t ram_start = le64_to_cpu(e820_table[j].address);
+            uint64_t ram_end = ram_start + le64_to_cpu(e820_table[j].length);
+            uint32_t ram_type = le32_to_cpu(e820_table[j].type);
+
+            if (ram_type == E820_RAM && ram_start <= start && ram_end >= end) {
+                range_in_ram = true;
+                break;
+            }
+        }
+        if (!range_in_ram) {
+            return false;
+        }
+    }
+
+    /* Find entry that contains the target range and update it */
+    for (size_t i = 0; i < e820_entries; i++) {
+        uint64_t entry_start = le64_to_cpu(e820_table[i].address);
+        uint64_t entry_length = le64_to_cpu(e820_table[i].length);
+        uint64_t entry_end = entry_start + entry_length;
+
+        if (entry_start <= start && entry_end >= end) {
+            uint32_t original_type = e820_table[i].type;
+
+            /* Remove original entry */
+            memmove(&e820_table[i], &e820_table[i + 1],
+                    (e820_entries - i - 1) * sizeof(struct e820_entry));
+            e820_entries--;
+
+            /* Add split parts inline */
+            if (entry_start < start) {
+                e820_table = g_renew(struct e820_entry, e820_table,
+                                     e820_entries + 1);
+                e820_table[e820_entries].address = cpu_to_le64(entry_start);
+                e820_table[e820_entries].length =
+                    cpu_to_le64(start - entry_start);
+                e820_table[e820_entries].type = original_type;
+                e820_entries++;
+            }
+
+            e820_table = g_renew(struct e820_entry, e820_table,
+                                 e820_entries + 1);
+            e820_table[e820_entries].address = cpu_to_le64(start);
+            e820_table[e820_entries].length = cpu_to_le64(length);
+            e820_table[e820_entries].type = cpu_to_le32(new_type);
+            e820_entries++;
+
+            if (end < entry_end) {
+                e820_table = g_renew(struct e820_entry, e820_table,
+                                     e820_entries + 1);
+                e820_table[e820_entries].address = cpu_to_le64(end);
+                e820_table[e820_entries].length = cpu_to_le64(entry_end - end);
+                e820_table[e820_entries].type = original_type;
+                e820_entries++;
+            }
+
+            updated = true;
+            break;
+        }
+    }
+
+    return updated;
+}
diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
index b50acfa201..657cc679e2 100644
--- a/hw/i386/e820_memory_layout.h
+++ b/hw/i386/e820_memory_layout.h
@@ -15,6 +15,7 @@
 #define E820_ACPI       3
 #define E820_NVS        4
 #define E820_UNUSABLE   5
+#define E820_SOFT_RESERVED  0xEFFFFFFF
 
 struct e820_entry {
     uint64_t address;
@@ -26,5 +27,6 @@ void e820_add_entry(uint64_t address, uint64_t length, uint32_t type);
 bool e820_get_entry(int index, uint32_t type,
                     uint64_t *address, uint64_t *length);
 int e820_get_table(struct e820_entry **table);
+bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type);
 
 #endif
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index bc048a6d13..3e50570484 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -26,6 +26,7 @@
 #include "qemu/units.h"
 #include "exec/target_page.h"
 #include "hw/i386/pc.h"
+#include "system/ramblock.h"
 #include "hw/char/serial-isa.h"
 #include "hw/char/parallel.h"
 #include "hw/hyperv/hv-balloon.h"
@@ -787,6 +788,41 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
     return pc_above_4g_end(pcms) - 1;
 }
 
+static int pc_update_spm_memory(RAMBlock *rb, void *opaque)
+{
+    X86MachineState *x86ms = opaque;
+    MachineState *ms = MACHINE(x86ms);
+    ram_addr_t offset;
+    ram_addr_t length;
+    bool is_spm = false;
+
+    /* Check if this RAM block belongs to a NUMA node with spm=on */
+    for (int i = 0; i < ms->numa_state->num_nodes; i++) {
+        NodeInfo *numa_info = &ms->numa_state->nodes[i];
+        if (numa_info->is_spm && numa_info->node_memdev) {
+            MemoryRegion *mr = &numa_info->node_memdev->mr;
+            if (mr->ram_block == rb) {
+                /* Mark this RAM block as SPM and set the flag */
+                rb->flags |= RAM_SPM;
+                is_spm = true;
+                break;
+            }
+        }
+    }
+
+    if (is_spm) {
+        offset = qemu_ram_get_offset(rb) +
+                 (0x100000000ULL - x86ms->below_4g_mem_size);
+        length = qemu_ram_get_used_length(rb);
+        if (!e820_update_entry_type(offset, length, E820_SOFT_RESERVED)) {
+            warn_report("Failed to update E820 entry for SPM at 0x%" PRIx64
+                        " length 0x%" PRIx64, offset, length);
+        }
+    }
+
+    return 0;
+}
+
 /*
  * AMD systems with an IOMMU have an additional hole close to the
  * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
@@ -901,6 +937,7 @@ void pc_memory_init(PCMachineState *pcms,
     if (pcms->sgx_epc.size != 0) {
         e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
     }
+    qemu_ram_foreach_block(pc_update_spm_memory, x86ms);
 
     if (!pcmc->has_reserved_memory &&
         (machine->ram_slots ||
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 9b658a3f48..9b437eaa10 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -89,6 +89,7 @@ ram_addr_t qemu_ram_get_fd_offset(RAMBlock *rb);
 ram_addr_t qemu_ram_get_used_length(RAMBlock *rb);
 ram_addr_t qemu_ram_get_max_length(RAMBlock *rb);
 bool qemu_ram_is_shared(RAMBlock *rb);
+bool qemu_ram_is_spm(RAMBlock *rb);
 bool qemu_ram_is_noreserve(RAMBlock *rb);
 bool qemu_ram_is_uf_zeroable(RAMBlock *rb);
 void qemu_ram_set_uf_zeroable(RAMBlock *rb);
diff --git a/include/system/memory.h b/include/system/memory.h
index aa85fc27a1..0d36cbd30d 100644
--- a/include/system/memory.h
+++ b/include/system/memory.h
@@ -275,6 +275,9 @@ typedef struct IOMMUTLBEvent {
  */
 #define RAM_PRIVATE (1 << 13)
 
+/* RAM is Specific Purpose Memory */
+#define RAM_SPM (1 << 14)
+
 static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
                                        IOMMUNotifierFlag flags,
                                        hwaddr start, hwaddr end,
diff --git a/include/system/numa.h b/include/system/numa.h
index 1044b0eb6e..438511a756 100644
--- a/include/system/numa.h
+++ b/include/system/numa.h
@@ -41,6 +41,7 @@ typedef struct NodeInfo {
     bool present;
     bool has_cpu;
     bool has_gi;
+    bool is_spm;
     uint8_t lb_info_provided;
     uint16_t initiator;
     uint8_t distance[MAX_NODES];
diff --git a/qapi/machine.json b/qapi/machine.json
index 038eab281c..1fa31b0224 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -500,6 +500,11 @@
 # @memdev: memory backend object.  If specified for one node, it must
 #     be specified for all nodes.
 #
+# @spm: if true, mark the memory region of this node as Specific
+#     Purpose Memory (SPM). This will set the RAM_SPM flag for the
+#     corresponding memory region and set the E820 type to
+#     E820_SOFT_RESERVED. (default: false, since 9.2)
+#
 # @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points
 #     to the nodeid which has the memory controller responsible for
 #     this NUMA node.  This field provides additional information as
@@ -514,6 +519,7 @@
    '*cpus':   ['uint16'],
    '*mem':    'size',
    '*memdev': 'str',
+   '*spm':    'bool',
    '*initiator': 'uint16' }}
 
 ##
diff --git a/system/physmem.c b/system/physmem.c
index ae8ecd50ea..0090d9955d 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1611,6 +1611,11 @@ bool qemu_ram_is_noreserve(RAMBlock *rb)
     return rb->flags & RAM_NORESERVE;
 }
 
+bool qemu_ram_is_spm(RAMBlock *rb)
+{
+    return rb->flags & RAM_SPM;
+}
+
 /* Note: Only set at the start of postcopy */
 bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
 {
@@ -2032,7 +2037,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, ram_addr_t max_size,
     ram_flags &= ~RAM_PRIVATE;
 
     /* Just support these ram flags by now. */
-    assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE |
+    assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_SPM | RAM_NORESERVE |
                           RAM_PROTECTED | RAM_NAMED_FILE | RAM_READONLY |
                           RAM_READONLY_FD | RAM_GUEST_MEMFD |
                           RAM_RESIZEABLE)) == 0);
-- 
2.34.1

Re: [PATCH v2] numa: add 'spm' option for Specific Purpose Memory

Posted by David Hildenbrand 3 months ago

On 20.10.25 11:07, fanhuang wrote:
> This patch adds support for Specific Purpose Memory (SPM) through the
> NUMA node configuration. When 'spm=on' is specified for a NUMA node,
> QEMU will:
> 
> 1. Set the RAM_SPM flag in the RAM block of the corresponding memory region
> 2. Update the overlapping E820 RAM entries before adding E820_SOFT_RESERVED
> 3. Set the E820 type to E820_SOFT_RESERVED for this memory region
> 
> This allows guest operating systems to recognize the memory as soft reserved
> memory, which can be used for device-specific memory management without
> E820 table conflicts.
> 
> Usage:
>    -numa node,nodeid=0,memdev=m1,spm=on
> 
> Signed-off-by: fanhuang <FangSheng.Huang@amd.com>
> ---
>   hw/core/numa.c               |  3 ++
>   hw/i386/e820_memory_layout.c | 73 ++++++++++++++++++++++++++++++++++++
>   hw/i386/e820_memory_layout.h |  2 +
>   hw/i386/pc.c                 | 37 ++++++++++++++++++
>   include/exec/cpu-common.h    |  1 +
>   include/system/memory.h      |  3 ++
>   include/system/numa.h        |  1 +
>   qapi/machine.json            |  6 +++
>   system/physmem.c             |  7 +++-
>   9 files changed, 132 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/core/numa.c b/hw/core/numa.c
> index 218576f745..e680130460 100644
> --- a/hw/core/numa.c
> +++ b/hw/core/numa.c
> @@ -163,6 +163,9 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
>           numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
>       }
>   
> +    /* Store spm configuration for later processing */
> +    numa_info[nodenr].is_spm = node->has_spm && node->spm;
> +
>       numa_info[nodenr].present = true;
>       max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
>       ms->numa_state->num_nodes++;
> diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
> index 3e848fb69c..5b090ac6df 100644
> --- a/hw/i386/e820_memory_layout.c
> +++ b/hw/i386/e820_memory_layout.c
> @@ -46,3 +46,76 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length)
>       }
>       return false;
>   }
> +
> +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type)
> +{
> +    uint64_t end = start + length;
> +    bool updated = false;
> +    assert(!e820_done);
> +
> +    /* For E820_SOFT_RESERVED, validate range is within E820_RAM */
> +    if (new_type == E820_SOFT_RESERVED) {
> +        bool range_in_ram = false;
> +        for (size_t j = 0; j < e820_entries; j++) {
> +            uint64_t ram_start = le64_to_cpu(e820_table[j].address);
> +            uint64_t ram_end = ram_start + le64_to_cpu(e820_table[j].length);
> +            uint32_t ram_type = le32_to_cpu(e820_table[j].type);
> +
> +            if (ram_type == E820_RAM && ram_start <= start && ram_end >= end) {
> +                range_in_ram = true;
> +                break;
> +            }
> +        }
> +        if (!range_in_ram) {
> +            return false;
> +        }
> +    }
> +
> +    /* Find entry that contains the target range and update it */
> +    for (size_t i = 0; i < e820_entries; i++) {
> +        uint64_t entry_start = le64_to_cpu(e820_table[i].address);
> +        uint64_t entry_length = le64_to_cpu(e820_table[i].length);
> +        uint64_t entry_end = entry_start + entry_length;
> +
> +        if (entry_start <= start && entry_end >= end) {
> +            uint32_t original_type = e820_table[i].type;
> +
> +            /* Remove original entry */
> +            memmove(&e820_table[i], &e820_table[i + 1],
> +                    (e820_entries - i - 1) * sizeof(struct e820_entry));
> +            e820_entries--;
> +
> +            /* Add split parts inline */
> +            if (entry_start < start) {
> +                e820_table = g_renew(struct e820_entry, e820_table,
> +                                     e820_entries + 1);
> +                e820_table[e820_entries].address = cpu_to_le64(entry_start);
> +                e820_table[e820_entries].length =
> +                    cpu_to_le64(start - entry_start);
> +                e820_table[e820_entries].type = original_type;
> +                e820_entries++;
> +            }
> +
> +            e820_table = g_renew(struct e820_entry, e820_table,
> +                                 e820_entries + 1);
> +            e820_table[e820_entries].address = cpu_to_le64(start);
> +            e820_table[e820_entries].length = cpu_to_le64(length);
> +            e820_table[e820_entries].type = cpu_to_le32(new_type);
> +            e820_entries++;
> +
> +            if (end < entry_end) {
> +                e820_table = g_renew(struct e820_entry, e820_table,
> +                                     e820_entries + 1);
> +                e820_table[e820_entries].address = cpu_to_le64(end);
> +                e820_table[e820_entries].length = cpu_to_le64(entry_end - end);
> +                e820_table[e820_entries].type = original_type;
> +                e820_entries++;
> +            }
> +
> +            updated = true;
> +            break;
> +        }
> +    }
> +
> +    return updated;
> +}
> diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
> index b50acfa201..657cc679e2 100644
> --- a/hw/i386/e820_memory_layout.h
> +++ b/hw/i386/e820_memory_layout.h
> @@ -15,6 +15,7 @@
>   #define E820_ACPI       3
>   #define E820_NVS        4
>   #define E820_UNUSABLE   5
> +#define E820_SOFT_RESERVED  0xEFFFFFFF
>   
>   struct e820_entry {
>       uint64_t address;
> @@ -26,5 +27,6 @@ void e820_add_entry(uint64_t address, uint64_t length, uint32_t type);
>   bool e820_get_entry(int index, uint32_t type,
>                       uint64_t *address, uint64_t *length);
>   int e820_get_table(struct e820_entry **table);
> +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type);
>   
>   #endif
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index bc048a6d13..3e50570484 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -26,6 +26,7 @@
>   #include "qemu/units.h"
>   #include "exec/target_page.h"
>   #include "hw/i386/pc.h"
> +#include "system/ramblock.h"
>   #include "hw/char/serial-isa.h"
>   #include "hw/char/parallel.h"
>   #include "hw/hyperv/hv-balloon.h"
> @@ -787,6 +788,41 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
>       return pc_above_4g_end(pcms) - 1;
>   }
>   
> +static int pc_update_spm_memory(RAMBlock *rb, void *opaque)
> +{
> +    X86MachineState *x86ms = opaque;
> +    MachineState *ms = MACHINE(x86ms);
> +    ram_addr_t offset;
> +    ram_addr_t length;
> +    bool is_spm = false;
> +
> +    /* Check if this RAM block belongs to a NUMA node with spm=on */
> +    for (int i = 0; i < ms->numa_state->num_nodes; i++) {
> +        NodeInfo *numa_info = &ms->numa_state->nodes[i];
> +        if (numa_info->is_spm && numa_info->node_memdev) {
> +            MemoryRegion *mr = &numa_info->node_memdev->mr;
> +            if (mr->ram_block == rb) {
> +                /* Mark this RAM block as SPM and set the flag */
> +                rb->flags |= RAM_SPM;
> +                is_spm = true;
> +                break;
> +            }
> +        }
> +    }
> +
> +    if (is_spm) {
> +        offset = qemu_ram_get_offset(rb) +
> +                 (0x100000000ULL - x86ms->below_4g_mem_size);
> +        length = qemu_ram_get_used_length(rb);
> +        if (!e820_update_entry_type(offset, length, E820_SOFT_RESERVED)) {
> +            warn_report("Failed to update E820 entry for SPM at 0x%" PRIx64
> +                        " length 0x%" PRIx64, offset, length);
> +        }
> +    }
> +
> +    return 0;
> +}
> +
>   /*
>    * AMD systems with an IOMMU have an additional hole close to the
>    * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
> @@ -901,6 +937,7 @@ void pc_memory_init(PCMachineState *pcms,
>       if (pcms->sgx_epc.size != 0) {
>           e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
>       }
> +    qemu_ram_foreach_block(pc_update_spm_memory, x86ms);
>   
>       if (!pcmc->has_reserved_memory &&
>           (machine->ram_slots ||
> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
> index 9b658a3f48..9b437eaa10 100644
> --- a/include/exec/cpu-common.h
> +++ b/include/exec/cpu-common.h
> @@ -89,6 +89,7 @@ ram_addr_t qemu_ram_get_fd_offset(RAMBlock *rb);
>   ram_addr_t qemu_ram_get_used_length(RAMBlock *rb);
>   ram_addr_t qemu_ram_get_max_length(RAMBlock *rb);
>   bool qemu_ram_is_shared(RAMBlock *rb);
> +bool qemu_ram_is_spm(RAMBlock *rb);
>   bool qemu_ram_is_noreserve(RAMBlock *rb);
>   bool qemu_ram_is_uf_zeroable(RAMBlock *rb);
>   void qemu_ram_set_uf_zeroable(RAMBlock *rb);
> diff --git a/include/system/memory.h b/include/system/memory.h
> index aa85fc27a1..0d36cbd30d 100644
> --- a/include/system/memory.h
> +++ b/include/system/memory.h
> @@ -275,6 +275,9 @@ typedef struct IOMMUTLBEvent {
>    */
>   #define RAM_PRIVATE (1 << 13)
>   
> +/* RAM is Specific Purpose Memory */
> +#define RAM_SPM (1 << 14)
> +
>   static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
>                                          IOMMUNotifierFlag flags,
>                                          hwaddr start, hwaddr end,
> diff --git a/include/system/numa.h b/include/system/numa.h
> index 1044b0eb6e..438511a756 100644
> --- a/include/system/numa.h
> +++ b/include/system/numa.h
> @@ -41,6 +41,7 @@ typedef struct NodeInfo {
>       bool present;
>       bool has_cpu;
>       bool has_gi;
> +    bool is_spm;
>       uint8_t lb_info_provided;
>       uint16_t initiator;
>       uint8_t distance[MAX_NODES];
> diff --git a/qapi/machine.json b/qapi/machine.json
> index 038eab281c..1fa31b0224 100644
> --- a/qapi/machine.json
> +++ b/qapi/machine.json
> @@ -500,6 +500,11 @@
>   # @memdev: memory backend object.  If specified for one node, it must
>   #     be specified for all nodes.
>   #
> +# @spm: if true, mark the memory region of this node as Specific
> +#     Purpose Memory (SPM). This will set the RAM_SPM flag for the
> +#     corresponding memory region and set the E820 type to
> +#     E820_SOFT_RESERVED. (default: false, since 9.2)
> +#
>   # @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points
>   #     to the nodeid which has the memory controller responsible for
>   #     this NUMA node.  This field provides additional information as
> @@ -514,6 +519,7 @@
>      '*cpus':   ['uint16'],
>      '*mem':    'size',
>      '*memdev': 'str',
> +   '*spm':    'bool',
>      '*initiator': 'uint16' }}
>   
>   ##
> diff --git a/system/physmem.c b/system/physmem.c
> index ae8ecd50ea..0090d9955d 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -1611,6 +1611,11 @@ bool qemu_ram_is_noreserve(RAMBlock *rb)
>       return rb->flags & RAM_NORESERVE;
>   }
>   
> +bool qemu_ram_is_spm(RAMBlock *rb)
> +{
> +    return rb->flags & RAM_SPM;
> +}
> +

IIUC, this function is unused, and the only setter is in 
pc_update_spm_memory().

Why do we have to modify the RAMBlock at all or walk over them?

Shouldn't it be sufficient to just walk over all 
&ms->numa_state->nodes[i] and update e820 accordingly?

-- 
Cheers

David / dhildenb

Re: [PATCH v2] numa: add 'spm' option for Specific Purpose Memory

Posted by Huang, FangSheng (Jerry) 3 months ago


On 11/3/2025 8:32 PM, David Hildenbrand wrote:
> On 20.10.25 11:07, fanhuang wrote:
>> This patch adds support for Specific Purpose Memory (SPM) through the
>> NUMA node configuration. When 'spm=on' is specified for a NUMA node,
>> QEMU will:
>>
>> 1. Set the RAM_SPM flag in the RAM block of the corresponding memory 
>> region
>> 2. Update the overlapping E820 RAM entries before adding 
>> E820_SOFT_RESERVED
>> 3. Set the E820 type to E820_SOFT_RESERVED for this memory region
>>
>> This allows guest operating systems to recognize the memory as soft 
>> reserved
>> memory, which can be used for device-specific memory management without
>> E820 table conflicts.
>>
>> Usage:
>>    -numa node,nodeid=0,memdev=m1,spm=on
>>
>> Signed-off-by: fanhuang <FangSheng.Huang@amd.com>
>> ---
>>   hw/core/numa.c               |  3 ++
>>   hw/i386/e820_memory_layout.c | 73 ++++++++++++++++++++++++++++++++++++
>>   hw/i386/e820_memory_layout.h |  2 +
>>   hw/i386/pc.c                 | 37 ++++++++++++++++++
>>   include/exec/cpu-common.h    |  1 +
>>   include/system/memory.h      |  3 ++
>>   include/system/numa.h        |  1 +
>>   qapi/machine.json            |  6 +++
>>   system/physmem.c             |  7 +++-
>>   9 files changed, 132 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/core/numa.c b/hw/core/numa.c
>> index 218576f745..e680130460 100644
>> --- a/hw/core/numa.c
>> +++ b/hw/core/numa.c
>> @@ -163,6 +163,9 @@ static void parse_numa_node(MachineState *ms, 
>> NumaNodeOptions *node,
>>           numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
>>       }
>> +    /* Store spm configuration for later processing */
>> +    numa_info[nodenr].is_spm = node->has_spm && node->spm;
>> +
>>       numa_info[nodenr].present = true;
>>       max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
>>       ms->numa_state->num_nodes++;
>> diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
>> index 3e848fb69c..5b090ac6df 100644
>> --- a/hw/i386/e820_memory_layout.c
>> +++ b/hw/i386/e820_memory_layout.c
>> @@ -46,3 +46,76 @@ bool e820_get_entry(int idx, uint32_t type, 
>> uint64_t *address, uint64_t *length)
>>       }
>>       return false;
>>   }
>> +
>> +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t 
>> new_type)
>> +{
>> +    uint64_t end = start + length;
>> +    bool updated = false;
>> +    assert(!e820_done);
>> +
>> +    /* For E820_SOFT_RESERVED, validate range is within E820_RAM */
>> +    if (new_type == E820_SOFT_RESERVED) {
>> +        bool range_in_ram = false;
>> +        for (size_t j = 0; j < e820_entries; j++) {
>> +            uint64_t ram_start = le64_to_cpu(e820_table[j].address);
>> +            uint64_t ram_end = ram_start + 
>> le64_to_cpu(e820_table[j].length);
>> +            uint32_t ram_type = le32_to_cpu(e820_table[j].type);
>> +
>> +            if (ram_type == E820_RAM && ram_start <= start && ram_end 
>> >= end) {
>> +                range_in_ram = true;
>> +                break;
>> +            }
>> +        }
>> +        if (!range_in_ram) {
>> +            return false;
>> +        }
>> +    }
>> +
>> +    /* Find entry that contains the target range and update it */
>> +    for (size_t i = 0; i < e820_entries; i++) {
>> +        uint64_t entry_start = le64_to_cpu(e820_table[i].address);
>> +        uint64_t entry_length = le64_to_cpu(e820_table[i].length);
>> +        uint64_t entry_end = entry_start + entry_length;
>> +
>> +        if (entry_start <= start && entry_end >= end) {
>> +            uint32_t original_type = e820_table[i].type;
>> +
>> +            /* Remove original entry */
>> +            memmove(&e820_table[i], &e820_table[i + 1],
>> +                    (e820_entries - i - 1) * sizeof(struct e820_entry));
>> +            e820_entries--;
>> +
>> +            /* Add split parts inline */
>> +            if (entry_start < start) {
>> +                e820_table = g_renew(struct e820_entry, e820_table,
>> +                                     e820_entries + 1);
>> +                e820_table[e820_entries].address = 
>> cpu_to_le64(entry_start);
>> +                e820_table[e820_entries].length =
>> +                    cpu_to_le64(start - entry_start);
>> +                e820_table[e820_entries].type = original_type;
>> +                e820_entries++;
>> +            }
>> +
>> +            e820_table = g_renew(struct e820_entry, e820_table,
>> +                                 e820_entries + 1);
>> +            e820_table[e820_entries].address = cpu_to_le64(start);
>> +            e820_table[e820_entries].length = cpu_to_le64(length);
>> +            e820_table[e820_entries].type = cpu_to_le32(new_type);
>> +            e820_entries++;
>> +
>> +            if (end < entry_end) {
>> +                e820_table = g_renew(struct e820_entry, e820_table,
>> +                                     e820_entries + 1);
>> +                e820_table[e820_entries].address = cpu_to_le64(end);
>> +                e820_table[e820_entries].length = 
>> cpu_to_le64(entry_end - end);
>> +                e820_table[e820_entries].type = original_type;
>> +                e820_entries++;
>> +            }
>> +
>> +            updated = true;
>> +            break;
>> +        }
>> +    }
>> +
>> +    return updated;
>> +}
>> diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
>> index b50acfa201..657cc679e2 100644
>> --- a/hw/i386/e820_memory_layout.h
>> +++ b/hw/i386/e820_memory_layout.h
>> @@ -15,6 +15,7 @@
>>   #define E820_ACPI       3
>>   #define E820_NVS        4
>>   #define E820_UNUSABLE   5
>> +#define E820_SOFT_RESERVED  0xEFFFFFFF
>>   struct e820_entry {
>>       uint64_t address;
>> @@ -26,5 +27,6 @@ void e820_add_entry(uint64_t address, uint64_t 
>> length, uint32_t type);
>>   bool e820_get_entry(int index, uint32_t type,
>>                       uint64_t *address, uint64_t *length);
>>   int e820_get_table(struct e820_entry **table);
>> +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t 
>> new_type);
>>   #endif
>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>> index bc048a6d13..3e50570484 100644
>> --- a/hw/i386/pc.c
>> +++ b/hw/i386/pc.c
>> @@ -26,6 +26,7 @@
>>   #include "qemu/units.h"
>>   #include "exec/target_page.h"
>>   #include "hw/i386/pc.h"
>> +#include "system/ramblock.h"
>>   #include "hw/char/serial-isa.h"
>>   #include "hw/char/parallel.h"
>>   #include "hw/hyperv/hv-balloon.h"
>> @@ -787,6 +788,41 @@ static hwaddr pc_max_used_gpa(PCMachineState 
>> *pcms, uint64_t pci_hole64_size)
>>       return pc_above_4g_end(pcms) - 1;
>>   }
>> +static int pc_update_spm_memory(RAMBlock *rb, void *opaque)
>> +{
>> +    X86MachineState *x86ms = opaque;
>> +    MachineState *ms = MACHINE(x86ms);
>> +    ram_addr_t offset;
>> +    ram_addr_t length;
>> +    bool is_spm = false;
>> +
>> +    /* Check if this RAM block belongs to a NUMA node with spm=on */
>> +    for (int i = 0; i < ms->numa_state->num_nodes; i++) {
>> +        NodeInfo *numa_info = &ms->numa_state->nodes[i];
>> +        if (numa_info->is_spm && numa_info->node_memdev) {
>> +            MemoryRegion *mr = &numa_info->node_memdev->mr;
>> +            if (mr->ram_block == rb) {
>> +                /* Mark this RAM block as SPM and set the flag */
>> +                rb->flags |= RAM_SPM;
>> +                is_spm = true;
>> +                break;
>> +            }
>> +        }
>> +    }
>> +
>> +    if (is_spm) {
>> +        offset = qemu_ram_get_offset(rb) +
>> +                 (0x100000000ULL - x86ms->below_4g_mem_size);
>> +        length = qemu_ram_get_used_length(rb);
>> +        if (!e820_update_entry_type(offset, length, 
>> E820_SOFT_RESERVED)) {
>> +            warn_report("Failed to update E820 entry for SPM at 0x%" 
>> PRIx64
>> +                        " length 0x%" PRIx64, offset, length);
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   /*
>>    * AMD systems with an IOMMU have an additional hole close to the
>>    * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
>> @@ -901,6 +937,7 @@ void pc_memory_init(PCMachineState *pcms,
>>       if (pcms->sgx_epc.size != 0) {
>>           e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, 
>> E820_RESERVED);
>>       }
>> +    qemu_ram_foreach_block(pc_update_spm_memory, x86ms);
>>       if (!pcmc->has_reserved_memory &&
>>           (machine->ram_slots ||
>> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
>> index 9b658a3f48..9b437eaa10 100644
>> --- a/include/exec/cpu-common.h
>> +++ b/include/exec/cpu-common.h
>> @@ -89,6 +89,7 @@ ram_addr_t qemu_ram_get_fd_offset(RAMBlock *rb);
>>   ram_addr_t qemu_ram_get_used_length(RAMBlock *rb);
>>   ram_addr_t qemu_ram_get_max_length(RAMBlock *rb);
>>   bool qemu_ram_is_shared(RAMBlock *rb);
>> +bool qemu_ram_is_spm(RAMBlock *rb);
>>   bool qemu_ram_is_noreserve(RAMBlock *rb);
>>   bool qemu_ram_is_uf_zeroable(RAMBlock *rb);
>>   void qemu_ram_set_uf_zeroable(RAMBlock *rb);
>> diff --git a/include/system/memory.h b/include/system/memory.h
>> index aa85fc27a1..0d36cbd30d 100644
>> --- a/include/system/memory.h
>> +++ b/include/system/memory.h
>> @@ -275,6 +275,9 @@ typedef struct IOMMUTLBEvent {
>>    */
>>   #define RAM_PRIVATE (1 << 13)
>> +/* RAM is Specific Purpose Memory */
>> +#define RAM_SPM (1 << 14)
>> +
>>   static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify 
>> fn,
>>                                          IOMMUNotifierFlag flags,
>>                                          hwaddr start, hwaddr end,
>> diff --git a/include/system/numa.h b/include/system/numa.h
>> index 1044b0eb6e..438511a756 100644
>> --- a/include/system/numa.h
>> +++ b/include/system/numa.h
>> @@ -41,6 +41,7 @@ typedef struct NodeInfo {
>>       bool present;
>>       bool has_cpu;
>>       bool has_gi;
>> +    bool is_spm;
>>       uint8_t lb_info_provided;
>>       uint16_t initiator;
>>       uint8_t distance[MAX_NODES];
>> diff --git a/qapi/machine.json b/qapi/machine.json
>> index 038eab281c..1fa31b0224 100644
>> --- a/qapi/machine.json
>> +++ b/qapi/machine.json
>> @@ -500,6 +500,11 @@
>>   # @memdev: memory backend object.  If specified for one node, it must
>>   #     be specified for all nodes.
>>   #
>> +# @spm: if true, mark the memory region of this node as Specific
>> +#     Purpose Memory (SPM). This will set the RAM_SPM flag for the
>> +#     corresponding memory region and set the E820 type to
>> +#     E820_SOFT_RESERVED. (default: false, since 9.2)
>> +#
>>   # @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points
>>   #     to the nodeid which has the memory controller responsible for
>>   #     this NUMA node.  This field provides additional information as
>> @@ -514,6 +519,7 @@
>>      '*cpus':   ['uint16'],
>>      '*mem':    'size',
>>      '*memdev': 'str',
>> +   '*spm':    'bool',
>>      '*initiator': 'uint16' }}
>>   ##
>> diff --git a/system/physmem.c b/system/physmem.c
>> index ae8ecd50ea..0090d9955d 100644
>> --- a/system/physmem.c
>> +++ b/system/physmem.c
>> @@ -1611,6 +1611,11 @@ bool qemu_ram_is_noreserve(RAMBlock *rb)
>>       return rb->flags & RAM_NORESERVE;
>>   }
>> +bool qemu_ram_is_spm(RAMBlock *rb)
>> +{
>> +    return rb->flags & RAM_SPM;
>> +}
>> +
> 
> IIUC, this function is unused, and the only setter is in 
> pc_update_spm_memory().
> 
> Why do we have to modify the RAMBlock at all or walk over them?
> 
> Shouldn't it be sufficient to just walk over all &ms->numa_state- 
>  >nodes[i] and update e820 accordingly?
> 
Hi David,

Thank you for the excellent review and the insightful suggestion!

You're absolutely right - I've simplified the implementation to
directly iterate over NUMA nodes instead of RAMBlocks.

I'll send v3 after internal review. I also understand Igor's
feedback would be valuable - I'll wait to hear if he has any
concerns.

Best regards,
Jerry Huang

[PATCH v3 0/1] numa: add 'spm' option for Specific Purpose Memory

Posted by fanhuang 2 months ago

Hi David and Igor,

Thank you for your patience. It's been about a month since our last
discussion, and I apologize for the delay.

This is v3 of the SPM (Specific Purpose Memory) patch. Following David's
suggestion from v2 review, I've simplified the implementation significantly.

Changes in v3:
- Removed unnecessary RAMBlock traversal and RAM_SPM flag
- Now directly iterates over NUMA nodes to update E820 entries
- Added documentation in qemu-options.hx

Use case reminder:
This feature allows passing EFI_MEMORY_SP (Specific Purpose Memory) from
host to guest VM, useful for memory reserved for specific PCI devices
(e.g., GPU memory via VFIO-PCI). The SPM memory appears as
E820_SOFT_RESERVED to the guest and is managed by device drivers rather
than the OS memory allocator.

Example usage:
  -object memory-backend-ram,size=8G,id=m0
  -object memory-backend-file,size=8G,id=m1,mem-path=/dev/dax0.0
  -numa node,nodeid=0,memdev=m0
  -numa node,nodeid=1,memdev=m1,spm=on

Please review. Thank you for your guidance on this implementation.

Best regards,
Jerry Huang

[PATCH v3 1/1] numa: add 'spm' option for Specific Purpose Memory

Posted by fanhuang 2 months ago

This patch adds support for Specific Purpose Memory (SPM) through the
NUMA node configuration. When 'spm=on' is specified for a NUMA node,
QEMU will set the E820 type to E820_SOFT_RESERVED for this memory region.

This allows guest operating systems to recognize the memory as soft reserved
memory, which can be used for device-specific memory management.

The implementation directly iterates over NUMA nodes to update E820 entries,
avoiding unnecessary RAMBlock traversal and flags, as suggested in code review.

Usage:
  -numa node,nodeid=0,memdev=m1,spm=on

Signed-off-by: fanhuang <FangSheng.Huang@amd.com>
---
 hw/core/numa.c               |  3 ++
 hw/i386/e820_memory_layout.c | 73 ++++++++++++++++++++++++++++++++++++
 hw/i386/e820_memory_layout.h |  2 +
 hw/i386/pc.c                 | 55 +++++++++++++++++++++++++++
 include/system/numa.h        |  1 +
 qapi/machine.json            |  5 +++
 qemu-options.hx              | 11 +++++-
 7 files changed, 148 insertions(+), 2 deletions(-)

diff --git a/hw/core/numa.c b/hw/core/numa.c
index 218576f745..e680130460 100644
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@@ -163,6 +163,9 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
     }
 
+    /* Store spm configuration for later processing */
+    numa_info[nodenr].is_spm = node->has_spm && node->spm;
+
     numa_info[nodenr].present = true;
     max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
     ms->numa_state->num_nodes++;
diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
index 3e848fb69c..5b090ac6df 100644
--- a/hw/i386/e820_memory_layout.c
+++ b/hw/i386/e820_memory_layout.c
@@ -46,3 +46,76 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length)
     }
     return false;
 }
+
+bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type)
+{
+    uint64_t end = start + length;
+    bool updated = false;
+    assert(!e820_done);
+
+    /* For E820_SOFT_RESERVED, validate range is within E820_RAM */
+    if (new_type == E820_SOFT_RESERVED) {
+        bool range_in_ram = false;
+        for (size_t j = 0; j < e820_entries; j++) {
+            uint64_t ram_start = le64_to_cpu(e820_table[j].address);
+            uint64_t ram_end = ram_start + le64_to_cpu(e820_table[j].length);
+            uint32_t ram_type = le32_to_cpu(e820_table[j].type);
+
+            if (ram_type == E820_RAM && ram_start <= start && ram_end >= end) {
+                range_in_ram = true;
+                break;
+            }
+        }
+        if (!range_in_ram) {
+            return false;
+        }
+    }
+
+    /* Find entry that contains the target range and update it */
+    for (size_t i = 0; i < e820_entries; i++) {
+        uint64_t entry_start = le64_to_cpu(e820_table[i].address);
+        uint64_t entry_length = le64_to_cpu(e820_table[i].length);
+        uint64_t entry_end = entry_start + entry_length;
+
+        if (entry_start <= start && entry_end >= end) {
+            uint32_t original_type = e820_table[i].type;
+
+            /* Remove original entry */
+            memmove(&e820_table[i], &e820_table[i + 1],
+                    (e820_entries - i - 1) * sizeof(struct e820_entry));
+            e820_entries--;
+
+            /* Add split parts inline */
+            if (entry_start < start) {
+                e820_table = g_renew(struct e820_entry, e820_table,
+                                     e820_entries + 1);
+                e820_table[e820_entries].address = cpu_to_le64(entry_start);
+                e820_table[e820_entries].length =
+                    cpu_to_le64(start - entry_start);
+                e820_table[e820_entries].type = original_type;
+                e820_entries++;
+            }
+
+            e820_table = g_renew(struct e820_entry, e820_table,
+                                 e820_entries + 1);
+            e820_table[e820_entries].address = cpu_to_le64(start);
+            e820_table[e820_entries].length = cpu_to_le64(length);
+            e820_table[e820_entries].type = cpu_to_le32(new_type);
+            e820_entries++;
+
+            if (end < entry_end) {
+                e820_table = g_renew(struct e820_entry, e820_table,
+                                     e820_entries + 1);
+                e820_table[e820_entries].address = cpu_to_le64(end);
+                e820_table[e820_entries].length = cpu_to_le64(entry_end - end);
+                e820_table[e820_entries].type = original_type;
+                e820_entries++;
+            }
+
+            updated = true;
+            break;
+        }
+    }
+
+    return updated;
+}
diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
index b50acfa201..657cc679e2 100644
--- a/hw/i386/e820_memory_layout.h
+++ b/hw/i386/e820_memory_layout.h
@@ -15,6 +15,7 @@
 #define E820_ACPI       3
 #define E820_NVS        4
 #define E820_UNUSABLE   5
+#define E820_SOFT_RESERVED  0xEFFFFFFF
 
 struct e820_entry {
     uint64_t address;
@@ -26,5 +27,6 @@ void e820_add_entry(uint64_t address, uint64_t length, uint32_t type);
 bool e820_get_entry(int index, uint32_t type,
                     uint64_t *address, uint64_t *length);
 int e820_get_table(struct e820_entry **table);
+bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type);
 
 #endif
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index f8b919cb6c..ccb2af2a56 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -791,6 +791,58 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
     return pc_above_4g_end(pcms) - 1;
 }
 
+/*
+ * Update E820 entries for NUMA nodes marked as SPM (Specific Purpose Memory).
+ * This function directly iterates over NUMA nodes instead of RAMBlocks,
+ * as suggested by code review to simplify the implementation.
+ */
+static void pc_update_spm_memory(X86MachineState *x86ms)
+{
+    MachineState *ms = MACHINE(x86ms);
+    uint64_t addr = 0;
+
+    for (int i = 0; i < ms->numa_state->num_nodes; i++) {
+        NodeInfo *numa_info = &ms->numa_state->nodes[i];
+        uint64_t node_size = numa_info->node_mem;
+
+        /* Process SPM nodes */
+        if (numa_info->is_spm && numa_info->node_memdev) {
+            uint64_t guest_addr;
+
+            /* Calculate guest physical address accounting for PCI hole */
+            if (addr < x86ms->below_4g_mem_size) {
+                if (addr + node_size <= x86ms->below_4g_mem_size) {
+                    /* Entirely below 4GB */
+                    guest_addr = addr;
+                } else {
+                    /* Spans across 4GB boundary - should not happen with proper config */
+                    warn_report("SPM node %d spans 4GB boundary, "
+                                "using address above 4GB", i);
+                    guest_addr = 0x100000000ULL + 
+                                (addr + node_size - x86ms->below_4g_mem_size);
+                }
+            } else {
+                /* Above 4GB, account for PCI hole */
+                guest_addr = 0x100000000ULL + 
+                            (addr - x86ms->below_4g_mem_size);
+            }
+
+            /* Update E820 entry type to SOFT_RESERVED */
+            if (!e820_update_entry_type(guest_addr, node_size, 
+                                       E820_SOFT_RESERVED)) {
+                warn_report("Failed to update E820 entry for SPM node %d "
+                           "at 0x%" PRIx64 " length 0x%" PRIx64,
+                           i, guest_addr, node_size);
+            }
+        }
+
+        /* Accumulate address for next node */
+        if (numa_info->node_memdev) {
+            addr += node_size;
+        }
+    }
+}
+
 /*
  * AMD systems with an IOMMU have an additional hole close to the
  * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
@@ -907,6 +959,9 @@ void pc_memory_init(PCMachineState *pcms,
         e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
     }
 
+    /* Update E820 for NUMA nodes marked as SPM */
+    pc_update_spm_memory(x86ms);
+
     if (!pcmc->has_reserved_memory &&
         (machine->ram_slots ||
          (machine->maxram_size > machine->ram_size))) {
diff --git a/include/system/numa.h b/include/system/numa.h
index 1044b0eb6e..438511a756 100644
--- a/include/system/numa.h
+++ b/include/system/numa.h
@@ -41,6 +41,7 @@ typedef struct NodeInfo {
     bool present;
     bool has_cpu;
     bool has_gi;
+    bool is_spm;
     uint8_t lb_info_provided;
     uint16_t initiator;
     uint8_t distance[MAX_NODES];
diff --git a/qapi/machine.json b/qapi/machine.json
index 907cb25f75..98c2367ee6 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -500,6 +500,10 @@
 # @memdev: memory backend object.  If specified for one node, it must
 #     be specified for all nodes.
 #
+# @spm: if true, mark the memory region of this node as Specific
+#     Purpose Memory (SPM).  This will set the E820 type to
+#     E820_SOFT_RESERVED for guest OS.  (default: false, since 9.2)
+#
 # @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points
 #     to the nodeid which has the memory controller responsible for
 #     this NUMA node.  This field provides additional information as
@@ -514,6 +518,7 @@
    '*cpus':   ['uint16'],
    '*mem':    'size',
    '*memdev': 'str',
+   '*spm':    'bool',
    '*initiator': 'uint16' }}
 
 ##
diff --git a/qemu-options.hx b/qemu-options.hx
index fca2b7bc74..7d914a9bc6 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -431,7 +431,7 @@ ERST
 
 DEF("numa", HAS_ARG, QEMU_OPTION_numa,
     "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
-    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
+    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node][,spm=on|off]\n"
     "-numa dist,src=source,dst=destination,val=distance\n"
     "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n"
     "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n"
@@ -440,7 +440,7 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa,
 SRST
 ``-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]``
   \ 
-``-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]``
+``-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator][,spm=on|off]``
   \
 ``-numa dist,src=source,dst=destination,val=distance``
   \ 
@@ -508,6 +508,13 @@ SRST
     largest bandwidth) to this NUMA node. Note that this option can be
     set only when the machine property 'hmat' is set to 'on'.
 
+    '\ ``spm``\ ' option marks the memory region of this NUMA node as
+    Specific Purpose Memory (SPM). When enabled, the memory will be
+    reported as soft reserved (E820 type 0xEFFFFFFF) to the guest OS,
+    which can then manage it separately from normal system RAM. This is
+    useful for device-specific memory that should not be used as general
+    purpose memory. This option is only supported on x86 platforms.
+
     Following example creates a machine with 2 NUMA nodes, node 0 has
     CPU. node 1 has only memory, and its initiator is node 0. Note that
     because node 0 has CPU, by default the initiator of node 0 is itself
-- 
2.34.1

Re: [PATCH v3 1/1] numa: add 'spm' option for Specific Purpose Memory

Posted by Jonathan Cameron via 2 months ago

On Mon, 8 Dec 2025 18:51:37 +0800
fanhuang <FangSheng.Huang@amd.com> wrote:

> This patch adds support for Specific Purpose Memory (SPM) through the
> NUMA node configuration. When 'spm=on' is specified for a NUMA node,
> QEMU will set the E820 type to E820_SOFT_RESERVED for this memory region.
> 
> This allows guest operating systems to recognize the memory as soft reserved
> memory, which can be used for device-specific memory management.
> 
> The implementation directly iterates over NUMA nodes to update E820 entries,
> avoiding unnecessary RAMBlock traversal and flags, as suggested in code review.
> 
> Usage:
>   -numa node,nodeid=0,memdev=m1,spm=on
> 
> Signed-off-by: fanhuang <FangSheng.Huang@amd.com>

Hi,

A few suggestions inline,

One general thing. I would never send a new version in reply to
a previous one.  That tends to just mean it ends up way back in
reviewer's in boxes + leads to confusing threads.

The thread naming of the cover letter is enough to associate the different
versions.

> diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
> index 3e848fb69c..5b090ac6df 100644
> --- a/hw/i386/e820_memory_layout.c
> +++ b/hw/i386/e820_memory_layout.c
> @@ -46,3 +46,76 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length)
>      }
>      return false;
>  }
> +
> +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type)
> +{
> +    uint64_t end = start + length;
> +    bool updated = false;
> +    assert(!e820_done);
> +
> +    /* For E820_SOFT_RESERVED, validate range is within E820_RAM */
> +    if (new_type == E820_SOFT_RESERVED) {
> +        bool range_in_ram = false;

I'd put a blank line here for readability.

> +        for (size_t j = 0; j < e820_entries; j++) {
> +            uint64_t ram_start = le64_to_cpu(e820_table[j].address);
> +            uint64_t ram_end = ram_start + le64_to_cpu(e820_table[j].length);
> +            uint32_t ram_type = le32_to_cpu(e820_table[j].type);
> +
> +            if (ram_type == E820_RAM && ram_start <= start && ram_end >= end) {
> +                range_in_ram = true;
> +                break;
> +            }
> +        }
> +        if (!range_in_ram) {
> +            return false;
> +        }
> +    }
> +
> +    /* Find entry that contains the target range and update it */
> +    for (size_t i = 0; i < e820_entries; i++) {
> +        uint64_t entry_start = le64_to_cpu(e820_table[i].address);
> +        uint64_t entry_length = le64_to_cpu(e820_table[i].length);
> +        uint64_t entry_end = entry_start + entry_length;
> +
> +        if (entry_start <= start && entry_end >= end) {
> +            uint32_t original_type = e820_table[i].type;
> +
> +            /* Remove original entry */
> +            memmove(&e820_table[i], &e820_table[i + 1],
> +                    (e820_entries - i - 1) * sizeof(struct e820_entry));
> +            e820_entries--;
> +
> +            /* Add split parts inline */
> +            if (entry_start < start) {
> +                e820_table = g_renew(struct e820_entry, e820_table,
> +                                     e820_entries + 1);
> +                e820_table[e820_entries].address = cpu_to_le64(entry_start);
> +                e820_table[e820_entries].length =
> +                    cpu_to_le64(start - entry_start);
> +                e820_table[e820_entries].type = original_type;
> +                e820_entries++;
> +            }
> +
> +            e820_table = g_renew(struct e820_entry, e820_table,
> +                                 e820_entries + 1);
> +            e820_table[e820_entries].address = cpu_to_le64(start);
> +            e820_table[e820_entries].length = cpu_to_le64(length);
> +            e820_table[e820_entries].type = cpu_to_le32(new_type);
> +            e820_entries++;
> +
> +            if (end < entry_end) {
> +                e820_table = g_renew(struct e820_entry, e820_table,
> +                                     e820_entries + 1);
> +                e820_table[e820_entries].address = cpu_to_le64(end);
> +                e820_table[e820_entries].length = cpu_to_le64(entry_end - end);
> +                e820_table[e820_entries].type = original_type;
> +                e820_entries++;
> +            }
> +
> +            updated = true;
Given you break out of the for loop and then return, why not
		return true;
> +            break;
> +        }
> +    }
> +
> +    return updated;
	return false;
and get rid of the updated local variable.

> +}

> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index f8b919cb6c..ccb2af2a56 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -791,6 +791,58 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
>      return pc_above_4g_end(pcms) - 1;
>  }
>  
> +/*
> + * Update E820 entries for NUMA nodes marked as SPM (Specific Purpose Memory).
> + * This function directly iterates over NUMA nodes instead of RAMBlocks,
> + * as suggested by code review to simplify the implementation.

Drop this sentence. That belongs in the patch description not the
code I think.

> + */
> +static void pc_update_spm_memory(X86MachineState *x86ms)
> +{
> +    MachineState *ms = MACHINE(x86ms);
> +    uint64_t addr = 0;
> +
> +    for (int i = 0; i < ms->numa_state->num_nodes; i++) {
> +        NodeInfo *numa_info = &ms->numa_state->nodes[i];
> +        uint64_t node_size = numa_info->node_mem;
> +
> +        /* Process SPM nodes */
> +        if (numa_info->is_spm && numa_info->node_memdev) {
> +            uint64_t guest_addr;
> +
> +            /* Calculate guest physical address accounting for PCI hole */
> +            if (addr < x86ms->below_4g_mem_size) {
> +                if (addr + node_size <= x86ms->below_4g_mem_size) {
> +                    /* Entirely below 4GB */
> +                    guest_addr = addr;
> +                } else {
> +                    /* Spans across 4GB boundary - should not happen with proper config */

Why not just error out then?  Would be better that we don't have
configs in the wild that don't make sense and having qemu not start,
with a good error message is a great way to ensure no one does that.

> +                    warn_report("SPM node %d spans 4GB boundary, "
> +                                "using address above 4GB", i);
> +                    guest_addr = 0x100000000ULL + 
> +                                (addr + node_size - x86ms->below_4g_mem_size);
> +                }
> +            } else {
> +                /* Above 4GB, account for PCI hole */
> +                guest_addr = 0x100000000ULL + 
> +                            (addr - x86ms->below_4g_mem_size);
> +            }
> +
> +            /* Update E820 entry type to SOFT_RESERVED */
> +            if (!e820_update_entry_type(guest_addr, node_size, 
> +                                       E820_SOFT_RESERVED)) {
> +                warn_report("Failed to update E820 entry for SPM node %d "
> +                           "at 0x%" PRIx64 " length 0x%" PRIx64,
> +                           i, guest_addr, node_size);
> +            }
> +        }
> +
> +        /* Accumulate address for next node */
> +        if (numa_info->node_memdev) {
> +            addr += node_size;
> +        }
> +    }
> +}

> diff --git a/include/system/numa.h b/include/system/numa.h
> index 1044b0eb6e..438511a756 100644
> --- a/include/system/numa.h
> +++ b/include/system/numa.h
> @@ -41,6 +41,7 @@ typedef struct NodeInfo {
>      bool present;
>      bool has_cpu;
>      bool has_gi;
> +    bool is_spm;
>      uint8_t lb_info_provided;
>      uint16_t initiator;
>      uint8_t distance[MAX_NODES];
> diff --git a/qapi/machine.json b/qapi/machine.json
> index 907cb25f75..98c2367ee6 100644
> --- a/qapi/machine.json
> +++ b/qapi/machine.json
> @@ -500,6 +500,10 @@
>  # @memdev: memory backend object.  If specified for one node, it must
>  #     be specified for all nodes.
>  #
> +# @spm: if true, mark the memory region of this node as Specific
> +#     Purpose Memory (SPM).  This will set the E820 type to
> +#     E820_SOFT_RESERVED for guest OS.  (default: false, since 9.2)

This is an arch independent file and only x86 has an E820 table to do
this in. Obviously we'll need to wire it up to the EFI memory map on
other architectures, but I'd definitely like to avoid arch specific
documentation, or call out which architectures it applies to.

> +#
>  # @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points
>  #     to the nodeid which has the memory controller responsible for
>  #     this NUMA node.  This field provides additional information as
> @@ -514,6 +518,7 @@
>     '*cpus':   ['uint16'],
>     '*mem':    'size',
>     '*memdev': 'str',
> +   '*spm':    'bool',
>     '*initiator': 'uint16' }}
>  
>  ##
> diff --git a/qemu-options.hx b/qemu-options.hx
> index fca2b7bc74..7d914a9bc6 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -431,7 +431,7 @@ ERST
>  
>  DEF("numa", HAS_ARG, QEMU_OPTION_numa,
>      "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
> -    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
> +    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node][,spm=on|off]\n"
>      "-numa dist,src=source,dst=destination,val=distance\n"
>      "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n"
>      "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n"
> @@ -440,7 +440,7 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa,
>  SRST
>  ``-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]``
>    \ 
> -``-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]``
> +``-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator][,spm=on|off]``
>    \
>  ``-numa dist,src=source,dst=destination,val=distance``
>    \ 
> @@ -508,6 +508,13 @@ SRST
>      largest bandwidth) to this NUMA node. Note that this option can be
>      set only when the machine property 'hmat' is set to 'on'.
>  
> +    '\ ``spm``\ ' option marks the memory region of this NUMA node as
> +    Specific Purpose Memory (SPM). When enabled, the memory will be
> +    reported as soft reserved (E820 type 0xEFFFFFFF) to the guest OS,
> +    which can then manage it separately from normal system RAM. This is
> +    useful for device-specific memory that should not be used as general
> +    purpose memory. This option is only supported on x86 platforms.

Do we error out if anyone tries to set it on other architectures?
From a quick look I think you are just ignoring it which is a good
way to confused users.

> +
>      Following example creates a machine with 2 NUMA nodes, node 0 has
>      CPU. node 1 has only memory, and its initiator is node 0. Note that
>      because node 0 has CPU, by default the initiator of node 0 is itself