[PATCH 2/6] x86/sev: add support for enabling RMPOPT

Ashish Kalra posted 6 patches 1 month, 2 weeks ago
There is a newer version of this series
[PATCH 2/6] x86/sev: add support for enabling RMPOPT
Posted by Ashish Kalra 1 month, 2 weeks ago
From: Ashish Kalra <ashish.kalra@amd.com>

The new RMPOPT instruction sets bits in a per-CPU RMPOPT table, which
indicates whether specific 1GB physical memory regions contain SEV-SNP
guest memory.

Per-CPU RMPOPT tables support at most 2 TB of addressable memory for
RMP optimizations. To handle this limitation:

For systems with 2 TB of RAM or less, configure each per-CPU RMPOPT
table base to 0 so that all system RAM is RMP-optimized on every CPU.

For systems with more than 2 TB of RAM, configure per-CPU RMPOPT
tables to cover the memory local to each NUMA node so RMP
optimizations can take advantage of NUMA locality. This must also
accommodate virtualized NUMA software domains (for example, AMD NPS
configurations) and ensure that the 2 TB RAM range local to each
physical socket is RMP-optimized.

Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 arch/x86/include/asm/msr-index.h |   3 +
 arch/x86/virt/svm/sev.c          | 192 +++++++++++++++++++++++++++++++
 2 files changed, 195 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index da5275d8eda6..8e7da03abd5b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -753,6 +753,9 @@
 #define MSR_AMD64_SEG_RMP_ENABLED_BIT	0
 #define MSR_AMD64_SEG_RMP_ENABLED	BIT_ULL(MSR_AMD64_SEG_RMP_ENABLED_BIT)
 #define MSR_AMD64_RMP_SEGMENT_SHIFT(x)	(((x) & GENMASK_ULL(13, 8)) >> 8)
+#define MSR_AMD64_RMPOPT_BASE		0xc0010139
+#define MSR_AMD64_RMPOPT_ENABLE_BIT	0
+#define MSR_AMD64_RMPOPT_ENABLE		BIT_ULL(MSR_AMD64_RMPOPT_ENABLE_BIT)
 
 #define MSR_SVSM_CAA			0xc001f000
 
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index ee643a6cd691..e6b784d26c33 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -127,6 +127,17 @@ static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
 
 static unsigned long snp_nr_leaked_pages;
 
+#define RMPOPT_TABLE_MAX_LIMIT_IN_TB	2
+#define NUM_TB(pfn_min, pfn_max)	\
+	(((pfn_max) - (pfn_min)) / (1 << (40 - PAGE_SHIFT)))
+
+struct rmpopt_socket_config {
+	unsigned long start_pfn, end_pfn;
+	cpumask_var_t cpulist;
+	int *node_id;
+	int current_node_idx;
+};
+
 #undef pr_fmt
 #define pr_fmt(fmt)	"SEV-SNP: " fmt
 
@@ -500,6 +511,185 @@ static bool __init setup_rmptable(void)
 	}
 }
 
+/*
+ * Build a cpumask of online primary threads, accounting for primary threads
+ * that have been offlined while their secondary threads are still online.
+ */
+static void get_cpumask_of_primary_threads(cpumask_var_t cpulist)
+{
+	cpumask_t cpus;
+	int cpu;
+
+	cpumask_copy(&cpus, cpu_online_mask);
+	for_each_cpu(cpu, &cpus) {
+		cpumask_set_cpu(cpu, cpulist);
+		cpumask_andnot(&cpus, &cpus, cpu_smt_mask(cpu));
+	}
+}
+
+static void __configure_rmpopt(void *val)
+{
+	u64 rmpopt_base = ((u64)val & PUD_MASK) | MSR_AMD64_RMPOPT_ENABLE;
+
+	wrmsrq(MSR_AMD64_RMPOPT_BASE, rmpopt_base);
+}
+
+static void configure_rmpopt_non_numa(cpumask_var_t primary_threads_cpulist)
+{
+	on_each_cpu_mask(primary_threads_cpulist, __configure_rmpopt, (void *)0, true);
+}
+
+static void free_rmpopt_socket_config(struct rmpopt_socket_config *socket)
+{
+	int i;
+
+	if (!socket)
+		return;
+
+	for (i = 0; i < topology_max_packages(); i++) {
+		free_cpumask_var(socket[i].cpulist);
+		kfree(socket[i].node_id);
+	}
+
+	kfree(socket);
+}
+DEFINE_FREE(free_rmpopt_socket_config, struct rmpopt_socket_config *, free_rmpopt_socket_config(_T))
+
+static void configure_rmpopt_large_physmem(cpumask_var_t primary_threads_cpulist)
+{
+	struct rmpopt_socket_config *socket __free(free_rmpopt_socket_config) = NULL;
+	int max_packages = topology_max_packages();
+	struct rmpopt_socket_config *sc;
+	int cpu, i;
+
+	socket = kcalloc(max_packages, sizeof(struct rmpopt_socket_config), GFP_KERNEL);
+	if (!socket)
+		return;
+
+	for (i = 0; i < max_packages; i++) {
+		sc = &socket[i];
+		if (!zalloc_cpumask_var(&sc->cpulist, GFP_KERNEL))
+			return;
+		sc->node_id = kcalloc(nr_node_ids, sizeof(int), GFP_KERNEL);
+		if (!sc->node_id)
+			return;
+		sc->current_node_idx = -1;
+	}
+
+	/*
+	 * Handle case of virtualized NUMA software domains, such as AMD Nodes Per Socket(NPS)
+	 * configurations. The kernel does not have an abstraction for physical sockets,
+	 * therefore, enumerate the physical sockets and Nodes Per Socket(NPS) information by
+	 * walking the online CPU list.
+	 */
+	for_each_cpu(cpu, primary_threads_cpulist) {
+		int socket_id, nid;
+
+		socket_id = topology_logical_package_id(cpu);
+		nid = cpu_to_node(cpu);
+		sc = &socket[socket_id];
+
+		/*
+		 * For each socket, determine the corresponding nodes and the socket's start
+		 * and end PFNs.
+		 * Record the node and the start and end PFNs of the first node found on the
+		 * socket, then record each subsequent node and update the end PFN for that
+		 * socket as additional nodes are found.
+		 */
+		if (sc->current_node_idx == -1) {
+			sc->current_node_idx = 0;
+			sc->node_id[sc->current_node_idx] = nid;
+			sc->start_pfn = node_start_pfn(nid);
+			sc->end_pfn = node_end_pfn(nid);
+		} else if (sc->node_id[sc->current_node_idx] != nid) {
+			sc->current_node_idx++;
+			sc->node_id[sc->current_node_idx] = nid;
+			sc->end_pfn = node_end_pfn(nid);
+		}
+
+		cpumask_set_cpu(cpu, sc->cpulist);
+	}
+
+	/*
+	 * If the "physical" socket has up to 2TB of memory, the per-CPU RMPOPT tables are
+	 * configured to the starting physical address of the socket, otherwise the tables
+	 * are configured per-node.
+	 */
+	for (i = 0; i < max_packages; i++) {
+		int num_tb_socket;
+		phys_addr_t pa;
+		int j;
+
+		sc = &socket[i];
+		num_tb_socket = NUM_TB(sc->start_pfn, sc->end_pfn) + 1;
+
+		pr_debug("socket start_pfn 0x%lx, end_pfn 0x%lx, socket cpu mask %*pbl\n",
+			 sc->start_pfn, sc->end_pfn, cpumask_pr_args(sc->cpulist));
+
+		if (num_tb_socket <= RMPOPT_TABLE_MAX_LIMIT_IN_TB) {
+			pa = PFN_PHYS(sc->start_pfn);
+			on_each_cpu_mask(sc->cpulist, __configure_rmpopt, (void *)pa, true);
+			continue;
+		}
+
+		for (j = 0; j <= sc->current_node_idx; j++) {
+			int nid = sc->node_id[j];
+			struct cpumask node_mask;
+
+			cpumask_and(&node_mask, cpumask_of_node(nid), sc->cpulist);
+			pa = PFN_PHYS(node_start_pfn(nid));
+
+			pr_debug("RMPOPT_BASE MSR on nodeid %d cpu mask %*pbl set to 0x%llx\n",
+				 nid, cpumask_pr_args(&node_mask), pa);
+			on_each_cpu_mask(&node_mask, __configure_rmpopt, (void *)pa, true);
+		}
+	}
+}
+
+static __init void configure_and_enable_rmpopt(void)
+{
+	cpumask_var_t primary_threads_cpulist;
+	int num_tb;
+
+	if (!cpu_feature_enabled(X86_FEATURE_RMPOPT)) {
+		pr_debug("RMPOPT not supported on this platform\n");
+		return;
+	}
+
+	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) {
+		pr_debug("RMPOPT optimizations not enabled as SNP support is not enabled\n");
+		return;
+	}
+
+	if (!(rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED)) {
+		pr_info("RMPOPT optimizations not enabled, segmented RMP required\n");
+		return;
+	}
+
+	if (!zalloc_cpumask_var(&primary_threads_cpulist, GFP_KERNEL))
+		return;
+
+	num_tb = NUM_TB(min_low_pfn, max_pfn) + 1;
+	pr_debug("NUM_TB pages in system %d\n", num_tb);
+
+	/* Only one thread per core needs to set RMPOPT_BASE MSR as it is per-core */
+	get_cpumask_of_primary_threads(primary_threads_cpulist);
+
+	/*
+	 * Per-CPU RMPOPT tables support at most 2 TB of addressable memory for RMP optimizations.
+	 *
+	 * Fastpath RMPOPT configuration and setup:
+	 * For systems with <= 2 TB of RAM, configure each per-core RMPOPT base to 0,
+	 * ensuring all system RAM is RMP-optimized on all CPUs.
+	 */
+	if (num_tb <= RMPOPT_TABLE_MAX_LIMIT_IN_TB)
+		configure_rmpopt_non_numa(primary_threads_cpulist);
+	else
+		configure_rmpopt_large_physmem(primary_threads_cpulist);
+
+	free_cpumask_var(primary_threads_cpulist);
+}
+
 /*
  * Do the necessary preparations which are verified by the firmware as
  * described in the SNP_INIT_EX firmware command description in the SNP
@@ -555,6 +745,8 @@ int __init snp_rmptable_init(void)
 skip_enable:
 	cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL);
 
+	configure_and_enable_rmpopt();
+
 	/*
 	 * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic
 	 * notifier is invoked to do SNP IOMMU shutdown before kdump.
-- 
2.43.0
Re: [PATCH 2/6] x86/sev: add support for enabling RMPOPT
Posted by Dave Hansen 1 month, 2 weeks ago
> +#define RMPOPT_TABLE_MAX_LIMIT_IN_TB	2
> +#define NUM_TB(pfn_min, pfn_max)	\
> +	(((pfn_max) - (pfn_min)) / (1 << (40 - PAGE_SHIFT)))

IMNHO, you should just keep these in bytes. No reason to keep them in TB.

> +struct rmpopt_socket_config {
> +	unsigned long start_pfn, end_pfn;
> +	cpumask_var_t cpulist;
> +	int *node_id;
> +	int current_node_idx;
> +};

This looks like optimization complexity before the groundwork is in
place. Also, don't we *have* CPU lists for NUMA nodes? This seems rather
redundant.

> +/*
> + * Build a cpumask of online primary threads, accounting for primary threads
> + * that have been offlined while their secondary threads are still online.
> + */
> +static void get_cpumask_of_primary_threads(cpumask_var_t cpulist)
> +{
> +	cpumask_t cpus;
> +	int cpu;
> +
> +	cpumask_copy(&cpus, cpu_online_mask);
> +	for_each_cpu(cpu, &cpus) {
> +		cpumask_set_cpu(cpu, cpulist);
> +		cpumask_andnot(&cpus, &cpus, cpu_smt_mask(cpu));
> +	}
> +}

Don't we have a primary thread mask already? I thought we did.

> +static void __configure_rmpopt(void *val)
> +{
> +	u64 rmpopt_base = ((u64)val & PUD_MASK) | MSR_AMD64_RMPOPT_ENABLE;
> +
> +	wrmsrq(MSR_AMD64_RMPOPT_BASE, rmpopt_base);
> +}

I'd honestly just make the callers align the address..

> +static void configure_rmpopt_non_numa(cpumask_var_t primary_threads_cpulist)
> +{
> +	on_each_cpu_mask(primary_threads_cpulist, __configure_rmpopt, (void *)0, true);
> +}
> +
> +static void free_rmpopt_socket_config(struct rmpopt_socket_config *socket)
> +{
> +	int i;
> +
> +	if (!socket)
> +		return;
> +
> +	for (i = 0; i < topology_max_packages(); i++) {
> +		free_cpumask_var(socket[i].cpulist);
> +		kfree(socket[i].node_id);
> +	}
> +
> +	kfree(socket);
> +}
> +DEFINE_FREE(free_rmpopt_socket_config, struct rmpopt_socket_config *, free_rmpopt_socket_config(_T))

Looking at all this, I really think you need a more organized series.

Make something that's _functional_ and works for all <2TB configs. Then,
go add all this NUMA complexity in a follow-on patch or patches. There's
too much going on here.

> +static void configure_rmpopt_large_physmem(cpumask_var_t primary_threads_cpulist)
> +{
> +	struct rmpopt_socket_config *socket __free(free_rmpopt_socket_config) = NULL;
> +	int max_packages = topology_max_packages();
> +	struct rmpopt_socket_config *sc;
> +	int cpu, i;
> +
> +	socket = kcalloc(max_packages, sizeof(struct rmpopt_socket_config), GFP_KERNEL);
> +	if (!socket)
> +		return;
> +
> +	for (i = 0; i < max_packages; i++) {
> +		sc = &socket[i];
> +		if (!zalloc_cpumask_var(&sc->cpulist, GFP_KERNEL))
> +			return;
> +		sc->node_id = kcalloc(nr_node_ids, sizeof(int), GFP_KERNEL);
> +		if (!sc->node_id)
> +			return;
> +		sc->current_node_idx = -1;
> +	}
> +
> +	/*
> +	 * Handle case of virtualized NUMA software domains, such as AMD Nodes Per Socket(NPS)
> +	 * configurations. The kernel does not have an abstraction for physical sockets,
> +	 * therefore, enumerate the physical sockets and Nodes Per Socket(NPS) information by
> +	 * walking the online CPU list.
> +	 */

By this point, I've forgotten why sockets are important here.

Why are they important?

> +	for_each_cpu(cpu, primary_threads_cpulist) {
> +		int socket_id, nid;
> +
> +		socket_id = topology_logical_package_id(cpu);
> +		nid = cpu_to_node(cpu);
> +		sc = &socket[socket_id];
> +
> +		/*
> +		 * For each socket, determine the corresponding nodes and the socket's start
> +		 * and end PFNs.
> +		 * Record the node and the start and end PFNs of the first node found on the
> +		 * socket, then record each subsequent node and update the end PFN for that
> +		 * socket as additional nodes are found.
> +		 */
> +		if (sc->current_node_idx == -1) {
> +			sc->current_node_idx = 0;
> +			sc->node_id[sc->current_node_idx] = nid;
> +			sc->start_pfn = node_start_pfn(nid);
> +			sc->end_pfn = node_end_pfn(nid);
> +		} else if (sc->node_id[sc->current_node_idx] != nid) {
> +			sc->current_node_idx++;
> +			sc->node_id[sc->current_node_idx] = nid;
> +			sc->end_pfn = node_end_pfn(nid);
> +		}
> +
> +		cpumask_set_cpu(cpu, sc->cpulist);
> +	}
> +
> +	/*
> +	 * If the "physical" socket has up to 2TB of memory, the per-CPU RMPOPT tables are
> +	 * configured to the starting physical address of the socket, otherwise the tables
> +	 * are configured per-node.
> +	 */
> +	for (i = 0; i < max_packages; i++) {
> +		int num_tb_socket;
> +		phys_addr_t pa;
> +		int j;
> +
> +		sc = &socket[i];
> +		num_tb_socket = NUM_TB(sc->start_pfn, sc->end_pfn) + 1;
> +
> +		pr_debug("socket start_pfn 0x%lx, end_pfn 0x%lx, socket cpu mask %*pbl\n",
> +			 sc->start_pfn, sc->end_pfn, cpumask_pr_args(sc->cpulist));
> +
> +		if (num_tb_socket <= RMPOPT_TABLE_MAX_LIMIT_IN_TB) {
> +			pa = PFN_PHYS(sc->start_pfn);
> +			on_each_cpu_mask(sc->cpulist, __configure_rmpopt, (void *)pa, true);
> +			continue;
> +		}
> +
> +		for (j = 0; j <= sc->current_node_idx; j++) {
> +			int nid = sc->node_id[j];
> +			struct cpumask node_mask;
> +
> +			cpumask_and(&node_mask, cpumask_of_node(nid), sc->cpulist);
> +			pa = PFN_PHYS(node_start_pfn(nid));
> +
> +			pr_debug("RMPOPT_BASE MSR on nodeid %d cpu mask %*pbl set to 0x%llx\n",
> +				 nid, cpumask_pr_args(&node_mask), pa);
> +			on_each_cpu_mask(&node_mask, __configure_rmpopt, (void *)pa, true);
> +		}
> +	}
> +}

Ahh, so you're not optimizing by NUMA itself: you're assuming that there
are groups of NUMA nodes in a socket and then optimizing for those groups.

It would have been nice to say that. It would make great material for
the changelog for your broken out patches.

I have the feeling that the structure here could be one of these in a patch:

 1. Support systems with <2TB of memory
 2. Support a RMPOPT range per NUMA node
 3. Group NUMA nodes at socket boundaries and have them share a common
    RMPOPT config.

Right?

> +static __init void configure_and_enable_rmpopt(void)
> +{
> +	cpumask_var_t primary_threads_cpulist;
> +	int num_tb;
> +
> +	if (!cpu_feature_enabled(X86_FEATURE_RMPOPT)) {
> +		pr_debug("RMPOPT not supported on this platform\n");
> +		return;
> +	}
> +
> +	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) {
> +		pr_debug("RMPOPT optimizations not enabled as SNP support is not enabled\n");
> +		return;
> +	}
> +
> +	if (!(rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED)) {
> +		pr_info("RMPOPT optimizations not enabled, segmented RMP required\n");
> +		return;
> +	}
> +
> +	if (!zalloc_cpumask_var(&primary_threads_cpulist, GFP_KERNEL))
> +		return;
> +
> +	num_tb = NUM_TB(min_low_pfn, max_pfn) + 1;
> +	pr_debug("NUM_TB pages in system %d\n", num_tb);

This looks wrong. Earlier, you program 0 as the base RMPOPT address into
the MSR. But this uses 'min_low_pfn'. Why not 0?

> +	/* Only one thread per core needs to set RMPOPT_BASE MSR as it is per-core */
> +	get_cpumask_of_primary_threads(primary_threads_cpulist);
> +
> +	/*
> +	 * Per-CPU RMPOPT tables support at most 2 TB of addressable memory for RMP optimizations.
> +	 *
> +	 * Fastpath RMPOPT configuration and setup:
> +	 * For systems with <= 2 TB of RAM, configure each per-core RMPOPT base to 0,
> +	 * ensuring all system RAM is RMP-optimized on all CPUs.
> +	 */
> +	if (num_tb <= RMPOPT_TABLE_MAX_LIMIT_IN_TB)
> +		configure_rmpopt_non_numa(primary_threads_cpulist);

this part:

> +	else
> +		configure_rmpopt_large_physmem(primary_threads_cpulist);

^^ needs to be broken out into a separate optimization patch.
Re: [PATCH 2/6] x86/sev: add support for enabling RMPOPT
Posted by K Prateek Nayak 1 month, 2 weeks ago
Hello Dave,

On 2/18/2026 3:36 AM, Dave Hansen wrote:
>> +/*
>> + * Build a cpumask of online primary threads, accounting for primary threads
>> + * that have been offlined while their secondary threads are still online.
>> + */
>> +static void get_cpumask_of_primary_threads(cpumask_var_t cpulist)
>> +{
>> +	cpumask_t cpus;
>> +	int cpu;
>> +
>> +	cpumask_copy(&cpus, cpu_online_mask);
>> +	for_each_cpu(cpu, &cpus) {
>> +		cpumask_set_cpu(cpu, cpulist);
>> +		cpumask_andnot(&cpus, &cpus, cpu_smt_mask(cpu));
>> +	}
>> +}
> 
> Don't we have a primary thread mask already? I thought we did.

If you are referring to cpu_primary_thread_mask(), the CPUs are set on it
based on the LSB of APICID, specifically:

    !(apicid & (__max_threads_per_core - 1))

It can so happen, the primary thread ((apicid & 1) == 0) of the core is
offline while the secondary thread ((apicid & 1) == 1) is online but the
traversal of (cpu_primary_thread_mask() & cpu_online_mask()) will simply
skip these cores.

Is there an equivalent mask that sets the first online CPU of each core?

-- 
Thanks and Regards,
Prateek
Re: [PATCH 2/6] x86/sev: add support for enabling RMPOPT
Posted by Dave Hansen 1 month, 1 week ago
On 2/17/26 19:08, K Prateek Nayak wrote:
> Hello Dave,
> 
> On 2/18/2026 3:36 AM, Dave Hansen wrote:
>>> +/*
>>> + * Build a cpumask of online primary threads, accounting for primary threads
>>> + * that have been offlined while their secondary threads are still online.
>>> + */
>>> +static void get_cpumask_of_primary_threads(cpumask_var_t cpulist)
>>> +{
>>> +	cpumask_t cpus;
>>> +	int cpu;
>>> +
>>> +	cpumask_copy(&cpus, cpu_online_mask);
>>> +	for_each_cpu(cpu, &cpus) {
>>> +		cpumask_set_cpu(cpu, cpulist);
>>> +		cpumask_andnot(&cpus, &cpus, cpu_smt_mask(cpu));
>>> +	}
>>> +}
>>
>> Don't we have a primary thread mask already? I thought we did.
> 
> If you are referring to cpu_primary_thread_mask(), the CPUs are set on it
> based on the LSB of APICID, specifically:
> 
>     !(apicid & (__max_threads_per_core - 1))
> 
> It can so happen, the primary thread ((apicid & 1) == 0) of the core is
> offline while the secondary thread ((apicid & 1) == 1) is online but the
> traversal of (cpu_primary_thread_mask() & cpu_online_mask()) will simply
> skip these cores.
> 
> Is there an equivalent mask that sets the first online CPU of each core?

No I don't think we have that sitting around.

But, stepping back, why is this even necessary? Is it just saving a few
IPIs in the super rare case that someone has offlined the primary thread
but not a secondary one?

Why bother?
Re: [PATCH 2/6] x86/sev: add support for enabling RMPOPT
Posted by Kalra, Ashish 1 month, 1 week ago

On 2/18/2026 8:59 AM, Dave Hansen wrote:
> On 2/17/26 19:08, K Prateek Nayak wrote:
>> Hello Dave,
>>
>> On 2/18/2026 3:36 AM, Dave Hansen wrote:
>>>> +/*
>>>> + * Build a cpumask of online primary threads, accounting for primary threads
>>>> + * that have been offlined while their secondary threads are still online.
>>>> + */
>>>> +static void get_cpumask_of_primary_threads(cpumask_var_t cpulist)
>>>> +{
>>>> +	cpumask_t cpus;
>>>> +	int cpu;
>>>> +
>>>> +	cpumask_copy(&cpus, cpu_online_mask);
>>>> +	for_each_cpu(cpu, &cpus) {
>>>> +		cpumask_set_cpu(cpu, cpulist);
>>>> +		cpumask_andnot(&cpus, &cpus, cpu_smt_mask(cpu));
>>>> +	}
>>>> +}
>>>
>>> Don't we have a primary thread mask already? I thought we did.
>>
>> If you are referring to cpu_primary_thread_mask(), the CPUs are set on it
>> based on the LSB of APICID, specifically:
>>
>>     !(apicid & (__max_threads_per_core - 1))
>>
>> It can so happen, the primary thread ((apicid & 1) == 0) of the core is
>> offline while the secondary thread ((apicid & 1) == 1) is online but the
>> traversal of (cpu_primary_thread_mask() & cpu_online_mask()) will simply
>> skip these cores.
>>
>> Is there an equivalent mask that sets the first online CPU of each core?
> 
> No I don't think we have that sitting around.
> 
> But, stepping back, why is this even necessary? Is it just saving a few
> IPIs in the super rare case that someone has offlined the primary thread
> but not a secondary one?
> 
> Why bother?

Because, setting RMPOPT_BASE MSR (which is a per-core MSR) and RMPOPT instruction
need to be issued on only one thread per core. If the primary thread is offlined
and secondary thread is not considered, we will miss/skip setting either the
RMPOPT_BASE MSR or not issuing the RMPOPT instruction for that physical CPU, which means
no RMP optimizations enabled for that physical CPU.

Thanks,
Ashish
Re: [PATCH 2/6] x86/sev: add support for enabling RMPOPT
Posted by Dave Hansen 1 month, 1 week ago
On 2/18/26 08:55, Kalra, Ashish wrote:
> Because, setting RMPOPT_BASE MSR (which is a per-core MSR) and
> RMPOPT instruction need to be issued on only one thread per core. If
> the primary thread is offlined and secondary thread is not
> considered, we will miss/skip setting either the RMPOPT_BASE MSR or
> not issuing the RMPOPT instruction for that physical CPU, which
> means no RMP optimizations enabled for that physical CPU.
What is the harm of issuing it twice per core?
Re: [PATCH 2/6] x86/sev: add support for enabling RMPOPT
Posted by Kalra, Ashish 1 month, 1 week ago
On 2/18/2026 11:01 AM, Dave Hansen wrote:
> On 2/18/26 08:55, Kalra, Ashish wrote:
>> Because, setting RMPOPT_BASE MSR (which is a per-core MSR) and
>> RMPOPT instruction need to be issued on only one thread per core. If
>> the primary thread is offlined and secondary thread is not
>> considered, we will miss/skip setting either the RMPOPT_BASE MSR or
>> not issuing the RMPOPT instruction for that physical CPU, which
>> means no RMP optimizations enabled for that physical CPU.
> What is the harm of issuing it twice per core?

Why to issue it if we can avoid it.

It is not that complex to setup a cpumask containing the online primary
or secondary thread and then issue the RMPOPT instruction only once per
thread.

Thanks,
Ashish
Re: [PATCH 2/6] x86/sev: add support for enabling RMPOPT
Posted by Dave Hansen 1 month, 1 week ago
On 2/18/26 09:07, Kalra, Ashish wrote:
> On 2/18/2026 11:01 AM, Dave Hansen wrote:
>> On 2/18/26 08:55, Kalra, Ashish wrote:
>>> Because, setting RMPOPT_BASE MSR (which is a per-core MSR) and
>>> RMPOPT instruction need to be issued on only one thread per core. If
>>> the primary thread is offlined and secondary thread is not
>>> considered, we will miss/skip setting either the RMPOPT_BASE MSR or
>>> not issuing the RMPOPT instruction for that physical CPU, which
>>> means no RMP optimizations enabled for that physical CPU.
>> What is the harm of issuing it twice per core?
> Why to issue it if we can avoid it.
> 
> It is not that complex to setup a cpumask containing the online primary
> or secondary thread and then issue the RMPOPT instruction only once per
> thread.

It's a non-zero amount of error-prone kernel code. It *is* complex. It
has to be reviewed and maintained.

Please remove this unnecessary optimization from the series. If you
would like to add it back, please do it in a patch at the end so it can
be evaluated on its own. Include performance numbers so the code
complexity can be balanced against the performance gain.
Re: [PATCH 2/6] x86/sev: add support for enabling RMPOPT
Posted by Kalra, Ashish 1 month, 1 week ago
Hello Dave,

On 2/17/2026 4:06 PM, Dave Hansen wrote:
>> +#define RMPOPT_TABLE_MAX_LIMIT_IN_TB	2
>> +#define NUM_TB(pfn_min, pfn_max)	\
>> +	(((pfn_max) - (pfn_min)) / (1 << (40 - PAGE_SHIFT)))
> 
> IMNHO, you should just keep these in bytes. No reason to keep them in TB.
> 
>> +struct rmpopt_socket_config {
>> +	unsigned long start_pfn, end_pfn;
>> +	cpumask_var_t cpulist;
>> +	int *node_id;
>> +	int current_node_idx;
>> +};
> 
> This looks like optimization complexity before the groundwork is in
> place. Also, don't we *have* CPU lists for NUMA nodes? This seems rather
> redundant.
> 

Yes, we do have CPU lists for NUMA nodes, but we need a socket specific 
cpumask, let me explain more about that below. 

>> +/*
>> + * Build a cpumask of online primary threads, accounting for primary threads
>> + * that have been offlined while their secondary threads are still online.
>> + */
>> +static void get_cpumask_of_primary_threads(cpumask_var_t cpulist)
>> +{
>> +	cpumask_t cpus;
>> +	int cpu;
>> +
>> +	cpumask_copy(&cpus, cpu_online_mask);
>> +	for_each_cpu(cpu, &cpus) {
>> +		cpumask_set_cpu(cpu, cpulist);
>> +		cpumask_andnot(&cpus, &cpus, cpu_smt_mask(cpu));
>> +	}
>> +}
> 
> Don't we have a primary thread mask already? I thought we did.
> 

Already discussed this. 

>> +static void __configure_rmpopt(void *val)
>> +{
>> +	u64 rmpopt_base = ((u64)val & PUD_MASK) | MSR_AMD64_RMPOPT_ENABLE;
>> +
>> +	wrmsrq(MSR_AMD64_RMPOPT_BASE, rmpopt_base);
>> +}
> 
> I'd honestly just make the callers align the address..
> 
>> +static void configure_rmpopt_non_numa(cpumask_var_t primary_threads_cpulist)
>> +{
>> +	on_each_cpu_mask(primary_threads_cpulist, __configure_rmpopt, (void *)0, true);
>> +}
>> +
>> +static void free_rmpopt_socket_config(struct rmpopt_socket_config *socket)
>> +{
>> +	int i;
>> +
>> +	if (!socket)
>> +		return;
>> +
>> +	for (i = 0; i < topology_max_packages(); i++) {
>> +		free_cpumask_var(socket[i].cpulist);
>> +		kfree(socket[i].node_id);
>> +	}
>> +
>> +	kfree(socket);
>> +}
>> +DEFINE_FREE(free_rmpopt_socket_config, struct rmpopt_socket_config *, free_rmpopt_socket_config(_T))
> 
> Looking at all this, I really think you need a more organized series.
> 
> Make something that's _functional_ and works for all <2TB configs. Then,
> go add all this NUMA complexity in a follow-on patch or patches. There's
> too much going on here.

Sure. 

> 
>> +static void configure_rmpopt_large_physmem(cpumask_var_t primary_threads_cpulist)
>> +{
>> +	struct rmpopt_socket_config *socket __free(free_rmpopt_socket_config) = NULL;
>> +	int max_packages = topology_max_packages();
>> +	struct rmpopt_socket_config *sc;
>> +	int cpu, i;
>> +
>> +	socket = kcalloc(max_packages, sizeof(struct rmpopt_socket_config), GFP_KERNEL);
>> +	if (!socket)
>> +		return;
>> +
>> +	for (i = 0; i < max_packages; i++) {
>> +		sc = &socket[i];
>> +		if (!zalloc_cpumask_var(&sc->cpulist, GFP_KERNEL))
>> +			return;
>> +		sc->node_id = kcalloc(nr_node_ids, sizeof(int), GFP_KERNEL);
>> +		if (!sc->node_id)
>> +			return;
>> +		sc->current_node_idx = -1;
>> +	}
>> +
>> +	/*
>> +	 * Handle case of virtualized NUMA software domains, such as AMD Nodes Per Socket(NPS)
>> +	 * configurations. The kernel does not have an abstraction for physical sockets,
>> +	 * therefore, enumerate the physical sockets and Nodes Per Socket(NPS) information by
>> +	 * walking the online CPU list.
>> +	 */
> 
> By this point, I've forgotten why sockets are important here.
> 
> Why are they important?
Because, Nodes per Socket (NPS) configuration is enabled by default, therefore, we have to
look at Sockets instead of simply NUMA nodes, and collect/aggregate all the Node data per Socket
and then accordingly setup the RMPOPT tables, so that the 2TB limit of RMPOPT tables is covered
appropriately and we try to map the maximum possible memory in RMPOPT tables per-Socket rather
than per-Node.

And as there is no per-Socket information available in kernel, we walk through the online
CPU list and collect all this per-Socket information (including socket's start, end addresses,
NUMA nodes in the socket, cpumask of the socket, etc.)

>  
>> +	for_each_cpu(cpu, primary_threads_cpulist) {
>> +		int socket_id, nid;
>> +
>> +		socket_id = topology_logical_package_id(cpu);
>> +		nid = cpu_to_node(cpu);
>> +		sc = &socket[socket_id];
>> +
>> +		/*
>> +		 * For each socket, determine the corresponding nodes and the socket's start
>> +		 * and end PFNs.
>> +		 * Record the node and the start and end PFNs of the first node found on the
>> +		 * socket, then record each subsequent node and update the end PFN for that
>> +		 * socket as additional nodes are found.
>> +		 */
>> +		if (sc->current_node_idx == -1) {
>> +			sc->current_node_idx = 0;
>> +			sc->node_id[sc->current_node_idx] = nid;
>> +			sc->start_pfn = node_start_pfn(nid);
>> +			sc->end_pfn = node_end_pfn(nid);
>> +		} else if (sc->node_id[sc->current_node_idx] != nid) {
>> +			sc->current_node_idx++;
>> +			sc->node_id[sc->current_node_idx] = nid;
>> +			sc->end_pfn = node_end_pfn(nid);
>> +		}
>> +
>> +		cpumask_set_cpu(cpu, sc->cpulist);
>> +	}
>> +
>> +	/*
>> +	 * If the "physical" socket has up to 2TB of memory, the per-CPU RMPOPT tables are
>> +	 * configured to the starting physical address of the socket, otherwise the tables
>> +	 * are configured per-node.
>> +	 */
>> +	for (i = 0; i < max_packages; i++) {
>> +		int num_tb_socket;
>> +		phys_addr_t pa;
>> +		int j;
>> +
>> +		sc = &socket[i];
>> +		num_tb_socket = NUM_TB(sc->start_pfn, sc->end_pfn) + 1;
>> +
>> +		pr_debug("socket start_pfn 0x%lx, end_pfn 0x%lx, socket cpu mask %*pbl\n",
>> +			 sc->start_pfn, sc->end_pfn, cpumask_pr_args(sc->cpulist));
>> +
>> +		if (num_tb_socket <= RMPOPT_TABLE_MAX_LIMIT_IN_TB) {
>> +			pa = PFN_PHYS(sc->start_pfn);
>> +			on_each_cpu_mask(sc->cpulist, __configure_rmpopt, (void *)pa, true);
>> +			continue;
>> +		}
>> +
>> +		for (j = 0; j <= sc->current_node_idx; j++) {
>> +			int nid = sc->node_id[j];
>> +			struct cpumask node_mask;
>> +
>> +			cpumask_and(&node_mask, cpumask_of_node(nid), sc->cpulist);
>> +			pa = PFN_PHYS(node_start_pfn(nid));
>> +
>> +			pr_debug("RMPOPT_BASE MSR on nodeid %d cpu mask %*pbl set to 0x%llx\n",
>> +				 nid, cpumask_pr_args(&node_mask), pa);
>> +			on_each_cpu_mask(&node_mask, __configure_rmpopt, (void *)pa, true);
>> +		}
>> +	}
>> +}
> 
> Ahh, so you're not optimizing by NUMA itself: you're assuming that there
> are groups of NUMA nodes in a socket and then optimizing for those groups.
>
Yes, by default Venice platform has the NPS2 configuration enabled by default,
so we have 'X' nodes per socket and we have to consider this NPSx configuration
and optimize for those groups. 

> It would have been nice to say that. It would make great material for
> the changelog for your broken out patches.

Ok. 

> 
> I have the feeling that the structure here could be one of these in a patch:
> 
>  1. Support systems with <2TB of memory
>  2. Support a RMPOPT range per NUMA node
>  3. Group NUMA nodes at socket boundaries and have them share a common
>     RMPOPT config.
> 
> Right?

Yes, sure.

> 
>> +static __init void configure_and_enable_rmpopt(void)
>> +{
>> +	cpumask_var_t primary_threads_cpulist;
>> +	int num_tb;
>> +
>> +	if (!cpu_feature_enabled(X86_FEATURE_RMPOPT)) {
>> +		pr_debug("RMPOPT not supported on this platform\n");
>> +		return;
>> +	}
>> +
>> +	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) {
>> +		pr_debug("RMPOPT optimizations not enabled as SNP support is not enabled\n");
>> +		return;
>> +	}
>> +
>> +	if (!(rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED)) {
>> +		pr_info("RMPOPT optimizations not enabled, segmented RMP required\n");
>> +		return;
>> +	}
>> +
>> +	if (!zalloc_cpumask_var(&primary_threads_cpulist, GFP_KERNEL))
>> +		return;
>> +
>> +	num_tb = NUM_TB(min_low_pfn, max_pfn) + 1;
>> +	pr_debug("NUM_TB pages in system %d\n", num_tb);
> 
> This looks wrong. Earlier, you program 0 as the base RMPOPT address into
> the MSR. But this uses 'min_low_pfn'. Why not 0?

You are right, we should have used min_low_pfn earlier to program the 
base RMPOPT address into the MSR. 

> 
>> +	/* Only one thread per core needs to set RMPOPT_BASE MSR as it is per-core */
>> +	get_cpumask_of_primary_threads(primary_threads_cpulist);
>> +
>> +	/*
>> +	 * Per-CPU RMPOPT tables support at most 2 TB of addressable memory for RMP optimizations.
>> +	 *
>> +	 * Fastpath RMPOPT configuration and setup:
>> +	 * For systems with <= 2 TB of RAM, configure each per-core RMPOPT base to 0,
>> +	 * ensuring all system RAM is RMP-optimized on all CPUs.
>> +	 */
>> +	if (num_tb <= RMPOPT_TABLE_MAX_LIMIT_IN_TB)
>> +		configure_rmpopt_non_numa(primary_threads_cpulist);
> 
> this part:
> 
>> +	else
>> +		configure_rmpopt_large_physmem(primary_threads_cpulist);
> 
> ^^ needs to be broken out into a separate optimization patch.
> 

Ok.

Thanks,
Ashish
Re: [PATCH 2/6] x86/sev: add support for enabling RMPOPT
Posted by Dave Hansen 1 month, 1 week ago
On 2/18/26 14:17, Kalra, Ashish wrote:
> Yes, by default Venice platform has the NPS2 configuration enabled by default,
> so we have 'X' nodes per socket and we have to consider this NPSx configuration
> and optimize for those groups. 

Why, though?

You keep saying: "We have NPS so we must configure sockets". But not *why*.

I suspect this is another premature optimization. Nodes are a bit too
small so if you configure via nodes, the later nodes will have RMPOPT
tables that cover empty address space off the end of system memory.

Honestly, I think this is all just done wrong. It doesn't need to even
consider sockets. Sockets might even be the wrong thing to look at.

Basically, RMPOPT gives you a 2TB window of potentially "fast" memory.
The rest of memory is "slow". If you're lucky, the memory that's fast
because of RMPOPT is also in a low-distance NUMA node.

Sockets are a good thing to use, for sure. But they're not even optimal!
Just imagine what's going to happen if you have more than 2TB in a
socket. You just turn off the per-socket optimization. If that happens,
the last node in the socket will end up with an RMPOPT table that has
itself at the beginning, but probably a nonzero amount of off-socket memory.

I'd probably just do something like this:

Given a NUMA node, go through each 1GB of memory in the system and see
what the average NUMA distance of that 2TB window of memory is. Find the
2TB window with the lowest average distance. That'll give you a more or
less optimal RMPOPT window. It'll work with NPS or regular NUMA or
whatever bonkers future fancy thing shows up.

But that's all optimization territory. Please squirrel that away to go
look at in 6 months once you get the rest of this merged.