[v1] xen/x86: Change stub page allocation/free

[PATCH for-4.22] xen/x86: Change stub page allocation/free

Posted by Roger Pau Monne 1 day, 22 hours ago

From: Jason Andryuk <jason.andryuk@amd.com>

Today the inline tracking of the stub page is problematic.  0xcc is used to
indicate unused, but it is also a "clear value."  A !CONFIG_PV build with
smt=0 will bring up CPU0, bring up CPU1, bring down CPU1, and free the
in-use stub page.  CPU0 or subsequent onlined CPUs can write to the re-used
page.

The new approach uses a global, CPU-indexed dynamically allocated array of
stub addresses.  However, to handle NUMA aware allocations, we cannot
allocate all the memory in advance because of the NUMA dependency.  Take
advantage of the fact that Xen will attempt to contiguously pack CPUs on
the same NUMA node (see normalise_cpu_order()), and on CPU bringup use the
same stubs page the previous CPU did if suitable.  Note the code would
still function properly even if CPUs from NUMA nodes are not contiguously
packed, it just consumes more memory.

stub pages are no longer freed.  They remain referenced in the global
CPU-indexed array and are re-used if the CPU is re-onlined.

stubs and node_stubs don't have an explicit lock.  During boot they are
accessed single threaded.  During runtime, &cpu_add_remove_lock serializes
access.

Fixes: 7a66ac8d1633 ("x86: move syscall trampolines off the stack")
Signed-off-by: Jason Andryuk <jason.andryuk@amd.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
There are other even more simple options here: for example Andrew proposed
to pack stubs contiguously in both the physical and the linear address
spaces, at the cost of possibly loosing the NUMA memory affinity between
the allocated page and the CPU using it.  We have decided to go for a more
conservative approach here, that keeps the same properties as the current
logic regarding NUMA memory affinity of the stub region.
---
 xen/arch/x86/include/asm/stubs.h |  2 +-
 xen/arch/x86/setup.c             |  4 +-
 xen/arch/x86/smpboot.c           | 91 +++++++++++++++++---------------
 3 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/xen/arch/x86/include/asm/stubs.h b/xen/arch/x86/include/asm/stubs.h
index a520928e9a50..7d8d302e0623 100644
--- a/xen/arch/x86/include/asm/stubs.h
+++ b/xen/arch/x86/include/asm/stubs.h
@@ -32,6 +32,6 @@ struct stubs {
 };
 
 DECLARE_PER_CPU(struct stubs, stubs);
-unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn);
+void init_bsp_stub(void);
 
 #endif /* X86_ASM_STUBS_H */
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 4192edf635b6..cddf8806c877 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -2089,9 +2089,7 @@ void asmlinkage __init noreturn __start_xen(void)
 
     init_idle_domain();
 
-    this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
-                                           &this_cpu(stubs).mfn);
-    BUG_ON(!this_cpu(stubs.addr));
+    init_bsp_stub();
 
     bsp_traps_reinit(); /* Needs stubs allocated, must be before presmp_initcalls. */
 
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index d8fd71ffab37..3282392317f4 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -20,6 +20,7 @@
 #include <xen/serial.h>
 #include <xen/softirq.h>
 #include <xen/tasklet.h>
+#include <xen/xvmalloc.h>
 
 #include <asm/apic.h>
 #include <asm/cpuidle.h>
@@ -641,41 +642,61 @@ static int do_boot_cpu(int apicid, int cpu)
     return rc;
 }
 
-#define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
+/* Dynamically allocated, indexed by CPU.  Store physical address of stubs. */
+static paddr_t *__ro_after_init stubs;
 
-unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
+static bool assign_stub_page(unsigned int cpu)
 {
     unsigned long stub_va;
-    struct page_info *pg;
+    paddr_t addr = stubs[cpu];
 
-    BUILD_BUG_ON(STUBS_PER_PAGE & (STUBS_PER_PAGE - 1));
-
-    if ( *mfn )
-        pg = mfn_to_page(_mfn(*mfn));
-    else
+    if ( addr == INVALID_PADDR )
     {
-        nodeid_t node = cpu_to_node(cpu);
-        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
+        nodeid_t nid = cpu_to_node(cpu);
 
-        pg = alloc_domheap_page(NULL, memflags);
-        if ( !pg )
-            return 0;
+        /*
+         * Attempt to use the same page as the previous CPU if possible,
+         * otherwise allocate a new one.
+         */
+        if ( cpu && nid == cpu_to_node(cpu - 1) &&
+             PAGE_OFFSET(stubs[cpu - 1] + STUB_BUF_SIZE) )
+            addr = stubs[cpu - 1] + STUB_BUF_SIZE;
+        else
+        {
+            struct page_info *pg = alloc_domheap_page(NULL, MEMF_node(nid));
 
-        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
+            if ( !pg )
+                return false;
+            addr = page_to_maddr(pg);
+        }
+        stubs[cpu] = addr;
     }
 
     stub_va = XEN_VIRT_END - FIXADDR_X_SIZE - (cpu + 1) * PAGE_SIZE;
-    if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
+    if ( map_pages_to_xen(stub_va, maddr_to_mfn(addr), 1,
                           PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
-    {
-        if ( !*mfn )
-            free_domheap_page(pg);
-        stub_va = 0;
-    }
-    else if ( !*mfn )
-        *mfn = mfn_x(page_to_mfn(pg));
+        return false;
 
-    return stub_va;
+    per_cpu(stubs.mfn, cpu) = PFN_DOWN(addr);
+    per_cpu(stubs.addr, cpu) = stub_va + PAGE_OFFSET(addr);
+    return true;
+}
+
+void __init init_bsp_stub(void)
+{
+    const unsigned int num_cpus = num_present_cpus();
+    unsigned int i;
+
+    ASSERT(!stubs);
+    stubs = xvmalloc_array(typeof(*stubs), num_cpus);
+    if ( !stubs )
+        panic("Unable to allocate stub array");
+
+    for ( i = 0; i < num_cpus; i++ )
+        stubs[i] = INVALID_PADDR;
+
+    if ( !assign_stub_page(0) )
+        panic("Unable to initialize BSP stub region");
 }
 
 void cpu_exit_clear(unsigned int cpu)
@@ -990,19 +1011,12 @@ static void cpu_smpboot_free(unsigned int cpu, bool remove)
     {
         mfn_t mfn = _mfn(per_cpu(stubs.mfn, cpu));
         unsigned char *stub_page = map_domain_page(mfn);
-        unsigned int i;
 
-        memset(stub_page + STUB_BUF_CPU_OFFS(cpu), 0xcc, STUB_BUF_SIZE);
-        for ( i = 0; i < STUBS_PER_PAGE; ++i )
-            if ( stub_page[i * STUB_BUF_SIZE] != 0xcc )
-                break;
+        memset(stub_page + PAGE_OFFSET(stubs[cpu]), 0xcc, STUB_BUF_SIZE);
         unmap_domain_page(stub_page);
         destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
                              (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
         per_cpu(stubs.addr, cpu) = 0;
-        per_cpu(stubs.mfn, cpu) = 0;
-        if ( i == STUBS_PER_PAGE )
-            free_domheap_page(mfn_to_page(mfn));
     }
 
     if ( IS_ENABLED(CONFIG_PV32) )
@@ -1041,10 +1055,9 @@ void *cpu_alloc_stack(unsigned int cpu)
 static int cpu_smpboot_alloc(unsigned int cpu)
 {
     struct cpu_info *info;
-    unsigned int i, memflags = 0;
+    unsigned int memflags = 0;
     nodeid_t node = cpu_to_node(cpu);
     seg_desc_t *gdt;
-    unsigned long stub_page;
     int rc = -ENOMEM;
 
     if ( node != NUMA_NO_NODE )
@@ -1092,18 +1105,8 @@ static int cpu_smpboot_alloc(unsigned int cpu)
     memcpy(per_cpu(idt, cpu), bsp_idt, sizeof(bsp_idt));
     disable_each_ist(per_cpu(idt, cpu));
 
-    for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
-          i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
-        if ( cpu_online(i) && cpu_to_node(i) == node )
-        {
-            per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i);
-            break;
-        }
-    BUG_ON(i == cpu);
-    stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu));
-    if ( !stub_page )
+    if ( !assign_stub_page(cpu) )
         goto out;
-    per_cpu(stubs.addr, cpu) = stub_page + STUB_BUF_CPU_OFFS(cpu);
 
     rc = setup_cpu_root_pgt(cpu);
     if ( rc )
-- 
2.53.0

Re: [PATCH for-4.22] xen/x86: Change stub page allocation/free

Posted by Andrew Cooper 1 day, 18 hours ago

On 11/06/2026 8:53 am, Roger Pau Monne wrote:
> From: Jason Andryuk <jason.andryuk@amd.com>
>
> Today the inline tracking of the stub page is problematic.  0xcc is used to
> indicate unused, but it is also a "clear value."  A !CONFIG_PV build with
> smt=0 will bring up CPU0, bring up CPU1, bring down CPU1, and free the
> in-use stub page.  CPU0 or subsequent onlined CPUs can write to the re-used
> page.

I'm pretty sure a CONFIG_PV build booted on a FRED enabled system will
do the same.

This is the other case where we (now) forgo writing out the LSTAR/CSTAR
stubs.

>
> The new approach uses a global, CPU-indexed dynamically allocated array of
> stub addresses.  However, to handle NUMA aware allocations, we cannot
> allocate all the memory in advance because of the NUMA dependency.  Take
> advantage of the fact that Xen will attempt to contiguously pack CPUs on
> the same NUMA node (see normalise_cpu_order()), and on CPU bringup use the
> same stubs page the previous CPU did if suitable.  Note the code would
> still function properly even if CPUs from NUMA nodes are not contiguously
> packed, it just consumes more memory.
>
> stub pages are no longer freed.  They remain referenced in the global
> CPU-indexed array and are re-used if the CPU is re-onlined.
>
> stubs and node_stubs don't have an explicit lock.  During boot they are
> accessed single threaded.  During runtime, &cpu_add_remove_lock serializes
> access.

Is node_stubs stale?  Stub(s) should be capitalised at the start of the
sentence.  In context, it's not clear that it's a variable name, and it
doesn't need to be the literal variable name to convey the intended meaning.

>
> Fixes: 7a66ac8d1633 ("x86: move syscall trampolines off the stack")
> Signed-off-by: Jason Andryuk <jason.andryuk@amd.com>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
> There are other even more simple options here: for example Andrew proposed
> to pack stubs contiguously in both the physical and the linear address
> spaces, at the cost of possibly loosing the NUMA memory affinity between
> the allocated page and the CPU using it.  We have decided to go for a more
> conservative approach here, that keeps the same properties as the current
> logic regarding NUMA memory affinity of the stub region.

Part of the suggestion was made in error, but there's a second aspect
which I'll discuss at the end of the email.

> diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
> index 4192edf635b6..cddf8806c877 100644
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -2089,9 +2089,7 @@ void asmlinkage __init noreturn __start_xen(void)
>  
>      init_idle_domain();
>  
> -    this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
> -                                           &this_cpu(stubs).mfn);
> -    BUG_ON(!this_cpu(stubs.addr));
> +    init_bsp_stub();

Personally, I'd name this init_stubs().  It does work for more than just
the BSP, and the bsp_* part is only really needed for clarity when
there's a matching ap_* variant, which is not the case here.

>  
>      bsp_traps_reinit(); /* Needs stubs allocated, must be before presmp_initcalls. */
>  
> diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
> index d8fd71ffab37..3282392317f4 100644
> --- a/xen/arch/x86/smpboot.c
> +++ b/xen/arch/x86/smpboot.c
> @@ -641,41 +642,61 @@ static int do_boot_cpu(int apicid, int cpu)
>      return rc;
>  }
>  
> -#define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
> +/* Dynamically allocated, indexed by CPU.  Store physical address of stubs. */
> +static paddr_t *__ro_after_init stubs;
>  
> -unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
> +static bool assign_stub_page(unsigned int cpu)
>  {
>      unsigned long stub_va;
> -    struct page_info *pg;
> +    paddr_t addr = stubs[cpu];
>  
> -    BUILD_BUG_ON(STUBS_PER_PAGE & (STUBS_PER_PAGE - 1));
> -
> -    if ( *mfn )
> -        pg = mfn_to_page(_mfn(*mfn));
> -    else
> +    if ( addr == INVALID_PADDR )
>      {
> -        nodeid_t node = cpu_to_node(cpu);
> -        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;

I think you need to retain this calculation of memflags. 
MEMF_node(NUMA_NO_NODE) doesn't work as expected.

> +        nodeid_t nid = cpu_to_node(cpu);
>  
> -        pg = alloc_domheap_page(NULL, memflags);
> -        if ( !pg )
> -            return 0;
> +        /*
> +         * Attempt to use the same page as the previous CPU if possible,
> +         * otherwise allocate a new one.
> +         */
> +        if ( cpu && nid == cpu_to_node(cpu - 1) &&
> +             PAGE_OFFSET(stubs[cpu - 1] + STUB_BUF_SIZE) )
> +            addr = stubs[cpu - 1] + STUB_BUF_SIZE;
> +        else
> +        {
> +            struct page_info *pg = alloc_domheap_page(NULL, MEMF_node(nid));
>  
> -        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));

You've dropped this memset() of the whole page to 0xcc.

As a consequence, the stubs for not-yet-onlined CPUs, or for gaps
because of NUMA, are rubble yet mapped executably.

> +            if ( !pg )
> +                return false;
> +            addr = page_to_maddr(pg);
> +        }
> +        stubs[cpu] = addr;
>      }
>  
>      stub_va = XEN_VIRT_END - FIXADDR_X_SIZE - (cpu + 1) * PAGE_SIZE;
> -    if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
> +    if ( map_pages_to_xen(stub_va, maddr_to_mfn(addr), 1,
>                            PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
> -    {
> -        if ( !*mfn )
> -            free_domheap_page(pg);
> -        stub_va = 0;
> -    }
> -    else if ( !*mfn )
> -        *mfn = mfn_x(page_to_mfn(pg));
> +        return false;
>  
> -    return stub_va;
> +    per_cpu(stubs.mfn, cpu) = PFN_DOWN(addr);
> +    per_cpu(stubs.addr, cpu) = stub_va + PAGE_OFFSET(addr);
> +    return true;
> +}
> +
> +void __init init_bsp_stub(void)
> +{
> +    const unsigned int num_cpus = num_present_cpus();
> +    unsigned int i;
> +
> +    ASSERT(!stubs);
> +    stubs = xvmalloc_array(typeof(*stubs), num_cpus);
> +    if ( !stubs )
> +        panic("Unable to allocate stub array");
> +
> +    for ( i = 0; i < num_cpus; i++ )
> +        stubs[i] = INVALID_PADDR;
> +
> +    if ( !assign_stub_page(0) )
> +        panic("Unable to initialize BSP stub region");

\n's for both panic messages.

With the above stuff addressed, I think this is looking ok, but
definitely subject to Jason confirming it resolves his issue.  And for
4.22, that might even be sufficient to go in.

The other thing I want to discuss is this:

>      stub_va = XEN_VIRT_END - FIXADDR_X_SIZE - (cpu + 1) * PAGE_SIZE;

because it creates 32 virtual aliases of every stub.

AIUI, this was a hard requirement for the old freeing scheme, but the
optimisation guides recommend against creating aliases like this. 
Besides microarchitectural tracking/safety effects, one consequence is
that we end up with 31 aliases which have unsafe branches in them;
disp32's depend on the linear address the code is executed at.

The ideal solution would be allocate VAs just like we allocate paddrs,
and for the map_pages_to_xen() be beside the alloc_domheap_page(),
rather than outside of INVALID_PADDR check.

This reduces the amount of VA space (and L1 pagetables) used by 32
times, and removes the risk of accidentally using the wrong alias.

~Andrew

Re: [PATCH for-4.22] xen/x86: Change stub page allocation/free

Posted by Roger Pau Monné 1 day, 15 hours ago

On Thu, Jun 11, 2026 at 12:38:46PM +0100, Andrew Cooper wrote:
> On 11/06/2026 8:53 am, Roger Pau Monne wrote:
> > From: Jason Andryuk <jason.andryuk@amd.com>
> >
> > Today the inline tracking of the stub page is problematic.  0xcc is used to
> > indicate unused, but it is also a "clear value."  A !CONFIG_PV build with
> > smt=0 will bring up CPU0, bring up CPU1, bring down CPU1, and free the
> > in-use stub page.  CPU0 or subsequent onlined CPUs can write to the re-used
> > page.
> 
> I'm pretty sure a CONFIG_PV build booted on a FRED enabled system will
> do the same.
> 
> This is the other case where we (now) forgo writing out the LSTAR/CSTAR
> stubs.
> 
> >
> > The new approach uses a global, CPU-indexed dynamically allocated array of
> > stub addresses.  However, to handle NUMA aware allocations, we cannot
> > allocate all the memory in advance because of the NUMA dependency.  Take
> > advantage of the fact that Xen will attempt to contiguously pack CPUs on
> > the same NUMA node (see normalise_cpu_order()), and on CPU bringup use the
> > same stubs page the previous CPU did if suitable.  Note the code would
> > still function properly even if CPUs from NUMA nodes are not contiguously
> > packed, it just consumes more memory.
> >
> > stub pages are no longer freed.  They remain referenced in the global
> > CPU-indexed array and are re-used if the CPU is re-onlined.
> >
> > stubs and node_stubs don't have an explicit lock.  During boot they are
> > accessed single threaded.  During runtime, &cpu_add_remove_lock serializes
> > access.
> 
> Is node_stubs stale?  Stub(s) should be capitalised at the start of the
> sentence.  In context, it's not clear that it's a variable name, and it
> doesn't need to be the literal variable name to convey the intended meaning.

Oh, yes, that's a leftover from the previous version that I picked up
from Jason.  What about using "The stubs array don't have..." to make
it clear it's referring to the variable.

> >
> > Fixes: 7a66ac8d1633 ("x86: move syscall trampolines off the stack")
> > Signed-off-by: Jason Andryuk <jason.andryuk@amd.com>
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > ---
> > There are other even more simple options here: for example Andrew proposed
> > to pack stubs contiguously in both the physical and the linear address
> > spaces, at the cost of possibly loosing the NUMA memory affinity between
> > the allocated page and the CPU using it.  We have decided to go for a more
> > conservative approach here, that keeps the same properties as the current
> > logic regarding NUMA memory affinity of the stub region.
> 
> Part of the suggestion was made in error, but there's a second aspect
> which I'll discuss at the end of the email.
> 
> > diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
> > index 4192edf635b6..cddf8806c877 100644
> > --- a/xen/arch/x86/setup.c
> > +++ b/xen/arch/x86/setup.c
> > @@ -2089,9 +2089,7 @@ void asmlinkage __init noreturn __start_xen(void)
> >  
> >      init_idle_domain();
> >  
> > -    this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
> > -                                           &this_cpu(stubs).mfn);
> > -    BUG_ON(!this_cpu(stubs.addr));
> > +    init_bsp_stub();
> 
> Personally, I'd name this init_stubs().  It does work for more than just
> the BSP, and the bsp_* part is only really needed for clarity when
> there's a matching ap_* variant, which is not the case here.

Np, I will change it.

> >  
> >      bsp_traps_reinit(); /* Needs stubs allocated, must be before presmp_initcalls. */
> >  
> > diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
> > index d8fd71ffab37..3282392317f4 100644
> > --- a/xen/arch/x86/smpboot.c
> > +++ b/xen/arch/x86/smpboot.c
> > @@ -641,41 +642,61 @@ static int do_boot_cpu(int apicid, int cpu)
> >      return rc;
> >  }
> >  
> > -#define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
> > +/* Dynamically allocated, indexed by CPU.  Store physical address of stubs. */
> > +static paddr_t *__ro_after_init stubs;
> >  
> > -unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
> > +static bool assign_stub_page(unsigned int cpu)
> >  {
> >      unsigned long stub_va;
> > -    struct page_info *pg;
> > +    paddr_t addr = stubs[cpu];
> >  
> > -    BUILD_BUG_ON(STUBS_PER_PAGE & (STUBS_PER_PAGE - 1));
> > -
> > -    if ( *mfn )
> > -        pg = mfn_to_page(_mfn(*mfn));
> > -    else
> > +    if ( addr == INVALID_PADDR )
> >      {
> > -        nodeid_t node = cpu_to_node(cpu);
> > -        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
> 
> I think you need to retain this calculation of memflags. 
> MEMF_node(NUMA_NO_NODE) doesn't work as expected.

I think I'm confused, MEMF_node() macro is:

((((n) + 1) & MEMF_node_mask) << _MEMF_node)

So when n == 0xff the memflags result is 0 (0x100 & 0xff).  And then doing
MEMF_get_node(0) gives you NUMA_NO_NODE, so works as expected?

> > +        nodeid_t nid = cpu_to_node(cpu);
> >  
> > -        pg = alloc_domheap_page(NULL, memflags);
> > -        if ( !pg )
> > -            return 0;
> > +        /*
> > +         * Attempt to use the same page as the previous CPU if possible,
> > +         * otherwise allocate a new one.
> > +         */
> > +        if ( cpu && nid == cpu_to_node(cpu - 1) &&
> > +             PAGE_OFFSET(stubs[cpu - 1] + STUB_BUF_SIZE) )
> > +            addr = stubs[cpu - 1] + STUB_BUF_SIZE;
> > +        else
> > +        {
> > +            struct page_info *pg = alloc_domheap_page(NULL, MEMF_node(nid));
> >  
> > -        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
> 
> You've dropped this memset() of the whole page to 0xcc.
> 
> As a consequence, the stubs for not-yet-onlined CPUs, or for gaps
> because of NUMA, are rubble yet mapped executably.

My bad, sorry, I will re-add it.

> > +            if ( !pg )
> > +                return false;
> > +            addr = page_to_maddr(pg);
> > +        }
> > +        stubs[cpu] = addr;
> >      }
> >  
> >      stub_va = XEN_VIRT_END - FIXADDR_X_SIZE - (cpu + 1) * PAGE_SIZE;
> > -    if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
> > +    if ( map_pages_to_xen(stub_va, maddr_to_mfn(addr), 1,
> >                            PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
> > -    {
> > -        if ( !*mfn )
> > -            free_domheap_page(pg);
> > -        stub_va = 0;
> > -    }
> > -    else if ( !*mfn )
> > -        *mfn = mfn_x(page_to_mfn(pg));
> > +        return false;
> >  
> > -    return stub_va;
> > +    per_cpu(stubs.mfn, cpu) = PFN_DOWN(addr);
> > +    per_cpu(stubs.addr, cpu) = stub_va + PAGE_OFFSET(addr);
> > +    return true;
> > +}
> > +
> > +void __init init_bsp_stub(void)
> > +{
> > +    const unsigned int num_cpus = num_present_cpus();
> > +    unsigned int i;
> > +
> > +    ASSERT(!stubs);
> > +    stubs = xvmalloc_array(typeof(*stubs), num_cpus);
> > +    if ( !stubs )
> > +        panic("Unable to allocate stub array");
> > +
> > +    for ( i = 0; i < num_cpus; i++ )
> > +        stubs[i] = INVALID_PADDR;
> > +
> > +    if ( !assign_stub_page(0) )
> > +        panic("Unable to initialize BSP stub region");
> 
> \n's for both panic messages.
> 
> With the above stuff addressed, I think this is looking ok, but
> definitely subject to Jason confirming it resolves his issue.  And for
> 4.22, that might even be sufficient to go in.

I will go with the adjusted version of this patch for 4.22, if it
fixes Jason's issue.

> The other thing I want to discuss is this:
> 
> >      stub_va = XEN_VIRT_END - FIXADDR_X_SIZE - (cpu + 1) * PAGE_SIZE;
> 
> because it creates 32 virtual aliases of every stub.
> 
> AIUI, this was a hard requirement for the old freeing scheme, but the
> optimisation guides recommend against creating aliases like this. 
> Besides microarchitectural tracking/safety effects, one consequence is
> that we end up with 31 aliases which have unsafe branches in them;
> disp32's depend on the linear address the code is executed at.
> 
> The ideal solution would be allocate VAs just like we allocate paddrs,
> and for the map_pages_to_xen() be beside the alloc_domheap_page(),
> rather than outside of INVALID_PADDR check.
> 
> This reduces the amount of VA space (and L1 pagetables) used by 32
> times, and removes the risk of accidentally using the wrong alias.

I think the point of having a different page for each CPU is that we
can remove the page-table entry when the CPU goes offline, and hence
any mismatched attempts to use that stub region by a different CPU
would result in a page-fault.  In practice I'm not sure how useful
that is, at the end we already clobber the stubs region with int3's.

I don't mind looking into that as a followup change, but for 4.22 I
would rather take this less risky version.  The mapping adjustments
will be a different patch anyway, I don't think it should be squashed
with the content here.

Thanks, Roger.

Re: [PATCH for-4.22] xen/x86: Change stub page allocation/free

Posted by Oleksii Kurochko 1 day, 16 hours ago


On 6/11/26 1:38 PM, Andrew Cooper wrote:
> On 11/06/2026 8:53 am, Roger Pau Monne wrote:
>> From: Jason Andryuk <jason.andryuk@amd.com>
>>
>> Today the inline tracking of the stub page is problematic.  0xcc is used to
>> indicate unused, but it is also a "clear value."  A !CONFIG_PV build with
>> smt=0 will bring up CPU0, bring up CPU1, bring down CPU1, and free the
>> in-use stub page.  CPU0 or subsequent onlined CPUs can write to the re-used
>> page.
> 
> I'm pretty sure a CONFIG_PV build booted on a FRED enabled system will
> do the same.
> 
> This is the other case where we (now) forgo writing out the LSTAR/CSTAR
> stubs.
> 
>>
>> The new approach uses a global, CPU-indexed dynamically allocated array of
>> stub addresses.  However, to handle NUMA aware allocations, we cannot
>> allocate all the memory in advance because of the NUMA dependency.  Take
>> advantage of the fact that Xen will attempt to contiguously pack CPUs on
>> the same NUMA node (see normalise_cpu_order()), and on CPU bringup use the
>> same stubs page the previous CPU did if suitable.  Note the code would
>> still function properly even if CPUs from NUMA nodes are not contiguously
>> packed, it just consumes more memory.
>>
>> stub pages are no longer freed.  They remain referenced in the global
>> CPU-indexed array and are re-used if the CPU is re-onlined.
>>
>> stubs and node_stubs don't have an explicit lock.  During boot they are
>> accessed single threaded.  During runtime, &cpu_add_remove_lock serializes
>> access.
> 
> Is node_stubs stale?  Stub(s) should be capitalised at the start of the
> sentence.  In context, it's not clear that it's a variable name, and it
> doesn't need to be the literal variable name to convey the intended meaning.
> 
>>
>> Fixes: 7a66ac8d1633 ("x86: move syscall trampolines off the stack")
>> Signed-off-by: Jason Andryuk <jason.andryuk@amd.com>
>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
>> ---
>> There are other even more simple options here: for example Andrew proposed
>> to pack stubs contiguously in both the physical and the linear address
>> spaces, at the cost of possibly loosing the NUMA memory affinity between
>> the allocated page and the CPU using it.  We have decided to go for a more
>> conservative approach here, that keeps the same properties as the current
>> logic regarding NUMA memory affinity of the stub region.
> 
> Part of the suggestion was made in error, but there's a second aspect
> which I'll discuss at the end of the email.
> 
>> diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
>> index 4192edf635b6..cddf8806c877 100644
>> --- a/xen/arch/x86/setup.c
>> +++ b/xen/arch/x86/setup.c
>> @@ -2089,9 +2089,7 @@ void asmlinkage __init noreturn __start_xen(void)
>>   
>>       init_idle_domain();
>>   
>> -    this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
>> -                                           &this_cpu(stubs).mfn);
>> -    BUG_ON(!this_cpu(stubs.addr));
>> +    init_bsp_stub();
> 
> Personally, I'd name this init_stubs().  It does work for more than just
> the BSP, and the bsp_* part is only really needed for clarity when
> there's a matching ap_* variant, which is not the case here.
> 
>>   
>>       bsp_traps_reinit(); /* Needs stubs allocated, must be before presmp_initcalls. */
>>   
>> diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
>> index d8fd71ffab37..3282392317f4 100644
>> --- a/xen/arch/x86/smpboot.c
>> +++ b/xen/arch/x86/smpboot.c
>> @@ -641,41 +642,61 @@ static int do_boot_cpu(int apicid, int cpu)
>>       return rc;
>>   }
>>   
>> -#define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
>> +/* Dynamically allocated, indexed by CPU.  Store physical address of stubs. */
>> +static paddr_t *__ro_after_init stubs;
>>   
>> -unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
>> +static bool assign_stub_page(unsigned int cpu)
>>   {
>>       unsigned long stub_va;
>> -    struct page_info *pg;
>> +    paddr_t addr = stubs[cpu];
>>   
>> -    BUILD_BUG_ON(STUBS_PER_PAGE & (STUBS_PER_PAGE - 1));
>> -
>> -    if ( *mfn )
>> -        pg = mfn_to_page(_mfn(*mfn));
>> -    else
>> +    if ( addr == INVALID_PADDR )
>>       {
>> -        nodeid_t node = cpu_to_node(cpu);
>> -        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
> 
> I think you need to retain this calculation of memflags.
> MEMF_node(NUMA_NO_NODE) doesn't work as expected.
> 
>> +        nodeid_t nid = cpu_to_node(cpu);
>>   
>> -        pg = alloc_domheap_page(NULL, memflags);
>> -        if ( !pg )
>> -            return 0;
>> +        /*
>> +         * Attempt to use the same page as the previous CPU if possible,
>> +         * otherwise allocate a new one.
>> +         */
>> +        if ( cpu && nid == cpu_to_node(cpu - 1) &&
>> +             PAGE_OFFSET(stubs[cpu - 1] + STUB_BUF_SIZE) )
>> +            addr = stubs[cpu - 1] + STUB_BUF_SIZE;
>> +        else
>> +        {
>> +            struct page_info *pg = alloc_domheap_page(NULL, MEMF_node(nid));
>>   
>> -        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
> 
> You've dropped this memset() of the whole page to 0xcc.
> 
> As a consequence, the stubs for not-yet-onlined CPUs, or for gaps
> because of NUMA, are rubble yet mapped executably.
> 
>> +            if ( !pg )
>> +                return false;
>> +            addr = page_to_maddr(pg);
>> +        }
>> +        stubs[cpu] = addr;
>>       }
>>   
>>       stub_va = XEN_VIRT_END - FIXADDR_X_SIZE - (cpu + 1) * PAGE_SIZE;
>> -    if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
>> +    if ( map_pages_to_xen(stub_va, maddr_to_mfn(addr), 1,
>>                             PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
>> -    {
>> -        if ( !*mfn )
>> -            free_domheap_page(pg);
>> -        stub_va = 0;
>> -    }
>> -    else if ( !*mfn )
>> -        *mfn = mfn_x(page_to_mfn(pg));
>> +        return false;
>>   
>> -    return stub_va;
>> +    per_cpu(stubs.mfn, cpu) = PFN_DOWN(addr);
>> +    per_cpu(stubs.addr, cpu) = stub_va + PAGE_OFFSET(addr);
>> +    return true;
>> +}
>> +
>> +void __init init_bsp_stub(void)
>> +{
>> +    const unsigned int num_cpus = num_present_cpus();
>> +    unsigned int i;
>> +
>> +    ASSERT(!stubs);
>> +    stubs = xvmalloc_array(typeof(*stubs), num_cpus);
>> +    if ( !stubs )
>> +        panic("Unable to allocate stub array");
>> +
>> +    for ( i = 0; i < num_cpus; i++ )
>> +        stubs[i] = INVALID_PADDR;
>> +
>> +    if ( !assign_stub_page(0) )
>> +        panic("Unable to initialize BSP stub region");
> 
> \n's for both panic messages.
> 
> With the above stuff addressed, I think this is looking ok, but
> definitely subject to Jason confirming it resolves his issue.  And for
> 4.22, that might even be sufficient to go in.

Release-Acked-by: Oleksii Kurochko <oleskii.kurochko@gmail.com>

Thanks.

~ Oleksii

> 
> The other thing I want to discuss is this:
> 
>>       stub_va = XEN_VIRT_END - FIXADDR_X_SIZE - (cpu + 1) * PAGE_SIZE;
> 
> because it creates 32 virtual aliases of every stub.
> 
> AIUI, this was a hard requirement for the old freeing scheme, but the
> optimisation guides recommend against creating aliases like this.
> Besides microarchitectural tracking/safety effects, one consequence is
> that we end up with 31 aliases which have unsafe branches in them;
> disp32's depend on the linear address the code is executed at.
> 
> The ideal solution would be allocate VAs just like we allocate paddrs,
> and for the map_pages_to_xen() be beside the alloc_domheap_page(),
> rather than outside of INVALID_PADDR check.
> 
> This reduces the amount of VA space (and L1 pagetables) used by 32
> times, and removes the risk of accidentally using the wrong alias.
> 
> ~Andrew