From: Hongyan Xia <hongyxia@amazon.com>
When we do not have a direct map, archs_mfn_in_direct_map() will always
return false, thus init_node_heap() will allocate xenheap pages from an
existing node for the metadata of a new node. This means that the
metadata of a new node is in a different node, slowing down heap
allocation.
Since we now have early vmap, vmap the metadata locally in the new node.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in v4:
* Change type of the parameters to paddr_t
* Use clear_domain_page() instead of open-coding it
Changes in v2:
* vmap_contig_pages() was renamed to vmap_contig()
* Fix indentation and coding style
Changes from Hongyan's version:
* arch_mfn_in_direct_map() was renamed to
arch_mfns_in_direct_map()
* Use vmap_contig_pages() rather than __vmap(...).
* Add missing include (xen/vmap.h) so it compiles on Arm
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 2cef521ad85a..62cdeb5013a3 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -137,6 +137,7 @@
#include <xen/sections.h>
#include <xen/softirq.h>
#include <xen/spinlock.h>
+#include <xen/vmap.h>
#include <asm/flushtlb.h>
#include <asm/page.h>
@@ -606,22 +607,32 @@ static unsigned long init_node_heap(int node, unsigned long mfn,
needed = 0;
}
else if ( *use_tail && nr >= needed &&
- arch_mfns_in_directmap(mfn + nr - needed, needed) &&
(!xenheap_bits ||
!((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
{
- _heap[node] = mfn_to_virt(mfn + nr - needed);
- avail[node] = mfn_to_virt(mfn + nr - 1) +
- PAGE_SIZE - sizeof(**avail) * NR_ZONES;
+ if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
+ _heap[node] = mfn_to_virt(mfn + nr - needed);
+ else
+ _heap[node] = vmap_contig(_mfn(mfn + nr - needed), needed);
+
+ BUG_ON(!_heap[node]);
+ avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
+ sizeof(**avail) * NR_ZONES;
+
}
else if ( nr >= needed &&
- arch_mfns_in_directmap(mfn, needed) &&
(!xenheap_bits ||
- !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
+ !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
{
- _heap[node] = mfn_to_virt(mfn);
- avail[node] = mfn_to_virt(mfn + needed - 1) +
- PAGE_SIZE - sizeof(**avail) * NR_ZONES;
+ if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
+ _heap[node] = mfn_to_virt(mfn + nr - needed);
+ else
+ _heap[node] = vmap_contig(_mfn(mfn + nr - needed), needed);
+
+ BUG_ON(!_heap[node]);
+ avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
+ sizeof(**avail) * NR_ZONES;
+
*use_tail = false;
}
else if ( get_order_from_bytes(sizeof(**_heap)) ==
--
2.40.1
On Mon Nov 11, 2024 at 1:11 PM GMT, Elias El Yandouzi wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
>
> When we do not have a direct map, archs_mfn_in_direct_map() will always
> return false, thus init_node_heap() will allocate xenheap pages from an
> existing node for the metadata of a new node. This means that the
> metadata of a new node is in a different node, slowing down heap
> allocation.
>
> Since we now have early vmap, vmap the metadata locally in the new node.
>
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Signed-off-by: Julien Grall <jgrall@amazon.com>
> Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
>
> ----
>
> Changes in v4:
> * Change type of the parameters to paddr_t
> * Use clear_domain_page() instead of open-coding it
>
> Changes in v2:
> * vmap_contig_pages() was renamed to vmap_contig()
> * Fix indentation and coding style
>
> Changes from Hongyan's version:
> * arch_mfn_in_direct_map() was renamed to
> arch_mfns_in_direct_map()
> * Use vmap_contig_pages() rather than __vmap(...).
> * Add missing include (xen/vmap.h) so it compiles on Arm
>
> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index 2cef521ad85a..62cdeb5013a3 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -137,6 +137,7 @@
> #include <xen/sections.h>
> #include <xen/softirq.h>
> #include <xen/spinlock.h>
> +#include <xen/vmap.h>
>
> #include <asm/flushtlb.h>
> #include <asm/page.h>
> @@ -606,22 +607,32 @@ static unsigned long init_node_heap(int node, unsigned long mfn,
> needed = 0;
> }
> else if ( *use_tail && nr >= needed &&
> - arch_mfns_in_directmap(mfn + nr - needed, needed) &&
> (!xenheap_bits ||
> !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
> {
> - _heap[node] = mfn_to_virt(mfn + nr - needed);
> - avail[node] = mfn_to_virt(mfn + nr - 1) +
> - PAGE_SIZE - sizeof(**avail) * NR_ZONES;
> + if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
> + _heap[node] = mfn_to_virt(mfn + nr - needed);
> + else
> + _heap[node] = vmap_contig(_mfn(mfn + nr - needed), needed);
... and looking more carefully, couldn't we simply map_pages_to_xen() on the
directmap using mfn_to_virt() as the target? It's not like the NUMA information
is a secret, and even if it was the vmap is no less exposed.
I _GUESS_ this was done with the intent of eventually removing the directmap
altogether, but it's probably a lot better to keep it around for things like
the p2m structures and other global data (like these per-node structures).
> +
> + BUG_ON(!_heap[node]);
> + avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
> + sizeof(**avail) * NR_ZONES;
> +
> }
> else if ( nr >= needed &&
> - arch_mfns_in_directmap(mfn, needed) &&
> (!xenheap_bits ||
> - !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
> + !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
> {
> - _heap[node] = mfn_to_virt(mfn);
> - avail[node] = mfn_to_virt(mfn + needed - 1) +
> - PAGE_SIZE - sizeof(**avail) * NR_ZONES;
> + if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
> + _heap[node] = mfn_to_virt(mfn + nr - needed);
> + else
> + _heap[node] = vmap_contig(_mfn(mfn + nr - needed), needed);
> +
> + BUG_ON(!_heap[node]);
> + avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
> + sizeof(**avail) * NR_ZONES;
> +
> *use_tail = false;
> }
> else if ( get_order_from_bytes(sizeof(**_heap)) ==
I'm compiling all these fixes/enhancements into a separate branch while testing
the whole thing.
Cheers,
Alejandro
Hi,
I'm seeing crashes on NUMA machines, which can be attributed to a bug below:
On Mon Nov 11, 2024 at 1:11 PM GMT, Elias El Yandouzi wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
>
> When we do not have a direct map, archs_mfn_in_direct_map() will always
> return false, thus init_node_heap() will allocate xenheap pages from an
> existing node for the metadata of a new node. This means that the
> metadata of a new node is in a different node, slowing down heap
> allocation.
>
> Since we now have early vmap, vmap the metadata locally in the new node.
>
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Signed-off-by: Julien Grall <jgrall@amazon.com>
> Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
>
> ----
>
> Changes in v4:
> * Change type of the parameters to paddr_t
> * Use clear_domain_page() instead of open-coding it
>
> Changes in v2:
> * vmap_contig_pages() was renamed to vmap_contig()
> * Fix indentation and coding style
>
> Changes from Hongyan's version:
> * arch_mfn_in_direct_map() was renamed to
> arch_mfns_in_direct_map()
> * Use vmap_contig_pages() rather than __vmap(...).
> * Add missing include (xen/vmap.h) so it compiles on Arm
>
> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index 2cef521ad85a..62cdeb5013a3 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -137,6 +137,7 @@
> #include <xen/sections.h>
> #include <xen/softirq.h>
> #include <xen/spinlock.h>
> +#include <xen/vmap.h>
>
> #include <asm/flushtlb.h>
> #include <asm/page.h>
> @@ -606,22 +607,32 @@ static unsigned long init_node_heap(int node, unsigned long mfn,
> needed = 0;
> }
> else if ( *use_tail && nr >= needed &&
> - arch_mfns_in_directmap(mfn + nr - needed, needed) &&
> (!xenheap_bits ||
> !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
> {
> - _heap[node] = mfn_to_virt(mfn + nr - needed);
> - avail[node] = mfn_to_virt(mfn + nr - 1) +
> - PAGE_SIZE - sizeof(**avail) * NR_ZONES;
> + if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
> + _heap[node] = mfn_to_virt(mfn + nr - needed);
> + else
> + _heap[node] = vmap_contig(_mfn(mfn + nr - needed), needed);
> +
> + BUG_ON(!_heap[node]);
> + avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
> + sizeof(**avail) * NR_ZONES;
> +
> }
> else if ( nr >= needed &&
> - arch_mfns_in_directmap(mfn, needed) &&
> (!xenheap_bits ||
> - !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
> + !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
> {
> - _heap[node] = mfn_to_virt(mfn);
> - avail[node] = mfn_to_virt(mfn + needed - 1) +
> - PAGE_SIZE - sizeof(**avail) * NR_ZONES;
> + if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
> + _heap[node] = mfn_to_virt(mfn + nr - needed);
> + else
> + _heap[node] = vmap_contig(_mfn(mfn + nr - needed), needed);
This isn't quite the same thing, I think it regressed in v4 when acting on
Roger's feedback. It should be:
if ( arch_mfns_in_directmap(mfn, needed) )
_heap[node] = mfn_to_virt(mfn);
else
_heap[node] = vmap_contig(_mfn(mfn), needed);
Otherwise `use_tail` serves is unconditionally considered as set. With this
change in place, I can boot on NUMA machines.
> +
> + BUG_ON(!_heap[node]);
> + avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
> + sizeof(**avail) * NR_ZONES;
> +
> *use_tail = false;
> }
> else if ( get_order_from_bytes(sizeof(**_heap)) ==
Cheers,
Alejandro
© 2016 - 2026 Red Hat, Inc.