[PATCH RFC] x86/xen: attempt to inflate the memory balloon on PVH

Roger Pau Monne posted 1 patch 2 months, 1 week ago
Failed in applying to current master (apply log)
There is a newer version of this series
arch/x86/include/asm/xen/hypervisor.h |  1 +
arch/x86/platform/pvh/enlighten.c     |  3 ++
arch/x86/xen/enlighten.c              | 32 +++++++++++++
arch/x86/xen/enlighten_pvh.c          | 68 +++++++++++++++++++++++++++
arch/x86/xen/setup.c                  | 44 -----------------
arch/x86/xen/xen-ops.h                | 14 ++++++
drivers/xen/balloon.c                 |  2 -
7 files changed, 118 insertions(+), 46 deletions(-)
[PATCH RFC] x86/xen: attempt to inflate the memory balloon on PVH
Posted by Roger Pau Monne 2 months, 1 week ago
When running as PVH or HVM Linux will use holes in the memory map as scratch
space to map grants, foreign domain pages and possibly miscellaneous other
stuff.  However the usage of such memory map holes for Xen purposes can be
problematic.  The request of holesby Xen happen quite early in the kernel boot
process (grant table setup already uses scratch map space), and it's possible
that by then not all devices have reclaimed their MMIO space.  It's not
unlikely for chunks of Xen scratch map space to end up using PCI bridge MMIO
window memory, which (as expected) causes quite a lot of issues in the system.

At least for PVH dom0 we have the possibility of using regions marked as
UNUSABLE in the e820 memory map.  Either if the region is UNUSABLE in the
native memory map, or it has been converted into UNUSABLE in order to hide RAM
regions from dom0, the second stage translation page-tables can populate those
areas without issues.

PV already has this kind of logic, where the balloon driver is inflated at
boot.  Re-use the current logic in order to also inflate it when running as
PVH.  onvert UNUSABLE regions up to the ratio specified in EXTRA_MEM_RATIO to
RAM, while reserving them using xen_add_extra_mem() (which is also moved so
it's no longer tied to CONFIG_PV).

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
RFC reasons:

 * Note that it would be preferred for the hypervisor to provide an explicit
   range to be used as scratch mapping space, but that requires changes to Xen,
   and it's not fully clear whether Xen can figure out the position of all MMIO
   regions at boot in order to suggest a scratch mapping region for dom0.

 * Should the whole set of xen_{add,del,chk,inv}_extra_mem() functions be moved
   to a different file?  For the purposes of PVH only xen_add_extra_mem() is
   moved and the chk and inv ones are PV specific and might not want moving to
   a separate file just to guard them with CONFIG_PV.
---
 arch/x86/include/asm/xen/hypervisor.h |  1 +
 arch/x86/platform/pvh/enlighten.c     |  3 ++
 arch/x86/xen/enlighten.c              | 32 +++++++++++++
 arch/x86/xen/enlighten_pvh.c          | 68 +++++++++++++++++++++++++++
 arch/x86/xen/setup.c                  | 44 -----------------
 arch/x86/xen/xen-ops.h                | 14 ++++++
 drivers/xen/balloon.c                 |  2 -
 7 files changed, 118 insertions(+), 46 deletions(-)

diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index a9088250770f..31e2bf8d5db7 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -62,6 +62,7 @@ void xen_arch_unregister_cpu(int num);
 #ifdef CONFIG_PVH
 void __init xen_pvh_init(struct boot_params *boot_params);
 void __init mem_map_via_hcall(struct boot_params *boot_params_p);
+void __init xen_reserve_extra_memory(struct boot_params *bootp);
 #endif
 
 /* Lazy mode for batching updates / context switch */
diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c
index 00a92cb2c814..a12117f3d4de 100644
--- a/arch/x86/platform/pvh/enlighten.c
+++ b/arch/x86/platform/pvh/enlighten.c
@@ -74,6 +74,9 @@ static void __init init_pvh_bootparams(bool xen_guest)
 	} else
 		xen_raw_printk("Warning: Can fit ISA range into e820\n");
 
+	if (xen_guest)
+		xen_reserve_extra_memory(&pvh_bootparams);
+
 	pvh_bootparams.hdr.cmd_line_ptr =
 		pvh_start_info.cmdline_paddr;
 
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 3c61bb98c10e..a01ca255b0c6 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -6,6 +6,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/kexec.h>
+#include <linux/memblock.h>
 #include <linux/slab.h>
 #include <linux/panic_notifier.h>
 
@@ -350,3 +351,34 @@ void xen_arch_unregister_cpu(int num)
 }
 EXPORT_SYMBOL(xen_arch_unregister_cpu);
 #endif
+
+/* Amount of extra memory space we add to the e820 ranges */
+struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
+
+void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns)
+{
+	unsigned int i;
+
+	/*
+	 * No need to check for zero size, should happen rarely and will only
+	 * write a new entry regarded to be unused due to zero size.
+	 */
+	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+		/* Add new region. */
+		if (xen_extra_mem[i].n_pfns == 0) {
+			xen_extra_mem[i].start_pfn = start_pfn;
+			xen_extra_mem[i].n_pfns = n_pfns;
+			break;
+		}
+		/* Append to existing region. */
+		if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+		    start_pfn) {
+			xen_extra_mem[i].n_pfns += n_pfns;
+			break;
+		}
+	}
+	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
+		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
+
+	memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
+}
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index ada3868c02c2..c28f073c1df5 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/acpi.h>
 #include <linux/export.h>
+#include <linux/mm.h>
 
 #include <xen/hvc-console.h>
 
@@ -72,3 +73,70 @@ void __init mem_map_via_hcall(struct boot_params *boot_params_p)
 	}
 	boot_params_p->e820_entries = memmap.nr_entries;
 }
+
+/*
+ * Reserve e820 UNUSABLE regions to inflate the memory balloon.
+ *
+ * On PVH dom0 the host memory map is used, RAM regions available to dom0 are
+ * located as the same place as in the native memory map, but since dom0 gets
+ * less memory than the total amount of host RAM the ranges that can't be
+ * populated are converted from RAM -> UNUSABLE.  Use such regions (up to the
+ * ratio signaled in EXTRA_MEM_RATIO) in order to inflate the balloon driver at
+ * boot.  Doing so prevents the guest (even if just temporary) from using holes
+ * in the memory map in order to map grants or foreign addresses, and
+ * hopefully limits the risk of a clash with a device MMIO region.  Ideally the
+ * hypervisor should notify us which memory ranges are suitable for creating
+ * foreign mappings, but that's not yet implemented.
+ */
+void __init xen_reserve_extra_memory(struct boot_params *bootp)
+{
+	unsigned int i, ram_pages = 0, extra_pages;
+
+	for (i = 0; i < bootp->e820_entries; i++) {
+		struct boot_e820_entry *e = &bootp->e820_table[i];
+
+		if (e->type != E820_TYPE_RAM)
+			continue;
+		ram_pages += PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr);
+	}
+
+	/* Max amount of extra memory. */
+	extra_pages = EXTRA_MEM_RATIO * ram_pages;
+
+	/*
+	 * Convert UNUSABLE ranges to RAM and reserve them for foreign mapping
+	 * purposes.
+	 */
+	for (i = 0; i < bootp->e820_entries && extra_pages; i++) {
+		struct boot_e820_entry *e = &bootp->e820_table[i];
+		unsigned long pages;
+
+		if (e->type != E820_TYPE_UNUSABLE)
+			continue;
+
+		pages = min(extra_pages,
+			PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr));
+
+		if (pages != (PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr))) {
+			struct boot_e820_entry *next;
+
+			if (bootp->e820_entries ==
+			    ARRAY_SIZE(bootp->e820_table))
+				/* No space left to split - skip region. */
+				continue;
+
+			/* Split entry. */
+			next = e + 1;
+			memmove(next, e,
+				(bootp->e820_entries - i) * sizeof(*e));
+			bootp->e820_entries++;
+			next->addr = PAGE_ALIGN(e->addr) + PFN_PHYS(pages);
+			e->size = next->addr - e->addr;
+			next->size -= e->size;
+		}
+		e->type = E820_TYPE_RAM;
+		extra_pages -= pages;
+
+		xen_add_extra_mem(PFN_UP(e->addr), pages);
+	}
+}
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b3e37961065a..380591028cb8 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -38,9 +38,6 @@
 
 #define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
 
-/* Amount of extra memory space we add to the e820 ranges */
-struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
-
 /* Number of pages released from the initial allocation. */
 unsigned long xen_released_pages;
 
@@ -64,18 +61,6 @@ static struct {
 } xen_remap_buf __initdata __aligned(PAGE_SIZE);
 static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
 
-/*
- * The maximum amount of extra memory compared to the base size.  The
- * main scaling factor is the size of struct page.  At extreme ratios
- * of base:extra, all the base memory can be filled with page
- * structures for the extra memory, leaving no space for anything
- * else.
- *
- * 10x seems like a reasonable balance between scaling flexibility and
- * leaving a practically usable system.
- */
-#define EXTRA_MEM_RATIO		(10)
-
 static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
 
 static void __init xen_parse_512gb(void)
@@ -96,35 +81,6 @@ static void __init xen_parse_512gb(void)
 	xen_512gb_limit = val;
 }
 
-static void __init xen_add_extra_mem(unsigned long start_pfn,
-				     unsigned long n_pfns)
-{
-	int i;
-
-	/*
-	 * No need to check for zero size, should happen rarely and will only
-	 * write a new entry regarded to be unused due to zero size.
-	 */
-	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-		/* Add new region. */
-		if (xen_extra_mem[i].n_pfns == 0) {
-			xen_extra_mem[i].start_pfn = start_pfn;
-			xen_extra_mem[i].n_pfns = n_pfns;
-			break;
-		}
-		/* Append to existing region. */
-		if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
-		    start_pfn) {
-			xen_extra_mem[i].n_pfns += n_pfns;
-			break;
-		}
-	}
-	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
-		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
-
-	memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
-}
-
 static void __init xen_del_extra_mem(unsigned long start_pfn,
 				     unsigned long n_pfns)
 {
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index a87ab36889e7..79cf93f2c92f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -163,4 +163,18 @@ void xen_hvm_post_suspend(int suspend_cancelled);
 static inline void xen_hvm_post_suspend(int suspend_cancelled) {}
 #endif
 
+/*
+ * The maximum amount of extra memory compared to the base size.  The
+ * main scaling factor is the size of struct page.  At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ *
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO		(10)
+
+void xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns);
+
 #endif /* XEN_OPS_H */
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 976c6cdf9ee6..aaf2514fcfa4 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -672,7 +672,6 @@ EXPORT_SYMBOL(xen_free_ballooned_pages);
 
 static void __init balloon_add_regions(void)
 {
-#if defined(CONFIG_XEN_PV)
 	unsigned long start_pfn, pages;
 	unsigned long pfn, extra_pfn_end;
 	unsigned int i;
@@ -696,7 +695,6 @@ static void __init balloon_add_regions(void)
 
 		balloon_stats.total_pages += extra_pfn_end - start_pfn;
 	}
-#endif
 }
 
 static int __init balloon_init(void)
-- 
2.43.0


Re: [PATCH RFC] x86/xen: attempt to inflate the memory balloon on PVH
Posted by Juergen Gross 1 month, 2 weeks ago
On 20.02.24 18:43, Roger Pau Monne wrote:
> When running as PVH or HVM Linux will use holes in the memory map as scratch
> space to map grants, foreign domain pages and possibly miscellaneous other
> stuff.  However the usage of such memory map holes for Xen purposes can be
> problematic.  The request of holesby Xen happen quite early in the kernel boot
> process (grant table setup already uses scratch map space), and it's possible
> that by then not all devices have reclaimed their MMIO space.  It's not
> unlikely for chunks of Xen scratch map space to end up using PCI bridge MMIO
> window memory, which (as expected) causes quite a lot of issues in the system.
> 
> At least for PVH dom0 we have the possibility of using regions marked as
> UNUSABLE in the e820 memory map.  Either if the region is UNUSABLE in the
> native memory map, or it has been converted into UNUSABLE in order to hide RAM
> regions from dom0, the second stage translation page-tables can populate those
> areas without issues.
> 
> PV already has this kind of logic, where the balloon driver is inflated at
> boot.  Re-use the current logic in order to also inflate it when running as
> PVH.  onvert UNUSABLE regions up to the ratio specified in EXTRA_MEM_RATIO to
> RAM, while reserving them using xen_add_extra_mem() (which is also moved so
> it's no longer tied to CONFIG_PV).
> 
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>

Reviewed-by: Juergen Gross <jgross@suse.com>


Juergen

Re: [PATCH RFC] x86/xen: attempt to inflate the memory balloon on PVH
Posted by Stefano Stabellini 2 months ago
On Tue, 20 Feb 2024, Roger Pau Monne wrote:
> When running as PVH or HVM Linux will use holes in the memory map as scratch
> space to map grants, foreign domain pages and possibly miscellaneous other
> stuff.  However the usage of such memory map holes for Xen purposes can be
> problematic.  The request of holesby Xen happen quite early in the kernel boot
> process (grant table setup already uses scratch map space), and it's possible
> that by then not all devices have reclaimed their MMIO space.  It's not
> unlikely for chunks of Xen scratch map space to end up using PCI bridge MMIO
> window memory, which (as expected) causes quite a lot of issues in the system.

Am I understanding correctly that XEN_BALLOON_MEMORY_HOTPLUG doesn't
help because it becomes available too late in the PVH boot sequence? 



> At least for PVH dom0 we have the possibility of using regions marked as
> UNUSABLE in the e820 memory map.  Either if the region is UNUSABLE in the
> native memory map, or it has been converted into UNUSABLE in order to hide RAM
> regions from dom0, the second stage translation page-tables can populate those
> areas without issues.
> 
> PV already has this kind of logic, where the balloon driver is inflated at
> boot.  Re-use the current logic in order to also inflate it when running as
> PVH.  onvert UNUSABLE regions up to the ratio specified in EXTRA_MEM_RATIO to
> RAM, while reserving them using xen_add_extra_mem() (which is also moved so
> it's no longer tied to CONFIG_PV).
> 
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
> RFC reasons:
> 
>  * Note that it would be preferred for the hypervisor to provide an explicit
>    range to be used as scratch mapping space, but that requires changes to Xen,
>    and it's not fully clear whether Xen can figure out the position of all MMIO
>    regions at boot in order to suggest a scratch mapping region for dom0.
> 
>  * Should the whole set of xen_{add,del,chk,inv}_extra_mem() functions be moved
>    to a different file?  For the purposes of PVH only xen_add_extra_mem() is
>    moved and the chk and inv ones are PV specific and might not want moving to
>    a separate file just to guard them with CONFIG_PV.
> ---
>  arch/x86/include/asm/xen/hypervisor.h |  1 +
>  arch/x86/platform/pvh/enlighten.c     |  3 ++
>  arch/x86/xen/enlighten.c              | 32 +++++++++++++
>  arch/x86/xen/enlighten_pvh.c          | 68 +++++++++++++++++++++++++++
>  arch/x86/xen/setup.c                  | 44 -----------------
>  arch/x86/xen/xen-ops.h                | 14 ++++++
>  drivers/xen/balloon.c                 |  2 -
>  7 files changed, 118 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
> index a9088250770f..31e2bf8d5db7 100644
> --- a/arch/x86/include/asm/xen/hypervisor.h
> +++ b/arch/x86/include/asm/xen/hypervisor.h
> @@ -62,6 +62,7 @@ void xen_arch_unregister_cpu(int num);
>  #ifdef CONFIG_PVH
>  void __init xen_pvh_init(struct boot_params *boot_params);
>  void __init mem_map_via_hcall(struct boot_params *boot_params_p);
> +void __init xen_reserve_extra_memory(struct boot_params *bootp);
>  #endif
>  
>  /* Lazy mode for batching updates / context switch */
> diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c
> index 00a92cb2c814..a12117f3d4de 100644
> --- a/arch/x86/platform/pvh/enlighten.c
> +++ b/arch/x86/platform/pvh/enlighten.c
> @@ -74,6 +74,9 @@ static void __init init_pvh_bootparams(bool xen_guest)
>  	} else
>  		xen_raw_printk("Warning: Can fit ISA range into e820\n");
>  
> +	if (xen_guest)
> +		xen_reserve_extra_memory(&pvh_bootparams);
> +
>  	pvh_bootparams.hdr.cmd_line_ptr =
>  		pvh_start_info.cmdline_paddr;
>  
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index 3c61bb98c10e..a01ca255b0c6 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -6,6 +6,7 @@
>  #include <linux/console.h>
>  #include <linux/cpu.h>
>  #include <linux/kexec.h>
> +#include <linux/memblock.h>
>  #include <linux/slab.h>
>  #include <linux/panic_notifier.h>
>  
> @@ -350,3 +351,34 @@ void xen_arch_unregister_cpu(int num)
>  }
>  EXPORT_SYMBOL(xen_arch_unregister_cpu);
>  #endif
> +
> +/* Amount of extra memory space we add to the e820 ranges */
> +struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
> +
> +void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns)
> +{
> +	unsigned int i;
> +
> +	/*
> +	 * No need to check for zero size, should happen rarely and will only
> +	 * write a new entry regarded to be unused due to zero size.
> +	 */
> +	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
> +		/* Add new region. */
> +		if (xen_extra_mem[i].n_pfns == 0) {
> +			xen_extra_mem[i].start_pfn = start_pfn;
> +			xen_extra_mem[i].n_pfns = n_pfns;
> +			break;
> +		}
> +		/* Append to existing region. */
> +		if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
> +		    start_pfn) {
> +			xen_extra_mem[i].n_pfns += n_pfns;
> +			break;
> +		}
> +	}
> +	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
> +		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
> +
> +	memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
> +}
> diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
> index ada3868c02c2..c28f073c1df5 100644
> --- a/arch/x86/xen/enlighten_pvh.c
> +++ b/arch/x86/xen/enlighten_pvh.c
> @@ -1,6 +1,7 @@
>  // SPDX-License-Identifier: GPL-2.0
>  #include <linux/acpi.h>
>  #include <linux/export.h>
> +#include <linux/mm.h>
>  
>  #include <xen/hvc-console.h>
>  
> @@ -72,3 +73,70 @@ void __init mem_map_via_hcall(struct boot_params *boot_params_p)
>  	}
>  	boot_params_p->e820_entries = memmap.nr_entries;
>  }
> +
> +/*
> + * Reserve e820 UNUSABLE regions to inflate the memory balloon.
> + *
> + * On PVH dom0 the host memory map is used, RAM regions available to dom0 are
> + * located as the same place as in the native memory map, but since dom0 gets
> + * less memory than the total amount of host RAM the ranges that can't be
> + * populated are converted from RAM -> UNUSABLE.  Use such regions (up to the
> + * ratio signaled in EXTRA_MEM_RATIO) in order to inflate the balloon driver at
> + * boot.  Doing so prevents the guest (even if just temporary) from using holes
> + * in the memory map in order to map grants or foreign addresses, and
> + * hopefully limits the risk of a clash with a device MMIO region.  Ideally the
> + * hypervisor should notify us which memory ranges are suitable for creating
> + * foreign mappings, but that's not yet implemented.
> + */
> +void __init xen_reserve_extra_memory(struct boot_params *bootp)
> +{
> +	unsigned int i, ram_pages = 0, extra_pages;
> +
> +	for (i = 0; i < bootp->e820_entries; i++) {
> +		struct boot_e820_entry *e = &bootp->e820_table[i];
> +
> +		if (e->type != E820_TYPE_RAM)
> +			continue;
> +		ram_pages += PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr);
> +	}
> +
> +	/* Max amount of extra memory. */
> +	extra_pages = EXTRA_MEM_RATIO * ram_pages;
> +
> +	/*
> +	 * Convert UNUSABLE ranges to RAM and reserve them for foreign mapping
> +	 * purposes.
> +	 */
> +	for (i = 0; i < bootp->e820_entries && extra_pages; i++) {
> +		struct boot_e820_entry *e = &bootp->e820_table[i];
> +		unsigned long pages;
> +
> +		if (e->type != E820_TYPE_UNUSABLE)
> +			continue;
> +
> +		pages = min(extra_pages,
> +			PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr));
> +
> +		if (pages != (PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr))) {
> +			struct boot_e820_entry *next;
> +
> +			if (bootp->e820_entries ==
> +			    ARRAY_SIZE(bootp->e820_table))
> +				/* No space left to split - skip region. */
> +				continue;
> +
> +			/* Split entry. */
> +			next = e + 1;
> +			memmove(next, e,
> +				(bootp->e820_entries - i) * sizeof(*e));
> +			bootp->e820_entries++;
> +			next->addr = PAGE_ALIGN(e->addr) + PFN_PHYS(pages);
> +			e->size = next->addr - e->addr;
> +			next->size -= e->size;

Is this really worth doing? Can we just skip this range and continue or
simply break out and call it a day? Or even add the whole range instead?

The reason I am asking is that I am expecting E820_TYPE_UNUSABLE regions
not to be huge. Splitting one just to cover the few remaining pages out
of extra_pages doesn't seem worth it?

Everything else looks OK to me.


> +		}
> +		e->type = E820_TYPE_RAM;
> +		extra_pages -= pages;
> +
> +		xen_add_extra_mem(PFN_UP(e->addr), pages);
> +	}
> +}
> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
> index b3e37961065a..380591028cb8 100644
> --- a/arch/x86/xen/setup.c
> +++ b/arch/x86/xen/setup.c
> @@ -38,9 +38,6 @@
>  
>  #define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
>  
> -/* Amount of extra memory space we add to the e820 ranges */
> -struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
> -
>  /* Number of pages released from the initial allocation. */
>  unsigned long xen_released_pages;
>  
> @@ -64,18 +61,6 @@ static struct {
>  } xen_remap_buf __initdata __aligned(PAGE_SIZE);
>  static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
>  
> -/*
> - * The maximum amount of extra memory compared to the base size.  The
> - * main scaling factor is the size of struct page.  At extreme ratios
> - * of base:extra, all the base memory can be filled with page
> - * structures for the extra memory, leaving no space for anything
> - * else.
> - *
> - * 10x seems like a reasonable balance between scaling flexibility and
> - * leaving a practically usable system.
> - */
> -#define EXTRA_MEM_RATIO		(10)
> -
>  static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
>  
>  static void __init xen_parse_512gb(void)
> @@ -96,35 +81,6 @@ static void __init xen_parse_512gb(void)
>  	xen_512gb_limit = val;
>  }
>  
> -static void __init xen_add_extra_mem(unsigned long start_pfn,
> -				     unsigned long n_pfns)
> -{
> -	int i;
> -
> -	/*
> -	 * No need to check for zero size, should happen rarely and will only
> -	 * write a new entry regarded to be unused due to zero size.
> -	 */
> -	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
> -		/* Add new region. */
> -		if (xen_extra_mem[i].n_pfns == 0) {
> -			xen_extra_mem[i].start_pfn = start_pfn;
> -			xen_extra_mem[i].n_pfns = n_pfns;
> -			break;
> -		}
> -		/* Append to existing region. */
> -		if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
> -		    start_pfn) {
> -			xen_extra_mem[i].n_pfns += n_pfns;
> -			break;
> -		}
> -	}
> -	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
> -		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
> -
> -	memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
> -}
> -
>  static void __init xen_del_extra_mem(unsigned long start_pfn,
>  				     unsigned long n_pfns)
>  {
> diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
> index a87ab36889e7..79cf93f2c92f 100644
> --- a/arch/x86/xen/xen-ops.h
> +++ b/arch/x86/xen/xen-ops.h
> @@ -163,4 +163,18 @@ void xen_hvm_post_suspend(int suspend_cancelled);
>  static inline void xen_hvm_post_suspend(int suspend_cancelled) {}
>  #endif
>  
> +/*
> + * The maximum amount of extra memory compared to the base size.  The
> + * main scaling factor is the size of struct page.  At extreme ratios
> + * of base:extra, all the base memory can be filled with page
> + * structures for the extra memory, leaving no space for anything
> + * else.
> + *
> + * 10x seems like a reasonable balance between scaling flexibility and
> + * leaving a practically usable system.
> + */
> +#define EXTRA_MEM_RATIO		(10)
> +
> +void xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns);
> +
>  #endif /* XEN_OPS_H */
> diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
> index 976c6cdf9ee6..aaf2514fcfa4 100644
> --- a/drivers/xen/balloon.c
> +++ b/drivers/xen/balloon.c
> @@ -672,7 +672,6 @@ EXPORT_SYMBOL(xen_free_ballooned_pages);
>  
>  static void __init balloon_add_regions(void)
>  {
> -#if defined(CONFIG_XEN_PV)
>  	unsigned long start_pfn, pages;
>  	unsigned long pfn, extra_pfn_end;
>  	unsigned int i;
> @@ -696,7 +695,6 @@ static void __init balloon_add_regions(void)
>  
>  		balloon_stats.total_pages += extra_pfn_end - start_pfn;
>  	}
> -#endif
>  }
>  
>  static int __init balloon_init(void)
> -- 
> 2.43.0
> 
Re: [PATCH RFC] x86/xen: attempt to inflate the memory balloon on PVH
Posted by Roger Pau Monné 1 month, 3 weeks ago
On Thu, Feb 22, 2024 at 05:16:09PM -0800, Stefano Stabellini wrote:
> On Tue, 20 Feb 2024, Roger Pau Monne wrote:
> > When running as PVH or HVM Linux will use holes in the memory map as scratch
> > space to map grants, foreign domain pages and possibly miscellaneous other
> > stuff.  However the usage of such memory map holes for Xen purposes can be
> > problematic.  The request of holesby Xen happen quite early in the kernel boot
> > process (grant table setup already uses scratch map space), and it's possible
> > that by then not all devices have reclaimed their MMIO space.  It's not
> > unlikely for chunks of Xen scratch map space to end up using PCI bridge MMIO
> > window memory, which (as expected) causes quite a lot of issues in the system.
> 
> Am I understanding correctly that XEN_BALLOON_MEMORY_HOTPLUG doesn't
> help because it becomes available too late in the PVH boot sequence? 

No, not really, the hoptplug mechanism is available as early as the
balloon driver requires, the issue is that when Linux starts making
use of such unpopulated ranges (for example in order to map the shared
info page) many drivers have not yet reserved their MMIO regions, and so it's
not uncommon for the balloon driver to end up using address ranges that
would otherwise be used by device BARs for example.

This causes havoc, Linux starts to reposition device BARs, sometimes
it can manage to re-position them, otherwise some devices are not
usable.

> > At least for PVH dom0 we have the possibility of using regions marked as
> > UNUSABLE in the e820 memory map.  Either if the region is UNUSABLE in the
> > native memory map, or it has been converted into UNUSABLE in order to hide RAM
> > regions from dom0, the second stage translation page-tables can populate those
> > areas without issues.
> > 
> > PV already has this kind of logic, where the balloon driver is inflated at
> > boot.  Re-use the current logic in order to also inflate it when running as
> > PVH.  onvert UNUSABLE regions up to the ratio specified in EXTRA_MEM_RATIO to
> > RAM, while reserving them using xen_add_extra_mem() (which is also moved so
> > it's no longer tied to CONFIG_PV).
> > 
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > ---
> > RFC reasons:
> > 
> >  * Note that it would be preferred for the hypervisor to provide an explicit
> >    range to be used as scratch mapping space, but that requires changes to Xen,
> >    and it's not fully clear whether Xen can figure out the position of all MMIO
> >    regions at boot in order to suggest a scratch mapping region for dom0.
> > 
> >  * Should the whole set of xen_{add,del,chk,inv}_extra_mem() functions be moved
> >    to a different file?  For the purposes of PVH only xen_add_extra_mem() is
> >    moved and the chk and inv ones are PV specific and might not want moving to
> >    a separate file just to guard them with CONFIG_PV.
> > ---
> >  arch/x86/include/asm/xen/hypervisor.h |  1 +
> >  arch/x86/platform/pvh/enlighten.c     |  3 ++
> >  arch/x86/xen/enlighten.c              | 32 +++++++++++++
> >  arch/x86/xen/enlighten_pvh.c          | 68 +++++++++++++++++++++++++++
> >  arch/x86/xen/setup.c                  | 44 -----------------
> >  arch/x86/xen/xen-ops.h                | 14 ++++++
> >  drivers/xen/balloon.c                 |  2 -
> >  7 files changed, 118 insertions(+), 46 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
> > index a9088250770f..31e2bf8d5db7 100644
> > --- a/arch/x86/include/asm/xen/hypervisor.h
> > +++ b/arch/x86/include/asm/xen/hypervisor.h
> > @@ -62,6 +62,7 @@ void xen_arch_unregister_cpu(int num);
> >  #ifdef CONFIG_PVH
> >  void __init xen_pvh_init(struct boot_params *boot_params);
> >  void __init mem_map_via_hcall(struct boot_params *boot_params_p);
> > +void __init xen_reserve_extra_memory(struct boot_params *bootp);
> >  #endif
> >  
> >  /* Lazy mode for batching updates / context switch */
> > diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c
> > index 00a92cb2c814..a12117f3d4de 100644
> > --- a/arch/x86/platform/pvh/enlighten.c
> > +++ b/arch/x86/platform/pvh/enlighten.c
> > @@ -74,6 +74,9 @@ static void __init init_pvh_bootparams(bool xen_guest)
> >  	} else
> >  		xen_raw_printk("Warning: Can fit ISA range into e820\n");
> >  
> > +	if (xen_guest)
> > +		xen_reserve_extra_memory(&pvh_bootparams);
> > +
> >  	pvh_bootparams.hdr.cmd_line_ptr =
> >  		pvh_start_info.cmdline_paddr;
> >  
> > diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> > index 3c61bb98c10e..a01ca255b0c6 100644
> > --- a/arch/x86/xen/enlighten.c
> > +++ b/arch/x86/xen/enlighten.c
> > @@ -6,6 +6,7 @@
> >  #include <linux/console.h>
> >  #include <linux/cpu.h>
> >  #include <linux/kexec.h>
> > +#include <linux/memblock.h>
> >  #include <linux/slab.h>
> >  #include <linux/panic_notifier.h>
> >  
> > @@ -350,3 +351,34 @@ void xen_arch_unregister_cpu(int num)
> >  }
> >  EXPORT_SYMBOL(xen_arch_unregister_cpu);
> >  #endif
> > +
> > +/* Amount of extra memory space we add to the e820 ranges */
> > +struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
> > +
> > +void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns)
> > +{
> > +	unsigned int i;
> > +
> > +	/*
> > +	 * No need to check for zero size, should happen rarely and will only
> > +	 * write a new entry regarded to be unused due to zero size.
> > +	 */
> > +	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
> > +		/* Add new region. */
> > +		if (xen_extra_mem[i].n_pfns == 0) {
> > +			xen_extra_mem[i].start_pfn = start_pfn;
> > +			xen_extra_mem[i].n_pfns = n_pfns;
> > +			break;
> > +		}
> > +		/* Append to existing region. */
> > +		if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
> > +		    start_pfn) {
> > +			xen_extra_mem[i].n_pfns += n_pfns;
> > +			break;
> > +		}
> > +	}
> > +	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
> > +		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
> > +
> > +	memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
> > +}
> > diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
> > index ada3868c02c2..c28f073c1df5 100644
> > --- a/arch/x86/xen/enlighten_pvh.c
> > +++ b/arch/x86/xen/enlighten_pvh.c
> > @@ -1,6 +1,7 @@
> >  // SPDX-License-Identifier: GPL-2.0
> >  #include <linux/acpi.h>
> >  #include <linux/export.h>
> > +#include <linux/mm.h>
> >  
> >  #include <xen/hvc-console.h>
> >  
> > @@ -72,3 +73,70 @@ void __init mem_map_via_hcall(struct boot_params *boot_params_p)
> >  	}
> >  	boot_params_p->e820_entries = memmap.nr_entries;
> >  }
> > +
> > +/*
> > + * Reserve e820 UNUSABLE regions to inflate the memory balloon.
> > + *
> > + * On PVH dom0 the host memory map is used, RAM regions available to dom0 are
> > + * located as the same place as in the native memory map, but since dom0 gets
> > + * less memory than the total amount of host RAM the ranges that can't be
> > + * populated are converted from RAM -> UNUSABLE.  Use such regions (up to the
> > + * ratio signaled in EXTRA_MEM_RATIO) in order to inflate the balloon driver at
> > + * boot.  Doing so prevents the guest (even if just temporary) from using holes
> > + * in the memory map in order to map grants or foreign addresses, and
> > + * hopefully limits the risk of a clash with a device MMIO region.  Ideally the
> > + * hypervisor should notify us which memory ranges are suitable for creating
> > + * foreign mappings, but that's not yet implemented.
> > + */
> > +void __init xen_reserve_extra_memory(struct boot_params *bootp)
> > +{
> > +	unsigned int i, ram_pages = 0, extra_pages;
> > +
> > +	for (i = 0; i < bootp->e820_entries; i++) {
> > +		struct boot_e820_entry *e = &bootp->e820_table[i];
> > +
> > +		if (e->type != E820_TYPE_RAM)
> > +			continue;
> > +		ram_pages += PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr);
> > +	}
> > +
> > +	/* Max amount of extra memory. */
> > +	extra_pages = EXTRA_MEM_RATIO * ram_pages;
> > +
> > +	/*
> > +	 * Convert UNUSABLE ranges to RAM and reserve them for foreign mapping
> > +	 * purposes.
> > +	 */
> > +	for (i = 0; i < bootp->e820_entries && extra_pages; i++) {
> > +		struct boot_e820_entry *e = &bootp->e820_table[i];
> > +		unsigned long pages;
> > +
> > +		if (e->type != E820_TYPE_UNUSABLE)
> > +			continue;
> > +
> > +		pages = min(extra_pages,
> > +			PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr));
> > +
> > +		if (pages != (PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr))) {
> > +			struct boot_e820_entry *next;
> > +
> > +			if (bootp->e820_entries ==
> > +			    ARRAY_SIZE(bootp->e820_table))
> > +				/* No space left to split - skip region. */
> > +				continue;
> > +
> > +			/* Split entry. */
> > +			next = e + 1;
> > +			memmove(next, e,
> > +				(bootp->e820_entries - i) * sizeof(*e));
> > +			bootp->e820_entries++;
> > +			next->addr = PAGE_ALIGN(e->addr) + PFN_PHYS(pages);
> > +			e->size = next->addr - e->addr;
> > +			next->size -= e->size;
> 
> Is this really worth doing? Can we just skip this range and continue or
> simply break out and call it a day? Or even add the whole range instead?
> 
> The reason I am asking is that I am expecting E820_TYPE_UNUSABLE regions
> not to be huge. Splitting one just to cover the few remaining pages out
> of extra_pages doesn't seem worth it?

No, they are usually quite huge on PVH dom0, because when building a
PVH dom0 the E820_TYPE_RAM ranges that are not made available to dom0
because of a dom0_mem option end up being reported as
E820_TYPE_UNUSABLE in the e820 provided to dom0.

That's mostly the motivation of the change, to be able to reuse those
ranges as scratch space for foreign mappings.

Ideally the hypervisor should somehow report suitable ranges in the
address space for domains to create foreign mappings, but this does
require an amount of extra work I don't have time to do ATM, hence
this stopgap proposal.

> Everything else looks OK to me.

Thanks, Roger.
Re: [PATCH RFC] x86/xen: attempt to inflate the memory balloon on PVH
Posted by Stefano Stabellini 1 month, 3 weeks ago
On Tue, 5 Mar 2024, Roger Pau Monné wrote:
> On Thu, Feb 22, 2024 at 05:16:09PM -0800, Stefano Stabellini wrote:
> > On Tue, 20 Feb 2024, Roger Pau Monne wrote:
> > > When running as PVH or HVM Linux will use holes in the memory map as scratch
> > > space to map grants, foreign domain pages and possibly miscellaneous other
> > > stuff.  However the usage of such memory map holes for Xen purposes can be
> > > problematic.  The request of holesby Xen happen quite early in the kernel boot
> > > process (grant table setup already uses scratch map space), and it's possible
> > > that by then not all devices have reclaimed their MMIO space.  It's not
> > > unlikely for chunks of Xen scratch map space to end up using PCI bridge MMIO
> > > window memory, which (as expected) causes quite a lot of issues in the system.
> > 
> > Am I understanding correctly that XEN_BALLOON_MEMORY_HOTPLUG doesn't
> > help because it becomes available too late in the PVH boot sequence? 
> 
> No, not really, the hoptplug mechanism is available as early as the
> balloon driver requires, the issue is that when Linux starts making
> use of such unpopulated ranges (for example in order to map the shared
> info page) many drivers have not yet reserved their MMIO regions, and so it's
> not uncommon for the balloon driver to end up using address ranges that
> would otherwise be used by device BARs for example.
> 
> This causes havoc, Linux starts to reposition device BARs, sometimes
> it can manage to re-position them, otherwise some devices are not
> usable.

OK this is bad


> > > At least for PVH dom0 we have the possibility of using regions marked as
> > > UNUSABLE in the e820 memory map.  Either if the region is UNUSABLE in the
> > > native memory map, or it has been converted into UNUSABLE in order to hide RAM
> > > regions from dom0, the second stage translation page-tables can populate those
> > > areas without issues.
> > > 
> > > PV already has this kind of logic, where the balloon driver is inflated at
> > > boot.  Re-use the current logic in order to also inflate it when running as
> > > PVH.  onvert UNUSABLE regions up to the ratio specified in EXTRA_MEM_RATIO to
> > > RAM, while reserving them using xen_add_extra_mem() (which is also moved so
> > > it's no longer tied to CONFIG_PV).
> > > 
> > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > > ---
> > > RFC reasons:
> > > 
> > >  * Note that it would be preferred for the hypervisor to provide an explicit
> > >    range to be used as scratch mapping space, but that requires changes to Xen,
> > >    and it's not fully clear whether Xen can figure out the position of all MMIO
> > >    regions at boot in order to suggest a scratch mapping region for dom0.
> > > 
> > >  * Should the whole set of xen_{add,del,chk,inv}_extra_mem() functions be moved
> > >    to a different file?  For the purposes of PVH only xen_add_extra_mem() is
> > >    moved and the chk and inv ones are PV specific and might not want moving to
> > >    a separate file just to guard them with CONFIG_PV.
> > > ---
> > >  arch/x86/include/asm/xen/hypervisor.h |  1 +
> > >  arch/x86/platform/pvh/enlighten.c     |  3 ++
> > >  arch/x86/xen/enlighten.c              | 32 +++++++++++++
> > >  arch/x86/xen/enlighten_pvh.c          | 68 +++++++++++++++++++++++++++
> > >  arch/x86/xen/setup.c                  | 44 -----------------
> > >  arch/x86/xen/xen-ops.h                | 14 ++++++
> > >  drivers/xen/balloon.c                 |  2 -
> > >  7 files changed, 118 insertions(+), 46 deletions(-)
> > > 
> > > diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
> > > index a9088250770f..31e2bf8d5db7 100644
> > > --- a/arch/x86/include/asm/xen/hypervisor.h
> > > +++ b/arch/x86/include/asm/xen/hypervisor.h
> > > @@ -62,6 +62,7 @@ void xen_arch_unregister_cpu(int num);
> > >  #ifdef CONFIG_PVH
> > >  void __init xen_pvh_init(struct boot_params *boot_params);
> > >  void __init mem_map_via_hcall(struct boot_params *boot_params_p);
> > > +void __init xen_reserve_extra_memory(struct boot_params *bootp);
> > >  #endif
> > >  
> > >  /* Lazy mode for batching updates / context switch */
> > > diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c
> > > index 00a92cb2c814..a12117f3d4de 100644
> > > --- a/arch/x86/platform/pvh/enlighten.c
> > > +++ b/arch/x86/platform/pvh/enlighten.c
> > > @@ -74,6 +74,9 @@ static void __init init_pvh_bootparams(bool xen_guest)
> > >  	} else
> > >  		xen_raw_printk("Warning: Can fit ISA range into e820\n");
> > >  
> > > +	if (xen_guest)
> > > +		xen_reserve_extra_memory(&pvh_bootparams);
> > > +
> > >  	pvh_bootparams.hdr.cmd_line_ptr =
> > >  		pvh_start_info.cmdline_paddr;
> > >  
> > > diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> > > index 3c61bb98c10e..a01ca255b0c6 100644
> > > --- a/arch/x86/xen/enlighten.c
> > > +++ b/arch/x86/xen/enlighten.c
> > > @@ -6,6 +6,7 @@
> > >  #include <linux/console.h>
> > >  #include <linux/cpu.h>
> > >  #include <linux/kexec.h>
> > > +#include <linux/memblock.h>
> > >  #include <linux/slab.h>
> > >  #include <linux/panic_notifier.h>
> > >  
> > > @@ -350,3 +351,34 @@ void xen_arch_unregister_cpu(int num)
> > >  }
> > >  EXPORT_SYMBOL(xen_arch_unregister_cpu);
> > >  #endif
> > > +
> > > +/* Amount of extra memory space we add to the e820 ranges */
> > > +struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
> > > +
> > > +void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns)
> > > +{
> > > +	unsigned int i;
> > > +
> > > +	/*
> > > +	 * No need to check for zero size, should happen rarely and will only
> > > +	 * write a new entry regarded to be unused due to zero size.
> > > +	 */
> > > +	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
> > > +		/* Add new region. */
> > > +		if (xen_extra_mem[i].n_pfns == 0) {
> > > +			xen_extra_mem[i].start_pfn = start_pfn;
> > > +			xen_extra_mem[i].n_pfns = n_pfns;
> > > +			break;
> > > +		}
> > > +		/* Append to existing region. */
> > > +		if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
> > > +		    start_pfn) {
> > > +			xen_extra_mem[i].n_pfns += n_pfns;
> > > +			break;
> > > +		}
> > > +	}
> > > +	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
> > > +		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
> > > +
> > > +	memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
> > > +}
> > > diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
> > > index ada3868c02c2..c28f073c1df5 100644
> > > --- a/arch/x86/xen/enlighten_pvh.c
> > > +++ b/arch/x86/xen/enlighten_pvh.c
> > > @@ -1,6 +1,7 @@
> > >  // SPDX-License-Identifier: GPL-2.0
> > >  #include <linux/acpi.h>
> > >  #include <linux/export.h>
> > > +#include <linux/mm.h>
> > >  
> > >  #include <xen/hvc-console.h>
> > >  
> > > @@ -72,3 +73,70 @@ void __init mem_map_via_hcall(struct boot_params *boot_params_p)
> > >  	}
> > >  	boot_params_p->e820_entries = memmap.nr_entries;
> > >  }
> > > +
> > > +/*
> > > + * Reserve e820 UNUSABLE regions to inflate the memory balloon.
> > > + *
> > > + * On PVH dom0 the host memory map is used, RAM regions available to dom0 are
> > > + * located as the same place as in the native memory map, but since dom0 gets
> > > + * less memory than the total amount of host RAM the ranges that can't be
> > > + * populated are converted from RAM -> UNUSABLE.  Use such regions (up to the
> > > + * ratio signaled in EXTRA_MEM_RATIO) in order to inflate the balloon driver at
> > > + * boot.  Doing so prevents the guest (even if just temporary) from using holes
> > > + * in the memory map in order to map grants or foreign addresses, and
> > > + * hopefully limits the risk of a clash with a device MMIO region.  Ideally the
> > > + * hypervisor should notify us which memory ranges are suitable for creating
> > > + * foreign mappings, but that's not yet implemented.
> > > + */
> > > +void __init xen_reserve_extra_memory(struct boot_params *bootp)
> > > +{
> > > +	unsigned int i, ram_pages = 0, extra_pages;
> > > +
> > > +	for (i = 0; i < bootp->e820_entries; i++) {
> > > +		struct boot_e820_entry *e = &bootp->e820_table[i];
> > > +
> > > +		if (e->type != E820_TYPE_RAM)
> > > +			continue;
> > > +		ram_pages += PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr);
> > > +	}
> > > +
> > > +	/* Max amount of extra memory. */
> > > +	extra_pages = EXTRA_MEM_RATIO * ram_pages;
> > > +
> > > +	/*
> > > +	 * Convert UNUSABLE ranges to RAM and reserve them for foreign mapping
> > > +	 * purposes.
> > > +	 */
> > > +	for (i = 0; i < bootp->e820_entries && extra_pages; i++) {
> > > +		struct boot_e820_entry *e = &bootp->e820_table[i];
> > > +		unsigned long pages;
> > > +
> > > +		if (e->type != E820_TYPE_UNUSABLE)
> > > +			continue;
> > > +
> > > +		pages = min(extra_pages,
> > > +			PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr));
> > > +
> > > +		if (pages != (PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr))) {
> > > +			struct boot_e820_entry *next;
> > > +
> > > +			if (bootp->e820_entries ==
> > > +			    ARRAY_SIZE(bootp->e820_table))
> > > +				/* No space left to split - skip region. */
> > > +				continue;
> > > +
> > > +			/* Split entry. */
> > > +			next = e + 1;
> > > +			memmove(next, e,
> > > +				(bootp->e820_entries - i) * sizeof(*e));
> > > +			bootp->e820_entries++;
> > > +			next->addr = PAGE_ALIGN(e->addr) + PFN_PHYS(pages);
> > > +			e->size = next->addr - e->addr;
> > > +			next->size -= e->size;
> > 
> > Is this really worth doing? Can we just skip this range and continue or
> > simply break out and call it a day? Or even add the whole range instead?
> > 
> > The reason I am asking is that I am expecting E820_TYPE_UNUSABLE regions
> > not to be huge. Splitting one just to cover the few remaining pages out
> > of extra_pages doesn't seem worth it?
> 
> No, they are usually quite huge on PVH dom0, because when building a
> PVH dom0 the E820_TYPE_RAM ranges that are not made available to dom0
> because of a dom0_mem option end up being reported as
> E820_TYPE_UNUSABLE in the e820 provided to dom0.
> 
> That's mostly the motivation of the change, to be able to reuse those
> ranges as scratch space for foreign mappings.
> 
> Ideally the hypervisor should somehow report suitable ranges in the
> address space for domains to create foreign mappings, but this does
> require an amount of extra work I don't have time to do ATM, hence
> this stopgap proposal.

I see. We have gained this feature on ARM not long ago for Dom0 and
Dom0less guests.

All right, I have no reservations. The patch looks OK to me. Juergen?