[v1] xen: fix dom0 PV boot on some AMD machines

[PATCH 5/5] xen: tolerate ACPI NVS memory overlapping with Xen allocated memory

Posted by Juergen Gross 3 months, 2 weeks ago

In order to minimize required special handling for running as Xen PV
dom0, the memory layout is modified to match that of the host. This
requires to have only RAM at the locations where Xen allocated memory
is living. Unfortunately there seem to be some machines, where ACPI
NVS is located at 64 MB, resulting in a conflict with the loaded
kernel or the initial page tables built by Xen.

As ACPI NVS needs to be accessed by the kernel only for saving and
restoring it across suspend operations, it can be relocated in the
dom0's memory map by swapping it with unused RAM (this is possible
via modification of the dom0 P2M map).

While the E820 map can (and should) be modified right away, the P2M
map can be updated only after memory allocation is working, as the P2M
map might need to be extended.

Fixes: 808fdb71936c ("xen: check for kernel memory conflicting with memory layout")
Signed-off-by: Juergen Gross <jgross@suse.com>
Tested-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
---
 arch/x86/xen/setup.c | 133 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 132 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index d678c0330971..dbb5d13ca61a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -49,6 +49,15 @@ static struct e820_table xen_e820_table __initdata;
 /* Number of initially usable memory pages. */
 static unsigned long ini_nr_pages __initdata;
 
+/* Remapped non-RAM areas */
+#define NR_NONRAM_REMAP 4
+static struct nonram_remap {
+	unsigned long maddr;
+	unsigned long size;
+	unsigned long paddr;
+} xen_nonram_remap[NR_NONRAM_REMAP] __initdata;
+static unsigned int nr_nonram_remap __initdata;
+
 /*
  * Buffer used to remap identity mapped pages. We only need the virtual space.
  * The physical page behind this address is remapped as needed to different
@@ -452,6 +461,8 @@ static unsigned long __init xen_foreach_remap_area(
  * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
  * This scheme allows to remap the different chunks in arbitrary order while
  * the resulting mapping will be independent from the order.
+ * In case xen_e820_resolve_conflicts() did relocate some non-RAM E820
+ * entries, set the correct P2M information for the affected pages.
  */
 void __init xen_remap_memory(void)
 {
@@ -495,6 +506,29 @@ void __init xen_remap_memory(void)
 	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
 
 	pr_info("Remapped %ld page(s)\n", remapped);
+
+	if (nr_nonram_remap == 0)
+		return;
+
+	remapped = 0;
+	for (i = 0; i < nr_nonram_remap; i++) {
+		struct nonram_remap *remap = xen_nonram_remap + i;
+
+		pfn = PFN_DOWN(remap->paddr);
+		mfn_save = PFN_DOWN(remap->maddr);
+		for (len = 0; len < remap->size; len += PAGE_SIZE) {
+			if (!set_phys_to_machine(pfn, mfn_save)) {
+				WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
+				     pfn, mfn_save);
+				BUG();
+			}
+			pfn++;
+			mfn_save++;
+			remapped++;
+		}
+	}
+
+	pr_info("Remapped %ld non-RAM page(s)\n", remapped);
 }
 
 static unsigned long __init xen_get_pages_limit(void)
@@ -625,14 +659,111 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size)
 	return 0;
 }
 
+/*
+ * Swap a non-RAM E820 map entry with RAM above ini_nr_pages.
+ * Note that the E820 map is modified accordingly, but the P2M map isn't yet.
+ * The adaption of the P2M must be deferred until page allocation is possible.
+ */
+static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry)
+{
+	struct e820_entry *entry;
+	unsigned int mapcnt;
+	phys_addr_t mem_end = PFN_PHYS(ini_nr_pages);
+	struct nonram_remap *remap;
+	phys_addr_t swap_addr, swap_size, entry_end;
+
+	if (nr_nonram_remap == NR_NONRAM_REMAP) {
+		xen_raw_console_write("Number of required E820 entry remapping actions exceed maximum value\n");
+		BUG();
+	}
+
+	swap_addr = PAGE_ALIGN_DOWN(swap_entry->addr);
+	swap_size = PAGE_ALIGN(swap_entry->addr - swap_addr + swap_entry->size);
+	remap = xen_nonram_remap + nr_nonram_remap;
+	entry = xen_e820_table.entries;
+
+	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
+		entry_end = entry->addr + entry->size;
+		if (entry->type == E820_TYPE_RAM && entry->size >= swap_size &&
+		    entry_end - swap_size >= mem_end) {
+			/* Reduce RAM entry by needed space (whole pages). */
+			entry->size -= swap_size;
+
+			/* Add new entry at the end of E820 map. */
+			entry = xen_e820_table.entries +
+				xen_e820_table.nr_entries;
+			xen_e820_table.nr_entries++;
+
+			/* Fill new entry (keep size and page offset). */
+			entry->type = swap_entry->type;
+			entry->addr = entry_end - swap_size +
+				      swap_addr - swap_entry->addr;
+			entry->size = swap_entry->size;
+
+			/* Convert old entry to RAM, align to pages. */
+			swap_entry->type = E820_TYPE_RAM;
+			swap_entry->addr = swap_addr;
+			swap_entry->size = swap_size;
+
+			/* Remember PFN<->MFN relation for P2M update. */
+			remap->maddr = swap_addr;
+			remap->size = swap_size;
+			remap->paddr = entry_end - swap_size;
+			nr_nonram_remap++;
+
+			/* Order E820 table and merge entries. */
+			e820__update_table(&xen_e820_table);
+
+			return;
+		}
+
+		entry++;
+	}
+
+	xen_raw_console_write("No suitable area found for required E820 entry remapping action\n");
+	BUG();
+}
+
+/*
+ * Look for non-RAM memory types in a specific guest physical area and move
+ * those away if possible (ACPI NVS only for now).
+ */
+static void __init xen_e820_resolve_conflicts(phys_addr_t start,
+					      phys_addr_t size)
+{
+	struct e820_entry *entry;
+	unsigned int mapcnt;
+	phys_addr_t end;
+
+	if (!size)
+		return;
+
+	end = start + size;
+	entry = xen_e820_table.entries;
+
+	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
+		if (entry->addr >= end)
+			return;
+
+		if (entry->addr + entry->size > start &&
+		    entry->type == E820_TYPE_NVS)
+			xen_e820_swap_entry_with_ram(entry);
+
+		entry++;
+	}
+}
+
 /*
  * Check for an area in physical memory to be usable for non-movable purposes.
- * An area is considered to usable if the used E820 map lists it to be RAM.
+ * An area is considered to usable if the used E820 map lists it to be RAM or
+ * some other type which can be moved to higher PFNs while keeping the MFNs.
  * In case the area is not usable, crash the system with an error message.
  */
 void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size,
 				   const char *component)
 {
+	xen_e820_resolve_conflicts(start, size);
+
 	if (!xen_is_e820_reserved(start, size))
 		return;
 
-- 
2.43.0

Re: [PATCH 5/5] xen: tolerate ACPI NVS memory overlapping with Xen allocated memory

Posted by Jan Beulich 3 months, 2 weeks ago

On 07.08.2024 12:41, Juergen Gross wrote:
> In order to minimize required special handling for running as Xen PV
> dom0, the memory layout is modified to match that of the host. This
> requires to have only RAM at the locations where Xen allocated memory
> is living. Unfortunately there seem to be some machines, where ACPI
> NVS is located at 64 MB, resulting in a conflict with the loaded
> kernel or the initial page tables built by Xen.
> 
> As ACPI NVS needs to be accessed by the kernel only for saving and
> restoring it across suspend operations, it can be relocated in the
> dom0's memory map by swapping it with unused RAM (this is possible
> via modification of the dom0 P2M map).

While the kernel may not (directly) need to access it for other purposes,
what about AML accessing it? As you can't advertise the movement to ACPI,
and as non-RAM mappings are carried out by drivers/acpi/osl.c:acpi_map()
using acpi_os_ioremap(), phys-to-machine translations won't cover for
that (unless I'm overlooking something, which unfortunately seems like I
might be).

Even without that I wonder in how far tools (kexec?) may be misguided by
relocating non-RAM memory ranges. Even more so when stuff traditionally
living below the 4G boundary suddenly moves far beyond that.

Jan

Re: [PATCH 5/5] xen: tolerate ACPI NVS memory overlapping with Xen allocated memory

Posted by Jan Beulich 3 months, 2 weeks ago

On 07.08.2024 14:05, Jan Beulich wrote:
> On 07.08.2024 12:41, Juergen Gross wrote:
>> In order to minimize required special handling for running as Xen PV
>> dom0, the memory layout is modified to match that of the host. This
>> requires to have only RAM at the locations where Xen allocated memory
>> is living. Unfortunately there seem to be some machines, where ACPI
>> NVS is located at 64 MB, resulting in a conflict with the loaded
>> kernel or the initial page tables built by Xen.
>>
>> As ACPI NVS needs to be accessed by the kernel only for saving and
>> restoring it across suspend operations, it can be relocated in the
>> dom0's memory map by swapping it with unused RAM (this is possible
>> via modification of the dom0 P2M map).
> 
> While the kernel may not (directly) need to access it for other purposes,
> what about AML accessing it? As you can't advertise the movement to ACPI,
> and as non-RAM mappings are carried out by drivers/acpi/osl.c:acpi_map()
> using acpi_os_ioremap(), phys-to-machine translations won't cover for
> that (unless I'm overlooking something, which unfortunately seems like I
> might be).

Thinking some more about this, the above may be pointing in the wrong
direction. If from acpi_os_ioremap() downwards no address translation
(PFN -> MFN) occurred, then what's coming from AML would still be
handled correctly as far as page table entries go. The problem then
might instead be that the mapping would appear to be covering RAM, not
the ACPI NVS region (and there may be checks for that).

Jan

Re: [PATCH 5/5] xen: tolerate ACPI NVS memory overlapping with Xen allocated memory

Posted by Jürgen Groß 3 months, 2 weeks ago

On 09.08.24 11:45, Jan Beulich wrote:
> On 07.08.2024 14:05, Jan Beulich wrote:
>> On 07.08.2024 12:41, Juergen Gross wrote:
>>> In order to minimize required special handling for running as Xen PV
>>> dom0, the memory layout is modified to match that of the host. This
>>> requires to have only RAM at the locations where Xen allocated memory
>>> is living. Unfortunately there seem to be some machines, where ACPI
>>> NVS is located at 64 MB, resulting in a conflict with the loaded
>>> kernel or the initial page tables built by Xen.
>>>
>>> As ACPI NVS needs to be accessed by the kernel only for saving and
>>> restoring it across suspend operations, it can be relocated in the
>>> dom0's memory map by swapping it with unused RAM (this is possible
>>> via modification of the dom0 P2M map).
>>
>> While the kernel may not (directly) need to access it for other purposes,
>> what about AML accessing it? As you can't advertise the movement to ACPI,
>> and as non-RAM mappings are carried out by drivers/acpi/osl.c:acpi_map()
>> using acpi_os_ioremap(), phys-to-machine translations won't cover for
>> that (unless I'm overlooking something, which unfortunately seems like I
>> might be).
> 
> Thinking some more about this, the above may be pointing in the wrong
> direction. If from acpi_os_ioremap() downwards no address translation
> (PFN -> MFN) occurred, then what's coming from AML would still be
> handled correctly as far as page table entries go. The problem then
> might instead be that the mapping would appear to be covering RAM, not
> the ACPI NVS region (and there may be checks for that).

All PTE entries written go through the P->M translation.


Juergen

Re: [PATCH 5/5] xen: tolerate ACPI NVS memory overlapping with Xen allocated memory

Posted by Jürgen Groß 3 months, 2 weeks ago

On 07.08.24 14:05, Jan Beulich wrote:
> On 07.08.2024 12:41, Juergen Gross wrote:
>> In order to minimize required special handling for running as Xen PV
>> dom0, the memory layout is modified to match that of the host. This
>> requires to have only RAM at the locations where Xen allocated memory
>> is living. Unfortunately there seem to be some machines, where ACPI
>> NVS is located at 64 MB, resulting in a conflict with the loaded
>> kernel or the initial page tables built by Xen.
>>
>> As ACPI NVS needs to be accessed by the kernel only for saving and
>> restoring it across suspend operations, it can be relocated in the
>> dom0's memory map by swapping it with unused RAM (this is possible
>> via modification of the dom0 P2M map).
> 
> While the kernel may not (directly) need to access it for other purposes,
> what about AML accessing it? As you can't advertise the movement to ACPI,
> and as non-RAM mappings are carried out by drivers/acpi/osl.c:acpi_map()
> using acpi_os_ioremap(), phys-to-machine translations won't cover for
> that (unless I'm overlooking something, which unfortunately seems like I
> might be).

Seems I need to hook into acpi_os_ioremap() in order to handle that case.

> Even without that I wonder in how far tools (kexec?) may be misguided by
> relocating non-RAM memory ranges. Even more so when stuff traditionally
> living below the 4G boundary suddenly moves far beyond that.

I don't think kexec is working in PV mode.

Other tools: time will tell, I guess. I prefer a generally working system
with maybe some tools failing over one not booting at all.

The only other general solution I could think of would be something similar
to vmlinuz on bare metal: a "small" boot code looking for an appropriate
location for the kernel and then relocating the "real" kernel accordingly.
This would require quite some effort, though, as the boot code would need
to create the related page tables and p2m table, too. I'd like to avoid
that.

Juergen

Re: [PATCH 5/5] xen: tolerate ACPI NVS memory overlapping with Xen allocated memory

Posted by Jan Beulich 3 months, 2 weeks ago

On 08.08.2024 16:42, Jürgen Groß wrote:
> On 07.08.24 14:05, Jan Beulich wrote:
>> Even without that I wonder in how far tools (kexec?) may be misguided by
>> relocating non-RAM memory ranges. Even more so when stuff traditionally
>> living below the 4G boundary suddenly moves far beyond that.
> 
> I don't think kexec is working in PV mode.

I wasn't thinking of in-Dom0 kexec, but about host one. Iirc the kexec tool
will collect information it needs from various data sources, not uniformly
everything from the hypervisor. But I may be wrong with this.

Jan