From nobody Tue Feb 10 04:03:26 2026 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id E227C190477 for ; Mon, 25 Nov 2024 10:08:32 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1732529314; cv=none; b=SUu0BRcwgDiqx9QKjXmmb8F4W4hLQQvB0x16gzpOGBn+I0nQCjvIIn/e0DnrWY32JPVBTO/35VCr/qO8M0+bXJP8UHx1EeILnnjx8jsNLAFwKit/Y9CISlEUYeEoVNpswE7vxVm/yL9OlZJyw/rH+nPWNfYrC07M0WEhGR8gyIU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1732529314; c=relaxed/simple; bh=rYKDSwIIsHtQF+bMMhhkGfS4OJ5i6kPPKGKgXYrFnLk=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=JX4+icFBudSl3Qh7T1OhRrKuH4oqY9dVTfEvlZqY8z3isxyha41oymLUVdN7MY5XkrYx4/Xljbu/YPfj5llVAqEOe4ap2S2IH3dJB8d9p1y2XdIn/xzrG4wV6uWu3u17OOcd61DOdWG8deft8CCPolcg2mIpXBdomA6L18F4PgM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=casper.srs.infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=sXbVyejq; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=casper.srs.infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="sXbVyejq" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Sender:Content-Transfer-Encoding: MIME-Version:References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From: Reply-To:Content-Type:Content-ID:Content-Description; bh=VikHqdMQVVh2ZNQodkzQFhw0qQ4X8ckzUTlNk/VeSq0=; b=sXbVyejqtKyciEp1oR3xbL3soU GovLjDnTwdmK6ZCcnmYKgy7A4wVV+UYj/GsyBJhrv5FURbQAwgnmnkR44VnxEfqiiz1tDJUul4MU1 WU1Jx2sLkcpJ7ziWp5KMrOJzd3c4sAjfRpTwg9nAfXT8Na65aCJCgKC8cY7cNrz5wwIRt/uGEv+P/ iDtuYD/ljn/eT3ZqiWY4M6faD73nCDH8/8QTleOaDUrv+dL8thoLQsWntfKAJZ+fHgtj3VC6CuzRL vlSWFH42UYlN1oXgUMft3MuzMB40iuSjTIile9mKJaQGxFy2E4pbgXT3Ylsj7ZShzA5g3v21M/zY0 yPVC/tgQ==; Received: from [2001:8b0:10b:1::ebe] (helo=i7.infradead.org) by casper.infradead.org with esmtpsa (Exim 4.98 #2 (Red Hat Linux)) id 1tFW0y-0000000Biqu-3iG9; Mon, 25 Nov 2024 10:08:22 +0000 Received: from dwoodhou by i7.infradead.org with local (Exim 4.98 #2 (Red Hat Linux)) id 1tFW0z-000000000l5-1yeC; Mon, 25 Nov 2024 10:08:21 +0000 From: David Woodhouse To: kexec@lists.infradead.org Cc: Thomas Gleixner , Ingo Molnar , Borislav Petkov , Dave Hansen , x86@kernel.org, "H. Peter Anvin" , David Woodhouse , "Kirill A. Shutemov" , Kai Huang , Nikolay Borisov , linux-kernel@vger.kernel.org, Simon Horman , Dave Young , Peter Zijlstra , jpoimboe@kernel.org, bsz@amazon.de Subject: [RFC PATCH v3 06/20] x86/kexec: Allocate PGD for x86_64 transition page tables separately Date: Mon, 25 Nov 2024 09:54:36 +0000 Message-ID: <20241125100815.2512-7-dwmw2@infradead.org> X-Mailer: git-send-email 2.47.0 In-Reply-To: <20241125100815.2512-1-dwmw2@infradead.org> References: <20241125100815.2512-1-dwmw2@infradead.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Sender: David Woodhouse X-SRS-Rewrite: SMTP reverse-path rewritten from by casper.infradead.org. See http://www.infradead.org/rpr.html Content-Type: text/plain; charset="utf-8" From: David Woodhouse There's no good reason for this to be part of the control_code_page; just allocate it separately on x86_64 like i386 does. Signed-off-by: David Woodhouse --- arch/x86/include/asm/kexec.h | 18 ++++++++--- arch/x86/kernel/machine_kexec_64.c | 49 ++++++++++++++++-------------- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index ae5482a2f0ca..ccb8ff37fa9d 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -16,6 +16,7 @@ # define PAGES_NR 4 #endif =20 +# define KEXEC_CONTROL_PAGE_SIZE 4096 # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 =20 #ifndef __ASSEMBLY__ @@ -43,7 +44,6 @@ struct kimage; /* Maximum address we can use for the control code buffer */ # define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE =20 -# define KEXEC_CONTROL_PAGE_SIZE 4096 =20 /* The native architecture */ # define KEXEC_ARCH KEXEC_ARCH_386 @@ -58,9 +58,6 @@ struct kimage; /* Maximum address we can use for the control pages */ # define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1) =20 -/* Allocate one page for the pdp and the second for the code */ -# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL) - /* The native architecture */ # define KEXEC_ARCH KEXEC_ARCH_X86_64 #endif @@ -145,6 +142,19 @@ struct kimage_arch { }; #else struct kimage_arch { + /* + * This is a kimage control page, as it must not overlap with either + * source or destination address ranges. + */ + pgd_t *pgd; + /* + * The virtual mapping of the control code page itself is used only + * during the transition, while the current kernel's pages are all + * in place. Thus the intermediate page table pages used to map it + * are not control pages, but instead just normal pages obtained + * with get_zeroed_page(). And have to be tracked (below) so that + * they can be freed. + */ p4d_t *p4d; pud_t *pud; pmd_t *pmd; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_k= exec_64.c index b9b6243ee223..c9ae65c9a27c 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -146,7 +146,8 @@ static void free_transition_pgtable(struct kimage *imag= e) image->arch.pte =3D NULL; } =20 -static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) +static int init_transition_pgtable(struct kimage *image, pgd_t *pgd, + unsigned long control_page) { pgprot_t prot =3D PAGE_KERNEL_EXEC_NOENC; unsigned long vaddr, paddr; @@ -157,7 +158,7 @@ static int init_transition_pgtable(struct kimage *image= , pgd_t *pgd) pte_t *pte; =20 vaddr =3D (unsigned long)relocate_kernel; - paddr =3D __pa(page_address(image->control_code_page)+PAGE_SIZE); + paddr =3D control_page; pgd +=3D pgd_index(vaddr); if (!pgd_present(*pgd)) { p4d =3D (p4d_t *)get_zeroed_page(GFP_KERNEL); @@ -216,7 +217,7 @@ static void *alloc_pgt_page(void *data) return p; } =20 -static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +static int init_pgtable(struct kimage *image, unsigned long control_page) { struct x86_mapping_info info =3D { .alloc_pgt_page =3D alloc_pgt_page, @@ -225,12 +226,12 @@ static int init_pgtable(struct kimage *image, unsigne= d long start_pgtable) .kernpg_flag =3D _KERNPG_TABLE_NOENC, }; unsigned long mstart, mend; - pgd_t *level4p; int result; int i; =20 - level4p =3D (pgd_t *)__va(start_pgtable); - clear_page(level4p); + image->arch.pgd =3D alloc_pgt_page(image); + if (!image->arch.pgd) + return -ENOMEM; =20 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { info.page_flag |=3D _PAGE_ENC; @@ -241,8 +242,8 @@ static int init_pgtable(struct kimage *image, unsigned = long start_pgtable) info.direct_gbpages =3D true; =20 /* Ensure the control code page itself is in the direct map */ - result =3D kernel_ident_mapping_init(&info, level4p, start_pgtable + PAGE= _SIZE, - start_pgtable + KEXEC_CONTROL_CODE_MAX_SIZE); + result =3D kernel_ident_mapping_init(&info, image->arch.pgd, control_page, + control_page + KEXEC_CONTROL_CODE_MAX_SIZE); if (result) return result; =20 @@ -250,8 +251,8 @@ static int init_pgtable(struct kimage *image, unsigned = long start_pgtable) mstart =3D pfn_mapped[i].start << PAGE_SHIFT; mend =3D pfn_mapped[i].end << PAGE_SHIFT; =20 - result =3D kernel_ident_mapping_init(&info, - level4p, mstart, mend); + result =3D kernel_ident_mapping_init(&info, image->arch.pgd, + mstart, mend); if (result) return result; } @@ -266,8 +267,8 @@ static int init_pgtable(struct kimage *image, unsigned = long start_pgtable) mstart =3D image->segment[i].mem; mend =3D mstart + image->segment[i].memsz; =20 - result =3D kernel_ident_mapping_init(&info, - level4p, mstart, mend); + result =3D kernel_ident_mapping_init(&info, image->arch.pgd, + mstart, mend); =20 if (result) return result; @@ -277,15 +278,19 @@ static int init_pgtable(struct kimage *image, unsigne= d long start_pgtable) * Prepare EFI systab and ACPI tables for kexec kernel since they are * not covered by pfn_mapped. */ - result =3D map_efi_systab(&info, level4p); + result =3D map_efi_systab(&info, image->arch.pgd); if (result) return result; =20 - result =3D map_acpi_tables(&info, level4p); + result =3D map_acpi_tables(&info, image->arch.pgd); if (result) return result; =20 - return init_transition_pgtable(image, level4p); + /* + * This must be last because the intermediate page table pages it + * allocates will not be control pages and may overlap the image. + */ + return init_transition_pgtable(image, image->arch.pgd, control_page); } =20 static void load_segments(void) @@ -302,14 +307,14 @@ static void load_segments(void) =20 int machine_kexec_prepare(struct kimage *image) { - unsigned long start_pgtable; + unsigned long control_page; int result; =20 /* Calculate the offsets */ - start_pgtable =3D page_to_pfn(image->control_code_page) << PAGE_SHIFT; + control_page =3D page_to_pfn(image->control_code_page) << PAGE_SHIFT; =20 /* Setup the identity mapped 64bit page table */ - result =3D init_pgtable(image, start_pgtable); + result =3D init_pgtable(image, control_page); if (result) return result; =20 @@ -363,13 +368,12 @@ void machine_kexec(struct kimage *image) #endif } =20 - control_page =3D page_address(image->control_code_page) + PAGE_SIZE; + control_page =3D page_address(image->control_code_page); __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); =20 page_list[PA_CONTROL_PAGE] =3D virt_to_phys(control_page); page_list[VA_CONTROL_PAGE] =3D (unsigned long)control_page; - page_list[PA_TABLE_PAGE] =3D - (unsigned long)__pa(page_address(image->control_code_page)); + page_list[PA_TABLE_PAGE] =3D (unsigned long)__pa(image->arch.pgd); =20 if (image->type =3D=3D KEXEC_TYPE_DEFAULT) page_list[PA_SWAP_PAGE] =3D (page_to_pfn(image->swap_page) @@ -579,8 +583,7 @@ static void kexec_mark_crashkres(bool protect) =20 /* Don't touch the control code page used in crash_kexec().*/ control =3D PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page)); - /* Control code page is located in the 2nd page. */ - kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect); + kexec_mark_range(crashk_res.start, control - 1, protect); control +=3D KEXEC_CONTROL_PAGE_SIZE; kexec_mark_range(control, crashk_res.end, protect); } --=20 2.47.0