From: Sohil Mehta <sohil.mehta@intel.com>
Separate out the actual vsyscall emulation from the page fault specific
handling in preparation for the upcoming #GP fault emulation.
No functional change intended.
Signed-off-by: Sohil Mehta <sohil.mehta@intel.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
arch/x86/entry/vsyscall/vsyscall_64.c | 42 +++++++++++++++------------
arch/x86/include/asm/vsyscall.h | 8 ++---
arch/x86/mm/fault.c | 2 +-
3 files changed, 29 insertions(+), 23 deletions(-)
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index c9103a6fa06e..f18b08c03553 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -112,30 +112,13 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
}
}
-bool emulate_vsyscall(unsigned long error_code,
- struct pt_regs *regs, unsigned long address)
+static bool __emulate_vsyscall(struct pt_regs *regs, unsigned long address)
{
unsigned long caller;
int vsyscall_nr, syscall_nr, tmp;
long ret;
unsigned long orig_dx;
- /* Write faults or kernel-privilege faults never get fixed up. */
- if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
- return false;
-
- if (!(error_code & X86_PF_INSTR)) {
- /* Failed vsyscall read */
- if (vsyscall_mode == EMULATE)
- return false;
-
- /*
- * User code tried and failed to read the vsyscall page.
- */
- warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
- return false;
- }
-
/*
* No point in checking CS -- the only way to get here is a user mode
* trap to a high address, which means that we're in 64-bit user code.
@@ -270,6 +253,29 @@ bool emulate_vsyscall(unsigned long error_code,
return true;
}
+bool emulate_vsyscall_pf(unsigned long error_code, struct pt_regs *regs,
+ unsigned long address)
+{
+ /* Write faults or kernel-privilege faults never get fixed up. */
+ if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
+ return false;
+
+ if (!(error_code & X86_PF_INSTR)) {
+ /* Failed vsyscall read */
+ if (vsyscall_mode == EMULATE)
+ return false;
+
+ /*
+ * User code tried and failed to read the vsyscall page.
+ */
+ warn_bad_vsyscall(KERN_INFO, regs,
+ "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
+ return false;
+ }
+
+ return __emulate_vsyscall(regs, address);
+}
+
/*
* A pseudo VMA to allow ptrace access for the vsyscall page. This only
* covers the 64bit vsyscall page now. 32bit has a real VMA now and does
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index 472f0263dbc6..214977f4fa11 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -14,12 +14,12 @@ extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
* Called on instruction fetch fault in vsyscall page.
* Returns true if handled.
*/
-extern bool emulate_vsyscall(unsigned long error_code,
- struct pt_regs *regs, unsigned long address);
+extern bool emulate_vsyscall_pf(unsigned long error_code,
+ struct pt_regs *regs, unsigned long address);
#else
static inline void map_vsyscall(void) {}
-static inline bool emulate_vsyscall(unsigned long error_code,
- struct pt_regs *regs, unsigned long address)
+static inline bool emulate_vsyscall_pf(unsigned long error_code,
+ struct pt_regs *regs, unsigned long address)
{
return false;
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 998bd807fc7b..fbcc2da75fd6 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1316,7 +1316,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* to consider the PF_PK bit.
*/
if (is_vsyscall_vaddr(address)) {
- if (emulate_vsyscall(error_code, regs, address))
+ if (emulate_vsyscall_pf(error_code, regs, address))
return;
}
#endif
--
2.47.2
On 6/20/25 06:53, Kirill A. Shutemov wrote: > +bool emulate_vsyscall_pf(unsigned long error_code, struct pt_regs *regs, > + unsigned long address) > +{ > + /* Write faults or kernel-privilege faults never get fixed up. */ > + if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) > + return false; > + > + if (!(error_code & X86_PF_INSTR)) { > + /* Failed vsyscall read */ > + if (vsyscall_mode == EMULATE) > + return false; > + > + /* > + * User code tried and failed to read the vsyscall page. > + */ > + warn_bad_vsyscall(KERN_INFO, regs, > + "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround"); > + return false; > + } > + > + return __emulate_vsyscall(regs, address); > +} For this patch that just moves the code: Acked-by: Dave Hansen <dave.hansen@linux.intel.com> But, the resulting code is wonky. It needs to do something more like this: if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) return false; if (error_code & X86_PF_INSTR)) return __emulate_vsyscall(regs, address); /* Failed vsyscall read */ if (vsyscall_mode == EMULATE) return false; /* * User code tried and failed to read the vsyscall page. */ warn_bad_vsyscall(KERN_INFO, regs, ... return false; That's much more linear to read.
> But, the resulting code is wonky. It needs to do something more like this: > > if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) > return false; > > if (error_code & X86_PF_INSTR)) > return __emulate_vsyscall(regs, address); To do this, LASS needs a proper interlink against NX || SMEP. If neither NX nor SMEP are active, the CPU does not report X86_PF_INSTR, meaning that fetches are reported as plain reads. This leads to some fun corner cases in SMAP and now LASS too for virt. ~Andrew
On 6/20/25 16:08, Andrew Cooper wrote: >> But, the resulting code is wonky. It needs to do something more like this: >> >> if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) >> return false; >> >> if (error_code & X86_PF_INSTR)) >> return __emulate_vsyscall(regs, address); > To do this, LASS needs a proper interlink against NX || SMEP. > > If neither NX nor SMEP are active, the CPU does not report X86_PF_INSTR, > meaning that fetches are reported as plain reads. Interesting point. I think the easiest way to do this is just make a cpuid_deps[] entry for LASS and NX. If there's a CPU where LASS is available but where NX isn't available, we have much bigger problems on our hands.
On Fri, Jun 20, 2025 at 04:21:38PM -0700, Dave Hansen wrote: > On 6/20/25 16:08, Andrew Cooper wrote: > >> But, the resulting code is wonky. It needs to do something more like this: > >> > >> if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) > >> return false; > >> > >> if (error_code & X86_PF_INSTR)) > >> return __emulate_vsyscall(regs, address); > > To do this, LASS needs a proper interlink against NX || SMEP. > > > > If neither NX nor SMEP are active, the CPU does not report X86_PF_INSTR, > > meaning that fetches are reported as plain reads. > Interesting point. > > I think the easiest way to do this is just make a cpuid_deps[] entry for > LASS and NX. If there's a CPU where LASS is available but where NX isn't > available, we have much bigger problems on our hands. I am not sure what I suppose to do here. Sohil pointed out that with LASS we get #GP on vsyscall, not #PF and PFEC is not relevant for LASS. So, IIUC, that's dependency of vsyscall PF on NX. Do we want to disable vsyscall on boot if NX is not available? BTW, why do we even support !NX on X86_64? Is there such HW? -- Kiryl Shutsemau / Kirill A. Shutemov
On 6/23/25 05:41, Kirill A. Shutemov wrote: > So, IIUC, that's dependency of vsyscall PF on NX. Do we want to disable > vsyscall on boot if NX is not available? Well, vsyscall=none can break old userspace, so forcing it on old hardware doesn't seem like a great idea. But, either way, this doesn't really appear to be a LASS issue. This code: > if (!(error_code & X86_PF_INSTR)) { > /* Failed vsyscall read */ > if (vsyscall_mode == EMULATE) > return false; Is really asking the question: Is this #PF from an instruction fetch in the vsyscall page? That _should_ be able to be done by comparing CR2 and regs->rip. In fact, that's done just below anyway: WARN_ON_ONCE(address != regs->ip); So I think we can fix this up with something like the attached patch which just drives the if() from regs->rip and make the warning NX-only. But this code has been like this a long time and I'm 99% sure the x86 selftests poke at all these cases. I'm curious what they do on those old P4's (or a 64-bit VM with NX turned off), but it's not super important either way.
On Mon, Jun 23, 2025 at 08:32:53AM -0700, Dave Hansen wrote: > On 6/23/25 05:41, Kirill A. Shutemov wrote: > > So, IIUC, that's dependency of vsyscall PF on NX. Do we want to disable > > vsyscall on boot if NX is not available? > > Well, vsyscall=none can break old userspace, so forcing it on old > hardware doesn't seem like a great idea. > > But, either way, this doesn't really appear to be a LASS issue. This code: > > > if (!(error_code & X86_PF_INSTR)) { > > /* Failed vsyscall read */ > > if (vsyscall_mode == EMULATE) > > return false; > > Is really asking the question: > > Is this #PF from an instruction fetch in the vsyscall page? > > That _should_ be able to be done by comparing CR2 and regs->rip. In > fact, that's done just below anyway: > > WARN_ON_ONCE(address != regs->ip); > > So I think we can fix this up with something like the attached patch > which just drives the if() from regs->rip and make the warning NX-only. Looks good to me. Do you want me to include it into this patchset or will you apply it separately? -- Kiryl Shutsemau / Kirill A. Shutemov
On 6/24/25 04:37, Kirill A. Shutemov wrote: > Do you want me to include it into this patchset or will you apply it > separately? Actually, if you want to break it out and just submit it separately, I'll probably just apply it.
emulate_vsyscall() expects to see X86_PF_INSTR in PFEC on a vsyscall
page fault, but the CPU does not report X86_PF_INSTR if neither
X86_FEATURE_NX nor X86_FEATURE_SMEP are enabled.
X86_FEATURE_NX should be enabled on nearly all 64-bit CPUs, except for
early P4 processors that did not support this feature.
Instead of explicitly checking for X86_PF_INSTR, compare the fault
address to RIP.
On machines with X86_FEATURE_NX enabled, issue a warning if RIP is equal
to fault address but X86_PF_INSTR is absent.
Originally-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lore.kernel.org/all/bd81a98b-f8d4-4304-ac55-d4151a1a77ab@intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
arch/x86/entry/vsyscall/vsyscall_64.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index c9103a6fa06e..0b0e0283994f 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -124,7 +124,8 @@ bool emulate_vsyscall(unsigned long error_code,
if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
return false;
- if (!(error_code & X86_PF_INSTR)) {
+ /* Avoid emulation unless userspace was executing from vsyscall page: */
+ if (address != regs->ip) {
/* Failed vsyscall read */
if (vsyscall_mode == EMULATE)
return false;
@@ -136,13 +137,16 @@ bool emulate_vsyscall(unsigned long error_code,
return false;
}
+
+ /* X86_PF_INSTR is only set when NX is supported: */
+ if (cpu_feature_enabled(X86_FEATURE_NX))
+ WARN_ON_ONCE(!(error_code & X86_PF_INSTR));
+
/*
* No point in checking CS -- the only way to get here is a user mode
* trap to a high address, which means that we're in 64-bit user code.
*/
- WARN_ON_ONCE(address != regs->ip);
-
if (vsyscall_mode == NONE) {
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall attempted with vsyscall=none");
--
2.47.2
On 23/06/2025 4:32 pm, Dave Hansen wrote: > On 6/23/25 05:41, Kirill A. Shutemov wrote: >> So, IIUC, that's dependency of vsyscall PF on NX. Do we want to disable >> vsyscall on boot if NX is not available? > Well, vsyscall=none can break old userspace, so forcing it on old > hardware doesn't seem like a great idea. > > But, either way, this doesn't really appear to be a LASS issue. This code: > >> if (!(error_code & X86_PF_INSTR)) { >> /* Failed vsyscall read */ >> if (vsyscall_mode == EMULATE) >> return false; > Is really asking the question: > > Is this #PF from an instruction fetch in the vsyscall page? > > That _should_ be able to be done by comparing CR2 and regs->rip. In > fact, that's done just below anyway: > > WARN_ON_ONCE(address != regs->ip); > > So I think we can fix this up with something like the attached patch > which just drives the if() from regs->rip and make the warning NX-only. Yeah, that looks good. Furthermore, it means that the LASS #GP path (patch 9) will be consistent with this path. (i.e. both doing a regs->rip check.) Patch Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> if that counts for anything. ~Andrew
The following commit has been merged into the x86/entry branch of tip:
Commit-ID: 8ba38a7a9a699905b84fa97578a8291010dec273
Gitweb: https://git.kernel.org/tip/8ba38a7a9a699905b84fa97578a8291010dec273
Author: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
AuthorDate: Tue, 24 Jun 2025 17:59:18 +03:00
Committer: Dave Hansen <dave.hansen@linux.intel.com>
CommitterDate: Wed, 13 Aug 2025 15:02:12 -07:00
x86/vsyscall: Do not require X86_PF_INSTR to emulate vsyscall
emulate_vsyscall() expects to see X86_PF_INSTR in PFEC on a vsyscall
page fault, but the CPU does not report X86_PF_INSTR if neither
X86_FEATURE_NX nor X86_FEATURE_SMEP are enabled.
X86_FEATURE_NX should be enabled on nearly all 64-bit CPUs, except for
early P4 processors that did not support this feature.
Instead of explicitly checking for X86_PF_INSTR, compare the fault
address to RIP.
On machines with X86_FEATURE_NX enabled, issue a warning if RIP is equal
to fault address but X86_PF_INSTR is absent.
[ dhansen: flesh out code comments ]
Originally-by: Dave Hansen <dave.hansen@intel.com>
Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Link: https://lore.kernel.org/all/bd81a98b-f8d4-4304-ac55-d4151a1a77ab@intel.com
Link: https://lore.kernel.org/all/20250624145918.2720487-1-kirill.shutemov%40linux.intel.com
---
arch/x86/entry/vsyscall/vsyscall_64.c | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index c9103a6..6e6c0a7 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -124,7 +124,12 @@ bool emulate_vsyscall(unsigned long error_code,
if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
return false;
- if (!(error_code & X86_PF_INSTR)) {
+ /*
+ * Assume that faults at regs->ip are because of an
+ * instruction fetch. Return early and avoid
+ * emulation for faults during data accesses:
+ */
+ if (address != regs->ip) {
/* Failed vsyscall read */
if (vsyscall_mode == EMULATE)
return false;
@@ -137,12 +142,18 @@ bool emulate_vsyscall(unsigned long error_code,
}
/*
+ * X86_PF_INSTR is only set when NX is supported. When
+ * available, use it to double-check that the emulation code
+ * is only being used for instruction fetches:
+ */
+ if (cpu_feature_enabled(X86_FEATURE_NX))
+ WARN_ON_ONCE(!(error_code & X86_PF_INSTR));
+
+ /*
* No point in checking CS -- the only way to get here is a user mode
* trap to a high address, which means that we're in 64-bit user code.
*/
- WARN_ON_ONCE(address != regs->ip);
-
if (vsyscall_mode == NONE) {
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall attempted with vsyscall=none");
On 23/06/2025 1:41 pm, Kirill A. Shutemov wrote: > On Fri, Jun 20, 2025 at 04:21:38PM -0700, Dave Hansen wrote: >> On 6/20/25 16:08, Andrew Cooper wrote: >>>> But, the resulting code is wonky. It needs to do something more like this: >>>> >>>> if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) >>>> return false; >>>> >>>> if (error_code & X86_PF_INSTR)) >>>> return __emulate_vsyscall(regs, address); >>> To do this, LASS needs a proper interlink against NX || SMEP. >>> >>> If neither NX nor SMEP are active, the CPU does not report X86_PF_INSTR, >>> meaning that fetches are reported as plain reads. >> Interesting point. >> >> I think the easiest way to do this is just make a cpuid_deps[] entry for >> LASS and NX. If there's a CPU where LASS is available but where NX isn't >> available, we have much bigger problems on our hands. > I am not sure what I suppose to do here. > > Sohil pointed out that with LASS we get #GP on vsyscall, not #PF and PFEC > is not relevant for LASS. Correct. That was my mistake originally. > > So, IIUC, that's dependency of vsyscall PF on NX. Do we want to disable > vsyscall on boot if NX is not available? > > BTW, why do we even support !NX on X86_64? Is there such HW? Yes. Early P4 steppings had no NX at all. ~Andrew
On June 20, 2025 4:21:38 PM PDT, Dave Hansen <dave.hansen@intel.com> wrote: >On 6/20/25 16:08, Andrew Cooper wrote: >>> But, the resulting code is wonky. It needs to do something more like this: >>> >>> if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) >>> return false; >>> >>> if (error_code & X86_PF_INSTR)) >>> return __emulate_vsyscall(regs, address); >> To do this, LASS needs a proper interlink against NX || SMEP. >> >> If neither NX nor SMEP are active, the CPU does not report X86_PF_INSTR, >> meaning that fetches are reported as plain reads. >Interesting point. > >I think the easiest way to do this is just make a cpuid_deps[] entry for >LASS and NX. If there's a CPU where LASS is available but where NX isn't >available, we have much bigger problems on our hands. Indeed. There is exactly zero reason to support this case.
On 6/20/2025 4:08 PM, Andrew Cooper wrote: >> But, the resulting code is wonky. It needs to do something more like this: >> >> if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) >> return false; >> >> if (error_code & X86_PF_INSTR)) >> return __emulate_vsyscall(regs, address); > > To do this, LASS needs a proper interlink against NX || SMEP. > > If neither NX nor SMEP are active, the CPU does not report X86_PF_INSTR, > meaning that fetches are reported as plain reads. > > This leads to some fun corner cases in SMAP and now LASS too for virt. Maybe I am missing something, but LASS works pre-paging so it wouldn't generate a PF, right? We have a new vsyscall emulation for #GP (patch 9) that wouldn't follow this path.
On 21/06/2025 12:18 am, Sohil Mehta wrote: > On 6/20/2025 4:08 PM, Andrew Cooper wrote: >>> But, the resulting code is wonky. It needs to do something more like this: >>> >>> if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) >>> return false; >>> >>> if (error_code & X86_PF_INSTR)) >>> return __emulate_vsyscall(regs, address); >> To do this, LASS needs a proper interlink against NX || SMEP. >> >> If neither NX nor SMEP are active, the CPU does not report X86_PF_INSTR, >> meaning that fetches are reported as plain reads. >> >> This leads to some fun corner cases in SMAP and now LASS too for virt. > Maybe I am missing something, but LASS works pre-paging so it wouldn't > generate a PF, right? Oh right, yes. This is a preexisting bug in vsyscall #PF handling. It simply became obvious with Dave's suggested rearrangement. ~Andrew
© 2016 - 2025 Red Hat, Inc.