Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
arch/x86/entry/common.c | 50 ++++++++++++++++++++++++++++++-
arch/x86/entry/entry_64.S | 55 ++--------------------------------
arch/x86/include/asm/syscall.h | 2 +-
3 files changed, 52 insertions(+), 55 deletions(-)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 6c2826417b33..afe79c3f1c5b 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -70,8 +70,12 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
return false;
}
-__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
+/* Returns true to return using SYSRET, or false to use IRET */
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
{
+ long rip;
+ unsigned int shift_rip;
+
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
@@ -84,6 +88,50 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
instrumentation_end();
syscall_exit_to_user_mode(regs);
+
+ /*
+ * Check that the register state is valid for using SYSRET to exit
+ * to userspace. Otherwise use the slower but fully capable IRET
+ * exit path.
+ */
+
+ /* XEN PV guests always use IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* SYSRET requires RCX == RIP and R11 == EFLAGS */
+ if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+ return false;
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ *
+ * Change top bits to match most significant bit (47th or 56th bit
+ * depending on paging mode) in the address.
+ */
+ shift_rip = (64 - __VIRTUAL_MASK_SHIFT + 1);
+ rip = (long) regs->ip;
+ rip <<= shift_rip;
+ rip >>= shift_rip;
+ if (unlikely((unsigned long) rip != regs->ip))
+ return false;
+
+ /*
+ * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET.
+ */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
+ return false;
+
+ /* Use SYSRET to exit to userspace */
+ return true;
}
#endif
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index c01776a51545..b1288e22cae8 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -123,60 +123,9 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
- * In the Xen PV case we must use iret anyway.
*/
-
- ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
- X86_FEATURE_XENPV
-
- movq RCX(%rsp), %rcx
- movq RIP(%rsp), %r11
-
- cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP.
- *
- * If width of "canonical tail" ever becomes variable, this will need
- * to be updated to remain correct on both old and new CPUs.
- *
- * Change top bits to match most significant bit (47th or 56th bit
- * depending on paging mode) in the address.
- */
-#ifdef CONFIG_X86_5LEVEL
- ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
- "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
-#else
- shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
- sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-#endif
-
- /* If this changed %rcx, it was not canonical */
- cmpq %rcx, %r11
- jne swapgs_restore_regs_and_return_to_usermode
-
- cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
-
- movq R11(%rsp), %r11
- cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
- * restoring TF results in a trap from userspace immediately after
- * SYSRET.
- */
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz swapgs_restore_regs_and_return_to_usermode
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
+ testb %al, %al
+ jz swapgs_restore_regs_and_return_to_usermode
/*
* We win! This label is here just for ease of understanding
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 4fb36fba4b5a..be6c5515e0b9 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -126,7 +126,7 @@ static inline int syscall_get_arch(struct task_struct *task)
? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}
-void do_syscall_64(struct pt_regs *regs, int nr);
+bool do_syscall_64(struct pt_regs *regs, int nr);
#endif /* CONFIG_X86_32 */
--
2.41.0
Hi, On 18.7.2023 16.44, Brian Gerst wrote: > Signed-off-by: Brian Gerst <brgerst@gmail.com> > --- > arch/x86/entry/common.c | 50 ++++++++++++++++++++++++++++++- > arch/x86/entry/entry_64.S | 55 ++-------------------------------- > arch/x86/include/asm/syscall.h | 2 +- > 3 files changed, 52 insertions(+), 55 deletions(-) > > diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c > index 6c2826417b33..afe79c3f1c5b 100644 > --- a/arch/x86/entry/common.c > +++ b/arch/x86/entry/common.c > @@ -70,8 +70,12 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) > return false; > } > > -__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) > +/* Returns true to return using SYSRET, or false to use IRET */ > +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) > { > + long rip; > + unsigned int shift_rip; > + > add_random_kstack_offset(); > nr = syscall_enter_from_user_mode(regs, nr); > > @@ -84,6 +88,50 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) > > instrumentation_end(); > syscall_exit_to_user_mode(regs); > + > + /* > + * Check that the register state is valid for using SYSRET to exit > + * to userspace. Otherwise use the slower but fully capable IRET > + * exit path. > + */ > + > + /* XEN PV guests always use IRET path */ > + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > + return false; > + > + /* SYSRET requires RCX == RIP and R11 == EFLAGS */ > + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) > + return false; > + > + /* CS and SS must match the values set in MSR_STAR */ > + if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) > + return false; > + > + /* > + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP > + * in kernel space. This essentially lets the user take over > + * the kernel, since userspace controls RSP. > + * > + * Change top bits to match most significant bit (47th or 56th bit > + * depending on paging mode) in the address. > + */ > + shift_rip = (64 - __VIRTUAL_MASK_SHIFT + 1); Should this be: shift_rip = (64 - __VIRTUAL_MASK_SHIFT - 1); ? > + rip = (long) regs->ip; > + rip <<= shift_rip; > + rip >>= shift_rip; > + if (unlikely((unsigned long) rip != regs->ip)) > + return false; > + > + /* > + * SYSRET cannot restore RF. It can restore TF, but unlike IRET, > + * restoring TF results in a trap from userspace immediately after > + * SYSRET. > + */ > + if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) > + return false; > + > + /* Use SYSRET to exit to userspace */ > + return true; > } > #endif > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S > index c01776a51545..b1288e22cae8 100644 > --- a/arch/x86/entry/entry_64.S > +++ b/arch/x86/entry/entry_64.S > @@ -123,60 +123,9 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) > * Try to use SYSRET instead of IRET if we're returning to > * a completely clean 64-bit userspace context. If we're not, > * go to the slow exit path. > - * In the Xen PV case we must use iret anyway. > */ > - > - ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \ > - X86_FEATURE_XENPV > - > - movq RCX(%rsp), %rcx > - movq RIP(%rsp), %r11 > - > - cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ > - jne swapgs_restore_regs_and_return_to_usermode > - > - /* > - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP > - * in kernel space. This essentially lets the user take over > - * the kernel, since userspace controls RSP. > - * > - * If width of "canonical tail" ever becomes variable, this will need > - * to be updated to remain correct on both old and new CPUs. > - * > - * Change top bits to match most significant bit (47th or 56th bit > - * depending on paging mode) in the address. > - */ > -#ifdef CONFIG_X86_5LEVEL > - ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \ > - "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57 > -#else > - shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx > - sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx > -#endif > - > - /* If this changed %rcx, it was not canonical */ > - cmpq %rcx, %r11 > - jne swapgs_restore_regs_and_return_to_usermode > - > - cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ > - jne swapgs_restore_regs_and_return_to_usermode > - > - movq R11(%rsp), %r11 > - cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ > - jne swapgs_restore_regs_and_return_to_usermode > - > - /* > - * SYSRET cannot restore RF. It can restore TF, but unlike IRET, > - * restoring TF results in a trap from userspace immediately after > - * SYSRET. > - */ > - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 > - jnz swapgs_restore_regs_and_return_to_usermode > - > - /* nothing to check for RSP */ > - > - cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ > - jne swapgs_restore_regs_and_return_to_usermode > + testb %al, %al > + jz swapgs_restore_regs_and_return_to_usermode > > /* > * We win! This label is here just for ease of understanding > diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h > index 4fb36fba4b5a..be6c5515e0b9 100644 > --- a/arch/x86/include/asm/syscall.h > +++ b/arch/x86/include/asm/syscall.h > @@ -126,7 +126,7 @@ static inline int syscall_get_arch(struct task_struct *task) > ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; > } > > -void do_syscall_64(struct pt_regs *regs, int nr); > +bool do_syscall_64(struct pt_regs *regs, int nr); > > #endif /* CONFIG_X86_32 */ > --Mika
On Tue, Jul 18, 2023 at 10:17 AM Mika Penttilä <mpenttil@redhat.com> wrote: > > Hi, > > > On 18.7.2023 16.44, Brian Gerst wrote: > > Signed-off-by: Brian Gerst <brgerst@gmail.com> > > --- > > arch/x86/entry/common.c | 50 ++++++++++++++++++++++++++++++- > > arch/x86/entry/entry_64.S | 55 ++-------------------------------- > > arch/x86/include/asm/syscall.h | 2 +- > > 3 files changed, 52 insertions(+), 55 deletions(-) > > > > diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c > > index 6c2826417b33..afe79c3f1c5b 100644 > > --- a/arch/x86/entry/common.c > > +++ b/arch/x86/entry/common.c > > @@ -70,8 +70,12 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) > > return false; > > } > > > > -__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) > > +/* Returns true to return using SYSRET, or false to use IRET */ > > +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) > > { > > + long rip; > > + unsigned int shift_rip; > > + > > add_random_kstack_offset(); > > nr = syscall_enter_from_user_mode(regs, nr); > > > > @@ -84,6 +88,50 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) > > > > instrumentation_end(); > > syscall_exit_to_user_mode(regs); > > + > > + /* > > + * Check that the register state is valid for using SYSRET to exit > > + * to userspace. Otherwise use the slower but fully capable IRET > > + * exit path. > > + */ > > + > > + /* XEN PV guests always use IRET path */ > > + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > > + return false; > > + > > + /* SYSRET requires RCX == RIP and R11 == EFLAGS */ > > + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) > > + return false; > > + > > + /* CS and SS must match the values set in MSR_STAR */ > > + if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) > > + return false; > > + > > + /* > > + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP > > + * in kernel space. This essentially lets the user take over > > + * the kernel, since userspace controls RSP. > > + * > > + * Change top bits to match most significant bit (47th or 56th bit > > + * depending on paging mode) in the address. > > + */ > > + shift_rip = (64 - __VIRTUAL_MASK_SHIFT + 1); > > Should this be: > > shift_rip = (64 - __VIRTUAL_MASK_SHIFT - 1); > ? I removed a set of parentheses, which switched the sign from -1 to +1. I could put it back if that's less confusing. Brian Gerst
On 18.7.2023 17.25, Brian Gerst wrote: > On Tue, Jul 18, 2023 at 10:17 AM Mika Penttilä <mpenttil@redhat.com> wrote: >> >> Hi, >> >> >> On 18.7.2023 16.44, Brian Gerst wrote: >>> Signed-off-by: Brian Gerst <brgerst@gmail.com> >>> --- >>> arch/x86/entry/common.c | 50 ++++++++++++++++++++++++++++++- >>> arch/x86/entry/entry_64.S | 55 ++-------------------------------- >>> arch/x86/include/asm/syscall.h | 2 +- >>> 3 files changed, 52 insertions(+), 55 deletions(-) >>> >>> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c >>> index 6c2826417b33..afe79c3f1c5b 100644 >>> --- a/arch/x86/entry/common.c >>> +++ b/arch/x86/entry/common.c >>> @@ -70,8 +70,12 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) >>> return false; >>> } >>> >>> -__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) >>> +/* Returns true to return using SYSRET, or false to use IRET */ >>> +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) >>> { >>> + long rip; >>> + unsigned int shift_rip; >>> + >>> add_random_kstack_offset(); >>> nr = syscall_enter_from_user_mode(regs, nr); >>> >>> @@ -84,6 +88,50 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) >>> >>> instrumentation_end(); >>> syscall_exit_to_user_mode(regs); >>> + >>> + /* >>> + * Check that the register state is valid for using SYSRET to exit >>> + * to userspace. Otherwise use the slower but fully capable IRET >>> + * exit path. >>> + */ >>> + >>> + /* XEN PV guests always use IRET path */ >>> + if (cpu_feature_enabled(X86_FEATURE_XENPV)) >>> + return false; >>> + >>> + /* SYSRET requires RCX == RIP and R11 == EFLAGS */ >>> + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) >>> + return false; >>> + >>> + /* CS and SS must match the values set in MSR_STAR */ >>> + if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) >>> + return false; >>> + >>> + /* >>> + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP >>> + * in kernel space. This essentially lets the user take over >>> + * the kernel, since userspace controls RSP. >>> + * >>> + * Change top bits to match most significant bit (47th or 56th bit >>> + * depending on paging mode) in the address. >>> + */ >>> + shift_rip = (64 - __VIRTUAL_MASK_SHIFT + 1); >> >> Should this be: >> >> shift_rip = (64 - __VIRTUAL_MASK_SHIFT - 1); >> ? > > I removed a set of parentheses, which switched the sign from -1 to +1. > I could put it back if that's less confusing. > I mean isn't it supposed to be: shift_rip = (64 - 48) for 4 level, now it's shift_rip = (64 - 46) __VIRTUAL_MASK_SHIFT == 47 > Brian Gerst >
On Tue, Jul 18, 2023 at 10:49 AM Mika Penttilä <mpenttil@redhat.com> wrote: > > > > On 18.7.2023 17.25, Brian Gerst wrote: > > On Tue, Jul 18, 2023 at 10:17 AM Mika Penttilä <mpenttil@redhat.com> wrote: > >> > >> Hi, > >> > >> > >> On 18.7.2023 16.44, Brian Gerst wrote: > >>> Signed-off-by: Brian Gerst <brgerst@gmail.com> > >>> --- > >>> arch/x86/entry/common.c | 50 ++++++++++++++++++++++++++++++- > >>> arch/x86/entry/entry_64.S | 55 ++-------------------------------- > >>> arch/x86/include/asm/syscall.h | 2 +- > >>> 3 files changed, 52 insertions(+), 55 deletions(-) > >>> > >>> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c > >>> index 6c2826417b33..afe79c3f1c5b 100644 > >>> --- a/arch/x86/entry/common.c > >>> +++ b/arch/x86/entry/common.c > >>> @@ -70,8 +70,12 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) > >>> return false; > >>> } > >>> > >>> -__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) > >>> +/* Returns true to return using SYSRET, or false to use IRET */ > >>> +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) > >>> { > >>> + long rip; > >>> + unsigned int shift_rip; > >>> + > >>> add_random_kstack_offset(); > >>> nr = syscall_enter_from_user_mode(regs, nr); > >>> > >>> @@ -84,6 +88,50 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) > >>> > >>> instrumentation_end(); > >>> syscall_exit_to_user_mode(regs); > >>> + > >>> + /* > >>> + * Check that the register state is valid for using SYSRET to exit > >>> + * to userspace. Otherwise use the slower but fully capable IRET > >>> + * exit path. > >>> + */ > >>> + > >>> + /* XEN PV guests always use IRET path */ > >>> + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > >>> + return false; > >>> + > >>> + /* SYSRET requires RCX == RIP and R11 == EFLAGS */ > >>> + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) > >>> + return false; > >>> + > >>> + /* CS and SS must match the values set in MSR_STAR */ > >>> + if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) > >>> + return false; > >>> + > >>> + /* > >>> + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP > >>> + * in kernel space. This essentially lets the user take over > >>> + * the kernel, since userspace controls RSP. > >>> + * > >>> + * Change top bits to match most significant bit (47th or 56th bit > >>> + * depending on paging mode) in the address. > >>> + */ > >>> + shift_rip = (64 - __VIRTUAL_MASK_SHIFT + 1); > >> > >> Should this be: > >> > >> shift_rip = (64 - __VIRTUAL_MASK_SHIFT - 1); > >> ? > > > > I removed a set of parentheses, which switched the sign from -1 to +1. > > I could put it back if that's less confusing. > > > > I mean isn't it supposed to be: > shift_rip = (64 - 48) for 4 level, now it's > shift_rip = (64 - 46) > > __VIRTUAL_MASK_SHIFT == 47 Original: (64 - (47 + 1)) = (64 - 48) = 16 c5: 48 c1 e1 10 shl $0x10,%rcx c9: 48 c1 f9 10 sar $0x10,%rcx New: (64 - 47 - 1) = (17 - 1) = 16 18b: b9 10 00 00 00 mov $0x10,%ecx 193: 48 d3 e2 shl %cl,%rdx 196: 48 d3 fa sar %cl,%rdx Anyways, I'll switch it back to the original formula. I'm not going to argue any more about basic math. Brian Gerst
On Tue, Jul 18, 2023 at 11:21 AM Brian Gerst <brgerst@gmail.com> wrote: > > On Tue, Jul 18, 2023 at 10:49 AM Mika Penttilä <mpenttil@redhat.com> wrote: > > > > > > > > On 18.7.2023 17.25, Brian Gerst wrote: > > > On Tue, Jul 18, 2023 at 10:17 AM Mika Penttilä <mpenttil@redhat.com> wrote: > > >> > > >> Hi, > > >> > > >> > > >> On 18.7.2023 16.44, Brian Gerst wrote: > > >>> Signed-off-by: Brian Gerst <brgerst@gmail.com> > > >>> --- > > >>> arch/x86/entry/common.c | 50 ++++++++++++++++++++++++++++++- > > >>> arch/x86/entry/entry_64.S | 55 ++-------------------------------- > > >>> arch/x86/include/asm/syscall.h | 2 +- > > >>> 3 files changed, 52 insertions(+), 55 deletions(-) > > >>> > > >>> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c > > >>> index 6c2826417b33..afe79c3f1c5b 100644 > > >>> --- a/arch/x86/entry/common.c > > >>> +++ b/arch/x86/entry/common.c > > >>> @@ -70,8 +70,12 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) > > >>> return false; > > >>> } > > >>> > > >>> -__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) > > >>> +/* Returns true to return using SYSRET, or false to use IRET */ > > >>> +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) > > >>> { > > >>> + long rip; > > >>> + unsigned int shift_rip; > > >>> + > > >>> add_random_kstack_offset(); > > >>> nr = syscall_enter_from_user_mode(regs, nr); > > >>> > > >>> @@ -84,6 +88,50 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) > > >>> > > >>> instrumentation_end(); > > >>> syscall_exit_to_user_mode(regs); > > >>> + > > >>> + /* > > >>> + * Check that the register state is valid for using SYSRET to exit > > >>> + * to userspace. Otherwise use the slower but fully capable IRET > > >>> + * exit path. > > >>> + */ > > >>> + > > >>> + /* XEN PV guests always use IRET path */ > > >>> + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > > >>> + return false; > > >>> + > > >>> + /* SYSRET requires RCX == RIP and R11 == EFLAGS */ > > >>> + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) > > >>> + return false; > > >>> + > > >>> + /* CS and SS must match the values set in MSR_STAR */ > > >>> + if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) > > >>> + return false; > > >>> + > > >>> + /* > > >>> + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP > > >>> + * in kernel space. This essentially lets the user take over > > >>> + * the kernel, since userspace controls RSP. > > >>> + * > > >>> + * Change top bits to match most significant bit (47th or 56th bit > > >>> + * depending on paging mode) in the address. > > >>> + */ > > >>> + shift_rip = (64 - __VIRTUAL_MASK_SHIFT + 1); > > >> > > >> Should this be: > > >> > > >> shift_rip = (64 - __VIRTUAL_MASK_SHIFT - 1); > > >> ? > > > > > > I removed a set of parentheses, which switched the sign from -1 to +1. > > > I could put it back if that's less confusing. > > > > > > > I mean isn't it supposed to be: > > shift_rip = (64 - 48) for 4 level, now it's > > shift_rip = (64 - 46) > > > > __VIRTUAL_MASK_SHIFT == 47 My apologies, you were right. I've been sitting on this series for a while and finally got around to posting it and didn't catch that error. > > Original: > (64 - (47 + 1)) = (64 - 48) = 16 > > c5: 48 c1 e1 10 shl $0x10,%rcx > c9: 48 c1 f9 10 sar $0x10,%rcx This was wrong. I hastily compiled this after I had reverted to the original formula. > New: > (64 - 47 - 1) = (17 - 1) = 16 > > 18b: b9 10 00 00 00 mov $0x10,%ecx > 193: 48 d3 e2 shl %cl,%rdx > 196: 48 d3 fa sar %cl,%rdx > > Anyways, I'll switch it back to the original formula. I'm not going > to argue any more about basic math. I'll send a v2 later after any more feedback. Thanks. Brian Gerst
© 2016 - 2025 Red Hat, Inc.