In addition to disabling 32bit syscall interface let's also disable the
ability to run 32bit processes altogether. This is achieved by setting
the GDT_ENTRY_DEFAULT_USER32_CS descriptor to not present which would
cause 32 bit processes to trap with a #NP exception. Furthermore,
forbid loading compat processes as well.
Signed-off-by: Nikolay Borisov <nik.borisov@suse.com>
---
arch/x86/include/asm/elf.h | 5 +++--
arch/x86/kernel/cpu/common.c | 8 ++++++++
2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 18fd06f7936a..406245bc0fb0 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -148,9 +148,10 @@ do { \
#define elf_check_arch(x) \
((x)->e_machine == EM_X86_64)
+extern bool ia32_disabled;
#define compat_elf_check_arch(x) \
- (elf_check_arch_ia32(x) || \
- (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
+ (!ia32_disabled && (elf_check_arch_ia32(x) || \
+ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)))
static inline void elf_common_init(struct thread_struct *t,
struct pt_regs *regs, const u16 ds)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 71f8b55f70c9..ddc301c09419 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2359,6 +2359,11 @@ void microcode_check(struct cpuinfo_x86 *prev_info)
}
#endif
+static void remove_user32cs_from_gdt(void * __unused)
+{
+ get_current_gdt_rw()[GDT_ENTRY_DEFAULT_USER32_CS].p = 0;
+}
+
/*
* Invoked from core CPU hotplug code after hotplug operations
*/
@@ -2368,4 +2373,7 @@ void arch_smt_update(void)
cpu_bugs_smt_update();
/* Check whether IPI broadcasting can be enabled */
apic_smt_update();
+ if (ia32_disabled)
+ on_each_cpu(remove_user32cs_from_gdt, NULL, 1);
+
}
--
2.34.1
On Wed, Jun 7, 2023 at 3:41 AM Nikolay Borisov <nik.borisov@suse.com> wrote:
>
> In addition to disabling 32bit syscall interface let's also disable the
> ability to run 32bit processes altogether. This is achieved by setting
> the GDT_ENTRY_DEFAULT_USER32_CS descriptor to not present which would
> cause 32 bit processes to trap with a #NP exception. Furthermore,
> forbid loading compat processes as well.
>
> Signed-off-by: Nikolay Borisov <nik.borisov@suse.com>
> ---
> arch/x86/include/asm/elf.h | 5 +++--
> arch/x86/kernel/cpu/common.c | 8 ++++++++
> 2 files changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
> index 18fd06f7936a..406245bc0fb0 100644
> --- a/arch/x86/include/asm/elf.h
> +++ b/arch/x86/include/asm/elf.h
> @@ -148,9 +148,10 @@ do { \
> #define elf_check_arch(x) \
> ((x)->e_machine == EM_X86_64)
>
> +extern bool ia32_disabled;
> #define compat_elf_check_arch(x) \
> - (elf_check_arch_ia32(x) || \
> - (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
> + (!ia32_disabled && (elf_check_arch_ia32(x) || \
> + (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)))
>
> static inline void elf_common_init(struct thread_struct *t,
> struct pt_regs *regs, const u16 ds)
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 71f8b55f70c9..ddc301c09419 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -2359,6 +2359,11 @@ void microcode_check(struct cpuinfo_x86 *prev_info)
> }
> #endif
>
> +static void remove_user32cs_from_gdt(void * __unused)
> +{
> + get_current_gdt_rw()[GDT_ENTRY_DEFAULT_USER32_CS].p = 0;
> +}
> +
> /*
> * Invoked from core CPU hotplug code after hotplug operations
> */
> @@ -2368,4 +2373,7 @@ void arch_smt_update(void)
> cpu_bugs_smt_update();
> /* Check whether IPI broadcasting can be enabled */
> apic_smt_update();
> + if (ia32_disabled)
> + on_each_cpu(remove_user32cs_from_gdt, NULL, 1);
> +
> }
Disabling USER32_CS isn't really necessary, as simply running 32-bit
user code (in a normally 64-bit process) doesn't pose much of a threat
to the kernel with 32-bit syscalls disabled.
Wine, for example, is moving to a model where the main code runs in
64-bit mode, uses only 64-bit unix libraries, and the 32->64 bit
transitions are done entirely in userspace. It will still need the
ability to use 32-bit code segments, but won't need 32-bit unix
libraries or syscalls to run 32-bit Windows code.
Brian Gerst
On Wed, Jun 07 2023 at 10:29, Nikolay Borisov wrote:
> In addition to disabling 32bit syscall interface let's also disable the
> ability to run 32bit processes altogether. This is achieved by setting
> the GDT_ENTRY_DEFAULT_USER32_CS descriptor to not present which would
> cause 32 bit processes to trap with a #NP exception. Furthermore,
> forbid loading compat processes as well.
This is obviously the wrong order of things. Prevent loading of compat
processes is the first step, no?
>
> +extern bool ia32_disabled;
> #define compat_elf_check_arch(x) \
So in patch 1 you add the declaration with #ifdef guards and now you add
another one without. Fortunately this is the last patch otherwise we'd
might end up with another incarnation in the next header file.
> - (elf_check_arch_ia32(x) || \
> - (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
> + (!ia32_disabled && (elf_check_arch_ia32(x) || \
> + (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)))
If I'm reading this correctly then ia32_disabled also prevents binaries
with X32 ABI to be loaded.
That might be intentional but I'm failing to find any explanation for
this in the changelog.
X32_ABI != IA32_EMULATION
> static inline void elf_common_init(struct thread_struct *t,
> struct pt_regs *regs, const u16 ds)
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 71f8b55f70c9..ddc301c09419 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -2359,6 +2359,11 @@ void microcode_check(struct cpuinfo_x86 *prev_info)
> }
> #endif
>
> +static void remove_user32cs_from_gdt(void * __unused)
> +{
> + get_current_gdt_rw()[GDT_ENTRY_DEFAULT_USER32_CS].p = 0;
> +}
> +
> /*
> * Invoked from core CPU hotplug code after hotplug operations
> */
> @@ -2368,4 +2373,7 @@ void arch_smt_update(void)
> cpu_bugs_smt_update();
> /* Check whether IPI broadcasting can be enabled */
> apic_smt_update();
> + if (ia32_disabled)
> + on_each_cpu(remove_user32cs_from_gdt, NULL, 1);
> +
> }
This issues a SMP function call on all online CPUs to set these entries
to 0 on _every_ CPU hotplug operation.
I'm sure there is a reason why these bits need to be cleared over and
over. It's just not obvious to me why clearing them _ONCE_ per boot is
not sufficient. It's neither clear to me why CPU0 must do that ($NCPUS -
1) times, but for the last CPU it's enough to do it once.
That aside, what's the justification for doing this in the first place
and why is this again inconsistent vs. CONFIG_IA32_EMULATION=n?
The changelog is full of void in that aspect.
Thanks,
tglx
On Wed, Jun 07 2023 at 14:01, Thomas Gleixner wrote: > On Wed, Jun 07 2023 at 10:29, Nikolay Borisov wrote: >> @@ -2368,4 +2373,7 @@ void arch_smt_update(void) >> cpu_bugs_smt_update(); >> /* Check whether IPI broadcasting can be enabled */ >> apic_smt_update(); >> + if (ia32_disabled) >> + on_each_cpu(remove_user32cs_from_gdt, NULL, 1); My brain based compiler tells me, that this breaks the 32bit build and the 64bit build with CONFIG_IA32_EMULATION=n. I'm pretty confident that a real compiler will agree.
On 7.06.23 г. 15:01 ч., Thomas Gleixner wrote:
> On Wed, Jun 07 2023 at 10:29, Nikolay Borisov wrote:
>> In addition to disabling 32bit syscall interface let's also disable the
>> ability to run 32bit processes altogether. This is achieved by setting
>> the GDT_ENTRY_DEFAULT_USER32_CS descriptor to not present which would
>> cause 32 bit processes to trap with a #NP exception. Furthermore,
>> forbid loading compat processes as well.
>
> This is obviously the wrong order of things. Prevent loading of compat
> processes is the first step, no?
You mean to change the sequence in which those things are mentioned in
the log?
>
>>
>> +extern bool ia32_disabled;
>> #define compat_elf_check_arch(x) \
>
> So in patch 1 you add the declaration with #ifdef guards and now you add
> another one without. Fortunately this is the last patch otherwise we'd
> might end up with another incarnation in the next header file.
My bad, will fix it.
>
>> - (elf_check_arch_ia32(x) || \
>> - (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
>> + (!ia32_disabled && (elf_check_arch_ia32(x) || \
>> + (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)))
>
> If I'm reading this correctly then ia32_disabled also prevents binaries
> with X32 ABI to be loaded.
>
> That might be intentional but I'm failing to find any explanation for
> this in the changelog.
>
> X32_ABI != IA32_EMULATION
Right, however given the other changes (i.e disabling sysenter/int 0x80)
can we really have a working X32 abi when ia32_disabled is true? Now I'm
thinking can we really have IA32_EMULATION && X32_ABI && ia32_disabled,
I guess the answer is no?
>
>> static inline void elf_common_init(struct thread_struct *t,
>> struct pt_regs *regs, const u16 ds)
>> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
>> index 71f8b55f70c9..ddc301c09419 100644
>> --- a/arch/x86/kernel/cpu/common.c
>> +++ b/arch/x86/kernel/cpu/common.c
>> @@ -2359,6 +2359,11 @@ void microcode_check(struct cpuinfo_x86 *prev_info)
>> }
>> #endif
>>
>> +static void remove_user32cs_from_gdt(void * __unused)
>> +{
>> + get_current_gdt_rw()[GDT_ENTRY_DEFAULT_USER32_CS].p = 0;
>> +}
>> +
>> /*
>> * Invoked from core CPU hotplug code after hotplug operations
>> */
>> @@ -2368,4 +2373,7 @@ void arch_smt_update(void)
>> cpu_bugs_smt_update();
>> /* Check whether IPI broadcasting can be enabled */
>> apic_smt_update();
>> + if (ia32_disabled)
>> + on_each_cpu(remove_user32cs_from_gdt, NULL, 1);
>> +
>> }
>
> This issues a SMP function call on all online CPUs to set these entries
> to 0 on _every_ CPU hotplug operation.
>
> I'm sure there is a reason why these bits need to be cleared over and
> over. It's just not obvious to me why clearing them _ONCE_ per boot is
> not sufficient. It's neither clear to me why CPU0 must do that ($NCPUS -
> 1) times, but for the last CPU it's enough to do it once.
Actually clearing them once per-cpu is perfectly fine. Looking around
the code i saw arch_smt_update() to be the only place where a
on_each_cpu() call is being made hence I put the code there. Another
aspect I was thinking of is what if a cpu gets onlined at a later stage
and a 32bit process is scheduled on that cpu, if the gdt entry wasn't
cleared on that CPU then it would be possible to run 32bit processes on
it. I guess a better alternative is to use arch_initcall() ?
>
> That aside, what's the justification for doing this in the first place
> and why is this again inconsistent vs. CONFIG_IA32_EMULATION=n?
I'll put it under an ifdef CONFIG_IA32_EMULATION, unfortunately the
traps.h header can't be included in elf.h without causing build breakage.
>
> The changelog is full of void in that aspect.
>
> Thanks,
>
> tglx
>
On Wed, Jun 07 2023 at 15:19, Nikolay Borisov wrote:
> On 7.06.23 г. 15:01 ч., Thomas Gleixner wrote:
>>
>>> - (elf_check_arch_ia32(x) || \
>>> - (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
>>> + (!ia32_disabled && (elf_check_arch_ia32(x) || \
>>> + (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)))
>>
>> If I'm reading this correctly then ia32_disabled also prevents binaries
>> with X32 ABI to be loaded.
>>
>> That might be intentional but I'm failing to find any explanation for
>> this in the changelog.
>>
>> X32_ABI != IA32_EMULATION
>
> Right, however given the other changes (i.e disabling sysenter/int 0x80)
> can we really have a working X32 abi when ia32_disabled is true? Now I'm
> thinking can we really have IA32_EMULATION && X32_ABI && ia32_disabled,
> I guess the answer is no?
X32_ABI is completely _independent_ from IA32_EMULATION.
It just shares some of the required compat code, but it does not use
sysenter/int 0x80 at all. It uses the regular 64bit system call.
You can build a working kernel with X32_ABI=y and IA32_EMULATION=n.
So why would boottime disabling of IA32_EMULATION affect X32_ABI in any
way?
>>
>> This issues a SMP function call on all online CPUs to set these entries
>> to 0 on _every_ CPU hotplug operation.
>>
>> I'm sure there is a reason why these bits need to be cleared over and
>> over. It's just not obvious to me why clearing them _ONCE_ per boot is
>> not sufficient. It's neither clear to me why CPU0 must do that ($NCPUS -
>> 1) times, but for the last CPU it's enough to do it once.
>
> Actually clearing them once per-cpu is perfectly fine. Looking around
> the code i saw arch_smt_update() to be the only place where a
> on_each_cpu() call is being made hence I put the code there. Another
> aspect I was thinking of is what if a cpu gets onlined at a later stage
> and a 32bit process is scheduled on that cpu, if the gdt entry wasn't
> cleared on that CPU then it would be possible to run 32bit processes on
> it. I guess a better alternative is to use arch_initcall() ?
Why do you need an on_each_cpu() function call at all? Why would you
need an extra arch_initcall()?
The obvious place to clear this is when a CPU is initialized, no?
>> That aside, what's the justification for doing this in the first place
>> and why is this again inconsistent vs. CONFIG_IA32_EMULATION=n?
>
> I'll put it under an ifdef CONFIG_IA32_EMULATION, unfortunately the
> traps.h header can't be included in elf.h without causing build breakage.
You are not answering my question at all and neither the elf nor the
traps header have anything to do with it. I'm happy to rephrase it:
1) What is the justification for setting the 'present' bit of
GDT_ENTRY_DEFAULT_USER32_CS to 0?
2) Why is #1 inconsistent with CONFIG_IA32_EMULATION=n?
Thanks,
tglx
On 7.06.23 г. 15:53 ч., Thomas Gleixner wrote: > On Wed, Jun 07 2023 at 15:19, Nikolay Borisov wrote: >> On 7.06.23 г. 15:01 ч., Thomas Gleixner wrote: >>> >>>> - (elf_check_arch_ia32(x) || \ >>>> - (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) >>>> + (!ia32_disabled && (elf_check_arch_ia32(x) || \ >>>> + (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))) >>> >>> If I'm reading this correctly then ia32_disabled also prevents binaries >>> with X32 ABI to be loaded. >>> >>> That might be intentional but I'm failing to find any explanation for >>> this in the changelog. >>> >>> X32_ABI != IA32_EMULATION >> >> Right, however given the other changes (i.e disabling sysenter/int 0x80) >> can we really have a working X32 abi when ia32_disabled is true? Now I'm >> thinking can we really have IA32_EMULATION && X32_ABI && ia32_disabled, >> I guess the answer is no? > > X32_ABI is completely _independent_ from IA32_EMULATION. > > It just shares some of the required compat code, but it does not use > sysenter/int 0x80 at all. It uses the regular 64bit system call. > > You can build a working kernel with X32_ABI=y and IA32_EMULATION=n. > > So why would boottime disabling of IA32_EMULATION affect X32_ABI in any > way? In this case it shouldn't affect it and the check should be ((elf_check_arch_ia32(x) && !ia32_disabled) || (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)). > >>> >>> This issues a SMP function call on all online CPUs to set these entries >>> to 0 on _every_ CPU hotplug operation. >>> >>> I'm sure there is a reason why these bits need to be cleared over and >>> over. It's just not obvious to me why clearing them _ONCE_ per boot is >>> not sufficient. It's neither clear to me why CPU0 must do that ($NCPUS - >>> 1) times, but for the last CPU it's enough to do it once. >> >> Actually clearing them once per-cpu is perfectly fine. Looking around >> the code i saw arch_smt_update() to be the only place where a >> on_each_cpu() call is being made hence I put the code there. Another >> aspect I was thinking of is what if a cpu gets onlined at a later stage >> and a 32bit process is scheduled on that cpu, if the gdt entry wasn't >> cleared on that CPU then it would be possible to run 32bit processes on >> it. I guess a better alternative is to use arch_initcall() ? > > Why do you need an on_each_cpu() function call at all? Why would you > need an extra arch_initcall()? > > The obvious place to clear this is when a CPU is initialized, no? > >>> That aside, what's the justification for doing this in the first place >>> and why is this again inconsistent vs. CONFIG_IA32_EMULATION=n? >> >> I'll put it under an ifdef CONFIG_IA32_EMULATION, unfortunately the >> traps.h header can't be included in elf.h without causing build breakage. > > You are not answering my question at all and neither the elf nor the > traps header have anything to do with it. I'm happy to rephrase it: > > 1) What is the justification for setting the 'present' bit of > GDT_ENTRY_DEFAULT_USER32_CS to 0? This was something which was suggested by Andrew Cooper on irc, to my understanding the idea is that by not having a 32bit capable descriptor it's impossible to run a 32bit code. I guess the scenario where it might be relevant if someone starts a 64bit process and with inline assembly tries to run 32bit code somehow, though it might be a far fetched example and also the fact that the compat_elf_check_arch() forbids loading 32bit processes might be sufficient. > > 2) Why is #1 inconsistent with CONFIG_IA32_EMULATION=n? Because I forgot doing it. > > Thanks, > > tglx > > >
On Wed, Jun 07 2023 at 16:38, Nikolay Borisov wrote:
> On 7.06.23 г. 15:53 ч., Thomas Gleixner wrote:
>> So why would boottime disabling of IA32_EMULATION affect X32_ABI in any
>> way?
>
> In this case it shouldn't affect it and the check should be
>
> ((elf_check_arch_ia32(x) && !ia32_disabled) ||
> (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)).
Correct.
>> 1) What is the justification for setting the 'present' bit of
>> GDT_ENTRY_DEFAULT_USER32_CS to 0?
>
> This was something which was suggested by Andrew Cooper on irc, to my
> understanding the idea is that by not having a 32bit capable descriptor
> it's impossible to run a 32bit code.
Right, but that's a completely separate change. If it is agreed on then
it needs to be consistent and not depend on this command line parameter.
> I guess the scenario where it might be relevant if someone starts a
> 64bit process and with inline assembly tries to run 32bit code
> somehow, though it might be a far fetched example and also the fact
> that the compat_elf_check_arch() forbids loading 32bit processes might
> be sufficient.
Guesswork is not helpful. Facts matter.
Fact is that today a 64bit application can do a far jump of far call
into a 32bit code segment based on the default descriptors or by setting
them up via LDT. That 32bit code obviously can't do compat syscalls if
IA32_EMULATION is disabled, but other than that it just works.
Whether that makes sense or not is a completely different question.
Ergo fact is that clearing the present bit is a user space visible
change, which can't be done nilly willy and burried into a patch
which is about making CONFIG_IA32_EMULATION a boot time switch.
Thanks,
tglx
On 07/06/2023 3:49 pm, Thomas Gleixner wrote: > On Wed, Jun 07 2023 at 16:38, Nikolay Borisov wrote: >> On 7.06.23 г. 15:53 ч., Thomas Gleixner wrote: >>> 1) What is the justification for setting the 'present' bit of >>> GDT_ENTRY_DEFAULT_USER32_CS to 0? >> This was something which was suggested by Andrew Cooper on irc, to my >> understanding the idea is that by not having a 32bit capable descriptor >> it's impossible to run a 32bit code. > Right, but that's a completely separate change. If it is agreed on then > it needs to be consistent and not depend on this command line parameter. And the recommendation was only for debugging purposes. Segments are not like pagetables. The present bit is the last of the checks, not the first, so you'll e.g. get #GP (bad segment type) ahead of getting #NP[sel]. If you're looking to block 32bit, you should zero the entire GDT entry. That way you get #GP to convert into a signal for userspace, rather than starting down a segment-"paged out" looking route. >> I guess the scenario where it might be relevant if someone starts a >> 64bit process and with inline assembly tries to run 32bit code >> somehow, though it might be a far fetched example and also the fact >> that the compat_elf_check_arch() forbids loading 32bit processes might >> be sufficient. > Guesswork is not helpful. Facts matter. > > Fact is that today a 64bit application can do a far jump of far call > into a 32bit code segment based on the default descriptors or by setting > them up via LDT. That 32bit code obviously can't do compat syscalls if > IA32_EMULATION is disabled, but other than that it just works. > > Whether that makes sense or not is a completely different question. > > Ergo fact is that clearing the present bit is a user space visible > change, which can't be done nilly willy and burried into a patch > which is about making CONFIG_IA32_EMULATION a boot time switch. Removing GDT_ENTRY_DEFAULT_USER32_CS is necessary but not sufficient to block userspace getting into 32bit mode. You also have to block Linux from taking any SYSRETL or SYSEXITL path out of the kernel, as these will load fixed 32bit mode attributes into %cs without reference to the GDT. And you need to prevent any userspace use of the LDT, which might be as simple as just blocking SYS_modify_ldt, but it's been a while since I last looked. ~Andrew
On Wed, Jun 07 2023 at 18:25, Andrew Cooper wrote:
> On 07/06/2023 3:49 pm, Thomas Gleixner wrote:
>> Ergo fact is that clearing the present bit is a user space visible
>> change, which can't be done nilly willy and burried into a patch
>> which is about making CONFIG_IA32_EMULATION a boot time switch.
>
> Removing GDT_ENTRY_DEFAULT_USER32_CS is necessary but not sufficient to
> block userspace getting into 32bit mode.
Correct.
> You also have to block Linux from taking any SYSRETL or SYSEXITL path
> out of the kernel, as these will load fixed 32bit mode attributes into
> %cs without reference to the GDT.
That's non-trivial as there is no way to disable 32bit SYSCALL on AMD
(Intel does not support 32bit syscall and you get #UD if CS.L != 1). So
to be safe you'd need to make ignore_sysret() kill the process w/o
returning to user space.
Though arguably if GDT does not have USER32_CS and LDT is disabled (or
the creation of code segments is blocked) then invoking SYSCALL from
compat mode requires quite some advanced magic (assumed there are no CPU
and kernel bugs), no?
> And you need to prevent any userspace use of the LDT, which might be as
> simple as just blocking SYS_modify_ldt, but it's been a while since I
> last looked.
CONFIG_MODIFY_LDT_SYSCALL=n is the only in kernel option right now, but
that could be made boottime disabled trivially. Extending LDT to reject
the creation of code segments is not rocket science either.
Though the real question is:
What is the benefit of such a change?
So far I haven't seen any argument for that. Maybe there is none :)
Thanks,
tglx
On 07/06/2023 10:52 pm, Thomas Gleixner wrote: > On Wed, Jun 07 2023 at 18:25, Andrew Cooper wrote: >> You also have to block Linux from taking any SYSRETL or SYSEXITL path >> out of the kernel, as these will load fixed 32bit mode attributes into >> %cs without reference to the GDT. > That's non-trivial as there is no way to disable 32bit SYSCALL on AMD > (Intel does not support 32bit syscall and you get #UD if CS.L != 1). So > to be safe you'd need to make ignore_sysret() kill the process w/o > returning to user space. ignore_sysret() desperately needs renaming to entry_SYSCALL32_ignore() or similar. And yes, wiring this into SIGSEGV/etc would be a sensible move. The same applies to 32bit SYSENTER if configured. (Linux doesn't, but other OSes do.) > Though arguably if GDT does not have USER32_CS and LDT is disabled (or > the creation of code segments is blocked) then invoking SYSCALL from > compat mode requires quite some advanced magic (assumed there are no CPU > and kernel bugs), no? Plenty of arcane magic exists. Rowhammer the GDT, or exploit a VMM, SMM or ACM bug (all 3 of which can load segments behind the kernel's back), or Red-Unlock mode which can write the segments registers directly, or you could play with the AVX512 brownout erratum some more - the descriptor L bit is only a few bits along from DPL... But if we're assuming no bugs, then no - I'm not aware of any way of doing this. There are only 4 instructions which can reduce privilege: LRET, IRET, SYSEXIT and SYSRET. >> And you need to prevent any userspace use of the LDT, which might be as >> simple as just blocking SYS_modify_ldt, but it's been a while since I >> last looked. > CONFIG_MODIFY_LDT_SYSCALL=n is the only in kernel option right now, but > that could be made boottime disabled trivially. Extending LDT to reject > the creation of code segments is not rocket science either. > > Though the real question is: > > What is the benefit of such a change? > > So far I haven't seen any argument for that. Maybe there is none :) Hardening. The general purpose distros definitely won't care, but special purpose ones will. An x86 bytestream is decoded differently in different modes, and malware can hide in the differences. Standard tooling can't cope with multi-mode binaries, and if it happens by accident you tend get very obscure crash to diagnose. Furthermore, despite CET-SS explicitly trying to account for and protect against accidental mismatches, there are errata in some parts which let userspace forge legal return addresses on the shadow stack by dropping into 32bit mode because, there's a #GP check missing in a microflow. For usecases where there ought not to be any 32bit code at all (and there absolutely are), it would be lovely if this could be enforced, rather than relying on blind hope that it doesn't happen. Thanks, ~Andrew
On Thu, Jun 08 2023 at 00:43, Andrew Cooper wrote:
> On 07/06/2023 10:52 pm, Thomas Gleixner wrote:
>> On Wed, Jun 07 2023 at 18:25, Andrew Cooper wrote:
>>> You also have to block Linux from taking any SYSRETL or SYSEXITL path
>>> out of the kernel, as these will load fixed 32bit mode attributes into
>>> %cs without reference to the GDT.
>> That's non-trivial as there is no way to disable 32bit SYSCALL on AMD
>> (Intel does not support 32bit syscall and you get #UD if CS.L != 1). So
>> to be safe you'd need to make ignore_sysret() kill the process w/o
>> returning to user space.
>
> ignore_sysret() desperately needs renaming to entry_SYSCALL32_ignore()
> or similar.
No argument about that.
> And yes, wiring this into SIGSEGV/etc would be a sensible move.
The only option is to wire it into die_hard_crash_and_burn(). There is
no sane way to deliver a signal to the process which managed to run into
this. Appropriate info to parent or ptracer will still be delivered.
> The same applies to 32bit SYSENTER if configured. (Linux doesn't, but
> other OSes do.)
Why the heck are they doing that?
I really wish that we could disable syscall32 reliably on AMD and make
it raise #UD as it does on Intal.
>> Though the real question is:
>>
>> What is the benefit of such a change?
>>
>> So far I haven't seen any argument for that. Maybe there is none :)
>
> Hardening. The general purpose distros definitely won't care, but
> special purpose ones will.
>
> An x86 bytestream is decoded differently in different modes, and malware
> can hide in the differences. Standard tooling can't cope with
> multi-mode binaries, and if it happens by accident you tend get very
> obscure crash to diagnose.
IOW, you are talking about defense in depth, right? I can buy that one.
> Furthermore, despite CET-SS explicitly trying to account for and protect
> against accidental mismatches, there are errata in some parts which let
> userspace forge legal return addresses on the shadow stack by dropping
> into 32bit mode because, there's a #GP check missing in a microflow.
Didn't we assume that there are no CPU bugs? :)
> For usecases where there ought not to be any 32bit code at all (and
> there absolutely are), it would be lovely if this could be enforced,
> rather than relying on blind hope that it doesn't happen.
I think it's rather clear what needs to be done here to achieve that,
but that's completely orthogonal to the intent of the patch series in
question which aims to make CONFIG_IA32_EMULATION a boot time decision.
Thanks,
tglx
On 08/06/2023 1:25 am, Thomas Gleixner wrote: > On Thu, Jun 08 2023 at 00:43, Andrew Cooper wrote: >> And yes, wiring this into SIGSEGV/etc would be a sensible move. > The only option is to wire it into die_hard_crash_and_burn(). There is > no sane way to deliver a signal to the process which managed to run into > this. Appropriate info to parent or ptracer will still be delivered. Hmm yeah. Something has already gone seriously wrong to end up here, so terminating it is probably best. > I really wish that we could disable syscall32 reliably on AMD and make > it raise #UD as it does on Intal. You know that's changing in FRED, and will follow the AMD model? The SYSCALL instruction is lower latency if it doesn't need to check %cs to conditionally #UD. >> Furthermore, despite CET-SS explicitly trying to account for and protect >> against accidental mismatches, there are errata in some parts which let >> userspace forge legal return addresses on the shadow stack by dropping >> into 32bit mode because, there's a #GP check missing in a microflow. > Didn't we assume that there are no CPU bugs? :) Tell me, is such a CPU delivered with or without a complimentary unicorn? :) >> For usecases where there ought not to be any 32bit code at all (and >> there absolutely are), it would be lovely if this could be enforced, >> rather than relying on blind hope that it doesn't happen. > I think it's rather clear what needs to be done here to achieve that, > but that's completely orthogonal to the intent of the patch series in > question which aims to make CONFIG_IA32_EMULATION a boot time decision. Fully inhibiting 32bit mode is stronger, because it impacts code which would otherwise operate in CONFIG_IA32_EMULATION=n configuration. Which is fine, but I agree that it doesn't want to be confused with being a "runtime CONFIG_IA32_EMULATION" knob. If the runtime form could also come with an equivalent Kconfig form, that would be awesome too. ~Andrew
On 8.06.23 г. 14:25 ч., Andrew Cooper wrote: > On 08/06/2023 1:25 am, Thomas Gleixner wrote: >> On Thu, Jun 08 2023 at 00:43, Andrew Cooper wrote: >>> And yes, wiring this into SIGSEGV/etc would be a sensible move. >> The only option is to wire it into die_hard_crash_and_burn(). There is >> no sane way to deliver a signal to the process which managed to run into >> this. Appropriate info to parent or ptracer will still be delivered. > > Hmm yeah. Something has already gone seriously wrong to end up here, so > terminating it is probably best. > >> I really wish that we could disable syscall32 reliably on AMD and make >> it raise #UD as it does on Intal. > > You know that's changing in FRED, and will follow the AMD model? > > The SYSCALL instruction is lower latency if it doesn't need to check %cs > to conditionally #UD. > >>> Furthermore, despite CET-SS explicitly trying to account for and protect >>> against accidental mismatches, there are errata in some parts which let >>> userspace forge legal return addresses on the shadow stack by dropping >>> into 32bit mode because, there's a #GP check missing in a microflow. >> Didn't we assume that there are no CPU bugs? :) > > Tell me, is such a CPU delivered with or without a complimentary unicorn? :) > >>> For usecases where there ought not to be any 32bit code at all (and >>> there absolutely are), it would be lovely if this could be enforced, >>> rather than relying on blind hope that it doesn't happen. >> I think it's rather clear what needs to be done here to achieve that, >> but that's completely orthogonal to the intent of the patch series in >> question which aims to make CONFIG_IA32_EMULATION a boot time decision. > > Fully inhibiting 32bit mode is stronger, because it impacts code which > would otherwise operate in CONFIG_IA32_EMULATION=n configuration. > > Which is fine, but I agree that it doesn't want to be confused with > being a "runtime CONFIG_IA32_EMULATION" knob. > > If the runtime form could also come with an equivalent Kconfig form, > that would be awesome too. Actually that's something I'm working on. I.e be able to set the default state of IA32_EMULATION at compile time (i.e disabled or enabled) and provide a boottime switch to override this. That way upstream can retain the current behavior, while distros can set the "default disabled" config-time switch and finally users will have the ability to override the config-switch at boottime. > > ~Andrew
On Thu, Jun 08 2023 at 12:25, Andrew Cooper wrote:
> On 08/06/2023 1:25 am, Thomas Gleixner wrote:
>> I really wish that we could disable syscall32 reliably on AMD and make
>> it raise #UD as it does on Intal.
>
> You know that's changing in FRED, and will follow the AMD model?
>
> The SYSCALL instruction is lower latency if it doesn't need to check %cs
> to conditionally #UD.
Yes, but with FRED the CPL0 context is fully consistent. There are no
CPL3 leftovers.
>> Didn't we assume that there are no CPU bugs? :)
>
> Tell me, is such a CPU delivered with or without a complimentary unicorn? :)
It's deliverd by a fairytale princess :)
Thanks,
tglx
On 08. 06. 23, 2:25, Thomas Gleixner wrote: >> For usecases where there ought not to be any 32bit code at all (and >> there absolutely are), it would be lovely if this could be enforced, >> rather than relying on blind hope that it doesn't happen. > > I think it's rather clear what needs to be done here to achieve that, > but that's completely orthogonal to the intent of the patch series in > question which aims to make CONFIG_IA32_EMULATION a boot time decision. Agreed. The original intent was only to disable the 32bit paths in the kernel. I.e. set CONFIG_IA32_EMULATION=n by a runtime switch. Then we came up with idea to disallow loading 32bit binaries to WARN people as the bins won't (mostly) work anyway. (We are/were aware this doesn't forbid running 32bit code.) But now, when we are doing that, I think we should disable 32 bits completely by the switch. I.e. don't provide 32bit segments and whatever. And make that clear and documented in the series. Because as it stands now, it's half-way. Agreed? This whole attempt served as a call for opinions which we got and which is perfect. thanks, -- js suse labs
On 08. 06. 23, 2:25, Thomas Gleixner wrote: > I really wish that we could disable syscall32 reliably on AMD and make > it raise #UD as it does on Intal. Sorry, I am likely missing something, but why is not #GP enough when we set CSTAR = 0? It's of course not as good as Intel's *defined* #UD, but why is not the above sufficient/reliable? thanks, -- js suse labs
On 08/06/2023 7:16 am, Jiri Slaby wrote: > On 08. 06. 23, 2:25, Thomas Gleixner wrote: >> I really wish that we could disable syscall32 reliably on AMD and make >> it raise #UD as it does on Intal. > > Sorry, I am likely missing something, but why is not #GP enough when > we set CSTAR = 0? Yeah, don't be setting CSTAR to 0. If you set CSTAR to 0, and userspace has mapped something at 0, then the CPU will start executing from 0 in kernel mode. If you've got SMEP active, this doesn't help. Instead of executing from 0, you'll take #PF. Except you were already in kernel mode and #PF isn't an IST vector, so you'll then start executing the #PF handler on the same stack as before... which is the user stack, and it can still hijack execution by hooking a return address. If you've got (just) SMAP active, then this doesn't help. The hijacked execution doesn't need to touch the stack to execute STAC and re-permit user data accesses. If you've got SMEP, SMAP, *and* FMASK configured to clear AC automatically on syscall, then you end up in #DF from a SMEP violation trying to fetch the code, and a SMAP violation while trying to push the SMEP violation's #PF IRET frame. It's almost as if not switching the stack was a terrible terrible idea... ~Andrew
On Thu, Jun 08 2023 at 08:16, Jiri Slaby wrote:
> On 08. 06. 23, 2:25, Thomas Gleixner wrote:
>> I really wish that we could disable syscall32 reliably on AMD and make
>> it raise #UD as it does on Intal.
>
> Sorry, I am likely missing something, but why is not #GP enough when we
> set CSTAR = 0?
Because you are not getting a #GP.
It will try to execute from virtual address 0 in CPL 0 and with RSP
still pointing to the user space stack. So you have several
possibilities:
1) 0 is mapped in user space and SMEP/SMAP is off.
Attacker won
2) 0 is not mapped or SMEP is on.
You get #PF from CPL0 and RSP is still pointing to the user space
stack. If SMAP is on this results in #DF
If SMAP is off, kernel uses an attacker controlled stack...
Similar sillies when you set it to a valid kernel address which is not
mapped or lacks X or contains invalid opcode ....
So no. CSTAR _must_ be a valid kernel text address which handles the
32bit syscall. Right now all it does is SYSRETL when IA32_EMULATION is
disabled.
So the only way to handle that is to have proper entry code which
switches to kernel context and then runs "syscall32_kill_myself()" which
kills the process hard and it exits without the chance to attempt a
return to user.
Anything else wont work.
Bah. Was it really necessary to bring this up so I hade to page in the
gory details of this hardware insanity again?
Thanks,
tglx
On 08. 06. 23, 8:16, Jiri Slaby wrote: > On 08. 06. 23, 2:25, Thomas Gleixner wrote: >> I really wish that we could disable syscall32 reliably on AMD and make >> it raise #UD as it does on Intal. > > Sorry, I am likely missing something, but why is not #GP enough when we > set CSTAR = 0? Or rather to some hole (to avoid mappings when mmap_min_addr=0) or to something like entry_SYSCALL32_kill which you suggested elsewhere. But that is maybe what you consider not being "reliable". > It's of course not as good as Intel's *defined* #UD, but > why is not the above sufficient/reliable? > > thanks, -- js suse labs
© 2016 - 2026 Red Hat, Inc.