Provide support for this insn, which is a prereq to FRED. CPUID-wise,
while its and FRED's enumerators were already introduced, their dependency
still needs adding.
While adding a testcase, also add a SWAPGS one. In order to not affect
the behavior of pre-existing tests, install write_{segment,msr} hooks
only transiently.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
For PV save_segments() would need adjustment, but the insn being
restricted to ring 0 means PV guests can't use it anyway (unless we
wanted to emulate it as another privileged insn).
I've also dropped the test harness read_segment() change. It generally
would be correct to have, but isn't needed anymore with neither SWAPGS
nor LKGS handling using the hook.
---
v10: Drop FRED dependency on LKGS. Replace "best effort unwind". Avoid
->read_segment(). Re-base.
v9: Re-base.
v8: Re-base.
v6: Use MSR constants in test harness. S->s in cpufeatureset.h. Add
NMI_SRC feature bits. Re-base.
v5: Re-base.
v3: Add dependency on LM. Re-base.
v2: Use X86_EXC_*. Add comments.
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -326,6 +326,7 @@ static const struct {
{ { 0x00, 0x18 }, { 2, 2 }, T, R }, /* ltr */
{ { 0x00, 0x20 }, { 2, 2 }, T, R }, /* verr */
{ { 0x00, 0x28 }, { 2, 2 }, T, R }, /* verw */
+ { { 0x00, 0x30 }, { 0, 2 }, T, R, pfx_f2 }, /* lkgs */
{ { 0x01, 0x00 }, { 2, 2 }, F, W }, /* sgdt */
{ { 0x01, 0x08 }, { 2, 2 }, F, W }, /* sidt */
{ { 0x01, 0x10 }, { 2, 2 }, F, R }, /* lgdt */
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -673,6 +673,10 @@ static int blk(
return x86_emul_blk((void *)offset, p_data, bytes, eflags, state, ctxt);
}
+#ifdef __x86_64__
+static unsigned long gs_base, gs_base_shadow;
+#endif
+
static int read_segment(
enum x86_segment seg,
struct segment_register *reg,
@@ -682,8 +686,25 @@ static int read_segment(
return X86EMUL_UNHANDLEABLE;
memset(reg, 0, sizeof(*reg));
reg->p = 1;
+
+ return X86EMUL_OKAY;
+}
+
+#ifdef __x86_64__
+static int write_segment(
+ enum x86_segment seg,
+ const struct segment_register *reg,
+ struct x86_emulate_ctxt *ctxt)
+{
+ if ( !is_x86_user_segment(seg) )
+ return X86EMUL_UNHANDLEABLE;
+
+ if ( seg == x86_seg_gs )
+ gs_base = reg->base;
+
return X86EMUL_OKAY;
}
+#endif
static int read_msr(
unsigned int reg,
@@ -696,6 +717,20 @@ static int read_msr(
*val = ctxt->addr_size > 32 ? EFER_LME | EFER_LMA : 0;
return X86EMUL_OKAY;
+#ifdef __x86_64__
+ case MSR_GS_BASE:
+ if ( ctxt->addr_size < 64 )
+ break;
+ *val = gs_base;
+ return X86EMUL_OKAY;
+
+ case MSR_SHADOW_GS_BASE:
+ if ( ctxt->addr_size < 64 )
+ break;
+ *val = gs_base_shadow;
+ return X86EMUL_OKAY;
+#endif
+
case MSR_TSC_AUX:
#define TSC_AUX_VALUE 0xCACACACA
*val = TSC_AUX_VALUE;
@@ -705,6 +740,32 @@ static int read_msr(
return X86EMUL_UNHANDLEABLE;
}
+#ifdef __x86_64__
+static int write_msr(
+ unsigned int reg,
+ uint64_t val,
+ struct x86_emulate_ctxt *ctxt,
+ bool explicit)
+{
+ switch ( reg )
+ {
+ case MSR_GS_BASE:
+ if ( ctxt->addr_size < 64 || !is_canonical_address(val) )
+ break;
+ gs_base = val;
+ return X86EMUL_OKAY;
+
+ case MSR_SHADOW_GS_BASE:
+ if ( ctxt->addr_size < 64 || !is_canonical_address(val) )
+ break;
+ gs_base_shadow = val;
+ return X86EMUL_OKAY;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+#endif
+
#define INVPCID_ADDR 0x12345678
#define INVPCID_PCID 0x123
@@ -1339,6 +1400,41 @@ int main(int argc, char **argv)
printf("%u bytes read - ", bytes_read);
goto fail;
}
+ printf("okay\n");
+
+ emulops.write_segment = write_segment;
+ emulops.write_msr = write_msr;
+
+ printf("%-40s", "Testing swapgs...");
+ instr[0] = 0x0f; instr[1] = 0x01; instr[2] = 0xf8;
+ regs.eip = (unsigned long)&instr[0];
+ gs_base = 0xffffeeeecccc8888UL;
+ gs_base_shadow = 0x0000111122224444UL;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) ||
+ (regs.eip != (unsigned long)&instr[3]) ||
+ (gs_base != 0x0000111122224444UL) ||
+ (gs_base_shadow != 0xffffeeeecccc8888UL) )
+ goto fail;
+ printf("okay\n");
+
+ printf("%-40s", "Testing lkgs 2(%rdx)...");
+ instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x00; instr[3] = 0x72; instr[4] = 0x02;
+ regs.eip = (unsigned long)&instr[0];
+ regs.edx = (unsigned long)res;
+ res[0] = 0x00004444;
+ res[1] = 0x8888cccc;
+ i = cpu_policy.extd.nscb; cpu_policy.extd.nscb = true; /* for AMD */
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) ||
+ (regs.eip != (unsigned long)&instr[5]) ||
+ (gs_base != 0x0000111122224444UL) ||
+ gs_base_shadow )
+ goto fail;
+
+ cpu_policy.extd.nscb = i;
+ emulops.write_segment = NULL;
+ emulops.write_msr = NULL;
#endif
printf("okay\n");
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -85,6 +85,7 @@ bool emul_test_init(void)
cpu_policy.feat.invpcid = true;
cpu_policy.feat.adx = true;
cpu_policy.feat.rdpid = true;
+ cpu_policy.feat.lkgs = true;
cpu_policy.feat.wrmsrns = true;
cpu_policy.extd.clzero = true;
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -744,8 +744,12 @@ decode_twobyte(struct x86_emulate_state
case 0:
s->desc |= DstMem | SrcImplicit | Mov;
break;
+ case 6:
+ if ( !(s->modrm_reg & 1) && mode_64bit() )
+ {
case 2: case 4:
- s->desc |= SrcMem16;
+ s->desc |= SrcMem16;
+ }
break;
}
break;
--- a/xen/arch/x86/x86_emulate/private.h
+++ b/xen/arch/x86/x86_emulate/private.h
@@ -608,6 +608,7 @@ amd_like(const struct x86_emulate_ctxt *
#define vcpu_has_avx_vnni() (ctxt->cpuid->feat.avx_vnni)
#define vcpu_has_avx512_bf16() (ctxt->cpuid->feat.avx512_bf16)
#define vcpu_has_cmpccxadd() (ctxt->cpuid->feat.cmpccxadd)
+#define vcpu_has_lkgs() (ctxt->cpuid->feat.lkgs)
#define vcpu_has_wrmsrns() (ctxt->cpuid->feat.wrmsrns)
#define vcpu_has_avx_ifma() (ctxt->cpuid->feat.avx_ifma)
#define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -2899,8 +2899,37 @@ x86_emulate(
break;
}
break;
- default:
- generate_exception_if(true, X86_EXC_UD);
+
+ case 6: /* lkgs */
+ generate_exception_if((modrm_reg & 1) || vex.pfx != vex_f2,
+ X86_EXC_UD);
+ generate_exception_if(!mode_64bit() || !mode_ring0(), X86_EXC_UD);
+ vcpu_must_have(lkgs);
+ fail_if(!ops->read_msr || !ops->write_segment || !ops->write_msr);
+ if ( (rc = ops->read_msr(MSR_SHADOW_GS_BASE, &msr_val,
+ ctxt)) != X86EMUL_OKAY ||
+ (rc = ops->read_msr(MSR_GS_BASE, &sreg.base,
+ ctxt)) != X86EMUL_OKAY )
+ goto done;
+ dst.orig_val = sreg.base; /* Preserve full GS Base. */
+ if ( (rc = protmode_load_seg(x86_seg_gs, src.val, false, &sreg,
+ ctxt, ops)) != X86EMUL_OKAY )
+ goto done;
+ /* Write (32-bit) base into SHADOW_GS. */
+ if ( (rc = ops->write_msr(MSR_SHADOW_GS_BASE, sreg.base,
+ ctxt, false)) != X86EMUL_OKAY ||
+ (sreg.base = dst.orig_val, /* Reinstate full GS Base. */
+ (rc = ops->write_segment(x86_seg_gs, &sreg,
+ ctxt)) != X86EMUL_OKAY) )
+ {
+ /*
+ * In real hardware, access to the registers cannot fail. It
+ * is an error in Xen if the writes fail.
+ */
+ ASSERT_UNREACHABLE();
+ x86_emul_reset_event(ctxt);
+ generate_exception(X86_EXC_DF, 0);
+ }
break;
}
break;
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -282,7 +282,8 @@ def crunch_numbers(state):
# superpages, PCID and PKU are only available in 4 level paging.
# NO_LMSL indicates the absense of Long Mode Segment Limits, which
# have been dropped in hardware.
- LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, AMX_TILE, CMPCCXADD],
+ LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, AMX_TILE, CMPCCXADD,
+ LKGS],
# AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the
# standard 3DNow in the earlier K6 processors.
On 08/04/2026 11:22 am, Jan Beulich wrote:
> Provide support for this insn, which is a prereq to FRED. CPUID-wise,
> while its and FRED's enumerators were already introduced, their dependency
> still needs adding.
>
> While adding a testcase, also add a SWAPGS one. In order to not affect
> the behavior of pre-existing tests, install write_{segment,msr} hooks
> only transiently.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> For PV save_segments() would need adjustment,
Not really. CPL3 must never have a way of modifying GS_KERN, hence ...
> but the insn being restricted to ring 0 means PV guests can't use it anyway
... the CPL0 restriction.
Arguably I should have had this in one of the FRED patches:
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 1151997758c6..3364e774ada7 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1952,7 +1952,7 @@ static void load_segments(struct vcpu *n)
* changes to bases can also be made with the WR{FS,GS}BASE instructions, when
* enabled.
*
- * Guests however cannot use SWAPGS, so there is no mechanism to modify the
+ * Guests cannot use SWAPGS or LKGS, so there is no mechanism to modify the
* inactive GS base behind Xen's back. Therefore, Xen's copy of the inactive
* GS base is still accurate, and doesn't need reading back from hardware.
*
but I don't think it's appropriate to merge into this patch.
> (unless we wanted to emulate it as another privileged insn).
We already have "LKGS" in hypercall form. It's spelt
SEGBASE_GS_USER_SEL and has existed for 20 years or so.
I don't see any reason to extend emul_priv_op().
>
> I've also dropped the test harness read_segment() change. It generally
> would be correct to have, but isn't needed anymore with neither SWAPGS
> nor LKGS handling using the hook.
Dropping read_segment() makes your patch depend on Teddy's, now that
test_x86_emulator is blocking in CI.
This matters for backports. I expect I'll be backporting guest support
in not-too-long.
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -2899,8 +2899,37 @@ x86_emulate(
> break;
> }
> break;
> - default:
> - generate_exception_if(true, X86_EXC_UD);
> +
> + case 6: /* lkgs */
> + generate_exception_if((modrm_reg & 1) || vex.pfx != vex_f2,
> + X86_EXC_UD);
> + generate_exception_if(!mode_64bit() || !mode_ring0(), X86_EXC_UD);
> + vcpu_must_have(lkgs);
> + fail_if(!ops->read_msr || !ops->write_segment || !ops->write_msr);
> + if ( (rc = ops->read_msr(MSR_SHADOW_GS_BASE, &msr_val,
> + ctxt)) != X86EMUL_OKAY ||
> + (rc = ops->read_msr(MSR_GS_BASE, &sreg.base,
> + ctxt)) != X86EMUL_OKAY )
> + goto done;
> + dst.orig_val = sreg.base; /* Preserve full GS Base. */
"Preserve current GS Base."
> + if ( (rc = protmode_load_seg(x86_seg_gs, src.val, false, &sreg,
> + ctxt, ops)) != X86EMUL_OKAY )
> + goto done;
> + /* Write (32-bit) base into SHADOW_GS. */
"Write new base into SHADOW_GS. Zero extended from GDT/LDT."
> + if ( (rc = ops->write_msr(MSR_SHADOW_GS_BASE, sreg.base,
> + ctxt, false)) != X86EMUL_OKAY ||
> + (sreg.base = dst.orig_val, /* Reinstate full GS Base. */
"Reinstate original GS base."
> + (rc = ops->write_segment(x86_seg_gs, &sreg,
> + ctxt)) != X86EMUL_OKAY) )
> + {
> + /*
> + * In real hardware, access to the registers cannot fail. It
> + * is an error in Xen if the writes fail.
> + */
> + ASSERT_UNREACHABLE();
> + x86_emul_reset_event(ctxt);
> + generate_exception(X86_EXC_DF, 0);
> + }
> break;
> }
> break;
This patch needs one more hunk:
diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c
index 5273fe0ae435..9d8195e2ae56 100644
--- a/xen/arch/x86/cpu-policy.c
+++ b/xen/arch/x86/cpu-policy.c
@@ -765,14 +765,25 @@ static void __init calculate_hvm_max_policy(void)
*/
__set_bit(X86_FEATURE_NO_LMSL, fs);
- /*
- * On AMD, PV guests are entirely unable to use SYSENTER as Xen runs in
- * long mode (and init_amd() has cleared it out of host
capabilities), but
- * HVM guests are able if running in protected mode.
- */
- if ( (boot_cpu_data.vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
- raw_cpu_policy.basic.sep )
- __set_bit(X86_FEATURE_SEP, fs);
+ if ( boot_cpu_data.vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
+ {
+ /*
+ * On AMD, PV guests are unable to use SYSENTER as Xen runs in long
+ * mode (and init_amd() has cleared it out of host
capabilities), but
+ * HVM guests are able if running in protected mode.
+ */
+ if ( raw_cpu_policy.basic.sep )
+ __set_bit(X86_FEATURE_SEP, fs);
+
+ /*
+ * NullSelectorClearsBase is really a "hardware doesn't have
this bug
+ * any more" bit. All FRED-capable hardware has NSCB
properties, so
+ * disallow a configuration which suggest/causes behaviour the
OS isn't
+ * expecting.
+ */
+ if ( !test_bit(X86_FEATURE_NSCB, fs) )
+ __clear_bit(X86_FEATURE_LKGS, fs);
+ }
/*
* VIRT_SSBD is exposed in the default policy as a result of
because otherwise a CPU Policy could hide NCSB and LKGS would be have
correctly when executed normally but malfunction in the emulator.
This hunk is in lieu of having vendor-dependent deep-deps calculations,
although it would need duplicating in userspace too.
Because this is only a link between an AMD-only feature and a common
feature, I think I can express it by only having a per-vendor
deep_features bitmap and keeping a shared deep_deps matrix.
Perhaps I should prototype that instead, but it would become another
dependency for this patch.
~Andrew
On 08.04.2026 13:34, Andrew Cooper wrote:
> On 08/04/2026 11:22 am, Jan Beulich wrote:
>> ---
>> For PV save_segments() would need adjustment,
>
> Not really. CPL3 must never have a way of modifying GS_KERN, hence ...
>
>> but the insn being restricted to ring 0 means PV guests can't use it anyway
>
> ... the CPL0 restriction.
>
> Arguably I should have had this in one of the FRED patches:
>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1952,7 +1952,7 @@ static void load_segments(struct vcpu *n)
> * changes to bases can also be made with the WR{FS,GS}BASE instructions, when
> * enabled.
> *
> - * Guests however cannot use SWAPGS, so there is no mechanism to modify the
> + * Guests cannot use SWAPGS or LKGS, so there is no mechanism to modify the
> * inactive GS base behind Xen's back. Therefore, Xen's copy of the inactive
> * GS base is still accurate, and doesn't need reading back from hardware.
> *
>
>
> but I don't think it's appropriate to merge into this patch.
>
>> (unless we wanted to emulate it as another privileged insn).
>
> We already have "LKGS" in hypercall form. It's spelt
> SEGBASE_GS_USER_SEL and has existed for 20 years or so.
Hmm, yes.
> I don't see any reason to extend emul_priv_op().
Nor do I. Nevertheless I wanted to mention the PV aspect.
>> I've also dropped the test harness read_segment() change. It generally
>> would be correct to have, but isn't needed anymore with neither SWAPGS
>> nor LKGS handling using the hook.
>
> Dropping read_segment() makes your patch depend on Teddy's, now that
> test_x86_emulator is blocking in CI.
I'm not dropping read_segment() from there. I've dropped a change to
that function that v9 had. That depends on your change (which has gone
in), but not Teddy's. Or else I may not understand what you mean.
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -2899,8 +2899,37 @@ x86_emulate(
>> break;
>> }
>> break;
>> - default:
>> - generate_exception_if(true, X86_EXC_UD);
>> +
>> + case 6: /* lkgs */
>> + generate_exception_if((modrm_reg & 1) || vex.pfx != vex_f2,
>> + X86_EXC_UD);
>> + generate_exception_if(!mode_64bit() || !mode_ring0(), X86_EXC_UD);
>> + vcpu_must_have(lkgs);
>> + fail_if(!ops->read_msr || !ops->write_segment || !ops->write_msr);
>> + if ( (rc = ops->read_msr(MSR_SHADOW_GS_BASE, &msr_val,
>> + ctxt)) != X86EMUL_OKAY ||
>> + (rc = ops->read_msr(MSR_GS_BASE, &sreg.base,
>> + ctxt)) != X86EMUL_OKAY )
>> + goto done;
>> + dst.orig_val = sreg.base; /* Preserve full GS Base. */
>
> "Preserve current GS Base."
>
>> + if ( (rc = protmode_load_seg(x86_seg_gs, src.val, false, &sreg,
>> + ctxt, ops)) != X86EMUL_OKAY )
>> + goto done;
>> + /* Write (32-bit) base into SHADOW_GS. */
>
> "Write new base into SHADOW_GS. Zero extended from GDT/LDT."
>
>> + if ( (rc = ops->write_msr(MSR_SHADOW_GS_BASE, sreg.base,
>> + ctxt, false)) != X86EMUL_OKAY ||
>> + (sreg.base = dst.orig_val, /* Reinstate full GS Base. */
>
> "Reinstate original GS base."
I can make these adjustments, sure, yet I think my forms were clear enough.
> This patch needs one more hunk:
>
> --- a/xen/arch/x86/cpu-policy.c
> +++ b/xen/arch/x86/cpu-policy.c
> @@ -765,14 +765,25 @@ static void __init calculate_hvm_max_policy(void)
> */
> __set_bit(X86_FEATURE_NO_LMSL, fs);
>
> - /*
> - * On AMD, PV guests are entirely unable to use SYSENTER as Xen runs in
> - * long mode (and init_amd() has cleared it out of host
> capabilities), but
> - * HVM guests are able if running in protected mode.
> - */
> - if ( (boot_cpu_data.vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
> - raw_cpu_policy.basic.sep )
> - __set_bit(X86_FEATURE_SEP, fs);
> + if ( boot_cpu_data.vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
> + {
> + /*
> + * On AMD, PV guests are unable to use SYSENTER as Xen runs in long
> + * mode (and init_amd() has cleared it out of host
> capabilities), but
> + * HVM guests are able if running in protected mode.
> + */
> + if ( raw_cpu_policy.basic.sep )
> + __set_bit(X86_FEATURE_SEP, fs);
> +
> + /*
> + * NullSelectorClearsBase is really a "hardware doesn't have
> this bug
> + * any more" bit. All FRED-capable hardware has NSCB
> properties, so
> + * disallow a configuration which suggest/causes behaviour the
> OS isn't
> + * expecting.
> + */
> + if ( !test_bit(X86_FEATURE_NSCB, fs) )
> + __clear_bit(X86_FEATURE_LKGS, fs);
> + }
>
> /*
> * VIRT_SSBD is exposed in the default policy as a result of
>
>
> because otherwise a CPU Policy could hide NCSB and LKGS would be have
> correctly when executed normally but malfunction in the emulator.
A policy cannot validly hide NSCB, as the flag - whichever way it is set -
describes how the underlying hardware works. We'd need to intercept and
emulate all selector loads to allow flag and hardware behavior to be out
of sync. I.e. what you say for LKGS would be true for all selector loads.
> This hunk is in lieu of having vendor-dependent deep-deps calculations,
> although it would need duplicating in userspace too.
>
> Because this is only a link between an AMD-only feature and a common
> feature, I think I can express it by only having a per-vendor
> deep_features bitmap and keeping a shared deep_deps matrix.
>
> Perhaps I should prototype that instead, but it would become another
> dependency for this patch.
Please do, albeit as per above I don't think it's truly a prereq to the
one here.
Jan
© 2016 - 2026 Red Hat, Inc.