Previously this code update only vCPU's in-memory value, which is good,
but not enough, as there will be no context switch after exiting
exception handler, so in-memory value will not get into actual
register.
It worked good enough for VHE guests because KVM code tried fast path,
which of course updated real PAR_EL1.
Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
---
arch/arm64/kvm/sys_regs.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 7b8a0a6f26468..ab2b5e261d312 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -3463,6 +3463,9 @@ static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
__kvm_at_s1e2(vcpu, op, p->regval);
+ /* No context switch happened, so we need to update PAR_EL1 manually */
+ write_sysreg(vcpu_read_sys_reg(vcpu, PAR_EL1), par_el1);
+
return true;
}
@@ -3473,6 +3476,9 @@ static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
__kvm_at_s12(vcpu, op, p->regval);
+ /* No context switch happened, so we need to update PAR_EL1 manually */
+ write_sysreg(vcpu_read_sys_reg(vcpu, PAR_EL1), par_el1);
+
return true;
}
--
2.50.1
On Wed, 06 Aug 2025 15:17:55 +0100,
Volodymyr Babchuk <Volodymyr_Babchuk@epam.com> wrote:
>
> Previously this code update only vCPU's in-memory value, which is good,
> but not enough, as there will be no context switch after exiting
> exception handler, so in-memory value will not get into actual
> register.
>
> It worked good enough for VHE guests because KVM code tried fast path,
> which of course updated real PAR_EL1.
Nothing to do with VHE, I'm afraid. We can take the slow path for any
odd reason, even on VHE. This is more of a structural problem, see
below.
>
> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
> ---
> arch/arm64/kvm/sys_regs.c | 6 ++++++
> 1 file changed, 6 insertions(+)
>
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 7b8a0a6f26468..ab2b5e261d312 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -3463,6 +3463,9 @@ static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
>
> __kvm_at_s1e2(vcpu, op, p->regval);
>
> + /* No context switch happened, so we need to update PAR_EL1 manually */
> + write_sysreg(vcpu_read_sys_reg(vcpu, PAR_EL1), par_el1);
> +
This looks like the wrong fix, as it papers over another issue.
The core problem is vcpu_write_sys_reg() (resp. read) does the wrong
thing with registers such as PAR_EL1, which are not translated between
EL1 and EL2, and therefore are always live, no matter what.
Can you please try the hack below? I don't like it, but at least it
shows where the actual problem lies.
Thanks,
M.
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index ad25484772574..167f0d411a708 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -95,7 +95,13 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
return true; \
}
-static bool get_el2_to_el1_mapping(unsigned int reg,
+#define COMMON_SYSREG(r) \
+ case r: { \
+ *el1r = __INVALID_SYSREG__; \
+ return is_hyp_ctxt(vcpu); \
+ }
+
+static bool get_el2_to_el1_mapping(const struct kvm_vcpu *vcpu, unsigned int reg,
unsigned int *el1r, u64 (**xlate)(u64))
{
switch (reg) {
@@ -119,6 +125,7 @@ static bool get_el2_to_el1_mapping(unsigned int reg,
PURE_EL2_SYSREG( HAFGRTR_EL2 );
PURE_EL2_SYSREG( CNTVOFF_EL2 );
PURE_EL2_SYSREG( CNTHCTL_EL2 );
+ COMMON_SYSREG( PAR_EL1 );
MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1,
translate_sctlr_el2_to_sctlr_el1 );
MAPPED_EL2_SYSREG(CPTR_EL2, CPACR_EL1,
@@ -158,7 +165,7 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
goto memory_read;
- if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
+ if (unlikely(get_el2_to_el1_mapping(vcpu, reg, &el1r, &xlate))) {
if (!is_hyp_ctxt(vcpu))
goto memory_read;
@@ -219,7 +226,7 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
goto memory_write;
- if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
+ if (unlikely(get_el2_to_el1_mapping(vcpu, reg, &el1r, &xlate))) {
if (!is_hyp_ctxt(vcpu))
goto memory_write;
--
Without deviation from the norm, progress is not possible.
Hi Marc,
Marc Zyngier <maz@kernel.org> writes:
> On Wed, 06 Aug 2025 15:17:55 +0100,
> Volodymyr Babchuk <Volodymyr_Babchuk@epam.com> wrote:
>>
>> Previously this code update only vCPU's in-memory value, which is good,
>> but not enough, as there will be no context switch after exiting
>> exception handler, so in-memory value will not get into actual
>> register.
>>
>> It worked good enough for VHE guests because KVM code tried fast path,
>> which of course updated real PAR_EL1.
>
> Nothing to do with VHE, I'm afraid. We can take the slow path for any
> odd reason, even on VHE.
Yeah, I just didn't conveyed my idea well. As VHE guest does not require
S2 translation, calling real "at s1*" was enough to get correct value in
PAR_EL1 in that case. That would hide that issue if using VHE
guests. Also, I believe that most hypervisors will try to do own
software pagewalk if "at s1" fails...
Anyways, I'll rewrite commit message to make this more clear.
> This is more of a structural problem, see
> below.
>
>>
>> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
>> ---
>> arch/arm64/kvm/sys_regs.c | 6 ++++++
>> 1 file changed, 6 insertions(+)
>>
>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> index 7b8a0a6f26468..ab2b5e261d312 100644
>> --- a/arch/arm64/kvm/sys_regs.c
>> +++ b/arch/arm64/kvm/sys_regs.c
>> @@ -3463,6 +3463,9 @@ static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
>>
>> __kvm_at_s1e2(vcpu, op, p->regval);
>>
>> + /* No context switch happened, so we need to update PAR_EL1 manually */
>> + write_sysreg(vcpu_read_sys_reg(vcpu, PAR_EL1), par_el1);
>> +
>
> This looks like the wrong fix, as it papers over another issue.
>
> The core problem is vcpu_write_sys_reg() (resp. read) does the wrong
> thing with registers such as PAR_EL1, which are not translated between
> EL1 and EL2, and therefore are always live, no matter what.
>
> Can you please try the hack below? I don't like it, but at least it
> shows where the actual problem lies.
It does no work, see below.
Also, what do you think about Oliver's approach? I'll try it next.
>
> Thanks,
>
> M.
>
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index ad25484772574..167f0d411a708 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -95,7 +95,13 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
> return true; \
> }
>
> -static bool get_el2_to_el1_mapping(unsigned int reg,
> +#define COMMON_SYSREG(r) \
> + case r: { \
> + *el1r = __INVALID_SYSREG__;
> \
This didn't worked right away, as code in vcpu_read_sys_reg()
will do just __vcpu_read_sys_reg_from_cpu(el1r, &val);
(the same true for write counterpart of course)
I tried to put *el1r = r, here, but this didn't worked also, because of
this check:
/*
* If this register does not have an EL1 counterpart,
* then read the stored EL2 version.
*/
if (reg == el1r)
goto memory_read;
So, either we need to add one more check, like
if (el1r == INVALID_REG)
__vcpu_read_sys_reg_from_cpu(reg, &val);
Or use Oliver's approach.
> + return is_hyp_ctxt(vcpu); \
> + }
> +
> +static bool get_el2_to_el1_mapping(const struct kvm_vcpu *vcpu, unsigned int reg,
> unsigned int *el1r, u64 (**xlate)(u64))
> {
> switch (reg) {
> @@ -119,6 +125,7 @@ static bool get_el2_to_el1_mapping(unsigned int reg,
> PURE_EL2_SYSREG( HAFGRTR_EL2 );
> PURE_EL2_SYSREG( CNTVOFF_EL2 );
> PURE_EL2_SYSREG( CNTHCTL_EL2 );
> + COMMON_SYSREG( PAR_EL1 );
> MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1,
> translate_sctlr_el2_to_sctlr_el1 );
> MAPPED_EL2_SYSREG(CPTR_EL2, CPACR_EL1,
> @@ -158,7 +165,7 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
> if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
> goto memory_read;
>
> - if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
> + if (unlikely(get_el2_to_el1_mapping(vcpu, reg, &el1r, &xlate))) {
> if (!is_hyp_ctxt(vcpu))
> goto memory_read;
>
> @@ -219,7 +226,7 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
> if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
> goto memory_write;
>
> - if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
> + if (unlikely(get_el2_to_el1_mapping(vcpu, reg, &el1r, &xlate))) {
> if (!is_hyp_ctxt(vcpu))
> goto memory_write;
--
WBR, Volodymyr
On Wed, Aug 06, 2025 at 02:17:55PM +0000, Volodymyr Babchuk wrote:
> Previously this code update only vCPU's in-memory value, which is good,
> but not enough, as there will be no context switch after exiting
> exception handler, so in-memory value will not get into actual
> register.
>
> It worked good enough for VHE guests because KVM code tried fast path,
> which of course updated real PAR_EL1.
>
> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
> ---
> arch/arm64/kvm/sys_regs.c | 6 ++++++
> 1 file changed, 6 insertions(+)
>
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 7b8a0a6f26468..ab2b5e261d312 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -3463,6 +3463,9 @@ static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
>
> __kvm_at_s1e2(vcpu, op, p->regval);
>
> + /* No context switch happened, so we need to update PAR_EL1 manually */
> + write_sysreg(vcpu_read_sys_reg(vcpu, PAR_EL1), par_el1);
> +
Ok, this had me thoroughly confused for a moment. The bug is actually in
kvm_write_sys_reg() which is supposed to update the sysreg value when
things are loaded on the CPU. __kvm_at_s1e2() is doing the right thing
by calling this accessor.
For registers like PAR_EL1 that don't have an EL2->EL1 mapping we assume
they belong to the EL1 context and thus are in-memory when in a hyp
context. TPIDR(RO)_EL0 is similarly affected.
This is a bit of an ugly hack, but something like the following should
get things working if you're able to test it:
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index ad2548477257..32f8d1de8f1a 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -149,6 +149,22 @@ static bool get_el2_to_el1_mapping(unsigned int reg,
}
}
+/*
+ * Special-cased registers that do not have an ELx mapping and are always
+ * loaded on the CPU.
+ */
+static bool reg_has_elx_mapping(int reg)
+{
+ switch (reg) {
+ case TPIDR_EL0:
+ case TPIDRRO_EL0:
+ case PAR_EL1:
+ return false;
+ default:
+ return true;
+ }
+}
+
u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
{
u64 val = 0x8badf00d8badf00d;
@@ -158,6 +174,9 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
goto memory_read;
+ if (!reg_has_elx_mapping(reg))
+ goto sysreg_read;
+
if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
if (!is_hyp_ctxt(vcpu))
goto memory_read;
@@ -204,6 +223,7 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
if (unlikely(is_hyp_ctxt(vcpu)))
goto memory_read;
+sysreg_read:
if (__vcpu_read_sys_reg_from_cpu(reg, &val))
return val;
@@ -219,6 +239,9 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
goto memory_write;
+ if (!reg_has_elx_mapping(reg))
+ goto sysreg_write;
+
if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
if (!is_hyp_ctxt(vcpu))
goto memory_write;
@@ -259,6 +282,7 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
if (unlikely(is_hyp_ctxt(vcpu)))
goto memory_write;
+sysreg_write:
if (__vcpu_write_sys_reg_to_cpu(val, reg))
return;
--
Thanks,
Oliver
Hi Oliver, Oliver Upton <oliver.upton@linux.dev> writes: > On Wed, Aug 06, 2025 at 02:17:55PM +0000, Volodymyr Babchuk wrote: >> Previously this code update only vCPU's in-memory value, which is good, >> but not enough, as there will be no context switch after exiting >> exception handler, so in-memory value will not get into actual >> register. >> >> It worked good enough for VHE guests because KVM code tried fast path, >> which of course updated real PAR_EL1. >> >> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com> >> --- >> arch/arm64/kvm/sys_regs.c | 6 ++++++ >> 1 file changed, 6 insertions(+) >> >> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c >> index 7b8a0a6f26468..ab2b5e261d312 100644 >> --- a/arch/arm64/kvm/sys_regs.c >> +++ b/arch/arm64/kvm/sys_regs.c >> @@ -3463,6 +3463,9 @@ static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, >> >> __kvm_at_s1e2(vcpu, op, p->regval); >> >> + /* No context switch happened, so we need to update PAR_EL1 manually */ >> + write_sysreg(vcpu_read_sys_reg(vcpu, PAR_EL1), par_el1); >> + > > Ok, this had me thoroughly confused for a moment. The bug is actually in > kvm_write_sys_reg() which is supposed to update the sysreg value when > things are loaded on the CPU. __kvm_at_s1e2() is doing the right thing > by calling this accessor. > > For registers like PAR_EL1 that don't have an EL2->EL1 mapping we assume > they belong to the EL1 context and thus are in-memory when in a hyp > context. TPIDR(RO)_EL0 is similarly affected. > > This is a bit of an ugly hack, but something like the following should > get things working if you're able to test it: Yep, it is working pretty good. Will you send the patch? I'll add my Tested-by. Or will you prefer that I'll send the patch with your Suggested-by: ? [...] -- WBR, Volodymyr
© 2016 - 2026 Red Hat, Inc.