From: Xin Li <xin3.li@intel.com>
Set VMX nested exception bit in the VM-entry interruption information
VMCS field when injecting a nested exception using FRED event delivery
to ensure:
1) The nested exception is injected on a correct stack level.
2) The nested bit defined in FRED stack frame is set.
The event stack level used by FRED event delivery depends on whether the
event was a nested exception encountered during delivery of another event,
because a nested exception is "regarded" as happening on ring 0. E.g.,
when #PF is configured to use stack level 1 in IA32_FRED_STKLVLS MSR:
- nested #PF will be delivered on stack level 1 when encountered in
ring 3.
- normal #PF will be delivered on stack level 0 when encountered in
ring 3.
The VMX nested-exception support ensures the correct event stack level is
chosen when a VM entry injects a nested exception.
Signed-off-by: Xin Li <xin3.li@intel.com>
[ Sean: reworked kvm_requeue_exception() to simply the code changes ]
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
---
Change since v2:
* Rework kvm_requeue_exception() to simply the code changes (Sean
Christopherson).
Change since v1:
* Set the nested flag when there is an original interrupt (Chao Gao).
---
arch/x86/include/asm/kvm_host.h | 4 +++-
arch/x86/include/asm/vmx.h | 5 ++++-
arch/x86/kvm/svm/svm.c | 2 +-
arch/x86/kvm/vmx/vmx.c | 6 +++++-
arch/x86/kvm/x86.c | 14 +++++++++++++-
arch/x86/kvm/x86.h | 1 +
6 files changed, 27 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b9b82aaea9a3..3830084b569b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -736,6 +736,7 @@ struct kvm_queued_exception {
u32 error_code;
unsigned long payload;
bool has_payload;
+ bool nested;
u64 event_data;
};
@@ -2114,7 +2115,8 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload);
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr,
- bool has_error_code, u32 error_code, u64 event_data);
+ bool has_error_code, u32 error_code, bool nested,
+ u64 event_data);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 3696e763c231..06c52fee5dcd 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -137,6 +137,7 @@
#define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49)
#define VMX_BASIC_INOUT BIT_ULL(54)
#define VMX_BASIC_TRUE_CTLS BIT_ULL(55)
+#define VMX_BASIC_NESTED_EXCEPTION BIT_ULL(58)
static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
{
@@ -416,13 +417,15 @@ enum vmcs_field {
#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */
#define INTR_INFO_UNBLOCK_NMI 0x1000 /* 12 */
+#define INTR_INFO_NESTED_EXCEPTION_MASK 0x2000 /* 13 */
#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
-#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000
+#define INTR_INFO_RESVD_BITS_MASK 0x7fffd000
#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
#define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK
#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
+#define VECTORING_INFO_NESTED_EXCEPTION_MASK INTR_INFO_NESTED_EXCEPTION_MASK
#define INTR_TYPE_EXT_INTR (EVENT_TYPE_EXTINT << 8) /* external interrupt */
#define INTR_TYPE_RESERVED (EVENT_TYPE_RESERVED << 8) /* reserved */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7fa8f842f116..e479e0208efe 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4126,7 +4126,7 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
kvm_requeue_exception(vcpu, vector,
exitintinfo & SVM_EXITINTINFO_VALID_ERR,
- error_code, 0);
+ error_code, false, 0);
break;
}
case SVM_EXITINTINFO_TYPE_INTR:
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d81144bd648f..03f42b218554 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1910,8 +1910,11 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu)
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmx->vcpu.arch.event_exit_inst_len);
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
- } else
+ } else {
intr_info |= INTR_TYPE_HARD_EXCEPTION;
+ if (ex->nested)
+ intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;
+ }
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
@@ -7290,6 +7293,7 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
kvm_requeue_exception(vcpu, vector,
idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK,
error_code,
+ idt_vectoring_info & VECTORING_INFO_NESTED_EXCEPTION_MASK,
event_data);
break;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7a55c1eb5297..8546629166e9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -874,6 +874,11 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr,
vcpu->arch.exception.pending = true;
vcpu->arch.exception.injected = false;
+ vcpu->arch.exception.nested = vcpu->arch.exception.nested ||
+ (is_fred_enabled(vcpu) &&
+ (vcpu->arch.nmi_injected ||
+ vcpu->arch.interrupt.injected));
+
vcpu->arch.exception.has_error_code = has_error;
vcpu->arch.exception.vector = nr;
vcpu->arch.exception.error_code = error_code;
@@ -903,8 +908,13 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr,
vcpu->arch.exception.injected = false;
vcpu->arch.exception.pending = false;
+ /* #DF is NOT a nested event, per its definition. */
+ vcpu->arch.exception.nested = false;
+
kvm_queue_exception_e(vcpu, DF_VECTOR, 0);
} else {
+ vcpu->arch.exception.nested = is_fred_enabled(vcpu);
+
/* replace previous exception with a new one in a hope
that instruction re-execution will regenerate lost
exception */
@@ -933,7 +943,8 @@ static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
}
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr,
- bool has_error_code, u32 error_code, u64 event_data)
+ bool has_error_code, u32 error_code, bool nested,
+ u64 event_data)
{
/*
@@ -958,6 +969,7 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr,
vcpu->arch.exception.error_code = error_code;
vcpu->arch.exception.has_payload = false;
vcpu->arch.exception.payload = 0;
+ vcpu->arch.exception.nested = nested;
vcpu->arch.exception.event_data = event_data;
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 578fea05ff18..992e73ee2ec5 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -134,6 +134,7 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.exception.pending = false;
vcpu->arch.exception.injected = false;
+ vcpu->arch.exception.nested = false;
vcpu->arch.exception_vmexit.pending = false;
}
--
2.46.2
>diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>index b9b82aaea9a3..3830084b569b 100644
>--- a/arch/x86/include/asm/kvm_host.h
>+++ b/arch/x86/include/asm/kvm_host.h
>@@ -736,6 +736,7 @@ struct kvm_queued_exception {
> u32 error_code;
> unsigned long payload;
> bool has_payload;
>+ bool nested;
> u64 event_data;
how "nested" is migrated in live migration?
> };
[..]
>
>diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>index d81144bd648f..03f42b218554 100644
>--- a/arch/x86/kvm/vmx/vmx.c
>+++ b/arch/x86/kvm/vmx/vmx.c
>@@ -1910,8 +1910,11 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu)
> vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
> vmx->vcpu.arch.event_exit_inst_len);
> intr_info |= INTR_TYPE_SOFT_EXCEPTION;
>- } else
>+ } else {
> intr_info |= INTR_TYPE_HARD_EXCEPTION;
>+ if (ex->nested)
>+ intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;
how about moving the is_fred_enable() check from kvm_multiple_exception() to here? i.e.,
if (ex->nested && is_fred_enabled(vcpu))
intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;
It is slightly clearer because FRED details don't bleed into kvm_multiple_exception().
>+ }
>
> vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
>
>@@ -7290,6 +7293,7 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
> kvm_requeue_exception(vcpu, vector,
> idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK,
> error_code,
>+ idt_vectoring_info & VECTORING_INFO_NESTED_EXCEPTION_MASK,
> event_data);
> break;
> }
>diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>index 7a55c1eb5297..8546629166e9 100644
>--- a/arch/x86/kvm/x86.c
>+++ b/arch/x86/kvm/x86.c
>@@ -874,6 +874,11 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr,
> vcpu->arch.exception.pending = true;
> vcpu->arch.exception.injected = false;
>
>+ vcpu->arch.exception.nested = vcpu->arch.exception.nested ||
>+ (is_fred_enabled(vcpu) &&
>+ (vcpu->arch.nmi_injected ||
>+ vcpu->arch.interrupt.injected));
>+
> vcpu->arch.exception.has_error_code = has_error;
> vcpu->arch.exception.vector = nr;
> vcpu->arch.exception.error_code = error_code;
>@@ -903,8 +908,13 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr,
> vcpu->arch.exception.injected = false;
> vcpu->arch.exception.pending = false;
>
>+ /* #DF is NOT a nested event, per its definition. */
>+ vcpu->arch.exception.nested = false;
>+
> kvm_queue_exception_e(vcpu, DF_VECTOR, 0);
> } else {
>+ vcpu->arch.exception.nested = is_fred_enabled(vcpu);
>+
> /* replace previous exception with a new one in a hope
> that instruction re-execution will regenerate lost
> exception */
On 10/23/2024 11:24 PM, Chao Gao wrote:
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index b9b82aaea9a3..3830084b569b 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -736,6 +736,7 @@ struct kvm_queued_exception {
>> u32 error_code;
>> unsigned long payload;
>> bool has_payload;
>> + bool nested;
>> u64 event_data;
>
> how "nested" is migrated in live migration?
Damn, I forgot it!
Looks we need to add it to kvm_vcpu_ioctl_x86_{get,set}_vcpu_events(),
but the real question is how to add it to struct kvm_vcpu_events.
>
>> };
>
> [..]
>
>>
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index d81144bd648f..03f42b218554 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -1910,8 +1910,11 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu)
>> vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
>> vmx->vcpu.arch.event_exit_inst_len);
>> intr_info |= INTR_TYPE_SOFT_EXCEPTION;
>> - } else
>> + } else {
>> intr_info |= INTR_TYPE_HARD_EXCEPTION;
>> + if (ex->nested)
>> + intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;
>
> how about moving the is_fred_enable() check from kvm_multiple_exception() to here? i.e.,
>
> if (ex->nested && is_fred_enabled(vcpu))
> intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;
>
> It is slightly clearer because FRED details don't bleed into kvm_multiple_exception().
But FRED is all about events, including exception/interrupt/trap/...
logically VMX nested exception only works when FRED is enabled, see how
it is set at 2 places in kvm_multiple_exception().
>
>> + }
>>
>> vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
>>
>> @@ -7290,6 +7293,7 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
>> kvm_requeue_exception(vcpu, vector,
>> idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK,
>> error_code,
>> + idt_vectoring_info & VECTORING_INFO_NESTED_EXCEPTION_MASK,
>> event_data);
>> break;
>> }
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 7a55c1eb5297..8546629166e9 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -874,6 +874,11 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr,
>> vcpu->arch.exception.pending = true;
>> vcpu->arch.exception.injected = false;
>>
>> + vcpu->arch.exception.nested = vcpu->arch.exception.nested ||
>> + (is_fred_enabled(vcpu) &&
>> + (vcpu->arch.nmi_injected ||
>> + vcpu->arch.interrupt.injected));
>> +
>> vcpu->arch.exception.has_error_code = has_error;
>> vcpu->arch.exception.vector = nr;
>> vcpu->arch.exception.error_code = error_code;
>> @@ -903,8 +908,13 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr,
>> vcpu->arch.exception.injected = false;
>> vcpu->arch.exception.pending = false;
>>
>> + /* #DF is NOT a nested event, per its definition. */
>> + vcpu->arch.exception.nested = false;
>> +
>> kvm_queue_exception_e(vcpu, DF_VECTOR, 0);
>> } else {
>> + vcpu->arch.exception.nested = is_fred_enabled(vcpu);
>> +
>> /* replace previous exception with a new one in a hope
>> that instruction re-execution will regenerate lost
>> exception */
>
>> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> > index d81144bd648f..03f42b218554 100644
>> > --- a/arch/x86/kvm/vmx/vmx.c
>> > +++ b/arch/x86/kvm/vmx/vmx.c
>> > @@ -1910,8 +1910,11 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu)
>> > vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
>> > vmx->vcpu.arch.event_exit_inst_len);
>> > intr_info |= INTR_TYPE_SOFT_EXCEPTION;
>> > - } else
>> > + } else {
>> > intr_info |= INTR_TYPE_HARD_EXCEPTION;
>> > + if (ex->nested)
>> > + intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;
>>
>> how about moving the is_fred_enable() check from kvm_multiple_exception() to here? i.e.,
>>
>> if (ex->nested && is_fred_enabled(vcpu))
>> intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;
>>
>> It is slightly clearer because FRED details don't bleed into kvm_multiple_exception().
>
>But FRED is all about events, including exception/interrupt/trap/...
>
>logically VMX nested exception only works when FRED is enabled, see how it is
>set at 2 places in kvm_multiple_exception().
"VMX nested exception only works ..." is what I referred to as "FRED details"
I believe there are several reasons to decouple the "nested exception" concept
from FRED:
1. Readers new to FRED can understand kvm_multiple_exception() without needing
to know FRED details. Readers just need to know nested exceptions are
exceptions encountered during delivering another event (exception/NMI/interrupts).
2. Developing KVM's generic "nested exception" concept can support other vendors.
"nested" becomes a property of an exception. only how nested exceptions are
reported to guests is specific to vendors (i.e., VMX/SVM).
3. This series handles ex->event_data in a similar approach: set it regardless
of FRED enablement and let VMX/SVM code decide to consume or ignore it.
On 10/27/2024 11:33 PM, Chao Gao wrote:
>>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>>>> index d81144bd648f..03f42b218554 100644
>>>> --- a/arch/x86/kvm/vmx/vmx.c
>>>> +++ b/arch/x86/kvm/vmx/vmx.c
>>>> @@ -1910,8 +1910,11 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu)
>>>> vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
>>>> vmx->vcpu.arch.event_exit_inst_len);
>>>> intr_info |= INTR_TYPE_SOFT_EXCEPTION;
>>>> - } else
>>>> + } else {
>>>> intr_info |= INTR_TYPE_HARD_EXCEPTION;
>>>> + if (ex->nested)
>>>> + intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;
>>>
>>> how about moving the is_fred_enable() check from kvm_multiple_exception() to here? i.e.,
>>>
>>> if (ex->nested && is_fred_enabled(vcpu))
>>> intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;
>>>
>>> It is slightly clearer because FRED details don't bleed into kvm_multiple_exception().
>>
>> But FRED is all about events, including exception/interrupt/trap/...
>>
>> logically VMX nested exception only works when FRED is enabled, see how it is
>> set at 2 places in kvm_multiple_exception().
>
> "VMX nested exception only works ..." is what I referred to as "FRED details"
>
> I believe there are several reasons to decouple the "nested exception" concept
> from FRED:
>
> 1. Readers new to FRED can understand kvm_multiple_exception() without needing
> to know FRED details. Readers just need to know nested exceptions are
> exceptions encountered during delivering another event (exception/NMI/interrupts).
>
> 2. Developing KVM's generic "nested exception" concept can support other vendors.
> "nested" becomes a property of an exception. only how nested exceptions are
> reported to guests is specific to vendors (i.e., VMX/SVM).
>
> 3. This series handles ex->event_data in a similar approach: set it regardless
> of FRED enablement and let VMX/SVM code decide to consume or ignore it.
This is a nice way to look at the nature of nested exception, and I have
made the change for the next iteration.
Thanks!
Xin
© 2016 - 2026 Red Hat, Inc.