Add 3 new tracepoints for nested VM exits which are intended
to capture extra information to gain insights about the nested guest
behavior.
The new tracepoints are:
- kvm_nested_msr
- kvm_nested_hypercall
These tracepoints capture extra register state to be able to know
which MSR or which hypercall was done.
- kvm_nested_page_fault
This tracepoint allows to capture extra info about which host pagefault
error code caused the nested page fault.
Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
arch/x86/kvm/svm/nested.c | 22 +++++++++++
arch/x86/kvm/trace.h | 82 +++++++++++++++++++++++++++++++++++++--
arch/x86/kvm/vmx/nested.c | 27 +++++++++++++
arch/x86/kvm/x86.c | 3 ++
4 files changed, 131 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 6f704c1037e51..2020307481553 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -38,6 +38,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
+ u64 host_error_code = vmcb->control.exit_info_1;
+
if (vmcb->control.exit_code != SVM_EXIT_NPF) {
/*
@@ -48,11 +50,15 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
vmcb->control.exit_code_hi = 0;
vmcb->control.exit_info_1 = (1ULL << 32);
vmcb->control.exit_info_2 = fault->address;
+ host_error_code = 0;
}
vmcb->control.exit_info_1 &= ~0xffffffffULL;
vmcb->control.exit_info_1 |= fault->error_code;
+ trace_kvm_nested_page_fault(fault->address, host_error_code,
+ fault->error_code);
+
nested_svm_vmexit(svm);
}
@@ -1126,6 +1132,22 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.exit_int_info_err,
KVM_ISA_SVM);
+ /* Collect some info about nested VM exits */
+ switch (vmcb12->control.exit_code) {
+ case SVM_EXIT_MSR:
+ trace_kvm_nested_msr(vmcb12->control.exit_info_1 == 1,
+ kvm_rcx_read(vcpu),
+ (vmcb12->save.rax & 0xFFFFFFFFull) |
+ (((u64)kvm_rdx_read(vcpu) << 32)));
+ break;
+ case SVM_EXIT_VMMCALL:
+ trace_kvm_nested_hypercall(vmcb12->save.rax,
+ kvm_rbx_read(vcpu),
+ kvm_rcx_read(vcpu),
+ kvm_rdx_read(vcpu));
+ break;
+ }
+
kvm_vcpu_unmap(vcpu, &map, true);
nested_svm_transition_tlb_flush(vcpu);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 5a5b7757e8456..6074b4f85d5e2 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -613,7 +613,7 @@ TRACE_EVENT(kvm_pv_eoi,
);
/*
- * Tracepoint for nested VMRUN
+ * Tracepoint for nested VMRUN/VMENTER
*/
TRACE_EVENT(kvm_nested_vmenter,
TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
@@ -746,8 +746,84 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
TP_printk("rip: 0x%016llx", __entry->rip)
);
+
/*
- * Tracepoint for nested #vmexit because of interrupt pending
+ * Tracepoint for nested guest MSR access.
+ */
+TRACE_EVENT(kvm_nested_msr,
+ TP_PROTO(bool write, u32 ecx, u64 data),
+ TP_ARGS(write, ecx, data),
+
+ TP_STRUCT__entry(
+ __field( bool, write )
+ __field( u32, ecx )
+ __field( u64, data )
+ ),
+
+ TP_fast_assign(
+ __entry->write = write;
+ __entry->ecx = ecx;
+ __entry->data = data;
+ ),
+
+ TP_printk("msr_%s %x = 0x%llx",
+ __entry->write ? "write" : "read",
+ __entry->ecx, __entry->data)
+);
+
+/*
+ * Tracepoint for nested hypercalls, capturing generic info about the
+ * hypercall
+ */
+
+TRACE_EVENT(kvm_nested_hypercall,
+ TP_PROTO(u64 rax, u64 rbx, u64 rcx, u64 rdx),
+ TP_ARGS(rax, rbx, rcx, rdx),
+
+ TP_STRUCT__entry(
+ __field( u64, rax )
+ __field( u64, rbx )
+ __field( u64, rcx )
+ __field( u64, rdx )
+ ),
+
+ TP_fast_assign(
+ __entry->rax = rax;
+ __entry->rbx = rbx;
+ __entry->rcx = rcx;
+ __entry->rdx = rdx;
+ ),
+
+ TP_printk("rax 0x%llx rbx 0x%llx rcx 0x%llx rdx 0x%llx",
+ __entry->rax, __entry->rbx, __entry->rcx, __entry->rdx)
+);
+
+
+TRACE_EVENT(kvm_nested_page_fault,
+ TP_PROTO(u64 gpa, u64 host_error_code, u64 guest_error_code),
+ TP_ARGS(gpa, host_error_code, guest_error_code),
+
+ TP_STRUCT__entry(
+ __field( u64, gpa )
+ __field( u64, host_error_code )
+ __field( u64, guest_errror_code )
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = gpa;
+ __entry->host_error_code = host_error_code;
+ __entry->guest_errror_code = guest_error_code;
+ ),
+
+ TP_printk("gpa 0x%llx host err 0x%llx guest err 0x%llx",
+ __entry->gpa,
+ __entry->host_error_code,
+ __entry->guest_errror_code)
+);
+
+
+/*
+ * Tracepoint for invlpga
*/
TRACE_EVENT(kvm_invlpga,
TP_PROTO(__u64 rip, unsigned int asid, u64 address),
@@ -770,7 +846,7 @@ TRACE_EVENT(kvm_invlpga,
);
/*
- * Tracepoint for nested #vmexit because of interrupt pending
+ * Tracepoint for skinit
*/
TRACE_EVENT(kvm_skinit,
TP_PROTO(__u64 rip, __u32 slb),
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 2392a7ef254df..3881a02694fc2 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -454,6 +454,16 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
*/
nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
fault->address);
+
+ /*
+ * vmx_get_exit_qual() returns the original exit qualification,
+ * before it was overridden with exit qualification that
+ * is about to be injected to the guest.
+ */
+
+ trace_kvm_nested_page_fault(fault->address,
+ vmx_get_exit_qual(vcpu),
+ exit_qualification);
}
nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
@@ -4985,6 +4995,23 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmcs12->vm_exit_intr_error_code,
KVM_ISA_VMX);
+ switch ((u16)vmcs12->vm_exit_reason) {
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ trace_kvm_nested_msr(vmcs12->vm_exit_reason == EXIT_REASON_MSR_WRITE,
+ kvm_rcx_read(vcpu),
+ (kvm_rax_read(vcpu) & 0xFFFFFFFFull) |
+ (((u64)kvm_rdx_read(vcpu)) << 32));
+ break;
+ case EXIT_REASON_VMCALL:
+ trace_kvm_nested_hypercall(kvm_rax_read(vcpu),
+ kvm_rbx_read(vcpu),
+ kvm_rcx_read(vcpu),
+ kvm_rdx_read(vcpu));
+ break;
+
+ }
+
load_vmcs12_host_state(vcpu, vmcs12);
return;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f72e5d89e942d..cb01cf2ad6ac9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -14032,6 +14032,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_hypercall);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_msr);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
--
2.26.3
On 9/10/24 22:03, Maxim Levitsky wrote:
> Add 3 new tracepoints for nested VM exits which are intended
> to capture extra information to gain insights about the nested guest
> behavior.
>
> The new tracepoints are:
>
> - kvm_nested_msr
> - kvm_nested_hypercall
>
> These tracepoints capture extra register state to be able to know
> which MSR or which hypercall was done.
>
> - kvm_nested_page_fault
>
> This tracepoint allows to capture extra info about which host pagefault
> error code caused the nested page fault.
>
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
> arch/x86/kvm/svm/nested.c | 22 +++++++++++
> arch/x86/kvm/trace.h | 82 +++++++++++++++++++++++++++++++++++++--
> arch/x86/kvm/vmx/nested.c | 27 +++++++++++++
> arch/x86/kvm/x86.c | 3 ++
> 4 files changed, 131 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index 6f704c1037e51..2020307481553 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -38,6 +38,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> struct vmcb *vmcb = svm->vmcb;
> + u64 host_error_code = vmcb->control.exit_info_1;
> +
>
> if (vmcb->control.exit_code != SVM_EXIT_NPF) {
> /*
> @@ -48,11 +50,15 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
> vmcb->control.exit_code_hi = 0;
> vmcb->control.exit_info_1 = (1ULL << 32);
> vmcb->control.exit_info_2 = fault->address;
> + host_error_code = 0;
> }
>
> vmcb->control.exit_info_1 &= ~0xffffffffULL;
> vmcb->control.exit_info_1 |= fault->error_code;
>
> + trace_kvm_nested_page_fault(fault->address, host_error_code,
> + fault->error_code);
> +
I disagree with Sean about trace_kvm_nested_page_fault. It's a useful
addition and it is easier to understand what's happening with a
dedicated tracepoint (especially on VMX).
Tracepoint are not an exact science and they aren't entirely kernel API.
At least they can just go away at any time (changing them is a lot
more tricky, but their presence is not guaranteed). The one below has
the slight ugliness of having to do some computation in
nested_svm_vmexit(), this one should go in.
> nested_svm_vmexit(svm);
> }
>
> @@ -1126,6 +1132,22 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> vmcb12->control.exit_int_info_err,
> KVM_ISA_SVM);
>
> + /* Collect some info about nested VM exits */
> + switch (vmcb12->control.exit_code) {
> + case SVM_EXIT_MSR:
> + trace_kvm_nested_msr(vmcb12->control.exit_info_1 == 1,
> + kvm_rcx_read(vcpu),
> + (vmcb12->save.rax & 0xFFFFFFFFull) |
> + (((u64)kvm_rdx_read(vcpu) << 32)));
> + break;
> + case SVM_EXIT_VMMCALL:
> + trace_kvm_nested_hypercall(vmcb12->save.rax,
> + kvm_rbx_read(vcpu),
> + kvm_rcx_read(vcpu),
> + kvm_rdx_read(vcpu));
> + break;
Here I probably would have preferred an unconditional tracepoint giving
RAX/RBX/RCX/RDX after a nested vmexit. This is not exactly what Sean
wanted but perhaps it strikes a middle ground? I know you wrote this
for a debugging tool, do you really need to have everything in a single
tracepoint, or can you correlate the existing exit tracepoint with this
hypothetical trace_kvm_nested_exit_regs, to pick RDMSR vs. WRMSR?
Paolo
On Thu, 2024-12-19 at 18:33 +0100, Paolo Bonzini wrote:
> On 9/10/24 22:03, Maxim Levitsky wrote:
> > Add 3 new tracepoints for nested VM exits which are intended
> > to capture extra information to gain insights about the nested guest
> > behavior.
> >
> > The new tracepoints are:
> >
> > - kvm_nested_msr
> > - kvm_nested_hypercall
> >
> > These tracepoints capture extra register state to be able to know
> > which MSR or which hypercall was done.
> >
> > - kvm_nested_page_fault
> >
> > This tracepoint allows to capture extra info about which host pagefault
> > error code caused the nested page fault.
> >
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> > arch/x86/kvm/svm/nested.c | 22 +++++++++++
> > arch/x86/kvm/trace.h | 82 +++++++++++++++++++++++++++++++++++++--
> > arch/x86/kvm/vmx/nested.c | 27 +++++++++++++
> > arch/x86/kvm/x86.c | 3 ++
> > 4 files changed, 131 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index 6f704c1037e51..2020307481553 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -38,6 +38,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
> > {
> > struct vcpu_svm *svm = to_svm(vcpu);
> > struct vmcb *vmcb = svm->vmcb;
> > + u64 host_error_code = vmcb->control.exit_info_1;
> > +
> >
> > if (vmcb->control.exit_code != SVM_EXIT_NPF) {
> > /*
> > @@ -48,11 +50,15 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
> > vmcb->control.exit_code_hi = 0;
> > vmcb->control.exit_info_1 = (1ULL << 32);
> > vmcb->control.exit_info_2 = fault->address;
> > + host_error_code = 0;
> > }
> >
> > vmcb->control.exit_info_1 &= ~0xffffffffULL;
> > vmcb->control.exit_info_1 |= fault->error_code;
> >
> > + trace_kvm_nested_page_fault(fault->address, host_error_code,
> > + fault->error_code);
> > +
>
> I disagree with Sean about trace_kvm_nested_page_fault. It's a useful
> addition and it is easier to understand what's happening with a
> dedicated tracepoint (especially on VMX).
>
> Tracepoint are not an exact science and they aren't entirely kernel API.
> At least they can just go away at any time (changing them is a lot
> more tricky, but their presence is not guaranteed). The one below has
> the slight ugliness of having to do some computation in
> nested_svm_vmexit(), this one should go in.
>
> > nested_svm_vmexit(svm);
> > }
> >
> > @@ -1126,6 +1132,22 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> > vmcb12->control.exit_int_info_err,
> > KVM_ISA_SVM);
> >
> > + /* Collect some info about nested VM exits */
> > + switch (vmcb12->control.exit_code) {
> > + case SVM_EXIT_MSR:
> > + trace_kvm_nested_msr(vmcb12->control.exit_info_1 == 1,
> > + kvm_rcx_read(vcpu),
> > + (vmcb12->save.rax & 0xFFFFFFFFull) |
> > + (((u64)kvm_rdx_read(vcpu) << 32)));
> > + break;
> > + case SVM_EXIT_VMMCALL:
> > + trace_kvm_nested_hypercall(vmcb12->save.rax,
> > + kvm_rbx_read(vcpu),
> > + kvm_rcx_read(vcpu),
> > + kvm_rdx_read(vcpu));
> > + break;
>
> Here I probably would have preferred an unconditional tracepoint giving
> RAX/RBX/RCX/RDX after a nested vmexit. This is not exactly what Sean
> wanted but perhaps it strikes a middle ground? I know you wrote this
> for a debugging tool, do you really need to have everything in a single
> tracepoint, or can you correlate the existing exit tracepoint with this
> hypothetical trace_kvm_nested_exit_regs, to pick RDMSR vs. WRMSR?
Hi!
If the new trace_kvm_nested_exit_regs tracepoint has a VM exit number argument, then
I can enable this new tracepoint twice with a different filter (vm_exit_num number == msr and vm_exit_num == vmcall),
and each instance will count the events that I need.
So this can work.
Thanks!
Best regards,
Maxim Levitsky
>
> Paolo
>
On 12/19/24 18:49, Maxim Levitsky wrote: >> Here I probably would have preferred an unconditional tracepoint giving >> RAX/RBX/RCX/RDX after a nested vmexit. This is not exactly what Sean >> wanted but perhaps it strikes a middle ground? I know you wrote this >> for a debugging tool, do you really need to have everything in a single >> tracepoint, or can you correlate the existing exit tracepoint with this >> hypothetical trace_kvm_nested_exit_regs, to pick RDMSR vs. WRMSR? > > Hi! > > If the new trace_kvm_nested_exit_regs tracepoint has a VM exit number argument, then > I can enable this new tracepoint twice with a different filter (vm_exit_num number == msr and vm_exit_num == vmcall), > and each instance will count the events that I need. > > So this can work. Ok, thanks. On one hand it may make sense to have trace_kvm_exit_regs and trace_kvm_nested_exit_regs (you can even extend the TRACE_EVENT_KVM_EXIT macro to generate both the exit and the exit_regs tracepoint). On the other hand it seems to me that this new tracepoint is kinda reinventing the wheel; your patch adding nested equivalents of trace_kvm_hypercall and trace_kvm_msr seems more obvious to me. I see Sean's point in not wanting one-off tracepoints, on the other hand there is value in having similar tracepoints for the L1->L0 and L2->L0 cases. I'll let him choose between the two possibilities (a new *_exit_regs pair, or just apply this patch) but I think there should be one of these two. Paolo
On Thu, Dec 19, 2024, Paolo Bonzini wrote: > On 12/19/24 18:49, Maxim Levitsky wrote: > > > Here I probably would have preferred an unconditional tracepoint giving > > > RAX/RBX/RCX/RDX after a nested vmexit. This is not exactly what Sean > > > wanted but perhaps it strikes a middle ground? I know you wrote this > > > for a debugging tool, do you really need to have everything in a single > > > tracepoint, or can you correlate the existing exit tracepoint with this > > > hypothetical trace_kvm_nested_exit_regs, to pick RDMSR vs. WRMSR? > > > > Hi! > > > > If the new trace_kvm_nested_exit_regs tracepoint has a VM exit number > > argument, then I can enable this new tracepoint twice with a different > > filter (vm_exit_num number == msr and vm_exit_num == vmcall), and each > > instance will count the events that I need. > > > > So this can work. > Ok, thanks. On one hand it may make sense to have trace_kvm_exit_regs and > trace_kvm_nested_exit_regs (you can even extend the TRACE_EVENT_KVM_EXIT > macro to generate both the exit and the exit_regs tracepoint). On the other > hand it seems to me that this new tracepoint is kinda reinventing the wheel; > your patch adding nested equivalents of trace_kvm_hypercall and > trace_kvm_msr seems more obvious to me. > > I see Sean's point in not wanting one-off tracepoints, on the other hand > there is value in having similar tracepoints for the L1->L0 and L2->L0 > cases. I don't understand why we want two (or three, or five) tracepoints for the same thing. I want to go the opposite direction and (a) delete kvm_nested_vmexit and then (b) rename kvm_nested_vmexit_inject => kvm_nested_vmexit so that it pairs with kvm_nested_vmenter. Similary, having kvm_nested_intr_vmexit is asinine when kvm_nested_vmexit_inject captures *more* information about the IRQ itself. I don't see the point of trace_kvm_nested_exit_regs. Except for L1 vs. L2, it's redundant. kvm_nested_vmexit_inject and kvm_nested_vmenter are useful because they capture novel information. > I'll let him choose between the two possibilities (a new *_exit_regs > pair, or just apply this patch) but I think there should be one of these > two. Anything but a pair. Why can't we capture L1 vs. L2 in the tracepoints and call it a day?
On Tue, Sep 10, 2024, Maxim Levitsky wrote:
> Add 3 new tracepoints for nested VM exits which are intended
> to capture extra information to gain insights about the nested guest
> behavior.
>
> The new tracepoints are:
>
> - kvm_nested_msr
> - kvm_nested_hypercall
I 100% agree that not having register state in the exit tracepoints is obnoxious,
but I don't think we should add one-off tracepoints for the most annoying cases.
I would much prefer to figure out a way to capture register state in kvm_entry
and kvm_exit. E.g. I've lost track of the number of times I've observed an MSR
exit without having trace_kvm_msr enabled.
One idea would be to capture E{A,B,C,D}X, which would cover MSRs, CPUID, and
most hypercalls. And then we might even be able to drop the dedicated MSR and
CPUID tracepoints (not sure if that's a good idea).
Side topic, arch/s390/kvm/trace.h has the concept of COMMON information that is
captured for multiple tracepoints. I haven't looked closely, but I gotta imagine
we can/should use a similar approach for x86.
> These tracepoints capture extra register state to be able to know
> which MSR or which hypercall was done.
>
> - kvm_nested_page_fault
>
> This tracepoint allows to capture extra info about which host pagefault
> error code caused the nested page fault.
The host error code, a.k.a. qualification info, is readily available in the
kvm_exit (or nested variant) tracepoint. I don't letting userspace skip a
tracepoint that's probably already enabled is worth the extra code to support
this tracepoint. The nested_svm_inject_npf_exit() code in particular is wonky,
and I think it's a good example of why userspace "needs" trace_kvm_exit, e.g. to
observe that a nested stage-2 page fault didn't originate from a hardware stage-2
fault.
© 2016 - 2026 Red Hat, Inc.