From: Andrew Cooper <andrew.cooper3@citrix.com>
All #DB exceptions result in an update of %dr6, but this isn't captured in
Xen's handling.
PV guests generally work by modifying %dr6 before raising #DB, whereas HVM
guests do nothing and have a single-step special case in the lowest levels of
{vmx,svm}_inject_event(). All of this is buggy, but in particular, task
switches with the trace flag never end up signalling BT in %dr6.
To begin resolving this issue, add a new pending_dbg field to x86_event
(unioned with cr2 to avoid taking any extra space), and introduce
{pv,hvm}_inject_debug_exn() helpers to replace the current callers using
{pv,hvm}_inject_hw_exception().
A key property is that pending_dbg is taken with positive polarity to deal
with RTM sensibly. Most callers pass in a constant, but callers passing in a
hardware %dr6 value need to xor the value with X86_DR6_DEFAULT to flip the
polarity of RTM and reserved fields.
For PV guests, move the ad-hoc updating of %dr6 into pv_inject_event(). This
in principle breaks the handing of RTM in do_debug(), but PV guests can't
actually enable MSR_DEBUGCTL.RTM yet, so this doesn't matter in practice.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
[ jinoh: Rebase onto staging, forward declare struct vcpu ]
Signed-off-by: Jinoh Kang <jinoh.kang.kr@gmail.com>
---
CC: Andrew Cooper <andrew.cooper3@citrix.com>
CC: Jan Beulich <JBeulich@suse.com>
CC: Wei Liu <wl@xen.org>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Jun Nakajima <jun.nakajima@intel.com>
CC: Kevin Tian <kevin.tian@intel.com>
CC: Tim Deegan <tim@xen.org>
CC: Tamas K Lengyel <tamas@tklengyel.com>
CC: Alexandru Isaila <aisaila@bitdefender.com>
CC: Petre Pircalabu <ppircalabu@bitdefender.com>
v1 -> v2: [S-o-b fixes. More details below.]
- Update DR6 for gdbsx when trapped in PV guest kernel mode
---
xen/arch/x86/hvm/emulate.c | 3 ++-
xen/arch/x86/hvm/hvm.c | 2 +-
xen/arch/x86/hvm/svm/svm.c | 9 ++++++---
xen/arch/x86/hvm/vmx/vmx.c | 13 ++++++++-----
xen/arch/x86/include/asm/debugreg.h | 3 +++
xen/arch/x86/include/asm/domain.h | 12 ++++++++++++
xen/arch/x86/include/asm/hvm/hvm.h | 15 ++++++++++++++-
xen/arch/x86/mm/shadow/multi.c | 5 +++--
xen/arch/x86/pv/emul-priv-op.c | 11 +++++------
xen/arch/x86/pv/emulate.c | 6 ++----
xen/arch/x86/pv/ro-page-fault.c | 3 ++-
xen/arch/x86/pv/traps.c | 16 ++++++++++++----
xen/arch/x86/traps.c | 10 +++++-----
xen/arch/x86/x86_emulate/x86_emulate.h | 5 ++++-
14 files changed, 79 insertions(+), 34 deletions(-)
diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 9b6e4c8bc6..129403ad90 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -16,6 +16,7 @@
#include <xen/paging.h>
#include <xen/trace.h>
#include <xen/vm_event.h>
+#include <asm/debugreg.h>
#include <asm/event.h>
#include <asm/i387.h>
#include <asm/xstate.h>
@@ -2673,7 +2674,7 @@ static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
}
if ( hvmemul_ctxt->ctxt.retire.singlestep )
- hvm_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
+ hvm_inject_debug_exn(X86_DR6_BS);
new_intr_shadow = hvmemul_ctxt->intr_shadow;
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index c726947ccb..f795ef9bc7 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3234,7 +3234,7 @@ void hvm_task_switch(
}
if ( (tss.trace & 1) && !exn_raised )
- hvm_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
+ hvm_inject_debug_exn(X86_DR6_BT);
out:
hvm_unmap_entry(optss_desc);
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index beb076ea8d..3d0402cb10 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -96,7 +96,7 @@ void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len)
curr->arch.hvm.svm.vmcb->int_stat.intr_shadow = 0;
if ( regs->eflags & X86_EFLAGS_TF )
- hvm_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
+ hvm_inject_debug_exn(X86_DR6_BS);
}
static void cf_check svm_cpu_down(void)
@@ -2755,7 +2755,10 @@ void svm_vmexit_handler(void)
goto unexpected_exit_type;
if ( !rc )
hvm_inject_exception(X86_EXC_DB,
- trap_type, insn_len, X86_EVENT_NO_EC);
+ trap_type, insn_len, X86_EVENT_NO_EC,
+ exit_reason == VMEXIT_ICEBP ? 0 :
+ /* #DB - Hardware already updated dr6. */
+ vmcb_get_dr6(vmcb) ^ X86_DR6_DEFAULT);
}
else
domain_pause_for_debugger();
@@ -2785,7 +2788,7 @@ void svm_vmexit_handler(void)
if ( !rc )
hvm_inject_exception(X86_EXC_BP,
X86_EVENTTYPE_SW_EXCEPTION,
- insn_len, X86_EVENT_NO_EC);
+ insn_len, X86_EVENT_NO_EC, 0 /* N/A */);
}
break;
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 1edc7f1e91..9c92d2be92 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3068,7 +3068,7 @@ void update_guest_eip(void)
}
if ( regs->eflags & X86_EFLAGS_TF )
- hvm_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
+ hvm_inject_debug_exn(X86_DR6_BS);
}
static void cf_check vmx_fpu_dirty_intercept(void)
@@ -3911,7 +3911,7 @@ static int vmx_handle_eoi_write(void)
* It is the callers responsibility to ensure that this function is only used
* in the context of an appropriate vmexit.
*/
-static void vmx_propagate_intr(unsigned long intr)
+static void vmx_propagate_intr(unsigned long intr, unsigned long pending_dbg)
{
struct x86_event event = {
.vector = MASK_EXTR(intr, INTR_INFO_VECTOR_MASK),
@@ -3935,6 +3935,9 @@ static void vmx_propagate_intr(unsigned long intr)
else
event.insn_len = 0;
+ if ( event.vector == X86_EXC_DB )
+ event.pending_dbg = pending_dbg;
+
hvm_inject_event(&event);
}
@@ -4300,7 +4303,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
if ( rc < 0 )
goto exit_and_crash;
if ( !rc )
- vmx_propagate_intr(intr_info);
+ vmx_propagate_intr(intr_info, exit_qualification);
}
else
domain_pause_for_debugger();
@@ -4321,7 +4324,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
if ( rc < 0 )
goto exit_and_crash;
if ( !rc )
- vmx_propagate_intr(intr_info);
+ vmx_propagate_intr(intr_info, 0 /* N/A */);
}
else
{
@@ -4361,7 +4364,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
break;
case X86_EXC_AC:
HVMTRACE_1D(TRAP, vector);
- vmx_propagate_intr(intr_info);
+ vmx_propagate_intr(intr_info, 0 /* N/A */);
break;
case X86_EXC_NMI:
if ( MASK_EXTR(intr_info, INTR_INFO_INTR_TYPE_MASK) !=
diff --git a/xen/arch/x86/include/asm/debugreg.h b/xen/arch/x86/include/asm/debugreg.h
index 74344555d2..f83b1b96ec 100644
--- a/xen/arch/x86/include/asm/debugreg.h
+++ b/xen/arch/x86/include/asm/debugreg.h
@@ -75,6 +75,9 @@
asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) ); \
__val; \
})
+
+struct vcpu;
+
long set_debugreg(struct vcpu *, unsigned int reg, unsigned long value);
void activate_debugregs(const struct vcpu *);
diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index c2d9fc333b..eba11adf33 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -729,6 +729,18 @@ static inline void pv_inject_hw_exception(unsigned int vector, int errcode)
pv_inject_event(&event);
}
+static inline void pv_inject_debug_exn(unsigned long pending_dbg)
+{
+ struct x86_event event = {
+ .vector = X86_EXC_DB,
+ .type = X86_EVENTTYPE_HW_EXCEPTION,
+ .error_code = X86_EVENT_NO_EC,
+ .pending_dbg = pending_dbg,
+ };
+
+ pv_inject_event(&event);
+}
+
static inline void pv_inject_page_fault(int errcode, unsigned long cr2)
{
const struct x86_event event = {
diff --git a/xen/arch/x86/include/asm/hvm/hvm.h b/xen/arch/x86/include/asm/hvm/hvm.h
index 6d53713fc3..43989f1681 100644
--- a/xen/arch/x86/include/asm/hvm/hvm.h
+++ b/xen/arch/x86/include/asm/hvm/hvm.h
@@ -503,13 +503,14 @@ hvm_get_cpl(struct vcpu *v)
static inline void hvm_inject_exception(
unsigned int vector, unsigned int type,
- unsigned int insn_len, int error_code)
+ unsigned int insn_len, int error_code, unsigned long extra)
{
struct x86_event event = {
.vector = vector,
.type = type,
.insn_len = insn_len,
.error_code = error_code,
+ .cr2 = extra, /* Any union field will do. */
};
hvm_inject_event(&event);
@@ -526,6 +527,18 @@ static inline void hvm_inject_hw_exception(unsigned int vector, int errcode)
hvm_inject_event(&event);
}
+static inline void hvm_inject_debug_exn(unsigned long pending_dbg)
+{
+ struct x86_event event = {
+ .vector = X86_EXC_DB,
+ .type = X86_EVENTTYPE_HW_EXCEPTION,
+ .error_code = X86_EVENT_NO_EC,
+ .pending_dbg = pending_dbg,
+ };
+
+ hvm_inject_event(&event);
+}
+
static inline void hvm_inject_page_fault(int errcode, unsigned long cr2)
{
struct x86_event event = {
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index cf74fdf5dd..6056626912 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -15,6 +15,7 @@
#include <xen/perfc.h>
#include <xen/domain_page.h>
#include <xen/iocap.h>
+#include <asm/debugreg.h>
#include <xsm/xsm.h>
#include <asm/page.h>
#include <asm/current.h>
@@ -2788,7 +2789,7 @@ static int cf_check sh_page_fault(
#endif
if ( emul_ctxt.ctxt.retire.singlestep )
- hvm_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
+ hvm_inject_debug_exn(X86_DR6_BS);
#if GUEST_PAGING_LEVELS == 3 /* PAE guest */
/*
@@ -2829,7 +2830,7 @@ static int cf_check sh_page_fault(
TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_LAST_FAILED);
if ( emul_ctxt.ctxt.retire.singlestep )
- hvm_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
+ hvm_inject_debug_exn(X86_DR6_BS);
break; /* Don't emulate again if we failed! */
}
diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
index 142bc4818c..72d0514e74 100644
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -1360,12 +1360,11 @@ int pv_emulate_privileged_op(struct cpu_user_regs *regs)
case X86EMUL_OKAY:
if ( ctxt.ctxt.retire.singlestep )
ctxt.bpmatch |= DR_STEP;
- if ( ctxt.bpmatch )
- {
- curr->arch.dr6 |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
- if ( !(curr->arch.pv.trap_bounce.flags & TBF_EXCEPTION) )
- pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
- }
+
+ if ( ctxt.bpmatch &&
+ !(curr->arch.pv.trap_bounce.flags & TBF_EXCEPTION) )
+ pv_inject_debug_exn(ctxt.bpmatch);
+
/* fall through */
case X86EMUL_RETRY:
return EXCRET_fault_fixed;
diff --git a/xen/arch/x86/pv/emulate.c b/xen/arch/x86/pv/emulate.c
index e7a1c0a2cc..aa8af96c30 100644
--- a/xen/arch/x86/pv/emulate.c
+++ b/xen/arch/x86/pv/emulate.c
@@ -71,11 +71,9 @@ void pv_emul_instruction_done(struct cpu_user_regs *regs, unsigned long rip)
{
regs->rip = rip;
regs->eflags &= ~X86_EFLAGS_RF;
+
if ( regs->eflags & X86_EFLAGS_TF )
- {
- current->arch.dr6 |= DR_STEP | DR_STATUS_RESERVED_ONE;
- pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
- }
+ pv_inject_debug_exn(X86_DR6_BS);
}
uint64_t pv_get_reg(struct vcpu *v, unsigned int reg)
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
index cad28ef928..50c37fbd25 100644
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -9,6 +9,7 @@
*/
#include <asm/pv/trace.h>
+#include <asm/debugreg.h>
#include <asm/shadow.h>
#include "emulate.h"
@@ -390,7 +391,7 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
/* Fallthrough */
case X86EMUL_OKAY:
if ( ctxt.retire.singlestep )
- pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
+ pv_inject_debug_exn(X86_DR6_BS);
/* Fallthrough */
case X86EMUL_RETRY:
diff --git a/xen/arch/x86/pv/traps.c b/xen/arch/x86/pv/traps.c
index 74f333da7e..4f5641a47c 100644
--- a/xen/arch/x86/pv/traps.c
+++ b/xen/arch/x86/pv/traps.c
@@ -13,6 +13,7 @@
#include <xen/softirq.h>
#include <asm/pv/trace.h>
+#include <asm/debugreg.h>
#include <asm/shared.h>
#include <asm/traps.h>
#include <irq_vectors.h>
@@ -50,9 +51,9 @@ void pv_inject_event(const struct x86_event *event)
tb->cs = ti->cs;
tb->eip = ti->address;
- if ( event->type == X86_EVENTTYPE_HW_EXCEPTION &&
- vector == X86_EXC_PF )
+ switch ( vector | -(event->type == X86_EVENTTYPE_SW_INTERRUPT) )
{
+ case X86_EXC_PF:
curr->arch.pv.ctrlreg[2] = event->cr2;
arch_set_cr2(curr, event->cr2);
@@ -62,9 +63,16 @@ void pv_inject_event(const struct x86_event *event)
error_code |= PFEC_user_mode;
trace_pv_page_fault(event->cr2, error_code);
- }
- else
+ break;
+
+ case X86_EXC_DB:
+ curr->arch.dr6 |= event->pending_dbg;
+ /* Fallthrough */
+
+ default:
trace_pv_trap(vector, regs->rip, use_error_code, error_code);
+ break;
+ }
if ( use_error_code )
{
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index a898e1f2d7..e2acfbcb9e 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -1989,17 +1989,17 @@ void do_debug(struct cpu_user_regs *regs)
return;
}
- /* Save debug status register where guest OS can peek at it */
- v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
- v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
-
if ( guest_kernel_mode(v, regs) && v->domain->debugger_attached )
{
+ /* Save debug status register where gdbsx can peek at it */
+ v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
+ v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
+
domain_pause_for_debugger();
return;
}
- pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
+ pv_inject_debug_exn(dr6 ^ X86_DR6_DEFAULT);
}
void do_entry_CP(struct cpu_user_regs *regs)
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h
index 698750267a..e348e3c1d3 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -78,7 +78,10 @@ struct x86_event {
uint8_t type; /* X86_EVENTTYPE_* */
uint8_t insn_len; /* Instruction length */
int32_t error_code; /* X86_EVENT_NO_EC if n/a */
- unsigned long cr2; /* Only for X86_EXC_PF h/w exception */
+ union {
+ unsigned long cr2; /* #PF */
+ unsigned long pending_dbg; /* #DB (new DR6 bits, positive polarity) */
+ };
};
/*
--
2.41.0
On 24.08.2023 17:26, Jinoh Kang wrote:
> @@ -62,9 +63,16 @@ void pv_inject_event(const struct x86_event *event)
> error_code |= PFEC_user_mode;
>
> trace_pv_page_fault(event->cr2, error_code);
> - }
> - else
> + break;
> +
> + case X86_EXC_DB:
> + curr->arch.dr6 |= event->pending_dbg;
> + /* Fallthrough */
I guess I have another question here, perhaps more to Andrew: How come
this is just an OR? Not only with some of the bits having inverted sense
and earlier logic being ...
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -1989,17 +1989,17 @@ void do_debug(struct cpu_user_regs *regs)
> return;
> }
>
> - /* Save debug status register where guest OS can peek at it */
> - v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
> - v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
... an OR and an AND, but also with ...
> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
> @@ -78,7 +78,10 @@ struct x86_event {
> uint8_t type; /* X86_EVENTTYPE_* */
> uint8_t insn_len; /* Instruction length */
> int32_t error_code; /* X86_EVENT_NO_EC if n/a */
> - unsigned long cr2; /* Only for X86_EXC_PF h/w exception */
> + union {
> + unsigned long cr2; /* #PF */
> + unsigned long pending_dbg; /* #DB (new DR6 bits, positive polarity) */
... the comment here saying "positive polarity", which I understand
to mean that inverted bits need inverting by the consumer of this
field. If this is solely because none of the inverted bits are
supported for PV, then I guess this wants a comment at the use site
(not the least because it would need adjusting as soon as one such
would become supported).
Jan
On 30/08/2023 2:39 pm, Jan Beulich wrote:
> On 24.08.2023 17:26, Jinoh Kang wrote:
>> @@ -62,9 +63,16 @@ void pv_inject_event(const struct x86_event *event)
>> error_code |= PFEC_user_mode;
>>
>> trace_pv_page_fault(event->cr2, error_code);
>> - }
>> - else
>> + break;
>> +
>> + case X86_EXC_DB:
>> + curr->arch.dr6 |= event->pending_dbg;
>> + /* Fallthrough */
> I guess I have another question here, perhaps more to Andrew: How come
> this is just an OR? Not only with some of the bits having inverted sense
> and earlier logic being ...
>
>> --- a/xen/arch/x86/traps.c
>> +++ b/xen/arch/x86/traps.c
>> @@ -1989,17 +1989,17 @@ void do_debug(struct cpu_user_regs *regs)
>> return;
>> }
>>
>> - /* Save debug status register where guest OS can peek at it */
>> - v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
>> - v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
> ... an OR and an AND, but also with ...
>
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
>> @@ -78,7 +78,10 @@ struct x86_event {
>> uint8_t type; /* X86_EVENTTYPE_* */
>> uint8_t insn_len; /* Instruction length */
>> int32_t error_code; /* X86_EVENT_NO_EC if n/a */
>> - unsigned long cr2; /* Only for X86_EXC_PF h/w exception */
>> + union {
>> + unsigned long cr2; /* #PF */
>> + unsigned long pending_dbg; /* #DB (new DR6 bits, positive polarity) */
As a tangent, since I wrote the original series, there's #NM and
MSR_XFD_ERR which needs to fit into this union for AMX support.
Sadly, the only AMX hardware on the market right now has an errata where
XFD_ERR doesn't behave properly here.
> ... the comment here saying "positive polarity", which I understand
> to mean that inverted bits need inverting by the consumer of this
> field. If this is solely because none of the inverted bits are
> supported for PV, then I guess this wants a comment at the use site
> (not the least because it would need adjusting as soon as one such
> would become supported).
Part of this patch is (or was) introducing pending_dbg with no logical
change, but as I said, I don't think I had the original series split up
quite correctly either.
This field is more than just the inversion. It needs to match the
semantics of the VMCS PENDING_DBG field, which is architectural but
otherwise hidden pipeline state, similar to the segment descriptor
cache. The other necessary property is the (lack of) stickiness of bits
in the pending_dbg field.
All of that said, having talked to some pipeline people recent, I think
pending_dbg needs to be elsewhere. Or perhaps a second copy elsewhere.
Some bits stay pending in pending_dbg across multiple instructions.
This is how we get the MovSS-delays-breakpoints property that is a
security disaster elsewhere.
The problem with this is that we can't get at the pipeline pending_dbg
state for PV guests (where we've only got an architectural #DB to work
with) or for SVM guests (where this state isn't presented in the VMCB
despite existing internally).
~Andrew
On 30.08.2023 16:20, Andrew Cooper wrote:
> On 30/08/2023 2:39 pm, Jan Beulich wrote:
>> On 24.08.2023 17:26, Jinoh Kang wrote:
>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
>>> @@ -78,7 +78,10 @@ struct x86_event {
>>> uint8_t type; /* X86_EVENTTYPE_* */
>>> uint8_t insn_len; /* Instruction length */
>>> int32_t error_code; /* X86_EVENT_NO_EC if n/a */
>>> - unsigned long cr2; /* Only for X86_EXC_PF h/w exception */
>>> + union {
>>> + unsigned long cr2; /* #PF */
>>> + unsigned long pending_dbg; /* #DB (new DR6 bits, positive polarity) */
>
> As a tangent, since I wrote the original series, there's #NM and
> MSR_XFD_ERR which needs to fit into this union for AMX support.
In "x86: XFD enabling" (posted over 2 years ago) I'm getting away
without this quite fine, and I didn't think it's wrong to write the
MSR right from the emulator (using the write_msr() hook).
Jan
On 24.08.2023 17:26, Jinoh Kang wrote:
> From: Andrew Cooper <andrew.cooper3@citrix.com>
>
> All #DB exceptions result in an update of %dr6, but this isn't captured in
> Xen's handling.
>
> PV guests generally work by modifying %dr6 before raising #DB, whereas HVM
> guests do nothing and have a single-step special case in the lowest levels of
> {vmx,svm}_inject_event(). All of this is buggy, but in particular, task
> switches with the trace flag never end up signalling BT in %dr6.
>
> To begin resolving this issue, add a new pending_dbg field to x86_event
> (unioned with cr2 to avoid taking any extra space), and introduce
> {pv,hvm}_inject_debug_exn() helpers to replace the current callers using
> {pv,hvm}_inject_hw_exception().
>
> A key property is that pending_dbg is taken with positive polarity to deal
> with RTM sensibly. Most callers pass in a constant, but callers passing in a
> hardware %dr6 value need to xor the value with X86_DR6_DEFAULT to flip the
> polarity of RTM and reserved fields.
>
> For PV guests, move the ad-hoc updating of %dr6 into pv_inject_event(). This
> in principle breaks the handing of RTM in do_debug(), but PV guests can't
> actually enable MSR_DEBUGCTL.RTM yet, so this doesn't matter in practice.
>
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>
> [ jinoh: Rebase onto staging, forward declare struct vcpu ]
> Signed-off-by: Jinoh Kang <jinoh.kang.kr@gmail.com>
> ---
> CC: Andrew Cooper <andrew.cooper3@citrix.com>
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Wei Liu <wl@xen.org>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> CC: Jun Nakajima <jun.nakajima@intel.com>
> CC: Kevin Tian <kevin.tian@intel.com>
> CC: Tim Deegan <tim@xen.org>
> CC: Tamas K Lengyel <tamas@tklengyel.com>
> CC: Alexandru Isaila <aisaila@bitdefender.com>
> CC: Petre Pircalabu <ppircalabu@bitdefender.com>
>
> v1 -> v2: [S-o-b fixes. More details below.]
>
> - Update DR6 for gdbsx when trapped in PV guest kernel mode
I take it that this refers to ...
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -1989,17 +1989,17 @@ void do_debug(struct cpu_user_regs *regs)
> return;
> }
>
> - /* Save debug status register where guest OS can peek at it */
> - v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
> - v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
> -
> if ( guest_kernel_mode(v, regs) && v->domain->debugger_attached )
> {
> + /* Save debug status register where gdbsx can peek at it */
> + v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
> + v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
> +
> domain_pause_for_debugger();
> return;
> }
... this code movement. I'm afraid this should have resulted in you
dropping the earlier R-b, and I'm further afraid I'm not convinced
this is correct, despite seeing why you would want to do this. The
issue is that this way you also alter guest-visible state, when the
intention is that such now happen via ...
> - pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
> + pv_inject_debug_exn(dr6 ^ X86_DR6_DEFAULT);
> }
... this call alone. I fear I can't currently see how to get both
aspects right, other than by breaking up
Jan
On 30/08/2023 2:30 pm, Jan Beulich wrote:
> On 24.08.2023 17:26, Jinoh Kang wrote:
>> From: Andrew Cooper <andrew.cooper3@citrix.com>
>>
>> All #DB exceptions result in an update of %dr6, but this isn't captured in
>> Xen's handling.
>>
>> PV guests generally work by modifying %dr6 before raising #DB, whereas HVM
>> guests do nothing and have a single-step special case in the lowest levels of
>> {vmx,svm}_inject_event(). All of this is buggy, but in particular, task
>> switches with the trace flag never end up signalling BT in %dr6.
>>
>> To begin resolving this issue, add a new pending_dbg field to x86_event
>> (unioned with cr2 to avoid taking any extra space), and introduce
>> {pv,hvm}_inject_debug_exn() helpers to replace the current callers using
>> {pv,hvm}_inject_hw_exception().
>>
>> A key property is that pending_dbg is taken with positive polarity to deal
>> with RTM sensibly. Most callers pass in a constant, but callers passing in a
>> hardware %dr6 value need to xor the value with X86_DR6_DEFAULT to flip the
>> polarity of RTM and reserved fields.
>>
>> For PV guests, move the ad-hoc updating of %dr6 into pv_inject_event(). This
>> in principle breaks the handing of RTM in do_debug(), but PV guests can't
>> actually enable MSR_DEBUGCTL.RTM yet, so this doesn't matter in practice.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
>> Reviewed-by: Jan Beulich <jbeulich@suse.com>
>> [ jinoh: Rebase onto staging, forward declare struct vcpu ]
>> Signed-off-by: Jinoh Kang <jinoh.kang.kr@gmail.com>
>> ---
>> CC: Andrew Cooper <andrew.cooper3@citrix.com>
>> CC: Jan Beulich <JBeulich@suse.com>
>> CC: Wei Liu <wl@xen.org>
>> CC: Roger Pau Monné <roger.pau@citrix.com>
>> CC: Jun Nakajima <jun.nakajima@intel.com>
>> CC: Kevin Tian <kevin.tian@intel.com>
>> CC: Tim Deegan <tim@xen.org>
>> CC: Tamas K Lengyel <tamas@tklengyel.com>
>> CC: Alexandru Isaila <aisaila@bitdefender.com>
>> CC: Petre Pircalabu <ppircalabu@bitdefender.com>
>>
>> v1 -> v2: [S-o-b fixes. More details below.]
>>
>> - Update DR6 for gdbsx when trapped in PV guest kernel mode
> I take it that this refers to ...
>
>> --- a/xen/arch/x86/traps.c
>> +++ b/xen/arch/x86/traps.c
>> @@ -1989,17 +1989,17 @@ void do_debug(struct cpu_user_regs *regs)
>> return;
>> }
>>
>> - /* Save debug status register where guest OS can peek at it */
>> - v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
>> - v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
>> -
>> if ( guest_kernel_mode(v, regs) && v->domain->debugger_attached )
>> {
>> + /* Save debug status register where gdbsx can peek at it */
>> + v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
>> + v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
>> +
>> domain_pause_for_debugger();
>> return;
>> }
> ... this code movement. I'm afraid this should have resulted in you
> dropping the earlier R-b, and I'm further afraid I'm not convinced
> this is correct, despite seeing why you would want to do this. The
> issue is that this way you also alter guest-visible state, when the
> intention is that such now happen via ...
>
>> - pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
>> + pv_inject_debug_exn(dr6 ^ X86_DR6_DEFAULT);
>> }
> ... this call alone. I fear I can't currently see how to get both
> aspects right, other than by breaking up
I think it was wrongly broken up in my RFC series too.
With hindsight, I think the series wants rearranging to introduce
x86_merge_dr6() first, and then fix up PV and HVM separately. They're
independent logic paths.
~Andrew
© 2016 - 2026 Red Hat, Inc.