[PATCH v2] xen: fix XEN_DOMCTL_gdbsx_guestmemio crash

Juergen Gross posted 1 patch 2 years ago
Test gitlab-ci passed
Patches applied successfully (tree, apply log)
git fetch https://gitlab.com/xen-project/patchew/xen tags/patchew/20220419101850.3045-1-jgross@suse.com
xen/arch/x86/debug.c                | 10 ++--------
xen/arch/x86/domctl.c               |  6 +++---
xen/arch/x86/include/asm/debugger.h |  2 +-
xen/common/domctl.c                 |  1 -
4 files changed, 6 insertions(+), 13 deletions(-)
[PATCH v2] xen: fix XEN_DOMCTL_gdbsx_guestmemio crash
Posted by Juergen Gross 2 years ago
A hypervisor built without CONFIG_GDBSX will crash in case the
XEN_DOMCTL_gdbsx_guestmemio domctl is being called, as the call will
end up in iommu_do_domctl() with d == NULL:

(XEN) CPU:    6
(XEN) RIP:    e008:[<ffff82d040269984>] iommu_do_domctl+0x4/0x30
(XEN) RFLAGS: 0000000000010202   CONTEXT: hypervisor (d0v0)
(XEN) rax: 00000000000003e8   rbx: ffff830856277ef8   rcx: ffff830856277fff
...
(XEN) Xen call trace:
(XEN)    [<ffff82d040269984>] R iommu_do_domctl+0x4/0x30
(XEN)    [<ffff82d04035cd5f>] S arch_do_domctl+0x7f/0x2330
(XEN)    [<ffff82d040239e46>] S do_domctl+0xe56/0x1930
(XEN)    [<ffff82d040238ff0>] S do_domctl+0/0x1930
(XEN)    [<ffff82d0402f8c59>] S pv_hypercall+0x99/0x110
(XEN)    [<ffff82d0402f5161>] S arch/x86/pv/domain.c#_toggle_guest_pt+0x11/0x90
(XEN)    [<ffff82d040366288>] S lstar_enter+0x128/0x130
(XEN)
(XEN) Pagetable walk from 0000000000000144:
(XEN)  L4[0x000] = 0000000000000000 ffffffffffffffff
(XEN)
(XEN) ****************************************
(XEN) Panic on CPU 6:
(XEN) FATAL PAGE FAULT
(XEN) [error_code=0000]
(XEN) Faulting linear address: 0000000000000144

Fix this issue by modifying the interface of gdbsx_guest_mem_io() to
take the already known domain pointer instead of the domid.

Reported-by: Cheyenne Wills <cheyenne.wills@gmail.com>
Fixes: e726a82ca0dc ("xen: make gdbsx support configurable")
Suggested-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
V2:
- use gdbsx_guest_mem_io() interface modification (Jan Beulich)
---
 xen/arch/x86/debug.c                | 10 ++--------
 xen/arch/x86/domctl.c               |  6 +++---
 xen/arch/x86/include/asm/debugger.h |  2 +-
 xen/common/domctl.c                 |  1 -
 4 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/xen/arch/x86/debug.c b/xen/arch/x86/debug.c
index d90dc93056..c0dd6eaf15 100644
--- a/xen/arch/x86/debug.c
+++ b/xen/arch/x86/debug.c
@@ -159,17 +159,11 @@ static unsigned int dbg_rw_guest_mem(struct domain *dp, unsigned long addr,
  * Returns: number of bytes remaining to be copied.
  */
 unsigned int dbg_rw_mem(unsigned long gva, XEN_GUEST_HANDLE_PARAM(void) buf,
-                        unsigned int len, domid_t domid, bool toaddr,
+                        unsigned int len, struct domain *d, bool toaddr,
                         uint64_t pgd3)
 {
-    struct domain *d = rcu_lock_domain_by_id(domid);
-
-    if ( d )
-    {
-        if ( !d->is_dying )
+    if ( d && !d->is_dying )
             len = dbg_rw_guest_mem(d, gva, buf, len, toaddr, pgd3);
-        rcu_unlock_domain(d);
-    }
 
     return len;
 }
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index e49f9e91b9..a6aae500a3 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -38,10 +38,10 @@
 #include <asm/cpuid.h>
 
 #ifdef CONFIG_GDBSX
-static int gdbsx_guest_mem_io(domid_t domid, struct xen_domctl_gdbsx_memio *iop)
+static int gdbsx_guest_mem_io(struct domain *d, struct xen_domctl_gdbsx_memio *iop)
 {
     iop->remain = dbg_rw_mem(iop->gva, guest_handle_from_ptr(iop->uva, void),
-                             iop->len, domid, iop->gwr, iop->pgd3val);
+                             iop->len, d, iop->gwr, iop->pgd3val);
 
     return iop->remain ? -EFAULT : 0;
 }
@@ -828,7 +828,7 @@ long arch_do_domctl(
 #ifdef CONFIG_GDBSX
     case XEN_DOMCTL_gdbsx_guestmemio:
         domctl->u.gdbsx_guest_memio.remain = domctl->u.gdbsx_guest_memio.len;
-        ret = gdbsx_guest_mem_io(domctl->domain, &domctl->u.gdbsx_guest_memio);
+        ret = gdbsx_guest_mem_io(d, &domctl->u.gdbsx_guest_memio);
         if ( !ret )
            copyback = true;
         break;
diff --git a/xen/arch/x86/include/asm/debugger.h b/xen/arch/x86/include/asm/debugger.h
index 99803bfd0c..221bcde137 100644
--- a/xen/arch/x86/include/asm/debugger.h
+++ b/xen/arch/x86/include/asm/debugger.h
@@ -94,7 +94,7 @@ static inline bool debugger_trap_entry(
 
 #ifdef CONFIG_GDBSX
 unsigned int dbg_rw_mem(unsigned long gva, XEN_GUEST_HANDLE_PARAM(void) buf,
-                        unsigned int len, domid_t domid, bool toaddr,
+                        unsigned int len, struct domain *d, bool toaddr,
                         uint64_t pgd3);
 #endif
 
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index 57135d4478..5879117580 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -308,7 +308,6 @@ long cf_check do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
         if ( op->domain == DOMID_INVALID )
         {
     case XEN_DOMCTL_createdomain:
-    case XEN_DOMCTL_gdbsx_guestmemio:
             d = NULL;
             break;
         }
-- 
2.34.1
Re: [PATCH v2] xen: fix XEN_DOMCTL_gdbsx_guestmemio crash
Posted by Andrew Cooper 2 years ago
On 19/04/2022 11:18, Juergen Gross wrote:
> A hypervisor built without CONFIG_GDBSX will crash in case the
> XEN_DOMCTL_gdbsx_guestmemio domctl is being called, as the call will
> end up in iommu_do_domctl() with d == NULL:
>
> (XEN) CPU:    6
> (XEN) RIP:    e008:[<ffff82d040269984>] iommu_do_domctl+0x4/0x30
> (XEN) RFLAGS: 0000000000010202   CONTEXT: hypervisor (d0v0)
> (XEN) rax: 00000000000003e8   rbx: ffff830856277ef8   rcx: ffff830856277fff
> ...
> (XEN) Xen call trace:
> (XEN)    [<ffff82d040269984>] R iommu_do_domctl+0x4/0x30
> (XEN)    [<ffff82d04035cd5f>] S arch_do_domctl+0x7f/0x2330
> (XEN)    [<ffff82d040239e46>] S do_domctl+0xe56/0x1930
> (XEN)    [<ffff82d040238ff0>] S do_domctl+0/0x1930
> (XEN)    [<ffff82d0402f8c59>] S pv_hypercall+0x99/0x110
> (XEN)    [<ffff82d0402f5161>] S arch/x86/pv/domain.c#_toggle_guest_pt+0x11/0x90
> (XEN)    [<ffff82d040366288>] S lstar_enter+0x128/0x130
> (XEN)
> (XEN) Pagetable walk from 0000000000000144:
> (XEN)  L4[0x000] = 0000000000000000 ffffffffffffffff
> (XEN)
> (XEN) ****************************************
> (XEN) Panic on CPU 6:
> (XEN) FATAL PAGE FAULT
> (XEN) [error_code=0000]
> (XEN) Faulting linear address: 0000000000000144
>
> Fix this issue by modifying the interface of gdbsx_guest_mem_io() to
> take the already known domain pointer instead of the domid.

There is some explanation missing here.  The adjustments you make are
within CONFIG_GDBSX, with the exception of the final hunk.


The actual bug is that non-IOMMU subops end up in iommu_do_domctl(), so
while this is good cleanup to gdbsx_guest_mem_io() it, along with Jan's
adjustment to iommu_do_domctl(), are not suitable fixes to the crash as
reported.

So someone's going to have to write a 3rd patch which actually fixes the
root of the crash, and this will become cleanup.

~Andrew

> diff --git a/xen/arch/x86/debug.c b/xen/arch/x86/debug.c
> index d90dc93056..c0dd6eaf15 100644
> --- a/xen/arch/x86/debug.c
> +++ b/xen/arch/x86/debug.c
> @@ -159,17 +159,11 @@ static unsigned int dbg_rw_guest_mem(struct domain *dp, unsigned long addr,
>   * Returns: number of bytes remaining to be copied.
>   */
>  unsigned int dbg_rw_mem(unsigned long gva, XEN_GUEST_HANDLE_PARAM(void) buf,
> -                        unsigned int len, domid_t domid, bool toaddr,
> +                        unsigned int len, struct domain *d, bool toaddr,
>                          uint64_t pgd3)
>  {
> -    struct domain *d = rcu_lock_domain_by_id(domid);
> -
> -    if ( d )
> -    {
> -        if ( !d->is_dying )
> +    if ( d && !d->is_dying )
>              len = dbg_rw_guest_mem(d, gva, buf, len, toaddr, pgd3);

This needs re-indenting.

Re: [PATCH v2] xen: fix XEN_DOMCTL_gdbsx_guestmemio crash
Posted by Jan Beulich 2 years ago
On 19.04.2022 12:40, Andrew Cooper wrote:
> On 19/04/2022 11:18, Juergen Gross wrote:
>> A hypervisor built without CONFIG_GDBSX will crash in case the
>> XEN_DOMCTL_gdbsx_guestmemio domctl is being called, as the call will
>> end up in iommu_do_domctl() with d == NULL:
>>
>> (XEN) CPU:    6
>> (XEN) RIP:    e008:[<ffff82d040269984>] iommu_do_domctl+0x4/0x30
>> (XEN) RFLAGS: 0000000000010202   CONTEXT: hypervisor (d0v0)
>> (XEN) rax: 00000000000003e8   rbx: ffff830856277ef8   rcx: ffff830856277fff
>> ...
>> (XEN) Xen call trace:
>> (XEN)    [<ffff82d040269984>] R iommu_do_domctl+0x4/0x30
>> (XEN)    [<ffff82d04035cd5f>] S arch_do_domctl+0x7f/0x2330
>> (XEN)    [<ffff82d040239e46>] S do_domctl+0xe56/0x1930
>> (XEN)    [<ffff82d040238ff0>] S do_domctl+0/0x1930
>> (XEN)    [<ffff82d0402f8c59>] S pv_hypercall+0x99/0x110
>> (XEN)    [<ffff82d0402f5161>] S arch/x86/pv/domain.c#_toggle_guest_pt+0x11/0x90
>> (XEN)    [<ffff82d040366288>] S lstar_enter+0x128/0x130
>> (XEN)
>> (XEN) Pagetable walk from 0000000000000144:
>> (XEN)  L4[0x000] = 0000000000000000 ffffffffffffffff
>> (XEN)
>> (XEN) ****************************************
>> (XEN) Panic on CPU 6:
>> (XEN) FATAL PAGE FAULT
>> (XEN) [error_code=0000]
>> (XEN) Faulting linear address: 0000000000000144
>>
>> Fix this issue by modifying the interface of gdbsx_guest_mem_io() to
>> take the already known domain pointer instead of the domid.
> 
> There is some explanation missing here.  The adjustments you make are
> within CONFIG_GDBSX, with the exception of the final hunk.
> 
> 
> The actual bug is that non-IOMMU subops end up in iommu_do_domctl(), so
> while this is good cleanup to gdbsx_guest_mem_io() it, along with Jan's
> adjustment to iommu_do_domctl(), are not suitable fixes to the crash as
> reported.

Whether non-IOMMU subops ending up in iommu_do_domctl() is a bug is a
matter of the position you take: Considering how we chain common -> arch
-> IOMMU domctls, this can also be viewed as intentional, with further
chaining going to be added anywhere in this set. The downside of your
approach (which otherwise I think would have been the way to go already
when the IOMMU domctls gained their own function) is that at least one
higher layer will need to know which specific sub-ops the function is
going to handle. If that was acceptable, I'd then question whether the
top layer shouldn't also know which sub-ops the per-arch functions are
going to handle.

Jan
Re: [PATCH v2] xen: fix XEN_DOMCTL_gdbsx_guestmemio crash
Posted by Juergen Gross 2 years ago
On 19.04.22 12:40, Andrew Cooper wrote:
> On 19/04/2022 11:18, Juergen Gross wrote:
>> A hypervisor built without CONFIG_GDBSX will crash in case the
>> XEN_DOMCTL_gdbsx_guestmemio domctl is being called, as the call will
>> end up in iommu_do_domctl() with d == NULL:
>>
>> (XEN) CPU:    6
>> (XEN) RIP:    e008:[<ffff82d040269984>] iommu_do_domctl+0x4/0x30
>> (XEN) RFLAGS: 0000000000010202   CONTEXT: hypervisor (d0v0)
>> (XEN) rax: 00000000000003e8   rbx: ffff830856277ef8   rcx: ffff830856277fff
>> ...
>> (XEN) Xen call trace:
>> (XEN)    [<ffff82d040269984>] R iommu_do_domctl+0x4/0x30
>> (XEN)    [<ffff82d04035cd5f>] S arch_do_domctl+0x7f/0x2330
>> (XEN)    [<ffff82d040239e46>] S do_domctl+0xe56/0x1930
>> (XEN)    [<ffff82d040238ff0>] S do_domctl+0/0x1930
>> (XEN)    [<ffff82d0402f8c59>] S pv_hypercall+0x99/0x110
>> (XEN)    [<ffff82d0402f5161>] S arch/x86/pv/domain.c#_toggle_guest_pt+0x11/0x90
>> (XEN)    [<ffff82d040366288>] S lstar_enter+0x128/0x130
>> (XEN)
>> (XEN) Pagetable walk from 0000000000000144:
>> (XEN)  L4[0x000] = 0000000000000000 ffffffffffffffff
>> (XEN)
>> (XEN) ****************************************
>> (XEN) Panic on CPU 6:
>> (XEN) FATAL PAGE FAULT
>> (XEN) [error_code=0000]
>> (XEN) Faulting linear address: 0000000000000144
>>
>> Fix this issue by modifying the interface of gdbsx_guest_mem_io() to
>> take the already known domain pointer instead of the domid.
> 
> There is some explanation missing here.  The adjustments you make are
> within CONFIG_GDBSX, with the exception of the final hunk.

Yeah, and this is the one really fixing the issue, while the
other hunks are needed to cope with the way the problem is
fixed.

> The actual bug is that non-IOMMU subops end up in iommu_do_domctl(), so
> while this is good cleanup to gdbsx_guest_mem_io() it, along with Jan's
> adjustment to iommu_do_domctl(), are not suitable fixes to the crash as
> reported.

The same way non-arch subops might end up in arch_do_domctl().

What would be the right way to fix that in your opinion?

IMO any subop handler called under the default label of a switch() should
be able to handle unknown subops. This is done for iommu_do_domctl() in
Jan's patch by not dereferencing d unconditionally.

My patch is fixing the original patch referred to in the Fixes: tag.
V1 was another way to fix that, but V2 is IMO the better variant, as it
is even simplifying the code.


Juergen
Re: [PATCH v2] xen: fix XEN_DOMCTL_gdbsx_guestmemio crash
Posted by Andrew Cooper 2 years ago
On 19/04/2022 11:51, Juergen Gross wrote:
> On 19.04.22 12:40, Andrew Cooper wrote:
>> On 19/04/2022 11:18, Juergen Gross wrote:
>>> A hypervisor built without CONFIG_GDBSX will crash in case the
>>> XEN_DOMCTL_gdbsx_guestmemio domctl is being called, as the call will
>>> end up in iommu_do_domctl() with d == NULL:
>>>
>>> (XEN) CPU:    6
>>> (XEN) RIP:    e008:[<ffff82d040269984>] iommu_do_domctl+0x4/0x30
>>> (XEN) RFLAGS: 0000000000010202   CONTEXT: hypervisor (d0v0)
>>> (XEN) rax: 00000000000003e8   rbx: ffff830856277ef8   rcx:
>>> ffff830856277fff
>>> ...
>>> (XEN) Xen call trace:
>>> (XEN)    [<ffff82d040269984>] R iommu_do_domctl+0x4/0x30
>>> (XEN)    [<ffff82d04035cd5f>] S arch_do_domctl+0x7f/0x2330
>>> (XEN)    [<ffff82d040239e46>] S do_domctl+0xe56/0x1930
>>> (XEN)    [<ffff82d040238ff0>] S do_domctl+0/0x1930
>>> (XEN)    [<ffff82d0402f8c59>] S pv_hypercall+0x99/0x110
>>> (XEN)    [<ffff82d0402f5161>] S
>>> arch/x86/pv/domain.c#_toggle_guest_pt+0x11/0x90
>>> (XEN)    [<ffff82d040366288>] S lstar_enter+0x128/0x130
>>> (XEN)
>>> (XEN) Pagetable walk from 0000000000000144:
>>> (XEN)  L4[0x000] = 0000000000000000 ffffffffffffffff
>>> (XEN)
>>> (XEN) ****************************************
>>> (XEN) Panic on CPU 6:
>>> (XEN) FATAL PAGE FAULT
>>> (XEN) [error_code=0000]
>>> (XEN) Faulting linear address: 0000000000000144
>>>
>>> Fix this issue by modifying the interface of gdbsx_guest_mem_io() to
>>> take the already known domain pointer instead of the domid.
>>
>> There is some explanation missing here.  The adjustments you make are
>> within CONFIG_GDBSX, with the exception of the final hunk.
>
> Yeah, and this is the one really fixing the issue, while the
> other hunks are needed to cope with the way the problem is
> fixed.

In which case the salient point needed in the commit message is "reject
the use of XEN_DOMCTL_gdbsx_guestmemio without a valid domain".

I'd go so far as to say that that ought to be a oneliner fix, which also
discusses why it's safe now (it didn't used to be IIRC), and the cleanup
in a separate patch.

This also reminds me that there's a pile of almost complete series of
debugger/gdbsx/traps cleanup in need of pushing over the line.

>
>> The actual bug is that non-IOMMU subops end up in iommu_do_domctl(), so
>> while this is good cleanup to gdbsx_guest_mem_io() it, along with Jan's
>> adjustment to iommu_do_domctl(), are not suitable fixes to the crash as
>> reported.
>
> The same way non-arch subops might end up in arch_do_domctl().
>
> What would be the right way to fix that in your opinion?
>
> IMO any subop handler called under the default label of a switch() should
> be able to handle unknown subops. This is done for iommu_do_domctl() in
> Jan's patch by not dereferencing d unconditionally.

The problem isn't (specifically) how they're chained; it's the name. 
arch_do_domctl() is clearly a dumping ground for arbitrary subops. 
iommu_do_domctl() is clearly not.

The bug was ultimately chaining iommu_do_domctl() in a default, which is
why it *was* reasonable for "use is_iommu_enabled() where
appropriate..." to assume that only iommu subops would reach the function.

iommu_do_domctl() either needs re-chaining under several case xxx:, or
it needs renaming to something generic like arch_do_domctl2().

Nothing else is going to leave the code in a position where it's harder
to make mistakes like this.

~Andrew
Re: [PATCH v2] xen: fix XEN_DOMCTL_gdbsx_guestmemio crash
Posted by Jan Beulich 2 years ago
On 19.04.2022 12:18, Juergen Gross wrote:
> A hypervisor built without CONFIG_GDBSX will crash in case the
> XEN_DOMCTL_gdbsx_guestmemio domctl is being called, as the call will
> end up in iommu_do_domctl() with d == NULL:
> 
> (XEN) CPU:    6
> (XEN) RIP:    e008:[<ffff82d040269984>] iommu_do_domctl+0x4/0x30
> (XEN) RFLAGS: 0000000000010202   CONTEXT: hypervisor (d0v0)
> (XEN) rax: 00000000000003e8   rbx: ffff830856277ef8   rcx: ffff830856277fff
> ...
> (XEN) Xen call trace:
> (XEN)    [<ffff82d040269984>] R iommu_do_domctl+0x4/0x30
> (XEN)    [<ffff82d04035cd5f>] S arch_do_domctl+0x7f/0x2330
> (XEN)    [<ffff82d040239e46>] S do_domctl+0xe56/0x1930
> (XEN)    [<ffff82d040238ff0>] S do_domctl+0/0x1930
> (XEN)    [<ffff82d0402f8c59>] S pv_hypercall+0x99/0x110
> (XEN)    [<ffff82d0402f5161>] S arch/x86/pv/domain.c#_toggle_guest_pt+0x11/0x90
> (XEN)    [<ffff82d040366288>] S lstar_enter+0x128/0x130
> (XEN)
> (XEN) Pagetable walk from 0000000000000144:
> (XEN)  L4[0x000] = 0000000000000000 ffffffffffffffff
> (XEN)
> (XEN) ****************************************
> (XEN) Panic on CPU 6:
> (XEN) FATAL PAGE FAULT
> (XEN) [error_code=0000]
> (XEN) Faulting linear address: 0000000000000144
> 
> Fix this issue by modifying the interface of gdbsx_guest_mem_io() to
> take the already known domain pointer instead of the domid.
> 
> Reported-by: Cheyenne Wills <cheyenne.wills@gmail.com>
> Fixes: e726a82ca0dc ("xen: make gdbsx support configurable")
> Suggested-by: Jan Beulich <jbeulich@suse.com>
> Signed-off-by: Juergen Gross <jgross@suse.com>

Reviewed-by: Jan Beulich <jbeulich@suse.com>
with one nit (which can be taken care of while committing):

> --- a/xen/arch/x86/debug.c
> +++ b/xen/arch/x86/debug.c
> @@ -159,17 +159,11 @@ static unsigned int dbg_rw_guest_mem(struct domain *dp, unsigned long addr,
>   * Returns: number of bytes remaining to be copied.
>   */
>  unsigned int dbg_rw_mem(unsigned long gva, XEN_GUEST_HANDLE_PARAM(void) buf,
> -                        unsigned int len, domid_t domid, bool toaddr,
> +                        unsigned int len, struct domain *d, bool toaddr,
>                          uint64_t pgd3)
>  {
> -    struct domain *d = rcu_lock_domain_by_id(domid);
> -
> -    if ( d )
> -    {
> -        if ( !d->is_dying )
> +    if ( d && !d->is_dying )
>              len = dbg_rw_guest_mem(d, gva, buf, len, toaddr, pgd3);

This line now wants its indentation adjusted.

Jan