[v2] vpci/msix: handle accesses adjacent to the MSI-X table

[PATCH v2] vpci/msix: handle accesses adjacent to the MSI-X table

Posted by Roger Pau Monne 2 years, 10 months ago

The handling of the MSI-X table accesses by Xen requires that any
pages part of the MSI-X related tables are not mapped into the domain
physmap.  As a result, any device registers in the same pages as the
start or the end of the MSIX or PBA tables is not currently
accessible, as the accesses are just dropped.

Note the spec forbids such placing of registers, as the MSIX and PBA
tables must be 4K isolated from any other registers:

"If a Base Address register that maps address space for the MSI-X
Table or MSI-X PBA also maps other usable address space that is not
associated with MSI-X structures, locations (e.g., for CSRs) used in
the other address space must not share any naturally aligned 4-KB
address range with one where either MSI-X structure resides."

Yet the 'Intel Wi-Fi 6 AX201' device on one of my boxes has registers
in the same page as the MSIX tables, and thus won't work on a PVH dom0
without this fix.

In order to cope with the behavior passthrough any accesses that fall
on the same page as the MSIX tables (but don't fall in between) it to
the underlying hardware.  Such forwarding also takes care of the PBA
accesses, so it allows to remove the code doing this handling in
msix_{read,write}.  Note that as a result accesses to the PBA array
are no longer limited to 4 and 8 byte sizes, there's no access size
restriction for PBA accesses documented in the specification.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Changes since v1:
 - Properly handle the PBA also.
 - Merge the handlers for adjacent writes into the existing MSIX table
   ones.
---
 xen/drivers/vpci/msix.c | 329 +++++++++++++++++++++++++++++-----------
 xen/drivers/vpci/vpci.c |   7 +-
 xen/include/xen/vpci.h  |   8 +-
 3 files changed, 255 insertions(+), 89 deletions(-)

diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c
index bea0cc7aed..060b74d62b 100644
--- a/xen/drivers/vpci/msix.c
+++ b/xen/drivers/vpci/msix.c
@@ -27,6 +27,11 @@
     ((addr) >= vmsix_table_addr(vpci, nr) &&                              \
      (addr) < vmsix_table_addr(vpci, nr) + vmsix_table_size(vpci, nr))
 
+#define VMSIX_ADDR_SAME_PAGE(addr, vpci, nr)                              \
+    (PFN_DOWN(addr) >= PFN_DOWN(vmsix_table_addr(vpci, nr)) &&            \
+     PFN_DOWN((addr)) < PFN_UP(vmsix_table_addr(vpci, nr) +               \
+                               vmsix_table_size(vpci, nr) - 1))
+
 static uint32_t cf_check control_read(
     const struct pci_dev *pdev, unsigned int reg, void *data)
 {
@@ -149,7 +154,7 @@ static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr)
 
         for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ )
             if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled &&
-                 VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) )
+                 VMSIX_ADDR_SAME_PAGE(addr, msix->pdev->vpci, i) )
                 return msix;
     }
 
@@ -182,93 +187,201 @@ static struct vpci_msix_entry *get_entry(struct vpci_msix *msix,
     return &msix->entries[(addr - start) / PCI_MSIX_ENTRY_SIZE];
 }
 
-static void __iomem *get_pba(struct vpci *vpci)
+static void __iomem *get_table(struct vpci *vpci, unsigned int slot)
 {
     struct vpci_msix *msix = vpci->msix;
     /*
-     * PBA will only be unmapped when the device is deassigned, so access it
-     * without holding the vpci lock.
+     * Regions will only be unmapped when the device is deassigned, so access
+     * them without holding the vpci lock.
      */
-    void __iomem *pba = read_atomic(&msix->pba);
+    void __iomem *table = read_atomic(&msix->table[slot]);
+    paddr_t addr = 0;
 
-    if ( likely(pba) )
-        return pba;
+    if ( likely(table) )
+        return table;
 
-    pba = ioremap(vmsix_table_addr(vpci, VPCI_MSIX_PBA),
-                  vmsix_table_size(vpci, VPCI_MSIX_PBA));
-    if ( !pba )
-        return read_atomic(&msix->pba);
+    switch ( slot )
+    {
+    case VPCI_MSIX_TBL_TAIL:
+        addr = vmsix_table_size(vpci, VPCI_MSIX_TABLE);
+        fallthrough;
+    case VPCI_MSIX_TBL_HEAD:
+        addr += vmsix_table_addr(vpci, VPCI_MSIX_TABLE);
+        break;
+
+    case VPCI_MSIX_PBA_TAIL:
+        addr = vmsix_table_size(vpci, VPCI_MSIX_PBA);
+        fallthrough;
+    case VPCI_MSIX_PBA_HEAD:
+        addr += vmsix_table_addr(vpci, VPCI_MSIX_PBA);
+        break;
+    }
+
+    table = ioremap(round_pgdown(addr), PAGE_SIZE);
+    if ( !table )
+        return read_atomic(&msix->table[slot]);
 
     spin_lock(&vpci->lock);
-    if ( !msix->pba )
+    if ( !msix->table[slot] )
     {
-        write_atomic(&msix->pba, pba);
+        write_atomic(&msix->table[slot], table);
         spin_unlock(&vpci->lock);
     }
     else
     {
         spin_unlock(&vpci->lock);
-        iounmap(pba);
+        iounmap(table);
     }
 
-    return read_atomic(&msix->pba);
+    return read_atomic(&msix->table[slot]);
 }
 
-static int cf_check msix_read(
-    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long *data)
+unsigned int get_slot(const struct vpci *vpci, unsigned long addr)
 {
-    const struct domain *d = v->domain;
-    struct vpci_msix *msix = msix_find(d, addr);
-    const struct vpci_msix_entry *entry;
-    unsigned int offset;
+    unsigned long pfn = PFN_DOWN(addr);
 
-    *data = ~0ul;
+    /*
+     * The logic below relies on having the tables identity mapped to the guest
+     * address space, or for the `addr` parameter to be translated into its
+     * host physical memory address equivalent.
+     */
 
-    if ( !msix )
-        return X86EMUL_RETRY;
+    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_TABLE)) )
+        return VPCI_MSIX_TBL_HEAD;
+    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_TABLE) +
+                         vmsix_table_size(vpci, VPCI_MSIX_TABLE) - 1) )
+        return VPCI_MSIX_TBL_TAIL;
+    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_PBA)) )
+        return VPCI_MSIX_PBA_HEAD;
+    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_PBA) +
+                         vmsix_table_size(vpci, VPCI_MSIX_PBA) - 1) )
+        return VPCI_MSIX_PBA_TAIL;
+
+    ASSERT_UNREACHABLE();
+    return -1;
+}
 
-    if ( !access_allowed(msix->pdev, addr, len) )
-        return X86EMUL_OKAY;
+static bool adjacent_handle(const struct vpci_msix *msix, unsigned long addr)
+{
+    unsigned int i;
 
     if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) )
+        return true;
+
+    if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_TABLE) )
+        return false;
+
+    for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ )
+        if ( VMSIX_ADDR_SAME_PAGE(addr, msix->pdev->vpci, i) )
+            return true;
+
+    return false;
+}
+
+static int adjacent_read(const struct domain *d, const struct vpci_msix *msix,
+                         unsigned long addr, unsigned int len,
+                         unsigned long *data)
+{
+    const void __iomem *mem;
+    struct vpci *vpci = msix->pdev->vpci;
+    unsigned int slot;
+
+    *data = ~0ul;
+
+    if ( !adjacent_handle(msix, addr + len - 1) )
+        return X86EMUL_OKAY;
+
+    if ( addr & (len - 1) )
     {
-        struct vpci *vpci = msix->pdev->vpci;
-        unsigned int idx = addr - vmsix_table_addr(vpci, VPCI_MSIX_PBA);
-        const void __iomem *pba = get_pba(vpci);
+        unsigned int i;
+
+        gprintk(XENLOG_DEBUG, "%pp: unaligned read to MSI-X related page\n",
+                &msix->pdev->sbdf);
 
         /*
-         * Access to PBA.
+         * Split unaligned accesses into byte sized ones. Shouldn't happen in
+         * the first place, but devices shouldn't have registers in the same 4K
+         * page as the MSIX tables either.
          *
-         * TODO: note that this relies on having the PBA identity mapped to the
-         * guest address space. If this changes the address will need to be
-         * translated.
+         * It's unclear whether this could cause issues if a guest expects a
+         * registers to be accessed atomically, it better use an aligned access
+         * if it has such expectations.
          */
-        if ( !pba )
-        {
-            gprintk(XENLOG_WARNING,
-                    "%pp: unable to map MSI-X PBA, report all pending\n",
-                    &msix->pdev->sbdf);
-            return X86EMUL_OKAY;
-        }
 
-        switch ( len )
+        for ( i = 0; i < len; i++ )
         {
-        case 4:
-            *data = readl(pba + idx);
-            break;
+            unsigned long partial = ~0ul;
+            int rc = adjacent_read(d, msix, addr + i, 1, &partial);
 
-        case 8:
-            *data = readq(pba + idx);
-            break;
+            if ( rc != X86EMUL_OKAY )
+                return rc;
 
-        default:
-            ASSERT_UNREACHABLE();
-            break;
+            *data &= ~(0xfful << (i * 8));
+            *data |= ((uint8_t)partial) << (i * 8);
         }
 
         return X86EMUL_OKAY;
     }
 
+    slot = get_slot(vpci, addr);
+    if ( slot >= ARRAY_SIZE(msix->table) )
+        return X86EMUL_OKAY;
+
+    mem = get_table(vpci, slot);
+    if ( !mem )
+    {
+        gprintk(XENLOG_WARNING,
+                "%pp: unable to map MSI-X page, returning all bits set\n",
+                &msix->pdev->sbdf);
+        return X86EMUL_OKAY;
+    }
+
+    switch ( len )
+    {
+    case 1:
+        *data = readb(mem + PAGE_OFFSET(addr));
+        break;
+
+    case 2:
+        *data = readw(mem + PAGE_OFFSET(addr));
+        break;
+
+    case 4:
+        *data = readl(mem + PAGE_OFFSET(addr));
+        break;
+
+    case 8:
+        *data = readq(mem + PAGE_OFFSET(addr));
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int cf_check msix_read(
+    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long *data)
+{
+    const struct domain *d = v->domain;
+    struct vpci_msix *msix = msix_find(d, addr);
+    const struct vpci_msix_entry *entry;
+    struct vpci *vpci;
+    unsigned int offset;
+
+    *data = ~0ul;
+
+    if ( !msix )
+        return X86EMUL_RETRY;
+
+    if ( adjacent_handle(msix, addr) )
+        return adjacent_read(d, msix, addr, len, data);
+
+    vpci = msix->pdev->vpci;
+    if ( !access_allowed(msix->pdev, addr, len) )
+        return X86EMUL_OKAY;
+
     spin_lock(&msix->pdev->vpci->lock);
     entry = get_entry(msix, addr);
     offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
@@ -303,57 +416,103 @@ static int cf_check msix_read(
     return X86EMUL_OKAY;
 }
 
-static int cf_check msix_write(
-    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long data)
+static int adjacent_write(const struct domain *d, const struct vpci_msix *msix,
+                          unsigned long addr, unsigned int len,
+                          unsigned long data)
 {
-    const struct domain *d = v->domain;
-    struct vpci_msix *msix = msix_find(d, addr);
-    struct vpci_msix_entry *entry;
-    unsigned int offset;
+    void __iomem *mem;
+    struct vpci *vpci = msix->pdev->vpci;
+    unsigned int slot;
 
-    if ( !msix )
-        return X86EMUL_RETRY;
+    if ( !adjacent_handle(msix, addr + len - 1) )
+        return X86EMUL_OKAY;
 
-    if ( !access_allowed(msix->pdev, addr, len) )
+    /*
+     * Only check start and end of the access because the size of the PBA is
+     * assumed to be equal or bigger (8 bytes) than the length of any access
+     * handled here.
+     */
+    if ( (VMSIX_ADDR_IN_RANGE(addr, vpci, VPCI_MSIX_PBA) ||
+          VMSIX_ADDR_IN_RANGE(addr + len - 1, vpci, VPCI_MSIX_PBA)) &&
+         !is_hardware_domain(d) )
+        /* Ignore writes to PBA for DomUs, it's undefined behavior. */
         return X86EMUL_OKAY;
 
-    if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) )
+    if ( addr & (len - 1) )
     {
-        struct vpci *vpci = msix->pdev->vpci;
-        unsigned int idx = addr - vmsix_table_addr(vpci, VPCI_MSIX_PBA);
-        const void __iomem *pba = get_pba(vpci);
+        unsigned int i;
 
-        if ( !is_hardware_domain(d) )
-            /* Ignore writes to PBA for DomUs, it's behavior is undefined. */
-            return X86EMUL_OKAY;
+        gprintk(XENLOG_DEBUG, "%pp: unaligned write to MSI-X related page\n",
+                &msix->pdev->sbdf);
 
-        if ( !pba )
+        for ( i = 0; i < len; i++ )
         {
-            /* Unable to map the PBA, ignore write. */
-            gprintk(XENLOG_WARNING,
-                    "%pp: unable to map MSI-X PBA, write ignored\n",
-                    &msix->pdev->sbdf);
-            return X86EMUL_OKAY;
-        }
+            int rc = adjacent_write(d, msix, addr + i, 1, data >> (i * 8));
 
-        switch ( len )
-        {
-        case 4:
-            writel(data, pba + idx);
-            break;
+            if ( rc != X86EMUL_OKAY )
+                return rc;
+        }
 
-        case 8:
-            writeq(data, pba + idx);
-            break;
+        return X86EMUL_OKAY;
+    }
 
-        default:
-            ASSERT_UNREACHABLE();
-            break;
-        }
+    slot = get_slot(vpci, addr);
+    if ( slot >= ARRAY_SIZE(msix->table) )
+        return X86EMUL_OKAY;
 
+    mem = get_table(vpci, slot);
+    if ( !mem )
+    {
+        gprintk(XENLOG_WARNING,
+                "%pp: unable to map MSI-X page, dropping write\n",
+                &msix->pdev->sbdf);
         return X86EMUL_OKAY;
     }
 
+    switch ( len )
+    {
+    case 1:
+        writeb(data, mem + PAGE_OFFSET(addr));
+        break;
+
+    case 2:
+        writew(data, mem + PAGE_OFFSET(addr));
+        break;
+
+    case 4:
+        writel(data, mem + PAGE_OFFSET(addr));
+        break;
+
+    case 8:
+        writeq(data, mem + PAGE_OFFSET(addr));
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int cf_check msix_write(
+    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long data)
+{
+    const struct domain *d = v->domain;
+    struct vpci_msix *msix = msix_find(d, addr);
+    struct vpci_msix_entry *entry;
+    struct vpci *vpci;
+    unsigned int offset;
+
+    if ( !msix )
+        return X86EMUL_RETRY;
+
+    if ( adjacent_handle(msix, addr) )
+        return adjacent_write(d, msix, addr, len, data);
+
+    vpci = msix->pdev->vpci;
+    if ( !access_allowed(msix->pdev, addr, len) )
+        return X86EMUL_OKAY;
+
     spin_lock(&msix->pdev->vpci->lock);
     entry = get_entry(msix, addr);
     offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 6d48d496bb..652807a4a4 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -54,9 +54,12 @@ void vpci_remove_device(struct pci_dev *pdev)
     spin_unlock(&pdev->vpci->lock);
     if ( pdev->vpci->msix )
     {
+        unsigned int i;
+
         list_del(&pdev->vpci->msix->next);
-        if ( pdev->vpci->msix->pba )
-            iounmap(pdev->vpci->msix->pba);
+        for ( i = 0; i < ARRAY_SIZE(pdev->vpci->msix->table); i++ )
+            if ( pdev->vpci->msix->table[i] )
+                iounmap(pdev->vpci->msix->table[i]);
     }
     xfree(pdev->vpci->msix);
     xfree(pdev->vpci->msi);
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index d8acfeba8a..0b8a2a3c74 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -133,8 +133,12 @@ struct vpci {
         bool enabled         : 1;
         /* Masked? */
         bool masked          : 1;
-        /* PBA map */
-        void __iomem *pba;
+        /* Partial table map. */
+#define VPCI_MSIX_TBL_HEAD 0
+#define VPCI_MSIX_TBL_TAIL 1
+#define VPCI_MSIX_PBA_HEAD 2
+#define VPCI_MSIX_PBA_TAIL 3
+        void __iomem *table[4];
         /* Entries. */
         struct vpci_msix_entry {
             uint64_t addr;
-- 
2.39.0

Re: [PATCH v2] vpci/msix: handle accesses adjacent to the MSI-X table

Posted by Jan Beulich 2 years, 10 months ago

On 16.03.2023 13:07, Roger Pau Monne wrote:
> --- a/xen/drivers/vpci/msix.c
> +++ b/xen/drivers/vpci/msix.c
> @@ -27,6 +27,11 @@
>      ((addr) >= vmsix_table_addr(vpci, nr) &&                              \
>       (addr) < vmsix_table_addr(vpci, nr) + vmsix_table_size(vpci, nr))
>  
> +#define VMSIX_ADDR_SAME_PAGE(addr, vpci, nr)                              \
> +    (PFN_DOWN(addr) >= PFN_DOWN(vmsix_table_addr(vpci, nr)) &&            \
> +     PFN_DOWN((addr)) < PFN_UP(vmsix_table_addr(vpci, nr) +               \
> +                               vmsix_table_size(vpci, nr) - 1))

Looks like this would be better in line with get_slot() (and slightly
cheaper) if the 2nd comparison was ... <= PFN_DOWN().

> @@ -149,7 +154,7 @@ static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr)
>  
>          for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ )
>              if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled &&
> -                 VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) )
> +                 VMSIX_ADDR_SAME_PAGE(addr, msix->pdev->vpci, i) )
>                  return msix;
>      }
>  
> @@ -182,93 +187,201 @@ static struct vpci_msix_entry *get_entry(struct vpci_msix *msix,
>      return &msix->entries[(addr - start) / PCI_MSIX_ENTRY_SIZE];
>  }
>  
> -static void __iomem *get_pba(struct vpci *vpci)
> +static void __iomem *get_table(struct vpci *vpci, unsigned int slot)
>  {
>      struct vpci_msix *msix = vpci->msix;
>      /*
> -     * PBA will only be unmapped when the device is deassigned, so access it
> -     * without holding the vpci lock.
> +     * Regions will only be unmapped when the device is deassigned, so access
> +     * them without holding the vpci lock.
>       */
> -    void __iomem *pba = read_atomic(&msix->pba);
> +    void __iomem *table = read_atomic(&msix->table[slot]);
> +    paddr_t addr = 0;
>  
> -    if ( likely(pba) )
> -        return pba;
> +    if ( likely(table) )
> +        return table;
>  
> -    pba = ioremap(vmsix_table_addr(vpci, VPCI_MSIX_PBA),
> -                  vmsix_table_size(vpci, VPCI_MSIX_PBA));
> -    if ( !pba )
> -        return read_atomic(&msix->pba);
> +    switch ( slot )
> +    {
> +    case VPCI_MSIX_TBL_TAIL:
> +        addr = vmsix_table_size(vpci, VPCI_MSIX_TABLE);
> +        fallthrough;
> +    case VPCI_MSIX_TBL_HEAD:
> +        addr += vmsix_table_addr(vpci, VPCI_MSIX_TABLE);
> +        break;
> +
> +    case VPCI_MSIX_PBA_TAIL:
> +        addr = vmsix_table_size(vpci, VPCI_MSIX_PBA);
> +        fallthrough;
> +    case VPCI_MSIX_PBA_HEAD:
> +        addr += vmsix_table_addr(vpci, VPCI_MSIX_PBA);
> +        break;

Hmm, wasn't the plan to stop special-casing the PBA, including its
special treatment wrt the p2m? Reading on I realize you need this for
the (future) DomU case (to enforce r/o-ness, albeit having looked at
the spec again the other day I'm not really convinced anymore we
really need to squash writes), but we should be able to avoid the
extra overhead for Dom0? (Granted it may make sense to leave this to
a separate patch, if we want to keep the DomU handling despite not
presently needing it.)

> +    }
> +
> +    table = ioremap(round_pgdown(addr), PAGE_SIZE);
> +    if ( !table )
> +        return read_atomic(&msix->table[slot]);
>  
>      spin_lock(&vpci->lock);
> -    if ( !msix->pba )
> +    if ( !msix->table[slot] )
>      {
> -        write_atomic(&msix->pba, pba);
> +        write_atomic(&msix->table[slot], table);
>          spin_unlock(&vpci->lock);
>      }
>      else
>      {
>          spin_unlock(&vpci->lock);
> -        iounmap(pba);
> +        iounmap(table);
>      }
>  
> -    return read_atomic(&msix->pba);
> +    return read_atomic(&msix->table[slot]);
>  }
>  
> -static int cf_check msix_read(
> -    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long *data)
> +unsigned int get_slot(const struct vpci *vpci, unsigned long addr)
>  {
> -    const struct domain *d = v->domain;
> -    struct vpci_msix *msix = msix_find(d, addr);
> -    const struct vpci_msix_entry *entry;
> -    unsigned int offset;
> +    unsigned long pfn = PFN_DOWN(addr);
>  
> -    *data = ~0ul;
> +    /*
> +     * The logic below relies on having the tables identity mapped to the guest
> +     * address space, or for the `addr` parameter to be translated into its
> +     * host physical memory address equivalent.
> +     */
>  
> -    if ( !msix )
> -        return X86EMUL_RETRY;
> +    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_TABLE)) )
> +        return VPCI_MSIX_TBL_HEAD;
> +    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_TABLE) +
> +                         vmsix_table_size(vpci, VPCI_MSIX_TABLE) - 1) )
> +        return VPCI_MSIX_TBL_TAIL;
> +    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_PBA)) )
> +        return VPCI_MSIX_PBA_HEAD;
> +    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_PBA) +
> +                         vmsix_table_size(vpci, VPCI_MSIX_PBA) - 1) )
> +        return VPCI_MSIX_PBA_TAIL;
> +
> +    ASSERT_UNREACHABLE();
> +    return -1;
> +}
>  
> -    if ( !access_allowed(msix->pdev, addr, len) )
> -        return X86EMUL_OKAY;
> +static bool adjacent_handle(const struct vpci_msix *msix, unsigned long addr)
> +{
> +    unsigned int i;
>  
>      if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) )
> +        return true;
> +
> +    if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_TABLE) )
> +        return false;
> +
> +    for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ )
> +        if ( VMSIX_ADDR_SAME_PAGE(addr, msix->pdev->vpci, i) )
> +            return true;
> +
> +    return false;
> +}
> +
> +static int adjacent_read(const struct domain *d, const struct vpci_msix *msix,
> +                         unsigned long addr, unsigned int len,
> +                         unsigned long *data)
> +{
> +    const void __iomem *mem;
> +    struct vpci *vpci = msix->pdev->vpci;
> +    unsigned int slot;
> +
> +    *data = ~0ul;
> +
> +    if ( !adjacent_handle(msix, addr + len - 1) )
> +        return X86EMUL_OKAY;
> +
> +    if ( addr & (len - 1) )

unlikely()?

>      {
> -        struct vpci *vpci = msix->pdev->vpci;
> -        unsigned int idx = addr - vmsix_table_addr(vpci, VPCI_MSIX_PBA);
> -        const void __iomem *pba = get_pba(vpci);
> +        unsigned int i;
> +
> +        gprintk(XENLOG_DEBUG, "%pp: unaligned read to MSI-X related page\n",
> +                &msix->pdev->sbdf);
>  
>          /*
> -         * Access to PBA.
> +         * Split unaligned accesses into byte sized ones. Shouldn't happen in
> +         * the first place, but devices shouldn't have registers in the same 4K
> +         * page as the MSIX tables either.
>           *
> -         * TODO: note that this relies on having the PBA identity mapped to the
> -         * guest address space. If this changes the address will need to be
> -         * translated.
> +         * It's unclear whether this could cause issues if a guest expects a
> +         * registers to be accessed atomically, it better use an aligned access
> +         * if it has such expectations.
>           */
> -        if ( !pba )
> -        {
> -            gprintk(XENLOG_WARNING,
> -                    "%pp: unable to map MSI-X PBA, report all pending\n",
> -                    &msix->pdev->sbdf);
> -            return X86EMUL_OKAY;
> -        }
>  
> -        switch ( len )
> +        for ( i = 0; i < len; i++ )
>          {
> -        case 4:
> -            *data = readl(pba + idx);
> -            break;
> +            unsigned long partial = ~0ul;

Pointless initializer (~0ul is written first thing above, i.e. also in
the recursive invocation). Then again that setting is also redundant
with msix_read()'s. So I guess the initializer wants to stay but the
setting at the top of the function can be dropped.

> +            int rc = adjacent_read(d, msix, addr + i, 1, &partial);
>  
> -        case 8:
> -            *data = readq(pba + idx);
> -            break;
> +            if ( rc != X86EMUL_OKAY )
> +                return rc;
>  
> -        default:
> -            ASSERT_UNREACHABLE();
> -            break;
> +            *data &= ~(0xfful << (i * 8));
> +            *data |= ((uint8_t)partial) << (i * 8);

This is UB for i >= 4; you'd need to cast back to unsigned long, at which
point I think the cast-free variant of masking by 0xff is to be preferred.

>          }
>  
>          return X86EMUL_OKAY;
>      }
>  
> +    slot = get_slot(vpci, addr);
> +    if ( slot >= ARRAY_SIZE(msix->table) )
> +        return X86EMUL_OKAY;
> +
> +    mem = get_table(vpci, slot);
> +    if ( !mem )
> +    {
> +        gprintk(XENLOG_WARNING,
> +                "%pp: unable to map MSI-X page, returning all bits set\n",
> +                &msix->pdev->sbdf);
> +        return X86EMUL_OKAY;
> +    }

Could this be moved ahead of the unaligned handling, so there would be
(commonly) only one such log message even for accesses we split?

Jan

Re: [PATCH v2] vpci/msix: handle accesses adjacent to the MSI-X table

Posted by Roger Pau Monné 2 years, 10 months ago

On Mon, Mar 20, 2023 at 01:08:48PM +0100, Jan Beulich wrote:
> On 16.03.2023 13:07, Roger Pau Monne wrote:
> > --- a/xen/drivers/vpci/msix.c
> > +++ b/xen/drivers/vpci/msix.c
> > @@ -27,6 +27,11 @@
> >      ((addr) >= vmsix_table_addr(vpci, nr) &&                              \
> >       (addr) < vmsix_table_addr(vpci, nr) + vmsix_table_size(vpci, nr))
> >  
> > +#define VMSIX_ADDR_SAME_PAGE(addr, vpci, nr)                              \
> > +    (PFN_DOWN(addr) >= PFN_DOWN(vmsix_table_addr(vpci, nr)) &&            \
> > +     PFN_DOWN((addr)) < PFN_UP(vmsix_table_addr(vpci, nr) +               \
> > +                               vmsix_table_size(vpci, nr) - 1))
> 
> Looks like this would be better in line with get_slot() (and slightly
> cheaper) if the 2nd comparison was ... <= PFN_DOWN().

Can adjust, I don't have a strong opinion.

> 
> > @@ -149,7 +154,7 @@ static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr)
> >  
> >          for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ )
> >              if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled &&
> > -                 VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) )
> > +                 VMSIX_ADDR_SAME_PAGE(addr, msix->pdev->vpci, i) )
> >                  return msix;
> >      }
> >  
> > @@ -182,93 +187,201 @@ static struct vpci_msix_entry *get_entry(struct vpci_msix *msix,
> >      return &msix->entries[(addr - start) / PCI_MSIX_ENTRY_SIZE];
> >  }
> >  
> > -static void __iomem *get_pba(struct vpci *vpci)
> > +static void __iomem *get_table(struct vpci *vpci, unsigned int slot)
> >  {
> >      struct vpci_msix *msix = vpci->msix;
> >      /*
> > -     * PBA will only be unmapped when the device is deassigned, so access it
> > -     * without holding the vpci lock.
> > +     * Regions will only be unmapped when the device is deassigned, so access
> > +     * them without holding the vpci lock.
> >       */
> > -    void __iomem *pba = read_atomic(&msix->pba);
> > +    void __iomem *table = read_atomic(&msix->table[slot]);
> > +    paddr_t addr = 0;
> >  
> > -    if ( likely(pba) )
> > -        return pba;
> > +    if ( likely(table) )
> > +        return table;
> >  
> > -    pba = ioremap(vmsix_table_addr(vpci, VPCI_MSIX_PBA),
> > -                  vmsix_table_size(vpci, VPCI_MSIX_PBA));
> > -    if ( !pba )
> > -        return read_atomic(&msix->pba);
> > +    switch ( slot )
> > +    {
> > +    case VPCI_MSIX_TBL_TAIL:
> > +        addr = vmsix_table_size(vpci, VPCI_MSIX_TABLE);
> > +        fallthrough;
> > +    case VPCI_MSIX_TBL_HEAD:
> > +        addr += vmsix_table_addr(vpci, VPCI_MSIX_TABLE);
> > +        break;
> > +
> > +    case VPCI_MSIX_PBA_TAIL:
> > +        addr = vmsix_table_size(vpci, VPCI_MSIX_PBA);
> > +        fallthrough;
> > +    case VPCI_MSIX_PBA_HEAD:
> > +        addr += vmsix_table_addr(vpci, VPCI_MSIX_PBA);
> > +        break;
> 
> Hmm, wasn't the plan to stop special-casing the PBA, including its
> special treatment wrt the p2m?

I had a pre-patch to do that, but TBH I'm not sure it's worth handling
dom0 vs domU different here, the more that PBA accesses aren't that
common anyway.

> Reading on I realize you need this for
> the (future) DomU case (to enforce r/o-ness, albeit having looked at
> the spec again the other day I'm not really convinced anymore we
> really need to squash writes), but we should be able to avoid the
> extra overhead for Dom0? (Granted it may make sense to leave this to
> a separate patch, if we want to keep the DomU handling despite not
> presently needing it.)

I can indeed avoid the trapping for dom0, it's just a 2 line change to
vpci_make_msix_hole() IIRC.  Since I've already done the code, let's
try to get the handling done for both dom0 and domU, and I can submit
the improvement for dom0 as a separate patch.

> > +    }
> > +
> > +    table = ioremap(round_pgdown(addr), PAGE_SIZE);
> > +    if ( !table )
> > +        return read_atomic(&msix->table[slot]);
> >  
> >      spin_lock(&vpci->lock);
> > -    if ( !msix->pba )
> > +    if ( !msix->table[slot] )
> >      {
> > -        write_atomic(&msix->pba, pba);
> > +        write_atomic(&msix->table[slot], table);
> >          spin_unlock(&vpci->lock);
> >      }
> >      else
> >      {
> >          spin_unlock(&vpci->lock);
> > -        iounmap(pba);
> > +        iounmap(table);
> >      }
> >  
> > -    return read_atomic(&msix->pba);
> > +    return read_atomic(&msix->table[slot]);
> >  }
> >  
> > -static int cf_check msix_read(
> > -    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long *data)
> > +unsigned int get_slot(const struct vpci *vpci, unsigned long addr)
> >  {
> > -    const struct domain *d = v->domain;
> > -    struct vpci_msix *msix = msix_find(d, addr);
> > -    const struct vpci_msix_entry *entry;
> > -    unsigned int offset;
> > +    unsigned long pfn = PFN_DOWN(addr);
> >  
> > -    *data = ~0ul;
> > +    /*
> > +     * The logic below relies on having the tables identity mapped to the guest
> > +     * address space, or for the `addr` parameter to be translated into its
> > +     * host physical memory address equivalent.
> > +     */
> >  
> > -    if ( !msix )
> > -        return X86EMUL_RETRY;
> > +    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_TABLE)) )
> > +        return VPCI_MSIX_TBL_HEAD;
> > +    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_TABLE) +
> > +                         vmsix_table_size(vpci, VPCI_MSIX_TABLE) - 1) )
> > +        return VPCI_MSIX_TBL_TAIL;
> > +    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_PBA)) )
> > +        return VPCI_MSIX_PBA_HEAD;
> > +    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_PBA) +
> > +                         vmsix_table_size(vpci, VPCI_MSIX_PBA) - 1) )
> > +        return VPCI_MSIX_PBA_TAIL;
> > +
> > +    ASSERT_UNREACHABLE();
> > +    return -1;
> > +}
> >  
> > -    if ( !access_allowed(msix->pdev, addr, len) )
> > -        return X86EMUL_OKAY;
> > +static bool adjacent_handle(const struct vpci_msix *msix, unsigned long addr)
> > +{
> > +    unsigned int i;
> >  
> >      if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) )
> > +        return true;
> > +
> > +    if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_TABLE) )
> > +        return false;
> > +
> > +    for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ )
> > +        if ( VMSIX_ADDR_SAME_PAGE(addr, msix->pdev->vpci, i) )
> > +            return true;
> > +
> > +    return false;
> > +}
> > +
> > +static int adjacent_read(const struct domain *d, const struct vpci_msix *msix,
> > +                         unsigned long addr, unsigned int len,
> > +                         unsigned long *data)
> > +{
> > +    const void __iomem *mem;
> > +    struct vpci *vpci = msix->pdev->vpci;
> > +    unsigned int slot;
> > +
> > +    *data = ~0ul;
> > +
> > +    if ( !adjacent_handle(msix, addr + len - 1) )
> > +        return X86EMUL_OKAY;
> > +
> > +    if ( addr & (len - 1) )
> 
> unlikely()?

Done.

> >      {
> > -        struct vpci *vpci = msix->pdev->vpci;
> > -        unsigned int idx = addr - vmsix_table_addr(vpci, VPCI_MSIX_PBA);
> > -        const void __iomem *pba = get_pba(vpci);
> > +        unsigned int i;
> > +
> > +        gprintk(XENLOG_DEBUG, "%pp: unaligned read to MSI-X related page\n",
> > +                &msix->pdev->sbdf);
> >  
> >          /*
> > -         * Access to PBA.
> > +         * Split unaligned accesses into byte sized ones. Shouldn't happen in
> > +         * the first place, but devices shouldn't have registers in the same 4K
> > +         * page as the MSIX tables either.
> >           *
> > -         * TODO: note that this relies on having the PBA identity mapped to the
> > -         * guest address space. If this changes the address will need to be
> > -         * translated.
> > +         * It's unclear whether this could cause issues if a guest expects a
> > +         * registers to be accessed atomically, it better use an aligned access
> > +         * if it has such expectations.
> >           */
> > -        if ( !pba )
> > -        {
> > -            gprintk(XENLOG_WARNING,
> > -                    "%pp: unable to map MSI-X PBA, report all pending\n",
> > -                    &msix->pdev->sbdf);
> > -            return X86EMUL_OKAY;
> > -        }
> >  
> > -        switch ( len )
> > +        for ( i = 0; i < len; i++ )
> >          {
> > -        case 4:
> > -            *data = readl(pba + idx);
> > -            break;
> > +            unsigned long partial = ~0ul;
> 
> Pointless initializer (~0ul is written first thing above, i.e. also in
> the recursive invocation). Then again that setting is also redundant
> with msix_read()'s. So I guess the initializer wants to stay but the
> setting at the top of the function can be dropped.

I'm always extra cautious with variables on the stack that contain
data to be returned to the guest.  All your points are valid and
correct, but I like to explicitly initialize them so that further
changes in the functions don't end up leaking them.  If you don't mind
that much I would rather leave it as-is.

> 
> > +            int rc = adjacent_read(d, msix, addr + i, 1, &partial);
> >  
> > -        case 8:
> > -            *data = readq(pba + idx);
> > -            break;
> > +            if ( rc != X86EMUL_OKAY )
> > +                return rc;
> >  
> > -        default:
> > -            ASSERT_UNREACHABLE();
> > -            break;
> > +            *data &= ~(0xfful << (i * 8));
> > +            *data |= ((uint8_t)partial) << (i * 8);
> 
> This is UB for i >= 4; you'd need to cast back to unsigned long, at which
> point I think the cast-free variant of masking by 0xff is to be preferred.

Indeed.

> >          }
> >  
> >          return X86EMUL_OKAY;
> >      }
> >  
> > +    slot = get_slot(vpci, addr);
> > +    if ( slot >= ARRAY_SIZE(msix->table) )
> > +        return X86EMUL_OKAY;
> > +
> > +    mem = get_table(vpci, slot);
> > +    if ( !mem )
> > +    {
> > +        gprintk(XENLOG_WARNING,
> > +                "%pp: unable to map MSI-X page, returning all bits set\n",
> > +                &msix->pdev->sbdf);
> > +        return X86EMUL_OKAY;
> > +    }
> 
> Could this be moved ahead of the unaligned handling, so there would be
> (commonly) only one such log message even for accesses we split?

Quite likely, let me take a look.

Thanks, Roger.

Re: [PATCH v2] vpci/msix: handle accesses adjacent to the MSI-X table

Posted by Jan Beulich 2 years, 10 months ago

On 21.03.2023 16:31, Roger Pau Monné wrote:
> On Mon, Mar 20, 2023 at 01:08:48PM +0100, Jan Beulich wrote:
>> On 16.03.2023 13:07, Roger Pau Monne wrote:
>>>      {
>>> -        struct vpci *vpci = msix->pdev->vpci;
>>> -        unsigned int idx = addr - vmsix_table_addr(vpci, VPCI_MSIX_PBA);
>>> -        const void __iomem *pba = get_pba(vpci);
>>> +        unsigned int i;
>>> +
>>> +        gprintk(XENLOG_DEBUG, "%pp: unaligned read to MSI-X related page\n",
>>> +                &msix->pdev->sbdf);
>>>  
>>>          /*
>>> -         * Access to PBA.
>>> +         * Split unaligned accesses into byte sized ones. Shouldn't happen in
>>> +         * the first place, but devices shouldn't have registers in the same 4K
>>> +         * page as the MSIX tables either.
>>>           *
>>> -         * TODO: note that this relies on having the PBA identity mapped to the
>>> -         * guest address space. If this changes the address will need to be
>>> -         * translated.
>>> +         * It's unclear whether this could cause issues if a guest expects a
>>> +         * registers to be accessed atomically, it better use an aligned access
>>> +         * if it has such expectations.
>>>           */
>>> -        if ( !pba )
>>> -        {
>>> -            gprintk(XENLOG_WARNING,
>>> -                    "%pp: unable to map MSI-X PBA, report all pending\n",
>>> -                    &msix->pdev->sbdf);
>>> -            return X86EMUL_OKAY;
>>> -        }
>>>  
>>> -        switch ( len )
>>> +        for ( i = 0; i < len; i++ )
>>>          {
>>> -        case 4:
>>> -            *data = readl(pba + idx);
>>> -            break;
>>> +            unsigned long partial = ~0ul;
>>
>> Pointless initializer (~0ul is written first thing above, i.e. also in
>> the recursive invocation). Then again that setting is also redundant
>> with msix_read()'s. So I guess the initializer wants to stay but the
>> setting at the top of the function can be dropped.
> 
> I'm always extra cautious with variables on the stack that contain
> data to be returned to the guest.  All your points are valid and
> correct, but I like to explicitly initialize them so that further
> changes in the functions don't end up leaking them.  If you don't mind
> that much I would rather leave it as-is.

Well, such extra code always raises the question "Why is this here?"
But no, I won't insist if you prefer to keep the redundancy.

Jan