[Xen-devel] [PATCH v5 00/10] AMD IOMMU: further improvements

Jan Beulich posted 10 patches 18 weeks ago
Only 0 patches received!

[Xen-devel] [PATCH v5 00/10] AMD IOMMU: further improvements

Posted by Jan Beulich 18 weeks ago
Only the first patch here is left from v4, everything else is new,
yet still related. The main goal is to reduce the huge memory
overhead that we've noticed. On the way there a number of other
things were once again noticed. Unfortunately before I was able to
also test the last two patches there, my Rome box broke again.
Hence these two patches have been tested on a (less affected)
Fam15 system only.

01: miscellaneous DTE handling adjustments
02: drop stray "else"
03: don't free shared IRT multiple times
04: introduce a "valid" flag for IVRS mappings
05: let callers of amd_iommu_alloc_intremap_table() handle errors
06: don't blindly allocate interrupt remapping tables
07: make phantom functions share interrupt remapping tables
08: x86/PCI: read MSI-X table entry count early
09: replace INTREMAP_ENTRIES
10: restrict interrupt remapping table sizes

Full set of patches once again attached here due to still unresolved
email issues over here.

Jan


AMD/IOMMU: miscellaneous DTE handling adjustments

First and foremost switch boolean fields to bool. Adjust a few related
function parameters as well. Then
- in amd_iommu_set_intremap_table() don't use literal numbers,
- in iommu_dte_add_device_entry() use a compound literal instead of many
  assignments,
- in amd_iommu_setup_domain_device()
  - eliminate a pointless local variable,
  - use || instead of && when deciding whether to clear an entry,
  - clear the I field without any checking of ATS / IOTLB state,
- leave reserved fields unnamed.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v5: IOMMU_INTREMAP_LENGTH -> IOMMU_INTREMAP_ORDER. Adjust comment.
v4: New.

--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -69,8 +69,7 @@ union irte_cptr {
     const union irte128 *ptr128;
 } __transparent__;
 
-#define INTREMAP_LENGTH 0xB
-#define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH)
+#define INTREMAP_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
 
 struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
 struct hpet_sbdf hpet_sbdf;
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -101,51 +101,52 @@ static unsigned int set_iommu_pte_presen
 
 void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
                                    uint64_t root_ptr, uint16_t domain_id,
-                                   uint8_t paging_mode, uint8_t valid)
+                                   uint8_t paging_mode, bool valid)
 {
     dte->domain_id = domain_id;
     dte->pt_root = paddr_to_pfn(root_ptr);
-    dte->iw = 1;
-    dte->ir = 1;
+    dte->iw = true;
+    dte->ir = true;
     dte->paging_mode = paging_mode;
-    dte->tv = 1;
+    dte->tv = true;
     dte->v = valid;
 }
 
 void __init amd_iommu_set_intremap_table(
-    struct amd_iommu_dte *dte, uint64_t intremap_ptr, uint8_t int_valid)
+    struct amd_iommu_dte *dte, uint64_t intremap_ptr, bool valid)
 {
     dte->it_root = intremap_ptr >> 6;
-    dte->int_tab_len = 0xb; /* 2048 entries */
-    dte->int_ctl = 2; /* fixed and arbitrated interrupts remapped */
-    dte->ig = 0; /* unmapped interrupt results io page faults */
-    dte->iv = int_valid;
+    dte->int_tab_len = IOMMU_INTREMAP_ORDER;
+    dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
+    dte->ig = false; /* unmapped interrupts result in i/o page faults */
+    dte->iv = valid;
 }
 
 void __init iommu_dte_add_device_entry(struct amd_iommu_dte *dte,
-                                       struct ivrs_mappings *ivrs_dev)
+                                       const struct ivrs_mappings *ivrs_dev)
 {
     uint8_t flags = ivrs_dev->device_flags;
 
-    memset(dte, 0, sizeof(*dte));
-
-    dte->init_pass = MASK_EXTR(flags, ACPI_IVHD_INIT_PASS);
-    dte->ext_int_pass = MASK_EXTR(flags, ACPI_IVHD_EINT_PASS);
-    dte->nmi_pass = MASK_EXTR(flags, ACPI_IVHD_NMI_PASS);
-    dte->lint0_pass = MASK_EXTR(flags, ACPI_IVHD_LINT0_PASS);
-    dte->lint1_pass = MASK_EXTR(flags, ACPI_IVHD_LINT1_PASS);
-    dte->sys_mgt = MASK_EXTR(flags, ACPI_IVHD_SYSTEM_MGMT);
-    dte->ex = ivrs_dev->dte_allow_exclusion;
+    *dte = (struct amd_iommu_dte){
+        .init_pass = flags & ACPI_IVHD_INIT_PASS,
+        .ext_int_pass = flags & ACPI_IVHD_EINT_PASS,
+        .nmi_pass = flags & ACPI_IVHD_NMI_PASS,
+        .lint0_pass = flags & ACPI_IVHD_LINT0_PASS,
+        .lint1_pass = flags & ACPI_IVHD_LINT1_PASS,
+        .ioctl = IOMMU_DEV_TABLE_IO_CONTROL_ABORTED,
+        .sys_mgt = MASK_EXTR(flags, ACPI_IVHD_SYSTEM_MGMT),
+        .ex = ivrs_dev->dte_allow_exclusion,
+    };
 }
 
 void iommu_dte_set_guest_cr3(struct amd_iommu_dte *dte, uint16_t dom_id,
-                             uint64_t gcr3_mfn, uint8_t gv, uint8_t glx)
+                             uint64_t gcr3_mfn, bool gv, uint8_t glx)
 {
 #define GCR3_MASK(hi, lo) (((1ul << ((hi) + 1)) - 1) & ~((1ul << (lo)) - 1))
 #define GCR3_SHIFT(lo) ((lo) - PAGE_SHIFT)
 
     /* I bit must be set when gcr3 is enabled */
-    dte->i = 1;
+    dte->i = true;
 
     dte->gcr3_trp_14_12 = (gcr3_mfn & GCR3_MASK(14, 12)) >> GCR3_SHIFT(12);
     dte->gcr3_trp_30_15 = (gcr3_mfn & GCR3_MASK(30, 15)) >> GCR3_SHIFT(15);
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -93,7 +93,6 @@ static void amd_iommu_setup_domain_devic
     struct amd_iommu_dte *table, *dte;
     unsigned long flags;
     int req_id, valid = 1;
-    int dte_i = 0;
     u8 bus = pdev->bus;
     const struct domain_iommu *hd = dom_iommu(domain);
 
@@ -103,9 +102,6 @@ static void amd_iommu_setup_domain_devic
     if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
         valid = 0;
 
-    if ( ats_enabled )
-        dte_i = 1;
-
     /* get device-table entry */
     req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
     table = iommu->dev_table.buffer;
@@ -122,7 +118,7 @@ static void amd_iommu_setup_domain_devic
 
         if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
              iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
-            dte->i = dte_i;
+            dte->i = ats_enabled;
 
         amd_iommu_flush_device(iommu, req_id);
 
@@ -288,14 +284,11 @@ void amd_iommu_disable_domain_device(str
     dte = &table[req_id];
 
     spin_lock_irqsave(&iommu->lock, flags);
-    if ( dte->tv && dte->v )
+    if ( dte->tv || dte->v )
     {
-        dte->tv = 0;
-        dte->v = 0;
-
-        if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
-             iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
-            dte->i = 0;
+        dte->tv = false;
+        dte->v = false;
+        dte->i = false;
 
         amd_iommu_flush_device(iommu, req_id);
 
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -107,57 +107,60 @@
 #define IOMMU_DEV_TABLE_INT_CONTROL_FORWARDED	0x1
 #define IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED	0x2
 
+/* For now, we always allocate the maximum: 2048 entries. */
+#define IOMMU_INTREMAP_ORDER			0xB
+
 struct amd_iommu_dte {
     /* 0 - 63 */
-    uint64_t v:1;
-    uint64_t tv:1;
-    uint64_t reserved0:5;
-    uint64_t had:2;
-    uint64_t paging_mode:3;
+    bool v:1;
+    bool tv:1;
+    unsigned int :5;
+    unsigned int had:2;
+    unsigned int paging_mode:3;
     uint64_t pt_root:40;
-    uint64_t ppr:1;
-    uint64_t gprp:1;
-    uint64_t giov:1;
-    uint64_t gv:1;
-    uint64_t glx:2;
-    uint64_t gcr3_trp_14_12:3;
-    uint64_t ir:1;
-    uint64_t iw:1;
-    uint64_t reserved1:1;
+    bool ppr:1;
+    bool gprp:1;
+    bool giov:1;
+    bool gv:1;
+    unsigned int glx:2;
+    unsigned int gcr3_trp_14_12:3;
+    bool ir:1;
+    bool iw:1;
+    unsigned int :1;
 
     /* 64 - 127 */
-    uint64_t domain_id:16;
-    uint64_t gcr3_trp_30_15:16;
-    uint64_t i:1;
-    uint64_t se:1;
-    uint64_t sa:1;
-    uint64_t ioctl:2;
-    uint64_t cache:1;
-    uint64_t sd:1;
-    uint64_t ex:1;
-    uint64_t sys_mgt:2;
-    uint64_t reserved2:1;
-    uint64_t gcr3_trp_51_31:21;
+    unsigned int domain_id:16;
+    unsigned int gcr3_trp_30_15:16;
+    bool i:1;
+    bool se:1;
+    bool sa:1;
+    unsigned int ioctl:2;
+    bool cache:1;
+    bool sd:1;
+    bool ex:1;
+    unsigned int sys_mgt:2;
+    unsigned int :1;
+    unsigned int gcr3_trp_51_31:21;
 
     /* 128 - 191 */
-    uint64_t iv:1;
-    uint64_t int_tab_len:4;
-    uint64_t ig:1;
+    bool iv:1;
+    unsigned int int_tab_len:4;
+    bool ig:1;
     uint64_t it_root:46;
-    uint64_t reserved3:4;
-    uint64_t init_pass:1;
-    uint64_t ext_int_pass:1;
-    uint64_t nmi_pass:1;
-    uint64_t reserved4:1;
-    uint64_t int_ctl:2;
-    uint64_t lint0_pass:1;
-    uint64_t lint1_pass:1;
+    unsigned int :4;
+    bool init_pass:1;
+    bool ext_int_pass:1;
+    bool nmi_pass:1;
+    unsigned int :1;
+    unsigned int int_ctl:2;
+    bool lint0_pass:1;
+    bool lint1_pass:1;
 
     /* 192 - 255 */
-    uint64_t reserved5:54;
-    uint64_t attr_v:1;
-    uint64_t mode0_fc:1;
-    uint64_t snoop_attr:8;
+    uint64_t :54;
+    bool attr_v:1;
+    bool mode0_fc:1;
+    unsigned int snoop_attr:8;
 };
 
 /* Command Buffer */
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -73,14 +73,14 @@ int __must_check amd_iommu_flush_iotlb_a
 int get_dma_requestor_id(uint16_t seg, uint16_t bdf);
 void amd_iommu_set_intremap_table(struct amd_iommu_dte *dte,
                                   uint64_t intremap_ptr,
-                                  uint8_t int_valid);
+                                  bool valid);
 void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
 				   uint64_t root_ptr, uint16_t domain_id,
-				   uint8_t paging_mode, uint8_t valid);
+				   uint8_t paging_mode, bool valid);
 void iommu_dte_add_device_entry(struct amd_iommu_dte *dte,
-                                struct ivrs_mappings *ivrs_dev);
+                                const struct ivrs_mappings *ivrs_dev);
 void iommu_dte_set_guest_cr3(struct amd_iommu_dte *dte, uint16_t dom_id,
-                             uint64_t gcr3_mfn, uint8_t gv, uint8_t glx);
+                             uint64_t gcr3_mfn, bool gv, uint8_t glx);
 
 /* send cmd to iommu */
 void amd_iommu_flush_all_pages(struct domain *d);
AMD/IOMMU: drop stray "else"

The blank line between it and the prior if() clearly indicates that this
was meant to be a standalone if().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -166,8 +166,8 @@ static int __init iov_detect(void)
     if ( !iommu_enable && !iommu_intremap )
         return 0;
 
-    else if ( (init_done ? amd_iommu_init_interrupt()
-                         : amd_iommu_init(false)) != 0 )
+    if ( (init_done ? amd_iommu_init_interrupt()
+                    : amd_iommu_init(false)) != 0 )
     {
         printk("AMD-Vi: Error initialization\n");
         return -ENODEV;
AMD/IOMMU: don't free shared IRT multiple times

Calling amd_iommu_free_intremap_table() for every IVRS entry is correct
only in per-device-IRT mode. Use a NULL 2nd argument to indicate that
the shared table should be freed, and call the function exactly once in
shared mode.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1106,6 +1106,15 @@ static void __init amd_iommu_init_cleanu
 {
     struct amd_iommu *iommu, *next;
 
+    /* free interrupt remapping table */
+    if ( amd_iommu_perdev_intremap )
+        iterate_ivrs_entries(amd_iommu_free_intremap_table);
+    else if ( shared_intremap_table )
+        amd_iommu_free_intremap_table(list_first_entry(&amd_iommu_head,
+                                                       struct amd_iommu,
+                                                       list),
+                                      NULL);
+
     /* free amd iommu list */
     list_for_each_entry_safe ( iommu, next, &amd_iommu_head, list )
     {
@@ -1128,9 +1137,6 @@ static void __init amd_iommu_init_cleanu
         xfree(iommu);
     }
 
-    /* free interrupt remapping table */
-    iterate_ivrs_entries(amd_iommu_free_intremap_table);
-
     /* free device table */
     deallocate_device_table(&device_table);
 
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -792,14 +792,23 @@ void amd_iommu_read_msi_from_ire(
 int __init amd_iommu_free_intremap_table(
     const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping)
 {
-    void *tb = ivrs_mapping->intremap_table;
+    void **tblp;
 
-    XFREE(ivrs_mapping->intremap_inuse);
+    if ( ivrs_mapping )
+    {
+        XFREE(ivrs_mapping->intremap_inuse);
+        tblp = &ivrs_mapping->intremap_table;
+    }
+    else
+    {
+        XFREE(shared_intremap_inuse);
+        tblp = &shared_intremap_table;
+    }
 
-    if ( tb )
+    if ( *tblp )
     {
-        __free_amd_iommu_tables(tb, intremap_table_order(iommu));
-        ivrs_mapping->intremap_table = NULL;
+        __free_amd_iommu_tables(*tblp, intremap_table_order(iommu));
+        *tblp = NULL;
     }
 
     return 0;
AMD/IOMMU: introduce a "valid" flag for IVRS mappings

For us to no longer blindly allocate interrupt remapping tables for
everything the ACPI tables name, we can't use struct ivrs_mappings'
intremap_table field anymore to also have the meaning of "this entry
is valid". Add a separate boolean field instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -88,6 +88,8 @@ static void __init add_ivrs_mapping_entr
          }
     }
 
+    ivrs_mappings[alias_id].valid = true;
+
     /* Assign IOMMU hardware. */
     ivrs_mappings[bdf].iommu = iommu;
 }
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1244,7 +1244,6 @@ static int __init amd_iommu_setup_device
     u16 seg, struct ivrs_mappings *ivrs_mappings)
 {
     unsigned int bdf;
-    void *intr_tb, *dte;
 
     BUG_ON( (ivrs_bdf_entries == 0) );
 
@@ -1264,16 +1263,17 @@ static int __init amd_iommu_setup_device
     /* Add device table entries */
     for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
     {
-        intr_tb = ivrs_mappings[bdf].intremap_table;
-
-        if ( intr_tb )
+        if ( ivrs_mappings[bdf].valid )
         {
+            void *dte;
+
             /* add device table entry */
             dte = device_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE);
             iommu_dte_add_device_entry(dte, &ivrs_mappings[bdf]);
 
             amd_iommu_set_intremap_table(
-                dte, (u64)virt_to_maddr(intr_tb), iommu_intremap);
+                dte, virt_to_maddr(ivrs_mappings[bdf].intremap_table),
+                iommu_intremap);
         }
     }
 
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -69,8 +69,8 @@ struct amd_iommu *find_iommu_for_device(
  * table and I/O page table respectively. Such devices will have
  * both alias entry and select entry in IVRS structure.
  *
- * Return original device id, if device has valid interrupt remapping
- * table setup for both select entry and alias entry.
+ * Return original device id if both the specific entry and the alias entry
+ * have been marked valid.
  */
 int get_dma_requestor_id(uint16_t seg, uint16_t bdf)
 {
@@ -79,8 +79,7 @@ int get_dma_requestor_id(uint16_t seg, u
 
     BUG_ON ( bdf >= ivrs_bdf_entries );
     req_id = ivrs_mappings[bdf].dte_requestor_id;
-    if ( (ivrs_mappings[bdf].intremap_table != NULL) &&
-         (ivrs_mappings[req_id].intremap_table != NULL) )
+    if ( ivrs_mappings[bdf].valid && ivrs_mappings[req_id].valid )
         req_id = bdf;
 
     return req_id;
--- a/xen/include/asm-x86/amd-iommu.h
+++ b/xen/include/asm-x86/amd-iommu.h
@@ -111,6 +111,7 @@ struct ivrs_mappings {
     u8 unity_map_enable;
     u8 write_permission;
     u8 read_permission;
+    bool valid;
     unsigned long addr_range_start;
     unsigned long addr_range_length;
     struct amd_iommu *iommu;
AMD/IOMMU: let callers of amd_iommu_alloc_intremap_table() handle errors

Additional users of the function will want to handle errors more
gracefully. Remove the BUG_ON()s and make the current caller panic()
instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -86,6 +86,10 @@ static void __init add_ivrs_mapping_entr
              ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
              ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
          }
+
+         if ( !ivrs_mappings[alias_id].intremap_table )
+             panic("No memory for %04x:%02x:%02x.%u's IRT\n", iommu->seg,
+                   PCI_BUS(alias_id), PCI_SLOT(alias_id), PCI_FUNC(alias_id));
     }
 
     ivrs_mappings[alias_id].valid = true;
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -817,12 +817,22 @@ int __init amd_iommu_free_intremap_table
 void *__init amd_iommu_alloc_intremap_table(
     const struct amd_iommu *iommu, unsigned long **inuse_map)
 {
-    void *tb = __alloc_amd_iommu_tables(intremap_table_order(iommu));
+    unsigned int order = intremap_table_order(iommu);
+    void *tb = __alloc_amd_iommu_tables(order);
+
+    if ( tb )
+    {
+        *inuse_map = xzalloc_array(unsigned long,
+                                   BITS_TO_LONGS(INTREMAP_ENTRIES));
+        if ( *inuse_map )
+            memset(tb, 0, PAGE_SIZE << order);
+        else
+        {
+            __free_amd_iommu_tables(tb, order);
+            tb = NULL;
+        }
+    }
 
-    BUG_ON(tb == NULL);
-    memset(tb, 0, PAGE_SIZE << intremap_table_order(iommu));
-    *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(INTREMAP_ENTRIES));
-    BUG_ON(*inuse_map == NULL);
     return tb;
 }
 
AMD/IOMMU: don't blindly allocate interrupt remapping tables

ACPI tables are free to list far more device coordinates than there are
actual devices. By delaying the table allocations for most cases, and
doing them only when an actual device is known to be present at a given
position, overall memory used for the tables goes down from over 500k
pages to just over 1k ones.

Tables continue to get allocated right away for special entries
(IO-APIC, HPET) as well as for alias IDs. While in the former case
that's simply because there may not be any device at a given position,
in the latter case this is to avoid having to introduce ref-counting of
table usage.

The change involves invoking
iterate_ivrs_mappings(amd_iommu_setup_device_table) a second time,
because the function now wants to be able to find PCI devices, which
isn't possible yet when IOMMU setup happens very early during x2APIC
mode setup. In this context amd_iommu_init_interrupt() gets renamed as
well.

The logic adjusting a DTE's interrupt remapping attributes also gets
changed, such that the lack of an IRT would result in target aborted
rather than not remapped interrupts (should any occur).

Note that for now phantom functions get separate IRTs allocated, as was
the case before.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.
---
TBD: This retains prior (but suspicious) behavior of not calling
     amd_iommu_set_intremap_table() for "invalid" IVRS mapping entries.
     Since DTE.IV=0 means un-remapped interrupts, I wonder if this needs
     changing.

Backporting note: This depends on b5fbe81196 "iommu / x86: move call to
scan_pci_devices() out of vendor code"!

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -53,7 +53,8 @@ union acpi_ivhd_device {
 };
 
 static void __init add_ivrs_mapping_entry(
-    u16 bdf, u16 alias_id, u8 flags, struct amd_iommu *iommu)
+    uint16_t bdf, uint16_t alias_id, uint8_t flags, bool alloc_irt,
+    struct amd_iommu *iommu)
 {
     struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(iommu->seg);
 
@@ -69,27 +70,32 @@ static void __init add_ivrs_mapping_entr
     if ( iommu->bdf == bdf )
         return;
 
-    if ( !ivrs_mappings[alias_id].intremap_table )
+    /* Allocate interrupt remapping table if needed. */
+    if ( iommu_intremap && !ivrs_mappings[alias_id].intremap_table )
     {
-         /* allocate per-device interrupt remapping table */
-         if ( amd_iommu_perdev_intremap )
-             ivrs_mappings[alias_id].intremap_table =
-                 amd_iommu_alloc_intremap_table(
-                     iommu,
-                     &ivrs_mappings[alias_id].intremap_inuse);
-         else
-         {
-             if ( shared_intremap_table == NULL  )
-                 shared_intremap_table = amd_iommu_alloc_intremap_table(
-                     iommu,
-                     &shared_intremap_inuse);
-             ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
-             ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
-         }
-
-         if ( !ivrs_mappings[alias_id].intremap_table )
-             panic("No memory for %04x:%02x:%02x.%u's IRT\n", iommu->seg,
-                   PCI_BUS(alias_id), PCI_SLOT(alias_id), PCI_FUNC(alias_id));
+        if ( !amd_iommu_perdev_intremap )
+        {
+            if ( !shared_intremap_table )
+                shared_intremap_table = amd_iommu_alloc_intremap_table(
+                    iommu, &shared_intremap_inuse);
+
+            if ( !shared_intremap_table )
+                panic("No memory for shared IRT\n");
+
+            ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
+            ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
+        }
+        else if ( alloc_irt )
+        {
+            ivrs_mappings[alias_id].intremap_table =
+                amd_iommu_alloc_intremap_table(
+                    iommu, &ivrs_mappings[alias_id].intremap_inuse);
+
+            if ( !ivrs_mappings[alias_id].intremap_table )
+                panic("No memory for %04x:%02x:%02x.%u's IRT\n",
+                      iommu->seg, PCI_BUS(alias_id), PCI_SLOT(alias_id),
+                      PCI_FUNC(alias_id));
+        }
     }
 
     ivrs_mappings[alias_id].valid = true;
@@ -433,7 +439,8 @@ static u16 __init parse_ivhd_device_sele
         return 0;
     }
 
-    add_ivrs_mapping_entry(bdf, bdf, select->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, bdf, select->header.data_setting, false,
+                           iommu);
 
     return sizeof(*select);
 }
@@ -479,7 +486,7 @@ static u16 __init parse_ivhd_device_rang
 
     for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
         add_ivrs_mapping_entry(bdf, bdf, range->start.header.data_setting,
-                               iommu);
+                               false, iommu);
 
     return dev_length;
 }
@@ -513,7 +520,8 @@ static u16 __init parse_ivhd_device_alia
 
     AMD_IOMMU_DEBUG(" Dev_Id Alias: %#x\n", alias_id);
 
-    add_ivrs_mapping_entry(bdf, alias_id, alias->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, alias_id, alias->header.data_setting, true,
+                           iommu);
 
     return dev_length;
 }
@@ -568,7 +576,7 @@ static u16 __init parse_ivhd_device_alia
 
     for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
         add_ivrs_mapping_entry(bdf, alias_id, range->alias.header.data_setting,
-                               iommu);
+                               true, iommu);
 
     return dev_length;
 }
@@ -593,7 +601,7 @@ static u16 __init parse_ivhd_device_exte
         return 0;
     }
 
-    add_ivrs_mapping_entry(bdf, bdf, ext->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, bdf, ext->header.data_setting, false, iommu);
 
     return dev_length;
 }
@@ -640,7 +648,7 @@ static u16 __init parse_ivhd_device_exte
 
     for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
         add_ivrs_mapping_entry(bdf, bdf, range->extended.header.data_setting,
-                               iommu);
+                               false, iommu);
 
     return dev_length;
 }
@@ -733,7 +741,8 @@ static u16 __init parse_ivhd_device_spec
     AMD_IOMMU_DEBUG("IVHD Special: %04x:%02x:%02x.%u variety %#x handle %#x\n",
                     seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf),
                     special->variety, special->handle);
-    add_ivrs_mapping_entry(bdf, bdf, special->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, bdf, special->header.data_setting, true,
+                           iommu);
 
     switch ( special->variety )
     {
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -30,6 +30,7 @@
 #include <xen/delay.h>
 
 static int __initdata nr_amd_iommus;
+static bool __initdata pci_init;
 
 static void do_amd_iommu_irq(unsigned long data);
 static DECLARE_SOFTIRQ_TASKLET(amd_iommu_irq_tasklet, do_amd_iommu_irq, 0);
@@ -1247,17 +1248,20 @@ static int __init amd_iommu_setup_device
 
     BUG_ON( (ivrs_bdf_entries == 0) );
 
-    /* allocate 'device table' on a 4K boundary */
-    device_table.alloc_size = PAGE_SIZE <<
-                              get_order_from_bytes(
-                              PAGE_ALIGN(ivrs_bdf_entries *
-                              IOMMU_DEV_TABLE_ENTRY_SIZE));
-    device_table.entries = device_table.alloc_size /
-                           IOMMU_DEV_TABLE_ENTRY_SIZE;
-
-    device_table.buffer = allocate_buffer(device_table.alloc_size,
-                                          "Device Table");
-    if  ( device_table.buffer == NULL )
+    if ( !device_table.buffer )
+    {
+        /* allocate 'device table' on a 4K boundary */
+        device_table.alloc_size = PAGE_SIZE <<
+                                  get_order_from_bytes(
+                                  PAGE_ALIGN(ivrs_bdf_entries *
+                                  IOMMU_DEV_TABLE_ENTRY_SIZE));
+        device_table.entries = device_table.alloc_size /
+                               IOMMU_DEV_TABLE_ENTRY_SIZE;
+
+        device_table.buffer = allocate_buffer(device_table.alloc_size,
+                                              "Device Table");
+    }
+    if ( !device_table.buffer )
         return -ENOMEM;
 
     /* Add device table entries */
@@ -1266,13 +1270,46 @@ static int __init amd_iommu_setup_device
         if ( ivrs_mappings[bdf].valid )
         {
             void *dte;
+            const struct pci_dev *pdev = NULL;
 
             /* add device table entry */
             dte = device_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE);
             iommu_dte_add_device_entry(dte, &ivrs_mappings[bdf]);
 
+            if ( iommu_intremap &&
+                 ivrs_mappings[bdf].dte_requestor_id == bdf &&
+                 !ivrs_mappings[bdf].intremap_table )
+            {
+                if ( !pci_init )
+                    continue;
+                pcidevs_lock();
+                pdev = pci_get_pdev(seg, PCI_BUS(bdf), PCI_DEVFN2(bdf));
+                pcidevs_unlock();
+            }
+
+            if ( pdev )
+            {
+                unsigned int req_id = bdf;
+
+                do {
+                    ivrs_mappings[req_id].intremap_table =
+                        amd_iommu_alloc_intremap_table(
+                            ivrs_mappings[bdf].iommu,
+                            &ivrs_mappings[req_id].intremap_inuse);
+                    if ( !ivrs_mappings[req_id].intremap_table )
+                        return -ENOMEM;
+
+                    if ( !pdev->phantom_stride )
+                        break;
+                    req_id += pdev->phantom_stride;
+                } while ( PCI_SLOT(req_id) == pdev->sbdf.dev );
+            }
+
             amd_iommu_set_intremap_table(
-                dte, virt_to_maddr(ivrs_mappings[bdf].intremap_table),
+                dte,
+                ivrs_mappings[bdf].intremap_table
+                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
+                : 0,
                 iommu_intremap);
         }
     }
@@ -1405,7 +1442,8 @@ int __init amd_iommu_init(bool xt)
     if ( rc )
         goto error_out;
 
-    /* allocate and initialize a global device table shared by all iommus */
+    /* Allocate and initialize device table(s). */
+    pci_init = !xt;
     rc = iterate_ivrs_mappings(amd_iommu_setup_device_table);
     if ( rc )
         goto error_out;
@@ -1424,7 +1462,7 @@ int __init amd_iommu_init(bool xt)
         /*
          * Setting up of the IOMMU interrupts cannot occur yet at the (very
          * early) time we get here when enabling x2APIC mode. Suppress it
-         * here, and do it explicitly in amd_iommu_init_interrupt().
+         * here, and do it explicitly in amd_iommu_init_late().
          */
         rc = amd_iommu_init_one(iommu, !xt);
         if ( rc )
@@ -1438,11 +1476,16 @@ error_out:
     return rc;
 }
 
-int __init amd_iommu_init_interrupt(void)
+int __init amd_iommu_init_late(void)
 {
     struct amd_iommu *iommu;
     int rc = 0;
 
+    /* Further initialize the device table(s). */
+    pci_init = true;
+    if ( iommu_intremap )
+        rc = iterate_ivrs_mappings(amd_iommu_setup_device_table);
+
     for_each_amd_iommu ( iommu )
     {
         struct irq_desc *desc;
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -789,7 +789,7 @@ void amd_iommu_read_msi_from_ire(
     }
 }
 
-int __init amd_iommu_free_intremap_table(
+int amd_iommu_free_intremap_table(
     const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping)
 {
     void **tblp;
@@ -814,7 +814,7 @@ int __init amd_iommu_free_intremap_table
     return 0;
 }
 
-void *__init amd_iommu_alloc_intremap_table(
+void *amd_iommu_alloc_intremap_table(
     const struct amd_iommu *iommu, unsigned long **inuse_map)
 {
     unsigned int order = intremap_table_order(iommu);
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -116,8 +116,9 @@ void __init amd_iommu_set_intremap_table
     struct amd_iommu_dte *dte, uint64_t intremap_ptr, bool valid)
 {
     dte->it_root = intremap_ptr >> 6;
-    dte->int_tab_len = IOMMU_INTREMAP_ORDER;
-    dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
+    dte->int_tab_len = intremap_ptr ? IOMMU_INTREMAP_ORDER : 0;
+    dte->int_ctl = intremap_ptr ? IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED
+                                : IOMMU_DEV_TABLE_INT_CONTROL_ABORTED;
     dte->ig = false; /* unmapped interrupts result in i/o page faults */
     dte->iv = valid;
 }
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -165,7 +165,7 @@ static int __init iov_detect(void)
     if ( !iommu_enable && !iommu_intremap )
         return 0;
 
-    if ( (init_done ? amd_iommu_init_interrupt()
+    if ( (init_done ? amd_iommu_init_late()
                     : amd_iommu_init(false)) != 0 )
     {
         printk("AMD-Vi: Error initialization\n");
@@ -429,6 +429,7 @@ static int amd_iommu_add_device(u8 devfn
 {
     struct amd_iommu *iommu;
     u16 bdf;
+    struct ivrs_mappings *ivrs_mappings;
 
     if ( !pdev->domain )
         return -EINVAL;
@@ -458,6 +459,32 @@ static int amd_iommu_add_device(u8 devfn
         return -ENODEV;
     }
 
+    ivrs_mappings = get_ivrs_mappings(pdev->seg);
+    bdf = PCI_BDF2(pdev->bus, devfn);
+    if ( !ivrs_mappings ||
+         !ivrs_mappings[ivrs_mappings[bdf].dte_requestor_id].valid )
+        return -EPERM;
+
+    if ( iommu_intremap &&
+         ivrs_mappings[bdf].dte_requestor_id == bdf &&
+         !ivrs_mappings[bdf].intremap_table )
+    {
+        ivrs_mappings[bdf].intremap_table =
+            amd_iommu_alloc_intremap_table(
+                iommu, &ivrs_mappings[bdf].intremap_inuse);
+        if ( !ivrs_mappings[bdf].intremap_table )
+            return -ENOMEM;
+
+        amd_iommu_set_intremap_table(
+            iommu->dev_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE),
+            ivrs_mappings[bdf].intremap_table
+            ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
+            : 0,
+            iommu_intremap);
+
+        amd_iommu_flush_device(iommu, bdf);
+    }
+
     amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
     return 0;
 }
@@ -466,6 +493,8 @@ static int amd_iommu_remove_device(u8 de
 {
     struct amd_iommu *iommu;
     u16 bdf;
+    struct ivrs_mappings *ivrs_mappings;
+
     if ( !pdev->domain )
         return -EINVAL;
 
@@ -481,6 +510,14 @@ static int amd_iommu_remove_device(u8 de
     }
 
     amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev);
+
+    ivrs_mappings = get_ivrs_mappings(pdev->seg);
+    bdf = PCI_BDF2(pdev->bus, devfn);
+    if ( amd_iommu_perdev_intremap &&
+         ivrs_mappings[bdf].dte_requestor_id == bdf &&
+         ivrs_mappings[bdf].intremap_table )
+        amd_iommu_free_intremap_table(iommu, &ivrs_mappings[bdf]);
+
     return 0;
 }
 
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -50,7 +50,7 @@ void get_iommu_features(struct amd_iommu
 /* amd-iommu-init functions */
 int amd_iommu_prepare(bool xt);
 int amd_iommu_init(bool xt);
-int amd_iommu_init_interrupt(void);
+int amd_iommu_init_late(void);
 int amd_iommu_update_ivrs_mapping_acpi(void);
 int iov_adjust_irq_affinities(void);
 
AMD/IOMMU: make phantom functions share interrupt remapping tables

Rather than duplicating entries in amd_iommu_msi_msg_update_ire(), share
the tables. This mainly requires some care while freeing them, to avoid
freeing memory blocks twice.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1114,7 +1114,7 @@ static void __init amd_iommu_init_cleanu
         amd_iommu_free_intremap_table(list_first_entry(&amd_iommu_head,
                                                        struct amd_iommu,
                                                        list),
-                                      NULL);
+                                      NULL, 0);
 
     /* free amd iommu list */
     list_for_each_entry_safe ( iommu, next, &amd_iommu_head, list )
@@ -1179,7 +1179,7 @@ int iterate_ivrs_mappings(int (*handler)
 }
 
 int iterate_ivrs_entries(int (*handler)(const struct amd_iommu *,
-                                        struct ivrs_mappings *))
+                                        struct ivrs_mappings *, uint16_t bdf))
 {
     u16 seg = 0;
     int rc = 0;
@@ -1196,7 +1196,7 @@ int iterate_ivrs_entries(int (*handler)(
             const struct amd_iommu *iommu = map[bdf].iommu;
 
             if ( iommu && map[bdf].dte_requestor_id == bdf )
-                rc = handler(iommu, &map[bdf]);
+                rc = handler(iommu, &map[bdf], bdf);
         }
     } while ( !rc && ++seg );
 
@@ -1289,20 +1289,29 @@ static int __init amd_iommu_setup_device
 
             if ( pdev )
             {
-                unsigned int req_id = bdf;
-
-                do {
-                    ivrs_mappings[req_id].intremap_table =
-                        amd_iommu_alloc_intremap_table(
-                            ivrs_mappings[bdf].iommu,
-                            &ivrs_mappings[req_id].intremap_inuse);
-                    if ( !ivrs_mappings[req_id].intremap_table )
-                        return -ENOMEM;
-
-                    if ( !pdev->phantom_stride )
-                        break;
-                    req_id += pdev->phantom_stride;
-                } while ( PCI_SLOT(req_id) == pdev->sbdf.dev );
+                ivrs_mappings[bdf].intremap_table =
+                    amd_iommu_alloc_intremap_table(
+                        ivrs_mappings[bdf].iommu,
+                        &ivrs_mappings[bdf].intremap_inuse);
+                if ( !ivrs_mappings[bdf].intremap_table )
+                    return -ENOMEM;
+
+                if ( pdev->phantom_stride )
+                {
+                    unsigned int req_id = bdf;
+
+                    for ( ; ; )
+                    {
+                        req_id += pdev->phantom_stride;
+                        if ( PCI_SLOT(req_id) != pdev->sbdf.dev )
+                            break;
+
+                        ivrs_mappings[req_id].intremap_table =
+                            ivrs_mappings[bdf].intremap_table;
+                        ivrs_mappings[req_id].intremap_inuse =
+                            ivrs_mappings[bdf].intremap_inuse;
+                    }
+                }
             }
 
             amd_iommu_set_intremap_table(
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -711,33 +711,20 @@ int amd_iommu_msi_msg_update_ire(
 
     if ( msi_desc->remap_index >= 0 && !msg )
     {
-        do {
-            update_intremap_entry_from_msi_msg(iommu, bdf, nr,
-                                               &msi_desc->remap_index,
-                                               NULL, NULL);
-            if ( !pdev || !pdev->phantom_stride )
-                break;
-            bdf += pdev->phantom_stride;
-        } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
+        update_intremap_entry_from_msi_msg(iommu, bdf, nr,
+                                           &msi_desc->remap_index,
+                                           NULL, NULL);
 
         for ( i = 0; i < nr; ++i )
             msi_desc[i].remap_index = -1;
-        if ( pdev )
-            bdf = PCI_BDF2(pdev->bus, pdev->devfn);
     }
 
     if ( !msg )
         return 0;
 
-    do {
-        rc = update_intremap_entry_from_msi_msg(iommu, bdf, nr,
-                                                &msi_desc->remap_index,
-                                                msg, &data);
-        if ( rc || !pdev || !pdev->phantom_stride )
-            break;
-        bdf += pdev->phantom_stride;
-    } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
-
+    rc = update_intremap_entry_from_msi_msg(iommu, bdf, nr,
+                                            &msi_desc->remap_index,
+                                            msg, &data);
     if ( !rc )
     {
         for ( i = 1; i < nr; ++i )
@@ -790,12 +777,27 @@ void amd_iommu_read_msi_from_ire(
 }
 
 int amd_iommu_free_intremap_table(
-    const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping)
+    const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping,
+    uint16_t bdf)
 {
     void **tblp;
 
     if ( ivrs_mapping )
     {
+        unsigned int i;
+
+        /*
+         * PCI device phantom functions use the same tables as their "base"
+         * function: Look ahead to zap the pointers.
+         */
+        for ( i = 1; PCI_FUNC(bdf + i) && bdf + i < ivrs_bdf_entries; ++i )
+            if ( ivrs_mapping[i].intremap_table ==
+                 ivrs_mapping->intremap_table )
+            {
+                ivrs_mapping[i].intremap_table = NULL;
+                ivrs_mapping[i].intremap_inuse = NULL;
+            }
+
         XFREE(ivrs_mapping->intremap_inuse);
         tblp = &ivrs_mapping->intremap_table;
     }
@@ -934,7 +936,8 @@ static void dump_intremap_table(const st
 }
 
 static int dump_intremap_mapping(const struct amd_iommu *iommu,
-                                 struct ivrs_mappings *ivrs_mapping)
+                                 struct ivrs_mappings *ivrs_mapping,
+                                 uint16_t unused)
 {
     unsigned long flags;
 
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -516,7 +516,7 @@ static int amd_iommu_remove_device(u8 de
     if ( amd_iommu_perdev_intremap &&
          ivrs_mappings[bdf].dte_requestor_id == bdf &&
          ivrs_mappings[bdf].intremap_table )
-        amd_iommu_free_intremap_table(iommu, &ivrs_mappings[bdf]);
+        amd_iommu_free_intremap_table(iommu, &ivrs_mappings[bdf], bdf);
 
     return 0;
 }
--- a/xen/include/asm-x86/amd-iommu.h
+++ b/xen/include/asm-x86/amd-iommu.h
@@ -131,7 +131,7 @@ extern u8 ivhd_type;
 struct ivrs_mappings *get_ivrs_mappings(u16 seg);
 int iterate_ivrs_mappings(int (*)(u16 seg, struct ivrs_mappings *));
 int iterate_ivrs_entries(int (*)(const struct amd_iommu *,
-                                 struct ivrs_mappings *));
+                                 struct ivrs_mappings *, uint16_t));
 
 /* iommu tables in guest space */
 struct mmio_reg {
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -101,7 +101,7 @@ int amd_iommu_setup_ioapic_remapping(voi
 void *amd_iommu_alloc_intremap_table(
     const struct amd_iommu *, unsigned long **);
 int amd_iommu_free_intremap_table(
-    const struct amd_iommu *, struct ivrs_mappings *);
+    const struct amd_iommu *, struct ivrs_mappings *, uint16_t);
 void amd_iommu_ioapic_update_ire(
     unsigned int apic, unsigned int reg, unsigned int value);
 unsigned int amd_iommu_read_ioapic_from_ire(
x86/PCI: read MSI-X table entry count early

Rather than doing this every time we set up interrupts for a device
anew (and then in two distinct places) fill this invariant field
right after allocating struct arch_msix.

While at it also obtain the MSI-X capability structure position just
once, in msix_capability_init(), rather than in each caller.

Furthermore take the opportunity and eliminate the multi_msix_capable()
alias of msix_table_size().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -821,10 +821,8 @@ static u64 read_pci_mem_bar(u16 seg, u8
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev,
-                                unsigned int pos,
                                 struct msi_info *msi,
-                                struct msi_desc **desc,
-                                unsigned int nr_entries)
+                                struct msi_desc **desc)
 {
     struct arch_msix *msix = dev->msix;
     struct msi_desc *entry = NULL;
@@ -838,6 +836,11 @@ static int msix_capability_init(struct p
     u8 slot = PCI_SLOT(dev->devfn);
     u8 func = PCI_FUNC(dev->devfn);
     bool maskall = msix->host_maskall;
+    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
+                                           PCI_CAP_ID_MSIX);
+
+    if ( !pos )
+        return -ENODEV;
 
     ASSERT(pcidevs_locked());
 
@@ -912,10 +915,9 @@ static int msix_capability_init(struct p
         u64 pba_paddr;
         u32 pba_offset;
 
-        msix->nr_entries = nr_entries;
         msix->table.first = PFN_DOWN(table_paddr);
         msix->table.last = PFN_DOWN(table_paddr +
-                                    nr_entries * PCI_MSIX_ENTRY_SIZE - 1);
+                                    msix->nr_entries * PCI_MSIX_ENTRY_SIZE - 1);
         WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->table.first,
                                         msix->table.last));
 
@@ -927,7 +929,7 @@ static int msix_capability_init(struct p
 
         msix->pba.first = PFN_DOWN(pba_paddr);
         msix->pba.last = PFN_DOWN(pba_paddr +
-                                  BITS_TO_LONGS(nr_entries) - 1);
+                                  BITS_TO_LONGS(msix->nr_entries) - 1);
         WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->pba.first,
                                         msix->pba.last));
     }
@@ -999,7 +1001,6 @@ static int msix_capability_init(struct p
             /* XXX How to deal with existing mappings? */
         }
     }
-    WARN_ON(msix->nr_entries != nr_entries);
     WARN_ON(msix->table.first != (table_paddr >> PAGE_SHIFT));
     ++msix->used_entries;
 
@@ -1093,22 +1094,17 @@ static void __pci_disable_msi(struct msi
  **/
 static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
 {
-    int pos, nr_entries;
     struct pci_dev *pdev;
-    u16 control;
     u8 slot = PCI_SLOT(msi->devfn);
     u8 func = PCI_FUNC(msi->devfn);
     struct msi_desc *old_desc;
 
     ASSERT(pcidevs_locked());
     pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
-    pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
-    if ( !pdev || !pos )
+    if ( !pdev || !pdev->msix )
         return -ENODEV;
 
-    control = pci_conf_read16(pdev->sbdf, msix_control_reg(pos));
-    nr_entries = multi_msix_capable(control);
-    if ( msi->entry_nr >= nr_entries )
+    if ( msi->entry_nr >= pdev->msix->nr_entries )
         return -EINVAL;
 
     old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX);
@@ -1127,7 +1123,7 @@ static int __pci_enable_msix(struct msi_
         __pci_disable_msi(old_desc);
     }
 
-    return msix_capability_init(pdev, pos, msi, desc, nr_entries);
+    return msix_capability_init(pdev, msi, desc);
 }
 
 static void _pci_cleanup_msix(struct arch_msix *msix)
@@ -1187,16 +1183,10 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
 {
     int rc;
     struct pci_dev *pdev;
-    u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
-    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
-                                           PCI_CAP_ID_MSIX);
 
     if ( !use_msi )
         return 0;
 
-    if ( !pos )
-        return -ENODEV;
-
     pcidevs_lock();
     pdev = pci_get_pdev(seg, bus, devfn);
     if ( !pdev )
@@ -1209,13 +1199,7 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
         rc = 0;
     }
     else
-    {
-        uint16_t control = pci_conf_read16(PCI_SBDF3(seg, bus, devfn),
-                                           msix_control_reg(pos));
-
-        rc = msix_capability_init(pdev, pos, NULL, NULL,
-                                  multi_msix_capable(control));
-    }
+        rc = msix_capability_init(pdev, NULL, NULL);
     pcidevs_unlock();
 
     return rc;
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -324,6 +324,7 @@ static void apply_quirks(struct pci_dev
 static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
 {
     struct pci_dev *pdev;
+    unsigned int pos;
 
     list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
         if ( pdev->bus == bus && pdev->devfn == devfn )
@@ -339,10 +340,12 @@ static struct pci_dev *alloc_pdev(struct
     pdev->domain = NULL;
     INIT_LIST_HEAD(&pdev->msi_list);
 
-    if ( pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
-                             PCI_CAP_ID_MSIX) )
+    pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                              PCI_CAP_ID_MSIX);
+    if ( pos )
     {
         struct arch_msix *msix = xzalloc(struct arch_msix);
+        uint16_t ctrl;
 
         if ( !msix )
         {
@@ -350,6 +353,10 @@ static struct pci_dev *alloc_pdev(struct
             return NULL;
         }
         spin_lock_init(&msix->table_lock);
+
+        ctrl = pci_conf_read16(pdev->sbdf, msix_control_reg(pos));
+        msix->nr_entries = msix_table_size(ctrl);
+
         pdev->msix = msix;
     }
 
@@ -358,7 +365,6 @@ static struct pci_dev *alloc_pdev(struct
     /* update bus2bridge */
     switch ( pdev->type = pdev_type(pseg->nr, bus, devfn) )
     {
-        int pos;
         u16 cap;
         u8 sec_bus, sub_bus;
 
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -171,7 +171,6 @@ int msi_free_irq(struct msi_desc *entry)
 #define msix_enable(control)	 	control |= PCI_MSIX_FLAGS_ENABLE
 #define msix_disable(control)	 	control &= ~PCI_MSIX_FLAGS_ENABLE
 #define msix_table_size(control) 	((control & PCI_MSIX_FLAGS_QSIZE)+1)
-#define multi_msix_capable		msix_table_size
 #define msix_unmask(address)	 	(address & ~PCI_MSIX_VECTOR_BITMASK)
 #define msix_mask(address)		(address | PCI_MSIX_VECTOR_BITMASK)
 
AMD/IOMMU: replace INTREMAP_ENTRIES

Prepare for the number of entries to not be the maximum possible, by
separating checks against maximum size from ones against actual size.
For caller side simplicity have alloc_intremap_entry() return the
maximum possible value upon allocation failure, rather than the first
just out-of-bounds one.

Have the involved functions already take all the subsequently needed
arguments here already, to reduce code churn in the patch actually
making the allocation size dynamic.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -69,7 +69,7 @@ union irte_cptr {
     const union irte128 *ptr128;
 } __transparent__;
 
-#define INTREMAP_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
+#define INTREMAP_MAX_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
 
 struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
 struct hpet_sbdf hpet_sbdf;
@@ -83,8 +83,20 @@ static void dump_intremap_tables(unsigne
 static unsigned int __init intremap_table_order(const struct amd_iommu *iommu)
 {
     return iommu->ctrl.ga_en
-           ? get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte128))
-           : get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte32));
+           ? get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte128))
+           : get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte32));
+}
+
+unsigned int amd_iommu_intremap_table_order(
+    const void *irt, const struct amd_iommu *iommu)
+{
+    return IOMMU_INTREMAP_ORDER;
+}
+
+static unsigned int intremap_table_entries(
+    const void *irt, const struct amd_iommu *iommu)
+{
+    return 1u << amd_iommu_intremap_table_order(irt, iommu);
 }
 
 unsigned int ioapic_id_to_index(unsigned int apic_id)
@@ -122,20 +134,24 @@ static int get_intremap_requestor_id(int
     return get_ivrs_mappings(seg)[bdf].dte_requestor_id;
 }
 
-static unsigned int alloc_intremap_entry(int seg, int bdf, unsigned int nr)
+static unsigned int alloc_intremap_entry(const struct amd_iommu *iommu,
+                                         unsigned int bdf, unsigned int nr)
 {
-    unsigned long *inuse = get_ivrs_mappings(seg)[bdf].intremap_inuse;
-    unsigned int slot = find_first_zero_bit(inuse, INTREMAP_ENTRIES);
+    const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(iommu->seg);
+    unsigned long *inuse = ivrs_mappings[bdf].intremap_inuse;
+    unsigned int nr_ents =
+        intremap_table_entries(ivrs_mappings[bdf].intremap_table, iommu);
+    unsigned int slot = find_first_zero_bit(inuse, nr_ents);
 
     for ( ; ; )
     {
         unsigned int end;
 
-        if ( slot >= INTREMAP_ENTRIES )
+        if ( slot >= nr_ents )
             break;
-        end = find_next_bit(inuse, INTREMAP_ENTRIES, slot + 1);
-        if ( end > INTREMAP_ENTRIES )
-            end = INTREMAP_ENTRIES;
+        end = find_next_bit(inuse, nr_ents, slot + 1);
+        if ( end > nr_ents )
+            end = nr_ents;
         slot = (slot + nr - 1) & ~(nr - 1);
         if ( slot + nr <= end )
         {
@@ -144,12 +160,12 @@ static unsigned int alloc_intremap_entry
             break;
         }
         slot = (end + nr) & ~(nr - 1);
-        if ( slot >= INTREMAP_ENTRIES )
+        if ( slot >= nr_ents )
             break;
-        slot = find_next_zero_bit(inuse, INTREMAP_ENTRIES, slot);
+        slot = find_next_zero_bit(inuse, nr_ents, slot);
     }
 
-    return slot;
+    return slot < nr_ents ? slot : INTREMAP_MAX_ENTRIES;
 }
 
 static union irte_ptr get_intremap_entry(const struct amd_iommu *iommu,
@@ -159,7 +175,7 @@ static union irte_ptr get_intremap_entry
         .ptr = get_ivrs_mappings(iommu->seg)[bdf].intremap_table
     };
 
-    ASSERT(table.ptr && (index < INTREMAP_ENTRIES));
+    ASSERT(table.ptr && (index < intremap_table_entries(table.ptr, iommu)));
 
     if ( iommu->ctrl.ga_en )
         table.ptr128 += index;
@@ -279,10 +295,10 @@ static int update_intremap_entry_from_io
     spin_lock_irqsave(lock, flags);
 
     offset = *index;
-    if ( offset >= INTREMAP_ENTRIES )
+    if ( offset >= INTREMAP_MAX_ENTRIES )
     {
-        offset = alloc_intremap_entry(iommu->seg, req_id, 1);
-        if ( offset >= INTREMAP_ENTRIES )
+        offset = alloc_intremap_entry(iommu, req_id, 1);
+        if ( offset >= INTREMAP_MAX_ENTRIES )
         {
             spin_unlock_irqrestore(lock, flags);
             rte->mask = 1;
@@ -400,8 +416,8 @@ int __init amd_iommu_setup_ioapic_remapp
             }
 
             spin_lock_irqsave(lock, flags);
-            offset = alloc_intremap_entry(seg, req_id, 1);
-            BUG_ON(offset >= INTREMAP_ENTRIES);
+            offset = alloc_intremap_entry(iommu, req_id, 1);
+            BUG_ON(offset >= INTREMAP_MAX_ENTRIES);
             entry = get_intremap_entry(iommu, req_id, offset);
             update_intremap_entry(iommu, entry, vector,
                                   delivery_mode, dest_mode, dest);
@@ -476,7 +492,7 @@ void amd_iommu_ioapic_update_ire(
         *(((u32 *)&new_rte) + 1) = value;
     }
 
-    if ( ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_ENTRIES )
+    if ( ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_MAX_ENTRIES )
     {
         ASSERT(saved_mask);
 
@@ -548,7 +564,7 @@ unsigned int amd_iommu_read_ioapic_from_
         return val;
 
     offset = ioapic_sbdf[idx].pin_2_idx[pin];
-    if ( offset >= INTREMAP_ENTRIES )
+    if ( offset >= INTREMAP_MAX_ENTRIES )
         return val;
 
     seg = ioapic_sbdf[idx].seg;
@@ -561,8 +577,8 @@ unsigned int amd_iommu_read_ioapic_from_
 
     if ( !(reg & 1) )
     {
-        ASSERT(offset == (val & (INTREMAP_ENTRIES - 1)));
-        val &= ~(INTREMAP_ENTRIES - 1);
+        ASSERT(offset == (val & (INTREMAP_MAX_ENTRIES - 1)));
+        val &= ~(INTREMAP_MAX_ENTRIES - 1);
         /* The IntType fields match for both formats. */
         val |= MASK_INSR(entry.ptr32->flds.int_type,
                          IO_APIC_REDIR_DELIV_MODE_MASK);
@@ -622,11 +638,11 @@ static int update_intremap_entry_from_ms
         dest = MASK_EXTR(msg->address_lo, MSI_ADDR_DEST_ID_MASK);
 
     offset = *remap_index;
-    if ( offset >= INTREMAP_ENTRIES )
+    if ( offset >= INTREMAP_MAX_ENTRIES )
     {
         ASSERT(nr);
-        offset = alloc_intremap_entry(iommu->seg, bdf, nr);
-        if ( offset >= INTREMAP_ENTRIES )
+        offset = alloc_intremap_entry(iommu, bdf, nr);
+        if ( offset >= INTREMAP_MAX_ENTRIES )
         {
             spin_unlock_irqrestore(lock, flags);
             return -ENOSPC;
@@ -654,7 +670,7 @@ static int update_intremap_entry_from_ms
     update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
     spin_unlock_irqrestore(lock, flags);
 
-    *data = (msg->data & ~(INTREMAP_ENTRIES - 1)) | offset;
+    *data = (msg->data & ~(INTREMAP_MAX_ENTRIES - 1)) | offset;
 
     /*
      * In some special cases, a pci-e device(e.g SATA controller in IDE mode)
@@ -738,7 +754,7 @@ int amd_iommu_msi_msg_update_ire(
 void amd_iommu_read_msi_from_ire(
     struct msi_desc *msi_desc, struct msi_msg *msg)
 {
-    unsigned int offset = msg->data & (INTREMAP_ENTRIES - 1);
+    unsigned int offset = msg->data & (INTREMAP_MAX_ENTRIES - 1);
     const struct pci_dev *pdev = msi_desc->dev;
     u16 bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf;
     u16 seg = pdev ? pdev->seg : hpet_sbdf.seg;
@@ -758,7 +774,7 @@ void amd_iommu_read_msi_from_ire(
         offset |= nr;
     }
 
-    msg->data &= ~(INTREMAP_ENTRIES - 1);
+    msg->data &= ~(INTREMAP_MAX_ENTRIES - 1);
     /* The IntType fields match for both formats. */
     msg->data |= MASK_INSR(entry.ptr32->flds.int_type,
                            MSI_DATA_DELIVERY_MODE_MASK);
@@ -824,8 +840,9 @@ void *amd_iommu_alloc_intremap_table(
 
     if ( tb )
     {
-        *inuse_map = xzalloc_array(unsigned long,
-                                   BITS_TO_LONGS(INTREMAP_ENTRIES));
+        unsigned int nr = intremap_table_entries(tb, iommu);
+
+        *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(nr));
         if ( *inuse_map )
             memset(tb, 0, PAGE_SIZE << order);
         else
@@ -869,6 +886,7 @@ bool __init iov_supports_xt(void)
 
 int __init amd_setup_hpet_msi(struct msi_desc *msi_desc)
 {
+    const struct amd_iommu *iommu;
     spinlock_t *lock;
     unsigned long flags;
     int rc = 0;
@@ -886,12 +904,15 @@ int __init amd_setup_hpet_msi(struct msi
         return -ENODEV;
     }
 
+    iommu = find_iommu_for_device(hpet_sbdf.seg, hpet_sbdf.bdf);
+    if ( !iommu )
+        return -ENXIO;
+
     lock = get_intremap_lock(hpet_sbdf.seg, hpet_sbdf.bdf);
     spin_lock_irqsave(lock, flags);
 
-    msi_desc->remap_index = alloc_intremap_entry(hpet_sbdf.seg,
-                                                 hpet_sbdf.bdf, 1);
-    if ( msi_desc->remap_index >= INTREMAP_ENTRIES )
+    msi_desc->remap_index = alloc_intremap_entry(iommu, hpet_sbdf.bdf, 1);
+    if ( msi_desc->remap_index >= INTREMAP_MAX_ENTRIES )
     {
         msi_desc->remap_index = -1;
         rc = -ENXIO;
@@ -906,12 +927,12 @@ static void dump_intremap_table(const st
                                 union irte_cptr tbl,
                                 const struct ivrs_mappings *ivrs_mapping)
 {
-    unsigned int count;
+    unsigned int count, nr = intremap_table_entries(tbl.ptr, iommu);
 
     if ( !tbl.ptr )
         return;
 
-    for ( count = 0; count < INTREMAP_ENTRIES; count++ )
+    for ( count = 0; count < nr; count++ )
     {
         if ( iommu->ctrl.ga_en
              ? !tbl.ptr128[count].raw[0] && !tbl.ptr128[count].raw[1]
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -102,6 +102,8 @@ void *amd_iommu_alloc_intremap_table(
     const struct amd_iommu *, unsigned long **);
 int amd_iommu_free_intremap_table(
     const struct amd_iommu *, struct ivrs_mappings *, uint16_t);
+unsigned int amd_iommu_intremap_table_order(
+    const void *irt, const struct amd_iommu *iommu);
 void amd_iommu_ioapic_update_ire(
     unsigned int apic, unsigned int reg, unsigned int value);
 unsigned int amd_iommu_read_ioapic_from_ire(
AMD/IOMMU: restrict interrupt remapping table sizes

There's no point setting up tables with more space than a PCI device can
use. For both MSI and MSI-X we can determine how many interrupts could
be set up at most. Tables allocated during ACPI table parsing, however,
will (for now at least) continue to be set up to have maximum size.

Note that until we would want to use sub-page allocations here there's
no point checking whether MSI is supported by a device - 1 or up to 32
(or actually 128, due to the change effectively using a reserved
encoding) IRTEs always mean an order-0 allocation anyway.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -77,7 +77,7 @@ static void __init add_ivrs_mapping_entr
         {
             if ( !shared_intremap_table )
                 shared_intremap_table = amd_iommu_alloc_intremap_table(
-                    iommu, &shared_intremap_inuse);
+                    iommu, &shared_intremap_inuse, 0);
 
             if ( !shared_intremap_table )
                 panic("No memory for shared IRT\n");
@@ -89,7 +89,7 @@ static void __init add_ivrs_mapping_entr
         {
             ivrs_mappings[alias_id].intremap_table =
                 amd_iommu_alloc_intremap_table(
-                    iommu, &ivrs_mappings[alias_id].intremap_inuse);
+                    iommu, &ivrs_mappings[alias_id].intremap_inuse, 0);
 
             if ( !ivrs_mappings[alias_id].intremap_table )
                 panic("No memory for %04x:%02x:%02x.%u's IRT\n",
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1292,7 +1292,9 @@ static int __init amd_iommu_setup_device
                 ivrs_mappings[bdf].intremap_table =
                     amd_iommu_alloc_intremap_table(
                         ivrs_mappings[bdf].iommu,
-                        &ivrs_mappings[bdf].intremap_inuse);
+                        &ivrs_mappings[bdf].intremap_inuse,
+                        pdev->msix ? pdev->msix->nr_entries
+                                   : multi_msi_capable(~0u));
                 if ( !ivrs_mappings[bdf].intremap_table )
                     return -ENOMEM;
 
@@ -1315,11 +1317,8 @@ static int __init amd_iommu_setup_device
             }
 
             amd_iommu_set_intremap_table(
-                dte,
-                ivrs_mappings[bdf].intremap_table
-                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
-                : 0,
-                iommu_intremap);
+                dte, ivrs_mappings[bdf].intremap_table,
+                ivrs_mappings[bdf].iommu, iommu_intremap);
         }
     }
 
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -69,7 +69,8 @@ union irte_cptr {
     const union irte128 *ptr128;
 } __transparent__;
 
-#define INTREMAP_MAX_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
+#define INTREMAP_MAX_ORDER   0xB
+#define INTREMAP_MAX_ENTRIES (1 << INTREMAP_MAX_ORDER)
 
 struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
 struct hpet_sbdf hpet_sbdf;
@@ -80,17 +81,13 @@ unsigned int nr_ioapic_sbdf;
 
 static void dump_intremap_tables(unsigned char key);
 
-static unsigned int __init intremap_table_order(const struct amd_iommu *iommu)
-{
-    return iommu->ctrl.ga_en
-           ? get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte128))
-           : get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte32));
-}
+#define intremap_page_order(irt) PFN_ORDER(virt_to_page(irt))
 
 unsigned int amd_iommu_intremap_table_order(
     const void *irt, const struct amd_iommu *iommu)
 {
-    return IOMMU_INTREMAP_ORDER;
+    return intremap_page_order(irt) + PAGE_SHIFT -
+           (iommu->ctrl.ga_en ? 4 : 2);
 }
 
 static unsigned int intremap_table_entries(
@@ -825,7 +822,10 @@ int amd_iommu_free_intremap_table(
 
     if ( *tblp )
     {
-        __free_amd_iommu_tables(*tblp, intremap_table_order(iommu));
+        unsigned int order = intremap_page_order(*tblp);
+
+        intremap_page_order(*tblp) = 0;
+        __free_amd_iommu_tables(*tblp, order);
         *tblp = NULL;
     }
 
@@ -833,15 +833,23 @@ int amd_iommu_free_intremap_table(
 }
 
 void *amd_iommu_alloc_intremap_table(
-    const struct amd_iommu *iommu, unsigned long **inuse_map)
+    const struct amd_iommu *iommu, unsigned long **inuse_map, unsigned int nr)
 {
-    unsigned int order = intremap_table_order(iommu);
-    void *tb = __alloc_amd_iommu_tables(order);
+    unsigned int order;
+    void *tb;
 
+    if ( !nr )
+        nr = INTREMAP_MAX_ENTRIES;
+
+    order = iommu->ctrl.ga_en
+            ? get_order_from_bytes(nr * sizeof(union irte128))
+            : get_order_from_bytes(nr * sizeof(union irte32));
+
+    tb = __alloc_amd_iommu_tables(order);
     if ( tb )
     {
-        unsigned int nr = intremap_table_entries(tb, iommu);
-
+        intremap_page_order(tb) = order;
+        nr = intremap_table_entries(tb, iommu);
         *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(nr));
         if ( *inuse_map )
             memset(tb, 0, PAGE_SIZE << order);
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -113,12 +113,22 @@ void amd_iommu_set_root_page_table(struc
 }
 
 void __init amd_iommu_set_intremap_table(
-    struct amd_iommu_dte *dte, uint64_t intremap_ptr, bool valid)
+    struct amd_iommu_dte *dte, const void *ptr,
+    const struct amd_iommu *iommu, bool valid)
 {
-    dte->it_root = intremap_ptr >> 6;
-    dte->int_tab_len = intremap_ptr ? IOMMU_INTREMAP_ORDER : 0;
-    dte->int_ctl = intremap_ptr ? IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED
-                                : IOMMU_DEV_TABLE_INT_CONTROL_ABORTED;
+    if ( ptr )
+    {
+        dte->it_root = virt_to_maddr(ptr) >> 6;
+        dte->int_tab_len = amd_iommu_intremap_table_order(ptr, iommu);
+        dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
+    }
+    else
+    {
+        dte->it_root = 0;
+        dte->int_tab_len = 0;
+        dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_ABORTED;
+    }
+
     dte->ig = false; /* unmapped interrupts result in i/o page faults */
     dte->iv = valid;
 }
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -471,16 +471,15 @@ static int amd_iommu_add_device(u8 devfn
     {
         ivrs_mappings[bdf].intremap_table =
             amd_iommu_alloc_intremap_table(
-                iommu, &ivrs_mappings[bdf].intremap_inuse);
+                iommu, &ivrs_mappings[bdf].intremap_inuse,
+                pdev->msix ? pdev->msix->nr_entries
+                           : multi_msi_capable(~0u));
         if ( !ivrs_mappings[bdf].intremap_table )
             return -ENOMEM;
 
         amd_iommu_set_intremap_table(
             iommu->dev_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE),
-            ivrs_mappings[bdf].intremap_table
-            ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
-            : 0,
-            iommu_intremap);
+            ivrs_mappings[bdf].intremap_table, iommu, iommu_intremap);
 
         amd_iommu_flush_device(iommu, bdf);
     }
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -107,9 +107,6 @@
 #define IOMMU_DEV_TABLE_INT_CONTROL_FORWARDED	0x1
 #define IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED	0x2
 
-/* For now, we always allocate the maximum: 2048 entries. */
-#define IOMMU_INTREMAP_ORDER			0xB
-
 struct amd_iommu_dte {
     /* 0 - 63 */
     bool v:1;
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -72,7 +72,8 @@ int __must_check amd_iommu_flush_iotlb_a
 /* device table functions */
 int get_dma_requestor_id(uint16_t seg, uint16_t bdf);
 void amd_iommu_set_intremap_table(struct amd_iommu_dte *dte,
-                                  uint64_t intremap_ptr,
+                                  const void *ptr,
+                                  const struct amd_iommu *iommu,
                                   bool valid);
 void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
 				   uint64_t root_ptr, uint16_t domain_id,
@@ -99,7 +100,7 @@ struct amd_iommu *find_iommu_for_device(
 bool iov_supports_xt(void);
 int amd_iommu_setup_ioapic_remapping(void);
 void *amd_iommu_alloc_intremap_table(
-    const struct amd_iommu *, unsigned long **);
+    const struct amd_iommu *, unsigned long **, unsigned int nr);
 int amd_iommu_free_intremap_table(
     const struct amd_iommu *, struct ivrs_mappings *, uint16_t);
 unsigned int amd_iommu_intremap_table_order(
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 00/10] AMD IOMMU: further improvements

Posted by Woods, Brian 17 weeks ago
On Tue, Aug 06, 2019 at 03:05:36PM +0200, Jan Beulich wrote:
> Only the first patch here is left from v4, everything else is new,
> yet still related. The main goal is to reduce the huge memory
> overhead that we've noticed. On the way there a number of other
> things were once again noticed. Unfortunately before I was able to
> also test the last two patches there, my Rome box broke again.
> Hence these two patches have been tested on a (less affected)
> Fam15 system only.
> 
> 01: miscellaneous DTE handling adjustments
> 02: drop stray "else"
> 03: don't free shared IRT multiple times
> 04: introduce a "valid" flag for IVRS mappings
> 05: let callers of amd_iommu_alloc_intremap_table() handle errors
> 06: don't blindly allocate interrupt remapping tables
> 07: make phantom functions share interrupt remapping tables
> 08: x86/PCI: read MSI-X table entry count early
> 09: replace INTREMAP_ENTRIES
> 10: restrict interrupt remapping table sizes
> 
> Full set of patches once again attached here due to still unresolved
> email issues over here.
> 
> Jan
> 

I don't think I have enough time here left to review these, but I've
tested them via PCI device passthrough on an AMD Rome system.

-- 
Brian Woods

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 01/10] AMD/IOMMU: miscellaneous DTE handling adjustments

Posted by Jan Beulich 18 weeks ago
First and foremost switch boolean fields to bool. Adjust a few related
function parameters as well. Then
- in amd_iommu_set_intremap_table() don't use literal numbers,
- in iommu_dte_add_device_entry() use a compound literal instead of many
   assignments,
- in amd_iommu_setup_domain_device()
   - eliminate a pointless local variable,
   - use || instead of && when deciding whether to clear an entry,
   - clear the I field without any checking of ATS / IOTLB state,
- leave reserved fields unnamed.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v5: IOMMU_INTREMAP_LENGTH -> IOMMU_INTREMAP_ORDER. Adjust comment.
v4: New.

--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -69,8 +69,7 @@ union irte_cptr {
      const union irte128 *ptr128;
  } __transparent__;
  
-#define INTREMAP_LENGTH 0xB
-#define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH)
+#define INTREMAP_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
  
  struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
  struct hpet_sbdf hpet_sbdf;
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -101,51 +101,52 @@ static unsigned int set_iommu_pte_presen
  
  void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
                                     uint64_t root_ptr, uint16_t domain_id,
-                                   uint8_t paging_mode, uint8_t valid)
+                                   uint8_t paging_mode, bool valid)
  {
      dte->domain_id = domain_id;
      dte->pt_root = paddr_to_pfn(root_ptr);
-    dte->iw = 1;
-    dte->ir = 1;
+    dte->iw = true;
+    dte->ir = true;
      dte->paging_mode = paging_mode;
-    dte->tv = 1;
+    dte->tv = true;
      dte->v = valid;
  }
  
  void __init amd_iommu_set_intremap_table(
-    struct amd_iommu_dte *dte, uint64_t intremap_ptr, uint8_t int_valid)
+    struct amd_iommu_dte *dte, uint64_t intremap_ptr, bool valid)
  {
      dte->it_root = intremap_ptr >> 6;
-    dte->int_tab_len = 0xb; /* 2048 entries */
-    dte->int_ctl = 2; /* fixed and arbitrated interrupts remapped */
-    dte->ig = 0; /* unmapped interrupt results io page faults */
-    dte->iv = int_valid;
+    dte->int_tab_len = IOMMU_INTREMAP_ORDER;
+    dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
+    dte->ig = false; /* unmapped interrupts result in i/o page faults */
+    dte->iv = valid;
  }
  
  void __init iommu_dte_add_device_entry(struct amd_iommu_dte *dte,
-                                       struct ivrs_mappings *ivrs_dev)
+                                       const struct ivrs_mappings *ivrs_dev)
  {
      uint8_t flags = ivrs_dev->device_flags;
  
-    memset(dte, 0, sizeof(*dte));
-
-    dte->init_pass = MASK_EXTR(flags, ACPI_IVHD_INIT_PASS);
-    dte->ext_int_pass = MASK_EXTR(flags, ACPI_IVHD_EINT_PASS);
-    dte->nmi_pass = MASK_EXTR(flags, ACPI_IVHD_NMI_PASS);
-    dte->lint0_pass = MASK_EXTR(flags, ACPI_IVHD_LINT0_PASS);
-    dte->lint1_pass = MASK_EXTR(flags, ACPI_IVHD_LINT1_PASS);
-    dte->sys_mgt = MASK_EXTR(flags, ACPI_IVHD_SYSTEM_MGMT);
-    dte->ex = ivrs_dev->dte_allow_exclusion;
+    *dte = (struct amd_iommu_dte){
+        .init_pass = flags & ACPI_IVHD_INIT_PASS,
+        .ext_int_pass = flags & ACPI_IVHD_EINT_PASS,
+        .nmi_pass = flags & ACPI_IVHD_NMI_PASS,
+        .lint0_pass = flags & ACPI_IVHD_LINT0_PASS,
+        .lint1_pass = flags & ACPI_IVHD_LINT1_PASS,
+        .ioctl = IOMMU_DEV_TABLE_IO_CONTROL_ABORTED,
+        .sys_mgt = MASK_EXTR(flags, ACPI_IVHD_SYSTEM_MGMT),
+        .ex = ivrs_dev->dte_allow_exclusion,
+    };
  }
  
  void iommu_dte_set_guest_cr3(struct amd_iommu_dte *dte, uint16_t dom_id,
-                             uint64_t gcr3_mfn, uint8_t gv, uint8_t glx)
+                             uint64_t gcr3_mfn, bool gv, uint8_t glx)
  {
  #define GCR3_MASK(hi, lo) (((1ul << ((hi) + 1)) - 1) & ~((1ul << (lo)) - 1))
  #define GCR3_SHIFT(lo) ((lo) - PAGE_SHIFT)
  
      /* I bit must be set when gcr3 is enabled */
-    dte->i = 1;
+    dte->i = true;
  
      dte->gcr3_trp_14_12 = (gcr3_mfn & GCR3_MASK(14, 12)) >> GCR3_SHIFT(12);
      dte->gcr3_trp_30_15 = (gcr3_mfn & GCR3_MASK(30, 15)) >> GCR3_SHIFT(15);
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -93,7 +93,6 @@ static void amd_iommu_setup_domain_devic
      struct amd_iommu_dte *table, *dte;
      unsigned long flags;
      int req_id, valid = 1;
-    int dte_i = 0;
      u8 bus = pdev->bus;
      const struct domain_iommu *hd = dom_iommu(domain);
  
@@ -103,9 +102,6 @@ static void amd_iommu_setup_domain_devic
      if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
          valid = 0;
  
-    if ( ats_enabled )
-        dte_i = 1;
-
      /* get device-table entry */
      req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
      table = iommu->dev_table.buffer;
@@ -122,7 +118,7 @@ static void amd_iommu_setup_domain_devic
  
          if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
               iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
-            dte->i = dte_i;
+            dte->i = ats_enabled;
  
          amd_iommu_flush_device(iommu, req_id);
  
@@ -288,14 +284,11 @@ void amd_iommu_disable_domain_device(str
      dte = &table[req_id];
  
      spin_lock_irqsave(&iommu->lock, flags);
-    if ( dte->tv && dte->v )
+    if ( dte->tv || dte->v )
      {
-        dte->tv = 0;
-        dte->v = 0;
-
-        if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
-             iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
-            dte->i = 0;
+        dte->tv = false;
+        dte->v = false;
+        dte->i = false;
  
          amd_iommu_flush_device(iommu, req_id);
  
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -107,57 +107,60 @@
  #define IOMMU_DEV_TABLE_INT_CONTROL_FORWARDED	0x1
  #define IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED	0x2
  
+/* For now, we always allocate the maximum: 2048 entries. */
+#define IOMMU_INTREMAP_ORDER			0xB
+
  struct amd_iommu_dte {
      /* 0 - 63 */
-    uint64_t v:1;
-    uint64_t tv:1;
-    uint64_t reserved0:5;
-    uint64_t had:2;
-    uint64_t paging_mode:3;
+    bool v:1;
+    bool tv:1;
+    unsigned int :5;
+    unsigned int had:2;
+    unsigned int paging_mode:3;
      uint64_t pt_root:40;
-    uint64_t ppr:1;
-    uint64_t gprp:1;
-    uint64_t giov:1;
-    uint64_t gv:1;
-    uint64_t glx:2;
-    uint64_t gcr3_trp_14_12:3;
-    uint64_t ir:1;
-    uint64_t iw:1;
-    uint64_t reserved1:1;
+    bool ppr:1;
+    bool gprp:1;
+    bool giov:1;
+    bool gv:1;
+    unsigned int glx:2;
+    unsigned int gcr3_trp_14_12:3;
+    bool ir:1;
+    bool iw:1;
+    unsigned int :1;
  
      /* 64 - 127 */
-    uint64_t domain_id:16;
-    uint64_t gcr3_trp_30_15:16;
-    uint64_t i:1;
-    uint64_t se:1;
-    uint64_t sa:1;
-    uint64_t ioctl:2;
-    uint64_t cache:1;
-    uint64_t sd:1;
-    uint64_t ex:1;
-    uint64_t sys_mgt:2;
-    uint64_t reserved2:1;
-    uint64_t gcr3_trp_51_31:21;
+    unsigned int domain_id:16;
+    unsigned int gcr3_trp_30_15:16;
+    bool i:1;
+    bool se:1;
+    bool sa:1;
+    unsigned int ioctl:2;
+    bool cache:1;
+    bool sd:1;
+    bool ex:1;
+    unsigned int sys_mgt:2;
+    unsigned int :1;
+    unsigned int gcr3_trp_51_31:21;
  
      /* 128 - 191 */
-    uint64_t iv:1;
-    uint64_t int_tab_len:4;
-    uint64_t ig:1;
+    bool iv:1;
+    unsigned int int_tab_len:4;
+    bool ig:1;
      uint64_t it_root:46;
-    uint64_t reserved3:4;
-    uint64_t init_pass:1;
-    uint64_t ext_int_pass:1;
-    uint64_t nmi_pass:1;
-    uint64_t reserved4:1;
-    uint64_t int_ctl:2;
-    uint64_t lint0_pass:1;
-    uint64_t lint1_pass:1;
+    unsigned int :4;
+    bool init_pass:1;
+    bool ext_int_pass:1;
+    bool nmi_pass:1;
+    unsigned int :1;
+    unsigned int int_ctl:2;
+    bool lint0_pass:1;
+    bool lint1_pass:1;
  
      /* 192 - 255 */
-    uint64_t reserved5:54;
-    uint64_t attr_v:1;
-    uint64_t mode0_fc:1;
-    uint64_t snoop_attr:8;
+    uint64_t :54;
+    bool attr_v:1;
+    bool mode0_fc:1;
+    unsigned int snoop_attr:8;
  };
  
  /* Command Buffer */
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -73,14 +73,14 @@ int __must_check amd_iommu_flush_iotlb_a
  int get_dma_requestor_id(uint16_t seg, uint16_t bdf);
  void amd_iommu_set_intremap_table(struct amd_iommu_dte *dte,
                                    uint64_t intremap_ptr,
-                                  uint8_t int_valid);
+                                  bool valid);
  void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
  				   uint64_t root_ptr, uint16_t domain_id,
-				   uint8_t paging_mode, uint8_t valid);
+				   uint8_t paging_mode, bool valid);
  void iommu_dte_add_device_entry(struct amd_iommu_dte *dte,
-                                struct ivrs_mappings *ivrs_dev);
+                                const struct ivrs_mappings *ivrs_dev);
  void iommu_dte_set_guest_cr3(struct amd_iommu_dte *dte, uint16_t dom_id,
-                             uint64_t gcr3_mfn, uint8_t gv, uint8_t glx);
+                             uint64_t gcr3_mfn, bool gv, uint8_t glx);
  
  /* send cmd to iommu */
  void amd_iommu_flush_all_pages(struct domain *d);


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 01/10] AMD/IOMMU: miscellaneous DTE handling adjustments

Posted by Woods, Brian 18 weeks ago
On Tue, Aug 06, 2019 at 03:07:48PM +0200, Jan Beulich wrote:
> First and foremost switch boolean fields to bool. Adjust a few related
> function parameters as well. Then
> - in amd_iommu_set_intremap_table() don't use literal numbers,
> - in iommu_dte_add_device_entry() use a compound literal instead of many
>   assignments,
> - in amd_iommu_setup_domain_device()
>   - eliminate a pointless local variable,
>   - use || instead of && when deciding whether to clear an entry,
>   - clear the I field without any checking of ATS / IOTLB state,
> - leave reserved fields unnamed.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

Ignore my ack on the old patch that was part of the other series (was
still catching).

Acked-by: Brian Woods <brian.woods@amd.com>

> ---
> v5: IOMMU_INTREMAP_LENGTH -> IOMMU_INTREMAP_ORDER. Adjust comment.
> v4: New.
> 
> --- a/xen/drivers/passthrough/amd/iommu_intr.c
> +++ b/xen/drivers/passthrough/amd/iommu_intr.c
> @@ -69,8 +69,7 @@ union irte_cptr {
>      const union irte128 *ptr128;
>  } __transparent__;
> -#define INTREMAP_LENGTH 0xB
> -#define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH)
> +#define INTREMAP_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
>  struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
>  struct hpet_sbdf hpet_sbdf;
> --- a/xen/drivers/passthrough/amd/iommu_map.c
> +++ b/xen/drivers/passthrough/amd/iommu_map.c
> @@ -101,51 +101,52 @@ static unsigned int set_iommu_pte_presen
>  void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
>                                     uint64_t root_ptr, uint16_t domain_id,
> -                                   uint8_t paging_mode, uint8_t valid)
> +                                   uint8_t paging_mode, bool valid)
>  {
>      dte->domain_id = domain_id;
>      dte->pt_root = paddr_to_pfn(root_ptr);
> -    dte->iw = 1;
> -    dte->ir = 1;
> +    dte->iw = true;
> +    dte->ir = true;
>      dte->paging_mode = paging_mode;
> -    dte->tv = 1;
> +    dte->tv = true;
>      dte->v = valid;
>  }
>  void __init amd_iommu_set_intremap_table(
> -    struct amd_iommu_dte *dte, uint64_t intremap_ptr, uint8_t int_valid)
> +    struct amd_iommu_dte *dte, uint64_t intremap_ptr, bool valid)
>  {
>      dte->it_root = intremap_ptr >> 6;
> -    dte->int_tab_len = 0xb; /* 2048 entries */
> -    dte->int_ctl = 2; /* fixed and arbitrated interrupts remapped */
> -    dte->ig = 0; /* unmapped interrupt results io page faults */
> -    dte->iv = int_valid;
> +    dte->int_tab_len = IOMMU_INTREMAP_ORDER;
> +    dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
> +    dte->ig = false; /* unmapped interrupts result in i/o page faults */
> +    dte->iv = valid;
>  }
>  void __init iommu_dte_add_device_entry(struct amd_iommu_dte *dte,
> -                                       struct ivrs_mappings *ivrs_dev)
> +                                       const struct ivrs_mappings *ivrs_dev)
>  {
>      uint8_t flags = ivrs_dev->device_flags;
> -    memset(dte, 0, sizeof(*dte));
> -
> -    dte->init_pass = MASK_EXTR(flags, ACPI_IVHD_INIT_PASS);
> -    dte->ext_int_pass = MASK_EXTR(flags, ACPI_IVHD_EINT_PASS);
> -    dte->nmi_pass = MASK_EXTR(flags, ACPI_IVHD_NMI_PASS);
> -    dte->lint0_pass = MASK_EXTR(flags, ACPI_IVHD_LINT0_PASS);
> -    dte->lint1_pass = MASK_EXTR(flags, ACPI_IVHD_LINT1_PASS);
> -    dte->sys_mgt = MASK_EXTR(flags, ACPI_IVHD_SYSTEM_MGMT);
> -    dte->ex = ivrs_dev->dte_allow_exclusion;
> +    *dte = (struct amd_iommu_dte){
> +        .init_pass = flags & ACPI_IVHD_INIT_PASS,
> +        .ext_int_pass = flags & ACPI_IVHD_EINT_PASS,
> +        .nmi_pass = flags & ACPI_IVHD_NMI_PASS,
> +        .lint0_pass = flags & ACPI_IVHD_LINT0_PASS,
> +        .lint1_pass = flags & ACPI_IVHD_LINT1_PASS,
> +        .ioctl = IOMMU_DEV_TABLE_IO_CONTROL_ABORTED,
> +        .sys_mgt = MASK_EXTR(flags, ACPI_IVHD_SYSTEM_MGMT),
> +        .ex = ivrs_dev->dte_allow_exclusion,
> +    };
>  }
>  void iommu_dte_set_guest_cr3(struct amd_iommu_dte *dte, uint16_t dom_id,
> -                             uint64_t gcr3_mfn, uint8_t gv, uint8_t glx)
> +                             uint64_t gcr3_mfn, bool gv, uint8_t glx)
>  {
>  #define GCR3_MASK(hi, lo) (((1ul << ((hi) + 1)) - 1) & ~((1ul << (lo)) - 1))
>  #define GCR3_SHIFT(lo) ((lo) - PAGE_SHIFT)
>      /* I bit must be set when gcr3 is enabled */
> -    dte->i = 1;
> +    dte->i = true;
>      dte->gcr3_trp_14_12 = (gcr3_mfn & GCR3_MASK(14, 12)) >> GCR3_SHIFT(12);
>      dte->gcr3_trp_30_15 = (gcr3_mfn & GCR3_MASK(30, 15)) >> GCR3_SHIFT(15);
> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> @@ -93,7 +93,6 @@ static void amd_iommu_setup_domain_devic
>      struct amd_iommu_dte *table, *dte;
>      unsigned long flags;
>      int req_id, valid = 1;
> -    int dte_i = 0;
>      u8 bus = pdev->bus;
>      const struct domain_iommu *hd = dom_iommu(domain);
> @@ -103,9 +102,6 @@ static void amd_iommu_setup_domain_devic
>      if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
>          valid = 0;
> -    if ( ats_enabled )
> -        dte_i = 1;
> -
>      /* get device-table entry */
>      req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
>      table = iommu->dev_table.buffer;
> @@ -122,7 +118,7 @@ static void amd_iommu_setup_domain_devic
>          if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
>               iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
> -            dte->i = dte_i;
> +            dte->i = ats_enabled;
>          amd_iommu_flush_device(iommu, req_id);
> @@ -288,14 +284,11 @@ void amd_iommu_disable_domain_device(str
>      dte = &table[req_id];
>      spin_lock_irqsave(&iommu->lock, flags);
> -    if ( dte->tv && dte->v )
> +    if ( dte->tv || dte->v )
>      {
> -        dte->tv = 0;
> -        dte->v = 0;
> -
> -        if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
> -             iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
> -            dte->i = 0;
> +        dte->tv = false;
> +        dte->v = false;
> +        dte->i = false;
>          amd_iommu_flush_device(iommu, req_id);
> --- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
> +++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
> @@ -107,57 +107,60 @@
>  #define IOMMU_DEV_TABLE_INT_CONTROL_FORWARDED	0x1
>  #define IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED	0x2
> +/* For now, we always allocate the maximum: 2048 entries. */
> +#define IOMMU_INTREMAP_ORDER			0xB
> +
>  struct amd_iommu_dte {
>      /* 0 - 63 */
> -    uint64_t v:1;
> -    uint64_t tv:1;
> -    uint64_t reserved0:5;
> -    uint64_t had:2;
> -    uint64_t paging_mode:3;
> +    bool v:1;
> +    bool tv:1;
> +    unsigned int :5;
> +    unsigned int had:2;
> +    unsigned int paging_mode:3;
>      uint64_t pt_root:40;
> -    uint64_t ppr:1;
> -    uint64_t gprp:1;
> -    uint64_t giov:1;
> -    uint64_t gv:1;
> -    uint64_t glx:2;
> -    uint64_t gcr3_trp_14_12:3;
> -    uint64_t ir:1;
> -    uint64_t iw:1;
> -    uint64_t reserved1:1;
> +    bool ppr:1;
> +    bool gprp:1;
> +    bool giov:1;
> +    bool gv:1;
> +    unsigned int glx:2;
> +    unsigned int gcr3_trp_14_12:3;
> +    bool ir:1;
> +    bool iw:1;
> +    unsigned int :1;
>      /* 64 - 127 */
> -    uint64_t domain_id:16;
> -    uint64_t gcr3_trp_30_15:16;
> -    uint64_t i:1;
> -    uint64_t se:1;
> -    uint64_t sa:1;
> -    uint64_t ioctl:2;
> -    uint64_t cache:1;
> -    uint64_t sd:1;
> -    uint64_t ex:1;
> -    uint64_t sys_mgt:2;
> -    uint64_t reserved2:1;
> -    uint64_t gcr3_trp_51_31:21;
> +    unsigned int domain_id:16;
> +    unsigned int gcr3_trp_30_15:16;
> +    bool i:1;
> +    bool se:1;
> +    bool sa:1;
> +    unsigned int ioctl:2;
> +    bool cache:1;
> +    bool sd:1;
> +    bool ex:1;
> +    unsigned int sys_mgt:2;
> +    unsigned int :1;
> +    unsigned int gcr3_trp_51_31:21;
>      /* 128 - 191 */
> -    uint64_t iv:1;
> -    uint64_t int_tab_len:4;
> -    uint64_t ig:1;
> +    bool iv:1;
> +    unsigned int int_tab_len:4;
> +    bool ig:1;
>      uint64_t it_root:46;
> -    uint64_t reserved3:4;
> -    uint64_t init_pass:1;
> -    uint64_t ext_int_pass:1;
> -    uint64_t nmi_pass:1;
> -    uint64_t reserved4:1;
> -    uint64_t int_ctl:2;
> -    uint64_t lint0_pass:1;
> -    uint64_t lint1_pass:1;
> +    unsigned int :4;
> +    bool init_pass:1;
> +    bool ext_int_pass:1;
> +    bool nmi_pass:1;
> +    unsigned int :1;
> +    unsigned int int_ctl:2;
> +    bool lint0_pass:1;
> +    bool lint1_pass:1;
>      /* 192 - 255 */
> -    uint64_t reserved5:54;
> -    uint64_t attr_v:1;
> -    uint64_t mode0_fc:1;
> -    uint64_t snoop_attr:8;
> +    uint64_t :54;
> +    bool attr_v:1;
> +    bool mode0_fc:1;
> +    unsigned int snoop_attr:8;
>  };
>  /* Command Buffer */
> --- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
> +++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
> @@ -73,14 +73,14 @@ int __must_check amd_iommu_flush_iotlb_a
>  int get_dma_requestor_id(uint16_t seg, uint16_t bdf);
>  void amd_iommu_set_intremap_table(struct amd_iommu_dte *dte,
>                                    uint64_t intremap_ptr,
> -                                  uint8_t int_valid);
> +                                  bool valid);
>  void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
>  				   uint64_t root_ptr, uint16_t domain_id,
> -				   uint8_t paging_mode, uint8_t valid);
> +				   uint8_t paging_mode, bool valid);
>  void iommu_dte_add_device_entry(struct amd_iommu_dte *dte,
> -                                struct ivrs_mappings *ivrs_dev);
> +                                const struct ivrs_mappings *ivrs_dev);
>  void iommu_dte_set_guest_cr3(struct amd_iommu_dte *dte, uint16_t dom_id,
> -                             uint64_t gcr3_mfn, uint8_t gv, uint8_t glx);
> +                             uint64_t gcr3_mfn, bool gv, uint8_t glx);
>  /* send cmd to iommu */
>  void amd_iommu_flush_all_pages(struct domain *d);
> 

-- 
Brian Woods

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 02/10] AMD/IOMMU: drop stray "else"

Posted by Jan Beulich 18 weeks ago
The blank line between it and the prior if() clearly indicates that this
was meant to be a standalone if().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -166,8 +166,8 @@ static int __init iov_detect(void)
      if ( !iommu_enable && !iommu_intremap )
          return 0;
  
-    else if ( (init_done ? amd_iommu_init_interrupt()
-                         : amd_iommu_init(false)) != 0 )
+    if ( (init_done ? amd_iommu_init_interrupt()
+                    : amd_iommu_init(false)) != 0 )
      {
          printk("AMD-Vi: Error initialization\n");
          return -ENODEV;


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 02/10] AMD/IOMMU: drop stray "else"

Posted by Woods, Brian 18 weeks ago
On Tue, Aug 06, 2019 at 03:08:11PM +0200, Jan Beulich wrote:
> The blank line between it and the prior if() clearly indicates that this
> was meant to be a standalone if().
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Brian Woods <brian.woods@amd.com>

> ---
> v5: New.
> 
> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> @@ -166,8 +166,8 @@ static int __init iov_detect(void)
>      if ( !iommu_enable && !iommu_intremap )
>          return 0;
> -    else if ( (init_done ? amd_iommu_init_interrupt()
> -                         : amd_iommu_init(false)) != 0 )
> +    if ( (init_done ? amd_iommu_init_interrupt()
> +                    : amd_iommu_init(false)) != 0 )
>      {
>          printk("AMD-Vi: Error initialization\n");
>          return -ENODEV;
> 

-- 
Brian Woods

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 02/10] AMD/IOMMU: drop stray "else"

Posted by Andrew Cooper 18 weeks ago
On 06/08/2019 14:08, Jan Beulich wrote:
> The blank line between it and the prior if() clearly indicates that this
> was meant to be a standalone if().
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Looks like this was you in c/s 0e8e0a0854a

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 02/10] AMD/IOMMU: drop stray "else"

Posted by Jan Beulich 18 weeks ago
On 07.08.2019 11:48, Andrew Cooper wrote:
> On 06/08/2019 14:08, Jan Beulich wrote:
>> The blank line between it and the prior if() clearly indicates that this
>> was meant to be a standalone if().
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Looks like this was you in c/s 0e8e0a0854a

Right - quite embarrassing. I can only assume that there was a
plain if() paired with it earlier on, and I didn't notice that
the else should go away when dropping/moving it.

> Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

Thanks.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 03/10] AMD/IOMMU: don't free shared IRT multiple times

Posted by Jan Beulich 18 weeks ago
Calling amd_iommu_free_intremap_table() for every IVRS entry is correct
only in per-device-IRT mode. Use a NULL 2nd argument to indicate that
the shared table should be freed, and call the function exactly once in
shared mode.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1106,6 +1106,15 @@ static void __init amd_iommu_init_cleanu
  {
      struct amd_iommu *iommu, *next;
  
+    /* free interrupt remapping table */
+    if ( amd_iommu_perdev_intremap )
+        iterate_ivrs_entries(amd_iommu_free_intremap_table);
+    else if ( shared_intremap_table )
+        amd_iommu_free_intremap_table(list_first_entry(&amd_iommu_head,
+                                                       struct amd_iommu,
+                                                       list),
+                                      NULL);
+
      /* free amd iommu list */
      list_for_each_entry_safe ( iommu, next, &amd_iommu_head, list )
      {
@@ -1128,9 +1137,6 @@ static void __init amd_iommu_init_cleanu
          xfree(iommu);
      }
  
-    /* free interrupt remapping table */
-    iterate_ivrs_entries(amd_iommu_free_intremap_table);
-
      /* free device table */
      deallocate_device_table(&device_table);
  
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -792,14 +792,23 @@ void amd_iommu_read_msi_from_ire(
  int __init amd_iommu_free_intremap_table(
      const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping)
  {
-    void *tb = ivrs_mapping->intremap_table;
+    void **tblp;
  
-    XFREE(ivrs_mapping->intremap_inuse);
+    if ( ivrs_mapping )
+    {
+        XFREE(ivrs_mapping->intremap_inuse);
+        tblp = &ivrs_mapping->intremap_table;
+    }
+    else
+    {
+        XFREE(shared_intremap_inuse);
+        tblp = &shared_intremap_table;
+    }
  
-    if ( tb )
+    if ( *tblp )
      {
-        __free_amd_iommu_tables(tb, intremap_table_order(iommu));
-        ivrs_mapping->intremap_table = NULL;
+        __free_amd_iommu_tables(*tblp, intremap_table_order(iommu));
+        *tblp = NULL;
      }
  
      return 0;


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 03/10] AMD/IOMMU: don't free shared IRT multiple times

Posted by Andrew Cooper 12 weeks ago
On 06/08/2019 14:08, Jan Beulich wrote:
> Calling amd_iommu_free_intremap_table() for every IVRS entry is correct
> only in per-device-IRT mode. Use a NULL 2nd argument to indicate that
> the shared table should be freed, and call the function exactly once in
> shared mode.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 04/10] AMD/IOMMU: introduce a "valid" flag for IVRS mappings

Posted by Jan Beulich 18 weeks ago
For us to no longer blindly allocate interrupt remapping tables for
everything the ACPI tables name, we can't use struct ivrs_mappings'
intremap_table field anymore to also have the meaning of "this entry
is valid". Add a separate boolean field instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -88,6 +88,8 @@ static void __init add_ivrs_mapping_entr
           }
      }
  
+    ivrs_mappings[alias_id].valid = true;
+
      /* Assign IOMMU hardware. */
      ivrs_mappings[bdf].iommu = iommu;
  }
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1244,7 +1244,6 @@ static int __init amd_iommu_setup_device
      u16 seg, struct ivrs_mappings *ivrs_mappings)
  {
      unsigned int bdf;
-    void *intr_tb, *dte;
  
      BUG_ON( (ivrs_bdf_entries == 0) );
  
@@ -1264,16 +1263,17 @@ static int __init amd_iommu_setup_device
      /* Add device table entries */
      for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
      {
-        intr_tb = ivrs_mappings[bdf].intremap_table;
-
-        if ( intr_tb )
+        if ( ivrs_mappings[bdf].valid )
          {
+            void *dte;
+
              /* add device table entry */
              dte = device_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE);
              iommu_dte_add_device_entry(dte, &ivrs_mappings[bdf]);
  
              amd_iommu_set_intremap_table(
-                dte, (u64)virt_to_maddr(intr_tb), iommu_intremap);
+                dte, virt_to_maddr(ivrs_mappings[bdf].intremap_table),
+                iommu_intremap);
          }
      }
  
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -69,8 +69,8 @@ struct amd_iommu *find_iommu_for_device(
   * table and I/O page table respectively. Such devices will have
   * both alias entry and select entry in IVRS structure.
   *
- * Return original device id, if device has valid interrupt remapping
- * table setup for both select entry and alias entry.
+ * Return original device id if both the specific entry and the alias entry
+ * have been marked valid.
   */
  int get_dma_requestor_id(uint16_t seg, uint16_t bdf)
  {
@@ -79,8 +79,7 @@ int get_dma_requestor_id(uint16_t seg, u
  
      BUG_ON ( bdf >= ivrs_bdf_entries );
      req_id = ivrs_mappings[bdf].dte_requestor_id;
-    if ( (ivrs_mappings[bdf].intremap_table != NULL) &&
-         (ivrs_mappings[req_id].intremap_table != NULL) )
+    if ( ivrs_mappings[bdf].valid && ivrs_mappings[req_id].valid )
          req_id = bdf;
  
      return req_id;
--- a/xen/include/asm-x86/amd-iommu.h
+++ b/xen/include/asm-x86/amd-iommu.h
@@ -111,6 +111,7 @@ struct ivrs_mappings {
      u8 unity_map_enable;
      u8 write_permission;
      u8 read_permission;
+    bool valid;
      unsigned long addr_range_start;
      unsigned long addr_range_length;
      struct amd_iommu *iommu;


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 04/10] AMD/IOMMU: introduce a "valid" flag for IVRS mappings

Posted by Andrew Cooper 12 weeks ago
On 06/08/2019 14:08, Jan Beulich wrote:
> For us to no longer blindly allocate interrupt remapping tables for
> everything the ACPI tables name, we can't use struct ivrs_mappings'
> intremap_table field anymore to also have the meaning of "this entry
> is valid". Add a separate boolean field instead.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 05/10] AMD/IOMMU: let callers of amd_iommu_alloc_intremap_table() handle errors

Posted by Jan Beulich 18 weeks ago
Additional users of the function will want to handle errors more
gracefully. Remove the BUG_ON()s and make the current caller panic()
instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -86,6 +86,10 @@ static void __init add_ivrs_mapping_entr
               ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
               ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
           }
+
+         if ( !ivrs_mappings[alias_id].intremap_table )
+             panic("No memory for %04x:%02x:%02x.%u's IRT\n", iommu->seg,
+                   PCI_BUS(alias_id), PCI_SLOT(alias_id), PCI_FUNC(alias_id));
      }
  
      ivrs_mappings[alias_id].valid = true;
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -817,12 +817,22 @@ int __init amd_iommu_free_intremap_table
  void *__init amd_iommu_alloc_intremap_table(
      const struct amd_iommu *iommu, unsigned long **inuse_map)
  {
-    void *tb = __alloc_amd_iommu_tables(intremap_table_order(iommu));
+    unsigned int order = intremap_table_order(iommu);
+    void *tb = __alloc_amd_iommu_tables(order);
+
+    if ( tb )
+    {
+        *inuse_map = xzalloc_array(unsigned long,
+                                   BITS_TO_LONGS(INTREMAP_ENTRIES));
+        if ( *inuse_map )
+            memset(tb, 0, PAGE_SIZE << order);
+        else
+        {
+            __free_amd_iommu_tables(tb, order);
+            tb = NULL;
+        }
+    }
  
-    BUG_ON(tb == NULL);
-    memset(tb, 0, PAGE_SIZE << intremap_table_order(iommu));
-    *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(INTREMAP_ENTRIES));
-    BUG_ON(*inuse_map == NULL);
      return tb;
  }
  


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 05/10] AMD/IOMMU: let callers of amd_iommu_alloc_intremap_table() handle errors

Posted by Andrew Cooper 12 weeks ago
On 06/08/2019 14:09, Jan Beulich wrote:
> Additional users of the function will want to handle errors more
> gracefully. Remove the BUG_ON()s and make the current caller panic()
> instead.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 06/10] AMD/IOMMU: don't blindly allocate interrupt remapping tables

Posted by Jan Beulich 18 weeks ago
ACPI tables are free to list far more device coordinates than there are
actual devices. By delaying the table allocations for most cases, and
doing them only when an actual device is known to be present at a given
position, overall memory used for the tables goes down from over 500k
pages to just over 1k ones.

Tables continue to get allocated right away for special entries
(IO-APIC, HPET) as well as for alias IDs. While in the former case
that's simply because there may not be any device at a given position,
in the latter case this is to avoid having to introduce ref-counting of
table usage.

The change involves invoking
iterate_ivrs_mappings(amd_iommu_setup_device_table) a second time,
because the function now wants to be able to find PCI devices, which
isn't possible yet when IOMMU setup happens very early during x2APIC
mode setup. In this context amd_iommu_init_interrupt() gets renamed as
well.

The logic adjusting a DTE's interrupt remapping attributes also gets
changed, such that the lack of an IRT would result in target aborted
rather than not remapped interrupts (should any occur).

Note that for now phantom functions get separate IRTs allocated, as was
the case before.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.
---
TBD: This retains prior (but suspicious) behavior of not calling
      amd_iommu_set_intremap_table() for "invalid" IVRS mapping entries.
      Since DTE.IV=0 means un-remapped interrupts, I wonder if this needs
      changing.

Backporting note: This depends on b5fbe81196 "iommu / x86: move call to
scan_pci_devices() out of vendor code"!

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -53,7 +53,8 @@ union acpi_ivhd_device {
  };
  
  static void __init add_ivrs_mapping_entry(
-    u16 bdf, u16 alias_id, u8 flags, struct amd_iommu *iommu)
+    uint16_t bdf, uint16_t alias_id, uint8_t flags, bool alloc_irt,
+    struct amd_iommu *iommu)
  {
      struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(iommu->seg);
  
@@ -69,27 +70,32 @@ static void __init add_ivrs_mapping_entr
      if ( iommu->bdf == bdf )
          return;
  
-    if ( !ivrs_mappings[alias_id].intremap_table )
+    /* Allocate interrupt remapping table if needed. */
+    if ( iommu_intremap && !ivrs_mappings[alias_id].intremap_table )
      {
-         /* allocate per-device interrupt remapping table */
-         if ( amd_iommu_perdev_intremap )
-             ivrs_mappings[alias_id].intremap_table =
-                 amd_iommu_alloc_intremap_table(
-                     iommu,
-                     &ivrs_mappings[alias_id].intremap_inuse);
-         else
-         {
-             if ( shared_intremap_table == NULL  )
-                 shared_intremap_table = amd_iommu_alloc_intremap_table(
-                     iommu,
-                     &shared_intremap_inuse);
-             ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
-             ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
-         }
-
-         if ( !ivrs_mappings[alias_id].intremap_table )
-             panic("No memory for %04x:%02x:%02x.%u's IRT\n", iommu->seg,
-                   PCI_BUS(alias_id), PCI_SLOT(alias_id), PCI_FUNC(alias_id));
+        if ( !amd_iommu_perdev_intremap )
+        {
+            if ( !shared_intremap_table )
+                shared_intremap_table = amd_iommu_alloc_intremap_table(
+                    iommu, &shared_intremap_inuse);
+
+            if ( !shared_intremap_table )
+                panic("No memory for shared IRT\n");
+
+            ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
+            ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
+        }
+        else if ( alloc_irt )
+        {
+            ivrs_mappings[alias_id].intremap_table =
+                amd_iommu_alloc_intremap_table(
+                    iommu, &ivrs_mappings[alias_id].intremap_inuse);
+
+            if ( !ivrs_mappings[alias_id].intremap_table )
+                panic("No memory for %04x:%02x:%02x.%u's IRT\n",
+                      iommu->seg, PCI_BUS(alias_id), PCI_SLOT(alias_id),
+                      PCI_FUNC(alias_id));
+        }
      }
  
      ivrs_mappings[alias_id].valid = true;
@@ -433,7 +439,8 @@ static u16 __init parse_ivhd_device_sele
          return 0;
      }
  
-    add_ivrs_mapping_entry(bdf, bdf, select->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, bdf, select->header.data_setting, false,
+                           iommu);
  
      return sizeof(*select);
  }
@@ -479,7 +486,7 @@ static u16 __init parse_ivhd_device_rang
  
      for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
          add_ivrs_mapping_entry(bdf, bdf, range->start.header.data_setting,
-                               iommu);
+                               false, iommu);
  
      return dev_length;
  }
@@ -513,7 +520,8 @@ static u16 __init parse_ivhd_device_alia
  
      AMD_IOMMU_DEBUG(" Dev_Id Alias: %#x\n", alias_id);
  
-    add_ivrs_mapping_entry(bdf, alias_id, alias->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, alias_id, alias->header.data_setting, true,
+                           iommu);
  
      return dev_length;
  }
@@ -568,7 +576,7 @@ static u16 __init parse_ivhd_device_alia
  
      for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
          add_ivrs_mapping_entry(bdf, alias_id, range->alias.header.data_setting,
-                               iommu);
+                               true, iommu);
  
      return dev_length;
  }
@@ -593,7 +601,7 @@ static u16 __init parse_ivhd_device_exte
          return 0;
      }
  
-    add_ivrs_mapping_entry(bdf, bdf, ext->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, bdf, ext->header.data_setting, false, iommu);
  
      return dev_length;
  }
@@ -640,7 +648,7 @@ static u16 __init parse_ivhd_device_exte
  
      for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
          add_ivrs_mapping_entry(bdf, bdf, range->extended.header.data_setting,
-                               iommu);
+                               false, iommu);
  
      return dev_length;
  }
@@ -733,7 +741,8 @@ static u16 __init parse_ivhd_device_spec
      AMD_IOMMU_DEBUG("IVHD Special: %04x:%02x:%02x.%u variety %#x handle %#x\n",
                      seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf),
                      special->variety, special->handle);
-    add_ivrs_mapping_entry(bdf, bdf, special->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, bdf, special->header.data_setting, true,
+                           iommu);
  
      switch ( special->variety )
      {
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -30,6 +30,7 @@
  #include <xen/delay.h>
  
  static int __initdata nr_amd_iommus;
+static bool __initdata pci_init;
  
  static void do_amd_iommu_irq(unsigned long data);
  static DECLARE_SOFTIRQ_TASKLET(amd_iommu_irq_tasklet, do_amd_iommu_irq, 0);
@@ -1247,17 +1248,20 @@ static int __init amd_iommu_setup_device
  
      BUG_ON( (ivrs_bdf_entries == 0) );
  
-    /* allocate 'device table' on a 4K boundary */
-    device_table.alloc_size = PAGE_SIZE <<
-                              get_order_from_bytes(
-                              PAGE_ALIGN(ivrs_bdf_entries *
-                              IOMMU_DEV_TABLE_ENTRY_SIZE));
-    device_table.entries = device_table.alloc_size /
-                           IOMMU_DEV_TABLE_ENTRY_SIZE;
-
-    device_table.buffer = allocate_buffer(device_table.alloc_size,
-                                          "Device Table");
-    if  ( device_table.buffer == NULL )
+    if ( !device_table.buffer )
+    {
+        /* allocate 'device table' on a 4K boundary */
+        device_table.alloc_size = PAGE_SIZE <<
+                                  get_order_from_bytes(
+                                  PAGE_ALIGN(ivrs_bdf_entries *
+                                  IOMMU_DEV_TABLE_ENTRY_SIZE));
+        device_table.entries = device_table.alloc_size /
+                               IOMMU_DEV_TABLE_ENTRY_SIZE;
+
+        device_table.buffer = allocate_buffer(device_table.alloc_size,
+                                              "Device Table");
+    }
+    if ( !device_table.buffer )
          return -ENOMEM;
  
      /* Add device table entries */
@@ -1266,13 +1270,46 @@ static int __init amd_iommu_setup_device
          if ( ivrs_mappings[bdf].valid )
          {
              void *dte;
+            const struct pci_dev *pdev = NULL;
  
              /* add device table entry */
              dte = device_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE);
              iommu_dte_add_device_entry(dte, &ivrs_mappings[bdf]);
  
+            if ( iommu_intremap &&
+                 ivrs_mappings[bdf].dte_requestor_id == bdf &&
+                 !ivrs_mappings[bdf].intremap_table )
+            {
+                if ( !pci_init )
+                    continue;
+                pcidevs_lock();
+                pdev = pci_get_pdev(seg, PCI_BUS(bdf), PCI_DEVFN2(bdf));
+                pcidevs_unlock();
+            }
+
+            if ( pdev )
+            {
+                unsigned int req_id = bdf;
+
+                do {
+                    ivrs_mappings[req_id].intremap_table =
+                        amd_iommu_alloc_intremap_table(
+                            ivrs_mappings[bdf].iommu,
+                            &ivrs_mappings[req_id].intremap_inuse);
+                    if ( !ivrs_mappings[req_id].intremap_table )
+                        return -ENOMEM;
+
+                    if ( !pdev->phantom_stride )
+                        break;
+                    req_id += pdev->phantom_stride;
+                } while ( PCI_SLOT(req_id) == pdev->sbdf.dev );
+            }
+
              amd_iommu_set_intremap_table(
-                dte, virt_to_maddr(ivrs_mappings[bdf].intremap_table),
+                dte,
+                ivrs_mappings[bdf].intremap_table
+                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
+                : 0,
                  iommu_intremap);
          }
      }
@@ -1405,7 +1442,8 @@ int __init amd_iommu_init(bool xt)
      if ( rc )
          goto error_out;
  
-    /* allocate and initialize a global device table shared by all iommus */
+    /* Allocate and initialize device table(s). */
+    pci_init = !xt;
      rc = iterate_ivrs_mappings(amd_iommu_setup_device_table);
      if ( rc )
          goto error_out;
@@ -1424,7 +1462,7 @@ int __init amd_iommu_init(bool xt)
          /*
           * Setting up of the IOMMU interrupts cannot occur yet at the (very
           * early) time we get here when enabling x2APIC mode. Suppress it
-         * here, and do it explicitly in amd_iommu_init_interrupt().
+         * here, and do it explicitly in amd_iommu_init_late().
           */
          rc = amd_iommu_init_one(iommu, !xt);
          if ( rc )
@@ -1438,11 +1476,16 @@ error_out:
      return rc;
  }
  
-int __init amd_iommu_init_interrupt(void)
+int __init amd_iommu_init_late(void)
  {
      struct amd_iommu *iommu;
      int rc = 0;
  
+    /* Further initialize the device table(s). */
+    pci_init = true;
+    if ( iommu_intremap )
+        rc = iterate_ivrs_mappings(amd_iommu_setup_device_table);
+
      for_each_amd_iommu ( iommu )
      {
          struct irq_desc *desc;
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -789,7 +789,7 @@ void amd_iommu_read_msi_from_ire(
      }
  }
  
-int __init amd_iommu_free_intremap_table(
+int amd_iommu_free_intremap_table(
      const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping)
  {
      void **tblp;
@@ -814,7 +814,7 @@ int __init amd_iommu_free_intremap_table
      return 0;
  }
  
-void *__init amd_iommu_alloc_intremap_table(
+void *amd_iommu_alloc_intremap_table(
      const struct amd_iommu *iommu, unsigned long **inuse_map)
  {
      unsigned int order = intremap_table_order(iommu);
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -116,8 +116,9 @@ void __init amd_iommu_set_intremap_table
      struct amd_iommu_dte *dte, uint64_t intremap_ptr, bool valid)
  {
      dte->it_root = intremap_ptr >> 6;
-    dte->int_tab_len = IOMMU_INTREMAP_ORDER;
-    dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
+    dte->int_tab_len = intremap_ptr ? IOMMU_INTREMAP_ORDER : 0;
+    dte->int_ctl = intremap_ptr ? IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED
+                                : IOMMU_DEV_TABLE_INT_CONTROL_ABORTED;
      dte->ig = false; /* unmapped interrupts result in i/o page faults */
      dte->iv = valid;
  }
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -165,7 +165,7 @@ static int __init iov_detect(void)
      if ( !iommu_enable && !iommu_intremap )
          return 0;
  
-    if ( (init_done ? amd_iommu_init_interrupt()
+    if ( (init_done ? amd_iommu_init_late()
                      : amd_iommu_init(false)) != 0 )
      {
          printk("AMD-Vi: Error initialization\n");
@@ -429,6 +429,7 @@ static int amd_iommu_add_device(u8 devfn
  {
      struct amd_iommu *iommu;
      u16 bdf;
+    struct ivrs_mappings *ivrs_mappings;
  
      if ( !pdev->domain )
          return -EINVAL;
@@ -458,6 +459,32 @@ static int amd_iommu_add_device(u8 devfn
          return -ENODEV;
      }
  
+    ivrs_mappings = get_ivrs_mappings(pdev->seg);
+    bdf = PCI_BDF2(pdev->bus, devfn);
+    if ( !ivrs_mappings ||
+         !ivrs_mappings[ivrs_mappings[bdf].dte_requestor_id].valid )
+        return -EPERM;
+
+    if ( iommu_intremap &&
+         ivrs_mappings[bdf].dte_requestor_id == bdf &&
+         !ivrs_mappings[bdf].intremap_table )
+    {
+        ivrs_mappings[bdf].intremap_table =
+            amd_iommu_alloc_intremap_table(
+                iommu, &ivrs_mappings[bdf].intremap_inuse);
+        if ( !ivrs_mappings[bdf].intremap_table )
+            return -ENOMEM;
+
+        amd_iommu_set_intremap_table(
+            iommu->dev_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE),
+            ivrs_mappings[bdf].intremap_table
+            ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
+            : 0,
+            iommu_intremap);
+
+        amd_iommu_flush_device(iommu, bdf);
+    }
+
      amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
      return 0;
  }
@@ -466,6 +493,8 @@ static int amd_iommu_remove_device(u8 de
  {
      struct amd_iommu *iommu;
      u16 bdf;
+    struct ivrs_mappings *ivrs_mappings;
+
      if ( !pdev->domain )
          return -EINVAL;
  
@@ -481,6 +510,14 @@ static int amd_iommu_remove_device(u8 de
      }
  
      amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev);
+
+    ivrs_mappings = get_ivrs_mappings(pdev->seg);
+    bdf = PCI_BDF2(pdev->bus, devfn);
+    if ( amd_iommu_perdev_intremap &&
+         ivrs_mappings[bdf].dte_requestor_id == bdf &&
+         ivrs_mappings[bdf].intremap_table )
+        amd_iommu_free_intremap_table(iommu, &ivrs_mappings[bdf]);
+
      return 0;
  }
  
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -50,7 +50,7 @@ void get_iommu_features(struct amd_iommu
  /* amd-iommu-init functions */
  int amd_iommu_prepare(bool xt);
  int amd_iommu_init(bool xt);
-int amd_iommu_init_interrupt(void);
+int amd_iommu_init_late(void);
  int amd_iommu_update_ivrs_mapping_acpi(void);
  int iov_adjust_irq_affinities(void);
  


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 06/10] AMD/IOMMU: don't blindly allocate interrupt remapping tables

Posted by Andrew Cooper 12 weeks ago
On 06/08/2019 14:09, Jan Beulich wrote:
> ACPI tables are free to list far more device coordinates than there are
> actual devices. By delaying the table allocations for most cases, and
> doing them only when an actual device is known to be present at a given
> position, overall memory used for the tables goes down from over 500k
> pages to just over 1k ones.

This is slightly awkward grammar.  While I don't think it is strictly
speaking incorrect, it would be more normal to phrase as "just over 1k
pages".

>
> Tables continue to get allocated right away for special entries
> (IO-APIC, HPET) as well as for alias IDs. While in the former case
> that's simply because there may not be any device at a given position,
> in the latter case this is to avoid having to introduce ref-counting of
> table usage.
>
> The change involves invoking
> iterate_ivrs_mappings(amd_iommu_setup_device_table) a second time,
> because the function now wants to be able to find PCI devices, which
> isn't possible yet when IOMMU setup happens very early during x2APIC
> mode setup. In this context amd_iommu_init_interrupt() gets renamed as
> well.
>
> The logic adjusting a DTE's interrupt remapping attributes also gets
> changed, such that the lack of an IRT would result in target aborted
> rather than not remapped interrupts (should any occur).
>
> Note that for now phantom functions get separate IRTs allocated, as was
> the case before.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v5: New.
> ---
> TBD: This retains prior (but suspicious) behavior of not calling
>      amd_iommu_set_intremap_table() for "invalid" IVRS mapping entries.
>      Since DTE.IV=0 means un-remapped interrupts, I wonder if this needs
>      changing.

How would an invalid entry get DTE.V set in the first place?

Whatever the old behaviour may have been, we shouldn't have code which
comes with a chance of having non-remapped interrupts by accident.

>
> @@ -1266,13 +1270,46 @@ static int __init amd_iommu_setup_device
>          if ( ivrs_mappings[bdf].valid )
>          {
>              void *dte;
> +            const struct pci_dev *pdev = NULL;
>  
>              /* add device table entry */
>              dte = device_table.buffer + (bdf *
> IOMMU_DEV_TABLE_ENTRY_SIZE);
>              iommu_dte_add_device_entry(dte, &ivrs_mappings[bdf]);
>  
> +            if ( iommu_intremap &&
> +                 ivrs_mappings[bdf].dte_requestor_id == bdf &&
> +                 !ivrs_mappings[bdf].intremap_table )
> +            {
> +                if ( !pci_init )
> +                    continue;
> +                pcidevs_lock();
> +                pdev = pci_get_pdev(seg, PCI_BUS(bdf), PCI_DEVFN2(bdf));
> +                pcidevs_unlock();
> +            }
> +
> +            if ( pdev )
> +            {
> +                unsigned int req_id = bdf;
> +
> +                do {
> +                    ivrs_mappings[req_id].intremap_table =
> +                        amd_iommu_alloc_intremap_table(
> +                            ivrs_mappings[bdf].iommu,
> +                            &ivrs_mappings[req_id].intremap_inuse);
> +                    if ( !ivrs_mappings[req_id].intremap_table )
> +                        return -ENOMEM;
> +
> +                    if ( !pdev->phantom_stride )
> +                        break;
> +                    req_id += pdev->phantom_stride;
> +                } while ( PCI_SLOT(req_id) == pdev->sbdf.dev );
> +            }
> +
>              amd_iommu_set_intremap_table(
> -                dte, virt_to_maddr(ivrs_mappings[bdf].intremap_table),
> +                dte,
> +                ivrs_mappings[bdf].intremap_table
> +                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
> +                : 0,

Under what circumstances can ivrs_mappings[bdf].intremap_table be NULL,
given the ENOMEM above?

This case isn't very clear cut given the !pdev possibility, but...

> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> @@ -458,6 +459,32 @@ static int amd_iommu_add_device(u8 devfn
>          return -ENODEV;
>      }
>  
> +    ivrs_mappings = get_ivrs_mappings(pdev->seg);
> +    bdf = PCI_BDF2(pdev->bus, devfn);
> +    if ( !ivrs_mappings ||
> +         !ivrs_mappings[ivrs_mappings[bdf].dte_requestor_id].valid )
> +        return -EPERM;
> +
> +    if ( iommu_intremap &&
> +         ivrs_mappings[bdf].dte_requestor_id == bdf &&
> +         !ivrs_mappings[bdf].intremap_table )
> +    {
> +        ivrs_mappings[bdf].intremap_table =
> +            amd_iommu_alloc_intremap_table(
> +                iommu, &ivrs_mappings[bdf].intremap_inuse);
> +        if ( !ivrs_mappings[bdf].intremap_table )
> +            return -ENOMEM;
> +
> +        amd_iommu_set_intremap_table(
> +            iommu->dev_table.buffer + (bdf *
> IOMMU_DEV_TABLE_ENTRY_SIZE),
> +            ivrs_mappings[bdf].intremap_table
> +            ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
> +            : 0,

... this case definitely cannot occur.

~Andrew

> +            iommu_intremap);
> +
> +        amd_iommu_flush_device(iommu, bdf);
> +    }
> +
>      amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
>      return 0;
>  }
>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 06/10] AMD/IOMMU: don't blindly allocate interrupt remapping tables

Posted by Jan Beulich 12 weeks ago
On 17.09.2019 15:10, Andrew Cooper wrote:
> On 06/08/2019 14:09, Jan Beulich wrote:
>> ACPI tables are free to list far more device coordinates than there are
>> actual devices. By delaying the table allocations for most cases, and
>> doing them only when an actual device is known to be present at a given
>> position, overall memory used for the tables goes down from over 500k
>> pages to just over 1k ones.
> 
> This is slightly awkward grammar.  While I don't think it is strictly
> speaking incorrect, it would be more normal to phrase as "just over 1k
> pages".

Changed, albeit to me the double "pages" sounds odd as well. Would
"of them" be any better than "ones"?

>> ---
>> TBD: This retains prior (but suspicious) behavior of not calling
>>      amd_iommu_set_intremap_table() for "invalid" IVRS mapping entries.
>>      Since DTE.IV=0 means un-remapped interrupts, I wonder if this needs
>>      changing.
> 
> How would an invalid entry get DTE.V set in the first place?

DTE.V gets set by amd_iommu_set_root_page_table(), which in turn gets
called from amd_iommu_setup_domain_device() alone. It's only the
latter function's callers which obtain (and possibly check) the
corresponding IVRS mappings entry. Hence to me there's a sufficient
disconnect between setting of DTE.V and DTE.IV.

Plus, looking at e.g. amd_iommu_add_device(), there's ample room for
not even making it to amd_iommu_set_intremap_table(), due to earlier
errors.

> Whatever the old behaviour may have been, we shouldn't have code which
> comes with a chance of having non-remapped interrupts by accident.

We can't make amd_iommu_set_root_page_table() set dte->iv to 1, as
it gets called only after amd_iommu_set_intremap_table() in
amd_iommu_add_device(). But we could of course make it do so when
it finds dte->it_root be zero. Yet I wonder if it wasn't more safe
to have DTEs start out with the field set this way.

Along these lines I'm also not convinced it is a good idea for
amd_iommu_set_intremap_table() to have a "valid" parameter in the
first place: It's okay as long as all callers pass iommu_intremap,
but it would seem to me that - as already said - we'd want DTEs be
set this way right when a DT gets allocated. If you agree, I'll
happily add a patch doing so to the end of this series (there's
meanwhile a patch re-arranging DT allocation anyway).

>> @@ -1266,13 +1270,46 @@ static int __init amd_iommu_setup_device
>>          if ( ivrs_mappings[bdf].valid )
>>          {
>>              void *dte;
>> +            const struct pci_dev *pdev = NULL;
>>  
>>              /* add device table entry */
>>              dte = device_table.buffer + (bdf *
>> IOMMU_DEV_TABLE_ENTRY_SIZE);
>>              iommu_dte_add_device_entry(dte, &ivrs_mappings[bdf]);
>>  
>> +            if ( iommu_intremap &&
>> +                 ivrs_mappings[bdf].dte_requestor_id == bdf &&
>> +                 !ivrs_mappings[bdf].intremap_table )
>> +            {
>> +                if ( !pci_init )
>> +                    continue;
>> +                pcidevs_lock();
>> +                pdev = pci_get_pdev(seg, PCI_BUS(bdf), PCI_DEVFN2(bdf));
>> +                pcidevs_unlock();
>> +            }
>> +
>> +            if ( pdev )
>> +            {
>> +                unsigned int req_id = bdf;
>> +
>> +                do {
>> +                    ivrs_mappings[req_id].intremap_table =
>> +                        amd_iommu_alloc_intremap_table(
>> +                            ivrs_mappings[bdf].iommu,
>> +                            &ivrs_mappings[req_id].intremap_inuse);
>> +                    if ( !ivrs_mappings[req_id].intremap_table )
>> +                        return -ENOMEM;
>> +
>> +                    if ( !pdev->phantom_stride )
>> +                        break;
>> +                    req_id += pdev->phantom_stride;
>> +                } while ( PCI_SLOT(req_id) == pdev->sbdf.dev );
>> +            }
>> +
>>              amd_iommu_set_intremap_table(
>> -                dte, virt_to_maddr(ivrs_mappings[bdf].intremap_table),
>> +                dte,
>> +                ivrs_mappings[bdf].intremap_table
>> +                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
>> +                : 0,
> 
> Under what circumstances can ivrs_mappings[bdf].intremap_table be NULL,
> given the ENOMEM above?

The "ivrs_mappings[bdf].dte_requestor_id == bdf" part of the conditional
around the setting of pdev can result in allocation (and hence the
ENOMEM error path) to be bypassed.

> This case isn't very clear cut given the !pdev possibility, but...
> 
>> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
>> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
>> @@ -458,6 +459,32 @@ static int amd_iommu_add_device(u8 devfn
>>          return -ENODEV;
>>      }
>>  
>> +    ivrs_mappings = get_ivrs_mappings(pdev->seg);
>> +    bdf = PCI_BDF2(pdev->bus, devfn);
>> +    if ( !ivrs_mappings ||
>> +         !ivrs_mappings[ivrs_mappings[bdf].dte_requestor_id].valid )
>> +        return -EPERM;
>> +
>> +    if ( iommu_intremap &&
>> +         ivrs_mappings[bdf].dte_requestor_id == bdf &&
>> +         !ivrs_mappings[bdf].intremap_table )
>> +    {
>> +        ivrs_mappings[bdf].intremap_table =
>> +            amd_iommu_alloc_intremap_table(
>> +                iommu, &ivrs_mappings[bdf].intremap_inuse);
>> +        if ( !ivrs_mappings[bdf].intremap_table )
>> +            return -ENOMEM;
>> +
>> +        amd_iommu_set_intremap_table(
>> +            iommu->dev_table.buffer + (bdf *
>> IOMMU_DEV_TABLE_ENTRY_SIZE),
>> +            ivrs_mappings[bdf].intremap_table
>> +            ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
>> +            : 0,
> 
> ... this case definitely cannot occur.

Oh, missing cleanup after copy-and-paste. Thanks for noticing.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 06/10] AMD/IOMMU: don't blindly allocate interrupt remapping tables

Posted by Andrew Cooper 12 weeks ago
On 18/09/2019 09:55, Jan Beulich wrote:
> On 17.09.2019 15:10, Andrew Cooper wrote:
>> On 06/08/2019 14:09, Jan Beulich wrote:
>>> ACPI tables are free to list far more device coordinates than there are
>>> actual devices. By delaying the table allocations for most cases, and
>>> doing them only when an actual device is known to be present at a given
>>> position, overall memory used for the tables goes down from over 500k
>>> pages to just over 1k ones.
>> This is slightly awkward grammar.  While I don't think it is strictly
>> speaking incorrect, it would be more normal to phrase as "just over 1k
>> pages".
> Changed, albeit to me the double "pages" sounds odd as well. Would
> "of them" be any better than "ones"?

You are drawing a comparison between 500k pages, and 1k pages.

Changing the 'pages' qualifier introduced ambiguity - consider the case
where "ones" is substituted for "bytes", which would be a legitimate way
to describe the memory reduction, but is misleading due to the
difference in units.

Alternatively, the trailing qualifier can be dropped, and the sentence
end at "1k.", as the "pages" is implied.

>
>>> ---
>>> TBD: This retains prior (but suspicious) behavior of not calling
>>>      amd_iommu_set_intremap_table() for "invalid" IVRS mapping entries.
>>>      Since DTE.IV=0 means un-remapped interrupts, I wonder if this needs
>>>      changing.
>> How would an invalid entry get DTE.V set in the first place?
> DTE.V gets set by amd_iommu_set_root_page_table(), which in turn gets
> called from amd_iommu_setup_domain_device() alone. It's only the
> latter function's callers which obtain (and possibly check) the
> corresponding IVRS mappings entry. Hence to me there's a sufficient
> disconnect between setting of DTE.V and DTE.IV.
>
> Plus, looking at e.g. amd_iommu_add_device(), there's ample room for
> not even making it to amd_iommu_set_intremap_table(), due to earlier
> errors.
>
>> Whatever the old behaviour may have been, we shouldn't have code which
>> comes with a chance of having non-remapped interrupts by accident.
> We can't make amd_iommu_set_root_page_table() set dte->iv to 1, as
> it gets called only after amd_iommu_set_intremap_table() in
> amd_iommu_add_device(). But we could of course make it do so when
> it finds dte->it_root be zero. Yet I wonder if it wasn't more safe
> to have DTEs start out with the field set this way.
>
> Along these lines I'm also not convinced it is a good idea for
> amd_iommu_set_intremap_table() to have a "valid" parameter in the
> first place: It's okay as long as all callers pass iommu_intremap,
> but it would seem to me that - as already said - we'd want DTEs be
> set this way right when a DT gets allocated. If you agree, I'll
> happily add a patch doing so to the end of this series (there's
> meanwhile a patch re-arranging DT allocation anyway).

I've been looking through the spec, and this is rather complicated.  We
need to set V and TV to inhibit DMA, and IV and IntCtl to inhibit
interrupts.

Why not initialise every entry in the device table when we create it to
a safe, blocked state.  That way, the only way a device starts
functioning appropriately is via a successful call to 
amd_iommu_set_root_page_table() and amd_iommu_set_intremap_table(),
which seems to be far safer behaviour overall.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 06/10] AMD/IOMMU: don't blindly allocate interrupt remapping tables

Posted by Jan Beulich 12 weeks ago
On 18.09.2019 13:36, Andrew Cooper wrote:
> On 18/09/2019 09:55, Jan Beulich wrote:
>> On 17.09.2019 15:10, Andrew Cooper wrote:
>>> On 06/08/2019 14:09, Jan Beulich wrote:
>>>> TBD: This retains prior (but suspicious) behavior of not calling
>>>>      amd_iommu_set_intremap_table() for "invalid" IVRS mapping entries.
>>>>      Since DTE.IV=0 means un-remapped interrupts, I wonder if this needs
>>>>      changing.
>>> How would an invalid entry get DTE.V set in the first place?
>> DTE.V gets set by amd_iommu_set_root_page_table(), which in turn gets
>> called from amd_iommu_setup_domain_device() alone. It's only the
>> latter function's callers which obtain (and possibly check) the
>> corresponding IVRS mappings entry. Hence to me there's a sufficient
>> disconnect between setting of DTE.V and DTE.IV.
>>
>> Plus, looking at e.g. amd_iommu_add_device(), there's ample room for
>> not even making it to amd_iommu_set_intremap_table(), due to earlier
>> errors.
>>
>>> Whatever the old behaviour may have been, we shouldn't have code which
>>> comes with a chance of having non-remapped interrupts by accident.
>> We can't make amd_iommu_set_root_page_table() set dte->iv to 1, as
>> it gets called only after amd_iommu_set_intremap_table() in
>> amd_iommu_add_device(). But we could of course make it do so when
>> it finds dte->it_root be zero. Yet I wonder if it wasn't more safe
>> to have DTEs start out with the field set this way.
>>
>> Along these lines I'm also not convinced it is a good idea for
>> amd_iommu_set_intremap_table() to have a "valid" parameter in the
>> first place: It's okay as long as all callers pass iommu_intremap,
>> but it would seem to me that - as already said - we'd want DTEs be
>> set this way right when a DT gets allocated. If you agree, I'll
>> happily add a patch doing so to the end of this series (there's
>> meanwhile a patch re-arranging DT allocation anyway).
> 
> I've been looking through the spec, and this is rather complicated.  We
> need to set V and TV to inhibit DMA, and IV and IntCtl to inhibit
> interrupts.

By "set V and TV", you don't mean setting both to 1, do you? My
reading of the respective tables is that we want V=1, TV=0, IV=1,
and IntCtl=0. The problem with setting V early is that some
other fields than also are assumed to be valid. I.e. along with
the above we'd also need SysMgt=0, EX=0, SD=?, Cache=?, IoCtl=0,
SA=?, SE=?, and I=0; question marks indicate either setting would
appear to be fine. In the end all zeros except V and IV would
look to be what we want, albeit setting on of SE or SA may be
worth considering.

> Why not initialise every entry in the device table when we create it to
> a safe, blocked state.  That way, the only way a device starts
> functioning appropriately is via a successful call to 
> amd_iommu_set_root_page_table() and amd_iommu_set_intremap_table(),
> which seems to be far safer behaviour overall.

I intend to add a respective patch to the series, if we manage to
agree (see above) what the initial settings should be.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 07/10] AMD/IOMMU: make phantom functions share interrupt remapping tables

Posted by Jan Beulich 18 weeks ago
Rather than duplicating entries in amd_iommu_msi_msg_update_ire(), share
the tables. This mainly requires some care while freeing them, to avoid
freeing memory blocks twice.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1114,7 +1114,7 @@ static void __init amd_iommu_init_cleanu
          amd_iommu_free_intremap_table(list_first_entry(&amd_iommu_head,
                                                         struct amd_iommu,
                                                         list),
-                                      NULL);
+                                      NULL, 0);
  
      /* free amd iommu list */
      list_for_each_entry_safe ( iommu, next, &amd_iommu_head, list )
@@ -1179,7 +1179,7 @@ int iterate_ivrs_mappings(int (*handler)
  }
  
  int iterate_ivrs_entries(int (*handler)(const struct amd_iommu *,
-                                        struct ivrs_mappings *))
+                                        struct ivrs_mappings *, uint16_t bdf))
  {
      u16 seg = 0;
      int rc = 0;
@@ -1196,7 +1196,7 @@ int iterate_ivrs_entries(int (*handler)(
              const struct amd_iommu *iommu = map[bdf].iommu;
  
              if ( iommu && map[bdf].dte_requestor_id == bdf )
-                rc = handler(iommu, &map[bdf]);
+                rc = handler(iommu, &map[bdf], bdf);
          }
      } while ( !rc && ++seg );
  
@@ -1289,20 +1289,29 @@ static int __init amd_iommu_setup_device
  
              if ( pdev )
              {
-                unsigned int req_id = bdf;
-
-                do {
-                    ivrs_mappings[req_id].intremap_table =
-                        amd_iommu_alloc_intremap_table(
-                            ivrs_mappings[bdf].iommu,
-                            &ivrs_mappings[req_id].intremap_inuse);
-                    if ( !ivrs_mappings[req_id].intremap_table )
-                        return -ENOMEM;
-
-                    if ( !pdev->phantom_stride )
-                        break;
-                    req_id += pdev->phantom_stride;
-                } while ( PCI_SLOT(req_id) == pdev->sbdf.dev );
+                ivrs_mappings[bdf].intremap_table =
+                    amd_iommu_alloc_intremap_table(
+                        ivrs_mappings[bdf].iommu,
+                        &ivrs_mappings[bdf].intremap_inuse);
+                if ( !ivrs_mappings[bdf].intremap_table )
+                    return -ENOMEM;
+
+                if ( pdev->phantom_stride )
+                {
+                    unsigned int req_id = bdf;
+
+                    for ( ; ; )
+                    {
+                        req_id += pdev->phantom_stride;
+                        if ( PCI_SLOT(req_id) != pdev->sbdf.dev )
+                            break;
+
+                        ivrs_mappings[req_id].intremap_table =
+                            ivrs_mappings[bdf].intremap_table;
+                        ivrs_mappings[req_id].intremap_inuse =
+                            ivrs_mappings[bdf].intremap_inuse;
+                    }
+                }
              }
  
              amd_iommu_set_intremap_table(
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -711,33 +711,20 @@ int amd_iommu_msi_msg_update_ire(
  
      if ( msi_desc->remap_index >= 0 && !msg )
      {
-        do {
-            update_intremap_entry_from_msi_msg(iommu, bdf, nr,
-                                               &msi_desc->remap_index,
-                                               NULL, NULL);
-            if ( !pdev || !pdev->phantom_stride )
-                break;
-            bdf += pdev->phantom_stride;
-        } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
+        update_intremap_entry_from_msi_msg(iommu, bdf, nr,
+                                           &msi_desc->remap_index,
+                                           NULL, NULL);
  
          for ( i = 0; i < nr; ++i )
              msi_desc[i].remap_index = -1;
-        if ( pdev )
-            bdf = PCI_BDF2(pdev->bus, pdev->devfn);
      }
  
      if ( !msg )
          return 0;
  
-    do {
-        rc = update_intremap_entry_from_msi_msg(iommu, bdf, nr,
-                                                &msi_desc->remap_index,
-                                                msg, &data);
-        if ( rc || !pdev || !pdev->phantom_stride )
-            break;
-        bdf += pdev->phantom_stride;
-    } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
-
+    rc = update_intremap_entry_from_msi_msg(iommu, bdf, nr,
+                                            &msi_desc->remap_index,
+                                            msg, &data);
      if ( !rc )
      {
          for ( i = 1; i < nr; ++i )
@@ -790,12 +777,27 @@ void amd_iommu_read_msi_from_ire(
  }
  
  int amd_iommu_free_intremap_table(
-    const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping)
+    const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping,
+    uint16_t bdf)
  {
      void **tblp;
  
      if ( ivrs_mapping )
      {
+        unsigned int i;
+
+        /*
+         * PCI device phantom functions use the same tables as their "base"
+         * function: Look ahead to zap the pointers.
+         */
+        for ( i = 1; PCI_FUNC(bdf + i) && bdf + i < ivrs_bdf_entries; ++i )
+            if ( ivrs_mapping[i].intremap_table ==
+                 ivrs_mapping->intremap_table )
+            {
+                ivrs_mapping[i].intremap_table = NULL;
+                ivrs_mapping[i].intremap_inuse = NULL;
+            }
+
          XFREE(ivrs_mapping->intremap_inuse);
          tblp = &ivrs_mapping->intremap_table;
      }
@@ -934,7 +936,8 @@ static void dump_intremap_table(const st
  }
  
  static int dump_intremap_mapping(const struct amd_iommu *iommu,
-                                 struct ivrs_mappings *ivrs_mapping)
+                                 struct ivrs_mappings *ivrs_mapping,
+                                 uint16_t unused)
  {
      unsigned long flags;
  
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -516,7 +516,7 @@ static int amd_iommu_remove_device(u8 de
      if ( amd_iommu_perdev_intremap &&
           ivrs_mappings[bdf].dte_requestor_id == bdf &&
           ivrs_mappings[bdf].intremap_table )
-        amd_iommu_free_intremap_table(iommu, &ivrs_mappings[bdf]);
+        amd_iommu_free_intremap_table(iommu, &ivrs_mappings[bdf], bdf);
  
      return 0;
  }
--- a/xen/include/asm-x86/amd-iommu.h
+++ b/xen/include/asm-x86/amd-iommu.h
@@ -131,7 +131,7 @@ extern u8 ivhd_type;
  struct ivrs_mappings *get_ivrs_mappings(u16 seg);
  int iterate_ivrs_mappings(int (*)(u16 seg, struct ivrs_mappings *));
  int iterate_ivrs_entries(int (*)(const struct amd_iommu *,
-                                 struct ivrs_mappings *));
+                                 struct ivrs_mappings *, uint16_t));
  
  /* iommu tables in guest space */
  struct mmio_reg {
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -101,7 +101,7 @@ int amd_iommu_setup_ioapic_remapping(voi
  void *amd_iommu_alloc_intremap_table(
      const struct amd_iommu *, unsigned long **);
  int amd_iommu_free_intremap_table(
-    const struct amd_iommu *, struct ivrs_mappings *);
+    const struct amd_iommu *, struct ivrs_mappings *, uint16_t);
  void amd_iommu_ioapic_update_ire(
      unsigned int apic, unsigned int reg, unsigned int value);
  unsigned int amd_iommu_read_ioapic_from_ire(


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 07/10] AMD/IOMMU: make phantom functions share interrupt remapping tables

Posted by Andrew Cooper 12 weeks ago
On 06/08/2019 14:09, Jan Beulich wrote:
> Rather than duplicating entries in amd_iommu_msi_msg_update_ire(), share
> the tables. This mainly requires some care while freeing them, to avoid
> freeing memory blocks twice.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 08/10] x86/PCI: read MSI-X table entry count early

Posted by Jan Beulich 18 weeks ago
Rather than doing this every time we set up interrupts for a device
anew (and then in two distinct places) fill this invariant field
right after allocating struct arch_msix.

While at it also obtain the MSI-X capability structure position just
once, in msix_capability_init(), rather than in each caller.

Furthermore take the opportunity and eliminate the multi_msix_capable()
alias of msix_table_size().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -821,10 +821,8 @@ static u64 read_pci_mem_bar(u16 seg, u8
   * requested MSI-X entries with allocated irqs or non-zero for otherwise.
   **/
  static int msix_capability_init(struct pci_dev *dev,
-                                unsigned int pos,
                                  struct msi_info *msi,
-                                struct msi_desc **desc,
-                                unsigned int nr_entries)
+                                struct msi_desc **desc)
  {
      struct arch_msix *msix = dev->msix;
      struct msi_desc *entry = NULL;
@@ -838,6 +836,11 @@ static int msix_capability_init(struct p
      u8 slot = PCI_SLOT(dev->devfn);
      u8 func = PCI_FUNC(dev->devfn);
      bool maskall = msix->host_maskall;
+    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
+                                           PCI_CAP_ID_MSIX);
+
+    if ( !pos )
+        return -ENODEV;
  
      ASSERT(pcidevs_locked());
  
@@ -912,10 +915,9 @@ static int msix_capability_init(struct p
          u64 pba_paddr;
          u32 pba_offset;
  
-        msix->nr_entries = nr_entries;
          msix->table.first = PFN_DOWN(table_paddr);
          msix->table.last = PFN_DOWN(table_paddr +
-                                    nr_entries * PCI_MSIX_ENTRY_SIZE - 1);
+                                    msix->nr_entries * PCI_MSIX_ENTRY_SIZE - 1);
          WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->table.first,
                                          msix->table.last));
  
@@ -927,7 +929,7 @@ static int msix_capability_init(struct p
  
          msix->pba.first = PFN_DOWN(pba_paddr);
          msix->pba.last = PFN_DOWN(pba_paddr +
-                                  BITS_TO_LONGS(nr_entries) - 1);
+                                  BITS_TO_LONGS(msix->nr_entries) - 1);
          WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->pba.first,
                                          msix->pba.last));
      }
@@ -999,7 +1001,6 @@ static int msix_capability_init(struct p
              /* XXX How to deal with existing mappings? */
          }
      }
-    WARN_ON(msix->nr_entries != nr_entries);
      WARN_ON(msix->table.first != (table_paddr >> PAGE_SHIFT));
      ++msix->used_entries;
  
@@ -1093,22 +1094,17 @@ static void __pci_disable_msi(struct msi
   **/
  static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
  {
-    int pos, nr_entries;
      struct pci_dev *pdev;
-    u16 control;
      u8 slot = PCI_SLOT(msi->devfn);
      u8 func = PCI_FUNC(msi->devfn);
      struct msi_desc *old_desc;
  
      ASSERT(pcidevs_locked());
      pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
-    pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
-    if ( !pdev || !pos )
+    if ( !pdev || !pdev->msix )
          return -ENODEV;
  
-    control = pci_conf_read16(pdev->sbdf, msix_control_reg(pos));
-    nr_entries = multi_msix_capable(control);
-    if ( msi->entry_nr >= nr_entries )
+    if ( msi->entry_nr >= pdev->msix->nr_entries )
          return -EINVAL;
  
      old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX);
@@ -1127,7 +1123,7 @@ static int __pci_enable_msix(struct msi_
          __pci_disable_msi(old_desc);
      }
  
-    return msix_capability_init(pdev, pos, msi, desc, nr_entries);
+    return msix_capability_init(pdev, msi, desc);
  }
  
  static void _pci_cleanup_msix(struct arch_msix *msix)
@@ -1187,16 +1183,10 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
  {
      int rc;
      struct pci_dev *pdev;
-    u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
-    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
-                                           PCI_CAP_ID_MSIX);
  
      if ( !use_msi )
          return 0;
  
-    if ( !pos )
-        return -ENODEV;
-
      pcidevs_lock();
      pdev = pci_get_pdev(seg, bus, devfn);
      if ( !pdev )
@@ -1209,13 +1199,7 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
          rc = 0;
      }
      else
-    {
-        uint16_t control = pci_conf_read16(PCI_SBDF3(seg, bus, devfn),
-                                           msix_control_reg(pos));
-
-        rc = msix_capability_init(pdev, pos, NULL, NULL,
-                                  multi_msix_capable(control));
-    }
+        rc = msix_capability_init(pdev, NULL, NULL);
      pcidevs_unlock();
  
      return rc;
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -324,6 +324,7 @@ static void apply_quirks(struct pci_dev
  static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
  {
      struct pci_dev *pdev;
+    unsigned int pos;
  
      list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
          if ( pdev->bus == bus && pdev->devfn == devfn )
@@ -339,10 +340,12 @@ static struct pci_dev *alloc_pdev(struct
      pdev->domain = NULL;
      INIT_LIST_HEAD(&pdev->msi_list);
  
-    if ( pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
-                             PCI_CAP_ID_MSIX) )
+    pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                              PCI_CAP_ID_MSIX);
+    if ( pos )
      {
          struct arch_msix *msix = xzalloc(struct arch_msix);
+        uint16_t ctrl;
  
          if ( !msix )
          {
@@ -350,6 +353,10 @@ static struct pci_dev *alloc_pdev(struct
              return NULL;
          }
          spin_lock_init(&msix->table_lock);
+
+        ctrl = pci_conf_read16(pdev->sbdf, msix_control_reg(pos));
+        msix->nr_entries = msix_table_size(ctrl);
+
          pdev->msix = msix;
      }
  
@@ -358,7 +365,6 @@ static struct pci_dev *alloc_pdev(struct
      /* update bus2bridge */
      switch ( pdev->type = pdev_type(pseg->nr, bus, devfn) )
      {
-        int pos;
          u16 cap;
          u8 sec_bus, sub_bus;
  
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -171,7 +171,6 @@ int msi_free_irq(struct msi_desc *entry)
  #define msix_enable(control)	 	control |= PCI_MSIX_FLAGS_ENABLE
  #define msix_disable(control)	 	control &= ~PCI_MSIX_FLAGS_ENABLE
  #define msix_table_size(control) 	((control & PCI_MSIX_FLAGS_QSIZE)+1)
-#define multi_msix_capable		msix_table_size
  #define msix_unmask(address)	 	(address & ~PCI_MSIX_VECTOR_BITMASK)
  #define msix_mask(address)		(address | PCI_MSIX_VECTOR_BITMASK)
  

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 08/10] x86/PCI: read MSI-X table entry count early

Posted by Andrew Cooper 12 weeks ago
On 06/08/2019 14:10, Jan Beulich wrote:
> Rather than doing this every time we set up interrupts for a device
> anew (and then in two distinct places) fill this invariant field
> right after allocating struct arch_msix.
>
> While at it also obtain the MSI-X capability structure position just
> once, in msix_capability_init(), rather than in each caller.
>
> Furthermore take the opportunity and eliminate the multi_msix_capable()
> alias of msix_table_size().
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 08/10] x86/PCI: read MSI-X table entry count early

Posted by Roger Pau Monné 18 weeks ago
On Tue, Aug 06, 2019 at 03:10:40PM +0200, Jan Beulich wrote:
> Rather than doing this every time we set up interrupts for a device
> anew (and then in two distinct places) fill this invariant field
> right after allocating struct arch_msix.
> 
> While at it also obtain the MSI-X capability structure position just
> once, in msix_capability_init(), rather than in each caller.
> 
> Furthermore take the opportunity and eliminate the multi_msix_capable()
> alias of msix_table_size().
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v5: New.
> 
> --- a/xen/arch/x86/msi.c
> +++ b/xen/arch/x86/msi.c
> @@ -821,10 +821,8 @@ static u64 read_pci_mem_bar(u16 seg, u8
>   * requested MSI-X entries with allocated irqs or non-zero for otherwise.
>   **/
>  static int msix_capability_init(struct pci_dev *dev,
> -                                unsigned int pos,
>                                  struct msi_info *msi,
> -                                struct msi_desc **desc,
> -                                unsigned int nr_entries)
> +                                struct msi_desc **desc)
>  {
>      struct arch_msix *msix = dev->msix;
>      struct msi_desc *entry = NULL;
> @@ -838,6 +836,11 @@ static int msix_capability_init(struct p
>      u8 slot = PCI_SLOT(dev->devfn);
>      u8 func = PCI_FUNC(dev->devfn);
>      bool maskall = msix->host_maskall;
> +    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
> +                                           PCI_CAP_ID_MSIX);
> +
> +    if ( !pos )
> +        return -ENODEV;
>      ASSERT(pcidevs_locked());
> @@ -912,10 +915,9 @@ static int msix_capability_init(struct p
>          u64 pba_paddr;
>          u32 pba_offset;
> -        msix->nr_entries = nr_entries;
>          msix->table.first = PFN_DOWN(table_paddr);
>          msix->table.last = PFN_DOWN(table_paddr +
> -                                    nr_entries * PCI_MSIX_ENTRY_SIZE - 1);
> +                                    msix->nr_entries * PCI_MSIX_ENTRY_SIZE - 1);
>          WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->table.first,
>                                          msix->table.last));
> @@ -927,7 +929,7 @@ static int msix_capability_init(struct p
>          msix->pba.first = PFN_DOWN(pba_paddr);
>          msix->pba.last = PFN_DOWN(pba_paddr +
> -                                  BITS_TO_LONGS(nr_entries) - 1);
> +                                  BITS_TO_LONGS(msix->nr_entries) - 1);
>          WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->pba.first,
>                                          msix->pba.last));
>      }
> @@ -999,7 +1001,6 @@ static int msix_capability_init(struct p
>              /* XXX How to deal with existing mappings? */
>          }
>      }
> -    WARN_ON(msix->nr_entries != nr_entries);
>      WARN_ON(msix->table.first != (table_paddr >> PAGE_SHIFT));
>      ++msix->used_entries;
> @@ -1093,22 +1094,17 @@ static void __pci_disable_msi(struct msi
>   **/
>  static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
>  {
> -    int pos, nr_entries;
>      struct pci_dev *pdev;
> -    u16 control;
>      u8 slot = PCI_SLOT(msi->devfn);
>      u8 func = PCI_FUNC(msi->devfn);
>      struct msi_desc *old_desc;

Missing newline.

>      ASSERT(pcidevs_locked());
>      pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
> -    pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
> -    if ( !pdev || !pos )
> +    if ( !pdev || !pdev->msix )
>          return -ENODEV;
> -    control = pci_conf_read16(pdev->sbdf, msix_control_reg(pos));
> -    nr_entries = multi_msix_capable(control);
> -    if ( msi->entry_nr >= nr_entries )
> +    if ( msi->entry_nr >= pdev->msix->nr_entries )
>          return -EINVAL;
>      old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX);
> @@ -1127,7 +1123,7 @@ static int __pci_enable_msix(struct msi_
>          __pci_disable_msi(old_desc);
>      }
> -    return msix_capability_init(pdev, pos, msi, desc, nr_entries);
> +    return msix_capability_init(pdev, msi, desc);
>  }
>  static void _pci_cleanup_msix(struct arch_msix *msix)
> @@ -1187,16 +1183,10 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
>  {
>      int rc;
>      struct pci_dev *pdev;
> -    u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
> -    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
> -                                           PCI_CAP_ID_MSIX);

You seem to be missing an empty newline here?

>      if ( !use_msi )
>          return 0;

Same here.

> -    if ( !pos )
> -        return -ENODEV;
> -
>      pcidevs_lock();
>      pdev = pci_get_pdev(seg, bus, devfn);
>      if ( !pdev )
> @@ -1209,13 +1199,7 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
>          rc = 0;
>      }
>      else
> -    {
> -        uint16_t control = pci_conf_read16(PCI_SBDF3(seg, bus, devfn),
> -                                           msix_control_reg(pos));
> -
> -        rc = msix_capability_init(pdev, pos, NULL, NULL,
> -                                  multi_msix_capable(control));
> -    }
> +        rc = msix_capability_init(pdev, NULL, NULL);
>      pcidevs_unlock();
>      return rc;
> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -324,6 +324,7 @@ static void apply_quirks(struct pci_dev
>  static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
>  {
>      struct pci_dev *pdev;
> +    unsigned int pos;

This chunk doesn't seem to match my current code, as there's an empty
newline between the declarations and list_for_each_entry.

>      list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
>          if ( pdev->bus == bus && pdev->devfn == devfn )
> @@ -339,10 +340,12 @@ static struct pci_dev *alloc_pdev(struct
>      pdev->domain = NULL;
>      INIT_LIST_HEAD(&pdev->msi_list);
> -    if ( pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
> -                             PCI_CAP_ID_MSIX) )
> +    pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
> +                              PCI_CAP_ID_MSIX);
> +    if ( pos )
>      {
>          struct arch_msix *msix = xzalloc(struct arch_msix);
> +        uint16_t ctrl;
>          if ( !msix )
>          {
> @@ -350,6 +353,10 @@ static struct pci_dev *alloc_pdev(struct
>              return NULL;
>          }
>          spin_lock_init(&msix->table_lock);
> +
> +        ctrl = pci_conf_read16(pdev->sbdf, msix_control_reg(pos));
> +        msix->nr_entries = msix_table_size(ctrl);

Since you store the number of entries here, why not also store the
position of the MSI-X capability since it's also immutable?

That would prevent having to fetch it again in msix_capability_init.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 08/10] x86/PCI: read MSI-X table entry count early

Posted by Jan Beulich 18 weeks ago
On 06.08.2019 16:25, Roger Pau Monné  wrote:
> On Tue, Aug 06, 2019 at 03:10:40PM +0200, Jan Beulich wrote:
>> @@ -1093,22 +1094,17 @@ static void __pci_disable_msi(struct msi
>>    **/
>>   static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
>>   {
>> -    int pos, nr_entries;
>>       struct pci_dev *pdev;
>> -    u16 control;
>>       u8 slot = PCI_SLOT(msi->devfn);
>>       u8 func = PCI_FUNC(msi->devfn);
>>       struct msi_desc *old_desc;
> 
> Missing newline.

For one this is patch context only anyway. And then I'm afraid this is
an artifact of the ongoing email issue mentioned in the cover letter.
In the list archives this also reflects itself by ...

>>       ASSERT(pcidevs_locked());

... this line not having any indentation at all. Then again, looking
at the copy I have received back from xen-devel, I can't see any issue
there at all.

>> @@ -1187,16 +1183,10 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
>>   {
>>       int rc;
>>       struct pci_dev *pdev;
>> -    u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
>> -    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
>> -                                           PCI_CAP_ID_MSIX);
> 
> You seem to be missing an empty newline here?
> 
>>       if ( !use_msi )
>>           return 0;
> 
> Same here.

Same as above.

>> @@ -1209,13 +1199,7 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
>>           rc = 0;
>>       }
>>       else
>> -    {
>> -        uint16_t control = pci_conf_read16(PCI_SBDF3(seg, bus, devfn),
>> -                                           msix_control_reg(pos));
>> -
>> -        rc = msix_capability_init(pdev, pos, NULL, NULL,
>> -                                  multi_msix_capable(control));
>> -    }
>> +        rc = msix_capability_init(pdev, NULL, NULL);
>>       pcidevs_unlock();
>>       return rc;
>> --- a/xen/drivers/passthrough/pci.c
>> +++ b/xen/drivers/passthrough/pci.c
>> @@ -324,6 +324,7 @@ static void apply_quirks(struct pci_dev
>>   static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
>>   {
>>       struct pci_dev *pdev;
>> +    unsigned int pos;
> 
> This chunk doesn't seem to match my current code, as there's an empty
> newline between the declarations and list_for_each_entry.

Same issue as above.

>> @@ -350,6 +353,10 @@ static struct pci_dev *alloc_pdev(struct
>>               return NULL;
>>           }
>>           spin_lock_init(&msix->table_lock);
>> +
>> +        ctrl = pci_conf_read16(pdev->sbdf, msix_control_reg(pos));
>> +        msix->nr_entries = msix_table_size(ctrl);
> 
> Since you store the number of entries here, why not also store the
> position of the MSI-X capability since it's also immutable?
> 
> That would prevent having to fetch it again in msix_capability_init.

I do consider this as something worthwhile to do in the future, but
not here: The field to store this doesn't exist in struct arch_msix
(yet), and hence would likely want moving from struct msi_attrib.
This is beyond the scope of this patch.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 08/10] x86/PCI: read MSI-X table entry count early

Posted by Roger Pau Monné 18 weeks ago
On Tue, Aug 06, 2019 at 04:47:28PM +0200, Jan Beulich wrote:
> On 06.08.2019 16:25, Roger Pau Monné  wrote:
> > On Tue, Aug 06, 2019 at 03:10:40PM +0200, Jan Beulich wrote:
> > > --- a/xen/drivers/passthrough/pci.c
> > > +++ b/xen/drivers/passthrough/pci.c
> > > @@ -324,6 +324,7 @@ static void apply_quirks(struct pci_dev
> > >   static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
> > >   {
> > >       struct pci_dev *pdev;
> > > +    unsigned int pos;
> > 
> > This chunk doesn't seem to match my current code, as there's an empty
> > newline between the declarations and list_for_each_entry.
> 
> Same issue as above.

Sorry for the noise. I jumped straight into this patch.

> > > @@ -350,6 +353,10 @@ static struct pci_dev *ºalloc_pdev(struct
> > >               return NULL;
> > >           }
> > >           spin_lock_init(&msix->table_lock);
> > > +
> > > +        ctrl = pci_conf_read16(pdev->sbdf, msix_control_reg(pos));
> > > +        msix->nr_entries = msix_table_size(ctrl);
> > 
> > Since you store the number of entries here, why not also store the
> > position of the MSI-X capability since it's also immutable?
> > 
> > That would prevent having to fetch it again in msix_capability_init.
> 
> I do consider this as something worthwhile to do in the future, but
> not here: The field to store this doesn't exist in struct arch_msix
> (yet), and hence would likely want moving from struct msi_attrib.
> This is beyond the scope of this patch.

Oh I see. So the position it's actually stored in msi_attrib, and is
used by both MSI and MSI-X, in which case what I'm proposing would be
worse, since the field would only be used by MSI-X.

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 08/10] x86/PCI: read MSI-X table entry count early

Posted by Jan Beulich 18 weeks ago
On 06.08.2019 17:16, Roger Pau Monné  wrote:
> On Tue, Aug 06, 2019 at 04:47:28PM +0200, Jan Beulich wrote:
>> On 06.08.2019 16:25, Roger Pau Monné  wrote:
>>> On Tue, Aug 06, 2019 at 03:10:40PM +0200, Jan Beulich wrote:
>>>> @@ -350,6 +353,10 @@ static struct pci_dev *ºalloc_pdev(struct
>>>>                return NULL;
>>>>            }
>>>>            spin_lock_init(&msix->table_lock);
>>>> +
>>>> +        ctrl = pci_conf_read16(pdev->sbdf, msix_control_reg(pos));
>>>> +        msix->nr_entries = msix_table_size(ctrl);
>>>
>>> Since you store the number of entries here, why not also store the
>>> position of the MSI-X capability since it's also immutable?
>>>
>>> That would prevent having to fetch it again in msix_capability_init.
>>
>> I do consider this as something worthwhile to do in the future, but
>> not here: The field to store this doesn't exist in struct arch_msix
>> (yet), and hence would likely want moving from struct msi_attrib.
>> This is beyond the scope of this patch.
> 
> Oh I see. So the position it's actually stored in msi_attrib, and is
> used by both MSI and MSI-X, in which case what I'm proposing would be
> worse, since the field would only be used by MSI-X.

Right, if we wanted to store it, we'd want to cover both MSI and
MSI-X (and hence it would need to be a field outside of struct
arch_msix).

> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 09/10] AMD/IOMMU: replace INTREMAP_ENTRIES

Posted by Jan Beulich 18 weeks ago
Prepare for the number of entries to not be the maximum possible, by
separating checks against maximum size from ones against actual size.
For caller side simplicity have alloc_intremap_entry() return the
maximum possible value upon allocation failure, rather than the first
just out-of-bounds one.

Have the involved functions already take all the subsequently needed
arguments here already, to reduce code churn in the patch actually
making the allocation size dynamic.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -69,7 +69,7 @@ union irte_cptr {
      const union irte128 *ptr128;
  } __transparent__;
  
-#define INTREMAP_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
+#define INTREMAP_MAX_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
  
  struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
  struct hpet_sbdf hpet_sbdf;
@@ -83,8 +83,20 @@ static void dump_intremap_tables(unsigne
  static unsigned int __init intremap_table_order(const struct amd_iommu *iommu)
  {
      return iommu->ctrl.ga_en
-           ? get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte128))
-           : get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte32));
+           ? get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte128))
+           : get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte32));
+}
+
+unsigned int amd_iommu_intremap_table_order(
+    const void *irt, const struct amd_iommu *iommu)
+{
+    return IOMMU_INTREMAP_ORDER;
+}
+
+static unsigned int intremap_table_entries(
+    const void *irt, const struct amd_iommu *iommu)
+{
+    return 1u << amd_iommu_intremap_table_order(irt, iommu);
  }
  
  unsigned int ioapic_id_to_index(unsigned int apic_id)
@@ -122,20 +134,24 @@ static int get_intremap_requestor_id(int
      return get_ivrs_mappings(seg)[bdf].dte_requestor_id;
  }
  
-static unsigned int alloc_intremap_entry(int seg, int bdf, unsigned int nr)
+static unsigned int alloc_intremap_entry(const struct amd_iommu *iommu,
+                                         unsigned int bdf, unsigned int nr)
  {
-    unsigned long *inuse = get_ivrs_mappings(seg)[bdf].intremap_inuse;
-    unsigned int slot = find_first_zero_bit(inuse, INTREMAP_ENTRIES);
+    const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(iommu->seg);
+    unsigned long *inuse = ivrs_mappings[bdf].intremap_inuse;
+    unsigned int nr_ents =
+        intremap_table_entries(ivrs_mappings[bdf].intremap_table, iommu);
+    unsigned int slot = find_first_zero_bit(inuse, nr_ents);
  
      for ( ; ; )
      {
          unsigned int end;
  
-        if ( slot >= INTREMAP_ENTRIES )
+        if ( slot >= nr_ents )
              break;
-        end = find_next_bit(inuse, INTREMAP_ENTRIES, slot + 1);
-        if ( end > INTREMAP_ENTRIES )
-            end = INTREMAP_ENTRIES;
+        end = find_next_bit(inuse, nr_ents, slot + 1);
+        if ( end > nr_ents )
+            end = nr_ents;
          slot = (slot + nr - 1) & ~(nr - 1);
          if ( slot + nr <= end )
          {
@@ -144,12 +160,12 @@ static unsigned int alloc_intremap_entry
              break;
          }
          slot = (end + nr) & ~(nr - 1);
-        if ( slot >= INTREMAP_ENTRIES )
+        if ( slot >= nr_ents )
              break;
-        slot = find_next_zero_bit(inuse, INTREMAP_ENTRIES, slot);
+        slot = find_next_zero_bit(inuse, nr_ents, slot);
      }
  
-    return slot;
+    return slot < nr_ents ? slot : INTREMAP_MAX_ENTRIES;
  }
  
  static union irte_ptr get_intremap_entry(const struct amd_iommu *iommu,
@@ -159,7 +175,7 @@ static union irte_ptr get_intremap_entry
          .ptr = get_ivrs_mappings(iommu->seg)[bdf].intremap_table
      };
  
-    ASSERT(table.ptr && (index < INTREMAP_ENTRIES));
+    ASSERT(table.ptr && (index < intremap_table_entries(table.ptr, iommu)));
  
      if ( iommu->ctrl.ga_en )
          table.ptr128 += index;
@@ -279,10 +295,10 @@ static int update_intremap_entry_from_io
      spin_lock_irqsave(lock, flags);
  
      offset = *index;
-    if ( offset >= INTREMAP_ENTRIES )
+    if ( offset >= INTREMAP_MAX_ENTRIES )
      {
-        offset = alloc_intremap_entry(iommu->seg, req_id, 1);
-        if ( offset >= INTREMAP_ENTRIES )
+        offset = alloc_intremap_entry(iommu, req_id, 1);
+        if ( offset >= INTREMAP_MAX_ENTRIES )
          {
              spin_unlock_irqrestore(lock, flags);
              rte->mask = 1;
@@ -400,8 +416,8 @@ int __init amd_iommu_setup_ioapic_remapp
              }
  
              spin_lock_irqsave(lock, flags);
-            offset = alloc_intremap_entry(seg, req_id, 1);
-            BUG_ON(offset >= INTREMAP_ENTRIES);
+            offset = alloc_intremap_entry(iommu, req_id, 1);
+            BUG_ON(offset >= INTREMAP_MAX_ENTRIES);
              entry = get_intremap_entry(iommu, req_id, offset);
              update_intremap_entry(iommu, entry, vector,
                                    delivery_mode, dest_mode, dest);
@@ -476,7 +492,7 @@ void amd_iommu_ioapic_update_ire(
          *(((u32 *)&new_rte) + 1) = value;
      }
  
-    if ( ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_ENTRIES )
+    if ( ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_MAX_ENTRIES )
      {
          ASSERT(saved_mask);
  
@@ -548,7 +564,7 @@ unsigned int amd_iommu_read_ioapic_from_
          return val;
  
      offset = ioapic_sbdf[idx].pin_2_idx[pin];
-    if ( offset >= INTREMAP_ENTRIES )
+    if ( offset >= INTREMAP_MAX_ENTRIES )
          return val;
  
      seg = ioapic_sbdf[idx].seg;
@@ -561,8 +577,8 @@ unsigned int amd_iommu_read_ioapic_from_
  
      if ( !(reg & 1) )
      {
-        ASSERT(offset == (val & (INTREMAP_ENTRIES - 1)));
-        val &= ~(INTREMAP_ENTRIES - 1);
+        ASSERT(offset == (val & (INTREMAP_MAX_ENTRIES - 1)));
+        val &= ~(INTREMAP_MAX_ENTRIES - 1);
          /* The IntType fields match for both formats. */
          val |= MASK_INSR(entry.ptr32->flds.int_type,
                           IO_APIC_REDIR_DELIV_MODE_MASK);
@@ -622,11 +638,11 @@ static int update_intremap_entry_from_ms
          dest = MASK_EXTR(msg->address_lo, MSI_ADDR_DEST_ID_MASK);
  
      offset = *remap_index;
-    if ( offset >= INTREMAP_ENTRIES )
+    if ( offset >= INTREMAP_MAX_ENTRIES )
      {
          ASSERT(nr);
-        offset = alloc_intremap_entry(iommu->seg, bdf, nr);
-        if ( offset >= INTREMAP_ENTRIES )
+        offset = alloc_intremap_entry(iommu, bdf, nr);
+        if ( offset >= INTREMAP_MAX_ENTRIES )
          {
              spin_unlock_irqrestore(lock, flags);
              return -ENOSPC;
@@ -654,7 +670,7 @@ static int update_intremap_entry_from_ms
      update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
      spin_unlock_irqrestore(lock, flags);
  
-    *data = (msg->data & ~(INTREMAP_ENTRIES - 1)) | offset;
+    *data = (msg->data & ~(INTREMAP_MAX_ENTRIES - 1)) | offset;
  
      /*
       * In some special cases, a pci-e device(e.g SATA controller in IDE mode)
@@ -738,7 +754,7 @@ int amd_iommu_msi_msg_update_ire(
  void amd_iommu_read_msi_from_ire(
      struct msi_desc *msi_desc, struct msi_msg *msg)
  {
-    unsigned int offset = msg->data & (INTREMAP_ENTRIES - 1);
+    unsigned int offset = msg->data & (INTREMAP_MAX_ENTRIES - 1);
      const struct pci_dev *pdev = msi_desc->dev;
      u16 bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf;
      u16 seg = pdev ? pdev->seg : hpet_sbdf.seg;
@@ -758,7 +774,7 @@ void amd_iommu_read_msi_from_ire(
          offset |= nr;
      }
  
-    msg->data &= ~(INTREMAP_ENTRIES - 1);
+    msg->data &= ~(INTREMAP_MAX_ENTRIES - 1);
      /* The IntType fields match for both formats. */
      msg->data |= MASK_INSR(entry.ptr32->flds.int_type,
                             MSI_DATA_DELIVERY_MODE_MASK);
@@ -824,8 +840,9 @@ void *amd_iommu_alloc_intremap_table(
  
      if ( tb )
      {
-        *inuse_map = xzalloc_array(unsigned long,
-                                   BITS_TO_LONGS(INTREMAP_ENTRIES));
+        unsigned int nr = intremap_table_entries(tb, iommu);
+
+        *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(nr));
          if ( *inuse_map )
              memset(tb, 0, PAGE_SIZE << order);
          else
@@ -869,6 +886,7 @@ bool __init iov_supports_xt(void)
  
  int __init amd_setup_hpet_msi(struct msi_desc *msi_desc)
  {
+    const struct amd_iommu *iommu;
      spinlock_t *lock;
      unsigned long flags;
      int rc = 0;
@@ -886,12 +904,15 @@ int __init amd_setup_hpet_msi(struct msi
          return -ENODEV;
      }
  
+    iommu = find_iommu_for_device(hpet_sbdf.seg, hpet_sbdf.bdf);
+    if ( !iommu )
+        return -ENXIO;
+
      lock = get_intremap_lock(hpet_sbdf.seg, hpet_sbdf.bdf);
      spin_lock_irqsave(lock, flags);
  
-    msi_desc->remap_index = alloc_intremap_entry(hpet_sbdf.seg,
-                                                 hpet_sbdf.bdf, 1);
-    if ( msi_desc->remap_index >= INTREMAP_ENTRIES )
+    msi_desc->remap_index = alloc_intremap_entry(iommu, hpet_sbdf.bdf, 1);
+    if ( msi_desc->remap_index >= INTREMAP_MAX_ENTRIES )
      {
          msi_desc->remap_index = -1;
          rc = -ENXIO;
@@ -906,12 +927,12 @@ static void dump_intremap_table(const st
                                  union irte_cptr tbl,
                                  const struct ivrs_mappings *ivrs_mapping)
  {
-    unsigned int count;
+    unsigned int count, nr = intremap_table_entries(tbl.ptr, iommu);
  
      if ( !tbl.ptr )
          return;
  
-    for ( count = 0; count < INTREMAP_ENTRIES; count++ )
+    for ( count = 0; count < nr; count++ )
      {
          if ( iommu->ctrl.ga_en
               ? !tbl.ptr128[count].raw[0] && !tbl.ptr128[count].raw[1]
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -102,6 +102,8 @@ void *amd_iommu_alloc_intremap_table(
      const struct amd_iommu *, unsigned long **);
  int amd_iommu_free_intremap_table(
      const struct amd_iommu *, struct ivrs_mappings *, uint16_t);
+unsigned int amd_iommu_intremap_table_order(
+    const void *irt, const struct amd_iommu *iommu);
  void amd_iommu_ioapic_update_ire(
      unsigned int apic, unsigned int reg, unsigned int value);
  unsigned int amd_iommu_read_ioapic_from_ire(


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 09/10] AMD/IOMMU: replace INTREMAP_ENTRIES

Posted by Andrew Cooper 12 weeks ago
On 06/08/2019 14:11, Jan Beulich wrote:
> Prepare for the number of entries to not be the maximum possible, by
> separating checks against maximum size from ones against actual size.
> For caller side simplicity have alloc_intremap_entry() return the
> maximum possible value upon allocation failure, rather than the first
> just out-of-bounds one.
>
> Have the involved functions already take all the subsequently needed
> arguments here already, to reduce code churn in the patch actually
> making the allocation size dynamic.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 10/10] AMD/IOMMU: restrict interrupt remapping table sizes

Posted by Jan Beulich 18 weeks ago
There's no point setting up tables with more space than a PCI device can
use. For both MSI and MSI-X we can determine how many interrupts could
be set up at most. Tables allocated during ACPI table parsing, however,
will (for now at least) continue to be set up to have maximum size.

Note that until we would want to use sub-page allocations here there's
no point checking whether MSI is supported by a device - 1 or up to 32
(or actually 128, due to the change effectively using a reserved
encoding) IRTEs always mean an order-0 allocation anyway.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -77,7 +77,7 @@ static void __init add_ivrs_mapping_entr
          {
              if ( !shared_intremap_table )
                  shared_intremap_table = amd_iommu_alloc_intremap_table(
-                    iommu, &shared_intremap_inuse);
+                    iommu, &shared_intremap_inuse, 0);
  
              if ( !shared_intremap_table )
                  panic("No memory for shared IRT\n");
@@ -89,7 +89,7 @@ static void __init add_ivrs_mapping_entr
          {
              ivrs_mappings[alias_id].intremap_table =
                  amd_iommu_alloc_intremap_table(
-                    iommu, &ivrs_mappings[alias_id].intremap_inuse);
+                    iommu, &ivrs_mappings[alias_id].intremap_inuse, 0);
  
              if ( !ivrs_mappings[alias_id].intremap_table )
                  panic("No memory for %04x:%02x:%02x.%u's IRT\n",
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1292,7 +1292,9 @@ static int __init amd_iommu_setup_device
                  ivrs_mappings[bdf].intremap_table =
                      amd_iommu_alloc_intremap_table(
                          ivrs_mappings[bdf].iommu,
-                        &ivrs_mappings[bdf].intremap_inuse);
+                        &ivrs_mappings[bdf].intremap_inuse,
+                        pdev->msix ? pdev->msix->nr_entries
+                                   : multi_msi_capable(~0u));
                  if ( !ivrs_mappings[bdf].intremap_table )
                      return -ENOMEM;
  
@@ -1315,11 +1317,8 @@ static int __init amd_iommu_setup_device
              }
  
              amd_iommu_set_intremap_table(
-                dte,
-                ivrs_mappings[bdf].intremap_table
-                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
-                : 0,
-                iommu_intremap);
+                dte, ivrs_mappings[bdf].intremap_table,
+                ivrs_mappings[bdf].iommu, iommu_intremap);
          }
      }
  
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -69,7 +69,8 @@ union irte_cptr {
      const union irte128 *ptr128;
  } __transparent__;
  
-#define INTREMAP_MAX_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
+#define INTREMAP_MAX_ORDER   0xB
+#define INTREMAP_MAX_ENTRIES (1 << INTREMAP_MAX_ORDER)
  
  struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
  struct hpet_sbdf hpet_sbdf;
@@ -80,17 +81,13 @@ unsigned int nr_ioapic_sbdf;
  
  static void dump_intremap_tables(unsigned char key);
  
-static unsigned int __init intremap_table_order(const struct amd_iommu *iommu)
-{
-    return iommu->ctrl.ga_en
-           ? get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte128))
-           : get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte32));
-}
+#define intremap_page_order(irt) PFN_ORDER(virt_to_page(irt))
  
  unsigned int amd_iommu_intremap_table_order(
      const void *irt, const struct amd_iommu *iommu)
  {
-    return IOMMU_INTREMAP_ORDER;
+    return intremap_page_order(irt) + PAGE_SHIFT -
+           (iommu->ctrl.ga_en ? 4 : 2);
  }
  
  static unsigned int intremap_table_entries(
@@ -825,7 +822,10 @@ int amd_iommu_free_intremap_table(
  
      if ( *tblp )
      {
-        __free_amd_iommu_tables(*tblp, intremap_table_order(iommu));
+        unsigned int order = intremap_page_order(*tblp);
+
+        intremap_page_order(*tblp) = 0;
+        __free_amd_iommu_tables(*tblp, order);
          *tblp = NULL;
      }
  
@@ -833,15 +833,23 @@ int amd_iommu_free_intremap_table(
  }
  
  void *amd_iommu_alloc_intremap_table(
-    const struct amd_iommu *iommu, unsigned long **inuse_map)
+    const struct amd_iommu *iommu, unsigned long **inuse_map, unsigned int nr)
  {
-    unsigned int order = intremap_table_order(iommu);
-    void *tb = __alloc_amd_iommu_tables(order);
+    unsigned int order;
+    void *tb;
  
+    if ( !nr )
+        nr = INTREMAP_MAX_ENTRIES;
+
+    order = iommu->ctrl.ga_en
+            ? get_order_from_bytes(nr * sizeof(union irte128))
+            : get_order_from_bytes(nr * sizeof(union irte32));
+
+    tb = __alloc_amd_iommu_tables(order);
      if ( tb )
      {
-        unsigned int nr = intremap_table_entries(tb, iommu);
-
+        intremap_page_order(tb) = order;
+        nr = intremap_table_entries(tb, iommu);
          *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(nr));
          if ( *inuse_map )
              memset(tb, 0, PAGE_SIZE << order);
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -113,12 +113,22 @@ void amd_iommu_set_root_page_table(struc
  }
  
  void __init amd_iommu_set_intremap_table(
-    struct amd_iommu_dte *dte, uint64_t intremap_ptr, bool valid)
+    struct amd_iommu_dte *dte, const void *ptr,
+    const struct amd_iommu *iommu, bool valid)
  {
-    dte->it_root = intremap_ptr >> 6;
-    dte->int_tab_len = intremap_ptr ? IOMMU_INTREMAP_ORDER : 0;
-    dte->int_ctl = intremap_ptr ? IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED
-                                : IOMMU_DEV_TABLE_INT_CONTROL_ABORTED;
+    if ( ptr )
+    {
+        dte->it_root = virt_to_maddr(ptr) >> 6;
+        dte->int_tab_len = amd_iommu_intremap_table_order(ptr, iommu);
+        dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
+    }
+    else
+    {
+        dte->it_root = 0;
+        dte->int_tab_len = 0;
+        dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_ABORTED;
+    }
+
      dte->ig = false; /* unmapped interrupts result in i/o page faults */
      dte->iv = valid;
  }
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -471,16 +471,15 @@ static int amd_iommu_add_device(u8 devfn
      {
          ivrs_mappings[bdf].intremap_table =
              amd_iommu_alloc_intremap_table(
-                iommu, &ivrs_mappings[bdf].intremap_inuse);
+                iommu, &ivrs_mappings[bdf].intremap_inuse,
+                pdev->msix ? pdev->msix->nr_entries
+                           : multi_msi_capable(~0u));
          if ( !ivrs_mappings[bdf].intremap_table )
              return -ENOMEM;
  
          amd_iommu_set_intremap_table(
              iommu->dev_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE),
-            ivrs_mappings[bdf].intremap_table
-            ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
-            : 0,
-            iommu_intremap);
+            ivrs_mappings[bdf].intremap_table, iommu, iommu_intremap);
  
          amd_iommu_flush_device(iommu, bdf);
      }
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -107,9 +107,6 @@
  #define IOMMU_DEV_TABLE_INT_CONTROL_FORWARDED	0x1
  #define IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED	0x2
  
-/* For now, we always allocate the maximum: 2048 entries. */
-#define IOMMU_INTREMAP_ORDER			0xB
-
  struct amd_iommu_dte {
      /* 0 - 63 */
      bool v:1;
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -72,7 +72,8 @@ int __must_check amd_iommu_flush_iotlb_a
  /* device table functions */
  int get_dma_requestor_id(uint16_t seg, uint16_t bdf);
  void amd_iommu_set_intremap_table(struct amd_iommu_dte *dte,
-                                  uint64_t intremap_ptr,
+                                  const void *ptr,
+                                  const struct amd_iommu *iommu,
                                    bool valid);
  void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
  				   uint64_t root_ptr, uint16_t domain_id,
@@ -99,7 +100,7 @@ struct amd_iommu *find_iommu_for_device(
  bool iov_supports_xt(void);
  int amd_iommu_setup_ioapic_remapping(void);
  void *amd_iommu_alloc_intremap_table(
-    const struct amd_iommu *, unsigned long **);
+    const struct amd_iommu *, unsigned long **, unsigned int nr);
  int amd_iommu_free_intremap_table(
      const struct amd_iommu *, struct ivrs_mappings *, uint16_t);
  unsigned int amd_iommu_intremap_table_order(


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 10/10] AMD/IOMMU: restrict interrupt remapping table sizes

Posted by Andrew Cooper 12 weeks ago
On 06/08/2019 14:11, Jan Beulich wrote:
> There's no point setting up tables with more space than a PCI device can
> use. For both MSI and MSI-X we can determine how many interrupts could
> be set up at most. Tables allocated during ACPI table parsing, however,
> will (for now at least) continue to be set up to have maximum size.
>
> Note that until we would want to use sub-page allocations here there's
> no point checking whether MSI is supported by a device - 1 or up to 32
> (or actually 128, due to the change effectively using a reserved
> encoding) IRTEs always mean an order-0 allocation anyway.

Devices which are not MSI-capable don't need an interrupt remapping
table at all.

Per my calculations, the Rome SDP has 62 devices with MSI/MSI-X support,
and 98 devices which are CPU-internals that have no interrupt support at
all.

In comparison, for a production Cascade Lake system I have to hand, the
stats are 92 non-MSI devices and 18 MSI-capable devices (which isn't a
valid direct comparison due to how VT-d's remapping tables work, but is
a datapoint on "similar looking systems").

I'm happy to leave "no IRT's for non-capable devices" for future work,
but at the very least, I don't think the commit message wants phrasing
in exactly this way.

>
> --- a/xen/drivers/passthrough/amd/iommu_init.c
> +++ b/xen/drivers/passthrough/amd/iommu_init.c
> @@ -1315,11 +1317,8 @@ static int __init amd_iommu_setup_device
>              }
>  
>              amd_iommu_set_intremap_table(
> -                dte,
> -                ivrs_mappings[bdf].intremap_table
> -                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
> -                : 0,
> -                iommu_intremap);
> +                dte, ivrs_mappings[bdf].intremap_table,
> +                ivrs_mappings[bdf].iommu, iommu_intremap);

Ah - half of this looks like it wants to be in patch 6, rather than here.

>          }
>      }
>  
> --- a/xen/drivers/passthrough/amd/iommu_intr.c
> +++ b/xen/drivers/passthrough/amd/iommu_intr.c
> @@ -69,7 +69,8 @@ union irte_cptr {
>      const union irte128 *ptr128;
>  } __transparent__;
>  
> -#define INTREMAP_MAX_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
> +#define INTREMAP_MAX_ORDER   0xB
> +#define INTREMAP_MAX_ENTRIES (1 << INTREMAP_MAX_ORDER)
>  
>  struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
>  struct hpet_sbdf hpet_sbdf;
> @@ -80,17 +81,13 @@ unsigned int nr_ioapic_sbdf;
>  
>  static void dump_intremap_tables(unsigned char key);
>  
> -static unsigned int __init intremap_table_order(const struct
> amd_iommu *iommu)
> -{
> -    return iommu->ctrl.ga_en
> -           ? get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union
> irte128))
> -           : get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union
> irte32));
> -}
> +#define intremap_page_order(irt) PFN_ORDER(virt_to_page(irt))

What makes the frameable order field safe to use?  It reaches into
(pg)->v.free.order which fairly obviously isn't safe for allocated pages.

virt_to_page() is a non-trivial calculation, which is now used in a
large number of circumstances.  I don't have an easy judgement of
whether they are hotpaths, but surely it would be easier to just store
another unsigned int per device.

Furthermore, it would work around a preexisting issue where we can
allocate beyond the number of interrupts for the device, up to the next
order boundary.

> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> @@ -471,16 +471,15 @@ static int amd_iommu_add_device(u8 devfn
>      {
>          ivrs_mappings[bdf].intremap_table =
>              amd_iommu_alloc_intremap_table(
> -                iommu, &ivrs_mappings[bdf].intremap_inuse);
> +                iommu, &ivrs_mappings[bdf].intremap_inuse,
> +                pdev->msix ? pdev->msix->nr_entries
> +                           : multi_msi_capable(~0u));
>          if ( !ivrs_mappings[bdf].intremap_table )
>              return -ENOMEM;
>  
>          amd_iommu_set_intremap_table(
>              iommu->dev_table.buffer + (bdf *
> IOMMU_DEV_TABLE_ENTRY_SIZE),
> -            ivrs_mappings[bdf].intremap_table
> -            ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
> -            : 0,
> -            iommu_intremap);
> +            ivrs_mappings[bdf].intremap_table, iommu, iommu_intremap);
>  

Similarly for patch 6 here.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 10/10] AMD/IOMMU: restrict interrupt remapping table sizes

Posted by Jan Beulich 12 weeks ago
On 17.09.2019 17:30, Andrew Cooper wrote:
> On 06/08/2019 14:11, Jan Beulich wrote:
>> There's no point setting up tables with more space than a PCI device can
>> use. For both MSI and MSI-X we can determine how many interrupts could
>> be set up at most. Tables allocated during ACPI table parsing, however,
>> will (for now at least) continue to be set up to have maximum size.
>>
>> Note that until we would want to use sub-page allocations here there's
>> no point checking whether MSI is supported by a device - 1 or up to 32
>> (or actually 128, due to the change effectively using a reserved
>> encoding) IRTEs always mean an order-0 allocation anyway.
> 
> Devices which are not MSI-capable don't need an interrupt remapping
> table at all.

Oh, good point - pin based interrupts use the respective IO-APIC's
IRTE.

> Per my calculations, the Rome SDP has 62 devices with MSI/MSI-X support,
> and 98 devices which are CPU-internals that have no interrupt support at
> all.
> 
> In comparison, for a production Cascade Lake system I have to hand, the
> stats are 92 non-MSI devices and 18 MSI-capable devices (which isn't a
> valid direct comparison due to how VT-d's remapping tables work, but is
> a datapoint on "similar looking systems").
> 
> I'm happy to leave "no IRT's for non-capable devices" for future work,
> but at the very least, I don't think the commit message wants phrasing
> in exactly this way.

I think it would be better to correct this right away, before it
goes in. I don't think it'll be overly much of a change to add an
MSI capability check next to the MSI-X one.

>> --- a/xen/drivers/passthrough/amd/iommu_init.c
>> +++ b/xen/drivers/passthrough/amd/iommu_init.c
>> @@ -1315,11 +1317,8 @@ static int __init amd_iommu_setup_device
>>              }
>>  
>>              amd_iommu_set_intremap_table(
>> -                dte,
>> -                ivrs_mappings[bdf].intremap_table
>> -                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
>> -                : 0,
>> -                iommu_intremap);
>> +                dte, ivrs_mappings[bdf].intremap_table,
>> +                ivrs_mappings[bdf].iommu, iommu_intremap);
> 
> Ah - half of this looks like it wants to be in patch 6, rather than here.

Hmm, which half? I don't see anything misplaced here. The signature
of amd_iommu_set_intremap_table) changes only in this patch, not in
patch 6.

>> @@ -80,17 +81,13 @@ unsigned int nr_ioapic_sbdf;
>>  
>>  static void dump_intremap_tables(unsigned char key);
>>  
>> -static unsigned int __init intremap_table_order(const struct
>> amd_iommu *iommu)
>> -{
>> -    return iommu->ctrl.ga_en
>> -           ? get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union
>> irte128))
>> -           : get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union
>> irte32));
>> -}
>> +#define intremap_page_order(irt) PFN_ORDER(virt_to_page(irt))
> 
> What makes the frameable order field safe to use?  It reaches into
> (pg)->v.free.order which fairly obviously isn't safe for allocated pages.

The same argument that allows xmalloc_whole_pages() and xfree() to
use this field: It is the owner of a page who controls how the
individual sub-fields of a union get used. As long as v.inuse and
v.sh don't get used, (ab)using v.free for an allocated page is
quite fine.

> virt_to_page() is a non-trivial calculation, which is now used in a
> large number of circumstances.  I don't have an easy judgement of
> whether they are hotpaths, but surely it would be easier to just store
> another unsigned int per device.

Except this would be a vendor specific field in a supposedly
vendor agnostic structure. I'm not very keen to add such a field.
Also I don't think interrupt setup/teardown paths would normally
be considered "hot".

What I could be talked into (to limit code size in case the
compiler doesn't expand things inline overly aggressively) is to
make this a static function instead of a macro.

> Furthermore, it would work around a preexisting issue where we can
> allocate beyond the number of interrupts for the device, up to the next
> order boundary.

Is this really an "issue", rather than just an irrelevant side
effect (which is never going to be hit as long as other layers
work correctly, in that they bound requests appropriately)?

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 10/10] AMD/IOMMU: restrict interrupt remapping table sizes

Posted by Andrew Cooper 12 weeks ago
On 18/09/2019 09:11, Jan Beulich wrote:
> On 17.09.2019 17:30, Andrew Cooper wrote:
>> On 06/08/2019 14:11, Jan Beulich wrote:
>>> There's no point setting up tables with more space than a PCI device can
>>> use. For both MSI and MSI-X we can determine how many interrupts could
>>> be set up at most. Tables allocated during ACPI table parsing, however,
>>> will (for now at least) continue to be set up to have maximum size.
>>>
>>> Note that until we would want to use sub-page allocations here there's
>>> no point checking whether MSI is supported by a device - 1 or up to 32
>>> (or actually 128, due to the change effectively using a reserved
>>> encoding) IRTEs always mean an order-0 allocation anyway.
>> Devices which are not MSI-capable don't need an interrupt remapping
>> table at all.
> Oh, good point - pin based interrupts use the respective IO-APIC's
> IRTE.

A lot of these devices have no interrupt capabilities at all.

>
>> Per my calculations, the Rome SDP has 62 devices with MSI/MSI-X support,
>> and 98 devices which are CPU-internals that have no interrupt support at
>> all.
>>
>> In comparison, for a production Cascade Lake system I have to hand, the
>> stats are 92 non-MSI devices and 18 MSI-capable devices (which isn't a
>> valid direct comparison due to how VT-d's remapping tables work, but is
>> a datapoint on "similar looking systems").
>>
>> I'm happy to leave "no IRT's for non-capable devices" for future work,
>> but at the very least, I don't think the commit message wants phrasing
>> in exactly this way.
> I think it would be better to correct this right away, before it
> goes in. I don't think it'll be overly much of a change to add an
> MSI capability check next to the MSI-X one.

Ok.  Even better.

>
>>> --- a/xen/drivers/passthrough/amd/iommu_init.c
>>> +++ b/xen/drivers/passthrough/amd/iommu_init.c
>>> @@ -1315,11 +1317,8 @@ static int __init amd_iommu_setup_device
>>>              }
>>>  
>>>              amd_iommu_set_intremap_table(
>>> -                dte,
>>> -                ivrs_mappings[bdf].intremap_table
>>> -                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
>>> -                : 0,
>>> -                iommu_intremap);
>>> +                dte, ivrs_mappings[bdf].intremap_table,
>>> +                ivrs_mappings[bdf].iommu, iommu_intremap);
>> Ah - half of this looks like it wants to be in patch 6, rather than here.
> Hmm, which half?

The dropping of the ternary expression.

> I don't see anything misplaced here. The signature
> of amd_iommu_set_intremap_table) changes only in this patch, not in
> patch 6.

If the code isn't misplaced, I can't spot why it is necessary before
this patch.

>
>>> @@ -80,17 +81,13 @@ unsigned int nr_ioapic_sbdf;
>>>  
>>>  static void dump_intremap_tables(unsigned char key);
>>>  
>>> -static unsigned int __init intremap_table_order(const struct
>>> amd_iommu *iommu)
>>> -{
>>> -    return iommu->ctrl.ga_en
>>> -           ? get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union
>>> irte128))
>>> -           : get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union
>>> irte32));
>>> -}
>>> +#define intremap_page_order(irt) PFN_ORDER(virt_to_page(irt))
>> What makes the frameable order field safe to use?  It reaches into
>> (pg)->v.free.order which fairly obviously isn't safe for allocated pages.
> The same argument that allows xmalloc_whole_pages() and xfree() to
> use this field: It is the owner of a page who controls how the
> individual sub-fields of a union get used. As long as v.inuse and
> v.sh don't get used, (ab)using v.free for an allocated page is
> quite fine.

In which case I think v.free needs renaming and/or the comment for
PFN_ORDER() needs rewriting.

The current code/documentation does not give the impression that the
current uses of PFN_ORDER() are safe.

Judging by the other users (particularly the IOMMU code), it would be
best in a field called opaque or similar.

>
>> virt_to_page() is a non-trivial calculation, which is now used in a
>> large number of circumstances.  I don't have an easy judgement of
>> whether they are hotpaths, but surely it would be easier to just store
>> another unsigned int per device.
> Except this would be a vendor specific field in a supposedly
> vendor agnostic structure. I'm not very keen to add such a field.
> Also I don't think interrupt setup/teardown paths would normally
> be considered "hot".
>
> What I could be talked into (to limit code size in case the
> compiler doesn't expand things inline overly aggressively) is to
> make this a static function instead of a macro.

I considered asking for that, but I expected not to get very far, given
that the use of PFN_ORDER() as an lvalue seems to be the prevailing idiom.

>> Furthermore, it would work around a preexisting issue where we can
>> allocate beyond the number of interrupts for the device, up to the next
>> order boundary.
> Is this really an "issue", rather than just an irrelevant side
> effect (which is never going to be hit as long as other layers
> work correctly, in that they bound requests appropriately)?

Its not major, certainly (and I wouldn't object to the patch for this
issue in isolation), but defensive coding is something to consider when
the underlying algorithm is up for debate.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 10/10] AMD/IOMMU: restrict interrupt remapping table sizes

Posted by Jan Beulich 12 weeks ago
On 18.09.2019 12:45, Andrew Cooper wrote:
> On 18/09/2019 09:11, Jan Beulich wrote:
>> On 17.09.2019 17:30, Andrew Cooper wrote:
>>> On 06/08/2019 14:11, Jan Beulich wrote:
>>>> There's no point setting up tables with more space than a PCI device can
>>>> use. For both MSI and MSI-X we can determine how many interrupts could
>>>> be set up at most. Tables allocated during ACPI table parsing, however,
>>>> will (for now at least) continue to be set up to have maximum size.
>>>>
>>>> Note that until we would want to use sub-page allocations here there's
>>>> no point checking whether MSI is supported by a device - 1 or up to 32
>>>> (or actually 128, due to the change effectively using a reserved
>>>> encoding) IRTEs always mean an order-0 allocation anyway.
>>> Devices which are not MSI-capable don't need an interrupt remapping
>>> table at all.
>> Oh, good point - pin based interrupts use the respective IO-APIC's
>> IRTE.
> 
> A lot of these devices have no interrupt capabilities at all.

Right, but this would be harder to tell than "is neither MSI-X nor
MSI capable".

>>>> --- a/xen/drivers/passthrough/amd/iommu_init.c
>>>> +++ b/xen/drivers/passthrough/amd/iommu_init.c
>>>> @@ -1315,11 +1317,8 @@ static int __init amd_iommu_setup_device
>>>>              }
>>>>  
>>>>              amd_iommu_set_intremap_table(
>>>> -                dte,
>>>> -                ivrs_mappings[bdf].intremap_table
>>>> -                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
>>>> -                : 0,
>>>> -                iommu_intremap);
>>>> +                dte, ivrs_mappings[bdf].intremap_table,
>>>> +                ivrs_mappings[bdf].iommu, iommu_intremap);
>>> Ah - half of this looks like it wants to be in patch 6, rather than here.
>> Hmm, which half?
> 
> The dropping of the ternary expression.
> 
>> I don't see anything misplaced here. The signature
>> of amd_iommu_set_intremap_table) changes only in this patch, not in
>> patch 6.
> 
> If the code isn't misplaced, I can't spot why it is necessary before
> this patch.

As explained in the context of the patch introducing it - there
is a way for ivrs_mappings[bdf].intremap_table to be NULL here.
The handling of this case merely gets moved into
amd_iommu_set_intremap_table() by the patch here.

>>>> @@ -80,17 +81,13 @@ unsigned int nr_ioapic_sbdf;
>>>>  
>>>>  static void dump_intremap_tables(unsigned char key);
>>>>  
>>>> -static unsigned int __init intremap_table_order(const struct
>>>> amd_iommu *iommu)
>>>> -{
>>>> -    return iommu->ctrl.ga_en
>>>> -           ? get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union
>>>> irte128))
>>>> -           : get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union
>>>> irte32));
>>>> -}
>>>> +#define intremap_page_order(irt) PFN_ORDER(virt_to_page(irt))
>>> What makes the frameable order field safe to use?  It reaches into
>>> (pg)->v.free.order which fairly obviously isn't safe for allocated pages.
>> The same argument that allows xmalloc_whole_pages() and xfree() to
>> use this field: It is the owner of a page who controls how the
>> individual sub-fields of a union get used. As long as v.inuse and
>> v.sh don't get used, (ab)using v.free for an allocated page is
>> quite fine.
> 
> In which case I think v.free needs renaming and/or the comment for
> PFN_ORDER() needs rewriting.
> 
> The current code/documentation does not give the impression that the
> current uses of PFN_ORDER() are safe.
> 
> Judging by the other users (particularly the IOMMU code), it would be
> best in a field called opaque or similar.

Well, perhaps. But I don't fancy doing this in this series.

>>> virt_to_page() is a non-trivial calculation, which is now used in a
>>> large number of circumstances.  I don't have an easy judgement of
>>> whether they are hotpaths, but surely it would be easier to just store
>>> another unsigned int per device.
>> Except this would be a vendor specific field in a supposedly
>> vendor agnostic structure. I'm not very keen to add such a field.
>> Also I don't think interrupt setup/teardown paths would normally
>> be considered "hot".
>>
>> What I could be talked into (to limit code size in case the
>> compiler doesn't expand things inline overly aggressively) is to
>> make this a static function instead of a macro.
> 
> I considered asking for that, but I expected not to get very far, given
> that the use of PFN_ORDER() as an lvalue seems to be the prevailing idiom.

Oh, of course - that's why it's a macro in the first place. It's
been a long while since I put together this change ...

>>> Furthermore, it would work around a preexisting issue where we can
>>> allocate beyond the number of interrupts for the device, up to the next
>>> order boundary.
>> Is this really an "issue", rather than just an irrelevant side
>> effect (which is never going to be hit as long as other layers
>> work correctly, in that they bound requests appropriately)?
> 
> Its not major, certainly (and I wouldn't object to the patch for this
> issue in isolation), but defensive coding is something to consider when
> the underlying algorithm is up for debate.

Understood. But since you didn't comment on why I wouldn't like
to introduce a per-device field as you suggest, it's not clear
whether you accept that argument. But wait - I effectively have
such a field now (after introducing msi_maxvec, i.e. it's sort
of a split field). The real problem is that the device isn't
always available when we want to get at this value (and there
may not even be one, like for the shared table, or for IO-APIC
and HPET entries). Having checked again, what might work out is
to add a field to struct ivrs_mappings.

But if I went there, then I could as well do away with full
page allocations, yet I didn't want to go quite as far for now.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel