[v2] IOMMU: superpage support when not sharing pagetables

[PATCH v2 13/18] VT-d: allow use of superpage mappings

Posted by Jan Beulich 4 years, 4 months ago

... depending on feature availability (and absence of quirks).

Also make the page table dumping function aware of superpages.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -743,18 +743,37 @@ static int __must_check iommu_flush_iotl
     return iommu_flush_iotlb(d, INVALID_DFN, 0, 0);
 }
 
+static void queue_free_pt(struct domain *d, mfn_t mfn, unsigned int next_level)
+{
+    if ( next_level > 1 )
+    {
+        struct dma_pte *pt = map_domain_page(mfn);
+        unsigned int i;
+
+        for ( i = 0; i < PTE_NUM; ++i )
+            if ( dma_pte_present(pt[i]) && !dma_pte_superpage(pt[i]) )
+                queue_free_pt(d, maddr_to_mfn(dma_pte_addr(pt[i])),
+                              next_level - 1);
+
+        unmap_domain_page(pt);
+    }
+
+    iommu_queue_free_pgtable(d, mfn_to_page(mfn));
+}
+
 /* clear one page's page table */
 static int dma_pte_clear_one(struct domain *domain, daddr_t addr,
                              unsigned int order,
                              unsigned int *flush_flags)
 {
     struct domain_iommu *hd = dom_iommu(domain);
-    struct dma_pte *page = NULL, *pte = NULL;
+    struct dma_pte *page = NULL, *pte = NULL, old;
     u64 pg_maddr;
+    unsigned int level = (order / LEVEL_STRIDE) + 1;
 
     spin_lock(&hd->arch.mapping_lock);
-    /* get last level pte */
-    pg_maddr = addr_to_dma_page_maddr(domain, addr, 1, flush_flags, false);
+    /* get target level pte */
+    pg_maddr = addr_to_dma_page_maddr(domain, addr, level, flush_flags, false);
     if ( pg_maddr < PAGE_SIZE )
     {
         spin_unlock(&hd->arch.mapping_lock);
@@ -762,7 +781,7 @@ static int dma_pte_clear_one(struct doma
     }
 
     page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
-    pte = page + address_level_offset(addr, 1);
+    pte = &page[address_level_offset(addr, level)];
 
     if ( !dma_pte_present(*pte) )
     {
@@ -771,14 +790,19 @@ static int dma_pte_clear_one(struct doma
         return 0;
     }
 
+    old = *pte;
     dma_clear_pte(*pte);
-    *flush_flags |= IOMMU_FLUSHF_modified;
 
     spin_unlock(&hd->arch.mapping_lock);
     iommu_sync_cache(pte, sizeof(struct dma_pte));
 
     unmap_vtd_domain_page(page);
 
+    *flush_flags |= IOMMU_FLUSHF_modified;
+
+    if ( level > 1 && !dma_pte_superpage(old) )
+        queue_free_pt(domain, maddr_to_mfn(dma_pte_addr(old)), level - 1);
+
     return 0;
 }
 
@@ -1868,6 +1892,7 @@ static int __must_check intel_iommu_map_
     struct domain_iommu *hd = dom_iommu(d);
     struct dma_pte *page, *pte, old, new = {};
     u64 pg_maddr;
+    unsigned int level = (IOMMUF_order(flags) / LEVEL_STRIDE) + 1;
     int rc = 0;
 
     /* Do nothing if VT-d shares EPT page table */
@@ -1892,7 +1917,7 @@ static int __must_check intel_iommu_map_
         return 0;
     }
 
-    pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 1, flush_flags,
+    pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), level, flush_flags,
                                       true);
     if ( pg_maddr < PAGE_SIZE )
     {
@@ -1901,13 +1926,15 @@ static int __must_check intel_iommu_map_
     }
 
     page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
-    pte = &page[dfn_x(dfn) & LEVEL_MASK];
+    pte = &page[address_level_offset(dfn_to_daddr(dfn), level)];
     old = *pte;
 
     dma_set_pte_addr(new, mfn_to_maddr(mfn));
     dma_set_pte_prot(new,
                      ((flags & IOMMUF_readable) ? DMA_PTE_READ  : 0) |
                      ((flags & IOMMUF_writable) ? DMA_PTE_WRITE : 0));
+    if ( IOMMUF_order(flags) )
+        dma_set_pte_superpage(new);
 
     /* Set the SNP on leaf page table if Snoop Control available */
     if ( iommu_snoop )
@@ -1928,8 +1955,13 @@ static int __must_check intel_iommu_map_
 
     *flush_flags |= IOMMU_FLUSHF_added;
     if ( dma_pte_present(old) )
+    {
         *flush_flags |= IOMMU_FLUSHF_modified;
 
+        if ( level > 1 && !dma_pte_superpage(old) )
+            queue_free_pt(d, maddr_to_mfn(dma_pte_addr(old)), level - 1);
+    }
+
     return rc;
 }
 
@@ -2286,6 +2318,7 @@ static int __init vtd_setup(void)
 {
     struct acpi_drhd_unit *drhd;
     struct vtd_iommu *iommu;
+    unsigned int large_sizes = PAGE_SIZE_2M | PAGE_SIZE_1G;
     int ret;
     bool reg_inval_supported = true;
 
@@ -2328,6 +2361,11 @@ static int __init vtd_setup(void)
                cap_sps_2mb(iommu->cap) ? ", 2MB" : "",
                cap_sps_1gb(iommu->cap) ? ", 1GB" : "");
 
+        if ( !cap_sps_2mb(iommu->cap) )
+            large_sizes &= ~PAGE_SIZE_2M;
+        if ( !cap_sps_1gb(iommu->cap) )
+            large_sizes &= ~PAGE_SIZE_1G;
+
 #ifndef iommu_snoop
         if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
             iommu_snoop = false;
@@ -2399,6 +2437,9 @@ static int __init vtd_setup(void)
     if ( ret )
         goto error;
 
+    ASSERT(iommu_ops.page_sizes & PAGE_SIZE_4K);
+    iommu_ops.page_sizes |= large_sizes;
+
     register_keyhandler('V', vtd_dump_iommu_info, "dump iommu info", 1);
 
     return 0;
@@ -2712,7 +2753,7 @@ static void vtd_dump_page_table_level(pa
             continue;
 
         address = gpa + offset_level_address(i, level);
-        if ( next_level >= 1 ) 
+        if ( next_level && !dma_pte_superpage(*pte) )
             vtd_dump_page_table_level(dma_pte_addr(*pte), next_level,
                                       address, indent + 1);
         else

Re: [PATCH v2 13/18] VT-d: allow use of superpage mappings

Posted by Roger Pau Monné 4 years, 1 month ago

On Fri, Sep 24, 2021 at 11:52:47AM +0200, Jan Beulich wrote:
> ... depending on feature availability (and absence of quirks).
> 
> Also make the page table dumping function aware of superpages.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Just some minor nits.

> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -743,18 +743,37 @@ static int __must_check iommu_flush_iotl
>      return iommu_flush_iotlb(d, INVALID_DFN, 0, 0);
>  }
>  
> +static void queue_free_pt(struct domain *d, mfn_t mfn, unsigned int next_level)

Same comment as the AMD side patch, about naming the parameter just
level.

> @@ -1901,13 +1926,15 @@ static int __must_check intel_iommu_map_
>      }
>  
>      page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
> -    pte = &page[dfn_x(dfn) & LEVEL_MASK];
> +    pte = &page[address_level_offset(dfn_to_daddr(dfn), level)];
>      old = *pte;
>  
>      dma_set_pte_addr(new, mfn_to_maddr(mfn));
>      dma_set_pte_prot(new,
>                       ((flags & IOMMUF_readable) ? DMA_PTE_READ  : 0) |
>                       ((flags & IOMMUF_writable) ? DMA_PTE_WRITE : 0));
> +    if ( IOMMUF_order(flags) )

You seem to use level > 1 in other places to check for whether the
entry is intended to be a super-page. Is there any reason to use
IOMMUF_order here instead?


> @@ -2328,6 +2361,11 @@ static int __init vtd_setup(void)
>                 cap_sps_2mb(iommu->cap) ? ", 2MB" : "",
>                 cap_sps_1gb(iommu->cap) ? ", 1GB" : "");
>  
> +        if ( !cap_sps_2mb(iommu->cap) )
> +            large_sizes &= ~PAGE_SIZE_2M;
> +        if ( !cap_sps_1gb(iommu->cap) )
> +            large_sizes &= ~PAGE_SIZE_1G;
> +
>  #ifndef iommu_snoop
>          if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
>              iommu_snoop = false;
> @@ -2399,6 +2437,9 @@ static int __init vtd_setup(void)
>      if ( ret )
>          goto error;
>  
> +    ASSERT(iommu_ops.page_sizes & PAGE_SIZE_4K);

Since you are adding the assert, it might be more complete to check no
other page sizes are set, iommu_ops.page_sizes == PAGE_SIZE_4K?

Thanks, Roger.

Re: [PATCH v2 13/18] VT-d: allow use of superpage mappings

Posted by Jan Beulich 4 years, 1 month ago

On 13.12.2021 12:54, Roger Pau Monné wrote:
> On Fri, Sep 24, 2021 at 11:52:47AM +0200, Jan Beulich wrote:
>> --- a/xen/drivers/passthrough/vtd/iommu.c
>> +++ b/xen/drivers/passthrough/vtd/iommu.c
>> @@ -743,18 +743,37 @@ static int __must_check iommu_flush_iotl
>>      return iommu_flush_iotlb(d, INVALID_DFN, 0, 0);
>>  }
>>  
>> +static void queue_free_pt(struct domain *d, mfn_t mfn, unsigned int next_level)
> 
> Same comment as the AMD side patch, about naming the parameter just
> level.

Sure, will change.

>> @@ -1901,13 +1926,15 @@ static int __must_check intel_iommu_map_
>>      }
>>  
>>      page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
>> -    pte = &page[dfn_x(dfn) & LEVEL_MASK];
>> +    pte = &page[address_level_offset(dfn_to_daddr(dfn), level)];
>>      old = *pte;
>>  
>>      dma_set_pte_addr(new, mfn_to_maddr(mfn));
>>      dma_set_pte_prot(new,
>>                       ((flags & IOMMUF_readable) ? DMA_PTE_READ  : 0) |
>>                       ((flags & IOMMUF_writable) ? DMA_PTE_WRITE : 0));
>> +    if ( IOMMUF_order(flags) )
> 
> You seem to use level > 1 in other places to check for whether the
> entry is intended to be a super-page. Is there any reason to use
> IOMMUF_order here instead?

"flags" is the original source of information here, so it seemed more
natural to use it. The following hunk uses "level > 1" to better
match the similar unmap logic as well as AMD code. Maybe I should
change those to also use "flags" (or "order" in the unmap case), as
that would allow re-using the local variable in the new patches in v3
doing the re-coalescing of present superpages (right now I'm using a
second, not very nicely named variable there).

I'll have to think about this some and check whether there are other
issues if I made such a change.

>> @@ -2328,6 +2361,11 @@ static int __init vtd_setup(void)
>>                 cap_sps_2mb(iommu->cap) ? ", 2MB" : "",
>>                 cap_sps_1gb(iommu->cap) ? ", 1GB" : "");
>>  
>> +        if ( !cap_sps_2mb(iommu->cap) )
>> +            large_sizes &= ~PAGE_SIZE_2M;
>> +        if ( !cap_sps_1gb(iommu->cap) )
>> +            large_sizes &= ~PAGE_SIZE_1G;
>> +
>>  #ifndef iommu_snoop
>>          if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
>>              iommu_snoop = false;
>> @@ -2399,6 +2437,9 @@ static int __init vtd_setup(void)
>>      if ( ret )
>>          goto error;
>>  
>> +    ASSERT(iommu_ops.page_sizes & PAGE_SIZE_4K);
> 
> Since you are adding the assert, it might be more complete to check no
> other page sizes are set, iommu_ops.page_sizes == PAGE_SIZE_4K?

Ah yes, would make sense. Let me change this.

Jan