[PATCH v2 10/18] AMD/IOMMU: walk trees upon page fault

Jan Beulich posted 18 patches 3 years, 2 months ago
There is a newer version of this series
[PATCH v2 10/18] AMD/IOMMU: walk trees upon page fault
Posted by Jan Beulich 3 years, 2 months ago
This is to aid diagnosing issues and largely matches VT-d's behavior.
Since I'm adding permissions output here as well, take the opportunity
and also add their displaying to amd_dump_page_table_level().

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/drivers/passthrough/amd/iommu.h
+++ b/xen/drivers/passthrough/amd/iommu.h
@@ -243,6 +243,8 @@ int __must_check amd_iommu_flush_iotlb_p
                                              unsigned long page_count,
                                              unsigned int flush_flags);
 int __must_check amd_iommu_flush_iotlb_all(struct domain *d);
+void amd_iommu_print_entries(const struct amd_iommu *iommu, unsigned int dev_id,
+                             dfn_t dfn);
 
 /* device table functions */
 int get_dma_requestor_id(uint16_t seg, uint16_t bdf);
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -573,6 +573,9 @@ static void parse_event_log_entry(struct
                (flags & 0x002) ? " NX" : "",
                (flags & 0x001) ? " GN" : "");
 
+        if ( iommu_verbose )
+            amd_iommu_print_entries(iommu, device_id, daddr_to_dfn(addr));
+
         for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
             if ( get_dma_requestor_id(iommu->seg, bdf) == device_id )
                 pci_check_disable_device(iommu->seg, PCI_BUS(bdf),
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -363,6 +363,50 @@ int amd_iommu_unmap_page(struct domain *
     return 0;
 }
 
+void amd_iommu_print_entries(const struct amd_iommu *iommu, unsigned int dev_id,
+                             dfn_t dfn)
+{
+    mfn_t pt_mfn;
+    unsigned int level;
+    const struct amd_iommu_dte *dt = iommu->dev_table.buffer;
+
+    if ( !dt[dev_id].tv )
+    {
+        printk("%pp: no root\n", &PCI_SBDF2(iommu->seg, dev_id));
+        return;
+    }
+
+    pt_mfn = _mfn(dt[dev_id].pt_root);
+    level = dt[dev_id].paging_mode;
+    printk("%pp root @ %"PRI_mfn" (%u levels) dfn=%"PRI_dfn"\n",
+           &PCI_SBDF2(iommu->seg, dev_id), mfn_x(pt_mfn), level, dfn_x(dfn));
+
+    while ( level )
+    {
+        const union amd_iommu_pte *pt = map_domain_page(pt_mfn);
+        unsigned int idx = pfn_to_pde_idx(dfn_x(dfn), level);
+        union amd_iommu_pte pte = pt[idx];
+
+        unmap_domain_page(pt);
+
+        printk("  L%u[%03x] = %"PRIx64" %c%c\n", level, idx, pte.raw,
+               pte.pr ? pte.ir ? 'r' : '-' : 'n',
+               pte.pr ? pte.iw ? 'w' : '-' : 'p');
+
+        if ( !pte.pr )
+            break;
+
+        if ( pte.next_level >= level )
+        {
+            printk("  L%u[%03x]: next: %u\n", level, idx, pte.next_level);
+            break;
+        }
+
+        pt_mfn = _mfn(pte.mfn);
+        level = pte.next_level;
+    }
+}
+
 static unsigned long flush_count(unsigned long dfn, unsigned long page_count,
                                  unsigned int order)
 {
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -607,10 +607,11 @@ static void amd_dump_page_table_level(st
                 mfn_to_page(_mfn(pde->mfn)), pde->next_level,
                 address, indent + 1);
         else
-            printk("%*sdfn: %08lx  mfn: %08lx\n",
+            printk("%*sdfn: %08lx  mfn: %08lx  %c%c\n",
                    indent, "",
                    (unsigned long)PFN_DOWN(address),
-                   (unsigned long)PFN_DOWN(pfn_to_paddr(pde->mfn)));
+                   (unsigned long)PFN_DOWN(pfn_to_paddr(pde->mfn)),
+                   pde->ir ? 'r' : '-', pde->iw ? 'w' : '-');
     }
 
     unmap_domain_page(table_vaddr);


Re: [PATCH v2 10/18] AMD/IOMMU: walk trees upon page fault
Posted by Roger Pau Monné 2 years, 12 months ago
On Fri, Sep 24, 2021 at 11:51:15AM +0200, Jan Beulich wrote:
> This is to aid diagnosing issues and largely matches VT-d's behavior.
> Since I'm adding permissions output here as well, take the opportunity
> and also add their displaying to amd_dump_page_table_level().
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/drivers/passthrough/amd/iommu.h
> +++ b/xen/drivers/passthrough/amd/iommu.h
> @@ -243,6 +243,8 @@ int __must_check amd_iommu_flush_iotlb_p
>                                               unsigned long page_count,
>                                               unsigned int flush_flags);
>  int __must_check amd_iommu_flush_iotlb_all(struct domain *d);
> +void amd_iommu_print_entries(const struct amd_iommu *iommu, unsigned int dev_id,
> +                             dfn_t dfn);
>  
>  /* device table functions */
>  int get_dma_requestor_id(uint16_t seg, uint16_t bdf);
> --- a/xen/drivers/passthrough/amd/iommu_init.c
> +++ b/xen/drivers/passthrough/amd/iommu_init.c
> @@ -573,6 +573,9 @@ static void parse_event_log_entry(struct
>                 (flags & 0x002) ? " NX" : "",
>                 (flags & 0x001) ? " GN" : "");
>  
> +        if ( iommu_verbose )
> +            amd_iommu_print_entries(iommu, device_id, daddr_to_dfn(addr));
> +
>          for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
>              if ( get_dma_requestor_id(iommu->seg, bdf) == device_id )
>                  pci_check_disable_device(iommu->seg, PCI_BUS(bdf),
> --- a/xen/drivers/passthrough/amd/iommu_map.c
> +++ b/xen/drivers/passthrough/amd/iommu_map.c
> @@ -363,6 +363,50 @@ int amd_iommu_unmap_page(struct domain *
>      return 0;
>  }
>  
> +void amd_iommu_print_entries(const struct amd_iommu *iommu, unsigned int dev_id,
> +                             dfn_t dfn)
> +{
> +    mfn_t pt_mfn;
> +    unsigned int level;
> +    const struct amd_iommu_dte *dt = iommu->dev_table.buffer;
> +
> +    if ( !dt[dev_id].tv )
> +    {
> +        printk("%pp: no root\n", &PCI_SBDF2(iommu->seg, dev_id));
> +        return;
> +    }
> +
> +    pt_mfn = _mfn(dt[dev_id].pt_root);
> +    level = dt[dev_id].paging_mode;
> +    printk("%pp root @ %"PRI_mfn" (%u levels) dfn=%"PRI_dfn"\n",
> +           &PCI_SBDF2(iommu->seg, dev_id), mfn_x(pt_mfn), level, dfn_x(dfn));
> +
> +    while ( level )
> +    {
> +        const union amd_iommu_pte *pt = map_domain_page(pt_mfn);
> +        unsigned int idx = pfn_to_pde_idx(dfn_x(dfn), level);
> +        union amd_iommu_pte pte = pt[idx];

Don't you need to take a lock here (mapping_lock maybe?) in order to
prevent changes to the IOMMU page tables while walking them?

Thanks, Roger.

Re: [PATCH v2 10/18] AMD/IOMMU: walk trees upon page fault
Posted by Jan Beulich 2 years, 12 months ago
On 03.12.2021 10:03, Roger Pau Monné wrote:
> On Fri, Sep 24, 2021 at 11:51:15AM +0200, Jan Beulich wrote:
>> This is to aid diagnosing issues and largely matches VT-d's behavior.
>> Since I'm adding permissions output here as well, take the opportunity
>> and also add their displaying to amd_dump_page_table_level().
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>
>> --- a/xen/drivers/passthrough/amd/iommu.h
>> +++ b/xen/drivers/passthrough/amd/iommu.h
>> @@ -243,6 +243,8 @@ int __must_check amd_iommu_flush_iotlb_p
>>                                               unsigned long page_count,
>>                                               unsigned int flush_flags);
>>  int __must_check amd_iommu_flush_iotlb_all(struct domain *d);
>> +void amd_iommu_print_entries(const struct amd_iommu *iommu, unsigned int dev_id,
>> +                             dfn_t dfn);
>>  
>>  /* device table functions */
>>  int get_dma_requestor_id(uint16_t seg, uint16_t bdf);
>> --- a/xen/drivers/passthrough/amd/iommu_init.c
>> +++ b/xen/drivers/passthrough/amd/iommu_init.c
>> @@ -573,6 +573,9 @@ static void parse_event_log_entry(struct
>>                 (flags & 0x002) ? " NX" : "",
>>                 (flags & 0x001) ? " GN" : "");
>>  
>> +        if ( iommu_verbose )
>> +            amd_iommu_print_entries(iommu, device_id, daddr_to_dfn(addr));
>> +
>>          for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
>>              if ( get_dma_requestor_id(iommu->seg, bdf) == device_id )
>>                  pci_check_disable_device(iommu->seg, PCI_BUS(bdf),
>> --- a/xen/drivers/passthrough/amd/iommu_map.c
>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
>> @@ -363,6 +363,50 @@ int amd_iommu_unmap_page(struct domain *
>>      return 0;
>>  }
>>  
>> +void amd_iommu_print_entries(const struct amd_iommu *iommu, unsigned int dev_id,
>> +                             dfn_t dfn)
>> +{
>> +    mfn_t pt_mfn;
>> +    unsigned int level;
>> +    const struct amd_iommu_dte *dt = iommu->dev_table.buffer;
>> +
>> +    if ( !dt[dev_id].tv )
>> +    {
>> +        printk("%pp: no root\n", &PCI_SBDF2(iommu->seg, dev_id));
>> +        return;
>> +    }
>> +
>> +    pt_mfn = _mfn(dt[dev_id].pt_root);
>> +    level = dt[dev_id].paging_mode;
>> +    printk("%pp root @ %"PRI_mfn" (%u levels) dfn=%"PRI_dfn"\n",
>> +           &PCI_SBDF2(iommu->seg, dev_id), mfn_x(pt_mfn), level, dfn_x(dfn));
>> +
>> +    while ( level )
>> +    {
>> +        const union amd_iommu_pte *pt = map_domain_page(pt_mfn);
>> +        unsigned int idx = pfn_to_pde_idx(dfn_x(dfn), level);
>> +        union amd_iommu_pte pte = pt[idx];
> 
> Don't you need to take a lock here (mapping_lock maybe?) in order to
> prevent changes to the IOMMU page tables while walking them?

Generally speaking - yes. But see the description saying "largely
matches VT-d's behavior". On VT-d both the IOMMU lock and the mapping
lock would need acquiring to be safe (the former could perhaps be
dropped early). Likewise here. If I wanted to do so here, I ought to
add a prereq patch adjusting the VT-d function. The main "excuse" not
to do so is/was probably the size of the series.

Jan


Re: [PATCH v2 10/18] AMD/IOMMU: walk trees upon page fault
Posted by Jan Beulich 2 years, 12 months ago
On 03.12.2021 10:49, Jan Beulich wrote:
> On 03.12.2021 10:03, Roger Pau Monné wrote:
>> On Fri, Sep 24, 2021 at 11:51:15AM +0200, Jan Beulich wrote:
>>> This is to aid diagnosing issues and largely matches VT-d's behavior.
>>> Since I'm adding permissions output here as well, take the opportunity
>>> and also add their displaying to amd_dump_page_table_level().
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>
>>> --- a/xen/drivers/passthrough/amd/iommu.h
>>> +++ b/xen/drivers/passthrough/amd/iommu.h
>>> @@ -243,6 +243,8 @@ int __must_check amd_iommu_flush_iotlb_p
>>>                                               unsigned long page_count,
>>>                                               unsigned int flush_flags);
>>>  int __must_check amd_iommu_flush_iotlb_all(struct domain *d);
>>> +void amd_iommu_print_entries(const struct amd_iommu *iommu, unsigned int dev_id,
>>> +                             dfn_t dfn);
>>>  
>>>  /* device table functions */
>>>  int get_dma_requestor_id(uint16_t seg, uint16_t bdf);
>>> --- a/xen/drivers/passthrough/amd/iommu_init.c
>>> +++ b/xen/drivers/passthrough/amd/iommu_init.c
>>> @@ -573,6 +573,9 @@ static void parse_event_log_entry(struct
>>>                 (flags & 0x002) ? " NX" : "",
>>>                 (flags & 0x001) ? " GN" : "");
>>>  
>>> +        if ( iommu_verbose )
>>> +            amd_iommu_print_entries(iommu, device_id, daddr_to_dfn(addr));
>>> +
>>>          for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
>>>              if ( get_dma_requestor_id(iommu->seg, bdf) == device_id )
>>>                  pci_check_disable_device(iommu->seg, PCI_BUS(bdf),
>>> --- a/xen/drivers/passthrough/amd/iommu_map.c
>>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
>>> @@ -363,6 +363,50 @@ int amd_iommu_unmap_page(struct domain *
>>>      return 0;
>>>  }
>>>  
>>> +void amd_iommu_print_entries(const struct amd_iommu *iommu, unsigned int dev_id,
>>> +                             dfn_t dfn)
>>> +{
>>> +    mfn_t pt_mfn;
>>> +    unsigned int level;
>>> +    const struct amd_iommu_dte *dt = iommu->dev_table.buffer;
>>> +
>>> +    if ( !dt[dev_id].tv )
>>> +    {
>>> +        printk("%pp: no root\n", &PCI_SBDF2(iommu->seg, dev_id));
>>> +        return;
>>> +    }
>>> +
>>> +    pt_mfn = _mfn(dt[dev_id].pt_root);
>>> +    level = dt[dev_id].paging_mode;
>>> +    printk("%pp root @ %"PRI_mfn" (%u levels) dfn=%"PRI_dfn"\n",
>>> +           &PCI_SBDF2(iommu->seg, dev_id), mfn_x(pt_mfn), level, dfn_x(dfn));
>>> +
>>> +    while ( level )
>>> +    {
>>> +        const union amd_iommu_pte *pt = map_domain_page(pt_mfn);
>>> +        unsigned int idx = pfn_to_pde_idx(dfn_x(dfn), level);
>>> +        union amd_iommu_pte pte = pt[idx];
>>
>> Don't you need to take a lock here (mapping_lock maybe?) in order to
>> prevent changes to the IOMMU page tables while walking them?
> 
> Generally speaking - yes. But see the description saying "largely
> matches VT-d's behavior". On VT-d both the IOMMU lock and the mapping
> lock would need acquiring to be safe (the former could perhaps be
> dropped early). Likewise here. If I wanted to do so here, I ought to
> add a prereq patch adjusting the VT-d function. The main "excuse" not
> to do so is/was probably the size of the series.

Which in turn would call for {amd,vtd}_dump_page_tables() to gain proper
locking. Not sure where this would end; these further items are more and
more unrelated to the actual purpose of this series (whereas I needed the
patch here anyway for debugging purposes) ...

Jan


Re: [PATCH v2 10/18] AMD/IOMMU: walk trees upon page fault
Posted by Roger Pau Monné 2 years, 11 months ago
On Fri, Dec 03, 2021 at 10:55:54AM +0100, Jan Beulich wrote:
> On 03.12.2021 10:49, Jan Beulich wrote:
> > On 03.12.2021 10:03, Roger Pau Monné wrote:
> >> On Fri, Sep 24, 2021 at 11:51:15AM +0200, Jan Beulich wrote:
> >>> This is to aid diagnosing issues and largely matches VT-d's behavior.
> >>> Since I'm adding permissions output here as well, take the opportunity
> >>> and also add their displaying to amd_dump_page_table_level().
> >>>
> >>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> >>>
> >>> --- a/xen/drivers/passthrough/amd/iommu.h
> >>> +++ b/xen/drivers/passthrough/amd/iommu.h
> >>> @@ -243,6 +243,8 @@ int __must_check amd_iommu_flush_iotlb_p
> >>>                                               unsigned long page_count,
> >>>                                               unsigned int flush_flags);
> >>>  int __must_check amd_iommu_flush_iotlb_all(struct domain *d);
> >>> +void amd_iommu_print_entries(const struct amd_iommu *iommu, unsigned int dev_id,
> >>> +                             dfn_t dfn);
> >>>  
> >>>  /* device table functions */
> >>>  int get_dma_requestor_id(uint16_t seg, uint16_t bdf);
> >>> --- a/xen/drivers/passthrough/amd/iommu_init.c
> >>> +++ b/xen/drivers/passthrough/amd/iommu_init.c
> >>> @@ -573,6 +573,9 @@ static void parse_event_log_entry(struct
> >>>                 (flags & 0x002) ? " NX" : "",
> >>>                 (flags & 0x001) ? " GN" : "");
> >>>  
> >>> +        if ( iommu_verbose )
> >>> +            amd_iommu_print_entries(iommu, device_id, daddr_to_dfn(addr));
> >>> +
> >>>          for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
> >>>              if ( get_dma_requestor_id(iommu->seg, bdf) == device_id )
> >>>                  pci_check_disable_device(iommu->seg, PCI_BUS(bdf),
> >>> --- a/xen/drivers/passthrough/amd/iommu_map.c
> >>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
> >>> @@ -363,6 +363,50 @@ int amd_iommu_unmap_page(struct domain *
> >>>      return 0;
> >>>  }
> >>>  
> >>> +void amd_iommu_print_entries(const struct amd_iommu *iommu, unsigned int dev_id,
> >>> +                             dfn_t dfn)
> >>> +{
> >>> +    mfn_t pt_mfn;
> >>> +    unsigned int level;
> >>> +    const struct amd_iommu_dte *dt = iommu->dev_table.buffer;
> >>> +
> >>> +    if ( !dt[dev_id].tv )
> >>> +    {
> >>> +        printk("%pp: no root\n", &PCI_SBDF2(iommu->seg, dev_id));
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    pt_mfn = _mfn(dt[dev_id].pt_root);
> >>> +    level = dt[dev_id].paging_mode;
> >>> +    printk("%pp root @ %"PRI_mfn" (%u levels) dfn=%"PRI_dfn"\n",
> >>> +           &PCI_SBDF2(iommu->seg, dev_id), mfn_x(pt_mfn), level, dfn_x(dfn));
> >>> +
> >>> +    while ( level )
> >>> +    {
> >>> +        const union amd_iommu_pte *pt = map_domain_page(pt_mfn);
> >>> +        unsigned int idx = pfn_to_pde_idx(dfn_x(dfn), level);
> >>> +        union amd_iommu_pte pte = pt[idx];
> >>
> >> Don't you need to take a lock here (mapping_lock maybe?) in order to
> >> prevent changes to the IOMMU page tables while walking them?
> > 
> > Generally speaking - yes. But see the description saying "largely
> > matches VT-d's behavior". On VT-d both the IOMMU lock and the mapping
> > lock would need acquiring to be safe (the former could perhaps be
> > dropped early). Likewise here. If I wanted to do so here, I ought to
> > add a prereq patch adjusting the VT-d function. The main "excuse" not
> > to do so is/was probably the size of the series.
> 
> Which in turn would call for {amd,vtd}_dump_page_tables() to gain proper
> locking. Not sure where this would end; these further items are more and
> more unrelated to the actual purpose of this series (whereas I needed the
> patch here anyway for debugging purposes) ...

I think adding a comment regarding the lack of locking due to the
function only being used as a debug aide would help clarify this. I
don't think we support running with iommu debug or verbose modes.

Thanks, Roger.

Re: [PATCH v2 10/18] AMD/IOMMU: walk trees upon page fault
Posted by Jan Beulich 2 years, 12 months ago
On 03.12.2021 10:03, Roger Pau Monné wrote:
> On Fri, Sep 24, 2021 at 11:51:15AM +0200, Jan Beulich wrote:
>> This is to aid diagnosing issues and largely matches VT-d's behavior.
>> Since I'm adding permissions output here as well, take the opportunity
>> and also add their displaying to amd_dump_page_table_level().
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>
>> --- a/xen/drivers/passthrough/amd/iommu.h
>> +++ b/xen/drivers/passthrough/amd/iommu.h
>> @@ -243,6 +243,8 @@ int __must_check amd_iommu_flush_iotlb_p
>>                                               unsigned long page_count,
>>                                               unsigned int flush_flags);
>>  int __must_check amd_iommu_flush_iotlb_all(struct domain *d);
>> +void amd_iommu_print_entries(const struct amd_iommu *iommu, unsigned int dev_id,
>> +                             dfn_t dfn);
>>  
>>  /* device table functions */
>>  int get_dma_requestor_id(uint16_t seg, uint16_t bdf);
>> --- a/xen/drivers/passthrough/amd/iommu_init.c
>> +++ b/xen/drivers/passthrough/amd/iommu_init.c
>> @@ -573,6 +573,9 @@ static void parse_event_log_entry(struct
>>                 (flags & 0x002) ? " NX" : "",
>>                 (flags & 0x001) ? " GN" : "");
>>  
>> +        if ( iommu_verbose )
>> +            amd_iommu_print_entries(iommu, device_id, daddr_to_dfn(addr));
>> +
>>          for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
>>              if ( get_dma_requestor_id(iommu->seg, bdf) == device_id )
>>                  pci_check_disable_device(iommu->seg, PCI_BUS(bdf),
>> --- a/xen/drivers/passthrough/amd/iommu_map.c
>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
>> @@ -363,6 +363,50 @@ int amd_iommu_unmap_page(struct domain *
>>      return 0;
>>  }
>>  
>> +void amd_iommu_print_entries(const struct amd_iommu *iommu, unsigned int dev_id,
>> +                             dfn_t dfn)
>> +{
>> +    mfn_t pt_mfn;
>> +    unsigned int level;
>> +    const struct amd_iommu_dte *dt = iommu->dev_table.buffer;
>> +
>> +    if ( !dt[dev_id].tv )
>> +    {
>> +        printk("%pp: no root\n", &PCI_SBDF2(iommu->seg, dev_id));
>> +        return;
>> +    }
>> +
>> +    pt_mfn = _mfn(dt[dev_id].pt_root);
>> +    level = dt[dev_id].paging_mode;
>> +    printk("%pp root @ %"PRI_mfn" (%u levels) dfn=%"PRI_dfn"\n",
>> +           &PCI_SBDF2(iommu->seg, dev_id), mfn_x(pt_mfn), level, dfn_x(dfn));
>> +
>> +    while ( level )
>> +    {
>> +        const union amd_iommu_pte *pt = map_domain_page(pt_mfn);
>> +        unsigned int idx = pfn_to_pde_idx(dfn_x(dfn), level);
>> +        union amd_iommu_pte pte = pt[idx];
> 
> Don't you need to take a lock here (mapping_lock maybe?) in order to
> prevent changes to the IOMMU page tables while walking them?

Further to my earlier reply, taking the mapping lock here isn't
straightforward, as that would mean determining the correct domain.

Jan