In order to be able to insert/remove super-pages we need to allow
callers of the walking function to specify at which point to stop the
walk.
For intel_iommu_lookup_page() integrate the last level access into
the main walking function.
dma_pte_clear_one() gets only partly adjusted for now: Error handling
and order parameter get put in place, but the order parameter remains
ignored (just like intel_iommu_map_page()'s order part of the flags).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
I have to admit that I don't understand why domain_pgd_maddr() wants to
populate all page table levels for DFN 0.
I was actually wondering whether it wouldn't make sense to integrate
dma_pte_clear_one() into its only caller intel_iommu_unmap_page(), for
better symmetry with intel_iommu_map_page().
---
v2: Fix build.
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -264,63 +264,116 @@ static u64 bus_to_context_maddr(struct v
return maddr;
}
-static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
+/*
+ * This function walks (and if requested allocates) page tables to the
+ * designated target level. It returns
+ * - 0 when a non-present entry was encountered and no allocation was
+ * requested,
+ * - a small positive value (the level, i.e. below PAGE_SIZE) upon allocation
+ * failure,
+ * - for target > 0 the address of the page table holding the leaf PTE for
+ * the requested address,
+ * - for target == 0 the full PTE.
+ */
+static uint64_t addr_to_dma_page_maddr(struct domain *domain, daddr_t addr,
+ unsigned int target,
+ unsigned int *flush_flags, bool alloc)
{
struct domain_iommu *hd = dom_iommu(domain);
int addr_width = agaw_to_width(hd->arch.vtd.agaw);
struct dma_pte *parent, *pte = NULL;
- int level = agaw_to_level(hd->arch.vtd.agaw);
- int offset;
+ unsigned int level = agaw_to_level(hd->arch.vtd.agaw), offset;
u64 pte_maddr = 0;
addr &= (((u64)1) << addr_width) - 1;
ASSERT(spin_is_locked(&hd->arch.mapping_lock));
+ ASSERT(target || !alloc);
+
if ( !hd->arch.vtd.pgd_maddr )
{
struct page_info *pg;
- if ( !alloc || !(pg = iommu_alloc_pgtable(domain)) )
+ if ( !alloc )
+ goto out;
+
+ pte_maddr = level;
+ if ( !(pg = iommu_alloc_pgtable(domain)) )
goto out;
hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
}
- parent = (struct dma_pte *)map_vtd_domain_page(hd->arch.vtd.pgd_maddr);
- while ( level > 1 )
+ pte_maddr = hd->arch.vtd.pgd_maddr;
+ parent = map_vtd_domain_page(pte_maddr);
+ while ( level > target )
{
offset = address_level_offset(addr, level);
pte = &parent[offset];
pte_maddr = dma_pte_addr(*pte);
- if ( !pte_maddr )
+ if ( !dma_pte_present(*pte) || (level > 1 && dma_pte_superpage(*pte)) )
{
struct page_info *pg;
+ /*
+ * Higher level tables always set r/w, last level page table
+ * controls read/write.
+ */
+ struct dma_pte new_pte = { DMA_PTE_PROT };
if ( !alloc )
- break;
+ {
+ pte_maddr = 0;
+ if ( !dma_pte_present(*pte) )
+ break;
+
+ /*
+ * When the leaf entry was requested, pass back the full PTE,
+ * with the address adjusted to account for the residual of
+ * the walk.
+ */
+ pte_maddr = pte->val +
+ (addr & ((1UL << level_to_offset_bits(level)) - 1) &
+ PAGE_MASK);
+ if ( !target )
+ break;
+ }
+ pte_maddr = level - 1;
pg = iommu_alloc_pgtable(domain);
if ( !pg )
break;
pte_maddr = page_to_maddr(pg);
- dma_set_pte_addr(*pte, pte_maddr);
+ dma_set_pte_addr(new_pte, pte_maddr);
- /*
- * high level table always sets r/w, last level
- * page table control read/write
- */
- dma_set_pte_readable(*pte);
- dma_set_pte_writable(*pte);
+ if ( dma_pte_present(*pte) )
+ {
+ struct dma_pte *split = map_vtd_domain_page(pte_maddr);
+ unsigned long inc = 1UL << level_to_offset_bits(level - 1);
+
+ split[0].val = pte->val;
+ if ( inc == PAGE_SIZE )
+ split[0].val &= ~DMA_PTE_SP;
+
+ for ( offset = 1; offset < PTE_NUM; ++offset )
+ split[offset].val = split[offset - 1].val + inc;
+
+ iommu_sync_cache(split, PAGE_SIZE);
+ unmap_vtd_domain_page(split);
+
+ if ( flush_flags )
+ *flush_flags |= IOMMU_FLUSHF_modified;
+ }
+
+ write_atomic(&pte->val, new_pte.val);
iommu_sync_cache(pte, sizeof(struct dma_pte));
}
- if ( level == 2 )
+ if ( --level == target )
break;
unmap_vtd_domain_page(parent);
parent = map_vtd_domain_page(pte_maddr);
- level--;
}
unmap_vtd_domain_page(parent);
@@ -346,7 +399,7 @@ static uint64_t domain_pgd_maddr(struct
if ( !hd->arch.vtd.pgd_maddr )
{
/* Ensure we have pagetables allocated down to leaf PTE. */
- addr_to_dma_page_maddr(d, 0, 1);
+ addr_to_dma_page_maddr(d, 0, 1, NULL, true);
if ( !hd->arch.vtd.pgd_maddr )
return 0;
@@ -691,8 +744,9 @@ static int __must_check iommu_flush_iotl
}
/* clear one page's page table */
-static void dma_pte_clear_one(struct domain *domain, uint64_t addr,
- unsigned int *flush_flags)
+static int dma_pte_clear_one(struct domain *domain, daddr_t addr,
+ unsigned int order,
+ unsigned int *flush_flags)
{
struct domain_iommu *hd = dom_iommu(domain);
struct dma_pte *page = NULL, *pte = NULL;
@@ -700,11 +754,11 @@ static void dma_pte_clear_one(struct dom
spin_lock(&hd->arch.mapping_lock);
/* get last level pte */
- pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
- if ( pg_maddr == 0 )
+ pg_maddr = addr_to_dma_page_maddr(domain, addr, 1, flush_flags, false);
+ if ( pg_maddr < PAGE_SIZE )
{
spin_unlock(&hd->arch.mapping_lock);
- return;
+ return pg_maddr ? -ENOMEM : 0;
}
page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
@@ -714,7 +768,7 @@ static void dma_pte_clear_one(struct dom
{
spin_unlock(&hd->arch.mapping_lock);
unmap_vtd_domain_page(page);
- return;
+ return 0;
}
dma_clear_pte(*pte);
@@ -724,6 +778,8 @@ static void dma_pte_clear_one(struct dom
iommu_sync_cache(pte, sizeof(struct dma_pte));
unmap_vtd_domain_page(page);
+
+ return 0;
}
static int iommu_set_root_entry(struct vtd_iommu *iommu)
@@ -1836,8 +1892,9 @@ static int __must_check intel_iommu_map_
return 0;
}
- pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 1);
- if ( !pg_maddr )
+ pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 1, flush_flags,
+ true);
+ if ( pg_maddr < PAGE_SIZE )
{
spin_unlock(&hd->arch.mapping_lock);
return -ENOMEM;
@@ -1887,17 +1944,14 @@ static int __must_check intel_iommu_unma
if ( iommu_hwdom_passthrough && is_hardware_domain(d) )
return 0;
- dma_pte_clear_one(d, dfn_to_daddr(dfn), flush_flags);
-
- return 0;
+ return dma_pte_clear_one(d, dfn_to_daddr(dfn), 0, flush_flags);
}
static int intel_iommu_lookup_page(struct domain *d, dfn_t dfn, mfn_t *mfn,
unsigned int *flags)
{
struct domain_iommu *hd = dom_iommu(d);
- struct dma_pte *page, val;
- u64 pg_maddr;
+ uint64_t val;
/*
* If VT-d shares EPT page table or if the domain is the hardware
@@ -1909,25 +1963,16 @@ static int intel_iommu_lookup_page(struc
spin_lock(&hd->arch.mapping_lock);
- pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 0);
- if ( !pg_maddr )
- {
- spin_unlock(&hd->arch.mapping_lock);
- return -ENOENT;
- }
-
- page = map_vtd_domain_page(pg_maddr);
- val = page[dfn_x(dfn) & LEVEL_MASK];
+ val = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 0, NULL, false);
- unmap_vtd_domain_page(page);
spin_unlock(&hd->arch.mapping_lock);
- if ( !dma_pte_present(val) )
+ if ( val < PAGE_SIZE )
return -ENOENT;
- *mfn = maddr_to_mfn(dma_pte_addr(val));
- *flags = dma_pte_read(val) ? IOMMUF_readable : 0;
- *flags |= dma_pte_write(val) ? IOMMUF_writable : 0;
+ *mfn = maddr_to_mfn(val);
+ *flags = val & DMA_PTE_READ ? IOMMUF_readable : 0;
+ *flags |= val & DMA_PTE_WRITE ? IOMMUF_writable : 0;
return 0;
}