softmmu/memory.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
From: Xiang Chen <chenxiang66@hisilicon.com>
Currently memory_region_iommu_replay() does full page table walk with
fixed granularity (page size) no matter translate() succeeds or not.
Actually if translate() successfully, we can skip translation size
(iotlb.addr_mask + 1) instead of fixed granularity.
Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com>
---
softmmu/memory.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/softmmu/memory.c b/softmmu/memory.c
index bfa5d5178c..ccfa19cf71 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -1924,7 +1924,7 @@ void memory_region_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
{
MemoryRegion *mr = MEMORY_REGION(iommu_mr);
IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
- hwaddr addr, granularity;
+ hwaddr addr, granularity, def_granu;
IOMMUTLBEntry iotlb;
/* If the IOMMU has its own replay callback, override */
@@ -1933,12 +1933,15 @@ void memory_region_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
return;
}
- granularity = memory_region_iommu_get_min_page_size(iommu_mr);
+ def_granu = memory_region_iommu_get_min_page_size(iommu_mr);
for (addr = 0; addr < memory_region_size(mr); addr += granularity) {
iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, n->iommu_idx);
if (iotlb.perm != IOMMU_NONE) {
n->notify(n, &iotlb);
+ granularity = iotlb.addr_mask + 1;
+ } else {
+ granularity = def_granu;
}
/* if (2^64 - MR size) < granularity, it's possible to get an
--
2.33.0
On 16.04.22 10:34, chenxiang via wrote: > From: Xiang Chen <chenxiang66@hisilicon.com> > > Currently memory_region_iommu_replay() does full page table walk with > fixed granularity (page size) no matter translate() succeeds or not. > Actually if translate() successfully, we can skip translation size > (iotlb.addr_mask + 1) instead of fixed granularity. > > Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com> > --- > softmmu/memory.c | 7 +++++-- > 1 file changed, 5 insertions(+), 2 deletions(-) > > diff --git a/softmmu/memory.c b/softmmu/memory.c > index bfa5d5178c..ccfa19cf71 100644 > --- a/softmmu/memory.c > +++ b/softmmu/memory.c > @@ -1924,7 +1924,7 @@ void memory_region_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) > { > MemoryRegion *mr = MEMORY_REGION(iommu_mr); > IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); > - hwaddr addr, granularity; > + hwaddr addr, granularity, def_granu; > IOMMUTLBEntry iotlb; > > /* If the IOMMU has its own replay callback, override */ > @@ -1933,12 +1933,15 @@ void memory_region_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) > return; > } > > - granularity = memory_region_iommu_get_min_page_size(iommu_mr); > + def_granu = memory_region_iommu_get_min_page_size(iommu_mr); "granu" sounds weird. I'd suggest calling this "min_granularity". > > for (addr = 0; addr < memory_region_size(mr); addr += granularity) { > iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, n->iommu_idx); > if (iotlb.perm != IOMMU_NONE) { > n->notify(n, &iotlb); > + granularity = iotlb.addr_mask + 1; > + } else { > + granularity = def_granu; > } I do wonder if there are cases where we would actually have to round up the addr instead of simply adding the granularity if we suddenly use bigger steps then the min granularity. i.e., there would be a difference between addr += granularity and addr = QEMU_ALIGN_UP(addr + min_granularity, granularity); I wonder if there are corner cases where translate() could fail on the first part of a granularity increment but succeed on the second part. My gut feeling is that it should be fine, -- Thanks, David / dhildenb
On Tue, Apr 19, 2022 at 12:14:53PM +0200, David Hildenbrand wrote: > On 16.04.22 10:34, chenxiang via wrote: > > From: Xiang Chen <chenxiang66@hisilicon.com> > > > > Currently memory_region_iommu_replay() does full page table walk with > > fixed granularity (page size) no matter translate() succeeds or not. > > Actually if translate() successfully, we can skip translation size > > (iotlb.addr_mask + 1) instead of fixed granularity. > > > > Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com> > > --- > > softmmu/memory.c | 7 +++++-- > > 1 file changed, 5 insertions(+), 2 deletions(-) > > > > diff --git a/softmmu/memory.c b/softmmu/memory.c > > index bfa5d5178c..ccfa19cf71 100644 > > --- a/softmmu/memory.c > > +++ b/softmmu/memory.c > > @@ -1924,7 +1924,7 @@ void memory_region_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) > > { > > MemoryRegion *mr = MEMORY_REGION(iommu_mr); > > IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); > > - hwaddr addr, granularity; > > + hwaddr addr, granularity, def_granu; > > IOMMUTLBEntry iotlb; > > > > /* If the IOMMU has its own replay callback, override */ > > @@ -1933,12 +1933,15 @@ void memory_region_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) > > return; > > } > > > > - granularity = memory_region_iommu_get_min_page_size(iommu_mr); > > + def_granu = memory_region_iommu_get_min_page_size(iommu_mr); > > "granu" sounds weird. I'd suggest calling this "min_granularity". > > > > > for (addr = 0; addr < memory_region_size(mr); addr += granularity) { > > iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, n->iommu_idx); > > if (iotlb.perm != IOMMU_NONE) { > > n->notify(n, &iotlb); > > + granularity = iotlb.addr_mask + 1; > > + } else { > > + granularity = def_granu; > > } > > I do wonder if there are cases where we would actually have to round up > the addr instead of simply adding the granularity if we suddenly use > bigger steps then the min granularity. > > i.e., there would be a difference between > > addr += granularity > > and > > addr = QEMU_ALIGN_UP(addr + min_granularity, granularity); > > I wonder if there are corner cases where translate() could fail on the > first part of a granularity increment but succeed on the second part. My > gut feeling is that it should be fine, I'd make it a harder requirement, since afaik the translation should always happen with one or multiple of min page size, so logically that's: granularity = iotlb.addr_mask + 1; assert(QEMU_IS_ALIGNED(granularity, def_granu)); Chenxiang, could you introduce more context of this change? In what case you found it slow and how this speeds things up? Obviously this is not for x86 because x86 doesn't even run this code path. Then I'm curious in what scenario would this be helpful, and whether it would always help. Say, the IOMMU_NONE cases are not being covered by this speedup, and if there're a bunch of IOMMU_NONE mappings under this region, it'll still be low, isn't it? Thanks, -- Peter Xu
© 2016 - 2024 Red Hat, Inc.