mm/memory.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-)
We use pte_range_none() to determine whether contiguous PTEs are empty
for an mTHP allocation. Instead of iterating the while loop for every
order, use some information, which is the first set PTE found, from the
previous iteration, to eliminate some cases. The key to understanding
the correctness of the patch is that the ranges we want to examine
form a strictly decreasing sequence of nested intervals.
Suggested-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
---
mm/memory.c | 30 +++++++++++++++++++++++-------
1 file changed, 23 insertions(+), 7 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 3c01d68065be..ffc24a48ef15 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4409,26 +4409,27 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
return ret;
}
-static bool pte_range_none(pte_t *pte, int nr_pages)
+static int pte_range_none(pte_t *pte, int nr_pages)
{
int i;
for (i = 0; i < nr_pages; i++) {
if (!pte_none(ptep_get_lockless(pte + i)))
- return false;
+ return i;
}
- return true;
+ return nr_pages;
}
static struct folio *alloc_anon_folio(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ pte_t *first_set_pte = NULL, *align_pte, *pte;
unsigned long orders;
struct folio *folio;
unsigned long addr;
- pte_t *pte;
+ int max_empty;
gfp_t gfp;
int order;
@@ -4463,8 +4464,23 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
order = highest_order(orders);
while (orders) {
addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
- if (pte_range_none(pte + pte_index(addr), 1 << order))
+ align_pte = pte + pte_index(addr);
+
+ /* Range to be scanned known to be empty */
+ if (align_pte + (1 << order) <= first_set_pte)
break;
+
+ /* Range to be scanned contains first_set_pte */
+ if (align_pte <= first_set_pte)
+ goto repeat;
+
+ /* align_pte > first_set_pte, so need to check properly */
+ max_empty = pte_range_none(align_pte, 1 << order);
+ if (max_empty == 1 << order)
+ break;
+
+ first_set_pte = align_pte + max_empty;
+repeat:
order = next_order(&orders, order);
}
@@ -4579,7 +4595,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (nr_pages == 1 && vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, addr, vmf->pte);
goto release;
- } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
+ } else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) {
update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
goto release;
}
@@ -4915,7 +4931,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
update_mmu_tlb(vma, addr, vmf->pte);
ret = VM_FAULT_NOPAGE;
goto unlock;
- } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
+ } else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) {
update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
ret = VM_FAULT_NOPAGE;
goto unlock;
--
2.30.2
On Fri, Sep 13, 2024 at 02:49:02PM +0530, Dev Jain wrote: > We use pte_range_none() to determine whether contiguous PTEs are empty > for an mTHP allocation. Instead of iterating the while loop for every > order, use some information, which is the first set PTE found, from the > previous iteration, to eliminate some cases. The key to understanding > the correctness of the patch is that the ranges we want to examine > form a strictly decreasing sequence of nested intervals. This is a lot more complicated. Do you have any numbers that indicate that it's faster? Yes, it's fewer memory references, but you've gone from a simple linear scan that's easy to prefetch to an exponential scan that might confuse the prefetchers.
On 9/16/24 18:54, Matthew Wilcox wrote: > On Fri, Sep 13, 2024 at 02:49:02PM +0530, Dev Jain wrote: >> We use pte_range_none() to determine whether contiguous PTEs are empty >> for an mTHP allocation. Instead of iterating the while loop for every >> order, use some information, which is the first set PTE found, from the >> previous iteration, to eliminate some cases. The key to understanding >> the correctness of the patch is that the ranges we want to examine >> form a strictly decreasing sequence of nested intervals. > This is a lot more complicated. Do you have any numbers that indicate > that it's faster? Yes, it's fewer memory references, but you've gone > from a simple linear scan that's easy to prefetch to an exponential scan > that might confuse the prefetchers. I do have some numbers, I tested with a simple program, and also used ktime API, with the latter, enclosing from "order = highest_order(orders)" till "pte_unmap(pte)" (enclosing the entire while loop), a rough average estimate is that without the patch, it takes 1700 ns to execute, with the patch, on an average it takes 80 - 100ns less. I cannot think of a good testing program... For the prefetching thingy, I am still doing a linear scan, and in each iteration, with the patch, the range I am scanning is going to strictly lie inside the range I would have scanned without the patch. Won't the compiler and the CPU still do prefetching, but on a smaller range; where does the prefetcher get confused? I confess, I do not understand this very well.
On 17.09.24 05:55, Dev Jain wrote: > > On 9/16/24 18:54, Matthew Wilcox wrote: >> On Fri, Sep 13, 2024 at 02:49:02PM +0530, Dev Jain wrote: >>> We use pte_range_none() to determine whether contiguous PTEs are empty >>> for an mTHP allocation. Instead of iterating the while loop for every >>> order, use some information, which is the first set PTE found, from the >>> previous iteration, to eliminate some cases. The key to understanding >>> the correctness of the patch is that the ranges we want to examine >>> form a strictly decreasing sequence of nested intervals. >> This is a lot more complicated. Do you have any numbers that indicate >> that it's faster? Yes, it's fewer memory references, but you've gone >> from a simple linear scan that's easy to prefetch to an exponential scan >> that might confuse the prefetchers. > > I do have some numbers, I tested with a simple program, and also used > ktime API, with the latter, enclosing from "order = highest_order(orders)" > till "pte_unmap(pte)" (enclosing the entire while loop), a rough average > estimate is that without the patch, it takes 1700 ns to execute, with the > patch, on an average it takes 80 - 100ns less. I cannot think of a good > testing program... And that is likely what Willy is actually wondering about: does it have any real world impact or is the benefit just noise. :) Change does not look too wild to me, but yes, it increases complexity. -- Cheers, David / dhildenb
On 17/09/2024 04:55, Dev Jain wrote: > > On 9/16/24 18:54, Matthew Wilcox wrote: >> On Fri, Sep 13, 2024 at 02:49:02PM +0530, Dev Jain wrote: >>> We use pte_range_none() to determine whether contiguous PTEs are empty >>> for an mTHP allocation. Instead of iterating the while loop for every >>> order, use some information, which is the first set PTE found, from the >>> previous iteration, to eliminate some cases. The key to understanding >>> the correctness of the patch is that the ranges we want to examine >>> form a strictly decreasing sequence of nested intervals. >> This is a lot more complicated. Do you have any numbers that indicate >> that it's faster? Yes, it's fewer memory references, but you've gone >> from a simple linear scan that's easy to prefetch to an exponential scan >> that might confuse the prefetchers. > > I do have some numbers, I tested with a simple program, and also used > ktime API, with the latter, enclosing from "order = highest_order(orders)" > till "pte_unmap(pte)" (enclosing the entire while loop), a rough average > estimate is that without the patch, it takes 1700 ns to execute, with the > patch, on an average it takes 80 - 100ns less. I cannot think of a good > testing program... > > For the prefetching thingy, I am still doing a linear scan, and in each > iteration, with the patch, the range I am scanning is going to strictly > lie inside the range I would have scanned without the patch. Won't the > compiler and the CPU still do prefetching, but on a smaller range; where > does the prefetcher get confused? I confess, I do not understand this > very well. > A little history on this; My original "RFC v2" for mTHP included this optimization [1], but Yu Zhou suggested dropping it to keep things simple, which I did. Then at v8, DavidH suggested we could benefit from this sort of optimization, but we agreed to do it later as a separate change [2]: """ >> Comment: Likely it would make sense to scan only once and determine the >> "largest none range" around that address, having the largest suitable order >> in mind. > > Yes, that's how I used to do it, but Yu Zhou requested simplifying to this, > IIRC. Perhaps this an optimization opportunity for later? Yes, definetly. """ Dev independently discovered this opportunity while reading the code, and I pointed him to the history, and suggested it would likely be worthwhile to send a patch. My view is that I don't see how this can harm performance; in the common case, when a single order is enabled, this is essentially the same as before. But when there are multiple orders enabled, we are now just doing a single linear scan of the ptes rather than multiple scans. There will likely be some stack accesses interleved, but I'd be gobsmacked if the prefetchers can't tell the difference between the stack and other areas of memory. Perhaps some perf numbers would help; I think the simplest way to gather some numbers would be to create a microbenchmark to allocate a large VMA, then fault in single pages at a given stride (say, 1 every 128K), then enable 1M, 512K, 256K, 128K and 64K mTHP, then memset the entire VMA. It's a bit contrived, but this patch will show improvement if the scan is currently a significant portion of the page fault. If the proposed benchmark shows an improvement, and we don't see any regression when only enabling 64K, then my vote would be to accept the patch. [1] https://lore.kernel.org/linux-mm/20230414130303.2345383-7-ryan.roberts@arm.com/ [2] https://lore.kernel.org/linux-mm/ca649aad-7b76-4c6d-b513-26b3d58f8e68@redhat.com/ Thanks, Ryan
On Mon, Sep 16, 2024 at 9:25 PM Matthew Wilcox <willy@infradead.org> wrote: > > On Fri, Sep 13, 2024 at 02:49:02PM +0530, Dev Jain wrote: > > We use pte_range_none() to determine whether contiguous PTEs are empty > > for an mTHP allocation. Instead of iterating the while loop for every > > order, use some information, which is the first set PTE found, from the > > previous iteration, to eliminate some cases. The key to understanding > > the correctness of the patch is that the ranges we want to examine > > form a strictly decreasing sequence of nested intervals. > > This is a lot more complicated. Do you have any numbers that indicate > that it's faster? Yes, it's fewer memory references, but you've gone > from a simple linear scan that's easy to prefetch to an exponential scan > that might confuse the prefetchers. +1 I'm not sure if multiple mthp sizes will be enabled for common cases ;) If not, this could be a bit more complicated, IMO. @Barry, could you share whether OPPO typically uses multiple mthp sizes in their scenarios? Thanks, Lance
On Fri, Sep 13, 2024 at 5:19 PM Dev Jain <dev.jain@arm.com> wrote: > > We use pte_range_none() to determine whether contiguous PTEs are empty > for an mTHP allocation. Instead of iterating the while loop for every > order, use some information, which is the first set PTE found, from the > previous iteration, to eliminate some cases. The key to understanding > the correctness of the patch is that the ranges we want to examine > form a strictly decreasing sequence of nested intervals. > > Suggested-by: Ryan Roberts <ryan.roberts@arm.com> > Signed-off-by: Dev Jain <dev.jain@arm.com> I like this patch, but could we come up with a better subject for pte_range_none()? The subject is really incorrect. Also, I'd prefer the change for alloc_anon_folio() to be separated into its own patch. So, one patchset with two patches, please. > --- > mm/memory.c | 30 +++++++++++++++++++++++------- > 1 file changed, 23 insertions(+), 7 deletions(-) > > diff --git a/mm/memory.c b/mm/memory.c > index 3c01d68065be..ffc24a48ef15 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -4409,26 +4409,27 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > return ret; > } > > -static bool pte_range_none(pte_t *pte, int nr_pages) > +static int pte_range_none(pte_t *pte, int nr_pages) > { > int i; > > for (i = 0; i < nr_pages; i++) { > if (!pte_none(ptep_get_lockless(pte + i))) > - return false; > + return i; > } > > - return true; > + return nr_pages; > } > > static struct folio *alloc_anon_folio(struct vm_fault *vmf) > { > struct vm_area_struct *vma = vmf->vma; > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > + pte_t *first_set_pte = NULL, *align_pte, *pte; > unsigned long orders; > struct folio *folio; > unsigned long addr; > - pte_t *pte; > + int max_empty; > gfp_t gfp; > int order; > > @@ -4463,8 +4464,23 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) > order = highest_order(orders); > while (orders) { > addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); > - if (pte_range_none(pte + pte_index(addr), 1 << order)) > + align_pte = pte + pte_index(addr); > + > + /* Range to be scanned known to be empty */ > + if (align_pte + (1 << order) <= first_set_pte) > break; > + > + /* Range to be scanned contains first_set_pte */ > + if (align_pte <= first_set_pte) > + goto repeat; > + > + /* align_pte > first_set_pte, so need to check properly */ > + max_empty = pte_range_none(align_pte, 1 << order); > + if (max_empty == 1 << order) > + break; > + > + first_set_pte = align_pte + max_empty; > +repeat: > order = next_order(&orders, order); > } > > @@ -4579,7 +4595,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) > if (nr_pages == 1 && vmf_pte_changed(vmf)) { > update_mmu_tlb(vma, addr, vmf->pte); > goto release; > - } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { > + } else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) { > update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); > goto release; > } > @@ -4915,7 +4931,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) > update_mmu_tlb(vma, addr, vmf->pte); > ret = VM_FAULT_NOPAGE; > goto unlock; > - } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { > + } else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) { > update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); > ret = VM_FAULT_NOPAGE; > goto unlock; > -- > 2.30.2 > Thanks Barry
On 9/16/24 10:42, Barry Song wrote: > On Fri, Sep 13, 2024 at 5:19 PM Dev Jain <dev.jain@arm.com> wrote: >> We use pte_range_none() to determine whether contiguous PTEs are empty >> for an mTHP allocation. Instead of iterating the while loop for every >> order, use some information, which is the first set PTE found, from the >> previous iteration, to eliminate some cases. The key to understanding >> the correctness of the patch is that the ranges we want to examine >> form a strictly decreasing sequence of nested intervals. >> >> Suggested-by: Ryan Roberts <ryan.roberts@arm.com> >> Signed-off-by: Dev Jain <dev.jain@arm.com> > I like this patch, but could we come up with a better subject for > pte_range_none()? > The subject is really incorrect. Are you asking me to change "Compute mTHP order efficiently" to something else? > > Also, I'd prefer the change for alloc_anon_folio() to be separated > into its own patch. > So, one patchset with two patches, please. Fine by me. > >> --- >> mm/memory.c | 30 +++++++++++++++++++++++------- >> 1 file changed, 23 insertions(+), 7 deletions(-) >> >> diff --git a/mm/memory.c b/mm/memory.c >> index 3c01d68065be..ffc24a48ef15 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -4409,26 +4409,27 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >> return ret; >> } >> >> -static bool pte_range_none(pte_t *pte, int nr_pages) >> +static int pte_range_none(pte_t *pte, int nr_pages) >> { >> int i; >> >> for (i = 0; i < nr_pages; i++) { >> if (!pte_none(ptep_get_lockless(pte + i))) >> - return false; >> + return i; >> } >> >> - return true; >> + return nr_pages; >> } >> >> static struct folio *alloc_anon_folio(struct vm_fault *vmf) >> { >> struct vm_area_struct *vma = vmf->vma; >> #ifdef CONFIG_TRANSPARENT_HUGEPAGE >> + pte_t *first_set_pte = NULL, *align_pte, *pte; >> unsigned long orders; >> struct folio *folio; >> unsigned long addr; >> - pte_t *pte; >> + int max_empty; >> gfp_t gfp; >> int order; >> >> @@ -4463,8 +4464,23 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) >> order = highest_order(orders); >> while (orders) { >> addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); >> - if (pte_range_none(pte + pte_index(addr), 1 << order)) >> + align_pte = pte + pte_index(addr); >> + >> + /* Range to be scanned known to be empty */ >> + if (align_pte + (1 << order) <= first_set_pte) >> break; >> + >> + /* Range to be scanned contains first_set_pte */ >> + if (align_pte <= first_set_pte) >> + goto repeat; >> + >> + /* align_pte > first_set_pte, so need to check properly */ >> + max_empty = pte_range_none(align_pte, 1 << order); >> + if (max_empty == 1 << order) >> + break; >> + >> + first_set_pte = align_pte + max_empty; >> +repeat: >> order = next_order(&orders, order); >> } >> >> @@ -4579,7 +4595,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) >> if (nr_pages == 1 && vmf_pte_changed(vmf)) { >> update_mmu_tlb(vma, addr, vmf->pte); >> goto release; >> - } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { >> + } else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) { >> update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); >> goto release; >> } >> @@ -4915,7 +4931,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) >> update_mmu_tlb(vma, addr, vmf->pte); >> ret = VM_FAULT_NOPAGE; >> goto unlock; >> - } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { >> + } else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) { >> update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); >> ret = VM_FAULT_NOPAGE; >> goto unlock; >> -- >> 2.30.2 >> > Thanks > Barry
On Mon, Sep 16, 2024 at 1:20 PM Dev Jain <dev.jain@arm.com> wrote: > > > On 9/16/24 10:42, Barry Song wrote: > > On Fri, Sep 13, 2024 at 5:19 PM Dev Jain <dev.jain@arm.com> wrote: > >> We use pte_range_none() to determine whether contiguous PTEs are empty > >> for an mTHP allocation. Instead of iterating the while loop for every > >> order, use some information, which is the first set PTE found, from the > >> previous iteration, to eliminate some cases. The key to understanding > >> the correctness of the patch is that the ranges we want to examine > >> form a strictly decreasing sequence of nested intervals. > >> > >> Suggested-by: Ryan Roberts <ryan.roberts@arm.com> > >> Signed-off-by: Dev Jain <dev.jain@arm.com> > > I like this patch, but could we come up with a better subject for > > pte_range_none()? > > The subject is really incorrect. > > Are you asking me to change "Compute mTHP order efficiently" to > something else? Right. Adjust the subject to more accurately reflect the specific changes being made. > > > > > Also, I'd prefer the change for alloc_anon_folio() to be separated > > into its own patch. > > So, one patchset with two patches, please. > > Fine by me. > > > > >> --- > >> mm/memory.c | 30 +++++++++++++++++++++++------- > >> 1 file changed, 23 insertions(+), 7 deletions(-) > >> > >> diff --git a/mm/memory.c b/mm/memory.c > >> index 3c01d68065be..ffc24a48ef15 100644 > >> --- a/mm/memory.c > >> +++ b/mm/memory.c > >> @@ -4409,26 +4409,27 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > >> return ret; > >> } > >> > >> -static bool pte_range_none(pte_t *pte, int nr_pages) > >> +static int pte_range_none(pte_t *pte, int nr_pages) > >> { > >> int i; > >> > >> for (i = 0; i < nr_pages; i++) { > >> if (!pte_none(ptep_get_lockless(pte + i))) > >> - return false; > >> + return i; > >> } > >> > >> - return true; > >> + return nr_pages; > >> } > >> > >> static struct folio *alloc_anon_folio(struct vm_fault *vmf) > >> { > >> struct vm_area_struct *vma = vmf->vma; > >> #ifdef CONFIG_TRANSPARENT_HUGEPAGE > >> + pte_t *first_set_pte = NULL, *align_pte, *pte; > >> unsigned long orders; > >> struct folio *folio; > >> unsigned long addr; > >> - pte_t *pte; > >> + int max_empty; > >> gfp_t gfp; > >> int order; > >> > >> @@ -4463,8 +4464,23 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) > >> order = highest_order(orders); > >> while (orders) { > >> addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); > >> - if (pte_range_none(pte + pte_index(addr), 1 << order)) > >> + align_pte = pte + pte_index(addr); > >> + > >> + /* Range to be scanned known to be empty */ > >> + if (align_pte + (1 << order) <= first_set_pte) > >> break; > >> + > >> + /* Range to be scanned contains first_set_pte */ > >> + if (align_pte <= first_set_pte) > >> + goto repeat; > >> + > >> + /* align_pte > first_set_pte, so need to check properly */ > >> + max_empty = pte_range_none(align_pte, 1 << order); > >> + if (max_empty == 1 << order) > >> + break; > >> + > >> + first_set_pte = align_pte + max_empty; > >> +repeat: > >> order = next_order(&orders, order); > >> } > >> > >> @@ -4579,7 +4595,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) > >> if (nr_pages == 1 && vmf_pte_changed(vmf)) { > >> update_mmu_tlb(vma, addr, vmf->pte); > >> goto release; > >> - } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { > >> + } else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) { > >> update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); > >> goto release; > >> } > >> @@ -4915,7 +4931,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) > >> update_mmu_tlb(vma, addr, vmf->pte); > >> ret = VM_FAULT_NOPAGE; > >> goto unlock; > >> - } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { > >> + } else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) { > >> update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); > >> ret = VM_FAULT_NOPAGE; > >> goto unlock; > >> -- > >> 2.30.2 > >> > > Thanks > > Barry >
© 2016 - 2024 Red Hat, Inc.