Instead, let's just allow for specifying through flags whether we want
to have bits merged into the original PTE.
For the madvise() case, simplify by having only a single parameter for
merging young+dirty. For madvise_cold_or_pageout_pte_range() merging the
dirty bit is not required, but also not harmful. This code is not that
performance critical after all to really force all micro-optimizations.
As we now have two pte_t * parameters, use PageTable() to make sure we
are actually given a pointer at a copy of the PTE, not a pointer into
an actual page table.
Signed-off-by: David Hildenbrand <david@redhat.com>
---
mm/internal.h | 58 +++++++++++++++++++++++++++++++--------------------
mm/madvise.c | 26 +++++------------------
mm/memory.c | 8 ++-----
mm/util.c | 2 +-
4 files changed, 43 insertions(+), 51 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index 6000b683f68ee..fe69e21b34a24 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -208,6 +208,18 @@ typedef int __bitwise fpb_t;
/* Compare PTEs honoring the soft-dirty bit. */
#define FPB_HONOR_SOFT_DIRTY ((__force fpb_t)BIT(1))
+/*
+ * Merge PTE write bits: if any PTE in the batch is writable, modify the
+ * PTE at @ptentp to be writable.
+ */
+#define FPB_MERGE_WRITE ((__force fpb_t)BIT(2))
+
+/*
+ * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty,
+ * modify the PTE at @ptentp to be young or dirty, respectively.
+ */
+#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(3))
+
static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
{
if (!(flags & FPB_HONOR_DIRTY))
@@ -220,16 +232,11 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
/**
* folio_pte_batch_ext - detect a PTE batch for a large folio
* @folio: The large folio to detect a PTE batch for.
+ * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL.
* @ptep: Page table pointer for the first entry.
- * @pte: Page table entry for the first page.
+ * @ptentp: Pointer at a copy of the first page table entry.
* @max_nr: The maximum number of table entries to consider.
* @flags: Flags to modify the PTE batch semantics.
- * @any_writable: Optional pointer to indicate whether any entry except the
- * first one is writable.
- * @any_young: Optional pointer to indicate whether any entry except the
- * first one is young.
- * @any_dirty: Optional pointer to indicate whether any entry except the
- * first one is dirty.
*
* Detect a PTE batch: consecutive (present) PTEs that map consecutive
* pages of the same large folio in a single VMA and a single page table.
@@ -242,28 +249,26 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
* must be limited by the caller so scanning cannot exceed a single VMA and
* a single page table.
*
+ * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will
+ * be modified.
+ *
* This function will be inlined to optimize based on the input parameters;
* consider using folio_pte_batch() instead if applicable.
*
* Return: the number of table entries in the batch.
*/
static inline unsigned int folio_pte_batch_ext(struct folio *folio,
- pte_t *ptep, pte_t pte, unsigned int max_nr, fpb_t flags,
- bool *any_writable, bool *any_young, bool *any_dirty)
+ struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp,
+ unsigned int max_nr, fpb_t flags)
{
+ bool any_writable = false, any_young = false, any_dirty = false;
+ pte_t expected_pte, pte = *ptentp;
unsigned int nr, cur_nr;
- pte_t expected_pte;
-
- if (any_writable)
- *any_writable = false;
- if (any_young)
- *any_young = false;
- if (any_dirty)
- *any_dirty = false;
VM_WARN_ON_FOLIO(!pte_present(pte), folio);
VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
+ VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp)));
/* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
max_nr = min_t(unsigned long, max_nr,
@@ -279,12 +284,12 @@ static inline unsigned int folio_pte_batch_ext(struct folio *folio,
if (!pte_same(__pte_batch_clear_ignored(pte, flags), expected_pte))
break;
- if (any_writable)
- *any_writable |= pte_write(pte);
- if (any_young)
- *any_young |= pte_young(pte);
- if (any_dirty)
- *any_dirty |= pte_dirty(pte);
+ if (flags & FPB_MERGE_WRITE)
+ any_writable |= pte_write(pte);
+ if (flags & FPB_MERGE_YOUNG_DIRTY) {
+ any_young |= pte_young(pte);
+ any_dirty |= pte_dirty(pte);
+ }
cur_nr = pte_batch_hint(ptep, pte);
expected_pte = pte_advance_pfn(expected_pte, cur_nr);
@@ -292,6 +297,13 @@ static inline unsigned int folio_pte_batch_ext(struct folio *folio,
nr += cur_nr;
}
+ if (any_writable)
+ *ptentp = pte_mkwrite(*ptentp, vma);
+ if (any_young)
+ *ptentp = pte_mkyoung(*ptentp);
+ if (any_dirty)
+ *ptentp = pte_mkdirty(*ptentp);
+
return min(nr, max_nr);
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 9b9c35a398ed0..dce8f5e8555cb 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -344,13 +344,12 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
struct folio *folio, pte_t *ptep,
- pte_t pte, bool *any_young,
- bool *any_dirty)
+ pte_t *ptentp)
{
int max_nr = (end - addr) / PAGE_SIZE;
- return folio_pte_batch_ext(folio, ptep, pte, max_nr, 0, NULL,
- any_young, any_dirty);
+ return folio_pte_batch_ext(folio, NULL, ptep, ptentp, max_nr,
+ FPB_MERGE_YOUNG_DIRTY);
}
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
@@ -488,13 +487,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
* next pte in the range.
*/
if (folio_test_large(folio)) {
- bool any_young;
-
- nr = madvise_folio_pte_batch(addr, end, folio, pte,
- ptent, &any_young, NULL);
- if (any_young)
- ptent = pte_mkyoung(ptent);
-
+ nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent);
if (nr < folio_nr_pages(folio)) {
int err;
@@ -724,11 +717,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
* next pte in the range.
*/
if (folio_test_large(folio)) {
- bool any_young, any_dirty;
-
- nr = madvise_folio_pte_batch(addr, end, folio, pte,
- ptent, &any_young, &any_dirty);
-
+ nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent);
if (nr < folio_nr_pages(folio)) {
int err;
@@ -753,11 +742,6 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
nr = 0;
continue;
}
-
- if (any_young)
- ptent = pte_mkyoung(ptent);
- if (any_dirty)
- ptent = pte_mkdirty(ptent);
}
if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
diff --git a/mm/memory.c b/mm/memory.c
index 43d35d6675f2e..985d09bee44fd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -972,10 +972,9 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
int max_nr, int *rss, struct folio **prealloc)
{
+ fpb_t flags = FPB_MERGE_WRITE;
struct page *page;
struct folio *folio;
- bool any_writable;
- fpb_t flags = 0;
int err, nr;
page = vm_normal_page(src_vma, addr, pte);
@@ -995,8 +994,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
if (vma_soft_dirty_enabled(src_vma))
flags |= FPB_HONOR_SOFT_DIRTY;
- nr = folio_pte_batch_ext(folio, src_pte, pte, max_nr, flags,
- &any_writable, NULL, NULL);
+ nr = folio_pte_batch_ext(folio, src_vma, src_pte, &pte, max_nr, flags);
folio_ref_add(folio, nr);
if (folio_test_anon(folio)) {
if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
@@ -1010,8 +1008,6 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
folio_dup_file_rmap_ptes(folio, page, nr, dst_vma);
rss[mm_counter_file(folio)] += nr;
}
- if (any_writable)
- pte = pte_mkwrite(pte, src_vma);
__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
addr, nr);
return nr;
diff --git a/mm/util.c b/mm/util.c
index d29dcc135ad28..19d1a5814fac7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1197,6 +1197,6 @@ EXPORT_SYMBOL(compat_vma_mmap_prepare);
unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
unsigned int max_nr)
{
- return folio_pte_batch_ext(folio, ptep, pte, max_nr, 0, NULL, NULL, NULL);
+ return folio_pte_batch_ext(folio, NULL, ptep, &pte, max_nr, 0);
}
#endif /* CONFIG_MMU */
--
2.49.0
On 27 Jun 2025, at 7:55, David Hildenbrand wrote: > Instead, let's just allow for specifying through flags whether we want > to have bits merged into the original PTE. > > For the madvise() case, simplify by having only a single parameter for > merging young+dirty. For madvise_cold_or_pageout_pte_range() merging the > dirty bit is not required, but also not harmful. This code is not that > performance critical after all to really force all micro-optimizations. > > As we now have two pte_t * parameters, use PageTable() to make sure we > are actually given a pointer at a copy of the PTE, not a pointer into > an actual page table. > > Signed-off-by: David Hildenbrand <david@redhat.com> > --- > mm/internal.h | 58 +++++++++++++++++++++++++++++++-------------------- > mm/madvise.c | 26 +++++------------------ > mm/memory.c | 8 ++----- > mm/util.c | 2 +- > 4 files changed, 43 insertions(+), 51 deletions(-) > > diff --git a/mm/internal.h b/mm/internal.h > index 6000b683f68ee..fe69e21b34a24 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -208,6 +208,18 @@ typedef int __bitwise fpb_t; > /* Compare PTEs honoring the soft-dirty bit. */ > #define FPB_HONOR_SOFT_DIRTY ((__force fpb_t)BIT(1)) > > +/* > + * Merge PTE write bits: if any PTE in the batch is writable, modify the > + * PTE at @ptentp to be writable. > + */ > +#define FPB_MERGE_WRITE ((__force fpb_t)BIT(2)) > + > +/* > + * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty, > + * modify the PTE at @ptentp to be young or dirty, respectively. > + */ > +#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(3)) > + > static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) > { > if (!(flags & FPB_HONOR_DIRTY)) > @@ -220,16 +232,11 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) > /** > * folio_pte_batch_ext - detect a PTE batch for a large folio > * @folio: The large folio to detect a PTE batch for. > + * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL. > * @ptep: Page table pointer for the first entry. > - * @pte: Page table entry for the first page. > + * @ptentp: Pointer at a copy of the first page table entry. > * @max_nr: The maximum number of table entries to consider. > * @flags: Flags to modify the PTE batch semantics. > - * @any_writable: Optional pointer to indicate whether any entry except the > - * first one is writable. > - * @any_young: Optional pointer to indicate whether any entry except the > - * first one is young. > - * @any_dirty: Optional pointer to indicate whether any entry except the > - * first one is dirty. > * > * Detect a PTE batch: consecutive (present) PTEs that map consecutive > * pages of the same large folio in a single VMA and a single page table. > @@ -242,28 +249,26 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) > * must be limited by the caller so scanning cannot exceed a single VMA and > * a single page table. > * > + * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will > + * be modified. > + * > * This function will be inlined to optimize based on the input parameters; > * consider using folio_pte_batch() instead if applicable. > * > * Return: the number of table entries in the batch. > */ > static inline unsigned int folio_pte_batch_ext(struct folio *folio, > - pte_t *ptep, pte_t pte, unsigned int max_nr, fpb_t flags, > - bool *any_writable, bool *any_young, bool *any_dirty) > + struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp, > + unsigned int max_nr, fpb_t flags) > { > + bool any_writable = false, any_young = false, any_dirty = false; > + pte_t expected_pte, pte = *ptentp; > unsigned int nr, cur_nr; > - pte_t expected_pte; > - > - if (any_writable) > - *any_writable = false; > - if (any_young) > - *any_young = false; > - if (any_dirty) > - *any_dirty = false; > > VM_WARN_ON_FOLIO(!pte_present(pte), folio); > VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); > VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); > + VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp))); Why not just VM_WARN_ON(!pte_same(*ptep, *ptentp)) ? ptep points to the first page table entry and ptentp points to the copy of it. I assume ptep should point to a valid page table entry to begin with. Best Regards, Yan, Zi
On 30.06.25 19:59, Zi Yan wrote: > On 27 Jun 2025, at 7:55, David Hildenbrand wrote: > >> Instead, let's just allow for specifying through flags whether we want >> to have bits merged into the original PTE. >> >> For the madvise() case, simplify by having only a single parameter for >> merging young+dirty. For madvise_cold_or_pageout_pte_range() merging the >> dirty bit is not required, but also not harmful. This code is not that >> performance critical after all to really force all micro-optimizations. >> >> As we now have two pte_t * parameters, use PageTable() to make sure we >> are actually given a pointer at a copy of the PTE, not a pointer into >> an actual page table. >> >> Signed-off-by: David Hildenbrand <david@redhat.com> >> --- >> mm/internal.h | 58 +++++++++++++++++++++++++++++++-------------------- >> mm/madvise.c | 26 +++++------------------ >> mm/memory.c | 8 ++----- >> mm/util.c | 2 +- >> 4 files changed, 43 insertions(+), 51 deletions(-) >> >> diff --git a/mm/internal.h b/mm/internal.h >> index 6000b683f68ee..fe69e21b34a24 100644 >> --- a/mm/internal.h >> +++ b/mm/internal.h >> @@ -208,6 +208,18 @@ typedef int __bitwise fpb_t; >> /* Compare PTEs honoring the soft-dirty bit. */ >> #define FPB_HONOR_SOFT_DIRTY ((__force fpb_t)BIT(1)) >> >> +/* >> + * Merge PTE write bits: if any PTE in the batch is writable, modify the >> + * PTE at @ptentp to be writable. >> + */ >> +#define FPB_MERGE_WRITE ((__force fpb_t)BIT(2)) >> + >> +/* >> + * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty, >> + * modify the PTE at @ptentp to be young or dirty, respectively. >> + */ >> +#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(3)) >> + >> static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) >> { >> if (!(flags & FPB_HONOR_DIRTY)) >> @@ -220,16 +232,11 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) >> /** >> * folio_pte_batch_ext - detect a PTE batch for a large folio >> * @folio: The large folio to detect a PTE batch for. >> + * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL. >> * @ptep: Page table pointer for the first entry. >> - * @pte: Page table entry for the first page. >> + * @ptentp: Pointer at a copy of the first page table entry. >> * @max_nr: The maximum number of table entries to consider. >> * @flags: Flags to modify the PTE batch semantics. >> - * @any_writable: Optional pointer to indicate whether any entry except the >> - * first one is writable. >> - * @any_young: Optional pointer to indicate whether any entry except the >> - * first one is young. >> - * @any_dirty: Optional pointer to indicate whether any entry except the >> - * first one is dirty. >> * >> * Detect a PTE batch: consecutive (present) PTEs that map consecutive >> * pages of the same large folio in a single VMA and a single page table. >> @@ -242,28 +249,26 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) >> * must be limited by the caller so scanning cannot exceed a single VMA and >> * a single page table. >> * >> + * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will >> + * be modified. >> + * >> * This function will be inlined to optimize based on the input parameters; >> * consider using folio_pte_batch() instead if applicable. >> * >> * Return: the number of table entries in the batch. >> */ >> static inline unsigned int folio_pte_batch_ext(struct folio *folio, >> - pte_t *ptep, pte_t pte, unsigned int max_nr, fpb_t flags, >> - bool *any_writable, bool *any_young, bool *any_dirty) >> + struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp, >> + unsigned int max_nr, fpb_t flags) >> { >> + bool any_writable = false, any_young = false, any_dirty = false; >> + pte_t expected_pte, pte = *ptentp; >> unsigned int nr, cur_nr; >> - pte_t expected_pte; >> - >> - if (any_writable) >> - *any_writable = false; >> - if (any_young) >> - *any_young = false; >> - if (any_dirty) >> - *any_dirty = false; >> >> VM_WARN_ON_FOLIO(!pte_present(pte), folio); >> VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); >> VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); >> + VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp))); > > Why not just VM_WARN_ON(!pte_same(*ptep, *ptentp)) ? > > ptep points to the first page table entry and ptentp points to the copy of it. > I assume ptep should point to a valid page table entry to begin with. That would also work, only miss the case where someone would by accident flip both pointers :) -- Cheers, David / dhildenb
On 02.07.25 11:08, David Hildenbrand wrote: > On 30.06.25 19:59, Zi Yan wrote: >> On 27 Jun 2025, at 7:55, David Hildenbrand wrote: >> >>> Instead, let's just allow for specifying through flags whether we want >>> to have bits merged into the original PTE. >>> >>> For the madvise() case, simplify by having only a single parameter for >>> merging young+dirty. For madvise_cold_or_pageout_pte_range() merging the >>> dirty bit is not required, but also not harmful. This code is not that >>> performance critical after all to really force all micro-optimizations. >>> >>> As we now have two pte_t * parameters, use PageTable() to make sure we >>> are actually given a pointer at a copy of the PTE, not a pointer into >>> an actual page table. >>> >>> Signed-off-by: David Hildenbrand <david@redhat.com> >>> --- >>> mm/internal.h | 58 +++++++++++++++++++++++++++++++-------------------- >>> mm/madvise.c | 26 +++++------------------ >>> mm/memory.c | 8 ++----- >>> mm/util.c | 2 +- >>> 4 files changed, 43 insertions(+), 51 deletions(-) >>> >>> diff --git a/mm/internal.h b/mm/internal.h >>> index 6000b683f68ee..fe69e21b34a24 100644 >>> --- a/mm/internal.h >>> +++ b/mm/internal.h >>> @@ -208,6 +208,18 @@ typedef int __bitwise fpb_t; >>> /* Compare PTEs honoring the soft-dirty bit. */ >>> #define FPB_HONOR_SOFT_DIRTY ((__force fpb_t)BIT(1)) >>> >>> +/* >>> + * Merge PTE write bits: if any PTE in the batch is writable, modify the >>> + * PTE at @ptentp to be writable. >>> + */ >>> +#define FPB_MERGE_WRITE ((__force fpb_t)BIT(2)) >>> + >>> +/* >>> + * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty, >>> + * modify the PTE at @ptentp to be young or dirty, respectively. >>> + */ >>> +#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(3)) >>> + >>> static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) >>> { >>> if (!(flags & FPB_HONOR_DIRTY)) >>> @@ -220,16 +232,11 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) >>> /** >>> * folio_pte_batch_ext - detect a PTE batch for a large folio >>> * @folio: The large folio to detect a PTE batch for. >>> + * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL. >>> * @ptep: Page table pointer for the first entry. >>> - * @pte: Page table entry for the first page. >>> + * @ptentp: Pointer at a copy of the first page table entry. >>> * @max_nr: The maximum number of table entries to consider. >>> * @flags: Flags to modify the PTE batch semantics. >>> - * @any_writable: Optional pointer to indicate whether any entry except the >>> - * first one is writable. >>> - * @any_young: Optional pointer to indicate whether any entry except the >>> - * first one is young. >>> - * @any_dirty: Optional pointer to indicate whether any entry except the >>> - * first one is dirty. >>> * >>> * Detect a PTE batch: consecutive (present) PTEs that map consecutive >>> * pages of the same large folio in a single VMA and a single page table. >>> @@ -242,28 +249,26 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) >>> * must be limited by the caller so scanning cannot exceed a single VMA and >>> * a single page table. >>> * >>> + * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will >>> + * be modified. >>> + * >>> * This function will be inlined to optimize based on the input parameters; >>> * consider using folio_pte_batch() instead if applicable. >>> * >>> * Return: the number of table entries in the batch. >>> */ >>> static inline unsigned int folio_pte_batch_ext(struct folio *folio, >>> - pte_t *ptep, pte_t pte, unsigned int max_nr, fpb_t flags, >>> - bool *any_writable, bool *any_young, bool *any_dirty) >>> + struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp, >>> + unsigned int max_nr, fpb_t flags) >>> { >>> + bool any_writable = false, any_young = false, any_dirty = false; >>> + pte_t expected_pte, pte = *ptentp; >>> unsigned int nr, cur_nr; >>> - pte_t expected_pte; >>> - >>> - if (any_writable) >>> - *any_writable = false; >>> - if (any_young) >>> - *any_young = false; >>> - if (any_dirty) >>> - *any_dirty = false; >>> >>> VM_WARN_ON_FOLIO(!pte_present(pte), folio); >>> VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); >>> VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); >>> + VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp))); >> >> Why not just VM_WARN_ON(!pte_same(*ptep, *ptentp)) ? >> >> ptep points to the first page table entry and ptentp points to the copy of it. >> I assume ptep should point to a valid page table entry to begin with. > > That would also work, only miss the case where someone would by accident > flip both pointers :) ... or pass the same pointer twice. -- Cheers, David / dhildenb
On Fri, Jun 27, 2025 at 01:55:10PM +0200, David Hildenbrand wrote: > Instead, let's just allow for specifying through flags whether we want > to have bits merged into the original PTE. > > For the madvise() case, simplify by having only a single parameter for > merging young+dirty. For madvise_cold_or_pageout_pte_range() merging the > dirty bit is not required, but also not harmful. This code is not that > performance critical after all to really force all micro-optimizations. > > As we now have two pte_t * parameters, use PageTable() to make sure we > are actually given a pointer at a copy of the PTE, not a pointer into > an actual page table. > > Signed-off-by: David Hildenbrand <david@redhat.com> Overall a really nice cleanup! Just some comments below. > --- > mm/internal.h | 58 +++++++++++++++++++++++++++++++-------------------- > mm/madvise.c | 26 +++++------------------ > mm/memory.c | 8 ++----- > mm/util.c | 2 +- > 4 files changed, 43 insertions(+), 51 deletions(-) > > diff --git a/mm/internal.h b/mm/internal.h > index 6000b683f68ee..fe69e21b34a24 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -208,6 +208,18 @@ typedef int __bitwise fpb_t; > /* Compare PTEs honoring the soft-dirty bit. */ > #define FPB_HONOR_SOFT_DIRTY ((__force fpb_t)BIT(1)) > > +/* > + * Merge PTE write bits: if any PTE in the batch is writable, modify the > + * PTE at @ptentp to be writable. > + */ > +#define FPB_MERGE_WRITE ((__force fpb_t)BIT(2)) > + > +/* > + * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty, > + * modify the PTE at @ptentp to be young or dirty, respectively. > + */ > +#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(3)) > + > static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) > { > if (!(flags & FPB_HONOR_DIRTY)) > @@ -220,16 +232,11 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) > /** > * folio_pte_batch_ext - detect a PTE batch for a large folio > * @folio: The large folio to detect a PTE batch for. > + * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL. > * @ptep: Page table pointer for the first entry. > - * @pte: Page table entry for the first page. > + * @ptentp: Pointer at a copy of the first page table entry. This seems weird to me, I know it's a pointer to a copy of the PTE, essentially replacing the pte param from before, but now it's also an output value? Shouldn't this be made clear? I know it's a pain and churn but if this is now meant to be an output var we should probably make it the last param too. At least needs an (output) or something here. > * @max_nr: The maximum number of table entries to consider. > * @flags: Flags to modify the PTE batch semantics. > - * @any_writable: Optional pointer to indicate whether any entry except the > - * first one is writable. > - * @any_young: Optional pointer to indicate whether any entry except the > - * first one is young. > - * @any_dirty: Optional pointer to indicate whether any entry except the > - * first one is dirty. > * > * Detect a PTE batch: consecutive (present) PTEs that map consecutive > * pages of the same large folio in a single VMA and a single page table. > @@ -242,28 +249,26 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) > * must be limited by the caller so scanning cannot exceed a single VMA and > * a single page table. > * > + * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will > + * be modified. This explains that you modify it but it doesn't really stand out as an output parameter. > + * > * This function will be inlined to optimize based on the input parameters; > * consider using folio_pte_batch() instead if applicable. > * > * Return: the number of table entries in the batch. > */ > static inline unsigned int folio_pte_batch_ext(struct folio *folio, > - pte_t *ptep, pte_t pte, unsigned int max_nr, fpb_t flags, > - bool *any_writable, bool *any_young, bool *any_dirty) > + struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp, > + unsigned int max_nr, fpb_t flags) > { > + bool any_writable = false, any_young = false, any_dirty = false; > + pte_t expected_pte, pte = *ptentp; > unsigned int nr, cur_nr; > - pte_t expected_pte; > - > - if (any_writable) > - *any_writable = false; > - if (any_young) > - *any_young = false; > - if (any_dirty) > - *any_dirty = false; > > VM_WARN_ON_FOLIO(!pte_present(pte), folio); > VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); > VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); > + VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp))); Hm so if !virt_addr_valid(ptentp) we're ok? :P I also think a quick comment here would help, the commit message explains it but glancing at this I'd be confused. Something like: /* Ensure this is a pointer to a copy not a pointer into a page table. */ > > /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ > max_nr = min_t(unsigned long, max_nr, > @@ -279,12 +284,12 @@ static inline unsigned int folio_pte_batch_ext(struct folio *folio, > if (!pte_same(__pte_batch_clear_ignored(pte, flags), expected_pte)) > break; > > - if (any_writable) > - *any_writable |= pte_write(pte); > - if (any_young) > - *any_young |= pte_young(pte); > - if (any_dirty) > - *any_dirty |= pte_dirty(pte); > + if (flags & FPB_MERGE_WRITE) > + any_writable |= pte_write(pte); > + if (flags & FPB_MERGE_YOUNG_DIRTY) { > + any_young |= pte_young(pte); > + any_dirty |= pte_dirty(pte); > + } > > cur_nr = pte_batch_hint(ptep, pte); > expected_pte = pte_advance_pfn(expected_pte, cur_nr); > @@ -292,6 +297,13 @@ static inline unsigned int folio_pte_batch_ext(struct folio *folio, > nr += cur_nr; > } > > + if (any_writable) > + *ptentp = pte_mkwrite(*ptentp, vma); > + if (any_young) > + *ptentp = pte_mkyoung(*ptentp); > + if (any_dirty) > + *ptentp = pte_mkdirty(*ptentp); > + > return min(nr, max_nr); > } > > diff --git a/mm/madvise.c b/mm/madvise.c > index 9b9c35a398ed0..dce8f5e8555cb 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -344,13 +344,12 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma) > > static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, > struct folio *folio, pte_t *ptep, > - pte_t pte, bool *any_young, > - bool *any_dirty) > + pte_t *ptentp) > { > int max_nr = (end - addr) / PAGE_SIZE; > > - return folio_pte_batch_ext(folio, ptep, pte, max_nr, 0, NULL, > - any_young, any_dirty); > + return folio_pte_batch_ext(folio, NULL, ptep, ptentp, max_nr, > + FPB_MERGE_YOUNG_DIRTY); > } > > static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, > @@ -488,13 +487,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, > * next pte in the range. > */ > if (folio_test_large(folio)) { > - bool any_young; > - > - nr = madvise_folio_pte_batch(addr, end, folio, pte, > - ptent, &any_young, NULL); > - if (any_young) > - ptent = pte_mkyoung(ptent); > - > + nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); > if (nr < folio_nr_pages(folio)) { > int err; > > @@ -724,11 +717,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, > * next pte in the range. > */ > if (folio_test_large(folio)) { > - bool any_young, any_dirty; > - > - nr = madvise_folio_pte_batch(addr, end, folio, pte, > - ptent, &any_young, &any_dirty); > - > + nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); > if (nr < folio_nr_pages(folio)) { > int err; > > @@ -753,11 +742,6 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, > nr = 0; > continue; > } > - > - if (any_young) > - ptent = pte_mkyoung(ptent); > - if (any_dirty) > - ptent = pte_mkdirty(ptent); > } > > if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { > diff --git a/mm/memory.c b/mm/memory.c > index 43d35d6675f2e..985d09bee44fd 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -972,10 +972,9 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma > pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, > int max_nr, int *rss, struct folio **prealloc) > { > + fpb_t flags = FPB_MERGE_WRITE; > struct page *page; > struct folio *folio; > - bool any_writable; > - fpb_t flags = 0; > int err, nr; > > page = vm_normal_page(src_vma, addr, pte); > @@ -995,8 +994,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma > if (vma_soft_dirty_enabled(src_vma)) > flags |= FPB_HONOR_SOFT_DIRTY; > > - nr = folio_pte_batch_ext(folio, src_pte, pte, max_nr, flags, > - &any_writable, NULL, NULL); > + nr = folio_pte_batch_ext(folio, src_vma, src_pte, &pte, max_nr, flags); > folio_ref_add(folio, nr); > if (folio_test_anon(folio)) { > if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, > @@ -1010,8 +1008,6 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma > folio_dup_file_rmap_ptes(folio, page, nr, dst_vma); > rss[mm_counter_file(folio)] += nr; > } > - if (any_writable) > - pte = pte_mkwrite(pte, src_vma); > __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, > addr, nr); > return nr; > diff --git a/mm/util.c b/mm/util.c > index d29dcc135ad28..19d1a5814fac7 100644 > --- a/mm/util.c > +++ b/mm/util.c > @@ -1197,6 +1197,6 @@ EXPORT_SYMBOL(compat_vma_mmap_prepare); > unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, > unsigned int max_nr) > { > - return folio_pte_batch_ext(folio, ptep, pte, max_nr, 0, NULL, NULL, NULL); > + return folio_pte_batch_ext(folio, NULL, ptep, &pte, max_nr, 0); > } > #endif /* CONFIG_MMU */ > -- > 2.49.0 >
On 27.06.25 21:04, Lorenzo Stoakes wrote: > On Fri, Jun 27, 2025 at 01:55:10PM +0200, David Hildenbrand wrote: >> Instead, let's just allow for specifying through flags whether we want >> to have bits merged into the original PTE. >> >> For the madvise() case, simplify by having only a single parameter for >> merging young+dirty. For madvise_cold_or_pageout_pte_range() merging the >> dirty bit is not required, but also not harmful. This code is not that >> performance critical after all to really force all micro-optimizations. >> >> As we now have two pte_t * parameters, use PageTable() to make sure we >> are actually given a pointer at a copy of the PTE, not a pointer into >> an actual page table. >> >> Signed-off-by: David Hildenbrand <david@redhat.com> > > Overall a really nice cleanup! Just some comments below. > >> --- >> mm/internal.h | 58 +++++++++++++++++++++++++++++++-------------------- >> mm/madvise.c | 26 +++++------------------ >> mm/memory.c | 8 ++----- >> mm/util.c | 2 +- >> 4 files changed, 43 insertions(+), 51 deletions(-) >> >> diff --git a/mm/internal.h b/mm/internal.h >> index 6000b683f68ee..fe69e21b34a24 100644 >> --- a/mm/internal.h >> +++ b/mm/internal.h >> @@ -208,6 +208,18 @@ typedef int __bitwise fpb_t; >> /* Compare PTEs honoring the soft-dirty bit. */ >> #define FPB_HONOR_SOFT_DIRTY ((__force fpb_t)BIT(1)) >> >> +/* >> + * Merge PTE write bits: if any PTE in the batch is writable, modify the >> + * PTE at @ptentp to be writable. >> + */ >> +#define FPB_MERGE_WRITE ((__force fpb_t)BIT(2)) >> + >> +/* >> + * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty, >> + * modify the PTE at @ptentp to be young or dirty, respectively. >> + */ >> +#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(3)) >> + >> static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) >> { >> if (!(flags & FPB_HONOR_DIRTY)) >> @@ -220,16 +232,11 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) >> /** >> * folio_pte_batch_ext - detect a PTE batch for a large folio >> * @folio: The large folio to detect a PTE batch for. >> + * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL. >> * @ptep: Page table pointer for the first entry. >> - * @pte: Page table entry for the first page. >> + * @ptentp: Pointer at a copy of the first page table entry. > > This seems weird to me, I know it's a pointer to a copy of the PTE, essentially > replacing the pte param from before, but now it's also an output value? > Shouldn't this be made clear? As you spotted, I make that clear below and for each and every flag that someone would set that would affect it. > > I know it's a pain and churn but if this is now meant to be an output var we > should probably make it the last param too. > > At least needs an (output) or something here. Well, it's an input+output parameter. "Pointer at a copy of the first page table entry that might be modified depending on @flags." is a bit mouthful, but maybe clearer than just "output". [...] >> VM_WARN_ON_FOLIO(!pte_present(pte), folio); >> VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); >> VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); >> + VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp))); > > Hm so if !virt_addr_valid(ptentp) we're ok? :P I had the same question when writing that. Obviously, PageTable(virt_to_page(ptentp)) faulted when called on something on the stack. (ran into that ... :) ) Maybe "VM_WARN_ON(virt_addr_valid(ptentp));" would work as well, but I am not sure how that function behaves on all architectures ... > I also think a quick comment here > would help, the commit message explains it but glancing at this I'd be confused. > > Something like: > > /* Ensure this is a pointer to a copy not a pointer into a page table. */ Yes, makes sense. -- Cheers, David / dhildenb
On Mon, Jun 30, 2025 at 11:32:40AM +0200, David Hildenbrand wrote: > On 27.06.25 21:04, Lorenzo Stoakes wrote: > > On Fri, Jun 27, 2025 at 01:55:10PM +0200, David Hildenbrand wrote: > > > Instead, let's just allow for specifying through flags whether we want > > > to have bits merged into the original PTE. > > > > > > For the madvise() case, simplify by having only a single parameter for > > > merging young+dirty. For madvise_cold_or_pageout_pte_range() merging the > > > dirty bit is not required, but also not harmful. This code is not that > > > performance critical after all to really force all micro-optimizations. > > > > > > As we now have two pte_t * parameters, use PageTable() to make sure we > > > are actually given a pointer at a copy of the PTE, not a pointer into > > > an actual page table. > > > > > > Signed-off-by: David Hildenbrand <david@redhat.com> > > > > Overall a really nice cleanup! Just some comments below. > > > > > --- > > > mm/internal.h | 58 +++++++++++++++++++++++++++++++-------------------- > > > mm/madvise.c | 26 +++++------------------ > > > mm/memory.c | 8 ++----- > > > mm/util.c | 2 +- > > > 4 files changed, 43 insertions(+), 51 deletions(-) > > > > > > diff --git a/mm/internal.h b/mm/internal.h > > > index 6000b683f68ee..fe69e21b34a24 100644 > > > --- a/mm/internal.h > > > +++ b/mm/internal.h > > > @@ -208,6 +208,18 @@ typedef int __bitwise fpb_t; > > > /* Compare PTEs honoring the soft-dirty bit. */ > > > #define FPB_HONOR_SOFT_DIRTY ((__force fpb_t)BIT(1)) > > > > > > +/* > > > + * Merge PTE write bits: if any PTE in the batch is writable, modify the > > > + * PTE at @ptentp to be writable. > > > + */ > > > +#define FPB_MERGE_WRITE ((__force fpb_t)BIT(2)) > > > + > > > +/* > > > + * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty, > > > + * modify the PTE at @ptentp to be young or dirty, respectively. > > > + */ > > > +#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(3)) > > > + > > > static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) > > > { > > > if (!(flags & FPB_HONOR_DIRTY)) > > > @@ -220,16 +232,11 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) > > > /** > > > * folio_pte_batch_ext - detect a PTE batch for a large folio > > > * @folio: The large folio to detect a PTE batch for. > > > + * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL. > > > * @ptep: Page table pointer for the first entry. > > > - * @pte: Page table entry for the first page. > > > + * @ptentp: Pointer at a copy of the first page table entry. > > > > This seems weird to me, I know it's a pointer to a copy of the PTE, essentially > > replacing the pte param from before, but now it's also an output value? > > Shouldn't this be made clear? > > As you spotted, I make that clear below and for each and every flag that > someone would set that would affect it. I still think this needs to be made clearer. As a reviewer I had no idea what on earth this parameter was for honestly without having to think quite a bit (and I try to avoid that these days :P). And as a user of this function I'd be like 'weird it needs a copy'. See below... > > > > > I know it's a pain and churn but if this is now meant to be an output var we > > should probably make it the last param too. > > > > At least needs an (output) or something here. > > Well, it's an input+output parameter. > > "Pointer at a copy of the first page table entry that might be modified > depending on @flags." is a bit mouthful, but maybe clearer than just > "output". Yeah but even then it's not clear :) So yeah it is both, and we get into vague realms here, actually normally 'output' means we never read it either... ugh god haha. So maybe: @ptentp: Pointer to a COPY of the first page table entry whose flags we update if appropriate. And then update the description of folio_pte_batch_flags() both the brief one to add 'updates ptentp to set flags if appropriate' and maybe in the larger description bit. Then we're as clear as we can be I think. > > [...] > > > > VM_WARN_ON_FOLIO(!pte_present(pte), folio); > > > VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); > > > VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); > > > + VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp))); > > > > Hm so if !virt_addr_valid(ptentp) we're ok? :P > > I had the same question when writing that. Obviously, > PageTable(virt_to_page(ptentp)) faulted when called on something on the > stack. (ran into that ... :) ) > > Maybe "VM_WARN_ON(virt_addr_valid(ptentp));" would work as well, but I am > not sure how that function behaves on all architectures ... Well you wouldn't want to limit callers to only working on stack values... I guess just add a comment, or rather update the the one below to something like: /* * Ensure this is a pointer to a copy not a pointer into a page table. * If this is a stack value, it won't be a valid virtual address, but that's * fine because it also cannot be pointing into the page table. */ Which would clarify. > > > I also think a quick comment here > > would help, the commit message explains it but glancing at this I'd be confused. > > > > Something like: > > > > /* Ensure this is a pointer to a copy not a pointer into a page table. */ > > Yes, makes sense. > > > -- > Cheers, > > David / dhildenb >
> > @ptentp: Pointer to a COPY of the first page table entry whose flags we update > if appropriate. + * @ptentp: Pointer to a COPY of the first page table entry whose flags this + * function updates based on @flags if appropriate. > > And then update the description of folio_pte_batch_flags() both the brief one to > add 'updates ptentp to set flags if appropriate' and maybe in the larger > description bit. Not in the brief one; the other description, including the excessive parameter doc will be enough. FWIW, I briefly though passing in an arg struct, or returning the pte instead (and returning the nr_pages using a parameter), but hated both. More than these two stupid pte*. > >> >> [...] >> >>>> VM_WARN_ON_FOLIO(!pte_present(pte), folio); >>>> VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); >>>> VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); >>>> + VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp))); >>> >>> Hm so if !virt_addr_valid(ptentp) we're ok? :P >> >> I had the same question when writing that. Obviously, >> PageTable(virt_to_page(ptentp)) faulted when called on something on the >> stack. (ran into that ... :) ) >> >> Maybe "VM_WARN_ON(virt_addr_valid(ptentp));" would work as well, but I am >> not sure how that function behaves on all architectures ... > > Well you wouldn't want to limit callers to only working on stack values... > > I guess just add a comment, or rather update the the one below to something like: > > /* > * Ensure this is a pointer to a copy not a pointer into a page table. > * If this is a stack value, it won't be a valid virtual address, but that's > * fine because it also cannot be pointing into the page table. > */ > > Which would clarify. Yes, I'll use that, thanks. -- Cheers, David / dhildenb
On Mon, Jun 30, 2025 at 01:16:52PM +0200, David Hildenbrand wrote: > > > > > @ptentp: Pointer to a COPY of the first page table entry whose flags we update > > if appropriate. > > + * @ptentp: Pointer to a COPY of the first page table entry whose flags this > + * function updates based on @flags if appropriate. > > > > > > And then update the description of folio_pte_batch_flags() both the brief one to > > add 'updates ptentp to set flags if appropriate' and maybe in the larger > > description bit. > > Not in the brief one; the other description, including the excessive parameter doc > will be enough. That works for me! Let's not hold this up on trivia. > > FWIW, I briefly though passing in an arg struct, or returning the pte instead (and returning > the nr_pages using a parameter), but hated both. More than these two stupid pte*. Well I love help structs bro so you know I'd love that ;) This version is fine with the updated param doc though!
On 30.06.25 13:18, Lorenzo Stoakes wrote: > On Mon, Jun 30, 2025 at 01:16:52PM +0200, David Hildenbrand wrote: >> >>> >>> @ptentp: Pointer to a COPY of the first page table entry whose flags we update >>> if appropriate. >> >> + * @ptentp: Pointer to a COPY of the first page table entry whose flags this >> + * function updates based on @flags if appropriate. >> >> >>> >>> And then update the description of folio_pte_batch_flags() both the brief one to >>> add 'updates ptentp to set flags if appropriate' and maybe in the larger >>> description bit. >> >> Not in the brief one; the other description, including the excessive parameter doc >> will be enough. > > That works for me! Let's not hold this up on trivia. > >> >> FWIW, I briefly though passing in an arg struct, or returning the pte instead (and returning >> the nr_pages using a parameter), but hated both. More than these two stupid pte*. > > Well I love help structs bro so you know I'd love that ;) Yeah, I was more annoyed by a possible the folio_pte_batch() vs. folio_pte_batch_ext /_flags() inconsistency. I'll think about this once more ... :) -- Cheers, David / dhildenb
On Fri, Jun 27, 2025 at 7:55 PM David Hildenbrand <david@redhat.com> wrote: > > Instead, let's just allow for specifying through flags whether we want > to have bits merged into the original PTE. > > For the madvise() case, simplify by having only a single parameter for > merging young+dirty. For madvise_cold_or_pageout_pte_range() merging the > dirty bit is not required, but also not harmful. This code is not that > performance critical after all to really force all micro-optimizations. IIRC, this work you've wanted to do for a long time - maybe even a year ago? Less conditional logic is always a good thing ;) Thanks, Lance > > As we now have two pte_t * parameters, use PageTable() to make sure we > are actually given a pointer at a copy of the PTE, not a pointer into > an actual page table. > > Signed-off-by: David Hildenbrand <david@redhat.com> > --- > mm/internal.h | 58 +++++++++++++++++++++++++++++++-------------------- > mm/madvise.c | 26 +++++------------------ > mm/memory.c | 8 ++----- > mm/util.c | 2 +- > 4 files changed, 43 insertions(+), 51 deletions(-) > > diff --git a/mm/internal.h b/mm/internal.h > index 6000b683f68ee..fe69e21b34a24 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -208,6 +208,18 @@ typedef int __bitwise fpb_t; > /* Compare PTEs honoring the soft-dirty bit. */ > #define FPB_HONOR_SOFT_DIRTY ((__force fpb_t)BIT(1)) > > +/* > + * Merge PTE write bits: if any PTE in the batch is writable, modify the > + * PTE at @ptentp to be writable. > + */ > +#define FPB_MERGE_WRITE ((__force fpb_t)BIT(2)) > + > +/* > + * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty, > + * modify the PTE at @ptentp to be young or dirty, respectively. > + */ > +#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(3)) > + > static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) > { > if (!(flags & FPB_HONOR_DIRTY)) > @@ -220,16 +232,11 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) > /** > * folio_pte_batch_ext - detect a PTE batch for a large folio > * @folio: The large folio to detect a PTE batch for. > + * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL. > * @ptep: Page table pointer for the first entry. > - * @pte: Page table entry for the first page. > + * @ptentp: Pointer at a copy of the first page table entry. > * @max_nr: The maximum number of table entries to consider. > * @flags: Flags to modify the PTE batch semantics. > - * @any_writable: Optional pointer to indicate whether any entry except the > - * first one is writable. > - * @any_young: Optional pointer to indicate whether any entry except the > - * first one is young. > - * @any_dirty: Optional pointer to indicate whether any entry except the > - * first one is dirty. > * > * Detect a PTE batch: consecutive (present) PTEs that map consecutive > * pages of the same large folio in a single VMA and a single page table. > @@ -242,28 +249,26 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) > * must be limited by the caller so scanning cannot exceed a single VMA and > * a single page table. > * > + * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will > + * be modified. > + * > * This function will be inlined to optimize based on the input parameters; > * consider using folio_pte_batch() instead if applicable. > * > * Return: the number of table entries in the batch. > */ > static inline unsigned int folio_pte_batch_ext(struct folio *folio, > - pte_t *ptep, pte_t pte, unsigned int max_nr, fpb_t flags, > - bool *any_writable, bool *any_young, bool *any_dirty) > + struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp, > + unsigned int max_nr, fpb_t flags) > { > + bool any_writable = false, any_young = false, any_dirty = false; > + pte_t expected_pte, pte = *ptentp; > unsigned int nr, cur_nr; > - pte_t expected_pte; > - > - if (any_writable) > - *any_writable = false; > - if (any_young) > - *any_young = false; > - if (any_dirty) > - *any_dirty = false; > > VM_WARN_ON_FOLIO(!pte_present(pte), folio); > VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); > VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); > + VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp))); > > /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ > max_nr = min_t(unsigned long, max_nr, > @@ -279,12 +284,12 @@ static inline unsigned int folio_pte_batch_ext(struct folio *folio, > if (!pte_same(__pte_batch_clear_ignored(pte, flags), expected_pte)) > break; > > - if (any_writable) > - *any_writable |= pte_write(pte); > - if (any_young) > - *any_young |= pte_young(pte); > - if (any_dirty) > - *any_dirty |= pte_dirty(pte); > + if (flags & FPB_MERGE_WRITE) > + any_writable |= pte_write(pte); > + if (flags & FPB_MERGE_YOUNG_DIRTY) { > + any_young |= pte_young(pte); > + any_dirty |= pte_dirty(pte); > + } > > cur_nr = pte_batch_hint(ptep, pte); > expected_pte = pte_advance_pfn(expected_pte, cur_nr); > @@ -292,6 +297,13 @@ static inline unsigned int folio_pte_batch_ext(struct folio *folio, > nr += cur_nr; > } > > + if (any_writable) > + *ptentp = pte_mkwrite(*ptentp, vma); > + if (any_young) > + *ptentp = pte_mkyoung(*ptentp); > + if (any_dirty) > + *ptentp = pte_mkdirty(*ptentp); > + > return min(nr, max_nr); > } > > diff --git a/mm/madvise.c b/mm/madvise.c > index 9b9c35a398ed0..dce8f5e8555cb 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -344,13 +344,12 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma) > > static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, > struct folio *folio, pte_t *ptep, > - pte_t pte, bool *any_young, > - bool *any_dirty) > + pte_t *ptentp) > { > int max_nr = (end - addr) / PAGE_SIZE; > > - return folio_pte_batch_ext(folio, ptep, pte, max_nr, 0, NULL, > - any_young, any_dirty); > + return folio_pte_batch_ext(folio, NULL, ptep, ptentp, max_nr, > + FPB_MERGE_YOUNG_DIRTY); > } > > static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, > @@ -488,13 +487,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, > * next pte in the range. > */ > if (folio_test_large(folio)) { > - bool any_young; > - > - nr = madvise_folio_pte_batch(addr, end, folio, pte, > - ptent, &any_young, NULL); > - if (any_young) > - ptent = pte_mkyoung(ptent); > - > + nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); > if (nr < folio_nr_pages(folio)) { > int err; > > @@ -724,11 +717,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, > * next pte in the range. > */ > if (folio_test_large(folio)) { > - bool any_young, any_dirty; > - > - nr = madvise_folio_pte_batch(addr, end, folio, pte, > - ptent, &any_young, &any_dirty); > - > + nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); > if (nr < folio_nr_pages(folio)) { > int err; > > @@ -753,11 +742,6 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, > nr = 0; > continue; > } > - > - if (any_young) > - ptent = pte_mkyoung(ptent); > - if (any_dirty) > - ptent = pte_mkdirty(ptent); > } > > if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { > diff --git a/mm/memory.c b/mm/memory.c > index 43d35d6675f2e..985d09bee44fd 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -972,10 +972,9 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma > pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, > int max_nr, int *rss, struct folio **prealloc) > { > + fpb_t flags = FPB_MERGE_WRITE; > struct page *page; > struct folio *folio; > - bool any_writable; > - fpb_t flags = 0; > int err, nr; > > page = vm_normal_page(src_vma, addr, pte); > @@ -995,8 +994,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma > if (vma_soft_dirty_enabled(src_vma)) > flags |= FPB_HONOR_SOFT_DIRTY; > > - nr = folio_pte_batch_ext(folio, src_pte, pte, max_nr, flags, > - &any_writable, NULL, NULL); > + nr = folio_pte_batch_ext(folio, src_vma, src_pte, &pte, max_nr, flags); > folio_ref_add(folio, nr); > if (folio_test_anon(folio)) { > if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, > @@ -1010,8 +1008,6 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma > folio_dup_file_rmap_ptes(folio, page, nr, dst_vma); > rss[mm_counter_file(folio)] += nr; > } > - if (any_writable) > - pte = pte_mkwrite(pte, src_vma); > __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, > addr, nr); > return nr; > diff --git a/mm/util.c b/mm/util.c > index d29dcc135ad28..19d1a5814fac7 100644 > --- a/mm/util.c > +++ b/mm/util.c > @@ -1197,6 +1197,6 @@ EXPORT_SYMBOL(compat_vma_mmap_prepare); > unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, > unsigned int max_nr) > { > - return folio_pte_batch_ext(folio, ptep, pte, max_nr, 0, NULL, NULL, NULL); > + return folio_pte_batch_ext(folio, NULL, ptep, &pte, max_nr, 0); > } > #endif /* CONFIG_MMU */ > -- > 2.49.0 > >
On 27.06.25 16:34, Lance Yang wrote: > On Fri, Jun 27, 2025 at 7:55 PM David Hildenbrand <david@redhat.com> wrote: >> >> Instead, let's just allow for specifying through flags whether we want >> to have bits merged into the original PTE. >> >> For the madvise() case, simplify by having only a single parameter for >> merging young+dirty. For madvise_cold_or_pageout_pte_range() merging the >> dirty bit is not required, but also not harmful. This code is not that >> performance critical after all to really force all micro-optimizations. > > IIRC, this work you've wanted to do for a long time - maybe even a year ago? Heh, maybe, I don't remember. For this user here, I realized that we might already check the existence of any_dirty at runtime -- because the compiler will not necessarily create two optimized functions. So we already have runtime checks ... instead of checking whether any_dirty == NULL, we now simply do the merging (checking for pte_young() instead) now. -- Cheers, David / dhildenb
On 2025/6/27 23:11, David Hildenbrand wrote: > On 27.06.25 16:34, Lance Yang wrote: >> On Fri, Jun 27, 2025 at 7:55 PM David Hildenbrand <david@redhat.com> >> wrote: >>> >>> Instead, let's just allow for specifying through flags whether we want >>> to have bits merged into the original PTE. >>> >>> For the madvise() case, simplify by having only a single parameter for >>> merging young+dirty. For madvise_cold_or_pageout_pte_range() merging the >>> dirty bit is not required, but also not harmful. This code is not that >>> performance critical after all to really force all micro-optimizations. >> >> IIRC, this work you've wanted to do for a long time - maybe even a >> year ago? > > Heh, maybe, I don't remember. > > For this user here, I realized that we might already check the existence > of any_dirty at runtime -- because the compiler will not necessarily > create two optimized functions. Ah, I see! That's a very sharp observation about the compiler's behavior ;) > > So we already have runtime checks ... instead of checking whether > any_dirty == NULL, we now simply do the merging (checking for > pte_young() instead) now. Thanks for the lesson! Lance
© 2016 - 2025 Red Hat, Inc.