Now that we no longer have a convenient flag in the cluster to determine
if a folio is large, free_swap_and_cache() will take a reference and
lock a large folio much more often, which could lead to contention and
(e.g.) failure to split large folios, etc.
Let's solve that problem by batch freeing swap and cache with a new
function, free_swap_and_cache_nr(), to free a contiguous range of swap
entries together. This allows us to first drop a reference to each swap
slot before we try to release the cache folio. This means we only try to
release the folio once, only taking the reference and lock once - much
better than the previous 512 times for the 2M THP case.
Contiguous swap entries are gathered in zap_pte_range() and
madvise_free_pte_range() in a similar way to how present ptes are
already gathered in zap_pte_range().
While we are at it, let's simplify by converting the return type of both
functions to void. The return value was used only by zap_pte_range() to
print a bad pte, and was ignored by everyone else, so the extra
reporting wasn't exactly guaranteed. We will still get the warning with
most of the information from get_swap_device(). With the batch version,
we wouldn't know which pte was bad anyway so could print the wrong one.
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
include/linux/pgtable.h | 29 ++++++++++++
include/linux/swap.h | 12 +++--
mm/internal.h | 63 ++++++++++++++++++++++++++
mm/madvise.c | 12 +++--
mm/memory.c | 13 +++---
mm/swapfile.c | 97 +++++++++++++++++++++++++++++++++--------
6 files changed, 195 insertions(+), 31 deletions(-)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index a3fc8150b047..75096025fe52 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
}
#endif
+#ifndef clear_not_present_full_ptes
+/**
+ * clear_not_present_full_ptes - Clear multiple not present PTEs which are
+ * consecutive in the pgtable.
+ * @mm: Address space the ptes represent.
+ * @addr: Address of the first pte.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over pte_clear_not_present_full().
+ *
+ * Context: The caller holds the page table lock. The PTEs are all not present.
+ * The PTEs are all in the same PMD.
+ */
+static inline void clear_not_present_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, unsigned int nr, int full)
+{
+ for (;;) {
+ pte_clear_not_present_full(mm, addr, ptep, full);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+#endif
+
#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
unsigned long address,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f6f78198f000..5737236dc3ce 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t);
extern void swap_free(swp_entry_t);
extern void swapcache_free_entries(swp_entry_t *entries, int n);
-extern int free_swap_and_cache(swp_entry_t);
+extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
int swap_type_of(dev_t device, sector_t offset);
int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
@@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
#define free_pages_and_swap_cache(pages, nr) \
release_pages((pages), (nr));
-/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
-#define free_swap_and_cache(e) is_pfn_swap_entry(e)
+static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
+{
+}
static inline void free_swap_cache(struct folio *folio)
{
@@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
}
#endif /* CONFIG_SWAP */
+static inline void free_swap_and_cache(swp_entry_t entry)
+{
+ free_swap_and_cache_nr(entry, 1);
+}
+
#ifdef CONFIG_MEMCG
static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
diff --git a/mm/internal.h b/mm/internal.h
index 3bdc8693b54f..de68705624b0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,8 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/tracepoint-defs.h>
struct folio_batch;
@@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
return min(ptep - start_ptep, max_nr);
}
+
+/**
+ * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
+ * @pte: The initial pte state; is_swap_pte(pte) must be true.
+ *
+ * Increments the swap offset, while maintaining all other fields, including
+ * swap type, and any swp pte bits. The resulting pte is returned.
+ */
+static inline pte_t pte_next_swp_offset(pte_t pte)
+{
+ swp_entry_t entry = pte_to_swp_entry(pte);
+ pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
+ swp_offset(entry) + 1));
+
+ if (pte_swp_soft_dirty(pte))
+ new = pte_swp_mksoft_dirty(new);
+ if (pte_swp_exclusive(pte))
+ new = pte_swp_mkexclusive(new);
+ if (pte_swp_uffd_wp(pte))
+ new = pte_swp_mkuffd_wp(new);
+
+ return new;
+}
+
+/**
+ * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
+ * @start_ptep: Page table pointer for the first entry.
+ * @max_nr: The maximum number of table entries to consider.
+ * @pte: Page table entry for the first entry.
+ *
+ * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
+ * containing swap entries all with consecutive offsets and targeting the same
+ * swap type, all with matching swp pte bits.
+ *
+ * max_nr must be at least one and must be limited by the caller so scanning
+ * cannot exceed a single page table.
+ *
+ * Return: the number of table entries in the batch.
+ */
+static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
+{
+ pte_t expected_pte = pte_next_swp_offset(pte);
+ const pte_t *end_ptep = start_ptep + max_nr;
+ pte_t *ptep = start_ptep + 1;
+
+ VM_WARN_ON(max_nr < 1);
+ VM_WARN_ON(!is_swap_pte(pte));
+ VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
+
+ while (ptep < end_ptep) {
+ pte = ptep_get(ptep);
+
+ if (!pte_same(pte, expected_pte))
+ break;
+
+ expected_pte = pte_next_swp_offset(expected_pte);
+ ptep++;
+ }
+
+ return ptep - start_ptep;
+}
#endif /* CONFIG_MMU */
void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
diff --git a/mm/madvise.c b/mm/madvise.c
index 1f77a51baaac..5011ecb24344 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -628,6 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
struct folio *folio;
int nr_swap = 0;
unsigned long next;
+ int nr, max_nr;
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd))
@@ -640,7 +641,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
- for (; addr != end; pte++, addr += PAGE_SIZE) {
+ for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
+ nr = 1;
ptent = ptep_get(pte);
if (pte_none(ptent))
@@ -655,9 +657,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
entry = pte_to_swp_entry(ptent);
if (!non_swap_entry(entry)) {
- nr_swap--;
- free_swap_and_cache(entry);
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ max_nr = (end - addr) / PAGE_SIZE;
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ nr_swap -= nr;
+ free_swap_and_cache_nr(entry, nr);
+ clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
} else if (is_hwpoison_entry(entry) ||
is_poisoned_swp_entry(entry)) {
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
diff --git a/mm/memory.c b/mm/memory.c
index b98e4d907a14..0db2aa066a5a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1637,12 +1637,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
folio_remove_rmap_pte(folio, page, vma);
folio_put(folio);
} else if (!non_swap_entry(entry)) {
- /* Genuine swap entry, hence a private anon page */
+ max_nr = (end - addr) / PAGE_SIZE;
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ /* Genuine swap entries, hence a private anon pages */
if (!should_zap_cows(details))
continue;
- rss[MM_SWAPENTS]--;
- if (unlikely(!free_swap_and_cache(entry)))
- print_bad_pte(vma, addr, ptent, NULL);
+ rss[MM_SWAPENTS] -= nr;
+ free_swap_and_cache_nr(entry, nr);
} else if (is_migration_entry(entry)) {
folio = pfn_swap_entry_folio(entry);
if (!should_zap_folio(details, folio))
@@ -1665,8 +1666,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
WARN_ON_ONCE(1);
}
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
- zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
+ clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
add_mm_rss_vec(mm, rss);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1ded6d1dcab4..20c45757f2b2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -130,7 +130,11 @@ static inline unsigned char swap_count(unsigned char ent)
/* Reclaim the swap entry if swap is getting full*/
#define TTRS_FULL 0x4
-/* returns 1 if swap entry is freed */
+/*
+ * returns number of pages in the folio that backs the swap entry. If positive,
+ * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
+ * folio was associated with the swap entry.
+ */
static int __try_to_reclaim_swap(struct swap_info_struct *si,
unsigned long offset, unsigned long flags)
{
@@ -155,6 +159,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
ret = folio_free_swap(folio);
folio_unlock(folio);
}
+ ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
folio_put(folio);
return ret;
}
@@ -895,7 +900,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
spin_lock(&si->lock);
/* entry was freed successfully, try to use this again */
- if (swap_was_freed)
+ if (swap_was_freed > 0)
goto checks;
goto scan; /* check next one */
}
@@ -1572,32 +1577,88 @@ bool folio_free_swap(struct folio *folio)
return true;
}
-/*
- * Free the swap entry like above, but also try to
- * free the page cache entry if it is the last user.
+/**
+ * free_swap_and_cache_nr() - Release reference on range of swap entries and
+ * reclaim their cache if no more references remain.
+ * @entry: First entry of range.
+ * @nr: Number of entries in range.
+ *
+ * For each swap entry in the contiguous range, release a reference. If any swap
+ * entries become free, try to reclaim their underlying folios, if present. The
+ * offset range is defined by [entry.offset, entry.offset + nr).
*/
-int free_swap_and_cache(swp_entry_t entry)
+void free_swap_and_cache_nr(swp_entry_t entry, int nr)
{
- struct swap_info_struct *p;
+ const unsigned long start_offset = swp_offset(entry);
+ const unsigned long end_offset = start_offset + nr;
+ unsigned int type = swp_type(entry);
+ struct swap_info_struct *si;
+ bool any_only_cache = false;
+ unsigned long offset;
unsigned char count;
if (non_swap_entry(entry))
- return 1;
+ return;
- p = get_swap_device(entry);
- if (p) {
- if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) {
- put_swap_device(p);
- return 0;
+ si = get_swap_device(entry);
+ if (!si)
+ return;
+
+ if (WARN_ON(end_offset > si->max))
+ goto out;
+
+ /*
+ * First free all entries in the range.
+ */
+ for (offset = start_offset; offset < end_offset; offset++) {
+ if (data_race(si->swap_map[offset])) {
+ count = __swap_entry_free(si, swp_entry(type, offset));
+ if (count == SWAP_HAS_CACHE)
+ any_only_cache = true;
+ } else {
+ WARN_ON_ONCE(1);
}
+ }
+
+ /*
+ * Short-circuit the below loop if none of the entries had their
+ * reference drop to zero.
+ */
+ if (!any_only_cache)
+ goto out;
- count = __swap_entry_free(p, entry);
- if (count == SWAP_HAS_CACHE)
- __try_to_reclaim_swap(p, swp_offset(entry),
+ /*
+ * Now go back over the range trying to reclaim the swap cache. This is
+ * more efficient for large folios because we will only try to reclaim
+ * the swap once per folio in the common case. If we do
+ * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
+ * latter will get a reference and lock the folio for every individual
+ * page but will only succeed once the swap slot for every subpage is
+ * zero.
+ */
+ for (offset = start_offset; offset < end_offset; offset += nr) {
+ nr = 1;
+ if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
+ /*
+ * Folios are always naturally aligned in swap so
+ * advance forward to the next boundary. Zero means no
+ * folio was found for the swap entry, so advance by 1
+ * in this case. Negative value means folio was found
+ * but could not be reclaimed. Here we can still advance
+ * to the next boundary.
+ */
+ nr = __try_to_reclaim_swap(si, offset,
TTRS_UNMAPPED | TTRS_FULL);
- put_swap_device(p);
+ if (nr == 0)
+ nr = 1;
+ else if (nr < 0)
+ nr = -nr;
+ nr = ALIGN(offset + 1, nr) - offset;
+ }
}
- return p != NULL;
+
+out:
+ put_swap_device(si);
}
#ifdef CONFIG_HIBERNATION
--
2.25.1
On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> Now that we no longer have a convenient flag in the cluster to determine
> if a folio is large, free_swap_and_cache() will take a reference and
> lock a large folio much more often, which could lead to contention and
> (e.g.) failure to split large folios, etc.
>
> Let's solve that problem by batch freeing swap and cache with a new
> function, free_swap_and_cache_nr(), to free a contiguous range of swap
> entries together. This allows us to first drop a reference to each swap
> slot before we try to release the cache folio. This means we only try to
> release the folio once, only taking the reference and lock once - much
> better than the previous 512 times for the 2M THP case.
>
> Contiguous swap entries are gathered in zap_pte_range() and
> madvise_free_pte_range() in a similar way to how present ptes are
> already gathered in zap_pte_range().
>
> While we are at it, let's simplify by converting the return type of both
> functions to void. The return value was used only by zap_pte_range() to
> print a bad pte, and was ignored by everyone else, so the extra
> reporting wasn't exactly guaranteed. We will still get the warning with
> most of the information from get_swap_device(). With the batch version,
> we wouldn't know which pte was bad anyway so could print the wrong one.
>
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
> include/linux/pgtable.h | 29 ++++++++++++
> include/linux/swap.h | 12 +++--
> mm/internal.h | 63 ++++++++++++++++++++++++++
> mm/madvise.c | 12 +++--
> mm/memory.c | 13 +++---
> mm/swapfile.c | 97 +++++++++++++++++++++++++++++++++--------
> 6 files changed, 195 insertions(+), 31 deletions(-)
>
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index a3fc8150b047..75096025fe52 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
> }
> #endif
>
> +#ifndef clear_not_present_full_ptes
> +/**
> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
> + * consecutive in the pgtable.
> + * @mm: Address space the ptes represent.
> + * @addr: Address of the first pte.
> + * @ptep: Page table pointer for the first entry.
> + * @nr: Number of entries to clear.
> + * @full: Whether we are clearing a full mm.
> + *
> + * May be overridden by the architecture; otherwise, implemented as a simple
> + * loop over pte_clear_not_present_full().
> + *
> + * Context: The caller holds the page table lock. The PTEs are all not present.
> + * The PTEs are all in the same PMD.
> + */
> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
> + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> +{
> + for (;;) {
> + pte_clear_not_present_full(mm, addr, ptep, full);
> + if (--nr == 0)
> + break;
> + ptep++;
> + addr += PAGE_SIZE;
> + }
> +}
> +#endif
> +
> #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
> extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
> unsigned long address,
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index f6f78198f000..5737236dc3ce 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
> extern int swapcache_prepare(swp_entry_t);
> extern void swap_free(swp_entry_t);
> extern void swapcache_free_entries(swp_entry_t *entries, int n);
> -extern int free_swap_and_cache(swp_entry_t);
> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
> int swap_type_of(dev_t device, sector_t offset);
> int find_first_swap(dev_t *device);
> extern unsigned int count_swap_pages(int, int);
> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
> #define free_pages_and_swap_cache(pages, nr) \
> release_pages((pages), (nr));
>
> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> +{
> +}
>
> static inline void free_swap_cache(struct folio *folio)
> {
> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
> }
> #endif /* CONFIG_SWAP */
>
> +static inline void free_swap_and_cache(swp_entry_t entry)
> +{
> + free_swap_and_cache_nr(entry, 1);
> +}
> +
> #ifdef CONFIG_MEMCG
> static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
> {
> diff --git a/mm/internal.h b/mm/internal.h
> index 3bdc8693b54f..de68705624b0 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -11,6 +11,8 @@
> #include <linux/mm.h>
> #include <linux/pagemap.h>
> #include <linux/rmap.h>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
> #include <linux/tracepoint-defs.h>
>
> struct folio_batch;
> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
>
> return min(ptep - start_ptep, max_nr);
> }
> +
> +/**
> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
> + * @pte: The initial pte state; is_swap_pte(pte) must be true.
> + *
> + * Increments the swap offset, while maintaining all other fields, including
> + * swap type, and any swp pte bits. The resulting pte is returned.
> + */
> +static inline pte_t pte_next_swp_offset(pte_t pte)
> +{
> + swp_entry_t entry = pte_to_swp_entry(pte);
> + pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
> + swp_offset(entry) + 1));
> +
> + if (pte_swp_soft_dirty(pte))
> + new = pte_swp_mksoft_dirty(new);
> + if (pte_swp_exclusive(pte))
> + new = pte_swp_mkexclusive(new);
> + if (pte_swp_uffd_wp(pte))
> + new = pte_swp_mkuffd_wp(new);
I don't quite understand this. If this page table entry is exclusive,
will its subsequent page table entry also be exclusive without
question?
in try_to_unmap_one, exclusive is per-subpage but not per-folio:
anon_exclusive = folio_test_anon(folio) &&
PageAnonExclusive(subpage);
same questions also for diry, wp etc.
> +
> + return new;
> +}
> +
> +/**
> + * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
> + * @start_ptep: Page table pointer for the first entry.
> + * @max_nr: The maximum number of table entries to consider.
> + * @pte: Page table entry for the first entry.
> + *
> + * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
> + * containing swap entries all with consecutive offsets and targeting the same
> + * swap type, all with matching swp pte bits.
> + *
> + * max_nr must be at least one and must be limited by the caller so scanning
> + * cannot exceed a single page table.
> + *
> + * Return: the number of table entries in the batch.
> + */
> +static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
> +{
> + pte_t expected_pte = pte_next_swp_offset(pte);
> + const pte_t *end_ptep = start_ptep + max_nr;
> + pte_t *ptep = start_ptep + 1;
> +
> + VM_WARN_ON(max_nr < 1);
> + VM_WARN_ON(!is_swap_pte(pte));
> + VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
> +
> + while (ptep < end_ptep) {
> + pte = ptep_get(ptep);
> +
> + if (!pte_same(pte, expected_pte))
> + break;
> +
> + expected_pte = pte_next_swp_offset(expected_pte);
> + ptep++;
> + }
> +
> + return ptep - start_ptep;
> +}
> #endif /* CONFIG_MMU */
>
> void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 1f77a51baaac..5011ecb24344 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -628,6 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> struct folio *folio;
> int nr_swap = 0;
> unsigned long next;
> + int nr, max_nr;
>
> next = pmd_addr_end(addr, end);
> if (pmd_trans_huge(*pmd))
> @@ -640,7 +641,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> return 0;
> flush_tlb_batched_pending(mm);
> arch_enter_lazy_mmu_mode();
> - for (; addr != end; pte++, addr += PAGE_SIZE) {
> + for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
> + nr = 1;
> ptent = ptep_get(pte);
>
> if (pte_none(ptent))
> @@ -655,9 +657,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>
> entry = pte_to_swp_entry(ptent);
> if (!non_swap_entry(entry)) {
> - nr_swap--;
> - free_swap_and_cache(entry);
> - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> + max_nr = (end - addr) / PAGE_SIZE;
> + nr = swap_pte_batch(pte, max_nr, ptent);
> + nr_swap -= nr;
> + free_swap_and_cache_nr(entry, nr);
> + clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
> } else if (is_hwpoison_entry(entry) ||
> is_poisoned_swp_entry(entry)) {
> pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> diff --git a/mm/memory.c b/mm/memory.c
> index b98e4d907a14..0db2aa066a5a 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1637,12 +1637,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> folio_remove_rmap_pte(folio, page, vma);
> folio_put(folio);
> } else if (!non_swap_entry(entry)) {
> - /* Genuine swap entry, hence a private anon page */
> + max_nr = (end - addr) / PAGE_SIZE;
> + nr = swap_pte_batch(pte, max_nr, ptent);
> + /* Genuine swap entries, hence a private anon pages */
> if (!should_zap_cows(details))
> continue;
> - rss[MM_SWAPENTS]--;
> - if (unlikely(!free_swap_and_cache(entry)))
> - print_bad_pte(vma, addr, ptent, NULL);
> + rss[MM_SWAPENTS] -= nr;
> + free_swap_and_cache_nr(entry, nr);
> } else if (is_migration_entry(entry)) {
> folio = pfn_swap_entry_folio(entry);
> if (!should_zap_folio(details, folio))
> @@ -1665,8 +1666,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
> WARN_ON_ONCE(1);
> }
> - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> - zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
> + clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
> + zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
> } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
>
> add_mm_rss_vec(mm, rss);
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 1ded6d1dcab4..20c45757f2b2 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -130,7 +130,11 @@ static inline unsigned char swap_count(unsigned char ent)
> /* Reclaim the swap entry if swap is getting full*/
> #define TTRS_FULL 0x4
>
> -/* returns 1 if swap entry is freed */
> +/*
> + * returns number of pages in the folio that backs the swap entry. If positive,
> + * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
> + * folio was associated with the swap entry.
> + */
> static int __try_to_reclaim_swap(struct swap_info_struct *si,
> unsigned long offset, unsigned long flags)
> {
> @@ -155,6 +159,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> ret = folio_free_swap(folio);
> folio_unlock(folio);
> }
> + ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
> folio_put(folio);
> return ret;
> }
> @@ -895,7 +900,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
> swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
> spin_lock(&si->lock);
> /* entry was freed successfully, try to use this again */
> - if (swap_was_freed)
> + if (swap_was_freed > 0)
> goto checks;
> goto scan; /* check next one */
> }
> @@ -1572,32 +1577,88 @@ bool folio_free_swap(struct folio *folio)
> return true;
> }
>
> -/*
> - * Free the swap entry like above, but also try to
> - * free the page cache entry if it is the last user.
> +/**
> + * free_swap_and_cache_nr() - Release reference on range of swap entries and
> + * reclaim their cache if no more references remain.
> + * @entry: First entry of range.
> + * @nr: Number of entries in range.
> + *
> + * For each swap entry in the contiguous range, release a reference. If any swap
> + * entries become free, try to reclaim their underlying folios, if present. The
> + * offset range is defined by [entry.offset, entry.offset + nr).
> */
> -int free_swap_and_cache(swp_entry_t entry)
> +void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> {
> - struct swap_info_struct *p;
> + const unsigned long start_offset = swp_offset(entry);
> + const unsigned long end_offset = start_offset + nr;
> + unsigned int type = swp_type(entry);
> + struct swap_info_struct *si;
> + bool any_only_cache = false;
> + unsigned long offset;
> unsigned char count;
>
> if (non_swap_entry(entry))
> - return 1;
> + return;
>
> - p = get_swap_device(entry);
> - if (p) {
> - if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) {
> - put_swap_device(p);
> - return 0;
> + si = get_swap_device(entry);
> + if (!si)
> + return;
> +
> + if (WARN_ON(end_offset > si->max))
> + goto out;
> +
> + /*
> + * First free all entries in the range.
> + */
> + for (offset = start_offset; offset < end_offset; offset++) {
> + if (data_race(si->swap_map[offset])) {
> + count = __swap_entry_free(si, swp_entry(type, offset));
> + if (count == SWAP_HAS_CACHE)
> + any_only_cache = true;
> + } else {
> + WARN_ON_ONCE(1);
> }
> + }
> +
> + /*
> + * Short-circuit the below loop if none of the entries had their
> + * reference drop to zero.
> + */
> + if (!any_only_cache)
> + goto out;
>
> - count = __swap_entry_free(p, entry);
> - if (count == SWAP_HAS_CACHE)
> - __try_to_reclaim_swap(p, swp_offset(entry),
> + /*
> + * Now go back over the range trying to reclaim the swap cache. This is
> + * more efficient for large folios because we will only try to reclaim
> + * the swap once per folio in the common case. If we do
> + * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
> + * latter will get a reference and lock the folio for every individual
> + * page but will only succeed once the swap slot for every subpage is
> + * zero.
> + */
> + for (offset = start_offset; offset < end_offset; offset += nr) {
> + nr = 1;
> + if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
> + /*
> + * Folios are always naturally aligned in swap so
> + * advance forward to the next boundary. Zero means no
> + * folio was found for the swap entry, so advance by 1
> + * in this case. Negative value means folio was found
> + * but could not be reclaimed. Here we can still advance
> + * to the next boundary.
> + */
> + nr = __try_to_reclaim_swap(si, offset,
> TTRS_UNMAPPED | TTRS_FULL);
> - put_swap_device(p);
> + if (nr == 0)
> + nr = 1;
> + else if (nr < 0)
> + nr = -nr;
> + nr = ALIGN(offset + 1, nr) - offset;
> + }
> }
> - return p != NULL;
> +
> +out:
> + put_swap_device(si);
> }
>
> #ifdef CONFIG_HIBERNATION
> --
> 2.25.1
>
Thanks
Barry
On Tue, Apr 9, 2024 at 8:51 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >
> > Now that we no longer have a convenient flag in the cluster to determine
> > if a folio is large, free_swap_and_cache() will take a reference and
> > lock a large folio much more often, which could lead to contention and
> > (e.g.) failure to split large folios, etc.
> >
> > Let's solve that problem by batch freeing swap and cache with a new
> > function, free_swap_and_cache_nr(), to free a contiguous range of swap
> > entries together. This allows us to first drop a reference to each swap
> > slot before we try to release the cache folio. This means we only try to
> > release the folio once, only taking the reference and lock once - much
> > better than the previous 512 times for the 2M THP case.
> >
> > Contiguous swap entries are gathered in zap_pte_range() and
> > madvise_free_pte_range() in a similar way to how present ptes are
> > already gathered in zap_pte_range().
> >
> > While we are at it, let's simplify by converting the return type of both
> > functions to void. The return value was used only by zap_pte_range() to
> > print a bad pte, and was ignored by everyone else, so the extra
> > reporting wasn't exactly guaranteed. We will still get the warning with
> > most of the information from get_swap_device(). With the batch version,
> > we wouldn't know which pte was bad anyway so could print the wrong one.
> >
> > Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> > ---
> > include/linux/pgtable.h | 29 ++++++++++++
> > include/linux/swap.h | 12 +++--
> > mm/internal.h | 63 ++++++++++++++++++++++++++
> > mm/madvise.c | 12 +++--
> > mm/memory.c | 13 +++---
> > mm/swapfile.c | 97 +++++++++++++++++++++++++++++++++--------
> > 6 files changed, 195 insertions(+), 31 deletions(-)
> >
> > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > index a3fc8150b047..75096025fe52 100644
> > --- a/include/linux/pgtable.h
> > +++ b/include/linux/pgtable.h
> > @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
> > }
> > #endif
> >
> > +#ifndef clear_not_present_full_ptes
> > +/**
> > + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
> > + * consecutive in the pgtable.
> > + * @mm: Address space the ptes represent.
> > + * @addr: Address of the first pte.
> > + * @ptep: Page table pointer for the first entry.
> > + * @nr: Number of entries to clear.
> > + * @full: Whether we are clearing a full mm.
> > + *
> > + * May be overridden by the architecture; otherwise, implemented as a simple
> > + * loop over pte_clear_not_present_full().
> > + *
> > + * Context: The caller holds the page table lock. The PTEs are all not present.
> > + * The PTEs are all in the same PMD.
> > + */
> > +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
> > + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> > +{
> > + for (;;) {
> > + pte_clear_not_present_full(mm, addr, ptep, full);
> > + if (--nr == 0)
> > + break;
> > + ptep++;
> > + addr += PAGE_SIZE;
> > + }
> > +}
> > +#endif
> > +
> > #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
> > extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
> > unsigned long address,
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index f6f78198f000..5737236dc3ce 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
> > extern int swapcache_prepare(swp_entry_t);
> > extern void swap_free(swp_entry_t);
> > extern void swapcache_free_entries(swp_entry_t *entries, int n);
> > -extern int free_swap_and_cache(swp_entry_t);
> > +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
> > int swap_type_of(dev_t device, sector_t offset);
> > int find_first_swap(dev_t *device);
> > extern unsigned int count_swap_pages(int, int);
> > @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
> > #define free_pages_and_swap_cache(pages, nr) \
> > release_pages((pages), (nr));
> >
> > -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
> > -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
> > +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> > +{
> > +}
> >
> > static inline void free_swap_cache(struct folio *folio)
> > {
> > @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
> > }
> > #endif /* CONFIG_SWAP */
> >
> > +static inline void free_swap_and_cache(swp_entry_t entry)
> > +{
> > + free_swap_and_cache_nr(entry, 1);
> > +}
> > +
> > #ifdef CONFIG_MEMCG
> > static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
> > {
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 3bdc8693b54f..de68705624b0 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -11,6 +11,8 @@
> > #include <linux/mm.h>
> > #include <linux/pagemap.h>
> > #include <linux/rmap.h>
> > +#include <linux/swap.h>
> > +#include <linux/swapops.h>
> > #include <linux/tracepoint-defs.h>
> >
> > struct folio_batch;
> > @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> >
> > return min(ptep - start_ptep, max_nr);
> > }
> > +
> > +/**
> > + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
> > + * @pte: The initial pte state; is_swap_pte(pte) must be true.
> > + *
> > + * Increments the swap offset, while maintaining all other fields, including
> > + * swap type, and any swp pte bits. The resulting pte is returned.
> > + */
> > +static inline pte_t pte_next_swp_offset(pte_t pte)
> > +{
> > + swp_entry_t entry = pte_to_swp_entry(pte);
> > + pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
> > + swp_offset(entry) + 1));
> > +
> > + if (pte_swp_soft_dirty(pte))
> > + new = pte_swp_mksoft_dirty(new);
> > + if (pte_swp_exclusive(pte))
> > + new = pte_swp_mkexclusive(new);
> > + if (pte_swp_uffd_wp(pte))
> > + new = pte_swp_mkuffd_wp(new);
>
> I don't quite understand this. If this page table entry is exclusive,
> will its subsequent page table entry also be exclusive without
> question?
> in try_to_unmap_one, exclusive is per-subpage but not per-folio:
>
> anon_exclusive = folio_test_anon(folio) &&
> PageAnonExclusive(subpage);
>
> same questions also for diry, wp etc.
Sorry for the noise. you are right. based on your new version, I think I should
entirely drop:
[PATCH v2 3/5] mm: swap_pte_batch: add an output argument to reture if
all swap entries are exclusive
https://lore.kernel.org/linux-mm/20240409082631.187483-4-21cnbao@gmail.com/
>
> > +
> > + return new;
> > +}
> > +
> > +/**
> > + * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
> > + * @start_ptep: Page table pointer for the first entry.
> > + * @max_nr: The maximum number of table entries to consider.
> > + * @pte: Page table entry for the first entry.
> > + *
> > + * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
> > + * containing swap entries all with consecutive offsets and targeting the same
> > + * swap type, all with matching swp pte bits.
> > + *
> > + * max_nr must be at least one and must be limited by the caller so scanning
> > + * cannot exceed a single page table.
> > + *
> > + * Return: the number of table entries in the batch.
> > + */
> > +static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
> > +{
> > + pte_t expected_pte = pte_next_swp_offset(pte);
> > + const pte_t *end_ptep = start_ptep + max_nr;
> > + pte_t *ptep = start_ptep + 1;
> > +
> > + VM_WARN_ON(max_nr < 1);
> > + VM_WARN_ON(!is_swap_pte(pte));
> > + VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
> > +
> > + while (ptep < end_ptep) {
> > + pte = ptep_get(ptep);
> > +
> > + if (!pte_same(pte, expected_pte))
> > + break;
> > +
> > + expected_pte = pte_next_swp_offset(expected_pte);
> > + ptep++;
> > + }
> > +
> > + return ptep - start_ptep;
> > +}
> > #endif /* CONFIG_MMU */
> >
> > void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 1f77a51baaac..5011ecb24344 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -628,6 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> > struct folio *folio;
> > int nr_swap = 0;
> > unsigned long next;
> > + int nr, max_nr;
> >
> > next = pmd_addr_end(addr, end);
> > if (pmd_trans_huge(*pmd))
> > @@ -640,7 +641,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> > return 0;
> > flush_tlb_batched_pending(mm);
> > arch_enter_lazy_mmu_mode();
> > - for (; addr != end; pte++, addr += PAGE_SIZE) {
> > + for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
> > + nr = 1;
> > ptent = ptep_get(pte);
> >
> > if (pte_none(ptent))
> > @@ -655,9 +657,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> >
> > entry = pte_to_swp_entry(ptent);
> > if (!non_swap_entry(entry)) {
> > - nr_swap--;
> > - free_swap_and_cache(entry);
> > - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> > + max_nr = (end - addr) / PAGE_SIZE;
> > + nr = swap_pte_batch(pte, max_nr, ptent);
> > + nr_swap -= nr;
> > + free_swap_and_cache_nr(entry, nr);
> > + clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
> > } else if (is_hwpoison_entry(entry) ||
> > is_poisoned_swp_entry(entry)) {
> > pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> > diff --git a/mm/memory.c b/mm/memory.c
> > index b98e4d907a14..0db2aa066a5a 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -1637,12 +1637,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> > folio_remove_rmap_pte(folio, page, vma);
> > folio_put(folio);
> > } else if (!non_swap_entry(entry)) {
> > - /* Genuine swap entry, hence a private anon page */
> > + max_nr = (end - addr) / PAGE_SIZE;
> > + nr = swap_pte_batch(pte, max_nr, ptent);
> > + /* Genuine swap entries, hence a private anon pages */
> > if (!should_zap_cows(details))
> > continue;
> > - rss[MM_SWAPENTS]--;
> > - if (unlikely(!free_swap_and_cache(entry)))
> > - print_bad_pte(vma, addr, ptent, NULL);
> > + rss[MM_SWAPENTS] -= nr;
> > + free_swap_and_cache_nr(entry, nr);
> > } else if (is_migration_entry(entry)) {
> > folio = pfn_swap_entry_folio(entry);
> > if (!should_zap_folio(details, folio))
> > @@ -1665,8 +1666,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> > pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
> > WARN_ON_ONCE(1);
> > }
> > - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> > - zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
> > + clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
> > + zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
> > } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
> >
> > add_mm_rss_vec(mm, rss);
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 1ded6d1dcab4..20c45757f2b2 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -130,7 +130,11 @@ static inline unsigned char swap_count(unsigned char ent)
> > /* Reclaim the swap entry if swap is getting full*/
> > #define TTRS_FULL 0x4
> >
> > -/* returns 1 if swap entry is freed */
> > +/*
> > + * returns number of pages in the folio that backs the swap entry. If positive,
> > + * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
> > + * folio was associated with the swap entry.
> > + */
> > static int __try_to_reclaim_swap(struct swap_info_struct *si,
> > unsigned long offset, unsigned long flags)
> > {
> > @@ -155,6 +159,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> > ret = folio_free_swap(folio);
> > folio_unlock(folio);
> > }
> > + ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
> > folio_put(folio);
> > return ret;
> > }
> > @@ -895,7 +900,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
> > swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
> > spin_lock(&si->lock);
> > /* entry was freed successfully, try to use this again */
> > - if (swap_was_freed)
> > + if (swap_was_freed > 0)
> > goto checks;
> > goto scan; /* check next one */
> > }
> > @@ -1572,32 +1577,88 @@ bool folio_free_swap(struct folio *folio)
> > return true;
> > }
> >
> > -/*
> > - * Free the swap entry like above, but also try to
> > - * free the page cache entry if it is the last user.
> > +/**
> > + * free_swap_and_cache_nr() - Release reference on range of swap entries and
> > + * reclaim their cache if no more references remain.
> > + * @entry: First entry of range.
> > + * @nr: Number of entries in range.
> > + *
> > + * For each swap entry in the contiguous range, release a reference. If any swap
> > + * entries become free, try to reclaim their underlying folios, if present. The
> > + * offset range is defined by [entry.offset, entry.offset + nr).
> > */
> > -int free_swap_and_cache(swp_entry_t entry)
> > +void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> > {
> > - struct swap_info_struct *p;
> > + const unsigned long start_offset = swp_offset(entry);
> > + const unsigned long end_offset = start_offset + nr;
> > + unsigned int type = swp_type(entry);
> > + struct swap_info_struct *si;
> > + bool any_only_cache = false;
> > + unsigned long offset;
> > unsigned char count;
> >
> > if (non_swap_entry(entry))
> > - return 1;
> > + return;
> >
> > - p = get_swap_device(entry);
> > - if (p) {
> > - if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) {
> > - put_swap_device(p);
> > - return 0;
> > + si = get_swap_device(entry);
> > + if (!si)
> > + return;
> > +
> > + if (WARN_ON(end_offset > si->max))
> > + goto out;
> > +
> > + /*
> > + * First free all entries in the range.
> > + */
> > + for (offset = start_offset; offset < end_offset; offset++) {
> > + if (data_race(si->swap_map[offset])) {
> > + count = __swap_entry_free(si, swp_entry(type, offset));
> > + if (count == SWAP_HAS_CACHE)
> > + any_only_cache = true;
> > + } else {
> > + WARN_ON_ONCE(1);
> > }
> > + }
> > +
> > + /*
> > + * Short-circuit the below loop if none of the entries had their
> > + * reference drop to zero.
> > + */
> > + if (!any_only_cache)
> > + goto out;
> >
> > - count = __swap_entry_free(p, entry);
> > - if (count == SWAP_HAS_CACHE)
> > - __try_to_reclaim_swap(p, swp_offset(entry),
> > + /*
> > + * Now go back over the range trying to reclaim the swap cache. This is
> > + * more efficient for large folios because we will only try to reclaim
> > + * the swap once per folio in the common case. If we do
> > + * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
> > + * latter will get a reference and lock the folio for every individual
> > + * page but will only succeed once the swap slot for every subpage is
> > + * zero.
> > + */
> > + for (offset = start_offset; offset < end_offset; offset += nr) {
> > + nr = 1;
> > + if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
> > + /*
> > + * Folios are always naturally aligned in swap so
> > + * advance forward to the next boundary. Zero means no
> > + * folio was found for the swap entry, so advance by 1
> > + * in this case. Negative value means folio was found
> > + * but could not be reclaimed. Here we can still advance
> > + * to the next boundary.
> > + */
> > + nr = __try_to_reclaim_swap(si, offset,
> > TTRS_UNMAPPED | TTRS_FULL);
> > - put_swap_device(p);
> > + if (nr == 0)
> > + nr = 1;
> > + else if (nr < 0)
> > + nr = -nr;
> > + nr = ALIGN(offset + 1, nr) - offset;
> > + }
> > }
> > - return p != NULL;
> > +
> > +out:
> > + put_swap_device(si);
> > }
> >
> > #ifdef CONFIG_HIBERNATION
> > --
> > 2.25.1
> >
>
> Thanks
> Barry
On 09.04.24 11:22, Barry Song wrote:
> On Tue, Apr 9, 2024 at 8:51 PM Barry Song <21cnbao@gmail.com> wrote:
>>
>> On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>
>>> Now that we no longer have a convenient flag in the cluster to determine
>>> if a folio is large, free_swap_and_cache() will take a reference and
>>> lock a large folio much more often, which could lead to contention and
>>> (e.g.) failure to split large folios, etc.
>>>
>>> Let's solve that problem by batch freeing swap and cache with a new
>>> function, free_swap_and_cache_nr(), to free a contiguous range of swap
>>> entries together. This allows us to first drop a reference to each swap
>>> slot before we try to release the cache folio. This means we only try to
>>> release the folio once, only taking the reference and lock once - much
>>> better than the previous 512 times for the 2M THP case.
>>>
>>> Contiguous swap entries are gathered in zap_pte_range() and
>>> madvise_free_pte_range() in a similar way to how present ptes are
>>> already gathered in zap_pte_range().
>>>
>>> While we are at it, let's simplify by converting the return type of both
>>> functions to void. The return value was used only by zap_pte_range() to
>>> print a bad pte, and was ignored by everyone else, so the extra
>>> reporting wasn't exactly guaranteed. We will still get the warning with
>>> most of the information from get_swap_device(). With the batch version,
>>> we wouldn't know which pte was bad anyway so could print the wrong one.
>>>
>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>> ---
>>> include/linux/pgtable.h | 29 ++++++++++++
>>> include/linux/swap.h | 12 +++--
>>> mm/internal.h | 63 ++++++++++++++++++++++++++
>>> mm/madvise.c | 12 +++--
>>> mm/memory.c | 13 +++---
>>> mm/swapfile.c | 97 +++++++++++++++++++++++++++++++++--------
>>> 6 files changed, 195 insertions(+), 31 deletions(-)
>>>
>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>> index a3fc8150b047..75096025fe52 100644
>>> --- a/include/linux/pgtable.h
>>> +++ b/include/linux/pgtable.h
>>> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
>>> }
>>> #endif
>>>
>>> +#ifndef clear_not_present_full_ptes
>>> +/**
>>> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
>>> + * consecutive in the pgtable.
>>> + * @mm: Address space the ptes represent.
>>> + * @addr: Address of the first pte.
>>> + * @ptep: Page table pointer for the first entry.
>>> + * @nr: Number of entries to clear.
>>> + * @full: Whether we are clearing a full mm.
>>> + *
>>> + * May be overridden by the architecture; otherwise, implemented as a simple
>>> + * loop over pte_clear_not_present_full().
>>> + *
>>> + * Context: The caller holds the page table lock. The PTEs are all not present.
>>> + * The PTEs are all in the same PMD.
>>> + */
>>> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
>>> + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
>>> +{
>>> + for (;;) {
>>> + pte_clear_not_present_full(mm, addr, ptep, full);
>>> + if (--nr == 0)
>>> + break;
>>> + ptep++;
>>> + addr += PAGE_SIZE;
>>> + }
>>> +}
>>> +#endif
>>> +
>>> #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
>>> extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
>>> unsigned long address,
>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>> index f6f78198f000..5737236dc3ce 100644
>>> --- a/include/linux/swap.h
>>> +++ b/include/linux/swap.h
>>> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
>>> extern int swapcache_prepare(swp_entry_t);
>>> extern void swap_free(swp_entry_t);
>>> extern void swapcache_free_entries(swp_entry_t *entries, int n);
>>> -extern int free_swap_and_cache(swp_entry_t);
>>> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
>>> int swap_type_of(dev_t device, sector_t offset);
>>> int find_first_swap(dev_t *device);
>>> extern unsigned int count_swap_pages(int, int);
>>> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
>>> #define free_pages_and_swap_cache(pages, nr) \
>>> release_pages((pages), (nr));
>>>
>>> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
>>> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
>>> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>>> +{
>>> +}
>>>
>>> static inline void free_swap_cache(struct folio *folio)
>>> {
>>> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
>>> }
>>> #endif /* CONFIG_SWAP */
>>>
>>> +static inline void free_swap_and_cache(swp_entry_t entry)
>>> +{
>>> + free_swap_and_cache_nr(entry, 1);
>>> +}
>>> +
>>> #ifdef CONFIG_MEMCG
>>> static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
>>> {
>>> diff --git a/mm/internal.h b/mm/internal.h
>>> index 3bdc8693b54f..de68705624b0 100644
>>> --- a/mm/internal.h
>>> +++ b/mm/internal.h
>>> @@ -11,6 +11,8 @@
>>> #include <linux/mm.h>
>>> #include <linux/pagemap.h>
>>> #include <linux/rmap.h>
>>> +#include <linux/swap.h>
>>> +#include <linux/swapops.h>
>>> #include <linux/tracepoint-defs.h>
>>>
>>> struct folio_batch;
>>> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
>>>
>>> return min(ptep - start_ptep, max_nr);
>>> }
>>> +
>>> +/**
>>> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
>>> + * @pte: The initial pte state; is_swap_pte(pte) must be true.
>>> + *
>>> + * Increments the swap offset, while maintaining all other fields, including
>>> + * swap type, and any swp pte bits. The resulting pte is returned.
>>> + */
>>> +static inline pte_t pte_next_swp_offset(pte_t pte)
>>> +{
>>> + swp_entry_t entry = pte_to_swp_entry(pte);
>>> + pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
>>> + swp_offset(entry) + 1));
>>> +
>>> + if (pte_swp_soft_dirty(pte))
>>> + new = pte_swp_mksoft_dirty(new);
>>> + if (pte_swp_exclusive(pte))
>>> + new = pte_swp_mkexclusive(new);
>>> + if (pte_swp_uffd_wp(pte))
>>> + new = pte_swp_mkuffd_wp(new);
>>
>> I don't quite understand this. If this page table entry is exclusive,
>> will its subsequent page table entry also be exclusive without
>> question?
>> in try_to_unmap_one, exclusive is per-subpage but not per-folio:
>>
>> anon_exclusive = folio_test_anon(folio) &&
>> PageAnonExclusive(subpage);
>>
>> same questions also for diry, wp etc.
>
> Sorry for the noise. you are right. based on your new version, I think I should
> entirely drop:
>
> [PATCH v2 3/5] mm: swap_pte_batch: add an output argument to reture if
> all swap entries are exclusive
Yes. If we ever want to ignore some bits, we should likely add flags to
change the behavior, like for folio_pte_batch().
For swapin, you really want the exclusive bits to match, though.
softdirty and uffd-wp as well at least initially for simplicity.
--
Cheers,
David / dhildenb
On Tue, Apr 9, 2024 at 9:24 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 09.04.24 11:22, Barry Song wrote:
> > On Tue, Apr 9, 2024 at 8:51 PM Barry Song <21cnbao@gmail.com> wrote:
> >>
> >> On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>
> >>> Now that we no longer have a convenient flag in the cluster to determine
> >>> if a folio is large, free_swap_and_cache() will take a reference and
> >>> lock a large folio much more often, which could lead to contention and
> >>> (e.g.) failure to split large folios, etc.
> >>>
> >>> Let's solve that problem by batch freeing swap and cache with a new
> >>> function, free_swap_and_cache_nr(), to free a contiguous range of swap
> >>> entries together. This allows us to first drop a reference to each swap
> >>> slot before we try to release the cache folio. This means we only try to
> >>> release the folio once, only taking the reference and lock once - much
> >>> better than the previous 512 times for the 2M THP case.
> >>>
> >>> Contiguous swap entries are gathered in zap_pte_range() and
> >>> madvise_free_pte_range() in a similar way to how present ptes are
> >>> already gathered in zap_pte_range().
> >>>
> >>> While we are at it, let's simplify by converting the return type of both
> >>> functions to void. The return value was used only by zap_pte_range() to
> >>> print a bad pte, and was ignored by everyone else, so the extra
> >>> reporting wasn't exactly guaranteed. We will still get the warning with
> >>> most of the information from get_swap_device(). With the batch version,
> >>> we wouldn't know which pte was bad anyway so could print the wrong one.
> >>>
> >>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> >>> ---
> >>> include/linux/pgtable.h | 29 ++++++++++++
> >>> include/linux/swap.h | 12 +++--
> >>> mm/internal.h | 63 ++++++++++++++++++++++++++
> >>> mm/madvise.c | 12 +++--
> >>> mm/memory.c | 13 +++---
> >>> mm/swapfile.c | 97 +++++++++++++++++++++++++++++++++--------
> >>> 6 files changed, 195 insertions(+), 31 deletions(-)
> >>>
> >>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> >>> index a3fc8150b047..75096025fe52 100644
> >>> --- a/include/linux/pgtable.h
> >>> +++ b/include/linux/pgtable.h
> >>> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
> >>> }
> >>> #endif
> >>>
> >>> +#ifndef clear_not_present_full_ptes
> >>> +/**
> >>> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
> >>> + * consecutive in the pgtable.
> >>> + * @mm: Address space the ptes represent.
> >>> + * @addr: Address of the first pte.
> >>> + * @ptep: Page table pointer for the first entry.
> >>> + * @nr: Number of entries to clear.
> >>> + * @full: Whether we are clearing a full mm.
> >>> + *
> >>> + * May be overridden by the architecture; otherwise, implemented as a simple
> >>> + * loop over pte_clear_not_present_full().
> >>> + *
> >>> + * Context: The caller holds the page table lock. The PTEs are all not present.
> >>> + * The PTEs are all in the same PMD.
> >>> + */
> >>> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
> >>> + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> >>> +{
> >>> + for (;;) {
> >>> + pte_clear_not_present_full(mm, addr, ptep, full);
> >>> + if (--nr == 0)
> >>> + break;
> >>> + ptep++;
> >>> + addr += PAGE_SIZE;
> >>> + }
> >>> +}
> >>> +#endif
> >>> +
> >>> #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
> >>> extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
> >>> unsigned long address,
> >>> diff --git a/include/linux/swap.h b/include/linux/swap.h
> >>> index f6f78198f000..5737236dc3ce 100644
> >>> --- a/include/linux/swap.h
> >>> +++ b/include/linux/swap.h
> >>> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
> >>> extern int swapcache_prepare(swp_entry_t);
> >>> extern void swap_free(swp_entry_t);
> >>> extern void swapcache_free_entries(swp_entry_t *entries, int n);
> >>> -extern int free_swap_and_cache(swp_entry_t);
> >>> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
> >>> int swap_type_of(dev_t device, sector_t offset);
> >>> int find_first_swap(dev_t *device);
> >>> extern unsigned int count_swap_pages(int, int);
> >>> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
> >>> #define free_pages_and_swap_cache(pages, nr) \
> >>> release_pages((pages), (nr));
> >>>
> >>> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
> >>> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
> >>> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >>> +{
> >>> +}
> >>>
> >>> static inline void free_swap_cache(struct folio *folio)
> >>> {
> >>> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
> >>> }
> >>> #endif /* CONFIG_SWAP */
> >>>
> >>> +static inline void free_swap_and_cache(swp_entry_t entry)
> >>> +{
> >>> + free_swap_and_cache_nr(entry, 1);
> >>> +}
> >>> +
> >>> #ifdef CONFIG_MEMCG
> >>> static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
> >>> {
> >>> diff --git a/mm/internal.h b/mm/internal.h
> >>> index 3bdc8693b54f..de68705624b0 100644
> >>> --- a/mm/internal.h
> >>> +++ b/mm/internal.h
> >>> @@ -11,6 +11,8 @@
> >>> #include <linux/mm.h>
> >>> #include <linux/pagemap.h>
> >>> #include <linux/rmap.h>
> >>> +#include <linux/swap.h>
> >>> +#include <linux/swapops.h>
> >>> #include <linux/tracepoint-defs.h>
> >>>
> >>> struct folio_batch;
> >>> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> >>>
> >>> return min(ptep - start_ptep, max_nr);
> >>> }
> >>> +
> >>> +/**
> >>> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
> >>> + * @pte: The initial pte state; is_swap_pte(pte) must be true.
> >>> + *
> >>> + * Increments the swap offset, while maintaining all other fields, including
> >>> + * swap type, and any swp pte bits. The resulting pte is returned.
> >>> + */
> >>> +static inline pte_t pte_next_swp_offset(pte_t pte)
> >>> +{
> >>> + swp_entry_t entry = pte_to_swp_entry(pte);
> >>> + pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
> >>> + swp_offset(entry) + 1));
> >>> +
> >>> + if (pte_swp_soft_dirty(pte))
> >>> + new = pte_swp_mksoft_dirty(new);
> >>> + if (pte_swp_exclusive(pte))
> >>> + new = pte_swp_mkexclusive(new);
> >>> + if (pte_swp_uffd_wp(pte))
> >>> + new = pte_swp_mkuffd_wp(new);
> >>
> >> I don't quite understand this. If this page table entry is exclusive,
> >> will its subsequent page table entry also be exclusive without
> >> question?
> >> in try_to_unmap_one, exclusive is per-subpage but not per-folio:
> >>
> >> anon_exclusive = folio_test_anon(folio) &&
> >> PageAnonExclusive(subpage);
> >>
> >> same questions also for diry, wp etc.
> >
> > Sorry for the noise. you are right. based on your new version, I think I should
> > entirely drop:
> >
> > [PATCH v2 3/5] mm: swap_pte_batch: add an output argument to reture if
> > all swap entries are exclusive
>
> Yes. If we ever want to ignore some bits, we should likely add flags to
> change the behavior, like for folio_pte_batch().
>
> For swapin, you really want the exclusive bits to match, though.
I am not quite sure I definitely need exclusive bits to match. i can either
drop my 3/5 or ignore the exclusive bit as below (if anyone is not shared,
swpin won't reuse the large folio, but it can still entirely map it read-only):
diff --git a/mm/internal.h b/mm/internal.h
index cae39c372bfc..5726e729c9ee 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -253,10 +253,22 @@ static inline int swap_pte_batch(pte_t
*start_ptep, int max_nr, pte_t pte,
*any_shared |= !pte_swp_exclusive(pte);
while (ptep < end_ptep) {
+ pte_t ignore_exclusive_pte;
+ pte_t ignore_exclusive_expected_pte;
pte = ptep_get(ptep);
- if (!pte_same(pte, expected_pte))
- break;
+ if (any_shared) {
+ ignore_exclusive_pte = pte;
+ ignore_exclusive_expected_pte = expected_pte;
+ ignore_exclusive_pte =
pte_swp_clear_exclusive(ignore_exclusive_pte);
+ ignore_exclusive_expected_pte =
pte_swp_clear_exclusive(expected_pte);
+
+ if (!pte_same(ignore_exclusive_pte,
ignore_exclusive_expected_pte))
+ break;
+ } else {
+ if (!pte_same(pte, expected_pte))
+ break;
+ }
if (any_shared)
*any_shared |= !pte_swp_exclusive(pte);
> softdirty and uffd-wp as well at least initially for simplicity.
yes for this.
By the way, I wonder if you and Ryan have a moment to review swpin
refault patchset
v2 :-)
[PATCH v2 0/5] large folios swap-in: handle refault cases first
https://lore.kernel.org/linux-mm/20240409082631.187483-1-21cnbao@gmail.com/
>
> --
> Cheers,
>
> David / dhildenb
>
Thanks
Barry
On 09/04/2024 10:41, Barry Song wrote:
> On Tue, Apr 9, 2024 at 9:24 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 09.04.24 11:22, Barry Song wrote:
>>> On Tue, Apr 9, 2024 at 8:51 PM Barry Song <21cnbao@gmail.com> wrote:
>>>>
>>>> On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>
>>>>> Now that we no longer have a convenient flag in the cluster to determine
>>>>> if a folio is large, free_swap_and_cache() will take a reference and
>>>>> lock a large folio much more often, which could lead to contention and
>>>>> (e.g.) failure to split large folios, etc.
>>>>>
>>>>> Let's solve that problem by batch freeing swap and cache with a new
>>>>> function, free_swap_and_cache_nr(), to free a contiguous range of swap
>>>>> entries together. This allows us to first drop a reference to each swap
>>>>> slot before we try to release the cache folio. This means we only try to
>>>>> release the folio once, only taking the reference and lock once - much
>>>>> better than the previous 512 times for the 2M THP case.
>>>>>
>>>>> Contiguous swap entries are gathered in zap_pte_range() and
>>>>> madvise_free_pte_range() in a similar way to how present ptes are
>>>>> already gathered in zap_pte_range().
>>>>>
>>>>> While we are at it, let's simplify by converting the return type of both
>>>>> functions to void. The return value was used only by zap_pte_range() to
>>>>> print a bad pte, and was ignored by everyone else, so the extra
>>>>> reporting wasn't exactly guaranteed. We will still get the warning with
>>>>> most of the information from get_swap_device(). With the batch version,
>>>>> we wouldn't know which pte was bad anyway so could print the wrong one.
>>>>>
>>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>>> ---
>>>>> include/linux/pgtable.h | 29 ++++++++++++
>>>>> include/linux/swap.h | 12 +++--
>>>>> mm/internal.h | 63 ++++++++++++++++++++++++++
>>>>> mm/madvise.c | 12 +++--
>>>>> mm/memory.c | 13 +++---
>>>>> mm/swapfile.c | 97 +++++++++++++++++++++++++++++++++--------
>>>>> 6 files changed, 195 insertions(+), 31 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>>>> index a3fc8150b047..75096025fe52 100644
>>>>> --- a/include/linux/pgtable.h
>>>>> +++ b/include/linux/pgtable.h
>>>>> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
>>>>> }
>>>>> #endif
>>>>>
>>>>> +#ifndef clear_not_present_full_ptes
>>>>> +/**
>>>>> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
>>>>> + * consecutive in the pgtable.
>>>>> + * @mm: Address space the ptes represent.
>>>>> + * @addr: Address of the first pte.
>>>>> + * @ptep: Page table pointer for the first entry.
>>>>> + * @nr: Number of entries to clear.
>>>>> + * @full: Whether we are clearing a full mm.
>>>>> + *
>>>>> + * May be overridden by the architecture; otherwise, implemented as a simple
>>>>> + * loop over pte_clear_not_present_full().
>>>>> + *
>>>>> + * Context: The caller holds the page table lock. The PTEs are all not present.
>>>>> + * The PTEs are all in the same PMD.
>>>>> + */
>>>>> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
>>>>> + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
>>>>> +{
>>>>> + for (;;) {
>>>>> + pte_clear_not_present_full(mm, addr, ptep, full);
>>>>> + if (--nr == 0)
>>>>> + break;
>>>>> + ptep++;
>>>>> + addr += PAGE_SIZE;
>>>>> + }
>>>>> +}
>>>>> +#endif
>>>>> +
>>>>> #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
>>>>> extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
>>>>> unsigned long address,
>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>> index f6f78198f000..5737236dc3ce 100644
>>>>> --- a/include/linux/swap.h
>>>>> +++ b/include/linux/swap.h
>>>>> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
>>>>> extern int swapcache_prepare(swp_entry_t);
>>>>> extern void swap_free(swp_entry_t);
>>>>> extern void swapcache_free_entries(swp_entry_t *entries, int n);
>>>>> -extern int free_swap_and_cache(swp_entry_t);
>>>>> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
>>>>> int swap_type_of(dev_t device, sector_t offset);
>>>>> int find_first_swap(dev_t *device);
>>>>> extern unsigned int count_swap_pages(int, int);
>>>>> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
>>>>> #define free_pages_and_swap_cache(pages, nr) \
>>>>> release_pages((pages), (nr));
>>>>>
>>>>> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
>>>>> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
>>>>> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>>>>> +{
>>>>> +}
>>>>>
>>>>> static inline void free_swap_cache(struct folio *folio)
>>>>> {
>>>>> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
>>>>> }
>>>>> #endif /* CONFIG_SWAP */
>>>>>
>>>>> +static inline void free_swap_and_cache(swp_entry_t entry)
>>>>> +{
>>>>> + free_swap_and_cache_nr(entry, 1);
>>>>> +}
>>>>> +
>>>>> #ifdef CONFIG_MEMCG
>>>>> static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
>>>>> {
>>>>> diff --git a/mm/internal.h b/mm/internal.h
>>>>> index 3bdc8693b54f..de68705624b0 100644
>>>>> --- a/mm/internal.h
>>>>> +++ b/mm/internal.h
>>>>> @@ -11,6 +11,8 @@
>>>>> #include <linux/mm.h>
>>>>> #include <linux/pagemap.h>
>>>>> #include <linux/rmap.h>
>>>>> +#include <linux/swap.h>
>>>>> +#include <linux/swapops.h>
>>>>> #include <linux/tracepoint-defs.h>
>>>>>
>>>>> struct folio_batch;
>>>>> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
>>>>>
>>>>> return min(ptep - start_ptep, max_nr);
>>>>> }
>>>>> +
>>>>> +/**
>>>>> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
>>>>> + * @pte: The initial pte state; is_swap_pte(pte) must be true.
>>>>> + *
>>>>> + * Increments the swap offset, while maintaining all other fields, including
>>>>> + * swap type, and any swp pte bits. The resulting pte is returned.
>>>>> + */
>>>>> +static inline pte_t pte_next_swp_offset(pte_t pte)
>>>>> +{
>>>>> + swp_entry_t entry = pte_to_swp_entry(pte);
>>>>> + pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
>>>>> + swp_offset(entry) + 1));
>>>>> +
>>>>> + if (pte_swp_soft_dirty(pte))
>>>>> + new = pte_swp_mksoft_dirty(new);
>>>>> + if (pte_swp_exclusive(pte))
>>>>> + new = pte_swp_mkexclusive(new);
>>>>> + if (pte_swp_uffd_wp(pte))
>>>>> + new = pte_swp_mkuffd_wp(new);
>>>>
>>>> I don't quite understand this. If this page table entry is exclusive,
>>>> will its subsequent page table entry also be exclusive without
>>>> question?
>>>> in try_to_unmap_one, exclusive is per-subpage but not per-folio:
>>>>
>>>> anon_exclusive = folio_test_anon(folio) &&
>>>> PageAnonExclusive(subpage);
>>>>
>>>> same questions also for diry, wp etc.
>>>
>>> Sorry for the noise. you are right. based on your new version, I think I should
>>> entirely drop:
>>>
>>> [PATCH v2 3/5] mm: swap_pte_batch: add an output argument to reture if
>>> all swap entries are exclusive
>>
>> Yes. If we ever want to ignore some bits, we should likely add flags to
>> change the behavior, like for folio_pte_batch().
>>
>> For swapin, you really want the exclusive bits to match, though.
>
> I am not quite sure I definitely need exclusive bits to match. i can either
> drop my 3/5 or ignore the exclusive bit as below (if anyone is not shared,
> swpin won't reuse the large folio, but it can still entirely map it read-only):
>
> diff --git a/mm/internal.h b/mm/internal.h
> index cae39c372bfc..5726e729c9ee 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -253,10 +253,22 @@ static inline int swap_pte_batch(pte_t
> *start_ptep, int max_nr, pte_t pte,
> *any_shared |= !pte_swp_exclusive(pte);
>
> while (ptep < end_ptep) {
> + pte_t ignore_exclusive_pte;
> + pte_t ignore_exclusive_expected_pte;
> pte = ptep_get(ptep);
>
> - if (!pte_same(pte, expected_pte))
> - break;
> + if (any_shared) {
> + ignore_exclusive_pte = pte;
> + ignore_exclusive_expected_pte = expected_pte;
> + ignore_exclusive_pte =
> pte_swp_clear_exclusive(ignore_exclusive_pte);
> + ignore_exclusive_expected_pte =
> pte_swp_clear_exclusive(expected_pte);
> +
> + if (!pte_same(ignore_exclusive_pte,
> ignore_exclusive_expected_pte))
> + break;
> + } else {
> + if (!pte_same(pte, expected_pte))
> + break;
> + }
>
> if (any_shared)
> *any_shared |= !pte_swp_exclusive(pte);
I'll leave David to comment on this proposal; I'm not sure I understand all the
details. The code change does look a bit "busy" though - sometimes that can be
an indicator :)
>
>> softdirty and uffd-wp as well at least initially for simplicity.
>
> yes for this.
>
> By the way, I wonder if you and Ryan have a moment to review swpin
> refault patchset
> v2 :-)
It's on my todo list! I'm very keen to get as much large swap-out and swap-in
support into v6.10 as we can. Hoping to get to it inthe next couple of days.
>
> [PATCH v2 0/5] large folios swap-in: handle refault cases first
> https://lore.kernel.org/linux-mm/20240409082631.187483-1-21cnbao@gmail.com/
>
>
>>
>> --
>> Cheers,
>>
>> David / dhildenb
>>
>
> Thanks
> Barry
On 09.04.24 11:55, Ryan Roberts wrote:
> On 09/04/2024 10:41, Barry Song wrote:
>> On Tue, Apr 9, 2024 at 9:24 PM David Hildenbrand <david@redhat.com> wrote:
>>>
>>> On 09.04.24 11:22, Barry Song wrote:
>>>> On Tue, Apr 9, 2024 at 8:51 PM Barry Song <21cnbao@gmail.com> wrote:
>>>>>
>>>>> On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>
>>>>>> Now that we no longer have a convenient flag in the cluster to determine
>>>>>> if a folio is large, free_swap_and_cache() will take a reference and
>>>>>> lock a large folio much more often, which could lead to contention and
>>>>>> (e.g.) failure to split large folios, etc.
>>>>>>
>>>>>> Let's solve that problem by batch freeing swap and cache with a new
>>>>>> function, free_swap_and_cache_nr(), to free a contiguous range of swap
>>>>>> entries together. This allows us to first drop a reference to each swap
>>>>>> slot before we try to release the cache folio. This means we only try to
>>>>>> release the folio once, only taking the reference and lock once - much
>>>>>> better than the previous 512 times for the 2M THP case.
>>>>>>
>>>>>> Contiguous swap entries are gathered in zap_pte_range() and
>>>>>> madvise_free_pte_range() in a similar way to how present ptes are
>>>>>> already gathered in zap_pte_range().
>>>>>>
>>>>>> While we are at it, let's simplify by converting the return type of both
>>>>>> functions to void. The return value was used only by zap_pte_range() to
>>>>>> print a bad pte, and was ignored by everyone else, so the extra
>>>>>> reporting wasn't exactly guaranteed. We will still get the warning with
>>>>>> most of the information from get_swap_device(). With the batch version,
>>>>>> we wouldn't know which pte was bad anyway so could print the wrong one.
>>>>>>
>>>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>>>> ---
>>>>>> include/linux/pgtable.h | 29 ++++++++++++
>>>>>> include/linux/swap.h | 12 +++--
>>>>>> mm/internal.h | 63 ++++++++++++++++++++++++++
>>>>>> mm/madvise.c | 12 +++--
>>>>>> mm/memory.c | 13 +++---
>>>>>> mm/swapfile.c | 97 +++++++++++++++++++++++++++++++++--------
>>>>>> 6 files changed, 195 insertions(+), 31 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>>>>> index a3fc8150b047..75096025fe52 100644
>>>>>> --- a/include/linux/pgtable.h
>>>>>> +++ b/include/linux/pgtable.h
>>>>>> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
>>>>>> }
>>>>>> #endif
>>>>>>
>>>>>> +#ifndef clear_not_present_full_ptes
>>>>>> +/**
>>>>>> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
>>>>>> + * consecutive in the pgtable.
>>>>>> + * @mm: Address space the ptes represent.
>>>>>> + * @addr: Address of the first pte.
>>>>>> + * @ptep: Page table pointer for the first entry.
>>>>>> + * @nr: Number of entries to clear.
>>>>>> + * @full: Whether we are clearing a full mm.
>>>>>> + *
>>>>>> + * May be overridden by the architecture; otherwise, implemented as a simple
>>>>>> + * loop over pte_clear_not_present_full().
>>>>>> + *
>>>>>> + * Context: The caller holds the page table lock. The PTEs are all not present.
>>>>>> + * The PTEs are all in the same PMD.
>>>>>> + */
>>>>>> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
>>>>>> + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
>>>>>> +{
>>>>>> + for (;;) {
>>>>>> + pte_clear_not_present_full(mm, addr, ptep, full);
>>>>>> + if (--nr == 0)
>>>>>> + break;
>>>>>> + ptep++;
>>>>>> + addr += PAGE_SIZE;
>>>>>> + }
>>>>>> +}
>>>>>> +#endif
>>>>>> +
>>>>>> #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
>>>>>> extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
>>>>>> unsigned long address,
>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>> index f6f78198f000..5737236dc3ce 100644
>>>>>> --- a/include/linux/swap.h
>>>>>> +++ b/include/linux/swap.h
>>>>>> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
>>>>>> extern int swapcache_prepare(swp_entry_t);
>>>>>> extern void swap_free(swp_entry_t);
>>>>>> extern void swapcache_free_entries(swp_entry_t *entries, int n);
>>>>>> -extern int free_swap_and_cache(swp_entry_t);
>>>>>> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
>>>>>> int swap_type_of(dev_t device, sector_t offset);
>>>>>> int find_first_swap(dev_t *device);
>>>>>> extern unsigned int count_swap_pages(int, int);
>>>>>> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
>>>>>> #define free_pages_and_swap_cache(pages, nr) \
>>>>>> release_pages((pages), (nr));
>>>>>>
>>>>>> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
>>>>>> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
>>>>>> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>>>>>> +{
>>>>>> +}
>>>>>>
>>>>>> static inline void free_swap_cache(struct folio *folio)
>>>>>> {
>>>>>> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
>>>>>> }
>>>>>> #endif /* CONFIG_SWAP */
>>>>>>
>>>>>> +static inline void free_swap_and_cache(swp_entry_t entry)
>>>>>> +{
>>>>>> + free_swap_and_cache_nr(entry, 1);
>>>>>> +}
>>>>>> +
>>>>>> #ifdef CONFIG_MEMCG
>>>>>> static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
>>>>>> {
>>>>>> diff --git a/mm/internal.h b/mm/internal.h
>>>>>> index 3bdc8693b54f..de68705624b0 100644
>>>>>> --- a/mm/internal.h
>>>>>> +++ b/mm/internal.h
>>>>>> @@ -11,6 +11,8 @@
>>>>>> #include <linux/mm.h>
>>>>>> #include <linux/pagemap.h>
>>>>>> #include <linux/rmap.h>
>>>>>> +#include <linux/swap.h>
>>>>>> +#include <linux/swapops.h>
>>>>>> #include <linux/tracepoint-defs.h>
>>>>>>
>>>>>> struct folio_batch;
>>>>>> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
>>>>>>
>>>>>> return min(ptep - start_ptep, max_nr);
>>>>>> }
>>>>>> +
>>>>>> +/**
>>>>>> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
>>>>>> + * @pte: The initial pte state; is_swap_pte(pte) must be true.
>>>>>> + *
>>>>>> + * Increments the swap offset, while maintaining all other fields, including
>>>>>> + * swap type, and any swp pte bits. The resulting pte is returned.
>>>>>> + */
>>>>>> +static inline pte_t pte_next_swp_offset(pte_t pte)
>>>>>> +{
>>>>>> + swp_entry_t entry = pte_to_swp_entry(pte);
>>>>>> + pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
>>>>>> + swp_offset(entry) + 1));
>>>>>> +
>>>>>> + if (pte_swp_soft_dirty(pte))
>>>>>> + new = pte_swp_mksoft_dirty(new);
>>>>>> + if (pte_swp_exclusive(pte))
>>>>>> + new = pte_swp_mkexclusive(new);
>>>>>> + if (pte_swp_uffd_wp(pte))
>>>>>> + new = pte_swp_mkuffd_wp(new);
>>>>>
>>>>> I don't quite understand this. If this page table entry is exclusive,
>>>>> will its subsequent page table entry also be exclusive without
>>>>> question?
>>>>> in try_to_unmap_one, exclusive is per-subpage but not per-folio:
>>>>>
>>>>> anon_exclusive = folio_test_anon(folio) &&
>>>>> PageAnonExclusive(subpage);
>>>>>
>>>>> same questions also for diry, wp etc.
>>>>
>>>> Sorry for the noise. you are right. based on your new version, I think I should
>>>> entirely drop:
>>>>
>>>> [PATCH v2 3/5] mm: swap_pte_batch: add an output argument to reture if
>>>> all swap entries are exclusive
>>>
>>> Yes. If we ever want to ignore some bits, we should likely add flags to
>>> change the behavior, like for folio_pte_batch().
>>>
>>> For swapin, you really want the exclusive bits to match, though.
>>
>> I am not quite sure I definitely need exclusive bits to match. i can either
>> drop my 3/5 or ignore the exclusive bit as below (if anyone is not shared,
>> swpin won't reuse the large folio, but it can still entirely map it read-only):
>>
>> diff --git a/mm/internal.h b/mm/internal.h
>> index cae39c372bfc..5726e729c9ee 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -253,10 +253,22 @@ static inline int swap_pte_batch(pte_t
>> *start_ptep, int max_nr, pte_t pte,
>> *any_shared |= !pte_swp_exclusive(pte);
>>
>> while (ptep < end_ptep) {
>> + pte_t ignore_exclusive_pte;
>> + pte_t ignore_exclusive_expected_pte;
>> pte = ptep_get(ptep);
>>
>> - if (!pte_same(pte, expected_pte))
>> - break;
>> + if (any_shared) {
>> + ignore_exclusive_pte = pte;
>> + ignore_exclusive_expected_pte = expected_pte;
>> + ignore_exclusive_pte =
>> pte_swp_clear_exclusive(ignore_exclusive_pte);
>> + ignore_exclusive_expected_pte =
>> pte_swp_clear_exclusive(expected_pte);
>> +
>> + if (!pte_same(ignore_exclusive_pte,
>> ignore_exclusive_expected_pte))
>> + break;
>> + } else {
>> + if (!pte_same(pte, expected_pte))
>> + break;
>> + }
>>
>> if (any_shared)
>> *any_shared |= !pte_swp_exclusive(pte);
>
> I'll leave David to comment on this proposal; I'm not sure I understand all the
> details. The code change does look a bit "busy" though - sometimes that can be
> an indicator :)
I'd prefer to keep it simple initially. Devil is in the detail for the
refault case: in the past, dropping an exclusive flag could have been
problematic with some O_DIRECT workloads that were not using FOLL_PIN.
IIUC, some still remain.
More details can be had from
https://lore.kernel.org/all/20230113171026.582290-1-david@redhat.com/
It might all change a bit if I manage to get folio_test_anon_exclusive()
flying. The current plan is that all individual swp PTEs would still
have pte_swp_exclusive() set, and we would *not* clear the
folio_test_anon_exclusive() flag while the folio is still in the
swapcache [which we do today to make fork() handling easier].
That will make refault significantly easier to handle regarding
exclusivity handling with large folios.
--
Cheers,
David / dhildenb
On Tue, Apr 9, 2024 at 9:55 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 09/04/2024 10:41, Barry Song wrote:
> > On Tue, Apr 9, 2024 at 9:24 PM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 09.04.24 11:22, Barry Song wrote:
> >>> On Tue, Apr 9, 2024 at 8:51 PM Barry Song <21cnbao@gmail.com> wrote:
> >>>>
> >>>> On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>>>
> >>>>> Now that we no longer have a convenient flag in the cluster to determine
> >>>>> if a folio is large, free_swap_and_cache() will take a reference and
> >>>>> lock a large folio much more often, which could lead to contention and
> >>>>> (e.g.) failure to split large folios, etc.
> >>>>>
> >>>>> Let's solve that problem by batch freeing swap and cache with a new
> >>>>> function, free_swap_and_cache_nr(), to free a contiguous range of swap
> >>>>> entries together. This allows us to first drop a reference to each swap
> >>>>> slot before we try to release the cache folio. This means we only try to
> >>>>> release the folio once, only taking the reference and lock once - much
> >>>>> better than the previous 512 times for the 2M THP case.
> >>>>>
> >>>>> Contiguous swap entries are gathered in zap_pte_range() and
> >>>>> madvise_free_pte_range() in a similar way to how present ptes are
> >>>>> already gathered in zap_pte_range().
> >>>>>
> >>>>> While we are at it, let's simplify by converting the return type of both
> >>>>> functions to void. The return value was used only by zap_pte_range() to
> >>>>> print a bad pte, and was ignored by everyone else, so the extra
> >>>>> reporting wasn't exactly guaranteed. We will still get the warning with
> >>>>> most of the information from get_swap_device(). With the batch version,
> >>>>> we wouldn't know which pte was bad anyway so could print the wrong one.
> >>>>>
> >>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> >>>>> ---
> >>>>> include/linux/pgtable.h | 29 ++++++++++++
> >>>>> include/linux/swap.h | 12 +++--
> >>>>> mm/internal.h | 63 ++++++++++++++++++++++++++
> >>>>> mm/madvise.c | 12 +++--
> >>>>> mm/memory.c | 13 +++---
> >>>>> mm/swapfile.c | 97 +++++++++++++++++++++++++++++++++--------
> >>>>> 6 files changed, 195 insertions(+), 31 deletions(-)
> >>>>>
> >>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> >>>>> index a3fc8150b047..75096025fe52 100644
> >>>>> --- a/include/linux/pgtable.h
> >>>>> +++ b/include/linux/pgtable.h
> >>>>> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
> >>>>> }
> >>>>> #endif
> >>>>>
> >>>>> +#ifndef clear_not_present_full_ptes
> >>>>> +/**
> >>>>> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
> >>>>> + * consecutive in the pgtable.
> >>>>> + * @mm: Address space the ptes represent.
> >>>>> + * @addr: Address of the first pte.
> >>>>> + * @ptep: Page table pointer for the first entry.
> >>>>> + * @nr: Number of entries to clear.
> >>>>> + * @full: Whether we are clearing a full mm.
> >>>>> + *
> >>>>> + * May be overridden by the architecture; otherwise, implemented as a simple
> >>>>> + * loop over pte_clear_not_present_full().
> >>>>> + *
> >>>>> + * Context: The caller holds the page table lock. The PTEs are all not present.
> >>>>> + * The PTEs are all in the same PMD.
> >>>>> + */
> >>>>> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
> >>>>> + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> >>>>> +{
> >>>>> + for (;;) {
> >>>>> + pte_clear_not_present_full(mm, addr, ptep, full);
> >>>>> + if (--nr == 0)
> >>>>> + break;
> >>>>> + ptep++;
> >>>>> + addr += PAGE_SIZE;
> >>>>> + }
> >>>>> +}
> >>>>> +#endif
> >>>>> +
> >>>>> #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
> >>>>> extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
> >>>>> unsigned long address,
> >>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
> >>>>> index f6f78198f000..5737236dc3ce 100644
> >>>>> --- a/include/linux/swap.h
> >>>>> +++ b/include/linux/swap.h
> >>>>> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
> >>>>> extern int swapcache_prepare(swp_entry_t);
> >>>>> extern void swap_free(swp_entry_t);
> >>>>> extern void swapcache_free_entries(swp_entry_t *entries, int n);
> >>>>> -extern int free_swap_and_cache(swp_entry_t);
> >>>>> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
> >>>>> int swap_type_of(dev_t device, sector_t offset);
> >>>>> int find_first_swap(dev_t *device);
> >>>>> extern unsigned int count_swap_pages(int, int);
> >>>>> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
> >>>>> #define free_pages_and_swap_cache(pages, nr) \
> >>>>> release_pages((pages), (nr));
> >>>>>
> >>>>> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
> >>>>> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
> >>>>> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >>>>> +{
> >>>>> +}
> >>>>>
> >>>>> static inline void free_swap_cache(struct folio *folio)
> >>>>> {
> >>>>> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
> >>>>> }
> >>>>> #endif /* CONFIG_SWAP */
> >>>>>
> >>>>> +static inline void free_swap_and_cache(swp_entry_t entry)
> >>>>> +{
> >>>>> + free_swap_and_cache_nr(entry, 1);
> >>>>> +}
> >>>>> +
> >>>>> #ifdef CONFIG_MEMCG
> >>>>> static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
> >>>>> {
> >>>>> diff --git a/mm/internal.h b/mm/internal.h
> >>>>> index 3bdc8693b54f..de68705624b0 100644
> >>>>> --- a/mm/internal.h
> >>>>> +++ b/mm/internal.h
> >>>>> @@ -11,6 +11,8 @@
> >>>>> #include <linux/mm.h>
> >>>>> #include <linux/pagemap.h>
> >>>>> #include <linux/rmap.h>
> >>>>> +#include <linux/swap.h>
> >>>>> +#include <linux/swapops.h>
> >>>>> #include <linux/tracepoint-defs.h>
> >>>>>
> >>>>> struct folio_batch;
> >>>>> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> >>>>>
> >>>>> return min(ptep - start_ptep, max_nr);
> >>>>> }
> >>>>> +
> >>>>> +/**
> >>>>> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
> >>>>> + * @pte: The initial pte state; is_swap_pte(pte) must be true.
> >>>>> + *
> >>>>> + * Increments the swap offset, while maintaining all other fields, including
> >>>>> + * swap type, and any swp pte bits. The resulting pte is returned.
> >>>>> + */
> >>>>> +static inline pte_t pte_next_swp_offset(pte_t pte)
> >>>>> +{
> >>>>> + swp_entry_t entry = pte_to_swp_entry(pte);
> >>>>> + pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
> >>>>> + swp_offset(entry) + 1));
> >>>>> +
> >>>>> + if (pte_swp_soft_dirty(pte))
> >>>>> + new = pte_swp_mksoft_dirty(new);
> >>>>> + if (pte_swp_exclusive(pte))
> >>>>> + new = pte_swp_mkexclusive(new);
> >>>>> + if (pte_swp_uffd_wp(pte))
> >>>>> + new = pte_swp_mkuffd_wp(new);
> >>>>
> >>>> I don't quite understand this. If this page table entry is exclusive,
> >>>> will its subsequent page table entry also be exclusive without
> >>>> question?
> >>>> in try_to_unmap_one, exclusive is per-subpage but not per-folio:
> >>>>
> >>>> anon_exclusive = folio_test_anon(folio) &&
> >>>> PageAnonExclusive(subpage);
> >>>>
> >>>> same questions also for diry, wp etc.
> >>>
> >>> Sorry for the noise. you are right. based on your new version, I think I should
> >>> entirely drop:
> >>>
> >>> [PATCH v2 3/5] mm: swap_pte_batch: add an output argument to reture if
> >>> all swap entries are exclusive
> >>
> >> Yes. If we ever want to ignore some bits, we should likely add flags to
> >> change the behavior, like for folio_pte_batch().
> >>
> >> For swapin, you really want the exclusive bits to match, though.
> >
> > I am not quite sure I definitely need exclusive bits to match. i can either
> > drop my 3/5 or ignore the exclusive bit as below (if anyone is not shared,
> > swpin won't reuse the large folio, but it can still entirely map it read-only):
> >
> > diff --git a/mm/internal.h b/mm/internal.h
> > index cae39c372bfc..5726e729c9ee 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -253,10 +253,22 @@ static inline int swap_pte_batch(pte_t
> > *start_ptep, int max_nr, pte_t pte,
> > *any_shared |= !pte_swp_exclusive(pte);
> >
> > while (ptep < end_ptep) {
> > + pte_t ignore_exclusive_pte;
> > + pte_t ignore_exclusive_expected_pte;
> > pte = ptep_get(ptep);
> >
> > - if (!pte_same(pte, expected_pte))
> > - break;
> > + if (any_shared) {
> > + ignore_exclusive_pte = pte;
> > + ignore_exclusive_expected_pte = expected_pte;
> > + ignore_exclusive_pte =
> > pte_swp_clear_exclusive(ignore_exclusive_pte);
> > + ignore_exclusive_expected_pte =
> > pte_swp_clear_exclusive(expected_pte);
> > +
> > + if (!pte_same(ignore_exclusive_pte,
> > ignore_exclusive_expected_pte))
> > + break;
> > + } else {
> > + if (!pte_same(pte, expected_pte))
> > + break;
> > + }
> >
> > if (any_shared)
> > *any_shared |= !pte_swp_exclusive(pte);
>
> I'll leave David to comment on this proposal; I'm not sure I understand all the
> details. The code change does look a bit "busy" though - sometimes that can be
> an indicator :)
indeed. I wrote it in one minute.
I'm confident that the code can be written in a manner similar to
__pte_batch_clear_ignored. I was only proposing the approach,
not selling the code :-)
>
> >
> >> softdirty and uffd-wp as well at least initially for simplicity.
> >
> > yes for this.
> >
> > By the way, I wonder if you and Ryan have a moment to review swpin
> > refault patchset
> > v2 :-)
>
> It's on my todo list! I'm very keen to get as much large swap-out and swap-in
> support into v6.10 as we can. Hoping to get to it inthe next couple of days.
>
> >
> > [PATCH v2 0/5] large folios swap-in: handle refault cases first
> > https://lore.kernel.org/linux-mm/20240409082631.187483-1-21cnbao@gmail.com/
> >
> >
> >>
> >> --
> >> Cheers,
> >>
> >> David / dhildenb
> >>
> >
Thanks
Barry
>
On 08.04.24 20:39, Ryan Roberts wrote:
> Now that we no longer have a convenient flag in the cluster to determine
> if a folio is large, free_swap_and_cache() will take a reference and
> lock a large folio much more often, which could lead to contention and
> (e.g.) failure to split large folios, etc.
>
> Let's solve that problem by batch freeing swap and cache with a new
> function, free_swap_and_cache_nr(), to free a contiguous range of swap
> entries together. This allows us to first drop a reference to each swap
> slot before we try to release the cache folio. This means we only try to
> release the folio once, only taking the reference and lock once - much
> better than the previous 512 times for the 2M THP case.
>
> Contiguous swap entries are gathered in zap_pte_range() and
> madvise_free_pte_range() in a similar way to how present ptes are
> already gathered in zap_pte_range().
>
> While we are at it, let's simplify by converting the return type of both
> functions to void. The return value was used only by zap_pte_range() to
> print a bad pte, and was ignored by everyone else, so the extra
> reporting wasn't exactly guaranteed. We will still get the warning with
> most of the information from get_swap_device(). With the batch version,
> we wouldn't know which pte was bad anyway so could print the wrong one.
>
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
> include/linux/pgtable.h | 29 ++++++++++++
> include/linux/swap.h | 12 +++--
> mm/internal.h | 63 ++++++++++++++++++++++++++
> mm/madvise.c | 12 +++--
> mm/memory.c | 13 +++---
> mm/swapfile.c | 97 +++++++++++++++++++++++++++++++++--------
> 6 files changed, 195 insertions(+), 31 deletions(-)
>
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index a3fc8150b047..75096025fe52 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
> }
> #endif
>
> +#ifndef clear_not_present_full_ptes
> +/**
> + * clear_not_present_full_ptes - Clear multiple not present PTEs which are
> + * consecutive in the pgtable.
> + * @mm: Address space the ptes represent.
> + * @addr: Address of the first pte.
> + * @ptep: Page table pointer for the first entry.
> + * @nr: Number of entries to clear.
> + * @full: Whether we are clearing a full mm.
> + *
> + * May be overridden by the architecture; otherwise, implemented as a simple
> + * loop over pte_clear_not_present_full().
> + *
> + * Context: The caller holds the page table lock. The PTEs are all not present.
> + * The PTEs are all in the same PMD.
> + */
> +static inline void clear_not_present_full_ptes(struct mm_struct *mm,
> + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> +{
> + for (;;) {
> + pte_clear_not_present_full(mm, addr, ptep, full);
> + if (--nr == 0)
> + break;
> + ptep++;
> + addr += PAGE_SIZE;
> + }
> +}
> +#endif
> +
> #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
> extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
> unsigned long address,
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index f6f78198f000..5737236dc3ce 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
> extern int swapcache_prepare(swp_entry_t);
> extern void swap_free(swp_entry_t);
> extern void swapcache_free_entries(swp_entry_t *entries, int n);
> -extern int free_swap_and_cache(swp_entry_t);
> +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
> int swap_type_of(dev_t device, sector_t offset);
> int find_first_swap(dev_t *device);
> extern unsigned int count_swap_pages(int, int);
> @@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
> #define free_pages_and_swap_cache(pages, nr) \
> release_pages((pages), (nr));
>
> -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
> -#define free_swap_and_cache(e) is_pfn_swap_entry(e)
> +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> +{
> +}
>
> static inline void free_swap_cache(struct folio *folio)
> {
> @@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
> }
> #endif /* CONFIG_SWAP */
>
> +static inline void free_swap_and_cache(swp_entry_t entry)
> +{
> + free_swap_and_cache_nr(entry, 1);
> +}
> +
> #ifdef CONFIG_MEMCG
> static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
> {
> diff --git a/mm/internal.h b/mm/internal.h
> index 3bdc8693b54f..de68705624b0 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -11,6 +11,8 @@
> #include <linux/mm.h>
> #include <linux/pagemap.h>
> #include <linux/rmap.h>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
> #include <linux/tracepoint-defs.h>
>
> struct folio_batch;
> @@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
>
> return min(ptep - start_ptep, max_nr);
> }
> +
> +/**
> + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
> + * @pte: The initial pte state; is_swap_pte(pte) must be true.
Likely we also want non_swap_entry() to be false.
> + *
> + * Increments the swap offset, while maintaining all other fields, including
> + * swap type, and any swp pte bits. The resulting pte is returned.
> + */
> +static inline pte_t pte_next_swp_offset(pte_t pte)
> +{
> + swp_entry_t entry = pte_to_swp_entry(pte);
> + pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
> + swp_offset(entry) + 1));
> +
> + if (pte_swp_soft_dirty(pte))
> + new = pte_swp_mksoft_dirty(new);
> + if (pte_swp_exclusive(pte))
> + new = pte_swp_mkexclusive(new);
> + if (pte_swp_uffd_wp(pte))
> + new = pte_swp_mkuffd_wp(new);
> +
> + return new;
> +}
Acked-by: David Hildenbrand <david@redhat.com>
--
Cheers,
David / dhildenb
Hi Andrew,
Could you please squash this into commit "mm: swap:
free_swap_and_cache_nr() as batched free_swap_and_cache()", which is
already in mm-unstable?
It fixes a build warning on parisc [1] due to their implementation of
__swp_entry_to_pte() not correctly putting the macro args in
parenthisis. But it turns out that a bunch of other arches are also
faulty in this regard.
I'm also adding an extra statement to the documentation for
pte_next_swp_offset() as suggested by David.
[1] https://lore.kernel.org/all/202404091749.ScNPJ8j4-lkp@intel.com/
Thanks,
Ryan
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
mm/internal.h | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index 9d3250b4a08a..22152e0c8494 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -202,7 +202,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
/**
* pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
- * @pte: The initial pte state; is_swap_pte(pte) must be true.
+ * @pte: The initial pte state; is_swap_pte(pte) must be true and
+ * non_swap_entry() must be false.
*
* Increments the swap offset, while maintaining all other fields, including
* swap type, and any swp pte bits. The resulting pte is returned.
@@ -211,7 +212,7 @@ static inline pte_t pte_next_swp_offset(pte_t pte)
{
swp_entry_t entry = pte_to_swp_entry(pte);
pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
- swp_offset(entry) + 1));
+ (swp_offset(entry) + 1)));
if (pte_swp_soft_dirty(pte))
new = pte_swp_mksoft_dirty(new);
--
2.25.1
© 2016 - 2026 Red Hat, Inc.