From: Kairui Song <kasong@tencent.com>
Slot cache is no longer needed now, removing it and all related code.
- vm-scalability with: `usemem --init-time -O -y -x -R -31 1G`,
12G memory cgroup using simulated pmem as SWAP (32G pmem, 32 CPUs),
16 test runs for each case, measuring the total throughput:
Before (KB/s) (stdev) After (KB/s) (stdev)
Random (4K): 424907.60 (24410.78) 414745.92 (34554.78)
Random (64K): 163308.82 (11635.72) 167314.50 (18434.99)
Sequential (4K, !-R): 6150056.79 (103205.90) 6321469.06 (115878.16)
The performance changes are below noise level.
- Build linux kernel with make -j96, using 4K folio with 1.5G memory
cgroup limit and 64K folio with 2G memory cgroup limit, on top of tmpfs,
12 test runs, measuring the system time:
Before (s) (stdev) After (s) (stdev)
make -j96 (4K): 6445.69 (61.95) 6408.80 (69.46)
make -j96 (64K): 6841.71 (409.04) 6437.99 (435.55)
Similar to above, 64k mTHP case showed a slight improvement.
Signed-off-by: Kairui Song <kasong@tencent.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
---
include/linux/swap.h | 3 -
include/linux/swap_slots.h | 28 ----
mm/Makefile | 2 +-
mm/swap_slots.c | 295 -------------------------------------
mm/swap_state.c | 8 +-
mm/swapfile.c | 194 ++++++++----------------
6 files changed, 67 insertions(+), 463 deletions(-)
delete mode 100644 include/linux/swap_slots.h
delete mode 100644 mm/swap_slots.c
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 374bffc87427..c5856dcc263a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -465,7 +465,6 @@ void free_pages_and_swap_cache(struct encoded_page **, int);
extern atomic_long_t nr_swap_pages;
extern long total_swap_pages;
extern atomic_t nr_rotate_swap;
-extern bool has_usable_swap(void);
/* Swap 50% full? Release swapcache more aggressively.. */
static inline bool vm_swap_full(void)
@@ -483,13 +482,11 @@ swp_entry_t folio_alloc_swap(struct folio *folio);
bool folio_free_swap(struct folio *folio);
void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
-extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t, int);
extern int swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t entry, int nr);
extern void swap_free_nr(swp_entry_t entry, int nr_pages);
-extern void swapcache_free_entries(swp_entry_t *entries, int n);
extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
int swap_type_of(dev_t device, sector_t offset);
int find_first_swap(dev_t *device);
diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h
deleted file mode 100644
index 840aec3523b2..000000000000
--- a/include/linux/swap_slots.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SWAP_SLOTS_H
-#define _LINUX_SWAP_SLOTS_H
-
-#include <linux/swap.h>
-#include <linux/spinlock.h>
-#include <linux/mutex.h>
-
-#define SWAP_SLOTS_CACHE_SIZE SWAP_BATCH
-#define THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE (5*SWAP_SLOTS_CACHE_SIZE)
-#define THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE (2*SWAP_SLOTS_CACHE_SIZE)
-
-struct swap_slots_cache {
- bool lock_initialized;
- struct mutex alloc_lock; /* protects slots, nr, cur */
- swp_entry_t *slots;
- int nr;
- int cur;
- int n_ret;
-};
-
-void disable_swap_slots_cache_lock(void);
-void reenable_swap_slots_cache_unlock(void);
-void enable_swap_slots_cache(void);
-
-extern bool swap_slot_cache_enabled;
-
-#endif /* _LINUX_SWAP_SLOTS_H */
diff --git a/mm/Makefile b/mm/Makefile
index 4510a9869e77..e7f6bbf8ae5f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -75,7 +75,7 @@ ifdef CONFIG_MMU
obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
endif
-obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o
+obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
deleted file mode 100644
index 9c7c171df7ba..000000000000
--- a/mm/swap_slots.c
+++ /dev/null
@@ -1,295 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Manage cache of swap slots to be used for and returned from
- * swap.
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Author: Tim Chen <tim.c.chen@linux.intel.com>
- *
- * We allocate the swap slots from the global pool and put
- * it into local per cpu caches. This has the advantage
- * of no needing to acquire the swap_info lock every time
- * we need a new slot.
- *
- * There is also opportunity to simply return the slot
- * to local caches without needing to acquire swap_info
- * lock. We do not reuse the returned slots directly but
- * move them back to the global pool in a batch. This
- * allows the slots to coalesce and reduce fragmentation.
- *
- * The swap entry allocated is marked with SWAP_HAS_CACHE
- * flag in map_count that prevents it from being allocated
- * again from the global pool.
- *
- * The swap slots cache is protected by a mutex instead of
- * a spin lock as when we search for slots with scan_swap_map,
- * we can possibly sleep.
- */
-
-#include <linux/swap_slots.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/mutex.h>
-#include <linux/mm.h>
-
-static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
-static bool swap_slot_cache_active;
-bool swap_slot_cache_enabled;
-static bool swap_slot_cache_initialized;
-static DEFINE_MUTEX(swap_slots_cache_mutex);
-/* Serialize swap slots cache enable/disable operations */
-static DEFINE_MUTEX(swap_slots_cache_enable_mutex);
-
-static void __drain_swap_slots_cache(void);
-
-#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
-
-static void deactivate_swap_slots_cache(void)
-{
- mutex_lock(&swap_slots_cache_mutex);
- swap_slot_cache_active = false;
- __drain_swap_slots_cache();
- mutex_unlock(&swap_slots_cache_mutex);
-}
-
-static void reactivate_swap_slots_cache(void)
-{
- mutex_lock(&swap_slots_cache_mutex);
- swap_slot_cache_active = true;
- mutex_unlock(&swap_slots_cache_mutex);
-}
-
-/* Must not be called with cpu hot plug lock */
-void disable_swap_slots_cache_lock(void)
-{
- mutex_lock(&swap_slots_cache_enable_mutex);
- swap_slot_cache_enabled = false;
- if (swap_slot_cache_initialized) {
- /* serialize with cpu hotplug operations */
- cpus_read_lock();
- __drain_swap_slots_cache();
- cpus_read_unlock();
- }
-}
-
-static void __reenable_swap_slots_cache(void)
-{
- swap_slot_cache_enabled = has_usable_swap();
-}
-
-void reenable_swap_slots_cache_unlock(void)
-{
- __reenable_swap_slots_cache();
- mutex_unlock(&swap_slots_cache_enable_mutex);
-}
-
-static bool check_cache_active(void)
-{
- long pages;
-
- if (!swap_slot_cache_enabled)
- return false;
-
- pages = get_nr_swap_pages();
- if (!swap_slot_cache_active) {
- if (pages > num_online_cpus() *
- THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE)
- reactivate_swap_slots_cache();
- goto out;
- }
-
- /* if global pool of slot caches too low, deactivate cache */
- if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE)
- deactivate_swap_slots_cache();
-out:
- return swap_slot_cache_active;
-}
-
-static int alloc_swap_slot_cache(unsigned int cpu)
-{
- struct swap_slots_cache *cache;
- swp_entry_t *slots;
-
- /*
- * Do allocation outside swap_slots_cache_mutex
- * as kvzalloc could trigger reclaim and folio_alloc_swap,
- * which can lock swap_slots_cache_mutex.
- */
- slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
- GFP_KERNEL);
- if (!slots)
- return -ENOMEM;
-
- mutex_lock(&swap_slots_cache_mutex);
- cache = &per_cpu(swp_slots, cpu);
- if (cache->slots) {
- /* cache already allocated */
- mutex_unlock(&swap_slots_cache_mutex);
-
- kvfree(slots);
-
- return 0;
- }
-
- if (!cache->lock_initialized) {
- mutex_init(&cache->alloc_lock);
- cache->lock_initialized = true;
- }
- cache->nr = 0;
- cache->cur = 0;
- cache->n_ret = 0;
- /*
- * We initialized alloc_lock and free_lock earlier. We use
- * !cache->slots or !cache->slots_ret to know if it is safe to acquire
- * the corresponding lock and use the cache. Memory barrier below
- * ensures the assumption.
- */
- mb();
- cache->slots = slots;
- mutex_unlock(&swap_slots_cache_mutex);
- return 0;
-}
-
-static void drain_slots_cache_cpu(unsigned int cpu, bool free_slots)
-{
- struct swap_slots_cache *cache;
-
- cache = &per_cpu(swp_slots, cpu);
- if (cache->slots) {
- mutex_lock(&cache->alloc_lock);
- swapcache_free_entries(cache->slots + cache->cur, cache->nr);
- cache->cur = 0;
- cache->nr = 0;
- if (free_slots && cache->slots) {
- kvfree(cache->slots);
- cache->slots = NULL;
- }
- mutex_unlock(&cache->alloc_lock);
- }
-}
-
-static void __drain_swap_slots_cache(void)
-{
- unsigned int cpu;
-
- /*
- * This function is called during
- * 1) swapoff, when we have to make sure no
- * left over slots are in cache when we remove
- * a swap device;
- * 2) disabling of swap slot cache, when we run low
- * on swap slots when allocating memory and need
- * to return swap slots to global pool.
- *
- * We cannot acquire cpu hot plug lock here as
- * this function can be invoked in the cpu
- * hot plug path:
- * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
- * -> memory allocation -> direct reclaim -> folio_alloc_swap
- * -> drain_swap_slots_cache
- *
- * Hence the loop over current online cpu below could miss cpu that
- * is being brought online but not yet marked as online.
- * That is okay as we do not schedule and run anything on a
- * cpu before it has been marked online. Hence, we will not
- * fill any swap slots in slots cache of such cpu.
- * There are no slots on such cpu that need to be drained.
- */
- for_each_online_cpu(cpu)
- drain_slots_cache_cpu(cpu, false);
-}
-
-static int free_slot_cache(unsigned int cpu)
-{
- mutex_lock(&swap_slots_cache_mutex);
- drain_slots_cache_cpu(cpu, true);
- mutex_unlock(&swap_slots_cache_mutex);
- return 0;
-}
-
-void enable_swap_slots_cache(void)
-{
- mutex_lock(&swap_slots_cache_enable_mutex);
- if (!swap_slot_cache_initialized) {
- int ret;
-
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
- alloc_swap_slot_cache, free_slot_cache);
- if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
- "without swap slots cache.\n", __func__))
- goto out_unlock;
-
- swap_slot_cache_initialized = true;
- }
-
- __reenable_swap_slots_cache();
-out_unlock:
- mutex_unlock(&swap_slots_cache_enable_mutex);
-}
-
-/* called with swap slot cache's alloc lock held */
-static int refill_swap_slots_cache(struct swap_slots_cache *cache)
-{
- if (!use_swap_slot_cache)
- return 0;
-
- cache->cur = 0;
- if (swap_slot_cache_active)
- cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
- cache->slots, 0);
-
- return cache->nr;
-}
-
-swp_entry_t folio_alloc_swap(struct folio *folio)
-{
- swp_entry_t entry;
- struct swap_slots_cache *cache;
-
- entry.val = 0;
-
- if (folio_test_large(folio)) {
- if (IS_ENABLED(CONFIG_THP_SWAP))
- get_swap_pages(1, &entry, folio_order(folio));
- goto out;
- }
-
- /*
- * Preemption is allowed here, because we may sleep
- * in refill_swap_slots_cache(). But it is safe, because
- * accesses to the per-CPU data structure are protected by the
- * mutex cache->alloc_lock.
- *
- * The alloc path here does not touch cache->slots_ret
- * so cache->free_lock is not taken.
- */
- cache = raw_cpu_ptr(&swp_slots);
-
- if (likely(check_cache_active() && cache->slots)) {
- mutex_lock(&cache->alloc_lock);
- if (cache->slots) {
-repeat:
- if (cache->nr) {
- entry = cache->slots[cache->cur];
- cache->slots[cache->cur++].val = 0;
- cache->nr--;
- } else if (refill_swap_slots_cache(cache)) {
- goto repeat;
- }
- }
- mutex_unlock(&cache->alloc_lock);
- if (entry.val)
- goto out;
- }
-
- get_swap_pages(1, &entry, 0);
-out:
- if (mem_cgroup_try_charge_swap(folio, entry)) {
- put_swap_folio(folio, entry);
- entry.val = 0;
- }
- return entry;
-}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 50840a2887a5..2b5744e211cd 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -20,7 +20,6 @@
#include <linux/blkdev.h>
#include <linux/migrate.h>
#include <linux/vmalloc.h>
-#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
@@ -447,13 +446,8 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/*
* Just skip read ahead for unused swap slot.
- * During swap_off when swap_slot_cache is disabled,
- * we have to handle the race between putting
- * swap entry in swap cache and marking swap slot
- * as SWAP_HAS_CACHE. That's done in later part of code or
- * else swap_off will be aborted if we return NULL.
*/
- if (!swap_entry_swapped(si, entry) && swap_slot_cache_enabled)
+ if (!swap_entry_swapped(si, entry))
goto put_and_return;
/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8b296c4c636b..9bd95173865d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -37,7 +37,6 @@
#include <linux/oom.h>
#include <linux/swapfile.h>
#include <linux/export.h>
-#include <linux/swap_slots.h>
#include <linux/sort.h>
#include <linux/completion.h>
#include <linux/suspend.h>
@@ -885,16 +884,20 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
struct swap_cluster_info *ci;
unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
- if (si->flags & SWP_SOLIDSTATE) {
- if (si == this_cpu_read(percpu_swap_cluster.si[order]))
- offset = this_cpu_read(percpu_swap_cluster.offset[order]);
- } else {
+ /*
+ * Swapfile is not block device so unable
+ * to allocate large entries.
+ */
+ if (order && !(si->flags & SWP_BLKDEV))
+ return 0;
+
+ if (!(si->flags & SWP_SOLIDSTATE)) {
/* Serialize HDD SWAP allocation for each device. */
spin_lock(&si->global_cluster_lock);
offset = si->global_cluster->next[order];
- }
+ if (offset == SWAP_ENTRY_INVALID)
+ goto new_cluster;
- if (offset) {
ci = lock_cluster(si, offset);
/* Cluster could have been used by another order */
if (cluster_is_usable(ci, order)) {
@@ -1153,43 +1156,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
swap_usage_sub(si, nr_entries);
}
-static int scan_swap_map_slots(struct swap_info_struct *si,
- unsigned char usage, int nr,
- swp_entry_t slots[], int order)
-{
- unsigned int nr_pages = 1 << order;
- int n_ret = 0;
-
- if (order > 0) {
- /*
- * Should not even be attempting large allocations when huge
- * page swap is disabled. Warn and fail the allocation.
- */
- if (!IS_ENABLED(CONFIG_THP_SWAP) ||
- nr_pages > SWAPFILE_CLUSTER) {
- VM_WARN_ON_ONCE(1);
- return 0;
- }
-
- /*
- * Swapfile is not block device so unable
- * to allocate large entries.
- */
- if (!(si->flags & SWP_BLKDEV))
- return 0;
- }
-
- while (n_ret < nr) {
- unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
-
- if (!offset)
- break;
- slots[n_ret++] = swp_entry(si->type, offset);
- }
-
- return n_ret;
-}
-
static bool get_swap_device_info(struct swap_info_struct *si)
{
if (!percpu_ref_tryget_live(&si->users))
@@ -1210,16 +1176,13 @@ static bool get_swap_device_info(struct swap_info_struct *si)
* Fast path try to get swap entries with specified order from current
* CPU's swap entry pool (a cluster).
*/
-static int swap_alloc_fast(swp_entry_t entries[],
+static int swap_alloc_fast(swp_entry_t *entry,
unsigned char usage,
- int order, int n_goal)
+ int order)
{
struct swap_cluster_info *ci;
struct swap_info_struct *si;
- unsigned int offset, found;
- int n_ret = 0;
-
- n_goal = min(n_goal, SWAP_BATCH);
+ unsigned int offset, found = SWAP_ENTRY_INVALID;
/*
* Once allocated, swap_info_struct will never be completely freed,
@@ -1228,46 +1191,48 @@ static int swap_alloc_fast(swp_entry_t entries[],
si = this_cpu_read(percpu_swap_cluster.si[order]);
offset = this_cpu_read(percpu_swap_cluster.offset[order]);
if (!si || !offset || !get_swap_device_info(si))
- return 0;
+ return false;
- while (offset) {
- ci = lock_cluster(si, offset);
- if (!cluster_is_usable(ci, order)) {
- unlock_cluster(ci);
- break;
- }
+ ci = lock_cluster(si, offset);
+ if (cluster_is_usable(ci, order)) {
if (cluster_is_empty(ci))
offset = cluster_offset(si, ci);
found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
- if (!found)
- break;
- entries[n_ret++] = swp_entry(si->type, found);
- if (n_ret == n_goal)
- break;
- offset = this_cpu_read(percpu_swap_cluster.offset[order]);
+ if (found)
+ *entry = swp_entry(si->type, found);
+ } else {
+ unlock_cluster(ci);
}
put_swap_device(si);
- return n_ret;
+ return !!found;
}
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
+swp_entry_t folio_alloc_swap(struct folio *folio)
{
- int order = swap_entry_order(entry_order);
- unsigned long size = 1 << order;
+ unsigned int order = folio_order(folio);
+ unsigned int size = 1 << order;
struct swap_info_struct *si, *next;
- int n_ret = 0;
+ swp_entry_t entry = {};
+ unsigned long offset;
int node;
+ if (order) {
+ /*
+ * Should not even be attempting large allocations when huge
+ * page swap is disabled. Warn and fail the allocation.
+ */
+ if (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER) {
+ VM_WARN_ON_ONCE(1);
+ return entry;
+ }
+ }
+
/* Fast path using percpu cluster */
local_lock(&percpu_swap_cluster.lock);
- n_ret = swap_alloc_fast(swp_entries,
- SWAP_HAS_CACHE,
- order, n_goal);
- if (n_ret == n_goal)
+ if (swap_alloc_fast(&entry, SWAP_HAS_CACHE, order))
goto out;
- n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);
/* Rotate the device and switch to a new cluster */
spin_lock(&swap_avail_lock);
start_over:
@@ -1276,18 +1241,13 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
if (get_swap_device_info(si)) {
- /*
- * For order 0 allocation, try best to fill the request
- * as it's used by slot cache.
- *
- * For mTHP allocation, it always have n_goal == 1,
- * and falling a mTHP swapin will just make the caller
- * fallback to order 0 allocation, so just bail out.
- */
- n_ret += scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal,
- swp_entries + n_ret, order);
+ offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
put_swap_device(si);
- if (n_ret || size > 1)
+ if (offset) {
+ entry = swp_entry(si->type, offset);
+ goto out;
+ }
+ if (order)
goto out;
}
@@ -1309,8 +1269,14 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
spin_unlock(&swap_avail_lock);
out:
local_unlock(&percpu_swap_cluster.lock);
- atomic_long_sub(n_ret * size, &nr_swap_pages);
- return n_ret;
+ /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
+ if (mem_cgroup_try_charge_swap(folio, entry)) {
+ put_swap_folio(folio, entry);
+ entry.val = 0;
+ }
+ if (entry.val)
+ atomic_long_sub(size, &nr_swap_pages);
+ return entry;
}
static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
@@ -1606,25 +1572,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
unlock_cluster(ci);
}
-void swapcache_free_entries(swp_entry_t *entries, int n)
-{
- int i;
- struct swap_cluster_info *ci;
- struct swap_info_struct *si = NULL;
-
- if (n <= 0)
- return;
-
- for (i = 0; i < n; ++i) {
- si = _swap_info_get(entries[i]);
- if (si) {
- ci = lock_cluster(si, swp_offset(entries[i]));
- swap_entry_range_free(si, ci, entries[i], 1);
- unlock_cluster(ci);
- }
- }
-}
-
int __swap_count(swp_entry_t entry)
{
struct swap_info_struct *si = swp_swap_info(entry);
@@ -1865,6 +1812,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
swp_entry_t get_swap_page_of_type(int type)
{
struct swap_info_struct *si = swap_type_to_swap_info(type);
+ unsigned long offset;
swp_entry_t entry = {0};
if (!si)
@@ -1872,8 +1820,13 @@ swp_entry_t get_swap_page_of_type(int type)
/* This is called for allocating swap entry, not cache */
if (get_swap_device_info(si)) {
- if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
- atomic_long_dec(&nr_swap_pages);
+ if (si->flags & SWP_WRITEOK) {
+ offset = cluster_alloc_swap_entry(si, 0, 1);
+ if (offset) {
+ entry = swp_entry(si->type, offset);
+ atomic_long_dec(&nr_swap_pages);
+ }
+ }
put_swap_device(si);
}
fail:
@@ -2634,21 +2587,6 @@ static void reinsert_swap_info(struct swap_info_struct *si)
spin_unlock(&swap_lock);
}
-static bool __has_usable_swap(void)
-{
- return !plist_head_empty(&swap_active_head);
-}
-
-bool has_usable_swap(void)
-{
- bool ret;
-
- spin_lock(&swap_lock);
- ret = __has_usable_swap();
- spin_unlock(&swap_lock);
- return ret;
-}
-
/*
* Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
* see the updated flags, so there will be no more allocations.
@@ -2761,8 +2699,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
wait_for_allocation(p);
- disable_swap_slots_cache_lock();
-
set_current_oom_origin();
err = try_to_unuse(p->type);
clear_current_oom_origin();
@@ -2770,12 +2706,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (err) {
/* re-insert swap space back into swap_list */
reinsert_swap_info(p);
- reenable_swap_slots_cache_unlock();
goto out_dput;
}
- reenable_swap_slots_cache_unlock();
-
/*
* Wait for swap operations protected by get/put_swap_device()
* to complete. Because of synchronize_rcu() here, all swap
@@ -3525,8 +3458,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
putname(name);
if (inode)
inode_unlock(inode);
- if (!error)
- enable_swap_slots_cache();
return error;
}
@@ -3922,6 +3853,11 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+static bool __has_usable_swap(void)
+{
+ return !plist_head_empty(&swap_active_head);
+}
+
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
struct swap_info_struct *si, *next;
--
2.48.1
Hi Kairui,
On Fri, Mar 14, 2025 at 12:59:34AM +0800, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
>
> Slot cache is no longer needed now, removing it and all related code.
...
> Signed-off-by: Kairui Song <kasong@tencent.com>
> Reviewed-by: Baoquan He <bhe@redhat.com>
> ---
> include/linux/swap.h | 3 -
> include/linux/swap_slots.h | 28 ----
> mm/Makefile | 2 +-
> mm/swap_slots.c | 295 -------------------------------------
> mm/swap_state.c | 8 +-
> mm/swapfile.c | 194 ++++++++----------------
> 6 files changed, 67 insertions(+), 463 deletions(-)
> delete mode 100644 include/linux/swap_slots.h
> delete mode 100644 mm/swap_slots.c
...
> diff --git a/mm/swapfile.c b/mm/swapfile.c
...
> +swp_entry_t folio_alloc_swap(struct folio *folio)
> {
> - int order = swap_entry_order(entry_order);
> - unsigned long size = 1 << order;
> + unsigned int order = folio_order(folio);
> + unsigned int size = 1 << order;
> struct swap_info_struct *si, *next;
> - int n_ret = 0;
> + swp_entry_t entry = {};
> + unsigned long offset;
> int node;
>
> + if (order) {
> + /*
> + * Should not even be attempting large allocations when huge
> + * page swap is disabled. Warn and fail the allocation.
> + */
> + if (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER) {
> + VM_WARN_ON_ONCE(1);
> + return entry;
> + }
> + }
This warning triggers on s390. CONFIG_THP_SWAP is disabled and order
is 8 when this triggers (reproduced with ltp's swapon01 test case):
------------[ cut here ]------------
WARNING: CPU: 1 PID: 895 at mm/swapfile.c:1227 folio_alloc_swap+0x438/0x440
Modules linked in:
CPU: 1 UID: 0 PID: 895 Comm: swapon01 Not tainted 6.14.0-rc6-00227-g0ff67f990bd4-dirty #25
Hardware name: IBM 3931 A01 704 (z/VM 7.4.0)
Krnl PSW : 0704d00180000000 000003ffe051210c (folio_alloc_swap+0x43c/0x440)
R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3
Krnl GPRS: 0000000080000000 0000000000000001 0000000000000013 0000000000070000
0000000000000006 fffffef40e9da000 0000000000000000 0000037202fc4000
0000037f00000100 0000000000000100 0000037fe2e4b770 0000037202fc4000
0000000000000000 0000000000000000 000003ffe0512108 0000037fe2e4b3c8
Krnl Code: 000003ffe05120fe: b9160044 llgfr %r4,%r4
000003ffe0512102: c0e5ffdf8c0b brasl %r14,000003ffe0103918
#000003ffe0512108: af000000 mc 0,0
>000003ffe051210c: a7f4fe94 brc 15,000003ffe0511e34
000003ffe0512110: c0040069ce74 brcl 0,000003ffe124bdf8
000003ffe0512116: eb8ff0580024 stmg %r8,%r15,88(%r15)
000003ffe051211c: b90400ef lgr %r14,%r15
000003ffe0512120: e3f0ffb8ff71 lay %r15,-72(%r15)
Call Trace:
[<000003ffe051210c>] folio_alloc_swap+0x43c/0x440
[<000003ffe050afa6>] add_to_swap+0x56/0xf0
[<000003ffe045fdc0>] shrink_folio_list+0xe80/0x13b0
[<000003ffe0461946>] shrink_inactive_list+0x1a6/0x550
[<000003ffe04624a2>] shrink_lruvec+0x2b2/0x410
[<000003ffe0462840>] shrink_node_memcgs+0x240/0x2d0
[<000003ffe0462986>] shrink_node+0xb6/0x3e0
[<000003ffe046302a>] do_try_to_free_pages+0xda/0x610
[<000003ffe0464d2c>] try_to_free_mem_cgroup_pages+0x14c/0x2a0
[<000003ffe0568270>] try_charge_memcg+0x220/0x5d0
[<000003ffe056867a>] charge_memcg+0x5a/0x270
[<000003ffe056a484>] __mem_cgroup_charge+0x44/0x80
[<000003ffe04acf20>] alloc_anon_folio+0x280/0x610
[<000003ffe04ad45a>] do_anonymous_page+0x1aa/0x5e0
[<000003ffe04af4c4>] __handle_mm_fault+0x244/0x500
[<000003ffe04af820>] handle_mm_fault+0xa0/0x170
[<000003ffe01533f8>] do_exception+0x1d8/0x4a0
[<000003ffe11fb92a>] __do_pgm_check+0x13a/0x220
[<000003ffe120c3ce>] pgm_check_handler+0x11e/0x170
---[ end trace 0000000000000000 ]---
On Mon, Apr 28, 2025 at 9:53 PM Heiko Carstens <hca@linux.ibm.com> wrote:
>
> Hi Kairui,
>
> On Fri, Mar 14, 2025 at 12:59:34AM +0800, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> >
> > Slot cache is no longer needed now, removing it and all related code.
> ...
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > Reviewed-by: Baoquan He <bhe@redhat.com>
> > ---
> > include/linux/swap.h | 3 -
> > include/linux/swap_slots.h | 28 ----
> > mm/Makefile | 2 +-
> > mm/swap_slots.c | 295 -------------------------------------
> > mm/swap_state.c | 8 +-
> > mm/swapfile.c | 194 ++++++++----------------
> > 6 files changed, 67 insertions(+), 463 deletions(-)
> > delete mode 100644 include/linux/swap_slots.h
> > delete mode 100644 mm/swap_slots.c
> ...
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> ...
> > +swp_entry_t folio_alloc_swap(struct folio *folio)
> > {
> > - int order = swap_entry_order(entry_order);
> > - unsigned long size = 1 << order;
> > + unsigned int order = folio_order(folio);
> > + unsigned int size = 1 << order;
> > struct swap_info_struct *si, *next;
> > - int n_ret = 0;
> > + swp_entry_t entry = {};
> > + unsigned long offset;
> > int node;
> >
> > + if (order) {
> > + /*
> > + * Should not even be attempting large allocations when huge
> > + * page swap is disabled. Warn and fail the allocation.
> > + */
> > + if (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER) {
> > + VM_WARN_ON_ONCE(1);
> > + return entry;
> > + }
> > + }
>
> This warning triggers on s390. CONFIG_THP_SWAP is disabled and order
> is 8 when this triggers (reproduced with ltp's swapon01 test case):
Hi Heiko,
Thanks for the report.
>
> ------------[ cut here ]------------
> WARNING: CPU: 1 PID: 895 at mm/swapfile.c:1227 folio_alloc_swap+0x438/0x440
> Modules linked in:
> CPU: 1 UID: 0 PID: 895 Comm: swapon01 Not tainted 6.14.0-rc6-00227-g0ff67f990bd4-dirty #25
> Hardware name: IBM 3931 A01 704 (z/VM 7.4.0)
> Krnl PSW : 0704d00180000000 000003ffe051210c (folio_alloc_swap+0x43c/0x440)
> R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3
> Krnl GPRS: 0000000080000000 0000000000000001 0000000000000013 0000000000070000
> 0000000000000006 fffffef40e9da000 0000000000000000 0000037202fc4000
> 0000037f00000100 0000000000000100 0000037fe2e4b770 0000037202fc4000
> 0000000000000000 0000000000000000 000003ffe0512108 0000037fe2e4b3c8
> Krnl Code: 000003ffe05120fe: b9160044 llgfr %r4,%r4
> 000003ffe0512102: c0e5ffdf8c0b brasl %r14,000003ffe0103918
> #000003ffe0512108: af000000 mc 0,0
> >000003ffe051210c: a7f4fe94 brc 15,000003ffe0511e34
> 000003ffe0512110: c0040069ce74 brcl 0,000003ffe124bdf8
> 000003ffe0512116: eb8ff0580024 stmg %r8,%r15,88(%r15)
> 000003ffe051211c: b90400ef lgr %r14,%r15
> 000003ffe0512120: e3f0ffb8ff71 lay %r15,-72(%r15)
> Call Trace:
> [<000003ffe051210c>] folio_alloc_swap+0x43c/0x440
> [<000003ffe050afa6>] add_to_swap+0x56/0xf0
> [<000003ffe045fdc0>] shrink_folio_list+0xe80/0x13b0
> [<000003ffe0461946>] shrink_inactive_list+0x1a6/0x550
> [<000003ffe04624a2>] shrink_lruvec+0x2b2/0x410
> [<000003ffe0462840>] shrink_node_memcgs+0x240/0x2d0
> [<000003ffe0462986>] shrink_node+0xb6/0x3e0
> [<000003ffe046302a>] do_try_to_free_pages+0xda/0x610
> [<000003ffe0464d2c>] try_to_free_mem_cgroup_pages+0x14c/0x2a0
> [<000003ffe0568270>] try_charge_memcg+0x220/0x5d0
> [<000003ffe056867a>] charge_memcg+0x5a/0x270
> [<000003ffe056a484>] __mem_cgroup_charge+0x44/0x80
> [<000003ffe04acf20>] alloc_anon_folio+0x280/0x610
> [<000003ffe04ad45a>] do_anonymous_page+0x1aa/0x5e0
> [<000003ffe04af4c4>] __handle_mm_fault+0x244/0x500
> [<000003ffe04af820>] handle_mm_fault+0xa0/0x170
> [<000003ffe01533f8>] do_exception+0x1d8/0x4a0
> [<000003ffe11fb92a>] __do_pgm_check+0x13a/0x220
> [<000003ffe120c3ce>] pgm_check_handler+0x11e/0x170
> ---[ end trace 0000000000000000 ]---
>
The !CONFIG_THP_SWAP check existed before because slot cache should
reject high order allocation. But slot cache is gone, so large
allocation will directly go to the allocator.
It was not a meaningful WARN in the first place, and now the allocator
should just fail silently for high order allocation, that's totally
fine and expected and will just inform the caller to split the folio.
I'll just change the WARN_ON condition to `if (order && size >
SWAPFILE_CLUSTER)` then, this should silence the WARN.
On Mon, Apr 28, 2025 at 11:31:59PM +0800, Kairui Song wrote:
> On Mon, Apr 28, 2025 at 9:53 PM Heiko Carstens <hca@linux.ibm.com> wrote:
> > > + if (order) {
> > > + /*
> > > + * Should not even be attempting large allocations when huge
> > > + * page swap is disabled. Warn and fail the allocation.
> > > + */
> > > + if (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER) {
> > > + VM_WARN_ON_ONCE(1);
> > > + return entry;
> > > + }
> > > + }
>
> The !CONFIG_THP_SWAP check existed before because slot cache should
> reject high order allocation. But slot cache is gone, so large
> allocation will directly go to the allocator.
>
> It was not a meaningful WARN in the first place, and now the allocator
> should just fail silently for high order allocation, that's totally
> fine and expected and will just inform the caller to split the folio.
>
> I'll just change the WARN_ON condition to `if (order && size >
> SWAPFILE_CLUSTER)` then, this should silence the WARN.
If I understand your suggestion correctly then this would be the
resulting code:
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2eff8b51a945..5a7797143948 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1276,7 +1276,7 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
* Should not even be attempting large allocations when huge
* page swap is disabled. Warn and fail the allocation.
*/
- if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER)) {
+ if (order && size > SWAPFILE_CLUSTER) {
VM_WARN_ON_ONCE(1);
return -EINVAL;
}
However, with that change I get this splat (and a few more) instead:
------------[ cut here ]------------
DEBUG_LOCKS_WARN_ON(l->owner != current)
WARNING: CPU: 4 PID: 934 at ./include/linux/local_lock_internal.h:52 folio_alloc_swap+0x22e/0x560
Modules linked in:
CPU: 4 UID: 0 PID: 934 Comm: swapon01 Not tainted 6.15.0-rc4-00021-gca91b9500108-dirty #2 PREEMPT
Hardware name: IBM 3931 A01 704 (z/VM 7.4.0)
Krnl PSW : 0704f00180000000 000003ffe05231b2 (folio_alloc_swap+0x232/0x560)
R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:3 PM:0 RI:0 EA:3
Krnl GPRS: 000003ff80000002 0000000000000003 0000000000000028 000003ffe2fe6c90
0000000000000003 000003ff80000003 000003ffe19fc538 000003720322c000
0000000000000000 000002f2ab38f000 000002f200000100 000003720322c000
0000000000000000 000002f3f0eda278 000003ffe05231ae 0000037febbdb3f8
Krnl Code: 000003ffe05231a2: c020008e69bc larl %r2,000003ffe16f051a
000003ffe05231a8: c0e5ffe2f4dc brasl %r14,000003ffe0181b60
#000003ffe05231ae: af000000 mc 0,0
>000003ffe05231b2: a7f4ff92 brc 15,000003ffe05230d6
000003ffe05231b6: a7080001 lhi %r0,1
000003ffe05231ba: a7a80001 lhi %r10,1
000003ffe05231be: a7980000 lhi %r9,0
000003ffe05231c2: a7f4ff04 brc 15,000003ffe0522fca
Call Trace:
[<000003ffe05231b2>] folio_alloc_swap+0x232/0x560
[<000003ffe046d2d2>] shrink_folio_list+0xe02/0x12d0
[<000003ffe046edc8>] shrink_inactive_list+0x188/0x5a0
[<000003ffe046f7d4>] shrink_lruvec+0x104/0x400
[<000003ffe046fd0c>] shrink_node_memcgs+0x23c/0x2c0
[<000003ffe046fe3a>] shrink_node+0xaa/0x420
[<000003ffe0470500>] do_try_to_free_pages+0xd0/0x5c0
[<000003ffe0472144>] try_to_free_mem_cgroup_pages+0x144/0x290
[<000003ffe057b33a>] try_charge_memcg+0x1ca/0x420
[<000003ffe057b5ea>] charge_memcg+0x5a/0x1a0
[<000003ffe057d5d4>] __mem_cgroup_charge+0x44/0x160
[<000003ffe04bb6ac>] alloc_anon_folio+0x27c/0x5d0
[<000003ffe04bbbac>] do_anonymous_page+0x1ac/0x7b0
[<000003ffe04bdec2>] __handle_mm_fault+0x212/0x4c0
[<000003ffe04be20c>] handle_mm_fault+0x9c/0x230
[<000003ffe015422c>] do_exception+0x1dc/0x540
[<000003ffe125c560>] __do_pgm_check+0x130/0x220
[<000003ffe126e3ae>] pgm_check_handler+0x11e/0x170
INFO: lockdep is turned off.
Last Breaking-Event-Address:
[<000003ffe0181c96>] __warn_printk+0x136/0x140
irq event stamp: 4120751
hardirqs last enabled at (4120751): [<000003ffe126d0ee>] _raw_spin_unlock_irq+0x3e/0x80
hardirqs last disabled at (4120750): [<000003ffe126cca8>] _raw_spin_lock_irq+0x98/0xd0
softirqs last enabled at (4104986): [<000003ffe018c1fc>] handle_softirqs+0x2fc/0x550
softirqs last disabled at (4104969): [<000003ffe018b9c6>] __irq_exit_rcu+0x126/0x140
---[ end trace 0000000000000000 ]---
On Tue, Apr 29, 2025 at 3:31 PM Heiko Carstens <hca@linux.ibm.com> wrote:
>
> On Mon, Apr 28, 2025 at 11:31:59PM +0800, Kairui Song wrote:
> > On Mon, Apr 28, 2025 at 9:53 PM Heiko Carstens <hca@linux.ibm.com> wrote:
> > > > + if (order) {
> > > > + /*
> > > > + * Should not even be attempting large allocations when huge
> > > > + * page swap is disabled. Warn and fail the allocation.
> > > > + */
> > > > + if (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER) {
> > > > + VM_WARN_ON_ONCE(1);
> > > > + return entry;
> > > > + }
> > > > + }
> >
> > The !CONFIG_THP_SWAP check existed before because slot cache should
> > reject high order allocation. But slot cache is gone, so large
> > allocation will directly go to the allocator.
> >
> > It was not a meaningful WARN in the first place, and now the allocator
> > should just fail silently for high order allocation, that's totally
> > fine and expected and will just inform the caller to split the folio.
> >
> > I'll just change the WARN_ON condition to `if (order && size >
> > SWAPFILE_CLUSTER)` then, this should silence the WARN.
>
> If I understand your suggestion correctly then this would be the
> resulting code:
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 2eff8b51a945..5a7797143948 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1276,7 +1276,7 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
> * Should not even be attempting large allocations when huge
> * page swap is disabled. Warn and fail the allocation.
> */
> - if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER)) {
> + if (order && size > SWAPFILE_CLUSTER) {
> VM_WARN_ON_ONCE(1);
> return -EINVAL;
> }
>
> However, with that change I get this splat (and a few more) instead:
Sorry my bad, the allocator needs to fail silencely, not ignore and go
on. So it should be:
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e727021b8e2c..b86637cfb17a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1272,13 +1272,22 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
- /*
- * Should not even be attempting large allocations when huge
- * page swap is disabled. Warn and fail the allocation.
- */
- if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size >
SWAPFILE_CLUSTER)) {
- VM_WARN_ON_ONCE(1);
- return -EINVAL;
+ if (order) {
+ /*
+ * Reject large allocation when THP_SWAP is disabled,
+ * the caller should split the folio and try again.
+ */
+ if (!IS_ENABLED(CONFIG_THP_SWAP))
+ return -EAGAIN;
+
+ /*
+ * Allocation size should never exceed cluster size
+ * (HPAGE_PMD_SIZE).
+ */
+ if (size > SWAPFILE_CLUSTER) {
+ VM_WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
}
local_lock(&percpu_swap_cluster.lock);
---
I've tested locally and it seems to work well, I'll send a patch to fix it.
© 2016 - 2025 Red Hat, Inc.