From: Kairui Song <kasong@tencent.com>
Non-rotation (SSD / ZRAM) device can tolerate fragmentations so the goal
of SWAP allocator is to avoid contention of clusters. So it used a
per-CPU cluster design, and each CPU will be using a different cluster
as much as possible.
But HDD is very sensitive to fragmentations, contention is trivial compared
to this. So just use one global cluster instead. This ensured each order
will be wring to a same cluster as much as possible, which helps to make
the IO more continuous.
This ensures the performance of cluster allocator is as good as the old
allocator. Test after this commit compared to before this series:
make -j32 with tinyconfig, using 1G memcg limit and HDD swap:
Before this series:
114.44user 29.11system 39:42.90elapsed 6%CPU (0avgtext+0avgdata 157284maxresident)k
2901232inputs+0outputs (238877major+4227640minor)pagefaults
After this commit:
113.90user 23.81system 38:11.77elapsed 6%CPU (0avgtext+0avgdata 157260maxresident)k
2548728inputs+0outputs (235471major+4238110minor)pagefaults
Suggested-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Kairui Song <kasong@tencent.com>
---
include/linux/swap.h | 2 ++
mm/swapfile.c | 48 ++++++++++++++++++++++++++++++++------------
2 files changed, 37 insertions(+), 13 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0e6c6bb385f0..9898b1881d4d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -319,6 +319,8 @@ struct swap_info_struct {
unsigned int pages; /* total of usable pages of swap */
atomic_long_t inuse_pages; /* number of those currently in use */
struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
+ struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */
+ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */
struct rb_root swap_extent_root;/* root of the swap extent rbtree */
struct block_device *bdev; /* swap device or bdev of swap file */
struct file *swap_file; /* seldom referenced */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f25d697f6736..6eb298a222c0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -798,7 +798,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
out:
relocate_cluster(si, ci);
unlock_cluster(ci);
- __this_cpu_write(si->percpu_cluster->next[order], next);
+ if (si->flags & SWP_SOLIDSTATE)
+ __this_cpu_write(si->percpu_cluster->next[order], next);
+ else
+ si->global_cluster->next[order] = next;
return found;
}
@@ -860,8 +863,14 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
unsigned int offset, found = 0;
/* Fast path using per CPU cluster */
- local_lock(&si->percpu_cluster->lock);
- offset = __this_cpu_read(si->percpu_cluster->next[order]);
+ if (si->flags & SWP_SOLIDSTATE) {
+ local_lock(&si->percpu_cluster->lock);
+ offset = __this_cpu_read(si->percpu_cluster->next[order]);
+ } else {
+ spin_lock(&si->global_cluster_lock);
+ offset = si->global_cluster->next[order];
+ }
+
if (offset) {
ci = lock_cluster(si, offset);
/* Cluster could have been used by another order */
@@ -960,8 +969,10 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
}
}
done:
- local_unlock(&si->percpu_cluster->lock);
-
+ if (si->flags & SWP_SOLIDSTATE)
+ local_unlock(&si->percpu_cluster->lock);
+ else
+ spin_unlock(&si->global_cluster_lock);
return found;
}
@@ -2737,6 +2748,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
+ kfree(p->global_cluster);
+ p->global_cluster = NULL;
vfree(swap_map);
kvfree(zeromap);
kvfree(cluster_info);
@@ -3142,17 +3155,24 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < nr_clusters; i++)
spin_lock_init(&cluster_info[i].lock);
- si->percpu_cluster = alloc_percpu(struct percpu_cluster);
- if (!si->percpu_cluster)
- goto err_free;
+ if (si->flags & SWP_SOLIDSTATE) {
+ si->percpu_cluster = alloc_percpu(struct percpu_cluster);
+ if (!si->percpu_cluster)
+ goto err_free;
- for_each_possible_cpu(cpu) {
- struct percpu_cluster *cluster;
+ for_each_possible_cpu(cpu) {
+ struct percpu_cluster *cluster;
- cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+ cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+ for (i = 0; i < SWAP_NR_ORDERS; i++)
+ cluster->next[i] = SWAP_ENTRY_INVALID;
+ local_lock_init(&cluster->lock);
+ }
+ } else {
+ si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL);
for (i = 0; i < SWAP_NR_ORDERS; i++)
- cluster->next[i] = SWAP_ENTRY_INVALID;
- local_lock_init(&cluster->lock);
+ si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
+ spin_lock_init(&si->global_cluster_lock);
}
/*
@@ -3426,6 +3446,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
bad_swap:
free_percpu(si->percpu_cluster);
si->percpu_cluster = NULL;
+ kfree(si->global_cluster);
+ si->global_cluster = NULL;
inode = NULL;
destroy_swap_extents(si);
swap_cgroup_swapoff(si->type);
--
2.47.0