:p
atchew
Login
From: Kairui Song <kasong@tencent.com> This series removes the global swap cgroup lock. The critical section of this lock is minimal but it's still a bottle neck for mass parallel swap workloads. Improvement of this series is more significant after the si lock rework: https://lore.kernel.org/linux-mm/20241022192451.38138-1-ryncsn@gmail.com/ But this series works very well on its own. Testing using 64G brd and build with build kernel with make -j96 in 1.5G memory cgroup using 4k folios showed below improvement (10 test run): Before this series: Sys time: 10730.08 (stdev 49.030728) Real time: 171.03 (stdev 0.850355) After this series: Sys time: 9612.24 (stdev 66.310789), -10.42% Real time: 159.78 (stdev 0.577193), -6.57% With 64k folios and 2G memcg: Before this series: Sys time: 7626.77 (stdev 43.545517) Real time: 136.22 (stdev 1.265544) After this series: Sys time: 6936.03 (stdev 39.996280), -9.06% Real time: 129.65 (stdev 0.880039), -4.82% Sequential swapout of 8G 4k zero folios (24 test run): Before this series: 5461409.12 us (stdev 183957.827084) After this commit: 5420447.26 us (stdev 196419.240317) Kairui Song (4): mm, memcontrol: avoid duplicated memcg enable check mm/swap_cgroup: remove swap_cgroup_cmpxchg mm/swap_cgroup: simplify swap cgroup definitions mm, swap_cgroup: remove global swap cgroup lock include/linux/swap_cgroup.h | 2 - mm/memcontrol.c | 2 +- mm/swap_cgroup.c | 110 ++++++++++++++++-------------------- 3 files changed, 51 insertions(+), 63 deletions(-) -- 2.47.0
From: Kairui Song <kasong@tencent.com> mem_cgroup_uncharge_swap() implies a mem_cgroup_disabled() check, which is already checked by the caller here. Skip it by calling __mem_cgroup_uncharge_swap() directly. Signed-off-by: Kairui Song <kasong@tencent.com> --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index XXXXXXX..XXXXXXX 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -XXX,XX +XXX,XX @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate. */ - mem_cgroup_uncharge_swap(entry, nr_pages); + __mem_cgroup_uncharge_swap(entry, nr_pages); } } -- 2.47.0
From: Kairui Song <kasong@tencent.com> This function is never used after commit 6b611388b626 ("memcg-v1: remove charge move code"). Signed-off-by: Kairui Song <kasong@tencent.com> --- include/linux/swap_cgroup.h | 2 -- mm/swap_cgroup.c | 29 ----------------------------- 2 files changed, 31 deletions(-) diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index XXXXXXX..XXXXXXX 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -XXX,XX +XXX,XX @@ #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new); extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index XXXXXXX..XXXXXXX 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -XXX,XX +XXX,XX @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, return &ctrl->map[offset]; } -/** - * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @ent: swap entry to be cmpxchged - * @old: old id - * @new: new id - * - * Returns old id at success, 0 at failure. - * (There is no mem_cgroup using 0 as its id) - */ -unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new) -{ - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned long flags; - unsigned short retval; - - sc = lookup_swap_cgroup(ent, &ctrl); - - spin_lock_irqsave(&ctrl->lock, flags); - retval = sc->id; - if (retval == old) - sc->id = new; - else - retval = 0; - spin_unlock_irqrestore(&ctrl->lock, flags); - return retval; -} - /** * swap_cgroup_record - record mem_cgroup for a set of swap entries * @ent: the first swap entry to be recorded into -- 2.47.0
From: Kairui Song <kasong@tencent.com> Remove the intermediate struct swap_cgroup, it just a unsigned short wrapper, simplify the code. Also zero the map on initialization to prevent unexpected behaviour as swap cgroup helpers are suppose to return 0 on error. Signed-off-by: Kairui Song <kasong@tencent.com> --- mm/swap_cgroup.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index XXXXXXX..XXXXXXX 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -XXX,XX +XXX,XX @@ struct swap_cgroup { }; struct swap_cgroup_ctrl { - struct swap_cgroup *map; + unsigned short *map; spinlock_t lock; }; static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; -#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) - /* * SwapCgroup implements "lookup" and "exchange" operations. * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge @@ -XXX,XX +XXX,XX @@ static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; * * TODO: we can push these buffers out to HIGHMEM. */ -static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, - struct swap_cgroup_ctrl **ctrlp) -{ - pgoff_t offset = swp_offset(ent); - struct swap_cgroup_ctrl *ctrl; - - ctrl = &swap_cgroup_ctrl[swp_type(ent)]; - if (ctrlp) - *ctrlp = ctrl; - return &ctrl->map[offset]; -} - /** * swap_cgroup_record - record mem_cgroup for a set of swap entries * @ent: the first swap entry to be recorded into @@ -XXX,XX +XXX,XX @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, unsigned int nr_ents) { struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; + unsigned short *map; unsigned short old; unsigned long flags; pgoff_t offset = swp_offset(ent); pgoff_t end = offset + nr_ents; - sc = lookup_swap_cgroup(ent, &ctrl); + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + map = ctrl->map; spin_lock_irqsave(&ctrl->lock, flags); - old = sc->id; - for (; offset < end; offset++, sc++) { - VM_BUG_ON(sc->id != old); - sc->id = id; - } + old = map[offset]; + do { + VM_BUG_ON(map[offset] != old); + map[offset] = id; + } while (++offset != end); spin_unlock_irqrestore(&ctrl->lock, flags); return old; @@ -XXX,XX +XXX,XX @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { + struct swap_cgroup_ctrl *ctrl; + if (mem_cgroup_disabled()) return 0; - return lookup_swap_cgroup(ent, NULL)->id; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + pgoff_t offset = swp_offset(ent); + + return READ_ONCE(ctrl->map[offset]); } int swap_cgroup_swapon(int type, unsigned long max_pages) { - struct swap_cgroup *map; + void *map; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return 0; - map = vcalloc(max_pages, sizeof(struct swap_cgroup)); + map = vzalloc(max_pages * sizeof(unsigned short)); if (!map) goto nomem; @@ -XXX,XX +XXX,XX @@ int swap_cgroup_swapon(int type, unsigned long max_pages) void swap_cgroup_swapoff(int type) { - struct swap_cgroup *map; + void *map; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) -- 2.47.0
From: Kairui Song <kasong@tencent.com> commit e9e58a4ec3b1 ("memcg: avoid use cmpxchg in swap cgroup maintainance") replaced the cmpxchg/xchg with a global irq spinlock because some archs doesn't support 2 bytes cmpxchg/xchg. Clearly this won't scale well. And as commented in swap_cgroup.c, this lock is not needed for map synchronization. Emulation of 2 bytes cmpxchg/xchg with atomic isn't hard, so implement it to get rid of this lock. Testing using 64G brd and build with build kernel with make -j96 in 1.5G memory cgroup using 4k folios showed below improvement (10 test run): Before this series: Sys time: 10730.08 (stdev 49.030728) Real time: 171.03 (stdev 0.850355) After this commit: Sys time: 9612.24 (stdev 66.310789), -10.42% Real time: 159.78 (stdev 0.577193), -6.57% With 64k folios and 2G memcg: Before this series: Sys time: 7626.77 (stdev 43.545517) Real time: 136.22 (stdev 1.265544) After this commit: Sys time: 6936.03 (stdev 39.996280), -9.06% Real time: 129.65 (stdev 0.880039), -4.82% Sequential swapout of 8G 4k zero folios (24 test run): Before this series: 5461409.12 us (stdev 183957.827084) After this commit: 5420447.26 us (stdev 196419.240317) Sequential swapin of 8G 4k zero folios (24 test run): Before this series: 19736958.916667 us (stdev 189027.246676) After this commit: 19662182.629630 us (stdev 172717.640614) Performance is better or at least not worse for all tests above. Signed-off-by: Kairui Song <kasong@tencent.com> --- mm/swap_cgroup.c | 56 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index XXXXXXX..XXXXXXX 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -XXX,XX +XXX,XX @@ #include <linux/swapops.h> /* depends on mm.h include */ +#define ID_PER_UNIT (sizeof(atomic_t) / sizeof(unsigned short)) +struct swap_cgroup_unit { + union { + int raw; + atomic_t val; + unsigned short __id[ID_PER_UNIT]; + }; +}; + static DEFINE_MUTEX(swap_cgroup_mutex); struct swap_cgroup { @@ -XXX,XX +XXX,XX @@ struct swap_cgroup { }; struct swap_cgroup_ctrl { - unsigned short *map; - spinlock_t lock; + union { + struct swap_cgroup_unit *units; + unsigned short *map; + }; }; static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; @@ -XXX,XX +XXX,XX @@ static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; * * TODO: we can push these buffers out to HIGHMEM. */ +static unsigned short __swap_cgroup_xchg(void *map, + pgoff_t offset, + unsigned int new_id) +{ + unsigned int old_id; + struct swap_cgroup_unit *units = map; + struct swap_cgroup_unit *unit = &units[offset / ID_PER_UNIT]; + struct swap_cgroup_unit new, old = { .raw = atomic_read(&unit->val) }; + + do { + new.raw = old.raw; + old_id = old.__id[offset % ID_PER_UNIT]; + new.__id[offset % ID_PER_UNIT] = new_id; + } while (!atomic_try_cmpxchg(&unit->val, &old.raw, new.raw)); + + return old_id; +} + /** * swap_cgroup_record - record mem_cgroup for a set of swap entries * @ent: the first swap entry to be recorded into @@ -XXX,XX +XXX,XX @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, unsigned int nr_ents) { struct swap_cgroup_ctrl *ctrl; - unsigned short *map; - unsigned short old; - unsigned long flags; pgoff_t offset = swp_offset(ent); pgoff_t end = offset + nr_ents; + unsigned short old, iter; + unsigned short *map; ctrl = &swap_cgroup_ctrl[swp_type(ent)]; map = ctrl->map; - spin_lock_irqsave(&ctrl->lock, flags); - old = map[offset]; + old = READ_ONCE(map[offset]); do { - VM_BUG_ON(map[offset] != old); - map[offset] = id; + iter = __swap_cgroup_xchg(map, offset, id); + VM_BUG_ON(iter != old); } while (++offset != end); - spin_unlock_irqrestore(&ctrl->lock, flags); return old; } @@ -XXX,XX +XXX,XX @@ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) int swap_cgroup_swapon(int type, unsigned long max_pages) { - void *map; + struct swap_cgroup_unit *units; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return 0; - map = vzalloc(max_pages * sizeof(unsigned short)); - if (!map) + units = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_UNIT) * + sizeof(struct swap_cgroup_unit)); + if (!units) goto nomem; ctrl = &swap_cgroup_ctrl[type]; mutex_lock(&swap_cgroup_mutex); - ctrl->map = map; - spin_lock_init(&ctrl->lock); + ctrl->units = units; mutex_unlock(&swap_cgroup_mutex); return 0; -- 2.47.0
From: Kairui Song <kasong@tencent.com> This series removes the global swap cgroup lock. The critical section of this lock is very short but it's still a bottle neck for mass parallel swap workloads. Up to 10% performance gain for tmpfs build kernel test on a 48c96t system, and no regression for other cases: Testing using 64G brd and build with build kernel with make -j96 in 1.5G memory cgroup using 4k folios showed below improvement (10 test run): Before this series: Sys time: 10809.46 (stdev 80.831491) Real time: 171.41 (stdev 1.239894) After this commit: Sys time: 9621.26 (stdev 34.620000), -10.42% Real time: 160.00 (stdev 0.497814), -6.57% With 64k folios and 2G memcg: Before this series: Sys time: 8231.99 (stdev 30.030994) Real time: 143.57 (stdev 0.577394) After this commit: Sys time: 7403.47 (stdev 6.270000), -10.06% Real time: 135.18 (stdev 0.605000), -5.84% Sequential swapout of 8G 64k zero folios (24 test run): Before this series: 5461409.12 us (stdev 183957.827084) After this commit: 5420447.26 us (stdev 196419.240317) Sequential swapin of 8G 4k zero folios (24 test run): Before this series: 19736958.916667 us (stdev 189027.246676) After this commit: 19662182.629630 us (stdev 172717.640614) V1: https://lore.kernel.org/linux-mm/20241202184154.19321-1-ryncsn@gmail.com/ Updates: - Collect Review and Ack. - Use bit shift instead of a mixed usage of short and atomic for emulating 2 byte xchg [Chris Li] - Merge patch 3 into patch 4 for simplicity [Roman Gushchin]. - Drop call of mem_cgroup_disabled instead in patch 1, also fix bot build error [Yosry Ahmed] - Wrap the access of the atomic_t map with helpers properly, so the emulation can be dropped to use native 2 byte xchg once available. Kairui Song (3): mm, memcontrol: avoid duplicated memcg enable check mm/swap_cgroup: remove swap_cgroup_cmpxchg mm, swap_cgroup: remove global swap cgroup lock include/linux/swap_cgroup.h | 2 - mm/memcontrol.c | 2 +- mm/swap_cgroup.c | 96 ++++++++++++++++--------------------- 3 files changed, 43 insertions(+), 57 deletions(-) -- 2.47.1
From: Kairui Song <kasong@tencent.com> mem_cgroup_uncharge_swap() includes a mem_cgroup_disabled() check, so the caller doesn't need to check that. Signed-off-by: Kairui Song <kasong@tencent.com> Reviewed-by: Yosry Ahmed <yosryahmed@google.com> Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Acked-by: Chris Li <chrisl@kernel.org> --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index XXXXXXX..XXXXXXX 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -XXX,XX +XXX,XX @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) * correspond 1:1 to page and swap slot lifetimes: we charge the * page to memory here, and uncharge swap when the slot is freed. */ - if (!mem_cgroup_disabled() && do_memsw_account()) { + if (do_memsw_account()) { /* * The swap entry might not get freed for a long time, * let's not wait for it. The page already received a -- 2.47.1
From: Kairui Song <kasong@tencent.com> This function is never used after commit 6b611388b626 ("memcg-v1: remove charge move code"). Signed-off-by: Kairui Song <kasong@tencent.com> Reviewed-by: Yosry Ahmed <yosryahmed@google.com> Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Acked-by: Chris Li <chrisl@kernel.org> --- include/linux/swap_cgroup.h | 2 -- mm/swap_cgroup.c | 29 ----------------------------- 2 files changed, 31 deletions(-) diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index XXXXXXX..XXXXXXX 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -XXX,XX +XXX,XX @@ #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new); extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index XXXXXXX..XXXXXXX 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -XXX,XX +XXX,XX @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, return &ctrl->map[offset]; } -/** - * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @ent: swap entry to be cmpxchged - * @old: old id - * @new: new id - * - * Returns old id at success, 0 at failure. - * (There is no mem_cgroup using 0 as its id) - */ -unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new) -{ - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned long flags; - unsigned short retval; - - sc = lookup_swap_cgroup(ent, &ctrl); - - spin_lock_irqsave(&ctrl->lock, flags); - retval = sc->id; - if (retval == old) - sc->id = new; - else - retval = 0; - spin_unlock_irqrestore(&ctrl->lock, flags); - return retval; -} - /** * swap_cgroup_record - record mem_cgroup for a set of swap entries * @ent: the first swap entry to be recorded into -- 2.47.1
From: Kairui Song <kasong@tencent.com> commit e9e58a4ec3b1 ("memcg: avoid use cmpxchg in swap cgroup maintainance") replaced the cmpxchg/xchg with a global irq spinlock because some archs doesn't support 2 bytes cmpxchg/xchg. Clearly this won't scale well. And as commented in swap_cgroup.c, this lock is not needed for map synchronization. Emulation of 2 bytes xchg with atomic cmpxchg isn't hard, so implement it to get rid of this lock. Introduced two helpers for doing so and they can be easily dropped if a generic 2 byte xchg is support. Testing using 64G brd and build with build kernel with make -j96 in 1.5G memory cgroup using 4k folios showed below improvement (10 test run): Before this series: Sys time: 10809.46 (stdev 80.831491) Real time: 171.41 (stdev 1.239894) After this commit: Sys time: 9621.26 (stdev 34.620000), -10.42% Real time: 160.00 (stdev 0.497814), -6.57% With 64k folios and 2G memcg: Before this series: Sys time: 8231.99 (stdev 30.030994) Real time: 143.57 (stdev 0.577394) After this commit: Sys time: 7403.47 (stdev 6.270000), -10.06% Real time: 135.18 (stdev 0.605000), -5.84% Sequential swapout of 8G 64k zero folios with madvise (24 test run): Before this series: 5461409.12 us (stdev 183957.827084) After this commit: 5420447.26 us (stdev 196419.240317) Sequential swapin of 8G 4k zero folios (24 test run): Before this series: 19736958.916667 us (stdev 189027.246676) After this commit: 19662182.629630 us (stdev 172717.640614) Performance is better or at least not worse for all tests above. Signed-off-by: Kairui Song <kasong@tencent.com> --- mm/swap_cgroup.c | 73 +++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index XXXXXXX..XXXXXXX 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -XXX,XX +XXX,XX @@ static DEFINE_MUTEX(swap_cgroup_mutex); +/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */ +#define ID_PER_SC (sizeof(atomic_t) / sizeof(unsigned short)) +#define ID_SHIFT (BITS_PER_TYPE(unsigned short)) +#define ID_MASK (BIT(ID_SHIFT) - 1) struct swap_cgroup { - unsigned short id; + atomic_t ids; }; struct swap_cgroup_ctrl { struct swap_cgroup *map; - spinlock_t lock; }; static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; -#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) - /* * SwapCgroup implements "lookup" and "exchange" operations. * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge @@ -XXX,XX +XXX,XX @@ static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; * SwapCache(and its swp_entry) is under lock. * - When called via swap_free(), there is no user of this entry and no race. * Then, we don't need lock around "exchange". - * - * TODO: we can push these buffers out to HIGHMEM. */ -static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, - struct swap_cgroup_ctrl **ctrlp) +static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map, + pgoff_t offset) { - pgoff_t offset = swp_offset(ent); - struct swap_cgroup_ctrl *ctrl; + unsigned int shift = (offset & 1) ? 0 : ID_SHIFT; + unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids); - ctrl = &swap_cgroup_ctrl[swp_type(ent)]; - if (ctrlp) - *ctrlp = ctrl; - return &ctrl->map[offset]; + return (old_ids & (ID_MASK << shift)) >> shift; +} + +static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, + pgoff_t offset, + unsigned short new_id) +{ + unsigned short old_id; + unsigned int shift = (offset & 1) ? 0 : ID_SHIFT; + struct swap_cgroup *sc = &map[offset / ID_PER_SC]; + unsigned int new_ids, old_ids = atomic_read(&sc->ids); + + do { + old_id = (old_ids & (ID_MASK << shift)) >> shift; + new_ids = (old_ids & ~(ID_MASK << shift)); + new_ids |= ((unsigned int)new_id) << shift; + } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids)); + + return old_id; } /** @@ -XXX,XX +XXX,XX @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, unsigned int nr_ents) { struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned short old; - unsigned long flags; pgoff_t offset = swp_offset(ent); pgoff_t end = offset + nr_ents; + unsigned short old, iter; + struct swap_cgroup *map; - sc = lookup_swap_cgroup(ent, &ctrl); + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + map = ctrl->map; - spin_lock_irqsave(&ctrl->lock, flags); - old = sc->id; - for (; offset < end; offset++, sc++) { - VM_BUG_ON(sc->id != old); - sc->id = id; - } - spin_unlock_irqrestore(&ctrl->lock, flags); + old = __swap_cgroup_id_lookup(map, offset); + do { + iter = __swap_cgroup_id_xchg(map, offset, id); + VM_BUG_ON(iter != old); + } while (++offset != end); return old; } @@ -XXX,XX +XXX,XX @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { + struct swap_cgroup_ctrl *ctrl; + if (mem_cgroup_disabled()) return 0; - return lookup_swap_cgroup(ent, NULL)->id; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent)); } int swap_cgroup_swapon(int type, unsigned long max_pages) @@ -XXX,XX +XXX,XX @@ int swap_cgroup_swapon(int type, unsigned long max_pages) if (mem_cgroup_disabled()) return 0; - map = vcalloc(max_pages, sizeof(struct swap_cgroup)); + BUILD_BUG_ON(!ID_PER_SC); + map = vcalloc(DIV_ROUND_UP(max_pages, ID_PER_SC), + sizeof(struct swap_cgroup)); if (!map) goto nomem; ctrl = &swap_cgroup_ctrl[type]; mutex_lock(&swap_cgroup_mutex); ctrl->map = map; - spin_lock_init(&ctrl->lock); mutex_unlock(&swap_cgroup_mutex); return 0; -- 2.47.1