From: Muchun Song <songmuchun@bytedance.com>
Pagecache pages are charged at allocation time and hold a reference
to the original memory cgroup until reclaimed. Depending on memory
pressure, page sharing patterns between different cgroups and cgroup
creation/destruction rates, many dying memory cgroups can be pinned
by pagecache pages, reducing page reclaim efficiency and wasting
memory. Converting LRU folios and most other raw memory cgroup pins
to the object cgroup direction can fix this long-living problem.
Finally, folio->memcg_data of LRU folios and kmem folios will always
point to an object cgroup pointer. The folio->memcg_data of slab
folios will point to an vector of object cgroups.
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
include/linux/memcontrol.h | 77 +++++----------
mm/memcontrol-v1.c | 15 +--
mm/memcontrol.c | 189 +++++++++++++++++++++++--------------
3 files changed, 150 insertions(+), 131 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 85265b28c5d18..9be52ce72f2c5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -369,9 +369,6 @@ enum objext_flags {
#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1)
#ifdef CONFIG_MEMCG
-
-static inline bool folio_memcg_kmem(struct folio *folio);
-
/*
* After the initialization objcg->memcg is always pointing at
* a valid memcg, but can be atomically swapped to the parent memcg.
@@ -385,43 +382,19 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
}
/*
- * __folio_memcg - Get the memory cgroup associated with a non-kmem folio
- * @folio: Pointer to the folio.
- *
- * Returns a pointer to the memory cgroup associated with the folio,
- * or NULL. This function assumes that the folio is known to have a
- * proper memory cgroup pointer. It's not safe to call this function
- * against some type of folios, e.g. slab folios or ex-slab folios or
- * kmem folios.
- */
-static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
-{
- unsigned long memcg_data = folio->memcg_data;
-
- VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
-
- return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
-}
-
-/*
- * __folio_objcg - get the object cgroup associated with a kmem folio.
+ * folio_objcg - get the object cgroup associated with a folio.
* @folio: Pointer to the folio.
*
* Returns a pointer to the object cgroup associated with the folio,
* or NULL. This function assumes that the folio is known to have a
- * proper object cgroup pointer. It's not safe to call this function
- * against some type of folios, e.g. slab folios or ex-slab folios or
- * LRU folios.
+ * proper object cgroup pointer.
*/
-static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
+static inline struct obj_cgroup *folio_objcg(struct folio *folio)
{
unsigned long memcg_data = folio->memcg_data;
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
- VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}
@@ -435,21 +408,30 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
* proper memory cgroup pointer. It's not safe to call this function
* against some type of folios, e.g. slab folios or ex-slab folios.
*
- * For a non-kmem folio any of the following ensures folio and memcg binding
- * stability:
+ * For a folio any of the following ensures folio and objcg binding stability:
*
* - the folio lock
* - LRU isolation
* - exclusive reference
*
- * For a kmem folio a caller should hold an rcu read lock to protect memcg
- * associated with a kmem folio from being released.
+ * Based on the stable binding of folio and objcg, for a folio any of the
+ * following ensures folio and memcg binding stability:
+ *
+ * - cgroup_mutex
+ * - the lruvec lock
+ *
+ * If the caller only want to ensure that the page counters of memcg are
+ * updated correctly, ensure that the binding stability of folio and objcg
+ * is sufficient.
+ *
+ * Note: The caller should hold an rcu read lock or cgroup_mutex to protect
+ * memcg associated with a folio from being released.
*/
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
- if (folio_memcg_kmem(folio))
- return obj_cgroup_memcg(__folio_objcg(folio));
- return __folio_memcg(folio);
+ struct obj_cgroup *objcg = folio_objcg(folio);
+
+ return objcg ? obj_cgroup_memcg(objcg) : NULL;
}
/*
@@ -473,15 +455,10 @@ static inline bool folio_memcg_charged(struct folio *folio)
* has an associated memory cgroup pointer or an object cgroups vector or
* an object cgroup.
*
- * For a non-kmem folio any of the following ensures folio and memcg binding
- * stability:
+ * The page and objcg or memcg binding rules can refer to folio_memcg().
*
- * - the folio lock
- * - LRU isolation
- * - exclusive reference
- *
- * For a kmem folio a caller should hold an rcu read lock to protect memcg
- * associated with a kmem folio from being released.
+ * A caller should hold an rcu read lock to protect memcg associated with a
+ * page from being released.
*/
static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
@@ -490,18 +467,14 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
* for slabs, READ_ONCE() should be used here.
*/
unsigned long memcg_data = READ_ONCE(folio->memcg_data);
+ struct obj_cgroup *objcg;
if (memcg_data & MEMCG_DATA_OBJEXTS)
return NULL;
- if (memcg_data & MEMCG_DATA_KMEM) {
- struct obj_cgroup *objcg;
-
- objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
- return obj_cgroup_memcg(objcg);
- }
+ objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
- return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
+ return objcg ? obj_cgroup_memcg(objcg) : NULL;
}
static inline struct mem_cgroup *page_memcg_check(struct page *page)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 6eed14bff7426..23c07df2063c8 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -591,6 +591,7 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
void memcg1_swapout(struct folio *folio, swp_entry_t entry)
{
struct mem_cgroup *memcg, *swap_memcg;
+ struct obj_cgroup *objcg;
unsigned int nr_entries;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
@@ -602,12 +603,13 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
if (!do_memsw_account())
return;
- memcg = folio_memcg(folio);
-
- VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
- if (!memcg)
+ objcg = folio_objcg(folio);
+ VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
+ if (!objcg)
return;
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
/*
* In case the memcg owning these pages has been offlined and doesn't
* have an ID allocated to it anymore, charge the closest online
@@ -625,7 +627,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
- if (!mem_cgroup_is_root(memcg))
+ if (!obj_cgroup_is_root(objcg))
page_counter_uncharge(&memcg->memory, nr_entries);
if (memcg != swap_memcg) {
@@ -646,7 +648,8 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
preempt_enable_nested();
memcg1_check_events(memcg, folio_nid(folio));
- css_put(&memcg->css);
+ rcu_read_unlock();
+ obj_cgroup_put(objcg);
}
/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3daa99a0c65fe..cd2f0f0c0f5ce 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -223,22 +223,55 @@ static inline void __memcg_reparent_objcgs(struct mem_cgroup *src,
static inline void reparent_locks(struct mem_cgroup *src, struct mem_cgroup *dst)
{
+ int nid, nest = 0;
+
spin_lock_irq(&objcg_lock);
+ for_each_node(nid) {
+ spin_lock_nested(&mem_cgroup_lruvec(src,
+ NODE_DATA(nid))->lru_lock, nest++);
+ spin_lock_nested(&mem_cgroup_lruvec(dst,
+ NODE_DATA(nid))->lru_lock, nest++);
+ }
}
static inline void reparent_unlocks(struct mem_cgroup *src, struct mem_cgroup *dst)
{
+ int nid;
+
+ for_each_node(nid) {
+ spin_unlock(&mem_cgroup_lruvec(dst, NODE_DATA(nid))->lru_lock);
+ spin_unlock(&mem_cgroup_lruvec(src, NODE_DATA(nid))->lru_lock);
+ }
spin_unlock_irq(&objcg_lock);
}
+static void memcg_reparent_lru_folios(struct mem_cgroup *src,
+ struct mem_cgroup *dst)
+{
+ if (lru_gen_enabled())
+ lru_gen_reparent_memcg(src, dst);
+ else
+ lru_reparent_memcg(src, dst);
+}
+
static void memcg_reparent_objcgs(struct mem_cgroup *src)
{
struct obj_cgroup *objcg = rcu_dereference_protected(src->objcg, true);
struct mem_cgroup *dst = parent_mem_cgroup(src);
+retry:
+ if (lru_gen_enabled())
+ max_lru_gen_memcg(dst);
+
reparent_locks(src, dst);
+ if (lru_gen_enabled() && !recheck_lru_gen_max_memcg(dst)) {
+ reparent_unlocks(src, dst);
+ cond_resched();
+ goto retry;
+ }
__memcg_reparent_objcgs(src, dst);
+ memcg_reparent_lru_folios(src, dst);
reparent_unlocks(src, dst);
@@ -989,6 +1022,8 @@ struct mem_cgroup *get_mem_cgroup_from_current(void)
/**
* get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
* @folio: folio from which memcg should be extracted.
+ *
+ * The folio and objcg or memcg binding rules can refer to folio_memcg().
*/
struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
{
@@ -2557,17 +2592,17 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
return try_charge_memcg(memcg, gfp_mask, nr_pages);
}
-static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+static void commit_charge(struct folio *folio, struct obj_cgroup *objcg)
{
VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio);
/*
- * Any of the following ensures page's memcg stability:
+ * Any of the following ensures folio's objcg stability:
*
* - the page lock
* - LRU isolation
* - exclusive reference
*/
- folio->memcg_data = (unsigned long)memcg;
+ folio->memcg_data = (unsigned long)objcg;
}
#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
@@ -2671,6 +2706,17 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
return NULL;
}
+static inline struct obj_cgroup *get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
+{
+ struct obj_cgroup *objcg;
+
+ rcu_read_lock();
+ objcg = __get_obj_cgroup_from_memcg(memcg);
+ rcu_read_unlock();
+
+ return objcg;
+}
+
static struct obj_cgroup *current_objcg_update(void)
{
struct mem_cgroup *memcg;
@@ -2771,17 +2817,10 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
{
struct obj_cgroup *objcg;
- if (!memcg_kmem_online())
- return NULL;
-
- if (folio_memcg_kmem(folio)) {
- objcg = __folio_objcg(folio);
+ objcg = folio_objcg(folio);
+ if (objcg)
obj_cgroup_get(objcg);
- } else {
- rcu_read_lock();
- objcg = __get_obj_cgroup_from_memcg(__folio_memcg(folio));
- rcu_read_unlock();
- }
+
return objcg;
}
@@ -3288,7 +3327,7 @@ void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
return;
new_refs = (1 << (old_order - new_order)) - 1;
- css_get_many(&__folio_memcg(folio)->css, new_refs);
+ obj_cgroup_get_many(folio_objcg(folio), new_refs);
}
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
@@ -4737,16 +4776,20 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
gfp_t gfp)
{
- int ret;
-
- ret = try_charge(memcg, gfp, folio_nr_pages(folio));
- if (ret)
- goto out;
+ int ret = 0;
+ struct obj_cgroup *objcg;
- css_get(&memcg->css);
- commit_charge(folio, memcg);
+ objcg = get_obj_cgroup_from_memcg(memcg);
+ /* Do not account at the root objcg level. */
+ if (!obj_cgroup_is_root(objcg))
+ ret = try_charge(memcg, gfp, folio_nr_pages(folio));
+ if (ret) {
+ obj_cgroup_put(objcg);
+ return ret;
+ }
+ commit_charge(folio, objcg);
memcg1_commit_charge(folio, memcg);
-out:
+
return ret;
}
@@ -4832,7 +4875,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
}
struct uncharge_gather {
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
unsigned long nr_memory;
unsigned long pgpgout;
unsigned long nr_kmem;
@@ -4846,58 +4889,52 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(ug->objcg);
if (ug->nr_memory) {
- memcg_uncharge(ug->memcg, ug->nr_memory);
+ memcg_uncharge(memcg, ug->nr_memory);
if (ug->nr_kmem) {
- mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem);
- memcg1_account_kmem(ug->memcg, -ug->nr_kmem);
+ mod_memcg_state(memcg, MEMCG_KMEM, -ug->nr_kmem);
+ memcg1_account_kmem(memcg, -ug->nr_kmem);
}
- memcg1_oom_recover(ug->memcg);
+ memcg1_oom_recover(memcg);
}
- memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid);
+ memcg1_uncharge_batch(memcg, ug->pgpgout, ug->nr_memory, ug->nid);
+ rcu_read_unlock();
/* drop reference from uncharge_folio */
- css_put(&ug->memcg->css);
+ obj_cgroup_put(ug->objcg);
}
static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
{
long nr_pages;
- struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
/*
* Nobody should be changing or seriously looking at
- * folio memcg or objcg at this point, we have fully
- * exclusive access to the folio.
+ * folio objcg at this point, we have fully exclusive
+ * access to the folio.
*/
- if (folio_memcg_kmem(folio)) {
- objcg = __folio_objcg(folio);
- /*
- * This get matches the put at the end of the function and
- * kmem pages do not hold memcg references anymore.
- */
- memcg = get_mem_cgroup_from_objcg(objcg);
- } else {
- memcg = __folio_memcg(folio);
- }
-
- if (!memcg)
+ objcg = folio_objcg(folio);
+ if (!objcg)
return;
- if (ug->memcg != memcg) {
- if (ug->memcg) {
+ if (ug->objcg != objcg) {
+ if (ug->objcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
- ug->memcg = memcg;
+ ug->objcg = objcg;
ug->nid = folio_nid(folio);
- /* pairs with css_put in uncharge_batch */
- css_get(&memcg->css);
+ /* pairs with obj_cgroup_put in uncharge_batch */
+ obj_cgroup_get(objcg);
}
nr_pages = folio_nr_pages(folio);
@@ -4905,20 +4942,17 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
if (folio_memcg_kmem(folio)) {
ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages;
-
- folio->memcg_data = 0;
- obj_cgroup_put(objcg);
} else {
/* LRU pages aren't accounted at the root level */
- if (!mem_cgroup_is_root(memcg))
+ if (!obj_cgroup_is_root(objcg))
ug->nr_memory += nr_pages;
ug->pgpgout++;
WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
- folio->memcg_data = 0;
}
- css_put(&memcg->css);
+ folio->memcg_data = 0;
+ obj_cgroup_put(objcg);
}
void __mem_cgroup_uncharge(struct folio *folio)
@@ -4942,7 +4976,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
uncharge_gather_clear(&ug);
for (i = 0; i < folios->nr; i++)
uncharge_folio(folios->folios[i], &ug);
- if (ug.memcg)
+ if (ug.objcg)
uncharge_batch(&ug);
}
@@ -4959,6 +4993,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
{
struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
long nr_pages = folio_nr_pages(new);
VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
@@ -4973,21 +5008,24 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
if (folio_memcg_charged(new))
return;
- memcg = folio_memcg(old);
- VM_WARN_ON_ONCE_FOLIO(!memcg, old);
- if (!memcg)
+ objcg = folio_objcg(old);
+ VM_WARN_ON_ONCE_FOLIO(!objcg, old);
+ if (!objcg)
return;
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
/* Force-charge the new page. The old one will be freed soon */
- if (!mem_cgroup_is_root(memcg)) {
+ if (!obj_cgroup_is_root(objcg)) {
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
}
- css_get(&memcg->css);
- commit_charge(new, memcg);
+ obj_cgroup_get(objcg);
+ commit_charge(new, objcg);
memcg1_commit_charge(new, memcg);
+ rcu_read_unlock();
}
/**
@@ -5003,7 +5041,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
*/
void mem_cgroup_migrate(struct folio *old, struct folio *new)
{
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
@@ -5014,18 +5052,18 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
if (mem_cgroup_disabled())
return;
- memcg = folio_memcg(old);
+ objcg = folio_objcg(old);
/*
- * Note that it is normal to see !memcg for a hugetlb folio.
+ * Note that it is normal to see !objcg for a hugetlb folio.
* For e.g, itt could have been allocated when memory_hugetlb_accounting
* was not selected.
*/
- VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
- if (!memcg)
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !objcg, old);
+ if (!objcg)
return;
- /* Transfer the charge and the css ref */
- commit_charge(new, memcg);
+ /* Transfer the charge and the objcg ref */
+ commit_charge(new, objcg);
/* Warning should never happen, so don't worry about refcount non-0 */
WARN_ON_ONCE(folio_unqueue_deferred_split(old));
@@ -5200,22 +5238,27 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
unsigned int nr_pages = folio_nr_pages(folio);
struct page_counter *counter;
struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
if (do_memsw_account())
return 0;
- memcg = folio_memcg(folio);
-
- VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
- if (!memcg)
+ objcg = folio_objcg(folio);
+ VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
+ if (!objcg)
return 0;
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
if (!entry.val) {
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
+ rcu_read_unlock();
return 0;
}
memcg = mem_cgroup_id_get_online(memcg);
+ /* memcg is pined by memcg ID. */
+ rcu_read_unlock();
if (!mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
--
2.20.1
Hi Qi.
On Wed, Dec 17, 2025 at 03:27:51PM +0800, Qi Zheng <qi.zheng@linux.dev> wrote:
> @@ -5200,22 +5238,27 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
> unsigned int nr_pages = folio_nr_pages(folio);
> struct page_counter *counter;
> struct mem_cgroup *memcg;
> + struct obj_cgroup *objcg;
>
> if (do_memsw_account())
> return 0;
>
> - memcg = folio_memcg(folio);
> -
> - VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
> - if (!memcg)
> + objcg = folio_objcg(folio);
> + VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
> + if (!objcg)
> return 0;
>
> + rcu_read_lock();
> + memcg = obj_cgroup_memcg(objcg);
> if (!entry.val) {
> memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
> + rcu_read_unlock();
> return 0;
> }
>
> memcg = mem_cgroup_id_get_online(memcg);
> + /* memcg is pined by memcg ID. */
> + rcu_read_unlock();
>
> if (!mem_cgroup_is_root(memcg) &&
> !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
Later there is:
swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
As per the comment memcg remains pinned by the ID which is associated
with a swap slot, i.e. theoretically time unbound (shmem).
(This was actually brought up by Yosry in stats subthread [1])
I think that should be tackled too to eliminate the problem completely.
As I look at the code, these memcg IDs (private [2]) could be converted
to objcg IDs so that reparenting applies also to folios that are
currently swapped out. (Or convert to swap_cgroup_ctrl from the vector
of IDs to a vector of objcg pointers, depending on space.)
Thanks,
Michal
[1] https://lore.kernel.org/r/ebdhvcwygvnfejai5azhg3sjudsjorwmlcvmzadpkhexoeq3tb@5gj5y2exdhpn
[2] https://lore.kernel.org/r/20251225232116.294540-1-shakeel.butt@linux.dev
On Mon, Jan 05, 2026 at 11:41:46AM +0100, Michal Koutný wrote:
> Hi Qi.
>
> On Wed, Dec 17, 2025 at 03:27:51PM +0800, Qi Zheng <qi.zheng@linux.dev> wrote:
>
> > @@ -5200,22 +5238,27 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
> > unsigned int nr_pages = folio_nr_pages(folio);
> > struct page_counter *counter;
> > struct mem_cgroup *memcg;
> > + struct obj_cgroup *objcg;
> >
> > if (do_memsw_account())
> > return 0;
> >
> > - memcg = folio_memcg(folio);
> > -
> > - VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
> > - if (!memcg)
> > + objcg = folio_objcg(folio);
> > + VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
> > + if (!objcg)
> > return 0;
> >
> > + rcu_read_lock();
> > + memcg = obj_cgroup_memcg(objcg);
> > if (!entry.val) {
> > memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
> > + rcu_read_unlock();
> > return 0;
> > }
> >
> > memcg = mem_cgroup_id_get_online(memcg);
> > + /* memcg is pined by memcg ID. */
> > + rcu_read_unlock();
> >
> > if (!mem_cgroup_is_root(memcg) &&
> > !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
>
> Later there is:
> swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
>
> As per the comment memcg remains pinned by the ID which is associated
> with a swap slot, i.e. theoretically time unbound (shmem).
> (This was actually brought up by Yosry in stats subthread [1])
>
> I think that should be tackled too to eliminate the problem completely.
FWIW, I am not sure if swap entries is the last cause of pinning memcgs,
I am pretty sure there will be others that we haven't found yet. This is
why I think we shouldn't assume that the time between offlining and
releasing a memcg is short or bounded when fixing the stats problem.
>
> As I look at the code, these memcg IDs (private [2]) could be converted
> to objcg IDs so that reparenting applies also to folios that are
> currently swapped out. (Or convert to swap_cgroup_ctrl from the vector
> of IDs to a vector of objcg pointers, depending on space.)
I think we can do objcg IDs, but be careful to keep the same behavior as
today and avoid overexhausting the 16 bit ID space. So we need to also
drop the ref to the objcg ID when the memcg is offlined and the objcg is
reparented, such that the objcg ID is deleted unless there are swapped
out entries.
I think this can be done on top of this series, not necessarily as part
of it.
>
> Thanks,
> Michal
>
> [1] https://lore.kernel.org/r/ebdhvcwygvnfejai5azhg3sjudsjorwmlcvmzadpkhexoeq3tb@5gj5y2exdhpn
> [2] https://lore.kernel.org/r/20251225232116.294540-1-shakeel.butt@linux.dev
On 1/6/26 12:14 AM, Yosry Ahmed wrote:
> On Mon, Jan 05, 2026 at 11:41:46AM +0100, Michal Koutný wrote:
>> Hi Qi.
>>
>> On Wed, Dec 17, 2025 at 03:27:51PM +0800, Qi Zheng <qi.zheng@linux.dev> wrote:
>>
>>> @@ -5200,22 +5238,27 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
>>> unsigned int nr_pages = folio_nr_pages(folio);
>>> struct page_counter *counter;
>>> struct mem_cgroup *memcg;
>>> + struct obj_cgroup *objcg;
>>>
>>> if (do_memsw_account())
>>> return 0;
>>>
>>> - memcg = folio_memcg(folio);
>>> -
>>> - VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
>>> - if (!memcg)
>>> + objcg = folio_objcg(folio);
>>> + VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
>>> + if (!objcg)
>>> return 0;
>>>
>>> + rcu_read_lock();
>>> + memcg = obj_cgroup_memcg(objcg);
>>> if (!entry.val) {
>>> memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
>>> + rcu_read_unlock();
>>> return 0;
>>> }
>>>
>>> memcg = mem_cgroup_id_get_online(memcg);
>>> + /* memcg is pined by memcg ID. */
>>> + rcu_read_unlock();
>>>
>>> if (!mem_cgroup_is_root(memcg) &&
>>> !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
>>
>> Later there is:
>> swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
>>
>> As per the comment memcg remains pinned by the ID which is associated
>> with a swap slot, i.e. theoretically time unbound (shmem).
>> (This was actually brought up by Yosry in stats subthread [1])
>>
>> I think that should be tackled too to eliminate the problem completely.
>
> FWIW, I am not sure if swap entries is the last cause of pinning memcgs,
> I am pretty sure there will be others that we haven't found yet. This is
Agree.
> why I think we shouldn't assume that the time between offlining and
> releasing a memcg is short or bounded when fixing the stats problem.
If I have not misunderstood your suggestion in the other thread, I plan
to do the following in v3:
1. define a memcgv1-only function:
void memcg1_reparent_state_local(struct mem_cgroup *memcg, struct
mem_cgroup *parent)
{
int i;
synchronize_rcu();
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
int idx = memcg1_stats[i];
unsigned long value = memcg_page_state_local(memcg, idx);
mod_memcg_page_state_local(parent, idx, value);
}
}
2. call it after reparent_unlocks():
memcg_reparent_objcgs
--> objcg = __memcg_reparent_objcgs(memcg, parent);
reparent_unlocks(memcg, parent);
reparent_state_local(memcg, parent);
--> memcg1_reparent_state_local()
>
>>
>> As I look at the code, these memcg IDs (private [2]) could be converted
>> to objcg IDs so that reparenting applies also to folios that are
>> currently swapped out. (Or convert to swap_cgroup_ctrl from the vector
>> of IDs to a vector of objcg pointers, depending on space.)
>
> I think we can do objcg IDs, but be careful to keep the same behavior as
> today and avoid overexhausting the 16 bit ID space. So we need to also
> drop the ref to the objcg ID when the memcg is offlined and the objcg is
> reparented, such that the objcg ID is deleted unless there are swapped
> out entries.
>
> I think this can be done on top of this series, not necessarily as part
> of it.
Agree, I prefer to address this issue in a separate patchset.
Thanks,
Qi
>
>>
>> Thanks,
>> Michal
>>
>> [1] https://lore.kernel.org/r/ebdhvcwygvnfejai5azhg3sjudsjorwmlcvmzadpkhexoeq3tb@5gj5y2exdhpn
>> [2] https://lore.kernel.org/r/20251225232116.294540-1-shakeel.butt@linux.dev
>
>
On Tue, Jan 06, 2026 at 03:08:57PM +0800, Qi Zheng wrote:
>
>
> On 1/6/26 12:14 AM, Yosry Ahmed wrote:
> > On Mon, Jan 05, 2026 at 11:41:46AM +0100, Michal Koutný wrote:
> > > Hi Qi.
> > >
> > > On Wed, Dec 17, 2025 at 03:27:51PM +0800, Qi Zheng <qi.zheng@linux.dev> wrote:
> > >
> > > > @@ -5200,22 +5238,27 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
> > > > unsigned int nr_pages = folio_nr_pages(folio);
> > > > struct page_counter *counter;
> > > > struct mem_cgroup *memcg;
> > > > + struct obj_cgroup *objcg;
> > > > if (do_memsw_account())
> > > > return 0;
> > > > - memcg = folio_memcg(folio);
> > > > -
> > > > - VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
> > > > - if (!memcg)
> > > > + objcg = folio_objcg(folio);
> > > > + VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
> > > > + if (!objcg)
> > > > return 0;
> > > > + rcu_read_lock();
> > > > + memcg = obj_cgroup_memcg(objcg);
> > > > if (!entry.val) {
> > > > memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
> > > > + rcu_read_unlock();
> > > > return 0;
> > > > }
> > > > memcg = mem_cgroup_id_get_online(memcg);
> > > > + /* memcg is pined by memcg ID. */
> > > > + rcu_read_unlock();
> > > > if (!mem_cgroup_is_root(memcg) &&
> > > > !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
> > >
> > > Later there is:
> > > swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
> > >
> > > As per the comment memcg remains pinned by the ID which is associated
> > > with a swap slot, i.e. theoretically time unbound (shmem).
> > > (This was actually brought up by Yosry in stats subthread [1])
> > >
> > > I think that should be tackled too to eliminate the problem completely.
> >
> > FWIW, I am not sure if swap entries is the last cause of pinning memcgs,
> > I am pretty sure there will be others that we haven't found yet. This is
>
> Agree.
>
> > why I think we shouldn't assume that the time between offlining and
> > releasing a memcg is short or bounded when fixing the stats problem.
>
> If I have not misunderstood your suggestion in the other thread, I plan
> to do the following in v3:
>
> 1. define a memcgv1-only function:
>
> void memcg1_reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup
> *parent)
> {
> int i;
>
> synchronize_rcu();
>
> for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
> int idx = memcg1_stats[i];
> unsigned long value = memcg_page_state_local(memcg, idx);
>
> mod_memcg_page_state_local(parent, idx, value);
> }
> }
>
> 2. call it after reparent_unlocks():
>
> memcg_reparent_objcgs
> --> objcg = __memcg_reparent_objcgs(memcg, parent);
> reparent_unlocks(memcg, parent);
> reparent_state_local(memcg, parent);
> --> memcg1_reparent_state_local()
Something like that, yeah. I think we can avoid introducing
mod_memcg_page_state_local() if we just use mod_memcg_state() to
subtract the stat from the child then add it to the parent.
We should probably also flush the stats before reading them to
aggregate all per-CPU counters.
I think we also need to ensure that all stat updates happen within the
same RCU read section where we read the memcg pointer from the page,
ideally with safeguards to prevent misuse.
>
> >
> > >
> > > As I look at the code, these memcg IDs (private [2]) could be converted
> > > to objcg IDs so that reparenting applies also to folios that are
> > > currently swapped out. (Or convert to swap_cgroup_ctrl from the vector
> > > of IDs to a vector of objcg pointers, depending on space.)
> >
> > I think we can do objcg IDs, but be careful to keep the same behavior as
> > today and avoid overexhausting the 16 bit ID space. So we need to also
> > drop the ref to the objcg ID when the memcg is offlined and the objcg is
> > reparented, such that the objcg ID is deleted unless there are swapped
> > out entries.
> >
> > I think this can be done on top of this series, not necessarily as part
> > of it.
>
> Agree, I prefer to address this issue in a separate patchset.
>
> Thanks,
> Qi
>
> >
> > >
> > > Thanks,
> > > Michal
> > >
> > > [1] https://lore.kernel.org/r/ebdhvcwygvnfejai5azhg3sjudsjorwmlcvmzadpkhexoeq3tb@5gj5y2exdhpn
> > > [2] https://lore.kernel.org/r/20251225232116.294540-1-shakeel.butt@linux.dev
> >
> >
>
On Wed, Dec 17, 2025 at 03:27:51PM +0800, Qi Zheng wrote:
> From: Muchun Song <songmuchun@bytedance.com>
>
> Pagecache pages are charged at allocation time and hold a reference
> to the original memory cgroup until reclaimed. Depending on memory
> pressure, page sharing patterns between different cgroups and cgroup
> creation/destruction rates, many dying memory cgroups can be pinned
> by pagecache pages, reducing page reclaim efficiency and wasting
> memory. Converting LRU folios and most other raw memory cgroup pins
> to the object cgroup direction can fix this long-living problem.
This is already in the coverletter. Please describe here what the
patch itself does. IOW, now that everything is set up, switch
folio->memcg_data pointers to objcgs, update the accessors, and
execute reparenting on cgroup death.
> Finally, folio->memcg_data of LRU folios and kmem folios will always
> point to an object cgroup pointer. The folio->memcg_data of slab
> folios will point to an vector of object cgroups.
> @@ -223,22 +223,55 @@ static inline void __memcg_reparent_objcgs(struct mem_cgroup *src,
>
> static inline void reparent_locks(struct mem_cgroup *src, struct mem_cgroup *dst)
> {
> + int nid, nest = 0;
> +
> spin_lock_irq(&objcg_lock);
> + for_each_node(nid) {
> + spin_lock_nested(&mem_cgroup_lruvec(src,
> + NODE_DATA(nid))->lru_lock, nest++);
> + spin_lock_nested(&mem_cgroup_lruvec(dst,
> + NODE_DATA(nid))->lru_lock, nest++);
> + }
> }
Looks okay to me. If this should turn out to be a scalability problem
in practice, we can make objcgs per-node, and then reparent lru/objcg
pairs on a per-node basis without nesting locks.
> static inline void reparent_unlocks(struct mem_cgroup *src, struct mem_cgroup *dst)
> {
> + int nid;
> +
> + for_each_node(nid) {
> + spin_unlock(&mem_cgroup_lruvec(dst, NODE_DATA(nid))->lru_lock);
> + spin_unlock(&mem_cgroup_lruvec(src, NODE_DATA(nid))->lru_lock);
> + }
> spin_unlock_irq(&objcg_lock);
> }
>
> +static void memcg_reparent_lru_folios(struct mem_cgroup *src,
> + struct mem_cgroup *dst)
> +{
> + if (lru_gen_enabled())
> + lru_gen_reparent_memcg(src, dst);
> + else
> + lru_reparent_memcg(src, dst);
> +}
> +
> static void memcg_reparent_objcgs(struct mem_cgroup *src)
> {
> struct obj_cgroup *objcg = rcu_dereference_protected(src->objcg, true);
> struct mem_cgroup *dst = parent_mem_cgroup(src);
>
> +retry:
> + if (lru_gen_enabled())
> + max_lru_gen_memcg(dst);
> +
> reparent_locks(src, dst);
> + if (lru_gen_enabled() && !recheck_lru_gen_max_memcg(dst)) {
> + reparent_unlocks(src, dst);
> + cond_resched();
> + goto retry;
> + }
>
> __memcg_reparent_objcgs(src, dst);
> + memcg_reparent_lru_folios(src, dst);
Please inline memcg_reparent_lru_folios() here, to keep the lru vs
lrugen switching as "flat" as possible:
if (lru_gen_enabled()) {
if (!recheck_lru_gen_max_memcgs(parent)) {
reparent_unlocks(memcg, parent);
cond_resched();
goto retry;
}
lru_gen_reparent_memcg(memcg, parent);
} else {
lru_reparent_memcg(memcg, parent);
}
> @@ -989,6 +1022,8 @@ struct mem_cgroup *get_mem_cgroup_from_current(void)
> /**
> * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
> * @folio: folio from which memcg should be extracted.
> + *
> + * The folio and objcg or memcg binding rules can refer to folio_memcg().
See folio_memcg() for folio->objcg/memcg binding rules.
On 12/18/25 10:06 PM, Johannes Weiner wrote:
> On Wed, Dec 17, 2025 at 03:27:51PM +0800, Qi Zheng wrote:
>> From: Muchun Song <songmuchun@bytedance.com>
>>
>> Pagecache pages are charged at allocation time and hold a reference
>> to the original memory cgroup until reclaimed. Depending on memory
>> pressure, page sharing patterns between different cgroups and cgroup
>> creation/destruction rates, many dying memory cgroups can be pinned
>> by pagecache pages, reducing page reclaim efficiency and wasting
>> memory. Converting LRU folios and most other raw memory cgroup pins
>> to the object cgroup direction can fix this long-living problem.
>
> This is already in the coverletter. Please describe here what the
> patch itself does. IOW, now that everything is set up, switch
> folio->memcg_data pointers to objcgs, update the accessors, and
> execute reparenting on cgroup death.
Got it, will do.
>
>> Finally, folio->memcg_data of LRU folios and kmem folios will always
>> point to an object cgroup pointer. The folio->memcg_data of slab
>> folios will point to an vector of object cgroups.
>
>> @@ -223,22 +223,55 @@ static inline void __memcg_reparent_objcgs(struct mem_cgroup *src,
>>
>> static inline void reparent_locks(struct mem_cgroup *src, struct mem_cgroup *dst)
>> {
>> + int nid, nest = 0;
>> +
>> spin_lock_irq(&objcg_lock);
>> + for_each_node(nid) {
>> + spin_lock_nested(&mem_cgroup_lruvec(src,
>> + NODE_DATA(nid))->lru_lock, nest++);
>> + spin_lock_nested(&mem_cgroup_lruvec(dst,
>> + NODE_DATA(nid))->lru_lock, nest++);
>> + }
>> }
>
> Looks okay to me. If this should turn out to be a scalability problem
> in practice, we can make objcgs per-node, and then reparent lru/objcg
> pairs on a per-node basis without nesting locks.
>
>> static inline void reparent_unlocks(struct mem_cgroup *src, struct mem_cgroup *dst)
>> {
>> + int nid;
>> +
>> + for_each_node(nid) {
>> + spin_unlock(&mem_cgroup_lruvec(dst, NODE_DATA(nid))->lru_lock);
>> + spin_unlock(&mem_cgroup_lruvec(src, NODE_DATA(nid))->lru_lock);
>> + }
>> spin_unlock_irq(&objcg_lock);
>> }
>>
>> +static void memcg_reparent_lru_folios(struct mem_cgroup *src,
>> + struct mem_cgroup *dst)
>> +{
>> + if (lru_gen_enabled())
>> + lru_gen_reparent_memcg(src, dst);
>> + else
>> + lru_reparent_memcg(src, dst);
>> +}
>> +
>> static void memcg_reparent_objcgs(struct mem_cgroup *src)
>> {
>> struct obj_cgroup *objcg = rcu_dereference_protected(src->objcg, true);
>> struct mem_cgroup *dst = parent_mem_cgroup(src);
>>
>> +retry:
>> + if (lru_gen_enabled())
>> + max_lru_gen_memcg(dst);
>> +
>> reparent_locks(src, dst);
>> + if (lru_gen_enabled() && !recheck_lru_gen_max_memcg(dst)) {
>> + reparent_unlocks(src, dst);
>> + cond_resched();
>> + goto retry;
>> + }
>>
>> __memcg_reparent_objcgs(src, dst);
>> + memcg_reparent_lru_folios(src, dst);
>
> Please inline memcg_reparent_lru_folios() here, to keep the lru vs
> lrugen switching as "flat" as possible:
>
> if (lru_gen_enabled()) {
> if (!recheck_lru_gen_max_memcgs(parent)) {
> reparent_unlocks(memcg, parent);
> cond_resched();
> goto retry;
> }
> lru_gen_reparent_memcg(memcg, parent);
> } else {
> lru_reparent_memcg(memcg, parent);
> }
Looks better, will change to this style.
>
>> @@ -989,6 +1022,8 @@ struct mem_cgroup *get_mem_cgroup_from_current(void)
>> /**
>> * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
>> * @folio: folio from which memcg should be extracted.
>> + *
>> + * The folio and objcg or memcg binding rules can refer to folio_memcg().
>
> See folio_memcg() for folio->objcg/memcg binding rules.
OK, will do.
© 2016 - 2026 Red Hat, Inc.