In the future, we will reparent LRU folios during memcg offline to
eliminate dying memory cgroups, which requires reparenting the split queue
to its parent.
Similar to list_lru, the split queue is relatively independent and does
not need to be reparented along with objcg and LRU folios (holding
objcg lock and lru lock). So let's apply the same mechanism as list_lru
to reparent the split queue separately when memcg is offine.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
include/linux/huge_mm.h | 2 ++
include/linux/mmzone.h | 1 +
mm/huge_memory.c | 39 +++++++++++++++++++++++++++++++++++++++
mm/memcontrol.c | 1 +
mm/mm_init.c | 1 +
5 files changed, 44 insertions(+)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f327d62fc9852..a0d4b751974d2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -417,6 +417,7 @@ static inline int split_huge_page(struct page *page)
return split_huge_page_to_list_to_order(page, NULL, ret);
}
void deferred_split_folio(struct folio *folio, bool partially_mapped);
+void reparent_deferred_split_queue(struct mem_cgroup *memcg);
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze);
@@ -611,6 +612,7 @@ static inline int try_folio_split(struct folio *folio, struct page *page,
}
static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
+static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7fb7331c57250..f3eb81fee056a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1346,6 +1346,7 @@ struct deferred_split {
spinlock_t split_queue_lock;
struct list_head split_queue;
unsigned long split_queue_len;
+ bool is_dying;
};
#endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 48b51e6230a67..de7806f759cba 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1094,9 +1094,15 @@ static struct deferred_split *folio_split_queue_lock(struct folio *folio)
struct deferred_split *queue;
memcg = folio_memcg(folio);
+retry:
queue = memcg ? &memcg->deferred_split_queue :
&NODE_DATA(folio_nid(folio))->deferred_split_queue;
spin_lock(&queue->split_queue_lock);
+ if (unlikely(queue->is_dying == true)) {
+ spin_unlock(&queue->split_queue_lock);
+ memcg = parent_mem_cgroup(memcg);
+ goto retry;
+ }
return queue;
}
@@ -1108,9 +1114,15 @@ folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
struct deferred_split *queue;
memcg = folio_memcg(folio);
+retry:
queue = memcg ? &memcg->deferred_split_queue :
&NODE_DATA(folio_nid(folio))->deferred_split_queue;
spin_lock_irqsave(&queue->split_queue_lock, *flags);
+ if (unlikely(queue->is_dying == true)) {
+ spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
+ memcg = parent_mem_cgroup(memcg);
+ goto retry;
+ }
return queue;
}
@@ -4284,6 +4296,33 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
return split;
}
+void reparent_deferred_split_queue(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct deferred_split *ds_queue = &memcg->deferred_split_queue;
+ struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
+ int nid;
+
+ spin_lock_irq(&ds_queue->split_queue_lock);
+ spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
+
+ if (!ds_queue->split_queue_len)
+ goto unlock;
+
+ list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
+ parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
+ ds_queue->split_queue_len = 0;
+ /* Mark the ds_queue dead */
+ ds_queue->is_dying = true;
+
+ for_each_node(nid)
+ set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
+
+unlock:
+ spin_unlock(&parent_ds_queue->split_queue_lock);
+ spin_unlock_irq(&ds_queue->split_queue_lock);
+}
+
#ifdef CONFIG_DEBUG_FS
static void split_huge_pages_all(void)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e090f29eb03bd..d03da72e7585d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3887,6 +3887,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
zswap_memcg_offline_cleanup(memcg);
memcg_offline_kmem(memcg);
+ reparent_deferred_split_queue(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
lru_gen_offline_memcg(memcg);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 3db2dea7db4c5..cbda5c2ee3241 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1387,6 +1387,7 @@ static void pgdat_init_split_queue(struct pglist_data *pgdat)
spin_lock_init(&ds_queue->split_queue_lock);
INIT_LIST_HEAD(&ds_queue->split_queue);
ds_queue->split_queue_len = 0;
+ ds_queue->is_dying = false;
}
#else
static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
--
2.20.1
On Tue, Sep 23, 2025 at 05:16:25PM +0800, Qi Zheng wrote: > In the future, we will reparent LRU folios during memcg offline to > eliminate dying memory cgroups, which requires reparenting the split queue > to its parent. > > Similar to list_lru, the split queue is relatively independent and does > not need to be reparented along with objcg and LRU folios (holding > objcg lock and lru lock). So let's apply the same mechanism as list_lru > to reparent the split queue separately when memcg is offine. > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> > --- > include/linux/huge_mm.h | 2 ++ > include/linux/mmzone.h | 1 + > mm/huge_memory.c | 39 +++++++++++++++++++++++++++++++++++++++ > mm/memcontrol.c | 1 + > mm/mm_init.c | 1 + > 5 files changed, 44 insertions(+) > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index f327d62fc9852..a0d4b751974d2 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -417,6 +417,7 @@ static inline int split_huge_page(struct page *page) > return split_huge_page_to_list_to_order(page, NULL, ret); > } > void deferred_split_folio(struct folio *folio, bool partially_mapped); > +void reparent_deferred_split_queue(struct mem_cgroup *memcg); > > void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > unsigned long address, bool freeze); > @@ -611,6 +612,7 @@ static inline int try_folio_split(struct folio *folio, struct page *page, > } > > static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} > +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} > #define split_huge_pmd(__vma, __pmd, __address) \ > do { } while (0) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 7fb7331c57250..f3eb81fee056a 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -1346,6 +1346,7 @@ struct deferred_split { > spinlock_t split_queue_lock; > struct list_head split_queue; > unsigned long split_queue_len; > + bool is_dying; > }; > #endif The scheme in Muchun's version was: retry: queue = folio_split_queue(folio); spin_lock(&queue->split_queue_lock); if (folio_memcg(folio) != folio_split_queue_memcg(folio, queue)) { /* split queue was reparented, retry */ spin_unlock(&queue->split_queue_lock); goto retry; } /* now we have a stable mapping between the folio and the split queue */ spin_unlock(&queue->split_queue_lock); Oh, I see. We can't use this scheme yet because we don't reparent LRU folios. (I was wondering why we're adding is_dying property) > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 48b51e6230a67..de7806f759cba 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1094,9 +1094,15 @@ static struct deferred_split *folio_split_queue_lock(struct folio *folio) > struct deferred_split *queue; For now it's safe to not call rcu_read_lock() here because memcgs won't disappear under us as long as there are folios to split (we don't reparent LRU folios), right? > memcg = folio_memcg(folio); > +retry: > queue = memcg ? &memcg->deferred_split_queue : > &NODE_DATA(folio_nid(folio))->deferred_split_queue; > spin_lock(&queue->split_queue_lock); > + if (unlikely(queue->is_dying == true)) { > + spin_unlock(&queue->split_queue_lock); > + memcg = parent_mem_cgroup(memcg); > + goto retry; > + } > return queue; > } -- Cheers, Harry / Hyeonggon
Hi Qi, kernel test robot noticed the following build errors: [auto build test ERROR on next-20250922] [also build test ERROR on v6.17-rc7] [cannot apply to akpm-mm/mm-everything rppt-memblock/for-next rppt-memblock/fixes linus/master v6.17-rc7 v6.17-rc6 v6.17-rc5] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Qi-Zheng/mm-thp-replace-folio_memcg-with-folio_memcg_charged/20250923-171935 base: next-20250922 patch link: https://lore.kernel.org/r/55370bda7b2df617033ac12116c1712144bb7591.1758618527.git.zhengqi.arch%40bytedance.com patch subject: [PATCH v2 4/4] mm: thp: reparent the split queue during memcg offline config: riscv-randconfig-001-20250924 (https://download.01.org/0day-ci/archive/20250924/202509242123.QwwFy7gc-lkp@intel.com/config) compiler: riscv64-linux-gcc (GCC) 8.5.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250924/202509242123.QwwFy7gc-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202509242123.QwwFy7gc-lkp@intel.com/ All errors (new ones prefixed by >>): mm/huge_memory.c: In function 'reparent_deferred_split_queue': >> mm/huge_memory.c:4302:42: error: dereferencing pointer to incomplete type 'struct mem_cgroup' struct deferred_split *ds_queue = &memcg->deferred_split_queue; ^~ Kconfig warnings: (for reference only) WARNING: unmet direct dependencies detected for ARCH_HAS_ELF_CORE_EFLAGS Depends on [n]: BINFMT_ELF [=n] && ELF_CORE [=n] Selected by [y]: - RISCV [=y] vim +4302 mm/huge_memory.c 4298 4299 void reparent_deferred_split_queue(struct mem_cgroup *memcg) 4300 { 4301 struct mem_cgroup *parent = parent_mem_cgroup(memcg); > 4302 struct deferred_split *ds_queue = &memcg->deferred_split_queue; 4303 struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; 4304 int nid; 4305 4306 spin_lock_irq(&ds_queue->split_queue_lock); 4307 spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); 4308 4309 if (!ds_queue->split_queue_len) 4310 goto unlock; 4311 4312 list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); 4313 parent_ds_queue->split_queue_len += ds_queue->split_queue_len; 4314 ds_queue->split_queue_len = 0; 4315 /* Mark the ds_queue dead */ 4316 ds_queue->is_dying = true; 4317 4318 for_each_node(nid) 4319 set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); 4320 4321 unlock: 4322 spin_unlock(&parent_ds_queue->split_queue_lock); 4323 spin_unlock_irq(&ds_queue->split_queue_lock); 4324 } 4325 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
On 23.09.25 11:16, Qi Zheng wrote: > In the future, we will reparent LRU folios during memcg offline to > eliminate dying memory cgroups, which requires reparenting the split queue > to its parent. > > Similar to list_lru, the split queue is relatively independent and does > not need to be reparented along with objcg and LRU folios (holding > objcg lock and lru lock). So let's apply the same mechanism as list_lru > to reparent the split queue separately when memcg is offine. > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> > --- > include/linux/huge_mm.h | 2 ++ > include/linux/mmzone.h | 1 + > mm/huge_memory.c | 39 +++++++++++++++++++++++++++++++++++++++ > mm/memcontrol.c | 1 + > mm/mm_init.c | 1 + > 5 files changed, 44 insertions(+) > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index f327d62fc9852..a0d4b751974d2 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -417,6 +417,7 @@ static inline int split_huge_page(struct page *page) > return split_huge_page_to_list_to_order(page, NULL, ret); > } > void deferred_split_folio(struct folio *folio, bool partially_mapped); > +void reparent_deferred_split_queue(struct mem_cgroup *memcg); > > void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > unsigned long address, bool freeze); > @@ -611,6 +612,7 @@ static inline int try_folio_split(struct folio *folio, struct page *page, > } > > static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} > +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} > #define split_huge_pmd(__vma, __pmd, __address) \ > do { } while (0) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 7fb7331c57250..f3eb81fee056a 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -1346,6 +1346,7 @@ struct deferred_split { > spinlock_t split_queue_lock; > struct list_head split_queue; > unsigned long split_queue_len; > + bool is_dying; It's a bit weird to query whether the "struct deferred_split" is dying. Shouldn't this be a memcg property? (and in particular, not exist for the pglist_data part where it might not make sense at all?). > }; > #endif > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 48b51e6230a67..de7806f759cba 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1094,9 +1094,15 @@ static struct deferred_split *folio_split_queue_lock(struct folio *folio) > struct deferred_split *queue; > > memcg = folio_memcg(folio); > +retry: > queue = memcg ? &memcg->deferred_split_queue : > &NODE_DATA(folio_nid(folio))->deferred_split_queue; > spin_lock(&queue->split_queue_lock); > + if (unlikely(queue->is_dying == true)) { if (unlikely(queue->is_dying)) > + spin_unlock(&queue->split_queue_lock); > + memcg = parent_mem_cgroup(memcg); > + goto retry; > + } > > return queue; > } > @@ -1108,9 +1114,15 @@ folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) > struct deferred_split *queue; > > memcg = folio_memcg(folio); > +retry: > queue = memcg ? &memcg->deferred_split_queue : > &NODE_DATA(folio_nid(folio))->deferred_split_queue; > spin_lock_irqsave(&queue->split_queue_lock, *flags); > + if (unlikely(queue->is_dying == true)) { if (unlikely(queue->is_dying)) > + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); > + memcg = parent_mem_cgroup(memcg); > + goto retry; > + } > > return queue; > } Nothing else jumped at me, but I am not a memcg expert :) -- Cheers David / dhildenb
Qi Zheng <zhengqi.arch@bytedance.com> writes: > In the future, we will reparent LRU folios during memcg offline to > eliminate dying memory cgroups, which requires reparenting the split queue > to its parent. Nit: commit logs should really focus on the actual change, not the future plans. > > Similar to list_lru, the split queue is relatively independent and does > not need to be reparented along with objcg and LRU folios (holding > objcg lock and lru lock). So let's apply the same mechanism as list_lru > to reparent the split queue separately when memcg is offine. > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> > --- > include/linux/huge_mm.h | 2 ++ > include/linux/mmzone.h | 1 + > mm/huge_memory.c | 39 +++++++++++++++++++++++++++++++++++++++ > mm/memcontrol.c | 1 + > mm/mm_init.c | 1 + > 5 files changed, 44 insertions(+) > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index f327d62fc9852..a0d4b751974d2 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -417,6 +417,7 @@ static inline int split_huge_page(struct page *page) > return split_huge_page_to_list_to_order(page, NULL, ret); > } > void deferred_split_folio(struct folio *folio, bool partially_mapped); > +void reparent_deferred_split_queue(struct mem_cgroup *memcg); > > void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > unsigned long address, bool freeze); > @@ -611,6 +612,7 @@ static inline int try_folio_split(struct folio *folio, struct page *page, > } > > static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} > +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} > #define split_huge_pmd(__vma, __pmd, __address) \ > do { } while (0) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 7fb7331c57250..f3eb81fee056a 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -1346,6 +1346,7 @@ struct deferred_split { > spinlock_t split_queue_lock; > struct list_head split_queue; > unsigned long split_queue_len; > + bool is_dying; > }; > #endif > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 48b51e6230a67..de7806f759cba 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1094,9 +1094,15 @@ static struct deferred_split *folio_split_queue_lock(struct folio *folio) > struct deferred_split *queue; > > memcg = folio_memcg(folio); > +retry: > queue = memcg ? &memcg->deferred_split_queue : > &NODE_DATA(folio_nid(folio))->deferred_split_queue; > spin_lock(&queue->split_queue_lock); > + if (unlikely(queue->is_dying == true)) { > + spin_unlock(&queue->split_queue_lock); > + memcg = parent_mem_cgroup(memcg); > + goto retry; > + } > > return queue; > } > @@ -1108,9 +1114,15 @@ folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) > struct deferred_split *queue; > > memcg = folio_memcg(folio); > +retry: > queue = memcg ? &memcg->deferred_split_queue : > &NODE_DATA(folio_nid(folio))->deferred_split_queue; > spin_lock_irqsave(&queue->split_queue_lock, *flags); > + if (unlikely(queue->is_dying == true)) { > + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); > + memcg = parent_mem_cgroup(memcg); > + goto retry; > + } > > return queue; > } > @@ -4284,6 +4296,33 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, > return split; > } > > +void reparent_deferred_split_queue(struct mem_cgroup *memcg) > +{ > + struct mem_cgroup *parent = parent_mem_cgroup(memcg); > + struct deferred_split *ds_queue = &memcg->deferred_split_queue; > + struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; > + int nid; > + > + spin_lock_irq(&ds_queue->split_queue_lock); > + spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); > + > + if (!ds_queue->split_queue_len) > + goto unlock; > + > + list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); > + parent_ds_queue->split_queue_len += ds_queue->split_queue_len; > + ds_queue->split_queue_len = 0; > + /* Mark the ds_queue dead */ > + ds_queue->is_dying = true; > + > + for_each_node(nid) > + set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); Does this loop need to be under locks? > + > +unlock: > + spin_unlock(&parent_ds_queue->split_queue_lock); > + spin_unlock_irq(&ds_queue->split_queue_lock); > +} > + > #ifdef CONFIG_DEBUG_FS > static void split_huge_pages_all(void) > { > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index e090f29eb03bd..d03da72e7585d 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -3887,6 +3887,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) > zswap_memcg_offline_cleanup(memcg); > > memcg_offline_kmem(memcg); > + reparent_deferred_split_queue(memcg); > reparent_shrinker_deferred(memcg); I guess the naming can be a bit more consistent here :) Thanks!
Hi Roman, On 9/24/25 5:23 PM, Roman Gushchin wrote: > Qi Zheng <zhengqi.arch@bytedance.com> writes: > >> In the future, we will reparent LRU folios during memcg offline to >> eliminate dying memory cgroups, which requires reparenting the split queue >> to its parent. > > Nit: commit logs should really focus on the actual change, not the future > plans. Got it. > >> >> Similar to list_lru, the split queue is relatively independent and does >> not need to be reparented along with objcg and LRU folios (holding >> objcg lock and lru lock). So let's apply the same mechanism as list_lru >> to reparent the split queue separately when memcg is offine. >> >> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> >> --- >> include/linux/huge_mm.h | 2 ++ >> include/linux/mmzone.h | 1 + >> mm/huge_memory.c | 39 +++++++++++++++++++++++++++++++++++++++ >> mm/memcontrol.c | 1 + >> mm/mm_init.c | 1 + >> 5 files changed, 44 insertions(+) >> >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >> index f327d62fc9852..a0d4b751974d2 100644 >> --- a/include/linux/huge_mm.h >> +++ b/include/linux/huge_mm.h >> @@ -417,6 +417,7 @@ static inline int split_huge_page(struct page *page) >> return split_huge_page_to_list_to_order(page, NULL, ret); >> } >> void deferred_split_folio(struct folio *folio, bool partially_mapped); >> +void reparent_deferred_split_queue(struct mem_cgroup *memcg); >> >> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, >> unsigned long address, bool freeze); >> @@ -611,6 +612,7 @@ static inline int try_folio_split(struct folio *folio, struct page *page, >> } >> >> static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} >> +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} >> #define split_huge_pmd(__vma, __pmd, __address) \ >> do { } while (0) >> >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h >> index 7fb7331c57250..f3eb81fee056a 100644 >> --- a/include/linux/mmzone.h >> +++ b/include/linux/mmzone.h >> @@ -1346,6 +1346,7 @@ struct deferred_split { >> spinlock_t split_queue_lock; >> struct list_head split_queue; >> unsigned long split_queue_len; >> + bool is_dying; >> }; >> #endif >> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index 48b51e6230a67..de7806f759cba 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -1094,9 +1094,15 @@ static struct deferred_split *folio_split_queue_lock(struct folio *folio) >> struct deferred_split *queue; >> >> memcg = folio_memcg(folio); >> +retry: >> queue = memcg ? &memcg->deferred_split_queue : >> &NODE_DATA(folio_nid(folio))->deferred_split_queue; >> spin_lock(&queue->split_queue_lock); >> + if (unlikely(queue->is_dying == true)) { >> + spin_unlock(&queue->split_queue_lock); >> + memcg = parent_mem_cgroup(memcg); >> + goto retry; >> + } >> >> return queue; >> } >> @@ -1108,9 +1114,15 @@ folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) >> struct deferred_split *queue; >> >> memcg = folio_memcg(folio); >> +retry: >> queue = memcg ? &memcg->deferred_split_queue : >> &NODE_DATA(folio_nid(folio))->deferred_split_queue; >> spin_lock_irqsave(&queue->split_queue_lock, *flags); >> + if (unlikely(queue->is_dying == true)) { >> + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); >> + memcg = parent_mem_cgroup(memcg); >> + goto retry; >> + } >> >> return queue; >> } >> @@ -4284,6 +4296,33 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, >> return split; >> } >> >> +void reparent_deferred_split_queue(struct mem_cgroup *memcg) >> +{ >> + struct mem_cgroup *parent = parent_mem_cgroup(memcg); >> + struct deferred_split *ds_queue = &memcg->deferred_split_queue; >> + struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; >> + int nid; >> + >> + spin_lock_irq(&ds_queue->split_queue_lock); >> + spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); >> + >> + if (!ds_queue->split_queue_len) >> + goto unlock; >> + >> + list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); >> + parent_ds_queue->split_queue_len += ds_queue->split_queue_len; >> + ds_queue->split_queue_len = 0; >> + /* Mark the ds_queue dead */ >> + ds_queue->is_dying = true; >> + >> + for_each_node(nid) >> + set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); > > Does this loop need to be under locks? I think it is not necessary, but the loop overhead should not be high. > >> + >> +unlock: >> + spin_unlock(&parent_ds_queue->split_queue_lock); >> + spin_unlock_irq(&ds_queue->split_queue_lock); >> +} >> + >> #ifdef CONFIG_DEBUG_FS >> static void split_huge_pages_all(void) >> { >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >> index e090f29eb03bd..d03da72e7585d 100644 >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -3887,6 +3887,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) >> zswap_memcg_offline_cleanup(memcg); >> >> memcg_offline_kmem(memcg); >> + reparent_deferred_split_queue(memcg); >> reparent_shrinker_deferred(memcg); > > I guess the naming can be a bit more consistent here :) Do you mean to change them all to: memcg_offline_xxx() or reparent_xxx() ? Thanks, Qi > > Thanks!
On 23 Sep 2025, at 5:16, Qi Zheng wrote: > In the future, we will reparent LRU folios during memcg offline to > eliminate dying memory cgroups, which requires reparenting the split queue > to its parent. > > Similar to list_lru, the split queue is relatively independent and does > not need to be reparented along with objcg and LRU folios (holding > objcg lock and lru lock). So let's apply the same mechanism as list_lru > to reparent the split queue separately when memcg is offine. > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> > --- > include/linux/huge_mm.h | 2 ++ > include/linux/mmzone.h | 1 + > mm/huge_memory.c | 39 +++++++++++++++++++++++++++++++++++++++ > mm/memcontrol.c | 1 + > mm/mm_init.c | 1 + > 5 files changed, 44 insertions(+) > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index f327d62fc9852..a0d4b751974d2 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -417,6 +417,7 @@ static inline int split_huge_page(struct page *page) > return split_huge_page_to_list_to_order(page, NULL, ret); > } > void deferred_split_folio(struct folio *folio, bool partially_mapped); > +void reparent_deferred_split_queue(struct mem_cgroup *memcg); > > void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > unsigned long address, bool freeze); > @@ -611,6 +612,7 @@ static inline int try_folio_split(struct folio *folio, struct page *page, > } > > static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} > +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} > #define split_huge_pmd(__vma, __pmd, __address) \ > do { } while (0) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 7fb7331c57250..f3eb81fee056a 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -1346,6 +1346,7 @@ struct deferred_split { > spinlock_t split_queue_lock; > struct list_head split_queue; > unsigned long split_queue_len; > + bool is_dying; > }; > #endif > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 48b51e6230a67..de7806f759cba 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1094,9 +1094,15 @@ static struct deferred_split *folio_split_queue_lock(struct folio *folio) > struct deferred_split *queue; > > memcg = folio_memcg(folio); > +retry: > queue = memcg ? &memcg->deferred_split_queue : > &NODE_DATA(folio_nid(folio))->deferred_split_queue; > spin_lock(&queue->split_queue_lock); > + if (unlikely(queue->is_dying == true)) { > + spin_unlock(&queue->split_queue_lock); > + memcg = parent_mem_cgroup(memcg); > + goto retry; > + } > > return queue; > } > @@ -1108,9 +1114,15 @@ folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) > struct deferred_split *queue; > > memcg = folio_memcg(folio); > +retry: > queue = memcg ? &memcg->deferred_split_queue : > &NODE_DATA(folio_nid(folio))->deferred_split_queue; > spin_lock_irqsave(&queue->split_queue_lock, *flags); > + if (unlikely(queue->is_dying == true)) { > + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); > + memcg = parent_mem_cgroup(memcg); > + goto retry; > + } > > return queue; > } > @@ -4284,6 +4296,33 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, > return split; > } > > +void reparent_deferred_split_queue(struct mem_cgroup *memcg) > +{ > + struct mem_cgroup *parent = parent_mem_cgroup(memcg); > + struct deferred_split *ds_queue = &memcg->deferred_split_queue; > + struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; > + int nid; > + > + spin_lock_irq(&ds_queue->split_queue_lock); > + spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); > + > + if (!ds_queue->split_queue_len) > + goto unlock; Should ds_queue still be marked as dying even if it is empty? Otherwise, new folios still can be added to it, based on my understanding of the changes to folio_split_queue_lock*(). > + > + list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); > + parent_ds_queue->split_queue_len += ds_queue->split_queue_len; > + ds_queue->split_queue_len = 0; > + /* Mark the ds_queue dead */ > + ds_queue->is_dying = true; > + > + for_each_node(nid) > + set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); > + > +unlock: > + spin_unlock(&parent_ds_queue->split_queue_lock); > + spin_unlock_irq(&ds_queue->split_queue_lock); > +} > + > #ifdef CONFIG_DEBUG_FS > static void split_huge_pages_all(void) > { > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index e090f29eb03bd..d03da72e7585d 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -3887,6 +3887,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) > zswap_memcg_offline_cleanup(memcg); > > memcg_offline_kmem(memcg); > + reparent_deferred_split_queue(memcg); > reparent_shrinker_deferred(memcg); > wb_memcg_offline(memcg); > lru_gen_offline_memcg(memcg); > diff --git a/mm/mm_init.c b/mm/mm_init.c > index 3db2dea7db4c5..cbda5c2ee3241 100644 > --- a/mm/mm_init.c > +++ b/mm/mm_init.c > @@ -1387,6 +1387,7 @@ static void pgdat_init_split_queue(struct pglist_data *pgdat) > spin_lock_init(&ds_queue->split_queue_lock); > INIT_LIST_HEAD(&ds_queue->split_queue); > ds_queue->split_queue_len = 0; > + ds_queue->is_dying = false; > } > #else > static void pgdat_init_split_queue(struct pglist_data *pgdat) {} > -- > 2.20.1 Best Regards, Yan, Zi
On 9/23/25 11:44 PM, Zi Yan wrote: > On 23 Sep 2025, at 5:16, Qi Zheng wrote: > >> In the future, we will reparent LRU folios during memcg offline to >> eliminate dying memory cgroups, which requires reparenting the split queue >> to its parent. >> >> Similar to list_lru, the split queue is relatively independent and does >> not need to be reparented along with objcg and LRU folios (holding >> objcg lock and lru lock). So let's apply the same mechanism as list_lru >> to reparent the split queue separately when memcg is offine. >> >> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> >> --- >> include/linux/huge_mm.h | 2 ++ >> include/linux/mmzone.h | 1 + >> mm/huge_memory.c | 39 +++++++++++++++++++++++++++++++++++++++ >> mm/memcontrol.c | 1 + >> mm/mm_init.c | 1 + >> 5 files changed, 44 insertions(+) >> >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >> index f327d62fc9852..a0d4b751974d2 100644 >> --- a/include/linux/huge_mm.h >> +++ b/include/linux/huge_mm.h >> @@ -417,6 +417,7 @@ static inline int split_huge_page(struct page *page) >> return split_huge_page_to_list_to_order(page, NULL, ret); >> } >> void deferred_split_folio(struct folio *folio, bool partially_mapped); >> +void reparent_deferred_split_queue(struct mem_cgroup *memcg); >> >> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, >> unsigned long address, bool freeze); >> @@ -611,6 +612,7 @@ static inline int try_folio_split(struct folio *folio, struct page *page, >> } >> >> static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} >> +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} >> #define split_huge_pmd(__vma, __pmd, __address) \ >> do { } while (0) >> >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h >> index 7fb7331c57250..f3eb81fee056a 100644 >> --- a/include/linux/mmzone.h >> +++ b/include/linux/mmzone.h >> @@ -1346,6 +1346,7 @@ struct deferred_split { >> spinlock_t split_queue_lock; >> struct list_head split_queue; >> unsigned long split_queue_len; >> + bool is_dying; >> }; >> #endif >> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index 48b51e6230a67..de7806f759cba 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -1094,9 +1094,15 @@ static struct deferred_split *folio_split_queue_lock(struct folio *folio) >> struct deferred_split *queue; >> >> memcg = folio_memcg(folio); >> +retry: >> queue = memcg ? &memcg->deferred_split_queue : >> &NODE_DATA(folio_nid(folio))->deferred_split_queue; >> spin_lock(&queue->split_queue_lock); >> + if (unlikely(queue->is_dying == true)) { >> + spin_unlock(&queue->split_queue_lock); >> + memcg = parent_mem_cgroup(memcg); >> + goto retry; >> + } >> >> return queue; >> } >> @@ -1108,9 +1114,15 @@ folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) >> struct deferred_split *queue; >> >> memcg = folio_memcg(folio); >> +retry: >> queue = memcg ? &memcg->deferred_split_queue : >> &NODE_DATA(folio_nid(folio))->deferred_split_queue; >> spin_lock_irqsave(&queue->split_queue_lock, *flags); >> + if (unlikely(queue->is_dying == true)) { >> + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); >> + memcg = parent_mem_cgroup(memcg); >> + goto retry; >> + } >> >> return queue; >> } >> @@ -4284,6 +4296,33 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, >> return split; >> } >> >> +void reparent_deferred_split_queue(struct mem_cgroup *memcg) >> +{ >> + struct mem_cgroup *parent = parent_mem_cgroup(memcg); >> + struct deferred_split *ds_queue = &memcg->deferred_split_queue; >> + struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; >> + int nid; >> + >> + spin_lock_irq(&ds_queue->split_queue_lock); >> + spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); >> + >> + if (!ds_queue->split_queue_len) >> + goto unlock; > > Should ds_queue still be marked as dying even if it is empty? > Otherwise, new folios still can be added to it, based on my > understanding of the changes to folio_split_queue_lock*(). I think you are right, will do in the next version. Thanks, Qi > >> + >> + list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); >> + parent_ds_queue->split_queue_len += ds_queue->split_queue_len; >> + ds_queue->split_queue_len = 0; >> + /* Mark the ds_queue dead */ >> + ds_queue->is_dying = true; >> + >> + for_each_node(nid) >> + set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); >> + >> +unlock: >> + spin_unlock(&parent_ds_queue->split_queue_lock); >> + spin_unlock_irq(&ds_queue->split_queue_lock); >> +} >> + >> #ifdef CONFIG_DEBUG_FS >> static void split_huge_pages_all(void) >> { >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >> index e090f29eb03bd..d03da72e7585d 100644 >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -3887,6 +3887,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) >> zswap_memcg_offline_cleanup(memcg); >> >> memcg_offline_kmem(memcg); >> + reparent_deferred_split_queue(memcg); >> reparent_shrinker_deferred(memcg); >> wb_memcg_offline(memcg); >> lru_gen_offline_memcg(memcg); >> diff --git a/mm/mm_init.c b/mm/mm_init.c >> index 3db2dea7db4c5..cbda5c2ee3241 100644 >> --- a/mm/mm_init.c >> +++ b/mm/mm_init.c >> @@ -1387,6 +1387,7 @@ static void pgdat_init_split_queue(struct pglist_data *pgdat) >> spin_lock_init(&ds_queue->split_queue_lock); >> INIT_LIST_HEAD(&ds_queue->split_queue); >> ds_queue->split_queue_len = 0; >> + ds_queue->is_dying = false; >> } >> #else >> static void pgdat_init_split_queue(struct pglist_data *pgdat) {} >> -- >> 2.20.1 > > > Best Regards, > Yan, Zi
© 2016 - 2025 Red Hat, Inc.