From: Qi Zheng <zhengqi.arch@bytedance.com>
Similar to list_lru, the split queue is relatively independent and does
not need to be reparented along with objcg and LRU folios (holding
objcg lock and lru lock). So let's apply the same mechanism as list_lru
to reparent the split queue separately when memcg is offine.
This is also a preparation for reparenting LRU folios.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
include/linux/huge_mm.h | 4 ++++
mm/huge_memory.c | 46 +++++++++++++++++++++++++++++++++++++++++
mm/memcontrol.c | 1 +
3 files changed, 51 insertions(+)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f327d62fc9852..0c211dcbb0ec1 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -417,6 +417,9 @@ static inline int split_huge_page(struct page *page)
return split_huge_page_to_list_to_order(page, NULL, ret);
}
void deferred_split_folio(struct folio *folio, bool partially_mapped);
+#ifdef CONFIG_MEMCG
+void reparent_deferred_split_queue(struct mem_cgroup *memcg);
+#endif
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze);
@@ -611,6 +614,7 @@ static inline int try_folio_split(struct folio *folio, struct page *page,
}
static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
+static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bb32091e3133e..5fc0caca71de0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1094,9 +1094,22 @@ static struct deferred_split *folio_split_queue_lock(struct folio *folio)
struct deferred_split *queue;
memcg = folio_memcg(folio);
+retry:
queue = memcg ? &memcg->deferred_split_queue :
&NODE_DATA(folio_nid(folio))->deferred_split_queue;
spin_lock(&queue->split_queue_lock);
+ /*
+ * Notice:
+ * 1. The memcg could be NULL if cgroup_disable=memory is set.
+ * 2. There is a period between setting CSS_DYING and reparenting
+ * deferred split queue, and during this period the THPs in the
+ * deferred split queue will be hidden from the shrinker side.
+ */
+ if (unlikely(memcg && css_is_dying(&memcg->css))) {
+ spin_unlock(&queue->split_queue_lock);
+ memcg = parent_mem_cgroup(memcg);
+ goto retry;
+ }
return queue;
}
@@ -1108,9 +1121,15 @@ folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
struct deferred_split *queue;
memcg = folio_memcg(folio);
+retry:
queue = memcg ? &memcg->deferred_split_queue :
&NODE_DATA(folio_nid(folio))->deferred_split_queue;
spin_lock_irqsave(&queue->split_queue_lock, *flags);
+ if (unlikely(memcg && css_is_dying(&memcg->css))) {
+ spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
+ memcg = parent_mem_cgroup(memcg);
+ goto retry;
+ }
return queue;
}
@@ -4275,6 +4294,33 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
return split;
}
+#ifdef CONFIG_MEMCG
+void reparent_deferred_split_queue(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct deferred_split *ds_queue = &memcg->deferred_split_queue;
+ struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
+ int nid;
+
+ spin_lock_irq(&ds_queue->split_queue_lock);
+ spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
+
+ if (!ds_queue->split_queue_len)
+ goto unlock;
+
+ list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
+ parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
+ ds_queue->split_queue_len = 0;
+
+ for_each_node(nid)
+ set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
+
+unlock:
+ spin_unlock(&parent_ds_queue->split_queue_lock);
+ spin_unlock_irq(&ds_queue->split_queue_lock);
+}
+#endif
+
#ifdef CONFIG_DEBUG_FS
static void split_huge_pages_all(void)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e090f29eb03bd..d03da72e7585d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3887,6 +3887,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
zswap_memcg_offline_cleanup(memcg);
memcg_offline_kmem(memcg);
+ reparent_deferred_split_queue(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
lru_gen_offline_memcg(memcg);
--
2.20.1
> On Sep 28, 2025, at 19:45, Qi Zheng <qi.zheng@linux.dev> wrote: > > From: Qi Zheng <zhengqi.arch@bytedance.com> > > Similar to list_lru, the split queue is relatively independent and does > not need to be reparented along with objcg and LRU folios (holding > objcg lock and lru lock). So let's apply the same mechanism as list_lru > to reparent the split queue separately when memcg is offine. > > This is also a preparation for reparenting LRU folios. > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> > --- > include/linux/huge_mm.h | 4 ++++ > mm/huge_memory.c | 46 +++++++++++++++++++++++++++++++++++++++++ > mm/memcontrol.c | 1 + > 3 files changed, 51 insertions(+) > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index f327d62fc9852..0c211dcbb0ec1 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -417,6 +417,9 @@ static inline int split_huge_page(struct page *page) > return split_huge_page_to_list_to_order(page, NULL, ret); > } > void deferred_split_folio(struct folio *folio, bool partially_mapped); > +#ifdef CONFIG_MEMCG > +void reparent_deferred_split_queue(struct mem_cgroup *memcg); > +#endif > > void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > unsigned long address, bool freeze); > @@ -611,6 +614,7 @@ static inline int try_folio_split(struct folio *folio, struct page *page, > } > > static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} > +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} > #define split_huge_pmd(__vma, __pmd, __address) \ > do { } while (0) > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index bb32091e3133e..5fc0caca71de0 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1094,9 +1094,22 @@ static struct deferred_split *folio_split_queue_lock(struct folio *folio) > struct deferred_split *queue; > > memcg = folio_memcg(folio); > +retry: > queue = memcg ? &memcg->deferred_split_queue : > &NODE_DATA(folio_nid(folio))->deferred_split_queue; > spin_lock(&queue->split_queue_lock); > + /* > + * Notice: > + * 1. The memcg could be NULL if cgroup_disable=memory is set. > + * 2. There is a period between setting CSS_DYING and reparenting > + * deferred split queue, and during this period the THPs in the > + * deferred split queue will be hidden from the shrinker side. > + */ > + if (unlikely(memcg && css_is_dying(&memcg->css))) { > + spin_unlock(&queue->split_queue_lock); > + memcg = parent_mem_cgroup(memcg); > + goto retry; > + } > > return queue; > } > @@ -1108,9 +1121,15 @@ folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) > struct deferred_split *queue; > > memcg = folio_memcg(folio); > +retry: > queue = memcg ? &memcg->deferred_split_queue : > &NODE_DATA(folio_nid(folio))->deferred_split_queue; > spin_lock_irqsave(&queue->split_queue_lock, *flags); > + if (unlikely(memcg && css_is_dying(&memcg->css))) { > + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); > + memcg = parent_mem_cgroup(memcg); > + goto retry; > + } > > return queue; > } > @@ -4275,6 +4294,33 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, > return split; > } > > +#ifdef CONFIG_MEMCG > +void reparent_deferred_split_queue(struct mem_cgroup *memcg) > +{ > + struct mem_cgroup *parent = parent_mem_cgroup(memcg); > + struct deferred_split *ds_queue = &memcg->deferred_split_queue; > + struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; > + int nid; > + > + spin_lock_irq(&ds_queue->split_queue_lock); > + spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); > + > + if (!ds_queue->split_queue_len) > + goto unlock; > + > + list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); > + parent_ds_queue->split_queue_len += ds_queue->split_queue_len; > + ds_queue->split_queue_len = 0; > + > + for_each_node(nid) > + set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); > + > +unlock: > + spin_unlock(&parent_ds_queue->split_queue_lock); > + spin_unlock_irq(&ds_queue->split_queue_lock); > +} > +#endif > + > #ifdef CONFIG_DEBUG_FS > static void split_huge_pages_all(void) > { > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index e090f29eb03bd..d03da72e7585d 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -3887,6 +3887,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) > zswap_memcg_offline_cleanup(memcg); > > memcg_offline_kmem(memcg); > + reparent_deferred_split_queue(memcg); Since the dying flag of a memcg is not set under split_queue_lock, two threads holding different split_queue_locks (e.g., one for the parent memcg and one for the child) can concurrently manipulate the same split-queue list of a folio. I think we should take the same solution like list_lru does to fix this. Muchun, Thanks. > reparent_shrinker_deferred(memcg); > wb_memcg_offline(memcg); > lru_gen_offline_memcg(memcg); > -- > 2.20.1 >
On 9/29/25 2:20 PM, Muchun Song wrote: > > >> On Sep 28, 2025, at 19:45, Qi Zheng <qi.zheng@linux.dev> wrote: >> >> From: Qi Zheng <zhengqi.arch@bytedance.com> >> >> Similar to list_lru, the split queue is relatively independent and does >> not need to be reparented along with objcg and LRU folios (holding >> objcg lock and lru lock). So let's apply the same mechanism as list_lru >> to reparent the split queue separately when memcg is offine. >> >> This is also a preparation for reparenting LRU folios. >> >> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> >> --- >> include/linux/huge_mm.h | 4 ++++ >> mm/huge_memory.c | 46 +++++++++++++++++++++++++++++++++++++++++ >> mm/memcontrol.c | 1 + >> 3 files changed, 51 insertions(+) >> >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >> index f327d62fc9852..0c211dcbb0ec1 100644 >> --- a/include/linux/huge_mm.h >> +++ b/include/linux/huge_mm.h >> @@ -417,6 +417,9 @@ static inline int split_huge_page(struct page *page) >> return split_huge_page_to_list_to_order(page, NULL, ret); >> } >> void deferred_split_folio(struct folio *folio, bool partially_mapped); >> +#ifdef CONFIG_MEMCG >> +void reparent_deferred_split_queue(struct mem_cgroup *memcg); >> +#endif >> >> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, >> unsigned long address, bool freeze); >> @@ -611,6 +614,7 @@ static inline int try_folio_split(struct folio *folio, struct page *page, >> } >> >> static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} >> +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} >> #define split_huge_pmd(__vma, __pmd, __address) \ >> do { } while (0) >> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index bb32091e3133e..5fc0caca71de0 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -1094,9 +1094,22 @@ static struct deferred_split *folio_split_queue_lock(struct folio *folio) >> struct deferred_split *queue; >> >> memcg = folio_memcg(folio); >> +retry: >> queue = memcg ? &memcg->deferred_split_queue : >> &NODE_DATA(folio_nid(folio))->deferred_split_queue; >> spin_lock(&queue->split_queue_lock); >> + /* >> + * Notice: >> + * 1. The memcg could be NULL if cgroup_disable=memory is set. >> + * 2. There is a period between setting CSS_DYING and reparenting >> + * deferred split queue, and during this period the THPs in the >> + * deferred split queue will be hidden from the shrinker side. The shrinker side can find this deferred split queue by traversing memcgs, so we should check CSS_DYING after we acquire child split_queue_lock in : deferred_split_scan --> spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (css_is_dying(&memcg->css)) --> retry to get parent split_queue_lock So during this period, we use parent split_queue_lock to protect child deferred split queue. It's a little weird, but it's safe. >> + */ >> + if (unlikely(memcg && css_is_dying(&memcg->css))) { >> + spin_unlock(&queue->split_queue_lock); >> + memcg = parent_mem_cgroup(memcg); >> + goto retry; >> + } >> >> return queue; >> } >> @@ -1108,9 +1121,15 @@ folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) >> struct deferred_split *queue; >> >> memcg = folio_memcg(folio); >> +retry: >> queue = memcg ? &memcg->deferred_split_queue : >> &NODE_DATA(folio_nid(folio))->deferred_split_queue; >> spin_lock_irqsave(&queue->split_queue_lock, *flags); >> + if (unlikely(memcg && css_is_dying(&memcg->css))) { >> + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); >> + memcg = parent_mem_cgroup(memcg); >> + goto retry; >> + } >> >> return queue; >> } >> @@ -4275,6 +4294,33 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, >> return split; >> } >> >> +#ifdef CONFIG_MEMCG >> +void reparent_deferred_split_queue(struct mem_cgroup *memcg) >> +{ >> + struct mem_cgroup *parent = parent_mem_cgroup(memcg); >> + struct deferred_split *ds_queue = &memcg->deferred_split_queue; >> + struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; >> + int nid; >> + >> + spin_lock_irq(&ds_queue->split_queue_lock); >> + spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); >> + >> + if (!ds_queue->split_queue_len) >> + goto unlock; >> + >> + list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); >> + parent_ds_queue->split_queue_len += ds_queue->split_queue_len; >> + ds_queue->split_queue_len = 0; >> + >> + for_each_node(nid) >> + set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); >> + >> +unlock: >> + spin_unlock(&parent_ds_queue->split_queue_lock); >> + spin_unlock_irq(&ds_queue->split_queue_lock); >> +} >> +#endif >> + >> #ifdef CONFIG_DEBUG_FS >> static void split_huge_pages_all(void) >> { >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >> index e090f29eb03bd..d03da72e7585d 100644 >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -3887,6 +3887,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) >> zswap_memcg_offline_cleanup(memcg); >> >> memcg_offline_kmem(memcg); >> + reparent_deferred_split_queue(memcg); > > Since the dying flag of a memcg is not set under split_queue_lock, > two threads holding different split_queue_locks (e.g., one for the > parent memcg and one for the child) can concurrently manipulate the > same split-queue list of a folio. I think we should take the same If we ensure that we will check CSS_DYING every time we take the split_queue_lock, then the lock protecting deferred split queue must be the same lock. To be more clear, consider the following case: CPU0 CPU1 CPU2 folio_split_queue_lock --> get child queue and lock set CSS_DYING deferred_split_scan unlock child queue lock --> acquire child queue lock ***WE SHOULD CHECK CSS_DYING HERE*** reparent spilt queue The deferred_split_scan() is problematic now, I will fix it as follow: diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5fc0caca71de0..9f1f61e7e0c8e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4208,6 +4208,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, struct folio *folio, *next; int split = 0, i; struct folio_batch fbatch; + struct mem_cgroup *memcg; #ifdef CONFIG_MEMCG if (sc->memcg) @@ -4217,6 +4218,11 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, folio_batch_init(&fbatch); retry: spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + if (sc->memcg && css_is_dying(&sc->memcg->css)) { + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + memcg = parent_mem_cgroup(sc->memcg); + spin_lock_irqsave(&memcg->deferred_split_queue.split_queue_lock, flags); + } /* Take pin on all head pages to avoid freeing them under us */ list_for_each_entry_safe(folio, next, &ds_queue->split_queue, _deferred_list) { Of course I'll add helper functions and do some cleanup. Thanks, Qi > solution like list_lru does to fix this. > > Muchun, > Thanks. > > >> reparent_shrinker_deferred(memcg); >> wb_memcg_offline(memcg); >> lru_gen_offline_memcg(memcg); >> -- >> 2.20.1 >> >
> On Sep 29, 2025, at 15:22, Qi Zheng <qi.zheng@linux.dev> wrote: > > > > On 9/29/25 2:20 PM, Muchun Song wrote: >>> On Sep 28, 2025, at 19:45, Qi Zheng <qi.zheng@linux.dev> wrote: >>> >>> From: Qi Zheng <zhengqi.arch@bytedance.com> >>> >>> Similar to list_lru, the split queue is relatively independent and does >>> not need to be reparented along with objcg and LRU folios (holding >>> objcg lock and lru lock). So let's apply the same mechanism as list_lru >>> to reparent the split queue separately when memcg is offine. >>> >>> This is also a preparation for reparenting LRU folios. >>> >>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> >>> --- >>> include/linux/huge_mm.h | 4 ++++ >>> mm/huge_memory.c | 46 +++++++++++++++++++++++++++++++++++++++++ >>> mm/memcontrol.c | 1 + >>> 3 files changed, 51 insertions(+) >>> >>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >>> index f327d62fc9852..0c211dcbb0ec1 100644 >>> --- a/include/linux/huge_mm.h >>> +++ b/include/linux/huge_mm.h >>> @@ -417,6 +417,9 @@ static inline int split_huge_page(struct page *page) >>> return split_huge_page_to_list_to_order(page, NULL, ret); >>> } >>> void deferred_split_folio(struct folio *folio, bool partially_mapped); >>> +#ifdef CONFIG_MEMCG >>> +void reparent_deferred_split_queue(struct mem_cgroup *memcg); >>> +#endif >>> >>> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, >>> unsigned long address, bool freeze); >>> @@ -611,6 +614,7 @@ static inline int try_folio_split(struct folio *folio, struct page *page, >>> } >>> >>> static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} >>> +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} >>> #define split_huge_pmd(__vma, __pmd, __address) \ >>> do { } while (0) >>> >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>> index bb32091e3133e..5fc0caca71de0 100644 >>> --- a/mm/huge_memory.c >>> +++ b/mm/huge_memory.c >>> @@ -1094,9 +1094,22 @@ static struct deferred_split *folio_split_queue_lock(struct folio *folio) >>> struct deferred_split *queue; >>> >>> memcg = folio_memcg(folio); >>> +retry: >>> queue = memcg ? &memcg->deferred_split_queue : >>> &NODE_DATA(folio_nid(folio))->deferred_split_queue; >>> spin_lock(&queue->split_queue_lock); >>> + /* >>> + * Notice: >>> + * 1. The memcg could be NULL if cgroup_disable=memory is set. >>> + * 2. There is a period between setting CSS_DYING and reparenting >>> + * deferred split queue, and during this period the THPs in the >>> + * deferred split queue will be hidden from the shrinker side. > > The shrinker side can find this deferred split queue by traversing > memcgs, so we should check CSS_DYING after we acquire child > split_queue_lock in : > > deferred_split_scan > --> spin_lock_irqsave(&ds_queue->split_queue_lock, flags); > if (css_is_dying(&memcg->css)) > --> retry to get parent split_queue_lock > > So during this period, we use parent split_queue_lock to protect > child deferred split queue. It's a little weird, but it's safe. > >>> + */ >>> + if (unlikely(memcg && css_is_dying(&memcg->css))) { >>> + spin_unlock(&queue->split_queue_lock); >>> + memcg = parent_mem_cgroup(memcg); >>> + goto retry; >>> + } >>> >>> return queue; >>> } >>> @@ -1108,9 +1121,15 @@ folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) >>> struct deferred_split *queue; >>> >>> memcg = folio_memcg(folio); >>> +retry: >>> queue = memcg ? &memcg->deferred_split_queue : >>> &NODE_DATA(folio_nid(folio))->deferred_split_queue; >>> spin_lock_irqsave(&queue->split_queue_lock, *flags); >>> + if (unlikely(memcg && css_is_dying(&memcg->css))) { >>> + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); >>> + memcg = parent_mem_cgroup(memcg); >>> + goto retry; >>> + } >>> >>> return queue; >>> } >>> @@ -4275,6 +4294,33 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, >>> return split; >>> } >>> >>> +#ifdef CONFIG_MEMCG >>> +void reparent_deferred_split_queue(struct mem_cgroup *memcg) >>> +{ >>> + struct mem_cgroup *parent = parent_mem_cgroup(memcg); >>> + struct deferred_split *ds_queue = &memcg->deferred_split_queue; >>> + struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; >>> + int nid; >>> + >>> + spin_lock_irq(&ds_queue->split_queue_lock); >>> + spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); >>> + >>> + if (!ds_queue->split_queue_len) >>> + goto unlock; >>> + >>> + list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); >>> + parent_ds_queue->split_queue_len += ds_queue->split_queue_len; >>> + ds_queue->split_queue_len = 0; >>> + >>> + for_each_node(nid) >>> + set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); >>> + >>> +unlock: >>> + spin_unlock(&parent_ds_queue->split_queue_lock); >>> + spin_unlock_irq(&ds_queue->split_queue_lock); >>> +} >>> +#endif >>> + >>> #ifdef CONFIG_DEBUG_FS >>> static void split_huge_pages_all(void) >>> { >>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >>> index e090f29eb03bd..d03da72e7585d 100644 >>> --- a/mm/memcontrol.c >>> +++ b/mm/memcontrol.c >>> @@ -3887,6 +3887,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) >>> zswap_memcg_offline_cleanup(memcg); >>> >>> memcg_offline_kmem(memcg); >>> + reparent_deferred_split_queue(memcg); >> Since the dying flag of a memcg is not set under split_queue_lock, >> two threads holding different split_queue_locks (e.g., one for the >> parent memcg and one for the child) can concurrently manipulate the >> same split-queue list of a folio. I think we should take the same > > If we ensure that we will check CSS_DYING every time we take the > split_queue_lock, then the lock protecting deferred split queue > must be the same lock. > > To be more clear, consider the following case: > > CPU0 CPU1 CPU2 > > folio_split_queue_lock > --> get child queue and lock > > set CSS_DYING > > deferred_split_scan > unlock child queue lock > --> acquire child queue lock > ***WE SHOULD CHECK CSS_DYING HERE*** > > > reparent spilt queue > > The deferred_split_scan() is problematic now, I will fix it as follow: > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 5fc0caca71de0..9f1f61e7e0c8e 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -4208,6 +4208,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, > struct folio *folio, *next; > int split = 0, i; > struct folio_batch fbatch; > + struct mem_cgroup *memcg; > > #ifdef CONFIG_MEMCG > if (sc->memcg) > @@ -4217,6 +4218,11 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, > folio_batch_init(&fbatch); > retry: > spin_lock_irqsave(&ds_queue->split_queue_lock, flags); > + if (sc->memcg && css_is_dying(&sc->memcg->css)) { There are more than one place where we check whether a memcg is dying, it is better to introduce a helper like mem_cgroup_is_dying to do this in memcontrol.h. > + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); Yes, we could fix this like this way. But I suggest we introduce another helper like folio_split_queue_lock to do the similar retry logic. Every users of split_queue_lock are supposed to use this new helper or folio_split_queue_lock to get the lock. > + memcg = parent_mem_cgroup(sc->memcg); > + spin_lock_irqsave(&memcg->deferred_split_queue.split_queue_lock, flags); > + } > /* Take pin on all head pages to avoid freeing them under us */ > list_for_each_entry_safe(folio, next, &ds_queue->split_queue, > _deferred_list) { > > Of course I'll add helper functions and do some cleanup. Yes. > > Thanks, > Qi > > >> solution like list_lru does to fix this. >> Muchun, >> Thanks. >>> reparent_shrinker_deferred(memcg); >>> wb_memcg_offline(memcg); >>> lru_gen_offline_memcg(memcg); >>> -- >>> 2.20.1
On 9/29/25 3:38 PM, Muchun Song wrote: > > >> On Sep 29, 2025, at 15:22, Qi Zheng <qi.zheng@linux.dev> wrote: >> >> >> >> On 9/29/25 2:20 PM, Muchun Song wrote: >>>> On Sep 28, 2025, at 19:45, Qi Zheng <qi.zheng@linux.dev> wrote: >>>> >>>> From: Qi Zheng <zhengqi.arch@bytedance.com> >>>> >>>> Similar to list_lru, the split queue is relatively independent and does >>>> not need to be reparented along with objcg and LRU folios (holding >>>> objcg lock and lru lock). So let's apply the same mechanism as list_lru >>>> to reparent the split queue separately when memcg is offine. >>>> >>>> This is also a preparation for reparenting LRU folios. >>>> >>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> >>>> --- >>>> include/linux/huge_mm.h | 4 ++++ >>>> mm/huge_memory.c | 46 +++++++++++++++++++++++++++++++++++++++++ >>>> mm/memcontrol.c | 1 + >>>> 3 files changed, 51 insertions(+) >>>> >>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >>>> index f327d62fc9852..0c211dcbb0ec1 100644 >>>> --- a/include/linux/huge_mm.h >>>> +++ b/include/linux/huge_mm.h >>>> @@ -417,6 +417,9 @@ static inline int split_huge_page(struct page *page) >>>> return split_huge_page_to_list_to_order(page, NULL, ret); >>>> } >>>> void deferred_split_folio(struct folio *folio, bool partially_mapped); >>>> +#ifdef CONFIG_MEMCG >>>> +void reparent_deferred_split_queue(struct mem_cgroup *memcg); >>>> +#endif >>>> >>>> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, >>>> unsigned long address, bool freeze); >>>> @@ -611,6 +614,7 @@ static inline int try_folio_split(struct folio *folio, struct page *page, >>>> } >>>> >>>> static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} >>>> +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} >>>> #define split_huge_pmd(__vma, __pmd, __address) \ >>>> do { } while (0) >>>> >>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>>> index bb32091e3133e..5fc0caca71de0 100644 >>>> --- a/mm/huge_memory.c >>>> +++ b/mm/huge_memory.c >>>> @@ -1094,9 +1094,22 @@ static struct deferred_split *folio_split_queue_lock(struct folio *folio) >>>> struct deferred_split *queue; >>>> >>>> memcg = folio_memcg(folio); >>>> +retry: >>>> queue = memcg ? &memcg->deferred_split_queue : >>>> &NODE_DATA(folio_nid(folio))->deferred_split_queue; >>>> spin_lock(&queue->split_queue_lock); >>>> + /* >>>> + * Notice: >>>> + * 1. The memcg could be NULL if cgroup_disable=memory is set. >>>> + * 2. There is a period between setting CSS_DYING and reparenting >>>> + * deferred split queue, and during this period the THPs in the >>>> + * deferred split queue will be hidden from the shrinker side. >> >> The shrinker side can find this deferred split queue by traversing >> memcgs, so we should check CSS_DYING after we acquire child >> split_queue_lock in : >> >> deferred_split_scan >> --> spin_lock_irqsave(&ds_queue->split_queue_lock, flags); >> if (css_is_dying(&memcg->css)) >> --> retry to get parent split_queue_lock >> >> So during this period, we use parent split_queue_lock to protect >> child deferred split queue. It's a little weird, but it's safe. >> >>>> + */ >>>> + if (unlikely(memcg && css_is_dying(&memcg->css))) { >>>> + spin_unlock(&queue->split_queue_lock); >>>> + memcg = parent_mem_cgroup(memcg); >>>> + goto retry; >>>> + } >>>> >>>> return queue; >>>> } >>>> @@ -1108,9 +1121,15 @@ folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) >>>> struct deferred_split *queue; >>>> >>>> memcg = folio_memcg(folio); >>>> +retry: >>>> queue = memcg ? &memcg->deferred_split_queue : >>>> &NODE_DATA(folio_nid(folio))->deferred_split_queue; >>>> spin_lock_irqsave(&queue->split_queue_lock, *flags); >>>> + if (unlikely(memcg && css_is_dying(&memcg->css))) { >>>> + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); >>>> + memcg = parent_mem_cgroup(memcg); >>>> + goto retry; >>>> + } >>>> >>>> return queue; >>>> } >>>> @@ -4275,6 +4294,33 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, >>>> return split; >>>> } >>>> >>>> +#ifdef CONFIG_MEMCG >>>> +void reparent_deferred_split_queue(struct mem_cgroup *memcg) >>>> +{ >>>> + struct mem_cgroup *parent = parent_mem_cgroup(memcg); >>>> + struct deferred_split *ds_queue = &memcg->deferred_split_queue; >>>> + struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; >>>> + int nid; >>>> + >>>> + spin_lock_irq(&ds_queue->split_queue_lock); >>>> + spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); >>>> + >>>> + if (!ds_queue->split_queue_len) >>>> + goto unlock; >>>> + >>>> + list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); >>>> + parent_ds_queue->split_queue_len += ds_queue->split_queue_len; >>>> + ds_queue->split_queue_len = 0; >>>> + >>>> + for_each_node(nid) >>>> + set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); >>>> + >>>> +unlock: >>>> + spin_unlock(&parent_ds_queue->split_queue_lock); >>>> + spin_unlock_irq(&ds_queue->split_queue_lock); >>>> +} >>>> +#endif >>>> + >>>> #ifdef CONFIG_DEBUG_FS >>>> static void split_huge_pages_all(void) >>>> { >>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >>>> index e090f29eb03bd..d03da72e7585d 100644 >>>> --- a/mm/memcontrol.c >>>> +++ b/mm/memcontrol.c >>>> @@ -3887,6 +3887,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) >>>> zswap_memcg_offline_cleanup(memcg); >>>> >>>> memcg_offline_kmem(memcg); >>>> + reparent_deferred_split_queue(memcg); >>> Since the dying flag of a memcg is not set under split_queue_lock, >>> two threads holding different split_queue_locks (e.g., one for the >>> parent memcg and one for the child) can concurrently manipulate the >>> same split-queue list of a folio. I think we should take the same >> >> If we ensure that we will check CSS_DYING every time we take the >> split_queue_lock, then the lock protecting deferred split queue >> must be the same lock. >> >> To be more clear, consider the following case: >> >> CPU0 CPU1 CPU2 >> >> folio_split_queue_lock >> --> get child queue and lock >> >> set CSS_DYING >> >> deferred_split_scan >> unlock child queue lock >> --> acquire child queue lock >> ***WE SHOULD CHECK CSS_DYING HERE*** >> >> >> reparent spilt queue >> >> The deferred_split_scan() is problematic now, I will fix it as follow: >> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index 5fc0caca71de0..9f1f61e7e0c8e 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -4208,6 +4208,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, >> struct folio *folio, *next; >> int split = 0, i; >> struct folio_batch fbatch; >> + struct mem_cgroup *memcg; >> >> #ifdef CONFIG_MEMCG >> if (sc->memcg) >> @@ -4217,6 +4218,11 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, >> folio_batch_init(&fbatch); >> retry: >> spin_lock_irqsave(&ds_queue->split_queue_lock, flags); >> + if (sc->memcg && css_is_dying(&sc->memcg->css)) { > > There are more than one place where we check whether a memcg is dying, > it is better to introduce a helper like mem_cgroup_is_dying to do this > in memcontrol.h. OK. I will try to add a cleanup patch to do this. > >> + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); > > Yes, we could fix this like this way. But I suggest we introduce another > helper like folio_split_queue_lock to do the similar retry logic. Every users > of split_queue_lock are supposed to use this new helper or folio_split_queue_lock > to get the lock. Yes, will do. > >> + memcg = parent_mem_cgroup(sc->memcg); >> + spin_lock_irqsave(&memcg->deferred_split_queue.split_queue_lock, flags); >> + } >> /* Take pin on all head pages to avoid freeing them under us */ >> list_for_each_entry_safe(folio, next, &ds_queue->split_queue, >> _deferred_list) { >> >> Of course I'll add helper functions and do some cleanup. > > Yes. > >> >> Thanks, >> Qi >> >> >>> solution like list_lru does to fix this. >>> Muchun, >>> Thanks. >>>> reparent_shrinker_deferred(memcg); >>>> wb_memcg_offline(memcg); >>>> lru_gen_offline_memcg(memcg); >>>> -- >>>> 2.20.1 > >
© 2016 - 2025 Red Hat, Inc.