[PATCH 2/4] mm: thp: introduce folio_split_queue_lock and its variants

Qi Zheng posted 4 patches 1 week, 6 days ago
There is a newer version of this series
[PATCH 2/4] mm: thp: introduce folio_split_queue_lock and its variants
Posted by Qi Zheng 1 week, 6 days ago
From: Muchun Song <songmuchun@bytedance.com>

In future memcg removal, the binding between a folio and a memcg may
change, making the split lock within the memcg unstable when held.

A new approach is required to reparent the split queue to its parent. This
patch starts introducing a unified way to acquire the split lock for
future work.

It's a code-only refactoring with no functional changes.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 include/linux/memcontrol.h | 10 +++++
 mm/huge_memory.c           | 89 ++++++++++++++++++++++++++------------
 2 files changed, 71 insertions(+), 28 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 16fe0306e50ea..99876af13c315 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1662,6 +1662,11 @@ int alloc_shrinker_info(struct mem_cgroup *memcg);
 void free_shrinker_info(struct mem_cgroup *memcg);
 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
 void reparent_shrinker_deferred(struct mem_cgroup *memcg);
+
+static inline int shrinker_id(struct shrinker *shrinker)
+{
+	return shrinker->id;
+}
 #else
 #define mem_cgroup_sockets_enabled 0
 
@@ -1693,6 +1698,11 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg,
 				    int nid, int shrinker_id)
 {
 }
+
+static inline int shrinker_id(struct shrinker *shrinker)
+{
+	return -1;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 582628ddf3f33..d34516a22f5bb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1078,26 +1078,62 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 
 #ifdef CONFIG_MEMCG
 static inline
-struct deferred_split *get_deferred_split_queue(struct folio *folio)
+struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
+					   struct deferred_split *queue)
 {
-	struct mem_cgroup *memcg = folio_memcg(folio);
-	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
-
-	if (memcg)
-		return &memcg->deferred_split_queue;
-	else
-		return &pgdat->deferred_split_queue;
+	if (mem_cgroup_disabled())
+		return NULL;
+	if (&NODE_DATA(folio_nid(folio))->deferred_split_queue == queue)
+		return NULL;
+	return container_of(queue, struct mem_cgroup, deferred_split_queue);
 }
 #else
 static inline
-struct deferred_split *get_deferred_split_queue(struct folio *folio)
+struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
+					   struct deferred_split *queue)
 {
-	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
-
-	return &pgdat->deferred_split_queue;
+	return NULL;
 }
 #endif
 
+static struct deferred_split *folio_split_queue_lock(struct folio *folio)
+{
+	struct mem_cgroup *memcg;
+	struct deferred_split *queue;
+
+	memcg = folio_memcg(folio);
+	queue = memcg ? &memcg->deferred_split_queue :
+			&NODE_DATA(folio_nid(folio))->deferred_split_queue;
+	spin_lock(&queue->split_queue_lock);
+
+	return queue;
+}
+
+static struct deferred_split *
+folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
+{
+	struct mem_cgroup *memcg;
+	struct deferred_split *queue;
+
+	memcg = folio_memcg(folio);
+	queue = memcg ? &memcg->deferred_split_queue :
+			&NODE_DATA(folio_nid(folio))->deferred_split_queue;
+	spin_lock_irqsave(&queue->split_queue_lock, *flags);
+
+	return queue;
+}
+
+static inline void split_queue_unlock(struct deferred_split *queue)
+{
+	spin_unlock(&queue->split_queue_lock);
+}
+
+static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
+						 unsigned long flags)
+{
+	spin_unlock_irqrestore(&queue->split_queue_lock, flags);
+}
+
 static inline bool is_transparent_hugepage(const struct folio *folio)
 {
 	if (!folio_test_large(folio))
@@ -3579,7 +3615,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 		struct page *split_at, struct page *lock_at,
 		struct list_head *list, bool uniform_split)
 {
-	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
+	struct deferred_split *ds_queue;
 	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
 	struct folio *end_folio = folio_next(folio);
 	bool is_anon = folio_test_anon(folio);
@@ -3718,7 +3754,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	}
 
 	/* Prevent deferred_split_scan() touching ->_refcount */
-	spin_lock(&ds_queue->split_queue_lock);
+	ds_queue = folio_split_queue_lock(folio);
 	if (folio_ref_freeze(folio, 1 + extra_pins)) {
 		struct swap_cluster_info *ci = NULL;
 		struct lruvec *lruvec;
@@ -3740,7 +3776,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 			 */
 			list_del_init(&folio->_deferred_list);
 		}
-		spin_unlock(&ds_queue->split_queue_lock);
+		split_queue_unlock(ds_queue);
 		if (mapping) {
 			int nr = folio_nr_pages(folio);
 
@@ -3835,7 +3871,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 		if (ci)
 			swap_cluster_unlock(ci);
 	} else {
-		spin_unlock(&ds_queue->split_queue_lock);
+		split_queue_unlock(ds_queue);
 		ret = -EAGAIN;
 	}
 fail:
@@ -4016,8 +4052,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
 	WARN_ON_ONCE(folio_ref_count(folio));
 	WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
 
-	ds_queue = get_deferred_split_queue(folio);
-	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
 	if (!list_empty(&folio->_deferred_list)) {
 		ds_queue->split_queue_len--;
 		if (folio_test_partially_mapped(folio)) {
@@ -4028,7 +4063,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
 		list_del_init(&folio->_deferred_list);
 		unqueued = true;
 	}
-	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+	split_queue_unlock_irqrestore(ds_queue, flags);
 
 	return unqueued;	/* useful for debug warnings */
 }
@@ -4036,10 +4071,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
 /* partially_mapped=false won't clear PG_partially_mapped folio flag */
 void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
-	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
-#ifdef CONFIG_MEMCG
-	struct mem_cgroup *memcg = folio_memcg(folio);
-#endif
+	struct deferred_split *ds_queue;
 	unsigned long flags;
 
 	/*
@@ -4062,7 +4094,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
 	if (folio_test_swapcache(folio))
 		return;
 
-	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
 	if (partially_mapped) {
 		if (!folio_test_partially_mapped(folio)) {
 			folio_set_partially_mapped(folio);
@@ -4077,15 +4109,16 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
 		VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
 	}
 	if (list_empty(&folio->_deferred_list)) {
+		struct mem_cgroup *memcg;
+
+		memcg = folio_split_queue_memcg(folio, ds_queue);
 		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
 		ds_queue->split_queue_len++;
-#ifdef CONFIG_MEMCG
 		if (memcg)
 			set_shrinker_bit(memcg, folio_nid(folio),
-					 deferred_split_shrinker->id);
-#endif
+					 shrinker_id(deferred_split_shrinker));
 	}
-	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+	split_queue_unlock_irqrestore(ds_queue, flags);
 }
 
 static unsigned long deferred_split_count(struct shrinker *shrink,
-- 
2.20.1
Re: [PATCH 2/4] mm: thp: introduce folio_split_queue_lock and its variants
Posted by David Hildenbrand 1 week, 2 days ago
On 19.09.25 05:46, Qi Zheng wrote:
> From: Muchun Song <songmuchun@bytedance.com>
> 
> In future memcg removal, the binding between a folio and a memcg may
> change, making the split lock within the memcg unstable when held.
> 
> A new approach is required to reparent the split queue to its parent. This
> patch starts introducing a unified way to acquire the split lock for
> future work.
> 
> It's a code-only refactoring with no functional changes.
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---

Looks sane, I assume the build issue is easily fixed

Acked-by: David Hildenbrand <david@redhat.com>

-- 
Cheers

David / dhildenb
Re: [PATCH 2/4] mm: thp: introduce folio_split_queue_lock and its variants
Posted by Shakeel Butt 1 week, 5 days ago
On Fri, Sep 19, 2025 at 11:46:33AM +0800, Qi Zheng wrote:
> From: Muchun Song <songmuchun@bytedance.com>
> 
> In future memcg removal, the binding between a folio and a memcg may
> change, making the split lock within the memcg unstable when held.
> 
> A new approach is required to reparent the split queue to its parent. This
> patch starts introducing a unified way to acquire the split lock for
> future work.
> 
> It's a code-only refactoring with no functional changes.
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>

Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Re: [PATCH 2/4] mm: thp: introduce folio_split_queue_lock and its variants
Posted by Zi Yan 1 week, 5 days ago
On 18 Sep 2025, at 23:46, Qi Zheng wrote:

> From: Muchun Song <songmuchun@bytedance.com>
>
> In future memcg removal, the binding between a folio and a memcg may
> change, making the split lock within the memcg unstable when held.
>
> A new approach is required to reparent the split queue to its parent. This
> patch starts introducing a unified way to acquire the split lock for
> future work.
>
> It's a code-only refactoring with no functional changes.
>
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
>  include/linux/memcontrol.h | 10 +++++
>  mm/huge_memory.c           | 89 ++++++++++++++++++++++++++------------
>  2 files changed, 71 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 16fe0306e50ea..99876af13c315 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1662,6 +1662,11 @@ int alloc_shrinker_info(struct mem_cgroup *memcg);
>  void free_shrinker_info(struct mem_cgroup *memcg);
>  void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
>  void reparent_shrinker_deferred(struct mem_cgroup *memcg);
> +
> +static inline int shrinker_id(struct shrinker *shrinker)
> +{
> +	return shrinker->id;
> +}
>  #else
>  #define mem_cgroup_sockets_enabled 0
>
> @@ -1693,6 +1698,11 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg,
>  				    int nid, int shrinker_id)
>  {
>  }
> +
> +static inline int shrinker_id(struct shrinker *shrinker)
> +{
> +	return -1;
> +}
>  #endif
>
>  #ifdef CONFIG_MEMCG
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 582628ddf3f33..d34516a22f5bb 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1078,26 +1078,62 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>
>  #ifdef CONFIG_MEMCG
>  static inline
> -struct deferred_split *get_deferred_split_queue(struct folio *folio)
> +struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
> +					   struct deferred_split *queue)
>  {
> -	struct mem_cgroup *memcg = folio_memcg(folio);
> -	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
> -
> -	if (memcg)
> -		return &memcg->deferred_split_queue;
> -	else
> -		return &pgdat->deferred_split_queue;
> +	if (mem_cgroup_disabled())
> +		return NULL;
> +	if (&NODE_DATA(folio_nid(folio))->deferred_split_queue == queue)
> +		return NULL;
> +	return container_of(queue, struct mem_cgroup, deferred_split_queue);
>  }
>  #else
>  static inline
> -struct deferred_split *get_deferred_split_queue(struct folio *folio)
> +struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
> +					   struct deferred_split *queue)
>  {
> -	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
> -
> -	return &pgdat->deferred_split_queue;
> +	return NULL;
>  }
>  #endif
>
> +static struct deferred_split *folio_split_queue_lock(struct folio *folio)
> +{
> +	struct mem_cgroup *memcg;
> +	struct deferred_split *queue;
> +
> +	memcg = folio_memcg(folio);
> +	queue = memcg ? &memcg->deferred_split_queue :
> +			&NODE_DATA(folio_nid(folio))->deferred_split_queue;
> +	spin_lock(&queue->split_queue_lock);
> +
> +	return queue;
> +}
> +
> +static struct deferred_split *
> +folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
> +{
> +	struct mem_cgroup *memcg;
> +	struct deferred_split *queue;
> +
> +	memcg = folio_memcg(folio);
> +	queue = memcg ? &memcg->deferred_split_queue :
> +			&NODE_DATA(folio_nid(folio))->deferred_split_queue;
> +	spin_lock_irqsave(&queue->split_queue_lock, *flags);
> +
> +	return queue;
> +}

A helper function to get queue from a folio would get rid of duplicated
code in the two functions above. Hmm, that is the deleted
get_deferred_split_queue(). So probably retain it.

Otherwise, LGTM. Reviewed-by: Zi Yan <ziy@nvidia.com>

Best Regards,
Yan, Zi
Re: [PATCH 2/4] mm: thp: introduce folio_split_queue_lock and its variants
Posted by Qi Zheng 1 week, 2 days ago
Hi Zi,

On 9/19/25 11:39 PM, Zi Yan wrote:
> On 18 Sep 2025, at 23:46, Qi Zheng wrote:
> 
>> From: Muchun Song <songmuchun@bytedance.com>
>>
>> In future memcg removal, the binding between a folio and a memcg may
>> change, making the split lock within the memcg unstable when held.
>>
>> A new approach is required to reparent the split queue to its parent. This
>> patch starts introducing a unified way to acquire the split lock for
>> future work.
>>
>> It's a code-only refactoring with no functional changes.
>>
>> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
>> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>>   include/linux/memcontrol.h | 10 +++++
>>   mm/huge_memory.c           | 89 ++++++++++++++++++++++++++------------
>>   2 files changed, 71 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index 16fe0306e50ea..99876af13c315 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -1662,6 +1662,11 @@ int alloc_shrinker_info(struct mem_cgroup *memcg);
>>   void free_shrinker_info(struct mem_cgroup *memcg);
>>   void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
>>   void reparent_shrinker_deferred(struct mem_cgroup *memcg);
>> +
>> +static inline int shrinker_id(struct shrinker *shrinker)
>> +{
>> +	return shrinker->id;
>> +}
>>   #else
>>   #define mem_cgroup_sockets_enabled 0
>>
>> @@ -1693,6 +1698,11 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg,
>>   				    int nid, int shrinker_id)
>>   {
>>   }
>> +
>> +static inline int shrinker_id(struct shrinker *shrinker)
>> +{
>> +	return -1;
>> +}
>>   #endif
>>
>>   #ifdef CONFIG_MEMCG
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 582628ddf3f33..d34516a22f5bb 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -1078,26 +1078,62 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>>
>>   #ifdef CONFIG_MEMCG
>>   static inline
>> -struct deferred_split *get_deferred_split_queue(struct folio *folio)
>> +struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
>> +					   struct deferred_split *queue)
>>   {
>> -	struct mem_cgroup *memcg = folio_memcg(folio);
>> -	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
>> -
>> -	if (memcg)
>> -		return &memcg->deferred_split_queue;
>> -	else
>> -		return &pgdat->deferred_split_queue;
>> +	if (mem_cgroup_disabled())
>> +		return NULL;
>> +	if (&NODE_DATA(folio_nid(folio))->deferred_split_queue == queue)
>> +		return NULL;
>> +	return container_of(queue, struct mem_cgroup, deferred_split_queue);
>>   }
>>   #else
>>   static inline
>> -struct deferred_split *get_deferred_split_queue(struct folio *folio)
>> +struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
>> +					   struct deferred_split *queue)
>>   {
>> -	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
>> -
>> -	return &pgdat->deferred_split_queue;
>> +	return NULL;
>>   }
>>   #endif
>>
>> +static struct deferred_split *folio_split_queue_lock(struct folio *folio)
>> +{
>> +	struct mem_cgroup *memcg;
>> +	struct deferred_split *queue;
>> +
>> +	memcg = folio_memcg(folio);
>> +	queue = memcg ? &memcg->deferred_split_queue :
>> +			&NODE_DATA(folio_nid(folio))->deferred_split_queue;
>> +	spin_lock(&queue->split_queue_lock);
>> +
>> +	return queue;
>> +}
>> +
>> +static struct deferred_split *
>> +folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
>> +{
>> +	struct mem_cgroup *memcg;
>> +	struct deferred_split *queue;
>> +
>> +	memcg = folio_memcg(folio);
>> +	queue = memcg ? &memcg->deferred_split_queue :
>> +			&NODE_DATA(folio_nid(folio))->deferred_split_queue;
>> +	spin_lock_irqsave(&queue->split_queue_lock, *flags);
>> +
>> +	return queue;
>> +}
> 
> A helper function to get queue from a folio would get rid of duplicated
> code in the two functions above. Hmm, that is the deleted
> get_deferred_split_queue(). So probably retain it.

After PATCH #4, we may retry after getting parent memcg in the above
functions. so we may not need to retrieve get_deferred_split_queue().

> 
> Otherwise, LGTM. Reviewed-by: Zi Yan <ziy@nvidia.com>

Thanks!

> 
> Best Regards,
> Yan, Zi