From: Muchun Song <songmuchun@bytedance.com>
The maintenance of the folio->_deferred_list is intricate because it's
reused in a local list.
Here are some peculiarities:
1) When a folio is removed from its split queue and added to a local
on-stack list in deferred_split_scan(), the ->split_queue_len isn't
updated, leading to an inconsistency between it and the actual
number of folios in the split queue.
2) When the folio is split via split_folio() later, it's removed from
the local list while holding the split queue lock. At this time,
this lock protects the local list, not the split queue.
3) To handle the race condition with a third-party freeing or migrating
the preceding folio, we must ensure there's always one safe (with
raised refcount) folio before by delaying its folio_put(). More
details can be found in commit e66f3185fa04 ("mm/thp: fix deferred
split queue not partially_mapped"). It's rather tricky.
We can use the folio_batch infrastructure to handle this clearly. In this
case, ->split_queue_len will be consistent with the real number of folios
in the split queue. If list_empty(&folio->_deferred_list) returns false,
it's clear the folio must be in its split queue (not in a local list
anymore).
In the future, we will reparent LRU folios during memcg offline to
eliminate dying memory cgroups, which requires reparenting the split queue
to its parent first. So this patch prepares for using
folio_split_queue_lock_irqsave() as the memcg may change then.
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
---
mm/huge_memory.c | 85 ++++++++++++++++++++++--------------------------
1 file changed, 39 insertions(+), 46 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 134666503440d..59ddebc9f3232 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3782,21 +3782,22 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
struct lruvec *lruvec;
int expected_refs;
- if (folio_order(folio) > 1 &&
- !list_empty(&folio->_deferred_list)) {
- ds_queue->split_queue_len--;
+ if (folio_order(folio) > 1) {
+ if (!list_empty(&folio->_deferred_list)) {
+ ds_queue->split_queue_len--;
+ /*
+ * Reinitialize page_deferred_list after removing the
+ * page from the split_queue, otherwise a subsequent
+ * split will see list corruption when checking the
+ * page_deferred_list.
+ */
+ list_del_init(&folio->_deferred_list);
+ }
if (folio_test_partially_mapped(folio)) {
folio_clear_partially_mapped(folio);
mod_mthp_stat(folio_order(folio),
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
- /*
- * Reinitialize page_deferred_list after removing the
- * page from the split_queue, otherwise a subsequent
- * split will see list corruption when checking the
- * page_deferred_list.
- */
- list_del_init(&folio->_deferred_list);
}
split_queue_unlock(ds_queue);
if (mapping) {
@@ -4185,35 +4186,40 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
{
struct deferred_split *ds_queue;
unsigned long flags;
- LIST_HEAD(list);
- struct folio *folio, *next, *prev = NULL;
- int split = 0, removed = 0;
+ struct folio *folio, *next;
+ int split = 0, i;
+ struct folio_batch fbatch;
+ folio_batch_init(&fbatch);
+
+retry:
ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
/* Take pin on all head pages to avoid freeing them under us */
list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
_deferred_list) {
if (folio_try_get(folio)) {
- list_move(&folio->_deferred_list, &list);
- } else {
+ folio_batch_add(&fbatch, folio);
+ } else if (folio_test_partially_mapped(folio)) {
/* We lost race with folio_put() */
- if (folio_test_partially_mapped(folio)) {
- folio_clear_partially_mapped(folio);
- mod_mthp_stat(folio_order(folio),
- MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
- }
- list_del_init(&folio->_deferred_list);
- ds_queue->split_queue_len--;
+ folio_clear_partially_mapped(folio);
+ mod_mthp_stat(folio_order(folio),
+ MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
+ list_del_init(&folio->_deferred_list);
+ ds_queue->split_queue_len--;
if (!--sc->nr_to_scan)
break;
+ if (!folio_batch_space(&fbatch))
+ break;
}
split_queue_unlock_irqrestore(ds_queue, flags);
- list_for_each_entry_safe(folio, next, &list, _deferred_list) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
bool did_split = false;
bool underused = false;
+ struct deferred_split *fqueue;
+ folio = fbatch.folios[i];
if (!folio_test_partially_mapped(folio)) {
/*
* See try_to_map_unused_to_zeropage(): we cannot
@@ -4236,38 +4242,25 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
}
folio_unlock(folio);
next:
+ if (did_split || !folio_test_partially_mapped(folio))
+ continue;
/*
- * split_folio() removes folio from list on success.
* Only add back to the queue if folio is partially mapped.
* If thp_underused returns false, or if split_folio fails
* in the case it was underused, then consider it used and
* don't add it back to split_queue.
*/
- if (did_split) {
- ; /* folio already removed from list */
- } else if (!folio_test_partially_mapped(folio)) {
- list_del_init(&folio->_deferred_list);
- removed++;
- } else {
- /*
- * That unlocked list_del_init() above would be unsafe,
- * unless its folio is separated from any earlier folios
- * left on the list (which may be concurrently unqueued)
- * by one safe folio with refcount still raised.
- */
- swap(folio, prev);
+ fqueue = folio_split_queue_lock_irqsave(folio, &flags);
+ if (list_empty(&folio->_deferred_list)) {
+ list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
+ fqueue->split_queue_len++;
}
- if (folio)
- folio_put(folio);
+ split_queue_unlock_irqrestore(fqueue, flags);
}
+ folios_put(&fbatch);
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
- list_splice_tail(&list, &ds_queue->split_queue);
- ds_queue->split_queue_len -= removed;
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
-
- if (prev)
- folio_put(prev);
+ if (sc->nr_to_scan)
+ goto retry;
/*
* Stop shrinker if we didn't split any page, but the queue is empty.
--
2.20.1
Hello,
kernel test robot noticed "BUG:soft_lockup-CPU##stuck_for#s![#:#]" on:
commit: 9d46e7734bc76dcb83ab0591de7f7ba94234140a ("[PATCH v4 3/4] mm: thp: use folio_batch to handle THP splitting in deferred_split_scan()")
url: https://github.com/intel-lab-lkp/linux/commits/Qi-Zheng/mm-thp-replace-folio_memcg-with-folio_memcg_charged/20251004-005605
base: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git e406d57be7bd2a4e73ea512c1ae36a40a44e499e
patch link: https://lore.kernel.org/all/304df1ad1e8180e102c4d6931733bcc77774eb9e.1759510072.git.zhengqi.arch@bytedance.com/
patch subject: [PATCH v4 3/4] mm: thp: use folio_batch to handle THP splitting in deferred_split_scan()
in testcase: xfstests
version: xfstests-x86_64-5a9cd3ef-1_20250910
with following parameters:
disk: 4HDD
fs: f2fs
test: generic-group-39
config: x86_64-rhel-9.4-func
compiler: gcc-14
test machine: 8 threads Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz (Skylake) with 16G memory
(please refer to attached dmesg/kmsg for entire log/backtrace)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202510142245.b857a693-lkp@intel.com
[ 102.427367][ C6] watchdog: BUG: soft lockup - CPU#6 stuck for 26s! [391:4643]
[ 102.427373][ C6] Modules linked in: dm_mod f2fs binfmt_misc btrfs snd_hda_codec_intelhdmi blake2b_generic snd_hda_codec_hdmi xor zstd_compress intel_rapl_msr intel_rapl_common snd_ctl_led raid6_pq snd_hda_codec_alc269 snd_hda_scodec_component x86_pkg_temp_thermal snd_hda_codec_realtek_lib intel_powerclamp snd_hda_codec_generic coretemp sd_mod snd_hda_intel kvm_intel sg snd_soc_avs snd_soc_hda_codec snd_hda_ext_core i915 snd_hda_codec kvm intel_gtt snd_hda_core drm_buddy snd_intel_dspcfg ttm snd_intel_sdw_acpi snd_hwdep drm_display_helper snd_soc_core irqbypass cec ghash_clmulni_intel drm_client_lib snd_compress mei_wdt drm_kms_helper snd_pcm ahci rapl wmi_bmof mei_me libahci snd_timer intel_cstate i2c_i801 video snd libata soundcore pcspkr intel_uncore mei serio_raw i2c_smbus intel_pmc_core intel_pch_thermal pmt_telemetry pmt_discovery pmt_class wmi intel_pmc_ssram_telemetry acpi_pad intel_vsec drm fuse nfnetlink
[ 102.427441][ C6] CPU: 6 UID: 0 PID: 4643 Comm: 391 Tainted: G S 6.17.0-09102-g9d46e7734bc7 #1 PREEMPT(voluntary)
[ 102.427446][ C6] Tainted: [S]=CPU_OUT_OF_SPEC
[ 102.427448][ C6] Hardware name: HP HP Z240 SFF Workstation/802E, BIOS N51 Ver. 01.63 10/05/2017
[ 102.427449][ C6] RIP: 0010:_raw_spin_unlock_irqrestore (include/linux/spinlock_api_smp.h:152 (discriminator 2) kernel/locking/spinlock.c:194 (discriminator 2))
[ 102.427456][ C6] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 c6 07 00 0f 1f 00 f7 c6 00 02 00 00 74 01 fb 65 ff 0d 65 de 2d 03 <74> 05 c3 cc cc cc cc 0f 1f 44 00 00 c3 cc cc cc cc 0f 1f 40 00 90
All code
========
0: 90 nop
1: 90 nop
2: 90 nop
3: 90 nop
4: 90 nop
5: 90 nop
6: 90 nop
7: 90 nop
8: 90 nop
9: 90 nop
a: 90 nop
b: 90 nop
c: 90 nop
d: 90 nop
e: 90 nop
f: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
14: c6 07 00 movb $0x0,(%rdi)
17: 0f 1f 00 nopl (%rax)
1a: f7 c6 00 02 00 00 test $0x200,%esi
20: 74 01 je 0x23
22: fb sti
23: 65 ff 0d 65 de 2d 03 decl %gs:0x32dde65(%rip) # 0x32dde8f
2a:* 74 05 je 0x31 <-- trapping instruction
2c: c3 ret
2d: cc int3
2e: cc int3
2f: cc int3
30: cc int3
31: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
36: c3 ret
37: cc int3
38: cc int3
39: cc int3
3a: cc int3
3b: 0f 1f 40 00 nopl 0x0(%rax)
3f: 90 nop
Code starting with the faulting instruction
===========================================
0: 74 05 je 0x7
2: c3 ret
3: cc int3
4: cc int3
5: cc int3
6: cc int3
7: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
c: c3 ret
d: cc int3
e: cc int3
f: cc int3
10: cc int3
11: 0f 1f 40 00 nopl 0x0(%rax)
15: 90 nop
[ 102.427458][ C6] RSP: 0018:ffffc900028cf6c8 EFLAGS: 00000246
[ 102.427461][ C6] RAX: ffff88810cb32900 RBX: ffffc900028cfa38 RCX: ffff88810cb32910
[ 102.427463][ C6] RDX: ffff88810cb32900 RSI: 0000000000000246 RDI: ffff88810cb328f8
[ 102.427465][ C6] RBP: ffff88810cb32870 R08: 0000000000000001 R09: fffff52000519ece
[ 102.427467][ C6] R10: 0000000000000003 R11: ffffc900028cf770 R12: ffffea0008400034
[ 102.427469][ C6] R13: dffffc0000000000 R14: ffff88810cb32900 R15: ffff88810cb32870
[ 102.427470][ C6] FS: 00007f2303266740(0000) GS:ffff8883f6bb8000(0000) knlGS:0000000000000000
[ 102.427473][ C6] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 102.427475][ C6] CR2: 00005651624c9c70 CR3: 00000001e42c6002 CR4: 00000000003726f0
[ 102.427477][ C6] Call Trace:
[ 102.427478][ C6] <TASK>
[ 102.427480][ C6] deferred_split_scan (mm/huge_memory.c:4166 mm/huge_memory.c:4240)
[ 102.427485][ C6] ? __list_lru_walk_one (include/linux/spinlock.h:392 mm/list_lru.c:110 mm/list_lru.c:331)
[ 102.427490][ C6] ? __pfx_deferred_split_scan (mm/huge_memory.c:4195)
[ 102.427495][ C6] ? list_lru_count_one (mm/list_lru.c:263 (discriminator 1))
[ 102.427498][ C6] ? super_cache_scan (fs/super.c:232)
[ 102.427501][ C6] ? super_cache_count (fs/super.c:265 (discriminator 1))
[ 102.427504][ C6] do_shrink_slab (mm/shrinker.c:438)
[ 102.427509][ C6] shrink_slab_memcg (mm/shrinker.c:551)
[ 102.427513][ C6] ? __pfx_shrink_slab_memcg (mm/shrinker.c:471)
[ 102.427517][ C6] ? do_shrink_slab (mm/shrinker.c:384)
[ 102.427521][ C6] shrink_slab (mm/shrinker.c:628)
[ 102.427524][ C6] ? __pfx_shrink_slab (mm/shrinker.c:616)
[ 102.427528][ C6] ? mem_cgroup_iter (include/linux/percpu-refcount.h:338 include/linux/percpu-refcount.h:351 include/linux/cgroup_refcnt.h:79 include/linux/cgroup_refcnt.h:76 mm/memcontrol.c:1084)
[ 102.427531][ C6] drop_slab (mm/vmscan.c:435 (discriminator 1) mm/vmscan.c:452 (discriminator 1))
[ 102.427534][ C6] drop_caches_sysctl_handler (include/linux/vmstat.h:68 (discriminator 1) fs/drop_caches.c:69 (discriminator 1) fs/drop_caches.c:51 (discriminator 1))
[ 102.427538][ C6] proc_sys_call_handler (fs/proc/proc_sysctl.c:606)
[ 102.427542][ C6] ? __pfx_proc_sys_call_handler (fs/proc/proc_sysctl.c:555)
[ 102.427545][ C6] ? __pfx_handle_pte_fault (mm/memory.c:6134)
[ 102.427548][ C6] ? rw_verify_area (fs/read_write.c:473)
[ 102.427552][ C6] vfs_write (fs/read_write.c:594 (discriminator 1) fs/read_write.c:686 (discriminator 1))
[ 102.427556][ C6] ? __pfx_vfs_write (fs/read_write.c:667)
[ 102.427559][ C6] ? __pfx_css_rstat_updated (kernel/cgroup/rstat.c:71)
[ 102.427563][ C6] ? fdget_pos (arch/x86/include/asm/atomic64_64.h:15 include/linux/atomic/atomic-arch-fallback.h:2583 include/linux/atomic/atomic-long.h:38 include/linux/atomic/atomic-instrumented.h:3189 include/linux/file_ref.h:215 fs/file.c:1204 fs/file.c:1230)
[ 102.427566][ C6] ? count_memcg_events (arch/x86/include/asm/atomic.h:23 include/linux/atomic/atomic-arch-fallback.h:457 include/linux/atomic/atomic-instrumented.h:33 mm/memcontrol.c:560 mm/memcontrol.c:583 mm/memcontrol.c:564 mm/memcontrol.c:846)
[ 102.427568][ C6] ksys_write (fs/read_write.c:738)
[ 102.427572][ C6] ? __pfx_ksys_write (fs/read_write.c:728)
[ 102.427575][ C6] ? handle_mm_fault (mm/memory.c:6360 mm/memory.c:6513)
[ 102.427579][ C6] do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1))
[ 102.427583][ C6] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
[ 102.427586][ C6] RIP: 0033:0x7f23032f8687
[ 102.427589][ C6] Code: 48 89 fa 4c 89 df e8 58 b3 00 00 8b 93 08 03 00 00 59 5e 48 83 f8 fc 74 1a 5b c3 0f 1f 84 00 00 00 00 00 48 8b 44 24 10 0f 05 <5b> c3 0f 1f 80 00 00 00 00 83 e2 39 83 fa 08 75 de e8 23 ff ff ff
All code
========
0: 48 89 fa mov %rdi,%rdx
3: 4c 89 df mov %r11,%rdi
6: e8 58 b3 00 00 call 0xb363
b: 8b 93 08 03 00 00 mov 0x308(%rbx),%edx
11: 59 pop %rcx
12: 5e pop %rsi
13: 48 83 f8 fc cmp $0xfffffffffffffffc,%rax
17: 74 1a je 0x33
19: 5b pop %rbx
1a: c3 ret
1b: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
22: 00
23: 48 8b 44 24 10 mov 0x10(%rsp),%rax
28: 0f 05 syscall
2a:* 5b pop %rbx <-- trapping instruction
2b: c3 ret
2c: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
33: 83 e2 39 and $0x39,%edx
36: 83 fa 08 cmp $0x8,%edx
39: 75 de jne 0x19
3b: e8 23 ff ff ff call 0xffffffffffffff63
Code starting with the faulting instruction
===========================================
0: 5b pop %rbx
1: c3 ret
2: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
9: 83 e2 39 and $0x39,%edx
c: 83 fa 08 cmp $0x8,%edx
f: 75 de jne 0xffffffffffffffef
11: e8 23 ff ff ff call 0xffffffffffffff39
The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20251014/202510142245.b857a693-lkp@intel.com
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On Sat, Oct 04, 2025 at 12:53:17AM +0800, Qi Zheng wrote:
> From: Muchun Song <songmuchun@bytedance.com>
>
> The maintenance of the folio->_deferred_list is intricate because it's
> reused in a local list.
>
> Here are some peculiarities:
>
> 1) When a folio is removed from its split queue and added to a local
> on-stack list in deferred_split_scan(), the ->split_queue_len isn't
> updated, leading to an inconsistency between it and the actual
> number of folios in the split queue.
>
> 2) When the folio is split via split_folio() later, it's removed from
> the local list while holding the split queue lock. At this time,
> this lock protects the local list, not the split queue.
I think the above text needs some massaging. Rather than saying lock
protects the local list, I think, it would be better to say that the
lock is not needed as it is not protecting anything.
>
> 3) To handle the race condition with a third-party freeing or migrating
> the preceding folio, we must ensure there's always one safe (with
> raised refcount) folio before by delaying its folio_put(). More
> details can be found in commit e66f3185fa04 ("mm/thp: fix deferred
> split queue not partially_mapped"). It's rather tricky.
>
> We can use the folio_batch infrastructure to handle this clearly. In this
> case, ->split_queue_len will be consistent with the real number of folios
> in the split queue. If list_empty(&folio->_deferred_list) returns false,
> it's clear the folio must be in its split queue (not in a local list
> anymore).
>
> In the future, we will reparent LRU folios during memcg offline to
> eliminate dying memory cgroups, which requires reparenting the split queue
> to its parent first. So this patch prepares for using
> folio_split_queue_lock_irqsave() as the memcg may change then.
>
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Acked-by: David Hildenbrand <david@redhat.com>
One nit below.
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---
> mm/huge_memory.c | 85 ++++++++++++++++++++++--------------------------
> 1 file changed, 39 insertions(+), 46 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 134666503440d..59ddebc9f3232 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3782,21 +3782,22 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
> struct lruvec *lruvec;
> int expected_refs;
>
> - if (folio_order(folio) > 1 &&
> - !list_empty(&folio->_deferred_list)) {
> - ds_queue->split_queue_len--;
> + if (folio_order(folio) > 1) {
> + if (!list_empty(&folio->_deferred_list)) {
> + ds_queue->split_queue_len--;
> + /*
> + * Reinitialize page_deferred_list after removing the
> + * page from the split_queue, otherwise a subsequent
> + * split will see list corruption when checking the
> + * page_deferred_list.
> + */
> + list_del_init(&folio->_deferred_list);
> + }
> if (folio_test_partially_mapped(folio)) {
> folio_clear_partially_mapped(folio);
> mod_mthp_stat(folio_order(folio),
> MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> }
> - /*
> - * Reinitialize page_deferred_list after removing the
> - * page from the split_queue, otherwise a subsequent
> - * split will see list corruption when checking the
> - * page_deferred_list.
> - */
> - list_del_init(&folio->_deferred_list);
> }
> split_queue_unlock(ds_queue);
> if (mapping) {
> @@ -4185,35 +4186,40 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> {
> struct deferred_split *ds_queue;
> unsigned long flags;
> - LIST_HEAD(list);
> - struct folio *folio, *next, *prev = NULL;
> - int split = 0, removed = 0;
> + struct folio *folio, *next;
> + int split = 0, i;
> + struct folio_batch fbatch;
>
> + folio_batch_init(&fbatch);
> +
> +retry:
> ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
> /* Take pin on all head pages to avoid freeing them under us */
> list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
> _deferred_list) {
> if (folio_try_get(folio)) {
> - list_move(&folio->_deferred_list, &list);
> - } else {
> + folio_batch_add(&fbatch, folio);
> + } else if (folio_test_partially_mapped(folio)) {
> /* We lost race with folio_put() */
> - if (folio_test_partially_mapped(folio)) {
> - folio_clear_partially_mapped(folio);
> - mod_mthp_stat(folio_order(folio),
> - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> - }
> - list_del_init(&folio->_deferred_list);
> - ds_queue->split_queue_len--;
> + folio_clear_partially_mapped(folio);
> + mod_mthp_stat(folio_order(folio),
> + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> }
> + list_del_init(&folio->_deferred_list);
> + ds_queue->split_queue_len--;
> if (!--sc->nr_to_scan)
> break;
> + if (!folio_batch_space(&fbatch))
> + break;
> }
> split_queue_unlock_irqrestore(ds_queue, flags);
>
> - list_for_each_entry_safe(folio, next, &list, _deferred_list) {
> + for (i = 0; i < folio_batch_count(&fbatch); i++) {
> bool did_split = false;
> bool underused = false;
> + struct deferred_split *fqueue;
>
> + folio = fbatch.folios[i];
> if (!folio_test_partially_mapped(folio)) {
> /*
> * See try_to_map_unused_to_zeropage(): we cannot
> @@ -4236,38 +4242,25 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> }
> folio_unlock(folio);
> next:
> + if (did_split || !folio_test_partially_mapped(folio))
> + continue;
> /*
> - * split_folio() removes folio from list on success.
> * Only add back to the queue if folio is partially mapped.
> * If thp_underused returns false, or if split_folio fails
> * in the case it was underused, then consider it used and
> * don't add it back to split_queue.
> */
> - if (did_split) {
> - ; /* folio already removed from list */
> - } else if (!folio_test_partially_mapped(folio)) {
> - list_del_init(&folio->_deferred_list);
> - removed++;
> - } else {
> - /*
> - * That unlocked list_del_init() above would be unsafe,
> - * unless its folio is separated from any earlier folios
> - * left on the list (which may be concurrently unqueued)
> - * by one safe folio with refcount still raised.
> - */
> - swap(folio, prev);
> + fqueue = folio_split_queue_lock_irqsave(folio, &flags);
> + if (list_empty(&folio->_deferred_list)) {
> + list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
> + fqueue->split_queue_len++;
> }
> - if (folio)
> - folio_put(folio);
> + split_queue_unlock_irqrestore(fqueue, flags);
Is it possible to move this lock/list_add/unlock code chunk out of loop
and before the folios_put(). I think it would be possible if you tag the
corresponding index or have a separate bool array. It is also reasonable
to claim that the contention of this lock is not a concern for now.
> }
> + folios_put(&fbatch);
>
> - spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> - list_splice_tail(&list, &ds_queue->split_queue);
> - ds_queue->split_queue_len -= removed;
> - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> -
> - if (prev)
> - folio_put(prev);
> + if (sc->nr_to_scan)
> + goto retry;
>
> /*
> * Stop shrinker if we didn't split any page, but the queue is empty.
> --
> 2.20.1
>
Hi Shakeel,
On 10/7/25 7:16 AM, Shakeel Butt wrote:
> On Sat, Oct 04, 2025 at 12:53:17AM +0800, Qi Zheng wrote:
>> From: Muchun Song <songmuchun@bytedance.com>
>>
>> The maintenance of the folio->_deferred_list is intricate because it's
>> reused in a local list.
>>
>> Here are some peculiarities:
>>
>> 1) When a folio is removed from its split queue and added to a local
>> on-stack list in deferred_split_scan(), the ->split_queue_len isn't
>> updated, leading to an inconsistency between it and the actual
>> number of folios in the split queue.
>>
>> 2) When the folio is split via split_folio() later, it's removed from
>> the local list while holding the split queue lock. At this time,
>> this lock protects the local list, not the split queue.
>
> I think the above text needs some massaging. Rather than saying lock
> protects the local list, I think, it would be better to say that the
> lock is not needed as it is not protecting anything.
Make sense, will do.
>
>>
>> 3) To handle the race condition with a third-party freeing or migrating
>> the preceding folio, we must ensure there's always one safe (with
>> raised refcount) folio before by delaying its folio_put(). More
>> details can be found in commit e66f3185fa04 ("mm/thp: fix deferred
>> split queue not partially_mapped"). It's rather tricky.
>>
>> We can use the folio_batch infrastructure to handle this clearly. In this
>> case, ->split_queue_len will be consistent with the real number of folios
>> in the split queue. If list_empty(&folio->_deferred_list) returns false,
>> it's clear the folio must be in its split queue (not in a local list
>> anymore).
>>
>> In the future, we will reparent LRU folios during memcg offline to
>> eliminate dying memory cgroups, which requires reparenting the split queue
>> to its parent first. So this patch prepares for using
>> folio_split_queue_lock_irqsave() as the memcg may change then.
>>
>> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> Reviewed-by: Zi Yan <ziy@nvidia.com>
>> Acked-by: David Hildenbrand <david@redhat.com>
>
> One nit below.
>
> Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Thanks!
>
>> ---
>> mm/huge_memory.c | 85 ++++++++++++++++++++++--------------------------
>> 1 file changed, 39 insertions(+), 46 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 134666503440d..59ddebc9f3232 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3782,21 +3782,22 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>> struct lruvec *lruvec;
>> int expected_refs;
>>
>> - if (folio_order(folio) > 1 &&
>> - !list_empty(&folio->_deferred_list)) {
>> - ds_queue->split_queue_len--;
>> + if (folio_order(folio) > 1) {
>> + if (!list_empty(&folio->_deferred_list)) {
>> + ds_queue->split_queue_len--;
>> + /*
>> + * Reinitialize page_deferred_list after removing the
>> + * page from the split_queue, otherwise a subsequent
>> + * split will see list corruption when checking the
>> + * page_deferred_list.
>> + */
>> + list_del_init(&folio->_deferred_list);
>> + }
>> if (folio_test_partially_mapped(folio)) {
>> folio_clear_partially_mapped(folio);
>> mod_mthp_stat(folio_order(folio),
>> MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
>> }
>> - /*
>> - * Reinitialize page_deferred_list after removing the
>> - * page from the split_queue, otherwise a subsequent
>> - * split will see list corruption when checking the
>> - * page_deferred_list.
>> - */
>> - list_del_init(&folio->_deferred_list);
>> }
>> split_queue_unlock(ds_queue);
>> if (mapping) {
>> @@ -4185,35 +4186,40 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>> {
>> struct deferred_split *ds_queue;
>> unsigned long flags;
>> - LIST_HEAD(list);
>> - struct folio *folio, *next, *prev = NULL;
>> - int split = 0, removed = 0;
>> + struct folio *folio, *next;
>> + int split = 0, i;
>> + struct folio_batch fbatch;
>>
>> + folio_batch_init(&fbatch);
>> +
>> +retry:
>> ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
>> /* Take pin on all head pages to avoid freeing them under us */
>> list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
>> _deferred_list) {
>> if (folio_try_get(folio)) {
>> - list_move(&folio->_deferred_list, &list);
>> - } else {
>> + folio_batch_add(&fbatch, folio);
>> + } else if (folio_test_partially_mapped(folio)) {
>> /* We lost race with folio_put() */
>> - if (folio_test_partially_mapped(folio)) {
>> - folio_clear_partially_mapped(folio);
>> - mod_mthp_stat(folio_order(folio),
>> - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
>> - }
>> - list_del_init(&folio->_deferred_list);
>> - ds_queue->split_queue_len--;
>> + folio_clear_partially_mapped(folio);
>> + mod_mthp_stat(folio_order(folio),
>> + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
>> }
>> + list_del_init(&folio->_deferred_list);
>> + ds_queue->split_queue_len--;
>> if (!--sc->nr_to_scan)
>> break;
>> + if (!folio_batch_space(&fbatch))
>> + break;
>> }
>> split_queue_unlock_irqrestore(ds_queue, flags);
>>
>> - list_for_each_entry_safe(folio, next, &list, _deferred_list) {
>> + for (i = 0; i < folio_batch_count(&fbatch); i++) {
>> bool did_split = false;
>> bool underused = false;
>> + struct deferred_split *fqueue;
>>
>> + folio = fbatch.folios[i];
>> if (!folio_test_partially_mapped(folio)) {
>> /*
>> * See try_to_map_unused_to_zeropage(): we cannot
>> @@ -4236,38 +4242,25 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>> }
>> folio_unlock(folio);
>> next:
>> + if (did_split || !folio_test_partially_mapped(folio))
>> + continue;
>> /*
>> - * split_folio() removes folio from list on success.
>> * Only add back to the queue if folio is partially mapped.
>> * If thp_underused returns false, or if split_folio fails
>> * in the case it was underused, then consider it used and
>> * don't add it back to split_queue.
>> */
>> - if (did_split) {
>> - ; /* folio already removed from list */
>> - } else if (!folio_test_partially_mapped(folio)) {
>> - list_del_init(&folio->_deferred_list);
>> - removed++;
>> - } else {
>> - /*
>> - * That unlocked list_del_init() above would be unsafe,
>> - * unless its folio is separated from any earlier folios
>> - * left on the list (which may be concurrently unqueued)
>> - * by one safe folio with refcount still raised.
>> - */
>> - swap(folio, prev);
>> + fqueue = folio_split_queue_lock_irqsave(folio, &flags);
>> + if (list_empty(&folio->_deferred_list)) {
>> + list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
>> + fqueue->split_queue_len++;
>> }
>> - if (folio)
>> - folio_put(folio);
>> + split_queue_unlock_irqrestore(fqueue, flags);
>
> Is it possible to move this lock/list_add/unlock code chunk out of loop
> and before the folios_put(). I think it would be possible if you tag the
> corresponding index or have a separate bool array. It is also reasonable
> to claim that the contention of this lock is not a concern for now.
Considering the code complexity, perhaps we could wait until contention
on this lock becomes a problem?
Thanks,
Qi
>
>> }
>> + folios_put(&fbatch);
>>
>> - spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>> - list_splice_tail(&list, &ds_queue->split_queue);
>> - ds_queue->split_queue_len -= removed;
>> - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>> -
>> - if (prev)
>> - folio_put(prev);
>> + if (sc->nr_to_scan)
>> + goto retry;
>>
>> /*
>> * Stop shrinker if we didn't split any page, but the queue is empty.
>> --
>> 2.20.1
>>
© 2016 - 2025 Red Hat, Inc.