mm/swapfile.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
A report [1] was uploaded from syzbot.
In the previous commit 862590ac3708 ("mm: swap: allow cache reclaim to skip
slot cache"), the __try_to_reclaim_swap() function reads offset and folio->entry
from folio without folio_lock protection.
In the currently reported KCSAN log, it is assumed that the actual data-race
will not occur because the calltrace that does WRITE already obtains the
folio_lock and then writes.
However, the existing __try_to_reclaim_swap() function was already implemented
to perform reads under folio_lock protection [1], and there is a risk of a
data-race occurring through a function other than the one shown in the KCSAN
log.
Therefore, I think it is appropriate to change read operations for
folio to be performed under folio_lock.
[1]
==================================================================
BUG: KCSAN: data-race in __delete_from_swap_cache / __try_to_reclaim_swap
write to 0xffffea0004c90328 of 8 bytes by task 5186 on cpu 0:
__delete_from_swap_cache+0x1f0/0x290 mm/swap_state.c:163
delete_from_swap_cache+0x72/0xe0 mm/swap_state.c:243
folio_free_swap+0x1d8/0x1f0 mm/swapfile.c:1850
free_swap_cache mm/swap_state.c:293 [inline]
free_pages_and_swap_cache+0x1fc/0x410 mm/swap_state.c:325
__tlb_batch_free_encoded_pages mm/mmu_gather.c:136 [inline]
tlb_batch_pages_flush mm/mmu_gather.c:149 [inline]
tlb_flush_mmu_free mm/mmu_gather.c:366 [inline]
tlb_flush_mmu+0x2cf/0x440 mm/mmu_gather.c:373
zap_pte_range mm/memory.c:1700 [inline]
zap_pmd_range mm/memory.c:1739 [inline]
zap_pud_range mm/memory.c:1768 [inline]
zap_p4d_range mm/memory.c:1789 [inline]
unmap_page_range+0x1f3c/0x22d0 mm/memory.c:1810
unmap_single_vma+0x142/0x1d0 mm/memory.c:1856
unmap_vmas+0x18d/0x2b0 mm/memory.c:1900
exit_mmap+0x18a/0x690 mm/mmap.c:1864
__mmput+0x28/0x1b0 kernel/fork.c:1347
mmput+0x4c/0x60 kernel/fork.c:1369
exit_mm+0xe4/0x190 kernel/exit.c:571
do_exit+0x55e/0x17f0 kernel/exit.c:926
do_group_exit+0x102/0x150 kernel/exit.c:1088
get_signal+0xf2a/0x1070 kernel/signal.c:2917
arch_do_signal_or_restart+0x95/0x4b0 arch/x86/kernel/signal.c:337
exit_to_user_mode_loop kernel/entry/common.c:111 [inline]
exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline]
__syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline]
syscall_exit_to_user_mode+0x59/0x130 kernel/entry/common.c:218
do_syscall_64+0xd6/0x1c0 arch/x86/entry/common.c:89
entry_SYSCALL_64_after_hwframe+0x77/0x7f
read to 0xffffea0004c90328 of 8 bytes by task 5189 on cpu 1:
__try_to_reclaim_swap+0x9d/0x510 mm/swapfile.c:198
free_swap_and_cache_nr+0x45d/0x8a0 mm/swapfile.c:1915
zap_pte_range mm/memory.c:1656 [inline]
zap_pmd_range mm/memory.c:1739 [inline]
zap_pud_range mm/memory.c:1768 [inline]
zap_p4d_range mm/memory.c:1789 [inline]
unmap_page_range+0xcf8/0x22d0 mm/memory.c:1810
unmap_single_vma+0x142/0x1d0 mm/memory.c:1856
unmap_vmas+0x18d/0x2b0 mm/memory.c:1900
exit_mmap+0x18a/0x690 mm/mmap.c:1864
__mmput+0x28/0x1b0 kernel/fork.c:1347
mmput+0x4c/0x60 kernel/fork.c:1369
exit_mm+0xe4/0x190 kernel/exit.c:571
do_exit+0x55e/0x17f0 kernel/exit.c:926
__do_sys_exit kernel/exit.c:1055 [inline]
__se_sys_exit kernel/exit.c:1053 [inline]
__x64_sys_exit+0x1f/0x20 kernel/exit.c:1053
x64_sys_call+0x2d46/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:61
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x77/0x7f
value changed: 0x0000000000000242 -> 0x0000000000000000
Reported-by: syzbot+fa43f1b63e3aa6f66329@syzkaller.appspotmail.com
Fixes: 862590ac3708 ("mm: swap: allow cache reclaim to skip slot cache")
Signed-off-by: Jeongjun Park <aha310510@gmail.com>
---
mm/swapfile.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0cded32414a1..eb782fcd5627 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -194,9 +194,6 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
if (IS_ERR(folio))
return 0;
- /* offset could point to the middle of a large folio */
- entry = folio->swap;
- offset = swp_offset(entry);
nr_pages = folio_nr_pages(folio);
ret = -nr_pages;
@@ -210,6 +207,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
if (!folio_trylock(folio))
goto out;
+ /* offset could point to the middle of a large folio */
+ entry = folio->swap;
+ offset = swp_offset(entry);
+
need_reclaim = ((flags & TTRS_ANYWAY) ||
((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
--
On Mon, Oct 7, 2024 at 3:06 PM Jeongjun Park <aha310510@gmail.com> wrote: > > A report [1] was uploaded from syzbot. > > In the previous commit 862590ac3708 ("mm: swap: allow cache reclaim to skip > slot cache"), the __try_to_reclaim_swap() function reads offset and folio->entry > from folio without folio_lock protection. > > In the currently reported KCSAN log, it is assumed that the actual data-race > will not occur because the calltrace that does WRITE already obtains the > folio_lock and then writes. > > However, the existing __try_to_reclaim_swap() function was already implemented > to perform reads under folio_lock protection [1], and there is a risk of a > data-race occurring through a function other than the one shown in the KCSAN > log. > > Therefore, I think it is appropriate to change read operations for > folio to be performed under folio_lock. > > [1] > > ================================================================== > BUG: KCSAN: data-race in __delete_from_swap_cache / __try_to_reclaim_swap > > write to 0xffffea0004c90328 of 8 bytes by task 5186 on cpu 0: > __delete_from_swap_cache+0x1f0/0x290 mm/swap_state.c:163 > delete_from_swap_cache+0x72/0xe0 mm/swap_state.c:243 > folio_free_swap+0x1d8/0x1f0 mm/swapfile.c:1850 > free_swap_cache mm/swap_state.c:293 [inline] > free_pages_and_swap_cache+0x1fc/0x410 mm/swap_state.c:325 > __tlb_batch_free_encoded_pages mm/mmu_gather.c:136 [inline] > tlb_batch_pages_flush mm/mmu_gather.c:149 [inline] > tlb_flush_mmu_free mm/mmu_gather.c:366 [inline] > tlb_flush_mmu+0x2cf/0x440 mm/mmu_gather.c:373 > zap_pte_range mm/memory.c:1700 [inline] > zap_pmd_range mm/memory.c:1739 [inline] > zap_pud_range mm/memory.c:1768 [inline] > zap_p4d_range mm/memory.c:1789 [inline] > unmap_page_range+0x1f3c/0x22d0 mm/memory.c:1810 > unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 > unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 > exit_mmap+0x18a/0x690 mm/mmap.c:1864 > __mmput+0x28/0x1b0 kernel/fork.c:1347 > mmput+0x4c/0x60 kernel/fork.c:1369 > exit_mm+0xe4/0x190 kernel/exit.c:571 > do_exit+0x55e/0x17f0 kernel/exit.c:926 > do_group_exit+0x102/0x150 kernel/exit.c:1088 > get_signal+0xf2a/0x1070 kernel/signal.c:2917 > arch_do_signal_or_restart+0x95/0x4b0 arch/x86/kernel/signal.c:337 > exit_to_user_mode_loop kernel/entry/common.c:111 [inline] > exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] > __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] > syscall_exit_to_user_mode+0x59/0x130 kernel/entry/common.c:218 > do_syscall_64+0xd6/0x1c0 arch/x86/entry/common.c:89 > entry_SYSCALL_64_after_hwframe+0x77/0x7f > > read to 0xffffea0004c90328 of 8 bytes by task 5189 on cpu 1: > __try_to_reclaim_swap+0x9d/0x510 mm/swapfile.c:198 > free_swap_and_cache_nr+0x45d/0x8a0 mm/swapfile.c:1915 > zap_pte_range mm/memory.c:1656 [inline] > zap_pmd_range mm/memory.c:1739 [inline] > zap_pud_range mm/memory.c:1768 [inline] > zap_p4d_range mm/memory.c:1789 [inline] > unmap_page_range+0xcf8/0x22d0 mm/memory.c:1810 > unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 > unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 > exit_mmap+0x18a/0x690 mm/mmap.c:1864 > __mmput+0x28/0x1b0 kernel/fork.c:1347 > mmput+0x4c/0x60 kernel/fork.c:1369 > exit_mm+0xe4/0x190 kernel/exit.c:571 > do_exit+0x55e/0x17f0 kernel/exit.c:926 > __do_sys_exit kernel/exit.c:1055 [inline] > __se_sys_exit kernel/exit.c:1053 [inline] > __x64_sys_exit+0x1f/0x20 kernel/exit.c:1053 > x64_sys_call+0x2d46/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:61 > do_syscall_x64 arch/x86/entry/common.c:52 [inline] > do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 > entry_SYSCALL_64_after_hwframe+0x77/0x7f > > value changed: 0x0000000000000242 -> 0x0000000000000000 > > Reported-by: syzbot+fa43f1b63e3aa6f66329@syzkaller.appspotmail.com > Fixes: 862590ac3708 ("mm: swap: allow cache reclaim to skip slot cache") > Signed-off-by: Jeongjun Park <aha310510@gmail.com> > --- > mm/swapfile.c | 7 ++++--- > 1 file changed, 4 insertions(+), 3 deletions(-) > > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 0cded32414a1..eb782fcd5627 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -194,9 +194,6 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > if (IS_ERR(folio)) > return 0; > > - /* offset could point to the middle of a large folio */ > - entry = folio->swap; > - offset = swp_offset(entry); > nr_pages = folio_nr_pages(folio); > ret = -nr_pages; > > @@ -210,6 +207,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > if (!folio_trylock(folio)) > goto out; > > + /* offset could point to the middle of a large folio */ > + entry = folio->swap; > + offset = swp_offset(entry); > + > need_reclaim = ((flags & TTRS_ANYWAY) || > ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || > ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); > -- Reviewed-by: Kairui Song <kasong@tencent.com> Hi Andrew, Will this be added to stable and 6.12? 862590ac3708 is already in 6.12 and this fixes a potential issue of it.
> Kairui Song <ryncsn@gmail.com> wrote: > > On Mon, Oct 7, 2024 at 3:06 PM Jeongjun Park <aha310510@gmail.com> wrote: >> >> A report [1] was uploaded from syzbot. >> >> In the previous commit 862590ac3708 ("mm: swap: allow cache reclaim to skip >> slot cache"), the __try_to_reclaim_swap() function reads offset and folio->entry >> from folio without folio_lock protection. >> >> In the currently reported KCSAN log, it is assumed that the actual data-race >> will not occur because the calltrace that does WRITE already obtains the >> folio_lock and then writes. >> >> However, the existing __try_to_reclaim_swap() function was already implemented >> to perform reads under folio_lock protection [1], and there is a risk of a >> data-race occurring through a function other than the one shown in the KCSAN >> log. >> >> Therefore, I think it is appropriate to change read operations for >> folio to be performed under folio_lock. >> >> [1] >> >> ================================================================== >> BUG: KCSAN: data-race in __delete_from_swap_cache / __try_to_reclaim_swap >> >> write to 0xffffea0004c90328 of 8 bytes by task 5186 on cpu 0: >> __delete_from_swap_cache+0x1f0/0x290 mm/swap_state.c:163 >> delete_from_swap_cache+0x72/0xe0 mm/swap_state.c:243 >> folio_free_swap+0x1d8/0x1f0 mm/swapfile.c:1850 >> free_swap_cache mm/swap_state.c:293 [inline] >> free_pages_and_swap_cache+0x1fc/0x410 mm/swap_state.c:325 >> __tlb_batch_free_encoded_pages mm/mmu_gather.c:136 [inline] >> tlb_batch_pages_flush mm/mmu_gather.c:149 [inline] >> tlb_flush_mmu_free mm/mmu_gather.c:366 [inline] >> tlb_flush_mmu+0x2cf/0x440 mm/mmu_gather.c:373 >> zap_pte_range mm/memory.c:1700 [inline] >> zap_pmd_range mm/memory.c:1739 [inline] >> zap_pud_range mm/memory.c:1768 [inline] >> zap_p4d_range mm/memory.c:1789 [inline] >> unmap_page_range+0x1f3c/0x22d0 mm/memory.c:1810 >> unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 >> unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 >> exit_mmap+0x18a/0x690 mm/mmap.c:1864 >> __mmput+0x28/0x1b0 kernel/fork.c:1347 >> mmput+0x4c/0x60 kernel/fork.c:1369 >> exit_mm+0xe4/0x190 kernel/exit.c:571 >> do_exit+0x55e/0x17f0 kernel/exit.c:926 >> do_group_exit+0x102/0x150 kernel/exit.c:1088 >> get_signal+0xf2a/0x1070 kernel/signal.c:2917 >> arch_do_signal_or_restart+0x95/0x4b0 arch/x86/kernel/signal.c:337 >> exit_to_user_mode_loop kernel/entry/common.c:111 [inline] >> exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] >> __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] >> syscall_exit_to_user_mode+0x59/0x130 kernel/entry/common.c:218 >> do_syscall_64+0xd6/0x1c0 arch/x86/entry/common.c:89 >> entry_SYSCALL_64_after_hwframe+0x77/0x7f >> >> read to 0xffffea0004c90328 of 8 bytes by task 5189 on cpu 1: >> __try_to_reclaim_swap+0x9d/0x510 mm/swapfile.c:198 >> free_swap_and_cache_nr+0x45d/0x8a0 mm/swapfile.c:1915 >> zap_pte_range mm/memory.c:1656 [inline] >> zap_pmd_range mm/memory.c:1739 [inline] >> zap_pud_range mm/memory.c:1768 [inline] >> zap_p4d_range mm/memory.c:1789 [inline] >> unmap_page_range+0xcf8/0x22d0 mm/memory.c:1810 >> unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 >> unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 >> exit_mmap+0x18a/0x690 mm/mmap.c:1864 >> __mmput+0x28/0x1b0 kernel/fork.c:1347 >> mmput+0x4c/0x60 kernel/fork.c:1369 >> exit_mm+0xe4/0x190 kernel/exit.c:571 >> do_exit+0x55e/0x17f0 kernel/exit.c:926 >> __do_sys_exit kernel/exit.c:1055 [inline] >> __se_sys_exit kernel/exit.c:1053 [inline] >> __x64_sys_exit+0x1f/0x20 kernel/exit.c:1053 >> x64_sys_call+0x2d46/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:61 >> do_syscall_x64 arch/x86/entry/common.c:52 [inline] >> do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 >> entry_SYSCALL_64_after_hwframe+0x77/0x7f >> >> value changed: 0x0000000000000242 -> 0x0000000000000000 >> >> Reported-by: syzbot+fa43f1b63e3aa6f66329@syzkaller.appspotmail.com >> Fixes: 862590ac3708 ("mm: swap: allow cache reclaim to skip slot cache") >> Signed-off-by: Jeongjun Park <aha310510@gmail.com> >> --- >> mm/swapfile.c | 7 ++++--- >> 1 file changed, 4 insertions(+), 3 deletions(-) >> >> diff --git a/mm/swapfile.c b/mm/swapfile.c >> index 0cded32414a1..eb782fcd5627 100644 >> --- a/mm/swapfile.c >> +++ b/mm/swapfile.c >> @@ -194,9 +194,6 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, >> if (IS_ERR(folio)) >> return 0; >> >> - /* offset could point to the middle of a large folio */ >> - entry = folio->swap; >> - offset = swp_offset(entry); >> nr_pages = folio_nr_pages(folio); >> ret = -nr_pages; >> >> @@ -210,6 +207,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, >> if (!folio_trylock(folio)) >> goto out; >> >> + /* offset could point to the middle of a large folio */ >> + entry = folio->swap; >> + offset = swp_offset(entry); >> + >> need_reclaim = ((flags & TTRS_ANYWAY) || >> ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || >> ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); >> -- > > Reviewed-by: Kairui Song <kasong@tencent.com> > > Hi Andrew, > > Will this be added to stable and 6.12? 862590ac3708 is already in 6.12 > and this fixes a potential issue of it. As far as I can see, commit 862590ac3708 was applied starting from 6.12-rc1, so it looks like no additional commits are needed for the stable version. Regards, Jeongjun Park
On Mon, Oct 14, 2024 at 10:17 AM Jeongjun Park <aha310510@gmail.com> wrote: > > Kairui Song <ryncsn@gmail.com> wrote: > > > > On Mon, Oct 7, 2024 at 3:06 PM Jeongjun Park <aha310510@gmail.com> wrote: > >> > >> A report [1] was uploaded from syzbot. > >> > >> In the previous commit 862590ac3708 ("mm: swap: allow cache reclaim to skip > >> slot cache"), the __try_to_reclaim_swap() function reads offset and folio->entry > >> from folio without folio_lock protection. > >> > >> In the currently reported KCSAN log, it is assumed that the actual data-race > >> will not occur because the calltrace that does WRITE already obtains the > >> folio_lock and then writes. > >> > >> However, the existing __try_to_reclaim_swap() function was already implemented > >> to perform reads under folio_lock protection [1], and there is a risk of a > >> data-race occurring through a function other than the one shown in the KCSAN > >> log. > >> > >> Therefore, I think it is appropriate to change read operations for > >> folio to be performed under folio_lock. > >> > >> [1] > >> > >> ================================================================== > >> BUG: KCSAN: data-race in __delete_from_swap_cache / __try_to_reclaim_swap > >> > >> write to 0xffffea0004c90328 of 8 bytes by task 5186 on cpu 0: > >> __delete_from_swap_cache+0x1f0/0x290 mm/swap_state.c:163 > >> delete_from_swap_cache+0x72/0xe0 mm/swap_state.c:243 > >> folio_free_swap+0x1d8/0x1f0 mm/swapfile.c:1850 > >> free_swap_cache mm/swap_state.c:293 [inline] > >> free_pages_and_swap_cache+0x1fc/0x410 mm/swap_state.c:325 > >> __tlb_batch_free_encoded_pages mm/mmu_gather.c:136 [inline] > >> tlb_batch_pages_flush mm/mmu_gather.c:149 [inline] > >> tlb_flush_mmu_free mm/mmu_gather.c:366 [inline] > >> tlb_flush_mmu+0x2cf/0x440 mm/mmu_gather.c:373 > >> zap_pte_range mm/memory.c:1700 [inline] > >> zap_pmd_range mm/memory.c:1739 [inline] > >> zap_pud_range mm/memory.c:1768 [inline] > >> zap_p4d_range mm/memory.c:1789 [inline] > >> unmap_page_range+0x1f3c/0x22d0 mm/memory.c:1810 > >> unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 > >> unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 > >> exit_mmap+0x18a/0x690 mm/mmap.c:1864 > >> __mmput+0x28/0x1b0 kernel/fork.c:1347 > >> mmput+0x4c/0x60 kernel/fork.c:1369 > >> exit_mm+0xe4/0x190 kernel/exit.c:571 > >> do_exit+0x55e/0x17f0 kernel/exit.c:926 > >> do_group_exit+0x102/0x150 kernel/exit.c:1088 > >> get_signal+0xf2a/0x1070 kernel/signal.c:2917 > >> arch_do_signal_or_restart+0x95/0x4b0 arch/x86/kernel/signal.c:337 > >> exit_to_user_mode_loop kernel/entry/common.c:111 [inline] > >> exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] > >> __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] > >> syscall_exit_to_user_mode+0x59/0x130 kernel/entry/common.c:218 > >> do_syscall_64+0xd6/0x1c0 arch/x86/entry/common.c:89 > >> entry_SYSCALL_64_after_hwframe+0x77/0x7f > >> > >> read to 0xffffea0004c90328 of 8 bytes by task 5189 on cpu 1: > >> __try_to_reclaim_swap+0x9d/0x510 mm/swapfile.c:198 > >> free_swap_and_cache_nr+0x45d/0x8a0 mm/swapfile.c:1915 > >> zap_pte_range mm/memory.c:1656 [inline] > >> zap_pmd_range mm/memory.c:1739 [inline] > >> zap_pud_range mm/memory.c:1768 [inline] > >> zap_p4d_range mm/memory.c:1789 [inline] > >> unmap_page_range+0xcf8/0x22d0 mm/memory.c:1810 > >> unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 > >> unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 > >> exit_mmap+0x18a/0x690 mm/mmap.c:1864 > >> __mmput+0x28/0x1b0 kernel/fork.c:1347 > >> mmput+0x4c/0x60 kernel/fork.c:1369 > >> exit_mm+0xe4/0x190 kernel/exit.c:571 > >> do_exit+0x55e/0x17f0 kernel/exit.c:926 > >> __do_sys_exit kernel/exit.c:1055 [inline] > >> __se_sys_exit kernel/exit.c:1053 [inline] > >> __x64_sys_exit+0x1f/0x20 kernel/exit.c:1053 > >> x64_sys_call+0x2d46/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:61 > >> do_syscall_x64 arch/x86/entry/common.c:52 [inline] > >> do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 > >> entry_SYSCALL_64_after_hwframe+0x77/0x7f > >> > >> value changed: 0x0000000000000242 -> 0x0000000000000000 > >> > >> Reported-by: syzbot+fa43f1b63e3aa6f66329@syzkaller.appspotmail.com > >> Fixes: 862590ac3708 ("mm: swap: allow cache reclaim to skip slot cache") > >> Signed-off-by: Jeongjun Park <aha310510@gmail.com> > >> --- > >> mm/swapfile.c | 7 ++++--- > >> 1 file changed, 4 insertions(+), 3 deletions(-) > >> > >> diff --git a/mm/swapfile.c b/mm/swapfile.c > >> index 0cded32414a1..eb782fcd5627 100644 > >> --- a/mm/swapfile.c > >> +++ b/mm/swapfile.c > >> @@ -194,9 +194,6 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > >> if (IS_ERR(folio)) > >> return 0; > >> > >> - /* offset could point to the middle of a large folio */ > >> - entry = folio->swap; > >> - offset = swp_offset(entry); > >> nr_pages = folio_nr_pages(folio); > >> ret = -nr_pages; > >> > >> @@ -210,6 +207,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > >> if (!folio_trylock(folio)) > >> goto out; > >> > >> + /* offset could point to the middle of a large folio */ > >> + entry = folio->swap; > >> + offset = swp_offset(entry); > >> + > >> need_reclaim = ((flags & TTRS_ANYWAY) || > >> ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || > >> ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); > >> -- > > > > Reviewed-by: Kairui Song <kasong@tencent.com> > > > > Hi Andrew, > > > > Will this be added to stable and 6.12? 862590ac3708 is already in 6.12 > > and this fixes a potential issue of it. > > As far as I can see, commit 862590ac3708 was applied starting > from 6.12-rc1, so it looks like no additional commits are needed > for the stable version. Hi, sorry for the confusion, I meant mm-stable, not the stable branch. It's better to merge this in 6.12. > Regards, > > Jeongjun Park
> Kairui Song <ryncsn@gmail.com> wrote: > > On Mon, Oct 14, 2024 at 10:17 AM Jeongjun Park <aha310510@gmail.com> wrote: >>> Kairui Song <ryncsn@gmail.com> wrote: >>> >>> On Mon, Oct 7, 2024 at 3:06 PM Jeongjun Park <aha310510@gmail.com> wrote: >>>> >>>> A report [1] was uploaded from syzbot. >>>> >>>> In the previous commit 862590ac3708 ("mm: swap: allow cache reclaim to skip >>>> slot cache"), the __try_to_reclaim_swap() function reads offset and folio->entry >>>> from folio without folio_lock protection. >>>> >>>> In the currently reported KCSAN log, it is assumed that the actual data-race >>>> will not occur because the calltrace that does WRITE already obtains the >>>> folio_lock and then writes. >>>> >>>> However, the existing __try_to_reclaim_swap() function was already implemented >>>> to perform reads under folio_lock protection [1], and there is a risk of a >>>> data-race occurring through a function other than the one shown in the KCSAN >>>> log. >>>> >>>> Therefore, I think it is appropriate to change read operations for >>>> folio to be performed under folio_lock. >>>> >>>> [1] >>>> >>>> ================================================================== >>>> BUG: KCSAN: data-race in __delete_from_swap_cache / __try_to_reclaim_swap >>>> >>>> write to 0xffffea0004c90328 of 8 bytes by task 5186 on cpu 0: >>>> __delete_from_swap_cache+0x1f0/0x290 mm/swap_state.c:163 >>>> delete_from_swap_cache+0x72/0xe0 mm/swap_state.c:243 >>>> folio_free_swap+0x1d8/0x1f0 mm/swapfile.c:1850 >>>> free_swap_cache mm/swap_state.c:293 [inline] >>>> free_pages_and_swap_cache+0x1fc/0x410 mm/swap_state.c:325 >>>> __tlb_batch_free_encoded_pages mm/mmu_gather.c:136 [inline] >>>> tlb_batch_pages_flush mm/mmu_gather.c:149 [inline] >>>> tlb_flush_mmu_free mm/mmu_gather.c:366 [inline] >>>> tlb_flush_mmu+0x2cf/0x440 mm/mmu_gather.c:373 >>>> zap_pte_range mm/memory.c:1700 [inline] >>>> zap_pmd_range mm/memory.c:1739 [inline] >>>> zap_pud_range mm/memory.c:1768 [inline] >>>> zap_p4d_range mm/memory.c:1789 [inline] >>>> unmap_page_range+0x1f3c/0x22d0 mm/memory.c:1810 >>>> unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 >>>> unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 >>>> exit_mmap+0x18a/0x690 mm/mmap.c:1864 >>>> __mmput+0x28/0x1b0 kernel/fork.c:1347 >>>> mmput+0x4c/0x60 kernel/fork.c:1369 >>>> exit_mm+0xe4/0x190 kernel/exit.c:571 >>>> do_exit+0x55e/0x17f0 kernel/exit.c:926 >>>> do_group_exit+0x102/0x150 kernel/exit.c:1088 >>>> get_signal+0xf2a/0x1070 kernel/signal.c:2917 >>>> arch_do_signal_or_restart+0x95/0x4b0 arch/x86/kernel/signal.c:337 >>>> exit_to_user_mode_loop kernel/entry/common.c:111 [inline] >>>> exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] >>>> __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] >>>> syscall_exit_to_user_mode+0x59/0x130 kernel/entry/common.c:218 >>>> do_syscall_64+0xd6/0x1c0 arch/x86/entry/common.c:89 >>>> entry_SYSCALL_64_after_hwframe+0x77/0x7f >>>> >>>> read to 0xffffea0004c90328 of 8 bytes by task 5189 on cpu 1: >>>> __try_to_reclaim_swap+0x9d/0x510 mm/swapfile.c:198 >>>> free_swap_and_cache_nr+0x45d/0x8a0 mm/swapfile.c:1915 >>>> zap_pte_range mm/memory.c:1656 [inline] >>>> zap_pmd_range mm/memory.c:1739 [inline] >>>> zap_pud_range mm/memory.c:1768 [inline] >>>> zap_p4d_range mm/memory.c:1789 [inline] >>>> unmap_page_range+0xcf8/0x22d0 mm/memory.c:1810 >>>> unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 >>>> unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 >>>> exit_mmap+0x18a/0x690 mm/mmap.c:1864 >>>> __mmput+0x28/0x1b0 kernel/fork.c:1347 >>>> mmput+0x4c/0x60 kernel/fork.c:1369 >>>> exit_mm+0xe4/0x190 kernel/exit.c:571 >>>> do_exit+0x55e/0x17f0 kernel/exit.c:926 >>>> __do_sys_exit kernel/exit.c:1055 [inline] >>>> __se_sys_exit kernel/exit.c:1053 [inline] >>>> __x64_sys_exit+0x1f/0x20 kernel/exit.c:1053 >>>> x64_sys_call+0x2d46/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:61 >>>> do_syscall_x64 arch/x86/entry/common.c:52 [inline] >>>> do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 >>>> entry_SYSCALL_64_after_hwframe+0x77/0x7f >>>> >>>> value changed: 0x0000000000000242 -> 0x0000000000000000 >>>> >>>> Reported-by: syzbot+fa43f1b63e3aa6f66329@syzkaller.appspotmail.com >>>> Fixes: 862590ac3708 ("mm: swap: allow cache reclaim to skip slot cache") >>>> Signed-off-by: Jeongjun Park <aha310510@gmail.com> >>>> --- >>>> mm/swapfile.c | 7 ++++--- >>>> 1 file changed, 4 insertions(+), 3 deletions(-) >>>> >>>> diff --git a/mm/swapfile.c b/mm/swapfile.c >>>> index 0cded32414a1..eb782fcd5627 100644 >>>> --- a/mm/swapfile.c >>>> +++ b/mm/swapfile.c >>>> @@ -194,9 +194,6 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, >>>> if (IS_ERR(folio)) >>>> return 0; >>>> >>>> - /* offset could point to the middle of a large folio */ >>>> - entry = folio->swap; >>>> - offset = swp_offset(entry); >>>> nr_pages = folio_nr_pages(folio); >>>> ret = -nr_pages; >>>> >>>> @@ -210,6 +207,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, >>>> if (!folio_trylock(folio)) >>>> goto out; >>>> >>>> + /* offset could point to the middle of a large folio */ >>>> + entry = folio->swap; >>>> + offset = swp_offset(entry); >>>> + >>>> need_reclaim = ((flags & TTRS_ANYWAY) || >>>> ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || >>>> ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); >>>> -- >>> >>> Reviewed-by: Kairui Song <kasong@tencent.com> >>> >>> Hi Andrew, >>> >>> Will this be added to stable and 6.12? 862590ac3708 is already in 6.12 >>> and this fixes a potential issue of it. >> >> As far as I can see, commit 862590ac3708 was applied starting >> from 6.12-rc1, so it looks like no additional commits are needed >> for the stable version. > > Hi, sorry for the confusion, I meant mm-stable, not the stable branch. > It's better to merge this in 6.12. I agree with you. I think this vulnerability should be fixed quickly, so it should be applied directly to the next rc version, not the next tree. However, this vulnerability does not affect the stable version, so I think it is appropriate to move this patch to the mm-hotfixes-unstable tree. What do you think, Andrew? Regards, Jeongjun Park > >> Regards, >> >> Jeongjun Park
On Mon, 14 Oct 2024 13:08:29 +0900 Jeongjun Park <aha310510@gmail.com> wrote: > >> As far as I can see, commit 862590ac3708 was applied starting > >> from 6.12-rc1, so it looks like no additional commits are needed > >> for the stable version. > > > > Hi, sorry for the confusion, I meant mm-stable, not the stable branch. > > It's better to merge this in 6.12. > > I agree with you. I think this vulnerability should be fixed quickly, > so it should be applied directly to the next rc version, not the > next tree. However, this vulnerability does not affect the stable > version, so I think it is appropriate to move this patch to the > mm-hotfixes-unstable tree. > > What do you think, Andrew? I moved it into the mm-hotfixes branch for a 6.12 merge.
On Mon, Oct 7, 2024 at 12:06 AM Jeongjun Park <aha310510@gmail.com> wrote: > > A report [1] was uploaded from syzbot. > > In the previous commit 862590ac3708 ("mm: swap: allow cache reclaim to skip > slot cache"), the __try_to_reclaim_swap() function reads offset and folio->entry > from folio without folio_lock protection. > > In the currently reported KCSAN log, it is assumed that the actual data-race > will not occur because the calltrace that does WRITE already obtains the > folio_lock and then writes. > > However, the existing __try_to_reclaim_swap() function was already implemented > to perform reads under folio_lock protection [1], and there is a risk of a > data-race occurring through a function other than the one shown in the KCSAN > log. > > Therefore, I think it is appropriate to change read operations for > folio to be performed under folio_lock. > > [1] > > ================================================================== > BUG: KCSAN: data-race in __delete_from_swap_cache / __try_to_reclaim_swap > > write to 0xffffea0004c90328 of 8 bytes by task 5186 on cpu 0: > __delete_from_swap_cache+0x1f0/0x290 mm/swap_state.c:163 > delete_from_swap_cache+0x72/0xe0 mm/swap_state.c:243 > folio_free_swap+0x1d8/0x1f0 mm/swapfile.c:1850 > free_swap_cache mm/swap_state.c:293 [inline] > free_pages_and_swap_cache+0x1fc/0x410 mm/swap_state.c:325 > __tlb_batch_free_encoded_pages mm/mmu_gather.c:136 [inline] > tlb_batch_pages_flush mm/mmu_gather.c:149 [inline] > tlb_flush_mmu_free mm/mmu_gather.c:366 [inline] > tlb_flush_mmu+0x2cf/0x440 mm/mmu_gather.c:373 > zap_pte_range mm/memory.c:1700 [inline] > zap_pmd_range mm/memory.c:1739 [inline] > zap_pud_range mm/memory.c:1768 [inline] > zap_p4d_range mm/memory.c:1789 [inline] > unmap_page_range+0x1f3c/0x22d0 mm/memory.c:1810 > unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 > unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 > exit_mmap+0x18a/0x690 mm/mmap.c:1864 > __mmput+0x28/0x1b0 kernel/fork.c:1347 > mmput+0x4c/0x60 kernel/fork.c:1369 > exit_mm+0xe4/0x190 kernel/exit.c:571 > do_exit+0x55e/0x17f0 kernel/exit.c:926 > do_group_exit+0x102/0x150 kernel/exit.c:1088 > get_signal+0xf2a/0x1070 kernel/signal.c:2917 > arch_do_signal_or_restart+0x95/0x4b0 arch/x86/kernel/signal.c:337 > exit_to_user_mode_loop kernel/entry/common.c:111 [inline] > exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] > __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] > syscall_exit_to_user_mode+0x59/0x130 kernel/entry/common.c:218 > do_syscall_64+0xd6/0x1c0 arch/x86/entry/common.c:89 > entry_SYSCALL_64_after_hwframe+0x77/0x7f > > read to 0xffffea0004c90328 of 8 bytes by task 5189 on cpu 1: > __try_to_reclaim_swap+0x9d/0x510 mm/swapfile.c:198 > free_swap_and_cache_nr+0x45d/0x8a0 mm/swapfile.c:1915 > zap_pte_range mm/memory.c:1656 [inline] > zap_pmd_range mm/memory.c:1739 [inline] > zap_pud_range mm/memory.c:1768 [inline] > zap_p4d_range mm/memory.c:1789 [inline] > unmap_page_range+0xcf8/0x22d0 mm/memory.c:1810 > unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 > unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 > exit_mmap+0x18a/0x690 mm/mmap.c:1864 > __mmput+0x28/0x1b0 kernel/fork.c:1347 > mmput+0x4c/0x60 kernel/fork.c:1369 > exit_mm+0xe4/0x190 kernel/exit.c:571 > do_exit+0x55e/0x17f0 kernel/exit.c:926 > __do_sys_exit kernel/exit.c:1055 [inline] > __se_sys_exit kernel/exit.c:1053 [inline] > __x64_sys_exit+0x1f/0x20 kernel/exit.c:1053 > x64_sys_call+0x2d46/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:61 > do_syscall_x64 arch/x86/entry/common.c:52 [inline] > do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 > entry_SYSCALL_64_after_hwframe+0x77/0x7f > > value changed: 0x0000000000000242 -> 0x0000000000000000 > > Reported-by: syzbot+fa43f1b63e3aa6f66329@syzkaller.appspotmail.com > Fixes: 862590ac3708 ("mm: swap: allow cache reclaim to skip slot cache") > Signed-off-by: Jeongjun Park <aha310510@gmail.com> > --- > mm/swapfile.c | 7 ++++--- > 1 file changed, 4 insertions(+), 3 deletions(-) > > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 0cded32414a1..eb782fcd5627 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -194,9 +194,6 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > if (IS_ERR(folio)) > return 0; > > - /* offset could point to the middle of a large folio */ > - entry = folio->swap; > - offset = swp_offset(entry); > nr_pages = folio_nr_pages(folio); > ret = -nr_pages; > > @@ -210,6 +207,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > if (!folio_trylock(folio)) > goto out; > > + /* offset could point to the middle of a large folio */ > + entry = folio->swap; > + offset = swp_offset(entry); > + Looks good to me, we do need to get the folio->swap after the folio is locked. Acked-by: Chris Li <chrisl@kernel.org> Chris > need_reclaim = ((flags & TTRS_ANYWAY) || > ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || > ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); > -- >
© 2016 - 2024 Red Hat, Inc.