kernel/events/uprobes.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
We triggered the following error logs in syzkaller test:
BUG: Bad page state in process syz.7.38 pfn:1eff3
page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1eff3
flags: 0x3fffff00004004(referenced|reserved|node=0|zone=1|lastcpupid=0x1fffff)
raw: 003fffff00004004 ffffe6c6c07bfcc8 ffffe6c6c07bfcc8 0000000000000000
raw: 0000000000000000 0000000000000000 00000000fffffffe 0000000000000000
page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x32/0x50
bad_page+0x69/0xf0
free_unref_page_prepare+0x401/0x500
free_unref_page+0x6d/0x1b0
uprobe_write_opcode+0x460/0x8e0
install_breakpoint.part.0+0x51/0x80
register_for_each_vma+0x1d9/0x2b0
__uprobe_register+0x245/0x300
bpf_uprobe_multi_link_attach+0x29b/0x4f0
link_create+0x1e2/0x280
__sys_bpf+0x75f/0xac0
__x64_sys_bpf+0x1a/0x30
do_syscall_64+0x56/0x100
entry_SYSCALL_64_after_hwframe+0x78/0xe2
BUG: Bad rss-counter state mm:00000000452453e0 type:MM_FILEPAGES val:-1
The following syzkaller test case can be used to reproduce:
r2 = creat(&(0x7f0000000000)='./file0\x00', 0x8)
write$nbd(r2, &(0x7f0000000580)=ANY=[], 0x10)
r4 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x42, 0x0)
mmap$IORING_OFF_SQ_RING(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0, 0x12, r4, 0x0)
r5 = userfaultfd(0x80801)
ioctl$UFFDIO_API(r5, 0xc018aa3f, &(0x7f0000000040)={0xaa, 0x20})
r6 = userfaultfd(0x80801)
ioctl$UFFDIO_API(r6, 0xc018aa3f, &(0x7f0000000140))
ioctl$UFFDIO_REGISTER(r6, 0xc020aa00, &(0x7f0000000100)={{&(0x7f0000ffc000/0x4000)=nil, 0x4000}, 0x2})
ioctl$UFFDIO_ZEROPAGE(r5, 0xc020aa04, &(0x7f0000000000)={{&(0x7f0000ffd000/0x1000)=nil, 0x1000}})
r7 = bpf$PROG_LOAD(0x5, &(0x7f0000000140)={0x2, 0x3, &(0x7f0000000200)=ANY=[@ANYBLOB="1800000000120000000000000000000095"], &(0x7f0000000000)='GPL\x00', 0x7, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, @fallback=0x30, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x10, 0x0, @void, @value}, 0x94)
bpf$BPF_LINK_CREATE_XDP(0x1c, &(0x7f0000000040)={r7, 0x0, 0x30, 0x1e, @val=@uprobe_multi={&(0x7f0000000080)='./file0\x00', &(0x7f0000000100)=[0x2], 0x0, 0x0, 0x1}}, 0x40)
The cause is that zero pfn is set to the pte without increasing the rss
count in mfill_atomic_pte_zeropage() and the refcount of zero folio does
not increase accordingly. Then, the operation on the same pfn is performed
in uprobe_write_opcode()->__replace_page() to unconditional decrease the
rss count and old_folio's refcount.
Therefore, two bugs are introduced:
1. The rss count is incorrect, when process exit, the check_mm() report
error "Bad rss-count".
2. The reserved folio (zero folio) is freed when folio->refcount is zero,
then free_pages_prepare->free_page_is_bad() report error "Bad page state".
To fix it, add zero folio check before rss counter and refcount decrease.
Fixes: 7396fa818d62 ("uprobes/core: Make background page replacement logic account for rss_stat counters")
Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints")
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
---
kernel/events/uprobes.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 46ddf3a2334d..ff5694acfa68 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -213,7 +213,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
dec_mm_counter(mm, MM_ANONPAGES);
if (!folio_test_anon(old_folio)) {
- dec_mm_counter(mm, mm_counter_file(old_folio));
+ if (!is_zero_folio(old_folio))
+ dec_mm_counter(mm, mm_counter_file(old_folio));
inc_mm_counter(mm, MM_ANONPAGES);
}
@@ -227,7 +228,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (!folio_mapped(old_folio))
folio_free_swap(old_folio);
page_vma_mapped_walk_done(&pvmw);
- folio_put(old_folio);
+ if (!is_zero_folio(old_folio))
+ folio_put(old_folio);
err = 0;
unlock:
--
2.25.1
在 2025/2/17 20:38, Tong Tiangen 写道:
> We triggered the following error logs in syzkaller test:
>
> BUG: Bad page state in process syz.7.38 pfn:1eff3
> page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1eff3
> flags: 0x3fffff00004004(referenced|reserved|node=0|zone=1|lastcpupid=0x1fffff)
> raw: 003fffff00004004 ffffe6c6c07bfcc8 ffffe6c6c07bfcc8 0000000000000000
> raw: 0000000000000000 0000000000000000 00000000fffffffe 0000000000000000
> page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014
> Call Trace:
> <TASK>
> dump_stack_lvl+0x32/0x50
> bad_page+0x69/0xf0
> free_unref_page_prepare+0x401/0x500
> free_unref_page+0x6d/0x1b0
> uprobe_write_opcode+0x460/0x8e0
> install_breakpoint.part.0+0x51/0x80
> register_for_each_vma+0x1d9/0x2b0
> __uprobe_register+0x245/0x300
> bpf_uprobe_multi_link_attach+0x29b/0x4f0
> link_create+0x1e2/0x280
> __sys_bpf+0x75f/0xac0
> __x64_sys_bpf+0x1a/0x30
> do_syscall_64+0x56/0x100
> entry_SYSCALL_64_after_hwframe+0x78/0xe2
>
> BUG: Bad rss-counter state mm:00000000452453e0 type:MM_FILEPAGES val:-1
>
> The following syzkaller test case can be used to reproduce:
>
> r2 = creat(&(0x7f0000000000)='./file0\x00', 0x8)
> write$nbd(r2, &(0x7f0000000580)=ANY=[], 0x10)
> r4 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x42, 0x0)
> mmap$IORING_OFF_SQ_RING(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0, 0x12, r4, 0x0)
> r5 = userfaultfd(0x80801)
> ioctl$UFFDIO_API(r5, 0xc018aa3f, &(0x7f0000000040)={0xaa, 0x20})
> r6 = userfaultfd(0x80801)
> ioctl$UFFDIO_API(r6, 0xc018aa3f, &(0x7f0000000140))
> ioctl$UFFDIO_REGISTER(r6, 0xc020aa00, &(0x7f0000000100)={{&(0x7f0000ffc000/0x4000)=nil, 0x4000}, 0x2})
> ioctl$UFFDIO_ZEROPAGE(r5, 0xc020aa04, &(0x7f0000000000)={{&(0x7f0000ffd000/0x1000)=nil, 0x1000}})
> r7 = bpf$PROG_LOAD(0x5, &(0x7f0000000140)={0x2, 0x3, &(0x7f0000000200)=ANY=[@ANYBLOB="1800000000120000000000000000000095"], &(0x7f0000000000)='GPL\x00', 0x7, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, @fallback=0x30, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x10, 0x0, @void, @value}, 0x94)
> bpf$BPF_LINK_CREATE_XDP(0x1c, &(0x7f0000000040)={r7, 0x0, 0x30, 0x1e, @val=@uprobe_multi={&(0x7f0000000080)='./file0\x00', &(0x7f0000000100)=[0x2], 0x0, 0x0, 0x1}}, 0x40)
>
> The cause is that zero pfn is set to the pte without increasing the rss
> count in mfill_atomic_pte_zeropage() and the refcount of zero folio does
> not increase accordingly. Then, the operation on the same pfn is performed
> in uprobe_write_opcode()->__replace_page() to unconditional decrease the
> rss count and old_folio's refcount.
>
> Therefore, two bugs are introduced:
> 1. The rss count is incorrect, when process exit, the check_mm() report
> error "Bad rss-count".
> 2. The reserved folio (zero folio) is freed when folio->refcount is zero,
> then free_pages_prepare->free_page_is_bad() report error "Bad page state".
>
> To fix it, add zero folio check before rss counter and refcount decrease.
>
> Fixes: 7396fa818d62 ("uprobes/core: Make background page replacement logic account for rss_stat counters")
> Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints")
> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
> ---
> kernel/events/uprobes.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 46ddf3a2334d..ff5694acfa68 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -213,7 +213,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
> dec_mm_counter(mm, MM_ANONPAGES);
>
> if (!folio_test_anon(old_folio)) {
> - dec_mm_counter(mm, mm_counter_file(old_folio));
> + if (!is_zero_folio(old_folio))
> + dec_mm_counter(mm, mm_counter_file(old_folio));
> inc_mm_counter(mm, MM_ANONPAGES);
> }
>
> @@ -227,7 +228,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
> if (!folio_mapped(old_folio))
> folio_free_swap(old_folio);
> page_vma_mapped_walk_done(&pvmw);
> - folio_put(old_folio);
> + if (!is_zero_folio(old_folio))
> + folio_put(old_folio);
>
> err = 0;
> unlock:
On 18.02.25 03:47, Tong Tiangen wrote:
>
>
> 在 2025/2/17 20:38, Tong Tiangen 写道:
>> We triggered the following error logs in syzkaller test:
>>
>> BUG: Bad page state in process syz.7.38 pfn:1eff3
>> page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1eff3
>> flags: 0x3fffff00004004(referenced|reserved|node=0|zone=1|lastcpupid=0x1fffff)
>> raw: 003fffff00004004 ffffe6c6c07bfcc8 ffffe6c6c07bfcc8 0000000000000000
>> raw: 0000000000000000 0000000000000000 00000000fffffffe 0000000000000000
>> page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014
>> Call Trace:
>> <TASK>
>> dump_stack_lvl+0x32/0x50
>> bad_page+0x69/0xf0
>> free_unref_page_prepare+0x401/0x500
>> free_unref_page+0x6d/0x1b0
>> uprobe_write_opcode+0x460/0x8e0
>> install_breakpoint.part.0+0x51/0x80
>> register_for_each_vma+0x1d9/0x2b0
>> __uprobe_register+0x245/0x300
>> bpf_uprobe_multi_link_attach+0x29b/0x4f0
>> link_create+0x1e2/0x280
>> __sys_bpf+0x75f/0xac0
>> __x64_sys_bpf+0x1a/0x30
>> do_syscall_64+0x56/0x100
>> entry_SYSCALL_64_after_hwframe+0x78/0xe2
>>
>> BUG: Bad rss-counter state mm:00000000452453e0 type:MM_FILEPAGES val:-1
>>
>> The following syzkaller test case can be used to reproduce:
>>
>> r2 = creat(&(0x7f0000000000)='./file0\x00', 0x8)
>> write$nbd(r2, &(0x7f0000000580)=ANY=[], 0x10)
>> r4 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x42, 0x0)
>> mmap$IORING_OFF_SQ_RING(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0, 0x12, r4, 0x0)
>> r5 = userfaultfd(0x80801)
>> ioctl$UFFDIO_API(r5, 0xc018aa3f, &(0x7f0000000040)={0xaa, 0x20})
>> r6 = userfaultfd(0x80801)
>> ioctl$UFFDIO_API(r6, 0xc018aa3f, &(0x7f0000000140))
>> ioctl$UFFDIO_REGISTER(r6, 0xc020aa00, &(0x7f0000000100)={{&(0x7f0000ffc000/0x4000)=nil, 0x4000}, 0x2})
>> ioctl$UFFDIO_ZEROPAGE(r5, 0xc020aa04, &(0x7f0000000000)={{&(0x7f0000ffd000/0x1000)=nil, 0x1000}})
>> r7 = bpf$PROG_LOAD(0x5, &(0x7f0000000140)={0x2, 0x3, &(0x7f0000000200)=ANY=[@ANYBLOB="1800000000120000000000000000000095"], &(0x7f0000000000)='GPL\x00', 0x7, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, @fallback=0x30, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x10, 0x0, @void, @value}, 0x94)
>> bpf$BPF_LINK_CREATE_XDP(0x1c, &(0x7f0000000040)={r7, 0x0, 0x30, 0x1e, @val=@uprobe_multi={&(0x7f0000000080)='./file0\x00', &(0x7f0000000100)=[0x2], 0x0, 0x0, 0x1}}, 0x40)
>>
>> The cause is that zero pfn is set to the pte without increasing the rss
>> count in mfill_atomic_pte_zeropage() and the refcount of zero folio does
>> not increase accordingly. Then, the operation on the same pfn is performed
>> in uprobe_write_opcode()->__replace_page() to unconditional decrease the
>> rss count and old_folio's refcount.
>>
>> Therefore, two bugs are introduced:
>> 1. The rss count is incorrect, when process exit, the check_mm() report
>> error "Bad rss-count".
>> 2. The reserved folio (zero folio) is freed when folio->refcount is zero,
>> then free_pages_prepare->free_page_is_bad() report error "Bad page state".
>>
>> To fix it, add zero folio check before rss counter and refcount decrease.
>>
>> Fixes: 7396fa818d62 ("uprobes/core: Make background page replacement logic account for rss_stat counters")
>> Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints")
>> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
>> ---
>> kernel/events/uprobes.c | 6 ++++--
>> 1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
>> index 46ddf3a2334d..ff5694acfa68 100644
>> --- a/kernel/events/uprobes.c
>> +++ b/kernel/events/uprobes.c
>> @@ -213,7 +213,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
>> dec_mm_counter(mm, MM_ANONPAGES);
>>
>> if (!folio_test_anon(old_folio)) {
>> - dec_mm_counter(mm, mm_counter_file(old_folio));
>> + if (!is_zero_folio(old_folio))
>> + dec_mm_counter(mm, mm_counter_file(old_folio));
>> inc_mm_counter(mm, MM_ANONPAGES);
>> }
>>
>> @@ -227,7 +228,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
>> if (!folio_mapped(old_folio))
>> folio_free_swap(old_folio);
>> page_vma_mapped_walk_done(&pvmw);
>> - folio_put(old_folio);
>> + if (!is_zero_folio(old_folio))
>> + folio_put(old_folio);
>> >> err = 0;
>> unlock:
>
The whole "manually replace pages" logic is fragile. I tried to rewrite
it a while back:
https://lkml.kernel.org/r/20240604122548.359952-1-david@redhat.com
But didn't get to follow-up yet.
I'm not sure if the page_vma_mapped_walk() really does what we would
expect here.
The folio_remove_rmap_pte(old_folio, old_page, vma); is certainly wrong
as well for ero folios.
I don't think there is a sane use case right now where we would hit the
shared zeropage.
So for the time being, I think we should just reject them immediately
after get_user_page_vma_remote().
At some point I'll follow up with my rewrite that will clean this
nastiness here up a bit.
--
Cheers,
David / dhildenb
在 2025/2/18 16:23, David Hildenbrand 写道:
> On 18.02.25 03:47, Tong Tiangen wrote:
>>
>>
>> 在 2025/2/17 20:38, Tong Tiangen 写道:
>>> We triggered the following error logs in syzkaller test:
>>>
>>> BUG: Bad page state in process syz.7.38 pfn:1eff3
>>> page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0
>>> pfn:0x1eff3
>>> flags:
>>> 0x3fffff00004004(referenced|reserved|node=0|zone=1|lastcpupid=0x1fffff)
>>> raw: 003fffff00004004 ffffe6c6c07bfcc8 ffffe6c6c07bfcc8
>>> 0000000000000000
>>> raw: 0000000000000000 0000000000000000 00000000fffffffe
>>> 0000000000000000
>>> page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
>>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
>>> 1.13.0-1ubuntu1.1 04/01/2014
>>> Call Trace:
>>> <TASK>
>>> dump_stack_lvl+0x32/0x50
>>> bad_page+0x69/0xf0
>>> free_unref_page_prepare+0x401/0x500
>>> free_unref_page+0x6d/0x1b0
>>> uprobe_write_opcode+0x460/0x8e0
>>> install_breakpoint.part.0+0x51/0x80
>>> register_for_each_vma+0x1d9/0x2b0
>>> __uprobe_register+0x245/0x300
>>> bpf_uprobe_multi_link_attach+0x29b/0x4f0
>>> link_create+0x1e2/0x280
>>> __sys_bpf+0x75f/0xac0
>>> __x64_sys_bpf+0x1a/0x30
>>> do_syscall_64+0x56/0x100
>>> entry_SYSCALL_64_after_hwframe+0x78/0xe2
>>>
>>> BUG: Bad rss-counter state mm:00000000452453e0 type:MM_FILEPAGES
>>> val:-1
>>>
>>> The following syzkaller test case can be used to reproduce:
>>>
>>> r2 = creat(&(0x7f0000000000)='./file0\x00', 0x8)
>>> write$nbd(r2, &(0x7f0000000580)=ANY=[], 0x10)
>>> r4 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00',
>>> 0x42, 0x0)
>>> mmap$IORING_OFF_SQ_RING(&(0x7f0000ffd000/0x3000)=nil, 0x3000,
>>> 0x0, 0x12, r4, 0x0)
>>> r5 = userfaultfd(0x80801)
>>> ioctl$UFFDIO_API(r5, 0xc018aa3f, &(0x7f0000000040)={0xaa, 0x20})
>>> r6 = userfaultfd(0x80801)
>>> ioctl$UFFDIO_API(r6, 0xc018aa3f, &(0x7f0000000140))
>>> ioctl$UFFDIO_REGISTER(r6, 0xc020aa00,
>>> &(0x7f0000000100)={{&(0x7f0000ffc000/0x4000)=nil, 0x4000}, 0x2})
>>> ioctl$UFFDIO_ZEROPAGE(r5, 0xc020aa04,
>>> &(0x7f0000000000)={{&(0x7f0000ffd000/0x1000)=nil, 0x1000}})
>>> r7 = bpf$PROG_LOAD(0x5, &(0x7f0000000140)={0x2, 0x3,
>>> &(0x7f0000000200)=ANY=[@ANYBLOB="1800000000120000000000000000000095"], &(0x7f0000000000)='GPL\x00',
>>> 0x7, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, @fallback=0x30,
>>> 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
>>> 0x0, 0x10, 0x0, @void, @value}, 0x94)
>>> bpf$BPF_LINK_CREATE_XDP(0x1c, &(0x7f0000000040)={r7, 0x0, 0x30,
>>> 0x1e, @val=@uprobe_multi={&(0x7f0000000080)='./file0\x00',
>>> &(0x7f0000000100)=[0x2], 0x0, 0x0, 0x1}}, 0x40)
>>>
>>> The cause is that zero pfn is set to the pte without increasing the rss
>>> count in mfill_atomic_pte_zeropage() and the refcount of zero folio does
>>> not increase accordingly. Then, the operation on the same pfn is
>>> performed
>>> in uprobe_write_opcode()->__replace_page() to unconditional decrease the
>>> rss count and old_folio's refcount.
>>>
>>> Therefore, two bugs are introduced:
>>> 1. The rss count is incorrect, when process exit, the check_mm() report
>>> error "Bad rss-count".
>>> 2. The reserved folio (zero folio) is freed when folio->refcount is
>>> zero,
>>> then free_pages_prepare->free_page_is_bad() report error "Bad
>>> page state".
>>>
>>> To fix it, add zero folio check before rss counter and refcount
>>> decrease.
>>>
>>> Fixes: 7396fa818d62 ("uprobes/core: Make background page replacement
>>> logic account for rss_stat counters")
>>> Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install
>>> and remove uprobes breakpoints")
>>> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
>>> ---
>>> kernel/events/uprobes.c | 6 ++++--
>>> 1 file changed, 4 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
>>> index 46ddf3a2334d..ff5694acfa68 100644
>>> --- a/kernel/events/uprobes.c
>>> +++ b/kernel/events/uprobes.c
>>> @@ -213,7 +213,8 @@ static int __replace_page(struct vm_area_struct
>>> *vma, unsigned long addr,
>>> dec_mm_counter(mm, MM_ANONPAGES);
>>> if (!folio_test_anon(old_folio)) {
>>> - dec_mm_counter(mm, mm_counter_file(old_folio));
>>> + if (!is_zero_folio(old_folio))
>>> + dec_mm_counter(mm, mm_counter_file(old_folio));
>>> inc_mm_counter(mm, MM_ANONPAGES);
>>> }
>>> @@ -227,7 +228,8 @@ static int __replace_page(struct vm_area_struct
>>> *vma, unsigned long addr,
>>> if (!folio_mapped(old_folio))
>>> folio_free_swap(old_folio);
>>> page_vma_mapped_walk_done(&pvmw);
>>> - folio_put(old_folio);
>>> + if (!is_zero_folio(old_folio))
>>> + folio_put(old_folio);
> >> >> err = 0;
>>> unlock:
>>
>
> The whole "manually replace pages" logic is fragile. I tried to rewrite
> it a while back:
>
> https://lkml.kernel.org/r/20240604122548.359952-1-david@redhat.com
>
> But didn't get to follow-up yet.
>
> I'm not sure if the page_vma_mapped_walk() really does what we would
> expect here.
>
> The folio_remove_rmap_pte(old_folio, old_page, vma); is certainly wrong
> as well for ero folios.
>
>
> I don't think there is a sane use case right now where we would hit the
> shared zeropage.
>
> So for the time being, I think we should just reject them immediately
> after get_user_page_vma_remote().
>
> At some point I'll follow up with my rewrite that will clean this
> nastiness here up a bit.
OK, Before your rewrite last merged, How about i change the solution to
just reject them immediately after get_user_page_vma_remote()?
Thanks,
Tong.
>
On 02/18, Tong Tiangen wrote: > > OK, Before your rewrite last merged, How about i change the solution to > just reject them immediately after get_user_page_vma_remote()? I agree, uprobe_write_opcode() should simply fail if is_zero_page(old_page). Oleg.
On 19.02.25 16:22, Oleg Nesterov wrote: > On 02/18, Tong Tiangen wrote: >> >> OK, Before your rewrite last merged, How about i change the solution to >> just reject them immediately after get_user_page_vma_remote()? > > I agree, uprobe_write_opcode() should simply fail if is_zero_page(old_page). Yes. That's currently only syzkaller that triggers it, not some sane use case. -- Cheers, David / dhildenb
在 2025/2/20 0:12, David Hildenbrand 写道:
> On 19.02.25 16:22, Oleg Nesterov wrote:
>> On 02/18, Tong Tiangen wrote:
>>>
>>> OK, Before your rewrite last merged, How about i change the solution to
>>> just reject them immediately after get_user_page_vma_remote()?
>>
>> I agree, uprobe_write_opcode() should simply fail if
>> is_zero_page(old_page).
>
> Yes. That's currently only syzkaller that triggers it, not some sane use
> case.
OK, change as follows:
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -506,6 +506,12 @@ int uprobe_write_opcode(struct arch_uprobe
*auprobe, struct mm_struct *mm,
if (ret <= 0)
goto put_old;
+ if (WARN(is_zero_page(old_page),
+ "uprobe should never work on zero page\n")) {
+ ret = -EINVAL;
+ goto put_old;
+ }
+
if (WARN(!is_register && PageCompound(old_page),
"uprobe unregister should never work on compound
page\n")) {
ret = -EINVAL;
If ok, i will send v2 soon.
Thanks,
Tong.
>
On 20.02.25 03:31, Tong Tiangen wrote: > > > 在 2025/2/20 0:12, David Hildenbrand 写道: >> On 19.02.25 16:22, Oleg Nesterov wrote: >>> On 02/18, Tong Tiangen wrote: >>>> >>>> OK, Before your rewrite last merged, How about i change the solution to >>>> just reject them immediately after get_user_page_vma_remote()? >>> >>> I agree, uprobe_write_opcode() should simply fail if >>> is_zero_page(old_page). >> >> Yes. That's currently only syzkaller that triggers it, not some sane use >> case. > > OK, change as follows: > > --- a/kernel/events/uprobes.c > +++ b/kernel/events/uprobes.c > @@ -506,6 +506,12 @@ int uprobe_write_opcode(struct arch_uprobe > *auprobe, struct mm_struct *mm, > if (ret <= 0) > goto put_old; > > + if (WARN(is_zero_page(old_page), This can likely be triggered by user space, so do not use WARN. -- Cheers, David / dhildenb
在 2025/2/20 16:38, David Hildenbrand 写道: > On 20.02.25 03:31, Tong Tiangen wrote: >> >> >> 在 2025/2/20 0:12, David Hildenbrand 写道: >>> On 19.02.25 16:22, Oleg Nesterov wrote: >>>> On 02/18, Tong Tiangen wrote: >>>>> >>>>> OK, Before your rewrite last merged, How about i change the >>>>> solution to >>>>> just reject them immediately after get_user_page_vma_remote()? >>>> >>>> I agree, uprobe_write_opcode() should simply fail if >>>> is_zero_page(old_page). >>> >>> Yes. That's currently only syzkaller that triggers it, not some sane use >>> case. >> >> OK, change as follows: >> >> --- a/kernel/events/uprobes.c >> +++ b/kernel/events/uprobes.c >> @@ -506,6 +506,12 @@ int uprobe_write_opcode(struct arch_uprobe >> *auprobe, struct mm_struct *mm, >> if (ret <= 0) >> goto put_old; >> >> + if (WARN(is_zero_page(old_page), > > This can likely be triggered by user space, so do not use WARN. OK,thanks. Hi Oleg, is that all right? Thanks, Tong. >
On 02/20, Tong Tiangen wrote: > > 在 2025/2/20 16:38, David Hildenbrand 写道: > >On 20.02.25 03:31, Tong Tiangen wrote: > >> > >>@@ -506,6 +506,12 @@ int uprobe_write_opcode(struct arch_uprobe > >>*auprobe, struct mm_struct *mm, > >> if (ret <= 0) > >> goto put_old; > >> > >>+ if (WARN(is_zero_page(old_page), > > > >This can likely be triggered by user space, so do not use WARN. > > OK,thanks. > > Hi Oleg, is that all right? Thanks, LGTM. Oleg.
Can't comment, my understanding of mm/ is not enough these days.
Just one question...
On 02/17, Tong Tiangen wrote:
>
> Fixes: 7396fa818d62 ("uprobes/core: Make background page replacement logic account for rss_stat counters")
> Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints")
Are you sure this logic was wrong from the very beginning? Just curious.
Oleg.
> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
> ---
> kernel/events/uprobes.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 46ddf3a2334d..ff5694acfa68 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -213,7 +213,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
> dec_mm_counter(mm, MM_ANONPAGES);
>
> if (!folio_test_anon(old_folio)) {
> - dec_mm_counter(mm, mm_counter_file(old_folio));
> + if (!is_zero_folio(old_folio))
> + dec_mm_counter(mm, mm_counter_file(old_folio));
> inc_mm_counter(mm, MM_ANONPAGES);
> }
>
> @@ -227,7 +228,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
> if (!folio_mapped(old_folio))
> folio_free_swap(old_folio);
> page_vma_mapped_walk_done(&pvmw);
> - folio_put(old_folio);
> + if (!is_zero_folio(old_folio))
> + folio_put(old_folio);
>
> err = 0;
> unlock:
> --
> 2.25.1
>
在 2025/2/18 0:12, Oleg Nesterov 写道:
> Can't comment, my understanding of mm/ is not enough these days.
>
> Just one question...
>
> On 02/17, Tong Tiangen wrote:
>>
>> Fixes: 7396fa818d62 ("uprobes/core: Make background page replacement logic account for rss_stat counters")
>> Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints")
>
> Are you sure this logic was wrong from the very beginning? Just curious.
>
> Oleg.
Yes, i checked the original code logic, and the put_page() didn't take
zero pages into account.
Add Morton,Peter and David for discussion.
Thanks,
Tong.
>
>> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
>> ---
>> kernel/events/uprobes.c | 6 ++++--
>> 1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
>> index 46ddf3a2334d..ff5694acfa68 100644
>> --- a/kernel/events/uprobes.c
>> +++ b/kernel/events/uprobes.c
>> @@ -213,7 +213,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
>> dec_mm_counter(mm, MM_ANONPAGES);
>>
>> if (!folio_test_anon(old_folio)) {
>> - dec_mm_counter(mm, mm_counter_file(old_folio));
>> + if (!is_zero_folio(old_folio))
>> + dec_mm_counter(mm, mm_counter_file(old_folio));
>> inc_mm_counter(mm, MM_ANONPAGES);
>> }
>>
>> @@ -227,7 +228,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
>> if (!folio_mapped(old_folio))
>> folio_free_swap(old_folio);
>> page_vma_mapped_walk_done(&pvmw);
>> - folio_put(old_folio);
>> + if (!is_zero_folio(old_folio))
>> + folio_put(old_folio);
>>
>> err = 0;
>> unlock:
>> --
>> 2.25.1
>>
>
>
> .
© 2016 - 2025 Red Hat, Inc.