include/linux/uprobes.h | 1 - kernel/events/uprobes.c | 39 ++++++++++++++++++++------------------- kernel/fork.c | 1 - 3 files changed, 20 insertions(+), 21 deletions(-)
The following KASAN splat was shown:
[ 44.505448] ================================================================== 20:37:27 [3421/145075]
[ 44.505455] BUG: KASAN: slab-use-after-free in special_mapping_close+0x9c/0xc8
[ 44.505471] Read of size 8 at addr 00000000868dac48 by task sh/1384
[ 44.505479]
[ 44.505486] CPU: 51 UID: 0 PID: 1384 Comm: sh Not tainted 6.11.0-rc6-next-20240902-dirty #1496
[ 44.505503] Hardware name: IBM 3931 A01 704 (z/VM 7.3.0)
[ 44.505508] Call Trace:
[ 44.505511] [<000b0324d2f78080>] dump_stack_lvl+0xd0/0x108
[ 44.505521] [<000b0324d2f5435c>] print_address_description.constprop.0+0x34/0x2e0
[ 44.505529] [<000b0324d2f5464c>] print_report+0x44/0x138
[ 44.505536] [<000b0324d1383192>] kasan_report+0xc2/0x140
[ 44.505543] [<000b0324d2f52904>] special_mapping_close+0x9c/0xc8
[ 44.505550] [<000b0324d12c7978>] remove_vma+0x78/0x120
[ 44.505557] [<000b0324d128a2c6>] exit_mmap+0x326/0x750
[ 44.505563] [<000b0324d0ba655a>] __mmput+0x9a/0x370
[ 44.505570] [<000b0324d0bbfbe0>] exit_mm+0x240/0x340
[ 44.505575] [<000b0324d0bc0228>] do_exit+0x548/0xd70
[ 44.505580] [<000b0324d0bc1102>] do_group_exit+0x132/0x390
[ 44.505586] [<000b0324d0bc13b6>] __s390x_sys_exit_group+0x56/0x60
[ 44.505592] [<000b0324d0adcbd6>] do_syscall+0x2f6/0x430
[ 44.505599] [<000b0324d2f78434>] __do_syscall+0xa4/0x170
[ 44.505606] [<000b0324d2f9454c>] system_call+0x74/0x98
[ 44.505614]
[ 44.505616] Allocated by task 1384:
[ 44.505621] kasan_save_stack+0x40/0x70
[ 44.505630] kasan_save_track+0x28/0x40
[ 44.505636] __kasan_kmalloc+0xa0/0xc0
[ 44.505642] __create_xol_area+0xfa/0x410
[ 44.505648] get_xol_area+0xb0/0xf0
[ 44.505652] uprobe_notify_resume+0x27a/0x470
[ 44.505657] irqentry_exit_to_user_mode+0x15e/0x1d0
[ 44.505664] pgm_check_handler+0x122/0x170
[ 44.505670]
[ 44.505672] Freed by task 1384:
[ 44.505676] kasan_save_stack+0x40/0x70
[ 44.505682] kasan_save_track+0x28/0x40
[ 44.505687] kasan_save_free_info+0x4a/0x70
[ 44.505693] __kasan_slab_free+0x5a/0x70
[ 44.505698] kfree+0xe8/0x3f0
[ 44.505704] __mmput+0x20/0x370
[ 44.505709] exit_mm+0x240/0x340
[ 44.505713] do_exit+0x548/0xd70
[ 44.505718] do_group_exit+0x132/0x390
[ 44.505722] __s390x_sys_exit_group+0x56/0x60
[ 44.505727] do_syscall+0x2f6/0x430
[ 44.505732] __do_syscall+0xa4/0x170
[ 44.505738] system_call+0x74/0x98
The problem is that uprobe_clear_state() kfree's struct xol_area, which
contains struct vm_special_mapping *xol_mapping. This one is passed to
_install_special_mapping() in xol_add_vma().
__mput reads:
static inline void __mmput(struct mm_struct *mm)
{
VM_BUG_ON(atomic_read(&mm->mm_users));
uprobe_clear_state(mm);
exit_aio(mm);
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
...
}
So uprobe_clear_state() in the beginning free's the memory area
containing the vm_special_mapping data, but exit_mmap() uses this
address later via vma->vm_private_data (which was set in
_install_special_mapping().
Fix this by moving uprobe_clear_state() to uprobes.c and use it as
close() callback.
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
---
include/linux/uprobes.h | 1 -
kernel/events/uprobes.c | 39 ++++++++++++++++++++-------------------
kernel/fork.c | 1 -
3 files changed, 20 insertions(+), 21 deletions(-)
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index f50df1fa93e7..2ae96c98d287 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -128,7 +128,6 @@ extern int uprobe_pre_sstep_notifier(struct pt_regs *regs);
extern void uprobe_notify_resume(struct pt_regs *regs);
extern bool uprobe_deny_signal(void);
extern bool arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs);
-extern void uprobe_clear_state(struct mm_struct *mm);
extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr);
extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 30348f13d4a7..ab19a43a4dfc 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1463,6 +1463,25 @@ void * __weak arch_uprobe_trampoline(unsigned long *psize)
return &insn;
}
+/*
+ * uprobe_clear_state - Free the area allocated for slots.
+ */
+static void uprobe_clear_state(const struct vm_special_mapping *sm, struct vm_area_struct *vma)
+{
+ struct xol_area *area = container_of(vma->vm_private_data, struct xol_area, xol_mapping);
+
+ mutex_lock(&delayed_uprobe_lock);
+ delayed_uprobe_remove(NULL, vma->vm_mm);
+ mutex_unlock(&delayed_uprobe_lock);
+
+ if (!area)
+ return;
+
+ put_page(area->pages[0]);
+ kfree(area->bitmap);
+ kfree(area);
+}
+
static struct xol_area *__create_xol_area(unsigned long vaddr)
{
struct mm_struct *mm = current->mm;
@@ -1481,6 +1500,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
area->xol_mapping.name = "[uprobes]";
area->xol_mapping.fault = NULL;
+ area->xol_mapping.close = uprobe_clear_state;
area->xol_mapping.pages = area->pages;
area->pages[0] = alloc_page(GFP_HIGHUSER);
if (!area->pages[0])
@@ -1526,25 +1546,6 @@ static struct xol_area *get_xol_area(void)
return area;
}
-/*
- * uprobe_clear_state - Free the area allocated for slots.
- */
-void uprobe_clear_state(struct mm_struct *mm)
-{
- struct xol_area *area = mm->uprobes_state.xol_area;
-
- mutex_lock(&delayed_uprobe_lock);
- delayed_uprobe_remove(NULL, mm);
- mutex_unlock(&delayed_uprobe_lock);
-
- if (!area)
- return;
-
- put_page(area->pages[0]);
- kfree(area->bitmap);
- kfree(area);
-}
-
void uprobe_start_dup_mmap(void)
{
percpu_down_read(&dup_mmap_sem);
diff --git a/kernel/fork.c b/kernel/fork.c
index df8e4575ff01..ad0e16cf528b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1340,7 +1340,6 @@ static inline void __mmput(struct mm_struct *mm)
{
VM_BUG_ON(atomic_read(&mm->mm_users));
- uprobe_clear_state(mm);
exit_aio(mm);
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
--
2.43.0
On 09/03, Sven Schnelle wrote:
>
> +static void uprobe_clear_state(const struct vm_special_mapping *sm, struct vm_area_struct *vma)
> +{
> + struct xol_area *area = container_of(vma->vm_private_data, struct xol_area, xol_mapping);
> +
> + mutex_lock(&delayed_uprobe_lock);
> + delayed_uprobe_remove(NULL, vma->vm_mm);
> + mutex_unlock(&delayed_uprobe_lock);
> +
> + if (!area)
> + return;
> +
> + put_page(area->pages[0]);
> + kfree(area->bitmap);
> + kfree(area);
> +}
> +
> static struct xol_area *__create_xol_area(unsigned long vaddr)
> {
> struct mm_struct *mm = current->mm;
> @@ -1481,6 +1500,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
>
> area->xol_mapping.name = "[uprobes]";
> area->xol_mapping.fault = NULL;
> + area->xol_mapping.close = uprobe_clear_state;
Ah, no, we can't do this :/
A malicious application can munmap() its "[uprobes]" vma and free
area/pages/bitmap. If this application hits the uprobe breakpoint after
that it will use the freed memory.
And no, "mm->uprobes_state.xol_area = NULL" in uprobe_clear_state() won't
help. Say, another thread can sleep on area.wq when munmap() is called.
Sorry, I should have realized that immediately, but I didn't :/
Andrew, this is uprobes-use-vm_special_mapping-close-functionality.patch
in mm-stable
Oleg.
I guess VM_SEALED could help, but it depends on CONFIG_64BIT
On 09/11, Oleg Nesterov wrote:
>
> On 09/03, Sven Schnelle wrote:
> >
> > +static void uprobe_clear_state(const struct vm_special_mapping *sm, struct vm_area_struct *vma)
> > +{
> > + struct xol_area *area = container_of(vma->vm_private_data, struct xol_area, xol_mapping);
> > +
> > + mutex_lock(&delayed_uprobe_lock);
> > + delayed_uprobe_remove(NULL, vma->vm_mm);
> > + mutex_unlock(&delayed_uprobe_lock);
> > +
> > + if (!area)
> > + return;
> > +
> > + put_page(area->pages[0]);
> > + kfree(area->bitmap);
> > + kfree(area);
> > +}
> > +
> > static struct xol_area *__create_xol_area(unsigned long vaddr)
> > {
> > struct mm_struct *mm = current->mm;
> > @@ -1481,6 +1500,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
> >
> > area->xol_mapping.name = "[uprobes]";
> > area->xol_mapping.fault = NULL;
> > + area->xol_mapping.close = uprobe_clear_state;
>
> Ah, no, we can't do this :/
>
> A malicious application can munmap() its "[uprobes]" vma and free
> area/pages/bitmap. If this application hits the uprobe breakpoint after
> that it will use the freed memory.
>
> And no, "mm->uprobes_state.xol_area = NULL" in uprobe_clear_state() won't
> help. Say, another thread can sleep on area.wq when munmap() is called.
>
> Sorry, I should have realized that immediately, but I didn't :/
>
> Andrew, this is uprobes-use-vm_special_mapping-close-functionality.patch
> in mm-stable
>
> Oleg.
Damn, sorry for the spam.
So I am going to send the patch from
https://lore.kernel.org/all/20240904095449.GA28781@redhat.com/
To me it looks like a good cleanup regardless, and unless I am totally
confused it should fix the problem with use-after-free.
Oleg.
On 09/11, Oleg Nesterov wrote:
>
> I guess VM_SEALED could help, but it depends on CONFIG_64BIT
>
>
> On 09/11, Oleg Nesterov wrote:
> >
> > On 09/03, Sven Schnelle wrote:
> > >
> > > +static void uprobe_clear_state(const struct vm_special_mapping *sm, struct vm_area_struct *vma)
> > > +{
> > > + struct xol_area *area = container_of(vma->vm_private_data, struct xol_area, xol_mapping);
> > > +
> > > + mutex_lock(&delayed_uprobe_lock);
> > > + delayed_uprobe_remove(NULL, vma->vm_mm);
> > > + mutex_unlock(&delayed_uprobe_lock);
> > > +
> > > + if (!area)
> > > + return;
> > > +
> > > + put_page(area->pages[0]);
> > > + kfree(area->bitmap);
> > > + kfree(area);
> > > +}
> > > +
> > > static struct xol_area *__create_xol_area(unsigned long vaddr)
> > > {
> > > struct mm_struct *mm = current->mm;
> > > @@ -1481,6 +1500,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
> > >
> > > area->xol_mapping.name = "[uprobes]";
> > > area->xol_mapping.fault = NULL;
> > > + area->xol_mapping.close = uprobe_clear_state;
> >
> > Ah, no, we can't do this :/
> >
> > A malicious application can munmap() its "[uprobes]" vma and free
> > area/pages/bitmap. If this application hits the uprobe breakpoint after
> > that it will use the freed memory.
> >
> > And no, "mm->uprobes_state.xol_area = NULL" in uprobe_clear_state() won't
> > help. Say, another thread can sleep on area.wq when munmap() is called.
> >
> > Sorry, I should have realized that immediately, but I didn't :/
> >
> > Andrew, this is uprobes-use-vm_special_mapping-close-functionality.patch
> > in mm-stable
> >
> > Oleg.
On 09/03, Sven Schnelle wrote:
>
> [ 44.505448] ================================================================== 20:37:27 [3421/145075]
> [ 44.505455] BUG: KASAN: slab-use-after-free in special_mapping_close+0x9c/0xc8
> [ 44.505471] Read of size 8 at addr 00000000868dac48 by task sh/1384
> [ 44.505479]
> [ 44.505486] CPU: 51 UID: 0 PID: 1384 Comm: sh Not tainted 6.11.0-rc6-next-20240902-dirty #1496
> [ 44.505503] Hardware name: IBM 3931 A01 704 (z/VM 7.3.0)
> [ 44.505508] Call Trace:
> [ 44.505511] [<000b0324d2f78080>] dump_stack_lvl+0xd0/0x108
> [ 44.505521] [<000b0324d2f5435c>] print_address_description.constprop.0+0x34/0x2e0
> [ 44.505529] [<000b0324d2f5464c>] print_report+0x44/0x138
> [ 44.505536] [<000b0324d1383192>] kasan_report+0xc2/0x140
> [ 44.505543] [<000b0324d2f52904>] special_mapping_close+0x9c/0xc8
^^^^^^^^^^^^^^^^^^^^^
Caused by
[PATCH v2 1/4] mm: Add optional close() to struct vm_special_mapping
https://lore.kernel.org/all/20240812082605.743814-1-mpe@ellerman.id.au/
?
> +static void uprobe_clear_state(const struct vm_special_mapping *sm, struct vm_area_struct *vma)
> +{
> + struct xol_area *area = container_of(vma->vm_private_data, struct xol_area, xol_mapping);
> +
> + mutex_lock(&delayed_uprobe_lock);
> + delayed_uprobe_remove(NULL, vma->vm_mm);
> + mutex_unlock(&delayed_uprobe_lock);
> +
> + if (!area)
> + return;
> +
> + put_page(area->pages[0]);
> + kfree(area->bitmap);
> + kfree(area);
> +}
> +
> static struct xol_area *__create_xol_area(unsigned long vaddr)
> {
> struct mm_struct *mm = current->mm;
> @@ -1481,6 +1500,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
>
> area->xol_mapping.name = "[uprobes]";
> area->xol_mapping.fault = NULL;
> + area->xol_mapping.close = uprobe_clear_state;
LGTM.
but with or without this fix __create_xol_area() also needs
area->xol_mapping.mremap = NULL;
?
And in the longer term I think we should probably add a single instance
of "struct vm_special_mapping uprobe_special_mapping with ->fault != NULL
but this is another issue.
Oleg.
On Tue, 3 Sept 2024 at 02:09, Oleg Nesterov <oleg@redhat.com> wrote:
>
> but with or without this fix __create_xol_area() also needs
>
> area->xol_mapping.mremap = NULL;
I think the whole thing needs to be zeroed out.
It was always horribly buggy. The close thing just made it more
*obviously* buggy, because closing a vma is a lot more common than
mremap'ing it.
Either use kzalloc(), or do a proper initializer something like this:
- area->xol_mapping.name = "[uprobes]";
- area->xol_mapping.fault = NULL;
- area->xol_mapping.pages = area->pages;
+ area->xol_mapping = (struct vm_special_mapping) {
+ .name = "[uprobes]",
+ .pages = area->pages,
+ .close = uprobe_clear_state,
+ };
which should initialize the struct vm_special_mapping fully.
Linus
I didn't have time to write a full reply yesterday.
On 09/03, Linus Torvalds wrote:
>
> On Tue, 3 Sept 2024 at 02:09, Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > but with or without this fix __create_xol_area() also needs
> >
> > area->xol_mapping.mremap = NULL;
>
> I think the whole thing needs to be zeroed out.
Again, this is what Sven did and I agree with this fix for now.
> It was always horribly buggy. The close thing just made it more
> *obviously* buggy, because closing a vma is a lot more common than
> mremap'ing it.
Well, this code is very old, I don't think it was "always" buggy.
But at least today it is horribly ugly, and
> Either use kzalloc(), or do a proper initializer something like this:
>
> - area->xol_mapping.name = "[uprobes]";
> - area->xol_mapping.fault = NULL;
> - area->xol_mapping.pages = area->pages;
> + area->xol_mapping = (struct vm_special_mapping) {
> + .name = "[uprobes]",
> + .pages = area->pages,
> + .close = uprobe_clear_state,
> + };
either way the code is still ugly, imo.
How about the (untested) patch below?
I am not going to send this patch right now, it conflicts with the ongoing
xol_mapping.close changes, but what do you think?
We do not need to add the instance of vm_special_mapping into mm_struct,
a single instance (xol_mapping below) with .fault = xol_fault() is enough.
And, with this change xol_area no longer needs "struct page *pages[2]", it
can be turned into "struct page *page".
Oleg.
---
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -101,7 +101,6 @@ struct xol_area {
atomic_t slot_count; /* number of in-use slots */
unsigned long *bitmap; /* 0 = free slot */
- struct vm_special_mapping xol_mapping;
struct page *pages[2];
/*
* We keep the vma's vm_start rather than a pointer to the vma
@@ -1453,6 +1452,21 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
}
+static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct xol_area *area = vma->vm_mm->uprobes_state.xol_area;
+
+ vmf->page = area->pages[0];
+ get_page(vmf->page);
+ return 0;
+}
+
+static const struct vm_special_mapping xol_mapping = {
+ .name = "[uprobes]",
+ .fault = xol_fault,
+};
+
/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
@@ -1479,7 +1493,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
- &area->xol_mapping);
+ &xol_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto fail;
@@ -1518,9 +1532,6 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
if (!area->bitmap)
goto free_area;
- area->xol_mapping.name = "[uprobes]";
- area->xol_mapping.fault = NULL;
- area->xol_mapping.pages = area->pages;
area->pages[0] = alloc_page(GFP_HIGHUSER);
if (!area->pages[0])
goto free_bitmap;
I didn't have time to write a full reply yesterday.
On 09/03, Linus Torvalds wrote:
>
> On Tue, 3 Sept 2024 at 02:09, Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > but with or without this fix __create_xol_area() also needs
> >
> > area->xol_mapping.mremap = NULL;
>
> I think the whole thing needs to be zeroed out.
Again, this is what Sven did and I agree with this fix for now.
> It was always horribly buggy. The close thing just made it more
> *obviously* buggy, because closing a vma is a lot more common than
> mremap'ing it.
Well, this code is very old, I don't think it was "always" buggy.
But at least today it is horribly ugly, and
> Either use kzalloc(), or do a proper initializer something like this:
>
> - area->xol_mapping.name = "[uprobes]";
> - area->xol_mapping.fault = NULL;
> - area->xol_mapping.pages = area->pages;
> + area->xol_mapping = (struct vm_special_mapping) {
> + .name = "[uprobes]",
> + .pages = area->pages,
> + .close = uprobe_clear_state,
> + };
either way the code is still ugly, imo.
How about the (untested) patch below?
I am not going to send this patch right now, it conflicts with the ongoing
close/mremap changes, but what do you think?
We do not need to add the instance of vm_special_mapping into mm_struct,
a single instance (xol_mapping below) with .fault = xol_fault() is enough.
And, with this change xol_area no longer needs "struct page *pages[2]", it
can be turned into "struct page *page".
Oleg.
---
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -101,7 +101,6 @@ struct xol_area {
atomic_t slot_count; /* number of in-use slots */
unsigned long *bitmap; /* 0 = free slot */
- struct vm_special_mapping xol_mapping;
struct page *pages[2];
/*
* We keep the vma's vm_start rather than a pointer to the vma
@@ -1453,6 +1452,21 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
}
+static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct xol_area *area = vma->vm_mm->uprobes_state.xol_area;
+
+ vmf->page = area->pages[0];
+ get_page(vmf->page);
+ return 0;
+}
+
+static const struct vm_special_mapping xol_mapping = {
+ .name = "[uprobes]",
+ .fault = xol_fault,
+};
+
/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
@@ -1479,7 +1493,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
- &area->xol_mapping);
+ &xol_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto fail;
@@ -1518,9 +1532,6 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
if (!area->bitmap)
goto free_area;
- area->xol_mapping.name = "[uprobes]";
- area->xol_mapping.fault = NULL;
- area->xol_mapping.pages = area->pages;
area->pages[0] = alloc_page(GFP_HIGHUSER);
if (!area->pages[0])
goto free_bitmap;
On 09/03, Linus Torvalds wrote: > > On Tue, 3 Sept 2024 at 02:09, Oleg Nesterov <oleg@redhat.com> wrote: > > > > but with or without this fix __create_xol_area() also needs > > > > area->xol_mapping.mremap = NULL; > > I think the whole thing needs to be zeroed out. Agreed, > Either use kzalloc(), This is what Sven did in [PATCH] uprobes: use kzalloc to allocate xol area https://lore.kernel.org/all/20240903102313.3402529-1-svens@linux.ibm.com/ Oleg.
Hi Linus, Linus Torvalds <torvalds@linux-foundation.org> writes: > On Tue, 3 Sept 2024 at 02:09, Oleg Nesterov <oleg@redhat.com> wrote: >> >> but with or without this fix __create_xol_area() also needs >> >> area->xol_mapping.mremap = NULL; > > I think the whole thing needs to be zeroed out. > > It was always horribly buggy. The close thing just made it more > *obviously* buggy, because closing a vma is a lot more common than > mremap'ing it. > > Either use kzalloc(), or do a proper initializer something like this: I sent a patch which does this today: https://lore.kernel.org/all/20240903102313.3402529-1-svens@linux.ibm.com/
On Tue, 3 Sept 2024 at 12:31, Sven Schnelle <svens@linux.ibm.com> wrote:
>
> > Either use kzalloc(), or do a proper initializer something like this:
>
> I sent a patch which does this today:
Ack, looks good.
This should probably be backported independently of the new close()
series, since the mremap issue seems to be a longstanding issue.
Linus
Oleg Nesterov <oleg@redhat.com> writes:
> On 09/03, Sven Schnelle wrote:
>>
>> [ 44.505448] ================================================================== 20:37:27 [3421/145075]
>> [ 44.505455] BUG: KASAN: slab-use-after-free in special_mapping_close+0x9c/0xc8
>> [ 44.505471] Read of size 8 at addr 00000000868dac48 by task sh/1384
>> [ 44.505479]
>> [ 44.505486] CPU: 51 UID: 0 PID: 1384 Comm: sh Not tainted 6.11.0-rc6-next-20240902-dirty #1496
>> [ 44.505503] Hardware name: IBM 3931 A01 704 (z/VM 7.3.0)
>> [ 44.505508] Call Trace:
>> [ 44.505511] [<000b0324d2f78080>] dump_stack_lvl+0xd0/0x108
>> [ 44.505521] [<000b0324d2f5435c>] print_address_description.constprop.0+0x34/0x2e0
>> [ 44.505529] [<000b0324d2f5464c>] print_report+0x44/0x138
>> [ 44.505536] [<000b0324d1383192>] kasan_report+0xc2/0x140
>> [ 44.505543] [<000b0324d2f52904>] special_mapping_close+0x9c/0xc8
> ^^^^^^^^^^^^^^^^^^^^^
> Caused by
>
> [PATCH v2 1/4] mm: Add optional close() to struct vm_special_mapping
> https://lore.kernel.org/all/20240812082605.743814-1-mpe@ellerman.id.au/
>
> ?
>
>> +static void uprobe_clear_state(const struct vm_special_mapping *sm, struct vm_area_struct *vma)
>> +{
>> + struct xol_area *area = container_of(vma->vm_private_data, struct xol_area, xol_mapping);
>> +
>> + mutex_lock(&delayed_uprobe_lock);
>> + delayed_uprobe_remove(NULL, vma->vm_mm);
>> + mutex_unlock(&delayed_uprobe_lock);
>> +
>> + if (!area)
>> + return;
>> +
>> + put_page(area->pages[0]);
>> + kfree(area->bitmap);
>> + kfree(area);
>> +}
>> +
>> static struct xol_area *__create_xol_area(unsigned long vaddr)
>> {
>> struct mm_struct *mm = current->mm;
>> @@ -1481,6 +1500,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
>>
>> area->xol_mapping.name = "[uprobes]";
>> area->xol_mapping.fault = NULL;
>> + area->xol_mapping.close = uprobe_clear_state;
>
> LGTM.
>
> but with or without this fix __create_xol_area() also needs
>
> area->xol_mapping.mremap = NULL;
>
> ?
I think the code should just use kzalloc() instead of kmalloc() so all
members are cleared. I noticed that when looking into the issue because
the new close member of xol_mapping wasn't initialized. My plan was to
send that change as a separate patch - because it's not related to this
issue.
Hi Michael, Sven Schnelle <svens@linux.ibm.com> writes: > The following KASAN splat was shown: > > [ 44.505448] ================================================================== 20:37:27 [3421/145075] > [ 44.505455] BUG: KASAN: slab-use-after-free in special_mapping_close+0x9c/0xc8 > [ 44.505471] Read of size 8 at addr 00000000868dac48 by task sh/1384 > [ 44.505479] > [ 44.505486] CPU: 51 UID: 0 PID: 1384 Comm: sh Not tainted 6.11.0-rc6-next-20240902-dirty #1496 > [ 44.505503] Hardware name: IBM 3931 A01 704 (z/VM 7.3.0) > [ 44.505508] Call Trace: > [ 44.505511] [<000b0324d2f78080>] dump_stack_lvl+0xd0/0x108 > [ 44.505521] [<000b0324d2f5435c>] print_address_description.constprop.0+0x34/0x2e0 > [ 44.505529] [<000b0324d2f5464c>] print_report+0x44/0x138 > [ 44.505536] [<000b0324d1383192>] kasan_report+0xc2/0x140 > [ 44.505543] [<000b0324d2f52904>] special_mapping_close+0x9c/0xc8 > [ 44.505550] [<000b0324d12c7978>] remove_vma+0x78/0x120 > [ 44.505557] [<000b0324d128a2c6>] exit_mmap+0x326/0x750 > [ 44.505563] [<000b0324d0ba655a>] __mmput+0x9a/0x370 > [ 44.505570] [<000b0324d0bbfbe0>] exit_mm+0x240/0x340 > [ 44.505575] [<000b0324d0bc0228>] do_exit+0x548/0xd70 > [ 44.505580] [<000b0324d0bc1102>] do_group_exit+0x132/0x390 > [ 44.505586] [<000b0324d0bc13b6>] __s390x_sys_exit_group+0x56/0x60 > [ 44.505592] [<000b0324d0adcbd6>] do_syscall+0x2f6/0x430 > [ 44.505599] [<000b0324d2f78434>] __do_syscall+0xa4/0x170 > [ 44.505606] [<000b0324d2f9454c>] system_call+0x74/0x98 > [ 44.505614] > [ 44.505616] Allocated by task 1384: > [ 44.505621] kasan_save_stack+0x40/0x70 > [ 44.505630] kasan_save_track+0x28/0x40 > [ 44.505636] __kasan_kmalloc+0xa0/0xc0 > [ 44.505642] __create_xol_area+0xfa/0x410 > [ 44.505648] get_xol_area+0xb0/0xf0 > [ 44.505652] uprobe_notify_resume+0x27a/0x470 > [ 44.505657] irqentry_exit_to_user_mode+0x15e/0x1d0 > [ 44.505664] pgm_check_handler+0x122/0x170 > [ 44.505670] > [ 44.505672] Freed by task 1384: > [ 44.505676] kasan_save_stack+0x40/0x70 > [ 44.505682] kasan_save_track+0x28/0x40 > [ 44.505687] kasan_save_free_info+0x4a/0x70 > [ 44.505693] __kasan_slab_free+0x5a/0x70 > [ 44.505698] kfree+0xe8/0x3f0 > [ 44.505704] __mmput+0x20/0x370 > [ 44.505709] exit_mm+0x240/0x340 > [ 44.505713] do_exit+0x548/0xd70 > [ 44.505718] do_group_exit+0x132/0x390 > [ 44.505722] __s390x_sys_exit_group+0x56/0x60 > [ 44.505727] do_syscall+0x2f6/0x430 > [ 44.505732] __do_syscall+0xa4/0x170 > [ 44.505738] system_call+0x74/0x98 > [..] As this has a dependency on your special mapping close series, do you want to carry that with your patches?
Sven Schnelle <svens@linux.ibm.com> writes: > Hi Michael, Hi Sven, > Sven Schnelle <svens@linux.ibm.com> writes: > >> The following KASAN splat was shown: >> >> [ 44.505448] ================================================================== 20:37:27 [3421/145075] >> [ 44.505455] BUG: KASAN: slab-use-after-free in special_mapping_close+0x9c/0xc8 >> [ 44.505471] Read of size 8 at addr 00000000868dac48 by task sh/1384 ... >> [..] > > As this has a dependency on your special mapping close series, do you > want to carry that with your patches? Andrew has my series in mm-stable, so I think this should go into mm as well. I assume he will pick it up. cheers
On Wed, 04 Sep 2024 13:57:13 +1000 Michael Ellerman <mpe@ellerman.id.au> wrote:
> Sven Schnelle <svens@linux.ibm.com> writes:
> > Hi Michael,
>
> Hi Sven,
>
> > Sven Schnelle <svens@linux.ibm.com> writes:
> >
> >> The following KASAN splat was shown:
> >>
> >> [ 44.505448] ================================================================== 20:37:27 [3421/145075]
> >> [ 44.505455] BUG: KASAN: slab-use-after-free in special_mapping_close+0x9c/0xc8
> >> [ 44.505471] Read of size 8 at addr 00000000868dac48 by task sh/1384
> ...
> >> [..]
> >
> > As this has a dependency on your special mapping close series, do you
> > want to carry that with your patches?
>
> Andrew has my series in mm-stable, so I think this should go into mm as
> well. I assume he will pick it up.
yup, thanks. Added, with
Fixes: 223febc6e557 ("mm: add optional close() to struct vm_special_mapping")
It appears that peterz is scooping up Sven's "uprobes: use kzalloc to
allocate xol area".
This reverts commit 08e28de1160a712724268fd33d77b32f1bc84d1c.
A malicious application can munmap() its "[uprobes]" vma and in this case
xol_mapping.close == uprobe_clear_state() will free the memory which can
be used by another thread, or the same thread when it hits the uprobe bp
afterwards.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
include/linux/uprobes.h | 1 +
kernel/events/uprobes.c | 36 +++++++++++++++++++-----------------
kernel/fork.c | 1 +
3 files changed, 21 insertions(+), 17 deletions(-)
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 493dc95d912c..b503fafb7fb3 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -126,6 +126,7 @@ extern int uprobe_pre_sstep_notifier(struct pt_regs *regs);
extern void uprobe_notify_resume(struct pt_regs *regs);
extern bool uprobe_deny_signal(void);
extern bool arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs);
+extern void uprobe_clear_state(struct mm_struct *mm);
extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr);
extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c2d6b2d64de2..73cc47708679 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1482,22 +1482,6 @@ void * __weak arch_uprobe_trampoline(unsigned long *psize)
return &insn;
}
-/*
- * uprobe_clear_state - Free the area allocated for slots.
- */
-static void uprobe_clear_state(const struct vm_special_mapping *sm, struct vm_area_struct *vma)
-{
- struct xol_area *area = container_of(vma->vm_private_data, struct xol_area, xol_mapping);
-
- mutex_lock(&delayed_uprobe_lock);
- delayed_uprobe_remove(NULL, vma->vm_mm);
- mutex_unlock(&delayed_uprobe_lock);
-
- put_page(area->pages[0]);
- kfree(area->bitmap);
- kfree(area);
-}
-
static struct xol_area *__create_xol_area(unsigned long vaddr)
{
struct mm_struct *mm = current->mm;
@@ -1516,7 +1500,6 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
area->xol_mapping.name = "[uprobes]";
area->xol_mapping.fault = NULL;
- area->xol_mapping.close = uprobe_clear_state;
area->xol_mapping.pages = area->pages;
area->pages[0] = alloc_page(GFP_HIGHUSER);
if (!area->pages[0])
@@ -1562,6 +1545,25 @@ static struct xol_area *get_xol_area(void)
return area;
}
+/*
+ * uprobe_clear_state - Free the area allocated for slots.
+ */
+void uprobe_clear_state(struct mm_struct *mm)
+{
+ struct xol_area *area = mm->uprobes_state.xol_area;
+
+ mutex_lock(&delayed_uprobe_lock);
+ delayed_uprobe_remove(NULL, mm);
+ mutex_unlock(&delayed_uprobe_lock);
+
+ if (!area)
+ return;
+
+ put_page(area->pages[0]);
+ kfree(area->bitmap);
+ kfree(area);
+}
+
void uprobe_start_dup_mmap(void)
{
percpu_down_read(&dup_mmap_sem);
diff --git a/kernel/fork.c b/kernel/fork.c
index 61070248a7d3..3d590e51ce84 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1338,6 +1338,7 @@ static inline void __mmput(struct mm_struct *mm)
{
VM_BUG_ON(atomic_read(&mm->mm_users));
+ uprobe_clear_state(mm);
exit_aio(mm);
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
--
2.25.1.362.g51ebf55
Currently each xol_area has its own instance of vm_special_mapping, this
is suboptimal and ugly. Kill xol_area->xol_mapping and add a single global
instance of vm_special_mapping, the ->fault() method can use area->pages
rather than xol_mapping->pages.
As a side effect this fixes the problem introduced by the recent commit
223febc6e557 ("mm: add optional close() to struct vm_special_mapping"), if
special_mapping_close() is called from the __mmput() paths, it will use
vma->vm_private_data = &area->xol_mapping freed by uprobe_clear_state().
Reported-by: Sven Schnelle <svens@linux.ibm.com>
Closes: https://lore.kernel.org/all/yt9dy149vprr.fsf@linux.ibm.com/
Fixes: 223febc6e557 ("mm: add optional close() to struct vm_special_mapping")
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
kernel/events/uprobes.c | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 73cc47708679..a478e028043f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -99,7 +99,6 @@ struct xol_area {
atomic_t slot_count; /* number of in-use slots */
unsigned long *bitmap; /* 0 = free slot */
- struct vm_special_mapping xol_mapping;
struct page *pages[2];
/*
* We keep the vma's vm_start rather than a pointer to the vma
@@ -1433,6 +1432,21 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
}
+static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct xol_area *area = vma->vm_mm->uprobes_state.xol_area;
+
+ vmf->page = area->pages[0];
+ get_page(vmf->page);
+ return 0;
+}
+
+static const struct vm_special_mapping xol_mapping = {
+ .name = "[uprobes]",
+ .fault = xol_fault,
+};
+
/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
@@ -1459,7 +1473,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
- &area->xol_mapping);
+ &xol_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto fail;
@@ -1498,9 +1512,6 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
if (!area->bitmap)
goto free_area;
- area->xol_mapping.name = "[uprobes]";
- area->xol_mapping.fault = NULL;
- area->xol_mapping.pages = area->pages;
area->pages[0] = alloc_page(GFP_HIGHUSER);
if (!area->pages[0])
goto free_bitmap;
--
2.25.1.362.g51ebf55
Now that xol_mapping has its own ->fault() method we no longer need
xol_area->pages[1] == NULL, we need a single page.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
kernel/events/uprobes.c | 17 ++++++++---------
1 file changed, 8 insertions(+), 9 deletions(-)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a478e028043f..94a17435cfe6 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -99,7 +99,7 @@ struct xol_area {
atomic_t slot_count; /* number of in-use slots */
unsigned long *bitmap; /* 0 = free slot */
- struct page *pages[2];
+ struct page *page;
/*
* We keep the vma's vm_start rather than a pointer to the vma
* itself. The probed process or a naughty kernel module could make
@@ -1437,7 +1437,7 @@ static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
{
struct xol_area *area = vma->vm_mm->uprobes_state.xol_area;
- vmf->page = area->pages[0];
+ vmf->page = area->page;
get_page(vmf->page);
return 0;
}
@@ -1512,10 +1512,9 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
if (!area->bitmap)
goto free_area;
- area->pages[0] = alloc_page(GFP_HIGHUSER);
- if (!area->pages[0])
+ area->page = alloc_page(GFP_HIGHUSER);
+ if (!area->page)
goto free_bitmap;
- area->pages[1] = NULL;
area->vaddr = vaddr;
init_waitqueue_head(&area->wq);
@@ -1523,12 +1522,12 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
set_bit(0, area->bitmap);
atomic_set(&area->slot_count, 1);
insns = arch_uprobe_trampoline(&insns_size);
- arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size);
+ arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
if (!xol_add_vma(mm, area))
return area;
- __free_page(area->pages[0]);
+ __free_page(area->page);
free_bitmap:
kfree(area->bitmap);
free_area:
@@ -1570,7 +1569,7 @@ void uprobe_clear_state(struct mm_struct *mm)
if (!area)
return;
- put_page(area->pages[0]);
+ put_page(area->page);
kfree(area->bitmap);
kfree(area);
}
@@ -1637,7 +1636,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
if (unlikely(!xol_vaddr))
return 0;
- arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
+ arch_uprobe_copy_ixol(area->page, xol_vaddr,
&uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
return xol_vaddr;
--
2.25.1.362.g51ebf55
© 2016 - 2025 Red Hat, Inc.