mm/vma.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-)
vms_abort_munmap_vmas() is a recovery path where, on entry, some VMAs
have already been torn down halfway (in a way we can't undo) but are
still present in the maple tree.
At this point, we *must* remove the VMAs from the VMA tree, otherwise
we get UAF.
Because removing VMA tree nodes can require memory allocation, the
existing code has an error path which tries to handle this by
reattaching the VMAs; but that can't be done safely.
A nicer way to fix it would probably be to preallocate enough maple
tree nodes for the removal before the point of no return, or something
like that; but for now, fix it the easy and kinda ugly way, by marking
this allocation __GFP_NOFAIL.
Fixes: 4f87153e82c4 ("mm: change failure of MAP_FIXED to restoring the gap on failure")
Signed-off-by: Jann Horn <jannh@google.com>
---
This can be tested with the following reproducer (on a kernel built with
CONFIG_KASAN=y, CONFIG_FAILSLAB=y, CONFIG_FAULT_INJECTION_DEBUG_FS=y,
with the reproducer running as root):
```
typeof(x) __res = (x); \
if (__res == (typeof(x))-1) \
err(1, "SYSCHK(" #x ")"); \
__res; \
})
static void write_file(char *name, char *buf) {
int fd = open(name, O_WRONLY);
if (fd == -1)
err(1, "unable to open for writing: %s", name);
if (SYSCHK(write(fd, buf, strlen(buf))) != strlen(buf))
errx(1, "write %s", name);
SYSCHK(close(fd));
}
int main(void) {
// make a large area with a bunch of VMAs
char *area = SYSCHK(mmap(NULL, AREA_SIZE, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0));
for (int off=0; off<AREA_SIZE; off += 0x2000)
SYSCHK(mmap(area+off, 0x1000, PROT_READ, MAP_FIXED|MAP_PRIVATE|MAP_ANONYMOUS, -1, 0));
// open a file whose mappings use usbdev_vm_ops, and map it in part of this area
int map_fd = SYSCHK(open("/dev/bus/usb/001/001", O_RDONLY));
void *map = SYSCHK(mmap(area, 0x1000, PROT_READ, MAP_SHARED|MAP_FIXED, map_fd, 0));
close(map_fd);
// make RWX mapping request fail late
SYSCHK(prctl(65/*PR_SET_MDWE*/, 1/*PR_MDWE_REFUSE_EXEC_GAIN*/, 0, 0, 0));
// some random other file
int fd = SYSCHK(open("/etc/passwd", O_RDONLY));
/* disarm for now */
write_file("/sys/kernel/debug/failslab/probability", "0");
/* fail allocations of maple_node... */
write_file("/sys/kernel/debug/failslab/cache-filter", "Y");
write_file("/sys/kernel/slab/maple_node/failslab", "1");
/* ... even though it's a sleepable allocation... */
write_file("/sys/kernel/debug/failslab/ignore-gfp-wait", "N");
/* ... in this task... */
write_file("/sys/kernel/debug/failslab/task-filter", "Y");
write_file("/proc/self/make-it-fail", "1");
/* ... every time... */
write_file("/sys/kernel/debug/failslab/times", "-1");
/* ... after 8 successful allocations (value chosen experimentally)... */
write_file("/sys/kernel/debug/failslab/space", "2048"); // one object is 256
/* ... and print where the allocations actually failed... */
write_file("/sys/kernel/debug/failslab/verbose", "2");
/* ... and that's it, arm it */
write_file("/sys/kernel/debug/failslab/probability", "100");
// This mmap request will fail late due to MDWE.
// The error recovery path will attempt to clear out the VMA pointers with a
// spanning_store (which will be blocked by error injection).
mmap(area, AREA_SIZE, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_FIXED, fd, 0);
/* disarm */
write_file("/sys/kernel/debug/failslab/probability", "0");
SYSCHK(munmap(map, 0x1000)); // UAF expected here
system("cat /proc/$PPID/maps");
}
```
Result:
```
FAULT_INJECTION: forcing a failure.
name failslab, interval 1, probability 100, space 256, times -1
CPU: 3 UID: 0 PID: 607 Comm: unmap-fail Not tainted 6.12.0-rc3-00013-geca631b8fe80 #518
[...]
Call Trace:
<TASK>
dump_stack_lvl+0x80/0xa0
should_fail_ex+0x4d3/0x5c0
[...]
should_failslab+0xc7/0x130
kmem_cache_alloc_noprof+0x73/0x3a0
[...]
mas_alloc_nodes+0x3a3/0x690
mas_nomem+0xaa/0x1d0
mas_store_gfp+0x515/0xa80
[...]
mmap_region+0xa96/0x2590
[...]
do_mmap+0x71e/0xfe0
[...]
vm_mmap_pgoff+0x17a/0x2f0
[...]
ksys_mmap_pgoff+0x2ee/0x460
do_syscall_64+0x68/0x140
entry_SYSCALL_64_after_hwframe+0x76/0x7e
[...]
</TASK>
mmap: unmap-fail: (607) Unable to abort munmap() operation
==================================================================
BUG: KASAN: slab-use-after-free in dec_usb_memory_use_count+0x365/0x430
Read of size 8 at addr ffff88810e9ba8b8 by task unmap-fail/607
CPU: 3 UID: 0 PID: 607 Comm: unmap-fail Not tainted 6.12.0-rc3-00013-geca631b8fe80 #518
[...]
Call Trace:
<TASK>
dump_stack_lvl+0x66/0xa0
print_report+0xce/0x670
[...]
kasan_report+0xf7/0x130
[...]
dec_usb_memory_use_count+0x365/0x430
remove_vma+0x76/0x120
vms_complete_munmap_vmas+0x447/0x750
do_vmi_align_munmap+0x4b9/0x700
[...]
do_vmi_munmap+0x164/0x2e0
__vm_munmap+0x128/0x2a0
[...]
__x64_sys_munmap+0x59/0x80
do_syscall_64+0x68/0x140
entry_SYSCALL_64_after_hwframe+0x76/0x7e
[...]
</TASK>
Allocated by task 607:
kasan_save_stack+0x33/0x60
kasan_save_track+0x14/0x30
__kasan_kmalloc+0xaa/0xb0
usbdev_mmap+0x1a0/0xaf0
mmap_region+0xf6e/0x2590
do_mmap+0x71e/0xfe0
vm_mmap_pgoff+0x17a/0x2f0
ksys_mmap_pgoff+0x2ee/0x460
do_syscall_64+0x68/0x140
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Freed by task 607:
kasan_save_stack+0x33/0x60
kasan_save_track+0x14/0x30
kasan_save_free_info+0x3b/0x60
__kasan_slab_free+0x4f/0x70
kfree+0x148/0x450
vms_clean_up_area+0x188/0x220
mmap_region+0xf1b/0x2590
do_mmap+0x71e/0xfe0
vm_mmap_pgoff+0x17a/0x2f0
ksys_mmap_pgoff+0x2ee/0x460
do_syscall_64+0x68/0x140
entry_SYSCALL_64_after_hwframe+0x76/0x7e
[...]
==================================================================
```
---
mm/vma.h | 12 +++---------
1 file changed, 3 insertions(+), 9 deletions(-)
diff --git a/mm/vma.h b/mm/vma.h
index 819f994cf727..ebd78f1577f3 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -241,15 +241,9 @@ static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
* failure method of leaving a gap where the MAP_FIXED mapping failed.
*/
mas_set_range(mas, vms->start, vms->end - 1);
- if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) {
- pr_warn_once("%s: (%d) Unable to abort munmap() operation\n",
- current->comm, current->pid);
- /* Leaving vmas detached and in-tree may hamper recovery */
- reattach_vmas(mas_detach);
- } else {
- /* Clean up the insertion of the unfortunate gap */
- vms_complete_munmap_vmas(vms, mas_detach);
- }
+ mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
+ /* Clean up the insertion of the unfortunate gap */
+ vms_complete_munmap_vmas(vms, mas_detach);
}
int
---
base-commit: eca631b8fe808748d7585059c4307005ca5c5820
change-id: 20241016-fix-munmap-abort-2330b61332aa
--
Jann Horn <jannh@google.com>
On Wed, Oct 16, 2024 at 05:07:53PM +0200, Jann Horn wrote: > vms_abort_munmap_vmas() is a recovery path where, on entry, some VMAs > have already been torn down halfway (in a way we can't undo) but are > still present in the maple tree. > > At this point, we *must* remove the VMAs from the VMA tree, otherwise > we get UAF. > > Because removing VMA tree nodes can require memory allocation, the > existing code has an error path which tries to handle this by > reattaching the VMAs; but that can't be done safely. > > A nicer way to fix it would probably be to preallocate enough maple > tree nodes for the removal before the point of no return, or something > like that; but for now, fix it the easy and kinda ugly way, by marking > this allocation __GFP_NOFAIL. > > Fixes: 4f87153e82c4 ("mm: change failure of MAP_FIXED to restoring the gap on failure") > Signed-off-by: Jann Horn <jannh@google.com> I kind of question whether this is real-world achievable (yes I realise you included a repro, but one prodding /sys/kernel/debug bits :>) but to be honest at this point I think I feel a lot safer just clearing this here for sure. So: Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- > This can be tested with the following reproducer (on a kernel built with > CONFIG_KASAN=y, CONFIG_FAILSLAB=y, CONFIG_FAULT_INJECTION_DEBUG_FS=y, > with the reproducer running as root): > > ``` > > typeof(x) __res = (x); \ > if (__res == (typeof(x))-1) \ > err(1, "SYSCHK(" #x ")"); \ > __res; \ > }) > > static void write_file(char *name, char *buf) { > int fd = open(name, O_WRONLY); > if (fd == -1) > err(1, "unable to open for writing: %s", name); > if (SYSCHK(write(fd, buf, strlen(buf))) != strlen(buf)) > errx(1, "write %s", name); > SYSCHK(close(fd)); > } > > int main(void) { > // make a large area with a bunch of VMAs > char *area = SYSCHK(mmap(NULL, AREA_SIZE, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)); > for (int off=0; off<AREA_SIZE; off += 0x2000) > SYSCHK(mmap(area+off, 0x1000, PROT_READ, MAP_FIXED|MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)); > > // open a file whose mappings use usbdev_vm_ops, and map it in part of this area > int map_fd = SYSCHK(open("/dev/bus/usb/001/001", O_RDONLY)); > void *map = SYSCHK(mmap(area, 0x1000, PROT_READ, MAP_SHARED|MAP_FIXED, map_fd, 0)); > close(map_fd); > > // make RWX mapping request fail late > SYSCHK(prctl(65/*PR_SET_MDWE*/, 1/*PR_MDWE_REFUSE_EXEC_GAIN*/, 0, 0, 0)); > > // some random other file > int fd = SYSCHK(open("/etc/passwd", O_RDONLY)); > > /* disarm for now */ > write_file("/sys/kernel/debug/failslab/probability", "0"); > > /* fail allocations of maple_node... */ > write_file("/sys/kernel/debug/failslab/cache-filter", "Y"); > write_file("/sys/kernel/slab/maple_node/failslab", "1"); > /* ... even though it's a sleepable allocation... */ > write_file("/sys/kernel/debug/failslab/ignore-gfp-wait", "N"); > /* ... in this task... */ > write_file("/sys/kernel/debug/failslab/task-filter", "Y"); > write_file("/proc/self/make-it-fail", "1"); > /* ... every time... */ > write_file("/sys/kernel/debug/failslab/times", "-1"); > /* ... after 8 successful allocations (value chosen experimentally)... */ > write_file("/sys/kernel/debug/failslab/space", "2048"); // one object is 256 > /* ... and print where the allocations actually failed... */ > write_file("/sys/kernel/debug/failslab/verbose", "2"); > /* ... and that's it, arm it */ > write_file("/sys/kernel/debug/failslab/probability", "100"); > > // This mmap request will fail late due to MDWE. > // The error recovery path will attempt to clear out the VMA pointers with a > // spanning_store (which will be blocked by error injection). > mmap(area, AREA_SIZE, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_FIXED, fd, 0); > > /* disarm */ > write_file("/sys/kernel/debug/failslab/probability", "0"); > > SYSCHK(munmap(map, 0x1000)); // UAF expected here > system("cat /proc/$PPID/maps"); > } > ``` > > Result: > ``` > FAULT_INJECTION: forcing a failure. > name failslab, interval 1, probability 100, space 256, times -1 > CPU: 3 UID: 0 PID: 607 Comm: unmap-fail Not tainted 6.12.0-rc3-00013-geca631b8fe80 #518 > [...] > Call Trace: > <TASK> > dump_stack_lvl+0x80/0xa0 > should_fail_ex+0x4d3/0x5c0 > [...] > should_failslab+0xc7/0x130 > kmem_cache_alloc_noprof+0x73/0x3a0 > [...] > mas_alloc_nodes+0x3a3/0x690 > mas_nomem+0xaa/0x1d0 > mas_store_gfp+0x515/0xa80 > [...] > mmap_region+0xa96/0x2590 > [...] > do_mmap+0x71e/0xfe0 > [...] > vm_mmap_pgoff+0x17a/0x2f0 > [...] > ksys_mmap_pgoff+0x2ee/0x460 > do_syscall_64+0x68/0x140 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > [...] > </TASK> > mmap: unmap-fail: (607) Unable to abort munmap() operation > ================================================================== > BUG: KASAN: slab-use-after-free in dec_usb_memory_use_count+0x365/0x430 > Read of size 8 at addr ffff88810e9ba8b8 by task unmap-fail/607 > > CPU: 3 UID: 0 PID: 607 Comm: unmap-fail Not tainted 6.12.0-rc3-00013-geca631b8fe80 #518 > [...] > Call Trace: > <TASK> > dump_stack_lvl+0x66/0xa0 > print_report+0xce/0x670 > [...] > kasan_report+0xf7/0x130 > [...] > dec_usb_memory_use_count+0x365/0x430 > remove_vma+0x76/0x120 > vms_complete_munmap_vmas+0x447/0x750 > do_vmi_align_munmap+0x4b9/0x700 > [...] > do_vmi_munmap+0x164/0x2e0 > __vm_munmap+0x128/0x2a0 > [...] > __x64_sys_munmap+0x59/0x80 > do_syscall_64+0x68/0x140 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > [...] > </TASK> > > Allocated by task 607: > kasan_save_stack+0x33/0x60 > kasan_save_track+0x14/0x30 > __kasan_kmalloc+0xaa/0xb0 > usbdev_mmap+0x1a0/0xaf0 > mmap_region+0xf6e/0x2590 > do_mmap+0x71e/0xfe0 > vm_mmap_pgoff+0x17a/0x2f0 > ksys_mmap_pgoff+0x2ee/0x460 > do_syscall_64+0x68/0x140 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > > Freed by task 607: > kasan_save_stack+0x33/0x60 > kasan_save_track+0x14/0x30 > kasan_save_free_info+0x3b/0x60 > __kasan_slab_free+0x4f/0x70 > kfree+0x148/0x450 > vms_clean_up_area+0x188/0x220 > mmap_region+0xf1b/0x2590 > do_mmap+0x71e/0xfe0 > vm_mmap_pgoff+0x17a/0x2f0 > ksys_mmap_pgoff+0x2ee/0x460 > do_syscall_64+0x68/0x140 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > [...] > ================================================================== > ``` > --- > mm/vma.h | 12 +++--------- > 1 file changed, 3 insertions(+), 9 deletions(-) > > diff --git a/mm/vma.h b/mm/vma.h > index 819f994cf727..ebd78f1577f3 100644 > --- a/mm/vma.h > +++ b/mm/vma.h > @@ -241,15 +241,9 @@ static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, > * failure method of leaving a gap where the MAP_FIXED mapping failed. > */ > mas_set_range(mas, vms->start, vms->end - 1); > - if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) { > - pr_warn_once("%s: (%d) Unable to abort munmap() operation\n", > - current->comm, current->pid); > - /* Leaving vmas detached and in-tree may hamper recovery */ > - reattach_vmas(mas_detach); > - } else { > - /* Clean up the insertion of the unfortunate gap */ > - vms_complete_munmap_vmas(vms, mas_detach); > - } > + mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL); > + /* Clean up the insertion of the unfortunate gap */ > + vms_complete_munmap_vmas(vms, mas_detach); > } > > int > > --- > base-commit: eca631b8fe808748d7585059c4307005ca5c5820 > change-id: 20241016-fix-munmap-abort-2330b61332aa > -- > Jann Horn <jannh@google.com> >
On Thu, Oct 17, 2024 at 11:47 AM Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > On Wed, Oct 16, 2024 at 05:07:53PM +0200, Jann Horn wrote: > > vms_abort_munmap_vmas() is a recovery path where, on entry, some VMAs > > have already been torn down halfway (in a way we can't undo) but are > > still present in the maple tree. > > > > At this point, we *must* remove the VMAs from the VMA tree, otherwise > > we get UAF. > > > > Because removing VMA tree nodes can require memory allocation, the > > existing code has an error path which tries to handle this by > > reattaching the VMAs; but that can't be done safely. > > > > A nicer way to fix it would probably be to preallocate enough maple > > tree nodes for the removal before the point of no return, or something > > like that; but for now, fix it the easy and kinda ugly way, by marking > > this allocation __GFP_NOFAIL. > > > > Fixes: 4f87153e82c4 ("mm: change failure of MAP_FIXED to restoring the gap on failure") > > Signed-off-by: Jann Horn <jannh@google.com> > > I kind of question whether this is real-world achievable (yes I realise you > included a repro, but one prodding /sys/kernel/debug bits :>) but to be > honest at this point I think I feel a lot safer just clearing this here for > sure. So: I mean, there is a reason why we have __GFP_NOFAIL, and if you don't set it, my understanding is that you *can* end up failing allocations when the page allocator sees no other way to make progress... I think as a rough sketch, what you'd have to do to hit this issue without cheating using fault injection might be something like this, for simplicity assume all of this happens on the same CPU core: - make processes A, B, C, D; with A having threads A1 and A2 - let process A consume most of the available RAM+swap (so that process A will be killed first by the OOM killer) - let thread A2 enter some syscall that will allocate a lot of order-0 pages without fatal_signal_pending() checks, then block/preempt it somehow - let thread A1 enter an mmap() syscall, then block/preempt it somehow - let process B consume remaining available RAM, until B blocks and the OOM killer decides to reap process A. Note that the OOM killer starts by basically just setting a flag on the target process and sending it a fatal signal; only if the target process doesn't exit for some time after that (OOM_REAPER_DELAY = 2 seconds), the OOM killer starts actively reaping the target's memory - let process C allocate as many maple tree nodes as possible (to drain the slab cache's freelists), until C blocks on memory allocation - maybe let process D free one maple tree node or such, so that the first maple node allocation in mmap() for constructing the detached tree works? - let thread A2 continue - it will have access to ALLOC_OOM memory reserves, and AFAIU will be able to completely empty out the memory reserves, and will then hit a __GFP_KERNEL allocation failure - once A2 has hit an allocation failure, let thread A1 continue execution - it, too, should hit a __GFP_KERNEL allocation failure But I haven't actually tested that.
On 10/16/24 17:07, Jann Horn wrote: > vms_abort_munmap_vmas() is a recovery path where, on entry, some VMAs > have already been torn down halfway (in a way we can't undo) but are > still present in the maple tree. > > At this point, we *must* remove the VMAs from the VMA tree, otherwise > we get UAF. > > Because removing VMA tree nodes can require memory allocation, the > existing code has an error path which tries to handle this by > reattaching the VMAs; but that can't be done safely. > > A nicer way to fix it would probably be to preallocate enough maple > tree nodes for the removal before the point of no return, or something > like that; but for now, fix it the easy and kinda ugly way, by marking > this allocation __GFP_NOFAIL. Yes that should be acceptable. > Fixes: 4f87153e82c4 ("mm: change failure of MAP_FIXED to restoring the gap on failure") > Signed-off-by: Jann Horn <jannh@google.com> Acked-by: Vlastimil Babka <vbabka@suse.cz>
* Jann Horn <jannh@google.com> [241016 11:08]: > vms_abort_munmap_vmas() is a recovery path where, on entry, some VMAs > have already been torn down halfway (in a way we can't undo) but are > still present in the maple tree. > > At this point, we *must* remove the VMAs from the VMA tree, otherwise > we get UAF. Wait, the vma should be re-attached without PTEs and files closed in this case. I don't see how we are hitting the UAF in your example - we shouldn't unless there is something with the driver itself and the file close? My concern is that I am missing an error scenario. > > Because removing VMA tree nodes can require memory allocation, the > existing code has an error path which tries to handle this by > reattaching the VMAs; but that can't be done safely. > > A nicer way to fix it would probably be to preallocate enough maple > tree nodes for the removal before the point of no return, or something > like that; but for now, fix it the easy and kinda ugly way, by marking > this allocation __GFP_NOFAIL. I don't think that's any better than what happens now or what you have below. We have a way to do what you are saying, but it would slow down everyone for the sake of having enough memory around for the very rare error path, and it is certainly better to dip into the reserves at that point. Your patch is better than this alternative. But since this is under a lock that allows sleeping, shouldn't the shrinker kick in? Should we just use __GFP_NOFAIL for the first store instead of the error path? > > Fixes: 4f87153e82c4 ("mm: change failure of MAP_FIXED to restoring the gap on failure") > Signed-off-by: Jann Horn <jannh@google.com> > --- > This can be tested with the following reproducer (on a kernel built with > CONFIG_KASAN=y, CONFIG_FAILSLAB=y, CONFIG_FAULT_INJECTION_DEBUG_FS=y, > with the reproducer running as root): > > ``` > > typeof(x) __res = (x); \ > if (__res == (typeof(x))-1) \ > err(1, "SYSCHK(" #x ")"); \ > __res; \ > }) > > static void write_file(char *name, char *buf) { > int fd = open(name, O_WRONLY); > if (fd == -1) > err(1, "unable to open for writing: %s", name); > if (SYSCHK(write(fd, buf, strlen(buf))) != strlen(buf)) > errx(1, "write %s", name); > SYSCHK(close(fd)); > } > > int main(void) { > // make a large area with a bunch of VMAs > char *area = SYSCHK(mmap(NULL, AREA_SIZE, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)); > for (int off=0; off<AREA_SIZE; off += 0x2000) > SYSCHK(mmap(area+off, 0x1000, PROT_READ, MAP_FIXED|MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)); > > // open a file whose mappings use usbdev_vm_ops, and map it in part of this area > int map_fd = SYSCHK(open("/dev/bus/usb/001/001", O_RDONLY)); > void *map = SYSCHK(mmap(area, 0x1000, PROT_READ, MAP_SHARED|MAP_FIXED, map_fd, 0)); > close(map_fd); > > // make RWX mapping request fail late > SYSCHK(prctl(65/*PR_SET_MDWE*/, 1/*PR_MDWE_REFUSE_EXEC_GAIN*/, 0, 0, 0)); > > // some random other file > int fd = SYSCHK(open("/etc/passwd", O_RDONLY)); > > /* disarm for now */ > write_file("/sys/kernel/debug/failslab/probability", "0"); > > /* fail allocations of maple_node... */ > write_file("/sys/kernel/debug/failslab/cache-filter", "Y"); > write_file("/sys/kernel/slab/maple_node/failslab", "1"); > /* ... even though it's a sleepable allocation... */ > write_file("/sys/kernel/debug/failslab/ignore-gfp-wait", "N"); > /* ... in this task... */ > write_file("/sys/kernel/debug/failslab/task-filter", "Y"); > write_file("/proc/self/make-it-fail", "1"); > /* ... every time... */ > write_file("/sys/kernel/debug/failslab/times", "-1"); > /* ... after 8 successful allocations (value chosen experimentally)... */ > write_file("/sys/kernel/debug/failslab/space", "2048"); // one object is 256 > /* ... and print where the allocations actually failed... */ > write_file("/sys/kernel/debug/failslab/verbose", "2"); > /* ... and that's it, arm it */ > write_file("/sys/kernel/debug/failslab/probability", "100"); > > // This mmap request will fail late due to MDWE. > // The error recovery path will attempt to clear out the VMA pointers with a > // spanning_store (which will be blocked by error injection). > mmap(area, AREA_SIZE, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_FIXED, fd, 0); > > /* disarm */ > write_file("/sys/kernel/debug/failslab/probability", "0"); > > SYSCHK(munmap(map, 0x1000)); // UAF expected here > system("cat /proc/$PPID/maps"); > } > ``` > > Result: > ``` > FAULT_INJECTION: forcing a failure. > name failslab, interval 1, probability 100, space 256, times -1 > CPU: 3 UID: 0 PID: 607 Comm: unmap-fail Not tainted 6.12.0-rc3-00013-geca631b8fe80 #518 > [...] > Call Trace: > <TASK> > dump_stack_lvl+0x80/0xa0 > should_fail_ex+0x4d3/0x5c0 > [...] > should_failslab+0xc7/0x130 > kmem_cache_alloc_noprof+0x73/0x3a0 > [...] > mas_alloc_nodes+0x3a3/0x690 > mas_nomem+0xaa/0x1d0 > mas_store_gfp+0x515/0xa80 > [...] > mmap_region+0xa96/0x2590 > [...] > do_mmap+0x71e/0xfe0 > [...] > vm_mmap_pgoff+0x17a/0x2f0 > [...] > ksys_mmap_pgoff+0x2ee/0x460 > do_syscall_64+0x68/0x140 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > [...] > </TASK> > mmap: unmap-fail: (607) Unable to abort munmap() operation > ================================================================== > BUG: KASAN: slab-use-after-free in dec_usb_memory_use_count+0x365/0x430 > Read of size 8 at addr ffff88810e9ba8b8 by task unmap-fail/607 What was this pointer? > > CPU: 3 UID: 0 PID: 607 Comm: unmap-fail Not tainted 6.12.0-rc3-00013-geca631b8fe80 #518 > [...] > Call Trace: > <TASK> > dump_stack_lvl+0x66/0xa0 > print_report+0xce/0x670 > [...] > kasan_report+0xf7/0x130 > [...] > dec_usb_memory_use_count+0x365/0x430 > remove_vma+0x76/0x120 > vms_complete_munmap_vmas+0x447/0x750 > do_vmi_align_munmap+0x4b9/0x700 > [...] > do_vmi_munmap+0x164/0x2e0 > __vm_munmap+0x128/0x2a0 > [...] > __x64_sys_munmap+0x59/0x80 > do_syscall_64+0x68/0x140 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > [...] > </TASK> > > Allocated by task 607: > kasan_save_stack+0x33/0x60 > kasan_save_track+0x14/0x30 > __kasan_kmalloc+0xaa/0xb0 > usbdev_mmap+0x1a0/0xaf0 > mmap_region+0xf6e/0x2590 > do_mmap+0x71e/0xfe0 > vm_mmap_pgoff+0x17a/0x2f0 > ksys_mmap_pgoff+0x2ee/0x460 > do_syscall_64+0x68/0x140 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > > Freed by task 607: > kasan_save_stack+0x33/0x60 > kasan_save_track+0x14/0x30 > kasan_save_free_info+0x3b/0x60 > __kasan_slab_free+0x4f/0x70 > kfree+0x148/0x450 > vms_clean_up_area+0x188/0x220 What line is this? > mmap_region+0xf1b/0x2590 > do_mmap+0x71e/0xfe0 > vm_mmap_pgoff+0x17a/0x2f0 > ksys_mmap_pgoff+0x2ee/0x460 > do_syscall_64+0x68/0x140 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > [...] > ================================================================== > ``` > --- > mm/vma.h | 12 +++--------- > 1 file changed, 3 insertions(+), 9 deletions(-) > > diff --git a/mm/vma.h b/mm/vma.h > index 819f994cf727..ebd78f1577f3 100644 > --- a/mm/vma.h > +++ b/mm/vma.h > @@ -241,15 +241,9 @@ static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, > * failure method of leaving a gap where the MAP_FIXED mapping failed. > */ > mas_set_range(mas, vms->start, vms->end - 1); > - if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) { > - pr_warn_once("%s: (%d) Unable to abort munmap() operation\n", > - current->comm, current->pid); > - /* Leaving vmas detached and in-tree may hamper recovery */ > - reattach_vmas(mas_detach); > - } else { > - /* Clean up the insertion of the unfortunate gap */ > - vms_complete_munmap_vmas(vms, mas_detach); > - } > + mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL); > + /* Clean up the insertion of the unfortunate gap */ > + vms_complete_munmap_vmas(vms, mas_detach); > } > > int > > --- > base-commit: eca631b8fe808748d7585059c4307005ca5c5820 > change-id: 20241016-fix-munmap-abort-2330b61332aa > -- > Jann Horn <jannh@google.com> >
On Wed, Oct 16, 2024 at 5:40 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > * Jann Horn <jannh@google.com> [241016 11:08]: > > vms_abort_munmap_vmas() is a recovery path where, on entry, some VMAs > > have already been torn down halfway (in a way we can't undo) but are > > still present in the maple tree. > > > > At this point, we *must* remove the VMAs from the VMA tree, otherwise > > we get UAF. > > Wait, the vma should be re-attached without PTEs and files closed in > this case. I don't see how we are hitting the UAF in your example - we > shouldn't unless there is something with the driver itself and the file > close? > > My concern is that I am missing an error scenario. Where are the files supposed to be closed? vms_clean_up_area() unlinks the VMA from the file and calls ->close() but doesn't unlink the file, right? FWIW, I tested on commit eca631b8fe80 (current mainline head), I didn't check whether anything in the MM tree already addresses this. (I probably should have...) > But since this is under a lock that allows sleeping, shouldn't the > shrinker kick in? Yeah, I think without error injection, you'd basically only fail this allocation if the OOM killer has already decided to kill your task and the system is entirely out of memory. OOM kills IIRC have two effects on the page allocator: 1. they allow allocating with no watermarks (ALLOC_OOM) (based on the theory that you might need to allocate some memory in order to be able to exit and free more memory) 2. they allow giving up on GFP_KERNEL allocations (see the "/* Avoid allocations with no watermarks from looping endlessly */" part of __alloc_pages_slowpath()) > Should we just use __GFP_NOFAIL for the first store > instead of the error path? Which first store? Do you mean get rid of vms_abort_munmap_vmas() entirely somehow? > > Fixes: 4f87153e82c4 ("mm: change failure of MAP_FIXED to restoring the gap on failure") > > Signed-off-by: Jann Horn <jannh@google.com> > > --- > > This can be tested with the following reproducer (on a kernel built with > > CONFIG_KASAN=y, CONFIG_FAILSLAB=y, CONFIG_FAULT_INJECTION_DEBUG_FS=y, > > with the reproducer running as root): > > > > ``` > > > > typeof(x) __res = (x); \ > > if (__res == (typeof(x))-1) \ > > err(1, "SYSCHK(" #x ")"); \ > > __res; \ > > }) > > > > static void write_file(char *name, char *buf) { > > int fd = open(name, O_WRONLY); > > if (fd == -1) > > err(1, "unable to open for writing: %s", name); > > if (SYSCHK(write(fd, buf, strlen(buf))) != strlen(buf)) > > errx(1, "write %s", name); > > SYSCHK(close(fd)); > > } > > > > int main(void) { > > // make a large area with a bunch of VMAs > > char *area = SYSCHK(mmap(NULL, AREA_SIZE, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)); > > for (int off=0; off<AREA_SIZE; off += 0x2000) > > SYSCHK(mmap(area+off, 0x1000, PROT_READ, MAP_FIXED|MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)); > > > > // open a file whose mappings use usbdev_vm_ops, and map it in part of this area > > int map_fd = SYSCHK(open("/dev/bus/usb/001/001", O_RDONLY)); > > void *map = SYSCHK(mmap(area, 0x1000, PROT_READ, MAP_SHARED|MAP_FIXED, map_fd, 0)); > > close(map_fd); > > > > // make RWX mapping request fail late > > SYSCHK(prctl(65/*PR_SET_MDWE*/, 1/*PR_MDWE_REFUSE_EXEC_GAIN*/, 0, 0, 0)); > > > > // some random other file > > int fd = SYSCHK(open("/etc/passwd", O_RDONLY)); > > > > /* disarm for now */ > > write_file("/sys/kernel/debug/failslab/probability", "0"); > > > > /* fail allocations of maple_node... */ > > write_file("/sys/kernel/debug/failslab/cache-filter", "Y"); > > write_file("/sys/kernel/slab/maple_node/failslab", "1"); > > /* ... even though it's a sleepable allocation... */ > > write_file("/sys/kernel/debug/failslab/ignore-gfp-wait", "N"); > > /* ... in this task... */ > > write_file("/sys/kernel/debug/failslab/task-filter", "Y"); > > write_file("/proc/self/make-it-fail", "1"); > > /* ... every time... */ > > write_file("/sys/kernel/debug/failslab/times", "-1"); > > /* ... after 8 successful allocations (value chosen experimentally)... */ > > write_file("/sys/kernel/debug/failslab/space", "2048"); // one object is 256 > > /* ... and print where the allocations actually failed... */ > > write_file("/sys/kernel/debug/failslab/verbose", "2"); > > /* ... and that's it, arm it */ > > write_file("/sys/kernel/debug/failslab/probability", "100"); > > > > // This mmap request will fail late due to MDWE. > > // The error recovery path will attempt to clear out the VMA pointers with a > > // spanning_store (which will be blocked by error injection). > > mmap(area, AREA_SIZE, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_FIXED, fd, 0); > > > > /* disarm */ > > write_file("/sys/kernel/debug/failslab/probability", "0"); > > > > SYSCHK(munmap(map, 0x1000)); // UAF expected here > > system("cat /proc/$PPID/maps"); > > } > > ``` > > > > Result: > > ``` > > FAULT_INJECTION: forcing a failure. > > name failslab, interval 1, probability 100, space 256, times -1 > > CPU: 3 UID: 0 PID: 607 Comm: unmap-fail Not tainted 6.12.0-rc3-00013-geca631b8fe80 #518 > > [...] > > Call Trace: > > <TASK> > > dump_stack_lvl+0x80/0xa0 > > should_fail_ex+0x4d3/0x5c0 > > [...] > > should_failslab+0xc7/0x130 > > kmem_cache_alloc_noprof+0x73/0x3a0 > > [...] > > mas_alloc_nodes+0x3a3/0x690 > > mas_nomem+0xaa/0x1d0 > > mas_store_gfp+0x515/0xa80 > > [...] > > mmap_region+0xa96/0x2590 > > [...] > > do_mmap+0x71e/0xfe0 > > [...] > > vm_mmap_pgoff+0x17a/0x2f0 > > [...] > > ksys_mmap_pgoff+0x2ee/0x460 > > do_syscall_64+0x68/0x140 > > entry_SYSCALL_64_after_hwframe+0x76/0x7e > > [...] > > </TASK> > > mmap: unmap-fail: (607) Unable to abort munmap() operation > > ================================================================== > > BUG: KASAN: slab-use-after-free in dec_usb_memory_use_count+0x365/0x430 > > Read of size 8 at addr ffff88810e9ba8b8 by task unmap-fail/607 > > What was this pointer? Should be the "struct usb_memory *usbm". > > > > CPU: 3 UID: 0 PID: 607 Comm: unmap-fail Not tainted 6.12.0-rc3-00013-geca631b8fe80 #518 > > [...] > > Call Trace: > > <TASK> > > dump_stack_lvl+0x66/0xa0 > > print_report+0xce/0x670 > > [...] > > kasan_report+0xf7/0x130 > > [...] > > dec_usb_memory_use_count+0x365/0x430 > > remove_vma+0x76/0x120 > > vms_complete_munmap_vmas+0x447/0x750 > > do_vmi_align_munmap+0x4b9/0x700 > > [...] > > do_vmi_munmap+0x164/0x2e0 > > __vm_munmap+0x128/0x2a0 > > [...] > > __x64_sys_munmap+0x59/0x80 > > do_syscall_64+0x68/0x140 > > entry_SYSCALL_64_after_hwframe+0x76/0x7e > > [...] > > </TASK> > > > > Allocated by task 607: > > kasan_save_stack+0x33/0x60 > > kasan_save_track+0x14/0x30 > > __kasan_kmalloc+0xaa/0xb0 > > usbdev_mmap+0x1a0/0xaf0 > > mmap_region+0xf6e/0x2590 > > do_mmap+0x71e/0xfe0 > > vm_mmap_pgoff+0x17a/0x2f0 > > ksys_mmap_pgoff+0x2ee/0x460 > > do_syscall_64+0x68/0x140 > > entry_SYSCALL_64_after_hwframe+0x76/0x7e > > > > Freed by task 607: > > kasan_save_stack+0x33/0x60 > > kasan_save_track+0x14/0x30 > > kasan_save_free_info+0x3b/0x60 > > __kasan_slab_free+0x4f/0x70 > > kfree+0x148/0x450 > > vms_clean_up_area+0x188/0x220 > > What line is this? Should be the vma->vm_ops->close(vma) call. (That would call dec_usb_memory_use_count(), which tail-calls kfree(), so the dec_usb_memory_use_count() wouldn't show up in a stack trace.) I don't have this kernel build anymore though, sorry. If you want I'll rebuild and get a properly symbolized trace.
* Jann Horn <jannh@google.com> [241016 12:04]: > On Wed, Oct 16, 2024 at 5:40 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > * Jann Horn <jannh@google.com> [241016 11:08]: > > > vms_abort_munmap_vmas() is a recovery path where, on entry, some VMAs > > > have already been torn down halfway (in a way we can't undo) but are > > > still present in the maple tree. > > > > > > At this point, we *must* remove the VMAs from the VMA tree, otherwise > > > we get UAF. > > > > Wait, the vma should be re-attached without PTEs and files closed in > > this case. I don't see how we are hitting the UAF in your example - we > > shouldn't unless there is something with the driver itself and the file > > close? > > > > My concern is that I am missing an error scenario. > > Where are the files supposed to be closed? vms_clean_up_area() unlinks > the VMA from the file and calls ->close() but doesn't unlink the file, > right? Correct. > > FWIW, I tested on commit eca631b8fe80 (current mainline head), I > didn't check whether anything in the MM tree already addresses this. > (I probably should have...) I have not addressed this. > > > But since this is under a lock that allows sleeping, shouldn't the > > shrinker kick in? > > Yeah, I think without error injection, you'd basically only fail this > allocation if the OOM killer has already decided to kill your task and > the system is entirely out of memory. > > OOM kills IIRC have two effects on the page allocator: > > 1. they allow allocating with no watermarks (ALLOC_OOM) (based on the > theory that you might need to allocate some memory in order to be able > to exit and free more memory) > 2. they allow giving up on GFP_KERNEL allocations (see the "/* Avoid > allocations with no watermarks from looping endlessly */" part of > __alloc_pages_slowpath()) > > > Should we just use __GFP_NOFAIL for the first store > > instead of the error path? > > Which first store? Do you mean get rid of vms_abort_munmap_vmas() > entirely somehow? Yes.. but there are other failures throughout the function so I think this is the best plan. > > > > Fixes: 4f87153e82c4 ("mm: change failure of MAP_FIXED to restoring the gap on failure") > > > Signed-off-by: Jann Horn <jannh@google.com> Thanks for fixing this. Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com> > > > --- > > > This can be tested with the following reproducer (on a kernel built with > > > CONFIG_KASAN=y, CONFIG_FAILSLAB=y, CONFIG_FAULT_INJECTION_DEBUG_FS=y, > > > with the reproducer running as root): > > > > > > ``` > > > > > > typeof(x) __res = (x); \ > > > if (__res == (typeof(x))-1) \ > > > err(1, "SYSCHK(" #x ")"); \ > > > __res; \ > > > }) > > > > > > static void write_file(char *name, char *buf) { > > > int fd = open(name, O_WRONLY); > > > if (fd == -1) > > > err(1, "unable to open for writing: %s", name); > > > if (SYSCHK(write(fd, buf, strlen(buf))) != strlen(buf)) > > > errx(1, "write %s", name); > > > SYSCHK(close(fd)); > > > } > > > > > > int main(void) { > > > // make a large area with a bunch of VMAs > > > char *area = SYSCHK(mmap(NULL, AREA_SIZE, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)); > > > for (int off=0; off<AREA_SIZE; off += 0x2000) > > > SYSCHK(mmap(area+off, 0x1000, PROT_READ, MAP_FIXED|MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)); > > > > > > // open a file whose mappings use usbdev_vm_ops, and map it in part of this area > > > int map_fd = SYSCHK(open("/dev/bus/usb/001/001", O_RDONLY)); > > > void *map = SYSCHK(mmap(area, 0x1000, PROT_READ, MAP_SHARED|MAP_FIXED, map_fd, 0)); > > > close(map_fd); > > > > > > // make RWX mapping request fail late > > > SYSCHK(prctl(65/*PR_SET_MDWE*/, 1/*PR_MDWE_REFUSE_EXEC_GAIN*/, 0, 0, 0)); > > > > > > // some random other file > > > int fd = SYSCHK(open("/etc/passwd", O_RDONLY)); > > > > > > /* disarm for now */ > > > write_file("/sys/kernel/debug/failslab/probability", "0"); > > > > > > /* fail allocations of maple_node... */ > > > write_file("/sys/kernel/debug/failslab/cache-filter", "Y"); > > > write_file("/sys/kernel/slab/maple_node/failslab", "1"); > > > /* ... even though it's a sleepable allocation... */ > > > write_file("/sys/kernel/debug/failslab/ignore-gfp-wait", "N"); > > > /* ... in this task... */ > > > write_file("/sys/kernel/debug/failslab/task-filter", "Y"); > > > write_file("/proc/self/make-it-fail", "1"); > > > /* ... every time... */ > > > write_file("/sys/kernel/debug/failslab/times", "-1"); > > > /* ... after 8 successful allocations (value chosen experimentally)... */ > > > write_file("/sys/kernel/debug/failslab/space", "2048"); // one object is 256 > > > /* ... and print where the allocations actually failed... */ > > > write_file("/sys/kernel/debug/failslab/verbose", "2"); > > > /* ... and that's it, arm it */ > > > write_file("/sys/kernel/debug/failslab/probability", "100"); > > > > > > // This mmap request will fail late due to MDWE. > > > // The error recovery path will attempt to clear out the VMA pointers with a > > > // spanning_store (which will be blocked by error injection). > > > mmap(area, AREA_SIZE, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_FIXED, fd, 0); > > > > > > /* disarm */ > > > write_file("/sys/kernel/debug/failslab/probability", "0"); > > > > > > SYSCHK(munmap(map, 0x1000)); // UAF expected here > > > system("cat /proc/$PPID/maps"); > > > } > > > ``` > > > > > > Result: > > > ``` > > > FAULT_INJECTION: forcing a failure. > > > name failslab, interval 1, probability 100, space 256, times -1 > > > CPU: 3 UID: 0 PID: 607 Comm: unmap-fail Not tainted 6.12.0-rc3-00013-geca631b8fe80 #518 > > > [...] > > > Call Trace: > > > <TASK> > > > dump_stack_lvl+0x80/0xa0 > > > should_fail_ex+0x4d3/0x5c0 > > > [...] > > > should_failslab+0xc7/0x130 > > > kmem_cache_alloc_noprof+0x73/0x3a0 > > > [...] > > > mas_alloc_nodes+0x3a3/0x690 > > > mas_nomem+0xaa/0x1d0 > > > mas_store_gfp+0x515/0xa80 > > > [...] > > > mmap_region+0xa96/0x2590 > > > [...] > > > do_mmap+0x71e/0xfe0 > > > [...] > > > vm_mmap_pgoff+0x17a/0x2f0 > > > [...] > > > ksys_mmap_pgoff+0x2ee/0x460 > > > do_syscall_64+0x68/0x140 > > > entry_SYSCALL_64_after_hwframe+0x76/0x7e > > > [...] > > > </TASK> > > > mmap: unmap-fail: (607) Unable to abort munmap() operation > > > ================================================================== > > > BUG: KASAN: slab-use-after-free in dec_usb_memory_use_count+0x365/0x430 > > > Read of size 8 at addr ffff88810e9ba8b8 by task unmap-fail/607 > > > > What was this pointer? > > Should be the "struct usb_memory *usbm". > > > > > > > CPU: 3 UID: 0 PID: 607 Comm: unmap-fail Not tainted 6.12.0-rc3-00013-geca631b8fe80 #518 > > > [...] > > > Call Trace: > > > <TASK> > > > dump_stack_lvl+0x66/0xa0 > > > print_report+0xce/0x670 > > > [...] > > > kasan_report+0xf7/0x130 > > > [...] > > > dec_usb_memory_use_count+0x365/0x430 > > > remove_vma+0x76/0x120 > > > vms_complete_munmap_vmas+0x447/0x750 > > > do_vmi_align_munmap+0x4b9/0x700 > > > [...] > > > do_vmi_munmap+0x164/0x2e0 > > > __vm_munmap+0x128/0x2a0 > > > [...] > > > __x64_sys_munmap+0x59/0x80 > > > do_syscall_64+0x68/0x140 > > > entry_SYSCALL_64_after_hwframe+0x76/0x7e > > > [...] > > > </TASK> > > > > > > Allocated by task 607: > > > kasan_save_stack+0x33/0x60 > > > kasan_save_track+0x14/0x30 > > > __kasan_kmalloc+0xaa/0xb0 > > > usbdev_mmap+0x1a0/0xaf0 > > > mmap_region+0xf6e/0x2590 > > > do_mmap+0x71e/0xfe0 > > > vm_mmap_pgoff+0x17a/0x2f0 > > > ksys_mmap_pgoff+0x2ee/0x460 > > > do_syscall_64+0x68/0x140 > > > entry_SYSCALL_64_after_hwframe+0x76/0x7e > > > > > > Freed by task 607: > > > kasan_save_stack+0x33/0x60 > > > kasan_save_track+0x14/0x30 > > > kasan_save_free_info+0x3b/0x60 > > > __kasan_slab_free+0x4f/0x70 > > > kfree+0x148/0x450 > > > vms_clean_up_area+0x188/0x220 > > > > What line is this? > > Should be the vma->vm_ops->close(vma) call. (That would call > dec_usb_memory_use_count(), which tail-calls kfree(), so the > dec_usb_memory_use_count() wouldn't show up in a stack trace.) > > I don't have this kernel build anymore though, sorry. If you want I'll > rebuild and get a properly symbolized trace. Right, so it's the driver. That makes sense. Your patch is the best plan.
© 2016 - 2024 Red Hat, Inc.