To get rid of setjmp()/longjmp(), the teardown logic needs to be usable
from __bail(). To access the atomic teardown conditional from there,
move it into the test metadata.
This also allows the removal of "setup_completed".
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
---
tools/testing/selftests/kselftest_harness.h | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
index 905986debbfb0ce8c9659dbd52b6c67c6759cae7..895821af3e5c5752065561d0a108210d79e9eeee 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -411,9 +411,9 @@
pid_t child = 1; \
int status = 0; \
/* Makes sure there is only one teardown, even when child forks again. */ \
- bool *teardown = mmap(NULL, sizeof(*teardown), \
+ _metadata->no_teardown = mmap(NULL, sizeof(*_metadata->no_teardown), \
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); \
- *teardown = false; \
+ *_metadata->no_teardown = true; \
if (sizeof(*self) > 0) { \
if (fixture_name##_teardown_parent) { \
self = mmap(NULL, sizeof(*self), PROT_READ | PROT_WRITE, \
@@ -431,7 +431,7 @@
/* Let setup failure terminate early. */ \
if (_metadata->exit_code) \
_exit(0); \
- _metadata->setup_completed = true; \
+ *_metadata->no_teardown = false; \
fixture_name##_##test_name(_metadata, self, variant->data); \
} else if (child < 0 || child != waitpid(child, &status, 0)) { \
ksft_print_msg("ERROR SPAWNING TEST GRANDCHILD\n"); \
@@ -439,15 +439,16 @@
} \
} \
if (child == 0) { \
- if (_metadata->setup_completed && !fixture_name##_teardown_parent && \
- !__atomic_test_and_set(teardown, __ATOMIC_RELAXED)) \
+ if (!fixture_name##_teardown_parent && \
+ !__atomic_test_and_set(_metadata->no_teardown, __ATOMIC_RELAXED)) \
fixture_name##_teardown(_metadata, self, variant->data); \
_exit(0); \
} \
- if (_metadata->setup_completed && fixture_name##_teardown_parent && \
- !__atomic_test_and_set(teardown, __ATOMIC_RELAXED)) \
+ if (fixture_name##_teardown_parent && \
+ !__atomic_test_and_set(_metadata->no_teardown, __ATOMIC_RELAXED)) \
fixture_name##_teardown(_metadata, self, variant->data); \
- munmap(teardown, sizeof(*teardown)); \
+ munmap(_metadata->no_teardown, sizeof(*_metadata->no_teardown)); \
+ _metadata->no_teardown = NULL; \
if (self && fixture_name##_teardown_parent) \
munmap(self, sizeof(*self)); \
if (WIFEXITED(status)) { \
@@ -916,7 +917,7 @@ struct __test_metadata {
int trigger; /* extra handler after the evaluation */
int timeout; /* seconds to wait for test timeout */
bool aborted; /* stopped test due to failed ASSERT */
- bool setup_completed; /* did setup finish? */
+ bool *no_teardown; /* fixture needs teardown */
jmp_buf env; /* for exiting out of test early */
struct __test_results *results;
struct __test_metadata *prev, *next;
@@ -1195,7 +1196,7 @@ static void __run_test(struct __fixture_metadata *f,
t->exit_code = KSFT_PASS;
t->trigger = 0;
t->aborted = false;
- t->setup_completed = false;
+ t->no_teardown = NULL;
memset(t->env, 0, sizeof(t->env));
memset(t->results->reason, 0, sizeof(t->results->reason));
--
2.49.0
Hi Thomas, CC += Jason On Mon, May 05, 2025 at 05:15:27PM +0200, Thomas Weißschuh wrote: > To get rid of setjmp()/longjmp(), the teardown logic needs to be usable > from __bail(). To access the atomic teardown conditional from there, > move it into the test metadata. > This also allows the removal of "setup_completed". > > Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de> > Acked-by: Shuah Khan <skhan@linuxfoundation.org> Certain hugepage tests in iommufd selftest (CONFIG_IOMMUFD_TEST) start to fail since v6.16-rc1, though the test functions weren't changed during last cycle: ------------------------------------------------------------------ # RUN iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty ... # enforce_dirty: Test terminated unexpectedly by signal 11 # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty not ok 193 iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty # RUN iommufd_dirty_tracking.domain_dirty128M_huge.set_dirty_tracking ... # set_dirty_tracking: Test terminated unexpectedly by signal 11 # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.set_dirty_tracking not ok 194 iommufd_dirty_tracking.domain_dirty128M_huge.set_dirty_tracking # RUN iommufd_dirty_tracking.domain_dirty128M_huge.device_dirty_capability ... # device_dirty_capability: Test terminated unexpectedly by signal 11 # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.device_dirty_capability not ok 195 iommufd_dirty_tracking.domain_dirty128M_huge.device_dirty_capability # RUN iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap ... # get_dirty_bitmap: Test terminated unexpectedly by signal 11 # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap not ok 196 iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap # RUN iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap_no_clear ... # get_dirty_bitmap_no_clear: Test terminated unexpectedly by signal 11 # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap_no_clear not ok 197 iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap_no_clear ------------------------------------------------------------------ Git bisect points to this patch, and reverting it and its following patches fixes these. I haven't debugged it, hoping you might have a quick thought. Lemme know if you need some details to figure out what's going on. Thanks Nicolin
Hi Nicolin, On Mon, Jun 09, 2025 at 11:49:05PM -0700, Nicolin Chen wrote: > CC += Jason > > On Mon, May 05, 2025 at 05:15:27PM +0200, Thomas Weißschuh wrote: > > To get rid of setjmp()/longjmp(), the teardown logic needs to be usable > > from __bail(). To access the atomic teardown conditional from there, > > move it into the test metadata. > > This also allows the removal of "setup_completed". > > > > Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de> > > Acked-by: Shuah Khan <skhan@linuxfoundation.org> > > Certain hugepage tests in iommufd selftest (CONFIG_IOMMUFD_TEST) > start to fail since v6.16-rc1, though the test functions weren't > changed during last cycle: Thanks for the report. > ------------------------------------------------------------------ > # RUN iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty ... > # enforce_dirty: Test terminated unexpectedly by signal 11 > # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty > not ok 193 iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty > # RUN iommufd_dirty_tracking.domain_dirty128M_huge.set_dirty_tracking ... > # set_dirty_tracking: Test terminated unexpectedly by signal 11 > # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.set_dirty_tracking > not ok 194 iommufd_dirty_tracking.domain_dirty128M_huge.set_dirty_tracking > # RUN iommufd_dirty_tracking.domain_dirty128M_huge.device_dirty_capability ... > # device_dirty_capability: Test terminated unexpectedly by signal 11 > # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.device_dirty_capability > not ok 195 iommufd_dirty_tracking.domain_dirty128M_huge.device_dirty_capability > # RUN iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap ... > # get_dirty_bitmap: Test terminated unexpectedly by signal 11 > # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap > not ok 196 iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap > # RUN iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap_no_clear ... > # get_dirty_bitmap_no_clear: Test terminated unexpectedly by signal 11 > # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap_no_clear > not ok 197 iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap_no_clear > ------------------------------------------------------------------ > > Git bisect points to this patch, and reverting it and its following > patches fixes these. > > I haven't debugged it, hoping you might have a quick thought. Lemme > know if you need some details to figure out what's going on. I can't reproduce this report. On my development machine or a virtme-ng VM I get the following failure: # ./iommufd -r iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty TAP version 13 1..1 # Starting 1 tests from 1 test cases. # RUN iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty ... iommufd: iommufd.c:2042: iommufd_dirty_tracking_setup: Assertion `vrc == self->buffer' failed. # enforce_dirty: Test terminated by assertion # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty not ok 1 iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty # FAILED: 0 / 1 tests passed. # Totals: pass:0 fail:1 xfail:0 xpass:0 skip:0 error:0 Specifically the mmap() fails with ENOMEM. When booting the VM with "hugepages=100" the test succeeds. The same result happens when running all the subtests. Could you give more specific reproduction steps? On another note, the selftest should use the kselftest_harness' ASSERT_*() macros instead of plain assert(). Thomas
On Tue, Jun 10, 2025 at 01:38:22PM +0200, Thomas Weißschuh wrote: > Hi Nicolin, > > On Mon, Jun 09, 2025 at 11:49:05PM -0700, Nicolin Chen wrote: > > CC += Jason > > > > On Mon, May 05, 2025 at 05:15:27PM +0200, Thomas Weißschuh wrote: > > > To get rid of setjmp()/longjmp(), the teardown logic needs to be usable > > > from __bail(). To access the atomic teardown conditional from there, > > > move it into the test metadata. > > > This also allows the removal of "setup_completed". > > > > > > Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de> > > > Acked-by: Shuah Khan <skhan@linuxfoundation.org> > > > > Certain hugepage tests in iommufd selftest (CONFIG_IOMMUFD_TEST) > > start to fail since v6.16-rc1, though the test functions weren't > > changed during last cycle: > > Thanks for the report. > > > ------------------------------------------------------------------ > > # RUN iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty ... > > # enforce_dirty: Test terminated unexpectedly by signal 11 Sig 11 is weird.. > I can't reproduce this report. > On my development machine or a virtme-ng VM I get the following failure: > > # ./iommufd -r iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty > TAP version 13 > 1..1 > # Starting 1 tests from 1 test cases. > # RUN iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty ... > iommufd: iommufd.c:2042: iommufd_dirty_tracking_setup: Assertion `vrc == self->buffer' failed. > # enforce_dirty: Test terminated by assertion > # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty > not ok 1 iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty > # FAILED: 0 / 1 tests passed. > # Totals: pass:0 fail:1 xfail:0 xpass:0 skip:0 error:0 > > Specifically the mmap() fails with ENOMEM. > > When booting the VM with "hugepages=100" the test succeeds. Yes, that is required > On another note, the selftest should use the kselftest_harness' ASSERT_*() > macros instead of plain assert(). IIRC the kselftest stuff explodes if you try to use it's assert functions within a fixture setup/teardown context. I also wasn't able to reproduce this (x86 ubuntu 24 LTS OS) Maybe it is ARM specific, I think Nicolin is running on ARM.. I did see the other warnings that Nicolin reported. Jason
On Tue, Jun 10, 2025 at 09:09:02AM -0300, Jason Gunthorpe wrote: > On Tue, Jun 10, 2025 at 01:38:22PM +0200, Thomas Weißschuh wrote: > > > ------------------------------------------------------------------ > > > # RUN iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty ... > > > # enforce_dirty: Test terminated unexpectedly by signal 11 > > Sig 11 is weird.. > > On another note, the selftest should use the kselftest_harness' ASSERT_*() > > macros instead of plain assert(). > > IIRC the kselftest stuff explodes if you try to use it's assert > functions within a fixture setup/teardown context. > > I also wasn't able to reproduce this (x86 ubuntu 24 LTS OS) Maybe > it is ARM specific, I think Nicolin is running on ARM.. Yes. And I was running with 64KB page size. I just quickly retried with 4KB page size (matching x86), and all failed tests pass now. Thanks Nicolin
On Tue, Jun 10, 2025 at 11:48:44AM -0700, Nicolin Chen wrote: > On Tue, Jun 10, 2025 at 09:09:02AM -0300, Jason Gunthorpe wrote: > > On Tue, Jun 10, 2025 at 01:38:22PM +0200, Thomas Weißschuh wrote: > > > > ------------------------------------------------------------------ > > > > # RUN iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty ... > > > > # enforce_dirty: Test terminated unexpectedly by signal 11 > > > > Sig 11 is weird.. > > > > On another note, the selftest should use the kselftest_harness' ASSERT_*() > > > macros instead of plain assert(). > > > > IIRC the kselftest stuff explodes if you try to use it's assert > > functions within a fixture setup/teardown context. > > > > I also wasn't able to reproduce this (x86 ubuntu 24 LTS OS) Maybe > > it is ARM specific, I think Nicolin is running on ARM.. > > Yes. And I was running with 64KB page size. I just quickly retried > with 4KB page size (matching x86), and all failed tests pass now. That is very important to know. It should be mentioned in the report. So I tried to reproduce it. To get even the mmap() in the test to succeed I needed to also pass default_hugepagesz=2MiB. Also 1GiB of memory was not enough. 30GiB was however. But then the tests succeeds fine for me. So I'll need reproduction steps. Thomas
On Tue, Jun 10, 2025 at 11:48:44AM -0700, Nicolin Chen wrote: > On Tue, Jun 10, 2025 at 09:09:02AM -0300, Jason Gunthorpe wrote: > > On Tue, Jun 10, 2025 at 01:38:22PM +0200, Thomas Weißschuh wrote: > > > > ------------------------------------------------------------------ > > > > # RUN iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty ... > > > > # enforce_dirty: Test terminated unexpectedly by signal 11 > > > > Sig 11 is weird.. > > > > On another note, the selftest should use the kselftest_harness' ASSERT_*() > > > macros instead of plain assert(). > > > > IIRC the kselftest stuff explodes if you try to use it's assert > > functions within a fixture setup/teardown context. > > > > I also wasn't able to reproduce this (x86 ubuntu 24 LTS OS) Maybe > > it is ARM specific, I think Nicolin is running on ARM.. > > Yes. And I was running with 64KB page size. I just quickly retried > with 4KB page size (matching x86), and all failed tests pass now. That's a weird thing to be sensitive too. Can you get a backtrace from the crash, what function/line is crashing? Jason
On Tue, Jun 10, 2025 at 08:46:57PM -0300, Jason Gunthorpe wrote:
> On Tue, Jun 10, 2025 at 11:48:44AM -0700, Nicolin Chen wrote:
> > On Tue, Jun 10, 2025 at 09:09:02AM -0300, Jason Gunthorpe wrote:
> > > On Tue, Jun 10, 2025 at 01:38:22PM +0200, Thomas Weißschuh wrote:
> > > > > ------------------------------------------------------------------
> > > > > # RUN iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty ...
> > > > > # enforce_dirty: Test terminated unexpectedly by signal 11
> > >
> > > Sig 11 is weird..
> >
> > > > On another note, the selftest should use the kselftest_harness' ASSERT_*()
> > > > macros instead of plain assert().
> > >
> > > IIRC the kselftest stuff explodes if you try to use it's assert
> > > functions within a fixture setup/teardown context.
> > >
> > > I also wasn't able to reproduce this (x86 ubuntu 24 LTS OS) Maybe
> > > it is ARM specific, I think Nicolin is running on ARM..
> >
> > Yes. And I was running with 64KB page size. I just quickly retried
> > with 4KB page size (matching x86), and all failed tests pass now.
>
> That's a weird thing to be sensitive too. Can you get a backtrace from
> the crash, what function/line is crashing?
I think I am getting what's going on. Here the harness code has a
parent process and a child process:
--------------------------------------------------------------
428- /* _metadata and potentially self are shared with all forks. */ \
429: child = fork(); \
430: if (child == 0) { \
431- fixture_name##_setup(_metadata, self, variant->data); \
432- /* Let setup failure terminate early. */ \
433- if (_metadata->exit_code) \
434- _exit(0); \
435- *_metadata->no_teardown = false; \
436- fixture_name##_##test_name(_metadata, self, variant->data); \
437- _metadata->teardown_fn(false, _metadata, self, variant->data); \
438- _exit(0); \
439: } else if (child < 0 || child != waitpid(child, &status, 0)) { \
440- ksft_print_msg("ERROR SPAWNING TEST GRANDCHILD\n"); \
441- _metadata->exit_code = KSFT_FAIL; \
442- } \
443- _metadata->teardown_fn(true, _metadata, self, variant->data); \
444- munmap(_metadata->no_teardown, sizeof(*_metadata->no_teardown)); \
445- _metadata->no_teardown = NULL; \
446- if (self && fixture_name##_teardown_parent) \
447- munmap(self, sizeof(*self)); \
448- if (WIFEXITED(status)) { \
449- if (WEXITSTATUS(status)) \
450- _metadata->exit_code = WEXITSTATUS(status); \
451- } else if (WIFSIGNALED(status)) { \
452- /* Forward signal to __wait_for_test(). */ \
453- kill(getpid(), WTERMSIG(status)); \
454- } \
....
456- static void wrapper_##fixture_name##_##test_name##_teardown( \
457- bool in_parent, struct __test_metadata *_metadata, \
458- void *self, const void *variant) \
459- { \
460- if (fixture_name##_teardown_parent == in_parent && \
461- !__atomic_test_and_set(_metadata->no_teardown, __ATOMIC_RELAXED)) \
462- fixture_name##_teardown(_metadata, self, variant); \
463- } \
--------------------------------------------------------------
I found there is a race between those two processes, resulting in
the teardown() not getting invoked: I added some ksft_print_msg()
calls in-between the lines to debug, those tests can pass mostly,
as teardown() got invoked.
I think the reason why those huge page cases fail is just because
the huge version of setup() takes longer time..
I haven't figured out a proper fix yet, but something smells bad:
1) *no_teardown is set non-atomically, while both processes calls
__atomic_test_and_set()
2) parent doesn't seem to wait for the setup() to complete..
3) when parent runs faster than the child that is still running
setup(), the parent unmaps the no_teardown and set it to NULL,
then UAF in the child, i.e. signal 11?
I think Thomas should have an idea with these info :)
Thanks
Nicolin
On Wed, Jun 11, 2025 at 12:05:25AM -0700, Nicolin Chen wrote:
> On Tue, Jun 10, 2025 at 08:46:57PM -0300, Jason Gunthorpe wrote:
> > On Tue, Jun 10, 2025 at 11:48:44AM -0700, Nicolin Chen wrote:
> > > On Tue, Jun 10, 2025 at 09:09:02AM -0300, Jason Gunthorpe wrote:
> > > > On Tue, Jun 10, 2025 at 01:38:22PM +0200, Thomas Weißschuh wrote:
> > > > > > ------------------------------------------------------------------
> > > > > > # RUN iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty ...
> > > > > > # enforce_dirty: Test terminated unexpectedly by signal 11
> > > >
> > > > Sig 11 is weird..
> > >
> > > > > On another note, the selftest should use the kselftest_harness' ASSERT_*()
> > > > > macros instead of plain assert().
> > > >
> > > > IIRC the kselftest stuff explodes if you try to use it's assert
> > > > functions within a fixture setup/teardown context.
> > > >
> > > > I also wasn't able to reproduce this (x86 ubuntu 24 LTS OS) Maybe
> > > > it is ARM specific, I think Nicolin is running on ARM..
> > >
> > > Yes. And I was running with 64KB page size. I just quickly retried
> > > with 4KB page size (matching x86), and all failed tests pass now.
> >
> > That's a weird thing to be sensitive too. Can you get a backtrace from
> > the crash, what function/line is crashing?
>
> I think I am getting what's going on. Here the harness code has a
> parent process and a child process:
>
> --------------------------------------------------------------
> 428- /* _metadata and potentially self are shared with all forks. */ \
> 429: child = fork(); \
> 430: if (child == 0) { \
> 431- fixture_name##_setup(_metadata, self, variant->data); \
> 432- /* Let setup failure terminate early. */ \
> 433- if (_metadata->exit_code) \
> 434- _exit(0); \
> 435- *_metadata->no_teardown = false; \
> 436- fixture_name##_##test_name(_metadata, self, variant->data); \
> 437- _metadata->teardown_fn(false, _metadata, self, variant->data); \
> 438- _exit(0); \
> 439: } else if (child < 0 || child != waitpid(child, &status, 0)) { \
> 440- ksft_print_msg("ERROR SPAWNING TEST GRANDCHILD\n"); \
> 441- _metadata->exit_code = KSFT_FAIL; \
> 442- } \
> 443- _metadata->teardown_fn(true, _metadata, self, variant->data); \
> 444- munmap(_metadata->no_teardown, sizeof(*_metadata->no_teardown)); \
> 445- _metadata->no_teardown = NULL; \
> 446- if (self && fixture_name##_teardown_parent) \
> 447- munmap(self, sizeof(*self)); \
> 448- if (WIFEXITED(status)) { \
> 449- if (WEXITSTATUS(status)) \
> 450- _metadata->exit_code = WEXITSTATUS(status); \
> 451- } else if (WIFSIGNALED(status)) { \
> 452- /* Forward signal to __wait_for_test(). */ \
> 453- kill(getpid(), WTERMSIG(status)); \
> 454- } \
> ....
> 456- static void wrapper_##fixture_name##_##test_name##_teardown( \
> 457- bool in_parent, struct __test_metadata *_metadata, \
> 458- void *self, const void *variant) \
> 459- { \
> 460- if (fixture_name##_teardown_parent == in_parent && \
> 461- !__atomic_test_and_set(_metadata->no_teardown, __ATOMIC_RELAXED)) \
> 462- fixture_name##_teardown(_metadata, self, variant); \
> 463- } \
> --------------------------------------------------------------
>
> I found there is a race between those two processes, resulting in
> the teardown() not getting invoked: I added some ksft_print_msg()
> calls in-between the lines to debug, those tests can pass mostly,
> as teardown() got invoked.
>
> I think the reason why those huge page cases fail is just because
> the huge version of setup() takes longer time..
Can you try to recreate this issue with changes to
tools/testing/selftests/kselftest_harness/harness-selftest.c ?
> I haven't figured out a proper fix yet, but something smells bad:
> 1) *no_teardown is set non-atomically, while both processes calls
> __atomic_test_and_set()
Does this make a difference?
diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
index 2925e47db995..89fb37a21d9d 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -410,7 +410,7 @@
/* Makes sure there is only one teardown, even when child forks again. */ \
_metadata->no_teardown = mmap(NULL, sizeof(*_metadata->no_teardown), \
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); \
- *_metadata->no_teardown = true; \
+ __atomic_store_n(_metadata->no_teardown, true, __ATOMIC_SEQ_CST); \
if (sizeof(*self) > 0) { \
if (fixture_name##_teardown_parent) { \
self = mmap(NULL, sizeof(*self), PROT_READ | PROT_WRITE, \
@@ -429,7 +429,7 @@
/* Let setup failure terminate early. */ \
if (_metadata->exit_code) \
_exit(0); \
- *_metadata->no_teardown = false; \
+ __atomic_store_n(_metadata->no_teardown, false, __ATOMIC_SEQ_CST); \
fixture_name##_##test_name(_metadata, self, variant->data); \
_metadata->teardown_fn(false, _metadata, self, variant->data); \
_exit(0); \
@@ -455,7 +455,7 @@
void *self, const void *variant) \
{ \
if (fixture_name##_teardown_parent == in_parent && \
- !__atomic_test_and_set(_metadata->no_teardown, __ATOMIC_RELAXED)) \
+ !__atomic_test_and_set(_metadata->no_teardown, __ATOMIC_SEQ_CST)) \
fixture_name##_teardown(_metadata, self, variant); \
} \
static struct __test_metadata *_##fixture_name##_##test_name##_object; \
> 2) parent doesn't seem to wait for the setup() to complete..
setup() is called in the child (L431) right before the testcase itself is
called (L436). The parent waits for the child to exit (L439) before unmapping.
> 3) when parent runs faster than the child that is still running
> setup(), the parent unmaps the no_teardown and set it to NULL,
> then UAF in the child, i.e. signal 11?
That should never happen as the waitpid() will block until the child running
setup() and the testcase itself have exited.
Does the issue also happen when you only execute a single testcase?
$ ./iommufd -r iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty
Thomas
On Wed, Jun 11, 2025 at 10:04:35AM +0200, Thomas Weißschuh wrote:
> On Wed, Jun 11, 2025 at 12:05:25AM -0700, Nicolin Chen wrote:
> > On Tue, Jun 10, 2025 at 08:46:57PM -0300, Jason Gunthorpe wrote:
> > > On Tue, Jun 10, 2025 at 11:48:44AM -0700, Nicolin Chen wrote:
> > > > On Tue, Jun 10, 2025 at 09:09:02AM -0300, Jason Gunthorpe wrote:
> > > > > On Tue, Jun 10, 2025 at 01:38:22PM +0200, Thomas Weißschuh wrote:
> > > > > > > ------------------------------------------------------------------
> > > > > > > # RUN iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty ...
> > > > > > > # enforce_dirty: Test terminated unexpectedly by signal 11
> > > > >
> > > > > Sig 11 is weird..
> > > >
> > > > > > On another note, the selftest should use the kselftest_harness' ASSERT_*()
> > > > > > macros instead of plain assert().
> > > > >
> > > > > IIRC the kselftest stuff explodes if you try to use it's assert
> > > > > functions within a fixture setup/teardown context.
> > > > >
> > > > > I also wasn't able to reproduce this (x86 ubuntu 24 LTS OS) Maybe
> > > > > it is ARM specific, I think Nicolin is running on ARM..
> > > >
> > > > Yes. And I was running with 64KB page size. I just quickly retried
> > > > with 4KB page size (matching x86), and all failed tests pass now.
> > >
> > > That's a weird thing to be sensitive too. Can you get a backtrace from
> > > the crash, what function/line is crashing?
> >
> > I think I am getting what's going on. Here the harness code has a
> > parent process and a child process:
> >
> > --------------------------------------------------------------
> > 428- /* _metadata and potentially self are shared with all forks. */ \
> > 429: child = fork(); \
> > 430: if (child == 0) { \
> > 431- fixture_name##_setup(_metadata, self, variant->data); \
> > 432- /* Let setup failure terminate early. */ \
> > 433- if (_metadata->exit_code) \
> > 434- _exit(0); \
> > 435- *_metadata->no_teardown = false; \
> > 436- fixture_name##_##test_name(_metadata, self, variant->data); \
> > 437- _metadata->teardown_fn(false, _metadata, self, variant->data); \
> > 438- _exit(0); \
> > 439: } else if (child < 0 || child != waitpid(child, &status, 0)) { \
> > 440- ksft_print_msg("ERROR SPAWNING TEST GRANDCHILD\n"); \
> > 441- _metadata->exit_code = KSFT_FAIL; \
> > 442- } \
> > 443- _metadata->teardown_fn(true, _metadata, self, variant->data); \
> > 444- munmap(_metadata->no_teardown, sizeof(*_metadata->no_teardown)); \
> > 445- _metadata->no_teardown = NULL; \
> > 446- if (self && fixture_name##_teardown_parent) \
> > 447- munmap(self, sizeof(*self)); \
> > 448- if (WIFEXITED(status)) { \
> > 449- if (WEXITSTATUS(status)) \
> > 450- _metadata->exit_code = WEXITSTATUS(status); \
> > 451- } else if (WIFSIGNALED(status)) { \
> > 452- /* Forward signal to __wait_for_test(). */ \
> > 453- kill(getpid(), WTERMSIG(status)); \
> > 454- } \
> > ....
> > 456- static void wrapper_##fixture_name##_##test_name##_teardown( \
> > 457- bool in_parent, struct __test_metadata *_metadata, \
> > 458- void *self, const void *variant) \
> > 459- { \
> > 460- if (fixture_name##_teardown_parent == in_parent && \
> > 461- !__atomic_test_and_set(_metadata->no_teardown, __ATOMIC_RELAXED)) \
> > 462- fixture_name##_teardown(_metadata, self, variant); \
> > 463- } \
> > --------------------------------------------------------------
> >
> > I found there is a race between those two processes, resulting in
> > the teardown() not getting invoked: I added some ksft_print_msg()
> > calls in-between the lines to debug, those tests can pass mostly,
> > as teardown() got invoked.
> >
> > I think the reason why those huge page cases fail is just because
> > the huge version of setup() takes longer time..
>
> Can you try to recreate this issue with changes to
> tools/testing/selftests/kselftest_harness/harness-selftest.c ?
Hmm, I assume all 9 cases should pass? Mine only passes 4 on rc1,
without any change (perhaps we should start from here?):
TAP version 13
1..9
# Starting 9 tests from 4 test cases.
# RUN global.standalone_pass ...
# harness-selftest.c:19:standalone_pass:before
# harness-selftest.c:23:standalone_pass:after
# OK global.standalone_pass
ok 1 global.standalone_pass
# RUN global.standalone_fail ...
# harness-selftest.c:27:standalone_fail:before
# harness-selftest.c:29:standalone_fail:Expected 0 (0) == 1 (1)
# harness-selftest.c:30:standalone_fail:Expected 0 (0) == 1 (1)
# standalone_fail: Test terminated by assertion
# FAIL global.standalone_fail
not ok 2 global.standalone_fail
# RUN global.signal_pass ...
# harness-selftest.c:35:signal_pass:before
# harness-selftest.c:37:signal_pass:after
# OK global.signal_pass
ok 3 global.signal_pass
# RUN global.signal_fail ...
# harness-selftest.c:42:signal_fail:before
# harness-selftest.c:43:signal_fail:Expected 0 (0) == 1 (1)
# signal_fail: Test terminated by assertion
# FAIL global.signal_fail
not ok 4 global.signal_fail
# RUN fixture.pass ...
# harness-selftest.c:53:pass:setup
# harness-selftest.c:62:pass:before
# harness-selftest.c:19:pass:before
# harness-selftest.c:23:pass:after
# harness-selftest.c:66:pass:after
# harness-selftest.c:58:pass:teardown same-process=1
# OK fixture.pass
ok 5 fixture.pass
# RUN fixture.fail ...
# harness-selftest.c:53:fail:setup
# harness-selftest.c:70:fail:before
# harness-selftest.c:71:fail:Expected 0 (0) == 1 (1)
# harness-selftest.c:58:fail:teardown same-process=1
# fail: Test terminated by assertion
# FAIL fixture.fail
not ok 6 fixture.fail
# RUN fixture.timeout ...
# harness-selftest.c:53:timeout:setup
# harness-selftest.c:77:timeout:before
# timeout: Test terminated by timeout
# FAIL fixture.timeout
not ok 7 fixture.timeout
# RUN fixture_parent.pass ...
# harness-selftest.c:87:pass:setup
# harness-selftest.c:96:pass:before
# harness-selftest.c:98:pass:after
# harness-selftest.c:92:pass:teardown same-process=0
# OK fixture_parent.pass
ok 8 fixture_parent.pass
# RUN fixture_setup_failure.pass ...
# harness-selftest.c:106:pass:setup
# harness-selftest.c:108:pass:Expected 0 (0) == 1 (1)
# pass: Test terminated by assertion
# FAIL fixture_setup_failure.pass
not ok 9 fixture_setup_failure.pass
# FAILED: 4 / 9 tests passed.
# Totals: pass:4 fail:5 xfail:0 xpass:0 skip:0 error:0
> > I haven't figured out a proper fix yet, but something smells bad:
> > 1) *no_teardown is set non-atomically, while both processes calls
> > __atomic_test_and_set()
>
> Does this make a difference?
>
> diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
> index 2925e47db995..89fb37a21d9d 100644
> --- a/tools/testing/selftests/kselftest_harness.h
> +++ b/tools/testing/selftests/kselftest_harness.h
> @@ -410,7 +410,7 @@
> /* Makes sure there is only one teardown, even when child forks again. */ \
> _metadata->no_teardown = mmap(NULL, sizeof(*_metadata->no_teardown), \
> PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); \
> - *_metadata->no_teardown = true; \
> + __atomic_store_n(_metadata->no_teardown, true, __ATOMIC_SEQ_CST); \
> if (sizeof(*self) > 0) { \
> if (fixture_name##_teardown_parent) { \
> self = mmap(NULL, sizeof(*self), PROT_READ | PROT_WRITE, \
> @@ -429,7 +429,7 @@
> /* Let setup failure terminate early. */ \
> if (_metadata->exit_code) \
> _exit(0); \
> - *_metadata->no_teardown = false; \
> + __atomic_store_n(_metadata->no_teardown, false, __ATOMIC_SEQ_CST); \
> fixture_name##_##test_name(_metadata, self, variant->data); \
> _metadata->teardown_fn(false, _metadata, self, variant->data); \
> _exit(0); \
> @@ -455,7 +455,7 @@
> void *self, const void *variant) \
> { \
> if (fixture_name##_teardown_parent == in_parent && \
> - !__atomic_test_and_set(_metadata->no_teardown, __ATOMIC_RELAXED)) \
> + !__atomic_test_and_set(_metadata->no_teardown, __ATOMIC_SEQ_CST)) \
> fixture_name##_teardown(_metadata, self, variant); \
> } \
> static struct __test_metadata *_##fixture_name##_##test_name##_object; \
Unfortunately, no :(
>
> > 2) parent doesn't seem to wait for the setup() to complete..
>
> setup() is called in the child (L431) right before the testcase itself is
> called (L436). The parent waits for the child to exit (L439) before unmapping.
>
> > 3) when parent runs faster than the child that is still running
> > setup(), the parent unmaps the no_teardown and set it to NULL,
> > then UAF in the child, i.e. signal 11?
>
> That should never happen as the waitpid() will block until the child running
> setup() and the testcase itself have exited.
Ah, maybe I was wrong about these narratives. But the results show
that iommufd_dirty_tracking_teardown() was not called in the failed
cases:
// I added a huge.c file to run only 4 cases on one variant
// And I added two fprintf to its FIXTURE_SETUP/TEARDOWN().
TAP version 13
1..4
# Starting 4 tests from 1 test cases.
# RUN iommufd_dirty_tracking.domain_dirty64M_huge.set_dirty_tracking ...
---------iommufd_dirty_tracking_setup
---------iommufd_dirty_tracking_teardown
# OK iommufd_dirty_tracking.domain_dirty64M_huge.set_dirty_tracking
ok 1 iommufd_dirty_tracking.domain_dirty64M_huge.set_dirty_tracking
# RUN iommufd_dirty_tracking.domain_dirty64M_huge.device_dirty_capability ...
---------iommufd_dirty_tracking_setup
# device_dirty_capability: Test terminated unexpectedly by signal 11
# FAIL iommufd_dirty_tracking.domain_dirty64M_huge.device_dirty_capability
not ok 2 iommufd_dirty_tracking.domain_dirty64M_huge.device_dirty_capability
# RUN iommufd_dirty_tracking.domain_dirty64M_huge.get_dirty_bitmap ...
---------iommufd_dirty_tracking_setup
# get_dirty_bitmap: Test terminated unexpectedly by signal 11
# FAIL iommufd_dirty_tracking.domain_dirty64M_huge.get_dirty_bitmap
not ok 3 iommufd_dirty_tracking.domain_dirty64M_huge.get_dirty_bitmap
# RUN iommufd_dirty_tracking.domain_dirty64M_huge.get_dirty_bitmap_no_clear ...
---------iommufd_dirty_tracking_setup
# get_dirty_bitmap_no_clear: Test terminated unexpectedly by signal 11
# FAIL iommufd_dirty_tracking.domain_dirty64M_huge.get_dirty_bitmap_no_clear
not ok 4 iommufd_dirty_tracking.domain_dirty64M_huge.get_dirty_bitmap_no_clear
# FAILED: 1 / 4 tests passed.
>
> Does the issue also happen when you only execute a single testcase?
> $ ./iommufd -r iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty
$ sudo tools/testing/selftests/iommu/iommufd -r iommufd_dirty_tracking.domain_dirty128M.enforce_dirty
TAP version 13
1..1
# Starting 1 tests from 1 test cases.
# RUN iommufd_dirty_tracking.domain_dirty128M.enforce_dirty ...
# OK iommufd_dirty_tracking.domain_dirty128M.enforce_dirty
ok 1 iommufd_dirty_tracking.domain_dirty128M.enforce_dirty
# PASSED: 1 / 1 tests passed.
# Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0
This one passes. Looks like the first hugepage case would pass but
the following ones would fail if running them sequentially..
Thanks
Nicolin
On Wed, Jun 11, 2025 at 10:19:56AM -0700, Nicolin Chen wrote:
> On Wed, Jun 11, 2025 at 10:04:35AM +0200, Thomas Weißschuh wrote:
> > On Wed, Jun 11, 2025 at 12:05:25AM -0700, Nicolin Chen wrote:
> > > 2) parent doesn't seem to wait for the setup() to complete..
> >
> > setup() is called in the child (L431) right before the testcase itself is
> > called (L436). The parent waits for the child to exit (L439) before unmapping.
> >
> > > 3) when parent runs faster than the child that is still running
> > > setup(), the parent unmaps the no_teardown and set it to NULL,
> > > then UAF in the child, i.e. signal 11?
> >
> > That should never happen as the waitpid() will block until the child running
> > setup() and the testcase itself have exited.
>
> Ah, maybe I was wrong about these narratives. But the results show
> that iommufd_dirty_tracking_teardown() was not called in the failed
> cases:
Here is a new finding...
As you replied that I was wrong about the race between the parent
and the child processes, the parent does wait for the completion
of the child. But the child exited with status=139 i.e. signal 11
due to UAF, which however is resulted from the iommufd test code:
FIXTURE_SETUP(iommufd_dirty_tracking)
{
....
vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
^
|
after this line, the _metadata->no_teardown is set to NULL.
So, the child process accessing this NULL pointer crashed with the
signal 11..
And I did a further experiment by turning "bool *no_teardown" to a
"bool no_teardown". Then, the mmap() in iommufd_dirty_tracking will
set _metadata->teardown_fn function pointer to NULL..
Thanks
Nicolin
On Wed, Jun 11, 2025 at 02:48:16PM -0700, Nicolin Chen wrote:
> On Wed, Jun 11, 2025 at 10:19:56AM -0700, Nicolin Chen wrote:
> > On Wed, Jun 11, 2025 at 10:04:35AM +0200, Thomas Weißschuh wrote:
> > > On Wed, Jun 11, 2025 at 12:05:25AM -0700, Nicolin Chen wrote:
> > > > 2) parent doesn't seem to wait for the setup() to complete..
> > >
> > > setup() is called in the child (L431) right before the testcase itself is
> > > called (L436). The parent waits for the child to exit (L439) before unmapping.
> > >
> > > > 3) when parent runs faster than the child that is still running
> > > > setup(), the parent unmaps the no_teardown and set it to NULL,
> > > > then UAF in the child, i.e. signal 11?
> > >
> > > That should never happen as the waitpid() will block until the child running
> > > setup() and the testcase itself have exited.
> >
> > Ah, maybe I was wrong about these narratives. But the results show
> > that iommufd_dirty_tracking_teardown() was not called in the failed
> > cases:
>
> Here is a new finding...
>
> As you replied that I was wrong about the race between the parent
> and the child processes, the parent does wait for the completion
> of the child. But the child exited with status=139 i.e. signal 11
> due to UAF, which however is resulted from the iommufd test code:
>
> FIXTURE_SETUP(iommufd_dirty_tracking)
> {
> ....
> vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
> ^
> |
> after this line, the _metadata->no_teardown is set to NULL.
>
> So, the child process accessing this NULL pointer crashed with the
> signal 11..
>
> And I did a further experiment by turning "bool *no_teardown" to a
> "bool no_teardown". Then, the mmap() in iommufd_dirty_tracking will
> set _metadata->teardown_fn function pointer to NULL..
So, the test case sets an alignment with HUGEPAGE_SIZE=512MB while
allocating buffer_size=64MB:
rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
this gives the self->buffer a location that is 512MB aligned, but
only mmap part of one 512MB huge page.
On the other hand, _metadata->no_teardown was mmap() outside the
range of the [self->buffer, self->buffer + 64MB), but within the
range of [self->buffer, self->buffer + 512MB).
E.g.
_metadata->no_teardown = 0xfffbfc610000 // inside range2 below
buffer=[0xfffbe0000000, fffbe4000000) // range1
buffer=[0xfffbe0000000, fffc00000000) // range2
Then ,the "vrc = mmap(..." overwrites the _metadata->no_teardown
location to NULL..
The following change can fix, though it feels odd that the buffer
has to be preserved with the entire huge page:
---------------------------------------------------------------
@@ -2024,3 +2027,4 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
- rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
+ rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE,
+ __ALIGN_KERNEL(variant->buffer_size, HUGEPAGE_SIZE));
if (rc || !self->buffer) {
---------------------------------------------------------------
Any thought?
Thanks
Nicolin
On Wed, Jun 11, 2025 at 04:43:00PM -0700, Nicolin Chen wrote:
> So, the test case sets an alignment with HUGEPAGE_SIZE=512MB while
> allocating buffer_size=64MB:
> rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
> vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
> this gives the self->buffer a location that is 512MB aligned, but
> only mmap part of one 512MB huge page.
>
> On the other hand, _metadata->no_teardown was mmap() outside the
> range of the [self->buffer, self->buffer + 64MB), but within the
> range of [self->buffer, self->buffer + 512MB).
>
> E.g.
> _metadata->no_teardown = 0xfffbfc610000 // inside range2 below
> buffer=[0xfffbe0000000, fffbe4000000) // range1
> buffer=[0xfffbe0000000, fffc00000000) // range2
>
> Then ,the "vrc = mmap(..." overwrites the _metadata->no_teardown
> location to NULL..
>
> The following change can fix, though it feels odd that the buffer
> has to be preserved with the entire huge page:
> ---------------------------------------------------------------
> @@ -2024,3 +2027,4 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
>
> - rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
> + rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE,
> + __ALIGN_KERNEL(variant->buffer_size, HUGEPAGE_SIZE));
> if (rc || !self->buffer) {
> ---------------------------------------------------------------
>
> Any thought?
This seems like something, variant->buffer_size should not
be less than HUGEPAGE_SIZE I guess that is possible on 64K ARM64
But I still don't quite get it..
rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
Should allocate buffer_size
mmap_flags = MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED;
mmap_flags |= MAP_HUGETLB | MAP_POPULATE;
vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
mmap_flags, -1, 0);
Should fail if buffer_size is not a multiple of HUGEPAGE_SIZE?
It certainly shouldn't mmap past the provided buffer_size!!!
Are you seeing the above mmap succeed and also map beyond buffer -> buffer + buffer_size?
I think that would be a kernel bug in MAP_HUGETLB!
Jason
On Wed, Jun 11, 2025 at 08:51:17PM -0300, Jason Gunthorpe wrote:
> On Wed, Jun 11, 2025 at 04:43:00PM -0700, Nicolin Chen wrote:
> > So, the test case sets an alignment with HUGEPAGE_SIZE=512MB while
> > allocating buffer_size=64MB:
> > rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
> > vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
> > this gives the self->buffer a location that is 512MB aligned, but
> > only mmap part of one 512MB huge page.
> >
> > On the other hand, _metadata->no_teardown was mmap() outside the
> > range of the [self->buffer, self->buffer + 64MB), but within the
> > range of [self->buffer, self->buffer + 512MB).
> >
> > E.g.
> > _metadata->no_teardown = 0xfffbfc610000 // inside range2 below
> > buffer=[0xfffbe0000000, fffbe4000000) // range1
> > buffer=[0xfffbe0000000, fffc00000000) // range2
> >
> > Then ,the "vrc = mmap(..." overwrites the _metadata->no_teardown
> > location to NULL..
> >
> > The following change can fix, though it feels odd that the buffer
> > has to be preserved with the entire huge page:
> > ---------------------------------------------------------------
> > @@ -2024,3 +2027,4 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
> >
> > - rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
> > + rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE,
> > + __ALIGN_KERNEL(variant->buffer_size, HUGEPAGE_SIZE));
> > if (rc || !self->buffer) {
> > ---------------------------------------------------------------
> >
> > Any thought?
>
> This seems like something, variant->buffer_size should not
> be less than HUGEPAGE_SIZE I guess that is possible on 64K ARM64
>
> But I still don't quite get it..
>
> rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
>
> Should allocate buffer_size
>
> mmap_flags = MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED;
> mmap_flags |= MAP_HUGETLB | MAP_POPULATE;
> vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
> mmap_flags, -1, 0);
>
> Should fail if buffer_size is not a multiple of HUGEPAGE_SIZE?
Yea, I think you are right. But..
> It certainly shouldn't mmap past the provided buffer_size!!!
>
> Are you seeing the above mmap succeed and also map beyond buffer -> buffer + buffer_size?
>
> I think that would be a kernel bug in MAP_HUGETLB!
..I did some bpftrace:
ksys_mmap_pgoff() addr=ffff80000000, len=4000000
hugetlb_file_setup(): size=0x20000000
hugetlb_reserve_pages() from=0, to=1
hugetlb_reserve_pages() returned: ret=1
hugetlb_file_setup() returned: size=0x20000000 ret=-281471746619776
vm_mmap_pgoff() addr=ffff80000000, len=20000000
do_mmap() addr=ffff80000000, len=20000000
hugetlb_reserve_pages() from=0, to=1
hugetlb_reserve_pages() returned: ret=1
do_mmap() returned: addr=0xffff80000000 ret=ffff80000000, pop=20000000
vm_mmap_pgoff() returned: addr=0xffff80000000 ret=ffff80000000
ksys_mmap_pgoff() returned: addr=0xffff80000000 ret=ffff80000000
We can see the 64MB was rounded up to 512MB by ksys_mmap_pgoff()
when being passed in to hugetlb_file_setup() at:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/mmap.c?h=v6.16-rc1#n594
" len = ALIGN(len, huge_page_size(hs)); "
By looking at the comments here..:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/hugetlbfs/inode.c#n1521
"
/*
* Note that size should be aligned to proper hugepage size in caller side,
* otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
*/
struct file *hugetlb_file_setup(const char *name, size_t size,
"
..I guess this function was supposed to fail the not-a-multiple
case as you remarked? But it certainly can't do that, when that
size passed in is already hugepage-aligned..
It feels like a kernel bug as you suspect :-/
And I just found one more weird thing...
In iommufd.c selftest code, we have:
"static __attribute__((constructor)) void setup_sizes(void)"
where it does another pair of posix_memalign/mmap, although this
one doesn't flag MAP_HUGETLB and shouldn't impact what is coming
to the next...
If I keep this code, the first hugepage test case can pass (64MB
buffer_size; 512MB THP), but all the following cases will fail,
as I reported here:
https://lore.kernel.org/all/aEm6tuzy7WK12sMh@nvidia.com/
If I remove this code, the hugepage test case will fail from the
first case with signal 11. But this time, it is not because the
mmap() overwrites the _metadata->no_teardown, it's because mmap()
call itself crashed...
And, in either a failed case (crashed) or a passed case, the top
kernel function ksys_mmap_pgoff() returned successfully, which
means it seemingly crashed inside the libc?
Thanks
Nicolin
On Wed, Jun 11, 2025 at 11:59:00PM -0700, Nicolin Chen wrote: > We can see the 64MB was rounded up to 512MB by ksys_mmap_pgoff() > when being passed in to hugetlb_file_setup() at: > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/mmap.c?h=v6.16-rc1#n594 > " len = ALIGN(len, huge_page_size(hs)); " > > By looking at the comments here..: > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/hugetlbfs/inode.c#n1521 > " > /* > * Note that size should be aligned to proper hugepage size in caller side, > * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. > */ > struct file *hugetlb_file_setup(const char *name, size_t size, > " > > ..I guess this function was supposed to fail the not-a-multiple > case as you remarked? But it certainly can't do that, when that > size passed in is already hugepage-aligned.. > > It feels like a kernel bug as you suspect :-/ Certainly is > And I just found one more weird thing... > > In iommufd.c selftest code, we have: > "static __attribute__((constructor)) void setup_sizes(void)" > where it does another pair of posix_memalign/mmap, although this > one doesn't flag MAP_HUGETLB and shouldn't impact what is coming > to the next... This could all just be more weirdness from the above, it doesn't really make alot of sense. I think change things so the MAP_HUGETLB test all skip if HUGEPAGE_SIZE < buffer_size and move on.. Can't run those tests on ARM64 64k which is unfortunate.. I thought there were patches to give that config a 2M huge page size option based on the new contiguous page support though? Maybe it was only THPS.. Jason
On Thu, Jun 12, 2025 at 10:58:02AM -0300, Jason Gunthorpe wrote: > On Wed, Jun 11, 2025 at 11:59:00PM -0700, Nicolin Chen wrote: > > > We can see the 64MB was rounded up to 512MB by ksys_mmap_pgoff() > > when being passed in to hugetlb_file_setup() at: > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/mmap.c?h=v6.16-rc1#n594 > > " len = ALIGN(len, huge_page_size(hs)); " > > > > By looking at the comments here..: > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/hugetlbfs/inode.c#n1521 > > " > > /* > > * Note that size should be aligned to proper hugepage size in caller side, > > * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. > > */ > > struct file *hugetlb_file_setup(const char *name, size_t size, > > " > > > > ..I guess this function was supposed to fail the not-a-multiple > > case as you remarked? But it certainly can't do that, when that > > size passed in is already hugepage-aligned.. > > > > It feels like a kernel bug as you suspect :-/ > > Certainly is > > > And I just found one more weird thing... > > > > In iommufd.c selftest code, we have: > > "static __attribute__((constructor)) void setup_sizes(void)" > > where it does another pair of posix_memalign/mmap, although this > > one doesn't flag MAP_HUGETLB and shouldn't impact what is coming > > to the next... > > This could all just be more weirdness from the above, it doesn't > really make alot of sense. > > I think change things so the MAP_HUGETLB test all skip if > HUGEPAGE_SIZE < buffer_size and move on.. > > Can't run those tests on ARM64 64k which is unfortunate.. I thought > there were patches to give that config a 2M huge page size option > based on the new contiguous page support though? Maybe it was only THPS.. If the assumption is that this is most likely a kernel bug, shouldn't it be fixed properly rather than worked around? After all the job of a selftest is to detect bugs to be fixed. But I wasn't able to follow all of your discussions, so I may be missing something. If the test is broken on ARM64 64k in general then I am also wondering how it didn't fail before my change to the selftest harness. Thomas
On Thu, Jun 12, 2025 at 04:27:41PM +0200, Thomas Weißschuh wrote: > If the assumption is that this is most likely a kernel bug, > shouldn't it be fixed properly rather than worked around? > After all the job of a selftest is to detect bugs to be fixed. I investigated the history for a bit and it seems likely we cannot change the kernel here. Call it an undocumented "feature". MAP_HUGETLBFS rounds up the length to some value, userspace has to figure that out and not pass incorrect lengths. The selftest is doing that wrong. > If the test is broken on ARM64 64k in general then I am also wondering how > it didn't fail before my change to the selftest harness. It got lucky and didn't overmap something important. Jason
On Thu, Jun 12, 2025 at 11:58:01AM -0300, Jason Gunthorpe wrote: > On Thu, Jun 12, 2025 at 04:27:41PM +0200, Thomas Weißschuh wrote: > > > If the assumption is that this is most likely a kernel bug, > > shouldn't it be fixed properly rather than worked around? > > After all the job of a selftest is to detect bugs to be fixed. > > I investigated the history for a bit and it seems likely we cannot > change the kernel here. Call it an undocumented "feature". I looked a bit and it seems to be mentioned in mmap(2): For mmap(), offset must be a multiple of the underlying huge page size. The system automatically aligns length to be a multiple of the underlying huge page size. And MAP_FIXED is documented to wipe away whichever mapping was there before. > MAP_HUGETLBFS rounds up the length to some value, userspace has to > figure that out and not pass incorrect lengths. The selftest is doing > that wrong. The selftest would be more robust if MAP_FIXED is replaced by MAP_FIXED_NOREPLACE. Even with the new explicit skip logic it should make debugging easier if something goes wrong. > > If the test is broken on ARM64 64k in general then I am also wondering how > > it didn't fail before my change to the selftest harness. > > It got lucky and didn't overmap something important. Oh, okay.
On Thu, Jun 12, 2025 at 05:23:01PM +0200, Thomas Weißschuh wrote: > On Thu, Jun 12, 2025 at 11:58:01AM -0300, Jason Gunthorpe wrote: > > On Thu, Jun 12, 2025 at 04:27:41PM +0200, Thomas Weißschuh wrote: > > > > > If the assumption is that this is most likely a kernel bug, > > > shouldn't it be fixed properly rather than worked around? > > > After all the job of a selftest is to detect bugs to be fixed. > > > > I investigated the history for a bit and it seems likely we cannot > > change the kernel here. Call it an undocumented "feature". > > I looked a bit and it seems to be mentioned in mmap(2): > > For mmap(), offset must be a multiple of the underlying huge page size. > The system automatically aligns length to be a multiple of the underlying huge page size. Oh there you go then :) Horrible design. No way for userspace to know what the rounded up length actually was and thus no way for userspace to unmap it. > > MAP_HUGETLBFS rounds up the length to some value, userspace has to > > figure that out and not pass incorrect lengths. The selftest is doing > > that wrong. > > The selftest would be more robust if MAP_FIXED is replaced by > MAP_FIXED_NOREPLACE. Even with the new explicit skip logic it should > make debugging easier if something goes wrong. The point is to replace something that is already mapped there, though I no longer remember why it is working like this. Jason
On Thu, Jun 12, 2025 at 12:42:42PM -0300, Jason Gunthorpe wrote: > On Thu, Jun 12, 2025 at 05:23:01PM +0200, Thomas Weißschuh wrote: > > On Thu, Jun 12, 2025 at 11:58:01AM -0300, Jason Gunthorpe wrote: > > > On Thu, Jun 12, 2025 at 04:27:41PM +0200, Thomas Weißschuh wrote: > > > > > > > If the assumption is that this is most likely a kernel bug, > > > > shouldn't it be fixed properly rather than worked around? > > > > After all the job of a selftest is to detect bugs to be fixed. > > > > > > I investigated the history for a bit and it seems likely we cannot > > > change the kernel here. Call it an undocumented "feature". > > > > I looked a bit and it seems to be mentioned in mmap(2): > > > > For mmap(), offset must be a multiple of the underlying huge page size. > > The system automatically aligns length to be a multiple of the underlying huge page size. > > Oh there you go then :) Horrible design. No way for userspace to know > what the rounded up length actually was and thus no way for > userspace to unmap it. OK. I think we would have to skip those cases then. > > > MAP_HUGETLBFS rounds up the length to some value, userspace has to > > > figure that out and not pass incorrect lengths. The selftest is doing > > > that wrong. > > > > The selftest would be more robust if MAP_FIXED is replaced by > > MAP_FIXED_NOREPLACE. Even with the new explicit skip logic it should > > make debugging easier if something goes wrong. > > The point is to replace something that is already mapped there, though > I no longer remember why it is working like this. By replacing MAP_FIXED with MAP_FIXED_NOREPLACE, at the existing two places, the selftest crashed at early setup_sizes...: iommufd: iommufd.c:53: setup_sizes: Assertion `vrc == buffer' failed. /nicolinc/iommufd_selftest.sh: line 19: 21487 Aborted (core dumped) tools/testing/selftests/iommu/iommufd strace: mmap(0xffff80000000, 1048576, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, -1, 0) = -1 EEXIST (File exists) This one doesn't MAP_HUGETLBFS btw... Thanks Nicolin
On Thu, Jun 12, 2025 at 10:53:34AM -0700, Nicolin Chen wrote:
> On Thu, Jun 12, 2025 at 12:42:42PM -0300, Jason Gunthorpe wrote:
> > On Thu, Jun 12, 2025 at 05:23:01PM +0200, Thomas Weißschuh wrote:
> > > On Thu, Jun 12, 2025 at 11:58:01AM -0300, Jason Gunthorpe wrote:
> > > > On Thu, Jun 12, 2025 at 04:27:41PM +0200, Thomas Weißschuh wrote:
> > > >
> > > > > If the assumption is that this is most likely a kernel bug,
> > > > > shouldn't it be fixed properly rather than worked around?
> > > > > After all the job of a selftest is to detect bugs to be fixed.
> > > >
> > > > I investigated the history for a bit and it seems likely we cannot
> > > > change the kernel here. Call it an undocumented "feature".
> > >
> > > I looked a bit and it seems to be mentioned in mmap(2):
> > >
> > > For mmap(), offset must be a multiple of the underlying huge page size.
> > > The system automatically aligns length to be a multiple of the underlying huge page size.
> >
> > Oh there you go then :) Horrible design. No way for userspace to know
> > what the rounded up length actually was and thus no way for
> > userspace to unmap it.
>
> OK. I think we would have to skip those cases then.
Or.. maybe we could just allocate a huge page:
@@ -2022,7 +2023,19 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
self->fd = open("/dev/iommu", O_RDWR);
ASSERT_NE(-1, self->fd);
- rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
+ if (variant->hugepages) {
+ /*
+ * Allocation must be aligned to the HUGEPAGE_SIZE, because the
+ * following mmap() will automatically align the length to be a
+ * multiple of the underlying huge page size. Failing to do the
+ * same at this allocation will result in a memory overwrite by
+ * the mmap().
+ */
+ size = __ALIGN_KERNEL(variant->buffer_size, HUGEPAGE_SIZE);
+ } else {
+ size = variant->buffer_size;
+ }
+ rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, size);
if (rc || !self->buffer) {
SKIP(return, "Skipping buffer_size=%lu due to errno=%d",
variant->buffer_size, rc);
It can just upsize the allocation, i.e. the test case will only
use the first 64M or 128MB out of the reserved 512MB huge page.
Thanks
Nicolin
On Thu, Jun 12, 2025 at 11:53:24AM -0700, Nicolin Chen wrote:
> @@ -2022,7 +2023,19 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
> self->fd = open("/dev/iommu", O_RDWR);
> ASSERT_NE(-1, self->fd);
>
> - rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
> + if (variant->hugepages) {
> + /*
> + * Allocation must be aligned to the HUGEPAGE_SIZE, because the
> + * following mmap() will automatically align the length to be a
> + * multiple of the underlying huge page size. Failing to do the
> + * same at this allocation will result in a memory overwrite by
> + * the mmap().
> + */
> + size = __ALIGN_KERNEL(variant->buffer_size, HUGEPAGE_SIZE);
> + } else {
> + size = variant->buffer_size;
> + }
> + rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, size);
> if (rc || !self->buffer) {
> SKIP(return, "Skipping buffer_size=%lu due to errno=%d",
> variant->buffer_size, rc);
>
> It can just upsize the allocation, i.e. the test case will only
> use the first 64M or 128MB out of the reserved 512MB huge page.
The MAP_HUGETLBFS is required that is the whole point of what it is
doing..
Jason
On Thu, Jun 12, 2025 at 03:56:13PM -0300, Jason Gunthorpe wrote:
> On Thu, Jun 12, 2025 at 11:53:24AM -0700, Nicolin Chen wrote:
> > @@ -2022,7 +2023,19 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
> > self->fd = open("/dev/iommu", O_RDWR);
> > ASSERT_NE(-1, self->fd);
> >
> > - rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
> > + if (variant->hugepages) {
> > + /*
> > + * Allocation must be aligned to the HUGEPAGE_SIZE, because the
> > + * following mmap() will automatically align the length to be a
> > + * multiple of the underlying huge page size. Failing to do the
> > + * same at this allocation will result in a memory overwrite by
> > + * the mmap().
> > + */
> > + size = __ALIGN_KERNEL(variant->buffer_size, HUGEPAGE_SIZE);
> > + } else {
> > + size = variant->buffer_size;
> > + }
> > + rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, size);
> > if (rc || !self->buffer) {
> > SKIP(return, "Skipping buffer_size=%lu due to errno=%d",
> > variant->buffer_size, rc);
> >
> > It can just upsize the allocation, i.e. the test case will only
> > use the first 64M or 128MB out of the reserved 512MB huge page.
>
> The MAP_HUGETLBFS is required that is the whole point of what it is
> doing..
I am not quite following this.. MAP_HUGETLB will be still set.
And the underlying selftest case is using:
MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE
Does it matter if the underlying allocation has an overshot?
Thanks
Nicolin
On Thu, Jun 12, 2025 at 12:03:14PM -0700, Nicolin Chen wrote:
> On Thu, Jun 12, 2025 at 03:56:13PM -0300, Jason Gunthorpe wrote:
> > On Thu, Jun 12, 2025 at 11:53:24AM -0700, Nicolin Chen wrote:
> > > @@ -2022,7 +2023,19 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
> > > self->fd = open("/dev/iommu", O_RDWR);
> > > ASSERT_NE(-1, self->fd);
> > >
> > > - rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
> > > + if (variant->hugepages) {
> > > + /*
> > > + * Allocation must be aligned to the HUGEPAGE_SIZE, because the
> > > + * following mmap() will automatically align the length to be a
> > > + * multiple of the underlying huge page size. Failing to do the
> > > + * same at this allocation will result in a memory overwrite by
> > > + * the mmap().
> > > + */
> > > + size = __ALIGN_KERNEL(variant->buffer_size, HUGEPAGE_SIZE);
> > > + } else {
> > > + size = variant->buffer_size;
> > > + }
> > > + rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, size);
> > > if (rc || !self->buffer) {
> > > SKIP(return, "Skipping buffer_size=%lu due to errno=%d",
> > > variant->buffer_size, rc);
> > >
> > > It can just upsize the allocation, i.e. the test case will only
> > > use the first 64M or 128MB out of the reserved 512MB huge page.
> >
> > The MAP_HUGETLBFS is required that is the whole point of what it is
> > doing..
>
> I am not quite following this.. MAP_HUGETLB will be still set.
>
> And the underlying selftest case is using:
> MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE
>
> Does it matter if the underlying allocation has an overshot?
I expect munmap won't work with the wrong size and the test will OOM?
You'd be better to correct the actual variant->buffer_size..
Jason
On Thu, Jun 12, 2025 at 08:31:38PM -0300, Jason Gunthorpe wrote:
> On Thu, Jun 12, 2025 at 12:03:14PM -0700, Nicolin Chen wrote:
> > On Thu, Jun 12, 2025 at 03:56:13PM -0300, Jason Gunthorpe wrote:
> > > On Thu, Jun 12, 2025 at 11:53:24AM -0700, Nicolin Chen wrote:
> > > > @@ -2022,7 +2023,19 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
> > > > self->fd = open("/dev/iommu", O_RDWR);
> > > > ASSERT_NE(-1, self->fd);
> > > >
> > > > - rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
> > > > + if (variant->hugepages) {
> > > > + /*
> > > > + * Allocation must be aligned to the HUGEPAGE_SIZE, because the
> > > > + * following mmap() will automatically align the length to be a
> > > > + * multiple of the underlying huge page size. Failing to do the
> > > > + * same at this allocation will result in a memory overwrite by
> > > > + * the mmap().
> > > > + */
> > > > + size = __ALIGN_KERNEL(variant->buffer_size, HUGEPAGE_SIZE);
> > > > + } else {
> > > > + size = variant->buffer_size;
> > > > + }
> > > > + rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, size);
> > > > if (rc || !self->buffer) {
> > > > SKIP(return, "Skipping buffer_size=%lu due to errno=%d",
> > > > variant->buffer_size, rc);
> > > >
> > > > It can just upsize the allocation, i.e. the test case will only
> > > > use the first 64M or 128MB out of the reserved 512MB huge page.
> > >
> > > The MAP_HUGETLBFS is required that is the whole point of what it is
> > > doing..
> >
> > I am not quite following this.. MAP_HUGETLB will be still set.
> >
> > And the underlying selftest case is using:
> > MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE
> >
> > Does it matter if the underlying allocation has an overshot?
>
> I expect munmap won't work with the wrong size and the test will OOM?
>
> You'd be better to correct the actual variant->buffer_size..
I saw test passing, before I posted that.
But you are certainly right: while mmap() handling MAP_HUGETLB will
align up the size, the munmap() doesn't. So, passing in to them the
same variant->buffer_size will result in a size mismatch.
I don't think we should change the variant->buffer_size, because it
affects the bitmap sizes in those dirty_tracking test cases. And if
we align up every single variant->buffer_size, the variants of 64MB
and 128Mb will be two duplicated 512MB cases, right?
I think we can just add this on top of that:
FIXTURE_TEARDOWN(iommufd_dirty_tracking)
{
- munmap(self->buffer, variant->buffer_size);
- munmap(self->bitmap, DIV_ROUND_UP(self->bitmap_size, BITS_PER_BYTE));
+ unsigned long size = variant->buffer_size;
+
+ if (variant->hugepages)
+ size = __ALIGN_KERNEL(size, HUGEPAGE_SIZE);
+ munmap(self->buffer, size);
+ free(self->buffer);
+ free(self->bitmap);
teardown_iommufd(self->fd, _metadata);
}
This FIXTURE_TEARDOWN() didn't free the memory allocated by the two
posix_memalign calls in the FIXTURE_SETUP()..
Thanks
Nicolin
© 2016 - 2025 Red Hat, Inc.