Hi! New version of the futex2 patches. Futex2 is a new interface to the same 'old' futex core. An attempt to get away from the multiplex syscall and add a little room for extentions. Changes since v2: - Rebased to v6.6-rc - New FUTEX_STRICT flag (Andre) - Reordered futex_size() helper (tglx) - Updated some comments (tglx) - Folded some tags My plan is to push the first 10 patches (all the syscalls) into tip/locking/core this afternoon. All those patches have plenty review tags including from Thomas who is the actual maintainer of this lot :-) This should be plenty for Jens to get a move on with the io-uring stuff. I'm holding off on the NUMA bits for now, because I want to write some userspace for it since there is some confusion on that -- but I seem to keep getting side-tracked :/ Patches also available at: git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git locking/core git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git locking/futex Where the locking/core thing is the first 10 patches only, and barring Link tags (which I'll harvest from this posting), will be what I'll push out to tip.
Hi! Updated version of patch 15/15 and a few extra patches for testing the FUTEX2_NUMA bits. The last patch (17/15) should never be applied for anything you care about and exists purely because I'm too lazy to generate actual hash-bucket contention. On my 2 node IVB-EP: $ echo FUTEX_SQUASH > /debug/sched/features Effectively reducing each node to 1 bucket. $ numactl -m0 -N0 ./futex_numa -c10 -t2 -n0 -N0 & numactl -m1 -N1 ./futex_numa -c10 -t2 -n0 -N0 ... contenders: 16154935 contenders: 16202472 $ numactl -m0 -N0 ./futex_numa -c10 -t2 -n0 -N0 & numactl -m1 -N1 ./futex_numa -c10 -t2 -n0 -N1 contenders: 48584991 contenders: 48680560 (loop counts, higher is better) Clearly showing how separating the hashes works. The first one runs 10 contenders on each node but forces the (numa) futex to hash to node 0 for both. This ensures all 20 contenders hash to the same bucket and *ouch*. The second one does the same, except now fully separates the nodes. Performance is much improved. Proving the per-node hashing actually works as advertised. Further: $ ./futex_numa -t2 -n50000 -s1 -N ... node: -1 node: -1 node: 0 node: 0 node: -1 node: -1 node: 1 node: 1 ... total: 8980 Shows how a FUTEX2_NUMA lock can bounce around the nodes. The test has some trivial asserts trying to show critical section integrity, but otherwise does lock+unlock cycles with a nanosleep. This both illustrates how to build a (trivial) lock using FUTEX2_NUMA and proves the functionality works.
On Fri, 22 Sep 2023, Peter Zijlstra wrote: >Hi! > >Updated version of patch 15/15 and a few extra patches for testing the >FUTEX2_NUMA bits. The last patch (17/15) should never be applied for anything >you care about and exists purely because I'm too lazy to generate actual >hash-bucket contention. > >On my 2 node IVB-EP: > > $ echo FUTEX_SQUASH > /debug/sched/features > >Effectively reducing each node to 1 bucket. > > $ numactl -m0 -N0 ./futex_numa -c10 -t2 -n0 -N0 & > numactl -m1 -N1 ./futex_numa -c10 -t2 -n0 -N0 > > ... > contenders: 16154935 > contenders: 16202472 > > $ numactl -m0 -N0 ./futex_numa -c10 -t2 -n0 -N0 & > numactl -m1 -N1 ./futex_numa -c10 -t2 -n0 -N1 > > contenders: 48584991 > contenders: 48680560 > >(loop counts, higher is better) > >Clearly showing how separating the hashes works. > >The first one runs 10 contenders on each node but forces the (numa) futex to >hash to node 0 for both. This ensures all 20 contenders hash to the same >bucket and *ouch*. > >The second one does the same, except now fully separates the nodes. Performance >is much improved. > >Proving the per-node hashing actually works as advertised. Very nice.
Extend the wait/requeue selftests to also cover the futex2 syscalls.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
tools/testing/selftests/futex/functional/futex_requeue.c | 100 +++++++++-
tools/testing/selftests/futex/functional/futex_wait.c | 56 ++++-
tools/testing/selftests/futex/functional/futex_wait_timeout.c | 16 +
tools/testing/selftests/futex/functional/futex_wait_wouldblock.c | 28 ++
tools/testing/selftests/futex/functional/futex_waitv.c | 15 -
tools/testing/selftests/futex/functional/run.sh | 6
tools/testing/selftests/futex/include/futex2test.h | 52 +++++
7 files changed, 243 insertions(+), 30 deletions(-)
--- a/tools/testing/selftests/futex/functional/futex_requeue.c
+++ b/tools/testing/selftests/futex/functional/futex_requeue.c
@@ -7,8 +7,10 @@
#include <pthread.h>
#include <limits.h>
+#include <stdbool.h>
#include "logging.h"
#include "futextest.h"
+#include "futex2test.h"
#define TEST_NAME "futex-requeue"
#define timeout_ns 30000000
@@ -16,24 +18,58 @@
volatile futex_t *f1;
+bool futex2 = 0;
+bool mixed = 0;
+
void usage(char *prog)
{
printf("Usage: %s\n", prog);
printf(" -c Use color\n");
+ printf(" -n Use futex2 interface\n");
+ printf(" -x Use mixed size futex\n");
printf(" -h Display this help message\n");
printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
VQUIET, VCRITICAL, VINFO);
}
-void *waiterfn(void *arg)
+static void *waiterfn(void *arg)
{
+ unsigned int flags = 0;
struct timespec to;
- to.tv_sec = 0;
- to.tv_nsec = timeout_ns;
+ if (futex2) {
+ unsigned long mask;
+
+ if (clock_gettime(CLOCK_MONOTONIC, &to)) {
+ printf("clock_gettime() failed errno %d", errno);
+ return NULL;
+ }
+
+ to.tv_nsec += timeout_ns;
+ if (to.tv_nsec >= 1000000000) {
+ to.tv_sec++;
+ to.tv_nsec -= 1000000000;
+ }
+
+ if (mixed) {
+ flags |= FUTEX2_SIZE_U16;
+ mask = (unsigned short)(~0U);
+ } else {
+ flags |= FUTEX2_SIZE_U32;
+ mask = (unsigned int)(~0U);
+ }
+
+ if (futex2_wait(f1, *f1, mask, flags,
+ &to, CLOCK_MONOTONIC))
+ printf("waiter failed errno %d\n", errno);
+ } else {
+
+ to.tv_sec = 0;
+ to.tv_nsec = timeout_ns;
- if (futex_wait(f1, *f1, &to, 0))
- printf("waiter failed errno %d\n", errno);
+ if (futex_wait(f1, *f1, &to, flags))
+ printf("waiter failed errno %d\n", errno);
+ }
return NULL;
}
@@ -48,7 +84,7 @@ int main(int argc, char *argv[])
f1 = &_f1;
- while ((c = getopt(argc, argv, "cht:v:")) != -1) {
+ while ((c = getopt(argc, argv, "xncht:v:")) != -1) {
switch (c) {
case 'c':
log_color(1);
@@ -59,6 +95,12 @@ int main(int argc, char *argv[])
case 'v':
log_verbosity(atoi(optarg));
break;
+ case 'x':
+ mixed=1;
+ /* fallthrough */
+ case 'n':
+ futex2=1;
+ break;
default:
usage(basename(argv[0]));
exit(1);
@@ -79,7 +121,22 @@ int main(int argc, char *argv[])
usleep(WAKE_WAIT_US);
info("Requeuing 1 futex from f1 to f2\n");
- res = futex_cmp_requeue(f1, 0, &f2, 0, 1, 0);
+ if (futex2) {
+ struct futex_waitv futexes[2] = {
+ {
+ .val = 0,
+ .uaddr = (unsigned long)f1,
+ .flags = mixed ? FUTEX2_SIZE_U16 : FUTEX2_SIZE_U32,
+ },
+ {
+ .uaddr = (unsigned long)&f2,
+ .flags = FUTEX2_SIZE_U32,
+ },
+ };
+ res = futex2_requeue(futexes, 0, 0, 1);
+ } else {
+ res = futex_cmp_requeue(f1, 0, &f2, 0, 1, 0);
+ }
if (res != 1) {
ksft_test_result_fail("futex_requeue simple returned: %d %s\n",
res ? errno : res,
@@ -89,7 +146,11 @@ int main(int argc, char *argv[])
info("Waking 1 futex at f2\n");
- res = futex_wake(&f2, 1, 0);
+ if (futex2) {
+ res = futex2_wake(&f2, ~0U, 1, FUTEX2_SIZE_U32);
+ } else {
+ res = futex_wake(&f2, 1, 0);
+ }
if (res != 1) {
ksft_test_result_fail("futex_requeue simple returned: %d %s\n",
res ? errno : res,
@@ -112,7 +173,22 @@ int main(int argc, char *argv[])
usleep(WAKE_WAIT_US);
info("Waking 3 futexes at f1 and requeuing 7 futexes from f1 to f2\n");
- res = futex_cmp_requeue(f1, 0, &f2, 3, 7, 0);
+ if (futex2) {
+ struct futex_waitv futexes[2] = {
+ {
+ .val = 0,
+ .uaddr = (unsigned long)f1,
+ .flags = mixed ? FUTEX2_SIZE_U16 : FUTEX2_SIZE_U32,
+ },
+ {
+ .uaddr = (unsigned long)&f2,
+ .flags = FUTEX2_SIZE_U32,
+ },
+ };
+ res = futex2_requeue(futexes, 0, 3, 7);
+ } else {
+ res = futex_cmp_requeue(f1, 0, &f2, 3, 7, 0);
+ }
if (res != 10) {
ksft_test_result_fail("futex_requeue many returned: %d %s\n",
res ? errno : res,
@@ -121,7 +197,11 @@ int main(int argc, char *argv[])
}
info("Waking INT_MAX futexes at f2\n");
- res = futex_wake(&f2, INT_MAX, 0);
+ if (futex2) {
+ res = futex2_wake(&f2, ~0U, INT_MAX, FUTEX2_SIZE_U32);
+ } else {
+ res = futex_wake(&f2, INT_MAX, 0);
+ }
if (res != 7) {
ksft_test_result_fail("futex_requeue many returned: %d %s\n",
res ? errno : res,
--- a/tools/testing/selftests/futex/functional/futex_wait.c
+++ b/tools/testing/selftests/futex/functional/futex_wait.c
@@ -9,8 +9,10 @@
#include <sys/shm.h>
#include <sys/mman.h>
#include <fcntl.h>
+#include <stdbool.h>
#include "logging.h"
#include "futextest.h"
+#include "futex2test.h"
#define TEST_NAME "futex-wait"
#define timeout_ns 30000000
@@ -19,10 +21,13 @@
void *futex;
+bool futex2 = 0;
+
void usage(char *prog)
{
printf("Usage: %s\n", prog);
printf(" -c Use color\n");
+ printf(" -n Use futex2 interface\n");
printf(" -h Display this help message\n");
printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
VQUIET, VCRITICAL, VINFO);
@@ -30,17 +35,35 @@ void usage(char *prog)
static void *waiterfn(void *arg)
{
- struct timespec to;
unsigned int flags = 0;
+ struct timespec to;
if (arg)
flags = *((unsigned int *) arg);
- to.tv_sec = 0;
- to.tv_nsec = timeout_ns;
+ if (futex2) {
+ if (clock_gettime(CLOCK_MONOTONIC, &to)) {
+ printf("clock_gettime() failed errno %d", errno);
+ return NULL;
+ }
- if (futex_wait(futex, 0, &to, flags))
- printf("waiter failed errno %d\n", errno);
+ to.tv_nsec += timeout_ns;
+ if (to.tv_nsec >= 1000000000) {
+ to.tv_sec++;
+ to.tv_nsec -= 1000000000;
+ }
+
+ if (futex2_wait(futex, 0, ~0U, flags | FUTEX2_SIZE_U32,
+ &to, CLOCK_MONOTONIC))
+ printf("waiter failed errno %d\n", errno);
+ } else {
+
+ to.tv_sec = 0;
+ to.tv_nsec = timeout_ns;
+
+ if (futex_wait(futex, 0, &to, flags))
+ printf("waiter failed errno %d\n", errno);
+ }
return NULL;
}
@@ -55,7 +78,7 @@ int main(int argc, char *argv[])
futex = &f_private;
- while ((c = getopt(argc, argv, "cht:v:")) != -1) {
+ while ((c = getopt(argc, argv, "ncht:v:")) != -1) {
switch (c) {
case 'c':
log_color(1);
@@ -66,6 +89,9 @@ int main(int argc, char *argv[])
case 'v':
log_verbosity(atoi(optarg));
break;
+ case 'n':
+ futex2=1;
+ break;
default:
usage(basename(argv[0]));
exit(1);
@@ -84,7 +110,11 @@ int main(int argc, char *argv[])
usleep(WAKE_WAIT_US);
info("Calling private futex_wake on futex: %p\n", futex);
- res = futex_wake(futex, 1, FUTEX_PRIVATE_FLAG);
+ if (futex2) {
+ res = futex2_wake(futex, ~0U, 1, FUTEX2_SIZE_U32 | FUTEX2_PRIVATE);
+ } else {
+ res = futex_wake(futex, 1, FUTEX_PRIVATE_FLAG);
+ }
if (res != 1) {
ksft_test_result_fail("futex_wake private returned: %d %s\n",
errno, strerror(errno));
@@ -112,7 +142,11 @@ int main(int argc, char *argv[])
usleep(WAKE_WAIT_US);
info("Calling shared (page anon) futex_wake on futex: %p\n", futex);
- res = futex_wake(futex, 1, 0);
+ if (futex2) {
+ res = futex2_wake(futex, ~0U, 1, FUTEX2_SIZE_U32);
+ } else {
+ res = futex_wake(futex, 1, 0);
+ }
if (res != 1) {
ksft_test_result_fail("futex_wake shared (page anon) returned: %d %s\n",
errno, strerror(errno));
@@ -151,7 +185,11 @@ int main(int argc, char *argv[])
usleep(WAKE_WAIT_US);
info("Calling shared (file backed) futex_wake on futex: %p\n", futex);
- res = futex_wake(shm, 1, 0);
+ if (futex2) {
+ res = futex2_wake(shm, ~0U, 1, FUTEX2_SIZE_U32);
+ } else {
+ res = futex_wake(shm, 1, 0);
+ }
if (res != 1) {
ksft_test_result_fail("futex_wake shared (file backed) returned: %d %s\n",
errno, strerror(errno));
--- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c
+++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c
@@ -103,7 +103,7 @@ int main(int argc, char *argv[])
struct futex_waitv waitv = {
.uaddr = (uintptr_t)&f1,
.val = f1,
- .flags = FUTEX_32,
+ .flags = FUTEX2_SIZE_U32,
.__reserved = 0
};
@@ -128,7 +128,7 @@ int main(int argc, char *argv[])
}
ksft_print_header();
- ksft_set_plan(9);
+ ksft_set_plan(11);
ksft_print_msg("%s: Block on a futex and wait for timeout\n",
basename(argv[0]));
ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns);
@@ -201,6 +201,18 @@ int main(int argc, char *argv[])
res = futex_waitv(&waitv, 1, 0, &to, CLOCK_REALTIME);
test_timeout(res, &ret, "futex_waitv realtime", ETIMEDOUT);
+ /* futex2_wait with CLOCK_MONOTONIC */
+ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns))
+ return RET_FAIL;
+ res = futex2_wait(&f1, f1, 1, FUTEX2_SIZE_U32, &to, CLOCK_MONOTONIC);
+ test_timeout(res, &ret, "futex2_wait monotonic", ETIMEDOUT);
+
+ /* futex2_wait with CLOCK_REALTIME */
+ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns))
+ return RET_FAIL;
+ res = futex2_wait(&f1, f1, 1, FUTEX2_SIZE_U32, &to, CLOCK_REALTIME);
+ test_timeout(res, &ret, "futex2_wait realtime", ETIMEDOUT);
+
ksft_print_cnts();
return ret;
}
--- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
+++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
@@ -46,7 +46,7 @@ int main(int argc, char *argv[])
struct futex_waitv waitv = {
.uaddr = (uintptr_t)&f1,
.val = f1+1,
- .flags = FUTEX_32,
+ .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE,
.__reserved = 0
};
@@ -68,7 +68,7 @@ int main(int argc, char *argv[])
}
ksft_print_header();
- ksft_set_plan(2);
+ ksft_set_plan(3);
ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n",
basename(argv[0]));
@@ -106,6 +106,30 @@ int main(int argc, char *argv[])
ksft_test_result_pass("futex_waitv\n");
}
+ if (clock_gettime(CLOCK_MONOTONIC, &to)) {
+ error("clock_gettime failed\n", errno);
+ return errno;
+ }
+
+ to.tv_nsec += timeout_ns;
+
+ if (to.tv_nsec >= 1000000000) {
+ to.tv_sec++;
+ to.tv_nsec -= 1000000000;
+ }
+
+ info("Calling futex2_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1);
+ res = futex2_wait(&f1, f1+1, ~0U, FUTEX2_SIZE_U32 | FUTEX2_PRIVATE,
+ &to, CLOCK_MONOTONIC);
+ if (!res || errno != EWOULDBLOCK) {
+ ksft_test_result_pass("futex2_wait returned: %d %s\n",
+ res ? errno : res,
+ res ? strerror(errno) : "");
+ ret = RET_FAIL;
+ } else {
+ ksft_test_result_pass("futex2_wait\n");
+ }
+
ksft_print_cnts();
return ret;
}
--- a/tools/testing/selftests/futex/functional/futex_waitv.c
+++ b/tools/testing/selftests/futex/functional/futex_waitv.c
@@ -88,7 +88,7 @@ int main(int argc, char *argv[])
for (i = 0; i < NR_FUTEXES; i++) {
waitv[i].uaddr = (uintptr_t)&futexes[i];
- waitv[i].flags = FUTEX_32 | FUTEX_PRIVATE_FLAG;
+ waitv[i].flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE;
waitv[i].val = 0;
waitv[i].__reserved = 0;
}
@@ -99,7 +99,8 @@ int main(int argc, char *argv[])
usleep(WAKE_WAIT_US);
- res = futex_wake(u64_to_ptr(waitv[NR_FUTEXES - 1].uaddr), 1, FUTEX_PRIVATE_FLAG);
+ res = futex2_wake(u64_to_ptr(waitv[NR_FUTEXES - 1].uaddr), ~0U, 1,
+ FUTEX2_PRIVATE | FUTEX2_SIZE_U32);
if (res != 1) {
ksft_test_result_fail("futex_wake private returned: %d %s\n",
res ? errno : res,
@@ -122,7 +123,7 @@ int main(int argc, char *argv[])
*shared_data = 0;
waitv[i].uaddr = (uintptr_t)shared_data;
- waitv[i].flags = FUTEX_32;
+ waitv[i].flags = FUTEX2_SIZE_U32;
waitv[i].val = 0;
waitv[i].__reserved = 0;
}
@@ -145,8 +146,8 @@ int main(int argc, char *argv[])
for (i = 0; i < NR_FUTEXES; i++)
shmdt(u64_to_ptr(waitv[i].uaddr));
- /* Testing a waiter without FUTEX_32 flag */
- waitv[0].flags = FUTEX_PRIVATE_FLAG;
+ /* Testing a waiter without FUTEX2_SIZE_U32 flag */
+ waitv[0].flags = FUTEX2_PRIVATE;
if (clock_gettime(CLOCK_MONOTONIC, &to))
error("gettime64 failed\n", errno);
@@ -160,11 +161,11 @@ int main(int argc, char *argv[])
res ? strerror(errno) : "");
ret = RET_FAIL;
} else {
- ksft_test_result_pass("futex_waitv without FUTEX_32\n");
+ ksft_test_result_pass("futex_waitv without FUTEX2_SIZE_U32\n");
}
/* Testing a waiter with an unaligned address */
- waitv[0].flags = FUTEX_PRIVATE_FLAG | FUTEX_32;
+ waitv[0].flags = FUTEX2_PRIVATE | FUTEX2_SIZE_U32;
waitv[0].uaddr = 1;
if (clock_gettime(CLOCK_MONOTONIC, &to))
--- a/tools/testing/selftests/futex/functional/run.sh
+++ b/tools/testing/selftests/futex/functional/run.sh
@@ -76,9 +76,15 @@ echo
echo
./futex_wait $COLOR
+echo
+./futex_wait -n $COLOR
echo
./futex_requeue $COLOR
+echo
+./futex_requeue -n $COLOR
+echo
+./futex_requeue -x $COLOR
echo
./futex_waitv $COLOR
--- a/tools/testing/selftests/futex/include/futex2test.h
+++ b/tools/testing/selftests/futex/include/futex2test.h
@@ -8,6 +8,41 @@
#define u64_to_ptr(x) ((void *)(uintptr_t)(x))
+#ifndef __NR_futex_waitv
+#define __NR_futex_waitv 449
+
+struct futex_waitv {
+ __u64 val;
+ __u64 uaddr;
+ __u32 flags;
+ __u32 __reserved;
+};
+#endif
+
+#ifndef __NR_futex_wake
+#define __NR_futex_wake 454
+#define __NR_futex_wait 455
+#define __NR_futex_requeue 456
+#endif
+
+#ifndef FUTEX2_SIZE_U8
+/*
+ * Flags for futex2 syscalls.
+ */
+#define FUTEX2_SIZE_U8 0x00
+#define FUTEX2_SIZE_U16 0x01
+#define FUTEX2_SIZE_U32 0x02
+#define FUTEX2_SIZE_U64 0x03
+#define FUTEX2_NUMA 0x04
+ /* 0x08 */
+ /* 0x10 */
+ /* 0x20 */
+ /* 0x40 */
+#define FUTEX2_PRIVATE FUTEX_PRIVATE_FLAG
+#endif
+
+#define FUTEX_NO_NODE (-1)
+
/**
* futex_waitv - Wait at multiple futexes, wake on any
* @waiters: Array of waiters
@@ -20,3 +55,20 @@ static inline int futex_waitv(volatile s
{
return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, timo, clockid);
}
+
+static inline int futex2_wake(volatile void *uaddr, unsigned long mask, int nr, unsigned int flags)
+{
+ return syscall(__NR_futex_wake, uaddr, mask, nr, flags);
+}
+
+static inline int futex2_wait(volatile void *uaddr, unsigned long val, unsigned long mask,
+ unsigned int flags, struct timespec *timo, clockid_t clockid)
+{
+ return syscall(__NR_futex_wait, uaddr, val, mask, flags, timo, clockid);
+}
+
+static inline int futex2_requeue(struct futex_waitv *futexes, unsigned int flags,
+ int nr_wake, int nr_requeue)
+{
+ return syscall(__NR_futex_requeue, futexes, flags, nr_wake, nr_requeue);
+}
XXX
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
tools/testing/selftests/futex/functional/Makefile | 3
tools/testing/selftests/futex/functional/futex_numa.c | 262 ++++++++++++++++++
2 files changed, 264 insertions(+), 1 deletion(-)
--- a/tools/testing/selftests/futex/functional/Makefile
+++ b/tools/testing/selftests/futex/functional/Makefile
@@ -17,7 +17,8 @@ TEST_GEN_PROGS := \
futex_wait_private_mapped_file \
futex_wait \
futex_requeue \
- futex_waitv
+ futex_waitv \
+ futex_numa
TEST_PROGS := run.sh
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_numa.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <pthread.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <time.h>
+#include <assert.h>
+#include "logging.h"
+#include "futextest.h"
+#include "futex2test.h"
+
+typedef u_int32_t u32;
+typedef int32_t s32;
+typedef u_int64_t u64;
+
+static int fflags = (FUTEX2_SIZE_U32 | FUTEX2_PRIVATE);
+static int fnode = FUTEX_NO_NODE;
+
+/* fairly stupid test-and-set lock with a waiter flag */
+
+#define N_LOCK 0x0000001
+#define N_WAITERS 0x0001000
+
+struct futex_numa_32 {
+ union {
+ u64 full;
+ struct {
+ u32 val;
+ u32 node;
+ };
+ };
+};
+
+void futex_numa_32_lock(struct futex_numa_32 *lock)
+{
+ for (;;) {
+ struct futex_numa_32 new, old = {
+ .full = __atomic_load_n(&lock->full, __ATOMIC_RELAXED),
+ };
+
+ for (;;) {
+ new = old;
+ if (old.val == 0) {
+ /* no waiter, no lock -> first lock, set no-node */
+ new.node = fnode;
+ }
+ if (old.val & N_LOCK) {
+ /* contention, set waiter */
+ new.val |= N_WAITERS;
+ }
+ new.val |= N_LOCK;
+
+ /* nothing changed, ready to block */
+ if (old.full == new.full)
+ break;
+
+ /*
+ * Use u64 cmpxchg to set the futex value and node in a
+ * consistent manner.
+ */
+ if (__atomic_compare_exchange_n(&lock->full,
+ &old.full, new.full,
+ /* .weak */ false,
+ __ATOMIC_ACQUIRE,
+ __ATOMIC_RELAXED)) {
+
+ /* if we just set N_LOCK, we own it */
+ if (!(old.val & N_LOCK))
+ return;
+
+ /* go block */
+ break;
+ }
+ }
+
+ futex2_wait(lock, new.val, ~0U, fflags, NULL, 0);
+ }
+}
+
+void futex_numa_32_unlock(struct futex_numa_32 *lock)
+{
+ u32 val = __atomic_sub_fetch(&lock->val, N_LOCK, __ATOMIC_RELEASE);
+ assert((s32)val >= 0);
+ if (val & N_WAITERS) {
+ int woken = futex2_wake(lock, ~0U, 1, fflags);
+ assert(val == N_WAITERS);
+ if (!woken) {
+ __atomic_compare_exchange_n(&lock->val, &val, 0U,
+ false, __ATOMIC_RELAXED,
+ __ATOMIC_RELAXED);
+ }
+ }
+}
+
+static long nanos = 50000;
+
+struct thread_args {
+ pthread_t tid;
+ volatile int * done;
+ struct futex_numa_32 *lock;
+ int val;
+ int *val1, *val2;
+ int node;
+};
+
+static void *threadfn(void *_arg)
+{
+ struct thread_args *args = _arg;
+ struct timespec ts = {
+ .tv_nsec = nanos,
+ };
+ int node;
+
+ while (!*args->done) {
+
+ futex_numa_32_lock(args->lock);
+ args->val++;
+
+ assert(*args->val1 == *args->val2);
+ (*args->val1)++;
+ nanosleep(&ts, NULL);
+ (*args->val2)++;
+
+ node = args->lock->node;
+ futex_numa_32_unlock(args->lock);
+
+ if (node != args->node) {
+ args->node = node;
+ printf("node: %d\n", node);
+ }
+
+ nanosleep(&ts, NULL);
+ }
+
+ return NULL;
+}
+
+static void *contendfn(void *_arg)
+{
+ struct thread_args *args = _arg;
+
+ while (!*args->done) {
+ /*
+ * futex2_wait() will take hb-lock, verify *var == val and
+ * queue/abort. By knowingly setting val 'wrong' this will
+ * abort and thereby generate hb-lock contention.
+ */
+ futex2_wait(&args->lock->val, ~0U, ~0U, fflags, NULL, 0);
+ args->val++;
+ }
+
+ return NULL;
+}
+
+static volatile int done = 0;
+static struct futex_numa_32 lock = { .val = 0, };
+static int val1, val2;
+
+int main(int argc, char *argv[])
+{
+ struct thread_args *tas[512], *cas[512];
+ int c, t, threads = 2, contenders = 0;
+ int sleeps = 10;
+ int total = 0;
+
+ while ((c = getopt(argc, argv, "c:t:s:n:N::")) != -1) {
+ switch (c) {
+ case 'c':
+ contenders = atoi(optarg);
+ break;
+ case 't':
+ threads = atoi(optarg);
+ break;
+ case 's':
+ sleeps = atoi(optarg);
+ break;
+ case 'n':
+ nanos = atoi(optarg);
+ break;
+ case 'N':
+ fflags |= FUTEX2_NUMA;
+ if (optarg)
+ fnode = atoi(optarg);
+ break;
+ default:
+ exit(1);
+ break;
+ }
+ }
+
+ for (t = 0; t < contenders; t++) {
+ struct thread_args *args = calloc(1, sizeof(*args));
+ if (!args) {
+ perror("thread_args");
+ exit(-1);
+ }
+
+ args->done = &done;
+ args->lock = &lock;
+ args->val1 = &val1;
+ args->val2 = &val2;
+ args->node = -1;
+
+ if (pthread_create(&args->tid, NULL, contendfn, args)) {
+ perror("pthread_create");
+ exit(-1);
+ }
+
+ cas[t] = args;
+ }
+
+ for (t = 0; t < threads; t++) {
+ struct thread_args *args = calloc(1, sizeof(*args));
+ if (!args) {
+ perror("thread_args");
+ exit(-1);
+ }
+
+ args->done = &done;
+ args->lock = &lock;
+ args->val1 = &val1;
+ args->val2 = &val2;
+ args->node = -1;
+
+ if (pthread_create(&args->tid, NULL, threadfn, args)) {
+ perror("pthread_create");
+ exit(-1);
+ }
+
+ tas[t] = args;
+ }
+
+ sleep(sleeps);
+
+ done = true;
+
+ for (t = 0; t < threads; t++) {
+ struct thread_args *args = tas[t];
+
+ pthread_join(args->tid, NULL);
+ total += args->val;
+// printf("tval: %d\n", args->val);
+ }
+ printf("total: %d\n", total);
+
+ if (contenders) {
+ total = 0;
+ for (t = 0; t < contenders; t++) {
+ struct thread_args *args = cas[t];
+
+ pthread_join(args->tid, NULL);
+ total += args->val;
+// printf("tval: %d\n", args->val);
+ }
+ printf("contenders: %d\n", total);
+ }
+
+ return 0;
+}
+
If you hate performance -- use this.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/futex/core.c | 6 ++++++
kernel/sched/features.h | 2 ++
2 files changed, 8 insertions(+)
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -128,6 +128,9 @@ static int futex_put_value(u32 val, u32
}
}
+#include <linux/sched/cputime.h>
+#include "../sched/sched.h"
+
/**
* futex_hash - Return the hash bucket in the global hash
* @key: Pointer to the futex key for which the hash is calculated
@@ -159,6 +162,9 @@ struct futex_hash_bucket *futex_hash(uni
}
}
+ if (sched_feat(FUTEX_SQUASH))
+ hash = 0;
+
return &futex_queues[node][hash & (futex_hashsize - 1)];
}
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -89,3 +89,5 @@ SCHED_FEAT(UTIL_EST_FASTUP, true)
SCHED_FEAT(LATENCY_WARN, false)
SCHED_FEAT(HZ_BW, true)
+
+SCHED_FEAT(FUTEX_SQUASH, false)
© 2016 - 2025 Red Hat, Inc.