From nobody Mon Apr 6 13:27:47 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 36E94C433F5 for ; Thu, 29 Sep 2022 22:56:14 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S230199AbiI2W4M (ORCPT ); Thu, 29 Sep 2022 18:56:12 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48724 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229562AbiI2Wzf (ORCPT ); Thu, 29 Sep 2022 18:55:35 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [139.178.84.217]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 8B450E11A6 for ; Thu, 29 Sep 2022 15:55:28 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id 433CC621DF for ; Thu, 29 Sep 2022 22:55:28 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 82A13C433C1; Thu, 29 Sep 2022 22:55:27 +0000 (UTC) Received: from rostedt by gandalf.local.home with local (Exim 4.96) (envelope-from ) id 1oe2SP-000cso-1U; Thu, 29 Sep 2022 18:56:41 -0400 Message-ID: <20220929225641.003313801@goodmis.org> User-Agent: quilt/0.66 Date: Thu, 29 Sep 2022 18:55:56 -0400 From: Steven Rostedt To: linux-kernel@vger.kernel.org Cc: Ingo Molnar , Andrew Morton , Mathieu Desnoyers , Beau Belgrave Subject: [for-next][PATCH 14/15] tracing/user_events: Use bits vs bytes for enabled status page data References: <20220929225542.784716766@goodmis.org> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Beau Belgrave User processes may require many events and when they do the cache performance of a byte index status check is less ideal than a bit index. The previous event limit per-page was 4096, the new limit is 32,768. This change adds a bitwise index to the user_reg struct. Programs check that the bit at status_bit has a bit set within the status page(s). Link: https://lkml.kernel.org/r/20220728233309.1896-6-beaub@linux.microsoft= .com Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.= zimbra@efficios.com/ Suggested-by: Mathieu Desnoyers Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/linux/user_events.h | 15 +--- kernel/trace/trace_events_user.c | 75 +++++++++++++++++-- samples/user_events/example.c | 25 +++++-- .../selftests/user_events/ftrace_test.c | 47 ++++++++++-- .../testing/selftests/user_events/perf_test.c | 11 ++- 5 files changed, 135 insertions(+), 38 deletions(-) diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 736e05603463..592a3fbed98e 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -20,15 +20,6 @@ #define USER_EVENTS_SYSTEM "user_events" #define USER_EVENTS_PREFIX "u:" =20 -/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */ -#define EVENT_BIT_FTRACE 0 -#define EVENT_BIT_PERF 1 -#define EVENT_BIT_OTHER 7 - -#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE) -#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF) -#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER) - /* Create dynamic location entry within a 32-bit value */ #define DYN_LOC(offset, size) ((size) << 16 | (offset)) =20 @@ -45,12 +36,12 @@ struct user_reg { /* Input: Pointer to string with event name, description and flags */ __u64 name_args; =20 - /* Output: Byte index of the event within the status page */ - __u32 status_index; + /* Output: Bitwise index of the event within the status page */ + __u32 status_bit; =20 /* Output: Index of the event to use when writing data */ __u32 write_index; -}; +} __attribute__((__packed__)); =20 #define DIAG_IOC_MAGIC '*' =20 diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_u= ser.c index 2bcae7abfa81..2c0a6ec75548 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -40,17 +40,44 @@ */ #define MAX_PAGE_ORDER 0 #define MAX_PAGES (1 << MAX_PAGE_ORDER) -#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE) +#define MAX_BYTES (MAX_PAGES * PAGE_SIZE) +#define MAX_EVENTS (MAX_BYTES * 8) =20 /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 =20 +/* + * The MAP_STATUS_* macros are used for taking a index and determining the + * appropriate byte and the bit in the byte to set/reset for an event. + * + * The lower 3 bits of the index decide which bit to set. + * The remaining upper bits of the index decide which byte to use for the = bit. + * + * This is used when an event has a probe attached/removed to reflect live + * status of the event wanting tracing or not to user-programs via shared + * memory maps. + */ +#define MAP_STATUS_BYTE(index) ((index) >> 3) +#define MAP_STATUS_MASK(index) BIT((index) & 7) + +/* + * Internal bits (kernel side only) to keep track of connected probes: + * These are used when status is requested in text form about an event. Th= ese + * bits are compared against an internal byte on the event to determine wh= ich + * probes to print out to the user. + * + * These do not reflect the mapped bytes between the user and kernel space. + */ +#define EVENT_STATUS_FTRACE BIT(0) +#define EVENT_STATUS_PERF BIT(1) +#define EVENT_STATUS_OTHER BIT(7) + static char *register_page_data; =20 static DEFINE_MUTEX(reg_mutex); -static DEFINE_HASHTABLE(register_table, 4); +static DEFINE_HASHTABLE(register_table, 8); static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); =20 /* @@ -72,6 +99,7 @@ struct user_event { int index; int flags; int min_size; + char status; }; =20 /* @@ -106,6 +134,22 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } =20 +static __always_inline +void user_event_register_set(struct user_event *user) +{ + int i =3D user->index; + + register_page_data[MAP_STATUS_BYTE(i)] |=3D MAP_STATUS_MASK(i); +} + +static __always_inline +void user_event_register_clear(struct user_event *user) +{ + int i =3D user->index; + + register_page_data[MAP_STATUS_BYTE(i)] &=3D ~MAP_STATUS_MASK(i); +} + static __always_inline __must_check bool user_event_last_ref(struct user_event *user) { @@ -648,7 +692,7 @@ static int destroy_user_event(struct user_event *user) =20 dyn_event_remove(&user->devent); =20 - register_page_data[user->index] =3D 0; + user_event_register_clear(user); clear_bit(user->index, page_bitmap); hash_del(&user->node); =20 @@ -827,7 +871,12 @@ static void update_reg_page_for(struct user_event *use= r) rcu_read_unlock_sched(); } =20 - register_page_data[user->index] =3D status; + if (status) + user_event_register_set(user); + else + user_event_register_clear(user); + + user->status =3D status; } =20 /* @@ -1332,7 +1381,17 @@ static long user_reg_get(struct user_reg __user *ure= g, struct user_reg *kreg) if (size > PAGE_SIZE) return -E2BIG; =20 - return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + if (size < offsetofend(struct user_reg, write_index)) + return -EINVAL; + + ret =3D copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + + if (ret) + return ret; + + kreg->size =3D size; + + return 0; } =20 /* @@ -1376,7 +1435,7 @@ static long user_events_ioctl_reg(struct file *file, = unsigned long uarg) return ret; =20 put_user((u32)ret, &ureg->write_index); - put_user(user->index, &ureg->status_index); + put_user(user->index, &ureg->status_bit); =20 return 0; } @@ -1485,7 +1544,7 @@ static int user_status_mmap(struct file *file, struct= vm_area_struct *vma) { unsigned long size =3D vma->vm_end - vma->vm_start; =20 - if (size !=3D MAX_EVENTS) + if (size !=3D MAX_BYTES) return -EINVAL; =20 return remap_pfn_range(vma, vma->vm_start, @@ -1520,7 +1579,7 @@ static int user_seq_show(struct seq_file *m, void *p) mutex_lock(®_mutex); =20 hash_for_each(register_table, i, user, node) { - status =3D register_page_data[user->index]; + status =3D user->status; flags =3D user->flags; =20 seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); diff --git a/samples/user_events/example.c b/samples/user_events/example.c index 4f5778e441c0..d06dc24156ec 100644 --- a/samples/user_events/example.c +++ b/samples/user_events/example.c @@ -12,13 +12,21 @@ #include #include #include +#include +#include #include =20 +#if __BITS_PER_LONG =3D=3D 64 +#define endian_swap(x) htole64(x) +#else +#define endian_swap(x) htole32(x) +#endif + /* Assumes debugfs is mounted */ const char *data_file =3D "/sys/kernel/debug/tracing/user_events_data"; const char *status_file =3D "/sys/kernel/debug/tracing/user_events_status"; =20 -static int event_status(char **status) +static int event_status(long **status) { int fd =3D open(status_file, O_RDONLY); =20 @@ -33,7 +41,8 @@ static int event_status(char **status) return 0; } =20 -static int event_reg(int fd, const char *command, int *status, int *write) +static int event_reg(int fd, const char *command, long *index, long *mask, + int *write) { struct user_reg reg =3D {0}; =20 @@ -43,7 +52,8 @@ static int event_reg(int fd, const char *command, int *st= atus, int *write) if (ioctl(fd, DIAG_IOCSREG, ®) =3D=3D -1) return -1; =20 - *status =3D reg.status_index; + *index =3D reg.status_bit / __BITS_PER_LONG; + *mask =3D endian_swap(1L << (reg.status_bit % __BITS_PER_LONG)); *write =3D reg.write_index; =20 return 0; @@ -51,8 +61,9 @@ static int event_reg(int fd, const char *command, int *st= atus, int *write) =20 int main(int argc, char **argv) { - int data_fd, status, write; - char *status_page; + int data_fd, write; + long index, mask; + long *status_page; struct iovec io[2]; __u32 count =3D 0; =20 @@ -61,7 +72,7 @@ int main(int argc, char **argv) =20 data_fd =3D open(data_file, O_RDWR); =20 - if (event_reg(data_fd, "test u32 count", &status, &write) =3D=3D -1) + if (event_reg(data_fd, "test u32 count", &index, &mask, &write) =3D=3D -1) return errno; =20 /* Setup iovec */ @@ -75,7 +86,7 @@ int main(int argc, char **argv) getchar(); =20 /* Check if anyone is listening */ - if (status_page[status]) { + if (status_page[index] & mask) { /* Yep, trace out our data */ writev(data_fd, (const struct iovec *)io, 2); =20 diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/test= ing/selftests/user_events/ftrace_test.c index a80fb5ef61d5..404a2713dcae 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -22,6 +22,11 @@ const char *enable_file =3D "/sys/kernel/debug/tracing/e= vents/user_events/__test_e const char *trace_file =3D "/sys/kernel/debug/tracing/trace"; const char *fmt_file =3D "/sys/kernel/debug/tracing/events/user_events/__t= est_event/format"; =20 +static inline int status_check(char *status_page, int status_bit) +{ + return status_page[status_bit >> 3] & (1 << (status_bit & 7)); +} + static int trace_bytes(void) { int fd =3D open(trace_file, O_RDONLY); @@ -197,12 +202,12 @@ TEST_F(user, register_events) { /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); =20 /* Multiple registers should result in same index */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); =20 /* Ensure disabled */ self->enable_fd =3D open(enable_file, O_RDWR); @@ -212,15 +217,15 @@ TEST_F(user, register_events) { /* MMAP should work and be zero'd */ ASSERT_NE(MAP_FAILED, status_page); ASSERT_NE(NULL, status_page); - ASSERT_EQ(0, status_page[reg.status_index]); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); =20 /* Enable event and ensure bits updated in status */ ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) - ASSERT_EQ(EVENT_STATUS_FTRACE, status_page[reg.status_index]); + ASSERT_NE(0, status_check(status_page, reg.status_bit)); =20 /* Disable event and ensure bits updated in status */ ASSERT_NE(-1, write(self->enable_fd, "0", sizeof("0"))) - ASSERT_EQ(0, status_page[reg.status_index]); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); =20 /* File still open should return -EBUSY for delete */ ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event")); @@ -240,6 +245,8 @@ TEST_F(user, write_events) { struct iovec io[3]; __u32 field1, field2; int before =3D 0, after =3D 0; + int page_size =3D sysconf(_SC_PAGESIZE); + char *status_page; =20 reg.size =3D sizeof(reg); reg.name_args =3D (__u64)"__test_event u32 field1; u32 field2"; @@ -254,10 +261,18 @@ TEST_F(user, write_events) { io[2].iov_base =3D &field2; io[2].iov_len =3D sizeof(field2); =20 + status_page =3D mmap(NULL, page_size, PROT_READ, MAP_SHARED, + self->status_fd, 0); + /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); + + /* MMAP should work and be zero'd */ + ASSERT_NE(MAP_FAILED, status_page); + ASSERT_NE(NULL, status_page); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); =20 /* Write should fail on invalid slot with ENOENT */ io[0].iov_base =3D &field2; @@ -271,6 +286,9 @@ TEST_F(user, write_events) { self->enable_fd =3D open(enable_file, O_RDWR); ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) =20 + /* Event should now be enabled */ + ASSERT_NE(0, status_check(status_page, reg.status_bit)); + /* Write should make it out to ftrace buffers */ before =3D trace_bytes(); ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 3)); @@ -298,7 +316,7 @@ TEST_F(user, write_fault) { /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); =20 /* Write should work normally */ ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 2)); @@ -315,6 +333,11 @@ TEST_F(user, write_validator) { int loc, bytes; char data[8]; int before =3D 0, after =3D 0; + int page_size =3D sysconf(_SC_PAGESIZE); + char *status_page; + + status_page =3D mmap(NULL, page_size, PROT_READ, MAP_SHARED, + self->status_fd, 0); =20 reg.size =3D sizeof(reg); reg.name_args =3D (__u64)"__test_event __rel_loc char[] data"; @@ -322,7 +345,12 @@ TEST_F(user, write_validator) { /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); + + /* MMAP should work and be zero'd */ + ASSERT_NE(MAP_FAILED, status_page); + ASSERT_NE(NULL, status_page); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); =20 io[0].iov_base =3D ®.write_index; io[0].iov_len =3D sizeof(reg.write_index); @@ -340,6 +368,9 @@ TEST_F(user, write_validator) { self->enable_fd =3D open(enable_file, O_RDWR); ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) =20 + /* Event should now be enabled */ + ASSERT_NE(0, status_check(status_page, reg.status_bit)); + /* Full in-bounds write should work */ before =3D trace_bytes(); loc =3D DYN_LOC(0, bytes); diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testin= g/selftests/user_events/perf_test.c index 26851d51d6bb..8b4c7879d5a7 100644 --- a/tools/testing/selftests/user_events/perf_test.c +++ b/tools/testing/selftests/user_events/perf_test.c @@ -35,6 +35,11 @@ static long perf_event_open(struct perf_event_attr *pe, = pid_t pid, return syscall(__NR_perf_event_open, pe, pid, cpu, group_fd, flags); } =20 +static inline int status_check(char *status_page, int status_bit) +{ + return status_page[status_bit >> 3] & (1 << (status_bit & 7)); +} + static int get_id(void) { FILE *fp =3D fopen(id_file, "r"); @@ -120,8 +125,8 @@ TEST_F(user, perf_write) { /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); - ASSERT_EQ(0, status_page[reg.status_index]); + ASSERT_NE(0, reg.status_bit); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); =20 /* Id should be there */ id =3D get_id(); @@ -144,7 +149,7 @@ TEST_F(user, perf_write) { ASSERT_NE(MAP_FAILED, perf_page); =20 /* Status should be updated */ - ASSERT_EQ(EVENT_STATUS_PERF, status_page[reg.status_index]); + ASSERT_NE(0, status_check(status_page, reg.status_bit)); =20 event.index =3D reg.write_index; event.field1 =3D 0xc001; --=20 2.35.1