From nobody Mon Apr 29 05:55:19 2024 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id AFB89C433F5 for ; Sat, 1 Oct 2022 00:10:29 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232067AbiJAAK1 (ORCPT ); Fri, 30 Sep 2022 20:10:27 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:53354 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231533AbiJAAKY (ORCPT ); Fri, 30 Sep 2022 20:10:24 -0400 Received: from linux.microsoft.com (linux.microsoft.com [13.77.154.182]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id 76A795814E; Fri, 30 Sep 2022 17:10:21 -0700 (PDT) Received: from localhost.localdomain (unknown [76.135.50.127]) by linux.microsoft.com (Postfix) with ESMTPSA id 1558620E0A56; Fri, 30 Sep 2022 17:10:21 -0700 (PDT) DKIM-Filter: OpenDKIM Filter v2.11.0 linux.microsoft.com 1558620E0A56 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.microsoft.com; s=default; t=1664583021; bh=eDp+/6nhU9aqrvfj7D8qww9fVnDm311AkoZsTuiqNkk=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=VqIhHsTK/wCqR2MUPQP0mKA1LtAHLEXkK+rsd/wFore7wSEOMG5+wg7lQ0M+oE7sb 9XRdz2qzBf/gMZfnMDjROUvVfGbzKymOaSqdzAFFHefwY55kSG5Ifd8jOMoNizPTzJ fBn+wSnlx4Sp45/AvQ2vPRyg+nIj7wqzqDPZgEMQ= From: Beau Belgrave To: rostedt@goodmis.org, mhiramat@kernel.org, mathieu.desnoyers@efficios.com, brauner@kernel.org, stefanb@linux.ibm.com Cc: linux-trace-devel@vger.kernel.org, linux-kernel@vger.kernel.org Subject: [PATCH 1/1] tracing/user_events: Move pages/locks into groups to prepare for namespaces Date: Fri, 30 Sep 2022 17:10:16 -0700 Message-Id: <20221001001016.2832-2-beaub@linux.microsoft.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20221001001016.2832-1-beaub@linux.microsoft.com> References: <20221001001016.2832-1-beaub@linux.microsoft.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" In order to enable namespaces or any sort of isolation within user_events the register lock and pages need to be broken up into groups. Each event and file now has a group pointer which stores the actual pages to map, lookup data and synchronization objects. This only enables a single group that maps to init_user_ns, as IMA namespace has done. This enables user_events to start the work of supporting namespaces by walking the namespaces up to the init_user_ns. Future patches will address other user namespaces and will align to the approaches the IMA namespace uses. Link: https://lore.kernel.org/linux-kernel/20220915193221.1728029-15-stefan= b@linux.ibm.com/#t Signed-off-by: Beau Belgrave --- kernel/trace/trace_events_user.c | 346 ++++++++++++++++++++++++------- 1 file changed, 274 insertions(+), 72 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_u= ser.c index 2c0a6ec75548..ae78c2d53c8a 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -74,11 +74,25 @@ #define EVENT_STATUS_PERF BIT(1) #define EVENT_STATUS_OTHER BIT(7) =20 -static char *register_page_data; +/* + * Stores the pages, tables, and locks for a group of events. + * Each logical grouping of events has its own group, with a + * matching page for status checks within user programs. This + * allows for isolation of events to user programs by various + * means. + */ +struct user_event_group { + struct page *pages; + char *register_page_data; + char *system_name; + struct hlist_node node; + struct mutex reg_mutex; + DECLARE_HASHTABLE(register_table, 8); + DECLARE_BITMAP(page_bitmap, MAX_EVENTS); +}; =20 -static DEFINE_MUTEX(reg_mutex); -static DEFINE_HASHTABLE(register_table, 8); -static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); +/* Group for init_user_ns mapping, top-most group */ +static struct user_event_group *init_group; =20 /* * Stores per-event properties, as users register events @@ -88,6 +102,7 @@ static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); * refcnt reaches one. */ struct user_event { + struct user_event_group *group; struct tracepoint tracepoint; struct trace_event_call call; struct trace_event_class class; @@ -114,6 +129,11 @@ struct user_event_refs { struct user_event *events[]; }; =20 +struct user_event_file_info { + struct user_event_group *group; + struct user_event_refs *refs; +}; + #define VALIDATOR_ENSURE_NULL (1 << 0) #define VALIDATOR_REL (1 << 1) =20 @@ -126,7 +146,8 @@ struct user_event_validator { typedef void (*user_event_func_t) (struct user_event *user, struct iov_ite= r *i, void *tpdata, bool *faulted); =20 -static int user_event_parse(char *name, char *args, char *flags, +static int user_event_parse(struct user_event_group *group, char *name, + char *args, char *flags, struct user_event **newuser); =20 static u32 user_event_key(char *name) @@ -134,12 +155,128 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } =20 +static void set_page_reservations(char *pages, bool set) +{ + int page; + + for (page =3D 0; page < MAX_PAGES; ++page) { + void *addr =3D pages + (PAGE_SIZE * page); + + if (set) + SetPageReserved(virt_to_page(addr)); + else + ClearPageReserved(virt_to_page(addr)); + } +} + +static void user_event_group_destroy(struct user_event_group *group) +{ + if (group->register_page_data) + set_page_reservations(group->register_page_data, false); + + if (group->pages) + __free_pages(group->pages, MAX_PAGE_ORDER); + + kfree(group->system_name); + kfree(group); +} + +static char *user_event_group_system_name(struct user_namespace *user_ns) +{ + char *system_name; + int len =3D sizeof(USER_EVENTS_SYSTEM) + 1; + + if (user_ns !=3D &init_user_ns) { + /* + * Unexpected at this point: + * We only currently support init_user_ns. + * When we enable more, this will trigger a failure so log. + */ + pr_warn("user_events: Namespace other than init_user_ns!\n"); + return NULL; + } + + system_name =3D kmalloc(len, GFP_KERNEL); + + if (!system_name) + return NULL; + + snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); + + return system_name; +} + +static inline struct user_event_group +*user_event_group_from_user_ns(struct user_namespace *user_ns) +{ + if (user_ns =3D=3D &init_user_ns) + return init_group; + + return NULL; +} + +static struct user_event_group *current_user_event_group(void) +{ + struct user_namespace *user_ns =3D current_user_ns(); + struct user_event_group *group =3D NULL; + + while (user_ns) { + group =3D user_event_group_from_user_ns(user_ns); + + if (group) + break; + + user_ns =3D user_ns->parent; + } + + return group; +} + +static struct user_event_group +*user_event_group_create(struct user_namespace *user_ns) +{ + struct user_event_group *group; + + group =3D kzalloc(sizeof(*group), GFP_KERNEL); + + if (!group) + return NULL; + + group->system_name =3D user_event_group_system_name(user_ns); + + if (!group->system_name) + goto error; + + group->pages =3D alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); + + if (!group->pages) + goto error; + + group->register_page_data =3D page_address(group->pages); + + set_page_reservations(group->register_page_data, true); + + /* Zero all bits beside 0 (which is reserved for failures) */ + bitmap_zero(group->page_bitmap, MAX_EVENTS); + set_bit(0, group->page_bitmap); + + mutex_init(&group->reg_mutex); + hash_init(group->register_table); + + return group; +error: + if (group) + user_event_group_destroy(group); + + return NULL; +}; + static __always_inline void user_event_register_set(struct user_event *user) { int i =3D user->index; =20 - register_page_data[MAP_STATUS_BYTE(i)] |=3D MAP_STATUS_MASK(i); + user->group->register_page_data[MAP_STATUS_BYTE(i)] |=3D MAP_STATUS_MASK(= i); } =20 static __always_inline @@ -147,7 +284,7 @@ void user_event_register_clear(struct user_event *user) { int i =3D user->index; =20 - register_page_data[MAP_STATUS_BYTE(i)] &=3D ~MAP_STATUS_MASK(i); + user->group->register_page_data[MAP_STATUS_BYTE(i)] &=3D ~MAP_STATUS_MASK= (i); } =20 static __always_inline __must_check @@ -191,7 +328,8 @@ static struct list_head *user_event_get_fields(struct t= race_event_call *call) * * Upon success user_event has its ref count increased by 1. */ -static int user_event_parse_cmd(char *raw_command, struct user_event **new= user) +static int user_event_parse_cmd(struct user_event_group *group, + char *raw_command, struct user_event **newuser) { char *name =3D raw_command; char *args =3D strpbrk(name, " "); @@ -205,7 +343,7 @@ static int user_event_parse_cmd(char *raw_command, stru= ct user_event **newuser) if (flags) *flags++ =3D '\0'; =20 - return user_event_parse(name, args, flags, newuser); + return user_event_parse(group, name, args, flags, newuser); } =20 static int user_field_array_size(const char *type) @@ -693,7 +831,7 @@ static int destroy_user_event(struct user_event *user) dyn_event_remove(&user->devent); =20 user_event_register_clear(user); - clear_bit(user->index, page_bitmap); + clear_bit(user->index, user->group->page_bitmap); hash_del(&user->node); =20 user_event_destroy_validators(user); @@ -704,14 +842,15 @@ static int destroy_user_event(struct user_event *user) return ret; } =20 -static struct user_event *find_user_event(char *name, u32 *outkey) +static struct user_event *find_user_event(struct user_event_group *group, + char *name, u32 *outkey) { struct user_event *user; u32 key =3D user_event_key(name); =20 *outkey =3D key; =20 - hash_for_each_possible(register_table, user, node, key) + hash_for_each_possible(group->register_table, user, node, key) if (!strcmp(EVENT_NAME(user), name)) { refcount_inc(&user->refcnt); return user; @@ -943,6 +1082,7 @@ static int user_event_reg(struct trace_event_call *cal= l, =20 static int user_event_create(const char *raw_command) { + struct user_event_group *group; struct user_event *user; char *name; int ret; @@ -958,14 +1098,19 @@ static int user_event_create(const char *raw_command) if (!name) return -ENOMEM; =20 - mutex_lock(®_mutex); + group =3D current_user_event_group(); + + if (!group) + return -ENOENT; + + mutex_lock(&group->reg_mutex); =20 - ret =3D user_event_parse_cmd(name, &user); + ret =3D user_event_parse_cmd(group, name, &user); =20 if (!ret) refcount_dec(&user->refcnt); =20 - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); =20 if (ret) kfree(name); @@ -1119,7 +1264,8 @@ static int user_event_trace_register(struct user_even= t *user) * The name buffer lifetime is owned by this method for success cases only. * Upon success the returned user_event has its ref count increased by 1. */ -static int user_event_parse(char *name, char *args, char *flags, +static int user_event_parse(struct user_event_group *group, char *name, + char *args, char *flags, struct user_event **newuser) { int ret; @@ -1129,7 +1275,7 @@ static int user_event_parse(char *name, char *args, c= har *flags, =20 /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); - user =3D find_user_event(name, &key); + user =3D find_user_event(group, name, &key); mutex_unlock(&event_mutex); =20 if (user) { @@ -1142,7 +1288,7 @@ static int user_event_parse(char *name, char *args, c= har *flags, return 0; } =20 - index =3D find_first_zero_bit(page_bitmap, MAX_EVENTS); + index =3D find_first_zero_bit(group->page_bitmap, MAX_EVENTS); =20 if (index =3D=3D MAX_EVENTS) return -EMFILE; @@ -1156,6 +1302,7 @@ static int user_event_parse(char *name, char *args, c= har *flags, INIT_LIST_HEAD(&user->fields); INIT_LIST_HEAD(&user->validators); =20 + user->group =3D group; user->tracepoint.name =3D name; =20 ret =3D user_event_parse_fields(user, args); @@ -1174,8 +1321,8 @@ static int user_event_parse(char *name, char *args, c= har *flags, user->call.flags =3D TRACE_EVENT_FL_TRACEPOINT; user->call.tp =3D &user->tracepoint; user->call.event.funcs =3D &user_event_funcs; + user->class.system =3D group->system_name; =20 - user->class.system =3D USER_EVENTS_SYSTEM; user->class.fields_array =3D user_event_fields_array; user->class.get_fields =3D user_event_get_fields; user->class.reg =3D user_event_reg; @@ -1198,8 +1345,8 @@ static int user_event_parse(char *name, char *args, c= har *flags, =20 dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); - set_bit(user->index, page_bitmap); - hash_add(register_table, &user->node, key); + set_bit(user->index, group->page_bitmap); + hash_add(group->register_table, &user->node, key); =20 mutex_unlock(&event_mutex); =20 @@ -1217,10 +1364,10 @@ static int user_event_parse(char *name, char *args,= char *flags, /* * Deletes a previously created event if it is no longer being used. */ -static int delete_user_event(char *name) +static int delete_user_event(struct user_event_group *group, char *name) { u32 key; - struct user_event *user =3D find_user_event(name, &key); + struct user_event *user =3D find_user_event(group, name, &key); =20 if (!user) return -ENOENT; @@ -1238,6 +1385,7 @@ static int delete_user_event(char *name) */ static ssize_t user_events_write_core(struct file *file, struct iov_iter *= i) { + struct user_event_file_info *info =3D file->private_data; struct user_event_refs *refs; struct user_event *user =3D NULL; struct tracepoint *tp; @@ -1249,7 +1397,7 @@ static ssize_t user_events_write_core(struct file *fi= le, struct iov_iter *i) =20 rcu_read_lock_sched(); =20 - refs =3D rcu_dereference_sched(file->private_data); + refs =3D rcu_dereference_sched(info->refs); =20 /* * The refs->events array is protected by RCU, and new items may be @@ -1307,6 +1455,28 @@ static ssize_t user_events_write_core(struct file *f= ile, struct iov_iter *i) return ret; } =20 +static int user_events_open(struct inode *node, struct file *file) +{ + struct user_event_group *group; + struct user_event_file_info *info; + + group =3D current_user_event_group(); + + if (!group) + return -ENOENT; + + info =3D kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return -ENOMEM; + + info->group =3D group; + + file->private_data =3D info; + + return 0; +} + static ssize_t user_events_write(struct file *file, const char __user *ubu= f, size_t count, loff_t *ppos) { @@ -1328,13 +1498,15 @@ static ssize_t user_events_write_iter(struct kiocb = *kp, struct iov_iter *i) return user_events_write_core(kp->ki_filp, i); } =20 -static int user_events_ref_add(struct file *file, struct user_event *user) +static int user_events_ref_add(struct user_event_file_info *info, + struct user_event *user) { + struct user_event_group *group =3D info->group; struct user_event_refs *refs, *new_refs; int i, size, count =3D 0; =20 - refs =3D rcu_dereference_protected(file->private_data, - lockdep_is_held(®_mutex)); + refs =3D rcu_dereference_protected(info->refs, + lockdep_is_held(&group->reg_mutex)); =20 if (refs) { count =3D refs->count; @@ -1360,7 +1532,7 @@ static int user_events_ref_add(struct file *file, str= uct user_event *user) =20 refcount_inc(&user->refcnt); =20 - rcu_assign_pointer(file->private_data, new_refs); + rcu_assign_pointer(info->refs, new_refs); =20 if (refs) kfree_rcu(refs, rcu); @@ -1397,7 +1569,8 @@ static long user_reg_get(struct user_reg __user *ureg= , struct user_reg *kreg) /* * Registers a user_event on behalf of a user process. */ -static long user_events_ioctl_reg(struct file *file, unsigned long uarg) +static long user_events_ioctl_reg(struct user_event_file_info *info, + unsigned long uarg) { struct user_reg __user *ureg =3D (struct user_reg __user *)uarg; struct user_reg reg; @@ -1418,14 +1591,14 @@ static long user_events_ioctl_reg(struct file *file= , unsigned long uarg) return ret; } =20 - ret =3D user_event_parse_cmd(name, &user); + ret =3D user_event_parse_cmd(info->group, name, &user); =20 if (ret) { kfree(name); return ret; } =20 - ret =3D user_events_ref_add(file, user); + ret =3D user_events_ref_add(info, user); =20 /* No longer need parse ref, ref_add either worked or not */ refcount_dec(&user->refcnt); @@ -1443,7 +1616,8 @@ static long user_events_ioctl_reg(struct file *file, = unsigned long uarg) /* * Deletes a user_event on behalf of a user process. */ -static long user_events_ioctl_del(struct file *file, unsigned long uarg) +static long user_events_ioctl_del(struct user_event_file_info *info, + unsigned long uarg) { void __user *ubuf =3D (void __user *)uarg; char *name; @@ -1456,7 +1630,7 @@ static long user_events_ioctl_del(struct file *file, = unsigned long uarg) =20 /* event_mutex prevents dyn_event from racing */ mutex_lock(&event_mutex); - ret =3D delete_user_event(name); + ret =3D delete_user_event(info->group, name); mutex_unlock(&event_mutex); =20 kfree(name); @@ -1470,19 +1644,21 @@ static long user_events_ioctl_del(struct file *file= , unsigned long uarg) static long user_events_ioctl(struct file *file, unsigned int cmd, unsigned long uarg) { + struct user_event_file_info *info =3D file->private_data; + struct user_event_group *group =3D info->group; long ret =3D -ENOTTY; =20 switch (cmd) { case DIAG_IOCSREG: - mutex_lock(®_mutex); - ret =3D user_events_ioctl_reg(file, uarg); - mutex_unlock(®_mutex); + mutex_lock(&group->reg_mutex); + ret =3D user_events_ioctl_reg(info, uarg); + mutex_unlock(&group->reg_mutex); break; =20 case DIAG_IOCSDEL: - mutex_lock(®_mutex); - ret =3D user_events_ioctl_del(file, uarg); - mutex_unlock(®_mutex); + mutex_lock(&group->reg_mutex); + ret =3D user_events_ioctl_del(info, uarg); + mutex_unlock(&group->reg_mutex); break; } =20 @@ -1494,17 +1670,24 @@ static long user_events_ioctl(struct file *file, un= signed int cmd, */ static int user_events_release(struct inode *node, struct file *file) { + struct user_event_file_info *info =3D file->private_data; + struct user_event_group *group; struct user_event_refs *refs; struct user_event *user; int i; =20 + if (!info) + return -EINVAL; + + group =3D info->group; + /* * Ensure refs cannot change under any situation by taking the * register mutex during the final freeing of the references. */ - mutex_lock(®_mutex); + mutex_lock(&group->reg_mutex); =20 - refs =3D file->private_data; + refs =3D info->refs; =20 if (!refs) goto out; @@ -1523,32 +1706,51 @@ static int user_events_release(struct inode *node, = struct file *file) out: file->private_data =3D NULL; =20 - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); =20 kfree(refs); + kfree(info); =20 return 0; } =20 static const struct file_operations user_data_fops =3D { + .open =3D user_events_open, .write =3D user_events_write, .write_iter =3D user_events_write_iter, .unlocked_ioctl =3D user_events_ioctl, .release =3D user_events_release, }; =20 +static struct user_event_group *user_status_group(struct file *file) +{ + struct seq_file *m =3D file->private_data; + + if (!m) + return NULL; + + return m->private; +} + /* * Maps the shared page into the user process for checking if event is ena= bled. */ static int user_status_mmap(struct file *file, struct vm_area_struct *vma) { + char *pages; + struct user_event_group *group =3D user_status_group(file); unsigned long size =3D vma->vm_end - vma->vm_start; =20 if (size !=3D MAX_BYTES) return -EINVAL; =20 + if (!group) + return -EINVAL; + + pages =3D group->register_page_data; + return remap_pfn_range(vma, vma->vm_start, - virt_to_phys(register_page_data) >> PAGE_SHIFT, + virt_to_phys(pages) >> PAGE_SHIFT, size, vm_get_page_prot(VM_READ)); } =20 @@ -1572,13 +1774,17 @@ static void user_seq_stop(struct seq_file *m, void = *p) =20 static int user_seq_show(struct seq_file *m, void *p) { + struct user_event_group *group =3D m->private; struct user_event *user; char status; int i, active =3D 0, busy =3D 0, flags; =20 - mutex_lock(®_mutex); + if (!group) + return -EINVAL; + + mutex_lock(&group->reg_mutex); =20 - hash_for_each(register_table, i, user, node) { + hash_for_each(group->register_table, i, user, node) { status =3D user->status; flags =3D user->flags; =20 @@ -1602,7 +1808,7 @@ static int user_seq_show(struct seq_file *m, void *p) active++; } =20 - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); =20 seq_puts(m, "\n"); seq_printf(m, "Active: %d\n", active); @@ -1621,7 +1827,24 @@ static const struct seq_operations user_seq_ops =3D { =20 static int user_status_open(struct inode *node, struct file *file) { - return seq_open(file, &user_seq_ops); + struct user_event_group *group; + int ret; + + group =3D current_user_event_group(); + + if (!group) + return -ENOENT; + + ret =3D seq_open(file, &user_seq_ops); + + if (!ret) { + /* Chain group to seq_file */ + struct seq_file *m =3D file->private_data; + + m->private =3D group; + } + + return ret; } =20 static const struct file_operations user_status_fops =3D { @@ -1662,42 +1885,21 @@ static int create_user_tracefs(void) return -ENODEV; } =20 -static void set_page_reservations(bool set) -{ - int page; - - for (page =3D 0; page < MAX_PAGES; ++page) { - void *addr =3D register_page_data + (PAGE_SIZE * page); - - if (set) - SetPageReserved(virt_to_page(addr)); - else - ClearPageReserved(virt_to_page(addr)); - } -} - static int __init trace_events_user_init(void) { - struct page *pages; int ret; =20 - /* Zero all bits beside 0 (which is reserved for failures) */ - bitmap_zero(page_bitmap, MAX_EVENTS); - set_bit(0, page_bitmap); + init_group =3D user_event_group_create(&init_user_ns); =20 - pages =3D alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); - if (!pages) + if (!init_group) return -ENOMEM; - register_page_data =3D page_address(pages); - - set_page_reservations(true); =20 ret =3D create_user_tracefs(); =20 if (ret) { pr_warn("user_events could not register with tracefs\n"); - set_page_reservations(false); - __free_pages(pages, MAX_PAGE_ORDER); + user_event_group_destroy(init_group); + init_group =3D NULL; return ret; } =20 --=20 2.25.1