Adding eventfs_file structure which will hold properties of file or dir.
Adding following functions to add dir in eventfs:
eventfs_create_events_dir() directly creates events dir with-in
tracing folder.
eventfs_add_subsystem_dir() adds the information of subsystem_dir to
eventfs and dynamically creates subsystem_dir as and when requires.
eventfs_add_dir() adds the information of dir (which is with-in
subsystem_dir) to eventfs and dynamically creates these dir as
and when requires.
Signed-off-by: Ajay Kaher <akaher@vmware.com>
Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Tested-by: Ching-lin Yu <chinglinyu@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-lkp/202305051619.9a469a9a-yujie.liu@intel.com
---
fs/tracefs/Makefile | 1 +
fs/tracefs/event_inode.c | 272 +++++++++++++++++++++++++++++++++++++++
include/linux/tracefs.h | 29 +++++
kernel/trace/trace.h | 1 +
4 files changed, 303 insertions(+)
create mode 100644 fs/tracefs/event_inode.c
diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile
index 7c35a282b..73c56da8e 100644
--- a/fs/tracefs/Makefile
+++ b/fs/tracefs/Makefile
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
tracefs-objs := inode.o
+tracefs-objs += event_inode.o
obj-$(CONFIG_TRACING) += tracefs.o
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
new file mode 100644
index 000000000..a48ce23c0
--- /dev/null
+++ b/fs/tracefs/event_inode.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * event_inode.c - part of tracefs, a pseudo file system for activating tracing
+ *
+ * Copyright (C) 2020-22 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org>
+ * Copyright (C) 2020-22 VMware Inc, author: Ajay Kaher <akaher@vmware.com>
+ *
+ * eventfs is used to show trace events with one set of dentries
+ *
+ * eventfs stores meta-data of files/dirs and skip to create object of
+ * inodes/dentries. As and when requires, eventfs will create the
+ * inodes/dentries for only required files/directories. Also eventfs
+ * would delete the inodes/dentries once no more requires but preserve
+ * the meta data.
+ */
+#include <linux/fsnotify.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/tracefs.h>
+#include <linux/kref.h>
+#include <linux/delay.h>
+#include "internal.h"
+
+/**
+ * eventfs_dentry_to_rwsem - Return corresponding eventfs_rwsem
+ * @dentry: a pointer to dentry
+ *
+ * helper function to return crossponding eventfs_rwsem for given dentry
+ */
+static struct rw_semaphore *eventfs_dentry_to_rwsem(struct dentry *dentry)
+{
+ if (S_ISDIR(dentry->d_inode->i_mode))
+ return (struct rw_semaphore *)dentry->d_inode->i_private;
+ else
+ return (struct rw_semaphore *)dentry->d_parent->d_inode->i_private;
+}
+
+/**
+ * eventfs_down_read - acquire read lock function
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * helper function to perform read lock. Nested locking requires because
+ * lookup(), release() requires read lock, these could be called directly
+ * or from open(), remove() which already hold the read/write lock.
+ */
+static void eventfs_down_read(struct rw_semaphore *eventfs_rwsem)
+{
+ down_read_nested(eventfs_rwsem, SINGLE_DEPTH_NESTING);
+}
+
+/**
+ * eventfs_up_read - release read lock function
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * helper function to release eventfs_rwsem lock if locked
+ */
+static void eventfs_up_read(struct rw_semaphore *eventfs_rwsem)
+{
+ up_read(eventfs_rwsem);
+}
+
+/**
+ * eventfs_down_write - acquire write lock function
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * helper function to perform write lock on eventfs_rwsem
+ */
+static void eventfs_down_write(struct rw_semaphore *eventfs_rwsem)
+{
+ while (!down_write_trylock(eventfs_rwsem))
+ msleep(10);
+}
+
+/**
+ * eventfs_up_write - release write lock function
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * helper function to perform write lock on eventfs_rwsem
+ */
+static void eventfs_up_write(struct rw_semaphore *eventfs_rwsem)
+{
+ up_write(eventfs_rwsem);
+}
+
+static const struct file_operations eventfs_file_operations = {
+};
+
+static const struct inode_operations eventfs_root_dir_inode_operations = {
+};
+
+/**
+ * eventfs_prepare_ef - helper function to prepare eventfs_file
+ * @name: a pointer to a string containing the name of the file/directory
+ * to create.
+ * @mode: the permission that the file should have.
+ * @fop: a pointer to a struct file_operations that should be used for
+ * this file/directory.
+ * @iop: a pointer to a struct inode_operations that should be used for
+ * this file/directory.
+ * @data: a pointer to something that the caller will want to get to later
+ * on. The inode.i_private pointer will point to this value on
+ * the open() call.
+ *
+ * This function allocate the fill eventfs_file structure.
+ */
+static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode,
+ const struct file_operations *fop,
+ const struct inode_operations *iop,
+ void *data)
+{
+ struct eventfs_file *ef;
+
+ ef = kzalloc(sizeof(*ef), GFP_KERNEL);
+ if (!ef)
+ return ERR_PTR(-ENOMEM);
+
+ ef->name = kstrdup(name, GFP_KERNEL);
+ if (!ef->name) {
+ kfree(ef);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if (S_ISDIR(mode)) {
+ ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL);
+ if (!ef->ei) {
+ kfree(ef->name);
+ kfree(ef);
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&ef->ei->e_top_files);
+ } else {
+ ef->ei = NULL;
+ }
+
+ ef->iop = iop;
+ ef->fop = fop;
+ ef->mode = mode;
+ ef->data = data;
+ ef->dentry = NULL;
+ ef->d_parent = NULL;
+ ef->created = false;
+ return ef;
+}
+
+/**
+ * eventfs_create_events_dir - create the trace event structure
+ * @name: a pointer to a string containing the name of the directory to
+ * create.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is NULL, then the
+ * directory will be created in the root of the tracefs filesystem.
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * This function creates the top of the trace event directory.
+ */
+struct dentry *eventfs_create_events_dir(const char *name,
+ struct dentry *parent,
+ struct rw_semaphore *eventfs_rwsem)
+{
+ struct dentry *dentry = tracefs_start_creating(name, parent);
+ struct eventfs_inode *ei;
+ struct tracefs_inode *ti;
+ struct inode *inode;
+
+ if (IS_ERR(dentry))
+ return dentry;
+
+ ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ if (!ei)
+ return ERR_PTR(-ENOMEM);
+ inode = tracefs_get_inode(dentry->d_sb);
+ if (unlikely(!inode)) {
+ kfree(ei);
+ tracefs_failed_creating(dentry);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ init_rwsem(eventfs_rwsem);
+ INIT_LIST_HEAD(&ei->e_top_files);
+
+ ti = get_tracefs(inode);
+ ti->flags |= TRACEFS_EVENT_INODE;
+ ti->private = ei;
+
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ inode->i_op = &eventfs_root_dir_inode_operations;
+ inode->i_fop = &eventfs_file_operations;
+ inode->i_private = eventfs_rwsem;
+
+ /* directory inodes start off with i_nlink == 2 (for "." entry) */
+ inc_nlink(inode);
+ d_instantiate(dentry, inode);
+ inc_nlink(dentry->d_parent->d_inode);
+ fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+ return tracefs_end_creating(dentry);
+}
+
+/**
+ * eventfs_add_subsystem_dir - add eventfs subsystem_dir to list to create later
+ * @name: a pointer to a string containing the name of the file to create.
+ * @parent: a pointer to the parent dentry for this dir.
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * This function adds eventfs subsystem dir to list.
+ * And all these dirs are created on the fly when they are looked up,
+ * and the dentry and inodes will be removed when they are done.
+ */
+struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
+ struct dentry *parent,
+ struct rw_semaphore *eventfs_rwsem)
+{
+ struct tracefs_inode *ti_parent;
+ struct eventfs_inode *ei_parent;
+ struct eventfs_file *ef;
+
+ if (!parent)
+ return ERR_PTR(-EINVAL);
+
+ ti_parent = get_tracefs(parent->d_inode);
+ ei_parent = ti_parent->private;
+
+ ef = eventfs_prepare_ef(name,
+ S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
+ &eventfs_file_operations,
+ &eventfs_root_dir_inode_operations,
+ (void *) eventfs_rwsem);
+
+ if (IS_ERR(ef))
+ return ef;
+
+ eventfs_down_write(eventfs_rwsem);
+ list_add_tail(&ef->list, &ei_parent->e_top_files);
+ ef->d_parent = parent;
+ eventfs_up_write(eventfs_rwsem);
+ return ef;
+}
+
+/**
+ * eventfs_add_dir - add eventfs dir to list to create later
+ * @name: a pointer to a string containing the name of the file to create.
+ * @ef_parent: a pointer to the parent eventfs_file for this dir.
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * This function adds eventfs dir to list.
+ * And all these dirs are created on the fly when they are looked up,
+ * and the dentry and inodes will be removed when they are done.
+ */
+struct eventfs_file *eventfs_add_dir(const char *name,
+ struct eventfs_file *ef_parent,
+ struct rw_semaphore *eventfs_rwsem)
+{
+ struct eventfs_file *ef;
+
+ if (!ef_parent)
+ return ERR_PTR(-EINVAL);
+
+ ef = eventfs_prepare_ef(name,
+ S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
+ &eventfs_file_operations,
+ &eventfs_root_dir_inode_operations,
+ (void *) eventfs_rwsem);
+
+ if (IS_ERR(ef))
+ return ef;
+
+ eventfs_down_write(eventfs_rwsem);
+ list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
+ ef->d_parent = ef_parent->dentry;
+ eventfs_up_write(eventfs_rwsem);
+ return ef;
+}
diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index 999124459..aeca6761f 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -21,6 +21,35 @@ struct file_operations;
#ifdef CONFIG_TRACING
+struct eventfs_inode {
+ struct list_head e_top_files;
+};
+
+struct eventfs_file {
+ const char *name;
+ struct dentry *d_parent;
+ struct dentry *dentry;
+ struct list_head list;
+ struct eventfs_inode *ei;
+ const struct file_operations *fop;
+ const struct inode_operations *iop;
+ void *data;
+ umode_t mode;
+ bool created;
+};
+
+struct dentry *eventfs_create_events_dir(const char *name,
+ struct dentry *parent,
+ struct rw_semaphore *eventfs_rwsem);
+
+struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
+ struct dentry *parent,
+ struct rw_semaphore *eventfs_rwsem);
+
+struct eventfs_file *eventfs_add_dir(const char *name,
+ struct eventfs_file *ef_parent,
+ struct rw_semaphore *eventfs_rwsem);
+
struct dentry *tracefs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 79bdefe92..b895c3346 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -359,6 +359,7 @@ struct trace_array {
struct dentry *options;
struct dentry *percpu_dir;
struct dentry *event_dir;
+ struct rw_semaphore eventfs_rwsem;
struct trace_options *topts;
struct list_head systems;
struct list_head events;
--
2.40.0
FYI, all subjects should start with a capital letter:
"eventfs: Implement eventfs dir creation functions"
On Thu, 1 Jun 2023 14:30:06 +0530
Ajay Kaher <akaher@vmware.com> wrote:
> Adding eventfs_file structure which will hold properties of file or dir.
>
> Adding following functions to add dir in eventfs:
>
> eventfs_create_events_dir() directly creates events dir with-in
"within" is a proper word.
> tracing folder.
>
> eventfs_add_subsystem_dir() adds the information of subsystem_dir to
> eventfs and dynamically creates subsystem_dir as and when requires.
"as and when requires" does not make sense.
>
> eventfs_add_dir() adds the information of dir (which is with-in
"within"
> subsystem_dir) to eventfs and dynamically creates these dir as
> and when requires.
I'm guessing you want to say:
eventfs_add_dir() adds the information of the dir, within a
subsystem_dir, to eventfs and dynamically creates these
directories when they are accessed.
>
> Signed-off-by: Ajay Kaher <akaher@vmware.com>
> Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> Tested-by: Ching-lin Yu <chinglinyu@google.com>
> Reported-by: kernel test robot <lkp@intel.com>
> Link: https://lore.kernel.org/oe-lkp/202305051619.9a469a9a-yujie.liu@intel.com
> ---
> fs/tracefs/Makefile | 1 +
> fs/tracefs/event_inode.c | 272 +++++++++++++++++++++++++++++++++++++++
> include/linux/tracefs.h | 29 +++++
> kernel/trace/trace.h | 1 +
> 4 files changed, 303 insertions(+)
> create mode 100644 fs/tracefs/event_inode.c
>
> diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile
> index 7c35a282b..73c56da8e 100644
> --- a/fs/tracefs/Makefile
> +++ b/fs/tracefs/Makefile
> @@ -1,5 +1,6 @@
> # SPDX-License-Identifier: GPL-2.0-only
> tracefs-objs := inode.o
> +tracefs-objs += event_inode.o
>
> obj-$(CONFIG_TRACING) += tracefs.o
>
> diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
> new file mode 100644
> index 000000000..a48ce23c0
> --- /dev/null
> +++ b/fs/tracefs/event_inode.c
> @@ -0,0 +1,272 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * event_inode.c - part of tracefs, a pseudo file system for activating tracing
> + *
> + * Copyright (C) 2020-22 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org>
> + * Copyright (C) 2020-22 VMware Inc, author: Ajay Kaher <akaher@vmware.com>
> + *
> + * eventfs is used to show trace events with one set of dentries
> + *
> + * eventfs stores meta-data of files/dirs and skip to create object of
> + * inodes/dentries. As and when requires, eventfs will create the
> + * inodes/dentries for only required files/directories. Also eventfs
> + * would delete the inodes/dentries once no more requires but preserve
> + * the meta data.
> + */
> +#include <linux/fsnotify.h>
> +#include <linux/fs.h>
> +#include <linux/namei.h>
> +#include <linux/security.h>
> +#include <linux/tracefs.h>
> +#include <linux/kref.h>
> +#include <linux/delay.h>
> +#include "internal.h"
> +
> +/**
> + * eventfs_dentry_to_rwsem - Return corresponding eventfs_rwsem
> + * @dentry: a pointer to dentry
> + *
> + * helper function to return crossponding eventfs_rwsem for given dentry
> + */
> +static struct rw_semaphore *eventfs_dentry_to_rwsem(struct dentry *dentry)
> +{
> + if (S_ISDIR(dentry->d_inode->i_mode))
> + return (struct rw_semaphore *)dentry->d_inode->i_private;
> + else
> + return (struct rw_semaphore *)dentry->d_parent->d_inode->i_private;
> +}
> +
> +/**
> + * eventfs_down_read - acquire read lock function
> + * @eventfs_rwsem: a pointer to rw_semaphore
> + *
> + * helper function to perform read lock. Nested locking requires because
> + * lookup(), release() requires read lock, these could be called directly
> + * or from open(), remove() which already hold the read/write lock.
> + */
> +static void eventfs_down_read(struct rw_semaphore *eventfs_rwsem)
> +{
> + down_read_nested(eventfs_rwsem, SINGLE_DEPTH_NESTING);
> +}
> +
> +/**
> + * eventfs_up_read - release read lock function
> + * @eventfs_rwsem: a pointer to rw_semaphore
> + *
> + * helper function to release eventfs_rwsem lock if locked
> + */
> +static void eventfs_up_read(struct rw_semaphore *eventfs_rwsem)
> +{
> + up_read(eventfs_rwsem);
> +}
> +
> +/**
> + * eventfs_down_write - acquire write lock function
> + * @eventfs_rwsem: a pointer to rw_semaphore
> + *
> + * helper function to perform write lock on eventfs_rwsem
> + */
> +static void eventfs_down_write(struct rw_semaphore *eventfs_rwsem)
> +{
> + while (!down_write_trylock(eventfs_rwsem))
> + msleep(10);
What's this loop for? Something like that needs a very good explanation
in a comment. Loops like these are usually a sign of a workaround for a
bug in the design, or worse, simply hides an existing bug.
> +}
> +
> +/**
> + * eventfs_up_write - release write lock function
> + * @eventfs_rwsem: a pointer to rw_semaphore
> + *
> + * helper function to perform write lock on eventfs_rwsem
> + */
> +static void eventfs_up_write(struct rw_semaphore *eventfs_rwsem)
> +{
> + up_write(eventfs_rwsem);
> +}
> +
> +static const struct file_operations eventfs_file_operations = {
> +};
> +
> +static const struct inode_operations eventfs_root_dir_inode_operations = {
> +};
> +
> +/**
> + * eventfs_prepare_ef - helper function to prepare eventfs_file
> + * @name: a pointer to a string containing the name of the file/directory
> + * to create.
> + * @mode: the permission that the file should have.
> + * @fop: a pointer to a struct file_operations that should be used for
> + * this file/directory.
> + * @iop: a pointer to a struct inode_operations that should be used for
> + * this file/directory.
> + * @data: a pointer to something that the caller will want to get to later
> + * on. The inode.i_private pointer will point to this value on
> + * the open() call.
> + *
> + * This function allocate the fill eventfs_file structure.
"allocates and fills the" ?
> + */
> +static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode,
> + const struct file_operations *fop,
> + const struct inode_operations *iop,
> + void *data)
> +{
> + struct eventfs_file *ef;
> +
> + ef = kzalloc(sizeof(*ef), GFP_KERNEL);
> + if (!ef)
> + return ERR_PTR(-ENOMEM);
> +
> + ef->name = kstrdup(name, GFP_KERNEL);
> + if (!ef->name) {
> + kfree(ef);
> + return ERR_PTR(-ENOMEM);
> + }
> +
> + if (S_ISDIR(mode)) {
> + ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL);
> + if (!ef->ei) {
> + kfree(ef->name);
> + kfree(ef);
> + return ERR_PTR(-ENOMEM);
> + }
> + INIT_LIST_HEAD(&ef->ei->e_top_files);
> + } else {
> + ef->ei = NULL;
> + }
> +
> + ef->iop = iop;
> + ef->fop = fop;
> + ef->mode = mode;
> + ef->data = data;
> + ef->dentry = NULL;
> + ef->d_parent = NULL;
> + ef->created = false;
No need for the initialization to NULL or even the false, as the
kzalloc() already did that.
> + return ef;
> +}
> +
> +/**
> + * eventfs_create_events_dir - create the trace event structure
> + * @name: a pointer to a string containing the name of the directory to
> + * create.
You don't need to add "a pointer" we can see it's a pointer. Just say:
* @name: The name of the directory to create
Adding more makes it confusing to read.
> + * @parent: a pointer to the parent dentry for this file. This should be a
> + * directory dentry if set. If this parameter is NULL, then the
> + * directory will be created in the root of the tracefs filesystem.
> + * @eventfs_rwsem: a pointer to rw_semaphore
Same with all the descriptions.
> + *
> + * This function creates the top of the trace event directory.
> + */
> +struct dentry *eventfs_create_events_dir(const char *name,
> + struct dentry *parent,
> + struct rw_semaphore *eventfs_rwsem)
OK, I'm going to have to really look at this. Passing in a lock to the
API is just broken. We need to find a way to solve this another way.
I'm about to board a plane to JFK shortly, I'm hoping to play with this
while flying back.
-- Steve
> +{
> + struct dentry *dentry = tracefs_start_creating(name, parent);
> + struct eventfs_inode *ei;
> + struct tracefs_inode *ti;
> + struct inode *inode;
> +
> + if (IS_ERR(dentry))
> + return dentry;
> +
> + ei = kzalloc(sizeof(*ei), GFP_KERNEL);
> + if (!ei)
> + return ERR_PTR(-ENOMEM);
> + inode = tracefs_get_inode(dentry->d_sb);
> + if (unlikely(!inode)) {
> + kfree(ei);
> + tracefs_failed_creating(dentry);
> + return ERR_PTR(-ENOMEM);
> + }
> +
> + init_rwsem(eventfs_rwsem);
> + INIT_LIST_HEAD(&ei->e_top_files);
> +
> + ti = get_tracefs(inode);
> + ti->flags |= TRACEFS_EVENT_INODE;
> + ti->private = ei;
> +
> + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
> + inode->i_op = &eventfs_root_dir_inode_operations;
> + inode->i_fop = &eventfs_file_operations;
> + inode->i_private = eventfs_rwsem;
> +
> + /* directory inodes start off with i_nlink == 2 (for "." entry) */
> + inc_nlink(inode);
> + d_instantiate(dentry, inode);
> + inc_nlink(dentry->d_parent->d_inode);
> + fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
> + return tracefs_end_creating(dentry);
> +}
> On 01-Jul-2023, at 7:24 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> !! External Email
>
> FYI, all subjects should start with a capital letter:
>
> "eventfs: Implement eventfs dir creation functions"
>
> On Thu, 1 Jun 2023 14:30:06 +0530
> Ajay Kaher <akaher@vmware.com> wrote:
>
>> Adding eventfs_file structure which will hold properties of file or dir.
>>
>> Adding following functions to add dir in eventfs:
>>
>> eventfs_create_events_dir() directly creates events dir with-in
>
> "within" is a proper word.
>
>> tracing folder.
>>
>> eventfs_add_subsystem_dir() adds the information of subsystem_dir to
>> eventfs and dynamically creates subsystem_dir as and when requires.
>
> "as and when requires" does not make sense.
>
>>
>> eventfs_add_dir() adds the information of dir (which is with-in
>
> "within"
>
>> subsystem_dir) to eventfs and dynamically creates these dir as
>> and when requires.
>
> I'm guessing you want to say:
>
> eventfs_add_dir() adds the information of the dir, within a
> subsystem_dir, to eventfs and dynamically creates these
> directories when they are accessed.
>
>>
>> Signed-off-by: Ajay Kaher <akaher@vmware.com>
>> Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
>> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
>> Tested-by: Ching-lin Yu <chinglinyu@google.com>
>> Reported-by: kernel test robot <lkp@intel.com>
>> Link: https://lore.kernel.org/oe-lkp/202305051619.9a469a9a-yujie.liu@intel.com
>> ---
>> fs/tracefs/Makefile | 1 +
>> fs/tracefs/event_inode.c | 272 +++++++++++++++++++++++++++++++++++++++
>> include/linux/tracefs.h | 29 +++++
>> kernel/trace/trace.h | 1 +
>> 4 files changed, 303 insertions(+)
>> create mode 100644 fs/tracefs/event_inode.c
>>
>> diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile
>> index 7c35a282b..73c56da8e 100644
>> --- a/fs/tracefs/Makefile
>> +++ b/fs/tracefs/Makefile
>> @@ -1,5 +1,6 @@
>> # SPDX-License-Identifier: GPL-2.0-only
>> tracefs-objs := inode.o
>> +tracefs-objs += event_inode.o
>>
>> obj-$(CONFIG_TRACING) += tracefs.o
>>
>> diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
>> new file mode 100644
>> index 000000000..a48ce23c0
>> --- /dev/null
>> +++ b/fs/tracefs/event_inode.c
>> @@ -0,0 +1,272 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * event_inode.c - part of tracefs, a pseudo file system for activating tracing
>> + *
>> + * Copyright (C) 2020-22 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org>
>> + * Copyright (C) 2020-22 VMware Inc, author: Ajay Kaher <akaher@vmware.com>
>> + *
>> + * eventfs is used to show trace events with one set of dentries
>> + *
>> + * eventfs stores meta-data of files/dirs and skip to create object of
>> + * inodes/dentries. As and when requires, eventfs will create the
>> + * inodes/dentries for only required files/directories. Also eventfs
>> + * would delete the inodes/dentries once no more requires but preserve
>> + * the meta data.
>> + */
>> +#include <linux/fsnotify.h>
>> +#include <linux/fs.h>
>> +#include <linux/namei.h>
>> +#include <linux/security.h>
>> +#include <linux/tracefs.h>
>> +#include <linux/kref.h>
>> +#include <linux/delay.h>
>> +#include "internal.h"
>> +
>> +/**
>> + * eventfs_dentry_to_rwsem - Return corresponding eventfs_rwsem
>> + * @dentry: a pointer to dentry
>> + *
>> + * helper function to return crossponding eventfs_rwsem for given dentry
>> + */
>> +static struct rw_semaphore *eventfs_dentry_to_rwsem(struct dentry *dentry)
>> +{
>> + if (S_ISDIR(dentry->d_inode->i_mode))
>> + return (struct rw_semaphore *)dentry->d_inode->i_private;
>> + else
>> + return (struct rw_semaphore *)dentry->d_parent->d_inode->i_private;
>> +}
>> +
>> +/**
>> + * eventfs_down_read - acquire read lock function
>> + * @eventfs_rwsem: a pointer to rw_semaphore
>> + *
>> + * helper function to perform read lock. Nested locking requires because
>> + * lookup(), release() requires read lock, these could be called directly
>> + * or from open(), remove() which already hold the read/write lock.
>> + */
>> +static void eventfs_down_read(struct rw_semaphore *eventfs_rwsem)
>> +{
>> + down_read_nested(eventfs_rwsem, SINGLE_DEPTH_NESTING);
>> +}
>> +
>> +/**
>> + * eventfs_up_read - release read lock function
>> + * @eventfs_rwsem: a pointer to rw_semaphore
>> + *
>> + * helper function to release eventfs_rwsem lock if locked
>> + */
>> +static void eventfs_up_read(struct rw_semaphore *eventfs_rwsem)
>> +{
>> + up_read(eventfs_rwsem);
>> +}
>> +
>> +/**
>> + * eventfs_down_write - acquire write lock function
>> + * @eventfs_rwsem: a pointer to rw_semaphore
>> + *
>> + * helper function to perform write lock on eventfs_rwsem
>> + */
>> +static void eventfs_down_write(struct rw_semaphore *eventfs_rwsem)
>> +{
>> + while (!down_write_trylock(eventfs_rwsem))
>> + msleep(10);
>
> What's this loop for? Something like that needs a very good explanation
> in a comment. Loops like these are usually a sign of a workaround for a
> bug in the design, or worse, simply hides an existing bug.
>
Yes correct, this logic is to solve deadlock:
Thread 1 Thread 2
down_read_nested() - read lock acquired
down_write() - waiting for write lock to acquire
down_read_nested() - deadlock
Deadlock is because rwlock wouldn’t allow read lock to be acquired if write lock is waiting.
down_write_trylock() wouldn’t add the write lock in waiting queue, hence helps to prevent
deadlock scenario.
I was stuck with this Deadlock, tried few methods and finally borrowed from cifs, as it’s
upstreamed, tested and working in cifs, please refer:
https://elixir.bootlin.com/linux/v6.3.1/source/fs/cifs/file.c#L438
Looking further for your input. I will add explanation in v4.
>> +}
>> +
>> +/**
>> + * eventfs_up_write - release write lock function
>> + * @eventfs_rwsem: a pointer to rw_semaphore
>> + *
>> + * helper function to perform write lock on eventfs_rwsem
>> + */
>> +static void eventfs_up_write(struct rw_semaphore *eventfs_rwsem)
>> +{
>> + up_write(eventfs_rwsem);
>> +}
>> +
>> +static const struct file_operations eventfs_file_operations = {
>> +};
>> +
>> +static const struct inode_operations eventfs_root_dir_inode_operations = {
>> +};
>> +
>> +/**
>> + * eventfs_prepare_ef - helper function to prepare eventfs_file
>> + * @name: a pointer to a string containing the name of the file/directory
>> + * to create.
>> + * @mode: the permission that the file should have.
>> + * @fop: a pointer to a struct file_operations that should be used for
>> + * this file/directory.
>> + * @iop: a pointer to a struct inode_operations that should be used for
>> + * this file/directory.
>> + * @data: a pointer to something that the caller will want to get to later
>> + * on. The inode.i_private pointer will point to this value on
>> + * the open() call.
>> + *
>> + * This function allocate the fill eventfs_file structure.
>
> "allocates and fills the" ?
>
>> + */
>> +static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode,
>> + const struct file_operations *fop,
>> + const struct inode_operations *iop,
>> + void *data)
>> +{
>> + struct eventfs_file *ef;
>> +
>> + ef = kzalloc(sizeof(*ef), GFP_KERNEL);
>> + if (!ef)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + ef->name = kstrdup(name, GFP_KERNEL);
>> + if (!ef->name) {
>> + kfree(ef);
>> + return ERR_PTR(-ENOMEM);
>> + }
>> +
>> + if (S_ISDIR(mode)) {
>> + ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL);
>> + if (!ef->ei) {
>> + kfree(ef->name);
>> + kfree(ef);
>> + return ERR_PTR(-ENOMEM);
>> + }
>> + INIT_LIST_HEAD(&ef->ei->e_top_files);
>> + } else {
>> + ef->ei = NULL;
>> + }
>> +
>> + ef->iop = iop;
>> + ef->fop = fop;
>> + ef->mode = mode;
>> + ef->data = data;
>> + ef->dentry = NULL;
>> + ef->d_parent = NULL;
>> + ef->created = false;
>
> No need for the initialization to NULL or even the false, as the
> kzalloc() already did that.
>
>> + return ef;
>> +}
>> +
>> +/**
>> + * eventfs_create_events_dir - create the trace event structure
>> + * @name: a pointer to a string containing the name of the directory to
>> + * create.
>
> You don't need to add "a pointer" we can see it's a pointer. Just say:
>
> * @name: The name of the directory to create
>
> Adding more makes it confusing to read.
>
>> + * @parent: a pointer to the parent dentry for this file. This should be a
>> + * directory dentry if set. If this parameter is NULL, then the
>> + * directory will be created in the root of the tracefs filesystem.
>> + * @eventfs_rwsem: a pointer to rw_semaphore
>
> Same with all the descriptions.
>
>
>> + *
>> + * This function creates the top of the trace event directory.
>> + */
>> +struct dentry *eventfs_create_events_dir(const char *name,
>> + struct dentry *parent,
>> + struct rw_semaphore *eventfs_rwsem)
>
> OK, I'm going to have to really look at this. Passing in a lock to the
> API is just broken. We need to find a way to solve this another way.
eventfs_rwsem is a member of struct trace_array, I guess we should
pass pointer to trace_array.
> I'm about to board a plane to JFK shortly, I'm hoping to play with this
> while flying back.
>
I have replied for major concerns. All other minor I will take care in v4.
Thanks a lot for giving time to eventfs patches.
- Ajay
> -- Steve
>
>
>> +{
>> + struct dentry *dentry = tracefs_start_creating(name, parent);
>> + struct eventfs_inode *ei;
>> + struct tracefs_inode *ti;
>> + struct inode *inode;
>> +
>> + if (IS_ERR(dentry))
>> + return dentry;
>> +
>> + ei = kzalloc(sizeof(*ei), GFP_KERNEL);
>> + if (!ei)
>> + return ERR_PTR(-ENOMEM);
>> + inode = tracefs_get_inode(dentry->d_sb);
>> + if (unlikely(!inode)) {
>> + kfree(ei);
>> + tracefs_failed_creating(dentry);
>> + return ERR_PTR(-ENOMEM);
>> + }
>> +
>> + init_rwsem(eventfs_rwsem);
>> + INIT_LIST_HEAD(&ei->e_top_files);
>> +
>> + ti = get_tracefs(inode);
>> + ti->flags |= TRACEFS_EVENT_INODE;
>> + ti->private = ei;
>> +
>> + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
>> + inode->i_op = &eventfs_root_dir_inode_operations;
>> + inode->i_fop = &eventfs_file_operations;
>> + inode->i_private = eventfs_rwsem;
>> +
>> + /* directory inodes start off with i_nlink == 2 (for "." entry) */
>> + inc_nlink(inode);
>> + d_instantiate(dentry, inode);
>> + inc_nlink(dentry->d_parent->d_inode);
>> + fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
>> + return tracefs_end_creating(dentry);
>> +}
>
> !! External Email: This email originated from outside of the organization. Do not click links or open attachments unless you recognize the sender.
On Mon, 3 Jul 2023 10:13:22 +0000
Ajay Kaher <akaher@vmware.com> wrote:
> >> +/**
> >> + * eventfs_down_write - acquire write lock function
> >> + * @eventfs_rwsem: a pointer to rw_semaphore
> >> + *
> >> + * helper function to perform write lock on eventfs_rwsem
> >> + */
> >> +static void eventfs_down_write(struct rw_semaphore *eventfs_rwsem)
> >> +{
> >> + while (!down_write_trylock(eventfs_rwsem))
> >> + msleep(10);
> >
> > What's this loop for? Something like that needs a very good explanation
> > in a comment. Loops like these are usually a sign of a workaround for a
> > bug in the design, or worse, simply hides an existing bug.
> >
>
> Yes correct, this logic is to solve deadlock:
>
> Thread 1 Thread 2
> down_read_nested() - read lock acquired
> down_write() - waiting for write lock to acquire
> down_read_nested() - deadlock
>
> Deadlock is because rwlock wouldn’t allow read lock to be acquired if write lock is waiting.
> down_write_trylock() wouldn’t add the write lock in waiting queue, hence helps to prevent
> deadlock scenario.
>
> I was stuck with this Deadlock, tried few methods and finally borrowed from cifs, as it’s
> upstreamed, tested and working in cifs, please refer:
> https://elixir.bootlin.com/linux/v6.3.1/source/fs/cifs/file.c#L438
I just looked at that code and the commit, and I honestly believe that
is a horrible hack, and very fragile. It's in the smb code, so it was
unlikely reviewed by anyone outside that subsystem. I really do not
want to prolificate that solution around the kernel. We need to come up
with something else.
I also think it's buggy (yes the cifs code is buggy!) because in the
comment above the down_read_nested() it says:
/*
* nested locking. NOTE: rwsems are not allowed to recurse
* (which occurs if the same task tries to acquire the same
* lock instance multiple times), but multiple locks of the
* same lock class might be taken, if the order of the locks
* is always the same. This ordering rule can be expressed
* to lockdep via the _nested() APIs, but enumerating the
* subclasses that are used. (If the nesting relationship is
* static then another method for expressing nested locking is
* the explicit definition of lock class keys and the use of
* lockdep_set_class() at lock initialization time.
* See Documentation/locking/lockdep-design.rst for more details.)
*/
So this is NOT a solution (and the cifs code should be fixed too!)
Can you show me the exact backtrace where the reader lock gets taken
again? We will have to come up with a way to not take the same lock
twice.
We can also look to see if we can implement this with RCU. What exactly
is this rwsem protecting?
>
> Looking further for your input. I will add explanation in v4.
>
>
> >> +}
> >> +
[..]
> >> + *
> >> + * This function creates the top of the trace event directory.
> >> + */
> >> +struct dentry *eventfs_create_events_dir(const char *name,
> >> + struct dentry *parent,
> >> + struct rw_semaphore *eventfs_rwsem)
> >
> > OK, I'm going to have to really look at this. Passing in a lock to the
> > API is just broken. We need to find a way to solve this another way.
>
> eventfs_rwsem is a member of struct trace_array, I guess we should
> pass pointer to trace_array.
No, it should not be part of the trace_array. If we can't do this with
RCU, then we need to add a descriptor that contains the dentry that is
returned above, and have the lock held there. The caller of the
eventfs_create_events_dir() should not care about locking. That's an
implementation detail that should *not* be part of the API.
That is, if you need a lock:
struct eventfs_dentry {
struct dentry *dentry;
struct rwsem *rwsem;
};
And then get to that lock by using the container_of() macro. All
created eventfs dentry's could have this structure, where the rwsem
points to the top one. Again, that's only if we can't do this with RCU.
-- Steve
>
>
> > I'm about to board a plane to JFK shortly, I'm hoping to play with this
> > while flying back.
> >
>
> I have replied for major concerns. All other minor I will take care in v4.
>
> Thanks a lot for giving time to eventfs patches.
>
> - Ajay
>
> On 03-Jul-2023, at 8:38 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> I just looked at that code and the commit, and I honestly believe that
> is a horrible hack, and very fragile. It's in the smb code, so it was
> unlikely reviewed by anyone outside that subsystem. I really do not
> want to prolificate that solution around the kernel. We need to come up
> with something else.
>
> I also think it's buggy (yes the cifs code is buggy!) because in the
> comment above the down_read_nested() it says:
>
> /*
> * nested locking. NOTE: rwsems are not allowed to recurse
> * (which occurs if the same task tries to acquire the same
> * lock instance multiple times), but multiple locks of the
> * same lock class might be taken, if the order of the locks
> * is always the same. This ordering rule can be expressed
> * to lockdep via the _nested() APIs, but enumerating the
> * subclasses that are used. (If the nesting relationship is
> * static then another method for expressing nested locking is
> * the explicit definition of lock class keys and the use of
> * lockdep_set_class() at lock initialization time.
> * See Documentation/locking/lockdep-design.rst for more details.)
> */
>
> So this is NOT a solution (and the cifs code should be fixed too!)
>
> Can you show me the exact backtrace where the reader lock gets taken
> again? We will have to come up with a way to not take the same lock
> twice.
[ 244.185505] eventfs_root_lookup+0x37/0x1f0 <--- require read lock
[ 244.185509] __lookup_slow+0x72/0x100
[ 244.185511] lookup_one_len+0x6a/0x70
[ 244.185513] eventfs_start_creating+0x58/0xd0
[ 244.185515] ? security_locked_down+0x2e/0x50
[ 244.185518] eventfs_create_file+0x57/0x150
[ 244.185521] dcache_dir_open_wrapper+0x1c6/0x260 <--- require read lock
[ 244.185524] ? __pfx_dcache_dir_open_wrapper+0x10/0x10
[ 244.185526] do_dentry_open+0x1ed/0x420
[ 244.185529] vfs_open+0x2d/0x40
>
> We can also look to see if we can implement this with RCU. What exactly
> is this rwsem protecting?
>
- struct eventfs_file holds the meta-data for file or dir.
https://github.com/intel-lab-lkp/linux/blob/dfe0dc15a73261ed83cdc728e43f4b3d4e315aae/include/linux/tracefs.h#L28
- eventfs_rwsem is supposed to protect the 'link-list which is made of struct eventfs_file
' and elements of struct eventfs_file.
I tried one more solution i.e by checking owner of lock:
static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
{
return (struct task_struct *)
(atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
}
But rwsem_owner() is static.
>
>>
>> Looking further for your input. I will add explanation in v4.
>>
>>
>>>> +}
>>>> +
>
> [..]
>
>>>> + *
>>>> + * This function creates the top of the trace event directory.
>>>> + */
>>>> +struct dentry *eventfs_create_events_dir(const char *name,
>>>> + struct dentry *parent,
>>>> + struct rw_semaphore *eventfs_rwsem)
>>>
>>> OK, I'm going to have to really look at this. Passing in a lock to the
>>> API is just broken. We need to find a way to solve this another way.
>>
>> eventfs_rwsem is a member of struct trace_array, I guess we should
>> pass pointer to trace_array.
>
> No, it should not be part of the trace_array. If we can't do this with
> RCU, then we need to add a descriptor that contains the dentry that is
> returned above, and have the lock held there. The caller of the
> eventfs_create_events_dir() should not care about locking. That's an
> implementation detail that should *not* be part of the API.
>
> That is, if you need a lock:
>
> struct eventfs_dentry {
> struct dentry *dentry;
> struct rwsem *rwsem;
> };
>
> And then get to that lock by using the container_of() macro. All
> created eventfs dentry's could have this structure, where the rwsem
> points to the top one. Again, that's only if we can't do this with RCU.
Ok. Let’s first fix locking issue.
-Ajay
On Mon, 3 Jul 2023 18:51:22 +0000 Ajay Kaher <akaher@vmware.com> wrote: > > > > We can also look to see if we can implement this with RCU. What exactly > > is this rwsem protecting? > > > > - struct eventfs_file holds the meta-data for file or dir. > https://github.com/intel-lab-lkp/linux/blob/dfe0dc15a73261ed83cdc728e43f4b3d4e315aae/include/linux/tracefs.h#L28 > - eventfs_rwsem is supposed to protect the 'link-list which is made of struct eventfs_file > ' and elements of struct eventfs_file. RCU is usually the perfect solution for protecting link lists though. I'll take a look at this when I get back to work. -- Steve
© 2016 - 2026 Red Hat, Inc.