kernfs_rename_lock is used to obtain stable kernfs_node::{name|parent}
pointer. This is a preparation to access kernfs_node::parent under RCU
and ensure that the pointer remains stable under the RCU lifetime
guarantees.
For a complete path, as it is done in kernfs_path_from_node(), the
kernfs_rename_lock is still required in order to obtain a stable parent
relationship while computing the relevant node depth. This must not
change while the nodes are inspected in order to build the path.
If the kernfs user never moves the nodes (changes the parent) then the
kernfs_rename_lock is not required and the RCU guarantees are
sufficient. This "restriction" can be set with
KERNFS_ROOT_INVARIANT_PARENT. Otherwise the lock is required.
Rename kernfs_node::parent to kernfs_node::__parent to denote the RCU
access and use RCU accessor while accessing the node.
Make cgroup use KERNFS_ROOT_INVARIANT_PARENT since the parent here can
not change.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/x86/kernel/cpu/resctrl/rdtgroup.c | 52 ++++++++-------
fs/kernfs/dir.c | 87 +++++++++++++++-----------
fs/kernfs/kernfs-internal.h | 19 +++++-
fs/kernfs/mount.c | 10 +--
fs/kernfs/symlink.c | 23 +++----
fs/sysfs/file.c | 24 ++++---
include/linux/kernfs.h | 7 ++-
kernel/cgroup/cgroup-v1.c | 2 +-
kernel/cgroup/cgroup.c | 16 +++--
9 files changed, 149 insertions(+), 91 deletions(-)
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index d906a1cd84917..3d5d03f22c10d 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -947,10 +947,16 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
return 0;
}
+static void *rdt_get_kn_parent_priv(struct kernfs_node *kn)
+{
+ guard(rcu)();
+ return rcu_dereference(kn->__parent)->priv;
+}
+
static int rdt_num_closids_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_get_kn_parent_priv(of->kn);
seq_printf(seq, "%u\n", s->num_closid);
return 0;
@@ -959,7 +965,7 @@ static int rdt_num_closids_show(struct kernfs_open_file *of,
static int rdt_default_ctrl_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_get_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%x\n", r->default_ctrl);
@@ -969,7 +975,7 @@ static int rdt_default_ctrl_show(struct kernfs_open_file *of,
static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_get_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
@@ -979,7 +985,7 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
static int rdt_shareable_bits_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_get_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%x\n", r->cache.shareable_bits);
@@ -1003,7 +1009,7 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of,
static int rdt_bit_usage_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_get_kn_parent_priv(of->kn);
/*
* Use unsigned long even though only 32 bits are used to ensure
* test_bit() is used safely.
@@ -1085,7 +1091,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
static int rdt_min_bw_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_get_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->membw.min_bw);
@@ -1095,7 +1101,7 @@ static int rdt_min_bw_show(struct kernfs_open_file *of,
static int rdt_num_rmids_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_get_kn_parent_priv(of->kn);
seq_printf(seq, "%d\n", r->num_rmid);
@@ -1105,7 +1111,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of,
static int rdt_mon_features_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_get_kn_parent_priv(of->kn);
struct mon_evt *mevt;
list_for_each_entry(mevt, &r->evt_list, list) {
@@ -1120,7 +1126,7 @@ static int rdt_mon_features_show(struct kernfs_open_file *of,
static int rdt_bw_gran_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_get_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->membw.bw_gran);
@@ -1130,7 +1136,7 @@ static int rdt_bw_gran_show(struct kernfs_open_file *of,
static int rdt_delay_linear_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_get_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->membw.delay_linear);
@@ -1148,7 +1154,7 @@ static int max_threshold_occ_show(struct kernfs_open_file *of,
static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_get_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
@@ -1213,7 +1219,7 @@ static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_get_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
@@ -1625,7 +1631,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid
static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_get_kn_parent_priv(of->kn);
mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
@@ -1635,7 +1641,7 @@ static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_get_kn_parent_priv(of->kn);
mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
@@ -1741,7 +1747,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_get_kn_parent_priv(of->kn);
int ret;
/* Valid input requires a trailing newline */
@@ -1767,7 +1773,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_get_kn_parent_priv(of->kn);
int ret;
/* Valid input requires a trailing newline */
@@ -2429,12 +2435,13 @@ static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
* resource. "info" and its subdirectories don't
* have rdtgroup structures, so return NULL here.
*/
- if (kn == kn_info || kn->parent == kn_info)
+ if (kn == kn_info ||
+ rcu_dereference_check(kn->__parent, true) == kn_info)
return NULL;
else
return kn->priv;
} else {
- return kn->parent->priv;
+ return rdt_get_kn_parent_priv(kn);
}
}
@@ -3760,7 +3767,7 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
static int rdtgroup_rmdir(struct kernfs_node *kn)
{
- struct kernfs_node *parent_kn = kn->parent;
+ struct kernfs_node *parent_kn;
struct rdtgroup *rdtgrp;
cpumask_var_t tmpmask;
int ret = 0;
@@ -3773,6 +3780,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
ret = -EPERM;
goto out;
}
+ parent_kn = rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex));
/*
* If the rdtgroup is a ctrl_mon group and parent directory
@@ -3841,6 +3849,7 @@ static void mongrp_reparent(struct rdtgroup *rdtgrp,
static int rdtgroup_rename(struct kernfs_node *kn,
struct kernfs_node *new_parent, const char *new_name)
{
+ struct kernfs_node *kn_parent;
struct rdtgroup *new_prdtgrp;
struct rdtgroup *rdtgrp;
cpumask_var_t tmpmask;
@@ -3875,8 +3884,9 @@ static int rdtgroup_rename(struct kernfs_node *kn,
goto out;
}
- if (rdtgrp->type != RDTMON_GROUP || !kn->parent ||
- !is_mon_groups(kn->parent, kn->name)) {
+ kn_parent = rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex));
+ if (rdtgrp->type != RDTMON_GROUP || !kn_parent ||
+ !is_mon_groups(kn_parent, kn->name)) {
rdt_last_cmd_puts("Source must be a MON group\n");
ret = -EPERM;
goto out;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 5a1fea414996e..8e92928c6bca6 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -56,7 +56,7 @@ static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
if (!kn)
return strscpy(buf, "(null)", buflen);
- return strscpy(buf, kn->parent ? kn->name : "/", buflen);
+ return strscpy(buf, rcu_access_pointer(kn->__parent) ? kn->name : "/", buflen);
}
/* kernfs_node_depth - compute depth from @from to @to */
@@ -64,9 +64,9 @@ static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to)
{
size_t depth = 0;
- while (to->parent && to != from) {
+ while (rcu_dereference(to->__parent) && to != from) {
depth++;
- to = to->parent;
+ to = rcu_dereference(to->__parent);
}
return depth;
}
@@ -84,18 +84,18 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
db = kernfs_depth(rb->kn, b);
while (da > db) {
- a = a->parent;
+ a = rcu_dereference(a->__parent);
da--;
}
while (db > da) {
- b = b->parent;
+ b = rcu_dereference(b->__parent);
db--;
}
/* worst case b and a will be the same at root */
while (b != a) {
- b = b->parent;
- a = a->parent;
+ b = rcu_dereference(b->__parent);
+ a = rcu_dereference(a->__parent);
}
return a;
@@ -168,8 +168,9 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
/* Calculate how many bytes we need for the rest */
for (i = depth_to - 1; i >= 0; i--) {
+
for (kn = kn_to, j = 0; j < i; j++)
- kn = kn->parent;
+ kn = rcu_dereference(kn->__parent);
len += scnprintf(buf + len, buflen - len, "/%s", kn->name);
}
@@ -295,7 +296,7 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
unsigned long flags;
read_lock_irqsave(&kernfs_rename_lock, flags);
- parent = kn->parent;
+ parent = rcu_dereference_check(kn->__parent, lockdep_is_held(&kernfs_rename_lock));
kernfs_get(parent);
read_unlock_irqrestore(&kernfs_rename_lock, flags);
@@ -360,8 +361,12 @@ static int kernfs_sd_compare(const struct kernfs_node *left,
*/
static int kernfs_link_sibling(struct kernfs_node *kn)
{
- struct rb_node **node = &kn->parent->dir.children.rb_node;
struct rb_node *parent = NULL;
+ struct kernfs_node *kn_parent;
+ struct rb_node **node;
+
+ kn_parent = kernfs_parent(kn);
+ node = &kn_parent->dir.children.rb_node;
while (*node) {
struct kernfs_node *pos;
@@ -380,13 +385,13 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
/* add new node and rebalance the tree */
rb_link_node(&kn->rb, parent, node);
- rb_insert_color(&kn->rb, &kn->parent->dir.children);
+ rb_insert_color(&kn->rb, &kn_parent->dir.children);
/* successfully added, account subdir number */
down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
if (kernfs_type(kn) == KERNFS_DIR)
- kn->parent->dir.subdirs++;
- kernfs_inc_rev(kn->parent);
+ kn_parent->dir.subdirs++;
+ kernfs_inc_rev(kn_parent);
up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
return 0;
@@ -407,16 +412,19 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
*/
static bool kernfs_unlink_sibling(struct kernfs_node *kn)
{
+ struct kernfs_node *kn_parent;
+
if (RB_EMPTY_NODE(&kn->rb))
return false;
+ kn_parent = kernfs_parent(kn);
down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
if (kernfs_type(kn) == KERNFS_DIR)
- kn->parent->dir.subdirs--;
- kernfs_inc_rev(kn->parent);
+ kn_parent->dir.subdirs--;
+ kernfs_inc_rev(kn_parent);
up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
- rb_erase(&kn->rb, &kn->parent->dir.children);
+ rb_erase(&kn->rb, &kn_parent->dir.children);
RB_CLEAR_NODE(&kn->rb);
return true;
}
@@ -562,7 +570,7 @@ void kernfs_put(struct kernfs_node *kn)
* Moving/renaming is always done while holding reference.
* kn->parent won't change beneath us.
*/
- parent = kn->parent;
+ parent = rcu_dereference_check(kn->__parent, !atomic_read(&kn->count));
WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
"kernfs_put: %s/%s: released with incorrect active_ref %d\n",
@@ -701,7 +709,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
name, mode, uid, gid, flags);
if (kn) {
kernfs_get(parent);
- kn->parent = parent;
+ rcu_assign_pointer(kn->__parent, parent);
}
return kn;
}
@@ -769,13 +777,14 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
*/
int kernfs_add_one(struct kernfs_node *kn)
{
- struct kernfs_node *parent = kn->parent;
- struct kernfs_root *root = kernfs_root(parent);
+ struct kernfs_root *root = kernfs_root(kn);
struct kernfs_iattrs *ps_iattr;
+ struct kernfs_node *parent;
bool has_ns;
int ret;
down_write(&root->kernfs_rwsem);
+ parent = kernfs_parent(kn);
ret = -EINVAL;
has_ns = kernfs_ns_enabled(parent);
@@ -1111,7 +1120,7 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct kernfs_node *kn;
+ struct kernfs_node *kn, *parent;
struct kernfs_root *root;
if (flags & LOOKUP_RCU)
@@ -1162,8 +1171,9 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
if (!kernfs_active(kn))
goto out_bad;
+ parent = kernfs_parent(kn);
/* The kernfs node has been moved? */
- if (kernfs_dentry_node(dentry->d_parent) != kn->parent)
+ if (kernfs_dentry_node(dentry->d_parent) != parent)
goto out_bad;
/* The kernfs node has been renamed */
@@ -1171,7 +1181,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
goto out_bad;
/* The kernfs node has been moved to a different namespace */
- if (kn->parent && kernfs_ns_enabled(kn->parent) &&
+ if (parent && kernfs_ns_enabled(parent) &&
kernfs_info(dentry->d_sb)->ns != kn->ns)
goto out_bad;
@@ -1364,7 +1374,7 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
return kernfs_leftmost_descendant(rb_to_kn(rbn));
/* no sibling left, visit parent */
- return pos->parent;
+ return kernfs_parent(pos);
}
static void kernfs_activate_one(struct kernfs_node *kn)
@@ -1376,7 +1386,7 @@ static void kernfs_activate_one(struct kernfs_node *kn)
if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING)))
return;
- WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb));
+ WARN_ON_ONCE(kernfs_parent(kn) && RB_EMPTY_NODE(&kn->rb));
WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
@@ -1446,7 +1456,7 @@ void kernfs_show(struct kernfs_node *kn, bool show)
static void __kernfs_remove(struct kernfs_node *kn)
{
- struct kernfs_node *pos;
+ struct kernfs_node *pos, *parent;
/* Short-circuit if non-root @kn has already finished removal. */
if (!kn)
@@ -1458,7 +1468,7 @@ static void __kernfs_remove(struct kernfs_node *kn)
* This is for kernfs_remove_self() which plays with active ref
* after removal.
*/
- if (kn->parent && RB_EMPTY_NODE(&kn->rb))
+ if (kernfs_parent(kn) && RB_EMPTY_NODE(&kn->rb))
return;
pr_debug("kernfs %s: removing\n", kn->name);
@@ -1484,14 +1494,14 @@ static void __kernfs_remove(struct kernfs_node *kn)
kernfs_get(pos);
kernfs_drain(pos);
-
+ parent = kernfs_parent(pos);
/*
* kernfs_unlink_sibling() succeeds once per node. Use it
* to decide who's responsible for cleanups.
*/
- if (!pos->parent || kernfs_unlink_sibling(pos)) {
+ if (!parent || kernfs_unlink_sibling(pos)) {
struct kernfs_iattrs *ps_iattr =
- pos->parent ? pos->parent->iattr : NULL;
+ parent ? parent->iattr : NULL;
/* update timestamps on the parent */
down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
@@ -1721,7 +1731,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
int error;
/* can't move or rename root */
- if (!kn->parent)
+ if (!rcu_access_pointer(kn->__parent))
return -EINVAL;
root = kernfs_root(kn);
@@ -1732,8 +1742,15 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
(new_parent->flags & KERNFS_EMPTY_DIR))
goto out;
+ old_parent = kernfs_parent(kn);
+ if (root->flags & KERNFS_ROOT_INVARIANT_PARENT) {
+ error = -EINVAL;
+ if (WARN_ON_ONCE(old_parent != new_parent))
+ goto out;
+ }
+
error = 0;
- if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
+ if ((old_parent == new_parent) && (kn->ns == new_ns) &&
(strcmp(kn->name, new_name) == 0))
goto out; /* nothing to rename */
@@ -1760,8 +1777,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
/* rename_lock protects ->parent and ->name accessors */
write_lock_irq(&kernfs_rename_lock);
- old_parent = kn->parent;
- kn->parent = new_parent;
+ old_parent = rcu_dereference_check(kn->__parent, kernfs_root_is_locked(kn));
+ rcu_assign_pointer(kn->__parent, new_parent);
kn->ns = new_ns;
if (new_name) {
@@ -1794,7 +1811,7 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
{
if (pos) {
int valid = kernfs_active(pos) &&
- pos->parent == parent && hash == pos->hash;
+ kernfs_parent(pos) == parent && hash == pos->hash;
kernfs_put(pos);
if (!valid)
pos = NULL;
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index b42ee6547cdc1..da3a933b9cac0 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -64,11 +64,14 @@ struct kernfs_root {
*
* Return: the kernfs_root @kn belongs to.
*/
-static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
+static inline struct kernfs_root *kernfs_root(const struct kernfs_node *kn)
{
+ const struct kernfs_node *knp;
/* if parent exists, it's always a dir; otherwise, @sd is a dir */
- if (kn->parent)
- kn = kn->parent;
+ guard(rcu)();
+ knp = rcu_dereference(kn->__parent);
+ if (knp)
+ kn = knp;
return kn->dir.root;
}
@@ -97,6 +100,16 @@ struct kernfs_super_info {
};
#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
+static inline bool kernfs_root_is_locked(const struct kernfs_node *kn)
+{
+ return lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem);
+}
+
+static inline struct kernfs_node *kernfs_parent(const struct kernfs_node *kn)
+{
+ return rcu_dereference_check(kn->__parent, kernfs_root_is_locked(kn));
+}
+
static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
{
if (d_really_is_negative(dentry))
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 4a0ff08d589ca..2252b16e6ef0b 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -148,7 +148,7 @@ static struct dentry *kernfs_get_parent_dentry(struct dentry *child)
struct kernfs_root *root = kernfs_root(kn);
guard(rwsem_read)(&root->kernfs_rwsem);
- return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent));
+ return d_obtain_alias(kernfs_get_inode(child->d_sb, kernfs_parent(kn)));
}
static const struct export_operations kernfs_export_ops = {
@@ -188,10 +188,10 @@ static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
return NULL;
}
- while (child->parent != parent) {
- if (!child->parent)
+ while (kernfs_parent(child) != parent) {
+ child = kernfs_parent(child);
+ if (!child)
return NULL;
- child = child->parent;
}
return child;
@@ -216,7 +216,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
dentry = dget(sb->s_root);
/* Check if this is the root kernfs_node */
- if (!kn->parent)
+ if (!rcu_access_pointer(kn->__parent))
return dentry;
root = kernfs_root(kn);
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 45371a70caa71..05c62ca93c53d 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -62,10 +62,10 @@ static int kernfs_get_target_path(struct kernfs_node *parent,
/* go up to the root, stop at the base */
base = parent;
- while (base->parent) {
- kn = target->parent;
- while (kn->parent && base != kn)
- kn = kn->parent;
+ while (kernfs_parent(base)) {
+ kn = kernfs_parent(target);
+ while (kernfs_parent(kn) && base != kn)
+ kn = kernfs_parent(kn);
if (base == kn)
break;
@@ -75,14 +75,14 @@ static int kernfs_get_target_path(struct kernfs_node *parent,
strcpy(s, "../");
s += 3;
- base = base->parent;
+ base = kernfs_parent(base);
}
/* determine end of target string for reverse fillup */
kn = target;
- while (kn->parent && kn != base) {
+ while (kernfs_parent(kn) && kn != base) {
len += strlen(kn->name) + 1;
- kn = kn->parent;
+ kn = kernfs_parent(kn);
}
/* check limits */
@@ -94,7 +94,7 @@ static int kernfs_get_target_path(struct kernfs_node *parent,
/* reverse fillup of target string from target to base */
kn = target;
- while (kn->parent && kn != base) {
+ while (kernfs_parent(kn) && kn != base) {
int slen = strlen(kn->name);
len -= slen;
@@ -102,7 +102,7 @@ static int kernfs_get_target_path(struct kernfs_node *parent,
if (len)
s[--len] = '/';
- kn = kn->parent;
+ kn = kernfs_parent(kn);
}
return 0;
@@ -111,12 +111,13 @@ static int kernfs_get_target_path(struct kernfs_node *parent,
static int kernfs_getlink(struct inode *inode, char *path)
{
struct kernfs_node *kn = inode->i_private;
- struct kernfs_node *parent = kn->parent;
+ struct kernfs_node *parent;
struct kernfs_node *target = kn->symlink.target_kn;
- struct kernfs_root *root = kernfs_root(parent);
+ struct kernfs_root *root = kernfs_root(kn);
int error;
down_read(&root->kernfs_rwsem);
+ parent = kernfs_parent(kn);
error = kernfs_get_target_path(parent, target, path);
up_read(&root->kernfs_rwsem);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 785408861c01c..1d16b4a386aa9 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -19,13 +19,19 @@
#include "sysfs.h"
+static struct kobject *sysfs_file_kobj(struct kernfs_node *kn)
+{
+ guard(rcu)();
+ return rcu_dereference(kn->__parent)->priv;
+}
+
/*
* Determine ktype->sysfs_ops for the given kernfs_node. This function
* must be called while holding an active reference.
*/
static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
{
- struct kobject *kobj = kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(kn);
if (kn->flags & KERNFS_LOCKDEP)
lockdep_assert_held(kn);
@@ -40,7 +46,7 @@ static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
{
struct kernfs_open_file *of = sf->private;
- struct kobject *kobj = of->kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
ssize_t count;
char *buf;
@@ -78,7 +84,7 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
size_t count, loff_t pos)
{
struct bin_attribute *battr = of->kn->priv;
- struct kobject *kobj = of->kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
loff_t size = file_inode(of->file)->i_size;
if (!count)
@@ -105,7 +111,7 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
size_t count, loff_t pos)
{
const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
- struct kobject *kobj = of->kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
ssize_t len;
/*
@@ -131,7 +137,7 @@ static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
size_t count, loff_t pos)
{
const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
- struct kobject *kobj = of->kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
if (!count)
return 0;
@@ -144,7 +150,7 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
size_t count, loff_t pos)
{
struct bin_attribute *battr = of->kn->priv;
- struct kobject *kobj = of->kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
loff_t size = file_inode(of->file)->i_size;
if (size) {
@@ -168,7 +174,7 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
struct vm_area_struct *vma)
{
struct bin_attribute *battr = of->kn->priv;
- struct kobject *kobj = of->kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
return battr->mmap(of->file, kobj, battr, vma);
}
@@ -177,7 +183,7 @@ static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset,
int whence)
{
struct bin_attribute *battr = of->kn->priv;
- struct kobject *kobj = of->kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
if (battr->llseek)
return battr->llseek(of->file, kobj, battr, offset, whence);
@@ -494,7 +500,7 @@ EXPORT_SYMBOL_GPL(sysfs_break_active_protection);
*/
void sysfs_unbreak_active_protection(struct kernfs_node *kn)
{
- struct kobject *kobj = kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(kn);
kernfs_unbreak_active_protection(kn);
kernfs_put(kn);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 87c79d076d6d7..9d4f09711cd73 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -147,6 +147,11 @@ enum kernfs_root_flag {
* Support user xattrs to be written to nodes rooted at this root.
*/
KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008,
+
+ /*
+ * Renames must not change the parent node.
+ */
+ KERNFS_ROOT_INVARIANT_PARENT = 0x0010,
};
/* type-specific structures for kernfs_node union members */
@@ -199,8 +204,8 @@ struct kernfs_node {
* never moved to a different parent, it is safe to access the
* parent directly.
*/
- struct kernfs_node *parent;
const char *name;
+ struct kernfs_node __rcu *__parent;
struct rb_node rb;
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index e28d5f0d20ed0..c9752eb607ec9 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -844,7 +844,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent
if (kernfs_type(kn) != KERNFS_DIR)
return -ENOTDIR;
- if (kn->parent != new_parent)
+ if (rcu_access_pointer(kn->__parent) != new_parent)
return -EIO;
/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d9061bd55436b..02c4b5da2a6c3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -633,9 +633,14 @@ int cgroup_task_count(const struct cgroup *cgrp)
return count;
}
+static struct cgroup *kn_get_priv(struct kernfs_node *kn)
+{
+ return rcu_dereference_check(kn->__parent, kn->flags & KERNFS_ROOT_INVARIANT_PARENT)->priv;
+}
+
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
- struct cgroup *cgrp = of->kn->parent->priv;
+ struct cgroup *cgrp = kn_get_priv(of->kn);
struct cftype *cft = of_cft(of);
/*
@@ -1612,7 +1617,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn)
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
- cgrp = kn->parent->priv;
+ cgrp = kn_get_priv(kn);
cgroup_unlock();
@@ -1644,7 +1649,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
- cgrp = kn->parent->priv;
+ cgrp = kn_get_priv(kn);
/*
* We're gonna grab cgroup_mutex which nests outside kernfs
@@ -2118,7 +2123,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED |
KERNFS_ROOT_SUPPORT_EXPORTOP |
- KERNFS_ROOT_SUPPORT_USER_XATTR,
+ KERNFS_ROOT_SUPPORT_USER_XATTR |
+ KERNFS_ROOT_INVARIANT_PARENT,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
@@ -4119,7 +4125,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct cgroup_file_ctx *ctx = of->priv;
- struct cgroup *cgrp = of->kn->parent->priv;
+ struct cgroup *cgrp = kn_get_priv(of->kn);
struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
int ret;
--
2.47.2
On Fri, Jan 24, 2025 at 06:46:13PM +0100, Sebastian Andrzej Siewior wrote:
...
> +static void *rdt_get_kn_parent_priv(struct kernfs_node *kn)
> +{
> + guard(rcu)();
> + return rcu_dereference(kn->__parent)->priv;
> +}
...
> @@ -2429,12 +2435,13 @@ static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
> * resource. "info" and its subdirectories don't
> * have rdtgroup structures, so return NULL here.
> */
> - if (kn == kn_info || kn->parent == kn_info)
> + if (kn == kn_info ||
> + rcu_dereference_check(kn->__parent, true) == kn_info)
Why is this safe? What's protecting ->__parent?
...
> @@ -3773,6 +3780,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
> ret = -EPERM;
> goto out;
> }
> + parent_kn = rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex));
Can you please encapsulate the rule in a helper? e.g.
static rdt_kn_parent(struct kernfs_node *kn)
{
return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex) + /* whatever other conditions that make accesses safe */);
}
and then you can use that everywhere e.g.:
static void *rdt_get_kn_parent_priv(struct krenfs_node *kn)
{
guard(rcu)();
return rdt_kn_parent(kn)->priv;
}
This way, the rule to access kn->__parent is documented and enforced in a
single place. If the access rules can't be described like this, open coding
exceptions is fine but some documentation would be great.
> diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> index 5a1fea414996e..8e92928c6bca6 100644
> --- a/fs/kernfs/dir.c
> +++ b/fs/kernfs/dir.c
> @@ -56,7 +56,7 @@ static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
> if (!kn)
> return strscpy(buf, "(null)", buflen);
>
> - return strscpy(buf, kn->parent ? kn->name : "/", buflen);
> + return strscpy(buf, rcu_access_pointer(kn->__parent) ? kn->name : "/", buflen);
rcu_access_pointer() is for when only the pointer value is used without
dereferencing it. Here, the poiner is being dereferenced.
> @@ -295,7 +296,7 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
> unsigned long flags;
>
> read_lock_irqsave(&kernfs_rename_lock, flags);
> - parent = kn->parent;
> + parent = rcu_dereference_check(kn->__parent, lockdep_is_held(&kernfs_rename_lock));
Ditto, it'd be better to encapsulate the access rules in a helper so that
these aren't open coded differently in different places.
...
> @@ -562,7 +570,7 @@ void kernfs_put(struct kernfs_node *kn)
> * Moving/renaming is always done while holding reference.
> * kn->parent won't change beneath us.
> */
> - parent = kn->parent;
> + parent = rcu_dereference_check(kn->__parent, !atomic_read(&kn->count));
And this rule can be encoded in the same accessor function so that the rules
can be documented and enforced from (if possible) a single place.
> @@ -1760,8 +1777,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
> /* rename_lock protects ->parent and ->name accessors */
> write_lock_irq(&kernfs_rename_lock);
>
> - old_parent = kn->parent;
> - kn->parent = new_parent;
> + old_parent = rcu_dereference_check(kn->__parent, kernfs_root_is_locked(kn));
Another rule here.
> +static inline struct kernfs_node *kernfs_parent(const struct kernfs_node *kn)
> +{
> + return rcu_dereference_check(kn->__parent, kernfs_root_is_locked(kn));
> +}
AFAICS, all rules can be put into this helper, no?
...
> +static struct cgroup *kn_get_priv(struct kernfs_node *kn)
> +{
> + return rcu_dereference_check(kn->__parent, kn->flags & KERNFS_ROOT_INVARIANT_PARENT)->priv;
> +}
The flag is a root flag but being tested against a node flags field.
Thanks.
--
tejun
On 2025-01-24 13:35:07 [-1000], Tejun Heo wrote:
> On Fri, Jan 24, 2025 at 06:46:13PM +0100, Sebastian Andrzej Siewior wrote:
> ...
> > +static void *rdt_get_kn_parent_priv(struct kernfs_node *kn)
> > +{
> > + guard(rcu)();
> > + return rcu_dereference(kn->__parent)->priv;
> > +}
> ...
> > @@ -2429,12 +2435,13 @@ static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
> > * resource. "info" and its subdirectories don't
> > * have rdtgroup structures, so return NULL here.
> > */
> > - if (kn == kn_info || kn->parent == kn_info)
> > + if (kn == kn_info ||
> > + rcu_dereference_check(kn->__parent, true) == kn_info)
>
> Why is this safe? What's protecting ->__parent?
rcu_access_pointer() is what I was looking for. The __parent pointer is
not dereferenced only compared.
> ...
> > @@ -3773,6 +3780,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
> > ret = -EPERM;
> > goto out;
> > }
> > + parent_kn = rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex));
>
> Can you please encapsulate the rule in a helper? e.g.
>
> static rdt_kn_parent(struct kernfs_node *kn)
> {
> return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex) + /* whatever other conditions that make accesses safe */);
> }
>
> and then you can use that everywhere e.g.:
>
> static void *rdt_get_kn_parent_priv(struct krenfs_node *kn)
> {
> guard(rcu)();
> return rdt_kn_parent(kn)->priv;
> }
>
> This way, the rule to access kn->__parent is documented and enforced in a
> single place. If the access rules can't be described like this, open coding
> exceptions is fine but some documentation would be great.
Okay.
> > diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> > index 5a1fea414996e..8e92928c6bca6 100644
> > --- a/fs/kernfs/dir.c
> > +++ b/fs/kernfs/dir.c
> > @@ -56,7 +56,7 @@ static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
> > if (!kn)
> > return strscpy(buf, "(null)", buflen);
> >
> > - return strscpy(buf, kn->parent ? kn->name : "/", buflen);
> > + return strscpy(buf, rcu_access_pointer(kn->__parent) ? kn->name : "/", buflen);
>
> rcu_access_pointer() is for when only the pointer value is used without
> dereferencing it. Here, the poiner is being dereferenced.
Is it? It checks if the pointer NULL and if so "/" is used otherwise
"kn->name". The __parent pointer itself is not dereferenced.
> > @@ -295,7 +296,7 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
> > unsigned long flags;
> >
> > read_lock_irqsave(&kernfs_rename_lock, flags);
> > - parent = kn->parent;
> > + parent = rcu_dereference_check(kn->__parent, lockdep_is_held(&kernfs_rename_lock));
>
> Ditto, it'd be better to encapsulate the access rules in a helper so that
> these aren't open coded differently in different places.
>
> ...
> > @@ -562,7 +570,7 @@ void kernfs_put(struct kernfs_node *kn)
> > * Moving/renaming is always done while holding reference.
> > * kn->parent won't change beneath us.
> > */
> > - parent = kn->parent;
> > + parent = rcu_dereference_check(kn->__parent, !atomic_read(&kn->count));
>
> And this rule can be encoded in the same accessor function so that the rules
> can be documented and enforced from (if possible) a single place.
>
> > @@ -1760,8 +1777,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
> > /* rename_lock protects ->parent and ->name accessors */
> > write_lock_irq(&kernfs_rename_lock);
> >
> > - old_parent = kn->parent;
> > - kn->parent = new_parent;
> > + old_parent = rcu_dereference_check(kn->__parent, kernfs_root_is_locked(kn));
>
> Another rule here.
>
> > +static inline struct kernfs_node *kernfs_parent(const struct kernfs_node *kn)
> > +{
> > + return rcu_dereference_check(kn->__parent, kernfs_root_is_locked(kn));
> > +}
>
> AFAICS, all rules can be put into this helper, no?
This would work. kernfs_parent() is the "general purpose" access. It is
used in most places (the kernfs_rename_ns() usage is moved to
kernfs_parent() in the following patch, ended here open coded during the
split, fixed now).
The "!atomic_read(&kn->count)" rule is a special case used only in
kernfs_put() after the counter went to 0 and should not be used (used as
in be valid) anywhere else. This is special because is going away and
__parent can not be renamed/ replaced at this point. One user in total.
The "lockdep_is_held(&kernfs_rename_lock)" rule is only used in
kernfs_get_parent(). One user in total.
Adding these two cases to kernfs_parent() will bloat the code a
little in the debug case (where the check is expanded). Also it will
require to make kernfs_rename_lock global so it be accessed outside of
dir.c.
All in all I don't think it is worth it. If you however prefer it that
way, I sure can update it.
> ...
> > +static struct cgroup *kn_get_priv(struct kernfs_node *kn)
> > +{
> > + return rcu_dereference_check(kn->__parent, kn->flags & KERNFS_ROOT_INVARIANT_PARENT)->priv;
> > +}
>
> The flag is a root flag but being tested against a node flags field.
Right you are. I've seen this flag set and that the root node's flags
were ORed into the child node but I can't find where this does happen.
It does not. I must have seen KERNFS_ACTIVATED then.
> Thanks.
Sebastian
Hello,
On Mon, Jan 27, 2025 at 05:25:43PM +0100, Sebastian Andrzej Siewior wrote:
> > > - return strscpy(buf, kn->parent ? kn->name : "/", buflen);
> > > + return strscpy(buf, rcu_access_pointer(kn->__parent) ? kn->name : "/", buflen);
> >
> > rcu_access_pointer() is for when only the pointer value is used without
> > dereferencing it. Here, the poiner is being dereferenced.
>
> Is it? It checks if the pointer NULL and if so "/" is used otherwise
> "kn->name". The __parent pointer itself is not dereferenced.
Ah, ignore me. I was misreading.
> > > +static inline struct kernfs_node *kernfs_parent(const struct kernfs_node *kn)
> > > +{
> > > + return rcu_dereference_check(kn->__parent, kernfs_root_is_locked(kn));
> > > +}
> >
> > AFAICS, all rules can be put into this helper, no?
>
> This would work. kernfs_parent() is the "general purpose" access. It is
> used in most places (the kernfs_rename_ns() usage is moved to
> kernfs_parent() in the following patch, ended here open coded during the
> split, fixed now).
>
> The "!atomic_read(&kn->count)" rule is a special case used only in
> kernfs_put() after the counter went to 0 and should not be used (used as
> in be valid) anywhere else. This is special because is going away and
> __parent can not be renamed/ replaced at this point. One user in total.
>
> The "lockdep_is_held(&kernfs_rename_lock)" rule is only used in
> kernfs_get_parent(). One user in total.
>
> Adding these two cases to kernfs_parent() will bloat the code a
> little in the debug case (where the check is expanded). Also it will
> require to make kernfs_rename_lock global so it be accessed outside of
> dir.c.
> All in all I don't think it is worth it. If you however prefer it that
> way, I sure can update it.
Hmm... maybe other people have different preferences here but I much prefer
documenting and enforcing RCU deref rules in a single place. It only adds
debug annotations that go away in prod builds while clarifying subtleties.
The trade-off seems pretty one-sided to me.
Thanks.
--
tejun
© 2016 - 2026 Red Hat, Inc.