fs/bpf_fs_kfuncs.c | 73 +++++++++ fs/namei.c | 51 ++++++ include/linux/namei.h | 2 + kernel/bpf/verifier.c | 5 + security/landlock/fs.c | 31 ++-- .../testing/selftests/bpf/bpf_experimental.h | 6 + .../selftests/bpf/prog_tests/path_iter.c | 111 ++++++++++++++ tools/testing/selftests/bpf/progs/path_iter.c | 145 ++++++++++++++++++ tools/testing/selftests/bpf/progs/path_walk.c | 59 +++++++ 9 files changed, 462 insertions(+), 21 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/path_iter.c create mode 100644 tools/testing/selftests/bpf/progs/path_iter.c create mode 100644 tools/testing/selftests/bpf/progs/path_walk.c
In security use cases, it is common to apply rules to VFS subtrees. However, filtering files in a subtree is not straightforward [1]. One solution to this problem is to start from a path and walk up the VFS tree (towards the root). Among in-tree LSMs, Landlock uses this solution. BPF LSM solutions, such like Tetragon [2], also use similar approaches. However, due to lack of proper helper/kfunc support, BPF LSM solutions usually do the path walk with probe read, which is racy. This patchset introduces a new helper path_walk_parent, which walks path to its VFS parent. The helper is used in Landlock. A new BPF iterator, path iterator, is introduced to do the path walking. The BPF path iterator uses the new path_walk_parent help to walk the VFS tree. Changes v2 => v3: 1. Fix an issue with path_walk_parent. 2. Move bpf path iterator to fs/bpf_fs_kfuncs.c 3. Optimize bpf path iterator (less memory). 4. Add more selftests. 5. Add more comments. v2: https://lore.kernel.org/bpf/20250603065920.3404510-1-song@kernel.org/ Changes v1 => v2: 1. Rename path_parent => path_walk_parent. 2. Remove path_connected check in path_walk_parent. 3. Fix is_access_to_paths_allowed(). 4. Remove mode for path iterator, add a flag instead. v1: https://lore.kernel.org/bpf/20250528222623.1373000-1-song@kernel.org/ [1] https://lpc.events/event/18/contributions/1940/ [2] https://github.com/cilium/tetragon/ Song Liu (5): namei: Introduce new helper function path_walk_parent() landlock: Use path_walk_parent() bpf: Introduce path iterator selftests/bpf: Add tests for bpf path iterator selftests/bpf: Path walk test fs/bpf_fs_kfuncs.c | 73 +++++++++ fs/namei.c | 51 ++++++ include/linux/namei.h | 2 + kernel/bpf/verifier.c | 5 + security/landlock/fs.c | 31 ++-- .../testing/selftests/bpf/bpf_experimental.h | 6 + .../selftests/bpf/prog_tests/path_iter.c | 111 ++++++++++++++ tools/testing/selftests/bpf/progs/path_iter.c | 145 ++++++++++++++++++ tools/testing/selftests/bpf/progs/path_walk.c | 59 +++++++ 9 files changed, 462 insertions(+), 21 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/path_iter.c create mode 100644 tools/testing/selftests/bpf/progs/path_iter.c create mode 100644 tools/testing/selftests/bpf/progs/path_walk.c -- 2.47.1
On 6/6/25 22:30, Song Liu wrote:
> In security use cases, it is common to apply rules to VFS subtrees.
> However, filtering files in a subtree is not straightforward [1].
>
> One solution to this problem is to start from a path and walk up the VFS
> tree (towards the root). Among in-tree LSMs, Landlock uses this solution.
>
> BPF LSM solutions, such like Tetragon [2], also use similar approaches.
> However, due to lack of proper helper/kfunc support, BPF LSM solutions
> usually do the path walk with probe read, which is racy.
>
> This patchset introduces a new helper path_walk_parent, which walks
> path to its VFS parent. The helper is used in Landlock.
>
> A new BPF iterator, path iterator, is introduced to do the path walking.
> The BPF path iterator uses the new path_walk_parent help to walk the VFS
> tree.
Hi Song, Christian, Al and others,
Previously I proposed in [1] to add ability to do a reference-less parent
walk for Landlock. However, as Christian pointed out and I do agree in
hindsight, it is not a good idea to do things like this in non-VFS code.
However, I still think this is valuable to consider given the performance
improvement, and after some discussion with Mickaël, I would like to
propose extending Song's helper to support such usage. While I recognize
that this patch series is already in its v3, and I do not want to delay it
by too much, putting this proposal out now is still better than after this
has merged, so that we may consider signature changes.
I've created a proof-of-concept and did some brief testing. The
performance improvement attained here is the same as in [1] (with a "git
status" workload, median landlock overhead 35% -> 28%, median time in
landlock decreases by 26.6%).
If this idea is accepted, I'm happy to work on it further, split out this
patch, update the comments and do more testing etc, potentially in
collaboration with Song.
An alternative to this is perhaps to add a new helper
path_walk_parent_rcu, also living in namei.c, that will be used directly
by Landlock. I'm happy to do it either way, but with some experimentation
I personally think that the code in this patch is still clean enough, and
can avoid some duplication.
Patch title: path_walk_parent: support reference-less walk
A later commit will update the BPF path iterator to use this.
Signed-off-by: Tingmao Wang <m@maowtm.org>
---
fs/namei.c | 107 ++++++++++++++++++++++++++++++++++++-----
include/linux/namei.h | 19 +++++++-
security/landlock/fs.c | 49 +++++++++++++++++--
3 files changed, 157 insertions(+), 18 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 732b8fd02451..351ebe957db8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1424,6 +1424,30 @@ static bool choose_mountpoint(struct mount *m, const struct path *root,
return found;
}
+/**
+ * acquires rcu read lock if rcu == true.
+ */
+void path_walk_parent_start(struct parent_iterator *pit, const struct path *path,
+ const struct path *root, bool ref_less)
+{
+ pit->path = *path;
+
+ pit->root.mnt = NULL;
+ pit->root.dentry = NULL;
+ if (root)
+ pit->root = *root;
+
+ pit->rcu = ref_less;
+ if (ref_less) {
+ pit->m_seq = read_seqbegin(&mount_lock);
+ pit->r_seq = read_seqbegin(&rename_lock);
+ pit->next_seq = read_seqcount_begin(&pit->path.dentry->d_seq);
+ rcu_read_lock();
+ } else {
+ path_get(&pit->path);
+ }
+}
+
/**
* path_walk_parent - Walk to the parent of path
* @path: input and output path.
@@ -1446,35 +1470,92 @@ static bool choose_mountpoint(struct mount *m, const struct path *root,
* // stop walking
*
* Returns:
- * true - if @path is updated to its parent.
- * false - if @path is already the root (real root or @root).
+ * PATH_WALK_PARENT_UPDATED - if @path is updated to its parent.
+ * PATH_WALK_PARENT_ALREADY_ROOT - if @path is already the root (real root or @root).
+ * PATH_WALK_PARENT_RETRY - reference-less path walk failed. Caller should restart with rcu == false.
*/
-bool path_walk_parent(struct path *path, const struct path *root)
+int path_walk_parent(struct parent_iterator *pit, struct path *next_parent)
{
struct dentry *parent;
+ struct path *path = &pit->path;
+ struct path *root = &pit->root;
+ unsigned mountpoint_d_seq;
if (path_equal(path, root))
return false;
if (unlikely(path->dentry == path->mnt->mnt_root)) {
- struct path p;
+ struct path upper_mountpoint;
- if (!choose_mountpoint(real_mount(path->mnt), root, &p))
- return false;
- path_put(path);
- *path = p;
+ if (pit->rcu) {
+ if (!choose_mountpoint_rcu(real_mount(path->mnt), root,
+ &upper_mountpoint,
+ &mountpoint_d_seq)) {
+ return PATH_WALK_PARENT_ALREADY_ROOT;
+ }
+ if (read_seqcount_retry(&path->dentry->d_seq,
+ pit->next_seq)) {
+ return PATH_WALK_PARENT_RETRY;
+ }
+ *path = upper_mountpoint;
+ pit->next_seq = mountpoint_d_seq;
+ } else {
+ if (!choose_mountpoint(real_mount(path->mnt), root,
+ &upper_mountpoint))
+ return PATH_WALK_PARENT_ALREADY_ROOT;
+ path_put(path);
+ *path = upper_mountpoint;
+ }
}
if (unlikely(IS_ROOT(path->dentry)))
- return false;
+ return PATH_WALK_PARENT_ALREADY_ROOT;
- parent = dget_parent(path->dentry);
- dput(path->dentry);
- path->dentry = parent;
- return true;
+ if (pit->rcu) {
+ parent = READ_ONCE(path->dentry->d_parent);
+ if (read_seqcount_retry(&path->dentry->d_seq, pit->next_seq)) {
+ return PATH_WALK_PARENT_RETRY;
+ }
+ path->dentry = parent;
+ pit->next_seq = read_seqcount_begin(&parent->d_seq);
+ } else {
+ parent = dget_parent(path->dentry);
+ dput(path->dentry);
+ path->dentry = parent;
+ }
+
+ if (next_parent)
+ *next_parent = *path;
+
+ return PATH_WALK_PARENT_UPDATED;
}
EXPORT_SYMBOL_GPL(path_walk_parent);
+/**
+ * releases rcu read lock if rcu == true.
+ * Returns -EAGAIN if rcu path walk failed.
+ */
+int path_walk_parent_end(struct parent_iterator *pit)
+{
+ bool need_restart = false;
+
+ if (pit->rcu) {
+ rcu_read_unlock();
+ /* do we need these if we're checking d_seq throughout? */
+ if (read_seqretry(&mount_lock, pit->m_seq) ||
+ read_seqretry(&rename_lock, pit->r_seq)) {
+ need_restart = true;
+ }
+ } else {
+ path_put(&pit->path);
+ }
+
+ if (need_restart)
+ return -EAGAIN;
+
+ return 0;
+}
+
/*
* Perform an automount
* - return -EISDIR to tell follow_managed() to stop and return the path we
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 9d220b1e823c..e7fdfae12bd5 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -86,7 +86,24 @@ extern int follow_down_one(struct path *);
extern int follow_down(struct path *path, unsigned int flags);
extern int follow_up(struct path *);
-bool path_walk_parent(struct path *path, const struct path *root);
+struct parent_iterator {
+ struct path path;
+ struct path root;
+ bool rcu;
+ /* expected seq of path->dentry */
+ unsigned next_seq;
+ unsigned m_seq, r_seq;
+};
+
+#define PATH_WALK_PARENT_UPDATED 0
+#define PATH_WALK_PARENT_ALREADY_ROOT -1
+#define PATH_WALK_PARENT_RETRY -2
+
+void path_walk_parent_start(struct parent_iterator *pit,
+ const struct path *path, const struct path *root,
+ bool ref_less);
+int path_walk_parent(struct parent_iterator *pit, struct path *next_parent);
+int path_walk_parent_end(struct parent_iterator *pit);
extern struct dentry *lock_rename(struct dentry *, struct dentry *);
extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 3adac544dc9e..522ac617d192 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -375,6 +375,9 @@ find_rule(const struct landlock_ruleset *const domain,
return NULL;
inode = d_backing_inode(dentry);
+ if (unlikely(!inode))
+ /* this can happen in reference-less path walk. Let outside retry. */
+ return NULL;
rcu_read_lock();
id.key.object = rcu_dereference(landlock_inode(inode)->object);
rule = landlock_find_rule(domain, id);
@@ -767,10 +770,15 @@ static bool is_access_to_paths_allowed(
child1_is_directory = true, child2_is_directory = true;
struct path walker_path;
access_mask_t access_masked_parent1, access_masked_parent2;
+ layer_mask_t _layer_mask_parent_1_init[LANDLOCK_NUM_ACCESS_FS],
+ _layer_mask_parent_2_init[LANDLOCK_NUM_ACCESS_FS];
layer_mask_t _layer_masks_child1[LANDLOCK_NUM_ACCESS_FS],
_layer_masks_child2[LANDLOCK_NUM_ACCESS_FS];
layer_mask_t(*layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS] = NULL,
(*layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS] = NULL;
+ struct parent_iterator iter;
+ bool restart_pathwalk = false;
+ int err;
if (!access_request_parent1 && !access_request_parent2)
return true;
@@ -784,6 +792,15 @@ static bool is_access_to_paths_allowed(
if (WARN_ON_ONCE(!layer_masks_parent1))
return false;
+ memcpy(_layer_mask_parent_1_init, layer_masks_parent1,
+ sizeof(*layer_masks_parent1));
+ if (unlikely(layer_masks_parent2)) {
+ memcpy(_layer_mask_parent_2_init, layer_masks_parent2,
+ sizeof(*layer_masks_parent2));
+ }
+
+restart_pathwalk:
+
allowed_parent1 = is_layer_masks_allowed(layer_masks_parent1);
if (unlikely(layer_masks_parent2)) {
@@ -830,15 +847,15 @@ static bool is_access_to_paths_allowed(
child2_is_directory = d_is_dir(dentry_child2);
}
+ path_walk_parent_start(&iter, path, NULL, !restart_pathwalk);
walker_path = *path;
- path_get(&walker_path);
+
/*
* We need to walk through all the hierarchy to not miss any relevant
* restriction.
*/
while (true) {
const struct landlock_rule *rule;
- struct path root = {};
/*
* If at least all accesses allowed on the destination are
@@ -896,8 +913,22 @@ static bool is_access_to_paths_allowed(
if (allowed_parent1 && allowed_parent2)
break;
- if (path_walk_parent(&walker_path, &root))
+ switch (path_walk_parent(&iter, &walker_path)) {
+ case PATH_WALK_PARENT_UPDATED:
continue;
+ case PATH_WALK_PARENT_RETRY:
+ path_walk_parent_end(&iter);
+ memcpy(layer_masks_parent1, _layer_mask_parent_1_init,
+ sizeof(*layer_masks_parent1));
+ if (layer_masks_parent2)
+ memcpy(layer_masks_parent2,
+ _layer_mask_parent_2_init,
+ sizeof(*layer_masks_parent2));
+ restart_pathwalk = true;
+ goto restart_pathwalk;
+ case PATH_WALK_PARENT_ALREADY_ROOT:
+ break;
+ }
if (unlikely(IS_ROOT(walker_path.dentry))) {
/*
@@ -913,7 +944,17 @@ static bool is_access_to_paths_allowed(
}
break;
}
- path_put(&walker_path);
+
+ err = path_walk_parent_end(&iter);
+ if (err == -EAGAIN) {
+ memcpy(layer_masks_parent1, _layer_mask_parent_1_init,
+ sizeof(*layer_masks_parent1));
+ if (layer_masks_parent2)
+ memcpy(layer_masks_parent2, _layer_mask_parent_2_init,
+ sizeof(*layer_masks_parent2));
+ restart_pathwalk = true;
+ goto restart_pathwalk;
+ }
if (!allowed_parent1) {
log_request_parent1->type = LANDLOCK_REQUEST_FS_ACCESS;
base-commit: 0ff41df1cb268fc69e703a08a57ee14ae967d0ca
prerequisite-patch-id: 3a08c744682d5b01f98d196b3d3320b862d189c8
prerequisite-patch-id: 37586287398318c9896395b186f0809da1b0b81d
prerequisite-patch-id: 990fee8b55dae8d26bcf05a953e37988dd83d563
prerequisite-patch-id: 7f95cfaeaed0b5b30206b81691367fa520244526
prerequisite-patch-id: e10506d21bc71ff99db81bf5ab46ddfec98d1fca
prerequisite-patch-id: 8785acb37f7cf6fb62dcbdbf0a14338c04fe342f
--
2.49.0
On Sun, Jun 8, 2025 at 10:34 AM Tingmao Wang <m@maowtm.org> wrote:
[...]
> Hi Song, Christian, Al and others,
>
> Previously I proposed in [1] to add ability to do a reference-less parent
> walk for Landlock. However, as Christian pointed out and I do agree in
> hindsight, it is not a good idea to do things like this in non-VFS code.
>
> However, I still think this is valuable to consider given the performance
> improvement, and after some discussion with Mickaël, I would like to
> propose extending Song's helper to support such usage. While I recognize
> that this patch series is already in its v3, and I do not want to delay it
> by too much, putting this proposal out now is still better than after this
> has merged, so that we may consider signature changes.
>
> I've created a proof-of-concept and did some brief testing. The
> performance improvement attained here is the same as in [1] (with a "git
> status" workload, median landlock overhead 35% -> 28%, median time in
> landlock decreases by 26.6%).
>
> If this idea is accepted, I'm happy to work on it further, split out this
> patch, update the comments and do more testing etc, potentially in
> collaboration with Song.
>
> An alternative to this is perhaps to add a new helper
> path_walk_parent_rcu, also living in namei.c, that will be used directly
> by Landlock. I'm happy to do it either way, but with some experimentation
> I personally think that the code in this patch is still clean enough, and
> can avoid some duplication.
>
> Patch title: path_walk_parent: support reference-less walk
>
> A later commit will update the BPF path iterator to use this.
>
> Signed-off-by: Tingmao Wang <m@maowtm.org>
[...]
>
> -bool path_walk_parent(struct path *path, const struct path *root);
> +struct parent_iterator {
> + struct path path;
> + struct path root;
> + bool rcu;
> + /* expected seq of path->dentry */
> + unsigned next_seq;
> + unsigned m_seq, r_seq;
Most of parent_iterator is not really used by reference walk.
So it is probably just separate the two APIs?
Also, is it ok to make m_seq and r_seq available out of fs/?
> +};
> +
> +#define PATH_WALK_PARENT_UPDATED 0
> +#define PATH_WALK_PARENT_ALREADY_ROOT -1
> +#define PATH_WALK_PARENT_RETRY -2
> +
> +void path_walk_parent_start(struct parent_iterator *pit,
> + const struct path *path, const struct path *root,
> + bool ref_less);
> +int path_walk_parent(struct parent_iterator *pit, struct path *next_parent);
> +int path_walk_parent_end(struct parent_iterator *pit);
I think it is better to make this rcu walk a separate set of APIs.
IOW, we will have:
int path_walk_parent(struct path *path, struct path *root);
and
void path_walk_parent_rcu_start(struct parent_iterator *pit,
const struct path *path, const struct path *root);
int path_walk_parent_rcu_next(struct parent_iterator *pit, struct path
*next_parent);
int path_walk_parent_rcu_end(struct parent_iterator *pit);
Thanks,
Song
[...]
On 6/9/25 07:23, Song Liu wrote:
> On Sun, Jun 8, 2025 at 10:34 AM Tingmao Wang <m@maowtm.org> wrote:
> [...]
>> Hi Song, Christian, Al and others,
>>
>> Previously I proposed in [1] to add ability to do a reference-less parent
>> walk for Landlock. However, as Christian pointed out and I do agree in
>> hindsight, it is not a good idea to do things like this in non-VFS code.
>>
>> However, I still think this is valuable to consider given the performance
>> improvement, and after some discussion with Mickaël, I would like to
>> propose extending Song's helper to support such usage. While I recognize
>> that this patch series is already in its v3, and I do not want to delay it
>> by too much, putting this proposal out now is still better than after this
>> has merged, so that we may consider signature changes.
>>
>> I've created a proof-of-concept and did some brief testing. The
>> performance improvement attained here is the same as in [1] (with a "git
>> status" workload, median landlock overhead 35% -> 28%, median time in
>> landlock decreases by 26.6%).
>>
>> If this idea is accepted, I'm happy to work on it further, split out this
>> patch, update the comments and do more testing etc, potentially in
>> collaboration with Song.
>>
>> An alternative to this is perhaps to add a new helper
>> path_walk_parent_rcu, also living in namei.c, that will be used directly
>> by Landlock. I'm happy to do it either way, but with some experimentation
>> I personally think that the code in this patch is still clean enough, and
>> can avoid some duplication.
>>
>> Patch title: path_walk_parent: support reference-less walk
>>
>> A later commit will update the BPF path iterator to use this.
>>
>> Signed-off-by: Tingmao Wang <m@maowtm.org>
> [...]
>>
>> -bool path_walk_parent(struct path *path, const struct path *root);
>> +struct parent_iterator {
>> + struct path path;
>> + struct path root;
>> + bool rcu;
>> + /* expected seq of path->dentry */
>> + unsigned next_seq;
>> + unsigned m_seq, r_seq;
>
> Most of parent_iterator is not really used by reference walk.
> So it is probably just separate the two APIs?
I don't mind either way, but I feel like it might be nice to just have one
style of APIs (i.e. an iterator with start / end / next vs just one
function), even though this is not totally necessary for the ref-taking
walk. After all, the BPF use case is iterator-based. This also means
that the code at the user's side (mostly thinking of Landlock here) is
slightly simpler.
But I've not experimented with the other way. I'm open to both, and I'm
happy to send a patch later for a separate API (in that case that would
not depend on this and I might just start a new series).
Would like to hear what VFS folks thinks of this first tho, and whether
there's any preference in one or two APIs.
>
> Also, is it ok to make m_seq and r_seq available out of fs/?
The struct is not intended to be used directly by code outside. Not sure
what is the standard way to do this but we can make it private by e.g.
putting the seq values in another struct, if needed. Alternatively I
think we can hide the entire struct behind an opaque pointer by doing the
allocation ourselves.
>
>> +};
>> +
>> +#define PATH_WALK_PARENT_UPDATED 0
>> +#define PATH_WALK_PARENT_ALREADY_ROOT -1
>> +#define PATH_WALK_PARENT_RETRY -2
>> +
>> +void path_walk_parent_start(struct parent_iterator *pit,
>> + const struct path *path, const struct path *root,
>> + bool ref_less);
>> +int path_walk_parent(struct parent_iterator *pit, struct path *next_parent);
>> +int path_walk_parent_end(struct parent_iterator *pit);
>
> I think it is better to make this rcu walk a separate set of APIs.
> IOW, we will have:
>
> int path_walk_parent(struct path *path, struct path *root);
>
> and
>
> void path_walk_parent_rcu_start(struct parent_iterator *pit,
> const struct path *path, const struct path *root);
> int path_walk_parent_rcu_next(struct parent_iterator *pit, struct path
> *next_parent);
> int path_walk_parent_rcu_end(struct parent_iterator *pit);
(replied above)
>
> Thanks,
> Song
>
> [...]
On Mon, Jun 09, 2025 at 09:08:34AM +0100, Tingmao Wang wrote:
> On 6/9/25 07:23, Song Liu wrote:
> > On Sun, Jun 8, 2025 at 10:34 AM Tingmao Wang <m@maowtm.org> wrote:
> > [...]
> >> Hi Song, Christian, Al and others,
> >>
> >> Previously I proposed in [1] to add ability to do a reference-less parent
> >> walk for Landlock. However, as Christian pointed out and I do agree in
> >> hindsight, it is not a good idea to do things like this in non-VFS code.
> >>
> >> However, I still think this is valuable to consider given the performance
> >> improvement, and after some discussion with Mickaël, I would like to
> >> propose extending Song's helper to support such usage. While I recognize
> >> that this patch series is already in its v3, and I do not want to delay it
> >> by too much, putting this proposal out now is still better than after this
> >> has merged, so that we may consider signature changes.
> >>
> >> I've created a proof-of-concept and did some brief testing. The
> >> performance improvement attained here is the same as in [1] (with a "git
> >> status" workload, median landlock overhead 35% -> 28%, median time in
> >> landlock decreases by 26.6%).
> >>
> >> If this idea is accepted, I'm happy to work on it further, split out this
> >> patch, update the comments and do more testing etc, potentially in
> >> collaboration with Song.
> >>
> >> An alternative to this is perhaps to add a new helper
> >> path_walk_parent_rcu, also living in namei.c, that will be used directly
> >> by Landlock. I'm happy to do it either way, but with some experimentation
> >> I personally think that the code in this patch is still clean enough, and
> >> can avoid some duplication.
> >>
> >> Patch title: path_walk_parent: support reference-less walk
> >>
> >> A later commit will update the BPF path iterator to use this.
> >>
> >> Signed-off-by: Tingmao Wang <m@maowtm.org>
> > [...]
> >>
> >> -bool path_walk_parent(struct path *path, const struct path *root);
> >> +struct parent_iterator {
> >> + struct path path;
> >> + struct path root;
> >> + bool rcu;
> >> + /* expected seq of path->dentry */
> >> + unsigned next_seq;
> >> + unsigned m_seq, r_seq;
> >
> > Most of parent_iterator is not really used by reference walk.
> > So it is probably just separate the two APIs?
>
> I don't mind either way, but I feel like it might be nice to just have one
> style of APIs (i.e. an iterator with start / end / next vs just one
> function), even though this is not totally necessary for the ref-taking
> walk. After all, the BPF use case is iterator-based. This also means
> that the code at the user's side (mostly thinking of Landlock here) is
> slightly simpler.
>
> But I've not experimented with the other way. I'm open to both, and I'm
> happy to send a patch later for a separate API (in that case that would
> not depend on this and I might just start a new series).
>
> Would like to hear what VFS folks thinks of this first tho, and whether
> there's any preference in one or two APIs.
I really dislike exposing the sequence number for mounts an for
dentries. That's just nonsense and a non-VFS low-level consumer of this
API has zero business caring about any of that. It's easy to
misunderstand, it's easy to abuse so that's not a good way of doing
this. It's the wrong API.
>
> >
> > Also, is it ok to make m_seq and r_seq available out of fs/?
No, it's not.
>
> The struct is not intended to be used directly by code outside. Not sure
That doesn't mean anything. It's simply the wrong API if it has to spill
so much of its bowels.
> what is the standard way to do this but we can make it private by e.g.
> putting the seq values in another struct, if needed. Alternatively I
> think we can hide the entire struct behind an opaque pointer by doing the
> allocation ourselves.
>
> >
> >> +};
> >> +
> >> +#define PATH_WALK_PARENT_UPDATED 0
> >> +#define PATH_WALK_PARENT_ALREADY_ROOT -1
> >> +#define PATH_WALK_PARENT_RETRY -2
> >> +
> >> +void path_walk_parent_start(struct parent_iterator *pit,
> >> + const struct path *path, const struct path *root,
> >> + bool ref_less);
> >> +int path_walk_parent(struct parent_iterator *pit, struct path *next_parent);
> >> +int path_walk_parent_end(struct parent_iterator *pit);
> >
> > I think it is better to make this rcu walk a separate set of APIs.
> > IOW, we will have:
> >
> > int path_walk_parent(struct path *path, struct path *root);
> >
> > and
> >
> > void path_walk_parent_rcu_start(struct parent_iterator *pit,
> > const struct path *path, const struct path *root);
> > int path_walk_parent_rcu_next(struct parent_iterator *pit, struct path
> > *next_parent);
> > int path_walk_parent_rcu_end(struct parent_iterator *pit);
>
> (replied above)
Exposing two sets of different APIs for essentially the same things is
not going to happen.
The VFS doesn't expose a rcu variant and a non-rcu variant for itself so
we are absolutely not going to do that for outside stuff.
It always does the try RCU first, then try to continue the walk by
falling back to REF walk (e.g., via try_to_unlazy()). If that doesn't
work then let the caller know and require them to decide whether to
abort or redo everything in ref-walk.
There's zero need in that scheme for the caller to see any of the
internals of the VFS and that's what you should aim for.
On Wed, Jun 11, 2025 at 01:36:46PM +0200, Christian Brauner wrote:
> On Mon, Jun 09, 2025 at 09:08:34AM +0100, Tingmao Wang wrote:
> > On 6/9/25 07:23, Song Liu wrote:
> > > On Sun, Jun 8, 2025 at 10:34 AM Tingmao Wang <m@maowtm.org> wrote:
> > > [...]
> > >> Hi Song, Christian, Al and others,
> > >>
> > >> Previously I proposed in [1] to add ability to do a reference-less parent
> > >> walk for Landlock. However, as Christian pointed out and I do agree in
> > >> hindsight, it is not a good idea to do things like this in non-VFS code.
> > >>
> > >> However, I still think this is valuable to consider given the performance
> > >> improvement, and after some discussion with Mickaël, I would like to
> > >> propose extending Song's helper to support such usage. While I recognize
> > >> that this patch series is already in its v3, and I do not want to delay it
> > >> by too much, putting this proposal out now is still better than after this
> > >> has merged, so that we may consider signature changes.
> > >>
> > >> I've created a proof-of-concept and did some brief testing. The
> > >> performance improvement attained here is the same as in [1] (with a "git
> > >> status" workload, median landlock overhead 35% -> 28%, median time in
> > >> landlock decreases by 26.6%).
> > >>
> > >> If this idea is accepted, I'm happy to work on it further, split out this
> > >> patch, update the comments and do more testing etc, potentially in
> > >> collaboration with Song.
> > >>
> > >> An alternative to this is perhaps to add a new helper
> > >> path_walk_parent_rcu, also living in namei.c, that will be used directly
> > >> by Landlock. I'm happy to do it either way, but with some experimentation
> > >> I personally think that the code in this patch is still clean enough, and
> > >> can avoid some duplication.
> > >>
> > >> Patch title: path_walk_parent: support reference-less walk
> > >>
> > >> A later commit will update the BPF path iterator to use this.
> > >>
> > >> Signed-off-by: Tingmao Wang <m@maowtm.org>
> > > [...]
> > >>
> > >> -bool path_walk_parent(struct path *path, const struct path *root);
> > >> +struct parent_iterator {
> > >> + struct path path;
> > >> + struct path root;
> > >> + bool rcu;
> > >> + /* expected seq of path->dentry */
> > >> + unsigned next_seq;
> > >> + unsigned m_seq, r_seq;
> > >
> > > Most of parent_iterator is not really used by reference walk.
> > > So it is probably just separate the two APIs?
> >
> > I don't mind either way, but I feel like it might be nice to just have one
> > style of APIs (i.e. an iterator with start / end / next vs just one
> > function), even though this is not totally necessary for the ref-taking
> > walk. After all, the BPF use case is iterator-based. This also means
> > that the code at the user's side (mostly thinking of Landlock here) is
> > slightly simpler.
> >
> > But I've not experimented with the other way. I'm open to both, and I'm
> > happy to send a patch later for a separate API (in that case that would
> > not depend on this and I might just start a new series).
> >
> > Would like to hear what VFS folks thinks of this first tho, and whether
> > there's any preference in one or two APIs.
>
> I really dislike exposing the sequence number for mounts an for
> dentries. That's just nonsense and a non-VFS low-level consumer of this
> API has zero business caring about any of that. It's easy to
> misunderstand, it's easy to abuse so that's not a good way of doing
> this. It's the wrong API.
>
> >
> > >
> > > Also, is it ok to make m_seq and r_seq available out of fs/?
>
> No, it's not.
>
> >
> > The struct is not intended to be used directly by code outside. Not sure
>
> That doesn't mean anything. It's simply the wrong API if it has to spill
> so much of its bowels.
>
> > what is the standard way to do this but we can make it private by e.g.
> > putting the seq values in another struct, if needed. Alternatively I
> > think we can hide the entire struct behind an opaque pointer by doing the
> > allocation ourselves.
> >
> > >
> > >> +};
> > >> +
> > >> +#define PATH_WALK_PARENT_UPDATED 0
> > >> +#define PATH_WALK_PARENT_ALREADY_ROOT -1
> > >> +#define PATH_WALK_PARENT_RETRY -2
> > >> +
> > >> +void path_walk_parent_start(struct parent_iterator *pit,
> > >> + const struct path *path, const struct path *root,
> > >> + bool ref_less);
> > >> +int path_walk_parent(struct parent_iterator *pit, struct path *next_parent);
> > >> +int path_walk_parent_end(struct parent_iterator *pit);
> > >
> > > I think it is better to make this rcu walk a separate set of APIs.
> > > IOW, we will have:
> > >
> > > int path_walk_parent(struct path *path, struct path *root);
> > >
> > > and
> > >
> > > void path_walk_parent_rcu_start(struct parent_iterator *pit,
> > > const struct path *path, const struct path *root);
> > > int path_walk_parent_rcu_next(struct parent_iterator *pit, struct path
> > > *next_parent);
> > > int path_walk_parent_rcu_end(struct parent_iterator *pit);
> >
> > (replied above)
>
> Exposing two sets of different APIs for essentially the same things is
> not going to happen.
>
> The VFS doesn't expose a rcu variant and a non-rcu variant for itself so
> we are absolutely not going to do that for outside stuff.
>
> It always does the try RCU first, then try to continue the walk by
> falling back to REF walk (e.g., via try_to_unlazy()). If that doesn't
> work then let the caller know and require them to decide whether to
> abort or redo everything in ref-walk.
That's indeed what is done by choose_mountpoint() (relying on
choose_mountpoint_rcu() when possible), but this proposal is about doing
a full path walk (i.e. multiple calls to path_walk_parent) within RCU.
>
> There's zero need in that scheme for the caller to see any of the
> internals of the VFS and that's what you should aim for.
Yes, but how could we detect if a full path walk is invalid (because of
a rename or mount change)?
Update bpf_fs_kfuncs to match path_walk_parent changes.
It compiles, but I've not tested this yet.
Signed-off-by: Tingmao Wang <m@maowtm.org>
---
fs/bpf_fs_kfuncs.c | 55 +++++++++++++++++++++++-----------------------
1 file changed, 28 insertions(+), 27 deletions(-)
diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c
index 8c618154df0a..6599342dd0de 100644
--- a/fs/bpf_fs_kfuncs.c
+++ b/fs/bpf_fs_kfuncs.c
@@ -327,23 +327,18 @@ __bpf_kfunc_end_defs();
/* open-coded path iterator */
struct bpf_iter_path {
- __u64 __opaque[2];
-} __aligned(8);
-
-struct bpf_iter_path_kern {
- struct path path;
+ __u64 __opaque[sizeof(struct parent_iterator) / sizeof(__u64)];
} __aligned(8);
__bpf_kfunc_start_defs();
-__bpf_kfunc int bpf_iter_path_new(struct bpf_iter_path *it,
- struct path *start,
+__bpf_kfunc int bpf_iter_path_new(struct bpf_iter_path *it, struct path *start,
__u64 flags)
{
- struct bpf_iter_path_kern *kit = (void *)it;
+ struct parent_iterator *pit = (void *)it;
- BUILD_BUG_ON(sizeof(*kit) > sizeof(*it));
- BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it));
+ BUILD_BUG_ON(sizeof(*pit) > sizeof(*it));
+ BUILD_BUG_ON(__alignof__(*pit) != __alignof__(*it));
if (flags) {
/*
@@ -351,45 +346,51 @@ __bpf_kfunc int bpf_iter_path_new(struct bpf_iter_path *it,
* kit->path so that it be passed to path_put() safely.
* Note: path_put() is no-op for zero'ed path.
*/
- memset(&kit->path, 0, sizeof(struct path));
+ memset(pit, 0, sizeof(*pit));
return -EINVAL;
}
- kit->path = *start;
- path_get(&kit->path);
-
- return 0;
-}
-
-__bpf_kfunc struct path *bpf_iter_path_next(struct bpf_iter_path *it)
-{
- struct bpf_iter_path_kern *kit = (void *)it;
- struct path root = {};
-
/*
- * "root" is zero'ed. Therefore, unless the loop is explicitly
+ * "root" is NULL. Therefore, unless the loop is explicitly
* terminated, bpf_iter_path_next() will continue looping until
* we've reached the global root of the VFS.
*
* If a root of walk is needed, the user can check "path" against
* that root on each iteration.
*/
- if (!path_walk_parent(&kit->path, &root)) {
+ path_walk_parent_start(pit, start, NULL, false);
+
+ return 0;
+}
+
+__bpf_kfunc struct path *bpf_iter_path_next(struct bpf_iter_path *it)
+{
+ struct parent_iterator *pit = (void *)it;
+ struct path p;
+
+ switch (path_walk_parent(pit, &p)) {
+ case PATH_WALK_PARENT_UPDATED:
+ return &pit->path;
+ case PATH_WALK_PARENT_ALREADY_ROOT:
/*
* Return NULL, but keep valid kit->path. _destroy() will
* always path_put(&kit->path).
*/
return NULL;
+ default:
+ WARN_ONCE(
+ 1,
+ "did not expect any other return from path_walk_parent");
}
- return &kit->path;
+ return &pit->path;
}
__bpf_kfunc void bpf_iter_path_destroy(struct bpf_iter_path *it)
{
- struct bpf_iter_path_kern *kit = (void *)it;
+ struct parent_iterator *pit = (void *)it;
- path_put(&kit->path);
+ path_walk_parent_end(pit);
}
__bpf_kfunc_end_defs();
--
2.49.0
© 2016 - 2025 Red Hat, Inc.