cgroup::ancestors includes self, i.e. root cgroups have one ancestor but
their level is 0. Change the value that we store inside struct cgroup
and use an inlined helper where we need to know the level. This way we
preserve the concept of 0-based levels and we can utilize __counted_by
constraint to guard ancestors access. (We could've used level value as a
counter for _low_ancestors but that would have no benefit since we never
access data through this flexible array alias.)
Cc: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Signed-off-by: Michal Koutný <mkoutny@suse.com>
---
include/linux/cgroup-defs.h | 19 ++++++++-----------
include/linux/cgroup.h | 2 +-
kernel/cgroup/cgroup.c | 3 ++-
3 files changed, 11 insertions(+), 13 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 9247e437da5ce..8ce1ae9bea909 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -475,14 +475,6 @@ struct cgroup {
unsigned long flags; /* "unsigned long" so bitops work */
- /*
- * The depth this cgroup is at. The root is at depth zero and each
- * step down the hierarchy increments the level. This along with
- * ancestors[] can determine whether a given cgroup is a
- * descendant of another without traversing the hierarchy.
- */
- int level;
-
/* Maximum allowed descent tree depth */
int max_depth;
@@ -625,13 +617,18 @@ struct cgroup {
struct bpf_local_storage __rcu *bpf_cgrp_storage;
#endif
- /* All ancestors including self */
union {
struct {
- void *_sentinel[0]; /* XXX to avoid 'flexible array member in a struct with no named members' */
- struct cgroup *ancestors[];
+ int nr_ancestors; /* do not use directly but via cgroup_level() */
+ /*
+ * All ancestors including self.
+ * ancestors[] can determine whether a given cgroup is a
+ * descendant of another without traversing the hierarchy.
+ */
+ struct cgroup *ancestors[] __counted_by(nr_ancestors);
};
struct {
+ int _nr_ancestors; /* auxiliary padding, see nr_ancestors above */
struct cgroup *_root_ancestor;
struct cgroup *_low_ancestors[];
};
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0290878ebad26..45f720b9ecedd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -534,7 +534,7 @@ static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
*/
static inline int cgroup_level(struct cgroup *cgrp)
{
- return cgrp->level;
+ return cgrp->nr_ancestors - 1;
}
/**
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e011f1dd6d87f..5110d3e13d125 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2197,6 +2197,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
}
root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
+ root_cgrp->nr_ancestors = 1; /* stored in _root_ancestor */
root_cgrp->ancestors[0] = root_cgrp;
ret = css_populate_dir(&root_cgrp->self);
@@ -5869,7 +5870,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
cgrp->self.parent = &parent->self;
cgrp->root = root;
- cgrp->level = level;
+ cgrp->nr_ancestors = parent->nr_ancestors + 1;
/*
* Now that init_cgroup_housekeeping() has been called and cgrp->self
--
2.52.0
On 2025/12/18 0:27, Michal Koutný wrote:
> cgroup::ancestors includes self, i.e. root cgroups have one ancestor but
> their level is 0. Change the value that we store inside struct cgroup
> and use an inlined helper where we need to know the level. This way we
> preserve the concept of 0-based levels and we can utilize __counted_by
> constraint to guard ancestors access. (We could've used level value as a
> counter for _low_ancestors but that would have no benefit since we never
> access data through this flexible array alias.)
>
> Cc: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
> Signed-off-by: Michal Koutný <mkoutny@suse.com>
> ---
> include/linux/cgroup-defs.h | 19 ++++++++-----------
> include/linux/cgroup.h | 2 +-
> kernel/cgroup/cgroup.c | 3 ++-
> 3 files changed, 11 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index 9247e437da5ce..8ce1ae9bea909 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -475,14 +475,6 @@ struct cgroup {
>
> unsigned long flags; /* "unsigned long" so bitops work */
>
> - /*
> - * The depth this cgroup is at. The root is at depth zero and each
> - * step down the hierarchy increments the level. This along with
> - * ancestors[] can determine whether a given cgroup is a
> - * descendant of another without traversing the hierarchy.
> - */
> - int level;
> -
Note that this level may already be used in existing BPF programs (e.g.,
tools/testing/selftests/bpf/progs/task_ls_uptr.c). Do we need to consider compatibility here?
> /* Maximum allowed descent tree depth */
> int max_depth;
>
> @@ -625,13 +617,18 @@ struct cgroup {
> struct bpf_local_storage __rcu *bpf_cgrp_storage;
> #endif
>
> - /* All ancestors including self */
> union {
> struct {
> - void *_sentinel[0]; /* XXX to avoid 'flexible array member in a struct with no named members' */
> - struct cgroup *ancestors[];
> + int nr_ancestors; /* do not use directly but via cgroup_level() */
> + /*
> + * All ancestors including self.
> + * ancestors[] can determine whether a given cgroup is a
> + * descendant of another without traversing the hierarchy.
> + */
> + struct cgroup *ancestors[] __counted_by(nr_ancestors);
> };
> struct {
> + int _nr_ancestors; /* auxiliary padding, see nr_ancestors above */
> struct cgroup *_root_ancestor;
> struct cgroup *_low_ancestors[];
> };
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 0290878ebad26..45f720b9ecedd 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -534,7 +534,7 @@ static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
> */
> static inline int cgroup_level(struct cgroup *cgrp)
> {
> - return cgrp->level;
> + return cgrp->nr_ancestors - 1;
> }
>
> /**
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index e011f1dd6d87f..5110d3e13d125 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -2197,6 +2197,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
> }
> root_cgrp->kn = kernfs_root_to_node(root->kf_root);
> WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
> + root_cgrp->nr_ancestors = 1; /* stored in _root_ancestor */
> root_cgrp->ancestors[0] = root_cgrp;
>
> ret = css_populate_dir(&root_cgrp->self);
> @@ -5869,7 +5870,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
>
> cgrp->self.parent = &parent->self;
> cgrp->root = root;
> - cgrp->level = level;
> + cgrp->nr_ancestors = parent->nr_ancestors + 1;
>
> /*
> * Now that init_cgroup_housekeeping() has been called and cgrp->self
--
Best regards,
Ridong
On Thu, Dec 18, 2025 at 03:09:32PM +0800, Chen Ridong wrote: > Note that this level may already be used in existing BPF programs (e.g., > tools/testing/selftests/bpf/progs/task_ls_uptr.c). Do we need to consider compatibility here? That's a good point. Is __counted_by instrumentation tied to some compiler flag? If so, might as well make it an optional extra field specifically for the annotation rather than changing the meaning of an existing field. Thanks. -- tejun
On December 19, 2025 1:09:42 AM GMT+09:00, Tejun Heo <tj@kernel.org> wrote: >On Thu, Dec 18, 2025 at 03:09:32PM +0800, Chen Ridong wrote: >> Note that this level may already be used in existing BPF programs (e.g., >> tools/testing/selftests/bpf/progs/task_ls_uptr.c). Do we need to consider compatibility here? > >That's a good point. Is __counted_by instrumentation tied to some compiler >flag? If so, might as well make it an optional extra field specifically for >the annotation rather than changing the meaning of an existing field. > >Thanks. > CONFIG_FORTIFY_SOURCE and CONFIG_UBSAN_BOUNDS use the information for instrumentation. -- Kees Cook
On Thu, Dec 18, 2025 at 06:09:42AM -1000, Tejun Heo <tj@kernel.org> wrote:
> On Thu, Dec 18, 2025 at 03:09:32PM +0800, Chen Ridong wrote:
> > Note that this level may already be used in existing BPF programs (e.g.,
> > tools/testing/selftests/bpf/progs/task_ls_uptr.c). Do we need to consider compatibility here?
>
> That's a good point.
I wouldn't be concerned about this particular aspect. The commit
e6ac2450d6dee ("bpf: Support bpf program calling kernel function")
excludes ABIs, the example program uses ksyms (not kfuncs), so there
could even apply Documentation/process/stable-api-nonsense.rst.
OTOH, the semantics of level is unchanged for BPF helpers (that are the
official API).
> Is __counted_by instrumentation tied to some compiler flag? If so,
> might as well make it an optional extra field specifically for the
> annotation rather than changing the meaning of an existing field.
Honestly, I can see benefit mainly in the first patch of the series
(posted the rest for discussion).
I'd like to ask Gustavo whether __counted_by here buys us anything or
whether it's more useful in other parts of kernel (e.g. flexible
allocations in networking code with outer sources of data).
Thanks,
Michal
On 12/19/25 01:32, Michal Koutný wrote:
> On Thu, Dec 18, 2025 at 06:09:42AM -1000, Tejun Heo <tj@kernel.org> wrote:
>> On Thu, Dec 18, 2025 at 03:09:32PM +0800, Chen Ridong wrote:
>>> Note that this level may already be used in existing BPF programs (e.g.,
>>> tools/testing/selftests/bpf/progs/task_ls_uptr.c). Do we need to consider compatibility here?
>>
>> That's a good point.
>
> I wouldn't be concerned about this particular aspect. The commit
> e6ac2450d6dee ("bpf: Support bpf program calling kernel function")
> excludes ABIs, the example program uses ksyms (not kfuncs), so there
> could even apply Documentation/process/stable-api-nonsense.rst.
> OTOH, the semantics of level is unchanged for BPF helpers (that are the
> official API).
>
>
>> Is __counted_by instrumentation tied to some compiler flag? If so,
>> might as well make it an optional extra field specifically for the
>> annotation rather than changing the meaning of an existing field.
>
> Honestly, I can see benefit mainly in the first patch of the series
> (posted the rest for discussion).
>
> I'd like to ask Gustavo whether __counted_by here buys us anything or
> whether it's more useful in other parts of kernel (e.g. flexible
> allocations in networking code with outer sources of data).
Ideally, all structures containing a flexible-array member (FAM) should
be annotated. However, if this is too much of a hassle right now, I'd
say the priority is to avoid the -Wflex-array-member-not-at-end warnings,
first.
Thanks
-Gustavo
© 2016 - 2026 Red Hat, Inc.