Honor cgroup namespace when resolving cgroup id

[PATCH 4/4] cgroup/bpf: Honor cgroup NS in cgroup_iter for ancestors

Posted by Michal Koutný 3 years, 7 months ago

The iterator with BPF_CGROUP_ITER_ANCESTORS_UP can traverse up across a
cgroup namespace level, which may be surprising within a non-init cgroup
namespace.

Introduce and use a new cgroup_parent_ns() helper that stops according
to cgroup namespace boundary. With BPF_CGROUP_ITER_ANCESTORS_UP. We use
the cgroup namespace of the iterator caller, not that one of the creator
(might be different, the former is relevant).

Fixes: d4ccaf58a847 ("bpf: Introduce cgroup iter")
Signed-off-by: Michal Koutný <mkoutny@suse.com>
---
 include/linux/cgroup.h   |  3 +++
 kernel/bpf/cgroup_iter.c |  9 ++++++---
 kernel/cgroup/cgroup.c   | 32 +++++++++++++++++++++++---------
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b6a9528374a8..b63a80e03fae 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -858,6 +858,9 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
 int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
 		   struct cgroup_namespace *ns);
 
+struct cgroup *cgroup_parent_ns(struct cgroup *cgrp,
+				struct cgroup_namespace *ns);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index c69bce2f4403..06ee4a0c5870 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -104,6 +104,7 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
 	struct cgroup_iter_priv *p = seq->private;
+	struct cgroup *parent;
 
 	++*pos;
 	if (p->terminate)
@@ -113,9 +114,11 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		return css_next_descendant_pre(curr, p->start_css);
 	else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
 		return css_next_descendant_post(curr, p->start_css);
-	else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
-		return curr->parent;
-	else  /* BPF_CGROUP_ITER_SELF_ONLY */
+	else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) {
+		parent = cgroup_parent_ns(curr->cgroup,
+					  current->nsproxy->cgroup_ns);
+		return parent ? &parent->self : NULL;
+	} else  /* BPF_CGROUP_ITER_SELF_ONLY */
 		return NULL;
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c0377726031f..d60b5dfbbbc9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1417,11 +1417,11 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
 }
 
 /*
- * look up cgroup associated with current task's cgroup namespace on the
+ * look up cgroup associated with given cgroup namespace on the
  * specified hierarchy
  */
-static struct cgroup *
-current_cgns_cgroup_from_root(struct cgroup_root *root)
+static struct cgroup *cgns_cgroup_from_root(struct cgroup_root *root,
+					    struct cgroup_namespace *ns)
 {
 	struct cgroup *res = NULL;
 	struct css_set *cset;
@@ -1430,7 +1430,7 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
 
 	rcu_read_lock();
 
-	cset = current->nsproxy->cgroup_ns->root_cset;
+	cset = ns->root_cset;
 	res = __cset_cgroup_from_root(cset, root);
 
 	rcu_read_unlock();
@@ -1852,15 +1852,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 	int len = 0;
 	char *buf = NULL;
 	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
-	struct cgroup *ns_cgroup;
+	struct cgroup *root_cgroup;
 
 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
 	spin_lock_irq(&css_set_lock);
-	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
-	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
+	root_cgroup = cgns_cgroup_from_root(kf_cgroot, current->nsproxy->cgroup_ns);
+	len = kernfs_path_from_node(kf_node, root_cgroup->kn, buf, PATH_MAX);
 	spin_unlock_irq(&css_set_lock);
 
 	if (len >= PATH_MAX)
@@ -2330,6 +2330,18 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
 }
 EXPORT_SYMBOL_GPL(cgroup_path_ns);
 
+struct cgroup *cgroup_parent_ns(struct cgroup *cgrp,
+				   struct cgroup_namespace *ns)
+{
+	struct cgroup *root_cgrp;
+
+	spin_lock_irq(&css_set_lock);
+	root_cgrp = cgns_cgroup_from_root(cgrp->root, ns);
+	spin_unlock_irq(&css_set_lock);
+
+	return cgrp == root_cgrp ? NULL : cgroup_parent(cgrp);
+}
+
 /**
  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
  * @task: target task
@@ -6031,7 +6043,8 @@ struct cgroup *cgroup_get_from_id(u64 id)
 		goto out;
 
 	spin_lock_irq(&css_set_lock);
-	root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
+	root_cgrp = cgns_cgroup_from_root(&cgrp_dfl_root,
+					  current->nsproxy->cgroup_ns);
 	spin_unlock_irq(&css_set_lock);
 	if (!cgroup_is_descendant(cgrp, root_cgrp)) {
 		cgroup_put(cgrp);
@@ -6612,7 +6625,8 @@ struct cgroup *cgroup_get_from_path(const char *path)
 	struct cgroup *root_cgrp;
 
 	spin_lock_irq(&css_set_lock);
-	root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
+	root_cgrp = cgns_cgroup_from_root(&cgrp_dfl_root,
+					  current->nsproxy->cgroup_ns);
 	kn = kernfs_walk_and_get(root_cgrp->kn, path);
 	spin_unlock_irq(&css_set_lock);
 	if (!kn)
-- 
2.37.0

Re: [PATCH 4/4] cgroup/bpf: Honor cgroup NS in cgroup_iter for ancestors

Posted by Yosry Ahmed 3 years, 7 months ago

Hi there!

Thanks for following up with this series!

On Fri, Aug 26, 2022 at 9:53 AM Michal Koutný <mkoutny@suse.com> wrote:
>
> The iterator with BPF_CGROUP_ITER_ANCESTORS_UP can traverse up across a
> cgroup namespace level, which may be surprising within a non-init cgroup
> namespace.
>
> Introduce and use a new cgroup_parent_ns() helper that stops according
> to cgroup namespace boundary. With BPF_CGROUP_ITER_ANCESTORS_UP. We use
> the cgroup namespace of the iterator caller, not that one of the creator
> (might be different, the former is relevant).
>
> Fixes: d4ccaf58a847 ("bpf: Introduce cgroup iter")
> Signed-off-by: Michal Koutný <mkoutny@suse.com>
> ---
>  include/linux/cgroup.h   |  3 +++
>  kernel/bpf/cgroup_iter.c |  9 ++++++---
>  kernel/cgroup/cgroup.c   | 32 +++++++++++++++++++++++---------
>  3 files changed, 32 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index b6a9528374a8..b63a80e03fae 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -858,6 +858,9 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
>  int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
>                    struct cgroup_namespace *ns);
>
> +struct cgroup *cgroup_parent_ns(struct cgroup *cgrp,
> +                               struct cgroup_namespace *ns);
> +
>  #else /* !CONFIG_CGROUPS */
>
>  static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
> diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
> index c69bce2f4403..06ee4a0c5870 100644
> --- a/kernel/bpf/cgroup_iter.c
> +++ b/kernel/bpf/cgroup_iter.c
> @@ -104,6 +104,7 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>  {
>         struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
>         struct cgroup_iter_priv *p = seq->private;
> +       struct cgroup *parent;
>
>         ++*pos;
>         if (p->terminate)
> @@ -113,9 +114,11 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>                 return css_next_descendant_pre(curr, p->start_css);
>         else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
>                 return css_next_descendant_post(curr, p->start_css);
> -       else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
> -               return curr->parent;
> -       else  /* BPF_CGROUP_ITER_SELF_ONLY */
> +       else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) {
> +               parent = cgroup_parent_ns(curr->cgroup,
> +                                         current->nsproxy->cgroup_ns);
> +               return parent ? &parent->self : NULL;
> +       } else  /* BPF_CGROUP_ITER_SELF_ONLY */
>                 return NULL;
>  }
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index c0377726031f..d60b5dfbbbc9 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -1417,11 +1417,11 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
>  }
>
>  /*
> - * look up cgroup associated with current task's cgroup namespace on the
> + * look up cgroup associated with given cgroup namespace on the
>   * specified hierarchy
>   */
> -static struct cgroup *
> -current_cgns_cgroup_from_root(struct cgroup_root *root)
> +static struct cgroup *cgns_cgroup_from_root(struct cgroup_root *root,
> +                                           struct cgroup_namespace *ns)
>  {
>         struct cgroup *res = NULL;
>         struct css_set *cset;
> @@ -1430,7 +1430,7 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
>
>         rcu_read_lock();
>
> -       cset = current->nsproxy->cgroup_ns->root_cset;
> +       cset = ns->root_cset;
>         res = __cset_cgroup_from_root(cset, root);
>
>         rcu_read_unlock();
> @@ -1852,15 +1852,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
>         int len = 0;
>         char *buf = NULL;
>         struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
> -       struct cgroup *ns_cgroup;
> +       struct cgroup *root_cgroup;
>
>         buf = kmalloc(PATH_MAX, GFP_KERNEL);
>         if (!buf)
>                 return -ENOMEM;
>
>         spin_lock_irq(&css_set_lock);
> -       ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
> -       len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
> +       root_cgroup = cgns_cgroup_from_root(kf_cgroot, current->nsproxy->cgroup_ns);
> +       len = kernfs_path_from_node(kf_node, root_cgroup->kn, buf, PATH_MAX);
>         spin_unlock_irq(&css_set_lock);
>
>         if (len >= PATH_MAX)
> @@ -2330,6 +2330,18 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
>  }
>  EXPORT_SYMBOL_GPL(cgroup_path_ns);
>
> +struct cgroup *cgroup_parent_ns(struct cgroup *cgrp,
> +                                  struct cgroup_namespace *ns)
> +{
> +       struct cgroup *root_cgrp;
> +
> +       spin_lock_irq(&css_set_lock);
> +       root_cgrp = cgns_cgroup_from_root(cgrp->root, ns);
> +       spin_unlock_irq(&css_set_lock);
> +
> +       return cgrp == root_cgrp ? NULL : cgroup_parent(cgrp);

I understand that currently cgroup_iter is the only user of this, but
for future use cases, is it safe to assume that cgrp will always be
inside ns? Would it be safer to do something like:

struct cgroup *parent = cgroup_parent(cgrp);

if (!parent)
    return NULL;

return cgroup_is_descendant(parent, root_cgrp) ? parent : NULL;


> +}
> +
>  /**
>   * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
>   * @task: target task
> @@ -6031,7 +6043,8 @@ struct cgroup *cgroup_get_from_id(u64 id)
>                 goto out;
>
>         spin_lock_irq(&css_set_lock);
> -       root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
> +       root_cgrp = cgns_cgroup_from_root(&cgrp_dfl_root,
> +                                         current->nsproxy->cgroup_ns);
>         spin_unlock_irq(&css_set_lock);
>         if (!cgroup_is_descendant(cgrp, root_cgrp)) {
>                 cgroup_put(cgrp);
> @@ -6612,7 +6625,8 @@ struct cgroup *cgroup_get_from_path(const char *path)
>         struct cgroup *root_cgrp;
>
>         spin_lock_irq(&css_set_lock);
> -       root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
> +       root_cgrp = cgns_cgroup_from_root(&cgrp_dfl_root,
> +                                         current->nsproxy->cgroup_ns);
>         kn = kernfs_walk_and_get(root_cgrp->kn, path);
>         spin_unlock_irq(&css_set_lock);
>         if (!kn)
> --
> 2.37.0
>

Re: [PATCH 4/4] cgroup/bpf: Honor cgroup NS in cgroup_iter for ancestors

Posted by Michal Koutný 3 years, 7 months ago

On Fri, Aug 26, 2022 at 10:41:37AM -0700, Yosry Ahmed <yosryahmed@google.com> wrote:
> I understand that currently cgroup_iter is the only user of this, but
> for future use cases, is it safe to assume that cgrp will always be
> inside ns? Would it be safer to do something like:

I preferred the simpler root_cgrp comparison to avoid pointer
arithmetics in cgroup_is_descendant. But I also made the assumption of
cgrp in ns.

Thanks, I'll likely adjust cgroup_path_ns to make it more robust for
an external cgrp.

I'd like to clarify, if a process A in a broad cgroup ns sets up a BPF
cgroup iterator, exposes it via bpffs and than a process B in a narrowed
cgroup ns (which excludes the origin cgroup) wants to traverse the
iterator, should it fail straight ahead (regardless of iter order)?
The alternative would be to allow self-dereference but prohibit any
iterator moves (regardless of order).

Thanks,
Michal

Re: [PATCH 4/4] cgroup/bpf: Honor cgroup NS in cgroup_iter for ancestors

Posted by Yosry Ahmed 3 years, 7 months ago

On Mon, Aug 29, 2022 at 6:00 AM Michal Koutný <mkoutny@suse.com> wrote:
>
> On Fri, Aug 26, 2022 at 10:41:37AM -0700, Yosry Ahmed <yosryahmed@google.com> wrote:
> > I understand that currently cgroup_iter is the only user of this, but
> > for future use cases, is it safe to assume that cgrp will always be
> > inside ns? Would it be safer to do something like:
>
> I preferred the simpler root_cgrp comparison to avoid pointer
> arithmetics in cgroup_is_descendant. But I also made the assumption of
> cgrp in ns.
>
> Thanks, I'll likely adjust cgroup_path_ns to make it more robust for
> an external cgrp.
>

Great, thanks!

>
> I'd like to clarify, if a process A in a broad cgroup ns sets up a BPF
> cgroup iterator, exposes it via bpffs and than a process B in a narrowed
> cgroup ns (which excludes the origin cgroup) wants to traverse the
> iterator, should it fail straight ahead (regardless of iter order)?
> The alternative would be to allow self-dereference but prohibit any
> iterator moves (regardless of order).
>

imo it should fail straight ahead, but maybe others (Tejun? Hao?) have
other opinions here.

>
> Thanks,
> Michal

Re: [PATCH 4/4] cgroup/bpf: Honor cgroup NS in cgroup_iter for ancestors

Posted by Tejun Heo 3 years, 7 months ago

On Mon, Aug 29, 2022 at 10:30:45AM -0700, Yosry Ahmed wrote:
> > I'd like to clarify, if a process A in a broad cgroup ns sets up a BPF
> > cgroup iterator, exposes it via bpffs and than a process B in a narrowed
> > cgroup ns (which excludes the origin cgroup) wants to traverse the
> > iterator, should it fail straight ahead (regardless of iter order)?
> > The alternative would be to allow self-dereference but prohibit any
> > iterator moves (regardless of order).
> >
> 
> imo it should fail straight ahead, but maybe others (Tejun? Hao?) have
> other opinions here.

Yeah, I'd prefer it to fail right away as that's simple and gives us the
most choices for the future.

Thanks.

-- 
tejun

Re: [PATCH 4/4] cgroup/bpf: Honor cgroup NS in cgroup_iter for ancestors

Posted by Hao Luo 3 years, 7 months ago

On Mon, Aug 29, 2022 at 10:49 AM Tejun Heo <tj@kernel.org> wrote:
>
> On Mon, Aug 29, 2022 at 10:30:45AM -0700, Yosry Ahmed wrote:
> > > I'd like to clarify, if a process A in a broad cgroup ns sets up a BPF
> > > cgroup iterator, exposes it via bpffs and than a process B in a narrowed
> > > cgroup ns (which excludes the origin cgroup) wants to traverse the
> > > iterator, should it fail straight ahead (regardless of iter order)?
> > > The alternative would be to allow self-dereference but prohibit any
> > > iterator moves (regardless of order).
> > >
> >
> > imo it should fail straight ahead, but maybe others (Tejun? Hao?) have
> > other opinions here.
>
> Yeah, I'd prefer it to fail right away as that's simple and gives us the
> most choices for the future.
>

Thanks Michal for fixing the cgroup iter use case! I agree that
failing straight ahead is better. I don't envision a use case that
wants the alternative.

> Thanks.
>
> --
> tejun