include/linux/memcontrol.h | 18 ++++++++++++++++++ kernel/cgroup/cgroup.c | 22 ++++++++++++++++------ mm/memcontrol.c | 4 +--- 3 files changed, 35 insertions(+), 9 deletions(-)
This is first of patches announced here:
https://lore.kernel.org/all/62188f37-f816-08e9-cdd5-8df23131746d@openvz.org/
---
Usually accounted allocations use memory cgroup of current process.
In case of cgroup this looks incorrect: memory cgroup can be created
both from the host and from inside container. At the same time
it is intuitively expected that these both operations should lead
to the same result.
This issue was addressed by Roman Gushchin in commit 3e38e0aaca9e
"mm: memcg: charge memcg percpu memory to the parent cgroup",
He adjusted memcg for allocations called inside mem_cgroup_css_alloc.
However, now we want to enable accounting for some other cgroup-related
resources called from cgroup_mkdir. We would like to guarantee that
all new accounted allocation will be charged to the same memory cgroup.
This patch moves memcg adjustment from mem_cgroup_css_alloc() to its
callers: cgroup_mkdir() and cgroup_apply_control(). In own turn this
requires to create new proxy function mem_cgroup_from_cgroup().
Signed-off-by: Vasily Averin <vvs@openvz.org>
---
include/linux/memcontrol.h | 18 ++++++++++++++++++
kernel/cgroup/cgroup.c | 22 ++++++++++++++++------
mm/memcontrol.c | 4 +---
3 files changed, 35 insertions(+), 9 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4d31ce55b1c0..342426d1edbf 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1813,6 +1813,19 @@ static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
{
return memcg ? memcg : root_mem_cgroup;
}
+
+static inline struct mem_cgroup *mem_cgroup_from_cgroup(struct cgroup *cgroup)
+{
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *memcg = NULL;
+
+ css = cgroup_get_e_css(cgroup, &memory_cgrp_subsys);
+
+ if (css)
+ memcg = container_of(css, struct mem_cgroup, css);
+
+ return mem_cgroup_or_root(memcg);
+}
#else
static inline bool mem_cgroup_kmem_disabled(void)
{
@@ -1878,6 +1891,11 @@ static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
{
return NULL;
}
+
+static inline struct mem_cgroup *mem_cgroup_from_cgroup(struct cgroup *cgroup)
+{
+ return NULL;
+}
#endif /* CONFIG_MEMCG_KMEM */
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ffaccd6373f1..544d93a8878f 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3247,13 +3247,16 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
*/
static int cgroup_apply_control(struct cgroup *cgrp)
{
+ struct mem_cgroup *memcg, *old_memcg;
int ret;
cgroup_propagate_control(cgrp);
+ memcg = mem_cgroup_from_cgroup(cgrp);
+ old_memcg = set_active_memcg(memcg);
ret = cgroup_apply_control_enable(cgrp);
if (ret)
- return ret;
+ goto out_memcg;
/*
* At this point, cgroup_e_css_by_mask() results reflect the new csses
@@ -3261,10 +3264,11 @@ static int cgroup_apply_control(struct cgroup *cgrp)
* css associations of all tasks in the subtree.
*/
ret = cgroup_update_dfl_csses(cgrp);
- if (ret)
- return ret;
- return 0;
+out_memcg:
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+ return ret;
}
/**
@@ -5532,6 +5536,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
+ struct mem_cgroup *memcg, *old_memcg;
int ret;
/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
@@ -5547,10 +5552,12 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
goto out_unlock;
}
+ memcg = mem_cgroup_from_cgroup(parent);
+ old_memcg = set_active_memcg(memcg);
cgrp = cgroup_create(parent, name, mode);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
- goto out_unlock;
+ goto out_memcg;
}
/*
@@ -5577,10 +5584,13 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
kernfs_activate(cgrp->kn);
ret = 0;
- goto out_unlock;
+ goto out_memcg;
out_destroy:
cgroup_destroy_locked(cgrp);
+out_memcg:
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
out_unlock:
cgroup_kn_unlock(parent_kn);
return ret;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b69979c9ced5..e170c64e66e2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5239,11 +5239,9 @@ static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
- struct mem_cgroup *memcg, *old_memcg;
+ struct mem_cgroup *memcg;
- old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc();
- set_active_memcg(old_memcg);
if (IS_ERR(memcg))
return ERR_CAST(memcg);
--
2.34.1
Hello. On Wed, Aug 17, 2022 at 10:42:40AM +0300, Vasily Averin <vvs@openvz.org> wrote: > However, now we want to enable accounting for some other cgroup-related > resources called from cgroup_mkdir. We would like to guarantee that > all new accounted allocation will be charged to the same memory cgroup. Here's my point -- the change in the referenced patch applied to memory controller hierarchies. This extension applies to any hierarchy that can create groups, namely, a hierarchy without memory controller too. There mem_cgroup_from_cgroup falls back to the root memcg (on a different hierarchy). If the purpose is to prevent unlimited creation of cgroup objects, the root memcg is by principle unlimited, so it's just for accounting. But I understand the purpose is to have everything under one roof, unless the object lifetime is not bound to that owning memcg. Should memory-less hierarchies be treated specially? > +static inline struct mem_cgroup *mem_cgroup_from_cgroup(struct cgroup *cgroup) [...] > + css = cgroup_get_e_css(cgroup, &memory_cgrp_subsys); > + > + if (css) > + memcg = container_of(css, struct mem_cgroup, css); Nit: mem_cgroup_from_css Regards, Michal
Hello, On Wed, Aug 17, 2022 at 11:17:28AM +0200, Michal Koutný wrote: > On Wed, Aug 17, 2022 at 10:42:40AM +0300, Vasily Averin <vvs@openvz.org> wrote: > > However, now we want to enable accounting for some other cgroup-related > > resources called from cgroup_mkdir. We would like to guarantee that > > all new accounted allocation will be charged to the same memory cgroup. > > Here's my point -- the change in the referenced patch applied to memory > controller hierarchies. This extension applies to any hierarchy that can > create groups, namely, a hierarchy without memory controller too. There > mem_cgroup_from_cgroup falls back to the root memcg (on a different > hierarchy). > > If the purpose is to prevent unlimited creation of cgroup objects, the > root memcg is by principle unlimited, so it's just for accounting. > > But I understand the purpose is to have everything under one roof, > unless the object lifetime is not bound to that owning memcg. Should > memory-less hierarchies be treated specially? At least from my POV, as long as cgroup1 is not being regressed, we want to make decisions which make the best long term sense. We surely can accommodate cgroup1 as long as the added complexity is minimal but the bar is pretty high there. cgroup1 has been in maintenance mode for years now and even the basic delegation model isn't well established in cgroup1, so if we end up accounting everything in the root cgroup for most of cgroup1 hierarchies, that sounds fine to me. Thanks. -- tejun
On 8/17/22 19:41, Tejun Heo wrote: > Hello, > > On Wed, Aug 17, 2022 at 11:17:28AM +0200, Michal Koutný wrote: >> On Wed, Aug 17, 2022 at 10:42:40AM +0300, Vasily Averin <vvs@openvz.org> wrote: >>> However, now we want to enable accounting for some other cgroup-related >>> resources called from cgroup_mkdir. We would like to guarantee that >>> all new accounted allocation will be charged to the same memory cgroup. >> >> Here's my point -- the change in the referenced patch applied to memory >> controller hierarchies. This extension applies to any hierarchy that can >> create groups, namely, a hierarchy without memory controller too. There >> mem_cgroup_from_cgroup falls back to the root memcg (on a different >> hierarchy). My goal was to properly account kernfs and simple_xattr entries only, however I missed that it does not work in cgroup1 case. >> If the purpose is to prevent unlimited creation of cgroup objects, the >> root memcg is by principle unlimited, so it's just for accounting. No, the goal is not to prevent unlimited creation of cgroup objects. As Michal Hocko pointed it can be done via cgroup.max.descendants limits. >> But I understand the purpose is to have everything under one roof, >> unless the object lifetime is not bound to that owning memcg. Should >> memory-less hierarchies be treated specially? > > At least from my POV, as long as cgroup1 is not being regressed, we want to > make decisions which make the best long term sense. We surely can > accommodate cgroup1 as long as the added complexity is minimal but the bar > is pretty high there. cgroup1 has been in maintenance mode for years now and > even the basic delegation model isn't well established in cgroup1, so if we > end up accounting everything in the root cgroup for most of cgroup1 > hierarchies, that sounds fine to me. I would like to properly handle cgroup1 case too. To do it we can enable accounting for new 'struct cgroup' objects, and bind them to memcg of creator task. Thank you, Vasily Averin
Hello, On Tue, Aug 23, 2022 at 03:04:31PM +0300, Vasily Averin wrote: > I would like to properly handle cgroup1 case too. > To do it we can enable accounting for new 'struct cgroup' objects, > and bind them to memcg of creator task. I'm not sure it'd be a good idea to introduce two different behaviors for handling the same thing. I'd just leave cgroup1 as-is. Thanks. -- tejun
On 8/17/22 12:17, Michal Koutný wrote: >> +static inline struct mem_cgroup *mem_cgroup_from_cgroup(struct cgroup *cgroup) > [...] >> + css = cgroup_get_e_css(cgroup, &memory_cgrp_subsys); >> + >> + if (css) >> + memcg = container_of(css, struct mem_cgroup, css); > Nit: mem_cgroup_from_css Yes, my fault, you are right. it was planned initially, but then I lost it somehow. Thank you very much! Vasily Averin
© 2016 - 2026 Red Hat, Inc.