[PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc

Joshua Hahn posted 8 patches 1 month, 1 week ago
There is a newer version of this series
[PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
Posted by Joshua Hahn 1 month, 1 week ago
Now that zswap_entries do not directly track obj_cgroups of the entries,
handle the lifetime management and charging of these entries into the
zsmalloc layer.

One functional change is that zswap entries are now no longer accounted
by the size of the compressed object, but by the size of the size_class
slot they occupy.

This brings the charging one step closer to an accurate representation
of the memory consumed in the zpdesc; even if a compressed object
doesn't consume the entirety of a obj slot, the hole it creates between
the objects is dead space the obj is accountable for.

Thus, account the memory each object makes unusable, not the amount of
memory each object takes up.

Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
 include/linux/memcontrol.h | 10 -------
 mm/memcontrol.c            | 51 ----------------------------------
 mm/zsmalloc.c              | 57 ++++++++++++++++++++++++++++++++++++--
 mm/zswap.c                 |  8 ------
 4 files changed, 55 insertions(+), 71 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b6c82c8f73e1..dd4278b1ca35 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1824,22 +1824,12 @@ static inline bool memcg_is_dying(struct mem_cgroup *memcg)
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
 bool obj_cgroup_may_zswap(struct obj_cgroup *objcg);
-void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size);
-void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size);
 bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg);
 #else
 static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
 {
 	return true;
 }
-static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg,
-					   size_t size)
-{
-}
-static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg,
-					     size_t size)
-{
-}
 static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
 {
 	/* if zswap is disabled, do not block pages going to the swapping device */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 007413a53b45..3432e1afc037 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5433,57 +5433,6 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
 	return ret;
 }
 
-/**
- * obj_cgroup_charge_zswap - charge compression backend memory
- * @objcg: the object cgroup
- * @size: size of compressed object
- *
- * This forces the charge after obj_cgroup_may_zswap() allowed
- * compression and storage in zswap for this cgroup to go ahead.
- */
-void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
-{
-	struct mem_cgroup *memcg;
-
-	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
-		return;
-
-	VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
-
-	/* PF_MEMALLOC context, charging must succeed */
-	if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
-		VM_WARN_ON_ONCE(1);
-
-	rcu_read_lock();
-	memcg = obj_cgroup_memcg(objcg);
-	mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
-	mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
-	rcu_read_unlock();
-}
-
-/**
- * obj_cgroup_uncharge_zswap - uncharge compression backend memory
- * @objcg: the object cgroup
- * @size: size of compressed object
- *
- * Uncharges zswap memory on page in.
- */
-void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
-{
-	struct mem_cgroup *memcg;
-
-	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
-		return;
-
-	obj_cgroup_uncharge(objcg, size);
-
-	rcu_read_lock();
-	memcg = obj_cgroup_memcg(objcg);
-	mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
-	mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
-	rcu_read_unlock();
-}
-
 bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
 {
 	/* if zswap is disabled, do not block pages going to the swapping device */
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 067215a6ddcc..88c7cd399261 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -963,6 +963,44 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
 	return true;
 }
 
+static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
+			    int size, unsigned long offset)
+{
+	struct mem_cgroup *memcg;
+
+	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+		return;
+
+	VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
+
+	/* PF_MEMALLOC context, charging must succeed */
+	if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
+		VM_WARN_ON_ONCE(1);
+
+	rcu_read_lock();
+	memcg = obj_cgroup_memcg(objcg);
+	mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
+	mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
+	rcu_read_unlock();
+}
+
+static void zs_uncharge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
+			      int size, unsigned long offset)
+{
+	struct mem_cgroup *memcg;
+
+	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+		return;
+
+	obj_cgroup_uncharge(objcg, size);
+
+	rcu_read_lock();
+	memcg = obj_cgroup_memcg(objcg);
+	mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
+	mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
+	rcu_read_unlock();
+}
+
 static void migrate_obj_objcg(unsigned long used_obj, unsigned long free_obj,
 			      int size)
 {
@@ -1018,6 +1056,12 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
 	return true;
 }
 
+static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
+		            int size, unsigned long offset) {}
+
+static void zs_uncharge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
+			      int size, unsigned long offset) {}
+
 static void migrate_obj_objcg(unsigned long used_obj, unsigned long free_obj,
 			      int size) {}
 
@@ -1334,8 +1378,11 @@ void zs_obj_write(struct zs_pool *pool, unsigned long handle,
 	class = zspage_class(pool, zspage);
 	off = offset_in_page(class->size * obj_idx);
 
-	if (objcg)
+	if (objcg) {
+		obj_cgroup_get(objcg);
+		zs_charge_objcg(zpdesc, objcg, class->size, off);
 		zpdesc_set_obj_cgroup(zpdesc, obj_idx, class->size, objcg);
+	}
 
 	if (!ZsHugePage(zspage))
 		off += ZS_HANDLE_SIZE;
@@ -1501,6 +1548,7 @@ static void obj_free(int class_size, unsigned long obj)
 	struct link_free *link;
 	struct zspage *zspage;
 	struct zpdesc *f_zpdesc;
+	struct obj_cgroup *objcg;
 	unsigned long f_offset;
 	unsigned int f_objidx;
 	void *vaddr;
@@ -1510,7 +1558,12 @@ static void obj_free(int class_size, unsigned long obj)
 	f_offset = offset_in_page(class_size * f_objidx);
 	zspage = get_zspage(f_zpdesc);
 
-	zpdesc_set_obj_cgroup(f_zpdesc, f_objidx, class_size, NULL);
+	objcg = zpdesc_obj_cgroup(f_zpdesc, f_objidx, class_size);
+	if (objcg) {
+		zs_uncharge_objcg(f_zpdesc, objcg, class_size, f_offset);
+		obj_cgroup_put(objcg);
+		zpdesc_set_obj_cgroup(f_zpdesc, f_objidx, class_size, NULL);
+	}
 
 	vaddr = kmap_local_zpdesc(f_zpdesc);
 	link = (struct link_free *)(vaddr + f_offset);
diff --git a/mm/zswap.c b/mm/zswap.c
index 55161a5c9d4c..77d3c6516ed3 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -711,10 +711,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
 	zs_free(entry->pool->zs_pool, entry->handle);
 	zswap_pool_put(entry->pool);
 
-	if (objcg) {
-		obj_cgroup_uncharge_zswap(objcg, entry->length);
-		obj_cgroup_put(objcg);
-	}
 	if (entry->length == PAGE_SIZE)
 		atomic_long_dec(&zswap_stored_incompressible_pages);
 	zswap_entry_cache_free(entry);
@@ -1437,10 +1433,6 @@ static bool zswap_store_page(struct page *page,
 	 * when the entry is removed from the tree.
 	 */
 	zswap_pool_get(pool);
-	if (objcg) {
-		obj_cgroup_get(objcg);
-		obj_cgroup_charge_zswap(objcg, entry->length);
-	}
 	atomic_long_inc(&zswap_stored_pages);
 	if (entry->length == PAGE_SIZE)
 		atomic_long_inc(&zswap_stored_incompressible_pages);
-- 
2.47.3
Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
Posted by Yosry Ahmed 1 month ago
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 067215a6ddcc..88c7cd399261 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -963,6 +963,44 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
>         return true;
>  }
>
> +static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
> +                           int size, unsigned long offset)
> +{
> +       struct mem_cgroup *memcg;
> +
> +       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> +               return;
> +
> +       VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
> +
> +       /* PF_MEMALLOC context, charging must succeed */
> +       if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
> +               VM_WARN_ON_ONCE(1);
> +
> +       rcu_read_lock();
> +       memcg = obj_cgroup_memcg(objcg);
> +       mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
> +       mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);

Zsmalloc should not be updating zswap stats (e.g. in case zram starts
supporting memcg charging).  How about moving the stat updates to
zswap?
Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
Posted by Joshua Hahn 1 month ago
On Tue, 3 Mar 2026 15:53:31 -0800 Yosry Ahmed <yosry@kernel.org> wrote:

> > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> > index 067215a6ddcc..88c7cd399261 100644
> > --- a/mm/zsmalloc.c
> > +++ b/mm/zsmalloc.c
> > @@ -963,6 +963,44 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
> >         return true;
> >  }
> >
> > +static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
> > +                           int size, unsigned long offset)
> > +{
> > +       struct mem_cgroup *memcg;
> > +
> > +       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > +               return;
> > +
> > +       VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
> > +
> > +       /* PF_MEMALLOC context, charging must succeed */
> > +       if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
> > +               VM_WARN_ON_ONCE(1);
> > +
> > +       rcu_read_lock();
> > +       memcg = obj_cgroup_memcg(objcg);
> > +       mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
> > +       mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);

Hello Yosry, I hope you are doing well!
Thank you for your feedback : -)

> Zsmalloc should not be updating zswap stats (e.g. in case zram starts
> supporting memcg charging).  How about moving the stat updates to
> zswap?

Yeah... I think this was also a big point of concern for me. While reading
the code, I was really amazed by how clean the logical divide between
zsmalloc and zswap / zram were, and I wanted to preserve it as much as
possible.

There are a few problems, though. Probably the biggest is that migration
of zpdescs and compressed objects within them are invisible to zswap.
Of course, this is by design, but this leads to two problems.

zswap's ignorance of compressed objects' movements across physical nodes
makes it impossible to accurately charge and uncharge from the correct
memcg-lruvec.

Conversely, zsmalloc's ignorance of memcg association makes it impossible
to correctly restrict cpusets.mems during migration.

So the clean logical divide makes a lot of sense for separating the
high-level cgroup association, compression, etc. from the physical
location of the memory and migration / zpdesc compaction, but it would
appear that this comes at a cost of oversimplifying the logic and missing
out on accurate memory charging and a unified source of truth for the
counters.

The last thing I wanted to note was that I agree that zsmalloc doing
explicit zswap stat updates feels a bit awkward. The reason I chose to do
this right now is because when enlightening zsmalloc about the compressed
objs' objcgs, zswap is the only one that does this memory accounting.
So having an objcg is a bit of a proxy to understand that the consumer
is zswap (as opposed to zram). Of course, if zram starts to do memcg
accounting as well, we'll have to start doing some other checks to
see if the compresed object should be accounted as zram or zswap.

OK. That's all the defense I have for my design : -) Now for thinking
about other designs:

I also explored whether it makes sense to make zsmalloc call a hook into
zswap code during and after migrations. The problem is that there isn't
a good way to do the compressed object --> zswap entry lookup, and this
still doesn't solve the issue of zsmalloc migrating compressed objects
without checking whether that object can live on another node.

Maybe one possible approach is to turn the array of objcgs into an array
of backpointers from compressed objects to their corresponding zswap_entries?
One concern is that this does add 8 bytes of additional overhead per 
zswap entry, and I'm not sure that this is acceptable. I'll keep thinking
on whether there's a creative way to save some memory here, though...

Of course the other concern is what this will look like for zram users.
I guess it can be done similarly to what is done here, and only allocate
the array of pointers when called in from zswap.

Anyways, thank you for bringing this up. What do you think about the
options we have here? I hope that I've motivated why we want
per-memcg-lruvec accounting as well. Please let me know if there is anything
I can provide additional context for : -)

Have a great day!
Joshua
Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
Posted by Yosry Ahmed 1 month ago
On Wed, Mar 4, 2026 at 7:11 AM Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
>
> On Tue, 3 Mar 2026 15:53:31 -0800 Yosry Ahmed <yosry@kernel.org> wrote:
>
> > > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> > > index 067215a6ddcc..88c7cd399261 100644
> > > --- a/mm/zsmalloc.c
> > > +++ b/mm/zsmalloc.c
> > > @@ -963,6 +963,44 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
> > >         return true;
> > >  }
> > >
> > > +static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
> > > +                           int size, unsigned long offset)
> > > +{
> > > +       struct mem_cgroup *memcg;
> > > +
> > > +       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > > +               return;
> > > +
> > > +       VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
> > > +
> > > +       /* PF_MEMALLOC context, charging must succeed */
> > > +       if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
> > > +               VM_WARN_ON_ONCE(1);
> > > +
> > > +       rcu_read_lock();
> > > +       memcg = obj_cgroup_memcg(objcg);
> > > +       mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
> > > +       mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
>
> Hello Yosry, I hope you are doing well!
> Thank you for your feedback : -)
>
> > Zsmalloc should not be updating zswap stats (e.g. in case zram starts
> > supporting memcg charging).  How about moving the stat updates to
> > zswap?
>
> Yeah... I think this was also a big point of concern for me. While reading
> the code, I was really amazed by how clean the logical divide between
> zsmalloc and zswap / zram were, and I wanted to preserve it as much as
> possible.
>
> There are a few problems, though. Probably the biggest is that migration
> of zpdescs and compressed objects within them are invisible to zswap.
> Of course, this is by design, but this leads to two problems.
>
> zswap's ignorance of compressed objects' movements across physical nodes
> makes it impossible to accurately charge and uncharge from the correct
> memcg-lruvec.
>
> Conversely, zsmalloc's ignorance of memcg association makes it impossible
> to correctly restrict cpusets.mems during migration.
>
> So the clean logical divide makes a lot of sense for separating the
> high-level cgroup association, compression, etc. from the physical
> location of the memory and migration / zpdesc compaction, but it would
> appear that this comes at a cost of oversimplifying the logic and missing
> out on accurate memory charging and a unified source of truth for the
> counters.
>
> The last thing I wanted to note was that I agree that zsmalloc doing
> explicit zswap stat updates feels a bit awkward. The reason I chose to do
> this right now is because when enlightening zsmalloc about the compressed
> objs' objcgs, zswap is the only one that does this memory accounting.
> So having an objcg is a bit of a proxy to understand that the consumer
> is zswap (as opposed to zram). Of course, if zram starts to do memcg
> accounting as well, we'll have to start doing some other checks to
> see if the compresed object should be accounted as zram or zswap.
>
> OK. That's all the defense I have for my design : -) Now for thinking
> about other designs:
>
> I also explored whether it makes sense to make zsmalloc call a hook into
> zswap code during and after migrations. The problem is that there isn't
> a good way to do the compressed object --> zswap entry lookup, and this
> still doesn't solve the issue of zsmalloc migrating compressed objects
> without checking whether that object can live on another node.
>
> Maybe one possible approach is to turn the array of objcgs into an array
> of backpointers from compressed objects to their corresponding zswap_entries?
> One concern is that this does add 8 bytes of additional overhead per
> zswap entry, and I'm not sure that this is acceptable. I'll keep thinking
> on whether there's a creative way to save some memory here, though...
>
> Of course the other concern is what this will look like for zram users.
> I guess it can be done similarly to what is done here, and only allocate
> the array of pointers when called in from zswap.
>
> Anyways, thank you for bringing this up. What do you think about the
> options we have here? I hope that I've motivated why we want
> per-memcg-lruvec accounting as well. Please let me know if there is anything
> I can provide additional context for : -)

Thanks for the detailed elaboration.

AFAICT the only zswap-specific part is the actual stat indexes, what
if these are parameterized at the zsmalloc pool level? AFAICT zswap
and zram will never share a pool.
Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
Posted by Nhat Pham 1 month ago
On Wed, Mar 4, 2026 at 7:47 AM Yosry Ahmed <yosry@kernel.org> wrote:
>
> On Wed, Mar 4, 2026 at 7:11 AM Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
> >
> > On Tue, 3 Mar 2026 15:53:31 -0800 Yosry Ahmed <yosry@kernel.org> wrote:
> >
> > > > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> > > > index 067215a6ddcc..88c7cd399261 100644
> > > > --- a/mm/zsmalloc.c
> > > > +++ b/mm/zsmalloc.c
> > > > @@ -963,6 +963,44 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
> > > >         return true;
> > > >  }
> > > >
> > > > +static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
> > > > +                           int size, unsigned long offset)
> > > > +{
> > > > +       struct mem_cgroup *memcg;
> > > > +
> > > > +       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > > > +               return;
> > > > +
> > > > +       VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
> > > > +
> > > > +       /* PF_MEMALLOC context, charging must succeed */
> > > > +       if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
> > > > +               VM_WARN_ON_ONCE(1);
> > > > +
> > > > +       rcu_read_lock();
> > > > +       memcg = obj_cgroup_memcg(objcg);
> > > > +       mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
> > > > +       mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
> >
> > Hello Yosry, I hope you are doing well!
> > Thank you for your feedback : -)
> >
> > > Zsmalloc should not be updating zswap stats (e.g. in case zram starts
> > > supporting memcg charging).  How about moving the stat updates to
> > > zswap?
> >
> > Yeah... I think this was also a big point of concern for me. While reading
> > the code, I was really amazed by how clean the logical divide between
> > zsmalloc and zswap / zram were, and I wanted to preserve it as much as
> > possible.
> >
> > There are a few problems, though. Probably the biggest is that migration
> > of zpdescs and compressed objects within them are invisible to zswap.
> > Of course, this is by design, but this leads to two problems.
> >
> > zswap's ignorance of compressed objects' movements across physical nodes
> > makes it impossible to accurately charge and uncharge from the correct
> > memcg-lruvec.
> >
> > Conversely, zsmalloc's ignorance of memcg association makes it impossible
> > to correctly restrict cpusets.mems during migration.
> >
> > So the clean logical divide makes a lot of sense for separating the
> > high-level cgroup association, compression, etc. from the physical
> > location of the memory and migration / zpdesc compaction, but it would
> > appear that this comes at a cost of oversimplifying the logic and missing
> > out on accurate memory charging and a unified source of truth for the
> > counters.
> >
> > The last thing I wanted to note was that I agree that zsmalloc doing
> > explicit zswap stat updates feels a bit awkward. The reason I chose to do
> > this right now is because when enlightening zsmalloc about the compressed
> > objs' objcgs, zswap is the only one that does this memory accounting.
> > So having an objcg is a bit of a proxy to understand that the consumer
> > is zswap (as opposed to zram). Of course, if zram starts to do memcg
> > accounting as well, we'll have to start doing some other checks to
> > see if the compresed object should be accounted as zram or zswap.
> >
> > OK. That's all the defense I have for my design : -) Now for thinking
> > about other designs:
> >
> > I also explored whether it makes sense to make zsmalloc call a hook into
> > zswap code during and after migrations. The problem is that there isn't
> > a good way to do the compressed object --> zswap entry lookup, and this
> > still doesn't solve the issue of zsmalloc migrating compressed objects
> > without checking whether that object can live on another node.
> >
> > Maybe one possible approach is to turn the array of objcgs into an array
> > of backpointers from compressed objects to their corresponding zswap_entries?
> > One concern is that this does add 8 bytes of additional overhead per
> > zswap entry, and I'm not sure that this is acceptable. I'll keep thinking
> > on whether there's a creative way to save some memory here, though...
> >
> > Of course the other concern is what this will look like for zram users.
> > I guess it can be done similarly to what is done here, and only allocate
> > the array of pointers when called in from zswap.
> >
> > Anyways, thank you for bringing this up. What do you think about the
> > options we have here? I hope that I've motivated why we want
> > per-memcg-lruvec accounting as well. Please let me know if there is anything
> > I can provide additional context for : -)
>
> Thanks for the detailed elaboration.
>
> AFAICT the only zswap-specific part is the actual stat indexes, what
> if these are parameterized at the zsmalloc pool level? AFAICT zswap
> and zram will never share a pool.

TBH, if we were to start from scratch, these should be zsmalloc
counters not zswap counters. Only zsmalloc knows about the memory
placement and real memory consumption (i.e taking into account
intra-slot wasted space) - this information is abstracted away from
all of the callers. And if/when zram supports cgroup tracking, memory
used by zswap and memory used by zram is indistinguishable, no?

Anyway, Joshua, do you think this is doable? Seems promising to me,
but idk if it will be clean to implement or not.
Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
Posted by Yosry Ahmed 1 month ago
> > AFAICT the only zswap-specific part is the actual stat indexes, what
> > if these are parameterized at the zsmalloc pool level? AFAICT zswap
> > and zram will never share a pool.
>
> TBH, if we were to start from scratch, these should be zsmalloc
> counters not zswap counters. Only zsmalloc knows about the memory
> placement and real memory consumption (i.e taking into account
> intra-slot wasted space) - this information is abstracted away from
> all of the callers.

I agree, but we cannot change the zswap stats now that we added them.
Keep in mind that when they were added zsmalloc was not the only
backend.

> And if/when zram supports cgroup tracking, memory
> used by zswap and memory used by zram is indistinguishable, no?

It is distinguishable as long as they use different zsmalloc pools, I
don't see why we'd need to share a pool.

> Anyway, Joshua, do you think this is doable? Seems promising to me,
> but idk if it will be clean to implement or not.

Not sure what you mean here? Changing the stats to be zsmalloc-based?
IIUC we can't do this without breaking userspace.
Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
Posted by Nhat Pham 1 month ago
On Wed, Mar 4, 2026 at 8:46 AM Yosry Ahmed <yosry@kernel.org> wrote:
>
> > > AFAICT the only zswap-specific part is the actual stat indexes, what
> > > if these are parameterized at the zsmalloc pool level? AFAICT zswap
> > > and zram will never share a pool.
> >
> > TBH, if we were to start from scratch, these should be zsmalloc
> > counters not zswap counters. Only zsmalloc knows about the memory
> > placement and real memory consumption (i.e taking into account
> > intra-slot wasted space) - this information is abstracted away from
> > all of the callers.
>
> I agree, but we cannot change the zswap stats now that we added them.
> Keep in mind that when they were added zsmalloc was not the only
> backend.
>
> > And if/when zram supports cgroup tracking, memory
> > used by zswap and memory used by zram is indistinguishable, no?
>
> It is distinguishable as long as they use different zsmalloc pools, I
> don't see why we'd need to share a pool.
>
> > Anyway, Joshua, do you think this is doable? Seems promising to me,
> > but idk if it will be clean to implement or not.
>
> Not sure what you mean here? Changing the stats to be zsmalloc-based?
> IIUC we can't do this without breaking userspace.

No I meant your proposal haha. Sorry for being confusing - probably
need more caffeine :)
Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
Posted by Joshua Hahn 1 month ago
On Wed, 4 Mar 2026 07:46:48 -0800 Yosry Ahmed <yosry@kernel.org> wrote:

> On Wed, Mar 4, 2026 at 7:11 AM Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
> >
> > On Tue, 3 Mar 2026 15:53:31 -0800 Yosry Ahmed <yosry@kernel.org> wrote:
> >
> > > > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> > > > index 067215a6ddcc..88c7cd399261 100644
> > > > --- a/mm/zsmalloc.c
> > > > +++ b/mm/zsmalloc.c
> > > > @@ -963,6 +963,44 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
> > > >         return true;
> > > >  }
> > > >
> > > > +static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
> > > > +                           int size, unsigned long offset)
> > > > +{
> > > > +       struct mem_cgroup *memcg;
> > > > +
> > > > +       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > > > +               return;
> > > > +
> > > > +       VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
> > > > +
> > > > +       /* PF_MEMALLOC context, charging must succeed */
> > > > +       if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
> > > > +               VM_WARN_ON_ONCE(1);
> > > > +
> > > > +       rcu_read_lock();
> > > > +       memcg = obj_cgroup_memcg(objcg);
> > > > +       mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
> > > > +       mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
> >
> > Hello Yosry, I hope you are doing well!
> > Thank you for your feedback : -)
> >
> > > Zsmalloc should not be updating zswap stats (e.g. in case zram starts
> > > supporting memcg charging).  How about moving the stat updates to
> > > zswap?
> >
> > Yeah... I think this was also a big point of concern for me. While reading
> > the code, I was really amazed by how clean the logical divide between
> > zsmalloc and zswap / zram were, and I wanted to preserve it as much as
> > possible.
> >
> > There are a few problems, though. Probably the biggest is that migration
> > of zpdescs and compressed objects within them are invisible to zswap.
> > Of course, this is by design, but this leads to two problems.
> >
> > zswap's ignorance of compressed objects' movements across physical nodes
> > makes it impossible to accurately charge and uncharge from the correct
> > memcg-lruvec.
> >
> > Conversely, zsmalloc's ignorance of memcg association makes it impossible
> > to correctly restrict cpusets.mems during migration.
> >
> > So the clean logical divide makes a lot of sense for separating the
> > high-level cgroup association, compression, etc. from the physical
> > location of the memory and migration / zpdesc compaction, but it would
> > appear that this comes at a cost of oversimplifying the logic and missing
> > out on accurate memory charging and a unified source of truth for the
> > counters.
> >
> > The last thing I wanted to note was that I agree that zsmalloc doing
> > explicit zswap stat updates feels a bit awkward. The reason I chose to do
> > this right now is because when enlightening zsmalloc about the compressed
> > objs' objcgs, zswap is the only one that does this memory accounting.
> > So having an objcg is a bit of a proxy to understand that the consumer
> > is zswap (as opposed to zram). Of course, if zram starts to do memcg
> > accounting as well, we'll have to start doing some other checks to
> > see if the compresed object should be accounted as zram or zswap.
> >
> > OK. That's all the defense I have for my design : -) Now for thinking
> > about other designs:
> >
> > I also explored whether it makes sense to make zsmalloc call a hook into
> > zswap code during and after migrations. The problem is that there isn't
> > a good way to do the compressed object --> zswap entry lookup, and this
> > still doesn't solve the issue of zsmalloc migrating compressed objects
> > without checking whether that object can live on another node.
> >
> > Maybe one possible approach is to turn the array of objcgs into an array
> > of backpointers from compressed objects to their corresponding zswap_entries?
> > One concern is that this does add 8 bytes of additional overhead per
> > zswap entry, and I'm not sure that this is acceptable. I'll keep thinking
> > on whether there's a creative way to save some memory here, though...
> >
> > Of course the other concern is what this will look like for zram users.
> > I guess it can be done similarly to what is done here, and only allocate
> > the array of pointers when called in from zswap.
> >
> > Anyways, thank you for bringing this up. What do you think about the
> > options we have here? I hope that I've motivated why we want
> > per-memcg-lruvec accounting as well. Please let me know if there is anything
> > I can provide additional context for : -)
> 
> Thanks for the detailed elaboration.
> 
> AFAICT the only zswap-specific part is the actual stat indexes, what
> if these are parameterized at the zsmalloc pool level? AFAICT zswap
> and zram will never share a pool.

That's a great idea, we can abstract the ZSWAP and ZSWAPPED idxs as
"compressed" and "uncompressed" and leave the flexibility for zram
to do similar accounting in the future if they wish to.

Thanks for the suggestion, Yosry. I'll include this in the v2 and
send it out! I hope you have a great day!!
Joshua