mm,numa: N_PRIVATE node isolation for device-managed memory

[RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Gregory Price 1 month ago

If a private zswap-node is available, skip the entire software
compression process and memcpy directly to a compressed memory
folio, and store the newly allocated compressed memory page as
the zswap entry->handle.

On decompress we do the opposite: copy directly from the stored
page to the destination, and free the compressed memory page.

The driver callback is responsible for preventing run-away
compression ratio failures by checking that the allocated page is
safe to use (i.e. a compression ratio limit hasn't been crossed).

Signed-off-by: Gregory Price <gourry@gourry.net>
---
 include/linux/zswap.h |   5 ++
 mm/zswap.c            | 106 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 109 insertions(+), 2 deletions(-)

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 30c193a1207e..4b52fe447e7e 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -35,6 +35,8 @@ void zswap_lruvec_state_init(struct lruvec *lruvec);
 void zswap_folio_swapin(struct folio *folio);
 bool zswap_is_enabled(void);
 bool zswap_never_enabled(void);
+void zswap_add_direct_node(int nid);
+void zswap_remove_direct_node(int nid);
 #else
 
 struct zswap_lruvec_state {};
@@ -69,6 +71,9 @@ static inline bool zswap_never_enabled(void)
 	return true;
 }
 
+static inline void zswap_add_direct_node(int nid) {}
+static inline void zswap_remove_direct_node(int nid) {}
+
 #endif
 
 #endif /* _LINUX_ZSWAP_H */
diff --git a/mm/zswap.c b/mm/zswap.c
index de8858ff1521..aada588c957e 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -35,6 +35,7 @@
 #include <linux/workqueue.h>
 #include <linux/list_lru.h>
 #include <linux/zsmalloc.h>
+#include <linux/node.h>
 
 #include "swap.h"
 #include "internal.h"
@@ -190,6 +191,7 @@ struct zswap_entry {
 	swp_entry_t swpentry;
 	unsigned int length;
 	bool referenced;
+	bool direct;
 	struct zswap_pool *pool;
 	unsigned long handle;
 	struct obj_cgroup *objcg;
@@ -199,6 +201,20 @@ struct zswap_entry {
 static struct xarray *zswap_trees[MAX_SWAPFILES];
 static unsigned int nr_zswap_trees[MAX_SWAPFILES];
 
+/* Nodemask for compressed RAM nodes used by zswap_compress_direct */
+static nodemask_t zswap_direct_nodes = NODE_MASK_NONE;
+
+void zswap_add_direct_node(int nid)
+{
+	node_set(nid, zswap_direct_nodes);
+}
+
+void zswap_remove_direct_node(int nid)
+{
+	if (!node_online(nid))
+		node_clear(nid, zswap_direct_nodes);
+}
+
 /* RCU-protected iteration */
 static LIST_HEAD(zswap_pools);
 /* protects zswap_pools list modification */
@@ -716,7 +732,13 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
 static void zswap_entry_free(struct zswap_entry *entry)
 {
 	zswap_lru_del(&zswap_list_lru, entry);
-	zs_free(entry->pool->zs_pool, entry->handle);
+	if (entry->direct) {
+		struct page *page = (struct page *)entry->handle;
+
+		node_private_freed(page);
+		__free_page(page);
+	} else
+		zs_free(entry->pool->zs_pool, entry->handle);
 	zswap_pool_put(entry->pool);
 	if (entry->objcg) {
 		obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
@@ -849,6 +871,58 @@ static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
 	mutex_unlock(&acomp_ctx->mutex);
 }
 
+static struct page *zswap_compress_direct(struct page *src,
+					  struct zswap_entry *entry)
+{
+	int nid;
+	struct page *dst;
+	gfp_t gfp;
+	nodemask_t tried_nodes = NODE_MASK_NONE;
+
+	if (nodes_empty(zswap_direct_nodes))
+		return NULL;
+
+	gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE |
+	      __GFP_THISNODE;
+
+	for_each_node_mask(nid, zswap_direct_nodes) {
+		int ret;
+
+		/* Skip nodes we've already tried and failed */
+		if (node_isset(nid, tried_nodes))
+			continue;
+
+		dst = __alloc_pages(gfp, 0, nid, &zswap_direct_nodes);
+		if (!dst)
+			continue;
+
+		/*
+		 * Check with the device driver that this page is safe to use.
+		 * If the device reports an error (e.g., compression ratio is
+		 * too low and the page can't safely store data), free the page
+		 * and try another node.
+		 */
+		ret = node_private_allocated(dst);
+		if (ret) {
+			__free_page(dst);
+			node_set(nid, tried_nodes);
+			continue;
+		}
+
+		goto found;
+	}
+
+	return NULL;
+
+found:
+	/* If we fail to copy at this point just fallback */
+	if (copy_mc_highpage(dst, src)) {
+		__free_page(dst);
+		dst = NULL;
+	}
+	return dst;
+}
+
 static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 			   struct zswap_pool *pool)
 {
@@ -860,6 +934,17 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	gfp_t gfp;
 	u8 *dst;
 	bool mapped = false;
+	struct page *zpage;
+
+	/* Try to shunt directly to compressed ram */
+	zpage = zswap_compress_direct(page, entry);
+	if (zpage) {
+		entry->handle = (unsigned long)zpage;
+		entry->length = PAGE_SIZE;
+		entry->direct = true;
+		return true;
+	}
+	/* otherwise fallback to normal zswap */
 
 	acomp_ctx = acomp_ctx_get_cpu_lock(pool);
 	dst = acomp_ctx->buffer;
@@ -913,6 +998,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	zs_obj_write(pool->zs_pool, handle, dst, dlen);
 	entry->handle = handle;
 	entry->length = dlen;
+	entry->direct = false;
 
 unlock:
 	if (mapped)
@@ -936,6 +1022,15 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 	int decomp_ret = 0, dlen = PAGE_SIZE;
 	u8 *src, *obj;
 
+	/* compressed ram page */
+	if (entry->direct) {
+		struct page *src = (struct page *)entry->handle;
+		struct folio *zfolio = page_folio(src);
+
+		memcpy_folio(folio, 0, zfolio, 0, PAGE_SIZE);
+		goto direct_done;
+	}
+
 	acomp_ctx = acomp_ctx_get_cpu_lock(pool);
 	obj = zs_obj_read_begin(pool->zs_pool, entry->handle, acomp_ctx->buffer);
 
@@ -969,6 +1064,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 	zs_obj_read_end(pool->zs_pool, entry->handle, obj);
 	acomp_ctx_put_unlock(acomp_ctx);
 
+direct_done:
 	if (!decomp_ret && dlen == PAGE_SIZE)
 		return true;
 
@@ -1483,7 +1579,13 @@ static bool zswap_store_page(struct page *page,
 	return true;
 
 store_failed:
-	zs_free(pool->zs_pool, entry->handle);
+	if (entry->direct) {
+		struct page *freepage = (struct page *)entry->handle;
+
+		node_private_freed(freepage);
+		__free_page(freepage);
+	} else
+		zs_free(pool->zs_pool, entry->handle);
 compress_failed:
 	zswap_entry_cache_free(entry);
 	return false;
-- 
2.52.0

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Yosry Ahmed 1 month ago

On Thu, Jan 08, 2026 at 03:37:54PM -0500, Gregory Price wrote:
> If a private zswap-node is available, skip the entire software
> compression process and memcpy directly to a compressed memory
> folio, and store the newly allocated compressed memory page as
> the zswap entry->handle.
> 
> On decompress we do the opposite: copy directly from the stored
> page to the destination, and free the compressed memory page.
> 
> The driver callback is responsible for preventing run-away
> compression ratio failures by checking that the allocated page is
> safe to use (i.e. a compression ratio limit hasn't been crossed).
> 
> Signed-off-by: Gregory Price <gourry@gourry.net>

Hi Gregory,

Thanks for sending this, I have a lot of questions/comments below, but
from a high-level I am trying to understand the benefit of using a
compressed node for zswap rather than as a second tier.

If the memory is byte-addressable, using it as a second tier makes it
directly accessible without page faults, so the access latency is much
better than a swapped out page in zswap.

Are there some HW limitations that allow a node to be used as a backend
for zswap but not a second tier?

Or is the idea to make promotions from compressed memory to normal
memory fault-driver instead of relying on page hotness?

I also think there are some design decisions that need to be made before
we commit to this, see the comments below for more.

> ---
>  include/linux/zswap.h |   5 ++
>  mm/zswap.c            | 106 +++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 109 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
> index 30c193a1207e..4b52fe447e7e 100644
> --- a/include/linux/zswap.h
> +++ b/include/linux/zswap.h
> @@ -35,6 +35,8 @@ void zswap_lruvec_state_init(struct lruvec *lruvec);
>  void zswap_folio_swapin(struct folio *folio);
>  bool zswap_is_enabled(void);
>  bool zswap_never_enabled(void);
> +void zswap_add_direct_node(int nid);
> +void zswap_remove_direct_node(int nid);
>  #else
>  
>  struct zswap_lruvec_state {};
> @@ -69,6 +71,9 @@ static inline bool zswap_never_enabled(void)
>  	return true;
>  }
>  
> +static inline void zswap_add_direct_node(int nid) {}
> +static inline void zswap_remove_direct_node(int nid) {}
> +
>  #endif
>  
>  #endif /* _LINUX_ZSWAP_H */
> diff --git a/mm/zswap.c b/mm/zswap.c
> index de8858ff1521..aada588c957e 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -35,6 +35,7 @@
>  #include <linux/workqueue.h>
>  #include <linux/list_lru.h>
>  #include <linux/zsmalloc.h>
> +#include <linux/node.h>
>  
>  #include "swap.h"
>  #include "internal.h"
> @@ -190,6 +191,7 @@ struct zswap_entry {
>  	swp_entry_t swpentry;
>  	unsigned int length;
>  	bool referenced;
> +	bool direct;
>  	struct zswap_pool *pool;
>  	unsigned long handle;
>  	struct obj_cgroup *objcg;
> @@ -199,6 +201,20 @@ struct zswap_entry {
>  static struct xarray *zswap_trees[MAX_SWAPFILES];
>  static unsigned int nr_zswap_trees[MAX_SWAPFILES];
>  
> +/* Nodemask for compressed RAM nodes used by zswap_compress_direct */
> +static nodemask_t zswap_direct_nodes = NODE_MASK_NONE;
> +
> +void zswap_add_direct_node(int nid)
> +{
> +	node_set(nid, zswap_direct_nodes);
> +}
> +
> +void zswap_remove_direct_node(int nid)
> +{
> +	if (!node_online(nid))
> +		node_clear(nid, zswap_direct_nodes);
> +}
> +
>  /* RCU-protected iteration */
>  static LIST_HEAD(zswap_pools);
>  /* protects zswap_pools list modification */
> @@ -716,7 +732,13 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
>  static void zswap_entry_free(struct zswap_entry *entry)
>  {
>  	zswap_lru_del(&zswap_list_lru, entry);
> -	zs_free(entry->pool->zs_pool, entry->handle);
> +	if (entry->direct) {
> +		struct page *page = (struct page *)entry->handle;

Would it be cleaner to add a union in zswap_entry that has entry->handle
and entry->page?

> +
> +		node_private_freed(page);
> +		__free_page(page);
> +	} else
> +		zs_free(entry->pool->zs_pool, entry->handle);
>  	zswap_pool_put(entry->pool);
>  	if (entry->objcg) {
>  		obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
> @@ -849,6 +871,58 @@ static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
>  	mutex_unlock(&acomp_ctx->mutex);
>  }
>  
> +static struct page *zswap_compress_direct(struct page *src,
> +					  struct zswap_entry *entry)
> +{
> +	int nid;
> +	struct page *dst;
> +	gfp_t gfp;
> +	nodemask_t tried_nodes = NODE_MASK_NONE;
> +
> +	if (nodes_empty(zswap_direct_nodes))
> +		return NULL;
> +
> +	gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE |
> +	      __GFP_THISNODE;
> +
> +	for_each_node_mask(nid, zswap_direct_nodes) {
> +		int ret;
> +
> +		/* Skip nodes we've already tried and failed */
> +		if (node_isset(nid, tried_nodes))
> +			continue;

Why do we need this? Does for_each_node_mask() iterate each node more
than once?

> +
> +		dst = __alloc_pages(gfp, 0, nid, &zswap_direct_nodes);
> +		if (!dst)
> +			continue;
> +
> +		/*
> +		 * Check with the device driver that this page is safe to use.
> +		 * If the device reports an error (e.g., compression ratio is
> +		 * too low and the page can't safely store data), free the page
> +		 * and try another node.
> +		 */
> +		ret = node_private_allocated(dst);
> +		if (ret) {
> +			__free_page(dst);
> +			node_set(nid, tried_nodes);
> +			continue;
> +		}

I think we can drop the 'found' label by moving things around, would
this be simpler?

	for_each_node_mask(..) {
		...
		ret = node_private_allocated(dst);
		if (!ret)
			break;

		__free_page(dst);
		dst = NULL;
	}

	if (!dst)
		return NULL;

	if (copy_mc_highpage(..) {
		..
	}
	return dst;

> +
> +		goto found;
> +	}
> +
> +	return NULL;
> +
> +found:
> +	/* If we fail to copy at this point just fallback */
> +	if (copy_mc_highpage(dst, src)) {
> +		__free_page(dst);
> +		dst = NULL;
> +	}
> +	return dst;
> +}
> +

So the CXL code tells zswap what nodes are usable, then zswap tries
getting a page from these nodes and checking them using APIs provided by
the CXL code.

Wouldn't it be a better abstraction if the nodemask lived in the CXL
code and an API was exposed to zswap just to allocate a page to copy to?
Or we can abstract the copy as well and provide an API that directly
tries to copy the page to the compressible node.

IOW move zswap_compress_direct() (probably under a different name?) and
zswap_direct_nodes into CXL code since it's not really zswap logic.

Also, I am not sure if the zswap_compress_direct() call and check would
introduce any latency, since almost all existing callers will pay for it
without benefiting.

If we move the function into CXL code, we could probably have an inline
wrapper in a header with a static key guarding it to make there is no
overhead for existing users.

>  static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>  			   struct zswap_pool *pool)
>  {
> @@ -860,6 +934,17 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>  	gfp_t gfp;
>  	u8 *dst;
>  	bool mapped = false;
> +	struct page *zpage;
> +
> +	/* Try to shunt directly to compressed ram */
> +	zpage = zswap_compress_direct(page, entry);
> +	if (zpage) {
> +		entry->handle = (unsigned long)zpage;
> +		entry->length = PAGE_SIZE;
> +		entry->direct = true;
> +		return true;
> +	}

I don't think this works. Setting entry->length = PAGE_SIZE will cause a
few problems, off the top of my head:

1. An entire page of memory will be charged to the memcg, so swapping
out the page won't reduce the memcg usage, which will cause thrashing
(reclaim with no progress when hitting the limit).

Ideally we'd get the compressed length from HW and record it here to
charge it appropriately, but I am not sure how we actually want to
charge memory on a compressed node. Do we charge the compressed size as
normal memory? Does it need separate charging and a separate limit?

There are design discussions to be had before we commit to something.

2. The page will be incorrectly counted in
zswap_stored_incompressible_pages.

Aside from that, zswap_total_pages() will be wrong now, as it gets the
pool size from zsmalloc and these pages are not allocated from zsmalloc.
This is used when checking the pool limits and is exposed in stats.

> +	/* otherwise fallback to normal zswap */
>  
>  	acomp_ctx = acomp_ctx_get_cpu_lock(pool);
>  	dst = acomp_ctx->buffer;
> @@ -913,6 +998,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>  	zs_obj_write(pool->zs_pool, handle, dst, dlen);
>  	entry->handle = handle;
>  	entry->length = dlen;
> +	entry->direct = false;
>  
>  unlock:
>  	if (mapped)
> @@ -936,6 +1022,15 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>  	int decomp_ret = 0, dlen = PAGE_SIZE;
>  	u8 *src, *obj;
>  
> +	/* compressed ram page */
> +	if (entry->direct) {
> +		struct page *src = (struct page *)entry->handle;
> +		struct folio *zfolio = page_folio(src);
> +
> +		memcpy_folio(folio, 0, zfolio, 0, PAGE_SIZE);

Why are we using memcpy_folio() here but copy_mc_highpage() on the
compression path? Are they equivalent?

> +		goto direct_done;
> +	}
> +
>  	acomp_ctx = acomp_ctx_get_cpu_lock(pool);
>  	obj = zs_obj_read_begin(pool->zs_pool, entry->handle, acomp_ctx->buffer);
>  
> @@ -969,6 +1064,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>  	zs_obj_read_end(pool->zs_pool, entry->handle, obj);
>  	acomp_ctx_put_unlock(acomp_ctx);
>  
> +direct_done:
>  	if (!decomp_ret && dlen == PAGE_SIZE)
>  		return true;
>  
> @@ -1483,7 +1579,13 @@ static bool zswap_store_page(struct page *page,
>  	return true;
>  
>  store_failed:
> -	zs_free(pool->zs_pool, entry->handle);
> +	if (entry->direct) {
> +		struct page *freepage = (struct page *)entry->handle;
> +
> +		node_private_freed(freepage);
> +		__free_page(freepage);
> +	} else
> +		zs_free(pool->zs_pool, entry->handle);

This code is repeated in zswap_entry_free(), we should probably wrap it
in a helper that frees the private page or the zsmalloc entry based on
entry->direct.

>  compress_failed:
>  	zswap_entry_cache_free(entry);
>  	return false;
> -- 
> 2.52.0
>

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Gregory Price 1 month ago

On Fri, Jan 09, 2026 at 04:00:00PM +0000, Yosry Ahmed wrote:
> On Thu, Jan 08, 2026 at 03:37:54PM -0500, Gregory Price wrote:
> 
> If the memory is byte-addressable, using it as a second tier makes it
> directly accessible without page faults, so the access latency is much
> better than a swapped out page in zswap.
> 
> Are there some HW limitations that allow a node to be used as a backend
> for zswap but not a second tier?
>

Coming back around - presumably any compressed node capable of hosting a
proper tier would be compatible with zswap, but you might have hardware
which is sufficiently slow(er than dram, faster than storage) that using
it as a proper tier may be less efficient than incurring faults.

The standard I've been using is 500ns+ cacheline fetches, but this is
somewhat arbitrary.  Even 500ns might be better than accessing multi-us
storage, but then when you add compression you might hit 600ns-1us.

This is besides the point, and apologies for the wall of text below,
feel free to skip this next section - writing out what hardware-specific
details I can share for the sake of completeness.

Some hardware details
=====================
The way every proposed piece of compressed memory hardware I have seen
would operate is essentially by lying about its capacity to the
operating system - and then providing mechanisms to determine when the
compression ratio becomes is dropping to dangerous levels.

Hardware Says : 8GB
Hardware Has  : 1GB
Node Capacity : 8GB

The capacity numbers are static.  Even with hotplug, they must be
considered static - because the runtime compression ratio can change.

If the device fails to achieve a 4:1 compression ratio, and real usage
starts to exceed real capacity - the system will fail.
(dropped writes, poisons, machine checks, etc).

We can mitigate this with strong write-controls and querying the device
for compression ratio data prior to actually migrating a page. 

Why Zswap to start
==================
ZSwap is an existing, clean read and write control path control.
   - We fault on all accesses.
   - It otherwise uses system memory under the hood (kmalloc)

I decided to use zswap as a proving ground for the concept.  While the
design in this patch is simplistic (and as you suggest below, can
clearly be improved), it demonstrates the entire concept:

on demotion:
- allocate a page from private memory
- ask the driver if it's safe to use
- if safe -> migrate
  if unsafe -> fallback

on memory access:
- "promote" to a real page
- inform the driver the page has been released (zero or discard)

As you point out, the real value in byte-accessible memory is leaving
the memory mapped, the only difference on cram.c and zswap.c in the
above pattern would be:

on demotion:
- allocate a page from private memory
- ask the driver if it's safe to use
- if safe -> migrate and remap the page as RO in page tables
  if unsafe
     -> trigger reclaim on cram node
     -> fallback to another demotion

on *write* access:
- promote to real page
- clean up the compressed page

> Or is the idea to make promotions from compressed memory to normal
> memory fault-driver instead of relying on page hotness?
> 
> I also think there are some design decisions that need to be made before
> we commit to this, see the comments below for more.
>

100% agreed, i'm absolutely not locked into a design, this just gets the
ball rolling :].

> >  /* RCU-protected iteration */
> >  static LIST_HEAD(zswap_pools);
> >  /* protects zswap_pools list modification */
> > @@ -716,7 +732,13 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
> >  static void zswap_entry_free(struct zswap_entry *entry)
> >  {
> >  	zswap_lru_del(&zswap_list_lru, entry);
> > -	zs_free(entry->pool->zs_pool, entry->handle);
> > +	if (entry->direct) {
> > +		struct page *page = (struct page *)entry->handle;
> 
> Would it be cleaner to add a union in zswap_entry that has entry->handle
> and entry->page?
> 

Absolutely. Ack.

> > +		/* Skip nodes we've already tried and failed */
> > +		if (node_isset(nid, tried_nodes))
> > +			continue;
> 
> Why do we need this? Does for_each_node_mask() iterate each node more
> than once?
>

This is just me being stupid, i will clean this up.  I think i wrote
this when i was using a _next nodemask variant that can loop around and
just left this in when i got it working.

> I think we can drop the 'found' label by moving things around, would
> this be simpler?
> 	for_each_node_mask(..) {
> 		...
> 		ret = node_private_allocated(dst);
> 		if (!ret)
> 			break;
> 
> 		__free_page(dst);
> 		dst = NULL;
> 	}
> 

ack, thank you.

> So the CXL code tells zswap what nodes are usable, then zswap tries
> getting a page from these nodes and checking them using APIs provided by
> the CXL code.
> 
> Wouldn't it be a better abstraction if the nodemask lived in the CXL
> code and an API was exposed to zswap just to allocate a page to copy to?
> Or we can abstract the copy as well and provide an API that directly
> tries to copy the page to the compressible node.
>
> IOW move zswap_compress_direct() (probably under a different name?) and
> zswap_direct_nodes into CXL code since it's not really zswap logic.
> 
> Also, I am not sure if the zswap_compress_direct() call and check would
> introduce any latency, since almost all existing callers will pay for it
> without benefiting.
> 
> If we move the function into CXL code, we could probably have an inline
> wrapper in a header with a static key guarding it to make there is no
> overhead for existing users.
> 

CXL is also the wrong place to put it - cxl is just one potential
source of such a node.  We'd want that abstracted...

So this looks like a good use of memor-tiers.c - do dispatch there and
have it set static branches for various features on node registration.

struct page* mt_migrate_page_to(NODE_TYPE, src, &size);
-> on success return dst page and the size of the page on hardware
   (target_size would address your accounting notes below)

Then have the migrate function in mt do all the node_private callbacks.

So that would limit the zswap internal change to

if (zswap_node_check()) { /* static branch check */
    cpage = mt_migrate_page_to(NODE_PRIVATE_ZSWAP, src, &size);
    if (compressed_page) {
        entry->page_handle = cpage;
        entry->length = size;
        entry->direct = true;
	return true;
    }
}
/* Fallthrough */

ack. this is all great, thank you.

... snip ...
> > entry->length = size
>
> I don't think this works. Setting entry->length = PAGE_SIZE will cause a
> few problems, off the top of my head:
> 
> 1. An entire page of memory will be charged to the memcg, so swapping
> out the page won't reduce the memcg usage, which will cause thrashing
> (reclaim with no progress when hitting the limit).
>
> Ideally we'd get the compressed length from HW and record it here to
> charge it appropriately, but I am not sure how we actually want to
> charge memory on a compressed node. Do we charge the compressed size as
> normal memory? Does it need separate charging and a separate limit?
> 
> There are design discussions to be had before we commit to something.

I have a feeling tracking individual page usage would be way too
granular / inefficient, but I will consult with some folks on whether
this can be quieried.  If so, we can add way to get that info.

node_private_page_size(page) -> returns device reported page size.

or work it directly into the migrate() call like above

--- assuming there isn't a way and we have to deal with fuzzy math ---

The goal should definitely be to leave the charging statistics the same
from the perspective of services - i.e zswap should charge a whole page,
because according to the OS it just used a whole page.

What this would mean is memcg would have to work with fuzzy data.
If 1GB is charged and the compression ratio is 4:1, reclaim should
operate (by way of callback) like it has used 256MB.

I think this is the best you can do without tracking individual pages.

> 
> 2. The page will be incorrectly counted in
> zswap_stored_incompressible_pages.
> 

If we can track individual page size, then we can fix that.

If we can't, then we'd need zswap_stored_direct_pages and to do the
accounting a bit differently.  Probably want direct_pages accounting
anyway, so i might just add that.

> Aside from that, zswap_total_pages() will be wrong now, as it gets the
> pool size from zsmalloc and these pages are not allocated from zsmalloc.
> This is used when checking the pool limits and is exposed in stats.
>

This is ignorance of zswap on my part, and yeah good point.  Will look
into this accounting a little more.

> > +		memcpy_folio(folio, 0, zfolio, 0, PAGE_SIZE);
> 
> Why are we using memcpy_folio() here but copy_mc_highpage() on the
> compression path? Are they equivalent?
> 

both are in include/linux/highmem.h

I was avoiding page->folio conversions in the compression path because
I had a struct page already.

tl;dr: I'm still looking for the "right" way to do this.  I originally
had a "HACK:" tag here previously but seems I definitely dropped it
prematurely.

(I also think this code can be pushed into mt_ or callbacks)

> > +	if (entry->direct) {
> > +		struct page *freepage = (struct page *)entry->handle;
> > +
> > +		node_private_freed(freepage);
> > +		__free_page(freepage);
> > +	} else
> > +		zs_free(pool->zs_pool, entry->handle);
> 
> This code is repeated in zswap_entry_free(), we should probably wrap it
> in a helper that frees the private page or the zsmalloc entry based on
> entry->direct.
>

ack.

Thank you again for taking a look, this has been enlightening.  Good
takeaways for the rest of the N_PRIVATE design.

I think we can minimize zswap changes even further given this.

~Gregory

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Yosry Ahmed 4 weeks ago

On Fri, Jan 09, 2026 at 04:40:08PM -0500, Gregory Price wrote:
> On Fri, Jan 09, 2026 at 04:00:00PM +0000, Yosry Ahmed wrote:
> > On Thu, Jan 08, 2026 at 03:37:54PM -0500, Gregory Price wrote:
> > 
> > If the memory is byte-addressable, using it as a second tier makes it
> > directly accessible without page faults, so the access latency is much
> > better than a swapped out page in zswap.
> > 
> > Are there some HW limitations that allow a node to be used as a backend
> > for zswap but not a second tier?
> >
> 
> Coming back around - presumably any compressed node capable of hosting a
> proper tier would be compatible with zswap, but you might have hardware
> which is sufficiently slow(er than dram, faster than storage) that using
> it as a proper tier may be less efficient than incurring faults.
> 
> The standard I've been using is 500ns+ cacheline fetches, but this is
> somewhat arbitrary.  Even 500ns might be better than accessing multi-us
> storage, but then when you add compression you might hit 600ns-1us.
> 
> This is besides the point, and apologies for the wall of text below,
> feel free to skip this next section - writing out what hardware-specific
> details I can share for the sake of completeness.

The wall of text is very helpful :)

> 
> 
> Some hardware details
> =====================
> The way every proposed piece of compressed memory hardware I have seen
> would operate is essentially by lying about its capacity to the
> operating system - and then providing mechanisms to determine when the
> compression ratio becomes is dropping to dangerous levels.
> 
> Hardware Says : 8GB
> Hardware Has  : 1GB
> Node Capacity : 8GB
> 
> The capacity numbers are static.  Even with hotplug, they must be
> considered static - because the runtime compression ratio can change.
> 
> If the device fails to achieve a 4:1 compression ratio, and real usage
> starts to exceed real capacity - the system will fail.
> (dropped writes, poisons, machine checks, etc).
> 
> We can mitigate this with strong write-controls and querying the device
> for compression ratio data prior to actually migrating a page. 

I am a little bit confused about this. Why do we only need to query the
device before migrating the page?

Are we checking if the device has enough memory for the worst case
scenario (i.e. PAGE_SIZE)?

Or are we checking if the device can compress this specific page and
checking if it can compress it and store it? This seems like it could be
racy and there might be some throwaway work.

I guess my question is: why not just give the page to the device and get
either: successfully compressed and stored OR failed?

Another question, can the device or driver be configured such that we
reject pages that compress poorly to avoid wasting memory and BW on the
device for little savings?

> 
> Why Zswap to start
> ==================
> ZSwap is an existing, clean read and write control path control.
>    - We fault on all accesses.
>    - It otherwise uses system memory under the hood (kmalloc)
> 
> I decided to use zswap as a proving ground for the concept.  While the
> design in this patch is simplistic (and as you suggest below, can
> clearly be improved), it demonstrates the entire concept:
> 
> on demotion:
> - allocate a page from private memory
> - ask the driver if it's safe to use
> - if safe -> migrate
>   if unsafe -> fallback
> 
> on memory access:
> - "promote" to a real page
> - inform the driver the page has been released (zero or discard)
> 
> As you point out, the real value in byte-accessible memory is leaving
> the memory mapped, the only difference on cram.c and zswap.c in the
> above pattern would be:
> 
> on demotion:
> - allocate a page from private memory
> - ask the driver if it's safe to use
> - if safe -> migrate and remap the page as RO in page tables
>   if unsafe
>      -> trigger reclaim on cram node
>      -> fallback to another demotion
> 
> on *write* access:
> - promote to real page
> - clean up the compressed page

This makes sense. I am assuming the main benefit of zswap.c over cram.c
in this scenario is limiting read accesses as well.

[..]
> > So the CXL code tells zswap what nodes are usable, then zswap tries
> > getting a page from these nodes and checking them using APIs provided by
> > the CXL code.
> > 
> > Wouldn't it be a better abstraction if the nodemask lived in the CXL
> > code and an API was exposed to zswap just to allocate a page to copy to?
> > Or we can abstract the copy as well and provide an API that directly
> > tries to copy the page to the compressible node.
> >
> > IOW move zswap_compress_direct() (probably under a different name?) and
> > zswap_direct_nodes into CXL code since it's not really zswap logic.
> > 
> > Also, I am not sure if the zswap_compress_direct() call and check would
> > introduce any latency, since almost all existing callers will pay for it
> > without benefiting.
> > 
> > If we move the function into CXL code, we could probably have an inline
> > wrapper in a header with a static key guarding it to make there is no
> > overhead for existing users.
> > 
> 
> 
> CXL is also the wrong place to put it - cxl is just one potential
> source of such a node.  We'd want that abstracted...
> 
> So this looks like a good use of memor-tiers.c - do dispatch there and
> have it set static branches for various features on node registration.
> 
> struct page* mt_migrate_page_to(NODE_TYPE, src, &size);
> -> on success return dst page and the size of the page on hardware
>    (target_size would address your accounting notes below)
> 
> Then have the migrate function in mt do all the node_private callbacks.
> 
> So that would limit the zswap internal change to
> 
> if (zswap_node_check()) { /* static branch check */
>     cpage = mt_migrate_page_to(NODE_PRIVATE_ZSWAP, src, &size);
>     if (compressed_page) {
>         entry->page_handle = cpage;
>         entry->length = size;
>         entry->direct = true;
> 	return true;
>     }
> }
> /* Fallthrough */

Yeah I didn't necessarily mean CXL code, but whatever layer is
responsible for keeping track of which nodes can be used for what.

> 
> ack. this is all great, thank you.
> 
> ... snip ...
> > > entry->length = size
> >
> > I don't think this works. Setting entry->length = PAGE_SIZE will cause a
> > few problems, off the top of my head:
> > 
> > 1. An entire page of memory will be charged to the memcg, so swapping
> > out the page won't reduce the memcg usage, which will cause thrashing
> > (reclaim with no progress when hitting the limit).
> >
> > Ideally we'd get the compressed length from HW and record it here to
> > charge it appropriately, but I am not sure how we actually want to
> > charge memory on a compressed node. Do we charge the compressed size as
> > normal memory? Does it need separate charging and a separate limit?
> > 
> > There are design discussions to be had before we commit to something.
> 
> I have a feeling tracking individual page usage would be way too
> granular / inefficient, but I will consult with some folks on whether
> this can be quieried.  If so, we can add way to get that info.
> 
> node_private_page_size(page) -> returns device reported page size.
> 
> or work it directly into the migrate() call like above
> 
> --- assuming there isn't a way and we have to deal with fuzzy math ---
> 
> The goal should definitely be to leave the charging statistics the same
> from the perspective of services - i.e zswap should charge a whole page,
> because according to the OS it just used a whole page.
> 
> What this would mean is memcg would have to work with fuzzy data.
> If 1GB is charged and the compression ratio is 4:1, reclaim should
> operate (by way of callback) like it has used 256MB.
> 
> I think this is the best you can do without tracking individual pages.

This part needs more thought. Zswap cannot charge a full page because
then from the memcg perspective reclaim is not making any progress.
OTOH, as you mention, from the system perspective we just consumed a
full page, so not charging that would be inconsistent.

This is not a zswap-specific thing though, even with cram.c we have to
figure out how to charge memory on the compressed node to the memcg.
It's perhaps not as much of a problem as with zswap because we are not
dealing with reclaim not making progress.

Maybe the memcg limits need to be "enlightened" about different tiers?
We did have such discussions in the past outside the context of
compressed memory, for memory tiering in general.

Not sure if this is the right place to discuss this, but I see the memcg
folks CC'd so maybe it is :)

> 
> > 
> > 2. The page will be incorrectly counted in
> > zswap_stored_incompressible_pages.
> > 
> 
> If we can track individual page size, then we can fix that.
> 
> If we can't, then we'd need zswap_stored_direct_pages and to do the
> accounting a bit differently.  Probably want direct_pages accounting
> anyway, so i might just add that.

Yeah probably the easiest way to deal with this, assuming we keep
entry->length as PAGE_SIZE.

> 
> > Aside from that, zswap_total_pages() will be wrong now, as it gets the
> > pool size from zsmalloc and these pages are not allocated from zsmalloc.
> > This is used when checking the pool limits and is exposed in stats.
> >
> 
> This is ignorance of zswap on my part, and yeah good point.  Will look
> into this accounting a little more.

This is similar-ish to the memcg charging problem, how do we count the
compressed memory usage toward the global zswap limit? Do we keep this
limit for the top-tier? If not, do we charge full size for pages in
c.zswap or compressed size?

Do we need a separate limit for c.zswap? Probably not if the whole node
is dedicated for zswap usage.

> 
> > > +		memcpy_folio(folio, 0, zfolio, 0, PAGE_SIZE);
> > 
> > Why are we using memcpy_folio() here but copy_mc_highpage() on the
> > compression path? Are they equivalent?
> > 
> 
> both are in include/linux/highmem.h
> 
> I was avoiding page->folio conversions in the compression path because
> I had a struct page already.
> 
> tl;dr: I'm still looking for the "right" way to do this.  I originally
> had a "HACK:" tag here previously but seems I definitely dropped it
> prematurely.

Not a big deal. An RFC or HACK or whatever tag just usually helps signal
to everyone (and more importantly, to Andrew) that this should not be
merged as-is.

> 
> (I also think this code can be pushed into mt_ or callbacks)

Agreed.

> 
> > > +	if (entry->direct) {
> > > +		struct page *freepage = (struct page *)entry->handle;
> > > +
> > > +		node_private_freed(freepage);
> > > +		__free_page(freepage);
> > > +	} else
> > > +		zs_free(pool->zs_pool, entry->handle);
> > 
> > This code is repeated in zswap_entry_free(), we should probably wrap it
> > in a helper that frees the private page or the zsmalloc entry based on
> > entry->direct.
> >
> 
> ack.
> 
> Thank you again for taking a look, this has been enlightening.  Good
> takeaways for the rest of the N_PRIVATE design.

Thanks for kicking off the discussion here, an interesting problem to
solve for sure :)

> 
> I think we can minimize zswap changes even further given this.
> 
> ~Gregory

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Nhat Pham 3 weeks, 6 days ago

On Tue, Jan 13, 2026 at 6:13 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
>
> On Fri, Jan 09, 2026 at 04:40:08PM -0500, Gregory Price wrote:
> > On Fri, Jan 09, 2026 at 04:00:00PM +0000, Yosry Ahmed wrote:
> > > On Thu, Jan 08, 2026 at 03:37:54PM -0500, Gregory Price wrote:
> > >
> > > If the memory is byte-addressable, using it as a second tier makes it
> > > directly accessible without page faults, so the access latency is much
> > > better than a swapped out page in zswap.
> > >
> > > Are there some HW limitations that allow a node to be used as a backend
> > > for zswap but not a second tier?
> > >
> >
> > Coming back around - presumably any compressed node capable of hosting a
> > proper tier would be compatible with zswap, but you might have hardware
> > which is sufficiently slow(er than dram, faster than storage) that using
> > it as a proper tier may be less efficient than incurring faults.
> >
> > The standard I've been using is 500ns+ cacheline fetches, but this is
> > somewhat arbitrary.  Even 500ns might be better than accessing multi-us
> > storage, but then when you add compression you might hit 600ns-1us.
> >
> > This is besides the point, and apologies for the wall of text below,
> > feel free to skip this next section - writing out what hardware-specific
> > details I can share for the sake of completeness.
>
> The wall of text is very helpful :)
>
> >
> >
> > Some hardware details
> > =====================
> > The way every proposed piece of compressed memory hardware I have seen
> > would operate is essentially by lying about its capacity to the
> > operating system - and then providing mechanisms to determine when the
> > compression ratio becomes is dropping to dangerous levels.
> >
> > Hardware Says : 8GB
> > Hardware Has  : 1GB
> > Node Capacity : 8GB
> >
> > The capacity numbers are static.  Even with hotplug, they must be
> > considered static - because the runtime compression ratio can change.
> >
> > If the device fails to achieve a 4:1 compression ratio, and real usage
> > starts to exceed real capacity - the system will fail.
> > (dropped writes, poisons, machine checks, etc).
> >
> > We can mitigate this with strong write-controls and querying the device
> > for compression ratio data prior to actually migrating a page.
>
> I am a little bit confused about this. Why do we only need to query the
> device before migrating the page?
>
> Are we checking if the device has enough memory for the worst case
> scenario (i.e. PAGE_SIZE)?
>
> Or are we checking if the device can compress this specific page and
> checking if it can compress it and store it? This seems like it could be
> racy and there might be some throwaway work.
>
> I guess my question is: why not just give the page to the device and get
> either: successfully compressed and stored OR failed?
>
> Another question, can the device or driver be configured such that we
> reject pages that compress poorly to avoid wasting memory and BW on the
> device for little savings?
>
> >
> > Why Zswap to start
> > ==================
> > ZSwap is an existing, clean read and write control path control.
> >    - We fault on all accesses.
> >    - It otherwise uses system memory under the hood (kmalloc)
> >
> > I decided to use zswap as a proving ground for the concept.  While the
> > design in this patch is simplistic (and as you suggest below, can
> > clearly be improved), it demonstrates the entire concept:
> >
> > on demotion:
> > - allocate a page from private memory
> > - ask the driver if it's safe to use
> > - if safe -> migrate
> >   if unsafe -> fallback
> >
> > on memory access:
> > - "promote" to a real page
> > - inform the driver the page has been released (zero or discard)
> >
> > As you point out, the real value in byte-accessible memory is leaving
> > the memory mapped, the only difference on cram.c and zswap.c in the
> > above pattern would be:
> >
> > on demotion:
> > - allocate a page from private memory
> > - ask the driver if it's safe to use
> > - if safe -> migrate and remap the page as RO in page tables
> >   if unsafe
> >      -> trigger reclaim on cram node
> >      -> fallback to another demotion
> >
> > on *write* access:
> > - promote to real page
> > - clean up the compressed page
>
> This makes sense. I am assuming the main benefit of zswap.c over cram.c
> in this scenario is limiting read accesses as well.
>
> [..]
> > > So the CXL code tells zswap what nodes are usable, then zswap tries
> > > getting a page from these nodes and checking them using APIs provided by
> > > the CXL code.
> > >
> > > Wouldn't it be a better abstraction if the nodemask lived in the CXL
> > > code and an API was exposed to zswap just to allocate a page to copy to?
> > > Or we can abstract the copy as well and provide an API that directly
> > > tries to copy the page to the compressible node.
> > >
> > > IOW move zswap_compress_direct() (probably under a different name?) and
> > > zswap_direct_nodes into CXL code since it's not really zswap logic.
> > >
> > > Also, I am not sure if the zswap_compress_direct() call and check would
> > > introduce any latency, since almost all existing callers will pay for it
> > > without benefiting.
> > >
> > > If we move the function into CXL code, we could probably have an inline
> > > wrapper in a header with a static key guarding it to make there is no
> > > overhead for existing users.
> > >
> >
> >
> > CXL is also the wrong place to put it - cxl is just one potential
> > source of such a node.  We'd want that abstracted...
> >
> > So this looks like a good use of memor-tiers.c - do dispatch there and
> > have it set static branches for various features on node registration.
> >
> > struct page* mt_migrate_page_to(NODE_TYPE, src, &size);
> > -> on success return dst page and the size of the page on hardware
> >    (target_size would address your accounting notes below)
> >
> > Then have the migrate function in mt do all the node_private callbacks.
> >
> > So that would limit the zswap internal change to
> >
> > if (zswap_node_check()) { /* static branch check */
> >     cpage = mt_migrate_page_to(NODE_PRIVATE_ZSWAP, src, &size);
> >     if (compressed_page) {
> >         entry->page_handle = cpage;
> >         entry->length = size;
> >         entry->direct = true;
> >       return true;
> >     }
> > }
> > /* Fallthrough */
>
> Yeah I didn't necessarily mean CXL code, but whatever layer is
> responsible for keeping track of which nodes can be used for what.
>
> >
> > ack. this is all great, thank you.
> >
> > ... snip ...
> > > > entry->length = size
> > >
> > > I don't think this works. Setting entry->length = PAGE_SIZE will cause a
> > > few problems, off the top of my head:
> > >
> > > 1. An entire page of memory will be charged to the memcg, so swapping
> > > out the page won't reduce the memcg usage, which will cause thrashing
> > > (reclaim with no progress when hitting the limit).
> > >
> > > Ideally we'd get the compressed length from HW and record it here to
> > > charge it appropriately, but I am not sure how we actually want to
> > > charge memory on a compressed node. Do we charge the compressed size as
> > > normal memory? Does it need separate charging and a separate limit?
> > >
> > > There are design discussions to be had before we commit to something.
> >
> > I have a feeling tracking individual page usage would be way too
> > granular / inefficient, but I will consult with some folks on whether
> > this can be quieried.  If so, we can add way to get that info.
> >
> > node_private_page_size(page) -> returns device reported page size.
> >
> > or work it directly into the migrate() call like above
> >
> > --- assuming there isn't a way and we have to deal with fuzzy math ---
> >
> > The goal should definitely be to leave the charging statistics the same
> > from the perspective of services - i.e zswap should charge a whole page,
> > because according to the OS it just used a whole page.
> >
> > What this would mean is memcg would have to work with fuzzy data.
> > If 1GB is charged and the compression ratio is 4:1, reclaim should
> > operate (by way of callback) like it has used 256MB.
> >
> > I think this is the best you can do without tracking individual pages.
>
> This part needs more thought. Zswap cannot charge a full page because
> then from the memcg perspective reclaim is not making any progress.
> OTOH, as you mention, from the system perspective we just consumed a
> full page, so not charging that would be inconsistent.
>
> This is not a zswap-specific thing though, even with cram.c we have to
> figure out how to charge memory on the compressed node to the memcg.
> It's perhaps not as much of a problem as with zswap because we are not
> dealing with reclaim not making progress.
>
> Maybe the memcg limits need to be "enlightened" about different tiers?
> We did have such discussions in the past outside the context of
> compressed memory, for memory tiering in general.

What if we add a reclaim flag that says "hey, we are hitting actual
memory limit and need to make memory reclaim forward progress".

Then, we can have zswap skip compressed cxl backend and fall back to
real compression.

(Maybe also demotion, which only move memory from one node to another,
as well as the new cram.c stuff? This will technically also save some
wasted work, as in the status quo we will need to do a demotion pass
first, before having to reclaiom memory from the bottom tier anyway?
But not sure if we want this).

>
> Not sure if this is the right place to discuss this, but I see the memcg
> folks CC'd so maybe it is :)
>
> >
> > >
> > > 2. The page will be incorrectly counted in
> > > zswap_stored_incompressible_pages.
> > >
> >
> > If we can track individual page size, then we can fix that.
> >
> > If we can't, then we'd need zswap_stored_direct_pages and to do the
> > accounting a bit differently.  Probably want direct_pages accounting
> > anyway, so i might just add that.
>
> Yeah probably the easiest way to deal with this, assuming we keep
> entry->length as PAGE_SIZE.

Yeah this one is no big deal. I like a new informative counter :)

>
> >
> > > Aside from that, zswap_total_pages() will be wrong now, as it gets the
> > > pool size from zsmalloc and these pages are not allocated from zsmalloc.
> > > This is used when checking the pool limits and is exposed in stats.
> > >
> >
> > This is ignorance of zswap on my part, and yeah good point.  Will look
> > into this accounting a little more.
>
> This is similar-ish to the memcg charging problem, how do we count the
> compressed memory usage toward the global zswap limit? Do we keep this
> limit for the top-tier? If not, do we charge full size for pages in
> c.zswap or compressed size?
>
> Do we need a separate limit for c.zswap? Probably not if the whole node
> is dedicated for zswap usage.
>
> >
> > > > +         memcpy_folio(folio, 0, zfolio, 0, PAGE_SIZE);
> > >
> > > Why are we using memcpy_folio() here but copy_mc_highpage() on the
> > > compression path? Are they equivalent?
> > >
> >
> > both are in include/linux/highmem.h
> >
> > I was avoiding page->folio conversions in the compression path because
> > I had a struct page already.
> >
> > tl;dr: I'm still looking for the "right" way to do this.  I originally
> > had a "HACK:" tag here previously but seems I definitely dropped it
> > prematurely.
>
> Not a big deal. An RFC or HACK or whatever tag just usually helps signal
> to everyone (and more importantly, to Andrew) that this should not be
> merged as-is.
>
> >
> > (I also think this code can be pushed into mt_ or callbacks)
>
> Agreed.
>
> >
> > > > + if (entry->direct) {
> > > > +         struct page *freepage = (struct page *)entry->handle;
> > > > +
> > > > +         node_private_freed(freepage);
> > > > +         __free_page(freepage);
> > > > + } else
> > > > +         zs_free(pool->zs_pool, entry->handle);
> > >
> > > This code is repeated in zswap_entry_free(), we should probably wrap it
> > > in a helper that frees the private page or the zsmalloc entry based on
> > > entry->direct.
> > >
> >
> > ack.
> >
> > Thank you again for taking a look, this has been enlightening.  Good
> > takeaways for the rest of the N_PRIVATE design.
>
> Thanks for kicking off the discussion here, an interesting problem to
> solve for sure :)
>
> >
> > I think we can minimize zswap changes even further given this.
> >
> > ~Gregory

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Nhat Pham 3 weeks, 6 days ago

On Tue, Jan 13, 2026 at 4:35 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> > This part needs more thought. Zswap cannot charge a full page because
> > then from the memcg perspective reclaim is not making any progress.
> > OTOH, as you mention, from the system perspective we just consumed a
> > full page, so not charging that would be inconsistent.
> >
> > This is not a zswap-specific thing though, even with cram.c we have to
> > figure out how to charge memory on the compressed node to the memcg.
> > It's perhaps not as much of a problem as with zswap because we are not
> > dealing with reclaim not making progress.
> >
> > Maybe the memcg limits need to be "enlightened" about different tiers?
> > We did have such discussions in the past outside the context of
> > compressed memory, for memory tiering in general.
>
> What if we add a reclaim flag that says "hey, we are hitting actual
> memory limit and need to make memory reclaim forward progress".
>
> Then, we can have zswap skip compressed cxl backend and fall back to
> real compression.
>
> (Maybe also demotion, which only move memory from one node to another,
> as well as the new cram.c stuff? This will technically also save some
> wasted work, as in the status quo we will need to do a demotion pass
> first, before having to reclaiom memory from the bottom tier anyway?
> But not sure if we want this).

Some more thoughts - right now demotion is kinda similar, right? We
move pages from one node (fast tier) to another (slow tier). This
frees up space in the fast tier, but it actually doesn't change the
memcg memory usage. So we are not making "forward progress" with this
either.

I suppose this is fine-ish, because reclaim subsystem can then proceed
by reclaiming from the bottom tier, which will now go to disk swap,
zswap, etc.

Can we achieve the same effect by making pages in
zswap-backed-by-compressed-cxl reclaimable:

1. Recompression - take them off compressed cxl and store them in
zswap proper (i.e in-memory compression).

2. Just enable zswap shrinker and have memory reclaim move these pages
into disk swap. This will have a much more drastic performance
implications though :)

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Yosry Ahmed 3 weeks, 4 days ago

On Tue, Jan 13, 2026 at 04:49:20PM +0900, Nhat Pham wrote:
> On Tue, Jan 13, 2026 at 4:35 PM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > > This part needs more thought. Zswap cannot charge a full page because
> > > then from the memcg perspective reclaim is not making any progress.
> > > OTOH, as you mention, from the system perspective we just consumed a
> > > full page, so not charging that would be inconsistent.
> > >
> > > This is not a zswap-specific thing though, even with cram.c we have to
> > > figure out how to charge memory on the compressed node to the memcg.
> > > It's perhaps not as much of a problem as with zswap because we are not
> > > dealing with reclaim not making progress.
> > >
> > > Maybe the memcg limits need to be "enlightened" about different tiers?
> > > We did have such discussions in the past outside the context of
> > > compressed memory, for memory tiering in general.
> >
> > What if we add a reclaim flag that says "hey, we are hitting actual
> > memory limit and need to make memory reclaim forward progress".
> >
> > Then, we can have zswap skip compressed cxl backend and fall back to
> > real compression.
> >
> > (Maybe also demotion, which only move memory from one node to another,
> > as well as the new cram.c stuff? This will technically also save some
> > wasted work, as in the status quo we will need to do a demotion pass
> > first, before having to reclaiom memory from the bottom tier anyway?
> > But not sure if we want this).
> 
> Some more thoughts - right now demotion is kinda similar, right? We
> move pages from one node (fast tier) to another (slow tier). This
> frees up space in the fast tier, but it actually doesn't change the
> memcg memory usage. So we are not making "forward progress" with this
> either.
> 
> I suppose this is fine-ish, because reclaim subsystem can then proceed
> by reclaiming from the bottom tier, which will now go to disk swap,
> zswap, etc.
> 
> Can we achieve the same effect by making pages in
> zswap-backed-by-compressed-cxl reclaimable:
> 
> 1. Recompression - take them off compressed cxl and store them in
> zswap proper (i.e in-memory compression).

I think the whole point of using compressed cxl with zswap is saving
memory in the top-tier, so this would be counter-productive (probably
even if we use slightly less memory in the top-tier).

> 
> 2. Just enable zswap shrinker and have memory reclaim move these pages
> into disk swap. This will have a much more drastic performance
> implications though :)

I think what you're getting it as that we can still make forward
progress after memory lands in compressed cxl. But moving memory to
compressed cxl is already forward progress that reclaim won't capture if
we charge memory as a full page. I think this is the crux of the issue.

We need to figure out how to make accounting work such that moving
memory to compressed cxl is forward progress, but make sure we don't
break the overall accounting consisteny. If we only charge the actual
compressed size, then from the system perspective there is a page that
is only partially charged and the rest of it is more-or-less leaked.

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Gregory Price 3 weeks, 4 days ago

On Thu, Jan 15, 2026 at 05:00:04PM +0000, Yosry Ahmed wrote:
> > 
> > 2. Just enable zswap shrinker and have memory reclaim move these pages
> > into disk swap. This will have a much more drastic performance
> > implications though :)
> 
> I think what you're getting it as that we can still make forward
> progress after memory lands in compressed cxl. But moving memory to
> compressed cxl is already forward progress that reclaim won't capture if
> we charge memory as a full page. I think this is the crux of the issue.
> 
> We need to figure out how to make accounting work such that moving
> memory to compressed cxl is forward progress, but make sure we don't
> break the overall accounting consisteny. If we only charge the actual
> compressed size, then from the system perspective there is a page that
> is only partially charged and the rest of it is more-or-less leaked.

Which is comically fine - because the actual capacity of the node is
functionally a lie anyway :D

~Gregory

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Gregory Price 3 weeks, 6 days ago

On Mon, Jan 12, 2026 at 09:13:26PM +0000, Yosry Ahmed wrote:
> On Fri, Jan 09, 2026 at 04:40:08PM -0500, Gregory Price wrote:
> > On Fri, Jan 09, 2026 at 04:00:00PM +0000, Yosry Ahmed wrote:
> > > On Thu, Jan 08, 2026 at 03:37:54PM -0500, Gregory Price wrote:
> > 
> > Hardware Says : 8GB
> > Hardware Has  : 1GB
> > Node Capacity : 8GB
> > 
> > The capacity numbers are static.  Even with hotplug, they must be
> > considered static - because the runtime compression ratio can change.
> > 
> > If the device fails to achieve a 4:1 compression ratio, and real usage
> > starts to exceed real capacity - the system will fail.
> > (dropped writes, poisons, machine checks, etc).
> > 
> > We can mitigate this with strong write-controls and querying the device
> > for compression ratio data prior to actually migrating a page. 
> 
> I am a little bit confused about this. Why do we only need to query the
> device before migrating the page?
>

Because there is no other interposition point at which we could.
Everything is memory semantic - it reduces to memcpy().

The actual question you're asking is "What happens if we write the page
and we're out of memory?"

The answer is:  The page gets poisoned and the write gets dropped.

That's it.  The writer does not get notified.  The next reader of that
memory will hit POISON and the failure process will happen (MCE or
SIGBUS, essentially).

> Are we checking if the device has enough memory for the worst case
> scenario (i.e. PAGE_SIZE)?
> 
> Or are we checking if the device can compress this specific page and
> checking if it can compress it and store it? This seems like it could be
> racy and there might be some throwaway work.
> 

We essentially need to capture the current compression ratio and
real-usage to determine whether there's another page available.

It is definitely racey, and the best we can do is set reasonable
real-memory-usage limits to prevent ever finding ourselves in that
scenario.  That most likely means requiring the hardware send an
interrupt when usage and/or ratio hit some threshhold and setting a
"NO ALLOCATION ALLOWED" bit.

But in software we can also try to query/track this as well, but we may
not be able to query the device at allocation time (or at least that
would be horribly non-performant).

So yeah, it's racy.

> I guess my question is: why not just give the page to the device and get
> either: successfully compressed and stored OR failed?
> 

Yeah this is what I meant by this whole thing being sunk into the
callback.  I think that's reasonable.

> Another question, can the device or driver be configured such that we
> reject pages that compress poorly to avoid wasting memory and BW on the
> device for little savings?
> 

Memory semantics :]

memcpy(dst, src) -> no indication of compression ratio

> > on *write* access:
> > - promote to real page
> > - clean up the compressed page
> 
> This makes sense. I am assuming the main benefit of zswap.c over cram.c
> in this scenario is limiting read accesses as well.
>

For the first go, yeah.  A cram.c would need special page table handling
bits that will take a while to get right.  We can make use of the
hardware differently in the meantime.

> > --- assuming there isn't a way and we have to deal with fuzzy math ---
> > 
> > The goal should definitely be to leave the charging statistics the same
> > from the perspective of services - i.e zswap should charge a whole page,
> > because according to the OS it just used a whole page.
> > 
> > What this would mean is memcg would have to work with fuzzy data.
> > If 1GB is charged and the compression ratio is 4:1, reclaim should
> > operate (by way of callback) like it has used 256MB.
> > 
> > I think this is the best you can do without tracking individual pages.
> 
> This part needs more thought. Zswap cannot charge a full page because
> then from the memcg perspective reclaim is not making any progress.
> OTOH, as you mention, from the system perspective we just consumed a
> full page, so not charging that would be inconsistent.
> 
> This is not a zswap-specific thing though, even with cram.c we have to
> figure out how to charge memory on the compressed node to the memcg.
> It's perhaps not as much of a problem as with zswap because we are not
> dealing with reclaim not making progress.
>
> Maybe the memcg limits need to be "enlightened" about different tiers?
> We did have such discussions in the past outside the context of
> compressed memory, for memory tiering in general.
> 
> Not sure if this is the right place to discuss this, but I see the memcg
> folks CC'd so maybe it is :)
>

I will probably need some help to get the accounting right if I'm being
honest.  I can't say I fully understanding the implications here, but
what you describe makes sense.

One of the assumptions you have in zswap is that there's some known
REAL chunk of memory X-GB, and the compression ratio dictates that you
get to cram more than X-GB of data in there.

This device flips that on its head.  It lies to the system and says
there's X-GB, and you can only actually use a fraction of it in the
worst case - and in the best case you use all of it.

So in that sense, zswap has "infinite upside" (if you're infinitely
compressible), whereas this device has "limited upside" (node capacity).

That changes how you account for things entirely, and that's why
entry->length always has to be PAGE_SIZE.  Even if the device can tell
us the real size, i'm not sure how useful that is - you still have to
charge for an entire `struct page`.

Time for a good long :think:

> > 
> > This is ignorance of zswap on my part, and yeah good point.  Will look
> > into this accounting a little more.
> 
> This is similar-ish to the memcg charging problem, how do we count the
> compressed memory usage toward the global zswap limit? Do we keep this
> limit for the top-tier? If not, do we charge full size for pages in
> c.zswap or compressed size?
> 
> Do we need a separate limit for c.zswap? Probably not if the whole node
> is dedicated for zswap usage.
>

Since we're accounting for entire `struct page` usage vs the hard cap of
(device_capcity / PAGE_SIZE) - then this might actually be the answer.

> > 
> > Thank you again for taking a look, this has been enlightening.  Good
> > takeaways for the rest of the N_PRIVATE design.
> 
> Thanks for kicking off the discussion here, an interesting problem to
> solve for sure :)
> 

One of the more interesting ones i've had in a few years :]

Cheers,
~Gregory

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Yosry Ahmed 3 weeks, 4 days ago

On Mon, Jan 12, 2026 at 06:33:16PM -0500, Gregory Price wrote:
> On Mon, Jan 12, 2026 at 09:13:26PM +0000, Yosry Ahmed wrote:
> > On Fri, Jan 09, 2026 at 04:40:08PM -0500, Gregory Price wrote:
> > > On Fri, Jan 09, 2026 at 04:00:00PM +0000, Yosry Ahmed wrote:
> > > > On Thu, Jan 08, 2026 at 03:37:54PM -0500, Gregory Price wrote:
> > > 
> > > Hardware Says : 8GB
> > > Hardware Has  : 1GB
> > > Node Capacity : 8GB
> > > 
> > > The capacity numbers are static.  Even with hotplug, they must be
> > > considered static - because the runtime compression ratio can change.
> > > 
> > > If the device fails to achieve a 4:1 compression ratio, and real usage
> > > starts to exceed real capacity - the system will fail.
> > > (dropped writes, poisons, machine checks, etc).
> > > 
> > > We can mitigate this with strong write-controls and querying the device
> > > for compression ratio data prior to actually migrating a page. 
> > 
> > I am a little bit confused about this. Why do we only need to query the
> > device before migrating the page?
> >
> 
> Because there is no other interposition point at which we could.
> Everything is memory semantic - it reduces to memcpy().
> 
> The actual question you're asking is "What happens if we write the page
> and we're out of memory?"
> 
> The answer is:  The page gets poisoned and the write gets dropped.
> 
> That's it.  The writer does not get notified.  The next reader of that
> memory will hit POISON and the failure process will happen (MCE or
> SIGBUS, essentially).
> 
> > Are we checking if the device has enough memory for the worst case
> > scenario (i.e. PAGE_SIZE)?
> > 
> > Or are we checking if the device can compress this specific page and
> > checking if it can compress it and store it? This seems like it could be
> > racy and there might be some throwaway work.
> > 
> 
> We essentially need to capture the current compression ratio and
> real-usage to determine whether there's another page available.
> 
> It is definitely racey, and the best we can do is set reasonable
> real-memory-usage limits to prevent ever finding ourselves in that
> scenario.  That most likely means requiring the hardware send an
> interrupt when usage and/or ratio hit some threshhold and setting a
> "NO ALLOCATION ALLOWED" bit.
> 
> But in software we can also try to query/track this as well, but we may
> not be able to query the device at allocation time (or at least that
> would be horribly non-performant).
> 
> So yeah, it's racy.

Yeah I think we should track it in software if possible to completely
avoid the poison scenario you describe above. Relying on setting a
reasonable limit and a certain compression ratio doesn't sound too
comforting.

> 
> > I guess my question is: why not just give the page to the device and get
> > either: successfully compressed and stored OR failed?
> > 
> 
> Yeah this is what I meant by this whole thing being sunk into the
> callback.  I think that's reasonable.
> 
> > Another question, can the device or driver be configured such that we
> > reject pages that compress poorly to avoid wasting memory and BW on the
> > device for little savings?
> > 
> 
> Memory semantics :]
> 
> memcpy(dst, src) -> no indication of compression ratio

Right..

> 
> > > on *write* access:
> > > - promote to real page
> > > - clean up the compressed page
> > 
> > This makes sense. I am assuming the main benefit of zswap.c over cram.c
> > in this scenario is limiting read accesses as well.
> >
> 
> For the first go, yeah.  A cram.c would need special page table handling
> bits that will take a while to get right.  We can make use of the
> hardware differently in the meantime.

Makes sense.

I just want to point out that using compressed memory with zswap doesn't
buy us much in terms of reclaim latency, so the main goal here is just
saving memory on the top tier, not improving performance, right?

> 
> > > --- assuming there isn't a way and we have to deal with fuzzy math ---
> > > 
> > > The goal should definitely be to leave the charging statistics the same
> > > from the perspective of services - i.e zswap should charge a whole page,
> > > because according to the OS it just used a whole page.
> > > 
> > > What this would mean is memcg would have to work with fuzzy data.
> > > If 1GB is charged and the compression ratio is 4:1, reclaim should
> > > operate (by way of callback) like it has used 256MB.
> > > 
> > > I think this is the best you can do without tracking individual pages.
> > 
> > This part needs more thought. Zswap cannot charge a full page because
> > then from the memcg perspective reclaim is not making any progress.
> > OTOH, as you mention, from the system perspective we just consumed a
> > full page, so not charging that would be inconsistent.
> > 
> > This is not a zswap-specific thing though, even with cram.c we have to
> > figure out how to charge memory on the compressed node to the memcg.
> > It's perhaps not as much of a problem as with zswap because we are not
> > dealing with reclaim not making progress.
> >
> > Maybe the memcg limits need to be "enlightened" about different tiers?
> > We did have such discussions in the past outside the context of
> > compressed memory, for memory tiering in general.
> > 
> > Not sure if this is the right place to discuss this, but I see the memcg
> > folks CC'd so maybe it is :)
> >
> 
> I will probably need some help to get the accounting right if I'm being
> honest.  I can't say I fully understanding the implications here, but
> what you describe makes sense.
> 
> One of the assumptions you have in zswap is that there's some known
> REAL chunk of memory X-GB, and the compression ratio dictates that you
> get to cram more than X-GB of data in there.
> 
> This device flips that on its head.  It lies to the system and says
> there's X-GB, and you can only actually use a fraction of it in the
> worst case - and in the best case you use all of it.
> 
> So in that sense, zswap has "infinite upside" (if you're infinitely
> compressible), whereas this device has "limited upside" (node capacity).
> 
> That changes how you account for things entirely, and that's why
> entry->length always has to be PAGE_SIZE.  Even if the device can tell
> us the real size, i'm not sure how useful that is - you still have to
> charge for an entire `struct page`.
> 
> Time for a good long :think:

Yeah it's counter-intuitive. Zswap needs to charge less than PAGE_SIZE
so that memcg tracking continues to make sense with reclaim (i.e. usage
goes down), but if zswap consumed a full page from the system
perspective, the math won't math.

Separate limits *could* be the answer, but it's harder to configure and
existing configuration won't "just work" with compressed memory.

> 
> > > 
> > > This is ignorance of zswap on my part, and yeah good point.  Will look
> > > into this accounting a little more.
> > 
> > This is similar-ish to the memcg charging problem, how do we count the
> > compressed memory usage toward the global zswap limit? Do we keep this
> > limit for the top-tier? If not, do we charge full size for pages in
> > c.zswap or compressed size?
> > 
> > Do we need a separate limit for c.zswap? Probably not if the whole node
> > is dedicated for zswap usage.
> >
> 
> Since we're accounting for entire `struct page` usage vs the hard cap of
> (device_capcity / PAGE_SIZE) - then this might actually be the answer.
> 
> > > 
> > > Thank you again for taking a look, this has been enlightening.  Good
> > > takeaways for the rest of the N_PRIVATE design.
> > 
> > Thanks for kicking off the discussion here, an interesting problem to
> > solve for sure :)
> > 
> 
> One of the more interesting ones i've had in a few years :]
> 
> Cheers,
> ~Gregory

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Gregory Price 3 weeks, 4 days ago

> > For the first go, yeah.  A cram.c would need special page table handling
> > bits that will take a while to get right.  We can make use of the
> > hardware differently in the meantime.
> 
> Makes sense.
> 
> I just want to point out that using compressed memory with zswap doesn't
> buy us much in terms of reclaim latency, so the main goal here is just
> saving memory on the top tier, not improving performance, right?
>

Yeah first goal is to just demonstrate such an accelerator can even work
as a top-tier memory saving mechanism.  But hard to say whether reclaim
latency will be affected appreciably - won't know until we get there :]

I'm totally prepared for this to be a science experiment that gets
thrown away.

> > 
> > I will probably need some help to get the accounting right if I'm being
> > honest.  I can't say I fully understanding the implications here, but
> > what you describe makes sense.
> > 
> 
> Yeah it's counter-intuitive. Zswap needs to charge less than PAGE_SIZE
> so that memcg tracking continues to make sense with reclaim (i.e. usage
> goes down), but if zswap consumed a full page from the system
> perspective, the math won't math.
> 
> Separate limits *could* be the answer, but it's harder to configure and
> existing configuration won't "just work" with compressed memory.
>

I think you are right. I am also inquiring whether individual page
compression data is retrievable.  If so, then this actually should be a
trivial integration.

If not then this is probably ending up on the cutting room floor and
going straight to a full cram.c implementation.

~Gregory

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Yosry Ahmed 3 weeks, 3 days ago

On Thu, Jan 15, 2026 at 12:26:41PM -0500, Gregory Price wrote:
> > > For the first go, yeah.  A cram.c would need special page table handling
> > > bits that will take a while to get right.  We can make use of the
> > > hardware differently in the meantime.
> > 
> > Makes sense.
> > 
> > I just want to point out that using compressed memory with zswap doesn't
> > buy us much in terms of reclaim latency, so the main goal here is just
> > saving memory on the top tier, not improving performance, right?
> >
> 
> Yeah first goal is to just demonstrate such an accelerator can even work
> as a top-tier memory saving mechanism.  But hard to say whether reclaim
> latency will be affected appreciably - won't know until we get there :]
> 
> I'm totally prepared for this to be a science experiment that gets
> thrown away.

If that's the case I would put the zswap stuff under an experimental
config option that's not enabled by default, so that we can rip it out
later if needed. 

> 
> > > 
> > > I will probably need some help to get the accounting right if I'm being
> > > honest.  I can't say I fully understanding the implications here, but
> > > what you describe makes sense.
> > > 
> > 
> > Yeah it's counter-intuitive. Zswap needs to charge less than PAGE_SIZE
> > so that memcg tracking continues to make sense with reclaim (i.e. usage
> > goes down), but if zswap consumed a full page from the system
> > perspective, the math won't math.
> > 
> > Separate limits *could* be the answer, but it's harder to configure and
> > existing configuration won't "just work" with compressed memory.
> >
> 
> I think you are right. I am also inquiring whether individual page
> compression data is retrievable.  If so, then this actually should be a
> trivial integration.
> 
> If not then this is probably ending up on the cutting room floor and
> going straight to a full cram.c implementation.
> 
> ~Gregory

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Jonathan Cameron 3 weeks, 6 days ago

...

> > Are we checking if the device has enough memory for the worst case
> > scenario (i.e. PAGE_SIZE)?
> > 
> > Or are we checking if the device can compress this specific page and
> > checking if it can compress it and store it? This seems like it could be
> > racy and there might be some throwaway work.
> >   
> 
> We essentially need to capture the current compression ratio and
> real-usage to determine whether there's another page available.
> 
> It is definitely racey, and the best we can do is set reasonable
> real-memory-usage limits to prevent ever finding ourselves in that
> scenario.  That most likely means requiring the hardware send an
> interrupt when usage and/or ratio hit some threshhold and setting a
> "NO ALLOCATION ALLOWED" bit.

I believe we could do some dance to close the race.

What we need is some upper bounds on usage at any point in time,
if that estimate is too high stop allocating until we get a better bound.

Can do that by starting an allocation counter before reading capacity.
As long as it only counts allocations (and not frees) then it will
always be an upper bound. 

Any frees will be dealt with when we reread current allocation (having
started a new counter of allocations just before that). Once we have
that new upper bound, can ignore the previous one as being less accurate.

If we see the interrupt, all bets are off. That's a fatal error in capacity
tracking.

> 
> But in software we can also try to query/track this as well, but we may
> not be able to query the device at allocation time (or at least that
> would be horribly non-performant).
> 
> So yeah, it's racy.

 
> > > 
> > > Thank you again for taking a look, this has been enlightening.  Good
> > > takeaways for the rest of the N_PRIVATE design.  
> > 
> > Thanks for kicking off the discussion here, an interesting problem to
> > solve for sure :)
> >   
> 
> One of the more interesting ones i've had in a few years :]

Agreed. Compressed memory is fun ;)
> 
> Cheers,
> ~Gregory

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Gregory Price 3 weeks, 6 days ago

On Mon, Jan 12, 2026 at 06:33:16PM -0500, Gregory Price wrote:
> One of the assumptions you have in zswap is that there's some known
> REAL chunk of memory X-GB, and the compression ratio dictates that you
> get to cram more than X-GB of data in there.
> 
> This device flips that on its head.  It lies to the system and says
> there's X-GB, and you can only actually use a fraction of it in the
> worst case - and in the best case you use all of it.
> 
> So in that sense, zswap has "infinite upside" (if you're infinitely
> compressible), whereas this device has "limited upside" (node capacity).
> 
> That changes how you account for things entirely, and that's why
> entry->length always has to be PAGE_SIZE.  Even if the device can tell
> us the real size, i'm not sure how useful that is - you still have to
> charge for an entire `struct page`.
> 
> Time for a good long :think:
> 

hmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm

now that i have written this out, I wonder if the answer here is for the
zswap_node controller (cxl driver or whatever) to detect high memory
usage and online a new memory block if there is additional capacity
available.

This would look like the swap file increasing in size dynamically,
which is *also* problematic, but it's at least in the same ballpark.

From a CXL perspective, this would look like a dynamic capacity device.

And the catch would be that we would need the opposite interface:

  zswap.c or cram.c would need an explicit evict interface to allow
  capacity to be offlined if the device needs to shrink the "fake"
  capacity in response to shrinking compression ratios.

Time for a much, much longer :think:

~Gregory

Re: [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration

Posted by Gregory Price 1 month ago

On Fri, Jan 09, 2026 at 04:00:00PM +0000, Yosry Ahmed wrote:
> On Thu, Jan 08, 2026 at 03:37:54PM -0500, Gregory Price wrote:
> > If a private zswap-node is available, skip the entire software
> > compression process and memcpy directly to a compressed memory
> > folio, and store the newly allocated compressed memory page as
> > the zswap entry->handle.
> > 
> > On decompress we do the opposite: copy directly from the stored
> > page to the destination, and free the compressed memory page.
> > 
> > The driver callback is responsible for preventing run-away
> > compression ratio failures by checking that the allocated page is
> > safe to use (i.e. a compression ratio limit hasn't been crossed).
> > 
> > Signed-off-by: Gregory Price <gourry@gourry.net>
> 
> Hi Gregory,
> 
> Thanks for sending this, I have a lot of questions/comments below, but
> from a high-level I am trying to understand the benefit of using a
> compressed node for zswap rather than as a second tier.
>

Don't think to hard about it - this is a stepping stone until we figure
out the cram.c usage pattern.

unrestricted write access to compress-ram a reliability issue, so:
  - zswap restricts both read and write.
  - a cram.c service would restrict write but leave pages mapped read

Have to step away, will come back to the rest of feedback a bit latter,
thank you for the review.

~Gregory