mm: ghost swapfile support for zswap

[PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 2 weeks ago

The current zswap requires a backing swapfile. The swap slot used
by zswap is not able to be used by the swapfile. That waste swapfile
space.

The ghost swapfile is a swapfile that only contains the swapfile header
for zswap. The swapfile header indicate the size of the swapfile. There
is no swap data section in the ghost swapfile, therefore, no waste of
swapfile space.  As such, any write to a ghost swapfile will fail. To
prevents accidental read or write of ghost swapfile, bdev of
swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
flag because there is no rotation disk access when using zswap.

The zswap write back has been disabled if all swapfiles in the system
are ghost swap files.

Signed-off-by: Chris Li <chrisl@kernel.org>
---
 include/linux/swap.h |  2 ++
 mm/page_io.c         | 18 +++++++++++++++---
 mm/swap.h            |  2 +-
 mm/swap_state.c      |  7 +++++++
 mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
 mm/zswap.c           | 17 +++++++++++------
 6 files changed, 73 insertions(+), 15 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -216,6 +216,7 @@ enum {
 	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
 	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
 	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
+	SWP_GHOST	= (1 << 13),	/* not backed by anything */
 					/* add others here before... */
 };
 
@@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
 void free_pages_and_swap_cache(struct encoded_page **, int);
 /* linux/mm/swapfile.c */
 extern atomic_long_t nr_swap_pages;
+extern atomic_t nr_real_swapfiles;
 extern long total_swap_pages;
 extern atomic_t nr_rotate_swap;
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
 		return AOP_WRITEPAGE_ACTIVATE;
 	}
 
-	__swap_writepage(folio, swap_plug);
-	return 0;
+	return __swap_writepage(folio, swap_plug);
 out_unlock:
 	folio_unlock(folio);
 	return ret;
@@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
 	submit_bio(bio);
 }
 
-void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
+int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
 {
 	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
 
 	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
+
+	if (sis->flags & SWP_GHOST) {
+		/* Prevent the page from getting reclaimed. */
+		folio_set_dirty(folio);
+		return AOP_WRITEPAGE_ACTIVATE;
+	}
+
 	/*
 	 * ->flags can be updated non-atomicially (scan_swap_map_slots),
 	 * but that will never affect SWP_FS_OPS, so the data_race
@@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
 		swap_writepage_bdev_sync(folio, sis);
 	else
 		swap_writepage_bdev_async(folio, sis);
+	return 0;
 }
 
 void swap_write_unplug(struct swap_iocb *sio)
@@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
 	if (zswap_load(folio) != -ENOENT)
 		goto finish;
 
+	if (unlikely(sis->flags & SWP_GHOST)) {
+		folio_unlock(folio);
+		goto finish;
+	}
+
 	/* We have to read from slower devices. Increase zswap protection. */
 	zswap_folio_swapin(folio);
 
diff --git a/mm/swap.h b/mm/swap.h
index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
 }
 void swap_write_unplug(struct swap_iocb *sio);
 int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
-void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
+int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
 
 /* linux/mm/swap_state.c */
 extern struct address_space swap_space __ro_after_init;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	struct swap_iocb *splug = NULL;
 	bool page_allocated;
 
+	/*
+	 * The entry may have been freed by another task. Avoid swap_info_get()
+	 * which will print error message if the race happens.
+	 */
+	if (si->flags & SWP_GHOST)
+		goto skip;
+
 	mask = swapin_nr_pages(offset) - 1;
 	if (!mask)
 		goto skip;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 atomic_long_t nr_swap_pages;
+atomic_t nr_real_swapfiles;
 /*
  * Some modules use swappable objects and may try to swap them out under
  * memory pressure (via the shrinker). Before doing so, they may wish to
@@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
 			goto skip;
 	}
 
+	if (!(si->flags & SWP_GHOST))
+		atomic_sub(1, &nr_real_swapfiles);
 	plist_del(&si->avail_list, &swap_avail_head);
 
 skip:
@@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
 	}
 
 	plist_add(&si->avail_list, &swap_avail_head);
+	if (!(si->flags & SWP_GHOST))
+		atomic_add(1, &nr_real_swapfiles);
 
 skip:
 	spin_unlock(&swap_avail_lock);
@@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 	struct inode *inode = mapping->host;
 	int ret;
 
+	if (sis->flags & SWP_GHOST) {
+		*span = 0;
+		return 0;
+	}
+
 	if (S_ISBLK(inode->i_mode)) {
 		ret = add_swap_extent(sis, 0, sis->max, 0);
 		*span = sis->pages;
@@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	if (p->flags & SWP_CONTINUED)
 		free_swap_count_continuations(p);
 
-	if (!p->bdev || !bdev_nonrot(p->bdev))
+	if (!(p->flags & SWP_GHOST) &&
+	    (!p->bdev || !bdev_nonrot(p->bdev)))
 		atomic_dec(&nr_rotate_swap);
 
 	mutex_lock(&swapon_mutex);
@@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
 	mutex_unlock(&swapon_mutex);
 }
 
+static const char *swap_type_str(struct swap_info_struct *si)
+{
+	struct file *file = si->swap_file;
+
+	if (si->flags & SWP_GHOST)
+		return "ghost\t";
+
+	if (S_ISBLK(file_inode(file)->i_mode))
+		return "partition";
+
+	return "file\t";
+}
+
 static int swap_show(struct seq_file *swap, void *v)
 {
 	struct swap_info_struct *si = v;
@@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
 	len = seq_file_path(swap, file, " \t\n\\");
 	seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
 			len < 40 ? 40 - len : 1, " ",
-			S_ISBLK(file_inode(file)->i_mode) ?
-				"partition" : "file\t",
+			swap_type_str(si),
 			bytes, bytes < 10000000 ? "\t" : "",
 			inuse, inuse < 10000000 ? "\t" : "",
 			si->prio);
@@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
 	return 0;
 }
 
-
 /*
  * Find out how many pages are allowed for a single swap device. There
  * are two limiting factors:
@@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
 	unsigned long maxpages;
 	unsigned long swapfilepages;
 	unsigned long last_page;
+	loff_t size;
 
 	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
 		pr_err("Unable to find swap-space signature\n");
@@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
 
 	if (!maxpages)
 		return 0;
-	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
+
+	size = i_size_read(inode);
+	if (size == PAGE_SIZE) {
+		/* Ghost swapfile */
+		si->bdev = NULL;
+		si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
+		return maxpages;
+	}
+
+	swapfilepages = size >> PAGE_SHIFT;
 	if (swapfilepages && maxpages > swapfilepages) {
 		pr_warn("Swap area shorter than signature indicates\n");
 		return 0;
diff --git a/mm/zswap.c b/mm/zswap.c
index 5d0f8b13a958da3b5e74b63217b06e58ba2d3c26..29dfcc94b13eb72b1dbd100ded6e50620299e6e1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1005,14 +1005,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	struct folio *folio;
 	struct mempolicy *mpol;
 	bool folio_was_allocated;
-	struct swap_info_struct *si;
+	struct swap_info_struct *si = get_swap_device(swpentry);
 	int ret = 0;
 
-	/* try to allocate swap cache folio */
-	si = get_swap_device(swpentry);
 	if (!si)
-		return -EEXIST;
+		return -ENOENT;
+
+	if (si->flags & SWP_GHOST) {
+		put_swap_device(si);
+		return -EINVAL;
+	}
 
+	/* try to allocate swap cache folio */
 	mpol = get_task_policy(current);
 	folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
 			NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
@@ -1067,7 +1071,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	folio_set_reclaim(folio);
 
 	/* start writeback */
-	__swap_writepage(folio, NULL);
+	ret = __swap_writepage(folio, NULL);
+	WARN_ON_ONCE(ret);
 
 out:
 	if (ret && ret != -EEXIST) {
@@ -1551,7 +1556,7 @@ bool zswap_store(struct folio *folio)
 	zswap_pool_put(pool);
 put_objcg:
 	obj_cgroup_put(objcg);
-	if (!ret && zswap_pool_reached_full)
+	if (!ret && zswap_pool_reached_full && atomic_read(&nr_real_swapfiles))
 		queue_work(shrink_wq, &zswap_shrink_work);
 check_old:
 	/*

---
base-commit: 9835506e139732fa1b55aea3ed4e3ec3dd499f30
change-id: 20251121-ghost-56e3948a7a17

Best regards,
-- 
Chris Li <chrisl@kernel.org>

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Yosry Ahmed 2 months ago

On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> The current zswap requires a backing swapfile. The swap slot used
> by zswap is not able to be used by the swapfile. That waste swapfile
> space.
> 
> The ghost swapfile is a swapfile that only contains the swapfile header
> for zswap. The swapfile header indicate the size of the swapfile. There
> is no swap data section in the ghost swapfile, therefore, no waste of
> swapfile space.  As such, any write to a ghost swapfile will fail. To
> prevents accidental read or write of ghost swapfile, bdev of
> swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> flag because there is no rotation disk access when using zswap.
> 
> The zswap write back has been disabled if all swapfiles in the system
> are ghost swap files.
> 
> Signed-off-by: Chris Li <chrisl@kernel.org>

I did not know which subthread to reply to at this point, so I am just
replying to the main thread. I have been trying to stay out of this for
various reasons, but I was mentioned a few times and I also think this
is getting out of hand tbh.

First of all, I want to clarify that I am not "representing" any entity
here, I am speaking as an upstream zswap maintainer. Obviously I have
Google's interests in mind, but I am not representing Google here.

Second, Chris keeps bringing up that the community picked and/or
strongly favored the swap table approach over virtual swap back in 2023.
I just want to make it absolutely clear that this was NOT my read of the
room, and I do not think that the community really made a decision or
favored any approach back then.

Third, Chris, please stop trying to force this into a company vs company
situation. You keep mentioning personal attacks, but you are making this
personal more than anyone in this thread by taking this approach.

Now with all of that out of the way, I want to try to salvage the
technical discussion here. Taking several steps back, and
oversimplifying a bit: Chris mentioned having a frontend and backend and
an optional redirection when a page is moved between swap backends. This
is conceptually the same as the virtual swap proposal.

I think the key difference here is:
- In Chris's proposal, we start with a swap entry that represents a swap
  slot in swapfile A. If we do writeback (or swap tiering), we create
  another swap entry in swapfile B, and have the first swap entry point
  to it instead of the slot in swapfile A. If we want to reuse the swap
  slot in swapfile A, we create a new swap entry that points to it.

  So we start with a swap entry that directly maps to a swap slot, and
  optionally put a redirection there to point to another swap slot for
  writeback/tiering.

  Everything is a swapfile, even zswap will need to be represented by a
  separate (ghost) swapfile.

- In the virtual swap proposal, swap entries are in a completely
  different space than swap slots. A swap entry points to an arbitrary
  swap slot (or zswap entry) from the beginning, and writeback (or
  tiering) does not change that, it only changes what is being pointed
  to.

Regarding memory overhead (assuming x86_64), Chris's proposal has 8
bytes per entry in the swap table that is used to hold both the swap
count as well as the swapcache or shadow entry. Nhat's RFC for virtual
swap had 48 bytes of overhead, but that's a PoC of a specific
implementaiton.

Disregarding any specific implementation, any space optimizations that
can be applied to the swap table (e.g. combining swap count and
swapcache in an 8 byte field) can also be applied to virtual swap. The
only *real* difference is that with virtual swap we need to store the
swap slot (or zswap entry), while for the current swap table proposal it
is implied by the index of the entry. That's an additional 8 bytes.

So I think a fully optimized implementation of virtual swap could end up
with an overhead of 16 bytes per-entry. Everything else (locks,
rcu_head, etc) can probably be optimized away by using similar
optimizations as the swap table (e.g. do locking and alloc/freeing in
batches). In fact, I think we can use the swap table as the allocator in
the virtual swap space, reusing all the locking and allocation
optimizations. The difference would be that the swap table is indexed by
the virtual swap ID rather than the swap slot index.

Another important aspect here, in the simple case the swap table does
have lower overhead than virtual swap (8 bytes vs 16 bytes). Although
the difference isn't large to begin with, I don't think it's always the
case. I think this is only true for the simple case of having a swapped
out page on a disk swapfile or in a zswap (ghost) swapfile.

Once a page is written back from zswap to disk swapfile, in the swap
table approach we'll have two swap table entries. One in the ghost
swapfile (with a redirection), and one in the disk swapfile. That's 16
bytes, equal to the overhead of virtual swap.

Now imagine a scenario where we have zswap, SSD, and HDD swapfiles with
tiering. If a page goes to zswap, then SSD, then HDD, we'll end up with
3 swap table entries for a single swapped out page. That's 24 bytes. So
the memory overhead is not really constant, it scales with the number of
tiers (as opposed to virtual swap).

Another scenario is where we have SSD and HDD swapfiles with tiering. If
a page starts in SSD and goes to HDD, we'll have to swap table entries
for it (as above). The SSD entry would be wasted (has a redirection),
but Chris mentioned that we can fix this by allocating another frontend
cluster that points at the same SSD slot. How does this fit in the
8-byte swap table entry tho? The 8-bytes can only hold the swapcache or
shadow (and swapcount), but not the swap slot. For the current
implementation, the slot is implied by the swap table index, but if we
have separate front end swap tables, then we'll also need to store the
actual slot.

We can workaround this by having different types of clusters and swap
tables, where "virtual" clusters have 16 bytes instead of 8 bytes per
entry for that, sure.. but at that point we're at significantly more
complexity to end up where virtual swap would have put us.

Chris, Johannes, Nhat -- please correct me if I am wrong here or if I
missed something. I think the current swap table work by Kairui is
great, and we can reuse it for virtual swap (as I mentioned above). But
I don't think forcing everything to use a swapfile and extending swap
tables to support indirections and frontend/backend split is the way to
go (for the reasons described above).

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months ago

On Wed, Dec 3, 2025 at 12:37 PM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
>
> On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
> >
> > The zswap write back has been disabled if all swapfiles in the system
> > are ghost swap files.
> >
> > Signed-off-by: Chris Li <chrisl@kernel.org>
>
> I did not know which subthread to reply to at this point, so I am just
> replying to the main thread. I have been trying to stay out of this for
> various reasons, but I was mentioned a few times and I also think this
> is getting out of hand tbh.

Thanks for saving the discussion.

>
> First of all, I want to clarify that I am not "representing" any entity
> here, I am speaking as an upstream zswap maintainer. Obviously I have
> Google's interests in mind, but I am not representing Google here.

Ack, same here.

> Second, Chris keeps bringing up that the community picked and/or
> strongly favored the swap table approach over virtual swap back in 2023.
> I just want to make it absolutely clear that this was NOT my read of the
> room, and I do not think that the community really made a decision or
> favored any approach back then.

OK. Let's move on from that to our current discussion.

> Third, Chris, please stop trying to force this into a company vs company
> situation. You keep mentioning personal attacks, but you are making this
> personal more than anyone in this thread by taking this approach.

Let me clarify, it is absolutely not my intention to make it company
vs company, that does not fit the description either. Please accept my
apology for that. My original intention is that it is a group of
people sharing the same idea. More like I am against a whole group
(team VS). It is not about which company at all. Round robin N -> 1
intense arguing put me in an uncomfortable situation, feeling
excluded.

On one hand I wish there was someone representing  the group as the
main speaker, that would make the discussion feel more equal, more
inclusive. On the other hand, any perspective is important, it is hard
to require the voice to route through the main speaker. It is hard to
execute in practice. So I give up suggesting that.  I am open for
suggestions on how to make the discussion more inclusive for newcomers
to the existing established group.

> Now with all of that out of the way, I want to try to salvage the
> technical discussion here. Taking several steps back, and

Thank you for driving the discussion back to the technical side. I
really appreciate it.

> oversimplifying a bit: Chris mentioned having a frontend and backend and
> an optional redirection when a page is moved between swap backends. This
> is conceptually the same as the virtual swap proposal.

In my perspective, it is not the same as a virtual swap proposal.
There is some overlap, they both can do redirection.

But they originally aim to solve two different problems. One of the
important goals of the swap table is  to allow continuing mTHP swap
entry when all the space left is not continues. For the rest of the
discusion we call it "continuous mTHP allocator". It allocate
continuous swap entry out of non continues file location.

Let's say you have a 1G swapfile, all full not available slots.
1) free 4 pages at swap offset 1, 3, 5, 7. All discontiguous spaces
add up to 16K.
2) Now allocate one mTHP order 2, 16K in size.
Previous allocator can not be satisfied with this requirement. Because
the 4 empty slots are not contiguous.
Here the redirection and growth of the front swap entry comes in, it
is all part of the consideration all alone, not an afterthought.
This following step will allow allocating 16K continuous swap entries
out of offset [1,3,5,7]
3) We grow the front end part of the swapfile, effectively bump up the
max size and add a new cluster of order 2, with a swap table.
That is where the front end of the swap and back end file store comes in.

BTW, Please don't accuse me copy cat the name "virtual swapfile". I
introduce it here 1/8/2025 before Nhat does:
https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/
==============quote==============
I think we need to have a separation of the swap cache and the backing
of IO of the swap file. I call it the "virtual swapfile".
It is virtual in two aspect:
1) There is an up front size at swap on, but no up front allocation of
the vmalloc array. The array grows as needed.
2) There is a virtual to physical swap entry mapping. The cost is 4
bytes per swap entry. But it will solve a lot of problems all
together.
==============quote ends =========
Side story:
I want to pass the "virtual swapfile"  for Kairui to propose as LSF
topic. Coincidentally  Nhat proposes the virtual swap as a LSF topic
at 1/16/2025, a few days after I mention "virtual swapfile" in the lsf
topic related discussion. It is right before Kairui purpose "virtual
swapfile". Kairui renamed our version as "swap table". That is the
history behind the name of "swap table".
https://lore.kernel.org/linux-mm/20250116092254.204549-1-nphamcs@gmail.com/

I am sure Nhat did not see that email and come up with it
independently, coincidentally. I just want to establish that I have
prior art introducing the name "virtual swapfile" before Nhat's LSF
"virtual swap" topic. After all, it is just a name. I am just as happy
using "swap table".

To avoid confuse the reader I will call my version of "virtual swap"
the "front end".

The front end owns the cluster and swap table (swap cache). 8 bytes.
The back end only contain file position pointer. 4 bytes.

4) The back end will need different allocate because the allocating
assumption is different, it does not have alignment requirement. It
just need to track which block location is available.
It will need to have a back end specific allocator.  It only manage
the location of the swapfile cannot allocate from fronted. e.g.
redirection entry create a hole. or the new cluster added from step 3.

5) the backend location pointer is optional of the cluster. For the
cluster new allocated from step, It must have location pointer,
because its offset is out of the backing file range.
That is a 4 byte just like a swap entry.
This backend location pointer can be used by solution like VS as well.
That is part of the consideration as well, so not a after thought.
The allocator mention here is more like a file system design rather
than pure memory location, because it need to consider block location
for combining block level IO.

So the mTHP allocator can do swapfile location redirection. But that
is a side benefit of a different design goal (mTHP allocation). This
physical location pointer description match my 2024 LSF pony talk
slide. I just did not put text in the slide there. So it is not some
thing after thought, it pre-dates back to 2024 talks.

> I think the key difference here is:
> - In Chris's proposal, we start with a swap entry that represents a swap
>   slot in swapfile A. If we do writeback (or swap tiering), we create
>   another swap entry in swapfile B, and have the first swap entry point

Correction. Instead of swapfile B, Backend location in swapfile B. in
step 5). It only 4 byte. The back end does not have swap cache. The
swap cache belong to front end A (8 bytes).

>   to it instead of the slot in swapfile A. If we want to reuse the swap
>   slot in swapfile A, we create a new swap entry that points to it.
>
>   So we start with a swap entry that directly maps to a swap slot, and

Again, in my description swap slot A has a file backend location
pointer points to swapfile B.
It is only the bottom half the swap slot B, not the full swap slot. It
does not have 8 byte swap entry overhead of B.

>   optionally put a redirection there to point to another swap slot for
>   writeback/tiering.

Point to another swapfile location backend, not swap entry.(4 bytes)

>   Everything is a swapfile, even zswap will need to be represented by a
>   separate (ghost) swapfile.

Allow ghost swapfile. I wouldn't go as far saying ban the current
zswap writeback, that part is TBD. My description is enable memory
swap tiers without actual physical file backing. Enable ghost
swapfile.

>
> - In the virtual swap proposal, swap entries are in a completely
>   different space than swap slots. A swap entry points to an arbitrary
>   swap slot (or zswap entry) from the beginning, and writeback (or
>   tiering) does not change that, it only changes what is being pointed
>   to.
>
> Regarding memory overhead (assuming x86_64), Chris's proposal has 8
> bytes per entry in the swap table that is used to hold both the swap
> count as well as the swapcache or shadow entry. Nhat's RFC for virtual
Ack

> swap had 48 bytes of overhead, but that's a PoC of a specific
> implementaiton.

Ack.

> Disregarding any specific implementation, any space optimizations that
> can be applied to the swap table (e.g. combining swap count and
> swapcache in an 8 byte field) can also be applied to virtual swap. The
> only *real* difference is that with virtual swap we need to store the
> swap slot (or zswap entry), while for the current swap table proposal it
> is implied by the index of the entry. That's an additional 8 bytes.

No, the VS has a smaller design scope. VS does not enable "continous
mTHP allocation" . At least that is not mention in any previous VS
material.

> So I think a fully optimized implementation of virtual swap could end up
> with an overhead of 16 bytes per-entry. Everything else (locks,
> rcu_head, etc) can probably be optimized away by using similar
> optimizations as the swap table (e.g. do locking and alloc/freeing in

With the continues mTHP allocator mention above, it already has the
all things VS needed.
I am not sure we still need VS if we have "continues mTHP allocator",
that is TBD.

Yes, VS can reuse the physical location pointer by "continues mTHP allocator".

The overhead is for above swap table of redirection is 12 bytes not 16 bytes.

> batches). In fact, I think we can use the swap table as the allocator in
> the virtual swap space, reusing all the locking and allocation

That is my feel all alone. Let swap table manage that.

> optimizations. The difference would be that the swap table is indexed by
> the virtual swap ID rather than the swap slot index.

 In the "continous mTHP allocator" it is just physical location pointer,

> Another important aspect here, in the simple case the swap table does
> have lower overhead than virtual swap (8 bytes vs 16 bytes). Although
> the difference isn't large to begin with, I don't think it's always the
> case. I think this is only true for the simple case of having a swapped
> out page on a disk swapfile or in a zswap (ghost) swapfile.

Please redo your evaluation after reading the above "continuous mTHP alloctor".

> Once a page is written back from zswap to disk swapfile, in the swap
> table approach we'll have two swap table entries. One in the ghost

One one entry with back end location pointer. (12 byte)

> swapfile (with a redirection), and one in the disk swapfile. That's 16
> bytes, equal to the overhead of virtual swap.

Again 12 bytes using "continues mTHP allocator" frame work.

> Now imagine a scenario where we have zswap, SSD, and HDD swapfiles with
> tiering. If a page goes to zswap, then SSD, then HDD, we'll end up with
> 3 swap table entries for a single swapped out page. That's 24 bytes. So
> the memory overhead is not really constant, it scales with the number of
> tiers (as opposed to virtual swap).

Nope, Only one front swap entry remain the same, every time it write
to a different tier, it only update the back end physical location
pointer.
It always points to the finial physical location. Only 12 bytes total.

You are paying 24 bytes because you don't have the front end vs back end split.
Your redirection includes the front end 8 byte as well. Because you
include the front end, now you need to do the relay forward.
That is the benefit to have front end and back end split of the swap
file. Make it more like a file system design.

> Another scenario is where we have SSD and HDD swapfiles with tiering. If
> a page starts in SSD and goes to HDD, we'll have to swap table entries
> for it (as above). The SSD entry would be wasted (has a redirection),
> but Chris mentioned that we can fix this by allocating another frontend
> cluster that points at the same SSD slot. How does this fit in the

No a fix. It is in the design consideration all alone. When the
redirection happen, that underlying physical block location pointer
will add to the backend allocator. The backend don't overlap with swap
entry location can be allocated from front end.

> 8-byte swap table entry tho? The 8-bytes can only hold the swapcache or
> shadow (and swapcount), but not the swap slot. For the current
> implementation, the slot is implied by the swap table index, but if we
> have separate front end swap tables, then we'll also need to store the
> actual slot.

Please read the above description regarding the front end and back end
split then ask your question again. The "continuous mTHP allocator"
above should answer your question.

> We can workaround this by having different types of clusters and swap
> tables, where "virtual" clusters have 16 bytes instead of 8 bytes per
> entry for that, sure.. but at that point we're at significantly more
> complexity to end up where virtual swap would have put us.

No, that further complicating things. Please don't go there. The front
end and back end location split is design to simplify situation like
this. It is conceptual much cleaner as well.

>
> Chris, Johannes, Nhat -- please correct me if I am wrong here or if I
> missed something. I think the current swap table work by Kairui is

Yes, see the above explanation of the "continuous mTHP allocator".

> great, and we can reuse it for virtual swap (as I mentioned above). But
> I don't think forcing everything to use a swapfile and extending swap
> tables to support indirections and frontend/backend split is the way to
> go (for the reasons described above).

IMHO, it is the way to go if consider mTHP allocating. You have
different assumption than mine in my design, I correct your
description as much as I can above. I am interested in your opinion
after read the above description of "continuous mTHP allocator", which
is match the 2024 LSF talk slide regarding swap cache redirecting
physical locations.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Yosry Ahmed 2 months ago

[..] 
> > Third, Chris, please stop trying to force this into a company vs company
> > situation. You keep mentioning personal attacks, but you are making this
> > personal more than anyone in this thread by taking this approach.
> 
> Let me clarify, it is absolutely not my intention to make it company
> vs company, that does not fit the description either. Please accept my
> apology for that. My original intention is that it is a group of
> people sharing the same idea. More like I am against a whole group
> (team VS). It is not about which company at all. Round robin N -> 1
> intense arguing put me in an uncomfortable situation, feeling
> excluded.
> 
> On one hand I wish there was someone representing  the group as the
> main speaker, that would make the discussion feel more equal, more
> inclusive. On the other hand, any perspective is important, it is hard
> to require the voice to route through the main speaker. It is hard to
> execute in practice. So I give up suggesting that.  I am open for
> suggestions on how to make the discussion more inclusive for newcomers
> to the existing established group.

Every person is expressing their own opinion, I don't think there's a
way to change that or have a "representative" of each opinion. In fact,
changing that would be the opposite of inclusive.  

> 
> > Now with all of that out of the way, I want to try to salvage the
> > technical discussion here. Taking several steps back, and
> 
> Thank you for driving the discussion back to the technical side. I
> really appreciate it.
> 
> > oversimplifying a bit: Chris mentioned having a frontend and backend and
> > an optional redirection when a page is moved between swap backends. This
> > is conceptually the same as the virtual swap proposal.
> 
> In my perspective, it is not the same as a virtual swap proposal.
> There is some overlap, they both can do redirection.
> 
> But they originally aim to solve two different problems. One of the
> important goals of the swap table is  to allow continuing mTHP swap
> entry when all the space left is not continues. For the rest of the
> discusion we call it "continuous mTHP allocator". It allocate
> continuous swap entry out of non continues file location.
> 
> Let's say you have a 1G swapfile, all full not available slots.
> 1) free 4 pages at swap offset 1, 3, 5, 7. All discontiguous spaces
> add up to 16K.
> 2) Now allocate one mTHP order 2, 16K in size.
> Previous allocator can not be satisfied with this requirement. Because
> the 4 empty slots are not contiguous.
> Here the redirection and growth of the front swap entry comes in, it
> is all part of the consideration all alone, not an afterthought.
> This following step will allow allocating 16K continuous swap entries
> out of offset [1,3,5,7]
> 3) We grow the front end part of the swapfile, effectively bump up the
> max size and add a new cluster of order 2, with a swap table.
> That is where the front end of the swap and back end file store comes in.

There's no reason why we cannot do the same with virtual swap, even if
it wasn't the main motivaiton, I don't see why we can't achieve the same
result.

> 
> BTW, Please don't accuse me copy cat the name "virtual swapfile". I
> introduce it here 1/8/2025 before Nhat does:

I don't think anyone cares about the actual names, or accused anyone of
copycatting anything.

> https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/
> ==============quote==============
> I think we need to have a separation of the swap cache and the backing
> of IO of the swap file. I call it the "virtual swapfile".
> It is virtual in two aspect:
> 1) There is an up front size at swap on, but no up front allocation of
> the vmalloc array. The array grows as needed.
> 2) There is a virtual to physical swap entry mapping. The cost is 4
> bytes per swap entry. But it will solve a lot of problems all
> together.
> ==============quote ends =========
> Side story:
> I want to pass the "virtual swapfile"  for Kairui to propose as LSF
> topic. Coincidentally  Nhat proposes the virtual swap as a LSF topic
> at 1/16/2025, a few days after I mention "virtual swapfile" in the lsf
> topic related discussion. It is right before Kairui purpose "virtual
> swapfile". Kairui renamed our version as "swap table". That is the
> history behind the name of "swap table".
> https://lore.kernel.org/linux-mm/20250116092254.204549-1-nphamcs@gmail.com/
> 
> I am sure Nhat did not see that email and come up with it
> independently, coincidentally. I just want to establish that I have
> prior art introducing the name "virtual swapfile" before Nhat's LSF
> "virtual swap" topic. After all, it is just a name. I am just as happy
> using "swap table".
> 
> To avoid confuse the reader I will call my version of "virtual swap"
> the "front end".
> 
> The front end owns the cluster and swap table (swap cache). 8 bytes.
> The back end only contain file position pointer. 4 bytes.
> 
> 4) The back end will need different allocate because the allocating
> assumption is different, it does not have alignment requirement. It
> just need to track which block location is available.
> It will need to have a back end specific allocator.  It only manage
> the location of the swapfile cannot allocate from fronted. e.g.
> redirection entry create a hole. or the new cluster added from step 3.
> 
> 5) the backend location pointer is optional of the cluster. For the
> cluster new allocated from step, It must have location pointer,
> because its offset is out of the backing file range.
> That is a 4 byte just like a swap entry.
> This backend location pointer can be used by solution like VS as well.
> That is part of the consideration as well, so not a after thought.
> The allocator mention here is more like a file system design rather
> than pure memory location, because it need to consider block location
> for combining block level IO.
> 
> So the mTHP allocator can do swapfile location redirection. But that
> is a side benefit of a different design goal (mTHP allocation). This
> physical location pointer description match my 2024 LSF pony talk
> slide. I just did not put text in the slide there. So it is not some
> thing after thought, it pre-dates back to 2024 talks.
> 
> > I think the key difference here is:
> > - In Chris's proposal, we start with a swap entry that represents a swap
> >   slot in swapfile A. If we do writeback (or swap tiering), we create
> >   another swap entry in swapfile B, and have the first swap entry point
> 
> Correction. Instead of swapfile B, Backend location in swapfile B. in
> step 5). It only 4 byte. The back end does not have swap cache. The
> swap cache belong to front end A (8 bytes).

Ack.

> 
> >   to it instead of the slot in swapfile A. If we want to reuse the swap
> >   slot in swapfile A, we create a new swap entry that points to it.
> >
> >   So we start with a swap entry that directly maps to a swap slot, and
> 
> Again, in my description swap slot A has a file backend location
> pointer points to swapfile B.
> It is only the bottom half the swap slot B, not the full swap slot. It
> does not have 8 byte swap entry overhead of B.

Ack.

> 
> >   optionally put a redirection there to point to another swap slot for
> >   writeback/tiering.
> 
> Point to another swapfile location backend, not swap entry.(4 bytes)

Ack.

> 
> >   Everything is a swapfile, even zswap will need to be represented by a
> >   separate (ghost) swapfile.
> 
> Allow ghost swapfile. I wouldn't go as far saying ban the current
> zswap writeback, that part is TBD. My description is enable memory
> swap tiers without actual physical file backing. Enable ghost
> swapfile.
> 
> >
> > - In the virtual swap proposal, swap entries are in a completely
> >   different space than swap slots. A swap entry points to an arbitrary
> >   swap slot (or zswap entry) from the beginning, and writeback (or
> >   tiering) does not change that, it only changes what is being pointed
> >   to.
> >
> > Regarding memory overhead (assuming x86_64), Chris's proposal has 8
> > bytes per entry in the swap table that is used to hold both the swap
> > count as well as the swapcache or shadow entry. Nhat's RFC for virtual
> Ack
> 
> > swap had 48 bytes of overhead, but that's a PoC of a specific
> > implementaiton.
> 
> Ack.
> 
> > Disregarding any specific implementation, any space optimizations that
> > can be applied to the swap table (e.g. combining swap count and
> > swapcache in an 8 byte field) can also be applied to virtual swap. The
> > only *real* difference is that with virtual swap we need to store the
> > swap slot (or zswap entry), while for the current swap table proposal it
> > is implied by the index of the entry. That's an additional 8 bytes.
> 
> No, the VS has a smaller design scope. VS does not enable "continous
> mTHP allocation" . At least that is not mention in any previous VS
> material.

Why not? Even if it wasn't specifically called out as part of the
motivation, it still achieves that. What we need for the mTHP swap is to
have a redirection layer. Both virtual swap or the front-end/back-end
design achieve that.

> 
> > So I think a fully optimized implementation of virtual swap could end up
> > with an overhead of 16 bytes per-entry. Everything else (locks,
> > rcu_head, etc) can probably be optimized away by using similar
> > optimizations as the swap table (e.g. do locking and alloc/freeing in
> 
> With the continues mTHP allocator mention above, it already has the
> all things VS needed.
> I am not sure we still need VS if we have "continues mTHP allocator",
> that is TBD.

As I mentioned above, I think the front-end/back-end swap tables and
virtual swap are conceptually very similar. The more we discuss this the
more I am convinced about this tbh. In both cases we provide an
indirection layer such that we can change the backend or backing
swapfile without updating the page tables, and allow thing like mTHP
swap without having contiguous slots in the swapfile.

> 
> Yes, VS can reuse the physical location pointer by "continues mTHP allocator".
> 
> The overhead is for above swap table of redirection is 12 bytes not 16 bytes.

Honeslty if it boils down to 4 bytes per page, I think that's a really
small difference. Especially that it doesn't apply to all cases (e.g.
not the zswap-only case that Google currently uses).

> 
> > batches). In fact, I think we can use the swap table as the allocator in
> > the virtual swap space, reusing all the locking and allocation
> 
> That is my feel all alone. Let swap table manage that.
> 
> > optimizations. The difference would be that the swap table is indexed by
> > the virtual swap ID rather than the swap slot index.
> 
>  In the "continous mTHP allocator" it is just physical location pointer,
> 
> > Another important aspect here, in the simple case the swap table does
> > have lower overhead than virtual swap (8 bytes vs 16 bytes). Although
> > the difference isn't large to begin with, I don't think it's always the
> > case. I think this is only true for the simple case of having a swapped
> > out page on a disk swapfile or in a zswap (ghost) swapfile.
> 
> Please redo your evaluation after reading the above "continuous mTHP alloctor".

I did, and if anything I am more convinced that the designs are
conceptually close. The main difference is that the virtual swap
approach is more flexible in my opinion because the backend doesn't have
to be a swapfile, and we don't need "ghost" to use zswap and manage it
like a swapfile.

> 
> > Once a page is written back from zswap to disk swapfile, in the swap
> > table approach we'll have two swap table entries. One in the ghost
> 
> One one entry with back end location pointer. (12 byte)
> 
> > swapfile (with a redirection), and one in the disk swapfile. That's 16
> > bytes, equal to the overhead of virtual swap.
> 
> Again 12 bytes using "continues mTHP allocator" frame work.

Ack.

> 
> > Now imagine a scenario where we have zswap, SSD, and HDD swapfiles with
> > tiering. If a page goes to zswap, then SSD, then HDD, we'll end up with
> > 3 swap table entries for a single swapped out page. That's 24 bytes. So
> > the memory overhead is not really constant, it scales with the number of
> > tiers (as opposed to virtual swap).
> 
> Nope, Only one front swap entry remain the same, every time it write
> to a different tier, it only update the back end physical location
> pointer.
> It always points to the finial physical location. Only 12 bytes total.

Ack.

> 
> You are paying 24 bytes because you don't have the front end vs back end split.
> Your redirection includes the front end 8 byte as well. Because you
> include the front end, now you need to do the relay forward.
> That is the benefit to have front end and back end split of the swap
> file. Make it more like a file system design.
> 
> > Another scenario is where we have SSD and HDD swapfiles with tiering. If
> > a page starts in SSD and goes to HDD, we'll have to swap table entries
> > for it (as above). The SSD entry would be wasted (has a redirection),
> > but Chris mentioned that we can fix this by allocating another frontend
> > cluster that points at the same SSD slot. How does this fit in the
> 
> No a fix. It is in the design consideration all alone. When the
> redirection happen, that underlying physical block location pointer
> will add to the backend allocator. The backend don't overlap with swap
> entry location can be allocated from front end.
> 
> > 8-byte swap table entry tho? The 8-bytes can only hold the swapcache or
> > shadow (and swapcount), but not the swap slot. For the current
> > implementation, the slot is implied by the swap table index, but if we
> > have separate front end swap tables, then we'll also need to store the
> > actual slot.
> 
> Please read the above description regarding the front end and back end
> split then ask your question again. The "continuous mTHP allocator"
> above should answer your question.

Yeah, the 8 bytes front-end and 4-bytes backend answer this.

> 
> > We can workaround this by having different types of clusters and swap
> > tables, where "virtual" clusters have 16 bytes instead of 8 bytes per
> > entry for that, sure.. but at that point we're at significantly more
> > complexity to end up where virtual swap would have put us.
> 
> No, that further complicating things. Please don't go there. The front
> end and back end location split is design to simplify situation like
> this. It is conceptual much cleaner as well.

Yeah that was mostly hypothetical.

> 
> >
> > Chris, Johannes, Nhat -- please correct me if I am wrong here or if I
> > missed something. I think the current swap table work by Kairui is
> 
> Yes, see the above explanation of the "continuous mTHP allocator".
> 
> > great, and we can reuse it for virtual swap (as I mentioned above). But
> > I don't think forcing everything to use a swapfile and extending swap
> > tables to support indirections and frontend/backend split is the way to
> > go (for the reasons described above).
> 
> IMHO, it is the way to go if consider mTHP allocating. You have
> different assumption than mine in my design, I correct your
> description as much as I can above. I am interested in your opinion
> after read the above description of "continuous mTHP allocator", which
> is match the 2024 LSF talk slide regarding swap cache redirecting
> physical locations.

As I mentioned, I am still very much convinced the designs are
conceptually very similar and the main difference is whether the
"backend" is 4 bytes and points at a slot in a swapfile, or a generic
8-byte pointer.

FWIW, we can use 4 bytes in virtual swap as well if we leave the xarray
in zswap. 4 bytes is plenty of space for an index into the zswap xarray
if we no longer use the swap offset. But if we use 8 bytes we can
actually get rid of the zswap xarray, by merging it with the virtual
swap xarray, or even stop using xarrays completely if we adopt the
current swap table allocator for the virtual swap indexes.

As Nhat mentioned earlier, I suspect we'll end up not using any extra
overhead at all for the zswap-only case, or even reducing the current
overhead.
> 
> Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months ago

, t

On Thu, Dec 4, 2025 at 10:16 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
> > On one hand I wish there was someone representing  the group as the
> > main speaker, that would make the discussion feel more equal, more
> > inclusive. On the other hand, any perspective is important, it is hard
> > to require the voice to route through the main speaker. It is hard to
> > execute in practice. So I give up suggesting that.  I am open for
> > suggestions on how to make the discussion more inclusive for newcomers
> > to the existing established group.
>
> Every person is expressing their own opinion, I don't think there's a
> way to change that or have a "representative" of each opinion. In fact,
> changing that would be the opposite of inclusive.

Ack, that is why I did not suggest a main speaker token approach. On
the other hand, there are still some considerations that can be taken
care of from the group side that do not overwhelm the single person if
a similar opinion has been expressed and is waiting for response. N vs
1 arguing does put the single person in unfair dis-advantage and
alienates the single person. We should consider the effect of that.
OK. Enough said on this and let's move on.

> > > Now with all of that out of the way, I want to try to salvage the
> > > technical discussion here. Taking several steps back, and
> >
> > Thank you for driving the discussion back to the technical side. I
> > really appreciate it.
> >
> > > oversimplifying a bit: Chris mentioned having a frontend and backend and
> > > an optional redirection when a page is moved between swap backends. This
> > > is conceptually the same as the virtual swap proposal.
> >
> > In my perspective, it is not the same as a virtual swap proposal.
> > There is some overlap, they both can do redirection.
> >
> > But they originally aim to solve two different problems. One of the
> > important goals of the swap table is  to allow continuing mTHP swap
> > entry when all the space left is not continues. For the rest of the
> > discusion we call it "continuous mTHP allocator". It allocate
> > continuous swap entry out of non continues file location.
> >
> > Let's say you have a 1G swapfile, all full not available slots.
> > 1) free 4 pages at swap offset 1, 3, 5, 7. All discontiguous spaces
> > add up to 16K.
> > 2) Now allocate one mTHP order 2, 16K in size.
> > Previous allocator can not be satisfied with this requirement. Because
> > the 4 empty slots are not contiguous.
> > Here the redirection and growth of the front swap entry comes in, it
> > is all part of the consideration all alone, not an afterthought.
> > This following step will allow allocating 16K continuous swap entries
> > out of offset [1,3,5,7]
> > 3) We grow the front end part of the swapfile, effectively bump up the
> > max size and add a new cluster of order 2, with a swap table.
> > That is where the front end of the swap and back end file store comes in.
>
> There's no reason why we cannot do the same with virtual swap, even if
> it wasn't the main motivaiton, I don't see why we can't achieve the same
> result.

Yes, they can. By largely copying the swap table approach to achieve
the same result. Before I point out the importance of the memory
overhead of per swap slot entry, the 48 bytes is not production
quality. VS hasn't really made good progress toward shrinking down the
per slot memory usage at a similar level. Not even close. That is
until you propose using the earlier stage of the swap table to compete
with the later stage of the swap table, by using the exact same
approach of the later stage of the swap table. Please don't use swap
table ideas to do a knockoff clone of swap table and take the final
credit. That is very not decent, I don't think that matches the
upstream spirit either. Please respect the originality of the idea and
give credit where it is due, after all, that is how the academic
system is built on.

> > BTW, Please don't accuse me copy cat the name "virtual swapfile". I
> > introduce it here 1/8/2025 before Nhat does:
>
> I don't think anyone cares about the actual names, or accused anyone of
> copycatting anything.

There are repeat projections cast on me as the "after thought", I want
the people who call me "after thought" acknowledge that I am the
"leading thought", the "original thought". Just joking.

> > https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/
> > ==============quote==============
> > I think we need to have a separation of the swap cache and the backing
> > of IO of the swap file. I call it the "virtual swapfile".
> > It is virtual in two aspect:
> > 1) There is an up front size at swap on, but no up front allocation of
> > the vmalloc array. The array grows as needed.
> > 2) There is a virtual to physical swap entry mapping. The cost is 4
> > bytes per swap entry. But it will solve a lot of problems all
> > together.
> > ==============quote ends =========

The above prior write up nicely sums up the main idea behind VS, would
you agree?

I want to give Nhat the benefit of the doubt that he did not commit
plagiarism. Since now VS has changed strategy to clone swap tables
against swap tables. I would add the points that, please be decent and
be collaborative. Respect the originality of the ideas. If this is in
the academic context, the email sent to the list considers paper
submission, the VS paper would definitely get ding on not properly
citing priory paper of "virtual swapfile" above.

So far team VS haven't participated much on swap table development.
There are a few ack from Nhat, but there is not really any discussion
showing insight of understanding the swap table. Now VS wants to clone
the swap table against the swap table. Why not just join the team swap
table. Really take part of the review of swap table phase N, not just
rubber stamping. Please be collaborative, be decent, do it the proper
upstream way.

> > Correction. Instead of swapfile B, Backend location in swapfile B. in
> > step 5). It only 4 byte. The back end does not have swap cache. The
> > swap cache belong to front end A (8 bytes).
>
> Ack.

Thanks for the Ack.

> > Again, in my description swap slot A has a file backend location
> > pointer points to swapfile B.
> > It is only the bottom half the swap slot B, not the full swap slot. It
> > does not have 8 byte swap entry overhead of B.
>
> Ack.

Thanks for the Ack.

> > Point to another swapfile location backend, not swap entry.(4 bytes)
>
> Ack.

Thanks for the Ack.

> > > Disregarding any specific implementation, any space optimizations that
> > > can be applied to the swap table (e.g. combining swap count and
> > > swapcache in an 8 byte field) can also be applied to virtual swap. The
> > > only *real* difference is that with virtual swap we need to store the
> > > swap slot (or zswap entry), while for the current swap table proposal it
> > > is implied by the index of the entry. That's an additional 8 bytes.
> >
> > No, the VS has a smaller design scope. VS does not enable "continous
> > mTHP allocation" . At least that is not mention in any previous VS
> > material.
>
> Why not? Even if it wasn't specifically called out as part of the
> motivation, it still achieves that. What we need for the mTHP swap is to
> have a redirection layer. Both virtual swap or the front-end/back-end
> design achieve that.

Using your magic against you, that is what I call an "after thought"
of the century. Just joking.

Yes, you can do that, by cloning swap tables against swpa tables. It
is just not considered decent in my book. Please be collaborative. Now
I have demonstrated the swap table side is the one with most of the
original ideas and advanced technical designs. Please let the team
swap table finish up what they originally planned, not steal the
thunder at the final glory. If team VS wants to help speed up the
process, since priority is one of VS main considerations, now the
design has been converging to swap tables. Please help reviewing the
swap table landing phases submission. Crew, walk, run. Even if you
want to use the swap table against the swap table. Reviewing landing
swap table code is a good way to understand swap tables. Let the team
swap tables to finish up the original goal. Once swpa tables have the
continue mTHP allocator, we can example any other VS feature can be
added on top of that.

> > With the continues mTHP allocator mention above, it already has the
> > all things VS needed.
> > I am not sure we still need VS if we have "continues mTHP allocator",
> > that is TBD.
>
> As I mentioned above, I think the front-end/back-end swap tables and
> virtual swap are conceptually very similar. The more we discuss this the

Of course very similar, for all we know it is possible they come from
the same source.
https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/

> more I am convinced about this tbh. In both cases we provide an
> indirection layer such that we can change the backend or backing
> swapfile without updating the page tables, and allow thing like mTHP
> swap without having contiguous slots in the swapfile.
>
> >
> > Yes, VS can reuse the physical location pointer by "continues mTHP allocator".
> >
> > The overhead is for above swap table of redirection is 12 bytes not 16 bytes.
>
> Honeslty if it boils down to 4 bytes per page, I think that's a really
> small difference.

4 bytes per slot entry difference is leaving free memory on the table.
Why not grab it?
Do you know that all those swap phase II..IV just to save 3 bytes per
slot (and clean up the code in the process)?
4 bytes out of total 8 or 12 bytes that is 33% - 50% difference on the
per solt usage.

> Especially that it doesn't apply to all cases (e.g.
> not the zswap-only case that Google currently uses).

I want to ask a clarifying question here. My understanding is that VS
is always on.
If we are doing zswap-only, does VS still have the 8+4 = 12 bytes overhead?

I want to make sure if we are not using the redirection, in the zswap
only case, we shouldn't pay the price for it.
Again that is another free money on the table.

> > > batches). In fact, I think we can use the swap table as the allocator in
> > > the virtual swap space, reusing all the locking and allocation

Yes, you can. Is there a technical difference to do so? If not, why
steal the thunder at finial glory? Why not let swap tables finish up
its course?

> >  In the "continous mTHP allocator" it is just physical location pointer,
> >
> > > Another important aspect here, in the simple case the swap table does
> > > have lower overhead than virtual swap (8 bytes vs 16 bytes). Although
> > > the difference isn't large to begin with, I don't think it's always the
> > > case. I think this is only true for the simple case of having a swapped
> > > out page on a disk swapfile or in a zswap (ghost) swapfile.
> >
> > Please redo your evaluation after reading the above "continuous mTHP alloctor".
>
> I did, and if anything I am more convinced that the designs are
> conceptually close. The main difference is that the virtual swap
> approach is more flexible in my opinion because the backend doesn't have
> to be a swapfile, and we don't need "ghost" to use zswap and manage it
> like a swapfile.

It seems the design has converged to the swap table side. Even the
"virtual swapfile" concept could have come from the swap table side.
I'm flattered, copying is the best compliment from the competitor.

Now we settle on the big design, the rest of the design difference is
very small.

Let's discuss the VS virtual swap interface without actual swapfile.

One question:
Does VS virtual swap file expose any swap file interface be referenced
by swap on/off? I assume no, please correct me if you do.

I think it could have downsides.

1) It is not compatible with normal /etc/fstab design. Now you need
seperate init script to enable disable VS.
2) It does not go through swap on/off path. That creates
complications. As we know we have a lot of bugs exposed in the swap
on/off. It is a very tricky business to get it right. I would
recommend staying away from cloning a separate path for the
swapon/off. The VS introduces a new kernel interface that also needs
to be maintained.
3) The customer can't swap files round robin. As we know some
companies are using multiple swap files to reduce the si->lock
contention. If I recall correctly 8 swapfiles. Forcing one virtual
swapfile will force go through the same si->locks has performance
penalties.
4) Having an overall virtual swap file imposes a design challenge in
swap.tiers world. Because it does not have a swapfile, the swapfile
priority does not apply.
5) Keep it simple. Using your magic against you, the ghost swapfile
conceptually can do whatever VS conceptually can do as well. You can
consider the ghost swapfile header is just a config file of the VS to
setup the swapfile. It saves the extra init script posted on users.

BTW, the "ghost swapfile" I will properly rename it back to "virtual
swapfile" in the code, as I earn that term's priority date. And you
don't mind what it is really called.

> > Again 12 bytes using "continues mTHP allocator" frame work.
>
> Ack.

Thanks for the Ack.

>
> >
> > > Now imagine a scenario where we have zswap, SSD, and HDD swapfiles with
> > > tiering. If a page goes to zswap, then SSD, then HDD, we'll end up with
> > > 3 swap table entries for a single swapped out page. That's 24 bytes. So
> > > the memory overhead is not really constant, it scales with the number of
> > > tiers (as opposed to virtual swap).
> >
> > Nope, Only one front swap entry remain the same, every time it write
> > to a different tier, it only update the back end physical location
> > pointer.
> > It always points to the finial physical location. Only 12 bytes total.
>
> Ack.

Thanks for the Ack. That confirms the swap table side is actually
having the more advanced technical design all alone.

> > Please read the above description regarding the front end and back end
> > split then ask your question again. The "continuous mTHP allocator"
> > above should answer your question.
>
> Yeah, the 8 bytes front-end and 4-bytes backend answer this.

Ack

> > > We can workaround this by having different types of clusters and swap
> > > tables, where "virtual" clusters have 16 bytes instead of 8 bytes per
> > > entry for that, sure.. but at that point we're at significantly more
> > > complexity to end up where virtual swap would have put us.
> >
> > No, that further complicating things. Please don't go there. The front
> > end and back end location split is design to simplify situation like
> > this. It is conceptual much cleaner as well.
>
> Yeah that was mostly hypothetical.

Ack.

>
> >
> > >
> > > Chris, Johannes, Nhat -- please correct me if I am wrong here or if I
> > > missed something. I think the current swap table work by Kairui is
> >
> > Yes, see the above explanation of the "continuous mTHP allocator".
> >
> > > great, and we can reuse it for virtual swap (as I mentioned above). But
> > > I don't think forcing everything to use a swapfile and extending swap
> > > tables to support indirections and frontend/backend split is the way to
> > > go (for the reasons described above).
> >
> > IMHO, it is the way to go if consider mTHP allocating. You have
> > different assumption than mine in my design, I correct your
> > description as much as I can above. I am interested in your opinion
> > after read the above description of "continuous mTHP allocator", which
> > is match the 2024 LSF talk slide regarding swap cache redirecting
> > physical locations.
>
> As I mentioned, I am still very much convinced the designs are
> conceptually very similar and the main difference is whether the
> "backend" is 4 bytes and points at a slot in a swapfile, or a generic
> 8-byte pointer.

Thanks, as I said earlier, I am flattered.

It is of course conceptually it is very close after you copy all the
internal design elements of the swap table approach.

> FWIW, we can use 4 bytes in virtual swap as well if we leave the xarray
> in zswap. 4 bytes is plenty of space for an index into the zswap xarray
> if we no longer use the swap offset. But if we use 8 bytes we can
> actually get rid of the zswap xarray, by merging it with the virtual
> swap xarray, or even stop using xarrays completely if we adopt the
> current swap table allocator for the virtual swap indexes.
>
> As Nhat mentioned earlier, I suspect we'll end up not using any extra
> overhead at all for the zswap-only case, or even reducing the current
> overhead.

In my design there is no extra xarray for zswap, you just have to take
my word for it now. That is very late in the game, finish the swap
table glory first.

Yosry, thank you for driving a good technical discussion. I really enjoy it.

I wish the beginning of the discussion went through this path instead.
The multi NACK first ask questions later and the condescending tone at
the beginning of the discussion really upsets me. The me alone facing
four round robin intense arguing doesn't help either. It makes me feel
I am not welcomed. I am short tempered and easily get triggered. I am
sorry for my behavior as well. Just give me a few moments and I will
come to my senses.

The ironic part of the discussion is that the "dead end" is the one
being converging to. The "afterthought" turns out to be "leading
thought". Let that be a lesson for everyone, me included. Be nice to
the people who hold different ideas than yourself.

Looking forward to more discussion like this.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Yosry Ahmed 2 months ago

On Thu, Dec 04, 2025 at 02:11:57PM +0400, Chris Li wrote:
[..]
> > >
> > > > oversimplifying a bit: Chris mentioned having a frontend and backend and
> > > > an optional redirection when a page is moved between swap backends. This
> > > > is conceptually the same as the virtual swap proposal.
> > >
> > > In my perspective, it is not the same as a virtual swap proposal.
> > > There is some overlap, they both can do redirection.
> > >
> > > But they originally aim to solve two different problems. One of the
> > > important goals of the swap table is  to allow continuing mTHP swap
> > > entry when all the space left is not continues. For the rest of the
> > > discusion we call it "continuous mTHP allocator". It allocate
> > > continuous swap entry out of non continues file location.
> > >
> > > Let's say you have a 1G swapfile, all full not available slots.
> > > 1) free 4 pages at swap offset 1, 3, 5, 7. All discontiguous spaces
> > > add up to 16K.
> > > 2) Now allocate one mTHP order 2, 16K in size.
> > > Previous allocator can not be satisfied with this requirement. Because
> > > the 4 empty slots are not contiguous.
> > > Here the redirection and growth of the front swap entry comes in, it
> > > is all part of the consideration all alone, not an afterthought.
> > > This following step will allow allocating 16K continuous swap entries
> > > out of offset [1,3,5,7]
> > > 3) We grow the front end part of the swapfile, effectively bump up the
> > > max size and add a new cluster of order 2, with a swap table.
> > > That is where the front end of the swap and back end file store comes in.
> >
> > There's no reason why we cannot do the same with virtual swap, even if
> > it wasn't the main motivaiton, I don't see why we can't achieve the same
> > result.
> 
> Yes, they can. By largely copying the swap table approach to achieve
> the same result.

What copying? Using virtual swap IDs inherently means that we are not
tied to coniguous swapfile slots to swap out large folio.

> Before I point out the importance of the memory
> overhead of per swap slot entry, the 48 bytes is not production
> quality. VS hasn't really made good progress toward shrinking down the
> per slot memory usage at a similar level. Not even close.

Nhat said repeatedly that what he sent was a PoC and that the overhead
can be optimized. Completely disregarding Nhat's implementation, I
described how conceptually the overhead can be lower, probably down to
16 bytes on x86_64.

> That is until you propose using the earlier stage of the swap table to
> compete with the later stage of the swap table, by using the exact
> same approach of the later stage of the swap table. Please don't use
> swap table ideas to do a knockoff clone of swap table and take the
> final credit. That is very not decent, I don't think that matches the
> upstream spirit either. Please respect the originality of the idea and
> give credit where it is due, after all, that is how the academic
> system is built on.

Ugh..what?

All virtual swap propasal made it clear that they are PoCs and that the
memory overhead can be shrunk. Compacting fields in the swap descriptor
(or whatever it's called) to save memory is not "an original idea". What
I said is that any memory optimizations that you apply to swap table can
equally apply to the virtual swap because they are conceptually storing
the same data (aside from the actual swap slot or zswap entry).

The other part is allocating and freeing in batches instead of
per-entry. This is an implementation detail, and Nhat mentioned early on
that we can do this to save memory (specifically for the locking, but it
applies for other things). This is not a novel approach either.

The comparison to swap table was to clarify things, not "knocking off"
anything.

[..] 
> > > https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/
> > > ==============quote==============
> > > I think we need to have a separation of the swap cache and the backing
> > > of IO of the swap file. I call it the "virtual swapfile".
> > > It is virtual in two aspect:
> > > 1) There is an up front size at swap on, but no up front allocation of
> > > the vmalloc array. The array grows as needed.
> > > 2) There is a virtual to physical swap entry mapping. The cost is 4
> > > bytes per swap entry. But it will solve a lot of problems all
> > > together.
> > > ==============quote ends =========
> 
> The above prior write up nicely sums up the main idea behind VS, would
> you agree?
> 
> I want to give Nhat the benefit of the doubt that he did not commit
> plagiarism. Since now VS has changed strategy to clone swap tables
> against swap tables. I would add the points that, please be decent and
> be collaborative. Respect the originality of the ideas. If this is in
> the academic context, the email sent to the list considers paper
> submission, the VS paper would definitely get ding on not properly
> citing priory paper of "virtual swapfile" above.

Okay let me make something very clear. This idea to introduce an
redirection layer for swap, call it virtual swap or swap table or mTHP
swap allocator or whatever is NOT new. It's NOT your idea, or my idea,
or Nhat's. I first heard about it from Johannes in 2022, and it was
floated around by Rik in 2011 based on discussions with others:
https://lore.kernel.org/linux-mm/4DA25039.3020700@redhat.com/

So no one here is trying to take credit for the idea, except you. No one
here is plagiarising anything. We are discussing different design and
implementations of the same idea. Sure, people have different ideas
about how to implement it, whether it's using an xarray or a swap table,
or what exactly to point at in the backend.

But these things are usually hashed out during discussions and code
reviews, and the better approach is taken by the community. You are the
one being very defensive about his "ideas", making it about personal
credit, and creating a problem where there was none. No one is trying to
steal any credit. Kairui's patches introducing the swap table are there
under his name. If we extend his work for the redirection layer, no
matter the direction we take it in, it's not taking away from his work,
it's adding to it.

> 
> So far team VS haven't participated much on swap table development.
> There are a few ack from Nhat, but there is not really any discussion
> showing insight of understanding the swap table. Now VS wants to clone
> the swap table against the swap table. Why not just join the team swap
> table. Really take part of the review of swap table phase N, not just
> rubber stamping. Please be collaborative, be decent, do it the proper
> upstream way.

There are no "teams" here, you're the only who's consistently making
this into an argument between companies or teams or whatever. You keep
saying you want to have a technical discussion yet most of your response
is about hypotheticals around teams and stealing credit.

> > > > Disregarding any specific implementation, any space optimizations that
> > > > can be applied to the swap table (e.g. combining swap count and
> > > > swapcache in an 8 byte field) can also be applied to virtual swap. The
> > > > only *real* difference is that with virtual swap we need to store the
> > > > swap slot (or zswap entry), while for the current swap table proposal it
> > > > is implied by the index of the entry. That's an additional 8 bytes.
> > >
> > > No, the VS has a smaller design scope. VS does not enable "continous
> > > mTHP allocation" . At least that is not mention in any previous VS
> > > material.
> >
> > Why not? Even if it wasn't specifically called out as part of the
> > motivation, it still achieves that. What we need for the mTHP swap is to
> > have a redirection layer. Both virtual swap or the front-end/back-end
> > design achieve that.
> 
> Using your magic against you, that is what I call an "after thought"
> of the century. Just joking.
> 
> Yes, you can do that, by cloning swap tables against swpa tables. It
> is just not considered decent in my book. Please be collaborative. Now
> I have demonstrated the swap table side is the one with most of the
> original ideas and advanced technical designs. Please let the team
> swap table finish up what they originally planned, not steal the
> thunder at the final glory. If team VS wants to help speed up the
> process, since priority is one of VS main considerations, now the
> design has been converging to swap tables. Please help reviewing the
> swap table landing phases submission. Crew, walk, run. Even if you
> want to use the swap table against the swap table. Reviewing landing
> swap table code is a good way to understand swap tables. Let the team
> swap tables to finish up the original goal. Once swpa tables have the
> continue mTHP allocator, we can example any other VS feature can be
> added on top of that.

More rants about hypothetical cloning, knocking off, etc.

> 
> > > With the continues mTHP allocator mention above, it already has the
> > > all things VS needed.
> > > I am not sure we still need VS if we have "continues mTHP allocator",
> > > that is TBD.
> >
> > As I mentioned above, I think the front-end/back-end swap tables and
> > virtual swap are conceptually very similar. The more we discuss this the
> 
> Of course very similar, for all we know it is possible they come from
> the same source.
> https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/

Your lack of self-awareness is impressive.

> 
> > more I am convinced about this tbh. In both cases we provide an
> > indirection layer such that we can change the backend or backing
> > swapfile without updating the page tables, and allow thing like mTHP
> > swap without having contiguous slots in the swapfile.
> >
> > >
> > > Yes, VS can reuse the physical location pointer by "continues mTHP allocator".
> > >
> > > The overhead is for above swap table of redirection is 12 bytes not 16 bytes.
> >
> > Honeslty if it boils down to 4 bytes per page, I think that's a really
> > small difference.
> 
> 4 bytes per slot entry difference is leaving free memory on the table.
> Why not grab it?
> Do you know that all those swap phase II..IV just to save 3 bytes per
> slot (and clean up the code in the process)?
> 4 bytes out of total 8 or 12 bytes that is 33% - 50% difference on the
> per solt usage.

Cleaning up the swap code and the performace optimizations in Kairui's
work are a lot more important that saving 3 bytes per slot, especially
if it's only for actively used slots. That's less than 0.1% of the
memory saved by swapping out a page to disk.

> 
> > Especially that it doesn't apply to all cases (e.g.
> > not the zswap-only case that Google currently uses).
> 
> I want to ask a clarifying question here. My understanding is that VS
> is always on.
> If we are doing zswap-only, does VS still have the 8+4 = 12 bytes overhead?
> 
> I want to make sure if we are not using the redirection, in the zswap
> only case, we shouldn't pay the price for it.
> Again that is another free money on the table.

IIUC the extra memory used for the virtual swap can be offset by
reduction in zswap_entry, so for the zswap-only case I don't believe
there will be any additional overhead.

> 
> > > > batches). In fact, I think we can use the swap table as the allocator in
> > > > the virtual swap space, reusing all the locking and allocation
> 
> Yes, you can. Is there a technical difference to do so? If not, why
> steal the thunder at finial glory? Why not let swap tables finish up
> its course?
> 
> > >  In the "continous mTHP allocator" it is just physical location pointer,
> > >
> > > > Another important aspect here, in the simple case the swap table does
> > > > have lower overhead than virtual swap (8 bytes vs 16 bytes). Although
> > > > the difference isn't large to begin with, I don't think it's always the
> > > > case. I think this is only true for the simple case of having a swapped
> > > > out page on a disk swapfile or in a zswap (ghost) swapfile.
> > >
> > > Please redo your evaluation after reading the above "continuous mTHP alloctor".
> >
> > I did, and if anything I am more convinced that the designs are
> > conceptually close. The main difference is that the virtual swap
> > approach is more flexible in my opinion because the backend doesn't have
> > to be a swapfile, and we don't need "ghost" to use zswap and manage it
> > like a swapfile.
> 
> It seems the design has converged to the swap table side. Even the
> "virtual swapfile" concept could have come from the swap table side.
> I'm flattered, copying is the best compliment from the competitor.
> 
> Now we settle on the big design, the rest of the design difference is
> very small.

No, the design hasn't settled or converged on any "side". I am also not
going to respond to the rest of this email, and potentially other
emails. You keep twisting my words, making delusional claims, and
proving how difficult it is to have a technical conversation with you.

You kept mentioning that you want to keep the conversation on the
technical side, but when I tried to have a technical disucssion you
quickly drove it away from that.  Half of your email is basically
"everyone is trying to steal my cool ideas".

I tried salvaging the discussion but this is hopeless.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Kairui Song 2 months ago

On Fri, Dec 5, 2025 at 5:05 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
>
> On Thu, Dec 04, 2025 at 02:11:57PM +0400, Chris Li wrote:
> [..]
> > > >
> > > > > oversimplifying a bit: Chris mentioned having a frontend and backend and
> > > > > an optional redirection when a page is moved between swap backends. This
> > > > > is conceptually the same as the virtual swap proposal.
> > > >
> > > > In my perspective, it is not the same as a virtual swap proposal.
> > > > There is some overlap, they both can do redirection.
> > > >
> > > > But they originally aim to solve two different problems. One of the
> > > > important goals of the swap table is  to allow continuing mTHP swap
> > > > entry when all the space left is not continues. For the rest of the
> > > > discusion we call it "continuous mTHP allocator". It allocate
> > > > continuous swap entry out of non continues file location.
> > > >
> > > > Let's say you have a 1G swapfile, all full not available slots.
> > > > 1) free 4 pages at swap offset 1, 3, 5, 7. All discontiguous spaces
> > > > add up to 16K.
> > > > 2) Now allocate one mTHP order 2, 16K in size.
> > > > Previous allocator can not be satisfied with this requirement. Because
> > > > the 4 empty slots are not contiguous.
> > > > Here the redirection and growth of the front swap entry comes in, it
> > > > is all part of the consideration all alone, not an afterthought.
> > > > This following step will allow allocating 16K continuous swap entries
> > > > out of offset [1,3,5,7]
> > > > 3) We grow the front end part of the swapfile, effectively bump up the
> > > > max size and add a new cluster of order 2, with a swap table.
> > > > That is where the front end of the swap and back end file store comes in.
> > >
> > > There's no reason why we cannot do the same with virtual swap, even if
> > > it wasn't the main motivaiton, I don't see why we can't achieve the same
> > > result.
> >
> > Yes, they can. By largely copying the swap table approach to achieve
> > the same result.
>
> What copying? Using virtual swap IDs inherently means that we are not
> tied to coniguous swapfile slots to swap out large folio.
>
> > Before I point out the importance of the memory
> > overhead of per swap slot entry, the 48 bytes is not production
> > quality. VS hasn't really made good progress toward shrinking down the
> > per slot memory usage at a similar level. Not even close.
>
> Nhat said repeatedly that what he sent was a PoC and that the overhead
> can be optimized. Completely disregarding Nhat's implementation, I
> described how conceptually the overhead can be lower, probably down to
> 16 bytes on x86_64.
>
> > That is until you propose using the earlier stage of the swap table to
> > compete with the later stage of the swap table, by using the exact
> > same approach of the later stage of the swap table. Please don't use
> > swap table ideas to do a knockoff clone of swap table and take the
> > final credit. That is very not decent, I don't think that matches the
> > upstream spirit either. Please respect the originality of the idea and
> > give credit where it is due, after all, that is how the academic
> > system is built on.
>
> Ugh..what?
>
> All virtual swap propasal made it clear that they are PoCs and that the
> memory overhead can be shrunk. Compacting fields in the swap descriptor
> (or whatever it's called) to save memory is not "an original idea". What
> I said is that any memory optimizations that you apply to swap table can
> equally apply to the virtual swap because they are conceptually storing
> the same data (aside from the actual swap slot or zswap entry).
>
> The other part is allocating and freeing in batches instead of
> per-entry. This is an implementation detail, and Nhat mentioned early on
> that we can do this to save memory (specifically for the locking, but it
> applies for other things). This is not a novel approach either.
>
> The comparison to swap table was to clarify things, not "knocking off"
> anything.
>
> [..]
> > > > https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/
> > > > ==============quote==============
> > > > I think we need to have a separation of the swap cache and the backing
> > > > of IO of the swap file. I call it the "virtual swapfile".
> > > > It is virtual in two aspect:
> > > > 1) There is an up front size at swap on, but no up front allocation of
> > > > the vmalloc array. The array grows as needed.
> > > > 2) There is a virtual to physical swap entry mapping. The cost is 4
> > > > bytes per swap entry. But it will solve a lot of problems all
> > > > together.
> > > > ==============quote ends =========
> >
> > The above prior write up nicely sums up the main idea behind VS, would
> > you agree?
> >
> > I want to give Nhat the benefit of the doubt that he did not commit
> > plagiarism. Since now VS has changed strategy to clone swap tables
> > against swap tables. I would add the points that, please be decent and
> > be collaborative. Respect the originality of the ideas. If this is in
> > the academic context, the email sent to the list considers paper
> > submission, the VS paper would definitely get ding on not properly
> > citing priory paper of "virtual swapfile" above.
>
> Okay let me make something very clear. This idea to introduce an
> redirection layer for swap, call it virtual swap or swap table or mTHP
> swap allocator or whatever is NOT new. It's NOT your idea, or my idea,
> or Nhat's. I first heard about it from Johannes in 2022, and it was
> floated around by Rik in 2011 based on discussions with others:
> https://lore.kernel.org/linux-mm/4DA25039.3020700@redhat.com/
>
> So no one here is trying to take credit for the idea, except you. No one
> here is plagiarising anything. We are discussing different design and
> implementations of the same idea. Sure, people have different ideas
> about how to implement it, whether it's using an xarray or a swap table,
> or what exactly to point at in the backend.
>
> But these things are usually hashed out during discussions and code
> reviews, and the better approach is taken by the community. You are the
> one being very defensive about his "ideas", making it about personal
> credit, and creating a problem where there was none. No one is trying to
> steal any credit. Kairui's patches introducing the swap table are there
> under his name. If we extend his work for the redirection layer, no
> matter the direction we take it in, it's not taking away from his work,
> it's adding to it.
>
> >
> > So far team VS haven't participated much on swap table development.
> > There are a few ack from Nhat, but there is not really any discussion
> > showing insight of understanding the swap table. Now VS wants to clone
> > the swap table against the swap table. Why not just join the team swap
> > table. Really take part of the review of swap table phase N, not just
> > rubber stamping. Please be collaborative, be decent, do it the proper
> > upstream way.
>
> There are no "teams" here, you're the only who's consistently making
> this into an argument between companies or teams or whatever. You keep
> saying you want to have a technical discussion yet most of your response
> is about hypotheticals around teams and stealing credit.
>
> > > > > Disregarding any specific implementation, any space optimizations that
> > > > > can be applied to the swap table (e.g. combining swap count and
> > > > > swapcache in an 8 byte field) can also be applied to virtual swap. The
> > > > > only *real* difference is that with virtual swap we need to store the
> > > > > swap slot (or zswap entry), while for the current swap table proposal it
> > > > > is implied by the index of the entry. That's an additional 8 bytes.
> > > >
> > > > No, the VS has a smaller design scope. VS does not enable "continous
> > > > mTHP allocation" . At least that is not mention in any previous VS
> > > > material.
> > >
> > > Why not? Even if it wasn't specifically called out as part of the
> > > motivation, it still achieves that. What we need for the mTHP swap is to
> > > have a redirection layer. Both virtual swap or the front-end/back-end
> > > design achieve that.
> >
> > Using your magic against you, that is what I call an "after thought"
> > of the century. Just joking.
> >
> > Yes, you can do that, by cloning swap tables against swpa tables. It
> > is just not considered decent in my book. Please be collaborative. Now
> > I have demonstrated the swap table side is the one with most of the
> > original ideas and advanced technical designs. Please let the team
> > swap table finish up what they originally planned, not steal the
> > thunder at the final glory. If team VS wants to help speed up the
> > process, since priority is one of VS main considerations, now the
> > design has been converging to swap tables. Please help reviewing the
> > swap table landing phases submission. Crew, walk, run. Even if you
> > want to use the swap table against the swap table. Reviewing landing
> > swap table code is a good way to understand swap tables. Let the team
> > swap tables to finish up the original goal. Once swpa tables have the
> > continue mTHP allocator, we can example any other VS feature can be
> > added on top of that.
>
> More rants about hypothetical cloning, knocking off, etc.
>
> >
> > > > With the continues mTHP allocator mention above, it already has the
> > > > all things VS needed.
> > > > I am not sure we still need VS if we have "continues mTHP allocator",
> > > > that is TBD.
> > >
> > > As I mentioned above, I think the front-end/back-end swap tables and
> > > virtual swap are conceptually very similar. The more we discuss this the
> >
> > Of course very similar, for all we know it is possible they come from
> > the same source.
> > https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/
>
> Your lack of self-awareness is impressive.
>
> >
> > > more I am convinced about this tbh. In both cases we provide an
> > > indirection layer such that we can change the backend or backing
> > > swapfile without updating the page tables, and allow thing like mTHP
> > > swap without having contiguous slots in the swapfile.
> > >
> > > >
> > > > Yes, VS can reuse the physical location pointer by "continues mTHP allocator".
> > > >
> > > > The overhead is for above swap table of redirection is 12 bytes not 16 bytes.
> > >
> > > Honeslty if it boils down to 4 bytes per page, I think that's a really
> > > small difference.
> >
> > 4 bytes per slot entry difference is leaving free memory on the table.
> > Why not grab it?
> > Do you know that all those swap phase II..IV just to save 3 bytes per
> > slot (and clean up the code in the process)?
> > 4 bytes out of total 8 or 12 bytes that is 33% - 50% difference on the
> > per solt usage.
>
> Cleaning up the swap code and the performace optimizations in Kairui's
> work are a lot more important that saving 3 bytes per slot, especially
> if it's only for actively used slots. That's less than 0.1% of the
> memory saved by swapping out a page to disk.
>
> >
> > > Especially that it doesn't apply to all cases (e.g.
> > > not the zswap-only case that Google currently uses).
> >
> > I want to ask a clarifying question here. My understanding is that VS
> > is always on.
> > If we are doing zswap-only, does VS still have the 8+4 = 12 bytes overhead?
> >
> > I want to make sure if we are not using the redirection, in the zswap
> > only case, we shouldn't pay the price for it.
> > Again that is another free money on the table.
>
> IIUC the extra memory used for the virtual swap can be offset by
> reduction in zswap_entry, so for the zswap-only case I don't believe
> there will be any additional overhead.
>
> >
> > > > > batches). In fact, I think we can use the swap table as the allocator in
> > > > > the virtual swap space, reusing all the locking and allocation
> >
> > Yes, you can. Is there a technical difference to do so? If not, why
> > steal the thunder at finial glory? Why not let swap tables finish up
> > its course?
> >
> > > >  In the "continous mTHP allocator" it is just physical location pointer,
> > > >
> > > > > Another important aspect here, in the simple case the swap table does
> > > > > have lower overhead than virtual swap (8 bytes vs 16 bytes). Although
> > > > > the difference isn't large to begin with, I don't think it's always the
> > > > > case. I think this is only true for the simple case of having a swapped
> > > > > out page on a disk swapfile or in a zswap (ghost) swapfile.
> > > >
> > > > Please redo your evaluation after reading the above "continuous mTHP alloctor".
> > >
> > > I did, and if anything I am more convinced that the designs are
> > > conceptually close. The main difference is that the virtual swap
> > > approach is more flexible in my opinion because the backend doesn't have
> > > to be a swapfile, and we don't need "ghost" to use zswap and manage it
> > > like a swapfile.
> >
> > It seems the design has converged to the swap table side. Even the
> > "virtual swapfile" concept could have come from the swap table side.
> > I'm flattered, copying is the best compliment from the competitor.
> >
> > Now we settle on the big design, the rest of the design difference is
> > very small.
>
> No, the design hasn't settled or converged on any "side". I am also not
> going to respond to the rest of this email, and potentially other
> emails. You keep twisting my words, making delusional claims, and
> proving how difficult it is to have a technical conversation with you.
>
> You kept mentioning that you want to keep the conversation on the
> technical side, but when I tried to have a technical disucssion you
> quickly drove it away from that.  Half of your email is basically
> "everyone is trying to steal my cool ideas".
>
> I tried salvaging the discussion but this is hopeless.
>

Hi, all, I hope people don't mind me adding a few words here.

I think the key thing is Chris wants things to be done in an optimized
way. He welcomes others to collaborate, as long as it's properly
credited.

Upstream development is tiring and there are conflicts in tech detail
and ideas, making it hard to track who is more credited for one
implementation. But he has been super helpful as the behind the scene
hero for swap tables:

Back when I was unfamiliar with swap and sent the long series to
optimized it in a different direction two years ago:
https://lore.kernel.org/linux-mm/20231119194740.94101-1-ryncsn@gmail.com/ [1]
https://lore.kernel.org/linux-mm/20240326185032.72159-1-ryncsn@gmail.com/ [2]

Chris literally reviewed every patch of the first series super
carefully despite me being a beginner. And for the later series, he
pointed out that's not an optimal direction at all, and shared what he
thinks is the right direction to refractor swap systematically with me
off-list. Then we collabed to implement the swap allocator. That's
also the key prerequisite of the swap table.

For the swap table series, I already posted a completed series at May
this year that have implemented things basically covers until phase 3
(almost half year ago):
https://lore.kernel.org/linux-mm/20250514201729.48420-1-ryncsn@gmail.com/
[3]. And shared a workish branch that covers until phase 5 seeking for
collaboration later multiple times.

Despite swap table is already performing well and stable, and I was
also providing the info on how we can solve the VS issue (And the
redirection entry layer idea was completely introduced by Chris), the
feedback or review is stuck. And you can see VS is also stuck with
performance issues at that time.

I was in a rush and struggling with managing that long series, getting
it merged or reviewed, to enable next step developments. But lacking
upstream positive feedback or progress is really disencouraging, and I
hesitate to implement the later parts and even thought about giving
up. Again Chris helped to organize and rework a large proportion of
that series, so we are making real progress, and finally got phase I
merged, and phase II ready to be merged.

I thought the best approach is, having a clean basement for everyone
so we can compare the end result, without any history burdens, then we
can discuss further developments. And we are on track of that. And
IIRC, VS was also struggling with things like the direct swapin, slot
cache and other existing workarounds or the fuzzy API of swap, which
are all removed or solved by the swap table series.

We are all busy and may be unaware of others' work or history. (e.g.
Yosry once pointed out I ignored his previous work, and I apologized
for that [4]). It's understandable to me that misunderstandings and
implicit interests exist. And if you look closely at [1] and [2] and a
few other later series around swap cache, it's also getting very close
to the idea of unifying the swap routine to then have a common
metadata despite I having no idea of other's work and it is in a
different direction, [2] has already done removing the direct swapin
and use swap cache as the unified layer, [1] in 2023 is having similar
vibe, you can still find same ideas or even codes in the pending swap
table patch). But without the cluster idea and a prototype patch from
Chris, it would end up catastrophically in upstream. He shared the
idea proactively and helped to make later work possible, and so we
co-authored on many larter patches.

Link: https://lore.kernel.org/all/CAMgjq7DHFYWhm+Z0C5tR2U2a-N_mtmgB4+idD2S+-1438u-wWw@mail.gmail.com/
[4]

What I mean is from what I've seen, Chris has been open and friendly
and I never see him lack the spirit of sharing the ideas or
collaboration on that. As for the current tech issue, we are
definitely on track making major break though, let's just focus on
improving the swap and make progress :)

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Kairui Song 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
>
> The current zswap requires a backing swapfile. The swap slot used
> by zswap is not able to be used by the swapfile. That waste swapfile
> space.
>
> The ghost swapfile is a swapfile that only contains the swapfile header
> for zswap. The swapfile header indicate the size of the swapfile. There
> is no swap data section in the ghost swapfile, therefore, no waste of
> swapfile space.  As such, any write to a ghost swapfile will fail. To
> prevents accidental read or write of ghost swapfile, bdev of
> swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> flag because there is no rotation disk access when using zswap.
>
> The zswap write back has been disabled if all swapfiles in the system
> are ghost swap files.

Thanks for sharing this, I've been hearing about the ghost swapfile
design for a long time, glad to see it finally got posted.

>
> Signed-off-by: Chris Li <chrisl@kernel.org>
> ---
>  include/linux/swap.h |  2 ++
>  mm/page_io.c         | 18 +++++++++++++++---
>  mm/swap.h            |  2 +-
>  mm/swap_state.c      |  7 +++++++
>  mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
>  mm/zswap.c           | 17 +++++++++++------
>  6 files changed, 73 insertions(+), 15 deletions(-)

In general I think this aligns quite well with what I had in mind and
an idea that was mention during LSFMM this year (the 3rd one in the
"Issues" part, it wasn't clearly described in the cover letter, more
details in the slides):
https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/

The good part is that we will reuse everything we have with the
current swap stack, and stay optional. Everything is a swap device, no
special layers required. All other features will be available in a
cleaner way.

And /etc/fstab just works the same way for the ghost swapfile.

Looking forward to see this RFC get more updates.

>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -216,6 +216,7 @@ enum {
>         SWP_PAGE_DISCARD = (1 << 10),   /* freed swap page-cluster discards */
>         SWP_STABLE_WRITES = (1 << 11),  /* no overwrite PG_writeback pages */
>         SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> +       SWP_GHOST       = (1 << 13),    /* not backed by anything */
>                                         /* add others here before... */
>  };
>
> @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
>  void free_pages_and_swap_cache(struct encoded_page **, int);
>  /* linux/mm/swapfile.c */
>  extern atomic_long_t nr_swap_pages;
> +extern atomic_t nr_real_swapfiles;
>  extern long total_swap_pages;
>  extern atomic_t nr_rotate_swap;
>
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
>                 return AOP_WRITEPAGE_ACTIVATE;
>         }
>
> -       __swap_writepage(folio, swap_plug);
> -       return 0;
> +       return __swap_writepage(folio, swap_plug);
>  out_unlock:
>         folio_unlock(folio);
>         return ret;
> @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
>         submit_bio(bio);
>  }
>
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
>  {
>         struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
>
>         VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> +
> +       if (sis->flags & SWP_GHOST) {
> +               /* Prevent the page from getting reclaimed. */
> +               folio_set_dirty(folio);
> +               return AOP_WRITEPAGE_ACTIVATE;
> +       }
> +
>         /*
>          * ->flags can be updated non-atomicially (scan_swap_map_slots),
>          * but that will never affect SWP_FS_OPS, so the data_race
> @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
>                 swap_writepage_bdev_sync(folio, sis);
>         else
>                 swap_writepage_bdev_async(folio, sis);
> +       return 0;
>  }
>
>  void swap_write_unplug(struct swap_iocb *sio)
> @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
>         if (zswap_load(folio) != -ENOENT)
>                 goto finish;
>
> +       if (unlikely(sis->flags & SWP_GHOST)) {
> +               folio_unlock(folio);
> +               goto finish;
> +       }
> +
>         /* We have to read from slower devices. Increase zswap protection. */
>         zswap_folio_swapin(folio);
>
> diff --git a/mm/swap.h b/mm/swap.h
> index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
>  }
>  void swap_write_unplug(struct swap_iocb *sio);
>  int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
>
>  /* linux/mm/swap_state.c */
>  extern struct address_space swap_space __ro_after_init;
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
>         struct swap_iocb *splug = NULL;
>         bool page_allocated;
>
> +       /*
> +        * The entry may have been freed by another task. Avoid swap_info_get()
> +        * which will print error message if the race happens.
> +        */
> +       if (si->flags & SWP_GHOST)
> +               goto skip;
> +
>         mask = swapin_nr_pages(offset) - 1;
>         if (!mask)
>                 goto skip;
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
>  static DEFINE_SPINLOCK(swap_lock);
>  static unsigned int nr_swapfiles;
>  atomic_long_t nr_swap_pages;
> +atomic_t nr_real_swapfiles;
>  /*
>   * Some modules use swappable objects and may try to swap them out under
>   * memory pressure (via the shrinker). Before doing so, they may wish to
> @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
>                         goto skip;
>         }
>
> +       if (!(si->flags & SWP_GHOST))
> +               atomic_sub(1, &nr_real_swapfiles);
>         plist_del(&si->avail_list, &swap_avail_head);
>
>  skip:
> @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
>         }
>
>         plist_add(&si->avail_list, &swap_avail_head);
> +       if (!(si->flags & SWP_GHOST))
> +               atomic_add(1, &nr_real_swapfiles);
>
>  skip:
>         spin_unlock(&swap_avail_lock);
> @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
>         struct inode *inode = mapping->host;
>         int ret;
>
> +       if (sis->flags & SWP_GHOST) {
> +               *span = 0;
> +               return 0;
> +       }
> +
>         if (S_ISBLK(inode->i_mode)) {
>                 ret = add_swap_extent(sis, 0, sis->max, 0);
>                 *span = sis->pages;
> @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>         if (p->flags & SWP_CONTINUED)
>                 free_swap_count_continuations(p);
>
> -       if (!p->bdev || !bdev_nonrot(p->bdev))
> +       if (!(p->flags & SWP_GHOST) &&
> +           (!p->bdev || !bdev_nonrot(p->bdev)))
>                 atomic_dec(&nr_rotate_swap);
>
>         mutex_lock(&swapon_mutex);
> @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
>         mutex_unlock(&swapon_mutex);
>  }
>
> +static const char *swap_type_str(struct swap_info_struct *si)
> +{
> +       struct file *file = si->swap_file;
> +
> +       if (si->flags & SWP_GHOST)
> +               return "ghost\t";
> +
> +       if (S_ISBLK(file_inode(file)->i_mode))
> +               return "partition";
> +
> +       return "file\t";
> +}
> +
>  static int swap_show(struct seq_file *swap, void *v)
>  {
>         struct swap_info_struct *si = v;
> @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
>         len = seq_file_path(swap, file, " \t\n\\");
>         seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
>                         len < 40 ? 40 - len : 1, " ",
> -                       S_ISBLK(file_inode(file)->i_mode) ?
> -                               "partition" : "file\t",
> +                       swap_type_str(si),
>                         bytes, bytes < 10000000 ? "\t" : "",
>                         inuse, inuse < 10000000 ? "\t" : "",
>                         si->prio);
> @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
>         return 0;
>  }
>
> -
>  /*
>   * Find out how many pages are allowed for a single swap device. There
>   * are two limiting factors:
> @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>         unsigned long maxpages;
>         unsigned long swapfilepages;
>         unsigned long last_page;
> +       loff_t size;
>
>         if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
>                 pr_err("Unable to find swap-space signature\n");
> @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>
>         if (!maxpages)
>                 return 0;
> -       swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> +
> +       size = i_size_read(inode);
> +       if (size == PAGE_SIZE) {
> +               /* Ghost swapfile */
> +               si->bdev = NULL;
> +               si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> +               return maxpages;
> +       }

Here if we push things further, it might be a good idea to make better
use of the swap file header for detecting this kind of device, and
maybe add support for other info too. The header already has version
info embedded in case it will be extended.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Barry Song 2 months, 1 week ago

On Sat, Nov 22, 2025 at 6:00 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
> >
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
> >
> > The zswap write back has been disabled if all swapfiles in the system
> > are ghost swap files.
>
> Thanks for sharing this, I've been hearing about the ghost swapfile
> design for a long time, glad to see it finally got posted.
>
> >
> > Signed-off-by: Chris Li <chrisl@kernel.org>
> > ---
> >  include/linux/swap.h |  2 ++
> >  mm/page_io.c         | 18 +++++++++++++++---
> >  mm/swap.h            |  2 +-
> >  mm/swap_state.c      |  7 +++++++
> >  mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
> >  mm/zswap.c           | 17 +++++++++++------
> >  6 files changed, 73 insertions(+), 15 deletions(-)
>
> In general I think this aligns quite well with what I had in mind and
> an idea that was mention during LSFMM this year (the 3rd one in the
> "Issues" part, it wasn't clearly described in the cover letter, more
> details in the slides):
> https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/
>
> The good part is that we will reuse everything we have with the
> current swap stack, and stay optional. Everything is a swap device, no
> special layers required. All other features will be available in a
> cleaner way.
>
> And /etc/fstab just works the same way for the ghost swapfile.

Apologies — let me raise a question that may be annoying.
I understand that people may already be feeling tense and sensitive.

Despite the benefit of compatibility with /etc/fstab, we still need to provide
a physical file on disk (or elsewhere), even if it contains only a header.
Personally, this feels a bit odd to me. Is it possible to avoid having a
“ghost” swap file altogether and instead implement all "ghost" functionality
entirely within the kernel? Ideally, we wouldn’t need to introduce a new
“ghost” concept to users at all.

In short, we provide the functionality of a ghost swap file without actually
having any file or “ghost” at all.

Thanks
Barry

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Baoquan He 2 months, 1 week ago

On 12/02/25 at 10:56am, Barry Song wrote:
> On Sat, Nov 22, 2025 at 6:00 PM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
> > >
> > > The current zswap requires a backing swapfile. The swap slot used
> > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > space.
> > >
> > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > prevents accidental read or write of ghost swapfile, bdev of
> > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > flag because there is no rotation disk access when using zswap.
> > >
> > > The zswap write back has been disabled if all swapfiles in the system
> > > are ghost swap files.
> >
> > Thanks for sharing this, I've been hearing about the ghost swapfile
> > design for a long time, glad to see it finally got posted.
> >
> > >
> > > Signed-off-by: Chris Li <chrisl@kernel.org>
> > > ---
> > >  include/linux/swap.h |  2 ++
> > >  mm/page_io.c         | 18 +++++++++++++++---
> > >  mm/swap.h            |  2 +-
> > >  mm/swap_state.c      |  7 +++++++
> > >  mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
> > >  mm/zswap.c           | 17 +++++++++++------
> > >  6 files changed, 73 insertions(+), 15 deletions(-)
> >
> > In general I think this aligns quite well with what I had in mind and
> > an idea that was mention during LSFMM this year (the 3rd one in the
> > "Issues" part, it wasn't clearly described in the cover letter, more
> > details in the slides):
> > https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/
> >
> > The good part is that we will reuse everything we have with the
> > current swap stack, and stay optional. Everything is a swap device, no
> > special layers required. All other features will be available in a
> > cleaner way.
> >
> > And /etc/fstab just works the same way for the ghost swapfile.
> 
> Apologies — let me raise a question that may be annoying.
> I understand that people may already be feeling tense and sensitive.
> 
> Despite the benefit of compatibility with /etc/fstab, we still need to provide
> a physical file on disk (or elsewhere), even if it contains only a header.
> Personally, this feels a bit odd to me. Is it possible to avoid having a
> “ghost” swap file altogether and instead implement all "ghost" functionality
> entirely within the kernel? Ideally, we wouldn’t need to introduce a new
> “ghost” concept to users at all.
> 
> In short, we provide the functionality of a ghost swap file without actually
> having any file or “ghost” at all.

That's actually what I would like to see. Just to make that we may need
change syscall swapon, to specify the flag to mark it and initial size.
People may complain about adjustment in syscall swapon.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Nhat Pham 2 months, 1 week ago

On Mon, Dec 1, 2025 at 10:32 PM Baoquan He <bhe@redhat.com> wrote:
>
> On 12/02/25 at 10:56am, Barry Song wrote:
> > On Sat, Nov 22, 2025 at 6:00 PM Kairui Song <ryncsn@gmail.com> wrote:
> > >
> > > On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
> > > >
> > > > The current zswap requires a backing swapfile. The swap slot used
> > > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > > space.
> > > >
> > > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > > prevents accidental read or write of ghost swapfile, bdev of
> > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > > flag because there is no rotation disk access when using zswap.
> > > >
> > > > The zswap write back has been disabled if all swapfiles in the system
> > > > are ghost swap files.
> > >
> > > Thanks for sharing this, I've been hearing about the ghost swapfile
> > > design for a long time, glad to see it finally got posted.
> > >
> > > >
> > > > Signed-off-by: Chris Li <chrisl@kernel.org>
> > > > ---
> > > >  include/linux/swap.h |  2 ++
> > > >  mm/page_io.c         | 18 +++++++++++++++---
> > > >  mm/swap.h            |  2 +-
> > > >  mm/swap_state.c      |  7 +++++++
> > > >  mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
> > > >  mm/zswap.c           | 17 +++++++++++------
> > > >  6 files changed, 73 insertions(+), 15 deletions(-)
> > >
> > > In general I think this aligns quite well with what I had in mind and
> > > an idea that was mention during LSFMM this year (the 3rd one in the
> > > "Issues" part, it wasn't clearly described in the cover letter, more
> > > details in the slides):
> > > https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/
> > >
> > > The good part is that we will reuse everything we have with the
> > > current swap stack, and stay optional. Everything is a swap device, no
> > > special layers required. All other features will be available in a
> > > cleaner way.
> > >
> > > And /etc/fstab just works the same way for the ghost swapfile.
> >
> > Apologies — let me raise a question that may be annoying.
> > I understand that people may already be feeling tense and sensitive.
> >
> > Despite the benefit of compatibility with /etc/fstab, we still need to provide
> > a physical file on disk (or elsewhere), even if it contains only a header.
> > Personally, this feels a bit odd to me. Is it possible to avoid having a
> > “ghost” swap file altogether and instead implement all "ghost" functionality
> > entirely within the kernel? Ideally, we wouldn’t need to introduce a new
> > “ghost” concept to users at all.
> >
> > In short, we provide the functionality of a ghost swap file without actually
> > having any file or “ghost” at all.
>
> That's actually what I would like to see. Just to make that we may need
> change syscall swapon, to specify the flag to mark it and initial size.
> People may complain about adjustment in syscall swapon.

Yeah that's another design goal with virtual swap - minimizing the
operational overhead.

With my design/RFC, all you need to do is:

1. Enable zswap at the host level (/sys/module/zswap/parameters/enabled).

2. Enable zswap at the cgroup level, through memory.zswap.max (you can
also size per-cgroup zswap limit here, if you so choose).

and it *just works*. Out of the box. No need to create a new swapfile,
/etc/fstab, etc.

If you're unsure about your workload's actual zswap usage, you can
keep it unlimited too - it will just grows and shrinks with memory
usage dynamics.

One design for every host type and workload characteristics
(workingset, memory access patterns, memory compressibility).

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 1 week ago

On Tue, Dec 2, 2025 at 9:53 PM Nhat Pham <nphamcs@gmail.com> wrote:
> > > Apologies — let me raise a question that may be annoying.
> > > I understand that people may already be feeling tense and sensitive.
> > >
> > > Despite the benefit of compatibility with /etc/fstab, we still need to provide
> > > a physical file on disk (or elsewhere), even if it contains only a header.
> > > Personally, this feels a bit odd to me. Is it possible to avoid having a
> > > “ghost” swap file altogether and instead implement all "ghost" functionality
> > > entirely within the kernel? Ideally, we wouldn’t need to introduce a new
> > > “ghost” concept to users at all.
> > >
> > > In short, we provide the functionality of a ghost swap file without actually
> > > having any file or “ghost” at all.
> >
> > That's actually what I would like to see. Just to make that we may need
> > change syscall swapon, to specify the flag to mark it and initial size.
> > People may complain about adjustment in syscall swapon.
>
> Yeah that's another design goal with virtual swap - minimizing the
> operational overhead.
>
> With my design/RFC, all you need to do is:
>
> 1. Enable zswap at the host level (/sys/module/zswap/parameters/enabled).
>
> 2. Enable zswap at the cgroup level, through memory.zswap.max (you can
> also size per-cgroup zswap limit here, if you so choose).

From the kernel point of view, managing swap entry without swapfile
poses some challenges.
1) how does the swap_full() and swap cache reclaim work in your world?
Will you create more holes not filling and fragments?
2) Do you internally have only one si->lock? You will  not able to
take advantage of the swap device round robin behavior.

> and it *just works*. Out of the box. No need to create a new swapfile,
That is a user space thing, existing user space tools.
> /etc/fstab, etc.

Able to continue using /etc/fstab is a good thing. Now you are forcing
distros to insert swap on for zswap which do the above init sequence.
It puts more burden on the distro.
That is not the main reason I did not go this route. Mostly I want the
patch to be simple and easy to review. Keep it simple. I see virtual
devices have drawbacks on si->locks and other user space changes
required.

> If you're unsure about your workload's actual zswap usage, you can
> keep it unlimited too - it will just grows and shrinks with memory
> usage dynamics.

How do you cap your swap cache in that case? I feel a lot of
discussion is very hand waving.
Having a landable patch will get more of my attention.

Chris

>
> One design for every host type and workload characteristics
> (workingset, memory access patterns, memory compressibility).

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Baoquan He 2 months, 2 weeks ago

Add YoungJun to CC.

On 11/22/25 at 05:59pm, Kairui Song wrote:
> On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
> >
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
> >
> > The zswap write back has been disabled if all swapfiles in the system
> > are ghost swap files.
> 
> Thanks for sharing this, I've been hearing about the ghost swapfile
> design for a long time, glad to see it finally got posted.
> 
> >
> > Signed-off-by: Chris Li <chrisl@kernel.org>
> > ---
> >  include/linux/swap.h |  2 ++
> >  mm/page_io.c         | 18 +++++++++++++++---
> >  mm/swap.h            |  2 +-
> >  mm/swap_state.c      |  7 +++++++
> >  mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
> >  mm/zswap.c           | 17 +++++++++++------
> >  6 files changed, 73 insertions(+), 15 deletions(-)
> 
> In general I think this aligns quite well with what I had in mind and
> an idea that was mention during LSFMM this year (the 3rd one in the
> "Issues" part, it wasn't clearly described in the cover letter, more
> details in the slides):
> https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/

Thanks for sharing the background and more information. When I checked
Youngjun's swap.tiers patchset before his RFC, felt it would be more
flexible to add zswap to memcg if zswap size can be decoupled from the
back device. Chris's RFC can satisfy that, but I didn't thought you
guys had planned more, e.g dynamic growth of swap size, and the zswap slot
management being like swap table on swap slot. Looking forward to seeing
the progress and more details.

Thanks
Baoquan

> 
> The good part is that we will reuse everything we have with the
> current swap stack, and stay optional. Everything is a swap device, no
> special layers required. All other features will be available in a
> cleaner way.
> 
> And /etc/fstab just works the same way for the ghost swapfile.
> 
> Looking forward to see this RFC get more updates.
> 
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -216,6 +216,7 @@ enum {
> >         SWP_PAGE_DISCARD = (1 << 10),   /* freed swap page-cluster discards */
> >         SWP_STABLE_WRITES = (1 << 11),  /* no overwrite PG_writeback pages */
> >         SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> > +       SWP_GHOST       = (1 << 13),    /* not backed by anything */
> >                                         /* add others here before... */
> >  };
> >
> > @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
> >  void free_pages_and_swap_cache(struct encoded_page **, int);
> >  /* linux/mm/swapfile.c */
> >  extern atomic_long_t nr_swap_pages;
> > +extern atomic_t nr_real_swapfiles;
> >  extern long total_swap_pages;
> >  extern atomic_t nr_rotate_swap;
> >
> > diff --git a/mm/page_io.c b/mm/page_io.c
> > index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> > --- a/mm/page_io.c
> > +++ b/mm/page_io.c
> > @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> >                 return AOP_WRITEPAGE_ACTIVATE;
> >         }
> >
> > -       __swap_writepage(folio, swap_plug);
> > -       return 0;
> > +       return __swap_writepage(folio, swap_plug);
> >  out_unlock:
> >         folio_unlock(folio);
> >         return ret;
> > @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
> >         submit_bio(bio);
> >  }
> >
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> >  {
> >         struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> >
> >         VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> > +
> > +       if (sis->flags & SWP_GHOST) {
> > +               /* Prevent the page from getting reclaimed. */
> > +               folio_set_dirty(folio);
> > +               return AOP_WRITEPAGE_ACTIVATE;
> > +       }
> > +
> >         /*
> >          * ->flags can be updated non-atomicially (scan_swap_map_slots),
> >          * but that will never affect SWP_FS_OPS, so the data_race
> > @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> >                 swap_writepage_bdev_sync(folio, sis);
> >         else
> >                 swap_writepage_bdev_async(folio, sis);
> > +       return 0;
> >  }
> >
> >  void swap_write_unplug(struct swap_iocb *sio)
> > @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> >         if (zswap_load(folio) != -ENOENT)
> >                 goto finish;
> >
> > +       if (unlikely(sis->flags & SWP_GHOST)) {
> > +               folio_unlock(folio);
> > +               goto finish;
> > +       }
> > +
> >         /* We have to read from slower devices. Increase zswap protection. */
> >         zswap_folio_swapin(folio);
> >
> > diff --git a/mm/swap.h b/mm/swap.h
> > index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> >  }
> >  void swap_write_unplug(struct swap_iocb *sio);
> >  int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> >
> >  /* linux/mm/swap_state.c */
> >  extern struct address_space swap_space __ro_after_init;
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> >         struct swap_iocb *splug = NULL;
> >         bool page_allocated;
> >
> > +       /*
> > +        * The entry may have been freed by another task. Avoid swap_info_get()
> > +        * which will print error message if the race happens.
> > +        */
> > +       if (si->flags & SWP_GHOST)
> > +               goto skip;
> > +
> >         mask = swapin_nr_pages(offset) - 1;
> >         if (!mask)
> >                 goto skip;
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
> >  static DEFINE_SPINLOCK(swap_lock);
> >  static unsigned int nr_swapfiles;
> >  atomic_long_t nr_swap_pages;
> > +atomic_t nr_real_swapfiles;
> >  /*
> >   * Some modules use swappable objects and may try to swap them out under
> >   * memory pressure (via the shrinker). Before doing so, they may wish to
> > @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> >                         goto skip;
> >         }
> >
> > +       if (!(si->flags & SWP_GHOST))
> > +               atomic_sub(1, &nr_real_swapfiles);
> >         plist_del(&si->avail_list, &swap_avail_head);
> >
> >  skip:
> > @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> >         }
> >
> >         plist_add(&si->avail_list, &swap_avail_head);
> > +       if (!(si->flags & SWP_GHOST))
> > +               atomic_add(1, &nr_real_swapfiles);
> >
> >  skip:
> >         spin_unlock(&swap_avail_lock);
> > @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
> >         struct inode *inode = mapping->host;
> >         int ret;
> >
> > +       if (sis->flags & SWP_GHOST) {
> > +               *span = 0;
> > +               return 0;
> > +       }
> > +
> >         if (S_ISBLK(inode->i_mode)) {
> >                 ret = add_swap_extent(sis, 0, sis->max, 0);
> >                 *span = sis->pages;
> > @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> >         if (p->flags & SWP_CONTINUED)
> >                 free_swap_count_continuations(p);
> >
> > -       if (!p->bdev || !bdev_nonrot(p->bdev))
> > +       if (!(p->flags & SWP_GHOST) &&
> > +           (!p->bdev || !bdev_nonrot(p->bdev)))
> >                 atomic_dec(&nr_rotate_swap);
> >
> >         mutex_lock(&swapon_mutex);
> > @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
> >         mutex_unlock(&swapon_mutex);
> >  }
> >
> > +static const char *swap_type_str(struct swap_info_struct *si)
> > +{
> > +       struct file *file = si->swap_file;
> > +
> > +       if (si->flags & SWP_GHOST)
> > +               return "ghost\t";
> > +
> > +       if (S_ISBLK(file_inode(file)->i_mode))
> > +               return "partition";
> > +
> > +       return "file\t";
> > +}
> > +
> >  static int swap_show(struct seq_file *swap, void *v)
> >  {
> >         struct swap_info_struct *si = v;
> > @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
> >         len = seq_file_path(swap, file, " \t\n\\");
> >         seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
> >                         len < 40 ? 40 - len : 1, " ",
> > -                       S_ISBLK(file_inode(file)->i_mode) ?
> > -                               "partition" : "file\t",
> > +                       swap_type_str(si),
> >                         bytes, bytes < 10000000 ? "\t" : "",
> >                         inuse, inuse < 10000000 ? "\t" : "",
> >                         si->prio);
> > @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
> >         return 0;
> >  }
> >
> > -
> >  /*
> >   * Find out how many pages are allowed for a single swap device. There
> >   * are two limiting factors:
> > @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> >         unsigned long maxpages;
> >         unsigned long swapfilepages;
> >         unsigned long last_page;
> > +       loff_t size;
> >
> >         if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
> >                 pr_err("Unable to find swap-space signature\n");
> > @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> >
> >         if (!maxpages)
> >                 return 0;
> > -       swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> > +
> > +       size = i_size_read(inode);
> > +       if (size == PAGE_SIZE) {
> > +               /* Ghost swapfile */
> > +               si->bdev = NULL;
> > +               si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> > +               return maxpages;
> > +       }
> 
> Here if we push things further, it might be a good idea to make better
> use of the swap file header for detecting this kind of device, and
> maybe add support for other info too. The header already has version
> info embedded in case it will be extended.
>

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Yosry Ahmed 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> The current zswap requires a backing swapfile. The swap slot used
> by zswap is not able to be used by the swapfile. That waste swapfile
> space.
> 
> The ghost swapfile is a swapfile that only contains the swapfile header
> for zswap. The swapfile header indicate the size of the swapfile. There
> is no swap data section in the ghost swapfile, therefore, no waste of
> swapfile space.  As such, any write to a ghost swapfile will fail. To
> prevents accidental read or write of ghost swapfile, bdev of
> swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> flag because there is no rotation disk access when using zswap.
> 
> The zswap write back has been disabled if all swapfiles in the system
> are ghost swap files.
> 
> Signed-off-by: Chris Li <chrisl@kernel.org>

This was brought up before, I think it's not the right way to go
upstream. Even if it's good for the short-term, it's a behavior exposed
to userspace that we'll have to maintain. With the ongoing work to
decouple zswap and swap backends, this will end up being something we
have to workaround indefinitely to keep the same userspace semantics.

> ---
>  include/linux/swap.h |  2 ++
>  mm/page_io.c         | 18 +++++++++++++++---
>  mm/swap.h            |  2 +-
>  mm/swap_state.c      |  7 +++++++
>  mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
>  mm/zswap.c           | 17 +++++++++++------
>  6 files changed, 73 insertions(+), 15 deletions(-)
> 
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -216,6 +216,7 @@ enum {
>  	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
>  	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
>  	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
> +	SWP_GHOST	= (1 << 13),	/* not backed by anything */
>  					/* add others here before... */
>  };
>  
> @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
>  void free_pages_and_swap_cache(struct encoded_page **, int);
>  /* linux/mm/swapfile.c */
>  extern atomic_long_t nr_swap_pages;
> +extern atomic_t nr_real_swapfiles;
>  extern long total_swap_pages;
>  extern atomic_t nr_rotate_swap;
>  
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
>  		return AOP_WRITEPAGE_ACTIVATE;
>  	}
>  
> -	__swap_writepage(folio, swap_plug);
> -	return 0;
> +	return __swap_writepage(folio, swap_plug);
>  out_unlock:
>  	folio_unlock(folio);
>  	return ret;
> @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
>  	submit_bio(bio);
>  }
>  
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
>  {
>  	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
>  
>  	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> +
> +	if (sis->flags & SWP_GHOST) {
> +		/* Prevent the page from getting reclaimed. */
> +		folio_set_dirty(folio);
> +		return AOP_WRITEPAGE_ACTIVATE;
> +	}
> +
>  	/*
>  	 * ->flags can be updated non-atomicially (scan_swap_map_slots),
>  	 * but that will never affect SWP_FS_OPS, so the data_race
> @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
>  		swap_writepage_bdev_sync(folio, sis);
>  	else
>  		swap_writepage_bdev_async(folio, sis);
> +	return 0;
>  }
>  
>  void swap_write_unplug(struct swap_iocb *sio)
> @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
>  	if (zswap_load(folio) != -ENOENT)
>  		goto finish;
>  
> +	if (unlikely(sis->flags & SWP_GHOST)) {
> +		folio_unlock(folio);
> +		goto finish;
> +	}
> +
>  	/* We have to read from slower devices. Increase zswap protection. */
>  	zswap_folio_swapin(folio);
>  
> diff --git a/mm/swap.h b/mm/swap.h
> index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
>  }
>  void swap_write_unplug(struct swap_iocb *sio);
>  int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
>  
>  /* linux/mm/swap_state.c */
>  extern struct address_space swap_space __ro_after_init;
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
>  	struct swap_iocb *splug = NULL;
>  	bool page_allocated;
>  
> +	/*
> +	 * The entry may have been freed by another task. Avoid swap_info_get()
> +	 * which will print error message if the race happens.
> +	 */
> +	if (si->flags & SWP_GHOST)
> +		goto skip;
> +
>  	mask = swapin_nr_pages(offset) - 1;
>  	if (!mask)
>  		goto skip;
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
>  static DEFINE_SPINLOCK(swap_lock);
>  static unsigned int nr_swapfiles;
>  atomic_long_t nr_swap_pages;
> +atomic_t nr_real_swapfiles;
>  /*
>   * Some modules use swappable objects and may try to swap them out under
>   * memory pressure (via the shrinker). Before doing so, they may wish to
> @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
>  			goto skip;
>  	}
>  
> +	if (!(si->flags & SWP_GHOST))
> +		atomic_sub(1, &nr_real_swapfiles);
>  	plist_del(&si->avail_list, &swap_avail_head);
>  
>  skip:
> @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
>  	}
>  
>  	plist_add(&si->avail_list, &swap_avail_head);
> +	if (!(si->flags & SWP_GHOST))
> +		atomic_add(1, &nr_real_swapfiles);
>  
>  skip:
>  	spin_unlock(&swap_avail_lock);
> @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
>  	struct inode *inode = mapping->host;
>  	int ret;
>  
> +	if (sis->flags & SWP_GHOST) {
> +		*span = 0;
> +		return 0;
> +	}
> +
>  	if (S_ISBLK(inode->i_mode)) {
>  		ret = add_swap_extent(sis, 0, sis->max, 0);
>  		*span = sis->pages;
> @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  	if (p->flags & SWP_CONTINUED)
>  		free_swap_count_continuations(p);
>  
> -	if (!p->bdev || !bdev_nonrot(p->bdev))
> +	if (!(p->flags & SWP_GHOST) &&
> +	    (!p->bdev || !bdev_nonrot(p->bdev)))
>  		atomic_dec(&nr_rotate_swap);
>  
>  	mutex_lock(&swapon_mutex);
> @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
>  	mutex_unlock(&swapon_mutex);
>  }
>  
> +static const char *swap_type_str(struct swap_info_struct *si)
> +{
> +	struct file *file = si->swap_file;
> +
> +	if (si->flags & SWP_GHOST)
> +		return "ghost\t";
> +
> +	if (S_ISBLK(file_inode(file)->i_mode))
> +		return "partition";
> +
> +	return "file\t";
> +}
> +
>  static int swap_show(struct seq_file *swap, void *v)
>  {
>  	struct swap_info_struct *si = v;
> @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
>  	len = seq_file_path(swap, file, " \t\n\\");
>  	seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
>  			len < 40 ? 40 - len : 1, " ",
> -			S_ISBLK(file_inode(file)->i_mode) ?
> -				"partition" : "file\t",
> +			swap_type_str(si),
>  			bytes, bytes < 10000000 ? "\t" : "",
>  			inuse, inuse < 10000000 ? "\t" : "",
>  			si->prio);
> @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
>  	return 0;
>  }
>  
> -
>  /*
>   * Find out how many pages are allowed for a single swap device. There
>   * are two limiting factors:
> @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>  	unsigned long maxpages;
>  	unsigned long swapfilepages;
>  	unsigned long last_page;
> +	loff_t size;
>  
>  	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
>  		pr_err("Unable to find swap-space signature\n");
> @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>  
>  	if (!maxpages)
>  		return 0;
> -	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> +
> +	size = i_size_read(inode);
> +	if (size == PAGE_SIZE) {
> +		/* Ghost swapfile */
> +		si->bdev = NULL;
> +		si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> +		return maxpages;
> +	}
> +
> +	swapfilepages = size >> PAGE_SHIFT;
>  	if (swapfilepages && maxpages > swapfilepages) {
>  		pr_warn("Swap area shorter than signature indicates\n");
>  		return 0;
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 5d0f8b13a958da3b5e74b63217b06e58ba2d3c26..29dfcc94b13eb72b1dbd100ded6e50620299e6e1 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1005,14 +1005,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>  	struct folio *folio;
>  	struct mempolicy *mpol;
>  	bool folio_was_allocated;
> -	struct swap_info_struct *si;
> +	struct swap_info_struct *si = get_swap_device(swpentry);
>  	int ret = 0;
>  
> -	/* try to allocate swap cache folio */
> -	si = get_swap_device(swpentry);
>  	if (!si)
> -		return -EEXIST;
> +		return -ENOENT;
> +
> +	if (si->flags & SWP_GHOST) {
> +		put_swap_device(si);
> +		return -EINVAL;
> +	}
>  
> +	/* try to allocate swap cache folio */
>  	mpol = get_task_policy(current);
>  	folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
>  			NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
> @@ -1067,7 +1071,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>  	folio_set_reclaim(folio);
>  
>  	/* start writeback */
> -	__swap_writepage(folio, NULL);
> +	ret = __swap_writepage(folio, NULL);
> +	WARN_ON_ONCE(ret);
>  
>  out:
>  	if (ret && ret != -EEXIST) {
> @@ -1551,7 +1556,7 @@ bool zswap_store(struct folio *folio)
>  	zswap_pool_put(pool);
>  put_objcg:
>  	obj_cgroup_put(objcg);
> -	if (!ret && zswap_pool_reached_full)
> +	if (!ret && zswap_pool_reached_full && atomic_read(&nr_real_swapfiles))
>  		queue_work(shrink_wq, &zswap_shrink_work);
>  check_old:
>  	/*
> 
> ---
> base-commit: 9835506e139732fa1b55aea3ed4e3ec3dd499f30
> change-id: 20251121-ghost-56e3948a7a17
> 
> Best regards,
> -- 
> Chris Li <chrisl@kernel.org>
>

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 7:14 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
>
> On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
> >
> > The zswap write back has been disabled if all swapfiles in the system
> > are ghost swap files.
> >
> > Signed-off-by: Chris Li <chrisl@kernel.org>
>
> This was brought up before, I think it's not the right way to go
> upstream. Even if it's good for the short-term, it's a behavior exposed
> to userspace that we'll have to maintain. With the ongoing work to
> decouple zswap and swap backends, this will end up being something we
> have to workaround indefinitely to keep the same userspace semantics.

Actually, this doesn't need to be the short term solution. It can be
long term. I get  it your zswap maintainers do not want to get
involved in the ghost swapfile. I will leave you guys alone. Remember
2023 LPC swap abstraction talk, the community picked my approach to
the VFS swap ops over the swap abstraction which the swap
virtualization is based on. I take some time to come up with the
cluster based swap allocator and swap table to clean up and speed up
the swap stack. Now I am finally able to circle back and fulfill my
promise of the VFS swap ops. Have a little faith I will solve this
swap entry redirection issue nicely for you, better than the swap
virtualization approach can.

Chris

>
> > ---
> >  include/linux/swap.h |  2 ++
> >  mm/page_io.c         | 18 +++++++++++++++---
> >  mm/swap.h            |  2 +-
> >  mm/swap_state.c      |  7 +++++++
> >  mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
> >  mm/zswap.c           | 17 +++++++++++------
> >  6 files changed, 73 insertions(+), 15 deletions(-)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -216,6 +216,7 @@ enum {
> >       SWP_PAGE_DISCARD = (1 << 10),   /* freed swap page-cluster discards */
> >       SWP_STABLE_WRITES = (1 << 11),  /* no overwrite PG_writeback pages */
> >       SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> > +     SWP_GHOST       = (1 << 13),    /* not backed by anything */
> >                                       /* add others here before... */
> >  };
> >
> > @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
> >  void free_pages_and_swap_cache(struct encoded_page **, int);
> >  /* linux/mm/swapfile.c */
> >  extern atomic_long_t nr_swap_pages;
> > +extern atomic_t nr_real_swapfiles;
> >  extern long total_swap_pages;
> >  extern atomic_t nr_rotate_swap;
> >
> > diff --git a/mm/page_io.c b/mm/page_io.c
> > index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> > --- a/mm/page_io.c
> > +++ b/mm/page_io.c
> > @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> >               return AOP_WRITEPAGE_ACTIVATE;
> >       }
> >
> > -     __swap_writepage(folio, swap_plug);
> > -     return 0;
> > +     return __swap_writepage(folio, swap_plug);
> >  out_unlock:
> >       folio_unlock(folio);
> >       return ret;
> > @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
> >       submit_bio(bio);
> >  }
> >
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> >  {
> >       struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> >
> >       VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> > +
> > +     if (sis->flags & SWP_GHOST) {
> > +             /* Prevent the page from getting reclaimed. */
> > +             folio_set_dirty(folio);
> > +             return AOP_WRITEPAGE_ACTIVATE;
> > +     }
> > +
> >       /*
> >        * ->flags can be updated non-atomicially (scan_swap_map_slots),
> >        * but that will never affect SWP_FS_OPS, so the data_race
> > @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> >               swap_writepage_bdev_sync(folio, sis);
> >       else
> >               swap_writepage_bdev_async(folio, sis);
> > +     return 0;
> >  }
> >
> >  void swap_write_unplug(struct swap_iocb *sio)
> > @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> >       if (zswap_load(folio) != -ENOENT)
> >               goto finish;
> >
> > +     if (unlikely(sis->flags & SWP_GHOST)) {
> > +             folio_unlock(folio);
> > +             goto finish;
> > +     }
> > +
> >       /* We have to read from slower devices. Increase zswap protection. */
> >       zswap_folio_swapin(folio);
> >
> > diff --git a/mm/swap.h b/mm/swap.h
> > index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> >  }
> >  void swap_write_unplug(struct swap_iocb *sio);
> >  int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> >
> >  /* linux/mm/swap_state.c */
> >  extern struct address_space swap_space __ro_after_init;
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> >       struct swap_iocb *splug = NULL;
> >       bool page_allocated;
> >
> > +     /*
> > +      * The entry may have been freed by another task. Avoid swap_info_get()
> > +      * which will print error message if the race happens.
> > +      */
> > +     if (si->flags & SWP_GHOST)
> > +             goto skip;
> > +
> >       mask = swapin_nr_pages(offset) - 1;
> >       if (!mask)
> >               goto skip;
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
> >  static DEFINE_SPINLOCK(swap_lock);
> >  static unsigned int nr_swapfiles;
> >  atomic_long_t nr_swap_pages;
> > +atomic_t nr_real_swapfiles;
> >  /*
> >   * Some modules use swappable objects and may try to swap them out under
> >   * memory pressure (via the shrinker). Before doing so, they may wish to
> > @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> >                       goto skip;
> >       }
> >
> > +     if (!(si->flags & SWP_GHOST))
> > +             atomic_sub(1, &nr_real_swapfiles);
> >       plist_del(&si->avail_list, &swap_avail_head);
> >
> >  skip:
> > @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> >       }
> >
> >       plist_add(&si->avail_list, &swap_avail_head);
> > +     if (!(si->flags & SWP_GHOST))
> > +             atomic_add(1, &nr_real_swapfiles);
> >
> >  skip:
> >       spin_unlock(&swap_avail_lock);
> > @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
> >       struct inode *inode = mapping->host;
> >       int ret;
> >
> > +     if (sis->flags & SWP_GHOST) {
> > +             *span = 0;
> > +             return 0;
> > +     }
> > +
> >       if (S_ISBLK(inode->i_mode)) {
> >               ret = add_swap_extent(sis, 0, sis->max, 0);
> >               *span = sis->pages;
> > @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> >       if (p->flags & SWP_CONTINUED)
> >               free_swap_count_continuations(p);
> >
> > -     if (!p->bdev || !bdev_nonrot(p->bdev))
> > +     if (!(p->flags & SWP_GHOST) &&
> > +         (!p->bdev || !bdev_nonrot(p->bdev)))
> >               atomic_dec(&nr_rotate_swap);
> >
> >       mutex_lock(&swapon_mutex);
> > @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
> >       mutex_unlock(&swapon_mutex);
> >  }
> >
> > +static const char *swap_type_str(struct swap_info_struct *si)
> > +{
> > +     struct file *file = si->swap_file;
> > +
> > +     if (si->flags & SWP_GHOST)
> > +             return "ghost\t";
> > +
> > +     if (S_ISBLK(file_inode(file)->i_mode))
> > +             return "partition";
> > +
> > +     return "file\t";
> > +}
> > +
> >  static int swap_show(struct seq_file *swap, void *v)
> >  {
> >       struct swap_info_struct *si = v;
> > @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
> >       len = seq_file_path(swap, file, " \t\n\\");
> >       seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
> >                       len < 40 ? 40 - len : 1, " ",
> > -                     S_ISBLK(file_inode(file)->i_mode) ?
> > -                             "partition" : "file\t",
> > +                     swap_type_str(si),
> >                       bytes, bytes < 10000000 ? "\t" : "",
> >                       inuse, inuse < 10000000 ? "\t" : "",
> >                       si->prio);
> > @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
> >       return 0;
> >  }
> >
> > -
> >  /*
> >   * Find out how many pages are allowed for a single swap device. There
> >   * are two limiting factors:
> > @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> >       unsigned long maxpages;
> >       unsigned long swapfilepages;
> >       unsigned long last_page;
> > +     loff_t size;
> >
> >       if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
> >               pr_err("Unable to find swap-space signature\n");
> > @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> >
> >       if (!maxpages)
> >               return 0;
> > -     swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> > +
> > +     size = i_size_read(inode);
> > +     if (size == PAGE_SIZE) {
> > +             /* Ghost swapfile */
> > +             si->bdev = NULL;
> > +             si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> > +             return maxpages;
> > +     }
> > +
> > +     swapfilepages = size >> PAGE_SHIFT;
> >       if (swapfilepages && maxpages > swapfilepages) {
> >               pr_warn("Swap area shorter than signature indicates\n");
> >               return 0;
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 5d0f8b13a958da3b5e74b63217b06e58ba2d3c26..29dfcc94b13eb72b1dbd100ded6e50620299e6e1 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -1005,14 +1005,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >       struct folio *folio;
> >       struct mempolicy *mpol;
> >       bool folio_was_allocated;
> > -     struct swap_info_struct *si;
> > +     struct swap_info_struct *si = get_swap_device(swpentry);
> >       int ret = 0;
> >
> > -     /* try to allocate swap cache folio */
> > -     si = get_swap_device(swpentry);
> >       if (!si)
> > -             return -EEXIST;
> > +             return -ENOENT;
> > +
> > +     if (si->flags & SWP_GHOST) {
> > +             put_swap_device(si);
> > +             return -EINVAL;
> > +     }
> >
> > +     /* try to allocate swap cache folio */
> >       mpol = get_task_policy(current);
> >       folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
> >                       NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
> > @@ -1067,7 +1071,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >       folio_set_reclaim(folio);
> >
> >       /* start writeback */
> > -     __swap_writepage(folio, NULL);
> > +     ret = __swap_writepage(folio, NULL);
> > +     WARN_ON_ONCE(ret);
> >
> >  out:
> >       if (ret && ret != -EEXIST) {
> > @@ -1551,7 +1556,7 @@ bool zswap_store(struct folio *folio)
> >       zswap_pool_put(pool);
> >  put_objcg:
> >       obj_cgroup_put(objcg);
> > -     if (!ret && zswap_pool_reached_full)
> > +     if (!ret && zswap_pool_reached_full && atomic_read(&nr_real_swapfiles))
> >               queue_work(shrink_wq, &zswap_shrink_work);
> >  check_old:
> >       /*
> >
> > ---
> > base-commit: 9835506e139732fa1b55aea3ed4e3ec3dd499f30
> > change-id: 20251121-ghost-56e3948a7a17
> >
> > Best regards,
> > --
> > Chris Li <chrisl@kernel.org>
> >
>

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Nhat Pham 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Fri, Nov 21, 2025 at 7:14 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
> >
> > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > The current zswap requires a backing swapfile. The swap slot used
> > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > space.
> > >
> > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > prevents accidental read or write of ghost swapfile, bdev of
> > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > flag because there is no rotation disk access when using zswap.
> > >
> > > The zswap write back has been disabled if all swapfiles in the system
> > > are ghost swap files.
> > >
> > > Signed-off-by: Chris Li <chrisl@kernel.org>
> >
> > This was brought up before, I think it's not the right way to go
> > upstream. Even if it's good for the short-term, it's a behavior exposed
> > to userspace that we'll have to maintain. With the ongoing work to
> > decouple zswap and swap backends, this will end up being something we
> > have to workaround indefinitely to keep the same userspace semantics.
>
> Actually, this doesn't need to be the short term solution. It can be
> long term. I get  it your zswap maintainers do not want to get
> involved in the ghost swapfile. I will leave you guys alone. Remember
> 2023 LPC swap abstraction talk, the community picked my approach to
> the VFS swap ops over the swap abstraction which the swap
> virtualization is based on. I take some time to come up with the
> cluster based swap allocator and swap table to clean up and speed up
> the swap stack. Now I am finally able to circle back and fulfill my
> promise of the VFS swap ops. Have a little faith I will solve this
> swap entry redirection issue nicely for you, better than the swap
> virtualization approach can.

Look man, I'm not married to any idea. If your VFS approach solve our
problems, I can move on to other projects :) We have lots of
swap/memory reclaim/MM problems to solve, both internally at Meta and
upstream.

But please explain how your VFS approach solved the 3 requirements I
mentioned in the other email, and more specifically the backend
transfer requirement.

I have explicitly asked about it in your submission for your 2024
LSFMMBPF talk - at that time I have not seriously started the swap
virtualization work, only at the design phase. You just handwaved it
away and never really explained to me how you can achieve backend
transfer with your design:

https://lore.kernel.org/all/CAF8kJuNFtejEtjQHg5UBGduvFNn3AaGn4ffyoOrEnXfHpx6Ubg@mail.gmail.com/

I understand that you had more pressing issues to fix at a time, so I
did not bring it up during the conference. But it's an imperative
requirement for us.

swap.tiers is nice for initial placement and for hierarchy
determination in general, but when the page is already placed on one
tier and needs to be transferred to the tier, how will you move it
from one tier to another?

What zram is doing right now, IIUC, is building the redirection
internally. I would like to try avoiding repeating that for zswap, and
for every other future backends, by pulling it out of backend internal
code and build a dedicated module for it. That is just swap
virtualization.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Johannes Weiner 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> The current zswap requires a backing swapfile. The swap slot used
> by zswap is not able to be used by the swapfile. That waste swapfile
> space.
>
> The ghost swapfile is a swapfile that only contains the swapfile header
> for zswap. The swapfile header indicate the size of the swapfile. There
> is no swap data section in the ghost swapfile, therefore, no waste of
> swapfile space.  As such, any write to a ghost swapfile will fail. To
> prevents accidental read or write of ghost swapfile, bdev of
> swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> flag because there is no rotation disk access when using zswap.

Zswap is primarily a compressed cache for real swap on secondary
storage. It's indeed quite important that entries currently in zswap
don't occupy disk slots; but for a solution to this to be acceptable,
it has to work with the primary usecase and support disk writeback.

This direction is a dead-end. Please take a look at Nhat's swap
virtualization patches. They decouple zswap from disk geometry, while
still supporting writeback to an actual backend file.

Nacked-by: Johannes Weiner <hannes@cmpxchg.org>

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
>
> Zswap is primarily a compressed cache for real swap on secondary
> storage. It's indeed quite important that entries currently in zswap
> don't occupy disk slots; but for a solution to this to be acceptable,
> it has to work with the primary usecase and support disk writeback.

Well, my plan is to support the writeback via swap.tiers.

> This direction is a dead-end. Please take a look at Nhat's swap
> virtualization patches. They decouple zswap from disk geometry, while
> still supporting writeback to an actual backend file.

Yes, there are many ways to decouple zswap from disk geometry, my swap
table + swap.tiers design can do that as well. I have concerns about
swap virtualization in the aspect of adding another layer of memory
overhead addition per swap entry and CPU overhead of extra xarray
lookup. I believe my approach is technically superior and cleaner.
Both faster and cleaner. Basically swap.tiers + VFS like swap read
write page ops. I will let Nhat clarify the performance and memory
overhead side of the swap virtualization.

I am not against swap entry redirection. Just the swap virtualization
series needs to compare against the alternatives in terms of memory
overhead and throughput.
Solving it from the swap.tiers angle is cleaner.

> Nacked-by: Johannes Weiner <hannes@cmpxchg.org>

I take that the only relevant part is you are zswap maintainer and I
am the swap maintainer. Fine. I got the message. I will leave the
zswap alone. I will find other ways to address the memory base swap
tiers in swap.tiers.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 2 weeks ago

Hi Johannes,

On Sat, Nov 22, 2025 at 5:52 AM Chris Li <chrisl@kernel.org> wrote:
>
> > Nacked-by: Johannes Weiner <hannes@cmpxchg.org>
>
> I take that the only relevant part is you are zswap maintainer and I
> am the swap maintainer. Fine. I got the message. I will leave the
> zswap alone. I will find other ways to address the memory base swap
> tiers in swap.tiers.

I am sorry that I have said that. Let me take back what I said above.
I was upset when I considered you and others blocking the more optimal
solution and in favor of the less optimal solution. That is my short
temper, as usual.

Now I can see that you might not see one as more optimal than the
other as convincing as I do, or I haven't done a good job explaining
it.

Let me offer my sincere apology. I will reply to the technical aspect
of the question in other email.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Johannes Weiner 2 months, 2 weeks ago

On Tue, Nov 25, 2025 at 10:14:40PM +0400, Chris Li wrote:
> Hi Johannes,
> 
> On Sat, Nov 22, 2025 at 5:52 AM Chris Li <chrisl@kernel.org> wrote:
> >
> > > Nacked-by: Johannes Weiner <hannes@cmpxchg.org>
> >
> > I take that the only relevant part is you are zswap maintainer and I
> > am the swap maintainer. Fine. I got the message. I will leave the
> > zswap alone. I will find other ways to address the memory base swap
> > tiers in swap.tiers.
> 
> I am sorry that I have said that. Let me take back what I said above.
> I was upset when I considered you and others blocking the more optimal
> solution and in favor of the less optimal solution. That is my short
> temper, as usual.
> 
> Now I can see that you might not see one as more optimal than the
> other as convincing as I do, or I haven't done a good job explaining
> it.
> 
> Let me offer my sincere apology. I will reply to the technical aspect
> of the question in other email.

Thanks Chris. No hard feelings.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Johannes Weiner 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote:
> On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > The current zswap requires a backing swapfile. The swap slot used
> > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > space.
> > >
> > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > prevents accidental read or write of ghost swapfile, bdev of
> > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > flag because there is no rotation disk access when using zswap.
> >
> > Zswap is primarily a compressed cache for real swap on secondary
> > storage. It's indeed quite important that entries currently in zswap
> > don't occupy disk slots; but for a solution to this to be acceptable,
> > it has to work with the primary usecase and support disk writeback.
> 
> Well, my plan is to support the writeback via swap.tiers.

Do you have a link to that proposal?

My understanding of swap tiers was about grouping different swapfiles
and assigning them to cgroups. The issue with writeback is relocating
the data that a swp_entry_t page table refers to - without having to
find and update all the possible page tables. I'm not sure how
swap.tiers solve this problem.

> > This direction is a dead-end. Please take a look at Nhat's swap
> > virtualization patches. They decouple zswap from disk geometry, while
> > still supporting writeback to an actual backend file.
> 
> Yes, there are many ways to decouple zswap from disk geometry, my swap
> table + swap.tiers design can do that as well. I have concerns about
> swap virtualization in the aspect of adding another layer of memory
> overhead addition per swap entry and CPU overhead of extra xarray
> lookup. I believe my approach is technically superior and cleaner.
> Both faster and cleaner. Basically swap.tiers + VFS like swap read
> write page ops. I will let Nhat clarify the performance and memory
> overhead side of the swap virtualization.

I'm happy to discuss it.

But keep in mind that the swap virtualization idea is a collaborative
product of quite a few people with an extensive combined upstream
record. Quite a bit of thought has gone into balancing static vs
runtime costs of that proposal. So you'll forgive me if I'm a bit
skeptical of the somewhat grandiose claims of one person that is new
to upstream development.

As to your specific points - we use xarray lookups in the page cache
fast path. It's a bold claim to say this would be too much overhead
during swapins.

Two, it's not clear to me how you want to make writeback efficient
*without* any sort of swap entry redirection. Walking all relevant
page tables is expensive; and you have to be able to find them first.

If you're talking about a redirection array as opposed to a tree -
static sizing of the compressed space is also a no-go. Zswap
utilization varies *widely* between workloads and different workload
combinations. Further, zswap consumes the same fungible resource as
uncompressed memory - there is really no excuse to burden users with
static sizing questions about this pool.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Yosry Ahmed 2 months, 2 weeks ago

On Mon, Nov 24, 2025 at 12:27:17PM -0500, Johannes Weiner wrote:
> On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote:
> > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > >
> > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > > The current zswap requires a backing swapfile. The swap slot used
> > > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > > space.
> > > >
> > > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > > prevents accidental read or write of ghost swapfile, bdev of
> > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > > flag because there is no rotation disk access when using zswap.
> > >
> > > Zswap is primarily a compressed cache for real swap on secondary
> > > storage. It's indeed quite important that entries currently in zswap
> > > don't occupy disk slots; but for a solution to this to be acceptable,
> > > it has to work with the primary usecase and support disk writeback.
> > 
> > Well, my plan is to support the writeback via swap.tiers.
> 
> Do you have a link to that proposal?
> 
> My understanding of swap tiers was about grouping different swapfiles
> and assigning them to cgroups. The issue with writeback is relocating
> the data that a swp_entry_t page table refers to - without having to
> find and update all the possible page tables. I'm not sure how
> swap.tiers solve this problem.
> 
> > > This direction is a dead-end. Please take a look at Nhat's swap
> > > virtualization patches. They decouple zswap from disk geometry, while
> > > still supporting writeback to an actual backend file.
> > 
> > Yes, there are many ways to decouple zswap from disk geometry, my swap
> > table + swap.tiers design can do that as well. I have concerns about
> > swap virtualization in the aspect of adding another layer of memory
> > overhead addition per swap entry and CPU overhead of extra xarray
> > lookup. I believe my approach is technically superior and cleaner.
> > Both faster and cleaner. Basically swap.tiers + VFS like swap read
> > write page ops. I will let Nhat clarify the performance and memory
> > overhead side of the swap virtualization.
> 
> I'm happy to discuss it.
> 
> But keep in mind that the swap virtualization idea is a collaborative
> product of quite a few people with an extensive combined upstream
> record. Quite a bit of thought has gone into balancing static vs
> runtime costs of that proposal. So you'll forgive me if I'm a bit
> skeptical of the somewhat grandiose claims of one person that is new
> to upstream development.
> 
> As to your specific points - we use xarray lookups in the page cache
> fast path. It's a bold claim to say this would be too much overhead
> during swapins.
> 
> Two, it's not clear to me how you want to make writeback efficient
> *without* any sort of swap entry redirection. Walking all relevant
> page tables is expensive; and you have to be able to find them first.
> 
> If you're talking about a redirection array as opposed to a tree -
> static sizing of the compressed space is also a no-go. Zswap
> utilization varies *widely* between workloads and different workload
> combinations. Further, zswap consumes the same fungible resource as
> uncompressed memory - there is really no excuse to burden users with
> static sizing questions about this pool.

I think what Chris's idea is (and Chris correct me if I am wrong), is
that we use ghost swapfiles (that are not backed by disk space) for
zswap. So zswap has its own swapfiles, separate from disk swapfiles.

memory.tiers establishes the ordering between swapfiles, so you put
"ghost" -> "real" to get today's zswap writeback behavior. When you
writeback, you keep page tables pointing at the swap entry in the ghost
swapfile. What you do is:
- Allocate a new swap entry in the "real" swapfile.
- Update the swap table of the "ghost" swapfile to point at the swap
  entry in the "real" swapfile, reusing the pointer used for the
  swapcache.

Then, on swapin, you read the swap table of the "ghost" swapfile, find
the redirection, and read to the swap table of the "real" swapfile, then
read the page from disk into the swap cache. The redirection in the
"ghost" swapfile will keep existing, wasting that slot, until all
references to it are dropped.

I think this might work for this specific use case, with less overhead
than the xarray. BUT there are a few scenarios that are not covered
AFAICT:

- You still need to statically size the ghost swapfiles and their
  overheads.

- Wasting a slot in the ghost swapfile for the redirection. This
  complicates static provisioning a bit, because you have to account for
  entries that will be in zswap as well as writtenback. Furthermore,
  IIUC swap.tiers is intended to be generic and cover other use cases
  beyond zswap like SSD -> HDD. For that, I think wasting a slot in the
  SSD when we writeback to the HDD is a much bigger problem.

- We still cannot do swapoff efficiently as we need to walk the page
  tables (and some swap tables) to find and swapin all entries in a
  swapfile. Not as important as other things, but worth mentioning.

Chris please let me know if I didn't get this right.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 2 weeks ago

On Mon, Nov 24, 2025 at 11:32 PM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
>
> I think what Chris's idea is (and Chris correct me if I am wrong), is
> that we use ghost swapfiles (that are not backed by disk space) for
> zswap. So zswap has its own swapfiles, separate from disk swapfiles.

Ack.

> memory.tiers establishes the ordering between swapfiles, so you put
> "ghost" -> "real" to get today's zswap writeback behavior. When you
> writeback, you keep page tables pointing at the swap entry in the ghost
> swapfile. What you do is:
> - Allocate a new swap entry in the "real" swapfile.
> - Update the swap table of the "ghost" swapfile to point at the swap
>   entry in the "real" swapfile, reusing the pointer used for the
>   swapcache.

Ack, with minor adjustment in mapping the swap entry to the physical
location. The swap entry has swap cache, the physical location does
not.

> Then, on swapin, you read the swap table of the "ghost" swapfile, find
> the redirection, and read to the swap table of the "real" swapfile, then
> read the page from disk into the swap cache. The redirection in the
> "ghost" swapfile will keep existing, wasting that slot, until all
> references to it are dropped.

Ack. That is assuming we don't have a rmap a like for the swap entry.

> I think this might work for this specific use case, with less overhead
> than the xarray. BUT there are a few scenarios that are not covered
> AFAICT:
>
> - You still need to statically size the ghost swapfiles and their
>   overheads.

No true, both ghost swapfile and physical swapfile can expand
additional clusters beyond the original physical size, for allocating
the continued high order entry or redirection. For a ghost swapfile,
there is no physical layer, only the front end. So the size can grow
dynamically. Just allocate more clusters. The current swapfile header
file size is just an initial size. My current patch does not implement
that. It will need some later swap table phase to make it happen. But
that is not an architecture limit, it has been considered as part of
normal business.

> - Wasting a slot in the ghost swapfile for the redirection. This
>   complicates static provisioning a bit, because you have to account for
>   entries that will be in zswap as well as writtenback. Furthermore,
>   IIUC swap.tiers is intended to be generic and cover other use cases
>   beyond zswap like SSD -> HDD. For that, I think wasting a slot in the
>   SSD when we writeback to the HDD is a much bigger problem.

Yes and No. Yes it only wastes a front end swap entry (with swap
cache). The physical location is  a seperate layer. No, the physical
SSD space is not wasted because you can allocate additional front end
swap entry by growing the swap entry front end. Then have the
additional front end swap entry point to the physical location you
just directed away from. There is a lot more consideration of the
front end vs the physical layer. The physical layer does not care
about location order size 2^N alignment. The physical layer cares a
bit about continuity and  the number of IOV that it needs to issue.
The swap entry front end and the physical layer have slightly
different constraints.

> - We still cannot do swapoff efficiently as we need to walk the page
>   tables (and some swap tables) to find and swapin all entries in a
>   swapfile. Not as important as other things, but worth mentioning.

That need rmap for swap entries. It It is an independent issue.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Rik van Riel 2 months, 1 week ago

On Tue, 2025-11-25 at 22:50 +0400, Chris Li wrote:
> 
> > - We still cannot do swapoff efficiently as we need to walk the
> > page
> >   tables (and some swap tables) to find and swapin all entries in a
> >   swapfile. Not as important as other things, but worth mentioning.
> 
> That need rmap for swap entries. It It is an independent issue.
> 

Wouldn't rmap for swap entries be more expensive than
simply always having indirection for swap entries that
are in use?

With indirection, swapoff can just move pages from
the being-swapoffed device into the swap cache, and
if needed the memory can then be moved to another
swap device, without ever needing to find the page
tables.

This sounds like an uncommon scenario, but it is
functionally identical to what is done to pages
during zswap writeback, where the page table entries
stay unchanged, and the swap page is simply moved
to another backend location.

Why implement two things, when we can have one
thing that does both, with no extra complexity
over what zswap writeback needs?

-- 
All Rights Reversed.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 1 week ago

On Thu, Nov 27, 2025 at 1:59 AM Rik van Riel <riel@surriel.com> wrote:
>
> On Tue, 2025-11-25 at 22:50 +0400, Chris Li wrote:
> >
> > > - We still cannot do swapoff efficiently as we need to walk the
> > > page
> > >   tables (and some swap tables) to find and swapin all entries in a
> > >   swapfile. Not as important as other things, but worth mentioning.
> >
> > That need rmap for swap entries. It It is an independent issue.
> >
>
> Wouldn't rmap for swap entries be more expensive than
> simply always having indirection for swap entries that
> are in use?

It might be, to be frank. I consider this pretty far and late in the
stage of the game to evaluate the rmap and its alternatives. Do you
agree?

I might or might not try the rmap for swap entry. Right now I don't
have many data points nor insights.

> With indirection, swapoff can just move pages from
> the being-swapoffed device into the swap cache, and
> if needed the memory can then be moved to another
> swap device, without ever needing to find the page
> tables.

Ack. I don't think we have any disagreement here.

> This sounds like an uncommon scenario, but it is
> functionally identical to what is done to pages
> during zswap writeback, where the page table entries
> stay unchanged, and the swap page is simply moved
> to another backend location.
>
> Why implement two things, when we can have one
> thing that does both, with no extra complexity
> over what zswap writeback needs?

Let me ask you a clarifying question, then.

1) What exactly are you trying to propose here in what project? VS or
swap the pony?
2) What stage of the code change do you have in mind should this
change apply to?

I can't speak for VS,  I am open to embrace what you suggest in order
to swap the pony project, that is after I understand it first.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Rik van Riel 2 months, 1 week ago

On Thu, 2025-11-27 at 06:07 +0400, Chris Li wrote:
> On Thu, Nov 27, 2025 at 1:59 AM Rik van Riel <riel@surriel.com>
> wrote:
> > 
> > On Tue, 2025-11-25 at 22:50 +0400, Chris Li wrote:
> > > 
> > > > - We still cannot do swapoff efficiently as we need to walk the
> > > > page
> > > >   tables (and some swap tables) to find and swapin all entries
> > > > in a
> > > >   swapfile. Not as important as other things, but worth
> > > > mentioning.
> > > 
> > > That need rmap for swap entries. It It is an independent issue.
> > > 
> > 
> > Wouldn't rmap for swap entries be more expensive than
> > simply always having indirection for swap entries that
> > are in use?
> 
> It might be, to be frank. I consider this pretty far and late in the
> stage of the game to evaluate the rmap and its alternatives. Do you
> agree?
> 
> I might or might not try the rmap for swap entry. Right now I don't
> have many data points nor insights.

On the contrary. I think we should at least do some
back of the envelope calculations to estimate the
overhead of the different proposed solutions.

With both Nhat's vswap, and your proposal to always
have swap indirection with a separate front end, and
several back ends, there is no need for swap rmap.

This is a good thing, because a single swap slot
could be referenced by dozens, hundreds, or even
thousands of page table entries, in the case of
forking servers. This creates complexity which is
probably best avoided.

Conceptually, Nhat's vswap, and your idea of having
always-on swap indirection seem to be the same thing.
> 
> > This sounds like an uncommon scenario, but it is
> > functionally identical to what is done to pages
> > during zswap writeback, where the page table entries
> > stay unchanged, and the swap page is simply moved
> > to another backend location.
> > 
> > Why implement two things, when we can have one
> > thing that does both, with no extra complexity
> > over what zswap writeback needs?
> 
> Let me ask you a clarifying question, then.
> 
> 1) What exactly are you trying to propose here in what project? VS or
> swap the pony?

In the past, when faced with competing code bases
like this, one thing that has worked well is for both
developers to send their code to the list, and then
for both developers to send each other suggestions
(or diffs) to improve each other's code.

Vswap and your always-on indirection seem to do
exactly the same thing. This seems like a good
opportunity to work together, and come up with
code that is better than any one person's code.

> 2) What stage of the code change do you have in mind should this
> change apply to?

I think it makes sense to get the hard design
problems resolved before committing to one
particular code design.

Spending months to resolve subtle bugs in a
code base, only to discover later that it does
not do exactly what is needed, is not the
greatest way to make progress.

> 
> I can't speak for VS,  I am open to embrace what you suggest in order
> to swap the pony project, that is after I understand it first.
> 
Once both Nhat and you understand each other's code,
and have suggestions for each other on how to improve
it, we will likely end up with a code base that looks
nicer than either of you would have done by yourselves.

The more perspectives, the better.

-- 
All Rights Reversed.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Nhat Pham 2 months, 2 weeks ago

On Mon, Nov 24, 2025 at 11:32 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
>
> On Mon, Nov 24, 2025 at 12:27:17PM -0500, Johannes Weiner wrote:
> > On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote:
> > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > >
> > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > > > The current zswap requires a backing swapfile. The swap slot used
> > > > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > > > space.
> > > > >
> > > > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > > > prevents accidental read or write of ghost swapfile, bdev of
> > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > > > flag because there is no rotation disk access when using zswap.
> > > >
> > > > Zswap is primarily a compressed cache for real swap on secondary
> > > > storage. It's indeed quite important that entries currently in zswap
> > > > don't occupy disk slots; but for a solution to this to be acceptable,
> > > > it has to work with the primary usecase and support disk writeback.
> > >
> > > Well, my plan is to support the writeback via swap.tiers.
> >
> > Do you have a link to that proposal?
> >
> > My understanding of swap tiers was about grouping different swapfiles
> > and assigning them to cgroups. The issue with writeback is relocating
> > the data that a swp_entry_t page table refers to - without having to
> > find and update all the possible page tables. I'm not sure how
> > swap.tiers solve this problem.
> >
> > > > This direction is a dead-end. Please take a look at Nhat's swap
> > > > virtualization patches. They decouple zswap from disk geometry, while
> > > > still supporting writeback to an actual backend file.
> > >
> > > Yes, there are many ways to decouple zswap from disk geometry, my swap
> > > table + swap.tiers design can do that as well. I have concerns about
> > > swap virtualization in the aspect of adding another layer of memory
> > > overhead addition per swap entry and CPU overhead of extra xarray
> > > lookup. I believe my approach is technically superior and cleaner.
> > > Both faster and cleaner. Basically swap.tiers + VFS like swap read
> > > write page ops. I will let Nhat clarify the performance and memory
> > > overhead side of the swap virtualization.
> >
> > I'm happy to discuss it.
> >
> > But keep in mind that the swap virtualization idea is a collaborative
> > product of quite a few people with an extensive combined upstream
> > record. Quite a bit of thought has gone into balancing static vs
> > runtime costs of that proposal. So you'll forgive me if I'm a bit
> > skeptical of the somewhat grandiose claims of one person that is new
> > to upstream development.
> >
> > As to your specific points - we use xarray lookups in the page cache
> > fast path. It's a bold claim to say this would be too much overhead
> > during swapins.
> >
> > Two, it's not clear to me how you want to make writeback efficient
> > *without* any sort of swap entry redirection. Walking all relevant
> > page tables is expensive; and you have to be able to find them first.
> >
> > If you're talking about a redirection array as opposed to a tree -
> > static sizing of the compressed space is also a no-go. Zswap
> > utilization varies *widely* between workloads and different workload
> > combinations. Further, zswap consumes the same fungible resource as
> > uncompressed memory - there is really no excuse to burden users with
> > static sizing questions about this pool.
>
> I think what Chris's idea is (and Chris correct me if I am wrong), is
> that we use ghost swapfiles (that are not backed by disk space) for
> zswap. So zswap has its own swapfiles, separate from disk swapfiles.
>
> memory.tiers establishes the ordering between swapfiles, so you put
> "ghost" -> "real" to get today's zswap writeback behavior. When you
> writeback, you keep page tables pointing at the swap entry in the ghost
> swapfile. What you do is:
> - Allocate a new swap entry in the "real" swapfile.
> - Update the swap table of the "ghost" swapfile to point at the swap
>   entry in the "real" swapfile, reusing the pointer used for the
>   swapcache.
>
> Then, on swapin, you read the swap table of the "ghost" swapfile, find
> the redirection, and read to the swap table of the "real" swapfile, then
> read the page from disk into the swap cache. The redirection in the
> "ghost" swapfile will keep existing, wasting that slot, until all
> references to it are dropped.
>
> I think this might work for this specific use case, with less overhead
> than the xarray. BUT there are a few scenarios that are not covered
> AFAICT:

Thanks for explaining these issues better than I could :)

>
> - You still need to statically size the ghost swapfiles and their
>   overheads.

Yes.

>
> - Wasting a slot in the ghost swapfile for the redirection. This
>   complicates static provisioning a bit, because you have to account for
>   entries that will be in zswap as well as writtenback. Furthermore,
>   IIUC swap.tiers is intended to be generic and cover other use cases
>   beyond zswap like SSD -> HDD. For that, I think wasting a slot in the
>   SSD when we writeback to the HDD is a much bigger problem.

Yep. We are trying to get away from static provisioning as much as we
can - this design digs us deeper in the hole. Who the hell know what's
the zswap:disk swap split is going to be? It's going to depend on
access patterns and compressibility.

>
> - We still cannot do swapoff efficiently as we need to walk the page
>   tables (and some swap tables) to find and swapin all entries in a
>   swapfile. Not as important as other things, but worth mentioning.

Yeah I left swapoff out of it, because it is just another use case.
But yes we can't do swapoff efficiently easily either.

And in general, it's going to be a very rigid design for more
complicated backend change (pre-fetching from one tier to another, or
compaction).

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 2 weeks ago

On Mon, Nov 24, 2025 at 8:27 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote:
> > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > >
> > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > > The current zswap requires a backing swapfile. The swap slot used
> > > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > > space.
> > > >
> > > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > > prevents accidental read or write of ghost swapfile, bdev of
> > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > > flag because there is no rotation disk access when using zswap.
> > >
> > > Zswap is primarily a compressed cache for real swap on secondary
> > > storage. It's indeed quite important that entries currently in zswap
> > > don't occupy disk slots; but for a solution to this to be acceptable,
> > > it has to work with the primary usecase and support disk writeback.
> >
> > Well, my plan is to support the writeback via swap.tiers.
>
> Do you have a link to that proposal?

My 2024 LSF swap pony talk already has a mechanism to redirect page
cache swap entries to different physical locations.
That can also work for redirecting swap entries in different swapfiles.

https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/

> My understanding of swap tiers was about grouping different swapfiles
> and assigning them to cgroups. The issue with writeback is relocating
> the data that a swp_entry_t page table refers to - without having to
> find and update all the possible page tables. I'm not sure how
> swap.tiers solve this problem.

swap.tiers is part of the picture. You are right the LPC topic mostly
covers the per cgroup portion. The VFS swap ops are my two slides of
the LPC 2023. You read from one swap file and write to another swap
file with a new swap entry allocated.

> > > This direction is a dead-end. Please take a look at Nhat's swap
> > > virtualization patches. They decouple zswap from disk geometry, while
> > > still supporting writeback to an actual backend file.
> >
> > Yes, there are many ways to decouple zswap from disk geometry, my swap
> > table + swap.tiers design can do that as well. I have concerns about
> > swap virtualization in the aspect of adding another layer of memory
> > overhead addition per swap entry and CPU overhead of extra xarray
> > lookup. I believe my approach is technically superior and cleaner.
> > Both faster and cleaner. Basically swap.tiers + VFS like swap read
> > write page ops. I will let Nhat clarify the performance and memory
> > overhead side of the swap virtualization.
>
> I'm happy to discuss it.
>
> But keep in mind that the swap virtualization idea is a collaborative
> product of quite a few people with an extensive combined upstream
> record. Quite a bit of thought has gone into balancing static vs
> runtime costs of that proposal. So you'll forgive me if I'm a bit
> skeptical of the somewhat grandiose claims of one person that is new
> to upstream development.

Collaborating with which companies developers? How many VS patches
landed in the kernel? I am also collaborating with different
developers, cluster base swap allocators, swap table phase I. Removing
the NUMA node swap file priority. Those are all suggested by me.

> As to your specific points - we use xarray lookups in the page cache
> fast path. It's a bold claim to say this would be too much overhead
> during swapins.

Yes, we just get rid of xarray in swap cache lookup and get some
performance gain from it.
You are saying one extra xarray is no problem, can your team demo some
performance number of impact of the extra xarray lookup in VS? Just
run some swap benchmarks and share the result.

We can do a test right now, without writing back to another SSD, The
ghosts swapfile compare with VS for zswap only case.

> Two, it's not clear to me how you want to make writeback efficient
> *without* any sort of swap entry redirection. Walking all relevant
> page tables is expensive; and you have to be able to find them first.

Swap cache can have a physical location redirection, see my 2024 LPC
slides. I have considered that way before the VS discussion.
https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/

> If you're talking about a redirection array as opposed to a tree -
> static sizing of the compressed space is also a no-go. Zswap
> utilization varies *widely* between workloads and different workload
> combinations. Further, zswap consumes the same fungible resource as
> uncompressed memory - there is really no excuse to burden users with
> static sizing questions about this pool.

I do see the swap table + swap.ters + swap ops and do better. We can
test the memory only case right now. To head to head test the VS and
swap.tiers on the writeback case will need to wait a bit. Swap table
is only reviewing phase II.

I mean CPU and per swap entry overhead.

I care less on who's idea it is, I care more about the end result
performance in (memory & CPU). I want the best idea/implementation to
win.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Johannes Weiner 2 months, 2 weeks ago

On Mon, Nov 24, 2025 at 09:24:18PM +0300, Chris Li wrote:
> On Mon, Nov 24, 2025 at 8:27 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote:
> > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > >
> > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > > > The current zswap requires a backing swapfile. The swap slot used
> > > > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > > > space.
> > > > >
> > > > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > > > prevents accidental read or write of ghost swapfile, bdev of
> > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > > > flag because there is no rotation disk access when using zswap.
> > > >
> > > > Zswap is primarily a compressed cache for real swap on secondary
> > > > storage. It's indeed quite important that entries currently in zswap
> > > > don't occupy disk slots; but for a solution to this to be acceptable,
> > > > it has to work with the primary usecase and support disk writeback.
> > >
> > > Well, my plan is to support the writeback via swap.tiers.
> >
> > Do you have a link to that proposal?
> 
> My 2024 LSF swap pony talk already has a mechanism to redirect page
> cache swap entries to different physical locations.
> That can also work for redirecting swap entries in different swapfiles.
> 
> https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/

I looked through your slides and the LWN article, but it's very hard
for me to find answers to my questions in there.

In your proposal, let's say you have a swp_entry_t in the page
table. What does it describe, and what are the data structures to get
from this key to user data in the following scenarios:

- Data is in a swapfile
- Data is in zswap
- Data is in being written from zswap to a swapfile
- Data is back in memory due to a fault from another page table

> > My understanding of swap tiers was about grouping different swapfiles
> > and assigning them to cgroups. The issue with writeback is relocating
> > the data that a swp_entry_t page table refers to - without having to
> > find and update all the possible page tables. I'm not sure how
> > swap.tiers solve this problem.
> 
> swap.tiers is part of the picture. You are right the LPC topic mostly
> covers the per cgroup portion. The VFS swap ops are my two slides of
> the LPC 2023. You read from one swap file and write to another swap
> file with a new swap entry allocated.

Ok, and from what you wrote below, presumably at this point you would
put a redirection pointer in the old location to point to the new one.

This way you only have the indirection IF such a relocation actually
happened, correct?

But how do you store new data in the freed up old slot?

> > As to your specific points - we use xarray lookups in the page cache
> > fast path. It's a bold claim to say this would be too much overhead
> > during swapins.
> 
> Yes, we just get rid of xarray in swap cache lookup and get some
> performance gain from it.
> You are saying one extra xarray is no problem, can your team demo some
> performance number of impact of the extra xarray lookup in VS? Just
> run some swap benchmarks and share the result.

Average and worst-case for all common usecases matter. There is no
code on your side for the writeback case. (And it's exceedingly
difficult to even get a mental model of how it would work from your
responses and the slides you have linked).

> > Two, it's not clear to me how you want to make writeback efficient
> > *without* any sort of swap entry redirection. Walking all relevant
> > page tables is expensive; and you have to be able to find them first.
> 
> Swap cache can have a physical location redirection, see my 2024 LPC
> slides. I have considered that way before the VS discussion.
> https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/

There are no matches for "redir" in either the email or the slides.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 2 weeks ago

On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > Do you have a link to that proposal?
> >
> > My 2024 LSF swap pony talk already has a mechanism to redirect page
> > cache swap entries to different physical locations.
> > That can also work for redirecting swap entries in different swapfiles.
> >
> > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/
>
> I looked through your slides and the LWN article, but it's very hard
> for me to find answers to my questions in there.

Naturally, the slide is only intended to cover what is in the current
swap table may be phase VII.
But it does have the physical location pointer consideration.

> In your proposal, let's say you have a swp_entry_t in the page
> table. What does it describe, and what are the data structures to get
> from this key to user data in the following scenarios:

Please keep in mind that I don't have every detail design laid out. I
follow the first principles that redirect a swap entry page should
only take an additional 4 byte per swap entry. VS blow up the swap
entry size by something like 24 bytes? I am pretty sure I am wrong
about the exact value. People who are familiar with VS please correct
me. My impression is that it is too far away from the first principle
value, I would not even consider. Exceptions can be made, but not that
far.

I will try my best to answer your question but usually I am more glad
to work with someone who is going to implement it to iron out all the
details. Right now it is a bit too far.

> - Data is in a swapfile
Same as current.

> - Data is in zswap

I have now realized that what I want from the memory swap tier is
actually not the same as today's zswap. I don't want the current
behavior of zswap in the swap.tiers. The zswap seat in front of every
swapfile. The zswap.writeback does not tell which particular swapfile
it wants to write to. It creates problems in the per memcg swap.tier
to include zswap as it is. I don't want the zswap to use another
swapfile swap entry and write through to it.

If data is in the memory tier swapfile, the swap entry looks up to the
actual data without redirection.

> - Data is in being written from zswap to a swapfile
It will look up the swap table and find a physical pointer, which
points to the physical device and office having the data.

> - Data is back in memory due to a fault from another page table
In the swap cache similar to today's swapfile.

> > > My understanding of swap tiers was about grouping different swapfiles
> > > and assigning them to cgroups. The issue with writeback is relocating
> > > the data that a swp_entry_t page table refers to - without having to
> > > find and update all the possible page tables. I'm not sure how
> > > swap.tiers solve this problem.
> >
> > swap.tiers is part of the picture. You are right the LPC topic mostly
> > covers the per cgroup portion. The VFS swap ops are my two slides of
> > the LPC 2023. You read from one swap file and write to another swap
> > file with a new swap entry allocated.
>
> Ok, and from what you wrote below, presumably at this point you would
> put a redirection pointer in the old location to point to the new one.

From the swap entry front end (also owns the swap cache) point to a
physical location.
>
> This way you only have the indirection IF such a relocation actually
> happened, correct?

Right. The more common

> But how do you store new data in the freed up old slot?
That is the front end swap entry and the physical back end split.
The front end swap entry can't be free until all users release the swap count.
The physical back end can be free. The free physical blocks caused by
redirection will likely have a different allocator, not the cluster
based swap allocator. Because those are just pure blocks.

>
> > > As to your specific points - we use xarray lookups in the page cache
> > > fast path. It's a bold claim to say this would be too much overhead
> > > during swapins.
> >
> > Yes, we just get rid of xarray in swap cache lookup and get some
> > performance gain from it.
> > You are saying one extra xarray is no problem, can your team demo some
> > performance number of impact of the extra xarray lookup in VS? Just
> > run some swap benchmarks and share the result.
>
> Average and worst-case for all common usecases matter. There is no
> code on your side for the writeback case. (And it's exceedingly
> difficult to even get a mental model of how it would work from your
> responses and the slides you have linked).

As I said, that slide is only intended to explain swap table phase VII
how physical direction works with swap cache.
The swap.tiers define tiers for swap, obviously how to move data
between the tier is a natural consideration. That I mention in the
2023 talk in two slides.

I don't plan that level of detail that far ahead. I try to follow the
first principle as best as I can. There will be a lot of decisions
made only at the later phases.

> > > Two, it's not clear to me how you want to make writeback efficient
> > > *without* any sort of swap entry redirection. Walking all relevant
> > > page tables is expensive; and you have to be able to find them first.
> >
> > Swap cache can have a physical location redirection, see my 2024 LPC
> > slides. I have considered that way before the VS discussion.
> > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/
>
> There are no matches for "redir" in either the email or the slides.

Yes, I use a different term in the slide. The continuous is the source
of the redirection, the non continuous is the destination of the
redirection. But in my mind I am not redirecting swap entries. The
swap entry might have an optional physical location pointer. The swap
entry front end and physical layer split.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Johannes Weiner 2 months, 1 week ago

On Tue, Nov 25, 2025 at 11:27:04PM +0400, Chris Li wrote:
> On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > Do you have a link to that proposal?
> > >
> > > My 2024 LSF swap pony talk already has a mechanism to redirect page
> > > cache swap entries to different physical locations.
> > > That can also work for redirecting swap entries in different swapfiles.
> > >
> > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/
> >
> > I looked through your slides and the LWN article, but it's very hard
> > for me to find answers to my questions in there.
> 
> Naturally, the slide is only intended to cover what is in the current
> swap table may be phase VII.
> But it does have the physical location pointer consideration.
> 
> > In your proposal, let's say you have a swp_entry_t in the page
> > table. What does it describe, and what are the data structures to get
> > from this key to user data in the following scenarios:
> 
> Please keep in mind that I don't have every detail design laid out. I
> follow the first principles that redirect a swap entry page should
> only take an additional 4 byte per swap entry. VS blow up the swap
> entry size by something like 24 bytes?

Nhat can lay this out in more detail, but there isn't much new stuff
in the virtual swap descriptor. It's mostly just a consolidation of
state we currently track elsewhere - swap count, swapcache pointer,
cgroup ownership etc.

The actual indirection is just a word for the backend type,offset.

That indirection is the tradeoff for swapped pages. In turn you're
getting back all that other stuff for swap slots that *aren't*
currently used. This is a win for the vast majority of users.

Since you mentioned first principles - the dynamically sized swap
space is also much more suitable for compressed pools, which are the
dominant form of swap setups nowadays. Again a win for the majority.

And the worst-case is reasonable. I don't see the giant gulf you seem
to see there. I don't know where it's supposed to be coming from.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Nhat Pham 2 months, 1 week ago

On Tue, Nov 25, 2025 at 1:31 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Tue, Nov 25, 2025 at 11:27:04PM +0400, Chris Li wrote:
> > On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > > Do you have a link to that proposal?
> > > >
> > > > My 2024 LSF swap pony talk already has a mechanism to redirect page
> > > > cache swap entries to different physical locations.
> > > > That can also work for redirecting swap entries in different swapfiles.
> > > >
> > > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/
> > >
> > > I looked through your slides and the LWN article, but it's very hard
> > > for me to find answers to my questions in there.
> >
> > Naturally, the slide is only intended to cover what is in the current
> > swap table may be phase VII.
> > But it does have the physical location pointer consideration.
> >
> > > In your proposal, let's say you have a swp_entry_t in the page
> > > table. What does it describe, and what are the data structures to get
> > > from this key to user data in the following scenarios:
> >
> > Please keep in mind that I don't have every detail design laid out. I
> > follow the first principles that redirect a swap entry page should
> > only take an additional 4 byte per swap entry. VS blow up the swap
> > entry size by something like 24 bytes?
>
> Nhat can lay this out in more detail, but there isn't much new stuff
> in the virtual swap descriptor. It's mostly just a consolidation of
> state we currently track elsewhere - swap count, swapcache pointer,
> cgroup ownership etc.
>
> The actual indirection is just a word for the backend type,offset.
>
> That indirection is the tradeoff for swapped pages. In turn you're
> getting back all that other stuff for swap slots that *aren't*
> currently used. This is a win for the vast majority of users.

I will also note though, that we will merge the zswap tree with the
virtual swap descriptors as well.

So for zswap entries there are actually no extra overhead induced by
the backend indirection pointer :)

IOW, overhead for zswap-only users (such as Google) will be much
smaller than what Johannes is describing here - pretty much
non-existent :) While you will still gain all the other benefits (swap
space dynamicization, operational overhead reduction) of swap
virtualization.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 1 week ago

On Tue, Dec 2, 2025 at 10:19 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> > That indirection is the tradeoff for swapped pages. In turn you're
> > getting back all that other stuff for swap slots that *aren't*
> > currently used. This is a win for the vast majority of users.
>
> I will also note though, that we will merge the zswap tree with the
> virtual swap descriptors as well.

What is the merged per swap slot entry size? If your descritor is over
48 bytes plus some zswap pool handles and compressed buffer size etc I
am not sure that is an overall win. Provide actual number helps.

> So for zswap entries there are actually no extra overhead induced by
> the backend indirection pointer :)
>
> IOW, overhead for zswap-only users (such as Google) will be much
> smaller than what Johannes is describing here - pretty much
> non-existent :) While you will still gain all the other benefits (swap

The per swap slot memory usage size, zswap+ swap core, is it smaller
than the ghost swap file patch I posted here?
Do you have a number in bytes?

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 1 week ago

On Wed, Nov 26, 2025 at 1:31 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Tue, Nov 25, 2025 at 11:27:04PM +0400, Chris Li wrote:
> > On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > > Do you have a link to that proposal?
> > > >
> > > > My 2024 LSF swap pony talk already has a mechanism to redirect page
> > > > cache swap entries to different physical locations.
> > > > That can also work for redirecting swap entries in different swapfiles.
> > > >
> > > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/
> > >
> > > I looked through your slides and the LWN article, but it's very hard
> > > for me to find answers to my questions in there.
> >
> > Naturally, the slide is only intended to cover what is in the current
> > swap table may be phase VII.
> > But it does have the physical location pointer consideration.
> >
> > > In your proposal, let's say you have a swp_entry_t in the page
> > > table. What does it describe, and what are the data structures to get
> > > from this key to user data in the following scenarios:
> >
> > Please keep in mind that I don't have every detail design laid out. I
> > follow the first principles that redirect a swap entry page should
> > only take an additional 4 byte per swap entry. VS blow up the swap
> > entry size by something like 24 bytes?
>
> Nhat can lay this out in more detail, but there isn't much new stuff

Please make sure Nhat do. It shouldn't be complicated question.

> in the virtual swap descriptor. It's mostly just a consolidation of
> state we currently track elsewhere - swap count, swapcache pointer,
> cgroup ownership etc.

All those will fold into swap table values at later phases. So in this
regard, swap table is not satisfying the status quotes, it is more
aggressive in conserving memory. If I recall correctly, VS uses atomic
for the counters? It will blow up the 1 byte counter to 4 bytes.

> The actual indirection is just a word for the backend type,offset.

Sure.

>
> That indirection is the tradeoff for swapped pages. In turn you're
> getting back all that other stuff for swap slots that *aren't*
> currently used. This is a win for the vast majority of users.

Swap table does those as well, in the later phases.

>
> Since you mentioned first principles - the dynamically sized swap
> space is also much more suitable for compressed pools, which are the
> dominant form of swap setups nowadays. Again a win for the majority.

Sure, the swap table does that, especially after the swap cgroup and
swap count fold into the swap table.

> And the worst-case is reasonable. I don't see the giant gulf you seem
> to see there. I don't know where it's supposed to be coming from.

Let Nhat conform the per swap entry overhead and let's compare it with
the swap table fully final form.
Another easy way is just run some benchmark to see how much overhead
the VS introduces.

That being said, I think I have answered enough technical questions of
my approach, to let you re-consider my proposal. You should be able to
realize by now my approach is more optimal compared to VS. Do you
agree or not? We are just arguing how big the gap that is.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Rik van Riel 2 months, 1 week ago

On Wed, 2025-11-26 at 23:22 +0400, Chris Li wrote:
> 
> That being said, I think I have answered enough technical questions
> of
> my approach, to let you re-consider my proposal. You should be able
> to
> realize by now my approach is more optimal compared to VS. Do you
> agree or not? We are just arguing how big the gap that is.
> 

We would have much more confidence in your
solution if you had told us exactly how
you were planning to solve things in future
stages of the project.

A "I'll solve it, but I can't tell you how"
is not very confidence inspiring.

-- 
All Rights Reversed.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 1 week ago

On Thu, Nov 27, 2025 at 1:53 AM Rik van Riel <riel@surriel.com> wrote:
>
> On Wed, 2025-11-26 at 23:22 +0400, Chris Li wrote:
> >
> > That being said, I think I have answered enough technical questions
> > of
> > my approach, to let you re-consider my proposal. You should be able
> > to
> > realize by now my approach is more optimal compared to VS. Do you
> > agree or not? We are just arguing how big the gap that is.
> >
>
> We would have much more confidence in your
> solution if you had told us exactly how
> you were planning to solve things in future
> stages of the project.

Can you clarify who is "We", sorry I am not part of your Meta kernel
team circle. II just reply to you and others how to solve the other
things. If you have further questions, please ask a clarifying
question. Until you ask, I don't know which part of the Swap Pony plan
you don't understand needs more clarifications.

> A "I'll solve it, but I can't tell you how"
> is not very confidence inspiring.

Don't need this kind of innuendo and it is not helping.
Please stay on the technical side of discussion and try not to project
personal judgement, thanks.

Please keep in mind that I am just one person love kernel hacking and
want to do the right things. I am doing this at my spare time, it is
not part of my company OKR's to work on upstream swap in the last two
years. I don't get pay to do this. I am replying this email from my
vacation 5am in the morning.

Again, let's stay technical. If you think I am holding any secret (I
am not ), please just ask a clarify question.

Thanks for  your cooperation and sorry that I did have a chance to
explain things better earlier.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Rik van Riel 2 months, 1 week ago

On Thu, 2025-11-27 at 05:52 +0400, Chris Li wrote:
> On Thu, Nov 27, 2025 at 1:53 AM Rik van Riel <riel@surriel.com>
> wrote:
> > 
> > On Wed, 2025-11-26 at 23:22 +0400, Chris Li wrote:
> > > 
> > > That being said, I think I have answered enough technical
> > > questions
> > > of
> > > my approach, to let you re-consider my proposal. You should be
> > > able
> > > to
> > > realize by now my approach is more optimal compared to VS. Do you
> > > agree or not? We are just arguing how big the gap that is.
> > > 
> > 
> > We would have much more confidence in your
> > solution if you had told us exactly how
> > you were planning to solve things in future
> > stages of the project.
> 
> Can you clarify who is "We", 

Sorry, I am talking about upstream.

When one developer has code, and somebody else emails
the equivalent of "trust me, bro", the code is usually
preferred.

> 
> Please keep in mind that I am just one person love kernel hacking and
> want to do the right things. I am doing this at my spare time, it is
> not part of my company OKR's to work on upstream swap in the last two
> years. I don't get pay to do this. I am replying this email from my
> vacation 5am in the morning.
> 
> Again, let's stay technical. If you think I am holding any secret (I
> am not ), please just ask a clarify question.

I really appreciate anybody participating in Linux
kernel development. Linux is good because different
people bring different perspectives to the table.

Some real numbers, even if just back of the envelope 
math to estimate the overhead of various ideas being
proposed, are often a good way to move a discussion 
along in a productive direction.

Let me reply to your other email with some more
technical details.

-- 
All Rights Reversed.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 1 week ago

On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote:
>
> Sorry, I am talking about upstream.

So far I have not had a pleasant upstream experience when submitting
this particular patch to upstream.

> I really appreciate anybody participating in Linux
> kernel development. Linux is good because different
> people bring different perspectives to the table.

Of course everybody is welcome. However, NACK without technical
justification is very bad for upstream development. I can't imagine
what a new hacker would think after going through what I have gone
through for this patch. He/she will likely quit contributing upstream.
This is not the kind of welcome we want.

Nhat needs to be able to technically justify his NACK as a maintainer.
Sorry there is no other way to sugar coat it.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Nhat Pham 2 months, 1 week ago

On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote:
>
> On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote:
> >
> > Sorry, I am talking about upstream.
>
> So far I have not had a pleasant upstream experience when submitting
> this particular patch to upstream.
>
> > I really appreciate anybody participating in Linux
> > kernel development. Linux is good because different
> > people bring different perspectives to the table.
>
> Of course everybody is welcome. However, NACK without technical
> justification is very bad for upstream development. I can't imagine
> what a new hacker would think after going through what I have gone
> through for this patch. He/she will likely quit contributing upstream.
> This is not the kind of welcome we want.
>
> Nhat needs to be able to technically justify his NACK as a maintainer.
> Sorry there is no other way to sugar coat it.

I am NOT the only zswap maintainer who expresses concerns. Other
people also have their misgivings, so I have let them speak and not
put words in their mouths.

But since you have repeatedly singled me out, I will repeat my concerns here:

1. I don't like the operational overhead (to statically size the zswap
swapfile size for each <host x workload> combination) of static
swapfile. Misspecification of swapfile size can lead to unacceptable
swap metadata overhead on small machines, or underutilization of zswap
on big machines. And it is *impossible* to know how much zswap will be
needed ahead of time, even if we fix host - it depends on workloads
access patterns, memory compressibility, and latency/memory pressure
tolerance.

2. I don't like the maintainer's overhead (to support a special
infrastructure for a very specific use case, i.e no-writeback),
especially since I'm not convinced this can be turned into a general
architecture. See below.

3. I want to move us towards a more dynamic architecture for zswap.
This is a step in the WRONG direction.

4. I don't believe this buys us anything we can't already do with
userspace hacking. Again, zswap-over-zram (or insert whatever RAM-only
swap option here), with writeback disabled, is 2-3 lines of script.

I believe I already justified myself well enough :) It is you who have
not really convinced me that this is, at the very least, a
temporary/first step towards a long-term generalized architecture for
zswap. Every time we pointed out an issue, you seem to justify it with
some more vague ideas that deepen the confusion.

Let's recap the discussion so far:

1. We claimed that this architecture is hard to extend for efficient
zswap writeback, or backend transfer in general, without incurring
page table updates. You claim you plan to implement a redirection
entry to solve this.

2. We then pointed out that inserting redirect entry into the current
physical swap infrastructure will leave holes in the upper swap tier's
address space, which is arguably *worse* than the current status quo
of zswap occupying disk swap space. Again, you pull out some vague
ideas about "frontend" and "backend" swap, which, frankly, is
conceptually very similar to swap virtualization.

3. The dynamicization of swap space is treated with the same rigor
(or, more accurately, lack thereof). Just more handwaving about the
"frontend" vs "backend" (which, again, is very close to swap
virtualization). This requirement is a deal breaker for me - see
requirement 1 above again.

4. We also pointed out your lack of thoughts for swapoff optimization,
which again, seem to be missing in your design. Again, more vagueness
about rmap, which is probably more overhead.

Look man, I'm not being hostile to you. Believe me on this - I respect
your opinion, and I'm working very hard on reducing memory overhead
for virtual swap, to see if I can meet you where you want it to be.
The RFC's original design inefficient memory usage was due to:

a) Readability. Space optimization can make it hard to read code, when
fields are squeezed into the same int/long variable. So I just put one
different field for each piece of metadata information

b) I was playing with synchronization optimization, i.e using atomics
instead of locks, and using per-entry locks. But I can go back to
using per-cluster lock (I haven't implemented cluster allocator at the
time of the RFC, but in my latest version I have done it), which will
further reduce the memory overhead by removing a couple of
fields/packing more fields.

The only non-negotiable per-swap-entry overhead will be a field to
indicate the backend location (physical swap slot, zswap entry, etc.)
+ 2 bits to indicate the swap type. With some field union-ing magic,
or pointer tagging magic, we can perhaps squeeze it even harder.

I'm also working on reducing the CPU overhead - re-partitioning swap
architectures (swap cache, zswap tree), reducing unnecessary xarray
lookups where possible.

We can then benchmark, and attempt to optimize it together as a community.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 1 week ago

On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote:
> >
> > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote:
> > >
> > > Sorry, I am talking about upstream.
> >
> > So far I have not had a pleasant upstream experience when submitting
> > this particular patch to upstream.
> >
> > > I really appreciate anybody participating in Linux
> > > kernel development. Linux is good because different
> > > people bring different perspectives to the table.
> >
> > Of course everybody is welcome. However, NACK without technical
> > justification is very bad for upstream development. I can't imagine
> > what a new hacker would think after going through what I have gone
> > through for this patch. He/she will likely quit contributing upstream.
> > This is not the kind of welcome we want.
> >
> > Nhat needs to be able to technically justify his NACK as a maintainer.
> > Sorry there is no other way to sugar coat it.
>
> I am NOT the only zswap maintainer who expresses concerns. Other
> people also have their misgivings, so I have let them speak and not
> put words in their mouths.

You did not mention the fact that both two NACK from zswap maintainers
are from the same company. I assume you have some kind of team sync.
There is a term for that, called "person acting in concert".

What I mean in "technically unjustifiable" is that VS patch series is
a non-starter to merge into mainline.
In this email you suggest the per swap slot memory overhead is 48
bytes previously 64 bytes.

https://lore.kernel.org/linux-mm/CAKEwX=Mea5V6CKcGuQrYfCQAKErgbje1s0fThjkgCwZXgF-d2A@mail.gmail.com/

Do you have newer VS that significantly reduce that? If so, what is
the new number?

The starting point before your VS is 11 bytes (3 bytes static, 8 bytes
dynamic). 48bytes is more than 4x the original size.
This will have a huge impact on the deployment that uses a lot of
swap. The worst part is that once your VS series is in the kernel.
That overhead is always on, it is forcing the overhead even if the
redirection is not used. This will hurt Google's fleet very badly if
deployed. Because of the same jobs, the kernel memory consumption will
jump up and fail jobs. Every body's kernel who use swap will suffer
because it is always on. The alternative, the swap table, uses much
less overhead. So your VS leave money on the table.

So I consider your VS is a non-starter. I repeatedly call you out
because you keep dodging this critical question. Johannes refers to
you for the detail value of the overhead as well.  Dodging critical
questions makes a technical debate very difficult to conduct and drive
to a conflict resolution impossible. BTW, this is my big concern on
the 2023 swap abstraction talk which our VS is based on. The community
feedback at the time strongly favored my solution. I don't understand
why you reboot the community un-favored solution without addressing
those concerns.

The other part of the bad experience is that you NACK first then ask
clarifying questions later. The proper order is the other way around.
You should fully understand the subject BEFORE you NACK on it. NACK is
a very serious business.

I did try my best to answer clarification question from your team. I
appreciate that Johannes and Yosry ask clarification to advance the
discussion. I did not see more question from them I assume they got
what they want to know. If you still feel something is missing out,
you should ask a follow up question for the part in which you need
more clarification. We can repeat until you understand. You keep using
the phrase "hand waving" as if I am faking it. That is FUD.
Communication is a two way street. I can't force you to understand,
asking more questions can help you. This is complex problem. I am
confident I can explain to Kairui and he can understand, because he
has a lot more context, not because I am faking it. Ask nicely so I
can answer nicely. Stay in the technical side of the discussion
please.

So I consider using VS to NACK my patch is technically unjustifiable.
Your current VS with 48 byte overhead is not usable at all as an
standard upstream kernel. Can we agree to that?

As we all know, using less memory to function the same is a lot harder
than using more. If you can dramatically reduce the memory usage, you
likely need to rebuild the whole patch series from scratch. If might
force you to use solution similar to swap table, in that case why not
join team swap table? We can reopen the topic again by then if you
have a newer VS:
1) address the per swap slot memory over head, ideally close to the
first principle value.
2) make the overhead optional, if not using redirection, preferably
not pay the overhead.
3) make your VS patch series incrementally show value, not all or nothing.

Sorry this email is getting very long and I have very limited time.
Let's discuss one topic at a time. I would like to conclude the
current VS is not a viable option as of now. I can reply to other
parts of your email once we get the VS out of the way.

Best Regards,

Chris

>
> 1. I don't like the operational overhead (to statically size the zswap
> swapfile size for each <host x workload> combination) of static
> swapfile. Misspecification of swapfile size can lead to unacceptable
> swap metadata overhead on small machines, or underutilization of zswap
> on big machines. And it is *impossible* to know how much zswap will be
> needed ahead of time, even if we fix host - it depends on workloads
> access patterns, memory compressibility, and latency/memory pressure
> tolerance.
>
> 2. I don't like the maintainer's overhead (to support a special
> infrastructure for a very specific use case, i.e no-writeback),
> especially since I'm not convinced this can be turned into a general
> architecture. See below.
>
> 3. I want to move us towards a more dynamic architecture for zswap.
> This is a step in the WRONG direction.
>
> 4. I don't believe this buys us anything we can't already do with
> userspace hacking. Again, zswap-over-zram (or insert whatever RAM-only
> swap option here), with writeback disabled, is 2-3 lines of script.
>
> I believe I already justified myself well enough :) It is you who have
> not really convinced me that this is, at the very least, a
> temporary/first step towards a long-term generalized architecture for
> zswap. Every time we pointed out an issue, you seem to justify it with
> some more vague ideas that deepen the confusion.
>
> Let's recap the discussion so far:
>
> 1. We claimed that this architecture is hard to extend for efficient
> zswap writeback, or backend transfer in general, without incurring
> page table updates. You claim you plan to implement a redirection
> entry to solve this.
>
> 2. We then pointed out that inserting redirect entry into the current
> physical swap infrastructure will leave holes in the upper swap tier's
> address space, which is arguably *worse* than the current status quo
> of zswap occupying disk swap space. Again, you pull out some vague
> ideas about "frontend" and "backend" swap, which, frankly, is
> conceptually very similar to swap virtualization.
>
> 3. The dynamicization of swap space is treated with the same rigor
> (or, more accurately, lack thereof). Just more handwaving about the
> "frontend" vs "backend" (which, again, is very close to swap
> virtualization). This requirement is a deal breaker for me - see
> requirement 1 above again.
>
> 4. We also pointed out your lack of thoughts for swapoff optimization,
> which again, seem to be missing in your design. Again, more vagueness
> about rmap, which is probably more overhead.
>
> Look man, I'm not being hostile to you. Believe me on this - I respect
> your opinion, and I'm working very hard on reducing memory overhead
> for virtual swap, to see if I can meet you where you want it to be.
> The RFC's original design inefficient memory usage was due to:
>
> a) Readability. Space optimization can make it hard to read code, when
> fields are squeezed into the same int/long variable. So I just put one
> different field for each piece of metadata information
>
> b) I was playing with synchronization optimization, i.e using atomics
> instead of locks, and using per-entry locks. But I can go back to
> using per-cluster lock (I haven't implemented cluster allocator at the
> time of the RFC, but in my latest version I have done it), which will
> further reduce the memory overhead by removing a couple of
> fields/packing more fields.
>
> The only non-negotiable per-swap-entry overhead will be a field to
> indicate the backend location (physical swap slot, zswap entry, etc.)
> + 2 bits to indicate the swap type. With some field union-ing magic,
> or pointer tagging magic, we can perhaps squeeze it even harder.
>
> I'm also working on reducing the CPU overhead - re-partitioning swap
> architectures (swap cache, zswap tree), reducing unnecessary xarray
> lookups where possible.
>
> We can then benchmark, and attempt to optimize it together as a community.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Nhat Pham 2 months, 1 week ago

On Sat, Nov 29, 2025 at 12:38 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote:
> > >
> > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote:
> > > >
> > > > Sorry, I am talking about upstream.
> > >
> > > So far I have not had a pleasant upstream experience when submitting
> > > this particular patch to upstream.
> > >
> > > > I really appreciate anybody participating in Linux
> > > > kernel development. Linux is good because different
> > > > people bring different perspectives to the table.
> > >
> > > Of course everybody is welcome. However, NACK without technical
> > > justification is very bad for upstream development. I can't imagine
> > > what a new hacker would think after going through what I have gone
> > > through for this patch. He/she will likely quit contributing upstream.
> > > This is not the kind of welcome we want.
> > >
> > > Nhat needs to be able to technically justify his NACK as a maintainer.
> > > Sorry there is no other way to sugar coat it.
> >
> > I am NOT the only zswap maintainer who expresses concerns. Other
> > people also have their misgivings, so I have let them speak and not
> > put words in their mouths.
>
> You did not mention the fact that both two NACK from zswap maintainers
> are from the same company. I assume you have some kind of team sync.
> There is a term for that, called "person acting in concert".

I mean, Yosry pointed out issues with your approach too. Yosry is from
your company, no?

The issues I pointed out have all been technical, thus far. I never
even brought up Meta - I'm sure other parties have the same issues.

>
> What I mean in "technically unjustifiable" is that VS patch series is
> a non-starter to merge into mainline.
> In this email you suggest the per swap slot memory overhead is 48
> bytes previously 64 bytes.
>
> https://lore.kernel.org/linux-mm/CAKEwX=Mea5V6CKcGuQrYfCQAKErgbje1s0fThjkgCwZXgF-d2A@mail.gmail.com/
>
> Do you have newer VS that significantly reduce that? If so, what is
> the new number?
>
> The starting point before your VS is 11 bytes (3 bytes static, 8 bytes
> dynamic). 48bytes is more than 4x the original size.
> This will have a huge impact on the deployment that uses a lot of
> swap. The worst part is that once your VS series is in the kernel.
> That overhead is always on, it is forcing the overhead even if the
> redirection is not used. This will hurt Google's fleet very badly if
> deployed. Because of the same jobs, the kernel memory consumption will
> jump up and fail jobs. Every body's kernel who use swap will suffer
> because it is always on. The alternative, the swap table, uses much
> less overhead. So your VS leave money on the table.
>
> So I consider your VS is a non-starter. I repeatedly call you out
> because you keep dodging this critical question. Johannes refers to
> you for the detail value of the overhead as well.  Dodging critical
> questions makes a technical debate very difficult to conduct and drive
> to a conflict resolution impossible. BTW, this is my big concern on
> the 2023 swap abstraction talk which our VS is based on. The community
> feedback at the time strongly favored my solution. I don't understand
> why you reboot the community un-favored solution without addressing
> those concerns.

I reboot the VS work because I have not seen any indications that your
design could solve the problems I believe are principle for any swap
architectures: dynamicization of swap space, efficient backend
transfer, to name 2.

>
> The other part of the bad experience is that you NACK first then ask
> clarifying questions later. The proper order is the other way around.
> You should fully understand the subject BEFORE you NACK on it. NACK is
> a very serious business.
>
> I did try my best to answer clarification question from your team. I
> appreciate that Johannes and Yosry ask clarification to advance the
> discussion. I did not see more question from them I assume they got
> what they want to know. If you still feel something is missing out,
> you should ask a follow up question for the part in which you need
> more clarification. We can repeat until you understand. You keep using
> the phrase "hand waving" as if I am faking it. That is FUD.
> Communication is a two way street. I can't force you to understand,
> asking more questions can help you. This is complex problem. I am
> confident I can explain to Kairui and he can understand, because he
> has a lot more context, not because I am faking it. Ask nicely so I
> can answer nicely. Stay in the technical side of the discussion
> please.
>
> So I consider using VS to NACK my patch is technically unjustifiable.

I'm not NACK-ing the ghost swapfile because of VS. I'm NACK-ing
swapfile because of the technical requirements I pointed out above.
Virtual swap happens to neatly solve all of them, by design, from
first principle. I never ruled out the possibility of another design
that would satisfy all of them - I just did not see enough from you to
believe otherwise.

I don't believe a static ghosttfile is it. In fact, you CAN
theoretically implement virtual swap with a ghost swapfile as well.
The staticity will just make it operationally untenable. The next step
would be to dynamicize the swap infrastructure, at which point we
revert back to the original VS design.

I see the same thing played out in your response as well, with the
redirection entry, then frontend/backend swap space. It's starting to
eerily resembles virtual swap. Or maybe you can clarify?

> Your current VS with 48 byte overhead is not usable at all as an
> standard upstream kernel. Can we agree to that?

Sure, which is why I sent it as an RFC and not as an actual patch
series pending merging :) Its main purpose was to demonstrate the
workflow of how a feature-complete virtual swap subsystem might
behave, in all of the code paths of the memory subsystem. I can then
optimize the fields piecemeal, while weighing the tradeoff (such as
lock granularity v.s lock fields memory overhead). You and Kairui are
welcome to criticize, comment, and help me optimize it, as did Yosry
and Johannes in the past.

>
> As we all know, using less memory to function the same is a lot harder
> than using more. If you can dramatically reduce the memory usage, you

I don't necessarily disagree.

I would, however, would like to point out that the reverse is true too
- you can't necessarily compare the overhead of two designs, where one
achieve a lot more in terms of features and/or operational goals than
the other.

> likely need to rebuild the whole patch series from scratch. If might
> force you to use solution similar to swap table, in that case why not
> join team swap table?

Because even with the current swap table design, the allocator is
*still* static.

I would LOVE to use the current physical swap allocation
infrastructure. It just doesn't work in its current state.

> We can reopen the topic again by then if you have a newer VS:

Sure.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 1 week ago

On Tue, Dec 2, 2025 at 3:37 AM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Sat, Nov 29, 2025 at 12:38 PM Chris Li <chrisl@kernel.org> wrote:
> >
> > On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote:
> > > >
> > > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote:
> > > > >
> > > > > Sorry, I am talking about upstream.
> > > >
> > > > So far I have not had a pleasant upstream experience when submitting
> > > > this particular patch to upstream.
> > > >
> > > > > I really appreciate anybody participating in Linux
> > > > > kernel development. Linux is good because different
> > > > > people bring different perspectives to the table.
> > > >
> > > > Of course everybody is welcome. However, NACK without technical
> > > > justification is very bad for upstream development. I can't imagine
> > > > what a new hacker would think after going through what I have gone
> > > > through for this patch. He/she will likely quit contributing upstream.
> > > > This is not the kind of welcome we want.
> > > >
> > > > Nhat needs to be able to technically justify his NACK as a maintainer.
> > > > Sorry there is no other way to sugar coat it.
> > >
> > > I am NOT the only zswap maintainer who expresses concerns. Other
> > > people also have their misgivings, so I have let them speak and not
> > > put words in their mouths.
> >
> > You did not mention the fact that both two NACK from zswap maintainers
> > are from the same company. I assume you have some kind of team sync.
> > There is a term for that, called "person acting in concert".
>
> I mean, Yosry pointed out issues with your approach too. Yosry is from
> your company, no?

I don't know who's interest Yosry is representing on this issue.

> > What I mean in "technically unjustifiable" is that VS patch series is
> > a non-starter to merge into mainline.
> > In this email you suggest the per swap slot memory overhead is 48
> > bytes previously 64 bytes.
> >
> > https://lore.kernel.org/linux-mm/CAKEwX=Mea5V6CKcGuQrYfCQAKErgbje1s0fThjkgCwZXgF-d2A@mail.gmail.com/
> >
> > Do you have newer VS that significantly reduce that? If so, what is
> > the new number?
> >
> > The starting point before your VS is 11 bytes (3 bytes static, 8 bytes
> > dynamic). 48bytes is more than 4x the original size.
> > This will have a huge impact on the deployment that uses a lot of
> > swap. The worst part is that once your VS series is in the kernel.
> > That overhead is always on, it is forcing the overhead even if the
> > redirection is not used. This will hurt Google's fleet very badly if
> > deployed. Because of the same jobs, the kernel memory consumption will
> > jump up and fail jobs. Every body's kernel who use swap will suffer
> > because it is always on. The alternative, the swap table, uses much
> > less overhead. So your VS leave money on the table.
> >
> > So I consider your VS is a non-starter. I repeatedly call you out
> > because you keep dodging this critical question. Johannes refers to
> > you for the detail value of the overhead as well.  Dodging critical
> > questions makes a technical debate very difficult to conduct and drive
> > to a conflict resolution impossible. BTW, this is my big concern on
> > the 2023 swap abstraction talk which our VS is based on. The community
> > feedback at the time strongly favored my solution. I don't understand
> > why you reboot the community un-favored solution without addressing
> > those concerns.
>
> I reboot the VS work because I have not seen any indications that your
> design could solve the problems I believe are principle for any swap
> architectures: dynamicization of swap space, efficient backend
> transfer, to name 2.

So no new number and no new date yet.

>
> >
> > The other part of the bad experience is that you NACK first then ask
> > clarifying questions later. The proper order is the other way around.
> > You should fully understand the subject BEFORE you NACK on it. NACK is
> > a very serious business.
> >
> > I did try my best to answer clarification question from your team. I
> > appreciate that Johannes and Yosry ask clarification to advance the
> > discussion. I did not see more question from them I assume they got
> > what they want to know. If you still feel something is missing out,
> > you should ask a follow up question for the part in which you need
> > more clarification. We can repeat until you understand. You keep using
> > the phrase "hand waving" as if I am faking it. That is FUD.
> > Communication is a two way street. I can't force you to understand,
> > asking more questions can help you. This is complex problem. I am
> > confident I can explain to Kairui and he can understand, because he
> > has a lot more context, not because I am faking it. Ask nicely so I
> > can answer nicely. Stay in the technical side of the discussion
> > please.
> >
> > So I consider using VS to NACK my patch is technically unjustifiable.
>
> I'm not NACK-ing the ghost swapfile because of VS. I'm NACK-ing
> swapfile because of the technical requirements I pointed out above.
> Virtual swap happens to neatly solve all of them, by design, from
> first principle. I never ruled out the possibility of another design
> that would satisfy all of them - I just did not see enough from you to
> believe otherwise.

That is FUD. (Doubt).  Do you notice that over half of the core swap
code in the kernel now is my design? Swap allocator, swap table. Soon
there will be more like swap.tiers. Code will speak for itself.

By the way, FUD can be universally applied to anything, so it is a
distraction for technical discussion.

Let's focus the discussion on the technical and stop spreading FUD.

>
> I don't believe a static ghosttfile is it. In fact, you CAN
> theoretically implement virtual swap with a ghost swapfile as well.
> The staticity will just make it operationally untenable. The next step
> would be to dynamicize the swap infrastructure, at which point we
> revert back to the original VS design.

Just a starting point. Can incremental change to dynamic size. I
believe in incremental landing improvement in the kernel.

> I see the same thing played out in your response as well, with the
> redirection entry, then frontend/backend swap space. It's starting to
> eerily resembles virtual swap. Or maybe you can clarify?
>
> > Your current VS with 48 byte overhead is not usable at all as an
> > standard upstream kernel. Can we agree to that?
>
> Sure, which is why I sent it as an RFC and not as an actual patch
> series pending merging :) Its main purpose was to demonstrate the
> workflow of how a feature-complete virtual swap subsystem might
> behave, in all of the code paths of the memory subsystem. I can then
> optimize the fields piecemeal, while weighing the tradeoff (such as
> lock granularity v.s lock fields memory overhead). You and Kairui are
> welcome to criticize, comment, and help me optimize it, as did Yosry
> and Johannes in the past.

We need a new VS number close to first principle value to reboot the
discussion. Because you care about competing with something designed
to run close to the first principle value.
Until then VS is just a prototype, not ready for production quality.

> > As we all know, using less memory to function the same is a lot harder
> > than using more. If you can dramatically reduce the memory usage, you
>
> I don't necessarily disagree.
>
> I would, however, would like to point out that the reverse is true too
> - you can't necessarily compare the overhead of two designs, where one
> achieve a lot more in terms of features and/or operational goals than
> the other.

Then the swap table is landing the kernel. The ghost swap file I
submit is much closer to the production quality. If you merge that
patch now, the kernel can have a swapfile that does not waste disk
space. It also performs well NOW. Google has been using ghost swapfile
in production for over 10 years. So you are right, you can't even
compare the two in the sense that VS is not ready. I want to focus my
time to spend on something that can land the kernel next. Ping me
again when your VS is at that stage.

> > likely need to rebuild the whole patch series from scratch. If might
> > force you to use solution similar to swap table, in that case why not
> > join team swap table?
>
> Because even with the current swap table design, the allocator is
> *still* static.

Can be changed, will be changed, it needs to happen after phase IV of
swap table. As a matter of fact, a lot of swap tables clean up
untangle to make it happen.

> I would LOVE to use the current physical swap allocation
> infrastructure. It just doesn't work in its current state.

Help reviewing the swap table phase II and phase III then. Make it
happen sooner.

> > We can reopen the topic again by then if you have a newer VS:
>
> Sure.

Now I will conclude the VS: It is not production ready and it has no
dates when it will be production ready. It is just red herring.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Johannes Weiner 2 months, 1 week ago

On Sun, Nov 30, 2025 at 12:38:38AM +0400, Chris Li wrote:
> On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote:
> > >
> > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote:
> > > >
> > > > Sorry, I am talking about upstream.
> > >
> > > So far I have not had a pleasant upstream experience when submitting
> > > this particular patch to upstream.
> > >
> > > > I really appreciate anybody participating in Linux
> > > > kernel development. Linux is good because different
> > > > people bring different perspectives to the table.
> > >
> > > Of course everybody is welcome. However, NACK without technical
> > > justification is very bad for upstream development. I can't imagine
> > > what a new hacker would think after going through what I have gone
> > > through for this patch. He/she will likely quit contributing upstream.
> > > This is not the kind of welcome we want.
> > >
> > > Nhat needs to be able to technically justify his NACK as a maintainer.
> > > Sorry there is no other way to sugar coat it.
> >
> > I am NOT the only zswap maintainer who expresses concerns. Other
> > people also have their misgivings, so I have let them speak and not
> > put words in their mouths.
> 
> You did not mention the fact that both two NACK from zswap maintainers
> are from the same company. I assume you have some kind of team sync.
> There is a term for that, called "person acting in concert".

For the benefit of anybody following this from the sidelines, the
third zswap maintainer also expressed concerns about Chris's proposal
upthread. He works for the same company as Chris.

The reality is that Chris is failing to convince others of his design
direction, and is now obviously resorting to manipulation and hominem
attacks.

During the course of this thread, Chris has asked for "a little faith"
that his idea will work for all stated requirements, without deeming
it necessary to explain how.

When probed on technical details, he stated that he doesn't like to
plan that far ahead, and prefers having somebody else iron out the
implementation details. He also referred to high-level slides from his
LSFMM '24 session - which was received thusly[1]:

  Matthew Wilcox agreed, warning Li that he was setting himself up for "a world of pain".

  Jan Kara said that existing filesystem designs are not suited to this task

  Hildenbrand said that this plan was introducing too much complexity

His first response to criticism was to invoke his <4 week status of
swap maintainer.

Meanwhile, the design direction that Chris is construing as a single
company conspiracy is anything but. The collaborative origins of these
patches are well documented. Chris was CC'd on those RFCs. He notably
did not engage in them. He is now lying about the narrative and
choosing to attack these patches in bad faith and out of context.

This pattern of behavior gives me low confidence that Chris is able to
collaborate and compromise on a design that works for all users.

And while Chris has been quite vocal and opinionated in mailing list
discussions, his actual code contributions to the kernel do not
instill confidence that he can solve this problem by himself, either.

[1] https://lwn.net/Articles/974587/

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 1 week ago

On Mon, Dec 1, 2025 at 8:43 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> For the benefit of anybody following this from the sidelines, the
> third zswap maintainer also expressed concerns about Chris's proposal
> upthread. He works for the same company as Chris.

Yes, I don't know who's interest Yosry's represent. We have a
disagreement on the swap abstraction 2023 that is why I have an
alternative proposal. The community back then strongly favored my
proposal. I guess Yosry just hasn't graduated from that yet.

>
> The reality is that Chris is failing to convince others of his design
> direction, and is now obviously resorting to manipulation and hominem
> attacks.

Now we can't even talk about technical and  move to personal attacks,
is that all you have left in you?

> During the course of this thread, Chris has asked for "a little faith"
> that his idea will work for all stated requirements, without deeming
> it necessary to explain how.

More FUD please.

> When probed on technical details, he stated that he doesn't like to
> plan that far ahead, and prefers having somebody else iron out the
> implementation details. He also referred to high-level slides from his
> LSFMM '24 session - which was received thusly[1]:
>
>   Matthew Wilcox agreed, warning Li that he was setting himself up for "a world of pain".

Yes, we deal with that pain. Swap table is the outcome so we don't
further impose pain to maintain file cache vs swap cache where a lot
of swap specific optimization will be painful for the file cache side.
As far as I am concerned, the most painful part, swap table as the new
swap cache has already landed. We did not cause Matthew pain in the
process.

>   Jan Kara said that existing filesystem designs are not suited to this task
>
>   Hildenbrand said that this plan was introducing too much complexity
>

More personal attacks please.

> His first response to criticism was to invoke his <4 week status of
> swap maintainer.

I take that back and apologize for what I say and you accept it as "no
hard feelings".
Do you mean you don't mean what you say?

> Meanwhile, the design direction that Chris is construing as a single
> company conspiracy is anything but. The collaborative origins of these
> patches are well documented. Chris was CC'd on those RFCs. He notably

I feel the 48 bytes overhead is a joke, I already provide my feedback
against it in the 2023 LSF swap abstraction. I don't like to keep
beating the dead horse.

> did not engage in them. He is now lying about the narrative and
> choosing to attack these patches in bad faith and out of context.

More FUD and personal attack, is that all you can output now?

>
> This pattern of behavior gives me low confidence that Chris is able to
> collaborate and compromise on a design that works for all users.
>
> And while Chris has been quite vocal and opinionated in mailing list
> discussions, his actual code contributions to the kernel do not
> instill confidence that he can solve this problem by himself, either.
>
> [1] https://lwn.net/Articles/974587/

You obviously haven't graduated from the fact that most of the swap
core is my design now, in the current kernel. There will be more.
More personal attacks please, I am ignoring the attack in the order I
received. It seems that is what is left of you, personal attacks to
dominate a technical discussion when the technical is losing.

Sign, this is an example case study of upstream bullying and that is
why sometimes upstream submission is very unfriendly for the less
established person. I personally know people who were  bullied by you
and give up upstream contributions completely. Go ahead and try to add
me to the list. That will win you more followers. More people will
enjoy working with you.

I agree I am not a native English speaker. I will lose to you in a
bullying shout out flight, you win in.

Let's compete in code and benchmarks and see what happens.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Kairui Song 2 months, 1 week ago

On Tue, Dec 2, 2025 at 12:47 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Sun, Nov 30, 2025 at 12:38:38AM +0400, Chris Li wrote:
> > On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote:
> > > >
> > > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote:
> > > > >
> > > > > Sorry, I am talking about upstream.
> > > >
> > > > So far I have not had a pleasant upstream experience when submitting
> > > > this particular patch to upstream.
> > > >
> > > > > I really appreciate anybody participating in Linux
> > > > > kernel development. Linux is good because different
> > > > > people bring different perspectives to the table.
> > > >
> > > > Of course everybody is welcome. However, NACK without technical
> > > > justification is very bad for upstream development. I can't imagine
> > > > what a new hacker would think after going through what I have gone
> > > > through for this patch. He/she will likely quit contributing upstream.
> > > > This is not the kind of welcome we want.
> > > >
> > > > Nhat needs to be able to technically justify his NACK as a maintainer.
> > > > Sorry there is no other way to sugar coat it.
> > >
> > > I am NOT the only zswap maintainer who expresses concerns. Other
> > > people also have their misgivings, so I have let them speak and not
> > > put words in their mouths.
> >
> > You did not mention the fact that both two NACK from zswap maintainers
> > are from the same company. I assume you have some kind of team sync.
> > There is a term for that, called "person acting in concert".
>
> For the benefit of anybody following this from the sidelines, the
> third zswap maintainer also expressed concerns about Chris's proposal
> upthread. He works for the same company as Chris.
>
> The reality is that Chris is failing to convince others of his design
> direction, and is now obviously resorting to manipulation and hominem
> attacks.
>
> During the course of this thread, Chris has asked for "a little faith"
> that his idea will work for all stated requirements, without deeming
> it necessary to explain how.
>
> When probed on technical details, he stated that he doesn't like to
> plan that far ahead, and prefers having somebody else iron out the
> implementation details. He also referred to high-level slides from his
> LSFMM '24 session - which was received thusly[1]:
>
>   Matthew Wilcox agreed, warning Li that he was setting himself up for "a world of pain".
>
>   Jan Kara said that existing filesystem designs are not suited to this task
>
>   Hildenbrand said that this plan was introducing too much complexity
>
> His first response to criticism was to invoke his <4 week status of
> swap maintainer.
>
> Meanwhile, the design direction that Chris is construing as a single
> company conspiracy is anything but. The collaborative origins of these
> patches are well documented. Chris was CC'd on those RFCs. He notably
> did not engage in them. He is now lying about the narrative and
> choosing to attack these patches in bad faith and out of context.
>
> This pattern of behavior gives me low confidence that Chris is able to
> collaborate and compromise on a design that works for all users.
>
> And while Chris has been quite vocal and opinionated in mailing list
> discussions, his actual code contributions to the kernel do not
> instill confidence that he can solve this problem by himself, either.

Hi all,

I’d really prefer we all let things cool off a bit before the thread
gets too dramatic. :)

Sorry to see that the discussion went quite off topic, still I believe
this is some kind of misunderstanding on Chris' intention to improve
the kernel in a more generic way.

From my perspective, Chris did co-developed, suggested, reviewed or
authored many of the implementation details around the swap-table
idea, and he implemented the swap cluster allocator in 6.11, which
unlocked a bunch of follow-on optimizations.

I’ve been working on swap for a while as well and have rewritten and
refactored large parts of the swap, swap allocator and swap cache
(mm/swapfile.c, mm/swap_state.c, swap.h, swap_table.h). Maybe, yeah,
I’m not a kernel vet with decades of patches yet, but I do think I'm
familiar enough with swap. I think Chris' work, words or code, has
been looking good in the end results.

It's hard to put a penthouse on a sandcastle, and maybe that's the
reason makes it hard to describe or layout the further implementations
of swap.

We all struggled with swap subsystem a lot, the code base served us
well, but it had accumulated a lot of historical complexity and
awkward workarounds overtime (we had so many people in the community
complaining about it for so many years). I think we all agree that
pursuing incremental cleanups and improvement (eg. swap slot cache
cleanup, swap lock cleanup, swap_has_cache cleanup,
direct-swap workarounds removal, etc.) is more suitable upstream.
Chris also help a lot on this (eg. the LPC talk last year) and we
finally got rid of many long time burdens, quite some of these
works are directly enabled by his swap allocator rework first.

And I do have a more completed branch that I posted several times
showing the end results of swap tables have better memory consumption
& performance, and the code is much simpler than what we had in
upstream. It's getting merged step by step, and each step is a gain. I
believe that is the right way to improve things upstream, everyone and
every workload benefits, and progressively. And based on that, we will
be able to implement things much easier.

I believe things will look much better and cleaner as we process (eg.
resizing might be doable for generic swap too), and make it easier for
all of us, and make the swap subsystem better in a collaborative way.

Cheers.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Johannes Weiner 2 months, 1 week ago

On Tue, Dec 02, 2025 at 03:49:22AM +0800, Kairui Song wrote:
> From my perspective, Chris did co-developed, suggested, reviewed or
> authored many of the implementation details around the swap-table
> idea, and he implemented the swap cluster allocator in 6.11, which
> unlocked a bunch of follow-on optimizations.
> 
> I’ve been working on swap for a while as well and have rewritten and
> refactored large parts of the swap, swap allocator and swap cache
> (mm/swapfile.c, mm/swap_state.c, swap.h, swap_table.h). Maybe, yeah,
> I’m not a kernel vet with decades of patches yet, but I do think I'm
> familiar enough with swap. I think Chris' work, words or code, has
> been looking good in the end results.

I have absolute respect for your work. And if you say Chris was
instrumental to getting it done, I will take your word for it.

> It's hard to put a penthouse on a sandcastle, and maybe that's the
> reason makes it hard to describe or layout the further implementations
> of swap.

Sure, I can understand that. However, I think the conflict is not
necessarily about implementation strategy, it's about priorities.

We have a usecase. We have a functional implementation that satisfies
this usecase. It was sent as RFCs early on to gain consensus on the
direction and find the best tradeoffs wrt other usecases. These RFC
threads are the place to voice concerns and influence direction.

Both Chris and you have stated that further swap table work *may* also
enable this usecase. However, at this time, I think it's also fair to
say that it's more of an afterthought, and no concrete design or code
for how this would actually look like has been proposed. High-level
ideas have been floated, but as you can see from Nhat, Rik's, Yosry's
and my replies, they don't really meet the necessary requirements.

This is not some principled stance. The virtual swap patches are
difficult to develop, especially given the current rate of change of
the underlying swap codebase. If anybody working on vswap had seen a
plausible way to solve these issues through incremental swap table
improvements they would have jumped on it a long time ago.

It's more about priorities. Combining compression with disk swap is
extremely powerful, because it dramatically reduces the worst aspects
of both: it reduces the memory footprint of compression by shedding
the coldest data to disk; it reduces the IO latencies and flash wear
of disk swap through the writeback cache. In practice, this reduces
*average event rates of the entire reclaim/paging/IO stack*.

These are higher-order overhead savings that are difficult to beat
with first-order descriptor and lookup cost optimizations.

We obviously want to have both, but they are orthogonal goals. You can
see how it doesn't make sense for us to deprioritize the former for
the latter, or why Nhat says it's an apples to oranges comparison.

It also won't work for one party to say "we will fix this, give us
time". Everybody wants to advance the thing they primarily care about
with the least amount of detours. That's where we have to find
compromise. Either let people pursue what's most important to them, or
lay out an encompassing design to build consensus and organize effort.

And yes, let's please stay technical and on-topic in these
discussions. Let's acknowledge we have interests that overlap, and
interests that do not. Then find ways to service everybody's usecases.

Disagreements are part of the game. There is no need to get personal,
pull rank, or make accusations to dismiss somebody else's clearly
stated usecase, perspective, or objections.

The best way to avoid this is to make technical statements, and reply
with technical responses where those statements are made.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 1 week ago

On Tue, Dec 2, 2025 at 9:02 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > It's hard to put a penthouse on a sandcastle, and maybe that's the
> > reason makes it hard to describe or layout the further implementations
> > of swap.
>
> Sure, I can understand that. However, I think the conflict is not
> necessarily about implementation strategy, it's about priorities.

Right, that is why we take the more incremental approach. Cleanup and
simplify the swap code in order to make later stage things happen.

> We have a usecase. We have a functional implementation that satisfies
> this usecase. It was sent as RFCs early on to gain consensus on the
> direction and find the best tradeoffs wrt other usecases. These RFC
> threads are the place to voice concerns and influence direction.

Speak of priority. We have a more priority approach by incrementally
landing code. The priority reflects that we are actually landing code,
improving it towards that glorified goal.

I have already expressed concern and the 2023 LSF swap abstraction
talk the community already picked the winner. Not the one VS is based
on. Rebooting that without addressing the previous concern is a waste
of everybody's time. You basically say the community picks the wrong
one. Let's retry again.

> Both Chris and you have stated that further swap table work *may* also
> enable this usecase. However, at this time, I think it's also fair to
> say that it's more of an afterthought, and no concrete design or code
> for how this would actually look like has been proposed. High-level
> ideas have been floated, but as you can see from Nhat, Rik's, Yosry's
> and my replies, they don't really meet the necessary requirements.

The VS doesn't meet the requirement of upstream and other companies
that do not unnecessarily blow up the kernel memory usage. One fight
at a time, sorry I have to get the VS out of the way before I comment
on other aspects of this patch.

> This is not some principled stance. The virtual swap patches are
> difficult to develop, especially given the current rate of change of
> the underlying swap codebase. If anybody working on vswap had seen a
> plausible way to solve these issues through incremental swap table
> improvements they would have jumped on it a long time ago.
>
> It's more about priorities. Combining compression with disk swap is
> extremely powerful, because it dramatically reduces the worst aspects
> of both: it reduces the memory footprint of compression by shedding
> the coldest data to disk; it reduces the IO latencies and flash wear
> of disk swap through the writeback cache. In practice, this reduces
> *average event rates of the entire reclaim/paging/IO stack*.
>
> These are higher-order overhead savings that are difficult to beat
> with first-order descriptor and lookup cost optimizations.
>
> We obviously want to have both, but they are orthogonal goals. You can
> see how it doesn't make sense for us to deprioritize the former for
> the latter, or why Nhat says it's an apples to oranges comparison.

My advice is that, make it incremental, come up with production
quality solutions.  Adding one layer of XArry to redirect is easy. The
hard part is how to keep the memory usage in check and perform well.
The posted VS will give you false illusion of progress because it
doesn't have a clear way to address the performance and memory usage
problem which can meet the production quality requirement for
upstream. The upstream kernel is not a toy kernel.

> It also won't work for one party to say "we will fix this, give us
> time". Everybody wants to advance the thing they primarily care about

Exactly, the same applies to VS. Even if I can spend the time
convincing you the grant vision and you are buying it. Who guarantees
that vision can be implemented without the surprise black swan
assumption that set us back to the drawing board? So landing the
actual improvement is the king. That is the real progress. If your
team want to get the result sooner, helping swap table landing would
speed up your goal. Once the code base is cleaned up. It is much
easier to move in ANY direction.

> with the least amount of detours. That's where we have to find
> compromise. Either let people pursue what's most important to them, or
> lay out an encompassing design to build consensus and organize effort.

This is just a development methodology, a personal choice. That is why
the swap table is landing as of now.

> And yes, let's please stay technical and on-topic in these
> discussions. Let's acknowledge we have interests that overlap, and
> interests that do not. Then find ways to service everybody's usecases.

Yes, I agree.

> Disagreements are part of the game. There is no need to get personal,
> pull rank, or make accusations to dismiss somebody else's clearly
> stated usecase, perspective, or objections.

Also agree.
>
> The best way to avoid this is to make technical statements, and reply
> with technical responses where those statements are made.

Yes, exactly. That is why I want to get a straight answer about the VS
slot overhead number. Writing a grant design doc is a much bigger
task.  It will put non native English speakers at a disadvantage for
writing the big design docs. A lot of us don't have the luxury of
contributing to the upstream as a day job, me included. We do it in
our spare times because we love it.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Rik van Riel 2 months, 2 weeks ago

On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org>
> wrote:
> > 
> > 
> > Zswap is primarily a compressed cache for real swap on secondary
> > storage. It's indeed quite important that entries currently in
> > zswap
> > don't occupy disk slots; but for a solution to this to be
> > acceptable,
> > it has to work with the primary usecase and support disk writeback.
> 
> Well, my plan is to support the writeback via swap.tiers.
> 
How would you do writeback from a zswap entry in
a ghost swapfile, to a real disk swap backend?

That is the use case people are trying to solve.

How would your architecture address it?

-- 
All Rights Reversed.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 2 weeks ago

On Mon, Nov 24, 2025 at 7:15 PM Rik van Riel <riel@surriel.com> wrote:
>
> On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org>
> > wrote:
> > >
> > >
> > > Zswap is primarily a compressed cache for real swap on secondary
> > > storage. It's indeed quite important that entries currently in
> > > zswap
> > > don't occupy disk slots; but for a solution to this to be
> > > acceptable,
> > > it has to work with the primary usecase and support disk writeback.
> >
> > Well, my plan is to support the writeback via swap.tiers.
> >
> How would you do writeback from a zswap entry in
> a ghost swapfile, to a real disk swap backend?

Basically, each swap file has its own version swap
ops->{read,write}_folio(). The mem swap tier is similar to the current
zswap but it is memory only, there is no file backing and don't share
swap entries with the real swapfile.

When writing back from one swap entry to another swapfile, for the
simple case of uncompressing the data, data will store to swap cache
and write to another swapfile with allocated another swap entry. The
front end of the swap cache will have the option map the front end
swap entry offset to the back end block locations. At the memory price
of 4 byte per swap entry.
This kind of physical block redirection not only happens in more than
one swapfile, it can happen in the same swapfile, in the situation
that there is available space in lower order swap entries. But can not
allocate the  higher order one because those lower order ones are not
continued. In such a case, the swap file can expand the high order
swap entry beyond the end of the current physical swapfile. Then map
two continues high order swap entry into the low order physical
locations. I have some slides I shared in the 2024 LSF the swap pony
talk with some diagrams for that physical swap location redirection.

> That is the use case people are trying to solve.

Yes, me too.

> How would your architecture address it?

The cluster base swap allocator, swap table as the new swap cache, per
cgroup swap.tiers and the vfs like swap ops all integrally work
together as the grant vision for the new swap system. I might not have
an answer for all the design details right now. I am the type of
person who likes to improvise and adjust the design details when more
detailed design constraints are found. So far I found this design can
work well. Some of the early milestones, swap allocator and swap
tables which already landed in the kernel and show great results.

I consider this is much better than the VS (previous swap astraction).
It does not enforce pain like the VS does. One of the big downsides of
VS is that, once applied to the kernel. Even normal swap does not use
redirection and will pay the price for it as well. The pain is
mandatory. My swap.tiers write back does not have this problem. If no
writeback or not redirection of physical blocks, no additional
overhead pay for memory nor CPU.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Rik van Riel 2 months, 2 weeks ago

On Mon, 2025-11-24 at 20:26 +0300, Chris Li wrote:
> On Mon, Nov 24, 2025 at 7:15 PM Rik van Riel <riel@surriel.com>
> wrote:
> > 
> > On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner
> > > <hannes@cmpxchg.org>
> > > wrote:
> > > > 
> > > > 
> > > > Zswap is primarily a compressed cache for real swap on
> > > > secondary
> > > > storage. It's indeed quite important that entries currently in
> > > > zswap
> > > > don't occupy disk slots; but for a solution to this to be
> > > > acceptable,
> > > > it has to work with the primary usecase and support disk
> > > > writeback.
> > > 
> > > Well, my plan is to support the writeback via swap.tiers.
> > > 
> > How would you do writeback from a zswap entry in
> > a ghost swapfile, to a real disk swap backend?
> 
> Basically, each swap file has its own version swap
> ops->{read,write}_folio(). The mem swap tier is similar to the
> current
> zswap but it is memory only, there is no file backing and don't share
> swap entries with the real swapfile.
> 
> When writing back from one swap entry to another swapfile, for the
> simple case of uncompressing the data, data will store to swap cache
> and write to another swapfile with allocated another swap entry. The
> front end of the swap cache will have the option map the front end
> swap entry offset to the back end block locations. At the memory
> price
> of 4 byte per swap entry.

Wait, so you use the swap cache radix tree to
indicate the physical location of data between
multiple swap devices?

Isn't that exactly what the vswap approach
does, too?

How is this different?

-- 
All Rights Reversed.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 2 weeks ago

On Mon, Nov 24, 2025 at 8:43 PM Rik van Riel <riel@surriel.com> wrote:
>
> On Mon, 2025-11-24 at 20:26 +0300, Chris Li wrote:
> > On Mon, Nov 24, 2025 at 7:15 PM Rik van Riel <riel@surriel.com>
> > wrote:
> > >
> > > On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> > > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner
> > > > <hannes@cmpxchg.org>
> > > > wrote:
> > > > >
> > > > >
> > > > > Zswap is primarily a compressed cache for real swap on
> > > > > secondary
> > > > > storage. It's indeed quite important that entries currently in
> > > > > zswap
> > > > > don't occupy disk slots; but for a solution to this to be
> > > > > acceptable,
> > > > > it has to work with the primary usecase and support disk
> > > > > writeback.
> > > >
> > > > Well, my plan is to support the writeback via swap.tiers.
> > > >
> > > How would you do writeback from a zswap entry in
> > > a ghost swapfile, to a real disk swap backend?
> >
> > Basically, each swap file has its own version swap
> > ops->{read,write}_folio(). The mem swap tier is similar to the
> > current
> > zswap but it is memory only, there is no file backing and don't share
> > swap entries with the real swapfile.
> >
> > When writing back from one swap entry to another swapfile, for the
> > simple case of uncompressing the data, data will store to swap cache
> > and write to another swapfile with allocated another swap entry. The
> > front end of the swap cache will have the option map the front end
> > swap entry offset to the back end block locations. At the memory
> > price
> > of 4 byte per swap entry.
>
> Wait, so you use the swap cache radix tree to
> indicate the physical location of data between
> multiple swap devices?

Ah, you haven't caught up with the progress that the new swap cache
does not use radix trees any more. It is using swap tables. It is a
512 entry swpa table array lookup, no tree lookup. Much faster with
less locks. The swap table commit shows there are about 20% difference
in throughput in some test benchmark workloads.

> Isn't that exactly what the vswap approach
> does, too?

Except that I purpose it earlier.
https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/
That swap cache physcial entry redirection is my original idea as far
as I can tell and presented in the conference earlier.

> How is this different?

The main difference will be I just get rid of the xarray in swap cache
lookup. I don't want to re-introduce it again.
Also in my swap.tiers design, the redirection overhead is optional. If
you are not using redirection, in swap.tiers swpa ops you don't pay
for it. Just like the ghost swap file. VS it is not optional, will
enforce the overhead as well. In my design the memory overhead will be
smaller per swap entry because it will be integrated tightly with swap
entry.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Nhat Pham 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > The current zswap requires a backing swapfile. The swap slot used
> > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > space.
> > >
> > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > prevents accidental read or write of ghost swapfile, bdev of
> > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > flag because there is no rotation disk access when using zswap.
> >
> > Zswap is primarily a compressed cache for real swap on secondary
> > storage. It's indeed quite important that entries currently in zswap
> > don't occupy disk slots; but for a solution to this to be acceptable,
> > it has to work with the primary usecase and support disk writeback.
>
> Well, my plan is to support the writeback via swap.tiers.
>
> > This direction is a dead-end. Please take a look at Nhat's swap
> > virtualization patches. They decouple zswap from disk geometry, while
> > still supporting writeback to an actual backend file.
>
> Yes, there are many ways to decouple zswap from disk geometry, my swap
> table + swap.tiers design can do that as well. I have concerns about
> swap virtualization in the aspect of adding another layer of memory
> overhead addition per swap entry and CPU overhead of extra xarray
> lookup. I believe my approach is technically superior and cleaner.

True, but the static nature of the current swapfile infrastructure
also imposes an space overhead and/or operational overhead.

I did play around with a prototype with a ghost swapfile for virtual
swap, but had to stop because of the swapfile overhead for larger
virtual swap space.

> Both faster and cleaner. Basically swap.tiers + VFS like swap read
> write page ops. I will let Nhat clarify the performance and memory

That just solves static placement, no? Backend transfer requires
something extra/orthogonal.

> overhead side of the swap virtualization.
>
> I am not against swap entry redirection. Just the swap virtualization

There will be redirection either way. I don't think it's avoidable.
The only option is whether to shove it into the backend (what zram is
doing), or having a generalized module (swap virtualization).

Or do a page table walk every time you want to do backend transfer
(what swapoff is doing).

> series needs to compare against the alternatives in terms of memory
> overhead and throughput.
> Solving it from the swap.tiers angle is cleaner.
>
> > Nacked-by: Johannes Weiner <hannes@cmpxchg.org>
>
> I take that the only relevant part is you are zswap maintainer and I
> am the swap maintainer. Fine. I got the message. I will leave the
> zswap alone. I will find other ways to address the memory base swap
> tiers in swap.tiers.

Please keep this discussion technical and not pull ranks unnecessarily.

>
> Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Kairui Song 2 months, 2 weeks ago

On Sat, Nov 22, 2025 at 10:09 AM Chris Li <chrisl@kernel.org> wrote:
>
> On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > The current zswap requires a backing swapfile. The swap slot used
> > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > space.
> > >
> > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > prevents accidental read or write of ghost swapfile, bdev of
> > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > flag because there is no rotation disk access when using zswap.
> >
> > Zswap is primarily a compressed cache for real swap on secondary
> > storage. It's indeed quite important that entries currently in zswap
> > don't occupy disk slots; but for a solution to this to be acceptable,
> > it has to work with the primary usecase and support disk writeback.
>
> Well, my plan is to support the writeback via swap.tiers.

That sounds interesting. Have been watching YoungJun and yours
swap.tiers discussion for a while, looking forward to see how they
play together.

Using tiering to resolve the writeback issue sounds like a nice
solution, we definitely don't want to limit the writeback to
zswap/ram-block only, we will also want things like
block-block writeback.

We (and I have noticed many community users) have setups involving
hybrid tiers. We have a internal module that moves swap entry from SSD
to HDD too. To do it upstreamly we need something like the swap.tiers.

>
> > This direction is a dead-end. Please take a look at Nhat's swap
> > virtualization patches. They decouple zswap from disk geometry, while
> > still supporting writeback to an actual backend file.
>
> Yes, there are many ways to decouple zswap from disk geometry, my swap
> ...
> Solving it from the swap.tiers angle is cleaner.

Agree with the swap.tiers part, that sounds cleaner.

>
> > Nacked-by: Johannes Weiner <hannes@cmpxchg.org>

I think that's too early to justify. Let's stay open for ideas.

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Nhat Pham 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote:
>
> The current zswap requires a backing swapfile. The swap slot used
> by zswap is not able to be used by the swapfile. That waste swapfile
> space.
>
> The ghost swapfile is a swapfile that only contains the swapfile header
> for zswap. The swapfile header indicate the size of the swapfile. There
> is no swap data section in the ghost swapfile, therefore, no waste of
> swapfile space.  As such, any write to a ghost swapfile will fail. To
> prevents accidental read or write of ghost swapfile, bdev of
> swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> flag because there is no rotation disk access when using zswap.

Would this also affect the swap slot allocation algorithm?

>
> The zswap write back has been disabled if all swapfiles in the system
> are ghost swap files.

I don't like this design:

1. Statically sizing the compression tier will be an operational
nightmare, for users that have to support a variety (and increasingly
bigger sized) types of hosts. It's one of the primary motivations of
the virtual swap line of work. We need to move towards a more dynamic
architecture for zswap, not the other way around, in order to reduce
both (human's) operational overhead, AND actual space overhead (i.e
only allocate (z)swap metadata on-demand).

2. This digs us in the hole of supporting a special infrastructure for
non-writeback cases. Now every future change to zswap's architecture
has to take this into account. It's not easy to turn this design into
something that can support writeback - you're stuck with either having
to do an expensive page table walk to update the PTEs, or shoving the
virtual swap layer inside zswap. Ugly.

3. And what does this even buy us? Just create a fake in-memory-only
swapfile (heck, you can use zram), disable writeback (which you can do
both at a cgroup and host-level), and call it a day.

Nacked-by: Nhat Pham <nphamcs@gmail.com>

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 2:19 AM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote:
> >
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
>
> Would this also affect the swap slot allocation algorithm?
>
> >
> > The zswap write back has been disabled if all swapfiles in the system
> > are ghost swap files.
>
> I don't like this design:
>
> 1. Statically sizing the compression tier will be an operational
> nightmare, for users that have to support a variety (and increasingly
> bigger sized) types of hosts. It's one of the primary motivations of
> the virtual swap line of work. We need to move towards a more dynamic
> architecture for zswap, not the other way around, in order to reduce
> both (human's) operational overhead, AND actual space overhead (i.e
> only allocate (z)swap metadata on-demand).

Let's do it one step at a time.

> 2. This digs us in the hole of supporting a special infrastructure for
> non-writeback cases. Now every future change to zswap's architecture
> has to take this into account. It's not easy to turn this design into
> something that can support writeback - you're stuck with either having
> to do an expensive page table walk to update the PTEs, or shoving the
> virtual swap layer inside zswap. Ugly.

What are you talking about? This patch does not have any page table
work. You are opposing something in your imagination. Please show me
the code in which I do expensive PTE walks.

> 3. And what does this even buy us? Just create a fake in-memory-only
> swapfile (heck, you can use zram), disable writeback (which you can do
> both at a cgroup and host-level), and call it a day.

Well this provides users a choice, if they don't care about write
backs. They can do zswap with ghost swapfile now without actually
wasting disk space.

It also does not stop zswap using write back with normal SSD. If you
want to write back, you can still use a non ghost swapfile as normal.

It is a simple enough patch to provide value right now. It also fits
into the swap.tiers long term roadmap to have a seperate tier for
memory based swapfiles. I believe that is a cleaner picture than the
current zswap as cache but also gets its hands so deep into the swap
stack and slows down other swap tiers.

> Nacked-by: Nhat Pham <nphamcs@gmail.com>

I heard  you, if you don't don't want zswap to have anything to do
with memory based swap tier in the swap.tiers design. I respect your
choice.

Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Nhat Pham 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Fri, Nov 21, 2025 at 2:19 AM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote:
> > >
> > > The current zswap requires a backing swapfile. The swap slot used
> > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > space.
> > >
> > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > prevents accidental read or write of ghost swapfile, bdev of
> > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > flag because there is no rotation disk access when using zswap.
> >
> > Would this also affect the swap slot allocation algorithm?
> >
> > >
> > > The zswap write back has been disabled if all swapfiles in the system
> > > are ghost swap files.
> >
> > I don't like this design:
> >
> > 1. Statically sizing the compression tier will be an operational
> > nightmare, for users that have to support a variety (and increasingly
> > bigger sized) types of hosts. It's one of the primary motivations of
> > the virtual swap line of work. We need to move towards a more dynamic
> > architecture for zswap, not the other way around, in order to reduce
> > both (human's) operational overhead, AND actual space overhead (i.e
> > only allocate (z)swap metadata on-demand).
>
> Let's do it one step at a time.

I'm happy with landing these patches one step at a time. But from my
POV (and admittedly limited imagination), it's a bit of a deadend.

The only architecture, IMO, that satisfies:

1. Dynamic overhead of (z)swap metadata.

2. Decouple swap backends, i.e no pre-reservation of lower tiers space
(what zswap is doing right now).

3. Backend transfer without page table walks.

is swap virtualization.

If you want to present an alternative vision, you don't have to
implement it right away, but you have to at least explain to me how to
achieve all these 3.

>
> > 2. This digs us in the hole of supporting a special infrastructure for
> > non-writeback cases. Now every future change to zswap's architecture
> > has to take this into account. It's not easy to turn this design into
> > something that can support writeback - you're stuck with either having
> > to do an expensive page table walk to update the PTEs, or shoving the
> > virtual swap layer inside zswap. Ugly.
>
> What are you talking about? This patch does not have any page table
> work. You are opposing something in your imagination. Please show me
> the code in which I do expensive PTE walks.

Please read my response again. I did not say you did any PTE walk in this patch.

What I meant was, if you want to make this the general architecture
for zswap and not some niche infrastructure for specialized use case,
you need to be able to support backend transfer, i.e zswap writeback
(zswap -> disk swap, and perhaps in the future the other direction).
This will be very expensive with this design.

>
> > 3. And what does this even buy us? Just create a fake in-memory-only
> > swapfile (heck, you can use zram), disable writeback (which you can do
> > both at a cgroup and host-level), and call it a day.
>
> Well this provides users a choice, if they don't care about write
> backs. They can do zswap with ghost swapfile now without actually
> wasting disk space.
>
> It also does not stop zswap using write back with normal SSD. If you
> want to write back, you can still use a non ghost swapfile as normal.
>
> It is a simple enough patch to provide value right now. It also fits
> into the swap.tiers long term roadmap to have a seperate tier for
> memory based swapfiles. I believe that is a cleaner picture than the
> current zswap as cache but also gets its hands so deep into the swap
> stack and slows down other swap tiers.
>
> > Nacked-by: Nhat Pham <nphamcs@gmail.com>
>
> I heard  you, if you don't don't want zswap to have anything to do
> with memory based swap tier in the swap.tiers design. I respect your
> choice.

Where does this even come from?

I can't speak for Johannes or Yosry, but personally I'm ambivalent
with respect to swap.tiers. My only objection in the past was there
was not any use case at a time, but there seems to be one now. I won't
stand in the way of swap.tiers landing, or zswap's integration into
it.

From my POV, swap.tiers solve a problem completely orthogonal to what
I'm trying to solve, namely, the three points listed above. It's about
definition of swap hierarchy, either at initial placement time, or
during offloading from one backend to another, where as I'm trying to
figure out the mechanistic side of it (how to transfer a page from one
backend to another without page table walking). These two are
independent, if not synergistic.

>
> Chris

Re: [PATCH RFC] mm: ghost swapfile support for zswap

Posted by Chris Li 2 months, 2 weeks ago

On Mon, Nov 24, 2025 at 5:47 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
> >
> > On Fri, Nov 21, 2025 at 2:19 AM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote:
> > > >
> > > > The current zswap requires a backing swapfile. The swap slot used
> > > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > > space.
> > > >
> > > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > > prevents accidental read or write of ghost swapfile, bdev of
> > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > > flag because there is no rotation disk access when using zswap.
> > >
> > > Would this also affect the swap slot allocation algorithm?
> > >
> > > >
> > > > The zswap write back has been disabled if all swapfiles in the system
> > > > are ghost swap files.
> > >
> > > I don't like this design:
> > >
> > > 1. Statically sizing the compression tier will be an operational
> > > nightmare, for users that have to support a variety (and increasingly
> > > bigger sized) types of hosts. It's one of the primary motivations of
> > > the virtual swap line of work. We need to move towards a more dynamic
> > > architecture for zswap, not the other way around, in order to reduce
> > > both (human's) operational overhead, AND actual space overhead (i.e
> > > only allocate (z)swap metadata on-demand).
> >
> > Let's do it one step at a time.
>
> I'm happy with landing these patches one step at a time. But from my
> POV (and admittedly limited imagination), it's a bit of a deadend.
>
> The only architecture, IMO, that satisfies:
>
> 1. Dynamic overhead of (z)swap metadata.
>
> 2. Decouple swap backends, i.e no pre-reservation of lower tiers space
> (what zswap is doing right now).
>
> 3. Backend transfer without page table walks.
>
> is swap virtualization.
>
> If you want to present an alternative vision, you don't have to
> implement it right away, but you have to at least explain to me how to
> achieve all these 3.

From 1,2,3 to SV as the only solution is  a big jump. How many
possibilities have you explored to conclude that no other solution can
satisfy your 123?

I just replied to Rik's email about the high level sketch design. My
design should satisfy it and can serve as one counter example of
alternative design.


>
> >
> > > 2. This digs us in the hole of supporting a special infrastructure for
> > > non-writeback cases. Now every future change to zswap's architecture
> > > has to take this into account. It's not easy to turn this design into
> > > something that can support writeback - you're stuck with either having
> > > to do an expensive page table walk to update the PTEs, or shoving the
> > > virtual swap layer inside zswap. Ugly.
> >
> > What are you talking about? This patch does not have any page table
> > work. You are opposing something in your imagination. Please show me
> > the code in which I do expensive PTE walks.
>
> Please read my response again. I did not say you did any PTE walk in this patch.
>
> What I meant was, if you want to make this the general architecture
> for zswap and not some niche infrastructure for specialized use case,
> you need to be able to support backend transfer, i.e zswap writeback
> (zswap -> disk swap, and perhaps in the future the other direction).
> This will be very expensive with this design.

I can't say I agree with you. It seems you have made a lot of
assumptions in your reasoning.

> > > 3. And what does this even buy us? Just create a fake in-memory-only
> > > swapfile (heck, you can use zram), disable writeback (which you can do
> > > both at a cgroup and host-level), and call it a day.
> >
> > Well this provides users a choice, if they don't care about write
> > backs. They can do zswap with ghost swapfile now without actually
> > wasting disk space.
> >
> > It also does not stop zswap using write back with normal SSD. If you
> > want to write back, you can still use a non ghost swapfile as normal.
> >
> > It is a simple enough patch to provide value right now. It also fits
> > into the swap.tiers long term roadmap to have a seperate tier for
> > memory based swapfiles. I believe that is a cleaner picture than the
> > current zswap as cache but also gets its hands so deep into the swap
> > stack and slows down other swap tiers.
> >
> > > Nacked-by: Nhat Pham <nphamcs@gmail.com>
> >
> > I heard  you, if you don't don't want zswap to have anything to do
> > with memory based swap tier in the swap.tiers design. I respect your
> > choice.
>
> Where does this even come from?
>
> I can't speak for Johannes or Yosry, but personally I'm ambivalent
> with respect to swap.tiers. My only objection in the past was there
> was not any use case at a time, but there seems to be one now. I won't
> stand in the way of swap.tiers landing, or zswap's integration into
> it.
>
> From my POV, swap.tiers solve a problem completely orthogonal to what
> I'm trying to solve, namely, the three points listed above. It's about
> definition of swap hierarchy, either at initial placement time, or
> during offloading from one backend to another, where as I'm trying to
> figure out the mechanistic side of it (how to transfer a page from one
> backend to another without page table walking). These two are
> independent, if not synergistic.

I think our goal overlaps, just a different approach with different
performance charistic.
I have asked in this thread a few times, how big is the per swap slot
memory overhead VS introduced?
That is something that I care about a lot.

Chris