[PATCH RFC] mm: ghost swapfile support for zswap

Chris Li posted 1 patch 1 week, 3 days ago
include/linux/swap.h |  2 ++
mm/page_io.c         | 18 +++++++++++++++---
mm/swap.h            |  2 +-
mm/swap_state.c      |  7 +++++++
mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
mm/zswap.c           | 17 +++++++++++------
6 files changed, 73 insertions(+), 15 deletions(-)
[PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 1 week, 3 days ago
The current zswap requires a backing swapfile. The swap slot used
by zswap is not able to be used by the swapfile. That waste swapfile
space.

The ghost swapfile is a swapfile that only contains the swapfile header
for zswap. The swapfile header indicate the size of the swapfile. There
is no swap data section in the ghost swapfile, therefore, no waste of
swapfile space.  As such, any write to a ghost swapfile will fail. To
prevents accidental read or write of ghost swapfile, bdev of
swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
flag because there is no rotation disk access when using zswap.

The zswap write back has been disabled if all swapfiles in the system
are ghost swap files.

Signed-off-by: Chris Li <chrisl@kernel.org>
---
 include/linux/swap.h |  2 ++
 mm/page_io.c         | 18 +++++++++++++++---
 mm/swap.h            |  2 +-
 mm/swap_state.c      |  7 +++++++
 mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
 mm/zswap.c           | 17 +++++++++++------
 6 files changed, 73 insertions(+), 15 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -216,6 +216,7 @@ enum {
 	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
 	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
 	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
+	SWP_GHOST	= (1 << 13),	/* not backed by anything */
 					/* add others here before... */
 };
 
@@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
 void free_pages_and_swap_cache(struct encoded_page **, int);
 /* linux/mm/swapfile.c */
 extern atomic_long_t nr_swap_pages;
+extern atomic_t nr_real_swapfiles;
 extern long total_swap_pages;
 extern atomic_t nr_rotate_swap;
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
 		return AOP_WRITEPAGE_ACTIVATE;
 	}
 
-	__swap_writepage(folio, swap_plug);
-	return 0;
+	return __swap_writepage(folio, swap_plug);
 out_unlock:
 	folio_unlock(folio);
 	return ret;
@@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
 	submit_bio(bio);
 }
 
-void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
+int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
 {
 	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
 
 	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
+
+	if (sis->flags & SWP_GHOST) {
+		/* Prevent the page from getting reclaimed. */
+		folio_set_dirty(folio);
+		return AOP_WRITEPAGE_ACTIVATE;
+	}
+
 	/*
 	 * ->flags can be updated non-atomicially (scan_swap_map_slots),
 	 * but that will never affect SWP_FS_OPS, so the data_race
@@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
 		swap_writepage_bdev_sync(folio, sis);
 	else
 		swap_writepage_bdev_async(folio, sis);
+	return 0;
 }
 
 void swap_write_unplug(struct swap_iocb *sio)
@@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
 	if (zswap_load(folio) != -ENOENT)
 		goto finish;
 
+	if (unlikely(sis->flags & SWP_GHOST)) {
+		folio_unlock(folio);
+		goto finish;
+	}
+
 	/* We have to read from slower devices. Increase zswap protection. */
 	zswap_folio_swapin(folio);
 
diff --git a/mm/swap.h b/mm/swap.h
index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
 }
 void swap_write_unplug(struct swap_iocb *sio);
 int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
-void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
+int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
 
 /* linux/mm/swap_state.c */
 extern struct address_space swap_space __ro_after_init;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	struct swap_iocb *splug = NULL;
 	bool page_allocated;
 
+	/*
+	 * The entry may have been freed by another task. Avoid swap_info_get()
+	 * which will print error message if the race happens.
+	 */
+	if (si->flags & SWP_GHOST)
+		goto skip;
+
 	mask = swapin_nr_pages(offset) - 1;
 	if (!mask)
 		goto skip;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 atomic_long_t nr_swap_pages;
+atomic_t nr_real_swapfiles;
 /*
  * Some modules use swappable objects and may try to swap them out under
  * memory pressure (via the shrinker). Before doing so, they may wish to
@@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
 			goto skip;
 	}
 
+	if (!(si->flags & SWP_GHOST))
+		atomic_sub(1, &nr_real_swapfiles);
 	plist_del(&si->avail_list, &swap_avail_head);
 
 skip:
@@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
 	}
 
 	plist_add(&si->avail_list, &swap_avail_head);
+	if (!(si->flags & SWP_GHOST))
+		atomic_add(1, &nr_real_swapfiles);
 
 skip:
 	spin_unlock(&swap_avail_lock);
@@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 	struct inode *inode = mapping->host;
 	int ret;
 
+	if (sis->flags & SWP_GHOST) {
+		*span = 0;
+		return 0;
+	}
+
 	if (S_ISBLK(inode->i_mode)) {
 		ret = add_swap_extent(sis, 0, sis->max, 0);
 		*span = sis->pages;
@@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	if (p->flags & SWP_CONTINUED)
 		free_swap_count_continuations(p);
 
-	if (!p->bdev || !bdev_nonrot(p->bdev))
+	if (!(p->flags & SWP_GHOST) &&
+	    (!p->bdev || !bdev_nonrot(p->bdev)))
 		atomic_dec(&nr_rotate_swap);
 
 	mutex_lock(&swapon_mutex);
@@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
 	mutex_unlock(&swapon_mutex);
 }
 
+static const char *swap_type_str(struct swap_info_struct *si)
+{
+	struct file *file = si->swap_file;
+
+	if (si->flags & SWP_GHOST)
+		return "ghost\t";
+
+	if (S_ISBLK(file_inode(file)->i_mode))
+		return "partition";
+
+	return "file\t";
+}
+
 static int swap_show(struct seq_file *swap, void *v)
 {
 	struct swap_info_struct *si = v;
@@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
 	len = seq_file_path(swap, file, " \t\n\\");
 	seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
 			len < 40 ? 40 - len : 1, " ",
-			S_ISBLK(file_inode(file)->i_mode) ?
-				"partition" : "file\t",
+			swap_type_str(si),
 			bytes, bytes < 10000000 ? "\t" : "",
 			inuse, inuse < 10000000 ? "\t" : "",
 			si->prio);
@@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
 	return 0;
 }
 
-
 /*
  * Find out how many pages are allowed for a single swap device. There
  * are two limiting factors:
@@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
 	unsigned long maxpages;
 	unsigned long swapfilepages;
 	unsigned long last_page;
+	loff_t size;
 
 	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
 		pr_err("Unable to find swap-space signature\n");
@@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
 
 	if (!maxpages)
 		return 0;
-	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
+
+	size = i_size_read(inode);
+	if (size == PAGE_SIZE) {
+		/* Ghost swapfile */
+		si->bdev = NULL;
+		si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
+		return maxpages;
+	}
+
+	swapfilepages = size >> PAGE_SHIFT;
 	if (swapfilepages && maxpages > swapfilepages) {
 		pr_warn("Swap area shorter than signature indicates\n");
 		return 0;
diff --git a/mm/zswap.c b/mm/zswap.c
index 5d0f8b13a958da3b5e74b63217b06e58ba2d3c26..29dfcc94b13eb72b1dbd100ded6e50620299e6e1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1005,14 +1005,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	struct folio *folio;
 	struct mempolicy *mpol;
 	bool folio_was_allocated;
-	struct swap_info_struct *si;
+	struct swap_info_struct *si = get_swap_device(swpentry);
 	int ret = 0;
 
-	/* try to allocate swap cache folio */
-	si = get_swap_device(swpentry);
 	if (!si)
-		return -EEXIST;
+		return -ENOENT;
+
+	if (si->flags & SWP_GHOST) {
+		put_swap_device(si);
+		return -EINVAL;
+	}
 
+	/* try to allocate swap cache folio */
 	mpol = get_task_policy(current);
 	folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
 			NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
@@ -1067,7 +1071,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	folio_set_reclaim(folio);
 
 	/* start writeback */
-	__swap_writepage(folio, NULL);
+	ret = __swap_writepage(folio, NULL);
+	WARN_ON_ONCE(ret);
 
 out:
 	if (ret && ret != -EEXIST) {
@@ -1551,7 +1556,7 @@ bool zswap_store(struct folio *folio)
 	zswap_pool_put(pool);
 put_objcg:
 	obj_cgroup_put(objcg);
-	if (!ret && zswap_pool_reached_full)
+	if (!ret && zswap_pool_reached_full && atomic_read(&nr_real_swapfiles))
 		queue_work(shrink_wq, &zswap_shrink_work);
 check_old:
 	/*

---
base-commit: 9835506e139732fa1b55aea3ed4e3ec3dd499f30
change-id: 20251121-ghost-56e3948a7a17

Best regards,
-- 
Chris Li <chrisl@kernel.org>
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Kairui Song 1 week, 2 days ago
On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
>
> The current zswap requires a backing swapfile. The swap slot used
> by zswap is not able to be used by the swapfile. That waste swapfile
> space.
>
> The ghost swapfile is a swapfile that only contains the swapfile header
> for zswap. The swapfile header indicate the size of the swapfile. There
> is no swap data section in the ghost swapfile, therefore, no waste of
> swapfile space.  As such, any write to a ghost swapfile will fail. To
> prevents accidental read or write of ghost swapfile, bdev of
> swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> flag because there is no rotation disk access when using zswap.
>
> The zswap write back has been disabled if all swapfiles in the system
> are ghost swap files.

Thanks for sharing this, I've been hearing about the ghost swapfile
design for a long time, glad to see it finally got posted.

>
> Signed-off-by: Chris Li <chrisl@kernel.org>
> ---
>  include/linux/swap.h |  2 ++
>  mm/page_io.c         | 18 +++++++++++++++---
>  mm/swap.h            |  2 +-
>  mm/swap_state.c      |  7 +++++++
>  mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
>  mm/zswap.c           | 17 +++++++++++------
>  6 files changed, 73 insertions(+), 15 deletions(-)

In general I think this aligns quite well with what I had in mind and
an idea that was mention during LSFMM this year (the 3rd one in the
"Issues" part, it wasn't clearly described in the cover letter, more
details in the slides):
https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/

The good part is that we will reuse everything we have with the
current swap stack, and stay optional. Everything is a swap device, no
special layers required. All other features will be available in a
cleaner way.

And /etc/fstab just works the same way for the ghost swapfile.

Looking forward to see this RFC get more updates.

>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -216,6 +216,7 @@ enum {
>         SWP_PAGE_DISCARD = (1 << 10),   /* freed swap page-cluster discards */
>         SWP_STABLE_WRITES = (1 << 11),  /* no overwrite PG_writeback pages */
>         SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> +       SWP_GHOST       = (1 << 13),    /* not backed by anything */
>                                         /* add others here before... */
>  };
>
> @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
>  void free_pages_and_swap_cache(struct encoded_page **, int);
>  /* linux/mm/swapfile.c */
>  extern atomic_long_t nr_swap_pages;
> +extern atomic_t nr_real_swapfiles;
>  extern long total_swap_pages;
>  extern atomic_t nr_rotate_swap;
>
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
>                 return AOP_WRITEPAGE_ACTIVATE;
>         }
>
> -       __swap_writepage(folio, swap_plug);
> -       return 0;
> +       return __swap_writepage(folio, swap_plug);
>  out_unlock:
>         folio_unlock(folio);
>         return ret;
> @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
>         submit_bio(bio);
>  }
>
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
>  {
>         struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
>
>         VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> +
> +       if (sis->flags & SWP_GHOST) {
> +               /* Prevent the page from getting reclaimed. */
> +               folio_set_dirty(folio);
> +               return AOP_WRITEPAGE_ACTIVATE;
> +       }
> +
>         /*
>          * ->flags can be updated non-atomicially (scan_swap_map_slots),
>          * but that will never affect SWP_FS_OPS, so the data_race
> @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
>                 swap_writepage_bdev_sync(folio, sis);
>         else
>                 swap_writepage_bdev_async(folio, sis);
> +       return 0;
>  }
>
>  void swap_write_unplug(struct swap_iocb *sio)
> @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
>         if (zswap_load(folio) != -ENOENT)
>                 goto finish;
>
> +       if (unlikely(sis->flags & SWP_GHOST)) {
> +               folio_unlock(folio);
> +               goto finish;
> +       }
> +
>         /* We have to read from slower devices. Increase zswap protection. */
>         zswap_folio_swapin(folio);
>
> diff --git a/mm/swap.h b/mm/swap.h
> index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
>  }
>  void swap_write_unplug(struct swap_iocb *sio);
>  int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
>
>  /* linux/mm/swap_state.c */
>  extern struct address_space swap_space __ro_after_init;
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
>         struct swap_iocb *splug = NULL;
>         bool page_allocated;
>
> +       /*
> +        * The entry may have been freed by another task. Avoid swap_info_get()
> +        * which will print error message if the race happens.
> +        */
> +       if (si->flags & SWP_GHOST)
> +               goto skip;
> +
>         mask = swapin_nr_pages(offset) - 1;
>         if (!mask)
>                 goto skip;
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
>  static DEFINE_SPINLOCK(swap_lock);
>  static unsigned int nr_swapfiles;
>  atomic_long_t nr_swap_pages;
> +atomic_t nr_real_swapfiles;
>  /*
>   * Some modules use swappable objects and may try to swap them out under
>   * memory pressure (via the shrinker). Before doing so, they may wish to
> @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
>                         goto skip;
>         }
>
> +       if (!(si->flags & SWP_GHOST))
> +               atomic_sub(1, &nr_real_swapfiles);
>         plist_del(&si->avail_list, &swap_avail_head);
>
>  skip:
> @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
>         }
>
>         plist_add(&si->avail_list, &swap_avail_head);
> +       if (!(si->flags & SWP_GHOST))
> +               atomic_add(1, &nr_real_swapfiles);
>
>  skip:
>         spin_unlock(&swap_avail_lock);
> @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
>         struct inode *inode = mapping->host;
>         int ret;
>
> +       if (sis->flags & SWP_GHOST) {
> +               *span = 0;
> +               return 0;
> +       }
> +
>         if (S_ISBLK(inode->i_mode)) {
>                 ret = add_swap_extent(sis, 0, sis->max, 0);
>                 *span = sis->pages;
> @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>         if (p->flags & SWP_CONTINUED)
>                 free_swap_count_continuations(p);
>
> -       if (!p->bdev || !bdev_nonrot(p->bdev))
> +       if (!(p->flags & SWP_GHOST) &&
> +           (!p->bdev || !bdev_nonrot(p->bdev)))
>                 atomic_dec(&nr_rotate_swap);
>
>         mutex_lock(&swapon_mutex);
> @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
>         mutex_unlock(&swapon_mutex);
>  }
>
> +static const char *swap_type_str(struct swap_info_struct *si)
> +{
> +       struct file *file = si->swap_file;
> +
> +       if (si->flags & SWP_GHOST)
> +               return "ghost\t";
> +
> +       if (S_ISBLK(file_inode(file)->i_mode))
> +               return "partition";
> +
> +       return "file\t";
> +}
> +
>  static int swap_show(struct seq_file *swap, void *v)
>  {
>         struct swap_info_struct *si = v;
> @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
>         len = seq_file_path(swap, file, " \t\n\\");
>         seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
>                         len < 40 ? 40 - len : 1, " ",
> -                       S_ISBLK(file_inode(file)->i_mode) ?
> -                               "partition" : "file\t",
> +                       swap_type_str(si),
>                         bytes, bytes < 10000000 ? "\t" : "",
>                         inuse, inuse < 10000000 ? "\t" : "",
>                         si->prio);
> @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
>         return 0;
>  }
>
> -
>  /*
>   * Find out how many pages are allowed for a single swap device. There
>   * are two limiting factors:
> @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>         unsigned long maxpages;
>         unsigned long swapfilepages;
>         unsigned long last_page;
> +       loff_t size;
>
>         if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
>                 pr_err("Unable to find swap-space signature\n");
> @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>
>         if (!maxpages)
>                 return 0;
> -       swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> +
> +       size = i_size_read(inode);
> +       if (size == PAGE_SIZE) {
> +               /* Ghost swapfile */
> +               si->bdev = NULL;
> +               si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> +               return maxpages;
> +       }

Here if we push things further, it might be a good idea to make better
use of the swap file header for detecting this kind of device, and
maybe add support for other info too. The header already has version
info embedded in case it will be extended.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Baoquan He 1 week, 2 days ago
Add YoungJun to CC.

On 11/22/25 at 05:59pm, Kairui Song wrote:
> On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
> >
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
> >
> > The zswap write back has been disabled if all swapfiles in the system
> > are ghost swap files.
> 
> Thanks for sharing this, I've been hearing about the ghost swapfile
> design for a long time, glad to see it finally got posted.
> 
> >
> > Signed-off-by: Chris Li <chrisl@kernel.org>
> > ---
> >  include/linux/swap.h |  2 ++
> >  mm/page_io.c         | 18 +++++++++++++++---
> >  mm/swap.h            |  2 +-
> >  mm/swap_state.c      |  7 +++++++
> >  mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
> >  mm/zswap.c           | 17 +++++++++++------
> >  6 files changed, 73 insertions(+), 15 deletions(-)
> 
> In general I think this aligns quite well with what I had in mind and
> an idea that was mention during LSFMM this year (the 3rd one in the
> "Issues" part, it wasn't clearly described in the cover letter, more
> details in the slides):
> https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/

Thanks for sharing the background and more information. When I checked
Youngjun's swap.tiers patchset before his RFC, felt it would be more
flexible to add zswap to memcg if zswap size can be decoupled from the
back device. Chris's RFC can satisfy that, but I didn't thought you
guys had planned more, e.g dynamic growth of swap size, and the zswap slot
management being like swap table on swap slot. Looking forward to seeing
the progress and more details.

Thanks
Baoquan

> 
> The good part is that we will reuse everything we have with the
> current swap stack, and stay optional. Everything is a swap device, no
> special layers required. All other features will be available in a
> cleaner way.
> 
> And /etc/fstab just works the same way for the ghost swapfile.
> 
> Looking forward to see this RFC get more updates.
> 
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -216,6 +216,7 @@ enum {
> >         SWP_PAGE_DISCARD = (1 << 10),   /* freed swap page-cluster discards */
> >         SWP_STABLE_WRITES = (1 << 11),  /* no overwrite PG_writeback pages */
> >         SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> > +       SWP_GHOST       = (1 << 13),    /* not backed by anything */
> >                                         /* add others here before... */
> >  };
> >
> > @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
> >  void free_pages_and_swap_cache(struct encoded_page **, int);
> >  /* linux/mm/swapfile.c */
> >  extern atomic_long_t nr_swap_pages;
> > +extern atomic_t nr_real_swapfiles;
> >  extern long total_swap_pages;
> >  extern atomic_t nr_rotate_swap;
> >
> > diff --git a/mm/page_io.c b/mm/page_io.c
> > index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> > --- a/mm/page_io.c
> > +++ b/mm/page_io.c
> > @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> >                 return AOP_WRITEPAGE_ACTIVATE;
> >         }
> >
> > -       __swap_writepage(folio, swap_plug);
> > -       return 0;
> > +       return __swap_writepage(folio, swap_plug);
> >  out_unlock:
> >         folio_unlock(folio);
> >         return ret;
> > @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
> >         submit_bio(bio);
> >  }
> >
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> >  {
> >         struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> >
> >         VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> > +
> > +       if (sis->flags & SWP_GHOST) {
> > +               /* Prevent the page from getting reclaimed. */
> > +               folio_set_dirty(folio);
> > +               return AOP_WRITEPAGE_ACTIVATE;
> > +       }
> > +
> >         /*
> >          * ->flags can be updated non-atomicially (scan_swap_map_slots),
> >          * but that will never affect SWP_FS_OPS, so the data_race
> > @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> >                 swap_writepage_bdev_sync(folio, sis);
> >         else
> >                 swap_writepage_bdev_async(folio, sis);
> > +       return 0;
> >  }
> >
> >  void swap_write_unplug(struct swap_iocb *sio)
> > @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> >         if (zswap_load(folio) != -ENOENT)
> >                 goto finish;
> >
> > +       if (unlikely(sis->flags & SWP_GHOST)) {
> > +               folio_unlock(folio);
> > +               goto finish;
> > +       }
> > +
> >         /* We have to read from slower devices. Increase zswap protection. */
> >         zswap_folio_swapin(folio);
> >
> > diff --git a/mm/swap.h b/mm/swap.h
> > index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> >  }
> >  void swap_write_unplug(struct swap_iocb *sio);
> >  int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> >
> >  /* linux/mm/swap_state.c */
> >  extern struct address_space swap_space __ro_after_init;
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> >         struct swap_iocb *splug = NULL;
> >         bool page_allocated;
> >
> > +       /*
> > +        * The entry may have been freed by another task. Avoid swap_info_get()
> > +        * which will print error message if the race happens.
> > +        */
> > +       if (si->flags & SWP_GHOST)
> > +               goto skip;
> > +
> >         mask = swapin_nr_pages(offset) - 1;
> >         if (!mask)
> >                 goto skip;
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
> >  static DEFINE_SPINLOCK(swap_lock);
> >  static unsigned int nr_swapfiles;
> >  atomic_long_t nr_swap_pages;
> > +atomic_t nr_real_swapfiles;
> >  /*
> >   * Some modules use swappable objects and may try to swap them out under
> >   * memory pressure (via the shrinker). Before doing so, they may wish to
> > @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> >                         goto skip;
> >         }
> >
> > +       if (!(si->flags & SWP_GHOST))
> > +               atomic_sub(1, &nr_real_swapfiles);
> >         plist_del(&si->avail_list, &swap_avail_head);
> >
> >  skip:
> > @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> >         }
> >
> >         plist_add(&si->avail_list, &swap_avail_head);
> > +       if (!(si->flags & SWP_GHOST))
> > +               atomic_add(1, &nr_real_swapfiles);
> >
> >  skip:
> >         spin_unlock(&swap_avail_lock);
> > @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
> >         struct inode *inode = mapping->host;
> >         int ret;
> >
> > +       if (sis->flags & SWP_GHOST) {
> > +               *span = 0;
> > +               return 0;
> > +       }
> > +
> >         if (S_ISBLK(inode->i_mode)) {
> >                 ret = add_swap_extent(sis, 0, sis->max, 0);
> >                 *span = sis->pages;
> > @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> >         if (p->flags & SWP_CONTINUED)
> >                 free_swap_count_continuations(p);
> >
> > -       if (!p->bdev || !bdev_nonrot(p->bdev))
> > +       if (!(p->flags & SWP_GHOST) &&
> > +           (!p->bdev || !bdev_nonrot(p->bdev)))
> >                 atomic_dec(&nr_rotate_swap);
> >
> >         mutex_lock(&swapon_mutex);
> > @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
> >         mutex_unlock(&swapon_mutex);
> >  }
> >
> > +static const char *swap_type_str(struct swap_info_struct *si)
> > +{
> > +       struct file *file = si->swap_file;
> > +
> > +       if (si->flags & SWP_GHOST)
> > +               return "ghost\t";
> > +
> > +       if (S_ISBLK(file_inode(file)->i_mode))
> > +               return "partition";
> > +
> > +       return "file\t";
> > +}
> > +
> >  static int swap_show(struct seq_file *swap, void *v)
> >  {
> >         struct swap_info_struct *si = v;
> > @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
> >         len = seq_file_path(swap, file, " \t\n\\");
> >         seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
> >                         len < 40 ? 40 - len : 1, " ",
> > -                       S_ISBLK(file_inode(file)->i_mode) ?
> > -                               "partition" : "file\t",
> > +                       swap_type_str(si),
> >                         bytes, bytes < 10000000 ? "\t" : "",
> >                         inuse, inuse < 10000000 ? "\t" : "",
> >                         si->prio);
> > @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
> >         return 0;
> >  }
> >
> > -
> >  /*
> >   * Find out how many pages are allowed for a single swap device. There
> >   * are two limiting factors:
> > @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> >         unsigned long maxpages;
> >         unsigned long swapfilepages;
> >         unsigned long last_page;
> > +       loff_t size;
> >
> >         if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
> >                 pr_err("Unable to find swap-space signature\n");
> > @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> >
> >         if (!maxpages)
> >                 return 0;
> > -       swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> > +
> > +       size = i_size_read(inode);
> > +       if (size == PAGE_SIZE) {
> > +               /* Ghost swapfile */
> > +               si->bdev = NULL;
> > +               si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> > +               return maxpages;
> > +       }
> 
> Here if we push things further, it might be a good idea to make better
> use of the swap file header for detecting this kind of device, and
> maybe add support for other info too. The header already has version
> info embedded in case it will be extended.
> 

Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Yosry Ahmed 1 week, 3 days ago
On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> The current zswap requires a backing swapfile. The swap slot used
> by zswap is not able to be used by the swapfile. That waste swapfile
> space.
> 
> The ghost swapfile is a swapfile that only contains the swapfile header
> for zswap. The swapfile header indicate the size of the swapfile. There
> is no swap data section in the ghost swapfile, therefore, no waste of
> swapfile space.  As such, any write to a ghost swapfile will fail. To
> prevents accidental read or write of ghost swapfile, bdev of
> swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> flag because there is no rotation disk access when using zswap.
> 
> The zswap write back has been disabled if all swapfiles in the system
> are ghost swap files.
> 
> Signed-off-by: Chris Li <chrisl@kernel.org>

This was brought up before, I think it's not the right way to go
upstream. Even if it's good for the short-term, it's a behavior exposed
to userspace that we'll have to maintain. With the ongoing work to
decouple zswap and swap backends, this will end up being something we
have to workaround indefinitely to keep the same userspace semantics.

> ---
>  include/linux/swap.h |  2 ++
>  mm/page_io.c         | 18 +++++++++++++++---
>  mm/swap.h            |  2 +-
>  mm/swap_state.c      |  7 +++++++
>  mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
>  mm/zswap.c           | 17 +++++++++++------
>  6 files changed, 73 insertions(+), 15 deletions(-)
> 
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -216,6 +216,7 @@ enum {
>  	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
>  	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
>  	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
> +	SWP_GHOST	= (1 << 13),	/* not backed by anything */
>  					/* add others here before... */
>  };
>  
> @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
>  void free_pages_and_swap_cache(struct encoded_page **, int);
>  /* linux/mm/swapfile.c */
>  extern atomic_long_t nr_swap_pages;
> +extern atomic_t nr_real_swapfiles;
>  extern long total_swap_pages;
>  extern atomic_t nr_rotate_swap;
>  
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
>  		return AOP_WRITEPAGE_ACTIVATE;
>  	}
>  
> -	__swap_writepage(folio, swap_plug);
> -	return 0;
> +	return __swap_writepage(folio, swap_plug);
>  out_unlock:
>  	folio_unlock(folio);
>  	return ret;
> @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
>  	submit_bio(bio);
>  }
>  
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
>  {
>  	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
>  
>  	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> +
> +	if (sis->flags & SWP_GHOST) {
> +		/* Prevent the page from getting reclaimed. */
> +		folio_set_dirty(folio);
> +		return AOP_WRITEPAGE_ACTIVATE;
> +	}
> +
>  	/*
>  	 * ->flags can be updated non-atomicially (scan_swap_map_slots),
>  	 * but that will never affect SWP_FS_OPS, so the data_race
> @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
>  		swap_writepage_bdev_sync(folio, sis);
>  	else
>  		swap_writepage_bdev_async(folio, sis);
> +	return 0;
>  }
>  
>  void swap_write_unplug(struct swap_iocb *sio)
> @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
>  	if (zswap_load(folio) != -ENOENT)
>  		goto finish;
>  
> +	if (unlikely(sis->flags & SWP_GHOST)) {
> +		folio_unlock(folio);
> +		goto finish;
> +	}
> +
>  	/* We have to read from slower devices. Increase zswap protection. */
>  	zswap_folio_swapin(folio);
>  
> diff --git a/mm/swap.h b/mm/swap.h
> index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
>  }
>  void swap_write_unplug(struct swap_iocb *sio);
>  int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
>  
>  /* linux/mm/swap_state.c */
>  extern struct address_space swap_space __ro_after_init;
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
>  	struct swap_iocb *splug = NULL;
>  	bool page_allocated;
>  
> +	/*
> +	 * The entry may have been freed by another task. Avoid swap_info_get()
> +	 * which will print error message if the race happens.
> +	 */
> +	if (si->flags & SWP_GHOST)
> +		goto skip;
> +
>  	mask = swapin_nr_pages(offset) - 1;
>  	if (!mask)
>  		goto skip;
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
>  static DEFINE_SPINLOCK(swap_lock);
>  static unsigned int nr_swapfiles;
>  atomic_long_t nr_swap_pages;
> +atomic_t nr_real_swapfiles;
>  /*
>   * Some modules use swappable objects and may try to swap them out under
>   * memory pressure (via the shrinker). Before doing so, they may wish to
> @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
>  			goto skip;
>  	}
>  
> +	if (!(si->flags & SWP_GHOST))
> +		atomic_sub(1, &nr_real_swapfiles);
>  	plist_del(&si->avail_list, &swap_avail_head);
>  
>  skip:
> @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
>  	}
>  
>  	plist_add(&si->avail_list, &swap_avail_head);
> +	if (!(si->flags & SWP_GHOST))
> +		atomic_add(1, &nr_real_swapfiles);
>  
>  skip:
>  	spin_unlock(&swap_avail_lock);
> @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
>  	struct inode *inode = mapping->host;
>  	int ret;
>  
> +	if (sis->flags & SWP_GHOST) {
> +		*span = 0;
> +		return 0;
> +	}
> +
>  	if (S_ISBLK(inode->i_mode)) {
>  		ret = add_swap_extent(sis, 0, sis->max, 0);
>  		*span = sis->pages;
> @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  	if (p->flags & SWP_CONTINUED)
>  		free_swap_count_continuations(p);
>  
> -	if (!p->bdev || !bdev_nonrot(p->bdev))
> +	if (!(p->flags & SWP_GHOST) &&
> +	    (!p->bdev || !bdev_nonrot(p->bdev)))
>  		atomic_dec(&nr_rotate_swap);
>  
>  	mutex_lock(&swapon_mutex);
> @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
>  	mutex_unlock(&swapon_mutex);
>  }
>  
> +static const char *swap_type_str(struct swap_info_struct *si)
> +{
> +	struct file *file = si->swap_file;
> +
> +	if (si->flags & SWP_GHOST)
> +		return "ghost\t";
> +
> +	if (S_ISBLK(file_inode(file)->i_mode))
> +		return "partition";
> +
> +	return "file\t";
> +}
> +
>  static int swap_show(struct seq_file *swap, void *v)
>  {
>  	struct swap_info_struct *si = v;
> @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
>  	len = seq_file_path(swap, file, " \t\n\\");
>  	seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
>  			len < 40 ? 40 - len : 1, " ",
> -			S_ISBLK(file_inode(file)->i_mode) ?
> -				"partition" : "file\t",
> +			swap_type_str(si),
>  			bytes, bytes < 10000000 ? "\t" : "",
>  			inuse, inuse < 10000000 ? "\t" : "",
>  			si->prio);
> @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
>  	return 0;
>  }
>  
> -
>  /*
>   * Find out how many pages are allowed for a single swap device. There
>   * are two limiting factors:
> @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>  	unsigned long maxpages;
>  	unsigned long swapfilepages;
>  	unsigned long last_page;
> +	loff_t size;
>  
>  	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
>  		pr_err("Unable to find swap-space signature\n");
> @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>  
>  	if (!maxpages)
>  		return 0;
> -	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> +
> +	size = i_size_read(inode);
> +	if (size == PAGE_SIZE) {
> +		/* Ghost swapfile */
> +		si->bdev = NULL;
> +		si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> +		return maxpages;
> +	}
> +
> +	swapfilepages = size >> PAGE_SHIFT;
>  	if (swapfilepages && maxpages > swapfilepages) {
>  		pr_warn("Swap area shorter than signature indicates\n");
>  		return 0;
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 5d0f8b13a958da3b5e74b63217b06e58ba2d3c26..29dfcc94b13eb72b1dbd100ded6e50620299e6e1 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1005,14 +1005,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>  	struct folio *folio;
>  	struct mempolicy *mpol;
>  	bool folio_was_allocated;
> -	struct swap_info_struct *si;
> +	struct swap_info_struct *si = get_swap_device(swpentry);
>  	int ret = 0;
>  
> -	/* try to allocate swap cache folio */
> -	si = get_swap_device(swpentry);
>  	if (!si)
> -		return -EEXIST;
> +		return -ENOENT;
> +
> +	if (si->flags & SWP_GHOST) {
> +		put_swap_device(si);
> +		return -EINVAL;
> +	}
>  
> +	/* try to allocate swap cache folio */
>  	mpol = get_task_policy(current);
>  	folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
>  			NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
> @@ -1067,7 +1071,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>  	folio_set_reclaim(folio);
>  
>  	/* start writeback */
> -	__swap_writepage(folio, NULL);
> +	ret = __swap_writepage(folio, NULL);
> +	WARN_ON_ONCE(ret);
>  
>  out:
>  	if (ret && ret != -EEXIST) {
> @@ -1551,7 +1556,7 @@ bool zswap_store(struct folio *folio)
>  	zswap_pool_put(pool);
>  put_objcg:
>  	obj_cgroup_put(objcg);
> -	if (!ret && zswap_pool_reached_full)
> +	if (!ret && zswap_pool_reached_full && atomic_read(&nr_real_swapfiles))
>  		queue_work(shrink_wq, &zswap_shrink_work);
>  check_old:
>  	/*
> 
> ---
> base-commit: 9835506e139732fa1b55aea3ed4e3ec3dd499f30
> change-id: 20251121-ghost-56e3948a7a17
> 
> Best regards,
> -- 
> Chris Li <chrisl@kernel.org>
>
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 1 week, 2 days ago
On Fri, Nov 21, 2025 at 7:14 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
>
> On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
> >
> > The zswap write back has been disabled if all swapfiles in the system
> > are ghost swap files.
> >
> > Signed-off-by: Chris Li <chrisl@kernel.org>
>
> This was brought up before, I think it's not the right way to go
> upstream. Even if it's good for the short-term, it's a behavior exposed
> to userspace that we'll have to maintain. With the ongoing work to
> decouple zswap and swap backends, this will end up being something we
> have to workaround indefinitely to keep the same userspace semantics.

Actually, this doesn't need to be the short term solution. It can be
long term. I get  it your zswap maintainers do not want to get
involved in the ghost swapfile. I will leave you guys alone. Remember
2023 LPC swap abstraction talk, the community picked my approach to
the VFS swap ops over the swap abstraction which the swap
virtualization is based on. I take some time to come up with the
cluster based swap allocator and swap table to clean up and speed up
the swap stack. Now I am finally able to circle back and fulfill my
promise of the VFS swap ops. Have a little faith I will solve this
swap entry redirection issue nicely for you, better than the swap
virtualization approach can.

Chris

>
> > ---
> >  include/linux/swap.h |  2 ++
> >  mm/page_io.c         | 18 +++++++++++++++---
> >  mm/swap.h            |  2 +-
> >  mm/swap_state.c      |  7 +++++++
> >  mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++++-----
> >  mm/zswap.c           | 17 +++++++++++------
> >  6 files changed, 73 insertions(+), 15 deletions(-)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -216,6 +216,7 @@ enum {
> >       SWP_PAGE_DISCARD = (1 << 10),   /* freed swap page-cluster discards */
> >       SWP_STABLE_WRITES = (1 << 11),  /* no overwrite PG_writeback pages */
> >       SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> > +     SWP_GHOST       = (1 << 13),    /* not backed by anything */
> >                                       /* add others here before... */
> >  };
> >
> > @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
> >  void free_pages_and_swap_cache(struct encoded_page **, int);
> >  /* linux/mm/swapfile.c */
> >  extern atomic_long_t nr_swap_pages;
> > +extern atomic_t nr_real_swapfiles;
> >  extern long total_swap_pages;
> >  extern atomic_t nr_rotate_swap;
> >
> > diff --git a/mm/page_io.c b/mm/page_io.c
> > index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> > --- a/mm/page_io.c
> > +++ b/mm/page_io.c
> > @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> >               return AOP_WRITEPAGE_ACTIVATE;
> >       }
> >
> > -     __swap_writepage(folio, swap_plug);
> > -     return 0;
> > +     return __swap_writepage(folio, swap_plug);
> >  out_unlock:
> >       folio_unlock(folio);
> >       return ret;
> > @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
> >       submit_bio(bio);
> >  }
> >
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> >  {
> >       struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> >
> >       VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> > +
> > +     if (sis->flags & SWP_GHOST) {
> > +             /* Prevent the page from getting reclaimed. */
> > +             folio_set_dirty(folio);
> > +             return AOP_WRITEPAGE_ACTIVATE;
> > +     }
> > +
> >       /*
> >        * ->flags can be updated non-atomicially (scan_swap_map_slots),
> >        * but that will never affect SWP_FS_OPS, so the data_race
> > @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> >               swap_writepage_bdev_sync(folio, sis);
> >       else
> >               swap_writepage_bdev_async(folio, sis);
> > +     return 0;
> >  }
> >
> >  void swap_write_unplug(struct swap_iocb *sio)
> > @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> >       if (zswap_load(folio) != -ENOENT)
> >               goto finish;
> >
> > +     if (unlikely(sis->flags & SWP_GHOST)) {
> > +             folio_unlock(folio);
> > +             goto finish;
> > +     }
> > +
> >       /* We have to read from slower devices. Increase zswap protection. */
> >       zswap_folio_swapin(folio);
> >
> > diff --git a/mm/swap.h b/mm/swap.h
> > index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> >  }
> >  void swap_write_unplug(struct swap_iocb *sio);
> >  int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> >
> >  /* linux/mm/swap_state.c */
> >  extern struct address_space swap_space __ro_after_init;
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> >       struct swap_iocb *splug = NULL;
> >       bool page_allocated;
> >
> > +     /*
> > +      * The entry may have been freed by another task. Avoid swap_info_get()
> > +      * which will print error message if the race happens.
> > +      */
> > +     if (si->flags & SWP_GHOST)
> > +             goto skip;
> > +
> >       mask = swapin_nr_pages(offset) - 1;
> >       if (!mask)
> >               goto skip;
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
> >  static DEFINE_SPINLOCK(swap_lock);
> >  static unsigned int nr_swapfiles;
> >  atomic_long_t nr_swap_pages;
> > +atomic_t nr_real_swapfiles;
> >  /*
> >   * Some modules use swappable objects and may try to swap them out under
> >   * memory pressure (via the shrinker). Before doing so, they may wish to
> > @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> >                       goto skip;
> >       }
> >
> > +     if (!(si->flags & SWP_GHOST))
> > +             atomic_sub(1, &nr_real_swapfiles);
> >       plist_del(&si->avail_list, &swap_avail_head);
> >
> >  skip:
> > @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> >       }
> >
> >       plist_add(&si->avail_list, &swap_avail_head);
> > +     if (!(si->flags & SWP_GHOST))
> > +             atomic_add(1, &nr_real_swapfiles);
> >
> >  skip:
> >       spin_unlock(&swap_avail_lock);
> > @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
> >       struct inode *inode = mapping->host;
> >       int ret;
> >
> > +     if (sis->flags & SWP_GHOST) {
> > +             *span = 0;
> > +             return 0;
> > +     }
> > +
> >       if (S_ISBLK(inode->i_mode)) {
> >               ret = add_swap_extent(sis, 0, sis->max, 0);
> >               *span = sis->pages;
> > @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> >       if (p->flags & SWP_CONTINUED)
> >               free_swap_count_continuations(p);
> >
> > -     if (!p->bdev || !bdev_nonrot(p->bdev))
> > +     if (!(p->flags & SWP_GHOST) &&
> > +         (!p->bdev || !bdev_nonrot(p->bdev)))
> >               atomic_dec(&nr_rotate_swap);
> >
> >       mutex_lock(&swapon_mutex);
> > @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
> >       mutex_unlock(&swapon_mutex);
> >  }
> >
> > +static const char *swap_type_str(struct swap_info_struct *si)
> > +{
> > +     struct file *file = si->swap_file;
> > +
> > +     if (si->flags & SWP_GHOST)
> > +             return "ghost\t";
> > +
> > +     if (S_ISBLK(file_inode(file)->i_mode))
> > +             return "partition";
> > +
> > +     return "file\t";
> > +}
> > +
> >  static int swap_show(struct seq_file *swap, void *v)
> >  {
> >       struct swap_info_struct *si = v;
> > @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
> >       len = seq_file_path(swap, file, " \t\n\\");
> >       seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
> >                       len < 40 ? 40 - len : 1, " ",
> > -                     S_ISBLK(file_inode(file)->i_mode) ?
> > -                             "partition" : "file\t",
> > +                     swap_type_str(si),
> >                       bytes, bytes < 10000000 ? "\t" : "",
> >                       inuse, inuse < 10000000 ? "\t" : "",
> >                       si->prio);
> > @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
> >       return 0;
> >  }
> >
> > -
> >  /*
> >   * Find out how many pages are allowed for a single swap device. There
> >   * are two limiting factors:
> > @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> >       unsigned long maxpages;
> >       unsigned long swapfilepages;
> >       unsigned long last_page;
> > +     loff_t size;
> >
> >       if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
> >               pr_err("Unable to find swap-space signature\n");
> > @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> >
> >       if (!maxpages)
> >               return 0;
> > -     swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> > +
> > +     size = i_size_read(inode);
> > +     if (size == PAGE_SIZE) {
> > +             /* Ghost swapfile */
> > +             si->bdev = NULL;
> > +             si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> > +             return maxpages;
> > +     }
> > +
> > +     swapfilepages = size >> PAGE_SHIFT;
> >       if (swapfilepages && maxpages > swapfilepages) {
> >               pr_warn("Swap area shorter than signature indicates\n");
> >               return 0;
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 5d0f8b13a958da3b5e74b63217b06e58ba2d3c26..29dfcc94b13eb72b1dbd100ded6e50620299e6e1 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -1005,14 +1005,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >       struct folio *folio;
> >       struct mempolicy *mpol;
> >       bool folio_was_allocated;
> > -     struct swap_info_struct *si;
> > +     struct swap_info_struct *si = get_swap_device(swpentry);
> >       int ret = 0;
> >
> > -     /* try to allocate swap cache folio */
> > -     si = get_swap_device(swpentry);
> >       if (!si)
> > -             return -EEXIST;
> > +             return -ENOENT;
> > +
> > +     if (si->flags & SWP_GHOST) {
> > +             put_swap_device(si);
> > +             return -EINVAL;
> > +     }
> >
> > +     /* try to allocate swap cache folio */
> >       mpol = get_task_policy(current);
> >       folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
> >                       NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
> > @@ -1067,7 +1071,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >       folio_set_reclaim(folio);
> >
> >       /* start writeback */
> > -     __swap_writepage(folio, NULL);
> > +     ret = __swap_writepage(folio, NULL);
> > +     WARN_ON_ONCE(ret);
> >
> >  out:
> >       if (ret && ret != -EEXIST) {
> > @@ -1551,7 +1556,7 @@ bool zswap_store(struct folio *folio)
> >       zswap_pool_put(pool);
> >  put_objcg:
> >       obj_cgroup_put(objcg);
> > -     if (!ret && zswap_pool_reached_full)
> > +     if (!ret && zswap_pool_reached_full && atomic_read(&nr_real_swapfiles))
> >               queue_work(shrink_wq, &zswap_shrink_work);
> >  check_old:
> >       /*
> >
> > ---
> > base-commit: 9835506e139732fa1b55aea3ed4e3ec3dd499f30
> > change-id: 20251121-ghost-56e3948a7a17
> >
> > Best regards,
> > --
> > Chris Li <chrisl@kernel.org>
> >
>
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Nhat Pham 1 week ago
On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Fri, Nov 21, 2025 at 7:14 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
> >
> > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > The current zswap requires a backing swapfile. The swap slot used
> > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > space.
> > >
> > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > prevents accidental read or write of ghost swapfile, bdev of
> > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > flag because there is no rotation disk access when using zswap.
> > >
> > > The zswap write back has been disabled if all swapfiles in the system
> > > are ghost swap files.
> > >
> > > Signed-off-by: Chris Li <chrisl@kernel.org>
> >
> > This was brought up before, I think it's not the right way to go
> > upstream. Even if it's good for the short-term, it's a behavior exposed
> > to userspace that we'll have to maintain. With the ongoing work to
> > decouple zswap and swap backends, this will end up being something we
> > have to workaround indefinitely to keep the same userspace semantics.
>
> Actually, this doesn't need to be the short term solution. It can be
> long term. I get  it your zswap maintainers do not want to get
> involved in the ghost swapfile. I will leave you guys alone. Remember
> 2023 LPC swap abstraction talk, the community picked my approach to
> the VFS swap ops over the swap abstraction which the swap
> virtualization is based on. I take some time to come up with the
> cluster based swap allocator and swap table to clean up and speed up
> the swap stack. Now I am finally able to circle back and fulfill my
> promise of the VFS swap ops. Have a little faith I will solve this
> swap entry redirection issue nicely for you, better than the swap
> virtualization approach can.

Look man, I'm not married to any idea. If your VFS approach solve our
problems, I can move on to other projects :) We have lots of
swap/memory reclaim/MM problems to solve, both internally at Meta and
upstream.

But please explain how your VFS approach solved the 3 requirements I
mentioned in the other email, and more specifically the backend
transfer requirement.

I have explicitly asked about it in your submission for your 2024
LSFMMBPF talk - at that time I have not seriously started the swap
virtualization work, only at the design phase. You just handwaved it
away and never really explained to me how you can achieve backend
transfer with your design:

https://lore.kernel.org/all/CAF8kJuNFtejEtjQHg5UBGduvFNn3AaGn4ffyoOrEnXfHpx6Ubg@mail.gmail.com/

I understand that you had more pressing issues to fix at a time, so I
did not bring it up during the conference. But it's an imperative
requirement for us.

swap.tiers is nice for initial placement and for hierarchy
determination in general, but when the page is already placed on one
tier and needs to be transferred to the tier, how will you move it
from one tier to another?

What zram is doing right now, IIUC, is building the redirection
internally. I would like to try avoiding repeating that for zswap, and
for every other future backends, by pulling it out of backend internal
code and build a dedicated module for it. That is just swap
virtualization.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Johannes Weiner 1 week, 3 days ago
On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> The current zswap requires a backing swapfile. The swap slot used
> by zswap is not able to be used by the swapfile. That waste swapfile
> space.
>
> The ghost swapfile is a swapfile that only contains the swapfile header
> for zswap. The swapfile header indicate the size of the swapfile. There
> is no swap data section in the ghost swapfile, therefore, no waste of
> swapfile space.  As such, any write to a ghost swapfile will fail. To
> prevents accidental read or write of ghost swapfile, bdev of
> swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> flag because there is no rotation disk access when using zswap.

Zswap is primarily a compressed cache for real swap on secondary
storage. It's indeed quite important that entries currently in zswap
don't occupy disk slots; but for a solution to this to be acceptable,
it has to work with the primary usecase and support disk writeback.

This direction is a dead-end. Please take a look at Nhat's swap
virtualization patches. They decouple zswap from disk geometry, while
still supporting writeback to an actual backend file.

Nacked-by: Johannes Weiner <hannes@cmpxchg.org>
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 1 week, 2 days ago
On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
>
> Zswap is primarily a compressed cache for real swap on secondary
> storage. It's indeed quite important that entries currently in zswap
> don't occupy disk slots; but for a solution to this to be acceptable,
> it has to work with the primary usecase and support disk writeback.

Well, my plan is to support the writeback via swap.tiers.

> This direction is a dead-end. Please take a look at Nhat's swap
> virtualization patches. They decouple zswap from disk geometry, while
> still supporting writeback to an actual backend file.

Yes, there are many ways to decouple zswap from disk geometry, my swap
table + swap.tiers design can do that as well. I have concerns about
swap virtualization in the aspect of adding another layer of memory
overhead addition per swap entry and CPU overhead of extra xarray
lookup. I believe my approach is technically superior and cleaner.
Both faster and cleaner. Basically swap.tiers + VFS like swap read
write page ops. I will let Nhat clarify the performance and memory
overhead side of the swap virtualization.

I am not against swap entry redirection. Just the swap virtualization
series needs to compare against the alternatives in terms of memory
overhead and throughput.
Solving it from the swap.tiers angle is cleaner.

> Nacked-by: Johannes Weiner <hannes@cmpxchg.org>

I take that the only relevant part is you are zswap maintainer and I
am the swap maintainer. Fine. I got the message. I will leave the
zswap alone. I will find other ways to address the memory base swap
tiers in swap.tiers.

Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 6 days, 6 hours ago
Hi Johannes,

On Sat, Nov 22, 2025 at 5:52 AM Chris Li <chrisl@kernel.org> wrote:
>
> > Nacked-by: Johannes Weiner <hannes@cmpxchg.org>
>
> I take that the only relevant part is you are zswap maintainer and I
> am the swap maintainer. Fine. I got the message. I will leave the
> zswap alone. I will find other ways to address the memory base swap
> tiers in swap.tiers.

I am sorry that I have said that. Let me take back what I said above.
I was upset when I considered you and others blocking the more optimal
solution and in favor of the less optimal solution. That is my short
temper, as usual.

Now I can see that you might not see one as more optimal than the
other as convincing as I do, or I haven't done a good job explaining
it.

Let me offer my sincere apology. I will reply to the technical aspect
of the question in other email.

Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Johannes Weiner 6 days, 6 hours ago
On Tue, Nov 25, 2025 at 10:14:40PM +0400, Chris Li wrote:
> Hi Johannes,
> 
> On Sat, Nov 22, 2025 at 5:52 AM Chris Li <chrisl@kernel.org> wrote:
> >
> > > Nacked-by: Johannes Weiner <hannes@cmpxchg.org>
> >
> > I take that the only relevant part is you are zswap maintainer and I
> > am the swap maintainer. Fine. I got the message. I will leave the
> > zswap alone. I will find other ways to address the memory base swap
> > tiers in swap.tiers.
> 
> I am sorry that I have said that. Let me take back what I said above.
> I was upset when I considered you and others blocking the more optimal
> solution and in favor of the less optimal solution. That is my short
> temper, as usual.
> 
> Now I can see that you might not see one as more optimal than the
> other as convincing as I do, or I haven't done a good job explaining
> it.
> 
> Let me offer my sincere apology. I will reply to the technical aspect
> of the question in other email.

Thanks Chris. No hard feelings.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Johannes Weiner 1 week ago
On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote:
> On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > The current zswap requires a backing swapfile. The swap slot used
> > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > space.
> > >
> > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > prevents accidental read or write of ghost swapfile, bdev of
> > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > flag because there is no rotation disk access when using zswap.
> >
> > Zswap is primarily a compressed cache for real swap on secondary
> > storage. It's indeed quite important that entries currently in zswap
> > don't occupy disk slots; but for a solution to this to be acceptable,
> > it has to work with the primary usecase and support disk writeback.
> 
> Well, my plan is to support the writeback via swap.tiers.

Do you have a link to that proposal?

My understanding of swap tiers was about grouping different swapfiles
and assigning them to cgroups. The issue with writeback is relocating
the data that a swp_entry_t page table refers to - without having to
find and update all the possible page tables. I'm not sure how
swap.tiers solve this problem.

> > This direction is a dead-end. Please take a look at Nhat's swap
> > virtualization patches. They decouple zswap from disk geometry, while
> > still supporting writeback to an actual backend file.
> 
> Yes, there are many ways to decouple zswap from disk geometry, my swap
> table + swap.tiers design can do that as well. I have concerns about
> swap virtualization in the aspect of adding another layer of memory
> overhead addition per swap entry and CPU overhead of extra xarray
> lookup. I believe my approach is technically superior and cleaner.
> Both faster and cleaner. Basically swap.tiers + VFS like swap read
> write page ops. I will let Nhat clarify the performance and memory
> overhead side of the swap virtualization.

I'm happy to discuss it.

But keep in mind that the swap virtualization idea is a collaborative
product of quite a few people with an extensive combined upstream
record. Quite a bit of thought has gone into balancing static vs
runtime costs of that proposal. So you'll forgive me if I'm a bit
skeptical of the somewhat grandiose claims of one person that is new
to upstream development.

As to your specific points - we use xarray lookups in the page cache
fast path. It's a bold claim to say this would be too much overhead
during swapins.

Two, it's not clear to me how you want to make writeback efficient
*without* any sort of swap entry redirection. Walking all relevant
page tables is expensive; and you have to be able to find them first.

If you're talking about a redirection array as opposed to a tree -
static sizing of the compressed space is also a no-go. Zswap
utilization varies *widely* between workloads and different workload
combinations. Further, zswap consumes the same fungible resource as
uncompressed memory - there is really no excuse to burden users with
static sizing questions about this pool.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Yosry Ahmed 1 week ago
On Mon, Nov 24, 2025 at 12:27:17PM -0500, Johannes Weiner wrote:
> On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote:
> > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > >
> > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > > The current zswap requires a backing swapfile. The swap slot used
> > > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > > space.
> > > >
> > > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > > prevents accidental read or write of ghost swapfile, bdev of
> > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > > flag because there is no rotation disk access when using zswap.
> > >
> > > Zswap is primarily a compressed cache for real swap on secondary
> > > storage. It's indeed quite important that entries currently in zswap
> > > don't occupy disk slots; but for a solution to this to be acceptable,
> > > it has to work with the primary usecase and support disk writeback.
> > 
> > Well, my plan is to support the writeback via swap.tiers.
> 
> Do you have a link to that proposal?
> 
> My understanding of swap tiers was about grouping different swapfiles
> and assigning them to cgroups. The issue with writeback is relocating
> the data that a swp_entry_t page table refers to - without having to
> find and update all the possible page tables. I'm not sure how
> swap.tiers solve this problem.
> 
> > > This direction is a dead-end. Please take a look at Nhat's swap
> > > virtualization patches. They decouple zswap from disk geometry, while
> > > still supporting writeback to an actual backend file.
> > 
> > Yes, there are many ways to decouple zswap from disk geometry, my swap
> > table + swap.tiers design can do that as well. I have concerns about
> > swap virtualization in the aspect of adding another layer of memory
> > overhead addition per swap entry and CPU overhead of extra xarray
> > lookup. I believe my approach is technically superior and cleaner.
> > Both faster and cleaner. Basically swap.tiers + VFS like swap read
> > write page ops. I will let Nhat clarify the performance and memory
> > overhead side of the swap virtualization.
> 
> I'm happy to discuss it.
> 
> But keep in mind that the swap virtualization idea is a collaborative
> product of quite a few people with an extensive combined upstream
> record. Quite a bit of thought has gone into balancing static vs
> runtime costs of that proposal. So you'll forgive me if I'm a bit
> skeptical of the somewhat grandiose claims of one person that is new
> to upstream development.
> 
> As to your specific points - we use xarray lookups in the page cache
> fast path. It's a bold claim to say this would be too much overhead
> during swapins.
> 
> Two, it's not clear to me how you want to make writeback efficient
> *without* any sort of swap entry redirection. Walking all relevant
> page tables is expensive; and you have to be able to find them first.
> 
> If you're talking about a redirection array as opposed to a tree -
> static sizing of the compressed space is also a no-go. Zswap
> utilization varies *widely* between workloads and different workload
> combinations. Further, zswap consumes the same fungible resource as
> uncompressed memory - there is really no excuse to burden users with
> static sizing questions about this pool.

I think what Chris's idea is (and Chris correct me if I am wrong), is
that we use ghost swapfiles (that are not backed by disk space) for
zswap. So zswap has its own swapfiles, separate from disk swapfiles.

memory.tiers establishes the ordering between swapfiles, so you put
"ghost" -> "real" to get today's zswap writeback behavior. When you
writeback, you keep page tables pointing at the swap entry in the ghost
swapfile. What you do is:
- Allocate a new swap entry in the "real" swapfile.
- Update the swap table of the "ghost" swapfile to point at the swap
  entry in the "real" swapfile, reusing the pointer used for the
  swapcache.

Then, on swapin, you read the swap table of the "ghost" swapfile, find
the redirection, and read to the swap table of the "real" swapfile, then
read the page from disk into the swap cache. The redirection in the
"ghost" swapfile will keep existing, wasting that slot, until all
references to it are dropped.

I think this might work for this specific use case, with less overhead
than the xarray. BUT there are a few scenarios that are not covered
AFAICT:

- You still need to statically size the ghost swapfiles and their
  overheads.

- Wasting a slot in the ghost swapfile for the redirection. This
  complicates static provisioning a bit, because you have to account for
  entries that will be in zswap as well as writtenback. Furthermore,
  IIUC swap.tiers is intended to be generic and cover other use cases
  beyond zswap like SSD -> HDD. For that, I think wasting a slot in the
  SSD when we writeback to the HDD is a much bigger problem.

- We still cannot do swapoff efficiently as we need to walk the page
  tables (and some swap tables) to find and swapin all entries in a
  swapfile. Not as important as other things, but worth mentioning.

Chris please let me know if I didn't get this right.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 6 days, 6 hours ago
On Mon, Nov 24, 2025 at 11:32 PM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
>
> I think what Chris's idea is (and Chris correct me if I am wrong), is
> that we use ghost swapfiles (that are not backed by disk space) for
> zswap. So zswap has its own swapfiles, separate from disk swapfiles.

Ack.

> memory.tiers establishes the ordering between swapfiles, so you put
> "ghost" -> "real" to get today's zswap writeback behavior. When you
> writeback, you keep page tables pointing at the swap entry in the ghost
> swapfile. What you do is:
> - Allocate a new swap entry in the "real" swapfile.
> - Update the swap table of the "ghost" swapfile to point at the swap
>   entry in the "real" swapfile, reusing the pointer used for the
>   swapcache.

Ack, with minor adjustment in mapping the swap entry to the physical
location. The swap entry has swap cache, the physical location does
not.

> Then, on swapin, you read the swap table of the "ghost" swapfile, find
> the redirection, and read to the swap table of the "real" swapfile, then
> read the page from disk into the swap cache. The redirection in the
> "ghost" swapfile will keep existing, wasting that slot, until all
> references to it are dropped.

Ack. That is assuming we don't have a rmap a like for the swap entry.

> I think this might work for this specific use case, with less overhead
> than the xarray. BUT there are a few scenarios that are not covered
> AFAICT:
>
> - You still need to statically size the ghost swapfiles and their
>   overheads.

No true, both ghost swapfile and physical swapfile can expand
additional clusters beyond the original physical size, for allocating
the continued high order entry or redirection. For a ghost swapfile,
there is no physical layer, only the front end. So the size can grow
dynamically. Just allocate more clusters. The current swapfile header
file size is just an initial size. My current patch does not implement
that. It will need some later swap table phase to make it happen. But
that is not an architecture limit, it has been considered as part of
normal business.

> - Wasting a slot in the ghost swapfile for the redirection. This
>   complicates static provisioning a bit, because you have to account for
>   entries that will be in zswap as well as writtenback. Furthermore,
>   IIUC swap.tiers is intended to be generic and cover other use cases
>   beyond zswap like SSD -> HDD. For that, I think wasting a slot in the
>   SSD when we writeback to the HDD is a much bigger problem.

Yes and No. Yes it only wastes a front end swap entry (with swap
cache). The physical location is  a seperate layer. No, the physical
SSD space is not wasted because you can allocate additional front end
swap entry by growing the swap entry front end. Then have the
additional front end swap entry point to the physical location you
just directed away from. There is a lot more consideration of the
front end vs the physical layer. The physical layer does not care
about location order size 2^N alignment. The physical layer cares a
bit about continuity and  the number of IOV that it needs to issue.
The swap entry front end and the physical layer have slightly
different constraints.

> - We still cannot do swapoff efficiently as we need to walk the page
>   tables (and some swap tables) to find and swapin all entries in a
>   swapfile. Not as important as other things, but worth mentioning.

That need rmap for swap entries. It It is an independent issue.

Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Rik van Riel 5 days, 3 hours ago
On Tue, 2025-11-25 at 22:50 +0400, Chris Li wrote:
> 
> > - We still cannot do swapoff efficiently as we need to walk the
> > page
> >   tables (and some swap tables) to find and swapin all entries in a
> >   swapfile. Not as important as other things, but worth mentioning.
> 
> That need rmap for swap entries. It It is an independent issue.
> 

Wouldn't rmap for swap entries be more expensive than
simply always having indirection for swap entries that
are in use?

With indirection, swapoff can just move pages from
the being-swapoffed device into the swap cache, and
if needed the memory can then be moved to another
swap device, without ever needing to find the page
tables.

This sounds like an uncommon scenario, but it is
functionally identical to what is done to pages
during zswap writeback, where the page table entries
stay unchanged, and the swap page is simply moved
to another backend location.

Why implement two things, when we can have one
thing that does both, with no extra complexity
over what zswap writeback needs?

-- 
All Rights Reversed.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 4 days, 23 hours ago
On Thu, Nov 27, 2025 at 1:59 AM Rik van Riel <riel@surriel.com> wrote:
>
> On Tue, 2025-11-25 at 22:50 +0400, Chris Li wrote:
> >
> > > - We still cannot do swapoff efficiently as we need to walk the
> > > page
> > >   tables (and some swap tables) to find and swapin all entries in a
> > >   swapfile. Not as important as other things, but worth mentioning.
> >
> > That need rmap for swap entries. It It is an independent issue.
> >
>
> Wouldn't rmap for swap entries be more expensive than
> simply always having indirection for swap entries that
> are in use?

It might be, to be frank. I consider this pretty far and late in the
stage of the game to evaluate the rmap and its alternatives. Do you
agree?

I might or might not try the rmap for swap entry. Right now I don't
have many data points nor insights.

> With indirection, swapoff can just move pages from
> the being-swapoffed device into the swap cache, and
> if needed the memory can then be moved to another
> swap device, without ever needing to find the page
> tables.

Ack. I don't think we have any disagreement here.

> This sounds like an uncommon scenario, but it is
> functionally identical to what is done to pages
> during zswap writeback, where the page table entries
> stay unchanged, and the swap page is simply moved
> to another backend location.
>
> Why implement two things, when we can have one
> thing that does both, with no extra complexity
> over what zswap writeback needs?

Let me ask you a clarifying question, then.

1) What exactly are you trying to propose here in what project? VS or
swap the pony?
2) What stage of the code change do you have in mind should this
change apply to?

I can't speak for VS,  I am open to embrace what you suggest in order
to swap the pony project, that is after I understand it first.

Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Rik van Riel 4 days, 22 hours ago
On Thu, 2025-11-27 at 06:07 +0400, Chris Li wrote:
> On Thu, Nov 27, 2025 at 1:59 AM Rik van Riel <riel@surriel.com>
> wrote:
> > 
> > On Tue, 2025-11-25 at 22:50 +0400, Chris Li wrote:
> > > 
> > > > - We still cannot do swapoff efficiently as we need to walk the
> > > > page
> > > >   tables (and some swap tables) to find and swapin all entries
> > > > in a
> > > >   swapfile. Not as important as other things, but worth
> > > > mentioning.
> > > 
> > > That need rmap for swap entries. It It is an independent issue.
> > > 
> > 
> > Wouldn't rmap for swap entries be more expensive than
> > simply always having indirection for swap entries that
> > are in use?
> 
> It might be, to be frank. I consider this pretty far and late in the
> stage of the game to evaluate the rmap and its alternatives. Do you
> agree?
> 
> I might or might not try the rmap for swap entry. Right now I don't
> have many data points nor insights.

On the contrary. I think we should at least do some
back of the envelope calculations to estimate the
overhead of the different proposed solutions.

With both Nhat's vswap, and your proposal to always
have swap indirection with a separate front end, and
several back ends, there is no need for swap rmap.

This is a good thing, because a single swap slot
could be referenced by dozens, hundreds, or even
thousands of page table entries, in the case of
forking servers. This creates complexity which is
probably best avoided.

Conceptually, Nhat's vswap, and your idea of having
always-on swap indirection seem to be the same thing.
> 
> > This sounds like an uncommon scenario, but it is
> > functionally identical to what is done to pages
> > during zswap writeback, where the page table entries
> > stay unchanged, and the swap page is simply moved
> > to another backend location.
> > 
> > Why implement two things, when we can have one
> > thing that does both, with no extra complexity
> > over what zswap writeback needs?
> 
> Let me ask you a clarifying question, then.
> 
> 1) What exactly are you trying to propose here in what project? VS or
> swap the pony?

In the past, when faced with competing code bases
like this, one thing that has worked well is for both
developers to send their code to the list, and then
for both developers to send each other suggestions
(or diffs) to improve each other's code.

Vswap and your always-on indirection seem to do
exactly the same thing. This seems like a good
opportunity to work together, and come up with
code that is better than any one person's code.

> 2) What stage of the code change do you have in mind should this
> change apply to?

I think it makes sense to get the hard design
problems resolved before committing to one
particular code design.

Spending months to resolve subtle bugs in a
code base, only to discover later that it does
not do exactly what is needed, is not the
greatest way to make progress.

> 
> I can't speak for VS,  I am open to embrace what you suggest in order
> to swap the pony project, that is after I understand it first.
> 
Once both Nhat and you understand each other's code,
and have suggestions for each other on how to improve
it, we will likely end up with a code base that looks
nicer than either of you would have done by yourselves.

The more perspectives, the better.


-- 
All Rights Reversed.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Nhat Pham 1 week ago
On Mon, Nov 24, 2025 at 11:32 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
>
> On Mon, Nov 24, 2025 at 12:27:17PM -0500, Johannes Weiner wrote:
> > On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote:
> > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > >
> > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > > > The current zswap requires a backing swapfile. The swap slot used
> > > > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > > > space.
> > > > >
> > > > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > > > prevents accidental read or write of ghost swapfile, bdev of
> > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > > > flag because there is no rotation disk access when using zswap.
> > > >
> > > > Zswap is primarily a compressed cache for real swap on secondary
> > > > storage. It's indeed quite important that entries currently in zswap
> > > > don't occupy disk slots; but for a solution to this to be acceptable,
> > > > it has to work with the primary usecase and support disk writeback.
> > >
> > > Well, my plan is to support the writeback via swap.tiers.
> >
> > Do you have a link to that proposal?
> >
> > My understanding of swap tiers was about grouping different swapfiles
> > and assigning them to cgroups. The issue with writeback is relocating
> > the data that a swp_entry_t page table refers to - without having to
> > find and update all the possible page tables. I'm not sure how
> > swap.tiers solve this problem.
> >
> > > > This direction is a dead-end. Please take a look at Nhat's swap
> > > > virtualization patches. They decouple zswap from disk geometry, while
> > > > still supporting writeback to an actual backend file.
> > >
> > > Yes, there are many ways to decouple zswap from disk geometry, my swap
> > > table + swap.tiers design can do that as well. I have concerns about
> > > swap virtualization in the aspect of adding another layer of memory
> > > overhead addition per swap entry and CPU overhead of extra xarray
> > > lookup. I believe my approach is technically superior and cleaner.
> > > Both faster and cleaner. Basically swap.tiers + VFS like swap read
> > > write page ops. I will let Nhat clarify the performance and memory
> > > overhead side of the swap virtualization.
> >
> > I'm happy to discuss it.
> >
> > But keep in mind that the swap virtualization idea is a collaborative
> > product of quite a few people with an extensive combined upstream
> > record. Quite a bit of thought has gone into balancing static vs
> > runtime costs of that proposal. So you'll forgive me if I'm a bit
> > skeptical of the somewhat grandiose claims of one person that is new
> > to upstream development.
> >
> > As to your specific points - we use xarray lookups in the page cache
> > fast path. It's a bold claim to say this would be too much overhead
> > during swapins.
> >
> > Two, it's not clear to me how you want to make writeback efficient
> > *without* any sort of swap entry redirection. Walking all relevant
> > page tables is expensive; and you have to be able to find them first.
> >
> > If you're talking about a redirection array as opposed to a tree -
> > static sizing of the compressed space is also a no-go. Zswap
> > utilization varies *widely* between workloads and different workload
> > combinations. Further, zswap consumes the same fungible resource as
> > uncompressed memory - there is really no excuse to burden users with
> > static sizing questions about this pool.
>
> I think what Chris's idea is (and Chris correct me if I am wrong), is
> that we use ghost swapfiles (that are not backed by disk space) for
> zswap. So zswap has its own swapfiles, separate from disk swapfiles.
>
> memory.tiers establishes the ordering between swapfiles, so you put
> "ghost" -> "real" to get today's zswap writeback behavior. When you
> writeback, you keep page tables pointing at the swap entry in the ghost
> swapfile. What you do is:
> - Allocate a new swap entry in the "real" swapfile.
> - Update the swap table of the "ghost" swapfile to point at the swap
>   entry in the "real" swapfile, reusing the pointer used for the
>   swapcache.
>
> Then, on swapin, you read the swap table of the "ghost" swapfile, find
> the redirection, and read to the swap table of the "real" swapfile, then
> read the page from disk into the swap cache. The redirection in the
> "ghost" swapfile will keep existing, wasting that slot, until all
> references to it are dropped.
>
> I think this might work for this specific use case, with less overhead
> than the xarray. BUT there are a few scenarios that are not covered
> AFAICT:

Thanks for explaining these issues better than I could :)

>
> - You still need to statically size the ghost swapfiles and their
>   overheads.

Yes.

>
> - Wasting a slot in the ghost swapfile for the redirection. This
>   complicates static provisioning a bit, because you have to account for
>   entries that will be in zswap as well as writtenback. Furthermore,
>   IIUC swap.tiers is intended to be generic and cover other use cases
>   beyond zswap like SSD -> HDD. For that, I think wasting a slot in the
>   SSD when we writeback to the HDD is a much bigger problem.

Yep. We are trying to get away from static provisioning as much as we
can - this design digs us deeper in the hole. Who the hell know what's
the zswap:disk swap split is going to be? It's going to depend on
access patterns and compressibility.

>
> - We still cannot do swapoff efficiently as we need to walk the page
>   tables (and some swap tables) to find and swapin all entries in a
>   swapfile. Not as important as other things, but worth mentioning.

Yeah I left swapoff out of it, because it is just another use case.
But yes we can't do swapoff efficiently easily either.

And in general, it's going to be a very rigid design for more
complicated backend change (pre-fetching from one tier to another, or
compaction).
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 1 week ago
On Mon, Nov 24, 2025 at 8:27 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote:
> > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > >
> > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > > The current zswap requires a backing swapfile. The swap slot used
> > > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > > space.
> > > >
> > > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > > prevents accidental read or write of ghost swapfile, bdev of
> > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > > flag because there is no rotation disk access when using zswap.
> > >
> > > Zswap is primarily a compressed cache for real swap on secondary
> > > storage. It's indeed quite important that entries currently in zswap
> > > don't occupy disk slots; but for a solution to this to be acceptable,
> > > it has to work with the primary usecase and support disk writeback.
> >
> > Well, my plan is to support the writeback via swap.tiers.
>
> Do you have a link to that proposal?

My 2024 LSF swap pony talk already has a mechanism to redirect page
cache swap entries to different physical locations.
That can also work for redirecting swap entries in different swapfiles.

https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/

> My understanding of swap tiers was about grouping different swapfiles
> and assigning them to cgroups. The issue with writeback is relocating
> the data that a swp_entry_t page table refers to - without having to
> find and update all the possible page tables. I'm not sure how
> swap.tiers solve this problem.

swap.tiers is part of the picture. You are right the LPC topic mostly
covers the per cgroup portion. The VFS swap ops are my two slides of
the LPC 2023. You read from one swap file and write to another swap
file with a new swap entry allocated.

> > > This direction is a dead-end. Please take a look at Nhat's swap
> > > virtualization patches. They decouple zswap from disk geometry, while
> > > still supporting writeback to an actual backend file.
> >
> > Yes, there are many ways to decouple zswap from disk geometry, my swap
> > table + swap.tiers design can do that as well. I have concerns about
> > swap virtualization in the aspect of adding another layer of memory
> > overhead addition per swap entry and CPU overhead of extra xarray
> > lookup. I believe my approach is technically superior and cleaner.
> > Both faster and cleaner. Basically swap.tiers + VFS like swap read
> > write page ops. I will let Nhat clarify the performance and memory
> > overhead side of the swap virtualization.
>
> I'm happy to discuss it.
>
> But keep in mind that the swap virtualization idea is a collaborative
> product of quite a few people with an extensive combined upstream
> record. Quite a bit of thought has gone into balancing static vs
> runtime costs of that proposal. So you'll forgive me if I'm a bit
> skeptical of the somewhat grandiose claims of one person that is new
> to upstream development.

Collaborating with which companies developers? How many VS patches
landed in the kernel? I am also collaborating with different
developers, cluster base swap allocators, swap table phase I. Removing
the NUMA node swap file priority. Those are all suggested by me.

> As to your specific points - we use xarray lookups in the page cache
> fast path. It's a bold claim to say this would be too much overhead
> during swapins.

Yes, we just get rid of xarray in swap cache lookup and get some
performance gain from it.
You are saying one extra xarray is no problem, can your team demo some
performance number of impact of the extra xarray lookup in VS? Just
run some swap benchmarks and share the result.

We can do a test right now, without writing back to another SSD, The
ghosts swapfile compare with VS for zswap only case.

> Two, it's not clear to me how you want to make writeback efficient
> *without* any sort of swap entry redirection. Walking all relevant
> page tables is expensive; and you have to be able to find them first.

Swap cache can have a physical location redirection, see my 2024 LPC
slides. I have considered that way before the VS discussion.
https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/

> If you're talking about a redirection array as opposed to a tree -
> static sizing of the compressed space is also a no-go. Zswap
> utilization varies *widely* between workloads and different workload
> combinations. Further, zswap consumes the same fungible resource as
> uncompressed memory - there is really no excuse to burden users with
> static sizing questions about this pool.

I do see the swap table + swap.ters + swap ops and do better. We can
test the memory only case right now. To head to head test the VS and
swap.tiers on the writeback case will need to wait a bit. Swap table
is only reviewing phase II.

I mean CPU and per swap entry overhead.

I care less on who's idea it is, I care more about the end result
performance in (memory & CPU). I want the best idea/implementation to
win.

Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Johannes Weiner 1 week ago
On Mon, Nov 24, 2025 at 09:24:18PM +0300, Chris Li wrote:
> On Mon, Nov 24, 2025 at 8:27 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote:
> > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > >
> > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > > > The current zswap requires a backing swapfile. The swap slot used
> > > > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > > > space.
> > > > >
> > > > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > > > prevents accidental read or write of ghost swapfile, bdev of
> > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > > > flag because there is no rotation disk access when using zswap.
> > > >
> > > > Zswap is primarily a compressed cache for real swap on secondary
> > > > storage. It's indeed quite important that entries currently in zswap
> > > > don't occupy disk slots; but for a solution to this to be acceptable,
> > > > it has to work with the primary usecase and support disk writeback.
> > >
> > > Well, my plan is to support the writeback via swap.tiers.
> >
> > Do you have a link to that proposal?
> 
> My 2024 LSF swap pony talk already has a mechanism to redirect page
> cache swap entries to different physical locations.
> That can also work for redirecting swap entries in different swapfiles.
> 
> https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/

I looked through your slides and the LWN article, but it's very hard
for me to find answers to my questions in there.

In your proposal, let's say you have a swp_entry_t in the page
table. What does it describe, and what are the data structures to get
from this key to user data in the following scenarios:

- Data is in a swapfile
- Data is in zswap
- Data is in being written from zswap to a swapfile
- Data is back in memory due to a fault from another page table

> > My understanding of swap tiers was about grouping different swapfiles
> > and assigning them to cgroups. The issue with writeback is relocating
> > the data that a swp_entry_t page table refers to - without having to
> > find and update all the possible page tables. I'm not sure how
> > swap.tiers solve this problem.
> 
> swap.tiers is part of the picture. You are right the LPC topic mostly
> covers the per cgroup portion. The VFS swap ops are my two slides of
> the LPC 2023. You read from one swap file and write to another swap
> file with a new swap entry allocated.

Ok, and from what you wrote below, presumably at this point you would
put a redirection pointer in the old location to point to the new one.

This way you only have the indirection IF such a relocation actually
happened, correct?

But how do you store new data in the freed up old slot?

> > As to your specific points - we use xarray lookups in the page cache
> > fast path. It's a bold claim to say this would be too much overhead
> > during swapins.
> 
> Yes, we just get rid of xarray in swap cache lookup and get some
> performance gain from it.
> You are saying one extra xarray is no problem, can your team demo some
> performance number of impact of the extra xarray lookup in VS? Just
> run some swap benchmarks and share the result.

Average and worst-case for all common usecases matter. There is no
code on your side for the writeback case. (And it's exceedingly
difficult to even get a mental model of how it would work from your
responses and the slides you have linked).

> > Two, it's not clear to me how you want to make writeback efficient
> > *without* any sort of swap entry redirection. Walking all relevant
> > page tables is expensive; and you have to be able to find them first.
> 
> Swap cache can have a physical location redirection, see my 2024 LPC
> slides. I have considered that way before the VS discussion.
> https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/

There are no matches for "redir" in either the email or the slides.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 6 days, 5 hours ago
On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > Do you have a link to that proposal?
> >
> > My 2024 LSF swap pony talk already has a mechanism to redirect page
> > cache swap entries to different physical locations.
> > That can also work for redirecting swap entries in different swapfiles.
> >
> > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/
>
> I looked through your slides and the LWN article, but it's very hard
> for me to find answers to my questions in there.

Naturally, the slide is only intended to cover what is in the current
swap table may be phase VII.
But it does have the physical location pointer consideration.

> In your proposal, let's say you have a swp_entry_t in the page
> table. What does it describe, and what are the data structures to get
> from this key to user data in the following scenarios:

Please keep in mind that I don't have every detail design laid out. I
follow the first principles that redirect a swap entry page should
only take an additional 4 byte per swap entry. VS blow up the swap
entry size by something like 24 bytes? I am pretty sure I am wrong
about the exact value. People who are familiar with VS please correct
me. My impression is that it is too far away from the first principle
value, I would not even consider. Exceptions can be made, but not that
far.

I will try my best to answer your question but usually I am more glad
to work with someone who is going to implement it to iron out all the
details. Right now it is a bit too far.

> - Data is in a swapfile
Same as current.

> - Data is in zswap

I have now realized that what I want from the memory swap tier is
actually not the same as today's zswap. I don't want the current
behavior of zswap in the swap.tiers. The zswap seat in front of every
swapfile. The zswap.writeback does not tell which particular swapfile
it wants to write to. It creates problems in the per memcg swap.tier
to include zswap as it is. I don't want the zswap to use another
swapfile swap entry and write through to it.

If data is in the memory tier swapfile, the swap entry looks up to the
actual data without redirection.

> - Data is in being written from zswap to a swapfile
It will look up the swap table and find a physical pointer, which
points to the physical device and office having the data.

> - Data is back in memory due to a fault from another page table
In the swap cache similar to today's swapfile.

> > > My understanding of swap tiers was about grouping different swapfiles
> > > and assigning them to cgroups. The issue with writeback is relocating
> > > the data that a swp_entry_t page table refers to - without having to
> > > find and update all the possible page tables. I'm not sure how
> > > swap.tiers solve this problem.
> >
> > swap.tiers is part of the picture. You are right the LPC topic mostly
> > covers the per cgroup portion. The VFS swap ops are my two slides of
> > the LPC 2023. You read from one swap file and write to another swap
> > file with a new swap entry allocated.
>
> Ok, and from what you wrote below, presumably at this point you would
> put a redirection pointer in the old location to point to the new one.

From the swap entry front end (also owns the swap cache) point to a
physical location.
>
> This way you only have the indirection IF such a relocation actually
> happened, correct?

Right. The more common

> But how do you store new data in the freed up old slot?
That is the front end swap entry and the physical back end split.
The front end swap entry can't be free until all users release the swap count.
The physical back end can be free. The free physical blocks caused by
redirection will likely have a different allocator, not the cluster
based swap allocator. Because those are just pure blocks.

>
> > > As to your specific points - we use xarray lookups in the page cache
> > > fast path. It's a bold claim to say this would be too much overhead
> > > during swapins.
> >
> > Yes, we just get rid of xarray in swap cache lookup and get some
> > performance gain from it.
> > You are saying one extra xarray is no problem, can your team demo some
> > performance number of impact of the extra xarray lookup in VS? Just
> > run some swap benchmarks and share the result.
>
> Average and worst-case for all common usecases matter. There is no
> code on your side for the writeback case. (And it's exceedingly
> difficult to even get a mental model of how it would work from your
> responses and the slides you have linked).

As I said, that slide is only intended to explain swap table phase VII
how physical direction works with swap cache.
The swap.tiers define tiers for swap, obviously how to move data
between the tier is a natural consideration. That I mention in the
2023 talk in two slides.

I don't plan that level of detail that far ahead. I try to follow the
first principle as best as I can. There will be a lot of decisions
made only at the later phases.

> > > Two, it's not clear to me how you want to make writeback efficient
> > > *without* any sort of swap entry redirection. Walking all relevant
> > > page tables is expensive; and you have to be able to find them first.
> >
> > Swap cache can have a physical location redirection, see my 2024 LPC
> > slides. I have considered that way before the VS discussion.
> > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/
>
> There are no matches for "redir" in either the email or the slides.

Yes, I use a different term in the slide. The continuous is the source
of the redirection, the non continuous is the destination of the
redirection. But in my mind I am not redirecting swap entries. The
swap entry might have an optional physical location pointer. The swap
entry front end and physical layer split.

Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Johannes Weiner 6 days, 3 hours ago
On Tue, Nov 25, 2025 at 11:27:04PM +0400, Chris Li wrote:
> On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > Do you have a link to that proposal?
> > >
> > > My 2024 LSF swap pony talk already has a mechanism to redirect page
> > > cache swap entries to different physical locations.
> > > That can also work for redirecting swap entries in different swapfiles.
> > >
> > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/
> >
> > I looked through your slides and the LWN article, but it's very hard
> > for me to find answers to my questions in there.
> 
> Naturally, the slide is only intended to cover what is in the current
> swap table may be phase VII.
> But it does have the physical location pointer consideration.
> 
> > In your proposal, let's say you have a swp_entry_t in the page
> > table. What does it describe, and what are the data structures to get
> > from this key to user data in the following scenarios:
> 
> Please keep in mind that I don't have every detail design laid out. I
> follow the first principles that redirect a swap entry page should
> only take an additional 4 byte per swap entry. VS blow up the swap
> entry size by something like 24 bytes?

Nhat can lay this out in more detail, but there isn't much new stuff
in the virtual swap descriptor. It's mostly just a consolidation of
state we currently track elsewhere - swap count, swapcache pointer,
cgroup ownership etc.

The actual indirection is just a word for the backend type,offset.

That indirection is the tradeoff for swapped pages. In turn you're
getting back all that other stuff for swap slots that *aren't*
currently used. This is a win for the vast majority of users.

Since you mentioned first principles - the dynamically sized swap
space is also much more suitable for compressed pools, which are the
dominant form of swap setups nowadays. Again a win for the majority.

And the worst-case is reasonable. I don't see the giant gulf you seem
to see there. I don't know where it's supposed to be coming from.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 5 days, 5 hours ago
On Wed, Nov 26, 2025 at 1:31 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Tue, Nov 25, 2025 at 11:27:04PM +0400, Chris Li wrote:
> > On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > > Do you have a link to that proposal?
> > > >
> > > > My 2024 LSF swap pony talk already has a mechanism to redirect page
> > > > cache swap entries to different physical locations.
> > > > That can also work for redirecting swap entries in different swapfiles.
> > > >
> > > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/
> > >
> > > I looked through your slides and the LWN article, but it's very hard
> > > for me to find answers to my questions in there.
> >
> > Naturally, the slide is only intended to cover what is in the current
> > swap table may be phase VII.
> > But it does have the physical location pointer consideration.
> >
> > > In your proposal, let's say you have a swp_entry_t in the page
> > > table. What does it describe, and what are the data structures to get
> > > from this key to user data in the following scenarios:
> >
> > Please keep in mind that I don't have every detail design laid out. I
> > follow the first principles that redirect a swap entry page should
> > only take an additional 4 byte per swap entry. VS blow up the swap
> > entry size by something like 24 bytes?
>
> Nhat can lay this out in more detail, but there isn't much new stuff

Please make sure Nhat do. It shouldn't be complicated question.

> in the virtual swap descriptor. It's mostly just a consolidation of
> state we currently track elsewhere - swap count, swapcache pointer,
> cgroup ownership etc.

All those will fold into swap table values at later phases. So in this
regard, swap table is not satisfying the status quotes, it is more
aggressive in conserving memory. If I recall correctly, VS uses atomic
for the counters? It will blow up the 1 byte counter to 4 bytes.

> The actual indirection is just a word for the backend type,offset.

Sure.

>
> That indirection is the tradeoff for swapped pages. In turn you're
> getting back all that other stuff for swap slots that *aren't*
> currently used. This is a win for the vast majority of users.

Swap table does those as well, in the later phases.

>
> Since you mentioned first principles - the dynamically sized swap
> space is also much more suitable for compressed pools, which are the
> dominant form of swap setups nowadays. Again a win for the majority.

Sure, the swap table does that, especially after the swap cgroup and
swap count fold into the swap table.

> And the worst-case is reasonable. I don't see the giant gulf you seem
> to see there. I don't know where it's supposed to be coming from.

Let Nhat conform the per swap entry overhead and let's compare it with
the swap table fully final form.
Another easy way is just run some benchmark to see how much overhead
the VS introduces.

That being said, I think I have answered enough technical questions of
my approach, to let you re-consider my proposal. You should be able to
realize by now my approach is more optimal compared to VS. Do you
agree or not? We are just arguing how big the gap that is.

Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Rik van Riel 5 days, 3 hours ago
On Wed, 2025-11-26 at 23:22 +0400, Chris Li wrote:
> 
> That being said, I think I have answered enough technical questions
> of
> my approach, to let you re-consider my proposal. You should be able
> to
> realize by now my approach is more optimal compared to VS. Do you
> agree or not? We are just arguing how big the gap that is.
> 

We would have much more confidence in your
solution if you had told us exactly how
you were planning to solve things in future
stages of the project.

A "I'll solve it, but I can't tell you how"
is not very confidence inspiring.

-- 
All Rights Reversed.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 4 days, 23 hours ago
On Thu, Nov 27, 2025 at 1:53 AM Rik van Riel <riel@surriel.com> wrote:
>
> On Wed, 2025-11-26 at 23:22 +0400, Chris Li wrote:
> >
> > That being said, I think I have answered enough technical questions
> > of
> > my approach, to let you re-consider my proposal. You should be able
> > to
> > realize by now my approach is more optimal compared to VS. Do you
> > agree or not? We are just arguing how big the gap that is.
> >
>
> We would have much more confidence in your
> solution if you had told us exactly how
> you were planning to solve things in future
> stages of the project.

Can you clarify who is "We", sorry I am not part of your Meta kernel
team circle. II just reply to you and others how to solve the other
things. If you have further questions, please ask a clarifying
question. Until you ask, I don't know which part of the Swap Pony plan
you don't understand needs more clarifications.

> A "I'll solve it, but I can't tell you how"
> is not very confidence inspiring.

Don't need this kind of innuendo and it is not helping.
Please stay on the technical side of discussion and try not to project
personal judgement, thanks.

Please keep in mind that I am just one person love kernel hacking and
want to do the right things. I am doing this at my spare time, it is
not part of my company OKR's to work on upstream swap in the last two
years. I don't get pay to do this. I am replying this email from my
vacation 5am in the morning.

Again, let's stay technical. If you think I am holding any secret (I
am not ), please just ask a clarify question.

Thanks for  your cooperation and sorry that I did have a chance to
explain things better earlier.

Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Rik van Riel 4 days, 22 hours ago
On Thu, 2025-11-27 at 05:52 +0400, Chris Li wrote:
> On Thu, Nov 27, 2025 at 1:53 AM Rik van Riel <riel@surriel.com>
> wrote:
> > 
> > On Wed, 2025-11-26 at 23:22 +0400, Chris Li wrote:
> > > 
> > > That being said, I think I have answered enough technical
> > > questions
> > > of
> > > my approach, to let you re-consider my proposal. You should be
> > > able
> > > to
> > > realize by now my approach is more optimal compared to VS. Do you
> > > agree or not? We are just arguing how big the gap that is.
> > > 
> > 
> > We would have much more confidence in your
> > solution if you had told us exactly how
> > you were planning to solve things in future
> > stages of the project.
> 
> Can you clarify who is "We", 

Sorry, I am talking about upstream.

When one developer has code, and somebody else emails
the equivalent of "trust me, bro", the code is usually
preferred.

> 
> Please keep in mind that I am just one person love kernel hacking and
> want to do the right things. I am doing this at my spare time, it is
> not part of my company OKR's to work on upstream swap in the last two
> years. I don't get pay to do this. I am replying this email from my
> vacation 5am in the morning.
> 
> Again, let's stay technical. If you think I am holding any secret (I
> am not ), please just ask a clarify question.

I really appreciate anybody participating in Linux
kernel development. Linux is good because different
people bring different perspectives to the table.

Some real numbers, even if just back of the envelope 
math to estimate the overhead of various ideas being
proposed, are often a good way to move a discussion 
along in a productive direction.

Let me reply to your other email with some more
technical details.

-- 
All Rights Reversed.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 4 days, 5 hours ago
On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote:
>
> Sorry, I am talking about upstream.

So far I have not had a pleasant upstream experience when submitting
this particular patch to upstream.

> I really appreciate anybody participating in Linux
> kernel development. Linux is good because different
> people bring different perspectives to the table.

Of course everybody is welcome. However, NACK without technical
justification is very bad for upstream development. I can't imagine
what a new hacker would think after going through what I have gone
through for this patch. He/she will likely quit contributing upstream.
This is not the kind of welcome we want.

Nhat needs to be able to technically justify his NACK as a maintainer.
Sorry there is no other way to sugar coat it.

Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Nhat Pham 3 days, 4 hours ago
On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote:
>
> On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote:
> >
> > Sorry, I am talking about upstream.
>
> So far I have not had a pleasant upstream experience when submitting
> this particular patch to upstream.
>
> > I really appreciate anybody participating in Linux
> > kernel development. Linux is good because different
> > people bring different perspectives to the table.
>
> Of course everybody is welcome. However, NACK without technical
> justification is very bad for upstream development. I can't imagine
> what a new hacker would think after going through what I have gone
> through for this patch. He/she will likely quit contributing upstream.
> This is not the kind of welcome we want.
>
> Nhat needs to be able to technically justify his NACK as a maintainer.
> Sorry there is no other way to sugar coat it.

I am NOT the only zswap maintainer who expresses concerns. Other
people also have their misgivings, so I have let them speak and not
put words in their mouths.

But since you have repeatedly singled me out, I will repeat my concerns here:

1. I don't like the operational overhead (to statically size the zswap
swapfile size for each <host x workload> combination) of static
swapfile. Misspecification of swapfile size can lead to unacceptable
swap metadata overhead on small machines, or underutilization of zswap
on big machines. And it is *impossible* to know how much zswap will be
needed ahead of time, even if we fix host - it depends on workloads
access patterns, memory compressibility, and latency/memory pressure
tolerance.

2. I don't like the maintainer's overhead (to support a special
infrastructure for a very specific use case, i.e no-writeback),
especially since I'm not convinced this can be turned into a general
architecture. See below.

3. I want to move us towards a more dynamic architecture for zswap.
This is a step in the WRONG direction.

4. I don't believe this buys us anything we can't already do with
userspace hacking. Again, zswap-over-zram (or insert whatever RAM-only
swap option here), with writeback disabled, is 2-3 lines of script.

I believe I already justified myself well enough :) It is you who have
not really convinced me that this is, at the very least, a
temporary/first step towards a long-term generalized architecture for
zswap. Every time we pointed out an issue, you seem to justify it with
some more vague ideas that deepen the confusion.

Let's recap the discussion so far:

1. We claimed that this architecture is hard to extend for efficient
zswap writeback, or backend transfer in general, without incurring
page table updates. You claim you plan to implement a redirection
entry to solve this.

2. We then pointed out that inserting redirect entry into the current
physical swap infrastructure will leave holes in the upper swap tier's
address space, which is arguably *worse* than the current status quo
of zswap occupying disk swap space. Again, you pull out some vague
ideas about "frontend" and "backend" swap, which, frankly, is
conceptually very similar to swap virtualization.

3. The dynamicization of swap space is treated with the same rigor
(or, more accurately, lack thereof). Just more handwaving about the
"frontend" vs "backend" (which, again, is very close to swap
virtualization). This requirement is a deal breaker for me - see
requirement 1 above again.

4. We also pointed out your lack of thoughts for swapoff optimization,
which again, seem to be missing in your design. Again, more vagueness
about rmap, which is probably more overhead.

Look man, I'm not being hostile to you. Believe me on this - I respect
your opinion, and I'm working very hard on reducing memory overhead
for virtual swap, to see if I can meet you where you want it to be.
The RFC's original design inefficient memory usage was due to:

a) Readability. Space optimization can make it hard to read code, when
fields are squeezed into the same int/long variable. So I just put one
different field for each piece of metadata information

b) I was playing with synchronization optimization, i.e using atomics
instead of locks, and using per-entry locks. But I can go back to
using per-cluster lock (I haven't implemented cluster allocator at the
time of the RFC, but in my latest version I have done it), which will
further reduce the memory overhead by removing a couple of
fields/packing more fields.

The only non-negotiable per-swap-entry overhead will be a field to
indicate the backend location (physical swap slot, zswap entry, etc.)
+ 2 bits to indicate the swap type. With some field union-ing magic,
or pointer tagging magic, we can perhaps squeeze it even harder.

I'm also working on reducing the CPU overhead - re-partitioning swap
architectures (swap cache, zswap tree), reducing unnecessary xarray
lookups where possible.

We can then benchmark, and attempt to optimize it together as a community.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 2 days, 4 hours ago
On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote:
> >
> > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote:
> > >
> > > Sorry, I am talking about upstream.
> >
> > So far I have not had a pleasant upstream experience when submitting
> > this particular patch to upstream.
> >
> > > I really appreciate anybody participating in Linux
> > > kernel development. Linux is good because different
> > > people bring different perspectives to the table.
> >
> > Of course everybody is welcome. However, NACK without technical
> > justification is very bad for upstream development. I can't imagine
> > what a new hacker would think after going through what I have gone
> > through for this patch. He/she will likely quit contributing upstream.
> > This is not the kind of welcome we want.
> >
> > Nhat needs to be able to technically justify his NACK as a maintainer.
> > Sorry there is no other way to sugar coat it.
>
> I am NOT the only zswap maintainer who expresses concerns. Other
> people also have their misgivings, so I have let them speak and not
> put words in their mouths.

You did not mention the fact that both two NACK from zswap maintainers
are from the same company. I assume you have some kind of team sync.
There is a term for that, called "person acting in concert".

What I mean in "technically unjustifiable" is that VS patch series is
a non-starter to merge into mainline.
In this email you suggest the per swap slot memory overhead is 48
bytes previously 64 bytes.

https://lore.kernel.org/linux-mm/CAKEwX=Mea5V6CKcGuQrYfCQAKErgbje1s0fThjkgCwZXgF-d2A@mail.gmail.com/

Do you have newer VS that significantly reduce that? If so, what is
the new number?

The starting point before your VS is 11 bytes (3 bytes static, 8 bytes
dynamic). 48bytes is more than 4x the original size.
This will have a huge impact on the deployment that uses a lot of
swap. The worst part is that once your VS series is in the kernel.
That overhead is always on, it is forcing the overhead even if the
redirection is not used. This will hurt Google's fleet very badly if
deployed. Because of the same jobs, the kernel memory consumption will
jump up and fail jobs. Every body's kernel who use swap will suffer
because it is always on. The alternative, the swap table, uses much
less overhead. So your VS leave money on the table.

So I consider your VS is a non-starter. I repeatedly call you out
because you keep dodging this critical question. Johannes refers to
you for the detail value of the overhead as well.  Dodging critical
questions makes a technical debate very difficult to conduct and drive
to a conflict resolution impossible. BTW, this is my big concern on
the 2023 swap abstraction talk which our VS is based on. The community
feedback at the time strongly favored my solution. I don't understand
why you reboot the community un-favored solution without addressing
those concerns.

The other part of the bad experience is that you NACK first then ask
clarifying questions later. The proper order is the other way around.
You should fully understand the subject BEFORE you NACK on it. NACK is
a very serious business.

I did try my best to answer clarification question from your team. I
appreciate that Johannes and Yosry ask clarification to advance the
discussion. I did not see more question from them I assume they got
what they want to know. If you still feel something is missing out,
you should ask a follow up question for the part in which you need
more clarification. We can repeat until you understand. You keep using
the phrase "hand waving" as if I am faking it. That is FUD.
Communication is a two way street. I can't force you to understand,
asking more questions can help you. This is complex problem. I am
confident I can explain to Kairui and he can understand, because he
has a lot more context, not because I am faking it. Ask nicely so I
can answer nicely. Stay in the technical side of the discussion
please.

So I consider using VS to NACK my patch is technically unjustifiable.
Your current VS with 48 byte overhead is not usable at all as an
standard upstream kernel. Can we agree to that?

As we all know, using less memory to function the same is a lot harder
than using more. If you can dramatically reduce the memory usage, you
likely need to rebuild the whole patch series from scratch. If might
force you to use solution similar to swap table, in that case why not
join team swap table? We can reopen the topic again by then if you
have a newer VS:
1) address the per swap slot memory over head, ideally close to the
first principle value.
2) make the overhead optional, if not using redirection, preferably
not pay the overhead.
3) make your VS patch series incrementally show value, not all or nothing.

Sorry this email is getting very long and I have very limited time.
Let's discuss one topic at a time. I would like to conclude the
current VS is not a viable option as of now. I can reply to other
parts of your email once we get the VS out of the way.

Best Regards,

Chris




>
> 1. I don't like the operational overhead (to statically size the zswap
> swapfile size for each <host x workload> combination) of static
> swapfile. Misspecification of swapfile size can lead to unacceptable
> swap metadata overhead on small machines, or underutilization of zswap
> on big machines. And it is *impossible* to know how much zswap will be
> needed ahead of time, even if we fix host - it depends on workloads
> access patterns, memory compressibility, and latency/memory pressure
> tolerance.
>
> 2. I don't like the maintainer's overhead (to support a special
> infrastructure for a very specific use case, i.e no-writeback),
> especially since I'm not convinced this can be turned into a general
> architecture. See below.
>
> 3. I want to move us towards a more dynamic architecture for zswap.
> This is a step in the WRONG direction.
>
> 4. I don't believe this buys us anything we can't already do with
> userspace hacking. Again, zswap-over-zram (or insert whatever RAM-only
> swap option here), with writeback disabled, is 2-3 lines of script.
>
> I believe I already justified myself well enough :) It is you who have
> not really convinced me that this is, at the very least, a
> temporary/first step towards a long-term generalized architecture for
> zswap. Every time we pointed out an issue, you seem to justify it with
> some more vague ideas that deepen the confusion.
>
> Let's recap the discussion so far:
>
> 1. We claimed that this architecture is hard to extend for efficient
> zswap writeback, or backend transfer in general, without incurring
> page table updates. You claim you plan to implement a redirection
> entry to solve this.
>
> 2. We then pointed out that inserting redirect entry into the current
> physical swap infrastructure will leave holes in the upper swap tier's
> address space, which is arguably *worse* than the current status quo
> of zswap occupying disk swap space. Again, you pull out some vague
> ideas about "frontend" and "backend" swap, which, frankly, is
> conceptually very similar to swap virtualization.
>
> 3. The dynamicization of swap space is treated with the same rigor
> (or, more accurately, lack thereof). Just more handwaving about the
> "frontend" vs "backend" (which, again, is very close to swap
> virtualization). This requirement is a deal breaker for me - see
> requirement 1 above again.
>
> 4. We also pointed out your lack of thoughts for swapoff optimization,
> which again, seem to be missing in your design. Again, more vagueness
> about rmap, which is probably more overhead.
>
> Look man, I'm not being hostile to you. Believe me on this - I respect
> your opinion, and I'm working very hard on reducing memory overhead
> for virtual swap, to see if I can meet you where you want it to be.
> The RFC's original design inefficient memory usage was due to:
>
> a) Readability. Space optimization can make it hard to read code, when
> fields are squeezed into the same int/long variable. So I just put one
> different field for each piece of metadata information
>
> b) I was playing with synchronization optimization, i.e using atomics
> instead of locks, and using per-entry locks. But I can go back to
> using per-cluster lock (I haven't implemented cluster allocator at the
> time of the RFC, but in my latest version I have done it), which will
> further reduce the memory overhead by removing a couple of
> fields/packing more fields.
>
> The only non-negotiable per-swap-entry overhead will be a field to
> indicate the backend location (physical swap slot, zswap entry, etc.)
> + 2 bits to indicate the swap type. With some field union-ing magic,
> or pointer tagging magic, we can perhaps squeeze it even harder.
>
> I'm also working on reducing the CPU overhead - re-partitioning swap
> architectures (swap cache, zswap tree), reducing unnecessary xarray
> lookups where possible.
>
> We can then benchmark, and attempt to optimize it together as a community.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Nhat Pham an hour ago
On Sat, Nov 29, 2025 at 12:38 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote:
> > >
> > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote:
> > > >
> > > > Sorry, I am talking about upstream.
> > >
> > > So far I have not had a pleasant upstream experience when submitting
> > > this particular patch to upstream.
> > >
> > > > I really appreciate anybody participating in Linux
> > > > kernel development. Linux is good because different
> > > > people bring different perspectives to the table.
> > >
> > > Of course everybody is welcome. However, NACK without technical
> > > justification is very bad for upstream development. I can't imagine
> > > what a new hacker would think after going through what I have gone
> > > through for this patch. He/she will likely quit contributing upstream.
> > > This is not the kind of welcome we want.
> > >
> > > Nhat needs to be able to technically justify his NACK as a maintainer.
> > > Sorry there is no other way to sugar coat it.
> >
> > I am NOT the only zswap maintainer who expresses concerns. Other
> > people also have their misgivings, so I have let them speak and not
> > put words in their mouths.
>
> You did not mention the fact that both two NACK from zswap maintainers
> are from the same company. I assume you have some kind of team sync.
> There is a term for that, called "person acting in concert".

I mean, Yosry pointed out issues with your approach too. Yosry is from
your company, no?

The issues I pointed out have all been technical, thus far. I never
even brought up Meta - I'm sure other parties have the same issues.

>
> What I mean in "technically unjustifiable" is that VS patch series is
> a non-starter to merge into mainline.
> In this email you suggest the per swap slot memory overhead is 48
> bytes previously 64 bytes.
>
> https://lore.kernel.org/linux-mm/CAKEwX=Mea5V6CKcGuQrYfCQAKErgbje1s0fThjkgCwZXgF-d2A@mail.gmail.com/
>
> Do you have newer VS that significantly reduce that? If so, what is
> the new number?
>
> The starting point before your VS is 11 bytes (3 bytes static, 8 bytes
> dynamic). 48bytes is more than 4x the original size.
> This will have a huge impact on the deployment that uses a lot of
> swap. The worst part is that once your VS series is in the kernel.
> That overhead is always on, it is forcing the overhead even if the
> redirection is not used. This will hurt Google's fleet very badly if
> deployed. Because of the same jobs, the kernel memory consumption will
> jump up and fail jobs. Every body's kernel who use swap will suffer
> because it is always on. The alternative, the swap table, uses much
> less overhead. So your VS leave money on the table.
>
> So I consider your VS is a non-starter. I repeatedly call you out
> because you keep dodging this critical question. Johannes refers to
> you for the detail value of the overhead as well.  Dodging critical
> questions makes a technical debate very difficult to conduct and drive
> to a conflict resolution impossible. BTW, this is my big concern on
> the 2023 swap abstraction talk which our VS is based on. The community
> feedback at the time strongly favored my solution. I don't understand
> why you reboot the community un-favored solution without addressing
> those concerns.

I reboot the VS work because I have not seen any indications that your
design could solve the problems I believe are principle for any swap
architectures: dynamicization of swap space, efficient backend
transfer, to name 2.

>
> The other part of the bad experience is that you NACK first then ask
> clarifying questions later. The proper order is the other way around.
> You should fully understand the subject BEFORE you NACK on it. NACK is
> a very serious business.
>
> I did try my best to answer clarification question from your team. I
> appreciate that Johannes and Yosry ask clarification to advance the
> discussion. I did not see more question from them I assume they got
> what they want to know. If you still feel something is missing out,
> you should ask a follow up question for the part in which you need
> more clarification. We can repeat until you understand. You keep using
> the phrase "hand waving" as if I am faking it. That is FUD.
> Communication is a two way street. I can't force you to understand,
> asking more questions can help you. This is complex problem. I am
> confident I can explain to Kairui and he can understand, because he
> has a lot more context, not because I am faking it. Ask nicely so I
> can answer nicely. Stay in the technical side of the discussion
> please.
>
> So I consider using VS to NACK my patch is technically unjustifiable.

I'm not NACK-ing the ghost swapfile because of VS. I'm NACK-ing
swapfile because of the technical requirements I pointed out above.
Virtual swap happens to neatly solve all of them, by design, from
first principle. I never ruled out the possibility of another design
that would satisfy all of them - I just did not see enough from you to
believe otherwise.

I don't believe a static ghosttfile is it. In fact, you CAN
theoretically implement virtual swap with a ghost swapfile as well.
The staticity will just make it operationally untenable. The next step
would be to dynamicize the swap infrastructure, at which point we
revert back to the original VS design.

I see the same thing played out in your response as well, with the
redirection entry, then frontend/backend swap space. It's starting to
eerily resembles virtual swap. Or maybe you can clarify?

> Your current VS with 48 byte overhead is not usable at all as an
> standard upstream kernel. Can we agree to that?

Sure, which is why I sent it as an RFC and not as an actual patch
series pending merging :) Its main purpose was to demonstrate the
workflow of how a feature-complete virtual swap subsystem might
behave, in all of the code paths of the memory subsystem. I can then
optimize the fields piecemeal, while weighing the tradeoff (such as
lock granularity v.s lock fields memory overhead). You and Kairui are
welcome to criticize, comment, and help me optimize it, as did Yosry
and Johannes in the past.

>
> As we all know, using less memory to function the same is a lot harder
> than using more. If you can dramatically reduce the memory usage, you

I don't necessarily disagree.

I would, however, would like to point out that the reverse is true too
- you can't necessarily compare the overhead of two designs, where one
achieve a lot more in terms of features and/or operational goals than
the other.

> likely need to rebuild the whole patch series from scratch. If might
> force you to use solution similar to swap table, in that case why not
> join team swap table?

Because even with the current swap table design, the allocator is
*still* static.

I would LOVE to use the current physical swap allocation
infrastructure. It just doesn't work in its current state.

> We can reopen the topic again by then if you have a newer VS:

Sure.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Johannes Weiner 8 hours ago
On Sun, Nov 30, 2025 at 12:38:38AM +0400, Chris Li wrote:
> On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote:
> > >
> > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote:
> > > >
> > > > Sorry, I am talking about upstream.
> > >
> > > So far I have not had a pleasant upstream experience when submitting
> > > this particular patch to upstream.
> > >
> > > > I really appreciate anybody participating in Linux
> > > > kernel development. Linux is good because different
> > > > people bring different perspectives to the table.
> > >
> > > Of course everybody is welcome. However, NACK without technical
> > > justification is very bad for upstream development. I can't imagine
> > > what a new hacker would think after going through what I have gone
> > > through for this patch. He/she will likely quit contributing upstream.
> > > This is not the kind of welcome we want.
> > >
> > > Nhat needs to be able to technically justify his NACK as a maintainer.
> > > Sorry there is no other way to sugar coat it.
> >
> > I am NOT the only zswap maintainer who expresses concerns. Other
> > people also have their misgivings, so I have let them speak and not
> > put words in their mouths.
> 
> You did not mention the fact that both two NACK from zswap maintainers
> are from the same company. I assume you have some kind of team sync.
> There is a term for that, called "person acting in concert".

For the benefit of anybody following this from the sidelines, the
third zswap maintainer also expressed concerns about Chris's proposal
upthread. He works for the same company as Chris.

The reality is that Chris is failing to convince others of his design
direction, and is now obviously resorting to manipulation and hominem
attacks.

During the course of this thread, Chris has asked for "a little faith"
that his idea will work for all stated requirements, without deeming
it necessary to explain how.

When probed on technical details, he stated that he doesn't like to
plan that far ahead, and prefers having somebody else iron out the
implementation details. He also referred to high-level slides from his
LSFMM '24 session - which was received thusly[1]:

  Matthew Wilcox agreed, warning Li that he was setting himself up for "a world of pain".

  Jan Kara said that existing filesystem designs are not suited to this task

  Hildenbrand said that this plan was introducing too much complexity

His first response to criticism was to invoke his <4 week status of
swap maintainer.

Meanwhile, the design direction that Chris is construing as a single
company conspiracy is anything but. The collaborative origins of these
patches are well documented. Chris was CC'd on those RFCs. He notably
did not engage in them. He is now lying about the narrative and
choosing to attack these patches in bad faith and out of context.

This pattern of behavior gives me low confidence that Chris is able to
collaborate and compromise on a design that works for all users.

And while Chris has been quite vocal and opinionated in mailing list
discussions, his actual code contributions to the kernel do not
instill confidence that he can solve this problem by himself, either.

[1] https://lwn.net/Articles/974587/
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Kairui Song 5 hours ago
On Tue, Dec 2, 2025 at 12:47 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Sun, Nov 30, 2025 at 12:38:38AM +0400, Chris Li wrote:
> > On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote:
> > > >
> > > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote:
> > > > >
> > > > > Sorry, I am talking about upstream.
> > > >
> > > > So far I have not had a pleasant upstream experience when submitting
> > > > this particular patch to upstream.
> > > >
> > > > > I really appreciate anybody participating in Linux
> > > > > kernel development. Linux is good because different
> > > > > people bring different perspectives to the table.
> > > >
> > > > Of course everybody is welcome. However, NACK without technical
> > > > justification is very bad for upstream development. I can't imagine
> > > > what a new hacker would think after going through what I have gone
> > > > through for this patch. He/she will likely quit contributing upstream.
> > > > This is not the kind of welcome we want.
> > > >
> > > > Nhat needs to be able to technically justify his NACK as a maintainer.
> > > > Sorry there is no other way to sugar coat it.
> > >
> > > I am NOT the only zswap maintainer who expresses concerns. Other
> > > people also have their misgivings, so I have let them speak and not
> > > put words in their mouths.
> >
> > You did not mention the fact that both two NACK from zswap maintainers
> > are from the same company. I assume you have some kind of team sync.
> > There is a term for that, called "person acting in concert".
>
> For the benefit of anybody following this from the sidelines, the
> third zswap maintainer also expressed concerns about Chris's proposal
> upthread. He works for the same company as Chris.
>
> The reality is that Chris is failing to convince others of his design
> direction, and is now obviously resorting to manipulation and hominem
> attacks.
>
> During the course of this thread, Chris has asked for "a little faith"
> that his idea will work for all stated requirements, without deeming
> it necessary to explain how.
>
> When probed on technical details, he stated that he doesn't like to
> plan that far ahead, and prefers having somebody else iron out the
> implementation details. He also referred to high-level slides from his
> LSFMM '24 session - which was received thusly[1]:
>
>   Matthew Wilcox agreed, warning Li that he was setting himself up for "a world of pain".
>
>   Jan Kara said that existing filesystem designs are not suited to this task
>
>   Hildenbrand said that this plan was introducing too much complexity
>
> His first response to criticism was to invoke his <4 week status of
> swap maintainer.
>
> Meanwhile, the design direction that Chris is construing as a single
> company conspiracy is anything but. The collaborative origins of these
> patches are well documented. Chris was CC'd on those RFCs. He notably
> did not engage in them. He is now lying about the narrative and
> choosing to attack these patches in bad faith and out of context.
>
> This pattern of behavior gives me low confidence that Chris is able to
> collaborate and compromise on a design that works for all users.
>
> And while Chris has been quite vocal and opinionated in mailing list
> discussions, his actual code contributions to the kernel do not
> instill confidence that he can solve this problem by himself, either.

Hi all,

I’d really prefer we all let things cool off a bit before the thread
gets too dramatic. :)

Sorry to see that the discussion went quite off topic, still I believe
this is some kind of misunderstanding on Chris' intention to improve
the kernel in a more generic way.

From my perspective, Chris did co-developed, suggested, reviewed or
authored many of the implementation details around the swap-table
idea, and he implemented the swap cluster allocator in 6.11, which
unlocked a bunch of follow-on optimizations.

I’ve been working on swap for a while as well and have rewritten and
refactored large parts of the swap, swap allocator and swap cache
(mm/swapfile.c, mm/swap_state.c, swap.h, swap_table.h). Maybe, yeah,
I’m not a kernel vet with decades of patches yet, but I do think I'm
familiar enough with swap. I think Chris' work, words or code, has
been looking good in the end results.

It's hard to put a penthouse on a sandcastle, and maybe that's the
reason makes it hard to describe or layout the further implementations
of swap.

We all struggled with swap subsystem a lot, the code base served us
well, but it had accumulated a lot of historical complexity and
awkward workarounds overtime (we had so many people in the community
complaining about it for so many years). I think we all agree that
pursuing incremental cleanups and improvement (eg. swap slot cache
cleanup, swap lock cleanup, swap_has_cache cleanup,
direct-swap workarounds removal, etc.) is more suitable upstream.
Chris also help a lot on this (eg. the LPC talk last year) and we
finally got rid of many long time burdens, quite some of these
works are directly enabled by his swap allocator rework first.

And I do have a more completed branch that I posted several times
showing the end results of swap tables have better memory consumption
& performance, and the code is much simpler than what we had in
upstream. It's getting merged step by step, and each step is a gain. I
believe that is the right way to improve things upstream, everyone and
every workload benefits, and progressively. And based on that, we will
be able to implement things much easier.

I believe things will look much better and cleaner as we process (eg.
resizing might be doable for generic swap too), and make it easier for
all of us, and make the swap subsystem better in a collaborative way.

Cheers.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Rik van Riel 1 week ago
On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org>
> wrote:
> > 
> > 
> > Zswap is primarily a compressed cache for real swap on secondary
> > storage. It's indeed quite important that entries currently in
> > zswap
> > don't occupy disk slots; but for a solution to this to be
> > acceptable,
> > it has to work with the primary usecase and support disk writeback.
> 
> Well, my plan is to support the writeback via swap.tiers.
> 
How would you do writeback from a zswap entry in
a ghost swapfile, to a real disk swap backend?

That is the use case people are trying to solve.

How would your architecture address it?

-- 
All Rights Reversed.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 1 week ago
On Mon, Nov 24, 2025 at 7:15 PM Rik van Riel <riel@surriel.com> wrote:
>
> On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org>
> > wrote:
> > >
> > >
> > > Zswap is primarily a compressed cache for real swap on secondary
> > > storage. It's indeed quite important that entries currently in
> > > zswap
> > > don't occupy disk slots; but for a solution to this to be
> > > acceptable,
> > > it has to work with the primary usecase and support disk writeback.
> >
> > Well, my plan is to support the writeback via swap.tiers.
> >
> How would you do writeback from a zswap entry in
> a ghost swapfile, to a real disk swap backend?

Basically, each swap file has its own version swap
ops->{read,write}_folio(). The mem swap tier is similar to the current
zswap but it is memory only, there is no file backing and don't share
swap entries with the real swapfile.

When writing back from one swap entry to another swapfile, for the
simple case of uncompressing the data, data will store to swap cache
and write to another swapfile with allocated another swap entry. The
front end of the swap cache will have the option map the front end
swap entry offset to the back end block locations. At the memory price
of 4 byte per swap entry.
This kind of physical block redirection not only happens in more than
one swapfile, it can happen in the same swapfile, in the situation
that there is available space in lower order swap entries. But can not
allocate the  higher order one because those lower order ones are not
continued. In such a case, the swap file can expand the high order
swap entry beyond the end of the current physical swapfile. Then map
two continues high order swap entry into the low order physical
locations. I have some slides I shared in the 2024 LSF the swap pony
talk with some diagrams for that physical swap location redirection.

> That is the use case people are trying to solve.

Yes, me too.

> How would your architecture address it?

The cluster base swap allocator, swap table as the new swap cache, per
cgroup swap.tiers and the vfs like swap ops all integrally work
together as the grant vision for the new swap system. I might not have
an answer for all the design details right now. I am the type of
person who likes to improvise and adjust the design details when more
detailed design constraints are found. So far I found this design can
work well. Some of the early milestones, swap allocator and swap
tables which already landed in the kernel and show great results.

I consider this is much better than the VS (previous swap astraction).
It does not enforce pain like the VS does. One of the big downsides of
VS is that, once applied to the kernel. Even normal swap does not use
redirection and will pay the price for it as well. The pain is
mandatory. My swap.tiers write back does not have this problem. If no
writeback or not redirection of physical blocks, no additional
overhead pay for memory nor CPU.

Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Rik van Riel 1 week ago
On Mon, 2025-11-24 at 20:26 +0300, Chris Li wrote:
> On Mon, Nov 24, 2025 at 7:15 PM Rik van Riel <riel@surriel.com>
> wrote:
> > 
> > On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner
> > > <hannes@cmpxchg.org>
> > > wrote:
> > > > 
> > > > 
> > > > Zswap is primarily a compressed cache for real swap on
> > > > secondary
> > > > storage. It's indeed quite important that entries currently in
> > > > zswap
> > > > don't occupy disk slots; but for a solution to this to be
> > > > acceptable,
> > > > it has to work with the primary usecase and support disk
> > > > writeback.
> > > 
> > > Well, my plan is to support the writeback via swap.tiers.
> > > 
> > How would you do writeback from a zswap entry in
> > a ghost swapfile, to a real disk swap backend?
> 
> Basically, each swap file has its own version swap
> ops->{read,write}_folio(). The mem swap tier is similar to the
> current
> zswap but it is memory only, there is no file backing and don't share
> swap entries with the real swapfile.
> 
> When writing back from one swap entry to another swapfile, for the
> simple case of uncompressing the data, data will store to swap cache
> and write to another swapfile with allocated another swap entry. The
> front end of the swap cache will have the option map the front end
> swap entry offset to the back end block locations. At the memory
> price
> of 4 byte per swap entry.

Wait, so you use the swap cache radix tree to
indicate the physical location of data between
multiple swap devices?

Isn't that exactly what the vswap approach
does, too?

How is this different?

-- 
All Rights Reversed.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 1 week ago
On Mon, Nov 24, 2025 at 8:43 PM Rik van Riel <riel@surriel.com> wrote:
>
> On Mon, 2025-11-24 at 20:26 +0300, Chris Li wrote:
> > On Mon, Nov 24, 2025 at 7:15 PM Rik van Riel <riel@surriel.com>
> > wrote:
> > >
> > > On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> > > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner
> > > > <hannes@cmpxchg.org>
> > > > wrote:
> > > > >
> > > > >
> > > > > Zswap is primarily a compressed cache for real swap on
> > > > > secondary
> > > > > storage. It's indeed quite important that entries currently in
> > > > > zswap
> > > > > don't occupy disk slots; but for a solution to this to be
> > > > > acceptable,
> > > > > it has to work with the primary usecase and support disk
> > > > > writeback.
> > > >
> > > > Well, my plan is to support the writeback via swap.tiers.
> > > >
> > > How would you do writeback from a zswap entry in
> > > a ghost swapfile, to a real disk swap backend?
> >
> > Basically, each swap file has its own version swap
> > ops->{read,write}_folio(). The mem swap tier is similar to the
> > current
> > zswap but it is memory only, there is no file backing and don't share
> > swap entries with the real swapfile.
> >
> > When writing back from one swap entry to another swapfile, for the
> > simple case of uncompressing the data, data will store to swap cache
> > and write to another swapfile with allocated another swap entry. The
> > front end of the swap cache will have the option map the front end
> > swap entry offset to the back end block locations. At the memory
> > price
> > of 4 byte per swap entry.
>
> Wait, so you use the swap cache radix tree to
> indicate the physical location of data between
> multiple swap devices?

Ah, you haven't caught up with the progress that the new swap cache
does not use radix trees any more. It is using swap tables. It is a
512 entry swpa table array lookup, no tree lookup. Much faster with
less locks. The swap table commit shows there are about 20% difference
in throughput in some test benchmark workloads.

> Isn't that exactly what the vswap approach
> does, too?

Except that I purpose it earlier.
https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/
That swap cache physcial entry redirection is my original idea as far
as I can tell and presented in the conference earlier.

> How is this different?

The main difference will be I just get rid of the xarray in swap cache
lookup. I don't want to re-introduce it again.
Also in my swap.tiers design, the redirection overhead is optional. If
you are not using redirection, in swap.tiers swpa ops you don't pay
for it. Just like the ghost swap file. VS it is not optional, will
enforce the overhead as well. In my design the memory overhead will be
smaller per swap entry because it will be integrated tightly with swap
entry.

Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Nhat Pham 1 week ago
On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > The current zswap requires a backing swapfile. The swap slot used
> > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > space.
> > >
> > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > prevents accidental read or write of ghost swapfile, bdev of
> > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > flag because there is no rotation disk access when using zswap.
> >
> > Zswap is primarily a compressed cache for real swap on secondary
> > storage. It's indeed quite important that entries currently in zswap
> > don't occupy disk slots; but for a solution to this to be acceptable,
> > it has to work with the primary usecase and support disk writeback.
>
> Well, my plan is to support the writeback via swap.tiers.
>
> > This direction is a dead-end. Please take a look at Nhat's swap
> > virtualization patches. They decouple zswap from disk geometry, while
> > still supporting writeback to an actual backend file.
>
> Yes, there are many ways to decouple zswap from disk geometry, my swap
> table + swap.tiers design can do that as well. I have concerns about
> swap virtualization in the aspect of adding another layer of memory
> overhead addition per swap entry and CPU overhead of extra xarray
> lookup. I believe my approach is technically superior and cleaner.

True, but the static nature of the current swapfile infrastructure
also imposes an space overhead and/or operational overhead.

I did play around with a prototype with a ghost swapfile for virtual
swap, but had to stop because of the swapfile overhead for larger
virtual swap space.

> Both faster and cleaner. Basically swap.tiers + VFS like swap read
> write page ops. I will let Nhat clarify the performance and memory

That just solves static placement, no? Backend transfer requires
something extra/orthogonal.

> overhead side of the swap virtualization.
>
> I am not against swap entry redirection. Just the swap virtualization

There will be redirection either way. I don't think it's avoidable.
The only option is whether to shove it into the backend (what zram is
doing), or having a generalized module (swap virtualization).

Or do a page table walk every time you want to do backend transfer
(what swapoff is doing).

> series needs to compare against the alternatives in terms of memory
> overhead and throughput.
> Solving it from the swap.tiers angle is cleaner.
>
> > Nacked-by: Johannes Weiner <hannes@cmpxchg.org>
>
> I take that the only relevant part is you are zswap maintainer and I
> am the swap maintainer. Fine. I got the message. I will leave the
> zswap alone. I will find other ways to address the memory base swap
> tiers in swap.tiers.

Please keep this discussion technical and not pull ranks unnecessarily.

>
> Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Kairui Song 1 week, 2 days ago
On Sat, Nov 22, 2025 at 10:09 AM Chris Li <chrisl@kernel.org> wrote:
>
> On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > > The current zswap requires a backing swapfile. The swap slot used
> > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > space.
> > >
> > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > prevents accidental read or write of ghost swapfile, bdev of
> > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > flag because there is no rotation disk access when using zswap.
> >
> > Zswap is primarily a compressed cache for real swap on secondary
> > storage. It's indeed quite important that entries currently in zswap
> > don't occupy disk slots; but for a solution to this to be acceptable,
> > it has to work with the primary usecase and support disk writeback.
>
> Well, my plan is to support the writeback via swap.tiers.

That sounds interesting. Have been watching YoungJun and yours
swap.tiers discussion for a while, looking forward to see how they
play together.

Using tiering to resolve the writeback issue sounds like a nice
solution, we definitely don't want to limit the writeback to
zswap/ram-block only, we will also want things like
block-block writeback.

We (and I have noticed many community users) have setups involving
hybrid tiers. We have a internal module that moves swap entry from SSD
to HDD too. To do it upstreamly we need something like the swap.tiers.

>
> > This direction is a dead-end. Please take a look at Nhat's swap
> > virtualization patches. They decouple zswap from disk geometry, while
> > still supporting writeback to an actual backend file.
>
> Yes, there are many ways to decouple zswap from disk geometry, my swap
> ...
> Solving it from the swap.tiers angle is cleaner.

Agree with the swap.tiers part, that sounds cleaner.

>
> > Nacked-by: Johannes Weiner <hannes@cmpxchg.org>

I think that's too early to justify. Let's stay open for ideas.
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Nhat Pham 1 week, 3 days ago
On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote:
>
> The current zswap requires a backing swapfile. The swap slot used
> by zswap is not able to be used by the swapfile. That waste swapfile
> space.
>
> The ghost swapfile is a swapfile that only contains the swapfile header
> for zswap. The swapfile header indicate the size of the swapfile. There
> is no swap data section in the ghost swapfile, therefore, no waste of
> swapfile space.  As such, any write to a ghost swapfile will fail. To
> prevents accidental read or write of ghost swapfile, bdev of
> swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> flag because there is no rotation disk access when using zswap.

Would this also affect the swap slot allocation algorithm?

>
> The zswap write back has been disabled if all swapfiles in the system
> are ghost swap files.

I don't like this design:

1. Statically sizing the compression tier will be an operational
nightmare, for users that have to support a variety (and increasingly
bigger sized) types of hosts. It's one of the primary motivations of
the virtual swap line of work. We need to move towards a more dynamic
architecture for zswap, not the other way around, in order to reduce
both (human's) operational overhead, AND actual space overhead (i.e
only allocate (z)swap metadata on-demand).

2. This digs us in the hole of supporting a special infrastructure for
non-writeback cases. Now every future change to zswap's architecture
has to take this into account. It's not easy to turn this design into
something that can support writeback - you're stuck with either having
to do an expensive page table walk to update the PTEs, or shoving the
virtual swap layer inside zswap. Ugly.

3. And what does this even buy us? Just create a fake in-memory-only
swapfile (heck, you can use zram), disable writeback (which you can do
both at a cgroup and host-level), and call it a day.

Nacked-by: Nhat Pham <nphamcs@gmail.com>
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 1 week, 2 days ago
On Fri, Nov 21, 2025 at 2:19 AM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote:
> >
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
>
> Would this also affect the swap slot allocation algorithm?
>
> >
> > The zswap write back has been disabled if all swapfiles in the system
> > are ghost swap files.
>
> I don't like this design:
>
> 1. Statically sizing the compression tier will be an operational
> nightmare, for users that have to support a variety (and increasingly
> bigger sized) types of hosts. It's one of the primary motivations of
> the virtual swap line of work. We need to move towards a more dynamic
> architecture for zswap, not the other way around, in order to reduce
> both (human's) operational overhead, AND actual space overhead (i.e
> only allocate (z)swap metadata on-demand).

Let's do it one step at a time.

> 2. This digs us in the hole of supporting a special infrastructure for
> non-writeback cases. Now every future change to zswap's architecture
> has to take this into account. It's not easy to turn this design into
> something that can support writeback - you're stuck with either having
> to do an expensive page table walk to update the PTEs, or shoving the
> virtual swap layer inside zswap. Ugly.

What are you talking about? This patch does not have any page table
work. You are opposing something in your imagination. Please show me
the code in which I do expensive PTE walks.

> 3. And what does this even buy us? Just create a fake in-memory-only
> swapfile (heck, you can use zram), disable writeback (which you can do
> both at a cgroup and host-level), and call it a day.

Well this provides users a choice, if they don't care about write
backs. They can do zswap with ghost swapfile now without actually
wasting disk space.

It also does not stop zswap using write back with normal SSD. If you
want to write back, you can still use a non ghost swapfile as normal.

It is a simple enough patch to provide value right now. It also fits
into the swap.tiers long term roadmap to have a seperate tier for
memory based swapfiles. I believe that is a cleaner picture than the
current zswap as cache but also gets its hands so deep into the swap
stack and slows down other swap tiers.

> Nacked-by: Nhat Pham <nphamcs@gmail.com>

I heard  you, if you don't don't want zswap to have anything to do
with memory based swap tier in the swap.tiers design. I respect your
choice.

Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Nhat Pham 1 week ago
On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Fri, Nov 21, 2025 at 2:19 AM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote:
> > >
> > > The current zswap requires a backing swapfile. The swap slot used
> > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > space.
> > >
> > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > prevents accidental read or write of ghost swapfile, bdev of
> > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > flag because there is no rotation disk access when using zswap.
> >
> > Would this also affect the swap slot allocation algorithm?
> >
> > >
> > > The zswap write back has been disabled if all swapfiles in the system
> > > are ghost swap files.
> >
> > I don't like this design:
> >
> > 1. Statically sizing the compression tier will be an operational
> > nightmare, for users that have to support a variety (and increasingly
> > bigger sized) types of hosts. It's one of the primary motivations of
> > the virtual swap line of work. We need to move towards a more dynamic
> > architecture for zswap, not the other way around, in order to reduce
> > both (human's) operational overhead, AND actual space overhead (i.e
> > only allocate (z)swap metadata on-demand).
>
> Let's do it one step at a time.

I'm happy with landing these patches one step at a time. But from my
POV (and admittedly limited imagination), it's a bit of a deadend.

The only architecture, IMO, that satisfies:

1. Dynamic overhead of (z)swap metadata.

2. Decouple swap backends, i.e no pre-reservation of lower tiers space
(what zswap is doing right now).

3. Backend transfer without page table walks.

is swap virtualization.

If you want to present an alternative vision, you don't have to
implement it right away, but you have to at least explain to me how to
achieve all these 3.

>
> > 2. This digs us in the hole of supporting a special infrastructure for
> > non-writeback cases. Now every future change to zswap's architecture
> > has to take this into account. It's not easy to turn this design into
> > something that can support writeback - you're stuck with either having
> > to do an expensive page table walk to update the PTEs, or shoving the
> > virtual swap layer inside zswap. Ugly.
>
> What are you talking about? This patch does not have any page table
> work. You are opposing something in your imagination. Please show me
> the code in which I do expensive PTE walks.

Please read my response again. I did not say you did any PTE walk in this patch.

What I meant was, if you want to make this the general architecture
for zswap and not some niche infrastructure for specialized use case,
you need to be able to support backend transfer, i.e zswap writeback
(zswap -> disk swap, and perhaps in the future the other direction).
This will be very expensive with this design.

>
> > 3. And what does this even buy us? Just create a fake in-memory-only
> > swapfile (heck, you can use zram), disable writeback (which you can do
> > both at a cgroup and host-level), and call it a day.
>
> Well this provides users a choice, if they don't care about write
> backs. They can do zswap with ghost swapfile now without actually
> wasting disk space.
>
> It also does not stop zswap using write back with normal SSD. If you
> want to write back, you can still use a non ghost swapfile as normal.
>
> It is a simple enough patch to provide value right now. It also fits
> into the swap.tiers long term roadmap to have a seperate tier for
> memory based swapfiles. I believe that is a cleaner picture than the
> current zswap as cache but also gets its hands so deep into the swap
> stack and slows down other swap tiers.
>
> > Nacked-by: Nhat Pham <nphamcs@gmail.com>
>
> I heard  you, if you don't don't want zswap to have anything to do
> with memory based swap tier in the swap.tiers design. I respect your
> choice.

Where does this even come from?

I can't speak for Johannes or Yosry, but personally I'm ambivalent
with respect to swap.tiers. My only objection in the past was there
was not any use case at a time, but there seems to be one now. I won't
stand in the way of swap.tiers landing, or zswap's integration into
it.

From my POV, swap.tiers solve a problem completely orthogonal to what
I'm trying to solve, namely, the three points listed above. It's about
definition of swap hierarchy, either at initial placement time, or
during offloading from one backend to another, where as I'm trying to
figure out the mechanistic side of it (how to transfer a page from one
backend to another without page table walking). These two are
independent, if not synergistic.

>
> Chris
Re: [PATCH RFC] mm: ghost swapfile support for zswap
Posted by Chris Li 6 days, 6 hours ago
On Mon, Nov 24, 2025 at 5:47 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
> >
> > On Fri, Nov 21, 2025 at 2:19 AM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote:
> > > >
> > > > The current zswap requires a backing swapfile. The swap slot used
> > > > by zswap is not able to be used by the swapfile. That waste swapfile
> > > > space.
> > > >
> > > > The ghost swapfile is a swapfile that only contains the swapfile header
> > > > for zswap. The swapfile header indicate the size of the swapfile. There
> > > > is no swap data section in the ghost swapfile, therefore, no waste of
> > > > swapfile space.  As such, any write to a ghost swapfile will fail. To
> > > > prevents accidental read or write of ghost swapfile, bdev of
> > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > > > flag because there is no rotation disk access when using zswap.
> > >
> > > Would this also affect the swap slot allocation algorithm?
> > >
> > > >
> > > > The zswap write back has been disabled if all swapfiles in the system
> > > > are ghost swap files.
> > >
> > > I don't like this design:
> > >
> > > 1. Statically sizing the compression tier will be an operational
> > > nightmare, for users that have to support a variety (and increasingly
> > > bigger sized) types of hosts. It's one of the primary motivations of
> > > the virtual swap line of work. We need to move towards a more dynamic
> > > architecture for zswap, not the other way around, in order to reduce
> > > both (human's) operational overhead, AND actual space overhead (i.e
> > > only allocate (z)swap metadata on-demand).
> >
> > Let's do it one step at a time.
>
> I'm happy with landing these patches one step at a time. But from my
> POV (and admittedly limited imagination), it's a bit of a deadend.
>
> The only architecture, IMO, that satisfies:
>
> 1. Dynamic overhead of (z)swap metadata.
>
> 2. Decouple swap backends, i.e no pre-reservation of lower tiers space
> (what zswap is doing right now).
>
> 3. Backend transfer without page table walks.
>
> is swap virtualization.
>
> If you want to present an alternative vision, you don't have to
> implement it right away, but you have to at least explain to me how to
> achieve all these 3.

From 1,2,3 to SV as the only solution is  a big jump. How many
possibilities have you explored to conclude that no other solution can
satisfy your 123?

I just replied to Rik's email about the high level sketch design. My
design should satisfy it and can serve as one counter example of
alternative design.


>
> >
> > > 2. This digs us in the hole of supporting a special infrastructure for
> > > non-writeback cases. Now every future change to zswap's architecture
> > > has to take this into account. It's not easy to turn this design into
> > > something that can support writeback - you're stuck with either having
> > > to do an expensive page table walk to update the PTEs, or shoving the
> > > virtual swap layer inside zswap. Ugly.
> >
> > What are you talking about? This patch does not have any page table
> > work. You are opposing something in your imagination. Please show me
> > the code in which I do expensive PTE walks.
>
> Please read my response again. I did not say you did any PTE walk in this patch.
>
> What I meant was, if you want to make this the general architecture
> for zswap and not some niche infrastructure for specialized use case,
> you need to be able to support backend transfer, i.e zswap writeback
> (zswap -> disk swap, and perhaps in the future the other direction).
> This will be very expensive with this design.

I can't say I agree with you. It seems you have made a lot of
assumptions in your reasoning.

> > > 3. And what does this even buy us? Just create a fake in-memory-only
> > > swapfile (heck, you can use zram), disable writeback (which you can do
> > > both at a cgroup and host-level), and call it a day.
> >
> > Well this provides users a choice, if they don't care about write
> > backs. They can do zswap with ghost swapfile now without actually
> > wasting disk space.
> >
> > It also does not stop zswap using write back with normal SSD. If you
> > want to write back, you can still use a non ghost swapfile as normal.
> >
> > It is a simple enough patch to provide value right now. It also fits
> > into the swap.tiers long term roadmap to have a seperate tier for
> > memory based swapfiles. I believe that is a cleaner picture than the
> > current zswap as cache but also gets its hands so deep into the swap
> > stack and slows down other swap tiers.
> >
> > > Nacked-by: Nhat Pham <nphamcs@gmail.com>
> >
> > I heard  you, if you don't don't want zswap to have anything to do
> > with memory based swap tier in the swap.tiers design. I respect your
> > choice.
>
> Where does this even come from?
>
> I can't speak for Johannes or Yosry, but personally I'm ambivalent
> with respect to swap.tiers. My only objection in the past was there
> was not any use case at a time, but there seems to be one now. I won't
> stand in the way of swap.tiers landing, or zswap's integration into
> it.
>
> From my POV, swap.tiers solve a problem completely orthogonal to what
> I'm trying to solve, namely, the three points listed above. It's about
> definition of swap hierarchy, either at initial placement time, or
> during offloading from one backend to another, where as I'm trying to
> figure out the mechanistic side of it (how to transfer a page from one
> backend to another without page table walking). These two are
> independent, if not synergistic.

I think our goal overlaps, just a different approach with different
performance charistic.
I have asked in this thread a few times, how big is the per swap slot
memory overhead VS introduced?
That is something that I care about a lot.

Chris