include/linux/swap.h | 2 ++ mm/page_io.c | 18 +++++++++++++++--- mm/swap.h | 2 +- mm/swap_state.c | 7 +++++++ mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++----- mm/zswap.c | 17 +++++++++++------ 6 files changed, 73 insertions(+), 15 deletions(-)
The current zswap requires a backing swapfile. The swap slot used
by zswap is not able to be used by the swapfile. That waste swapfile
space.
The ghost swapfile is a swapfile that only contains the swapfile header
for zswap. The swapfile header indicate the size of the swapfile. There
is no swap data section in the ghost swapfile, therefore, no waste of
swapfile space. As such, any write to a ghost swapfile will fail. To
prevents accidental read or write of ghost swapfile, bdev of
swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
flag because there is no rotation disk access when using zswap.
The zswap write back has been disabled if all swapfiles in the system
are ghost swap files.
Signed-off-by: Chris Li <chrisl@kernel.org>
---
include/linux/swap.h | 2 ++
mm/page_io.c | 18 +++++++++++++++---
mm/swap.h | 2 +-
mm/swap_state.c | 7 +++++++
mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++-----
mm/zswap.c | 17 +++++++++++------
6 files changed, 73 insertions(+), 15 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -216,6 +216,7 @@ enum {
SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
+ SWP_GHOST = (1 << 13), /* not backed by anything */
/* add others here before... */
};
@@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
void free_pages_and_swap_cache(struct encoded_page **, int);
/* linux/mm/swapfile.c */
extern atomic_long_t nr_swap_pages;
+extern atomic_t nr_real_swapfiles;
extern long total_swap_pages;
extern atomic_t nr_rotate_swap;
diff --git a/mm/page_io.c b/mm/page_io.c
index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
return AOP_WRITEPAGE_ACTIVATE;
}
- __swap_writepage(folio, swap_plug);
- return 0;
+ return __swap_writepage(folio, swap_plug);
out_unlock:
folio_unlock(folio);
return ret;
@@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
submit_bio(bio);
}
-void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
+int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
{
struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
+
+ if (sis->flags & SWP_GHOST) {
+ /* Prevent the page from getting reclaimed. */
+ folio_set_dirty(folio);
+ return AOP_WRITEPAGE_ACTIVATE;
+ }
+
/*
* ->flags can be updated non-atomicially (scan_swap_map_slots),
* but that will never affect SWP_FS_OPS, so the data_race
@@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
swap_writepage_bdev_sync(folio, sis);
else
swap_writepage_bdev_async(folio, sis);
+ return 0;
}
void swap_write_unplug(struct swap_iocb *sio)
@@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
if (zswap_load(folio) != -ENOENT)
goto finish;
+ if (unlikely(sis->flags & SWP_GHOST)) {
+ folio_unlock(folio);
+ goto finish;
+ }
+
/* We have to read from slower devices. Increase zswap protection. */
zswap_folio_swapin(folio);
diff --git a/mm/swap.h b/mm/swap.h
index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
}
void swap_write_unplug(struct swap_iocb *sio);
int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
-void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
+int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
/* linux/mm/swap_state.c */
extern struct address_space swap_space __ro_after_init;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct swap_iocb *splug = NULL;
bool page_allocated;
+ /*
+ * The entry may have been freed by another task. Avoid swap_info_get()
+ * which will print error message if the race happens.
+ */
+ if (si->flags & SWP_GHOST)
+ goto skip;
+
mask = swapin_nr_pages(offset) - 1;
if (!mask)
goto skip;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
atomic_long_t nr_swap_pages;
+atomic_t nr_real_swapfiles;
/*
* Some modules use swappable objects and may try to swap them out under
* memory pressure (via the shrinker). Before doing so, they may wish to
@@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
goto skip;
}
+ if (!(si->flags & SWP_GHOST))
+ atomic_sub(1, &nr_real_swapfiles);
plist_del(&si->avail_list, &swap_avail_head);
skip:
@@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
}
plist_add(&si->avail_list, &swap_avail_head);
+ if (!(si->flags & SWP_GHOST))
+ atomic_add(1, &nr_real_swapfiles);
skip:
spin_unlock(&swap_avail_lock);
@@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
struct inode *inode = mapping->host;
int ret;
+ if (sis->flags & SWP_GHOST) {
+ *span = 0;
+ return 0;
+ }
+
if (S_ISBLK(inode->i_mode)) {
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
@@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
- if (!p->bdev || !bdev_nonrot(p->bdev))
+ if (!(p->flags & SWP_GHOST) &&
+ (!p->bdev || !bdev_nonrot(p->bdev)))
atomic_dec(&nr_rotate_swap);
mutex_lock(&swapon_mutex);
@@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
mutex_unlock(&swapon_mutex);
}
+static const char *swap_type_str(struct swap_info_struct *si)
+{
+ struct file *file = si->swap_file;
+
+ if (si->flags & SWP_GHOST)
+ return "ghost\t";
+
+ if (S_ISBLK(file_inode(file)->i_mode))
+ return "partition";
+
+ return "file\t";
+}
+
static int swap_show(struct seq_file *swap, void *v)
{
struct swap_info_struct *si = v;
@@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
len = seq_file_path(swap, file, " \t\n\\");
seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
len < 40 ? 40 - len : 1, " ",
- S_ISBLK(file_inode(file)->i_mode) ?
- "partition" : "file\t",
+ swap_type_str(si),
bytes, bytes < 10000000 ? "\t" : "",
inuse, inuse < 10000000 ? "\t" : "",
si->prio);
@@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
return 0;
}
-
/*
* Find out how many pages are allowed for a single swap device. There
* are two limiting factors:
@@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
unsigned long maxpages;
unsigned long swapfilepages;
unsigned long last_page;
+ loff_t size;
if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
pr_err("Unable to find swap-space signature\n");
@@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
if (!maxpages)
return 0;
- swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
+
+ size = i_size_read(inode);
+ if (size == PAGE_SIZE) {
+ /* Ghost swapfile */
+ si->bdev = NULL;
+ si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
+ return maxpages;
+ }
+
+ swapfilepages = size >> PAGE_SHIFT;
if (swapfilepages && maxpages > swapfilepages) {
pr_warn("Swap area shorter than signature indicates\n");
return 0;
diff --git a/mm/zswap.c b/mm/zswap.c
index 5d0f8b13a958da3b5e74b63217b06e58ba2d3c26..29dfcc94b13eb72b1dbd100ded6e50620299e6e1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1005,14 +1005,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
struct folio *folio;
struct mempolicy *mpol;
bool folio_was_allocated;
- struct swap_info_struct *si;
+ struct swap_info_struct *si = get_swap_device(swpentry);
int ret = 0;
- /* try to allocate swap cache folio */
- si = get_swap_device(swpentry);
if (!si)
- return -EEXIST;
+ return -ENOENT;
+
+ if (si->flags & SWP_GHOST) {
+ put_swap_device(si);
+ return -EINVAL;
+ }
+ /* try to allocate swap cache folio */
mpol = get_task_policy(current);
folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
@@ -1067,7 +1071,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
folio_set_reclaim(folio);
/* start writeback */
- __swap_writepage(folio, NULL);
+ ret = __swap_writepage(folio, NULL);
+ WARN_ON_ONCE(ret);
out:
if (ret && ret != -EEXIST) {
@@ -1551,7 +1556,7 @@ bool zswap_store(struct folio *folio)
zswap_pool_put(pool);
put_objcg:
obj_cgroup_put(objcg);
- if (!ret && zswap_pool_reached_full)
+ if (!ret && zswap_pool_reached_full && atomic_read(&nr_real_swapfiles))
queue_work(shrink_wq, &zswap_shrink_work);
check_old:
/*
---
base-commit: 9835506e139732fa1b55aea3ed4e3ec3dd499f30
change-id: 20251121-ghost-56e3948a7a17
Best regards,
--
Chris Li <chrisl@kernel.org>
On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
>
> The current zswap requires a backing swapfile. The swap slot used
> by zswap is not able to be used by the swapfile. That waste swapfile
> space.
>
> The ghost swapfile is a swapfile that only contains the swapfile header
> for zswap. The swapfile header indicate the size of the swapfile. There
> is no swap data section in the ghost swapfile, therefore, no waste of
> swapfile space. As such, any write to a ghost swapfile will fail. To
> prevents accidental read or write of ghost swapfile, bdev of
> swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> flag because there is no rotation disk access when using zswap.
>
> The zswap write back has been disabled if all swapfiles in the system
> are ghost swap files.
Thanks for sharing this, I've been hearing about the ghost swapfile
design for a long time, glad to see it finally got posted.
>
> Signed-off-by: Chris Li <chrisl@kernel.org>
> ---
> include/linux/swap.h | 2 ++
> mm/page_io.c | 18 +++++++++++++++---
> mm/swap.h | 2 +-
> mm/swap_state.c | 7 +++++++
> mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++-----
> mm/zswap.c | 17 +++++++++++------
> 6 files changed, 73 insertions(+), 15 deletions(-)
In general I think this aligns quite well with what I had in mind and
an idea that was mention during LSFMM this year (the 3rd one in the
"Issues" part, it wasn't clearly described in the cover letter, more
details in the slides):
https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/
The good part is that we will reuse everything we have with the
current swap stack, and stay optional. Everything is a swap device, no
special layers required. All other features will be available in a
cleaner way.
And /etc/fstab just works the same way for the ghost swapfile.
Looking forward to see this RFC get more updates.
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -216,6 +216,7 @@ enum {
> SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
> SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
> SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> + SWP_GHOST = (1 << 13), /* not backed by anything */
> /* add others here before... */
> };
>
> @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
> void free_pages_and_swap_cache(struct encoded_page **, int);
> /* linux/mm/swapfile.c */
> extern atomic_long_t nr_swap_pages;
> +extern atomic_t nr_real_swapfiles;
> extern long total_swap_pages;
> extern atomic_t nr_rotate_swap;
>
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> return AOP_WRITEPAGE_ACTIVATE;
> }
>
> - __swap_writepage(folio, swap_plug);
> - return 0;
> + return __swap_writepage(folio, swap_plug);
> out_unlock:
> folio_unlock(folio);
> return ret;
> @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
> submit_bio(bio);
> }
>
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> {
> struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
>
> VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> +
> + if (sis->flags & SWP_GHOST) {
> + /* Prevent the page from getting reclaimed. */
> + folio_set_dirty(folio);
> + return AOP_WRITEPAGE_ACTIVATE;
> + }
> +
> /*
> * ->flags can be updated non-atomicially (scan_swap_map_slots),
> * but that will never affect SWP_FS_OPS, so the data_race
> @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> swap_writepage_bdev_sync(folio, sis);
> else
> swap_writepage_bdev_async(folio, sis);
> + return 0;
> }
>
> void swap_write_unplug(struct swap_iocb *sio)
> @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> if (zswap_load(folio) != -ENOENT)
> goto finish;
>
> + if (unlikely(sis->flags & SWP_GHOST)) {
> + folio_unlock(folio);
> + goto finish;
> + }
> +
> /* We have to read from slower devices. Increase zswap protection. */
> zswap_folio_swapin(folio);
>
> diff --git a/mm/swap.h b/mm/swap.h
> index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> }
> void swap_write_unplug(struct swap_iocb *sio);
> int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
>
> /* linux/mm/swap_state.c */
> extern struct address_space swap_space __ro_after_init;
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> struct swap_iocb *splug = NULL;
> bool page_allocated;
>
> + /*
> + * The entry may have been freed by another task. Avoid swap_info_get()
> + * which will print error message if the race happens.
> + */
> + if (si->flags & SWP_GHOST)
> + goto skip;
> +
> mask = swapin_nr_pages(offset) - 1;
> if (!mask)
> goto skip;
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
> static DEFINE_SPINLOCK(swap_lock);
> static unsigned int nr_swapfiles;
> atomic_long_t nr_swap_pages;
> +atomic_t nr_real_swapfiles;
> /*
> * Some modules use swappable objects and may try to swap them out under
> * memory pressure (via the shrinker). Before doing so, they may wish to
> @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> goto skip;
> }
>
> + if (!(si->flags & SWP_GHOST))
> + atomic_sub(1, &nr_real_swapfiles);
> plist_del(&si->avail_list, &swap_avail_head);
>
> skip:
> @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> }
>
> plist_add(&si->avail_list, &swap_avail_head);
> + if (!(si->flags & SWP_GHOST))
> + atomic_add(1, &nr_real_swapfiles);
>
> skip:
> spin_unlock(&swap_avail_lock);
> @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
> struct inode *inode = mapping->host;
> int ret;
>
> + if (sis->flags & SWP_GHOST) {
> + *span = 0;
> + return 0;
> + }
> +
> if (S_ISBLK(inode->i_mode)) {
> ret = add_swap_extent(sis, 0, sis->max, 0);
> *span = sis->pages;
> @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> if (p->flags & SWP_CONTINUED)
> free_swap_count_continuations(p);
>
> - if (!p->bdev || !bdev_nonrot(p->bdev))
> + if (!(p->flags & SWP_GHOST) &&
> + (!p->bdev || !bdev_nonrot(p->bdev)))
> atomic_dec(&nr_rotate_swap);
>
> mutex_lock(&swapon_mutex);
> @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
> mutex_unlock(&swapon_mutex);
> }
>
> +static const char *swap_type_str(struct swap_info_struct *si)
> +{
> + struct file *file = si->swap_file;
> +
> + if (si->flags & SWP_GHOST)
> + return "ghost\t";
> +
> + if (S_ISBLK(file_inode(file)->i_mode))
> + return "partition";
> +
> + return "file\t";
> +}
> +
> static int swap_show(struct seq_file *swap, void *v)
> {
> struct swap_info_struct *si = v;
> @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
> len = seq_file_path(swap, file, " \t\n\\");
> seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
> len < 40 ? 40 - len : 1, " ",
> - S_ISBLK(file_inode(file)->i_mode) ?
> - "partition" : "file\t",
> + swap_type_str(si),
> bytes, bytes < 10000000 ? "\t" : "",
> inuse, inuse < 10000000 ? "\t" : "",
> si->prio);
> @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
> return 0;
> }
>
> -
> /*
> * Find out how many pages are allowed for a single swap device. There
> * are two limiting factors:
> @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> unsigned long maxpages;
> unsigned long swapfilepages;
> unsigned long last_page;
> + loff_t size;
>
> if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
> pr_err("Unable to find swap-space signature\n");
> @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>
> if (!maxpages)
> return 0;
> - swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> +
> + size = i_size_read(inode);
> + if (size == PAGE_SIZE) {
> + /* Ghost swapfile */
> + si->bdev = NULL;
> + si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> + return maxpages;
> + }
Here if we push things further, it might be a good idea to make better
use of the swap file header for detecting this kind of device, and
maybe add support for other info too. The header already has version
info embedded in case it will be extended.
Add YoungJun to CC.
On 11/22/25 at 05:59pm, Kairui Song wrote:
> On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
> >
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space. As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
> >
> > The zswap write back has been disabled if all swapfiles in the system
> > are ghost swap files.
>
> Thanks for sharing this, I've been hearing about the ghost swapfile
> design for a long time, glad to see it finally got posted.
>
> >
> > Signed-off-by: Chris Li <chrisl@kernel.org>
> > ---
> > include/linux/swap.h | 2 ++
> > mm/page_io.c | 18 +++++++++++++++---
> > mm/swap.h | 2 +-
> > mm/swap_state.c | 7 +++++++
> > mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++-----
> > mm/zswap.c | 17 +++++++++++------
> > 6 files changed, 73 insertions(+), 15 deletions(-)
>
> In general I think this aligns quite well with what I had in mind and
> an idea that was mention during LSFMM this year (the 3rd one in the
> "Issues" part, it wasn't clearly described in the cover letter, more
> details in the slides):
> https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/
Thanks for sharing the background and more information. When I checked
Youngjun's swap.tiers patchset before his RFC, felt it would be more
flexible to add zswap to memcg if zswap size can be decoupled from the
back device. Chris's RFC can satisfy that, but I didn't thought you
guys had planned more, e.g dynamic growth of swap size, and the zswap slot
management being like swap table on swap slot. Looking forward to seeing
the progress and more details.
Thanks
Baoquan
>
> The good part is that we will reuse everything we have with the
> current swap stack, and stay optional. Everything is a swap device, no
> special layers required. All other features will be available in a
> cleaner way.
>
> And /etc/fstab just works the same way for the ghost swapfile.
>
> Looking forward to see this RFC get more updates.
>
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -216,6 +216,7 @@ enum {
> > SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
> > SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
> > SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> > + SWP_GHOST = (1 << 13), /* not backed by anything */
> > /* add others here before... */
> > };
> >
> > @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
> > void free_pages_and_swap_cache(struct encoded_page **, int);
> > /* linux/mm/swapfile.c */
> > extern atomic_long_t nr_swap_pages;
> > +extern atomic_t nr_real_swapfiles;
> > extern long total_swap_pages;
> > extern atomic_t nr_rotate_swap;
> >
> > diff --git a/mm/page_io.c b/mm/page_io.c
> > index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> > --- a/mm/page_io.c
> > +++ b/mm/page_io.c
> > @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> > return AOP_WRITEPAGE_ACTIVATE;
> > }
> >
> > - __swap_writepage(folio, swap_plug);
> > - return 0;
> > + return __swap_writepage(folio, swap_plug);
> > out_unlock:
> > folio_unlock(folio);
> > return ret;
> > @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
> > submit_bio(bio);
> > }
> >
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > {
> > struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> >
> > VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> > +
> > + if (sis->flags & SWP_GHOST) {
> > + /* Prevent the page from getting reclaimed. */
> > + folio_set_dirty(folio);
> > + return AOP_WRITEPAGE_ACTIVATE;
> > + }
> > +
> > /*
> > * ->flags can be updated non-atomicially (scan_swap_map_slots),
> > * but that will never affect SWP_FS_OPS, so the data_race
> > @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > swap_writepage_bdev_sync(folio, sis);
> > else
> > swap_writepage_bdev_async(folio, sis);
> > + return 0;
> > }
> >
> > void swap_write_unplug(struct swap_iocb *sio)
> > @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> > if (zswap_load(folio) != -ENOENT)
> > goto finish;
> >
> > + if (unlikely(sis->flags & SWP_GHOST)) {
> > + folio_unlock(folio);
> > + goto finish;
> > + }
> > +
> > /* We have to read from slower devices. Increase zswap protection. */
> > zswap_folio_swapin(folio);
> >
> > diff --git a/mm/swap.h b/mm/swap.h
> > index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> > }
> > void swap_write_unplug(struct swap_iocb *sio);
> > int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> >
> > /* linux/mm/swap_state.c */
> > extern struct address_space swap_space __ro_after_init;
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > struct swap_iocb *splug = NULL;
> > bool page_allocated;
> >
> > + /*
> > + * The entry may have been freed by another task. Avoid swap_info_get()
> > + * which will print error message if the race happens.
> > + */
> > + if (si->flags & SWP_GHOST)
> > + goto skip;
> > +
> > mask = swapin_nr_pages(offset) - 1;
> > if (!mask)
> > goto skip;
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
> > static DEFINE_SPINLOCK(swap_lock);
> > static unsigned int nr_swapfiles;
> > atomic_long_t nr_swap_pages;
> > +atomic_t nr_real_swapfiles;
> > /*
> > * Some modules use swappable objects and may try to swap them out under
> > * memory pressure (via the shrinker). Before doing so, they may wish to
> > @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> > goto skip;
> > }
> >
> > + if (!(si->flags & SWP_GHOST))
> > + atomic_sub(1, &nr_real_swapfiles);
> > plist_del(&si->avail_list, &swap_avail_head);
> >
> > skip:
> > @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> > }
> >
> > plist_add(&si->avail_list, &swap_avail_head);
> > + if (!(si->flags & SWP_GHOST))
> > + atomic_add(1, &nr_real_swapfiles);
> >
> > skip:
> > spin_unlock(&swap_avail_lock);
> > @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
> > struct inode *inode = mapping->host;
> > int ret;
> >
> > + if (sis->flags & SWP_GHOST) {
> > + *span = 0;
> > + return 0;
> > + }
> > +
> > if (S_ISBLK(inode->i_mode)) {
> > ret = add_swap_extent(sis, 0, sis->max, 0);
> > *span = sis->pages;
> > @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> > if (p->flags & SWP_CONTINUED)
> > free_swap_count_continuations(p);
> >
> > - if (!p->bdev || !bdev_nonrot(p->bdev))
> > + if (!(p->flags & SWP_GHOST) &&
> > + (!p->bdev || !bdev_nonrot(p->bdev)))
> > atomic_dec(&nr_rotate_swap);
> >
> > mutex_lock(&swapon_mutex);
> > @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
> > mutex_unlock(&swapon_mutex);
> > }
> >
> > +static const char *swap_type_str(struct swap_info_struct *si)
> > +{
> > + struct file *file = si->swap_file;
> > +
> > + if (si->flags & SWP_GHOST)
> > + return "ghost\t";
> > +
> > + if (S_ISBLK(file_inode(file)->i_mode))
> > + return "partition";
> > +
> > + return "file\t";
> > +}
> > +
> > static int swap_show(struct seq_file *swap, void *v)
> > {
> > struct swap_info_struct *si = v;
> > @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
> > len = seq_file_path(swap, file, " \t\n\\");
> > seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
> > len < 40 ? 40 - len : 1, " ",
> > - S_ISBLK(file_inode(file)->i_mode) ?
> > - "partition" : "file\t",
> > + swap_type_str(si),
> > bytes, bytes < 10000000 ? "\t" : "",
> > inuse, inuse < 10000000 ? "\t" : "",
> > si->prio);
> > @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
> > return 0;
> > }
> >
> > -
> > /*
> > * Find out how many pages are allowed for a single swap device. There
> > * are two limiting factors:
> > @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> > unsigned long maxpages;
> > unsigned long swapfilepages;
> > unsigned long last_page;
> > + loff_t size;
> >
> > if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
> > pr_err("Unable to find swap-space signature\n");
> > @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> >
> > if (!maxpages)
> > return 0;
> > - swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> > +
> > + size = i_size_read(inode);
> > + if (size == PAGE_SIZE) {
> > + /* Ghost swapfile */
> > + si->bdev = NULL;
> > + si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> > + return maxpages;
> > + }
>
> Here if we push things further, it might be a good idea to make better
> use of the swap file header for detecting this kind of device, and
> maybe add support for other info too. The header already has version
> info embedded in case it will be extended.
>
On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> The current zswap requires a backing swapfile. The swap slot used
> by zswap is not able to be used by the swapfile. That waste swapfile
> space.
>
> The ghost swapfile is a swapfile that only contains the swapfile header
> for zswap. The swapfile header indicate the size of the swapfile. There
> is no swap data section in the ghost swapfile, therefore, no waste of
> swapfile space. As such, any write to a ghost swapfile will fail. To
> prevents accidental read or write of ghost swapfile, bdev of
> swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> flag because there is no rotation disk access when using zswap.
>
> The zswap write back has been disabled if all swapfiles in the system
> are ghost swap files.
>
> Signed-off-by: Chris Li <chrisl@kernel.org>
This was brought up before, I think it's not the right way to go
upstream. Even if it's good for the short-term, it's a behavior exposed
to userspace that we'll have to maintain. With the ongoing work to
decouple zswap and swap backends, this will end up being something we
have to workaround indefinitely to keep the same userspace semantics.
> ---
> include/linux/swap.h | 2 ++
> mm/page_io.c | 18 +++++++++++++++---
> mm/swap.h | 2 +-
> mm/swap_state.c | 7 +++++++
> mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++-----
> mm/zswap.c | 17 +++++++++++------
> 6 files changed, 73 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -216,6 +216,7 @@ enum {
> SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
> SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
> SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> + SWP_GHOST = (1 << 13), /* not backed by anything */
> /* add others here before... */
> };
>
> @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
> void free_pages_and_swap_cache(struct encoded_page **, int);
> /* linux/mm/swapfile.c */
> extern atomic_long_t nr_swap_pages;
> +extern atomic_t nr_real_swapfiles;
> extern long total_swap_pages;
> extern atomic_t nr_rotate_swap;
>
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> return AOP_WRITEPAGE_ACTIVATE;
> }
>
> - __swap_writepage(folio, swap_plug);
> - return 0;
> + return __swap_writepage(folio, swap_plug);
> out_unlock:
> folio_unlock(folio);
> return ret;
> @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
> submit_bio(bio);
> }
>
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> {
> struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
>
> VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> +
> + if (sis->flags & SWP_GHOST) {
> + /* Prevent the page from getting reclaimed. */
> + folio_set_dirty(folio);
> + return AOP_WRITEPAGE_ACTIVATE;
> + }
> +
> /*
> * ->flags can be updated non-atomicially (scan_swap_map_slots),
> * but that will never affect SWP_FS_OPS, so the data_race
> @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> swap_writepage_bdev_sync(folio, sis);
> else
> swap_writepage_bdev_async(folio, sis);
> + return 0;
> }
>
> void swap_write_unplug(struct swap_iocb *sio)
> @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> if (zswap_load(folio) != -ENOENT)
> goto finish;
>
> + if (unlikely(sis->flags & SWP_GHOST)) {
> + folio_unlock(folio);
> + goto finish;
> + }
> +
> /* We have to read from slower devices. Increase zswap protection. */
> zswap_folio_swapin(folio);
>
> diff --git a/mm/swap.h b/mm/swap.h
> index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> }
> void swap_write_unplug(struct swap_iocb *sio);
> int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
>
> /* linux/mm/swap_state.c */
> extern struct address_space swap_space __ro_after_init;
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> struct swap_iocb *splug = NULL;
> bool page_allocated;
>
> + /*
> + * The entry may have been freed by another task. Avoid swap_info_get()
> + * which will print error message if the race happens.
> + */
> + if (si->flags & SWP_GHOST)
> + goto skip;
> +
> mask = swapin_nr_pages(offset) - 1;
> if (!mask)
> goto skip;
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
> static DEFINE_SPINLOCK(swap_lock);
> static unsigned int nr_swapfiles;
> atomic_long_t nr_swap_pages;
> +atomic_t nr_real_swapfiles;
> /*
> * Some modules use swappable objects and may try to swap them out under
> * memory pressure (via the shrinker). Before doing so, they may wish to
> @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> goto skip;
> }
>
> + if (!(si->flags & SWP_GHOST))
> + atomic_sub(1, &nr_real_swapfiles);
> plist_del(&si->avail_list, &swap_avail_head);
>
> skip:
> @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> }
>
> plist_add(&si->avail_list, &swap_avail_head);
> + if (!(si->flags & SWP_GHOST))
> + atomic_add(1, &nr_real_swapfiles);
>
> skip:
> spin_unlock(&swap_avail_lock);
> @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
> struct inode *inode = mapping->host;
> int ret;
>
> + if (sis->flags & SWP_GHOST) {
> + *span = 0;
> + return 0;
> + }
> +
> if (S_ISBLK(inode->i_mode)) {
> ret = add_swap_extent(sis, 0, sis->max, 0);
> *span = sis->pages;
> @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> if (p->flags & SWP_CONTINUED)
> free_swap_count_continuations(p);
>
> - if (!p->bdev || !bdev_nonrot(p->bdev))
> + if (!(p->flags & SWP_GHOST) &&
> + (!p->bdev || !bdev_nonrot(p->bdev)))
> atomic_dec(&nr_rotate_swap);
>
> mutex_lock(&swapon_mutex);
> @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
> mutex_unlock(&swapon_mutex);
> }
>
> +static const char *swap_type_str(struct swap_info_struct *si)
> +{
> + struct file *file = si->swap_file;
> +
> + if (si->flags & SWP_GHOST)
> + return "ghost\t";
> +
> + if (S_ISBLK(file_inode(file)->i_mode))
> + return "partition";
> +
> + return "file\t";
> +}
> +
> static int swap_show(struct seq_file *swap, void *v)
> {
> struct swap_info_struct *si = v;
> @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
> len = seq_file_path(swap, file, " \t\n\\");
> seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
> len < 40 ? 40 - len : 1, " ",
> - S_ISBLK(file_inode(file)->i_mode) ?
> - "partition" : "file\t",
> + swap_type_str(si),
> bytes, bytes < 10000000 ? "\t" : "",
> inuse, inuse < 10000000 ? "\t" : "",
> si->prio);
> @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
> return 0;
> }
>
> -
> /*
> * Find out how many pages are allowed for a single swap device. There
> * are two limiting factors:
> @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> unsigned long maxpages;
> unsigned long swapfilepages;
> unsigned long last_page;
> + loff_t size;
>
> if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
> pr_err("Unable to find swap-space signature\n");
> @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>
> if (!maxpages)
> return 0;
> - swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> +
> + size = i_size_read(inode);
> + if (size == PAGE_SIZE) {
> + /* Ghost swapfile */
> + si->bdev = NULL;
> + si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> + return maxpages;
> + }
> +
> + swapfilepages = size >> PAGE_SHIFT;
> if (swapfilepages && maxpages > swapfilepages) {
> pr_warn("Swap area shorter than signature indicates\n");
> return 0;
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 5d0f8b13a958da3b5e74b63217b06e58ba2d3c26..29dfcc94b13eb72b1dbd100ded6e50620299e6e1 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1005,14 +1005,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> struct folio *folio;
> struct mempolicy *mpol;
> bool folio_was_allocated;
> - struct swap_info_struct *si;
> + struct swap_info_struct *si = get_swap_device(swpentry);
> int ret = 0;
>
> - /* try to allocate swap cache folio */
> - si = get_swap_device(swpentry);
> if (!si)
> - return -EEXIST;
> + return -ENOENT;
> +
> + if (si->flags & SWP_GHOST) {
> + put_swap_device(si);
> + return -EINVAL;
> + }
>
> + /* try to allocate swap cache folio */
> mpol = get_task_policy(current);
> folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
> NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
> @@ -1067,7 +1071,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> folio_set_reclaim(folio);
>
> /* start writeback */
> - __swap_writepage(folio, NULL);
> + ret = __swap_writepage(folio, NULL);
> + WARN_ON_ONCE(ret);
>
> out:
> if (ret && ret != -EEXIST) {
> @@ -1551,7 +1556,7 @@ bool zswap_store(struct folio *folio)
> zswap_pool_put(pool);
> put_objcg:
> obj_cgroup_put(objcg);
> - if (!ret && zswap_pool_reached_full)
> + if (!ret && zswap_pool_reached_full && atomic_read(&nr_real_swapfiles))
> queue_work(shrink_wq, &zswap_shrink_work);
> check_old:
> /*
>
> ---
> base-commit: 9835506e139732fa1b55aea3ed4e3ec3dd499f30
> change-id: 20251121-ghost-56e3948a7a17
>
> Best regards,
> --
> Chris Li <chrisl@kernel.org>
>
On Fri, Nov 21, 2025 at 7:14 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
>
> On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space. As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
> >
> > The zswap write back has been disabled if all swapfiles in the system
> > are ghost swap files.
> >
> > Signed-off-by: Chris Li <chrisl@kernel.org>
>
> This was brought up before, I think it's not the right way to go
> upstream. Even if it's good for the short-term, it's a behavior exposed
> to userspace that we'll have to maintain. With the ongoing work to
> decouple zswap and swap backends, this will end up being something we
> have to workaround indefinitely to keep the same userspace semantics.
Actually, this doesn't need to be the short term solution. It can be
long term. I get it your zswap maintainers do not want to get
involved in the ghost swapfile. I will leave you guys alone. Remember
2023 LPC swap abstraction talk, the community picked my approach to
the VFS swap ops over the swap abstraction which the swap
virtualization is based on. I take some time to come up with the
cluster based swap allocator and swap table to clean up and speed up
the swap stack. Now I am finally able to circle back and fulfill my
promise of the VFS swap ops. Have a little faith I will solve this
swap entry redirection issue nicely for you, better than the swap
virtualization approach can.
Chris
>
> > ---
> > include/linux/swap.h | 2 ++
> > mm/page_io.c | 18 +++++++++++++++---
> > mm/swap.h | 2 +-
> > mm/swap_state.c | 7 +++++++
> > mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++-----
> > mm/zswap.c | 17 +++++++++++------
> > 6 files changed, 73 insertions(+), 15 deletions(-)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -216,6 +216,7 @@ enum {
> > SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
> > SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
> > SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> > + SWP_GHOST = (1 << 13), /* not backed by anything */
> > /* add others here before... */
> > };
> >
> > @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
> > void free_pages_and_swap_cache(struct encoded_page **, int);
> > /* linux/mm/swapfile.c */
> > extern atomic_long_t nr_swap_pages;
> > +extern atomic_t nr_real_swapfiles;
> > extern long total_swap_pages;
> > extern atomic_t nr_rotate_swap;
> >
> > diff --git a/mm/page_io.c b/mm/page_io.c
> > index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> > --- a/mm/page_io.c
> > +++ b/mm/page_io.c
> > @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> > return AOP_WRITEPAGE_ACTIVATE;
> > }
> >
> > - __swap_writepage(folio, swap_plug);
> > - return 0;
> > + return __swap_writepage(folio, swap_plug);
> > out_unlock:
> > folio_unlock(folio);
> > return ret;
> > @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
> > submit_bio(bio);
> > }
> >
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > {
> > struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> >
> > VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> > +
> > + if (sis->flags & SWP_GHOST) {
> > + /* Prevent the page from getting reclaimed. */
> > + folio_set_dirty(folio);
> > + return AOP_WRITEPAGE_ACTIVATE;
> > + }
> > +
> > /*
> > * ->flags can be updated non-atomicially (scan_swap_map_slots),
> > * but that will never affect SWP_FS_OPS, so the data_race
> > @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > swap_writepage_bdev_sync(folio, sis);
> > else
> > swap_writepage_bdev_async(folio, sis);
> > + return 0;
> > }
> >
> > void swap_write_unplug(struct swap_iocb *sio)
> > @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> > if (zswap_load(folio) != -ENOENT)
> > goto finish;
> >
> > + if (unlikely(sis->flags & SWP_GHOST)) {
> > + folio_unlock(folio);
> > + goto finish;
> > + }
> > +
> > /* We have to read from slower devices. Increase zswap protection. */
> > zswap_folio_swapin(folio);
> >
> > diff --git a/mm/swap.h b/mm/swap.h
> > index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> > }
> > void swap_write_unplug(struct swap_iocb *sio);
> > int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> >
> > /* linux/mm/swap_state.c */
> > extern struct address_space swap_space __ro_after_init;
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > struct swap_iocb *splug = NULL;
> > bool page_allocated;
> >
> > + /*
> > + * The entry may have been freed by another task. Avoid swap_info_get()
> > + * which will print error message if the race happens.
> > + */
> > + if (si->flags & SWP_GHOST)
> > + goto skip;
> > +
> > mask = swapin_nr_pages(offset) - 1;
> > if (!mask)
> > goto skip;
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
> > static DEFINE_SPINLOCK(swap_lock);
> > static unsigned int nr_swapfiles;
> > atomic_long_t nr_swap_pages;
> > +atomic_t nr_real_swapfiles;
> > /*
> > * Some modules use swappable objects and may try to swap them out under
> > * memory pressure (via the shrinker). Before doing so, they may wish to
> > @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> > goto skip;
> > }
> >
> > + if (!(si->flags & SWP_GHOST))
> > + atomic_sub(1, &nr_real_swapfiles);
> > plist_del(&si->avail_list, &swap_avail_head);
> >
> > skip:
> > @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> > }
> >
> > plist_add(&si->avail_list, &swap_avail_head);
> > + if (!(si->flags & SWP_GHOST))
> > + atomic_add(1, &nr_real_swapfiles);
> >
> > skip:
> > spin_unlock(&swap_avail_lock);
> > @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
> > struct inode *inode = mapping->host;
> > int ret;
> >
> > + if (sis->flags & SWP_GHOST) {
> > + *span = 0;
> > + return 0;
> > + }
> > +
> > if (S_ISBLK(inode->i_mode)) {
> > ret = add_swap_extent(sis, 0, sis->max, 0);
> > *span = sis->pages;
> > @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> > if (p->flags & SWP_CONTINUED)
> > free_swap_count_continuations(p);
> >
> > - if (!p->bdev || !bdev_nonrot(p->bdev))
> > + if (!(p->flags & SWP_GHOST) &&
> > + (!p->bdev || !bdev_nonrot(p->bdev)))
> > atomic_dec(&nr_rotate_swap);
> >
> > mutex_lock(&swapon_mutex);
> > @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
> > mutex_unlock(&swapon_mutex);
> > }
> >
> > +static const char *swap_type_str(struct swap_info_struct *si)
> > +{
> > + struct file *file = si->swap_file;
> > +
> > + if (si->flags & SWP_GHOST)
> > + return "ghost\t";
> > +
> > + if (S_ISBLK(file_inode(file)->i_mode))
> > + return "partition";
> > +
> > + return "file\t";
> > +}
> > +
> > static int swap_show(struct seq_file *swap, void *v)
> > {
> > struct swap_info_struct *si = v;
> > @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
> > len = seq_file_path(swap, file, " \t\n\\");
> > seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
> > len < 40 ? 40 - len : 1, " ",
> > - S_ISBLK(file_inode(file)->i_mode) ?
> > - "partition" : "file\t",
> > + swap_type_str(si),
> > bytes, bytes < 10000000 ? "\t" : "",
> > inuse, inuse < 10000000 ? "\t" : "",
> > si->prio);
> > @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
> > return 0;
> > }
> >
> > -
> > /*
> > * Find out how many pages are allowed for a single swap device. There
> > * are two limiting factors:
> > @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> > unsigned long maxpages;
> > unsigned long swapfilepages;
> > unsigned long last_page;
> > + loff_t size;
> >
> > if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
> > pr_err("Unable to find swap-space signature\n");
> > @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> >
> > if (!maxpages)
> > return 0;
> > - swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> > +
> > + size = i_size_read(inode);
> > + if (size == PAGE_SIZE) {
> > + /* Ghost swapfile */
> > + si->bdev = NULL;
> > + si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> > + return maxpages;
> > + }
> > +
> > + swapfilepages = size >> PAGE_SHIFT;
> > if (swapfilepages && maxpages > swapfilepages) {
> > pr_warn("Swap area shorter than signature indicates\n");
> > return 0;
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 5d0f8b13a958da3b5e74b63217b06e58ba2d3c26..29dfcc94b13eb72b1dbd100ded6e50620299e6e1 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -1005,14 +1005,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > struct folio *folio;
> > struct mempolicy *mpol;
> > bool folio_was_allocated;
> > - struct swap_info_struct *si;
> > + struct swap_info_struct *si = get_swap_device(swpentry);
> > int ret = 0;
> >
> > - /* try to allocate swap cache folio */
> > - si = get_swap_device(swpentry);
> > if (!si)
> > - return -EEXIST;
> > + return -ENOENT;
> > +
> > + if (si->flags & SWP_GHOST) {
> > + put_swap_device(si);
> > + return -EINVAL;
> > + }
> >
> > + /* try to allocate swap cache folio */
> > mpol = get_task_policy(current);
> > folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
> > NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
> > @@ -1067,7 +1071,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > folio_set_reclaim(folio);
> >
> > /* start writeback */
> > - __swap_writepage(folio, NULL);
> > + ret = __swap_writepage(folio, NULL);
> > + WARN_ON_ONCE(ret);
> >
> > out:
> > if (ret && ret != -EEXIST) {
> > @@ -1551,7 +1556,7 @@ bool zswap_store(struct folio *folio)
> > zswap_pool_put(pool);
> > put_objcg:
> > obj_cgroup_put(objcg);
> > - if (!ret && zswap_pool_reached_full)
> > + if (!ret && zswap_pool_reached_full && atomic_read(&nr_real_swapfiles))
> > queue_work(shrink_wq, &zswap_shrink_work);
> > check_old:
> > /*
> >
> > ---
> > base-commit: 9835506e139732fa1b55aea3ed4e3ec3dd499f30
> > change-id: 20251121-ghost-56e3948a7a17
> >
> > Best regards,
> > --
> > Chris Li <chrisl@kernel.org>
> >
>
On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote: > > On Fri, Nov 21, 2025 at 7:14 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote: > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > The current zswap requires a backing swapfile. The swap slot used > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > space. > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > prevents accidental read or write of ghost swapfile, bdev of > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > flag because there is no rotation disk access when using zswap. > > > > > > The zswap write back has been disabled if all swapfiles in the system > > > are ghost swap files. > > > > > > Signed-off-by: Chris Li <chrisl@kernel.org> > > > > This was brought up before, I think it's not the right way to go > > upstream. Even if it's good for the short-term, it's a behavior exposed > > to userspace that we'll have to maintain. With the ongoing work to > > decouple zswap and swap backends, this will end up being something we > > have to workaround indefinitely to keep the same userspace semantics. > > Actually, this doesn't need to be the short term solution. It can be > long term. I get it your zswap maintainers do not want to get > involved in the ghost swapfile. I will leave you guys alone. Remember > 2023 LPC swap abstraction talk, the community picked my approach to > the VFS swap ops over the swap abstraction which the swap > virtualization is based on. I take some time to come up with the > cluster based swap allocator and swap table to clean up and speed up > the swap stack. Now I am finally able to circle back and fulfill my > promise of the VFS swap ops. Have a little faith I will solve this > swap entry redirection issue nicely for you, better than the swap > virtualization approach can. Look man, I'm not married to any idea. If your VFS approach solve our problems, I can move on to other projects :) We have lots of swap/memory reclaim/MM problems to solve, both internally at Meta and upstream. But please explain how your VFS approach solved the 3 requirements I mentioned in the other email, and more specifically the backend transfer requirement. I have explicitly asked about it in your submission for your 2024 LSFMMBPF talk - at that time I have not seriously started the swap virtualization work, only at the design phase. You just handwaved it away and never really explained to me how you can achieve backend transfer with your design: https://lore.kernel.org/all/CAF8kJuNFtejEtjQHg5UBGduvFNn3AaGn4ffyoOrEnXfHpx6Ubg@mail.gmail.com/ I understand that you had more pressing issues to fix at a time, so I did not bring it up during the conference. But it's an imperative requirement for us. swap.tiers is nice for initial placement and for hierarchy determination in general, but when the page is already placed on one tier and needs to be transferred to the tier, how will you move it from one tier to another? What zram is doing right now, IIUC, is building the redirection internally. I would like to try avoiding repeating that for zswap, and for every other future backends, by pulling it out of backend internal code and build a dedicated module for it. That is just swap virtualization.
On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > The current zswap requires a backing swapfile. The swap slot used > by zswap is not able to be used by the swapfile. That waste swapfile > space. > > The ghost swapfile is a swapfile that only contains the swapfile header > for zswap. The swapfile header indicate the size of the swapfile. There > is no swap data section in the ghost swapfile, therefore, no waste of > swapfile space. As such, any write to a ghost swapfile will fail. To > prevents accidental read or write of ghost swapfile, bdev of > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > flag because there is no rotation disk access when using zswap. Zswap is primarily a compressed cache for real swap on secondary storage. It's indeed quite important that entries currently in zswap don't occupy disk slots; but for a solution to this to be acceptable, it has to work with the primary usecase and support disk writeback. This direction is a dead-end. Please take a look at Nhat's swap virtualization patches. They decouple zswap from disk geometry, while still supporting writeback to an actual backend file. Nacked-by: Johannes Weiner <hannes@cmpxchg.org>
On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > The current zswap requires a backing swapfile. The swap slot used > > by zswap is not able to be used by the swapfile. That waste swapfile > > space. > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > for zswap. The swapfile header indicate the size of the swapfile. There > > is no swap data section in the ghost swapfile, therefore, no waste of > > swapfile space. As such, any write to a ghost swapfile will fail. To > > prevents accidental read or write of ghost swapfile, bdev of > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > flag because there is no rotation disk access when using zswap. > > Zswap is primarily a compressed cache for real swap on secondary > storage. It's indeed quite important that entries currently in zswap > don't occupy disk slots; but for a solution to this to be acceptable, > it has to work with the primary usecase and support disk writeback. Well, my plan is to support the writeback via swap.tiers. > This direction is a dead-end. Please take a look at Nhat's swap > virtualization patches. They decouple zswap from disk geometry, while > still supporting writeback to an actual backend file. Yes, there are many ways to decouple zswap from disk geometry, my swap table + swap.tiers design can do that as well. I have concerns about swap virtualization in the aspect of adding another layer of memory overhead addition per swap entry and CPU overhead of extra xarray lookup. I believe my approach is technically superior and cleaner. Both faster and cleaner. Basically swap.tiers + VFS like swap read write page ops. I will let Nhat clarify the performance and memory overhead side of the swap virtualization. I am not against swap entry redirection. Just the swap virtualization series needs to compare against the alternatives in terms of memory overhead and throughput. Solving it from the swap.tiers angle is cleaner. > Nacked-by: Johannes Weiner <hannes@cmpxchg.org> I take that the only relevant part is you are zswap maintainer and I am the swap maintainer. Fine. I got the message. I will leave the zswap alone. I will find other ways to address the memory base swap tiers in swap.tiers. Chris
Hi Johannes, On Sat, Nov 22, 2025 at 5:52 AM Chris Li <chrisl@kernel.org> wrote: > > > Nacked-by: Johannes Weiner <hannes@cmpxchg.org> > > I take that the only relevant part is you are zswap maintainer and I > am the swap maintainer. Fine. I got the message. I will leave the > zswap alone. I will find other ways to address the memory base swap > tiers in swap.tiers. I am sorry that I have said that. Let me take back what I said above. I was upset when I considered you and others blocking the more optimal solution and in favor of the less optimal solution. That is my short temper, as usual. Now I can see that you might not see one as more optimal than the other as convincing as I do, or I haven't done a good job explaining it. Let me offer my sincere apology. I will reply to the technical aspect of the question in other email. Chris
On Tue, Nov 25, 2025 at 10:14:40PM +0400, Chris Li wrote: > Hi Johannes, > > On Sat, Nov 22, 2025 at 5:52 AM Chris Li <chrisl@kernel.org> wrote: > > > > > Nacked-by: Johannes Weiner <hannes@cmpxchg.org> > > > > I take that the only relevant part is you are zswap maintainer and I > > am the swap maintainer. Fine. I got the message. I will leave the > > zswap alone. I will find other ways to address the memory base swap > > tiers in swap.tiers. > > I am sorry that I have said that. Let me take back what I said above. > I was upset when I considered you and others blocking the more optimal > solution and in favor of the less optimal solution. That is my short > temper, as usual. > > Now I can see that you might not see one as more optimal than the > other as convincing as I do, or I haven't done a good job explaining > it. > > Let me offer my sincere apology. I will reply to the technical aspect > of the question in other email. Thanks Chris. No hard feelings.
On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote: > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > The current zswap requires a backing swapfile. The swap slot used > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > space. > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > prevents accidental read or write of ghost swapfile, bdev of > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > flag because there is no rotation disk access when using zswap. > > > > Zswap is primarily a compressed cache for real swap on secondary > > storage. It's indeed quite important that entries currently in zswap > > don't occupy disk slots; but for a solution to this to be acceptable, > > it has to work with the primary usecase and support disk writeback. > > Well, my plan is to support the writeback via swap.tiers. Do you have a link to that proposal? My understanding of swap tiers was about grouping different swapfiles and assigning them to cgroups. The issue with writeback is relocating the data that a swp_entry_t page table refers to - without having to find and update all the possible page tables. I'm not sure how swap.tiers solve this problem. > > This direction is a dead-end. Please take a look at Nhat's swap > > virtualization patches. They decouple zswap from disk geometry, while > > still supporting writeback to an actual backend file. > > Yes, there are many ways to decouple zswap from disk geometry, my swap > table + swap.tiers design can do that as well. I have concerns about > swap virtualization in the aspect of adding another layer of memory > overhead addition per swap entry and CPU overhead of extra xarray > lookup. I believe my approach is technically superior and cleaner. > Both faster and cleaner. Basically swap.tiers + VFS like swap read > write page ops. I will let Nhat clarify the performance and memory > overhead side of the swap virtualization. I'm happy to discuss it. But keep in mind that the swap virtualization idea is a collaborative product of quite a few people with an extensive combined upstream record. Quite a bit of thought has gone into balancing static vs runtime costs of that proposal. So you'll forgive me if I'm a bit skeptical of the somewhat grandiose claims of one person that is new to upstream development. As to your specific points - we use xarray lookups in the page cache fast path. It's a bold claim to say this would be too much overhead during swapins. Two, it's not clear to me how you want to make writeback efficient *without* any sort of swap entry redirection. Walking all relevant page tables is expensive; and you have to be able to find them first. If you're talking about a redirection array as opposed to a tree - static sizing of the compressed space is also a no-go. Zswap utilization varies *widely* between workloads and different workload combinations. Further, zswap consumes the same fungible resource as uncompressed memory - there is really no excuse to burden users with static sizing questions about this pool.
On Mon, Nov 24, 2025 at 12:27:17PM -0500, Johannes Weiner wrote: > On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote: > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > > The current zswap requires a backing swapfile. The swap slot used > > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > > space. > > > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > > prevents accidental read or write of ghost swapfile, bdev of > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > > flag because there is no rotation disk access when using zswap. > > > > > > Zswap is primarily a compressed cache for real swap on secondary > > > storage. It's indeed quite important that entries currently in zswap > > > don't occupy disk slots; but for a solution to this to be acceptable, > > > it has to work with the primary usecase and support disk writeback. > > > > Well, my plan is to support the writeback via swap.tiers. > > Do you have a link to that proposal? > > My understanding of swap tiers was about grouping different swapfiles > and assigning them to cgroups. The issue with writeback is relocating > the data that a swp_entry_t page table refers to - without having to > find and update all the possible page tables. I'm not sure how > swap.tiers solve this problem. > > > > This direction is a dead-end. Please take a look at Nhat's swap > > > virtualization patches. They decouple zswap from disk geometry, while > > > still supporting writeback to an actual backend file. > > > > Yes, there are many ways to decouple zswap from disk geometry, my swap > > table + swap.tiers design can do that as well. I have concerns about > > swap virtualization in the aspect of adding another layer of memory > > overhead addition per swap entry and CPU overhead of extra xarray > > lookup. I believe my approach is technically superior and cleaner. > > Both faster and cleaner. Basically swap.tiers + VFS like swap read > > write page ops. I will let Nhat clarify the performance and memory > > overhead side of the swap virtualization. > > I'm happy to discuss it. > > But keep in mind that the swap virtualization idea is a collaborative > product of quite a few people with an extensive combined upstream > record. Quite a bit of thought has gone into balancing static vs > runtime costs of that proposal. So you'll forgive me if I'm a bit > skeptical of the somewhat grandiose claims of one person that is new > to upstream development. > > As to your specific points - we use xarray lookups in the page cache > fast path. It's a bold claim to say this would be too much overhead > during swapins. > > Two, it's not clear to me how you want to make writeback efficient > *without* any sort of swap entry redirection. Walking all relevant > page tables is expensive; and you have to be able to find them first. > > If you're talking about a redirection array as opposed to a tree - > static sizing of the compressed space is also a no-go. Zswap > utilization varies *widely* between workloads and different workload > combinations. Further, zswap consumes the same fungible resource as > uncompressed memory - there is really no excuse to burden users with > static sizing questions about this pool. I think what Chris's idea is (and Chris correct me if I am wrong), is that we use ghost swapfiles (that are not backed by disk space) for zswap. So zswap has its own swapfiles, separate from disk swapfiles. memory.tiers establishes the ordering between swapfiles, so you put "ghost" -> "real" to get today's zswap writeback behavior. When you writeback, you keep page tables pointing at the swap entry in the ghost swapfile. What you do is: - Allocate a new swap entry in the "real" swapfile. - Update the swap table of the "ghost" swapfile to point at the swap entry in the "real" swapfile, reusing the pointer used for the swapcache. Then, on swapin, you read the swap table of the "ghost" swapfile, find the redirection, and read to the swap table of the "real" swapfile, then read the page from disk into the swap cache. The redirection in the "ghost" swapfile will keep existing, wasting that slot, until all references to it are dropped. I think this might work for this specific use case, with less overhead than the xarray. BUT there are a few scenarios that are not covered AFAICT: - You still need to statically size the ghost swapfiles and their overheads. - Wasting a slot in the ghost swapfile for the redirection. This complicates static provisioning a bit, because you have to account for entries that will be in zswap as well as writtenback. Furthermore, IIUC swap.tiers is intended to be generic and cover other use cases beyond zswap like SSD -> HDD. For that, I think wasting a slot in the SSD when we writeback to the HDD is a much bigger problem. - We still cannot do swapoff efficiently as we need to walk the page tables (and some swap tables) to find and swapin all entries in a swapfile. Not as important as other things, but worth mentioning. Chris please let me know if I didn't get this right.
On Mon, Nov 24, 2025 at 11:32 PM Yosry Ahmed <yosry.ahmed@linux.dev> wrote: > > I think what Chris's idea is (and Chris correct me if I am wrong), is > that we use ghost swapfiles (that are not backed by disk space) for > zswap. So zswap has its own swapfiles, separate from disk swapfiles. Ack. > memory.tiers establishes the ordering between swapfiles, so you put > "ghost" -> "real" to get today's zswap writeback behavior. When you > writeback, you keep page tables pointing at the swap entry in the ghost > swapfile. What you do is: > - Allocate a new swap entry in the "real" swapfile. > - Update the swap table of the "ghost" swapfile to point at the swap > entry in the "real" swapfile, reusing the pointer used for the > swapcache. Ack, with minor adjustment in mapping the swap entry to the physical location. The swap entry has swap cache, the physical location does not. > Then, on swapin, you read the swap table of the "ghost" swapfile, find > the redirection, and read to the swap table of the "real" swapfile, then > read the page from disk into the swap cache. The redirection in the > "ghost" swapfile will keep existing, wasting that slot, until all > references to it are dropped. Ack. That is assuming we don't have a rmap a like for the swap entry. > I think this might work for this specific use case, with less overhead > than the xarray. BUT there are a few scenarios that are not covered > AFAICT: > > - You still need to statically size the ghost swapfiles and their > overheads. No true, both ghost swapfile and physical swapfile can expand additional clusters beyond the original physical size, for allocating the continued high order entry or redirection. For a ghost swapfile, there is no physical layer, only the front end. So the size can grow dynamically. Just allocate more clusters. The current swapfile header file size is just an initial size. My current patch does not implement that. It will need some later swap table phase to make it happen. But that is not an architecture limit, it has been considered as part of normal business. > - Wasting a slot in the ghost swapfile for the redirection. This > complicates static provisioning a bit, because you have to account for > entries that will be in zswap as well as writtenback. Furthermore, > IIUC swap.tiers is intended to be generic and cover other use cases > beyond zswap like SSD -> HDD. For that, I think wasting a slot in the > SSD when we writeback to the HDD is a much bigger problem. Yes and No. Yes it only wastes a front end swap entry (with swap cache). The physical location is a seperate layer. No, the physical SSD space is not wasted because you can allocate additional front end swap entry by growing the swap entry front end. Then have the additional front end swap entry point to the physical location you just directed away from. There is a lot more consideration of the front end vs the physical layer. The physical layer does not care about location order size 2^N alignment. The physical layer cares a bit about continuity and the number of IOV that it needs to issue. The swap entry front end and the physical layer have slightly different constraints. > - We still cannot do swapoff efficiently as we need to walk the page > tables (and some swap tables) to find and swapin all entries in a > swapfile. Not as important as other things, but worth mentioning. That need rmap for swap entries. It It is an independent issue. Chris
On Tue, 2025-11-25 at 22:50 +0400, Chris Li wrote: > > > - We still cannot do swapoff efficiently as we need to walk the > > page > > tables (and some swap tables) to find and swapin all entries in a > > swapfile. Not as important as other things, but worth mentioning. > > That need rmap for swap entries. It It is an independent issue. > Wouldn't rmap for swap entries be more expensive than simply always having indirection for swap entries that are in use? With indirection, swapoff can just move pages from the being-swapoffed device into the swap cache, and if needed the memory can then be moved to another swap device, without ever needing to find the page tables. This sounds like an uncommon scenario, but it is functionally identical to what is done to pages during zswap writeback, where the page table entries stay unchanged, and the swap page is simply moved to another backend location. Why implement two things, when we can have one thing that does both, with no extra complexity over what zswap writeback needs? -- All Rights Reversed.
On Thu, Nov 27, 2025 at 1:59 AM Rik van Riel <riel@surriel.com> wrote: > > On Tue, 2025-11-25 at 22:50 +0400, Chris Li wrote: > > > > > - We still cannot do swapoff efficiently as we need to walk the > > > page > > > tables (and some swap tables) to find and swapin all entries in a > > > swapfile. Not as important as other things, but worth mentioning. > > > > That need rmap for swap entries. It It is an independent issue. > > > > Wouldn't rmap for swap entries be more expensive than > simply always having indirection for swap entries that > are in use? It might be, to be frank. I consider this pretty far and late in the stage of the game to evaluate the rmap and its alternatives. Do you agree? I might or might not try the rmap for swap entry. Right now I don't have many data points nor insights. > With indirection, swapoff can just move pages from > the being-swapoffed device into the swap cache, and > if needed the memory can then be moved to another > swap device, without ever needing to find the page > tables. Ack. I don't think we have any disagreement here. > This sounds like an uncommon scenario, but it is > functionally identical to what is done to pages > during zswap writeback, where the page table entries > stay unchanged, and the swap page is simply moved > to another backend location. > > Why implement two things, when we can have one > thing that does both, with no extra complexity > over what zswap writeback needs? Let me ask you a clarifying question, then. 1) What exactly are you trying to propose here in what project? VS or swap the pony? 2) What stage of the code change do you have in mind should this change apply to? I can't speak for VS, I am open to embrace what you suggest in order to swap the pony project, that is after I understand it first. Chris
On Thu, 2025-11-27 at 06:07 +0400, Chris Li wrote: > On Thu, Nov 27, 2025 at 1:59 AM Rik van Riel <riel@surriel.com> > wrote: > > > > On Tue, 2025-11-25 at 22:50 +0400, Chris Li wrote: > > > > > > > - We still cannot do swapoff efficiently as we need to walk the > > > > page > > > > tables (and some swap tables) to find and swapin all entries > > > > in a > > > > swapfile. Not as important as other things, but worth > > > > mentioning. > > > > > > That need rmap for swap entries. It It is an independent issue. > > > > > > > Wouldn't rmap for swap entries be more expensive than > > simply always having indirection for swap entries that > > are in use? > > It might be, to be frank. I consider this pretty far and late in the > stage of the game to evaluate the rmap and its alternatives. Do you > agree? > > I might or might not try the rmap for swap entry. Right now I don't > have many data points nor insights. On the contrary. I think we should at least do some back of the envelope calculations to estimate the overhead of the different proposed solutions. With both Nhat's vswap, and your proposal to always have swap indirection with a separate front end, and several back ends, there is no need for swap rmap. This is a good thing, because a single swap slot could be referenced by dozens, hundreds, or even thousands of page table entries, in the case of forking servers. This creates complexity which is probably best avoided. Conceptually, Nhat's vswap, and your idea of having always-on swap indirection seem to be the same thing. > > > This sounds like an uncommon scenario, but it is > > functionally identical to what is done to pages > > during zswap writeback, where the page table entries > > stay unchanged, and the swap page is simply moved > > to another backend location. > > > > Why implement two things, when we can have one > > thing that does both, with no extra complexity > > over what zswap writeback needs? > > Let me ask you a clarifying question, then. > > 1) What exactly are you trying to propose here in what project? VS or > swap the pony? In the past, when faced with competing code bases like this, one thing that has worked well is for both developers to send their code to the list, and then for both developers to send each other suggestions (or diffs) to improve each other's code. Vswap and your always-on indirection seem to do exactly the same thing. This seems like a good opportunity to work together, and come up with code that is better than any one person's code. > 2) What stage of the code change do you have in mind should this > change apply to? I think it makes sense to get the hard design problems resolved before committing to one particular code design. Spending months to resolve subtle bugs in a code base, only to discover later that it does not do exactly what is needed, is not the greatest way to make progress. > > I can't speak for VS, I am open to embrace what you suggest in order > to swap the pony project, that is after I understand it first. > Once both Nhat and you understand each other's code, and have suggestions for each other on how to improve it, we will likely end up with a code base that looks nicer than either of you would have done by yourselves. The more perspectives, the better. -- All Rights Reversed.
On Mon, Nov 24, 2025 at 11:32 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote: > > On Mon, Nov 24, 2025 at 12:27:17PM -0500, Johannes Weiner wrote: > > On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote: > > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > > > The current zswap requires a backing swapfile. The swap slot used > > > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > > > space. > > > > > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > > > prevents accidental read or write of ghost swapfile, bdev of > > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > > > flag because there is no rotation disk access when using zswap. > > > > > > > > Zswap is primarily a compressed cache for real swap on secondary > > > > storage. It's indeed quite important that entries currently in zswap > > > > don't occupy disk slots; but for a solution to this to be acceptable, > > > > it has to work with the primary usecase and support disk writeback. > > > > > > Well, my plan is to support the writeback via swap.tiers. > > > > Do you have a link to that proposal? > > > > My understanding of swap tiers was about grouping different swapfiles > > and assigning them to cgroups. The issue with writeback is relocating > > the data that a swp_entry_t page table refers to - without having to > > find and update all the possible page tables. I'm not sure how > > swap.tiers solve this problem. > > > > > > This direction is a dead-end. Please take a look at Nhat's swap > > > > virtualization patches. They decouple zswap from disk geometry, while > > > > still supporting writeback to an actual backend file. > > > > > > Yes, there are many ways to decouple zswap from disk geometry, my swap > > > table + swap.tiers design can do that as well. I have concerns about > > > swap virtualization in the aspect of adding another layer of memory > > > overhead addition per swap entry and CPU overhead of extra xarray > > > lookup. I believe my approach is technically superior and cleaner. > > > Both faster and cleaner. Basically swap.tiers + VFS like swap read > > > write page ops. I will let Nhat clarify the performance and memory > > > overhead side of the swap virtualization. > > > > I'm happy to discuss it. > > > > But keep in mind that the swap virtualization idea is a collaborative > > product of quite a few people with an extensive combined upstream > > record. Quite a bit of thought has gone into balancing static vs > > runtime costs of that proposal. So you'll forgive me if I'm a bit > > skeptical of the somewhat grandiose claims of one person that is new > > to upstream development. > > > > As to your specific points - we use xarray lookups in the page cache > > fast path. It's a bold claim to say this would be too much overhead > > during swapins. > > > > Two, it's not clear to me how you want to make writeback efficient > > *without* any sort of swap entry redirection. Walking all relevant > > page tables is expensive; and you have to be able to find them first. > > > > If you're talking about a redirection array as opposed to a tree - > > static sizing of the compressed space is also a no-go. Zswap > > utilization varies *widely* between workloads and different workload > > combinations. Further, zswap consumes the same fungible resource as > > uncompressed memory - there is really no excuse to burden users with > > static sizing questions about this pool. > > I think what Chris's idea is (and Chris correct me if I am wrong), is > that we use ghost swapfiles (that are not backed by disk space) for > zswap. So zswap has its own swapfiles, separate from disk swapfiles. > > memory.tiers establishes the ordering between swapfiles, so you put > "ghost" -> "real" to get today's zswap writeback behavior. When you > writeback, you keep page tables pointing at the swap entry in the ghost > swapfile. What you do is: > - Allocate a new swap entry in the "real" swapfile. > - Update the swap table of the "ghost" swapfile to point at the swap > entry in the "real" swapfile, reusing the pointer used for the > swapcache. > > Then, on swapin, you read the swap table of the "ghost" swapfile, find > the redirection, and read to the swap table of the "real" swapfile, then > read the page from disk into the swap cache. The redirection in the > "ghost" swapfile will keep existing, wasting that slot, until all > references to it are dropped. > > I think this might work for this specific use case, with less overhead > than the xarray. BUT there are a few scenarios that are not covered > AFAICT: Thanks for explaining these issues better than I could :) > > - You still need to statically size the ghost swapfiles and their > overheads. Yes. > > - Wasting a slot in the ghost swapfile for the redirection. This > complicates static provisioning a bit, because you have to account for > entries that will be in zswap as well as writtenback. Furthermore, > IIUC swap.tiers is intended to be generic and cover other use cases > beyond zswap like SSD -> HDD. For that, I think wasting a slot in the > SSD when we writeback to the HDD is a much bigger problem. Yep. We are trying to get away from static provisioning as much as we can - this design digs us deeper in the hole. Who the hell know what's the zswap:disk swap split is going to be? It's going to depend on access patterns and compressibility. > > - We still cannot do swapoff efficiently as we need to walk the page > tables (and some swap tables) to find and swapin all entries in a > swapfile. Not as important as other things, but worth mentioning. Yeah I left swapoff out of it, because it is just another use case. But yes we can't do swapoff efficiently easily either. And in general, it's going to be a very rigid design for more complicated backend change (pre-fetching from one tier to another, or compaction).
On Mon, Nov 24, 2025 at 8:27 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote: > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > > The current zswap requires a backing swapfile. The swap slot used > > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > > space. > > > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > > prevents accidental read or write of ghost swapfile, bdev of > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > > flag because there is no rotation disk access when using zswap. > > > > > > Zswap is primarily a compressed cache for real swap on secondary > > > storage. It's indeed quite important that entries currently in zswap > > > don't occupy disk slots; but for a solution to this to be acceptable, > > > it has to work with the primary usecase and support disk writeback. > > > > Well, my plan is to support the writeback via swap.tiers. > > Do you have a link to that proposal? My 2024 LSF swap pony talk already has a mechanism to redirect page cache swap entries to different physical locations. That can also work for redirecting swap entries in different swapfiles. https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ > My understanding of swap tiers was about grouping different swapfiles > and assigning them to cgroups. The issue with writeback is relocating > the data that a swp_entry_t page table refers to - without having to > find and update all the possible page tables. I'm not sure how > swap.tiers solve this problem. swap.tiers is part of the picture. You are right the LPC topic mostly covers the per cgroup portion. The VFS swap ops are my two slides of the LPC 2023. You read from one swap file and write to another swap file with a new swap entry allocated. > > > This direction is a dead-end. Please take a look at Nhat's swap > > > virtualization patches. They decouple zswap from disk geometry, while > > > still supporting writeback to an actual backend file. > > > > Yes, there are many ways to decouple zswap from disk geometry, my swap > > table + swap.tiers design can do that as well. I have concerns about > > swap virtualization in the aspect of adding another layer of memory > > overhead addition per swap entry and CPU overhead of extra xarray > > lookup. I believe my approach is technically superior and cleaner. > > Both faster and cleaner. Basically swap.tiers + VFS like swap read > > write page ops. I will let Nhat clarify the performance and memory > > overhead side of the swap virtualization. > > I'm happy to discuss it. > > But keep in mind that the swap virtualization idea is a collaborative > product of quite a few people with an extensive combined upstream > record. Quite a bit of thought has gone into balancing static vs > runtime costs of that proposal. So you'll forgive me if I'm a bit > skeptical of the somewhat grandiose claims of one person that is new > to upstream development. Collaborating with which companies developers? How many VS patches landed in the kernel? I am also collaborating with different developers, cluster base swap allocators, swap table phase I. Removing the NUMA node swap file priority. Those are all suggested by me. > As to your specific points - we use xarray lookups in the page cache > fast path. It's a bold claim to say this would be too much overhead > during swapins. Yes, we just get rid of xarray in swap cache lookup and get some performance gain from it. You are saying one extra xarray is no problem, can your team demo some performance number of impact of the extra xarray lookup in VS? Just run some swap benchmarks and share the result. We can do a test right now, without writing back to another SSD, The ghosts swapfile compare with VS for zswap only case. > Two, it's not clear to me how you want to make writeback efficient > *without* any sort of swap entry redirection. Walking all relevant > page tables is expensive; and you have to be able to find them first. Swap cache can have a physical location redirection, see my 2024 LPC slides. I have considered that way before the VS discussion. https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ > If you're talking about a redirection array as opposed to a tree - > static sizing of the compressed space is also a no-go. Zswap > utilization varies *widely* between workloads and different workload > combinations. Further, zswap consumes the same fungible resource as > uncompressed memory - there is really no excuse to burden users with > static sizing questions about this pool. I do see the swap table + swap.ters + swap ops and do better. We can test the memory only case right now. To head to head test the VS and swap.tiers on the writeback case will need to wait a bit. Swap table is only reviewing phase II. I mean CPU and per swap entry overhead. I care less on who's idea it is, I care more about the end result performance in (memory & CPU). I want the best idea/implementation to win. Chris
On Mon, Nov 24, 2025 at 09:24:18PM +0300, Chris Li wrote: > On Mon, Nov 24, 2025 at 8:27 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote: > > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > > > The current zswap requires a backing swapfile. The swap slot used > > > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > > > space. > > > > > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > > > prevents accidental read or write of ghost swapfile, bdev of > > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > > > flag because there is no rotation disk access when using zswap. > > > > > > > > Zswap is primarily a compressed cache for real swap on secondary > > > > storage. It's indeed quite important that entries currently in zswap > > > > don't occupy disk slots; but for a solution to this to be acceptable, > > > > it has to work with the primary usecase and support disk writeback. > > > > > > Well, my plan is to support the writeback via swap.tiers. > > > > Do you have a link to that proposal? > > My 2024 LSF swap pony talk already has a mechanism to redirect page > cache swap entries to different physical locations. > That can also work for redirecting swap entries in different swapfiles. > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ I looked through your slides and the LWN article, but it's very hard for me to find answers to my questions in there. In your proposal, let's say you have a swp_entry_t in the page table. What does it describe, and what are the data structures to get from this key to user data in the following scenarios: - Data is in a swapfile - Data is in zswap - Data is in being written from zswap to a swapfile - Data is back in memory due to a fault from another page table > > My understanding of swap tiers was about grouping different swapfiles > > and assigning them to cgroups. The issue with writeback is relocating > > the data that a swp_entry_t page table refers to - without having to > > find and update all the possible page tables. I'm not sure how > > swap.tiers solve this problem. > > swap.tiers is part of the picture. You are right the LPC topic mostly > covers the per cgroup portion. The VFS swap ops are my two slides of > the LPC 2023. You read from one swap file and write to another swap > file with a new swap entry allocated. Ok, and from what you wrote below, presumably at this point you would put a redirection pointer in the old location to point to the new one. This way you only have the indirection IF such a relocation actually happened, correct? But how do you store new data in the freed up old slot? > > As to your specific points - we use xarray lookups in the page cache > > fast path. It's a bold claim to say this would be too much overhead > > during swapins. > > Yes, we just get rid of xarray in swap cache lookup and get some > performance gain from it. > You are saying one extra xarray is no problem, can your team demo some > performance number of impact of the extra xarray lookup in VS? Just > run some swap benchmarks and share the result. Average and worst-case for all common usecases matter. There is no code on your side for the writeback case. (And it's exceedingly difficult to even get a mental model of how it would work from your responses and the slides you have linked). > > Two, it's not clear to me how you want to make writeback efficient > > *without* any sort of swap entry redirection. Walking all relevant > > page tables is expensive; and you have to be able to find them first. > > Swap cache can have a physical location redirection, see my 2024 LPC > slides. I have considered that way before the VS discussion. > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ There are no matches for "redir" in either the email or the slides.
On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > Do you have a link to that proposal? > > > > My 2024 LSF swap pony talk already has a mechanism to redirect page > > cache swap entries to different physical locations. > > That can also work for redirecting swap entries in different swapfiles. > > > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ > > I looked through your slides and the LWN article, but it's very hard > for me to find answers to my questions in there. Naturally, the slide is only intended to cover what is in the current swap table may be phase VII. But it does have the physical location pointer consideration. > In your proposal, let's say you have a swp_entry_t in the page > table. What does it describe, and what are the data structures to get > from this key to user data in the following scenarios: Please keep in mind that I don't have every detail design laid out. I follow the first principles that redirect a swap entry page should only take an additional 4 byte per swap entry. VS blow up the swap entry size by something like 24 bytes? I am pretty sure I am wrong about the exact value. People who are familiar with VS please correct me. My impression is that it is too far away from the first principle value, I would not even consider. Exceptions can be made, but not that far. I will try my best to answer your question but usually I am more glad to work with someone who is going to implement it to iron out all the details. Right now it is a bit too far. > - Data is in a swapfile Same as current. > - Data is in zswap I have now realized that what I want from the memory swap tier is actually not the same as today's zswap. I don't want the current behavior of zswap in the swap.tiers. The zswap seat in front of every swapfile. The zswap.writeback does not tell which particular swapfile it wants to write to. It creates problems in the per memcg swap.tier to include zswap as it is. I don't want the zswap to use another swapfile swap entry and write through to it. If data is in the memory tier swapfile, the swap entry looks up to the actual data without redirection. > - Data is in being written from zswap to a swapfile It will look up the swap table and find a physical pointer, which points to the physical device and office having the data. > - Data is back in memory due to a fault from another page table In the swap cache similar to today's swapfile. > > > My understanding of swap tiers was about grouping different swapfiles > > > and assigning them to cgroups. The issue with writeback is relocating > > > the data that a swp_entry_t page table refers to - without having to > > > find and update all the possible page tables. I'm not sure how > > > swap.tiers solve this problem. > > > > swap.tiers is part of the picture. You are right the LPC topic mostly > > covers the per cgroup portion. The VFS swap ops are my two slides of > > the LPC 2023. You read from one swap file and write to another swap > > file with a new swap entry allocated. > > Ok, and from what you wrote below, presumably at this point you would > put a redirection pointer in the old location to point to the new one. From the swap entry front end (also owns the swap cache) point to a physical location. > > This way you only have the indirection IF such a relocation actually > happened, correct? Right. The more common > But how do you store new data in the freed up old slot? That is the front end swap entry and the physical back end split. The front end swap entry can't be free until all users release the swap count. The physical back end can be free. The free physical blocks caused by redirection will likely have a different allocator, not the cluster based swap allocator. Because those are just pure blocks. > > > > As to your specific points - we use xarray lookups in the page cache > > > fast path. It's a bold claim to say this would be too much overhead > > > during swapins. > > > > Yes, we just get rid of xarray in swap cache lookup and get some > > performance gain from it. > > You are saying one extra xarray is no problem, can your team demo some > > performance number of impact of the extra xarray lookup in VS? Just > > run some swap benchmarks and share the result. > > Average and worst-case for all common usecases matter. There is no > code on your side for the writeback case. (And it's exceedingly > difficult to even get a mental model of how it would work from your > responses and the slides you have linked). As I said, that slide is only intended to explain swap table phase VII how physical direction works with swap cache. The swap.tiers define tiers for swap, obviously how to move data between the tier is a natural consideration. That I mention in the 2023 talk in two slides. I don't plan that level of detail that far ahead. I try to follow the first principle as best as I can. There will be a lot of decisions made only at the later phases. > > > Two, it's not clear to me how you want to make writeback efficient > > > *without* any sort of swap entry redirection. Walking all relevant > > > page tables is expensive; and you have to be able to find them first. > > > > Swap cache can have a physical location redirection, see my 2024 LPC > > slides. I have considered that way before the VS discussion. > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ > > There are no matches for "redir" in either the email or the slides. Yes, I use a different term in the slide. The continuous is the source of the redirection, the non continuous is the destination of the redirection. But in my mind I am not redirecting swap entries. The swap entry might have an optional physical location pointer. The swap entry front end and physical layer split. Chris
On Tue, Nov 25, 2025 at 11:27:04PM +0400, Chris Li wrote: > On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > Do you have a link to that proposal? > > > > > > My 2024 LSF swap pony talk already has a mechanism to redirect page > > > cache swap entries to different physical locations. > > > That can also work for redirecting swap entries in different swapfiles. > > > > > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ > > > > I looked through your slides and the LWN article, but it's very hard > > for me to find answers to my questions in there. > > Naturally, the slide is only intended to cover what is in the current > swap table may be phase VII. > But it does have the physical location pointer consideration. > > > In your proposal, let's say you have a swp_entry_t in the page > > table. What does it describe, and what are the data structures to get > > from this key to user data in the following scenarios: > > Please keep in mind that I don't have every detail design laid out. I > follow the first principles that redirect a swap entry page should > only take an additional 4 byte per swap entry. VS blow up the swap > entry size by something like 24 bytes? Nhat can lay this out in more detail, but there isn't much new stuff in the virtual swap descriptor. It's mostly just a consolidation of state we currently track elsewhere - swap count, swapcache pointer, cgroup ownership etc. The actual indirection is just a word for the backend type,offset. That indirection is the tradeoff for swapped pages. In turn you're getting back all that other stuff for swap slots that *aren't* currently used. This is a win for the vast majority of users. Since you mentioned first principles - the dynamically sized swap space is also much more suitable for compressed pools, which are the dominant form of swap setups nowadays. Again a win for the majority. And the worst-case is reasonable. I don't see the giant gulf you seem to see there. I don't know where it's supposed to be coming from.
On Wed, Nov 26, 2025 at 1:31 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Tue, Nov 25, 2025 at 11:27:04PM +0400, Chris Li wrote: > > On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > Do you have a link to that proposal? > > > > > > > > My 2024 LSF swap pony talk already has a mechanism to redirect page > > > > cache swap entries to different physical locations. > > > > That can also work for redirecting swap entries in different swapfiles. > > > > > > > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ > > > > > > I looked through your slides and the LWN article, but it's very hard > > > for me to find answers to my questions in there. > > > > Naturally, the slide is only intended to cover what is in the current > > swap table may be phase VII. > > But it does have the physical location pointer consideration. > > > > > In your proposal, let's say you have a swp_entry_t in the page > > > table. What does it describe, and what are the data structures to get > > > from this key to user data in the following scenarios: > > > > Please keep in mind that I don't have every detail design laid out. I > > follow the first principles that redirect a swap entry page should > > only take an additional 4 byte per swap entry. VS blow up the swap > > entry size by something like 24 bytes? > > Nhat can lay this out in more detail, but there isn't much new stuff Please make sure Nhat do. It shouldn't be complicated question. > in the virtual swap descriptor. It's mostly just a consolidation of > state we currently track elsewhere - swap count, swapcache pointer, > cgroup ownership etc. All those will fold into swap table values at later phases. So in this regard, swap table is not satisfying the status quotes, it is more aggressive in conserving memory. If I recall correctly, VS uses atomic for the counters? It will blow up the 1 byte counter to 4 bytes. > The actual indirection is just a word for the backend type,offset. Sure. > > That indirection is the tradeoff for swapped pages. In turn you're > getting back all that other stuff for swap slots that *aren't* > currently used. This is a win for the vast majority of users. Swap table does those as well, in the later phases. > > Since you mentioned first principles - the dynamically sized swap > space is also much more suitable for compressed pools, which are the > dominant form of swap setups nowadays. Again a win for the majority. Sure, the swap table does that, especially after the swap cgroup and swap count fold into the swap table. > And the worst-case is reasonable. I don't see the giant gulf you seem > to see there. I don't know where it's supposed to be coming from. Let Nhat conform the per swap entry overhead and let's compare it with the swap table fully final form. Another easy way is just run some benchmark to see how much overhead the VS introduces. That being said, I think I have answered enough technical questions of my approach, to let you re-consider my proposal. You should be able to realize by now my approach is more optimal compared to VS. Do you agree or not? We are just arguing how big the gap that is. Chris
On Wed, 2025-11-26 at 23:22 +0400, Chris Li wrote: > > That being said, I think I have answered enough technical questions > of > my approach, to let you re-consider my proposal. You should be able > to > realize by now my approach is more optimal compared to VS. Do you > agree or not? We are just arguing how big the gap that is. > We would have much more confidence in your solution if you had told us exactly how you were planning to solve things in future stages of the project. A "I'll solve it, but I can't tell you how" is not very confidence inspiring. -- All Rights Reversed.
On Thu, Nov 27, 2025 at 1:53 AM Rik van Riel <riel@surriel.com> wrote: > > On Wed, 2025-11-26 at 23:22 +0400, Chris Li wrote: > > > > That being said, I think I have answered enough technical questions > > of > > my approach, to let you re-consider my proposal. You should be able > > to > > realize by now my approach is more optimal compared to VS. Do you > > agree or not? We are just arguing how big the gap that is. > > > > We would have much more confidence in your > solution if you had told us exactly how > you were planning to solve things in future > stages of the project. Can you clarify who is "We", sorry I am not part of your Meta kernel team circle. II just reply to you and others how to solve the other things. If you have further questions, please ask a clarifying question. Until you ask, I don't know which part of the Swap Pony plan you don't understand needs more clarifications. > A "I'll solve it, but I can't tell you how" > is not very confidence inspiring. Don't need this kind of innuendo and it is not helping. Please stay on the technical side of discussion and try not to project personal judgement, thanks. Please keep in mind that I am just one person love kernel hacking and want to do the right things. I am doing this at my spare time, it is not part of my company OKR's to work on upstream swap in the last two years. I don't get pay to do this. I am replying this email from my vacation 5am in the morning. Again, let's stay technical. If you think I am holding any secret (I am not ), please just ask a clarify question. Thanks for your cooperation and sorry that I did have a chance to explain things better earlier. Chris
On Thu, 2025-11-27 at 05:52 +0400, Chris Li wrote: > On Thu, Nov 27, 2025 at 1:53 AM Rik van Riel <riel@surriel.com> > wrote: > > > > On Wed, 2025-11-26 at 23:22 +0400, Chris Li wrote: > > > > > > That being said, I think I have answered enough technical > > > questions > > > of > > > my approach, to let you re-consider my proposal. You should be > > > able > > > to > > > realize by now my approach is more optimal compared to VS. Do you > > > agree or not? We are just arguing how big the gap that is. > > > > > > > We would have much more confidence in your > > solution if you had told us exactly how > > you were planning to solve things in future > > stages of the project. > > Can you clarify who is "We", Sorry, I am talking about upstream. When one developer has code, and somebody else emails the equivalent of "trust me, bro", the code is usually preferred. > > Please keep in mind that I am just one person love kernel hacking and > want to do the right things. I am doing this at my spare time, it is > not part of my company OKR's to work on upstream swap in the last two > years. I don't get pay to do this. I am replying this email from my > vacation 5am in the morning. > > Again, let's stay technical. If you think I am holding any secret (I > am not ), please just ask a clarify question. I really appreciate anybody participating in Linux kernel development. Linux is good because different people bring different perspectives to the table. Some real numbers, even if just back of the envelope math to estimate the overhead of various ideas being proposed, are often a good way to move a discussion along in a productive direction. Let me reply to your other email with some more technical details. -- All Rights Reversed.
On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote: > > Sorry, I am talking about upstream. So far I have not had a pleasant upstream experience when submitting this particular patch to upstream. > I really appreciate anybody participating in Linux > kernel development. Linux is good because different > people bring different perspectives to the table. Of course everybody is welcome. However, NACK without technical justification is very bad for upstream development. I can't imagine what a new hacker would think after going through what I have gone through for this patch. He/she will likely quit contributing upstream. This is not the kind of welcome we want. Nhat needs to be able to technically justify his NACK as a maintainer. Sorry there is no other way to sugar coat it. Chris
On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote: > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote: > > > > Sorry, I am talking about upstream. > > So far I have not had a pleasant upstream experience when submitting > this particular patch to upstream. > > > I really appreciate anybody participating in Linux > > kernel development. Linux is good because different > > people bring different perspectives to the table. > > Of course everybody is welcome. However, NACK without technical > justification is very bad for upstream development. I can't imagine > what a new hacker would think after going through what I have gone > through for this patch. He/she will likely quit contributing upstream. > This is not the kind of welcome we want. > > Nhat needs to be able to technically justify his NACK as a maintainer. > Sorry there is no other way to sugar coat it. I am NOT the only zswap maintainer who expresses concerns. Other people also have their misgivings, so I have let them speak and not put words in their mouths. But since you have repeatedly singled me out, I will repeat my concerns here: 1. I don't like the operational overhead (to statically size the zswap swapfile size for each <host x workload> combination) of static swapfile. Misspecification of swapfile size can lead to unacceptable swap metadata overhead on small machines, or underutilization of zswap on big machines. And it is *impossible* to know how much zswap will be needed ahead of time, even if we fix host - it depends on workloads access patterns, memory compressibility, and latency/memory pressure tolerance. 2. I don't like the maintainer's overhead (to support a special infrastructure for a very specific use case, i.e no-writeback), especially since I'm not convinced this can be turned into a general architecture. See below. 3. I want to move us towards a more dynamic architecture for zswap. This is a step in the WRONG direction. 4. I don't believe this buys us anything we can't already do with userspace hacking. Again, zswap-over-zram (or insert whatever RAM-only swap option here), with writeback disabled, is 2-3 lines of script. I believe I already justified myself well enough :) It is you who have not really convinced me that this is, at the very least, a temporary/first step towards a long-term generalized architecture for zswap. Every time we pointed out an issue, you seem to justify it with some more vague ideas that deepen the confusion. Let's recap the discussion so far: 1. We claimed that this architecture is hard to extend for efficient zswap writeback, or backend transfer in general, without incurring page table updates. You claim you plan to implement a redirection entry to solve this. 2. We then pointed out that inserting redirect entry into the current physical swap infrastructure will leave holes in the upper swap tier's address space, which is arguably *worse* than the current status quo of zswap occupying disk swap space. Again, you pull out some vague ideas about "frontend" and "backend" swap, which, frankly, is conceptually very similar to swap virtualization. 3. The dynamicization of swap space is treated with the same rigor (or, more accurately, lack thereof). Just more handwaving about the "frontend" vs "backend" (which, again, is very close to swap virtualization). This requirement is a deal breaker for me - see requirement 1 above again. 4. We also pointed out your lack of thoughts for swapoff optimization, which again, seem to be missing in your design. Again, more vagueness about rmap, which is probably more overhead. Look man, I'm not being hostile to you. Believe me on this - I respect your opinion, and I'm working very hard on reducing memory overhead for virtual swap, to see if I can meet you where you want it to be. The RFC's original design inefficient memory usage was due to: a) Readability. Space optimization can make it hard to read code, when fields are squeezed into the same int/long variable. So I just put one different field for each piece of metadata information b) I was playing with synchronization optimization, i.e using atomics instead of locks, and using per-entry locks. But I can go back to using per-cluster lock (I haven't implemented cluster allocator at the time of the RFC, but in my latest version I have done it), which will further reduce the memory overhead by removing a couple of fields/packing more fields. The only non-negotiable per-swap-entry overhead will be a field to indicate the backend location (physical swap slot, zswap entry, etc.) + 2 bits to indicate the swap type. With some field union-ing magic, or pointer tagging magic, we can perhaps squeeze it even harder. I'm also working on reducing the CPU overhead - re-partitioning swap architectures (swap cache, zswap tree), reducing unnecessary xarray lookups where possible. We can then benchmark, and attempt to optimize it together as a community.
On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote: > > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote: > > > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote: > > > > > > Sorry, I am talking about upstream. > > > > So far I have not had a pleasant upstream experience when submitting > > this particular patch to upstream. > > > > > I really appreciate anybody participating in Linux > > > kernel development. Linux is good because different > > > people bring different perspectives to the table. > > > > Of course everybody is welcome. However, NACK without technical > > justification is very bad for upstream development. I can't imagine > > what a new hacker would think after going through what I have gone > > through for this patch. He/she will likely quit contributing upstream. > > This is not the kind of welcome we want. > > > > Nhat needs to be able to technically justify his NACK as a maintainer. > > Sorry there is no other way to sugar coat it. > > I am NOT the only zswap maintainer who expresses concerns. Other > people also have their misgivings, so I have let them speak and not > put words in their mouths. You did not mention the fact that both two NACK from zswap maintainers are from the same company. I assume you have some kind of team sync. There is a term for that, called "person acting in concert". What I mean in "technically unjustifiable" is that VS patch series is a non-starter to merge into mainline. In this email you suggest the per swap slot memory overhead is 48 bytes previously 64 bytes. https://lore.kernel.org/linux-mm/CAKEwX=Mea5V6CKcGuQrYfCQAKErgbje1s0fThjkgCwZXgF-d2A@mail.gmail.com/ Do you have newer VS that significantly reduce that? If so, what is the new number? The starting point before your VS is 11 bytes (3 bytes static, 8 bytes dynamic). 48bytes is more than 4x the original size. This will have a huge impact on the deployment that uses a lot of swap. The worst part is that once your VS series is in the kernel. That overhead is always on, it is forcing the overhead even if the redirection is not used. This will hurt Google's fleet very badly if deployed. Because of the same jobs, the kernel memory consumption will jump up and fail jobs. Every body's kernel who use swap will suffer because it is always on. The alternative, the swap table, uses much less overhead. So your VS leave money on the table. So I consider your VS is a non-starter. I repeatedly call you out because you keep dodging this critical question. Johannes refers to you for the detail value of the overhead as well. Dodging critical questions makes a technical debate very difficult to conduct and drive to a conflict resolution impossible. BTW, this is my big concern on the 2023 swap abstraction talk which our VS is based on. The community feedback at the time strongly favored my solution. I don't understand why you reboot the community un-favored solution without addressing those concerns. The other part of the bad experience is that you NACK first then ask clarifying questions later. The proper order is the other way around. You should fully understand the subject BEFORE you NACK on it. NACK is a very serious business. I did try my best to answer clarification question from your team. I appreciate that Johannes and Yosry ask clarification to advance the discussion. I did not see more question from them I assume they got what they want to know. If you still feel something is missing out, you should ask a follow up question for the part in which you need more clarification. We can repeat until you understand. You keep using the phrase "hand waving" as if I am faking it. That is FUD. Communication is a two way street. I can't force you to understand, asking more questions can help you. This is complex problem. I am confident I can explain to Kairui and he can understand, because he has a lot more context, not because I am faking it. Ask nicely so I can answer nicely. Stay in the technical side of the discussion please. So I consider using VS to NACK my patch is technically unjustifiable. Your current VS with 48 byte overhead is not usable at all as an standard upstream kernel. Can we agree to that? As we all know, using less memory to function the same is a lot harder than using more. If you can dramatically reduce the memory usage, you likely need to rebuild the whole patch series from scratch. If might force you to use solution similar to swap table, in that case why not join team swap table? We can reopen the topic again by then if you have a newer VS: 1) address the per swap slot memory over head, ideally close to the first principle value. 2) make the overhead optional, if not using redirection, preferably not pay the overhead. 3) make your VS patch series incrementally show value, not all or nothing. Sorry this email is getting very long and I have very limited time. Let's discuss one topic at a time. I would like to conclude the current VS is not a viable option as of now. I can reply to other parts of your email once we get the VS out of the way. Best Regards, Chris > > 1. I don't like the operational overhead (to statically size the zswap > swapfile size for each <host x workload> combination) of static > swapfile. Misspecification of swapfile size can lead to unacceptable > swap metadata overhead on small machines, or underutilization of zswap > on big machines. And it is *impossible* to know how much zswap will be > needed ahead of time, even if we fix host - it depends on workloads > access patterns, memory compressibility, and latency/memory pressure > tolerance. > > 2. I don't like the maintainer's overhead (to support a special > infrastructure for a very specific use case, i.e no-writeback), > especially since I'm not convinced this can be turned into a general > architecture. See below. > > 3. I want to move us towards a more dynamic architecture for zswap. > This is a step in the WRONG direction. > > 4. I don't believe this buys us anything we can't already do with > userspace hacking. Again, zswap-over-zram (or insert whatever RAM-only > swap option here), with writeback disabled, is 2-3 lines of script. > > I believe I already justified myself well enough :) It is you who have > not really convinced me that this is, at the very least, a > temporary/first step towards a long-term generalized architecture for > zswap. Every time we pointed out an issue, you seem to justify it with > some more vague ideas that deepen the confusion. > > Let's recap the discussion so far: > > 1. We claimed that this architecture is hard to extend for efficient > zswap writeback, or backend transfer in general, without incurring > page table updates. You claim you plan to implement a redirection > entry to solve this. > > 2. We then pointed out that inserting redirect entry into the current > physical swap infrastructure will leave holes in the upper swap tier's > address space, which is arguably *worse* than the current status quo > of zswap occupying disk swap space. Again, you pull out some vague > ideas about "frontend" and "backend" swap, which, frankly, is > conceptually very similar to swap virtualization. > > 3. The dynamicization of swap space is treated with the same rigor > (or, more accurately, lack thereof). Just more handwaving about the > "frontend" vs "backend" (which, again, is very close to swap > virtualization). This requirement is a deal breaker for me - see > requirement 1 above again. > > 4. We also pointed out your lack of thoughts for swapoff optimization, > which again, seem to be missing in your design. Again, more vagueness > about rmap, which is probably more overhead. > > Look man, I'm not being hostile to you. Believe me on this - I respect > your opinion, and I'm working very hard on reducing memory overhead > for virtual swap, to see if I can meet you where you want it to be. > The RFC's original design inefficient memory usage was due to: > > a) Readability. Space optimization can make it hard to read code, when > fields are squeezed into the same int/long variable. So I just put one > different field for each piece of metadata information > > b) I was playing with synchronization optimization, i.e using atomics > instead of locks, and using per-entry locks. But I can go back to > using per-cluster lock (I haven't implemented cluster allocator at the > time of the RFC, but in my latest version I have done it), which will > further reduce the memory overhead by removing a couple of > fields/packing more fields. > > The only non-negotiable per-swap-entry overhead will be a field to > indicate the backend location (physical swap slot, zswap entry, etc.) > + 2 bits to indicate the swap type. With some field union-ing magic, > or pointer tagging magic, we can perhaps squeeze it even harder. > > I'm also working on reducing the CPU overhead - re-partitioning swap > architectures (swap cache, zswap tree), reducing unnecessary xarray > lookups where possible. > > We can then benchmark, and attempt to optimize it together as a community.
On Sat, Nov 29, 2025 at 12:38 PM Chris Li <chrisl@kernel.org> wrote: > > On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote: > > > > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote: > > > > > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote: > > > > > > > > Sorry, I am talking about upstream. > > > > > > So far I have not had a pleasant upstream experience when submitting > > > this particular patch to upstream. > > > > > > > I really appreciate anybody participating in Linux > > > > kernel development. Linux is good because different > > > > people bring different perspectives to the table. > > > > > > Of course everybody is welcome. However, NACK without technical > > > justification is very bad for upstream development. I can't imagine > > > what a new hacker would think after going through what I have gone > > > through for this patch. He/she will likely quit contributing upstream. > > > This is not the kind of welcome we want. > > > > > > Nhat needs to be able to technically justify his NACK as a maintainer. > > > Sorry there is no other way to sugar coat it. > > > > I am NOT the only zswap maintainer who expresses concerns. Other > > people also have their misgivings, so I have let them speak and not > > put words in their mouths. > > You did not mention the fact that both two NACK from zswap maintainers > are from the same company. I assume you have some kind of team sync. > There is a term for that, called "person acting in concert". I mean, Yosry pointed out issues with your approach too. Yosry is from your company, no? The issues I pointed out have all been technical, thus far. I never even brought up Meta - I'm sure other parties have the same issues. > > What I mean in "technically unjustifiable" is that VS patch series is > a non-starter to merge into mainline. > In this email you suggest the per swap slot memory overhead is 48 > bytes previously 64 bytes. > > https://lore.kernel.org/linux-mm/CAKEwX=Mea5V6CKcGuQrYfCQAKErgbje1s0fThjkgCwZXgF-d2A@mail.gmail.com/ > > Do you have newer VS that significantly reduce that? If so, what is > the new number? > > The starting point before your VS is 11 bytes (3 bytes static, 8 bytes > dynamic). 48bytes is more than 4x the original size. > This will have a huge impact on the deployment that uses a lot of > swap. The worst part is that once your VS series is in the kernel. > That overhead is always on, it is forcing the overhead even if the > redirection is not used. This will hurt Google's fleet very badly if > deployed. Because of the same jobs, the kernel memory consumption will > jump up and fail jobs. Every body's kernel who use swap will suffer > because it is always on. The alternative, the swap table, uses much > less overhead. So your VS leave money on the table. > > So I consider your VS is a non-starter. I repeatedly call you out > because you keep dodging this critical question. Johannes refers to > you for the detail value of the overhead as well. Dodging critical > questions makes a technical debate very difficult to conduct and drive > to a conflict resolution impossible. BTW, this is my big concern on > the 2023 swap abstraction talk which our VS is based on. The community > feedback at the time strongly favored my solution. I don't understand > why you reboot the community un-favored solution without addressing > those concerns. I reboot the VS work because I have not seen any indications that your design could solve the problems I believe are principle for any swap architectures: dynamicization of swap space, efficient backend transfer, to name 2. > > The other part of the bad experience is that you NACK first then ask > clarifying questions later. The proper order is the other way around. > You should fully understand the subject BEFORE you NACK on it. NACK is > a very serious business. > > I did try my best to answer clarification question from your team. I > appreciate that Johannes and Yosry ask clarification to advance the > discussion. I did not see more question from them I assume they got > what they want to know. If you still feel something is missing out, > you should ask a follow up question for the part in which you need > more clarification. We can repeat until you understand. You keep using > the phrase "hand waving" as if I am faking it. That is FUD. > Communication is a two way street. I can't force you to understand, > asking more questions can help you. This is complex problem. I am > confident I can explain to Kairui and he can understand, because he > has a lot more context, not because I am faking it. Ask nicely so I > can answer nicely. Stay in the technical side of the discussion > please. > > So I consider using VS to NACK my patch is technically unjustifiable. I'm not NACK-ing the ghost swapfile because of VS. I'm NACK-ing swapfile because of the technical requirements I pointed out above. Virtual swap happens to neatly solve all of them, by design, from first principle. I never ruled out the possibility of another design that would satisfy all of them - I just did not see enough from you to believe otherwise. I don't believe a static ghosttfile is it. In fact, you CAN theoretically implement virtual swap with a ghost swapfile as well. The staticity will just make it operationally untenable. The next step would be to dynamicize the swap infrastructure, at which point we revert back to the original VS design. I see the same thing played out in your response as well, with the redirection entry, then frontend/backend swap space. It's starting to eerily resembles virtual swap. Or maybe you can clarify? > Your current VS with 48 byte overhead is not usable at all as an > standard upstream kernel. Can we agree to that? Sure, which is why I sent it as an RFC and not as an actual patch series pending merging :) Its main purpose was to demonstrate the workflow of how a feature-complete virtual swap subsystem might behave, in all of the code paths of the memory subsystem. I can then optimize the fields piecemeal, while weighing the tradeoff (such as lock granularity v.s lock fields memory overhead). You and Kairui are welcome to criticize, comment, and help me optimize it, as did Yosry and Johannes in the past. > > As we all know, using less memory to function the same is a lot harder > than using more. If you can dramatically reduce the memory usage, you I don't necessarily disagree. I would, however, would like to point out that the reverse is true too - you can't necessarily compare the overhead of two designs, where one achieve a lot more in terms of features and/or operational goals than the other. > likely need to rebuild the whole patch series from scratch. If might > force you to use solution similar to swap table, in that case why not > join team swap table? Because even with the current swap table design, the allocator is *still* static. I would LOVE to use the current physical swap allocation infrastructure. It just doesn't work in its current state. > We can reopen the topic again by then if you have a newer VS: Sure.
On Sun, Nov 30, 2025 at 12:38:38AM +0400, Chris Li wrote: > On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote: > > > > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote: > > > > > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote: > > > > > > > > Sorry, I am talking about upstream. > > > > > > So far I have not had a pleasant upstream experience when submitting > > > this particular patch to upstream. > > > > > > > I really appreciate anybody participating in Linux > > > > kernel development. Linux is good because different > > > > people bring different perspectives to the table. > > > > > > Of course everybody is welcome. However, NACK without technical > > > justification is very bad for upstream development. I can't imagine > > > what a new hacker would think after going through what I have gone > > > through for this patch. He/she will likely quit contributing upstream. > > > This is not the kind of welcome we want. > > > > > > Nhat needs to be able to technically justify his NACK as a maintainer. > > > Sorry there is no other way to sugar coat it. > > > > I am NOT the only zswap maintainer who expresses concerns. Other > > people also have their misgivings, so I have let them speak and not > > put words in their mouths. > > You did not mention the fact that both two NACK from zswap maintainers > are from the same company. I assume you have some kind of team sync. > There is a term for that, called "person acting in concert". For the benefit of anybody following this from the sidelines, the third zswap maintainer also expressed concerns about Chris's proposal upthread. He works for the same company as Chris. The reality is that Chris is failing to convince others of his design direction, and is now obviously resorting to manipulation and hominem attacks. During the course of this thread, Chris has asked for "a little faith" that his idea will work for all stated requirements, without deeming it necessary to explain how. When probed on technical details, he stated that he doesn't like to plan that far ahead, and prefers having somebody else iron out the implementation details. He also referred to high-level slides from his LSFMM '24 session - which was received thusly[1]: Matthew Wilcox agreed, warning Li that he was setting himself up for "a world of pain". Jan Kara said that existing filesystem designs are not suited to this task Hildenbrand said that this plan was introducing too much complexity His first response to criticism was to invoke his <4 week status of swap maintainer. Meanwhile, the design direction that Chris is construing as a single company conspiracy is anything but. The collaborative origins of these patches are well documented. Chris was CC'd on those RFCs. He notably did not engage in them. He is now lying about the narrative and choosing to attack these patches in bad faith and out of context. This pattern of behavior gives me low confidence that Chris is able to collaborate and compromise on a design that works for all users. And while Chris has been quite vocal and opinionated in mailing list discussions, his actual code contributions to the kernel do not instill confidence that he can solve this problem by himself, either. [1] https://lwn.net/Articles/974587/
On Tue, Dec 2, 2025 at 12:47 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Sun, Nov 30, 2025 at 12:38:38AM +0400, Chris Li wrote: > > On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote: > > > > > > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote: > > > > > > > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote: > > > > > > > > > > Sorry, I am talking about upstream. > > > > > > > > So far I have not had a pleasant upstream experience when submitting > > > > this particular patch to upstream. > > > > > > > > > I really appreciate anybody participating in Linux > > > > > kernel development. Linux is good because different > > > > > people bring different perspectives to the table. > > > > > > > > Of course everybody is welcome. However, NACK without technical > > > > justification is very bad for upstream development. I can't imagine > > > > what a new hacker would think after going through what I have gone > > > > through for this patch. He/she will likely quit contributing upstream. > > > > This is not the kind of welcome we want. > > > > > > > > Nhat needs to be able to technically justify his NACK as a maintainer. > > > > Sorry there is no other way to sugar coat it. > > > > > > I am NOT the only zswap maintainer who expresses concerns. Other > > > people also have their misgivings, so I have let them speak and not > > > put words in their mouths. > > > > You did not mention the fact that both two NACK from zswap maintainers > > are from the same company. I assume you have some kind of team sync. > > There is a term for that, called "person acting in concert". > > For the benefit of anybody following this from the sidelines, the > third zswap maintainer also expressed concerns about Chris's proposal > upthread. He works for the same company as Chris. > > The reality is that Chris is failing to convince others of his design > direction, and is now obviously resorting to manipulation and hominem > attacks. > > During the course of this thread, Chris has asked for "a little faith" > that his idea will work for all stated requirements, without deeming > it necessary to explain how. > > When probed on technical details, he stated that he doesn't like to > plan that far ahead, and prefers having somebody else iron out the > implementation details. He also referred to high-level slides from his > LSFMM '24 session - which was received thusly[1]: > > Matthew Wilcox agreed, warning Li that he was setting himself up for "a world of pain". > > Jan Kara said that existing filesystem designs are not suited to this task > > Hildenbrand said that this plan was introducing too much complexity > > His first response to criticism was to invoke his <4 week status of > swap maintainer. > > Meanwhile, the design direction that Chris is construing as a single > company conspiracy is anything but. The collaborative origins of these > patches are well documented. Chris was CC'd on those RFCs. He notably > did not engage in them. He is now lying about the narrative and > choosing to attack these patches in bad faith and out of context. > > This pattern of behavior gives me low confidence that Chris is able to > collaborate and compromise on a design that works for all users. > > And while Chris has been quite vocal and opinionated in mailing list > discussions, his actual code contributions to the kernel do not > instill confidence that he can solve this problem by himself, either. Hi all, I’d really prefer we all let things cool off a bit before the thread gets too dramatic. :) Sorry to see that the discussion went quite off topic, still I believe this is some kind of misunderstanding on Chris' intention to improve the kernel in a more generic way. From my perspective, Chris did co-developed, suggested, reviewed or authored many of the implementation details around the swap-table idea, and he implemented the swap cluster allocator in 6.11, which unlocked a bunch of follow-on optimizations. I’ve been working on swap for a while as well and have rewritten and refactored large parts of the swap, swap allocator and swap cache (mm/swapfile.c, mm/swap_state.c, swap.h, swap_table.h). Maybe, yeah, I’m not a kernel vet with decades of patches yet, but I do think I'm familiar enough with swap. I think Chris' work, words or code, has been looking good in the end results. It's hard to put a penthouse on a sandcastle, and maybe that's the reason makes it hard to describe or layout the further implementations of swap. We all struggled with swap subsystem a lot, the code base served us well, but it had accumulated a lot of historical complexity and awkward workarounds overtime (we had so many people in the community complaining about it for so many years). I think we all agree that pursuing incremental cleanups and improvement (eg. swap slot cache cleanup, swap lock cleanup, swap_has_cache cleanup, direct-swap workarounds removal, etc.) is more suitable upstream. Chris also help a lot on this (eg. the LPC talk last year) and we finally got rid of many long time burdens, quite some of these works are directly enabled by his swap allocator rework first. And I do have a more completed branch that I posted several times showing the end results of swap tables have better memory consumption & performance, and the code is much simpler than what we had in upstream. It's getting merged step by step, and each step is a gain. I believe that is the right way to improve things upstream, everyone and every workload benefits, and progressively. And based on that, we will be able to implement things much easier. I believe things will look much better and cleaner as we process (eg. resizing might be doable for generic swap too), and make it easier for all of us, and make the swap subsystem better in a collaborative way. Cheers.
On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote: > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> > wrote: > > > > > > Zswap is primarily a compressed cache for real swap on secondary > > storage. It's indeed quite important that entries currently in > > zswap > > don't occupy disk slots; but for a solution to this to be > > acceptable, > > it has to work with the primary usecase and support disk writeback. > > Well, my plan is to support the writeback via swap.tiers. > How would you do writeback from a zswap entry in a ghost swapfile, to a real disk swap backend? That is the use case people are trying to solve. How would your architecture address it? -- All Rights Reversed.
On Mon, Nov 24, 2025 at 7:15 PM Rik van Riel <riel@surriel.com> wrote:
>
> On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org>
> > wrote:
> > >
> > >
> > > Zswap is primarily a compressed cache for real swap on secondary
> > > storage. It's indeed quite important that entries currently in
> > > zswap
> > > don't occupy disk slots; but for a solution to this to be
> > > acceptable,
> > > it has to work with the primary usecase and support disk writeback.
> >
> > Well, my plan is to support the writeback via swap.tiers.
> >
> How would you do writeback from a zswap entry in
> a ghost swapfile, to a real disk swap backend?
Basically, each swap file has its own version swap
ops->{read,write}_folio(). The mem swap tier is similar to the current
zswap but it is memory only, there is no file backing and don't share
swap entries with the real swapfile.
When writing back from one swap entry to another swapfile, for the
simple case of uncompressing the data, data will store to swap cache
and write to another swapfile with allocated another swap entry. The
front end of the swap cache will have the option map the front end
swap entry offset to the back end block locations. At the memory price
of 4 byte per swap entry.
This kind of physical block redirection not only happens in more than
one swapfile, it can happen in the same swapfile, in the situation
that there is available space in lower order swap entries. But can not
allocate the higher order one because those lower order ones are not
continued. In such a case, the swap file can expand the high order
swap entry beyond the end of the current physical swapfile. Then map
two continues high order swap entry into the low order physical
locations. I have some slides I shared in the 2024 LSF the swap pony
talk with some diagrams for that physical swap location redirection.
> That is the use case people are trying to solve.
Yes, me too.
> How would your architecture address it?
The cluster base swap allocator, swap table as the new swap cache, per
cgroup swap.tiers and the vfs like swap ops all integrally work
together as the grant vision for the new swap system. I might not have
an answer for all the design details right now. I am the type of
person who likes to improvise and adjust the design details when more
detailed design constraints are found. So far I found this design can
work well. Some of the early milestones, swap allocator and swap
tables which already landed in the kernel and show great results.
I consider this is much better than the VS (previous swap astraction).
It does not enforce pain like the VS does. One of the big downsides of
VS is that, once applied to the kernel. Even normal swap does not use
redirection and will pay the price for it as well. The pain is
mandatory. My swap.tiers write back does not have this problem. If no
writeback or not redirection of physical blocks, no additional
overhead pay for memory nor CPU.
Chris
On Mon, 2025-11-24 at 20:26 +0300, Chris Li wrote:
> On Mon, Nov 24, 2025 at 7:15 PM Rik van Riel <riel@surriel.com>
> wrote:
> >
> > On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner
> > > <hannes@cmpxchg.org>
> > > wrote:
> > > >
> > > >
> > > > Zswap is primarily a compressed cache for real swap on
> > > > secondary
> > > > storage. It's indeed quite important that entries currently in
> > > > zswap
> > > > don't occupy disk slots; but for a solution to this to be
> > > > acceptable,
> > > > it has to work with the primary usecase and support disk
> > > > writeback.
> > >
> > > Well, my plan is to support the writeback via swap.tiers.
> > >
> > How would you do writeback from a zswap entry in
> > a ghost swapfile, to a real disk swap backend?
>
> Basically, each swap file has its own version swap
> ops->{read,write}_folio(). The mem swap tier is similar to the
> current
> zswap but it is memory only, there is no file backing and don't share
> swap entries with the real swapfile.
>
> When writing back from one swap entry to another swapfile, for the
> simple case of uncompressing the data, data will store to swap cache
> and write to another swapfile with allocated another swap entry. The
> front end of the swap cache will have the option map the front end
> swap entry offset to the back end block locations. At the memory
> price
> of 4 byte per swap entry.
Wait, so you use the swap cache radix tree to
indicate the physical location of data between
multiple swap devices?
Isn't that exactly what the vswap approach
does, too?
How is this different?
--
All Rights Reversed.
On Mon, Nov 24, 2025 at 8:43 PM Rik van Riel <riel@surriel.com> wrote:
>
> On Mon, 2025-11-24 at 20:26 +0300, Chris Li wrote:
> > On Mon, Nov 24, 2025 at 7:15 PM Rik van Riel <riel@surriel.com>
> > wrote:
> > >
> > > On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> > > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner
> > > > <hannes@cmpxchg.org>
> > > > wrote:
> > > > >
> > > > >
> > > > > Zswap is primarily a compressed cache for real swap on
> > > > > secondary
> > > > > storage. It's indeed quite important that entries currently in
> > > > > zswap
> > > > > don't occupy disk slots; but for a solution to this to be
> > > > > acceptable,
> > > > > it has to work with the primary usecase and support disk
> > > > > writeback.
> > > >
> > > > Well, my plan is to support the writeback via swap.tiers.
> > > >
> > > How would you do writeback from a zswap entry in
> > > a ghost swapfile, to a real disk swap backend?
> >
> > Basically, each swap file has its own version swap
> > ops->{read,write}_folio(). The mem swap tier is similar to the
> > current
> > zswap but it is memory only, there is no file backing and don't share
> > swap entries with the real swapfile.
> >
> > When writing back from one swap entry to another swapfile, for the
> > simple case of uncompressing the data, data will store to swap cache
> > and write to another swapfile with allocated another swap entry. The
> > front end of the swap cache will have the option map the front end
> > swap entry offset to the back end block locations. At the memory
> > price
> > of 4 byte per swap entry.
>
> Wait, so you use the swap cache radix tree to
> indicate the physical location of data between
> multiple swap devices?
Ah, you haven't caught up with the progress that the new swap cache
does not use radix trees any more. It is using swap tables. It is a
512 entry swpa table array lookup, no tree lookup. Much faster with
less locks. The swap table commit shows there are about 20% difference
in throughput in some test benchmark workloads.
> Isn't that exactly what the vswap approach
> does, too?
Except that I purpose it earlier.
https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/
That swap cache physcial entry redirection is my original idea as far
as I can tell and presented in the conference earlier.
> How is this different?
The main difference will be I just get rid of the xarray in swap cache
lookup. I don't want to re-introduce it again.
Also in my swap.tiers design, the redirection overhead is optional. If
you are not using redirection, in swap.tiers swpa ops you don't pay
for it. Just like the ghost swap file. VS it is not optional, will
enforce the overhead as well. In my design the memory overhead will be
smaller per swap entry because it will be integrated tightly with swap
entry.
Chris
On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote: > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > The current zswap requires a backing swapfile. The swap slot used > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > space. > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > prevents accidental read or write of ghost swapfile, bdev of > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > flag because there is no rotation disk access when using zswap. > > > > Zswap is primarily a compressed cache for real swap on secondary > > storage. It's indeed quite important that entries currently in zswap > > don't occupy disk slots; but for a solution to this to be acceptable, > > it has to work with the primary usecase and support disk writeback. > > Well, my plan is to support the writeback via swap.tiers. > > > This direction is a dead-end. Please take a look at Nhat's swap > > virtualization patches. They decouple zswap from disk geometry, while > > still supporting writeback to an actual backend file. > > Yes, there are many ways to decouple zswap from disk geometry, my swap > table + swap.tiers design can do that as well. I have concerns about > swap virtualization in the aspect of adding another layer of memory > overhead addition per swap entry and CPU overhead of extra xarray > lookup. I believe my approach is technically superior and cleaner. True, but the static nature of the current swapfile infrastructure also imposes an space overhead and/or operational overhead. I did play around with a prototype with a ghost swapfile for virtual swap, but had to stop because of the swapfile overhead for larger virtual swap space. > Both faster and cleaner. Basically swap.tiers + VFS like swap read > write page ops. I will let Nhat clarify the performance and memory That just solves static placement, no? Backend transfer requires something extra/orthogonal. > overhead side of the swap virtualization. > > I am not against swap entry redirection. Just the swap virtualization There will be redirection either way. I don't think it's avoidable. The only option is whether to shove it into the backend (what zram is doing), or having a generalized module (swap virtualization). Or do a page table walk every time you want to do backend transfer (what swapoff is doing). > series needs to compare against the alternatives in terms of memory > overhead and throughput. > Solving it from the swap.tiers angle is cleaner. > > > Nacked-by: Johannes Weiner <hannes@cmpxchg.org> > > I take that the only relevant part is you are zswap maintainer and I > am the swap maintainer. Fine. I got the message. I will leave the > zswap alone. I will find other ways to address the memory base swap > tiers in swap.tiers. Please keep this discussion technical and not pull ranks unnecessarily. > > Chris
On Sat, Nov 22, 2025 at 10:09 AM Chris Li <chrisl@kernel.org> wrote: > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > The current zswap requires a backing swapfile. The swap slot used > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > space. > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > prevents accidental read or write of ghost swapfile, bdev of > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > flag because there is no rotation disk access when using zswap. > > > > Zswap is primarily a compressed cache for real swap on secondary > > storage. It's indeed quite important that entries currently in zswap > > don't occupy disk slots; but for a solution to this to be acceptable, > > it has to work with the primary usecase and support disk writeback. > > Well, my plan is to support the writeback via swap.tiers. That sounds interesting. Have been watching YoungJun and yours swap.tiers discussion for a while, looking forward to see how they play together. Using tiering to resolve the writeback issue sounds like a nice solution, we definitely don't want to limit the writeback to zswap/ram-block only, we will also want things like block-block writeback. We (and I have noticed many community users) have setups involving hybrid tiers. We have a internal module that moves swap entry from SSD to HDD too. To do it upstreamly we need something like the swap.tiers. > > > This direction is a dead-end. Please take a look at Nhat's swap > > virtualization patches. They decouple zswap from disk geometry, while > > still supporting writeback to an actual backend file. > > Yes, there are many ways to decouple zswap from disk geometry, my swap > ... > Solving it from the swap.tiers angle is cleaner. Agree with the swap.tiers part, that sounds cleaner. > > > Nacked-by: Johannes Weiner <hannes@cmpxchg.org> I think that's too early to justify. Let's stay open for ideas.
On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote: > > The current zswap requires a backing swapfile. The swap slot used > by zswap is not able to be used by the swapfile. That waste swapfile > space. > > The ghost swapfile is a swapfile that only contains the swapfile header > for zswap. The swapfile header indicate the size of the swapfile. There > is no swap data section in the ghost swapfile, therefore, no waste of > swapfile space. As such, any write to a ghost swapfile will fail. To > prevents accidental read or write of ghost swapfile, bdev of > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > flag because there is no rotation disk access when using zswap. Would this also affect the swap slot allocation algorithm? > > The zswap write back has been disabled if all swapfiles in the system > are ghost swap files. I don't like this design: 1. Statically sizing the compression tier will be an operational nightmare, for users that have to support a variety (and increasingly bigger sized) types of hosts. It's one of the primary motivations of the virtual swap line of work. We need to move towards a more dynamic architecture for zswap, not the other way around, in order to reduce both (human's) operational overhead, AND actual space overhead (i.e only allocate (z)swap metadata on-demand). 2. This digs us in the hole of supporting a special infrastructure for non-writeback cases. Now every future change to zswap's architecture has to take this into account. It's not easy to turn this design into something that can support writeback - you're stuck with either having to do an expensive page table walk to update the PTEs, or shoving the virtual swap layer inside zswap. Ugly. 3. And what does this even buy us? Just create a fake in-memory-only swapfile (heck, you can use zram), disable writeback (which you can do both at a cgroup and host-level), and call it a day. Nacked-by: Nhat Pham <nphamcs@gmail.com>
On Fri, Nov 21, 2025 at 2:19 AM Nhat Pham <nphamcs@gmail.com> wrote: > > On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote: > > > > The current zswap requires a backing swapfile. The swap slot used > > by zswap is not able to be used by the swapfile. That waste swapfile > > space. > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > for zswap. The swapfile header indicate the size of the swapfile. There > > is no swap data section in the ghost swapfile, therefore, no waste of > > swapfile space. As such, any write to a ghost swapfile will fail. To > > prevents accidental read or write of ghost swapfile, bdev of > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > flag because there is no rotation disk access when using zswap. > > Would this also affect the swap slot allocation algorithm? > > > > > The zswap write back has been disabled if all swapfiles in the system > > are ghost swap files. > > I don't like this design: > > 1. Statically sizing the compression tier will be an operational > nightmare, for users that have to support a variety (and increasingly > bigger sized) types of hosts. It's one of the primary motivations of > the virtual swap line of work. We need to move towards a more dynamic > architecture for zswap, not the other way around, in order to reduce > both (human's) operational overhead, AND actual space overhead (i.e > only allocate (z)swap metadata on-demand). Let's do it one step at a time. > 2. This digs us in the hole of supporting a special infrastructure for > non-writeback cases. Now every future change to zswap's architecture > has to take this into account. It's not easy to turn this design into > something that can support writeback - you're stuck with either having > to do an expensive page table walk to update the PTEs, or shoving the > virtual swap layer inside zswap. Ugly. What are you talking about? This patch does not have any page table work. You are opposing something in your imagination. Please show me the code in which I do expensive PTE walks. > 3. And what does this even buy us? Just create a fake in-memory-only > swapfile (heck, you can use zram), disable writeback (which you can do > both at a cgroup and host-level), and call it a day. Well this provides users a choice, if they don't care about write backs. They can do zswap with ghost swapfile now without actually wasting disk space. It also does not stop zswap using write back with normal SSD. If you want to write back, you can still use a non ghost swapfile as normal. It is a simple enough patch to provide value right now. It also fits into the swap.tiers long term roadmap to have a seperate tier for memory based swapfiles. I believe that is a cleaner picture than the current zswap as cache but also gets its hands so deep into the swap stack and slows down other swap tiers. > Nacked-by: Nhat Pham <nphamcs@gmail.com> I heard you, if you don't don't want zswap to have anything to do with memory based swap tier in the swap.tiers design. I respect your choice. Chris
On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote: > > On Fri, Nov 21, 2025 at 2:19 AM Nhat Pham <nphamcs@gmail.com> wrote: > > > > On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote: > > > > > > The current zswap requires a backing swapfile. The swap slot used > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > space. > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > prevents accidental read or write of ghost swapfile, bdev of > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > flag because there is no rotation disk access when using zswap. > > > > Would this also affect the swap slot allocation algorithm? > > > > > > > > The zswap write back has been disabled if all swapfiles in the system > > > are ghost swap files. > > > > I don't like this design: > > > > 1. Statically sizing the compression tier will be an operational > > nightmare, for users that have to support a variety (and increasingly > > bigger sized) types of hosts. It's one of the primary motivations of > > the virtual swap line of work. We need to move towards a more dynamic > > architecture for zswap, not the other way around, in order to reduce > > both (human's) operational overhead, AND actual space overhead (i.e > > only allocate (z)swap metadata on-demand). > > Let's do it one step at a time. I'm happy with landing these patches one step at a time. But from my POV (and admittedly limited imagination), it's a bit of a deadend. The only architecture, IMO, that satisfies: 1. Dynamic overhead of (z)swap metadata. 2. Decouple swap backends, i.e no pre-reservation of lower tiers space (what zswap is doing right now). 3. Backend transfer without page table walks. is swap virtualization. If you want to present an alternative vision, you don't have to implement it right away, but you have to at least explain to me how to achieve all these 3. > > > 2. This digs us in the hole of supporting a special infrastructure for > > non-writeback cases. Now every future change to zswap's architecture > > has to take this into account. It's not easy to turn this design into > > something that can support writeback - you're stuck with either having > > to do an expensive page table walk to update the PTEs, or shoving the > > virtual swap layer inside zswap. Ugly. > > What are you talking about? This patch does not have any page table > work. You are opposing something in your imagination. Please show me > the code in which I do expensive PTE walks. Please read my response again. I did not say you did any PTE walk in this patch. What I meant was, if you want to make this the general architecture for zswap and not some niche infrastructure for specialized use case, you need to be able to support backend transfer, i.e zswap writeback (zswap -> disk swap, and perhaps in the future the other direction). This will be very expensive with this design. > > > 3. And what does this even buy us? Just create a fake in-memory-only > > swapfile (heck, you can use zram), disable writeback (which you can do > > both at a cgroup and host-level), and call it a day. > > Well this provides users a choice, if they don't care about write > backs. They can do zswap with ghost swapfile now without actually > wasting disk space. > > It also does not stop zswap using write back with normal SSD. If you > want to write back, you can still use a non ghost swapfile as normal. > > It is a simple enough patch to provide value right now. It also fits > into the swap.tiers long term roadmap to have a seperate tier for > memory based swapfiles. I believe that is a cleaner picture than the > current zswap as cache but also gets its hands so deep into the swap > stack and slows down other swap tiers. > > > Nacked-by: Nhat Pham <nphamcs@gmail.com> > > I heard you, if you don't don't want zswap to have anything to do > with memory based swap tier in the swap.tiers design. I respect your > choice. Where does this even come from? I can't speak for Johannes or Yosry, but personally I'm ambivalent with respect to swap.tiers. My only objection in the past was there was not any use case at a time, but there seems to be one now. I won't stand in the way of swap.tiers landing, or zswap's integration into it. From my POV, swap.tiers solve a problem completely orthogonal to what I'm trying to solve, namely, the three points listed above. It's about definition of swap hierarchy, either at initial placement time, or during offloading from one backend to another, where as I'm trying to figure out the mechanistic side of it (how to transfer a page from one backend to another without page table walking). These two are independent, if not synergistic. > > Chris
On Mon, Nov 24, 2025 at 5:47 PM Nhat Pham <nphamcs@gmail.com> wrote: > > On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote: > > > > On Fri, Nov 21, 2025 at 2:19 AM Nhat Pham <nphamcs@gmail.com> wrote: > > > > > > On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote: > > > > > > > > The current zswap requires a backing swapfile. The swap slot used > > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > > space. > > > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > > prevents accidental read or write of ghost swapfile, bdev of > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > > flag because there is no rotation disk access when using zswap. > > > > > > Would this also affect the swap slot allocation algorithm? > > > > > > > > > > > The zswap write back has been disabled if all swapfiles in the system > > > > are ghost swap files. > > > > > > I don't like this design: > > > > > > 1. Statically sizing the compression tier will be an operational > > > nightmare, for users that have to support a variety (and increasingly > > > bigger sized) types of hosts. It's one of the primary motivations of > > > the virtual swap line of work. We need to move towards a more dynamic > > > architecture for zswap, not the other way around, in order to reduce > > > both (human's) operational overhead, AND actual space overhead (i.e > > > only allocate (z)swap metadata on-demand). > > > > Let's do it one step at a time. > > I'm happy with landing these patches one step at a time. But from my > POV (and admittedly limited imagination), it's a bit of a deadend. > > The only architecture, IMO, that satisfies: > > 1. Dynamic overhead of (z)swap metadata. > > 2. Decouple swap backends, i.e no pre-reservation of lower tiers space > (what zswap is doing right now). > > 3. Backend transfer without page table walks. > > is swap virtualization. > > If you want to present an alternative vision, you don't have to > implement it right away, but you have to at least explain to me how to > achieve all these 3. From 1,2,3 to SV as the only solution is a big jump. How many possibilities have you explored to conclude that no other solution can satisfy your 123? I just replied to Rik's email about the high level sketch design. My design should satisfy it and can serve as one counter example of alternative design. > > > > > > 2. This digs us in the hole of supporting a special infrastructure for > > > non-writeback cases. Now every future change to zswap's architecture > > > has to take this into account. It's not easy to turn this design into > > > something that can support writeback - you're stuck with either having > > > to do an expensive page table walk to update the PTEs, or shoving the > > > virtual swap layer inside zswap. Ugly. > > > > What are you talking about? This patch does not have any page table > > work. You are opposing something in your imagination. Please show me > > the code in which I do expensive PTE walks. > > Please read my response again. I did not say you did any PTE walk in this patch. > > What I meant was, if you want to make this the general architecture > for zswap and not some niche infrastructure for specialized use case, > you need to be able to support backend transfer, i.e zswap writeback > (zswap -> disk swap, and perhaps in the future the other direction). > This will be very expensive with this design. I can't say I agree with you. It seems you have made a lot of assumptions in your reasoning. > > > 3. And what does this even buy us? Just create a fake in-memory-only > > > swapfile (heck, you can use zram), disable writeback (which you can do > > > both at a cgroup and host-level), and call it a day. > > > > Well this provides users a choice, if they don't care about write > > backs. They can do zswap with ghost swapfile now without actually > > wasting disk space. > > > > It also does not stop zswap using write back with normal SSD. If you > > want to write back, you can still use a non ghost swapfile as normal. > > > > It is a simple enough patch to provide value right now. It also fits > > into the swap.tiers long term roadmap to have a seperate tier for > > memory based swapfiles. I believe that is a cleaner picture than the > > current zswap as cache but also gets its hands so deep into the swap > > stack and slows down other swap tiers. > > > > > Nacked-by: Nhat Pham <nphamcs@gmail.com> > > > > I heard you, if you don't don't want zswap to have anything to do > > with memory based swap tier in the swap.tiers design. I respect your > > choice. > > Where does this even come from? > > I can't speak for Johannes or Yosry, but personally I'm ambivalent > with respect to swap.tiers. My only objection in the past was there > was not any use case at a time, but there seems to be one now. I won't > stand in the way of swap.tiers landing, or zswap's integration into > it. > > From my POV, swap.tiers solve a problem completely orthogonal to what > I'm trying to solve, namely, the three points listed above. It's about > definition of swap hierarchy, either at initial placement time, or > during offloading from one backend to another, where as I'm trying to > figure out the mechanistic side of it (how to transfer a page from one > backend to another without page table walking). These two are > independent, if not synergistic. I think our goal overlaps, just a different approach with different performance charistic. I have asked in this thread a few times, how big is the per swap slot memory overhead VS introduced? That is something that I care about a lot. Chris
© 2016 - 2025 Red Hat, Inc.