include/linux/swap.h | 2 ++ mm/page_io.c | 18 +++++++++++++++--- mm/swap.h | 2 +- mm/swap_state.c | 7 +++++++ mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++----- mm/zswap.c | 17 +++++++++++------ 6 files changed, 73 insertions(+), 15 deletions(-)
The current zswap requires a backing swapfile. The swap slot used
by zswap is not able to be used by the swapfile. That waste swapfile
space.
The ghost swapfile is a swapfile that only contains the swapfile header
for zswap. The swapfile header indicate the size of the swapfile. There
is no swap data section in the ghost swapfile, therefore, no waste of
swapfile space. As such, any write to a ghost swapfile will fail. To
prevents accidental read or write of ghost swapfile, bdev of
swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
flag because there is no rotation disk access when using zswap.
The zswap write back has been disabled if all swapfiles in the system
are ghost swap files.
Signed-off-by: Chris Li <chrisl@kernel.org>
---
include/linux/swap.h | 2 ++
mm/page_io.c | 18 +++++++++++++++---
mm/swap.h | 2 +-
mm/swap_state.c | 7 +++++++
mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++-----
mm/zswap.c | 17 +++++++++++------
6 files changed, 73 insertions(+), 15 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -216,6 +216,7 @@ enum {
SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
+ SWP_GHOST = (1 << 13), /* not backed by anything */
/* add others here before... */
};
@@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
void free_pages_and_swap_cache(struct encoded_page **, int);
/* linux/mm/swapfile.c */
extern atomic_long_t nr_swap_pages;
+extern atomic_t nr_real_swapfiles;
extern long total_swap_pages;
extern atomic_t nr_rotate_swap;
diff --git a/mm/page_io.c b/mm/page_io.c
index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
return AOP_WRITEPAGE_ACTIVATE;
}
- __swap_writepage(folio, swap_plug);
- return 0;
+ return __swap_writepage(folio, swap_plug);
out_unlock:
folio_unlock(folio);
return ret;
@@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
submit_bio(bio);
}
-void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
+int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
{
struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
+
+ if (sis->flags & SWP_GHOST) {
+ /* Prevent the page from getting reclaimed. */
+ folio_set_dirty(folio);
+ return AOP_WRITEPAGE_ACTIVATE;
+ }
+
/*
* ->flags can be updated non-atomicially (scan_swap_map_slots),
* but that will never affect SWP_FS_OPS, so the data_race
@@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
swap_writepage_bdev_sync(folio, sis);
else
swap_writepage_bdev_async(folio, sis);
+ return 0;
}
void swap_write_unplug(struct swap_iocb *sio)
@@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
if (zswap_load(folio) != -ENOENT)
goto finish;
+ if (unlikely(sis->flags & SWP_GHOST)) {
+ folio_unlock(folio);
+ goto finish;
+ }
+
/* We have to read from slower devices. Increase zswap protection. */
zswap_folio_swapin(folio);
diff --git a/mm/swap.h b/mm/swap.h
index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
}
void swap_write_unplug(struct swap_iocb *sio);
int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
-void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
+int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
/* linux/mm/swap_state.c */
extern struct address_space swap_space __ro_after_init;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct swap_iocb *splug = NULL;
bool page_allocated;
+ /*
+ * The entry may have been freed by another task. Avoid swap_info_get()
+ * which will print error message if the race happens.
+ */
+ if (si->flags & SWP_GHOST)
+ goto skip;
+
mask = swapin_nr_pages(offset) - 1;
if (!mask)
goto skip;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
atomic_long_t nr_swap_pages;
+atomic_t nr_real_swapfiles;
/*
* Some modules use swappable objects and may try to swap them out under
* memory pressure (via the shrinker). Before doing so, they may wish to
@@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
goto skip;
}
+ if (!(si->flags & SWP_GHOST))
+ atomic_sub(1, &nr_real_swapfiles);
plist_del(&si->avail_list, &swap_avail_head);
skip:
@@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
}
plist_add(&si->avail_list, &swap_avail_head);
+ if (!(si->flags & SWP_GHOST))
+ atomic_add(1, &nr_real_swapfiles);
skip:
spin_unlock(&swap_avail_lock);
@@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
struct inode *inode = mapping->host;
int ret;
+ if (sis->flags & SWP_GHOST) {
+ *span = 0;
+ return 0;
+ }
+
if (S_ISBLK(inode->i_mode)) {
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
@@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
- if (!p->bdev || !bdev_nonrot(p->bdev))
+ if (!(p->flags & SWP_GHOST) &&
+ (!p->bdev || !bdev_nonrot(p->bdev)))
atomic_dec(&nr_rotate_swap);
mutex_lock(&swapon_mutex);
@@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
mutex_unlock(&swapon_mutex);
}
+static const char *swap_type_str(struct swap_info_struct *si)
+{
+ struct file *file = si->swap_file;
+
+ if (si->flags & SWP_GHOST)
+ return "ghost\t";
+
+ if (S_ISBLK(file_inode(file)->i_mode))
+ return "partition";
+
+ return "file\t";
+}
+
static int swap_show(struct seq_file *swap, void *v)
{
struct swap_info_struct *si = v;
@@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
len = seq_file_path(swap, file, " \t\n\\");
seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
len < 40 ? 40 - len : 1, " ",
- S_ISBLK(file_inode(file)->i_mode) ?
- "partition" : "file\t",
+ swap_type_str(si),
bytes, bytes < 10000000 ? "\t" : "",
inuse, inuse < 10000000 ? "\t" : "",
si->prio);
@@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
return 0;
}
-
/*
* Find out how many pages are allowed for a single swap device. There
* are two limiting factors:
@@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
unsigned long maxpages;
unsigned long swapfilepages;
unsigned long last_page;
+ loff_t size;
if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
pr_err("Unable to find swap-space signature\n");
@@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
if (!maxpages)
return 0;
- swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
+
+ size = i_size_read(inode);
+ if (size == PAGE_SIZE) {
+ /* Ghost swapfile */
+ si->bdev = NULL;
+ si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
+ return maxpages;
+ }
+
+ swapfilepages = size >> PAGE_SHIFT;
if (swapfilepages && maxpages > swapfilepages) {
pr_warn("Swap area shorter than signature indicates\n");
return 0;
diff --git a/mm/zswap.c b/mm/zswap.c
index 5d0f8b13a958da3b5e74b63217b06e58ba2d3c26..29dfcc94b13eb72b1dbd100ded6e50620299e6e1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1005,14 +1005,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
struct folio *folio;
struct mempolicy *mpol;
bool folio_was_allocated;
- struct swap_info_struct *si;
+ struct swap_info_struct *si = get_swap_device(swpentry);
int ret = 0;
- /* try to allocate swap cache folio */
- si = get_swap_device(swpentry);
if (!si)
- return -EEXIST;
+ return -ENOENT;
+
+ if (si->flags & SWP_GHOST) {
+ put_swap_device(si);
+ return -EINVAL;
+ }
+ /* try to allocate swap cache folio */
mpol = get_task_policy(current);
folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
@@ -1067,7 +1071,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
folio_set_reclaim(folio);
/* start writeback */
- __swap_writepage(folio, NULL);
+ ret = __swap_writepage(folio, NULL);
+ WARN_ON_ONCE(ret);
out:
if (ret && ret != -EEXIST) {
@@ -1551,7 +1556,7 @@ bool zswap_store(struct folio *folio)
zswap_pool_put(pool);
put_objcg:
obj_cgroup_put(objcg);
- if (!ret && zswap_pool_reached_full)
+ if (!ret && zswap_pool_reached_full && atomic_read(&nr_real_swapfiles))
queue_work(shrink_wq, &zswap_shrink_work);
check_old:
/*
---
base-commit: 9835506e139732fa1b55aea3ed4e3ec3dd499f30
change-id: 20251121-ghost-56e3948a7a17
Best regards,
--
Chris Li <chrisl@kernel.org>
On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > The current zswap requires a backing swapfile. The swap slot used > by zswap is not able to be used by the swapfile. That waste swapfile > space. > > The ghost swapfile is a swapfile that only contains the swapfile header > for zswap. The swapfile header indicate the size of the swapfile. There > is no swap data section in the ghost swapfile, therefore, no waste of > swapfile space. As such, any write to a ghost swapfile will fail. To > prevents accidental read or write of ghost swapfile, bdev of > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > flag because there is no rotation disk access when using zswap. > > The zswap write back has been disabled if all swapfiles in the system > are ghost swap files. > > Signed-off-by: Chris Li <chrisl@kernel.org> I did not know which subthread to reply to at this point, so I am just replying to the main thread. I have been trying to stay out of this for various reasons, but I was mentioned a few times and I also think this is getting out of hand tbh. First of all, I want to clarify that I am not "representing" any entity here, I am speaking as an upstream zswap maintainer. Obviously I have Google's interests in mind, but I am not representing Google here. Second, Chris keeps bringing up that the community picked and/or strongly favored the swap table approach over virtual swap back in 2023. I just want to make it absolutely clear that this was NOT my read of the room, and I do not think that the community really made a decision or favored any approach back then. Third, Chris, please stop trying to force this into a company vs company situation. You keep mentioning personal attacks, but you are making this personal more than anyone in this thread by taking this approach. Now with all of that out of the way, I want to try to salvage the technical discussion here. Taking several steps back, and oversimplifying a bit: Chris mentioned having a frontend and backend and an optional redirection when a page is moved between swap backends. This is conceptually the same as the virtual swap proposal. I think the key difference here is: - In Chris's proposal, we start with a swap entry that represents a swap slot in swapfile A. If we do writeback (or swap tiering), we create another swap entry in swapfile B, and have the first swap entry point to it instead of the slot in swapfile A. If we want to reuse the swap slot in swapfile A, we create a new swap entry that points to it. So we start with a swap entry that directly maps to a swap slot, and optionally put a redirection there to point to another swap slot for writeback/tiering. Everything is a swapfile, even zswap will need to be represented by a separate (ghost) swapfile. - In the virtual swap proposal, swap entries are in a completely different space than swap slots. A swap entry points to an arbitrary swap slot (or zswap entry) from the beginning, and writeback (or tiering) does not change that, it only changes what is being pointed to. Regarding memory overhead (assuming x86_64), Chris's proposal has 8 bytes per entry in the swap table that is used to hold both the swap count as well as the swapcache or shadow entry. Nhat's RFC for virtual swap had 48 bytes of overhead, but that's a PoC of a specific implementaiton. Disregarding any specific implementation, any space optimizations that can be applied to the swap table (e.g. combining swap count and swapcache in an 8 byte field) can also be applied to virtual swap. The only *real* difference is that with virtual swap we need to store the swap slot (or zswap entry), while for the current swap table proposal it is implied by the index of the entry. That's an additional 8 bytes. So I think a fully optimized implementation of virtual swap could end up with an overhead of 16 bytes per-entry. Everything else (locks, rcu_head, etc) can probably be optimized away by using similar optimizations as the swap table (e.g. do locking and alloc/freeing in batches). In fact, I think we can use the swap table as the allocator in the virtual swap space, reusing all the locking and allocation optimizations. The difference would be that the swap table is indexed by the virtual swap ID rather than the swap slot index. Another important aspect here, in the simple case the swap table does have lower overhead than virtual swap (8 bytes vs 16 bytes). Although the difference isn't large to begin with, I don't think it's always the case. I think this is only true for the simple case of having a swapped out page on a disk swapfile or in a zswap (ghost) swapfile. Once a page is written back from zswap to disk swapfile, in the swap table approach we'll have two swap table entries. One in the ghost swapfile (with a redirection), and one in the disk swapfile. That's 16 bytes, equal to the overhead of virtual swap. Now imagine a scenario where we have zswap, SSD, and HDD swapfiles with tiering. If a page goes to zswap, then SSD, then HDD, we'll end up with 3 swap table entries for a single swapped out page. That's 24 bytes. So the memory overhead is not really constant, it scales with the number of tiers (as opposed to virtual swap). Another scenario is where we have SSD and HDD swapfiles with tiering. If a page starts in SSD and goes to HDD, we'll have to swap table entries for it (as above). The SSD entry would be wasted (has a redirection), but Chris mentioned that we can fix this by allocating another frontend cluster that points at the same SSD slot. How does this fit in the 8-byte swap table entry tho? The 8-bytes can only hold the swapcache or shadow (and swapcount), but not the swap slot. For the current implementation, the slot is implied by the swap table index, but if we have separate front end swap tables, then we'll also need to store the actual slot. We can workaround this by having different types of clusters and swap tables, where "virtual" clusters have 16 bytes instead of 8 bytes per entry for that, sure.. but at that point we're at significantly more complexity to end up where virtual swap would have put us. Chris, Johannes, Nhat -- please correct me if I am wrong here or if I missed something. I think the current swap table work by Kairui is great, and we can reuse it for virtual swap (as I mentioned above). But I don't think forcing everything to use a swapfile and extending swap tables to support indirections and frontend/backend split is the way to go (for the reasons described above).
On Wed, Dec 3, 2025 at 12:37 PM Yosry Ahmed <yosry.ahmed@linux.dev> wrote: > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > The current zswap requires a backing swapfile. The swap slot used > > by zswap is not able to be used by the swapfile. That waste swapfile > > space. > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > for zswap. The swapfile header indicate the size of the swapfile. There > > is no swap data section in the ghost swapfile, therefore, no waste of > > swapfile space. As such, any write to a ghost swapfile will fail. To > > prevents accidental read or write of ghost swapfile, bdev of > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > flag because there is no rotation disk access when using zswap. > > > > The zswap write back has been disabled if all swapfiles in the system > > are ghost swap files. > > > > Signed-off-by: Chris Li <chrisl@kernel.org> > > I did not know which subthread to reply to at this point, so I am just > replying to the main thread. I have been trying to stay out of this for > various reasons, but I was mentioned a few times and I also think this > is getting out of hand tbh. Thanks for saving the discussion. > > First of all, I want to clarify that I am not "representing" any entity > here, I am speaking as an upstream zswap maintainer. Obviously I have > Google's interests in mind, but I am not representing Google here. Ack, same here. > Second, Chris keeps bringing up that the community picked and/or > strongly favored the swap table approach over virtual swap back in 2023. > I just want to make it absolutely clear that this was NOT my read of the > room, and I do not think that the community really made a decision or > favored any approach back then. OK. Let's move on from that to our current discussion. > Third, Chris, please stop trying to force this into a company vs company > situation. You keep mentioning personal attacks, but you are making this > personal more than anyone in this thread by taking this approach. Let me clarify, it is absolutely not my intention to make it company vs company, that does not fit the description either. Please accept my apology for that. My original intention is that it is a group of people sharing the same idea. More like I am against a whole group (team VS). It is not about which company at all. Round robin N -> 1 intense arguing put me in an uncomfortable situation, feeling excluded. On one hand I wish there was someone representing the group as the main speaker, that would make the discussion feel more equal, more inclusive. On the other hand, any perspective is important, it is hard to require the voice to route through the main speaker. It is hard to execute in practice. So I give up suggesting that. I am open for suggestions on how to make the discussion more inclusive for newcomers to the existing established group. > Now with all of that out of the way, I want to try to salvage the > technical discussion here. Taking several steps back, and Thank you for driving the discussion back to the technical side. I really appreciate it. > oversimplifying a bit: Chris mentioned having a frontend and backend and > an optional redirection when a page is moved between swap backends. This > is conceptually the same as the virtual swap proposal. In my perspective, it is not the same as a virtual swap proposal. There is some overlap, they both can do redirection. But they originally aim to solve two different problems. One of the important goals of the swap table is to allow continuing mTHP swap entry when all the space left is not continues. For the rest of the discusion we call it "continuous mTHP allocator". It allocate continuous swap entry out of non continues file location. Let's say you have a 1G swapfile, all full not available slots. 1) free 4 pages at swap offset 1, 3, 5, 7. All discontiguous spaces add up to 16K. 2) Now allocate one mTHP order 2, 16K in size. Previous allocator can not be satisfied with this requirement. Because the 4 empty slots are not contiguous. Here the redirection and growth of the front swap entry comes in, it is all part of the consideration all alone, not an afterthought. This following step will allow allocating 16K continuous swap entries out of offset [1,3,5,7] 3) We grow the front end part of the swapfile, effectively bump up the max size and add a new cluster of order 2, with a swap table. That is where the front end of the swap and back end file store comes in. BTW, Please don't accuse me copy cat the name "virtual swapfile". I introduce it here 1/8/2025 before Nhat does: https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/ ==============quote============== I think we need to have a separation of the swap cache and the backing of IO of the swap file. I call it the "virtual swapfile". It is virtual in two aspect: 1) There is an up front size at swap on, but no up front allocation of the vmalloc array. The array grows as needed. 2) There is a virtual to physical swap entry mapping. The cost is 4 bytes per swap entry. But it will solve a lot of problems all together. ==============quote ends ========= Side story: I want to pass the "virtual swapfile" for Kairui to propose as LSF topic. Coincidentally Nhat proposes the virtual swap as a LSF topic at 1/16/2025, a few days after I mention "virtual swapfile" in the lsf topic related discussion. It is right before Kairui purpose "virtual swapfile". Kairui renamed our version as "swap table". That is the history behind the name of "swap table". https://lore.kernel.org/linux-mm/20250116092254.204549-1-nphamcs@gmail.com/ I am sure Nhat did not see that email and come up with it independently, coincidentally. I just want to establish that I have prior art introducing the name "virtual swapfile" before Nhat's LSF "virtual swap" topic. After all, it is just a name. I am just as happy using "swap table". To avoid confuse the reader I will call my version of "virtual swap" the "front end". The front end owns the cluster and swap table (swap cache). 8 bytes. The back end only contain file position pointer. 4 bytes. 4) The back end will need different allocate because the allocating assumption is different, it does not have alignment requirement. It just need to track which block location is available. It will need to have a back end specific allocator. It only manage the location of the swapfile cannot allocate from fronted. e.g. redirection entry create a hole. or the new cluster added from step 3. 5) the backend location pointer is optional of the cluster. For the cluster new allocated from step, It must have location pointer, because its offset is out of the backing file range. That is a 4 byte just like a swap entry. This backend location pointer can be used by solution like VS as well. That is part of the consideration as well, so not a after thought. The allocator mention here is more like a file system design rather than pure memory location, because it need to consider block location for combining block level IO. So the mTHP allocator can do swapfile location redirection. But that is a side benefit of a different design goal (mTHP allocation). This physical location pointer description match my 2024 LSF pony talk slide. I just did not put text in the slide there. So it is not some thing after thought, it pre-dates back to 2024 talks. > I think the key difference here is: > - In Chris's proposal, we start with a swap entry that represents a swap > slot in swapfile A. If we do writeback (or swap tiering), we create > another swap entry in swapfile B, and have the first swap entry point Correction. Instead of swapfile B, Backend location in swapfile B. in step 5). It only 4 byte. The back end does not have swap cache. The swap cache belong to front end A (8 bytes). > to it instead of the slot in swapfile A. If we want to reuse the swap > slot in swapfile A, we create a new swap entry that points to it. > > So we start with a swap entry that directly maps to a swap slot, and Again, in my description swap slot A has a file backend location pointer points to swapfile B. It is only the bottom half the swap slot B, not the full swap slot. It does not have 8 byte swap entry overhead of B. > optionally put a redirection there to point to another swap slot for > writeback/tiering. Point to another swapfile location backend, not swap entry.(4 bytes) > Everything is a swapfile, even zswap will need to be represented by a > separate (ghost) swapfile. Allow ghost swapfile. I wouldn't go as far saying ban the current zswap writeback, that part is TBD. My description is enable memory swap tiers without actual physical file backing. Enable ghost swapfile. > > - In the virtual swap proposal, swap entries are in a completely > different space than swap slots. A swap entry points to an arbitrary > swap slot (or zswap entry) from the beginning, and writeback (or > tiering) does not change that, it only changes what is being pointed > to. > > Regarding memory overhead (assuming x86_64), Chris's proposal has 8 > bytes per entry in the swap table that is used to hold both the swap > count as well as the swapcache or shadow entry. Nhat's RFC for virtual Ack > swap had 48 bytes of overhead, but that's a PoC of a specific > implementaiton. Ack. > Disregarding any specific implementation, any space optimizations that > can be applied to the swap table (e.g. combining swap count and > swapcache in an 8 byte field) can also be applied to virtual swap. The > only *real* difference is that with virtual swap we need to store the > swap slot (or zswap entry), while for the current swap table proposal it > is implied by the index of the entry. That's an additional 8 bytes. No, the VS has a smaller design scope. VS does not enable "continous mTHP allocation" . At least that is not mention in any previous VS material. > So I think a fully optimized implementation of virtual swap could end up > with an overhead of 16 bytes per-entry. Everything else (locks, > rcu_head, etc) can probably be optimized away by using similar > optimizations as the swap table (e.g. do locking and alloc/freeing in With the continues mTHP allocator mention above, it already has the all things VS needed. I am not sure we still need VS if we have "continues mTHP allocator", that is TBD. Yes, VS can reuse the physical location pointer by "continues mTHP allocator". The overhead is for above swap table of redirection is 12 bytes not 16 bytes. > batches). In fact, I think we can use the swap table as the allocator in > the virtual swap space, reusing all the locking and allocation That is my feel all alone. Let swap table manage that. > optimizations. The difference would be that the swap table is indexed by > the virtual swap ID rather than the swap slot index. In the "continous mTHP allocator" it is just physical location pointer, > Another important aspect here, in the simple case the swap table does > have lower overhead than virtual swap (8 bytes vs 16 bytes). Although > the difference isn't large to begin with, I don't think it's always the > case. I think this is only true for the simple case of having a swapped > out page on a disk swapfile or in a zswap (ghost) swapfile. Please redo your evaluation after reading the above "continuous mTHP alloctor". > Once a page is written back from zswap to disk swapfile, in the swap > table approach we'll have two swap table entries. One in the ghost One one entry with back end location pointer. (12 byte) > swapfile (with a redirection), and one in the disk swapfile. That's 16 > bytes, equal to the overhead of virtual swap. Again 12 bytes using "continues mTHP allocator" frame work. > Now imagine a scenario where we have zswap, SSD, and HDD swapfiles with > tiering. If a page goes to zswap, then SSD, then HDD, we'll end up with > 3 swap table entries for a single swapped out page. That's 24 bytes. So > the memory overhead is not really constant, it scales with the number of > tiers (as opposed to virtual swap). Nope, Only one front swap entry remain the same, every time it write to a different tier, it only update the back end physical location pointer. It always points to the finial physical location. Only 12 bytes total. You are paying 24 bytes because you don't have the front end vs back end split. Your redirection includes the front end 8 byte as well. Because you include the front end, now you need to do the relay forward. That is the benefit to have front end and back end split of the swap file. Make it more like a file system design. > Another scenario is where we have SSD and HDD swapfiles with tiering. If > a page starts in SSD and goes to HDD, we'll have to swap table entries > for it (as above). The SSD entry would be wasted (has a redirection), > but Chris mentioned that we can fix this by allocating another frontend > cluster that points at the same SSD slot. How does this fit in the No a fix. It is in the design consideration all alone. When the redirection happen, that underlying physical block location pointer will add to the backend allocator. The backend don't overlap with swap entry location can be allocated from front end. > 8-byte swap table entry tho? The 8-bytes can only hold the swapcache or > shadow (and swapcount), but not the swap slot. For the current > implementation, the slot is implied by the swap table index, but if we > have separate front end swap tables, then we'll also need to store the > actual slot. Please read the above description regarding the front end and back end split then ask your question again. The "continuous mTHP allocator" above should answer your question. > We can workaround this by having different types of clusters and swap > tables, where "virtual" clusters have 16 bytes instead of 8 bytes per > entry for that, sure.. but at that point we're at significantly more > complexity to end up where virtual swap would have put us. No, that further complicating things. Please don't go there. The front end and back end location split is design to simplify situation like this. It is conceptual much cleaner as well. > > Chris, Johannes, Nhat -- please correct me if I am wrong here or if I > missed something. I think the current swap table work by Kairui is Yes, see the above explanation of the "continuous mTHP allocator". > great, and we can reuse it for virtual swap (as I mentioned above). But > I don't think forcing everything to use a swapfile and extending swap > tables to support indirections and frontend/backend split is the way to > go (for the reasons described above). IMHO, it is the way to go if consider mTHP allocating. You have different assumption than mine in my design, I correct your description as much as I can above. I am interested in your opinion after read the above description of "continuous mTHP allocator", which is match the 2024 LSF talk slide regarding swap cache redirecting physical locations. Chris
[..] > > Third, Chris, please stop trying to force this into a company vs company > > situation. You keep mentioning personal attacks, but you are making this > > personal more than anyone in this thread by taking this approach. > > Let me clarify, it is absolutely not my intention to make it company > vs company, that does not fit the description either. Please accept my > apology for that. My original intention is that it is a group of > people sharing the same idea. More like I am against a whole group > (team VS). It is not about which company at all. Round robin N -> 1 > intense arguing put me in an uncomfortable situation, feeling > excluded. > > On one hand I wish there was someone representing the group as the > main speaker, that would make the discussion feel more equal, more > inclusive. On the other hand, any perspective is important, it is hard > to require the voice to route through the main speaker. It is hard to > execute in practice. So I give up suggesting that. I am open for > suggestions on how to make the discussion more inclusive for newcomers > to the existing established group. Every person is expressing their own opinion, I don't think there's a way to change that or have a "representative" of each opinion. In fact, changing that would be the opposite of inclusive. > > > Now with all of that out of the way, I want to try to salvage the > > technical discussion here. Taking several steps back, and > > Thank you for driving the discussion back to the technical side. I > really appreciate it. > > > oversimplifying a bit: Chris mentioned having a frontend and backend and > > an optional redirection when a page is moved between swap backends. This > > is conceptually the same as the virtual swap proposal. > > In my perspective, it is not the same as a virtual swap proposal. > There is some overlap, they both can do redirection. > > But they originally aim to solve two different problems. One of the > important goals of the swap table is to allow continuing mTHP swap > entry when all the space left is not continues. For the rest of the > discusion we call it "continuous mTHP allocator". It allocate > continuous swap entry out of non continues file location. > > Let's say you have a 1G swapfile, all full not available slots. > 1) free 4 pages at swap offset 1, 3, 5, 7. All discontiguous spaces > add up to 16K. > 2) Now allocate one mTHP order 2, 16K in size. > Previous allocator can not be satisfied with this requirement. Because > the 4 empty slots are not contiguous. > Here the redirection and growth of the front swap entry comes in, it > is all part of the consideration all alone, not an afterthought. > This following step will allow allocating 16K continuous swap entries > out of offset [1,3,5,7] > 3) We grow the front end part of the swapfile, effectively bump up the > max size and add a new cluster of order 2, with a swap table. > That is where the front end of the swap and back end file store comes in. There's no reason why we cannot do the same with virtual swap, even if it wasn't the main motivaiton, I don't see why we can't achieve the same result. > > BTW, Please don't accuse me copy cat the name "virtual swapfile". I > introduce it here 1/8/2025 before Nhat does: I don't think anyone cares about the actual names, or accused anyone of copycatting anything. > https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/ > ==============quote============== > I think we need to have a separation of the swap cache and the backing > of IO of the swap file. I call it the "virtual swapfile". > It is virtual in two aspect: > 1) There is an up front size at swap on, but no up front allocation of > the vmalloc array. The array grows as needed. > 2) There is a virtual to physical swap entry mapping. The cost is 4 > bytes per swap entry. But it will solve a lot of problems all > together. > ==============quote ends ========= > Side story: > I want to pass the "virtual swapfile" for Kairui to propose as LSF > topic. Coincidentally Nhat proposes the virtual swap as a LSF topic > at 1/16/2025, a few days after I mention "virtual swapfile" in the lsf > topic related discussion. It is right before Kairui purpose "virtual > swapfile". Kairui renamed our version as "swap table". That is the > history behind the name of "swap table". > https://lore.kernel.org/linux-mm/20250116092254.204549-1-nphamcs@gmail.com/ > > I am sure Nhat did not see that email and come up with it > independently, coincidentally. I just want to establish that I have > prior art introducing the name "virtual swapfile" before Nhat's LSF > "virtual swap" topic. After all, it is just a name. I am just as happy > using "swap table". > > To avoid confuse the reader I will call my version of "virtual swap" > the "front end". > > The front end owns the cluster and swap table (swap cache). 8 bytes. > The back end only contain file position pointer. 4 bytes. > > 4) The back end will need different allocate because the allocating > assumption is different, it does not have alignment requirement. It > just need to track which block location is available. > It will need to have a back end specific allocator. It only manage > the location of the swapfile cannot allocate from fronted. e.g. > redirection entry create a hole. or the new cluster added from step 3. > > 5) the backend location pointer is optional of the cluster. For the > cluster new allocated from step, It must have location pointer, > because its offset is out of the backing file range. > That is a 4 byte just like a swap entry. > This backend location pointer can be used by solution like VS as well. > That is part of the consideration as well, so not a after thought. > The allocator mention here is more like a file system design rather > than pure memory location, because it need to consider block location > for combining block level IO. > > So the mTHP allocator can do swapfile location redirection. But that > is a side benefit of a different design goal (mTHP allocation). This > physical location pointer description match my 2024 LSF pony talk > slide. I just did not put text in the slide there. So it is not some > thing after thought, it pre-dates back to 2024 talks. > > > I think the key difference here is: > > - In Chris's proposal, we start with a swap entry that represents a swap > > slot in swapfile A. If we do writeback (or swap tiering), we create > > another swap entry in swapfile B, and have the first swap entry point > > Correction. Instead of swapfile B, Backend location in swapfile B. in > step 5). It only 4 byte. The back end does not have swap cache. The > swap cache belong to front end A (8 bytes). Ack. > > > to it instead of the slot in swapfile A. If we want to reuse the swap > > slot in swapfile A, we create a new swap entry that points to it. > > > > So we start with a swap entry that directly maps to a swap slot, and > > Again, in my description swap slot A has a file backend location > pointer points to swapfile B. > It is only the bottom half the swap slot B, not the full swap slot. It > does not have 8 byte swap entry overhead of B. Ack. > > > optionally put a redirection there to point to another swap slot for > > writeback/tiering. > > Point to another swapfile location backend, not swap entry.(4 bytes) Ack. > > > Everything is a swapfile, even zswap will need to be represented by a > > separate (ghost) swapfile. > > Allow ghost swapfile. I wouldn't go as far saying ban the current > zswap writeback, that part is TBD. My description is enable memory > swap tiers without actual physical file backing. Enable ghost > swapfile. > > > > > - In the virtual swap proposal, swap entries are in a completely > > different space than swap slots. A swap entry points to an arbitrary > > swap slot (or zswap entry) from the beginning, and writeback (or > > tiering) does not change that, it only changes what is being pointed > > to. > > > > Regarding memory overhead (assuming x86_64), Chris's proposal has 8 > > bytes per entry in the swap table that is used to hold both the swap > > count as well as the swapcache or shadow entry. Nhat's RFC for virtual > Ack > > > swap had 48 bytes of overhead, but that's a PoC of a specific > > implementaiton. > > Ack. > > > Disregarding any specific implementation, any space optimizations that > > can be applied to the swap table (e.g. combining swap count and > > swapcache in an 8 byte field) can also be applied to virtual swap. The > > only *real* difference is that with virtual swap we need to store the > > swap slot (or zswap entry), while for the current swap table proposal it > > is implied by the index of the entry. That's an additional 8 bytes. > > No, the VS has a smaller design scope. VS does not enable "continous > mTHP allocation" . At least that is not mention in any previous VS > material. Why not? Even if it wasn't specifically called out as part of the motivation, it still achieves that. What we need for the mTHP swap is to have a redirection layer. Both virtual swap or the front-end/back-end design achieve that. > > > So I think a fully optimized implementation of virtual swap could end up > > with an overhead of 16 bytes per-entry. Everything else (locks, > > rcu_head, etc) can probably be optimized away by using similar > > optimizations as the swap table (e.g. do locking and alloc/freeing in > > With the continues mTHP allocator mention above, it already has the > all things VS needed. > I am not sure we still need VS if we have "continues mTHP allocator", > that is TBD. As I mentioned above, I think the front-end/back-end swap tables and virtual swap are conceptually very similar. The more we discuss this the more I am convinced about this tbh. In both cases we provide an indirection layer such that we can change the backend or backing swapfile without updating the page tables, and allow thing like mTHP swap without having contiguous slots in the swapfile. > > Yes, VS can reuse the physical location pointer by "continues mTHP allocator". > > The overhead is for above swap table of redirection is 12 bytes not 16 bytes. Honeslty if it boils down to 4 bytes per page, I think that's a really small difference. Especially that it doesn't apply to all cases (e.g. not the zswap-only case that Google currently uses). > > > batches). In fact, I think we can use the swap table as the allocator in > > the virtual swap space, reusing all the locking and allocation > > That is my feel all alone. Let swap table manage that. > > > optimizations. The difference would be that the swap table is indexed by > > the virtual swap ID rather than the swap slot index. > > In the "continous mTHP allocator" it is just physical location pointer, > > > Another important aspect here, in the simple case the swap table does > > have lower overhead than virtual swap (8 bytes vs 16 bytes). Although > > the difference isn't large to begin with, I don't think it's always the > > case. I think this is only true for the simple case of having a swapped > > out page on a disk swapfile or in a zswap (ghost) swapfile. > > Please redo your evaluation after reading the above "continuous mTHP alloctor". I did, and if anything I am more convinced that the designs are conceptually close. The main difference is that the virtual swap approach is more flexible in my opinion because the backend doesn't have to be a swapfile, and we don't need "ghost" to use zswap and manage it like a swapfile. > > > Once a page is written back from zswap to disk swapfile, in the swap > > table approach we'll have two swap table entries. One in the ghost > > One one entry with back end location pointer. (12 byte) > > > swapfile (with a redirection), and one in the disk swapfile. That's 16 > > bytes, equal to the overhead of virtual swap. > > Again 12 bytes using "continues mTHP allocator" frame work. Ack. > > > Now imagine a scenario where we have zswap, SSD, and HDD swapfiles with > > tiering. If a page goes to zswap, then SSD, then HDD, we'll end up with > > 3 swap table entries for a single swapped out page. That's 24 bytes. So > > the memory overhead is not really constant, it scales with the number of > > tiers (as opposed to virtual swap). > > Nope, Only one front swap entry remain the same, every time it write > to a different tier, it only update the back end physical location > pointer. > It always points to the finial physical location. Only 12 bytes total. Ack. > > You are paying 24 bytes because you don't have the front end vs back end split. > Your redirection includes the front end 8 byte as well. Because you > include the front end, now you need to do the relay forward. > That is the benefit to have front end and back end split of the swap > file. Make it more like a file system design. > > > Another scenario is where we have SSD and HDD swapfiles with tiering. If > > a page starts in SSD and goes to HDD, we'll have to swap table entries > > for it (as above). The SSD entry would be wasted (has a redirection), > > but Chris mentioned that we can fix this by allocating another frontend > > cluster that points at the same SSD slot. How does this fit in the > > No a fix. It is in the design consideration all alone. When the > redirection happen, that underlying physical block location pointer > will add to the backend allocator. The backend don't overlap with swap > entry location can be allocated from front end. > > > 8-byte swap table entry tho? The 8-bytes can only hold the swapcache or > > shadow (and swapcount), but not the swap slot. For the current > > implementation, the slot is implied by the swap table index, but if we > > have separate front end swap tables, then we'll also need to store the > > actual slot. > > Please read the above description regarding the front end and back end > split then ask your question again. The "continuous mTHP allocator" > above should answer your question. Yeah, the 8 bytes front-end and 4-bytes backend answer this. > > > We can workaround this by having different types of clusters and swap > > tables, where "virtual" clusters have 16 bytes instead of 8 bytes per > > entry for that, sure.. but at that point we're at significantly more > > complexity to end up where virtual swap would have put us. > > No, that further complicating things. Please don't go there. The front > end and back end location split is design to simplify situation like > this. It is conceptual much cleaner as well. Yeah that was mostly hypothetical. > > > > > Chris, Johannes, Nhat -- please correct me if I am wrong here or if I > > missed something. I think the current swap table work by Kairui is > > Yes, see the above explanation of the "continuous mTHP allocator". > > > great, and we can reuse it for virtual swap (as I mentioned above). But > > I don't think forcing everything to use a swapfile and extending swap > > tables to support indirections and frontend/backend split is the way to > > go (for the reasons described above). > > IMHO, it is the way to go if consider mTHP allocating. You have > different assumption than mine in my design, I correct your > description as much as I can above. I am interested in your opinion > after read the above description of "continuous mTHP allocator", which > is match the 2024 LSF talk slide regarding swap cache redirecting > physical locations. As I mentioned, I am still very much convinced the designs are conceptually very similar and the main difference is whether the "backend" is 4 bytes and points at a slot in a swapfile, or a generic 8-byte pointer. FWIW, we can use 4 bytes in virtual swap as well if we leave the xarray in zswap. 4 bytes is plenty of space for an index into the zswap xarray if we no longer use the swap offset. But if we use 8 bytes we can actually get rid of the zswap xarray, by merging it with the virtual swap xarray, or even stop using xarrays completely if we adopt the current swap table allocator for the virtual swap indexes. As Nhat mentioned earlier, I suspect we'll end up not using any extra overhead at all for the zswap-only case, or even reducing the current overhead. > > Chris
, t On Thu, Dec 4, 2025 at 10:16 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote: > > On one hand I wish there was someone representing the group as the > > main speaker, that would make the discussion feel more equal, more > > inclusive. On the other hand, any perspective is important, it is hard > > to require the voice to route through the main speaker. It is hard to > > execute in practice. So I give up suggesting that. I am open for > > suggestions on how to make the discussion more inclusive for newcomers > > to the existing established group. > > Every person is expressing their own opinion, I don't think there's a > way to change that or have a "representative" of each opinion. In fact, > changing that would be the opposite of inclusive. Ack, that is why I did not suggest a main speaker token approach. On the other hand, there are still some considerations that can be taken care of from the group side that do not overwhelm the single person if a similar opinion has been expressed and is waiting for response. N vs 1 arguing does put the single person in unfair dis-advantage and alienates the single person. We should consider the effect of that. OK. Enough said on this and let's move on. > > > Now with all of that out of the way, I want to try to salvage the > > > technical discussion here. Taking several steps back, and > > > > Thank you for driving the discussion back to the technical side. I > > really appreciate it. > > > > > oversimplifying a bit: Chris mentioned having a frontend and backend and > > > an optional redirection when a page is moved between swap backends. This > > > is conceptually the same as the virtual swap proposal. > > > > In my perspective, it is not the same as a virtual swap proposal. > > There is some overlap, they both can do redirection. > > > > But they originally aim to solve two different problems. One of the > > important goals of the swap table is to allow continuing mTHP swap > > entry when all the space left is not continues. For the rest of the > > discusion we call it "continuous mTHP allocator". It allocate > > continuous swap entry out of non continues file location. > > > > Let's say you have a 1G swapfile, all full not available slots. > > 1) free 4 pages at swap offset 1, 3, 5, 7. All discontiguous spaces > > add up to 16K. > > 2) Now allocate one mTHP order 2, 16K in size. > > Previous allocator can not be satisfied with this requirement. Because > > the 4 empty slots are not contiguous. > > Here the redirection and growth of the front swap entry comes in, it > > is all part of the consideration all alone, not an afterthought. > > This following step will allow allocating 16K continuous swap entries > > out of offset [1,3,5,7] > > 3) We grow the front end part of the swapfile, effectively bump up the > > max size and add a new cluster of order 2, with a swap table. > > That is where the front end of the swap and back end file store comes in. > > There's no reason why we cannot do the same with virtual swap, even if > it wasn't the main motivaiton, I don't see why we can't achieve the same > result. Yes, they can. By largely copying the swap table approach to achieve the same result. Before I point out the importance of the memory overhead of per swap slot entry, the 48 bytes is not production quality. VS hasn't really made good progress toward shrinking down the per slot memory usage at a similar level. Not even close. That is until you propose using the earlier stage of the swap table to compete with the later stage of the swap table, by using the exact same approach of the later stage of the swap table. Please don't use swap table ideas to do a knockoff clone of swap table and take the final credit. That is very not decent, I don't think that matches the upstream spirit either. Please respect the originality of the idea and give credit where it is due, after all, that is how the academic system is built on. > > BTW, Please don't accuse me copy cat the name "virtual swapfile". I > > introduce it here 1/8/2025 before Nhat does: > > I don't think anyone cares about the actual names, or accused anyone of > copycatting anything. There are repeat projections cast on me as the "after thought", I want the people who call me "after thought" acknowledge that I am the "leading thought", the "original thought". Just joking. > > https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/ > > ==============quote============== > > I think we need to have a separation of the swap cache and the backing > > of IO of the swap file. I call it the "virtual swapfile". > > It is virtual in two aspect: > > 1) There is an up front size at swap on, but no up front allocation of > > the vmalloc array. The array grows as needed. > > 2) There is a virtual to physical swap entry mapping. The cost is 4 > > bytes per swap entry. But it will solve a lot of problems all > > together. > > ==============quote ends ========= The above prior write up nicely sums up the main idea behind VS, would you agree? I want to give Nhat the benefit of the doubt that he did not commit plagiarism. Since now VS has changed strategy to clone swap tables against swap tables. I would add the points that, please be decent and be collaborative. Respect the originality of the ideas. If this is in the academic context, the email sent to the list considers paper submission, the VS paper would definitely get ding on not properly citing priory paper of "virtual swapfile" above. So far team VS haven't participated much on swap table development. There are a few ack from Nhat, but there is not really any discussion showing insight of understanding the swap table. Now VS wants to clone the swap table against the swap table. Why not just join the team swap table. Really take part of the review of swap table phase N, not just rubber stamping. Please be collaborative, be decent, do it the proper upstream way. > > Correction. Instead of swapfile B, Backend location in swapfile B. in > > step 5). It only 4 byte. The back end does not have swap cache. The > > swap cache belong to front end A (8 bytes). > > Ack. Thanks for the Ack. > > Again, in my description swap slot A has a file backend location > > pointer points to swapfile B. > > It is only the bottom half the swap slot B, not the full swap slot. It > > does not have 8 byte swap entry overhead of B. > > Ack. Thanks for the Ack. > > Point to another swapfile location backend, not swap entry.(4 bytes) > > Ack. Thanks for the Ack. > > > Disregarding any specific implementation, any space optimizations that > > > can be applied to the swap table (e.g. combining swap count and > > > swapcache in an 8 byte field) can also be applied to virtual swap. The > > > only *real* difference is that with virtual swap we need to store the > > > swap slot (or zswap entry), while for the current swap table proposal it > > > is implied by the index of the entry. That's an additional 8 bytes. > > > > No, the VS has a smaller design scope. VS does not enable "continous > > mTHP allocation" . At least that is not mention in any previous VS > > material. > > Why not? Even if it wasn't specifically called out as part of the > motivation, it still achieves that. What we need for the mTHP swap is to > have a redirection layer. Both virtual swap or the front-end/back-end > design achieve that. Using your magic against you, that is what I call an "after thought" of the century. Just joking. Yes, you can do that, by cloning swap tables against swpa tables. It is just not considered decent in my book. Please be collaborative. Now I have demonstrated the swap table side is the one with most of the original ideas and advanced technical designs. Please let the team swap table finish up what they originally planned, not steal the thunder at the final glory. If team VS wants to help speed up the process, since priority is one of VS main considerations, now the design has been converging to swap tables. Please help reviewing the swap table landing phases submission. Crew, walk, run. Even if you want to use the swap table against the swap table. Reviewing landing swap table code is a good way to understand swap tables. Let the team swap tables to finish up the original goal. Once swpa tables have the continue mTHP allocator, we can example any other VS feature can be added on top of that. > > With the continues mTHP allocator mention above, it already has the > > all things VS needed. > > I am not sure we still need VS if we have "continues mTHP allocator", > > that is TBD. > > As I mentioned above, I think the front-end/back-end swap tables and > virtual swap are conceptually very similar. The more we discuss this the Of course very similar, for all we know it is possible they come from the same source. https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/ > more I am convinced about this tbh. In both cases we provide an > indirection layer such that we can change the backend or backing > swapfile without updating the page tables, and allow thing like mTHP > swap without having contiguous slots in the swapfile. > > > > > Yes, VS can reuse the physical location pointer by "continues mTHP allocator". > > > > The overhead is for above swap table of redirection is 12 bytes not 16 bytes. > > Honeslty if it boils down to 4 bytes per page, I think that's a really > small difference. 4 bytes per slot entry difference is leaving free memory on the table. Why not grab it? Do you know that all those swap phase II..IV just to save 3 bytes per slot (and clean up the code in the process)? 4 bytes out of total 8 or 12 bytes that is 33% - 50% difference on the per solt usage. > Especially that it doesn't apply to all cases (e.g. > not the zswap-only case that Google currently uses). I want to ask a clarifying question here. My understanding is that VS is always on. If we are doing zswap-only, does VS still have the 8+4 = 12 bytes overhead? I want to make sure if we are not using the redirection, in the zswap only case, we shouldn't pay the price for it. Again that is another free money on the table. > > > batches). In fact, I think we can use the swap table as the allocator in > > > the virtual swap space, reusing all the locking and allocation Yes, you can. Is there a technical difference to do so? If not, why steal the thunder at finial glory? Why not let swap tables finish up its course? > > In the "continous mTHP allocator" it is just physical location pointer, > > > > > Another important aspect here, in the simple case the swap table does > > > have lower overhead than virtual swap (8 bytes vs 16 bytes). Although > > > the difference isn't large to begin with, I don't think it's always the > > > case. I think this is only true for the simple case of having a swapped > > > out page on a disk swapfile or in a zswap (ghost) swapfile. > > > > Please redo your evaluation after reading the above "continuous mTHP alloctor". > > I did, and if anything I am more convinced that the designs are > conceptually close. The main difference is that the virtual swap > approach is more flexible in my opinion because the backend doesn't have > to be a swapfile, and we don't need "ghost" to use zswap and manage it > like a swapfile. It seems the design has converged to the swap table side. Even the "virtual swapfile" concept could have come from the swap table side. I'm flattered, copying is the best compliment from the competitor. Now we settle on the big design, the rest of the design difference is very small. Let's discuss the VS virtual swap interface without actual swapfile. One question: Does VS virtual swap file expose any swap file interface be referenced by swap on/off? I assume no, please correct me if you do. I think it could have downsides. 1) It is not compatible with normal /etc/fstab design. Now you need seperate init script to enable disable VS. 2) It does not go through swap on/off path. That creates complications. As we know we have a lot of bugs exposed in the swap on/off. It is a very tricky business to get it right. I would recommend staying away from cloning a separate path for the swapon/off. The VS introduces a new kernel interface that also needs to be maintained. 3) The customer can't swap files round robin. As we know some companies are using multiple swap files to reduce the si->lock contention. If I recall correctly 8 swapfiles. Forcing one virtual swapfile will force go through the same si->locks has performance penalties. 4) Having an overall virtual swap file imposes a design challenge in swap.tiers world. Because it does not have a swapfile, the swapfile priority does not apply. 5) Keep it simple. Using your magic against you, the ghost swapfile conceptually can do whatever VS conceptually can do as well. You can consider the ghost swapfile header is just a config file of the VS to setup the swapfile. It saves the extra init script posted on users. BTW, the "ghost swapfile" I will properly rename it back to "virtual swapfile" in the code, as I earn that term's priority date. And you don't mind what it is really called. > > Again 12 bytes using "continues mTHP allocator" frame work. > > Ack. Thanks for the Ack. > > > > > > Now imagine a scenario where we have zswap, SSD, and HDD swapfiles with > > > tiering. If a page goes to zswap, then SSD, then HDD, we'll end up with > > > 3 swap table entries for a single swapped out page. That's 24 bytes. So > > > the memory overhead is not really constant, it scales with the number of > > > tiers (as opposed to virtual swap). > > > > Nope, Only one front swap entry remain the same, every time it write > > to a different tier, it only update the back end physical location > > pointer. > > It always points to the finial physical location. Only 12 bytes total. > > Ack. Thanks for the Ack. That confirms the swap table side is actually having the more advanced technical design all alone. > > Please read the above description regarding the front end and back end > > split then ask your question again. The "continuous mTHP allocator" > > above should answer your question. > > Yeah, the 8 bytes front-end and 4-bytes backend answer this. Ack > > > We can workaround this by having different types of clusters and swap > > > tables, where "virtual" clusters have 16 bytes instead of 8 bytes per > > > entry for that, sure.. but at that point we're at significantly more > > > complexity to end up where virtual swap would have put us. > > > > No, that further complicating things. Please don't go there. The front > > end and back end location split is design to simplify situation like > > this. It is conceptual much cleaner as well. > > Yeah that was mostly hypothetical. Ack. > > > > > > > > > Chris, Johannes, Nhat -- please correct me if I am wrong here or if I > > > missed something. I think the current swap table work by Kairui is > > > > Yes, see the above explanation of the "continuous mTHP allocator". > > > > > great, and we can reuse it for virtual swap (as I mentioned above). But > > > I don't think forcing everything to use a swapfile and extending swap > > > tables to support indirections and frontend/backend split is the way to > > > go (for the reasons described above). > > > > IMHO, it is the way to go if consider mTHP allocating. You have > > different assumption than mine in my design, I correct your > > description as much as I can above. I am interested in your opinion > > after read the above description of "continuous mTHP allocator", which > > is match the 2024 LSF talk slide regarding swap cache redirecting > > physical locations. > > As I mentioned, I am still very much convinced the designs are > conceptually very similar and the main difference is whether the > "backend" is 4 bytes and points at a slot in a swapfile, or a generic > 8-byte pointer. Thanks, as I said earlier, I am flattered. It is of course conceptually it is very close after you copy all the internal design elements of the swap table approach. > FWIW, we can use 4 bytes in virtual swap as well if we leave the xarray > in zswap. 4 bytes is plenty of space for an index into the zswap xarray > if we no longer use the swap offset. But if we use 8 bytes we can > actually get rid of the zswap xarray, by merging it with the virtual > swap xarray, or even stop using xarrays completely if we adopt the > current swap table allocator for the virtual swap indexes. > > As Nhat mentioned earlier, I suspect we'll end up not using any extra > overhead at all for the zswap-only case, or even reducing the current > overhead. In my design there is no extra xarray for zswap, you just have to take my word for it now. That is very late in the game, finish the swap table glory first. Yosry, thank you for driving a good technical discussion. I really enjoy it. I wish the beginning of the discussion went through this path instead. The multi NACK first ask questions later and the condescending tone at the beginning of the discussion really upsets me. The me alone facing four round robin intense arguing doesn't help either. It makes me feel I am not welcomed. I am short tempered and easily get triggered. I am sorry for my behavior as well. Just give me a few moments and I will come to my senses. The ironic part of the discussion is that the "dead end" is the one being converging to. The "afterthought" turns out to be "leading thought". Let that be a lesson for everyone, me included. Be nice to the people who hold different ideas than yourself. Looking forward to more discussion like this. Chris
On Thu, Dec 04, 2025 at 02:11:57PM +0400, Chris Li wrote: [..] > > > > > > > oversimplifying a bit: Chris mentioned having a frontend and backend and > > > > an optional redirection when a page is moved between swap backends. This > > > > is conceptually the same as the virtual swap proposal. > > > > > > In my perspective, it is not the same as a virtual swap proposal. > > > There is some overlap, they both can do redirection. > > > > > > But they originally aim to solve two different problems. One of the > > > important goals of the swap table is to allow continuing mTHP swap > > > entry when all the space left is not continues. For the rest of the > > > discusion we call it "continuous mTHP allocator". It allocate > > > continuous swap entry out of non continues file location. > > > > > > Let's say you have a 1G swapfile, all full not available slots. > > > 1) free 4 pages at swap offset 1, 3, 5, 7. All discontiguous spaces > > > add up to 16K. > > > 2) Now allocate one mTHP order 2, 16K in size. > > > Previous allocator can not be satisfied with this requirement. Because > > > the 4 empty slots are not contiguous. > > > Here the redirection and growth of the front swap entry comes in, it > > > is all part of the consideration all alone, not an afterthought. > > > This following step will allow allocating 16K continuous swap entries > > > out of offset [1,3,5,7] > > > 3) We grow the front end part of the swapfile, effectively bump up the > > > max size and add a new cluster of order 2, with a swap table. > > > That is where the front end of the swap and back end file store comes in. > > > > There's no reason why we cannot do the same with virtual swap, even if > > it wasn't the main motivaiton, I don't see why we can't achieve the same > > result. > > Yes, they can. By largely copying the swap table approach to achieve > the same result. What copying? Using virtual swap IDs inherently means that we are not tied to coniguous swapfile slots to swap out large folio. > Before I point out the importance of the memory > overhead of per swap slot entry, the 48 bytes is not production > quality. VS hasn't really made good progress toward shrinking down the > per slot memory usage at a similar level. Not even close. Nhat said repeatedly that what he sent was a PoC and that the overhead can be optimized. Completely disregarding Nhat's implementation, I described how conceptually the overhead can be lower, probably down to 16 bytes on x86_64. > That is until you propose using the earlier stage of the swap table to > compete with the later stage of the swap table, by using the exact > same approach of the later stage of the swap table. Please don't use > swap table ideas to do a knockoff clone of swap table and take the > final credit. That is very not decent, I don't think that matches the > upstream spirit either. Please respect the originality of the idea and > give credit where it is due, after all, that is how the academic > system is built on. Ugh..what? All virtual swap propasal made it clear that they are PoCs and that the memory overhead can be shrunk. Compacting fields in the swap descriptor (or whatever it's called) to save memory is not "an original idea". What I said is that any memory optimizations that you apply to swap table can equally apply to the virtual swap because they are conceptually storing the same data (aside from the actual swap slot or zswap entry). The other part is allocating and freeing in batches instead of per-entry. This is an implementation detail, and Nhat mentioned early on that we can do this to save memory (specifically for the locking, but it applies for other things). This is not a novel approach either. The comparison to swap table was to clarify things, not "knocking off" anything. [..] > > > https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/ > > > ==============quote============== > > > I think we need to have a separation of the swap cache and the backing > > > of IO of the swap file. I call it the "virtual swapfile". > > > It is virtual in two aspect: > > > 1) There is an up front size at swap on, but no up front allocation of > > > the vmalloc array. The array grows as needed. > > > 2) There is a virtual to physical swap entry mapping. The cost is 4 > > > bytes per swap entry. But it will solve a lot of problems all > > > together. > > > ==============quote ends ========= > > The above prior write up nicely sums up the main idea behind VS, would > you agree? > > I want to give Nhat the benefit of the doubt that he did not commit > plagiarism. Since now VS has changed strategy to clone swap tables > against swap tables. I would add the points that, please be decent and > be collaborative. Respect the originality of the ideas. If this is in > the academic context, the email sent to the list considers paper > submission, the VS paper would definitely get ding on not properly > citing priory paper of "virtual swapfile" above. Okay let me make something very clear. This idea to introduce an redirection layer for swap, call it virtual swap or swap table or mTHP swap allocator or whatever is NOT new. It's NOT your idea, or my idea, or Nhat's. I first heard about it from Johannes in 2022, and it was floated around by Rik in 2011 based on discussions with others: https://lore.kernel.org/linux-mm/4DA25039.3020700@redhat.com/ So no one here is trying to take credit for the idea, except you. No one here is plagiarising anything. We are discussing different design and implementations of the same idea. Sure, people have different ideas about how to implement it, whether it's using an xarray or a swap table, or what exactly to point at in the backend. But these things are usually hashed out during discussions and code reviews, and the better approach is taken by the community. You are the one being very defensive about his "ideas", making it about personal credit, and creating a problem where there was none. No one is trying to steal any credit. Kairui's patches introducing the swap table are there under his name. If we extend his work for the redirection layer, no matter the direction we take it in, it's not taking away from his work, it's adding to it. > > So far team VS haven't participated much on swap table development. > There are a few ack from Nhat, but there is not really any discussion > showing insight of understanding the swap table. Now VS wants to clone > the swap table against the swap table. Why not just join the team swap > table. Really take part of the review of swap table phase N, not just > rubber stamping. Please be collaborative, be decent, do it the proper > upstream way. There are no "teams" here, you're the only who's consistently making this into an argument between companies or teams or whatever. You keep saying you want to have a technical discussion yet most of your response is about hypotheticals around teams and stealing credit. > > > > Disregarding any specific implementation, any space optimizations that > > > > can be applied to the swap table (e.g. combining swap count and > > > > swapcache in an 8 byte field) can also be applied to virtual swap. The > > > > only *real* difference is that with virtual swap we need to store the > > > > swap slot (or zswap entry), while for the current swap table proposal it > > > > is implied by the index of the entry. That's an additional 8 bytes. > > > > > > No, the VS has a smaller design scope. VS does not enable "continous > > > mTHP allocation" . At least that is not mention in any previous VS > > > material. > > > > Why not? Even if it wasn't specifically called out as part of the > > motivation, it still achieves that. What we need for the mTHP swap is to > > have a redirection layer. Both virtual swap or the front-end/back-end > > design achieve that. > > Using your magic against you, that is what I call an "after thought" > of the century. Just joking. > > Yes, you can do that, by cloning swap tables against swpa tables. It > is just not considered decent in my book. Please be collaborative. Now > I have demonstrated the swap table side is the one with most of the > original ideas and advanced technical designs. Please let the team > swap table finish up what they originally planned, not steal the > thunder at the final glory. If team VS wants to help speed up the > process, since priority is one of VS main considerations, now the > design has been converging to swap tables. Please help reviewing the > swap table landing phases submission. Crew, walk, run. Even if you > want to use the swap table against the swap table. Reviewing landing > swap table code is a good way to understand swap tables. Let the team > swap tables to finish up the original goal. Once swpa tables have the > continue mTHP allocator, we can example any other VS feature can be > added on top of that. More rants about hypothetical cloning, knocking off, etc. > > > > With the continues mTHP allocator mention above, it already has the > > > all things VS needed. > > > I am not sure we still need VS if we have "continues mTHP allocator", > > > that is TBD. > > > > As I mentioned above, I think the front-end/back-end swap tables and > > virtual swap are conceptually very similar. The more we discuss this the > > Of course very similar, for all we know it is possible they come from > the same source. > https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/ Your lack of self-awareness is impressive. > > > more I am convinced about this tbh. In both cases we provide an > > indirection layer such that we can change the backend or backing > > swapfile without updating the page tables, and allow thing like mTHP > > swap without having contiguous slots in the swapfile. > > > > > > > > Yes, VS can reuse the physical location pointer by "continues mTHP allocator". > > > > > > The overhead is for above swap table of redirection is 12 bytes not 16 bytes. > > > > Honeslty if it boils down to 4 bytes per page, I think that's a really > > small difference. > > 4 bytes per slot entry difference is leaving free memory on the table. > Why not grab it? > Do you know that all those swap phase II..IV just to save 3 bytes per > slot (and clean up the code in the process)? > 4 bytes out of total 8 or 12 bytes that is 33% - 50% difference on the > per solt usage. Cleaning up the swap code and the performace optimizations in Kairui's work are a lot more important that saving 3 bytes per slot, especially if it's only for actively used slots. That's less than 0.1% of the memory saved by swapping out a page to disk. > > > Especially that it doesn't apply to all cases (e.g. > > not the zswap-only case that Google currently uses). > > I want to ask a clarifying question here. My understanding is that VS > is always on. > If we are doing zswap-only, does VS still have the 8+4 = 12 bytes overhead? > > I want to make sure if we are not using the redirection, in the zswap > only case, we shouldn't pay the price for it. > Again that is another free money on the table. IIUC the extra memory used for the virtual swap can be offset by reduction in zswap_entry, so for the zswap-only case I don't believe there will be any additional overhead. > > > > > batches). In fact, I think we can use the swap table as the allocator in > > > > the virtual swap space, reusing all the locking and allocation > > Yes, you can. Is there a technical difference to do so? If not, why > steal the thunder at finial glory? Why not let swap tables finish up > its course? > > > > In the "continous mTHP allocator" it is just physical location pointer, > > > > > > > Another important aspect here, in the simple case the swap table does > > > > have lower overhead than virtual swap (8 bytes vs 16 bytes). Although > > > > the difference isn't large to begin with, I don't think it's always the > > > > case. I think this is only true for the simple case of having a swapped > > > > out page on a disk swapfile or in a zswap (ghost) swapfile. > > > > > > Please redo your evaluation after reading the above "continuous mTHP alloctor". > > > > I did, and if anything I am more convinced that the designs are > > conceptually close. The main difference is that the virtual swap > > approach is more flexible in my opinion because the backend doesn't have > > to be a swapfile, and we don't need "ghost" to use zswap and manage it > > like a swapfile. > > It seems the design has converged to the swap table side. Even the > "virtual swapfile" concept could have come from the swap table side. > I'm flattered, copying is the best compliment from the competitor. > > Now we settle on the big design, the rest of the design difference is > very small. No, the design hasn't settled or converged on any "side". I am also not going to respond to the rest of this email, and potentially other emails. You keep twisting my words, making delusional claims, and proving how difficult it is to have a technical conversation with you. You kept mentioning that you want to keep the conversation on the technical side, but when I tried to have a technical disucssion you quickly drove it away from that. Half of your email is basically "everyone is trying to steal my cool ideas". I tried salvaging the discussion but this is hopeless.
On Fri, Dec 5, 2025 at 5:05 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote: > > On Thu, Dec 04, 2025 at 02:11:57PM +0400, Chris Li wrote: > [..] > > > > > > > > > oversimplifying a bit: Chris mentioned having a frontend and backend and > > > > > an optional redirection when a page is moved between swap backends. This > > > > > is conceptually the same as the virtual swap proposal. > > > > > > > > In my perspective, it is not the same as a virtual swap proposal. > > > > There is some overlap, they both can do redirection. > > > > > > > > But they originally aim to solve two different problems. One of the > > > > important goals of the swap table is to allow continuing mTHP swap > > > > entry when all the space left is not continues. For the rest of the > > > > discusion we call it "continuous mTHP allocator". It allocate > > > > continuous swap entry out of non continues file location. > > > > > > > > Let's say you have a 1G swapfile, all full not available slots. > > > > 1) free 4 pages at swap offset 1, 3, 5, 7. All discontiguous spaces > > > > add up to 16K. > > > > 2) Now allocate one mTHP order 2, 16K in size. > > > > Previous allocator can not be satisfied with this requirement. Because > > > > the 4 empty slots are not contiguous. > > > > Here the redirection and growth of the front swap entry comes in, it > > > > is all part of the consideration all alone, not an afterthought. > > > > This following step will allow allocating 16K continuous swap entries > > > > out of offset [1,3,5,7] > > > > 3) We grow the front end part of the swapfile, effectively bump up the > > > > max size and add a new cluster of order 2, with a swap table. > > > > That is where the front end of the swap and back end file store comes in. > > > > > > There's no reason why we cannot do the same with virtual swap, even if > > > it wasn't the main motivaiton, I don't see why we can't achieve the same > > > result. > > > > Yes, they can. By largely copying the swap table approach to achieve > > the same result. > > What copying? Using virtual swap IDs inherently means that we are not > tied to coniguous swapfile slots to swap out large folio. > > > Before I point out the importance of the memory > > overhead of per swap slot entry, the 48 bytes is not production > > quality. VS hasn't really made good progress toward shrinking down the > > per slot memory usage at a similar level. Not even close. > > Nhat said repeatedly that what he sent was a PoC and that the overhead > can be optimized. Completely disregarding Nhat's implementation, I > described how conceptually the overhead can be lower, probably down to > 16 bytes on x86_64. > > > That is until you propose using the earlier stage of the swap table to > > compete with the later stage of the swap table, by using the exact > > same approach of the later stage of the swap table. Please don't use > > swap table ideas to do a knockoff clone of swap table and take the > > final credit. That is very not decent, I don't think that matches the > > upstream spirit either. Please respect the originality of the idea and > > give credit where it is due, after all, that is how the academic > > system is built on. > > Ugh..what? > > All virtual swap propasal made it clear that they are PoCs and that the > memory overhead can be shrunk. Compacting fields in the swap descriptor > (or whatever it's called) to save memory is not "an original idea". What > I said is that any memory optimizations that you apply to swap table can > equally apply to the virtual swap because they are conceptually storing > the same data (aside from the actual swap slot or zswap entry). > > The other part is allocating and freeing in batches instead of > per-entry. This is an implementation detail, and Nhat mentioned early on > that we can do this to save memory (specifically for the locking, but it > applies for other things). This is not a novel approach either. > > The comparison to swap table was to clarify things, not "knocking off" > anything. > > [..] > > > > https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/ > > > > ==============quote============== > > > > I think we need to have a separation of the swap cache and the backing > > > > of IO of the swap file. I call it the "virtual swapfile". > > > > It is virtual in two aspect: > > > > 1) There is an up front size at swap on, but no up front allocation of > > > > the vmalloc array. The array grows as needed. > > > > 2) There is a virtual to physical swap entry mapping. The cost is 4 > > > > bytes per swap entry. But it will solve a lot of problems all > > > > together. > > > > ==============quote ends ========= > > > > The above prior write up nicely sums up the main idea behind VS, would > > you agree? > > > > I want to give Nhat the benefit of the doubt that he did not commit > > plagiarism. Since now VS has changed strategy to clone swap tables > > against swap tables. I would add the points that, please be decent and > > be collaborative. Respect the originality of the ideas. If this is in > > the academic context, the email sent to the list considers paper > > submission, the VS paper would definitely get ding on not properly > > citing priory paper of "virtual swapfile" above. > > Okay let me make something very clear. This idea to introduce an > redirection layer for swap, call it virtual swap or swap table or mTHP > swap allocator or whatever is NOT new. It's NOT your idea, or my idea, > or Nhat's. I first heard about it from Johannes in 2022, and it was > floated around by Rik in 2011 based on discussions with others: > https://lore.kernel.org/linux-mm/4DA25039.3020700@redhat.com/ > > So no one here is trying to take credit for the idea, except you. No one > here is plagiarising anything. We are discussing different design and > implementations of the same idea. Sure, people have different ideas > about how to implement it, whether it's using an xarray or a swap table, > or what exactly to point at in the backend. > > But these things are usually hashed out during discussions and code > reviews, and the better approach is taken by the community. You are the > one being very defensive about his "ideas", making it about personal > credit, and creating a problem where there was none. No one is trying to > steal any credit. Kairui's patches introducing the swap table are there > under his name. If we extend his work for the redirection layer, no > matter the direction we take it in, it's not taking away from his work, > it's adding to it. > > > > > So far team VS haven't participated much on swap table development. > > There are a few ack from Nhat, but there is not really any discussion > > showing insight of understanding the swap table. Now VS wants to clone > > the swap table against the swap table. Why not just join the team swap > > table. Really take part of the review of swap table phase N, not just > > rubber stamping. Please be collaborative, be decent, do it the proper > > upstream way. > > There are no "teams" here, you're the only who's consistently making > this into an argument between companies or teams or whatever. You keep > saying you want to have a technical discussion yet most of your response > is about hypotheticals around teams and stealing credit. > > > > > > Disregarding any specific implementation, any space optimizations that > > > > > can be applied to the swap table (e.g. combining swap count and > > > > > swapcache in an 8 byte field) can also be applied to virtual swap. The > > > > > only *real* difference is that with virtual swap we need to store the > > > > > swap slot (or zswap entry), while for the current swap table proposal it > > > > > is implied by the index of the entry. That's an additional 8 bytes. > > > > > > > > No, the VS has a smaller design scope. VS does not enable "continous > > > > mTHP allocation" . At least that is not mention in any previous VS > > > > material. > > > > > > Why not? Even if it wasn't specifically called out as part of the > > > motivation, it still achieves that. What we need for the mTHP swap is to > > > have a redirection layer. Both virtual swap or the front-end/back-end > > > design achieve that. > > > > Using your magic against you, that is what I call an "after thought" > > of the century. Just joking. > > > > Yes, you can do that, by cloning swap tables against swpa tables. It > > is just not considered decent in my book. Please be collaborative. Now > > I have demonstrated the swap table side is the one with most of the > > original ideas and advanced technical designs. Please let the team > > swap table finish up what they originally planned, not steal the > > thunder at the final glory. If team VS wants to help speed up the > > process, since priority is one of VS main considerations, now the > > design has been converging to swap tables. Please help reviewing the > > swap table landing phases submission. Crew, walk, run. Even if you > > want to use the swap table against the swap table. Reviewing landing > > swap table code is a good way to understand swap tables. Let the team > > swap tables to finish up the original goal. Once swpa tables have the > > continue mTHP allocator, we can example any other VS feature can be > > added on top of that. > > More rants about hypothetical cloning, knocking off, etc. > > > > > > > With the continues mTHP allocator mention above, it already has the > > > > all things VS needed. > > > > I am not sure we still need VS if we have "continues mTHP allocator", > > > > that is TBD. > > > > > > As I mentioned above, I think the front-end/back-end swap tables and > > > virtual swap are conceptually very similar. The more we discuss this the > > > > Of course very similar, for all we know it is possible they come from > > the same source. > > https://lore.kernel.org/linux-mm/CACePvbX76veOLK82X-_dhOAa52n0OXA1GsFf3uv9asuArpoYLw@mail.gmail.com/ > > Your lack of self-awareness is impressive. > > > > > > more I am convinced about this tbh. In both cases we provide an > > > indirection layer such that we can change the backend or backing > > > swapfile without updating the page tables, and allow thing like mTHP > > > swap without having contiguous slots in the swapfile. > > > > > > > > > > > Yes, VS can reuse the physical location pointer by "continues mTHP allocator". > > > > > > > > The overhead is for above swap table of redirection is 12 bytes not 16 bytes. > > > > > > Honeslty if it boils down to 4 bytes per page, I think that's a really > > > small difference. > > > > 4 bytes per slot entry difference is leaving free memory on the table. > > Why not grab it? > > Do you know that all those swap phase II..IV just to save 3 bytes per > > slot (and clean up the code in the process)? > > 4 bytes out of total 8 or 12 bytes that is 33% - 50% difference on the > > per solt usage. > > Cleaning up the swap code and the performace optimizations in Kairui's > work are a lot more important that saving 3 bytes per slot, especially > if it's only for actively used slots. That's less than 0.1% of the > memory saved by swapping out a page to disk. > > > > > > Especially that it doesn't apply to all cases (e.g. > > > not the zswap-only case that Google currently uses). > > > > I want to ask a clarifying question here. My understanding is that VS > > is always on. > > If we are doing zswap-only, does VS still have the 8+4 = 12 bytes overhead? > > > > I want to make sure if we are not using the redirection, in the zswap > > only case, we shouldn't pay the price for it. > > Again that is another free money on the table. > > IIUC the extra memory used for the virtual swap can be offset by > reduction in zswap_entry, so for the zswap-only case I don't believe > there will be any additional overhead. > > > > > > > > batches). In fact, I think we can use the swap table as the allocator in > > > > > the virtual swap space, reusing all the locking and allocation > > > > Yes, you can. Is there a technical difference to do so? If not, why > > steal the thunder at finial glory? Why not let swap tables finish up > > its course? > > > > > > In the "continous mTHP allocator" it is just physical location pointer, > > > > > > > > > Another important aspect here, in the simple case the swap table does > > > > > have lower overhead than virtual swap (8 bytes vs 16 bytes). Although > > > > > the difference isn't large to begin with, I don't think it's always the > > > > > case. I think this is only true for the simple case of having a swapped > > > > > out page on a disk swapfile or in a zswap (ghost) swapfile. > > > > > > > > Please redo your evaluation after reading the above "continuous mTHP alloctor". > > > > > > I did, and if anything I am more convinced that the designs are > > > conceptually close. The main difference is that the virtual swap > > > approach is more flexible in my opinion because the backend doesn't have > > > to be a swapfile, and we don't need "ghost" to use zswap and manage it > > > like a swapfile. > > > > It seems the design has converged to the swap table side. Even the > > "virtual swapfile" concept could have come from the swap table side. > > I'm flattered, copying is the best compliment from the competitor. > > > > Now we settle on the big design, the rest of the design difference is > > very small. > > No, the design hasn't settled or converged on any "side". I am also not > going to respond to the rest of this email, and potentially other > emails. You keep twisting my words, making delusional claims, and > proving how difficult it is to have a technical conversation with you. > > You kept mentioning that you want to keep the conversation on the > technical side, but when I tried to have a technical disucssion you > quickly drove it away from that. Half of your email is basically > "everyone is trying to steal my cool ideas". > > I tried salvaging the discussion but this is hopeless. > Hi, all, I hope people don't mind me adding a few words here. I think the key thing is Chris wants things to be done in an optimized way. He welcomes others to collaborate, as long as it's properly credited. Upstream development is tiring and there are conflicts in tech detail and ideas, making it hard to track who is more credited for one implementation. But he has been super helpful as the behind the scene hero for swap tables: Back when I was unfamiliar with swap and sent the long series to optimized it in a different direction two years ago: https://lore.kernel.org/linux-mm/20231119194740.94101-1-ryncsn@gmail.com/ [1] https://lore.kernel.org/linux-mm/20240326185032.72159-1-ryncsn@gmail.com/ [2] Chris literally reviewed every patch of the first series super carefully despite me being a beginner. And for the later series, he pointed out that's not an optimal direction at all, and shared what he thinks is the right direction to refractor swap systematically with me off-list. Then we collabed to implement the swap allocator. That's also the key prerequisite of the swap table. For the swap table series, I already posted a completed series at May this year that have implemented things basically covers until phase 3 (almost half year ago): https://lore.kernel.org/linux-mm/20250514201729.48420-1-ryncsn@gmail.com/ [3]. And shared a workish branch that covers until phase 5 seeking for collaboration later multiple times. Despite swap table is already performing well and stable, and I was also providing the info on how we can solve the VS issue (And the redirection entry layer idea was completely introduced by Chris), the feedback or review is stuck. And you can see VS is also stuck with performance issues at that time. I was in a rush and struggling with managing that long series, getting it merged or reviewed, to enable next step developments. But lacking upstream positive feedback or progress is really disencouraging, and I hesitate to implement the later parts and even thought about giving up. Again Chris helped to organize and rework a large proportion of that series, so we are making real progress, and finally got phase I merged, and phase II ready to be merged. I thought the best approach is, having a clean basement for everyone so we can compare the end result, without any history burdens, then we can discuss further developments. And we are on track of that. And IIRC, VS was also struggling with things like the direct swapin, slot cache and other existing workarounds or the fuzzy API of swap, which are all removed or solved by the swap table series. We are all busy and may be unaware of others' work or history. (e.g. Yosry once pointed out I ignored his previous work, and I apologized for that [4]). It's understandable to me that misunderstandings and implicit interests exist. And if you look closely at [1] and [2] and a few other later series around swap cache, it's also getting very close to the idea of unifying the swap routine to then have a common metadata despite I having no idea of other's work and it is in a different direction, [2] has already done removing the direct swapin and use swap cache as the unified layer, [1] in 2023 is having similar vibe, you can still find same ideas or even codes in the pending swap table patch). But without the cluster idea and a prototype patch from Chris, it would end up catastrophically in upstream. He shared the idea proactively and helped to make later work possible, and so we co-authored on many larter patches. Link: https://lore.kernel.org/all/CAMgjq7DHFYWhm+Z0C5tR2U2a-N_mtmgB4+idD2S+-1438u-wWw@mail.gmail.com/ [4] What I mean is from what I've seen, Chris has been open and friendly and I never see him lack the spirit of sharing the ideas or collaboration on that. As for the current tech issue, we are definitely on track making major break though, let's just focus on improving the swap and make progress :)
On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
>
> The current zswap requires a backing swapfile. The swap slot used
> by zswap is not able to be used by the swapfile. That waste swapfile
> space.
>
> The ghost swapfile is a swapfile that only contains the swapfile header
> for zswap. The swapfile header indicate the size of the swapfile. There
> is no swap data section in the ghost swapfile, therefore, no waste of
> swapfile space. As such, any write to a ghost swapfile will fail. To
> prevents accidental read or write of ghost swapfile, bdev of
> swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> flag because there is no rotation disk access when using zswap.
>
> The zswap write back has been disabled if all swapfiles in the system
> are ghost swap files.
Thanks for sharing this, I've been hearing about the ghost swapfile
design for a long time, glad to see it finally got posted.
>
> Signed-off-by: Chris Li <chrisl@kernel.org>
> ---
> include/linux/swap.h | 2 ++
> mm/page_io.c | 18 +++++++++++++++---
> mm/swap.h | 2 +-
> mm/swap_state.c | 7 +++++++
> mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++-----
> mm/zswap.c | 17 +++++++++++------
> 6 files changed, 73 insertions(+), 15 deletions(-)
In general I think this aligns quite well with what I had in mind and
an idea that was mention during LSFMM this year (the 3rd one in the
"Issues" part, it wasn't clearly described in the cover letter, more
details in the slides):
https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/
The good part is that we will reuse everything we have with the
current swap stack, and stay optional. Everything is a swap device, no
special layers required. All other features will be available in a
cleaner way.
And /etc/fstab just works the same way for the ghost swapfile.
Looking forward to see this RFC get more updates.
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -216,6 +216,7 @@ enum {
> SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
> SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
> SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> + SWP_GHOST = (1 << 13), /* not backed by anything */
> /* add others here before... */
> };
>
> @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
> void free_pages_and_swap_cache(struct encoded_page **, int);
> /* linux/mm/swapfile.c */
> extern atomic_long_t nr_swap_pages;
> +extern atomic_t nr_real_swapfiles;
> extern long total_swap_pages;
> extern atomic_t nr_rotate_swap;
>
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> return AOP_WRITEPAGE_ACTIVATE;
> }
>
> - __swap_writepage(folio, swap_plug);
> - return 0;
> + return __swap_writepage(folio, swap_plug);
> out_unlock:
> folio_unlock(folio);
> return ret;
> @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
> submit_bio(bio);
> }
>
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> {
> struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
>
> VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> +
> + if (sis->flags & SWP_GHOST) {
> + /* Prevent the page from getting reclaimed. */
> + folio_set_dirty(folio);
> + return AOP_WRITEPAGE_ACTIVATE;
> + }
> +
> /*
> * ->flags can be updated non-atomicially (scan_swap_map_slots),
> * but that will never affect SWP_FS_OPS, so the data_race
> @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> swap_writepage_bdev_sync(folio, sis);
> else
> swap_writepage_bdev_async(folio, sis);
> + return 0;
> }
>
> void swap_write_unplug(struct swap_iocb *sio)
> @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> if (zswap_load(folio) != -ENOENT)
> goto finish;
>
> + if (unlikely(sis->flags & SWP_GHOST)) {
> + folio_unlock(folio);
> + goto finish;
> + }
> +
> /* We have to read from slower devices. Increase zswap protection. */
> zswap_folio_swapin(folio);
>
> diff --git a/mm/swap.h b/mm/swap.h
> index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> }
> void swap_write_unplug(struct swap_iocb *sio);
> int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
>
> /* linux/mm/swap_state.c */
> extern struct address_space swap_space __ro_after_init;
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> struct swap_iocb *splug = NULL;
> bool page_allocated;
>
> + /*
> + * The entry may have been freed by another task. Avoid swap_info_get()
> + * which will print error message if the race happens.
> + */
> + if (si->flags & SWP_GHOST)
> + goto skip;
> +
> mask = swapin_nr_pages(offset) - 1;
> if (!mask)
> goto skip;
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
> static DEFINE_SPINLOCK(swap_lock);
> static unsigned int nr_swapfiles;
> atomic_long_t nr_swap_pages;
> +atomic_t nr_real_swapfiles;
> /*
> * Some modules use swappable objects and may try to swap them out under
> * memory pressure (via the shrinker). Before doing so, they may wish to
> @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> goto skip;
> }
>
> + if (!(si->flags & SWP_GHOST))
> + atomic_sub(1, &nr_real_swapfiles);
> plist_del(&si->avail_list, &swap_avail_head);
>
> skip:
> @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> }
>
> plist_add(&si->avail_list, &swap_avail_head);
> + if (!(si->flags & SWP_GHOST))
> + atomic_add(1, &nr_real_swapfiles);
>
> skip:
> spin_unlock(&swap_avail_lock);
> @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
> struct inode *inode = mapping->host;
> int ret;
>
> + if (sis->flags & SWP_GHOST) {
> + *span = 0;
> + return 0;
> + }
> +
> if (S_ISBLK(inode->i_mode)) {
> ret = add_swap_extent(sis, 0, sis->max, 0);
> *span = sis->pages;
> @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> if (p->flags & SWP_CONTINUED)
> free_swap_count_continuations(p);
>
> - if (!p->bdev || !bdev_nonrot(p->bdev))
> + if (!(p->flags & SWP_GHOST) &&
> + (!p->bdev || !bdev_nonrot(p->bdev)))
> atomic_dec(&nr_rotate_swap);
>
> mutex_lock(&swapon_mutex);
> @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
> mutex_unlock(&swapon_mutex);
> }
>
> +static const char *swap_type_str(struct swap_info_struct *si)
> +{
> + struct file *file = si->swap_file;
> +
> + if (si->flags & SWP_GHOST)
> + return "ghost\t";
> +
> + if (S_ISBLK(file_inode(file)->i_mode))
> + return "partition";
> +
> + return "file\t";
> +}
> +
> static int swap_show(struct seq_file *swap, void *v)
> {
> struct swap_info_struct *si = v;
> @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
> len = seq_file_path(swap, file, " \t\n\\");
> seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
> len < 40 ? 40 - len : 1, " ",
> - S_ISBLK(file_inode(file)->i_mode) ?
> - "partition" : "file\t",
> + swap_type_str(si),
> bytes, bytes < 10000000 ? "\t" : "",
> inuse, inuse < 10000000 ? "\t" : "",
> si->prio);
> @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
> return 0;
> }
>
> -
> /*
> * Find out how many pages are allowed for a single swap device. There
> * are two limiting factors:
> @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> unsigned long maxpages;
> unsigned long swapfilepages;
> unsigned long last_page;
> + loff_t size;
>
> if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
> pr_err("Unable to find swap-space signature\n");
> @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>
> if (!maxpages)
> return 0;
> - swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> +
> + size = i_size_read(inode);
> + if (size == PAGE_SIZE) {
> + /* Ghost swapfile */
> + si->bdev = NULL;
> + si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> + return maxpages;
> + }
Here if we push things further, it might be a good idea to make better
use of the swap file header for detecting this kind of device, and
maybe add support for other info too. The header already has version
info embedded in case it will be extended.
On Sat, Nov 22, 2025 at 6:00 PM Kairui Song <ryncsn@gmail.com> wrote: > > On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote: > > > > The current zswap requires a backing swapfile. The swap slot used > > by zswap is not able to be used by the swapfile. That waste swapfile > > space. > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > for zswap. The swapfile header indicate the size of the swapfile. There > > is no swap data section in the ghost swapfile, therefore, no waste of > > swapfile space. As such, any write to a ghost swapfile will fail. To > > prevents accidental read or write of ghost swapfile, bdev of > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > flag because there is no rotation disk access when using zswap. > > > > The zswap write back has been disabled if all swapfiles in the system > > are ghost swap files. > > Thanks for sharing this, I've been hearing about the ghost swapfile > design for a long time, glad to see it finally got posted. > > > > > Signed-off-by: Chris Li <chrisl@kernel.org> > > --- > > include/linux/swap.h | 2 ++ > > mm/page_io.c | 18 +++++++++++++++--- > > mm/swap.h | 2 +- > > mm/swap_state.c | 7 +++++++ > > mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++----- > > mm/zswap.c | 17 +++++++++++------ > > 6 files changed, 73 insertions(+), 15 deletions(-) > > In general I think this aligns quite well with what I had in mind and > an idea that was mention during LSFMM this year (the 3rd one in the > "Issues" part, it wasn't clearly described in the cover letter, more > details in the slides): > https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/ > > The good part is that we will reuse everything we have with the > current swap stack, and stay optional. Everything is a swap device, no > special layers required. All other features will be available in a > cleaner way. > > And /etc/fstab just works the same way for the ghost swapfile. Apologies — let me raise a question that may be annoying. I understand that people may already be feeling tense and sensitive. Despite the benefit of compatibility with /etc/fstab, we still need to provide a physical file on disk (or elsewhere), even if it contains only a header. Personally, this feels a bit odd to me. Is it possible to avoid having a “ghost” swap file altogether and instead implement all "ghost" functionality entirely within the kernel? Ideally, we wouldn’t need to introduce a new “ghost” concept to users at all. In short, we provide the functionality of a ghost swap file without actually having any file or “ghost” at all. Thanks Barry
On 12/02/25 at 10:56am, Barry Song wrote: > On Sat, Nov 22, 2025 at 6:00 PM Kairui Song <ryncsn@gmail.com> wrote: > > > > On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote: > > > > > > The current zswap requires a backing swapfile. The swap slot used > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > space. > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > prevents accidental read or write of ghost swapfile, bdev of > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > flag because there is no rotation disk access when using zswap. > > > > > > The zswap write back has been disabled if all swapfiles in the system > > > are ghost swap files. > > > > Thanks for sharing this, I've been hearing about the ghost swapfile > > design for a long time, glad to see it finally got posted. > > > > > > > > Signed-off-by: Chris Li <chrisl@kernel.org> > > > --- > > > include/linux/swap.h | 2 ++ > > > mm/page_io.c | 18 +++++++++++++++--- > > > mm/swap.h | 2 +- > > > mm/swap_state.c | 7 +++++++ > > > mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++----- > > > mm/zswap.c | 17 +++++++++++------ > > > 6 files changed, 73 insertions(+), 15 deletions(-) > > > > In general I think this aligns quite well with what I had in mind and > > an idea that was mention during LSFMM this year (the 3rd one in the > > "Issues" part, it wasn't clearly described in the cover letter, more > > details in the slides): > > https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/ > > > > The good part is that we will reuse everything we have with the > > current swap stack, and stay optional. Everything is a swap device, no > > special layers required. All other features will be available in a > > cleaner way. > > > > And /etc/fstab just works the same way for the ghost swapfile. > > Apologies — let me raise a question that may be annoying. > I understand that people may already be feeling tense and sensitive. > > Despite the benefit of compatibility with /etc/fstab, we still need to provide > a physical file on disk (or elsewhere), even if it contains only a header. > Personally, this feels a bit odd to me. Is it possible to avoid having a > “ghost” swap file altogether and instead implement all "ghost" functionality > entirely within the kernel? Ideally, we wouldn’t need to introduce a new > “ghost” concept to users at all. > > In short, we provide the functionality of a ghost swap file without actually > having any file or “ghost” at all. That's actually what I would like to see. Just to make that we may need change syscall swapon, to specify the flag to mark it and initial size. People may complain about adjustment in syscall swapon.
On Mon, Dec 1, 2025 at 10:32 PM Baoquan He <bhe@redhat.com> wrote: > > On 12/02/25 at 10:56am, Barry Song wrote: > > On Sat, Nov 22, 2025 at 6:00 PM Kairui Song <ryncsn@gmail.com> wrote: > > > > > > On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote: > > > > > > > > The current zswap requires a backing swapfile. The swap slot used > > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > > space. > > > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > > prevents accidental read or write of ghost swapfile, bdev of > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > > flag because there is no rotation disk access when using zswap. > > > > > > > > The zswap write back has been disabled if all swapfiles in the system > > > > are ghost swap files. > > > > > > Thanks for sharing this, I've been hearing about the ghost swapfile > > > design for a long time, glad to see it finally got posted. > > > > > > > > > > > Signed-off-by: Chris Li <chrisl@kernel.org> > > > > --- > > > > include/linux/swap.h | 2 ++ > > > > mm/page_io.c | 18 +++++++++++++++--- > > > > mm/swap.h | 2 +- > > > > mm/swap_state.c | 7 +++++++ > > > > mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++----- > > > > mm/zswap.c | 17 +++++++++++------ > > > > 6 files changed, 73 insertions(+), 15 deletions(-) > > > > > > In general I think this aligns quite well with what I had in mind and > > > an idea that was mention during LSFMM this year (the 3rd one in the > > > "Issues" part, it wasn't clearly described in the cover letter, more > > > details in the slides): > > > https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/ > > > > > > The good part is that we will reuse everything we have with the > > > current swap stack, and stay optional. Everything is a swap device, no > > > special layers required. All other features will be available in a > > > cleaner way. > > > > > > And /etc/fstab just works the same way for the ghost swapfile. > > > > Apologies — let me raise a question that may be annoying. > > I understand that people may already be feeling tense and sensitive. > > > > Despite the benefit of compatibility with /etc/fstab, we still need to provide > > a physical file on disk (or elsewhere), even if it contains only a header. > > Personally, this feels a bit odd to me. Is it possible to avoid having a > > “ghost” swap file altogether and instead implement all "ghost" functionality > > entirely within the kernel? Ideally, we wouldn’t need to introduce a new > > “ghost” concept to users at all. > > > > In short, we provide the functionality of a ghost swap file without actually > > having any file or “ghost” at all. > > That's actually what I would like to see. Just to make that we may need > change syscall swapon, to specify the flag to mark it and initial size. > People may complain about adjustment in syscall swapon. Yeah that's another design goal with virtual swap - minimizing the operational overhead. With my design/RFC, all you need to do is: 1. Enable zswap at the host level (/sys/module/zswap/parameters/enabled). 2. Enable zswap at the cgroup level, through memory.zswap.max (you can also size per-cgroup zswap limit here, if you so choose). and it *just works*. Out of the box. No need to create a new swapfile, /etc/fstab, etc. If you're unsure about your workload's actual zswap usage, you can keep it unlimited too - it will just grows and shrinks with memory usage dynamics. One design for every host type and workload characteristics (workingset, memory access patterns, memory compressibility).
On Tue, Dec 2, 2025 at 9:53 PM Nhat Pham <nphamcs@gmail.com> wrote: > > > Apologies — let me raise a question that may be annoying. > > > I understand that people may already be feeling tense and sensitive. > > > > > > Despite the benefit of compatibility with /etc/fstab, we still need to provide > > > a physical file on disk (or elsewhere), even if it contains only a header. > > > Personally, this feels a bit odd to me. Is it possible to avoid having a > > > “ghost” swap file altogether and instead implement all "ghost" functionality > > > entirely within the kernel? Ideally, we wouldn’t need to introduce a new > > > “ghost” concept to users at all. > > > > > > In short, we provide the functionality of a ghost swap file without actually > > > having any file or “ghost” at all. > > > > That's actually what I would like to see. Just to make that we may need > > change syscall swapon, to specify the flag to mark it and initial size. > > People may complain about adjustment in syscall swapon. > > Yeah that's another design goal with virtual swap - minimizing the > operational overhead. > > With my design/RFC, all you need to do is: > > 1. Enable zswap at the host level (/sys/module/zswap/parameters/enabled). > > 2. Enable zswap at the cgroup level, through memory.zswap.max (you can > also size per-cgroup zswap limit here, if you so choose). From the kernel point of view, managing swap entry without swapfile poses some challenges. 1) how does the swap_full() and swap cache reclaim work in your world? Will you create more holes not filling and fragments? 2) Do you internally have only one si->lock? You will not able to take advantage of the swap device round robin behavior. > and it *just works*. Out of the box. No need to create a new swapfile, That is a user space thing, existing user space tools. > /etc/fstab, etc. Able to continue using /etc/fstab is a good thing. Now you are forcing distros to insert swap on for zswap which do the above init sequence. It puts more burden on the distro. That is not the main reason I did not go this route. Mostly I want the patch to be simple and easy to review. Keep it simple. I see virtual devices have drawbacks on si->locks and other user space changes required. > If you're unsure about your workload's actual zswap usage, you can > keep it unlimited too - it will just grows and shrinks with memory > usage dynamics. How do you cap your swap cache in that case? I feel a lot of discussion is very hand waving. Having a landable patch will get more of my attention. Chris > > One design for every host type and workload characteristics > (workingset, memory access patterns, memory compressibility).
Add YoungJun to CC.
On 11/22/25 at 05:59pm, Kairui Song wrote:
> On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote:
> >
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space. As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
> >
> > The zswap write back has been disabled if all swapfiles in the system
> > are ghost swap files.
>
> Thanks for sharing this, I've been hearing about the ghost swapfile
> design for a long time, glad to see it finally got posted.
>
> >
> > Signed-off-by: Chris Li <chrisl@kernel.org>
> > ---
> > include/linux/swap.h | 2 ++
> > mm/page_io.c | 18 +++++++++++++++---
> > mm/swap.h | 2 +-
> > mm/swap_state.c | 7 +++++++
> > mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++-----
> > mm/zswap.c | 17 +++++++++++------
> > 6 files changed, 73 insertions(+), 15 deletions(-)
>
> In general I think this aligns quite well with what I had in mind and
> an idea that was mention during LSFMM this year (the 3rd one in the
> "Issues" part, it wasn't clearly described in the cover letter, more
> details in the slides):
> https://lore.kernel.org/all/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com/
Thanks for sharing the background and more information. When I checked
Youngjun's swap.tiers patchset before his RFC, felt it would be more
flexible to add zswap to memcg if zswap size can be decoupled from the
back device. Chris's RFC can satisfy that, but I didn't thought you
guys had planned more, e.g dynamic growth of swap size, and the zswap slot
management being like swap table on swap slot. Looking forward to seeing
the progress and more details.
Thanks
Baoquan
>
> The good part is that we will reuse everything we have with the
> current swap stack, and stay optional. Everything is a swap device, no
> special layers required. All other features will be available in a
> cleaner way.
>
> And /etc/fstab just works the same way for the ghost swapfile.
>
> Looking forward to see this RFC get more updates.
>
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -216,6 +216,7 @@ enum {
> > SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
> > SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
> > SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> > + SWP_GHOST = (1 << 13), /* not backed by anything */
> > /* add others here before... */
> > };
> >
> > @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
> > void free_pages_and_swap_cache(struct encoded_page **, int);
> > /* linux/mm/swapfile.c */
> > extern atomic_long_t nr_swap_pages;
> > +extern atomic_t nr_real_swapfiles;
> > extern long total_swap_pages;
> > extern atomic_t nr_rotate_swap;
> >
> > diff --git a/mm/page_io.c b/mm/page_io.c
> > index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> > --- a/mm/page_io.c
> > +++ b/mm/page_io.c
> > @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> > return AOP_WRITEPAGE_ACTIVATE;
> > }
> >
> > - __swap_writepage(folio, swap_plug);
> > - return 0;
> > + return __swap_writepage(folio, swap_plug);
> > out_unlock:
> > folio_unlock(folio);
> > return ret;
> > @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
> > submit_bio(bio);
> > }
> >
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > {
> > struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> >
> > VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> > +
> > + if (sis->flags & SWP_GHOST) {
> > + /* Prevent the page from getting reclaimed. */
> > + folio_set_dirty(folio);
> > + return AOP_WRITEPAGE_ACTIVATE;
> > + }
> > +
> > /*
> > * ->flags can be updated non-atomicially (scan_swap_map_slots),
> > * but that will never affect SWP_FS_OPS, so the data_race
> > @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > swap_writepage_bdev_sync(folio, sis);
> > else
> > swap_writepage_bdev_async(folio, sis);
> > + return 0;
> > }
> >
> > void swap_write_unplug(struct swap_iocb *sio)
> > @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> > if (zswap_load(folio) != -ENOENT)
> > goto finish;
> >
> > + if (unlikely(sis->flags & SWP_GHOST)) {
> > + folio_unlock(folio);
> > + goto finish;
> > + }
> > +
> > /* We have to read from slower devices. Increase zswap protection. */
> > zswap_folio_swapin(folio);
> >
> > diff --git a/mm/swap.h b/mm/swap.h
> > index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> > }
> > void swap_write_unplug(struct swap_iocb *sio);
> > int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> >
> > /* linux/mm/swap_state.c */
> > extern struct address_space swap_space __ro_after_init;
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > struct swap_iocb *splug = NULL;
> > bool page_allocated;
> >
> > + /*
> > + * The entry may have been freed by another task. Avoid swap_info_get()
> > + * which will print error message if the race happens.
> > + */
> > + if (si->flags & SWP_GHOST)
> > + goto skip;
> > +
> > mask = swapin_nr_pages(offset) - 1;
> > if (!mask)
> > goto skip;
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
> > static DEFINE_SPINLOCK(swap_lock);
> > static unsigned int nr_swapfiles;
> > atomic_long_t nr_swap_pages;
> > +atomic_t nr_real_swapfiles;
> > /*
> > * Some modules use swappable objects and may try to swap them out under
> > * memory pressure (via the shrinker). Before doing so, they may wish to
> > @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> > goto skip;
> > }
> >
> > + if (!(si->flags & SWP_GHOST))
> > + atomic_sub(1, &nr_real_swapfiles);
> > plist_del(&si->avail_list, &swap_avail_head);
> >
> > skip:
> > @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> > }
> >
> > plist_add(&si->avail_list, &swap_avail_head);
> > + if (!(si->flags & SWP_GHOST))
> > + atomic_add(1, &nr_real_swapfiles);
> >
> > skip:
> > spin_unlock(&swap_avail_lock);
> > @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
> > struct inode *inode = mapping->host;
> > int ret;
> >
> > + if (sis->flags & SWP_GHOST) {
> > + *span = 0;
> > + return 0;
> > + }
> > +
> > if (S_ISBLK(inode->i_mode)) {
> > ret = add_swap_extent(sis, 0, sis->max, 0);
> > *span = sis->pages;
> > @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> > if (p->flags & SWP_CONTINUED)
> > free_swap_count_continuations(p);
> >
> > - if (!p->bdev || !bdev_nonrot(p->bdev))
> > + if (!(p->flags & SWP_GHOST) &&
> > + (!p->bdev || !bdev_nonrot(p->bdev)))
> > atomic_dec(&nr_rotate_swap);
> >
> > mutex_lock(&swapon_mutex);
> > @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
> > mutex_unlock(&swapon_mutex);
> > }
> >
> > +static const char *swap_type_str(struct swap_info_struct *si)
> > +{
> > + struct file *file = si->swap_file;
> > +
> > + if (si->flags & SWP_GHOST)
> > + return "ghost\t";
> > +
> > + if (S_ISBLK(file_inode(file)->i_mode))
> > + return "partition";
> > +
> > + return "file\t";
> > +}
> > +
> > static int swap_show(struct seq_file *swap, void *v)
> > {
> > struct swap_info_struct *si = v;
> > @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
> > len = seq_file_path(swap, file, " \t\n\\");
> > seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
> > len < 40 ? 40 - len : 1, " ",
> > - S_ISBLK(file_inode(file)->i_mode) ?
> > - "partition" : "file\t",
> > + swap_type_str(si),
> > bytes, bytes < 10000000 ? "\t" : "",
> > inuse, inuse < 10000000 ? "\t" : "",
> > si->prio);
> > @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
> > return 0;
> > }
> >
> > -
> > /*
> > * Find out how many pages are allowed for a single swap device. There
> > * are two limiting factors:
> > @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> > unsigned long maxpages;
> > unsigned long swapfilepages;
> > unsigned long last_page;
> > + loff_t size;
> >
> > if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
> > pr_err("Unable to find swap-space signature\n");
> > @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> >
> > if (!maxpages)
> > return 0;
> > - swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> > +
> > + size = i_size_read(inode);
> > + if (size == PAGE_SIZE) {
> > + /* Ghost swapfile */
> > + si->bdev = NULL;
> > + si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> > + return maxpages;
> > + }
>
> Here if we push things further, it might be a good idea to make better
> use of the swap file header for detecting this kind of device, and
> maybe add support for other info too. The header already has version
> info embedded in case it will be extended.
>
On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> The current zswap requires a backing swapfile. The swap slot used
> by zswap is not able to be used by the swapfile. That waste swapfile
> space.
>
> The ghost swapfile is a swapfile that only contains the swapfile header
> for zswap. The swapfile header indicate the size of the swapfile. There
> is no swap data section in the ghost swapfile, therefore, no waste of
> swapfile space. As such, any write to a ghost swapfile will fail. To
> prevents accidental read or write of ghost swapfile, bdev of
> swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> flag because there is no rotation disk access when using zswap.
>
> The zswap write back has been disabled if all swapfiles in the system
> are ghost swap files.
>
> Signed-off-by: Chris Li <chrisl@kernel.org>
This was brought up before, I think it's not the right way to go
upstream. Even if it's good for the short-term, it's a behavior exposed
to userspace that we'll have to maintain. With the ongoing work to
decouple zswap and swap backends, this will end up being something we
have to workaround indefinitely to keep the same userspace semantics.
> ---
> include/linux/swap.h | 2 ++
> mm/page_io.c | 18 +++++++++++++++---
> mm/swap.h | 2 +-
> mm/swap_state.c | 7 +++++++
> mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++-----
> mm/zswap.c | 17 +++++++++++------
> 6 files changed, 73 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -216,6 +216,7 @@ enum {
> SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
> SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
> SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> + SWP_GHOST = (1 << 13), /* not backed by anything */
> /* add others here before... */
> };
>
> @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
> void free_pages_and_swap_cache(struct encoded_page **, int);
> /* linux/mm/swapfile.c */
> extern atomic_long_t nr_swap_pages;
> +extern atomic_t nr_real_swapfiles;
> extern long total_swap_pages;
> extern atomic_t nr_rotate_swap;
>
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> return AOP_WRITEPAGE_ACTIVATE;
> }
>
> - __swap_writepage(folio, swap_plug);
> - return 0;
> + return __swap_writepage(folio, swap_plug);
> out_unlock:
> folio_unlock(folio);
> return ret;
> @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
> submit_bio(bio);
> }
>
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> {
> struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
>
> VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> +
> + if (sis->flags & SWP_GHOST) {
> + /* Prevent the page from getting reclaimed. */
> + folio_set_dirty(folio);
> + return AOP_WRITEPAGE_ACTIVATE;
> + }
> +
> /*
> * ->flags can be updated non-atomicially (scan_swap_map_slots),
> * but that will never affect SWP_FS_OPS, so the data_race
> @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> swap_writepage_bdev_sync(folio, sis);
> else
> swap_writepage_bdev_async(folio, sis);
> + return 0;
> }
>
> void swap_write_unplug(struct swap_iocb *sio)
> @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> if (zswap_load(folio) != -ENOENT)
> goto finish;
>
> + if (unlikely(sis->flags & SWP_GHOST)) {
> + folio_unlock(folio);
> + goto finish;
> + }
> +
> /* We have to read from slower devices. Increase zswap protection. */
> zswap_folio_swapin(folio);
>
> diff --git a/mm/swap.h b/mm/swap.h
> index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> }
> void swap_write_unplug(struct swap_iocb *sio);
> int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
>
> /* linux/mm/swap_state.c */
> extern struct address_space swap_space __ro_after_init;
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> struct swap_iocb *splug = NULL;
> bool page_allocated;
>
> + /*
> + * The entry may have been freed by another task. Avoid swap_info_get()
> + * which will print error message if the race happens.
> + */
> + if (si->flags & SWP_GHOST)
> + goto skip;
> +
> mask = swapin_nr_pages(offset) - 1;
> if (!mask)
> goto skip;
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
> static DEFINE_SPINLOCK(swap_lock);
> static unsigned int nr_swapfiles;
> atomic_long_t nr_swap_pages;
> +atomic_t nr_real_swapfiles;
> /*
> * Some modules use swappable objects and may try to swap them out under
> * memory pressure (via the shrinker). Before doing so, they may wish to
> @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> goto skip;
> }
>
> + if (!(si->flags & SWP_GHOST))
> + atomic_sub(1, &nr_real_swapfiles);
> plist_del(&si->avail_list, &swap_avail_head);
>
> skip:
> @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> }
>
> plist_add(&si->avail_list, &swap_avail_head);
> + if (!(si->flags & SWP_GHOST))
> + atomic_add(1, &nr_real_swapfiles);
>
> skip:
> spin_unlock(&swap_avail_lock);
> @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
> struct inode *inode = mapping->host;
> int ret;
>
> + if (sis->flags & SWP_GHOST) {
> + *span = 0;
> + return 0;
> + }
> +
> if (S_ISBLK(inode->i_mode)) {
> ret = add_swap_extent(sis, 0, sis->max, 0);
> *span = sis->pages;
> @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> if (p->flags & SWP_CONTINUED)
> free_swap_count_continuations(p);
>
> - if (!p->bdev || !bdev_nonrot(p->bdev))
> + if (!(p->flags & SWP_GHOST) &&
> + (!p->bdev || !bdev_nonrot(p->bdev)))
> atomic_dec(&nr_rotate_swap);
>
> mutex_lock(&swapon_mutex);
> @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
> mutex_unlock(&swapon_mutex);
> }
>
> +static const char *swap_type_str(struct swap_info_struct *si)
> +{
> + struct file *file = si->swap_file;
> +
> + if (si->flags & SWP_GHOST)
> + return "ghost\t";
> +
> + if (S_ISBLK(file_inode(file)->i_mode))
> + return "partition";
> +
> + return "file\t";
> +}
> +
> static int swap_show(struct seq_file *swap, void *v)
> {
> struct swap_info_struct *si = v;
> @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
> len = seq_file_path(swap, file, " \t\n\\");
> seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
> len < 40 ? 40 - len : 1, " ",
> - S_ISBLK(file_inode(file)->i_mode) ?
> - "partition" : "file\t",
> + swap_type_str(si),
> bytes, bytes < 10000000 ? "\t" : "",
> inuse, inuse < 10000000 ? "\t" : "",
> si->prio);
> @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
> return 0;
> }
>
> -
> /*
> * Find out how many pages are allowed for a single swap device. There
> * are two limiting factors:
> @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> unsigned long maxpages;
> unsigned long swapfilepages;
> unsigned long last_page;
> + loff_t size;
>
> if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
> pr_err("Unable to find swap-space signature\n");
> @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>
> if (!maxpages)
> return 0;
> - swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> +
> + size = i_size_read(inode);
> + if (size == PAGE_SIZE) {
> + /* Ghost swapfile */
> + si->bdev = NULL;
> + si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> + return maxpages;
> + }
> +
> + swapfilepages = size >> PAGE_SHIFT;
> if (swapfilepages && maxpages > swapfilepages) {
> pr_warn("Swap area shorter than signature indicates\n");
> return 0;
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 5d0f8b13a958da3b5e74b63217b06e58ba2d3c26..29dfcc94b13eb72b1dbd100ded6e50620299e6e1 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1005,14 +1005,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> struct folio *folio;
> struct mempolicy *mpol;
> bool folio_was_allocated;
> - struct swap_info_struct *si;
> + struct swap_info_struct *si = get_swap_device(swpentry);
> int ret = 0;
>
> - /* try to allocate swap cache folio */
> - si = get_swap_device(swpentry);
> if (!si)
> - return -EEXIST;
> + return -ENOENT;
> +
> + if (si->flags & SWP_GHOST) {
> + put_swap_device(si);
> + return -EINVAL;
> + }
>
> + /* try to allocate swap cache folio */
> mpol = get_task_policy(current);
> folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
> NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
> @@ -1067,7 +1071,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> folio_set_reclaim(folio);
>
> /* start writeback */
> - __swap_writepage(folio, NULL);
> + ret = __swap_writepage(folio, NULL);
> + WARN_ON_ONCE(ret);
>
> out:
> if (ret && ret != -EEXIST) {
> @@ -1551,7 +1556,7 @@ bool zswap_store(struct folio *folio)
> zswap_pool_put(pool);
> put_objcg:
> obj_cgroup_put(objcg);
> - if (!ret && zswap_pool_reached_full)
> + if (!ret && zswap_pool_reached_full && atomic_read(&nr_real_swapfiles))
> queue_work(shrink_wq, &zswap_shrink_work);
> check_old:
> /*
>
> ---
> base-commit: 9835506e139732fa1b55aea3ed4e3ec3dd499f30
> change-id: 20251121-ghost-56e3948a7a17
>
> Best regards,
> --
> Chris Li <chrisl@kernel.org>
>
On Fri, Nov 21, 2025 at 7:14 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
>
> On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote:
> > The current zswap requires a backing swapfile. The swap slot used
> > by zswap is not able to be used by the swapfile. That waste swapfile
> > space.
> >
> > The ghost swapfile is a swapfile that only contains the swapfile header
> > for zswap. The swapfile header indicate the size of the swapfile. There
> > is no swap data section in the ghost swapfile, therefore, no waste of
> > swapfile space. As such, any write to a ghost swapfile will fail. To
> > prevents accidental read or write of ghost swapfile, bdev of
> > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD
> > flag because there is no rotation disk access when using zswap.
> >
> > The zswap write back has been disabled if all swapfiles in the system
> > are ghost swap files.
> >
> > Signed-off-by: Chris Li <chrisl@kernel.org>
>
> This was brought up before, I think it's not the right way to go
> upstream. Even if it's good for the short-term, it's a behavior exposed
> to userspace that we'll have to maintain. With the ongoing work to
> decouple zswap and swap backends, this will end up being something we
> have to workaround indefinitely to keep the same userspace semantics.
Actually, this doesn't need to be the short term solution. It can be
long term. I get it your zswap maintainers do not want to get
involved in the ghost swapfile. I will leave you guys alone. Remember
2023 LPC swap abstraction talk, the community picked my approach to
the VFS swap ops over the swap abstraction which the swap
virtualization is based on. I take some time to come up with the
cluster based swap allocator and swap table to clean up and speed up
the swap stack. Now I am finally able to circle back and fulfill my
promise of the VFS swap ops. Have a little faith I will solve this
swap entry redirection issue nicely for you, better than the swap
virtualization approach can.
Chris
>
> > ---
> > include/linux/swap.h | 2 ++
> > mm/page_io.c | 18 +++++++++++++++---
> > mm/swap.h | 2 +-
> > mm/swap_state.c | 7 +++++++
> > mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++-----
> > mm/zswap.c | 17 +++++++++++------
> > 6 files changed, 73 insertions(+), 15 deletions(-)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 38ca3df68716042946274c18a3a6695dda3b7b65..af9b789c9ef9c0e5cf98887ab2bccd469c833c6b 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -216,6 +216,7 @@ enum {
> > SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
> > SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
> > SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
> > + SWP_GHOST = (1 << 13), /* not backed by anything */
> > /* add others here before... */
> > };
> >
> > @@ -438,6 +439,7 @@ void free_folio_and_swap_cache(struct folio *folio);
> > void free_pages_and_swap_cache(struct encoded_page **, int);
> > /* linux/mm/swapfile.c */
> > extern atomic_long_t nr_swap_pages;
> > +extern atomic_t nr_real_swapfiles;
> > extern long total_swap_pages;
> > extern atomic_t nr_rotate_swap;
> >
> > diff --git a/mm/page_io.c b/mm/page_io.c
> > index 3c342db77ce38ed26bc7aec68651270bbe0e2564..cc1eb4a068c10840bae0288e8005665c342fdc53 100644
> > --- a/mm/page_io.c
> > +++ b/mm/page_io.c
> > @@ -281,8 +281,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> > return AOP_WRITEPAGE_ACTIVATE;
> > }
> >
> > - __swap_writepage(folio, swap_plug);
> > - return 0;
> > + return __swap_writepage(folio, swap_plug);
> > out_unlock:
> > folio_unlock(folio);
> > return ret;
> > @@ -444,11 +443,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
> > submit_bio(bio);
> > }
> >
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > {
> > struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> >
> > VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> > +
> > + if (sis->flags & SWP_GHOST) {
> > + /* Prevent the page from getting reclaimed. */
> > + folio_set_dirty(folio);
> > + return AOP_WRITEPAGE_ACTIVATE;
> > + }
> > +
> > /*
> > * ->flags can be updated non-atomicially (scan_swap_map_slots),
> > * but that will never affect SWP_FS_OPS, so the data_race
> > @@ -465,6 +471,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > swap_writepage_bdev_sync(folio, sis);
> > else
> > swap_writepage_bdev_async(folio, sis);
> > + return 0;
> > }
> >
> > void swap_write_unplug(struct swap_iocb *sio)
> > @@ -637,6 +644,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> > if (zswap_load(folio) != -ENOENT)
> > goto finish;
> >
> > + if (unlikely(sis->flags & SWP_GHOST)) {
> > + folio_unlock(folio);
> > + goto finish;
> > + }
> > +
> > /* We have to read from slower devices. Increase zswap protection. */
> > zswap_folio_swapin(folio);
> >
> > diff --git a/mm/swap.h b/mm/swap.h
> > index d034c13d8dd260cea2a1e95010a9df1e3011bfe4..bd60bf2c5dc9218069be0ada5d2d843399894439 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -195,7 +195,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> > }
> > void swap_write_unplug(struct swap_iocb *sio);
> > int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> > +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> >
> > /* linux/mm/swap_state.c */
> > extern struct address_space swap_space __ro_after_init;
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index b2230f8a48fc2c97d61d4bfb2c25e9d1e2508805..f01a8d8f32deb956e25c3c24897b0e3f6c5a735c 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -632,6 +632,13 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > struct swap_iocb *splug = NULL;
> > bool page_allocated;
> >
> > + /*
> > + * The entry may have been freed by another task. Avoid swap_info_get()
> > + * which will print error message if the race happens.
> > + */
> > + if (si->flags & SWP_GHOST)
> > + goto skip;
> > +
> > mask = swapin_nr_pages(offset) - 1;
> > if (!mask)
> > goto skip;
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 94e0f0c54168759d75bc2756e7c09f35413e6c78..a34d1eb6908ea144fd8fab1224f1520054a94992 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -66,6 +66,7 @@ static void move_cluster(struct swap_info_struct *si,
> > static DEFINE_SPINLOCK(swap_lock);
> > static unsigned int nr_swapfiles;
> > atomic_long_t nr_swap_pages;
> > +atomic_t nr_real_swapfiles;
> > /*
> > * Some modules use swappable objects and may try to swap them out under
> > * memory pressure (via the shrinker). Before doing so, they may wish to
> > @@ -1158,6 +1159,8 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> > goto skip;
> > }
> >
> > + if (!(si->flags & SWP_GHOST))
> > + atomic_sub(1, &nr_real_swapfiles);
> > plist_del(&si->avail_list, &swap_avail_head);
> >
> > skip:
> > @@ -1200,6 +1203,8 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> > }
> >
> > plist_add(&si->avail_list, &swap_avail_head);
> > + if (!(si->flags & SWP_GHOST))
> > + atomic_add(1, &nr_real_swapfiles);
> >
> > skip:
> > spin_unlock(&swap_avail_lock);
> > @@ -2677,6 +2682,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
> > struct inode *inode = mapping->host;
> > int ret;
> >
> > + if (sis->flags & SWP_GHOST) {
> > + *span = 0;
> > + return 0;
> > + }
> > +
> > if (S_ISBLK(inode->i_mode)) {
> > ret = add_swap_extent(sis, 0, sis->max, 0);
> > *span = sis->pages;
> > @@ -2910,7 +2920,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> > if (p->flags & SWP_CONTINUED)
> > free_swap_count_continuations(p);
> >
> > - if (!p->bdev || !bdev_nonrot(p->bdev))
> > + if (!(p->flags & SWP_GHOST) &&
> > + (!p->bdev || !bdev_nonrot(p->bdev)))
> > atomic_dec(&nr_rotate_swap);
> >
> > mutex_lock(&swapon_mutex);
> > @@ -3030,6 +3041,19 @@ static void swap_stop(struct seq_file *swap, void *v)
> > mutex_unlock(&swapon_mutex);
> > }
> >
> > +static const char *swap_type_str(struct swap_info_struct *si)
> > +{
> > + struct file *file = si->swap_file;
> > +
> > + if (si->flags & SWP_GHOST)
> > + return "ghost\t";
> > +
> > + if (S_ISBLK(file_inode(file)->i_mode))
> > + return "partition";
> > +
> > + return "file\t";
> > +}
> > +
> > static int swap_show(struct seq_file *swap, void *v)
> > {
> > struct swap_info_struct *si = v;
> > @@ -3049,8 +3073,7 @@ static int swap_show(struct seq_file *swap, void *v)
> > len = seq_file_path(swap, file, " \t\n\\");
> > seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
> > len < 40 ? 40 - len : 1, " ",
> > - S_ISBLK(file_inode(file)->i_mode) ?
> > - "partition" : "file\t",
> > + swap_type_str(si),
> > bytes, bytes < 10000000 ? "\t" : "",
> > inuse, inuse < 10000000 ? "\t" : "",
> > si->prio);
> > @@ -3183,7 +3206,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
> > return 0;
> > }
> >
> > -
> > /*
> > * Find out how many pages are allowed for a single swap device. There
> > * are two limiting factors:
> > @@ -3229,6 +3251,7 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> > unsigned long maxpages;
> > unsigned long swapfilepages;
> > unsigned long last_page;
> > + loff_t size;
> >
> > if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
> > pr_err("Unable to find swap-space signature\n");
> > @@ -3271,7 +3294,16 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
> >
> > if (!maxpages)
> > return 0;
> > - swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
> > +
> > + size = i_size_read(inode);
> > + if (size == PAGE_SIZE) {
> > + /* Ghost swapfile */
> > + si->bdev = NULL;
> > + si->flags |= SWP_GHOST | SWP_SOLIDSTATE;
> > + return maxpages;
> > + }
> > +
> > + swapfilepages = size >> PAGE_SHIFT;
> > if (swapfilepages && maxpages > swapfilepages) {
> > pr_warn("Swap area shorter than signature indicates\n");
> > return 0;
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 5d0f8b13a958da3b5e74b63217b06e58ba2d3c26..29dfcc94b13eb72b1dbd100ded6e50620299e6e1 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -1005,14 +1005,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > struct folio *folio;
> > struct mempolicy *mpol;
> > bool folio_was_allocated;
> > - struct swap_info_struct *si;
> > + struct swap_info_struct *si = get_swap_device(swpentry);
> > int ret = 0;
> >
> > - /* try to allocate swap cache folio */
> > - si = get_swap_device(swpentry);
> > if (!si)
> > - return -EEXIST;
> > + return -ENOENT;
> > +
> > + if (si->flags & SWP_GHOST) {
> > + put_swap_device(si);
> > + return -EINVAL;
> > + }
> >
> > + /* try to allocate swap cache folio */
> > mpol = get_task_policy(current);
> > folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
> > NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
> > @@ -1067,7 +1071,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > folio_set_reclaim(folio);
> >
> > /* start writeback */
> > - __swap_writepage(folio, NULL);
> > + ret = __swap_writepage(folio, NULL);
> > + WARN_ON_ONCE(ret);
> >
> > out:
> > if (ret && ret != -EEXIST) {
> > @@ -1551,7 +1556,7 @@ bool zswap_store(struct folio *folio)
> > zswap_pool_put(pool);
> > put_objcg:
> > obj_cgroup_put(objcg);
> > - if (!ret && zswap_pool_reached_full)
> > + if (!ret && zswap_pool_reached_full && atomic_read(&nr_real_swapfiles))
> > queue_work(shrink_wq, &zswap_shrink_work);
> > check_old:
> > /*
> >
> > ---
> > base-commit: 9835506e139732fa1b55aea3ed4e3ec3dd499f30
> > change-id: 20251121-ghost-56e3948a7a17
> >
> > Best regards,
> > --
> > Chris Li <chrisl@kernel.org>
> >
>
On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote: > > On Fri, Nov 21, 2025 at 7:14 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote: > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > The current zswap requires a backing swapfile. The swap slot used > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > space. > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > prevents accidental read or write of ghost swapfile, bdev of > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > flag because there is no rotation disk access when using zswap. > > > > > > The zswap write back has been disabled if all swapfiles in the system > > > are ghost swap files. > > > > > > Signed-off-by: Chris Li <chrisl@kernel.org> > > > > This was brought up before, I think it's not the right way to go > > upstream. Even if it's good for the short-term, it's a behavior exposed > > to userspace that we'll have to maintain. With the ongoing work to > > decouple zswap and swap backends, this will end up being something we > > have to workaround indefinitely to keep the same userspace semantics. > > Actually, this doesn't need to be the short term solution. It can be > long term. I get it your zswap maintainers do not want to get > involved in the ghost swapfile. I will leave you guys alone. Remember > 2023 LPC swap abstraction talk, the community picked my approach to > the VFS swap ops over the swap abstraction which the swap > virtualization is based on. I take some time to come up with the > cluster based swap allocator and swap table to clean up and speed up > the swap stack. Now I am finally able to circle back and fulfill my > promise of the VFS swap ops. Have a little faith I will solve this > swap entry redirection issue nicely for you, better than the swap > virtualization approach can. Look man, I'm not married to any idea. If your VFS approach solve our problems, I can move on to other projects :) We have lots of swap/memory reclaim/MM problems to solve, both internally at Meta and upstream. But please explain how your VFS approach solved the 3 requirements I mentioned in the other email, and more specifically the backend transfer requirement. I have explicitly asked about it in your submission for your 2024 LSFMMBPF talk - at that time I have not seriously started the swap virtualization work, only at the design phase. You just handwaved it away and never really explained to me how you can achieve backend transfer with your design: https://lore.kernel.org/all/CAF8kJuNFtejEtjQHg5UBGduvFNn3AaGn4ffyoOrEnXfHpx6Ubg@mail.gmail.com/ I understand that you had more pressing issues to fix at a time, so I did not bring it up during the conference. But it's an imperative requirement for us. swap.tiers is nice for initial placement and for hierarchy determination in general, but when the page is already placed on one tier and needs to be transferred to the tier, how will you move it from one tier to another? What zram is doing right now, IIUC, is building the redirection internally. I would like to try avoiding repeating that for zswap, and for every other future backends, by pulling it out of backend internal code and build a dedicated module for it. That is just swap virtualization.
On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > The current zswap requires a backing swapfile. The swap slot used > by zswap is not able to be used by the swapfile. That waste swapfile > space. > > The ghost swapfile is a swapfile that only contains the swapfile header > for zswap. The swapfile header indicate the size of the swapfile. There > is no swap data section in the ghost swapfile, therefore, no waste of > swapfile space. As such, any write to a ghost swapfile will fail. To > prevents accidental read or write of ghost swapfile, bdev of > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > flag because there is no rotation disk access when using zswap. Zswap is primarily a compressed cache for real swap on secondary storage. It's indeed quite important that entries currently in zswap don't occupy disk slots; but for a solution to this to be acceptable, it has to work with the primary usecase and support disk writeback. This direction is a dead-end. Please take a look at Nhat's swap virtualization patches. They decouple zswap from disk geometry, while still supporting writeback to an actual backend file. Nacked-by: Johannes Weiner <hannes@cmpxchg.org>
On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > The current zswap requires a backing swapfile. The swap slot used > > by zswap is not able to be used by the swapfile. That waste swapfile > > space. > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > for zswap. The swapfile header indicate the size of the swapfile. There > > is no swap data section in the ghost swapfile, therefore, no waste of > > swapfile space. As such, any write to a ghost swapfile will fail. To > > prevents accidental read or write of ghost swapfile, bdev of > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > flag because there is no rotation disk access when using zswap. > > Zswap is primarily a compressed cache for real swap on secondary > storage. It's indeed quite important that entries currently in zswap > don't occupy disk slots; but for a solution to this to be acceptable, > it has to work with the primary usecase and support disk writeback. Well, my plan is to support the writeback via swap.tiers. > This direction is a dead-end. Please take a look at Nhat's swap > virtualization patches. They decouple zswap from disk geometry, while > still supporting writeback to an actual backend file. Yes, there are many ways to decouple zswap from disk geometry, my swap table + swap.tiers design can do that as well. I have concerns about swap virtualization in the aspect of adding another layer of memory overhead addition per swap entry and CPU overhead of extra xarray lookup. I believe my approach is technically superior and cleaner. Both faster and cleaner. Basically swap.tiers + VFS like swap read write page ops. I will let Nhat clarify the performance and memory overhead side of the swap virtualization. I am not against swap entry redirection. Just the swap virtualization series needs to compare against the alternatives in terms of memory overhead and throughput. Solving it from the swap.tiers angle is cleaner. > Nacked-by: Johannes Weiner <hannes@cmpxchg.org> I take that the only relevant part is you are zswap maintainer and I am the swap maintainer. Fine. I got the message. I will leave the zswap alone. I will find other ways to address the memory base swap tiers in swap.tiers. Chris
Hi Johannes, On Sat, Nov 22, 2025 at 5:52 AM Chris Li <chrisl@kernel.org> wrote: > > > Nacked-by: Johannes Weiner <hannes@cmpxchg.org> > > I take that the only relevant part is you are zswap maintainer and I > am the swap maintainer. Fine. I got the message. I will leave the > zswap alone. I will find other ways to address the memory base swap > tiers in swap.tiers. I am sorry that I have said that. Let me take back what I said above. I was upset when I considered you and others blocking the more optimal solution and in favor of the less optimal solution. That is my short temper, as usual. Now I can see that you might not see one as more optimal than the other as convincing as I do, or I haven't done a good job explaining it. Let me offer my sincere apology. I will reply to the technical aspect of the question in other email. Chris
On Tue, Nov 25, 2025 at 10:14:40PM +0400, Chris Li wrote: > Hi Johannes, > > On Sat, Nov 22, 2025 at 5:52 AM Chris Li <chrisl@kernel.org> wrote: > > > > > Nacked-by: Johannes Weiner <hannes@cmpxchg.org> > > > > I take that the only relevant part is you are zswap maintainer and I > > am the swap maintainer. Fine. I got the message. I will leave the > > zswap alone. I will find other ways to address the memory base swap > > tiers in swap.tiers. > > I am sorry that I have said that. Let me take back what I said above. > I was upset when I considered you and others blocking the more optimal > solution and in favor of the less optimal solution. That is my short > temper, as usual. > > Now I can see that you might not see one as more optimal than the > other as convincing as I do, or I haven't done a good job explaining > it. > > Let me offer my sincere apology. I will reply to the technical aspect > of the question in other email. Thanks Chris. No hard feelings.
On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote: > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > The current zswap requires a backing swapfile. The swap slot used > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > space. > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > prevents accidental read or write of ghost swapfile, bdev of > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > flag because there is no rotation disk access when using zswap. > > > > Zswap is primarily a compressed cache for real swap on secondary > > storage. It's indeed quite important that entries currently in zswap > > don't occupy disk slots; but for a solution to this to be acceptable, > > it has to work with the primary usecase and support disk writeback. > > Well, my plan is to support the writeback via swap.tiers. Do you have a link to that proposal? My understanding of swap tiers was about grouping different swapfiles and assigning them to cgroups. The issue with writeback is relocating the data that a swp_entry_t page table refers to - without having to find and update all the possible page tables. I'm not sure how swap.tiers solve this problem. > > This direction is a dead-end. Please take a look at Nhat's swap > > virtualization patches. They decouple zswap from disk geometry, while > > still supporting writeback to an actual backend file. > > Yes, there are many ways to decouple zswap from disk geometry, my swap > table + swap.tiers design can do that as well. I have concerns about > swap virtualization in the aspect of adding another layer of memory > overhead addition per swap entry and CPU overhead of extra xarray > lookup. I believe my approach is technically superior and cleaner. > Both faster and cleaner. Basically swap.tiers + VFS like swap read > write page ops. I will let Nhat clarify the performance and memory > overhead side of the swap virtualization. I'm happy to discuss it. But keep in mind that the swap virtualization idea is a collaborative product of quite a few people with an extensive combined upstream record. Quite a bit of thought has gone into balancing static vs runtime costs of that proposal. So you'll forgive me if I'm a bit skeptical of the somewhat grandiose claims of one person that is new to upstream development. As to your specific points - we use xarray lookups in the page cache fast path. It's a bold claim to say this would be too much overhead during swapins. Two, it's not clear to me how you want to make writeback efficient *without* any sort of swap entry redirection. Walking all relevant page tables is expensive; and you have to be able to find them first. If you're talking about a redirection array as opposed to a tree - static sizing of the compressed space is also a no-go. Zswap utilization varies *widely* between workloads and different workload combinations. Further, zswap consumes the same fungible resource as uncompressed memory - there is really no excuse to burden users with static sizing questions about this pool.
On Mon, Nov 24, 2025 at 12:27:17PM -0500, Johannes Weiner wrote: > On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote: > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > > The current zswap requires a backing swapfile. The swap slot used > > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > > space. > > > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > > prevents accidental read or write of ghost swapfile, bdev of > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > > flag because there is no rotation disk access when using zswap. > > > > > > Zswap is primarily a compressed cache for real swap on secondary > > > storage. It's indeed quite important that entries currently in zswap > > > don't occupy disk slots; but for a solution to this to be acceptable, > > > it has to work with the primary usecase and support disk writeback. > > > > Well, my plan is to support the writeback via swap.tiers. > > Do you have a link to that proposal? > > My understanding of swap tiers was about grouping different swapfiles > and assigning them to cgroups. The issue with writeback is relocating > the data that a swp_entry_t page table refers to - without having to > find and update all the possible page tables. I'm not sure how > swap.tiers solve this problem. > > > > This direction is a dead-end. Please take a look at Nhat's swap > > > virtualization patches. They decouple zswap from disk geometry, while > > > still supporting writeback to an actual backend file. > > > > Yes, there are many ways to decouple zswap from disk geometry, my swap > > table + swap.tiers design can do that as well. I have concerns about > > swap virtualization in the aspect of adding another layer of memory > > overhead addition per swap entry and CPU overhead of extra xarray > > lookup. I believe my approach is technically superior and cleaner. > > Both faster and cleaner. Basically swap.tiers + VFS like swap read > > write page ops. I will let Nhat clarify the performance and memory > > overhead side of the swap virtualization. > > I'm happy to discuss it. > > But keep in mind that the swap virtualization idea is a collaborative > product of quite a few people with an extensive combined upstream > record. Quite a bit of thought has gone into balancing static vs > runtime costs of that proposal. So you'll forgive me if I'm a bit > skeptical of the somewhat grandiose claims of one person that is new > to upstream development. > > As to your specific points - we use xarray lookups in the page cache > fast path. It's a bold claim to say this would be too much overhead > during swapins. > > Two, it's not clear to me how you want to make writeback efficient > *without* any sort of swap entry redirection. Walking all relevant > page tables is expensive; and you have to be able to find them first. > > If you're talking about a redirection array as opposed to a tree - > static sizing of the compressed space is also a no-go. Zswap > utilization varies *widely* between workloads and different workload > combinations. Further, zswap consumes the same fungible resource as > uncompressed memory - there is really no excuse to burden users with > static sizing questions about this pool. I think what Chris's idea is (and Chris correct me if I am wrong), is that we use ghost swapfiles (that are not backed by disk space) for zswap. So zswap has its own swapfiles, separate from disk swapfiles. memory.tiers establishes the ordering between swapfiles, so you put "ghost" -> "real" to get today's zswap writeback behavior. When you writeback, you keep page tables pointing at the swap entry in the ghost swapfile. What you do is: - Allocate a new swap entry in the "real" swapfile. - Update the swap table of the "ghost" swapfile to point at the swap entry in the "real" swapfile, reusing the pointer used for the swapcache. Then, on swapin, you read the swap table of the "ghost" swapfile, find the redirection, and read to the swap table of the "real" swapfile, then read the page from disk into the swap cache. The redirection in the "ghost" swapfile will keep existing, wasting that slot, until all references to it are dropped. I think this might work for this specific use case, with less overhead than the xarray. BUT there are a few scenarios that are not covered AFAICT: - You still need to statically size the ghost swapfiles and their overheads. - Wasting a slot in the ghost swapfile for the redirection. This complicates static provisioning a bit, because you have to account for entries that will be in zswap as well as writtenback. Furthermore, IIUC swap.tiers is intended to be generic and cover other use cases beyond zswap like SSD -> HDD. For that, I think wasting a slot in the SSD when we writeback to the HDD is a much bigger problem. - We still cannot do swapoff efficiently as we need to walk the page tables (and some swap tables) to find and swapin all entries in a swapfile. Not as important as other things, but worth mentioning. Chris please let me know if I didn't get this right.
On Mon, Nov 24, 2025 at 11:32 PM Yosry Ahmed <yosry.ahmed@linux.dev> wrote: > > I think what Chris's idea is (and Chris correct me if I am wrong), is > that we use ghost swapfiles (that are not backed by disk space) for > zswap. So zswap has its own swapfiles, separate from disk swapfiles. Ack. > memory.tiers establishes the ordering between swapfiles, so you put > "ghost" -> "real" to get today's zswap writeback behavior. When you > writeback, you keep page tables pointing at the swap entry in the ghost > swapfile. What you do is: > - Allocate a new swap entry in the "real" swapfile. > - Update the swap table of the "ghost" swapfile to point at the swap > entry in the "real" swapfile, reusing the pointer used for the > swapcache. Ack, with minor adjustment in mapping the swap entry to the physical location. The swap entry has swap cache, the physical location does not. > Then, on swapin, you read the swap table of the "ghost" swapfile, find > the redirection, and read to the swap table of the "real" swapfile, then > read the page from disk into the swap cache. The redirection in the > "ghost" swapfile will keep existing, wasting that slot, until all > references to it are dropped. Ack. That is assuming we don't have a rmap a like for the swap entry. > I think this might work for this specific use case, with less overhead > than the xarray. BUT there are a few scenarios that are not covered > AFAICT: > > - You still need to statically size the ghost swapfiles and their > overheads. No true, both ghost swapfile and physical swapfile can expand additional clusters beyond the original physical size, for allocating the continued high order entry or redirection. For a ghost swapfile, there is no physical layer, only the front end. So the size can grow dynamically. Just allocate more clusters. The current swapfile header file size is just an initial size. My current patch does not implement that. It will need some later swap table phase to make it happen. But that is not an architecture limit, it has been considered as part of normal business. > - Wasting a slot in the ghost swapfile for the redirection. This > complicates static provisioning a bit, because you have to account for > entries that will be in zswap as well as writtenback. Furthermore, > IIUC swap.tiers is intended to be generic and cover other use cases > beyond zswap like SSD -> HDD. For that, I think wasting a slot in the > SSD when we writeback to the HDD is a much bigger problem. Yes and No. Yes it only wastes a front end swap entry (with swap cache). The physical location is a seperate layer. No, the physical SSD space is not wasted because you can allocate additional front end swap entry by growing the swap entry front end. Then have the additional front end swap entry point to the physical location you just directed away from. There is a lot more consideration of the front end vs the physical layer. The physical layer does not care about location order size 2^N alignment. The physical layer cares a bit about continuity and the number of IOV that it needs to issue. The swap entry front end and the physical layer have slightly different constraints. > - We still cannot do swapoff efficiently as we need to walk the page > tables (and some swap tables) to find and swapin all entries in a > swapfile. Not as important as other things, but worth mentioning. That need rmap for swap entries. It It is an independent issue. Chris
On Tue, 2025-11-25 at 22:50 +0400, Chris Li wrote: > > > - We still cannot do swapoff efficiently as we need to walk the > > page > > tables (and some swap tables) to find and swapin all entries in a > > swapfile. Not as important as other things, but worth mentioning. > > That need rmap for swap entries. It It is an independent issue. > Wouldn't rmap for swap entries be more expensive than simply always having indirection for swap entries that are in use? With indirection, swapoff can just move pages from the being-swapoffed device into the swap cache, and if needed the memory can then be moved to another swap device, without ever needing to find the page tables. This sounds like an uncommon scenario, but it is functionally identical to what is done to pages during zswap writeback, where the page table entries stay unchanged, and the swap page is simply moved to another backend location. Why implement two things, when we can have one thing that does both, with no extra complexity over what zswap writeback needs? -- All Rights Reversed.
On Thu, Nov 27, 2025 at 1:59 AM Rik van Riel <riel@surriel.com> wrote: > > On Tue, 2025-11-25 at 22:50 +0400, Chris Li wrote: > > > > > - We still cannot do swapoff efficiently as we need to walk the > > > page > > > tables (and some swap tables) to find and swapin all entries in a > > > swapfile. Not as important as other things, but worth mentioning. > > > > That need rmap for swap entries. It It is an independent issue. > > > > Wouldn't rmap for swap entries be more expensive than > simply always having indirection for swap entries that > are in use? It might be, to be frank. I consider this pretty far and late in the stage of the game to evaluate the rmap and its alternatives. Do you agree? I might or might not try the rmap for swap entry. Right now I don't have many data points nor insights. > With indirection, swapoff can just move pages from > the being-swapoffed device into the swap cache, and > if needed the memory can then be moved to another > swap device, without ever needing to find the page > tables. Ack. I don't think we have any disagreement here. > This sounds like an uncommon scenario, but it is > functionally identical to what is done to pages > during zswap writeback, where the page table entries > stay unchanged, and the swap page is simply moved > to another backend location. > > Why implement two things, when we can have one > thing that does both, with no extra complexity > over what zswap writeback needs? Let me ask you a clarifying question, then. 1) What exactly are you trying to propose here in what project? VS or swap the pony? 2) What stage of the code change do you have in mind should this change apply to? I can't speak for VS, I am open to embrace what you suggest in order to swap the pony project, that is after I understand it first. Chris
On Thu, 2025-11-27 at 06:07 +0400, Chris Li wrote: > On Thu, Nov 27, 2025 at 1:59 AM Rik van Riel <riel@surriel.com> > wrote: > > > > On Tue, 2025-11-25 at 22:50 +0400, Chris Li wrote: > > > > > > > - We still cannot do swapoff efficiently as we need to walk the > > > > page > > > > tables (and some swap tables) to find and swapin all entries > > > > in a > > > > swapfile. Not as important as other things, but worth > > > > mentioning. > > > > > > That need rmap for swap entries. It It is an independent issue. > > > > > > > Wouldn't rmap for swap entries be more expensive than > > simply always having indirection for swap entries that > > are in use? > > It might be, to be frank. I consider this pretty far and late in the > stage of the game to evaluate the rmap and its alternatives. Do you > agree? > > I might or might not try the rmap for swap entry. Right now I don't > have many data points nor insights. On the contrary. I think we should at least do some back of the envelope calculations to estimate the overhead of the different proposed solutions. With both Nhat's vswap, and your proposal to always have swap indirection with a separate front end, and several back ends, there is no need for swap rmap. This is a good thing, because a single swap slot could be referenced by dozens, hundreds, or even thousands of page table entries, in the case of forking servers. This creates complexity which is probably best avoided. Conceptually, Nhat's vswap, and your idea of having always-on swap indirection seem to be the same thing. > > > This sounds like an uncommon scenario, but it is > > functionally identical to what is done to pages > > during zswap writeback, where the page table entries > > stay unchanged, and the swap page is simply moved > > to another backend location. > > > > Why implement two things, when we can have one > > thing that does both, with no extra complexity > > over what zswap writeback needs? > > Let me ask you a clarifying question, then. > > 1) What exactly are you trying to propose here in what project? VS or > swap the pony? In the past, when faced with competing code bases like this, one thing that has worked well is for both developers to send their code to the list, and then for both developers to send each other suggestions (or diffs) to improve each other's code. Vswap and your always-on indirection seem to do exactly the same thing. This seems like a good opportunity to work together, and come up with code that is better than any one person's code. > 2) What stage of the code change do you have in mind should this > change apply to? I think it makes sense to get the hard design problems resolved before committing to one particular code design. Spending months to resolve subtle bugs in a code base, only to discover later that it does not do exactly what is needed, is not the greatest way to make progress. > > I can't speak for VS, I am open to embrace what you suggest in order > to swap the pony project, that is after I understand it first. > Once both Nhat and you understand each other's code, and have suggestions for each other on how to improve it, we will likely end up with a code base that looks nicer than either of you would have done by yourselves. The more perspectives, the better. -- All Rights Reversed.
On Mon, Nov 24, 2025 at 11:32 AM Yosry Ahmed <yosry.ahmed@linux.dev> wrote: > > On Mon, Nov 24, 2025 at 12:27:17PM -0500, Johannes Weiner wrote: > > On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote: > > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > > > The current zswap requires a backing swapfile. The swap slot used > > > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > > > space. > > > > > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > > > prevents accidental read or write of ghost swapfile, bdev of > > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > > > flag because there is no rotation disk access when using zswap. > > > > > > > > Zswap is primarily a compressed cache for real swap on secondary > > > > storage. It's indeed quite important that entries currently in zswap > > > > don't occupy disk slots; but for a solution to this to be acceptable, > > > > it has to work with the primary usecase and support disk writeback. > > > > > > Well, my plan is to support the writeback via swap.tiers. > > > > Do you have a link to that proposal? > > > > My understanding of swap tiers was about grouping different swapfiles > > and assigning them to cgroups. The issue with writeback is relocating > > the data that a swp_entry_t page table refers to - without having to > > find and update all the possible page tables. I'm not sure how > > swap.tiers solve this problem. > > > > > > This direction is a dead-end. Please take a look at Nhat's swap > > > > virtualization patches. They decouple zswap from disk geometry, while > > > > still supporting writeback to an actual backend file. > > > > > > Yes, there are many ways to decouple zswap from disk geometry, my swap > > > table + swap.tiers design can do that as well. I have concerns about > > > swap virtualization in the aspect of adding another layer of memory > > > overhead addition per swap entry and CPU overhead of extra xarray > > > lookup. I believe my approach is technically superior and cleaner. > > > Both faster and cleaner. Basically swap.tiers + VFS like swap read > > > write page ops. I will let Nhat clarify the performance and memory > > > overhead side of the swap virtualization. > > > > I'm happy to discuss it. > > > > But keep in mind that the swap virtualization idea is a collaborative > > product of quite a few people with an extensive combined upstream > > record. Quite a bit of thought has gone into balancing static vs > > runtime costs of that proposal. So you'll forgive me if I'm a bit > > skeptical of the somewhat grandiose claims of one person that is new > > to upstream development. > > > > As to your specific points - we use xarray lookups in the page cache > > fast path. It's a bold claim to say this would be too much overhead > > during swapins. > > > > Two, it's not clear to me how you want to make writeback efficient > > *without* any sort of swap entry redirection. Walking all relevant > > page tables is expensive; and you have to be able to find them first. > > > > If you're talking about a redirection array as opposed to a tree - > > static sizing of the compressed space is also a no-go. Zswap > > utilization varies *widely* between workloads and different workload > > combinations. Further, zswap consumes the same fungible resource as > > uncompressed memory - there is really no excuse to burden users with > > static sizing questions about this pool. > > I think what Chris's idea is (and Chris correct me if I am wrong), is > that we use ghost swapfiles (that are not backed by disk space) for > zswap. So zswap has its own swapfiles, separate from disk swapfiles. > > memory.tiers establishes the ordering between swapfiles, so you put > "ghost" -> "real" to get today's zswap writeback behavior. When you > writeback, you keep page tables pointing at the swap entry in the ghost > swapfile. What you do is: > - Allocate a new swap entry in the "real" swapfile. > - Update the swap table of the "ghost" swapfile to point at the swap > entry in the "real" swapfile, reusing the pointer used for the > swapcache. > > Then, on swapin, you read the swap table of the "ghost" swapfile, find > the redirection, and read to the swap table of the "real" swapfile, then > read the page from disk into the swap cache. The redirection in the > "ghost" swapfile will keep existing, wasting that slot, until all > references to it are dropped. > > I think this might work for this specific use case, with less overhead > than the xarray. BUT there are a few scenarios that are not covered > AFAICT: Thanks for explaining these issues better than I could :) > > - You still need to statically size the ghost swapfiles and their > overheads. Yes. > > - Wasting a slot in the ghost swapfile for the redirection. This > complicates static provisioning a bit, because you have to account for > entries that will be in zswap as well as writtenback. Furthermore, > IIUC swap.tiers is intended to be generic and cover other use cases > beyond zswap like SSD -> HDD. For that, I think wasting a slot in the > SSD when we writeback to the HDD is a much bigger problem. Yep. We are trying to get away from static provisioning as much as we can - this design digs us deeper in the hole. Who the hell know what's the zswap:disk swap split is going to be? It's going to depend on access patterns and compressibility. > > - We still cannot do swapoff efficiently as we need to walk the page > tables (and some swap tables) to find and swapin all entries in a > swapfile. Not as important as other things, but worth mentioning. Yeah I left swapoff out of it, because it is just another use case. But yes we can't do swapoff efficiently easily either. And in general, it's going to be a very rigid design for more complicated backend change (pre-fetching from one tier to another, or compaction).
On Mon, Nov 24, 2025 at 8:27 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote: > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > > The current zswap requires a backing swapfile. The swap slot used > > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > > space. > > > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > > prevents accidental read or write of ghost swapfile, bdev of > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > > flag because there is no rotation disk access when using zswap. > > > > > > Zswap is primarily a compressed cache for real swap on secondary > > > storage. It's indeed quite important that entries currently in zswap > > > don't occupy disk slots; but for a solution to this to be acceptable, > > > it has to work with the primary usecase and support disk writeback. > > > > Well, my plan is to support the writeback via swap.tiers. > > Do you have a link to that proposal? My 2024 LSF swap pony talk already has a mechanism to redirect page cache swap entries to different physical locations. That can also work for redirecting swap entries in different swapfiles. https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ > My understanding of swap tiers was about grouping different swapfiles > and assigning them to cgroups. The issue with writeback is relocating > the data that a swp_entry_t page table refers to - without having to > find and update all the possible page tables. I'm not sure how > swap.tiers solve this problem. swap.tiers is part of the picture. You are right the LPC topic mostly covers the per cgroup portion. The VFS swap ops are my two slides of the LPC 2023. You read from one swap file and write to another swap file with a new swap entry allocated. > > > This direction is a dead-end. Please take a look at Nhat's swap > > > virtualization patches. They decouple zswap from disk geometry, while > > > still supporting writeback to an actual backend file. > > > > Yes, there are many ways to decouple zswap from disk geometry, my swap > > table + swap.tiers design can do that as well. I have concerns about > > swap virtualization in the aspect of adding another layer of memory > > overhead addition per swap entry and CPU overhead of extra xarray > > lookup. I believe my approach is technically superior and cleaner. > > Both faster and cleaner. Basically swap.tiers + VFS like swap read > > write page ops. I will let Nhat clarify the performance and memory > > overhead side of the swap virtualization. > > I'm happy to discuss it. > > But keep in mind that the swap virtualization idea is a collaborative > product of quite a few people with an extensive combined upstream > record. Quite a bit of thought has gone into balancing static vs > runtime costs of that proposal. So you'll forgive me if I'm a bit > skeptical of the somewhat grandiose claims of one person that is new > to upstream development. Collaborating with which companies developers? How many VS patches landed in the kernel? I am also collaborating with different developers, cluster base swap allocators, swap table phase I. Removing the NUMA node swap file priority. Those are all suggested by me. > As to your specific points - we use xarray lookups in the page cache > fast path. It's a bold claim to say this would be too much overhead > during swapins. Yes, we just get rid of xarray in swap cache lookup and get some performance gain from it. You are saying one extra xarray is no problem, can your team demo some performance number of impact of the extra xarray lookup in VS? Just run some swap benchmarks and share the result. We can do a test right now, without writing back to another SSD, The ghosts swapfile compare with VS for zswap only case. > Two, it's not clear to me how you want to make writeback efficient > *without* any sort of swap entry redirection. Walking all relevant > page tables is expensive; and you have to be able to find them first. Swap cache can have a physical location redirection, see my 2024 LPC slides. I have considered that way before the VS discussion. https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ > If you're talking about a redirection array as opposed to a tree - > static sizing of the compressed space is also a no-go. Zswap > utilization varies *widely* between workloads and different workload > combinations. Further, zswap consumes the same fungible resource as > uncompressed memory - there is really no excuse to burden users with > static sizing questions about this pool. I do see the swap table + swap.ters + swap ops and do better. We can test the memory only case right now. To head to head test the VS and swap.tiers on the writeback case will need to wait a bit. Swap table is only reviewing phase II. I mean CPU and per swap entry overhead. I care less on who's idea it is, I care more about the end result performance in (memory & CPU). I want the best idea/implementation to win. Chris
On Mon, Nov 24, 2025 at 09:24:18PM +0300, Chris Li wrote: > On Mon, Nov 24, 2025 at 8:27 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > On Fri, Nov 21, 2025 at 05:52:09PM -0800, Chris Li wrote: > > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > > > The current zswap requires a backing swapfile. The swap slot used > > > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > > > space. > > > > > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > > > prevents accidental read or write of ghost swapfile, bdev of > > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > > > flag because there is no rotation disk access when using zswap. > > > > > > > > Zswap is primarily a compressed cache for real swap on secondary > > > > storage. It's indeed quite important that entries currently in zswap > > > > don't occupy disk slots; but for a solution to this to be acceptable, > > > > it has to work with the primary usecase and support disk writeback. > > > > > > Well, my plan is to support the writeback via swap.tiers. > > > > Do you have a link to that proposal? > > My 2024 LSF swap pony talk already has a mechanism to redirect page > cache swap entries to different physical locations. > That can also work for redirecting swap entries in different swapfiles. > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ I looked through your slides and the LWN article, but it's very hard for me to find answers to my questions in there. In your proposal, let's say you have a swp_entry_t in the page table. What does it describe, and what are the data structures to get from this key to user data in the following scenarios: - Data is in a swapfile - Data is in zswap - Data is in being written from zswap to a swapfile - Data is back in memory due to a fault from another page table > > My understanding of swap tiers was about grouping different swapfiles > > and assigning them to cgroups. The issue with writeback is relocating > > the data that a swp_entry_t page table refers to - without having to > > find and update all the possible page tables. I'm not sure how > > swap.tiers solve this problem. > > swap.tiers is part of the picture. You are right the LPC topic mostly > covers the per cgroup portion. The VFS swap ops are my two slides of > the LPC 2023. You read from one swap file and write to another swap > file with a new swap entry allocated. Ok, and from what you wrote below, presumably at this point you would put a redirection pointer in the old location to point to the new one. This way you only have the indirection IF such a relocation actually happened, correct? But how do you store new data in the freed up old slot? > > As to your specific points - we use xarray lookups in the page cache > > fast path. It's a bold claim to say this would be too much overhead > > during swapins. > > Yes, we just get rid of xarray in swap cache lookup and get some > performance gain from it. > You are saying one extra xarray is no problem, can your team demo some > performance number of impact of the extra xarray lookup in VS? Just > run some swap benchmarks and share the result. Average and worst-case for all common usecases matter. There is no code on your side for the writeback case. (And it's exceedingly difficult to even get a mental model of how it would work from your responses and the slides you have linked). > > Two, it's not clear to me how you want to make writeback efficient > > *without* any sort of swap entry redirection. Walking all relevant > > page tables is expensive; and you have to be able to find them first. > > Swap cache can have a physical location redirection, see my 2024 LPC > slides. I have considered that way before the VS discussion. > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ There are no matches for "redir" in either the email or the slides.
On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > Do you have a link to that proposal? > > > > My 2024 LSF swap pony talk already has a mechanism to redirect page > > cache swap entries to different physical locations. > > That can also work for redirecting swap entries in different swapfiles. > > > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ > > I looked through your slides and the LWN article, but it's very hard > for me to find answers to my questions in there. Naturally, the slide is only intended to cover what is in the current swap table may be phase VII. But it does have the physical location pointer consideration. > In your proposal, let's say you have a swp_entry_t in the page > table. What does it describe, and what are the data structures to get > from this key to user data in the following scenarios: Please keep in mind that I don't have every detail design laid out. I follow the first principles that redirect a swap entry page should only take an additional 4 byte per swap entry. VS blow up the swap entry size by something like 24 bytes? I am pretty sure I am wrong about the exact value. People who are familiar with VS please correct me. My impression is that it is too far away from the first principle value, I would not even consider. Exceptions can be made, but not that far. I will try my best to answer your question but usually I am more glad to work with someone who is going to implement it to iron out all the details. Right now it is a bit too far. > - Data is in a swapfile Same as current. > - Data is in zswap I have now realized that what I want from the memory swap tier is actually not the same as today's zswap. I don't want the current behavior of zswap in the swap.tiers. The zswap seat in front of every swapfile. The zswap.writeback does not tell which particular swapfile it wants to write to. It creates problems in the per memcg swap.tier to include zswap as it is. I don't want the zswap to use another swapfile swap entry and write through to it. If data is in the memory tier swapfile, the swap entry looks up to the actual data without redirection. > - Data is in being written from zswap to a swapfile It will look up the swap table and find a physical pointer, which points to the physical device and office having the data. > - Data is back in memory due to a fault from another page table In the swap cache similar to today's swapfile. > > > My understanding of swap tiers was about grouping different swapfiles > > > and assigning them to cgroups. The issue with writeback is relocating > > > the data that a swp_entry_t page table refers to - without having to > > > find and update all the possible page tables. I'm not sure how > > > swap.tiers solve this problem. > > > > swap.tiers is part of the picture. You are right the LPC topic mostly > > covers the per cgroup portion. The VFS swap ops are my two slides of > > the LPC 2023. You read from one swap file and write to another swap > > file with a new swap entry allocated. > > Ok, and from what you wrote below, presumably at this point you would > put a redirection pointer in the old location to point to the new one. From the swap entry front end (also owns the swap cache) point to a physical location. > > This way you only have the indirection IF such a relocation actually > happened, correct? Right. The more common > But how do you store new data in the freed up old slot? That is the front end swap entry and the physical back end split. The front end swap entry can't be free until all users release the swap count. The physical back end can be free. The free physical blocks caused by redirection will likely have a different allocator, not the cluster based swap allocator. Because those are just pure blocks. > > > > As to your specific points - we use xarray lookups in the page cache > > > fast path. It's a bold claim to say this would be too much overhead > > > during swapins. > > > > Yes, we just get rid of xarray in swap cache lookup and get some > > performance gain from it. > > You are saying one extra xarray is no problem, can your team demo some > > performance number of impact of the extra xarray lookup in VS? Just > > run some swap benchmarks and share the result. > > Average and worst-case for all common usecases matter. There is no > code on your side for the writeback case. (And it's exceedingly > difficult to even get a mental model of how it would work from your > responses and the slides you have linked). As I said, that slide is only intended to explain swap table phase VII how physical direction works with swap cache. The swap.tiers define tiers for swap, obviously how to move data between the tier is a natural consideration. That I mention in the 2023 talk in two slides. I don't plan that level of detail that far ahead. I try to follow the first principle as best as I can. There will be a lot of decisions made only at the later phases. > > > Two, it's not clear to me how you want to make writeback efficient > > > *without* any sort of swap entry redirection. Walking all relevant > > > page tables is expensive; and you have to be able to find them first. > > > > Swap cache can have a physical location redirection, see my 2024 LPC > > slides. I have considered that way before the VS discussion. > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ > > There are no matches for "redir" in either the email or the slides. Yes, I use a different term in the slide. The continuous is the source of the redirection, the non continuous is the destination of the redirection. But in my mind I am not redirecting swap entries. The swap entry might have an optional physical location pointer. The swap entry front end and physical layer split. Chris
On Tue, Nov 25, 2025 at 11:27:04PM +0400, Chris Li wrote: > On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > Do you have a link to that proposal? > > > > > > My 2024 LSF swap pony talk already has a mechanism to redirect page > > > cache swap entries to different physical locations. > > > That can also work for redirecting swap entries in different swapfiles. > > > > > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ > > > > I looked through your slides and the LWN article, but it's very hard > > for me to find answers to my questions in there. > > Naturally, the slide is only intended to cover what is in the current > swap table may be phase VII. > But it does have the physical location pointer consideration. > > > In your proposal, let's say you have a swp_entry_t in the page > > table. What does it describe, and what are the data structures to get > > from this key to user data in the following scenarios: > > Please keep in mind that I don't have every detail design laid out. I > follow the first principles that redirect a swap entry page should > only take an additional 4 byte per swap entry. VS blow up the swap > entry size by something like 24 bytes? Nhat can lay this out in more detail, but there isn't much new stuff in the virtual swap descriptor. It's mostly just a consolidation of state we currently track elsewhere - swap count, swapcache pointer, cgroup ownership etc. The actual indirection is just a word for the backend type,offset. That indirection is the tradeoff for swapped pages. In turn you're getting back all that other stuff for swap slots that *aren't* currently used. This is a win for the vast majority of users. Since you mentioned first principles - the dynamically sized swap space is also much more suitable for compressed pools, which are the dominant form of swap setups nowadays. Again a win for the majority. And the worst-case is reasonable. I don't see the giant gulf you seem to see there. I don't know where it's supposed to be coming from.
On Tue, Nov 25, 2025 at 1:31 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Tue, Nov 25, 2025 at 11:27:04PM +0400, Chris Li wrote: > > On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > Do you have a link to that proposal? > > > > > > > > My 2024 LSF swap pony talk already has a mechanism to redirect page > > > > cache swap entries to different physical locations. > > > > That can also work for redirecting swap entries in different swapfiles. > > > > > > > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ > > > > > > I looked through your slides and the LWN article, but it's very hard > > > for me to find answers to my questions in there. > > > > Naturally, the slide is only intended to cover what is in the current > > swap table may be phase VII. > > But it does have the physical location pointer consideration. > > > > > In your proposal, let's say you have a swp_entry_t in the page > > > table. What does it describe, and what are the data structures to get > > > from this key to user data in the following scenarios: > > > > Please keep in mind that I don't have every detail design laid out. I > > follow the first principles that redirect a swap entry page should > > only take an additional 4 byte per swap entry. VS blow up the swap > > entry size by something like 24 bytes? > > Nhat can lay this out in more detail, but there isn't much new stuff > in the virtual swap descriptor. It's mostly just a consolidation of > state we currently track elsewhere - swap count, swapcache pointer, > cgroup ownership etc. > > The actual indirection is just a word for the backend type,offset. > > That indirection is the tradeoff for swapped pages. In turn you're > getting back all that other stuff for swap slots that *aren't* > currently used. This is a win for the vast majority of users. I will also note though, that we will merge the zswap tree with the virtual swap descriptors as well. So for zswap entries there are actually no extra overhead induced by the backend indirection pointer :) IOW, overhead for zswap-only users (such as Google) will be much smaller than what Johannes is describing here - pretty much non-existent :) While you will still gain all the other benefits (swap space dynamicization, operational overhead reduction) of swap virtualization.
On Tue, Dec 2, 2025 at 10:19 PM Nhat Pham <nphamcs@gmail.com> wrote: > > > That indirection is the tradeoff for swapped pages. In turn you're > > getting back all that other stuff for swap slots that *aren't* > > currently used. This is a win for the vast majority of users. > > I will also note though, that we will merge the zswap tree with the > virtual swap descriptors as well. What is the merged per swap slot entry size? If your descritor is over 48 bytes plus some zswap pool handles and compressed buffer size etc I am not sure that is an overall win. Provide actual number helps. > So for zswap entries there are actually no extra overhead induced by > the backend indirection pointer :) > > IOW, overhead for zswap-only users (such as Google) will be much > smaller than what Johannes is describing here - pretty much > non-existent :) While you will still gain all the other benefits (swap The per swap slot memory usage size, zswap+ swap core, is it smaller than the ghost swap file patch I posted here? Do you have a number in bytes? Chris
On Wed, Nov 26, 2025 at 1:31 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Tue, Nov 25, 2025 at 11:27:04PM +0400, Chris Li wrote: > > On Mon, Nov 24, 2025 at 11:33 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > Do you have a link to that proposal? > > > > > > > > My 2024 LSF swap pony talk already has a mechanism to redirect page > > > > cache swap entries to different physical locations. > > > > That can also work for redirecting swap entries in different swapfiles. > > > > > > > > https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/ > > > > > > I looked through your slides and the LWN article, but it's very hard > > > for me to find answers to my questions in there. > > > > Naturally, the slide is only intended to cover what is in the current > > swap table may be phase VII. > > But it does have the physical location pointer consideration. > > > > > In your proposal, let's say you have a swp_entry_t in the page > > > table. What does it describe, and what are the data structures to get > > > from this key to user data in the following scenarios: > > > > Please keep in mind that I don't have every detail design laid out. I > > follow the first principles that redirect a swap entry page should > > only take an additional 4 byte per swap entry. VS blow up the swap > > entry size by something like 24 bytes? > > Nhat can lay this out in more detail, but there isn't much new stuff Please make sure Nhat do. It shouldn't be complicated question. > in the virtual swap descriptor. It's mostly just a consolidation of > state we currently track elsewhere - swap count, swapcache pointer, > cgroup ownership etc. All those will fold into swap table values at later phases. So in this regard, swap table is not satisfying the status quotes, it is more aggressive in conserving memory. If I recall correctly, VS uses atomic for the counters? It will blow up the 1 byte counter to 4 bytes. > The actual indirection is just a word for the backend type,offset. Sure. > > That indirection is the tradeoff for swapped pages. In turn you're > getting back all that other stuff for swap slots that *aren't* > currently used. This is a win for the vast majority of users. Swap table does those as well, in the later phases. > > Since you mentioned first principles - the dynamically sized swap > space is also much more suitable for compressed pools, which are the > dominant form of swap setups nowadays. Again a win for the majority. Sure, the swap table does that, especially after the swap cgroup and swap count fold into the swap table. > And the worst-case is reasonable. I don't see the giant gulf you seem > to see there. I don't know where it's supposed to be coming from. Let Nhat conform the per swap entry overhead and let's compare it with the swap table fully final form. Another easy way is just run some benchmark to see how much overhead the VS introduces. That being said, I think I have answered enough technical questions of my approach, to let you re-consider my proposal. You should be able to realize by now my approach is more optimal compared to VS. Do you agree or not? We are just arguing how big the gap that is. Chris
On Wed, 2025-11-26 at 23:22 +0400, Chris Li wrote: > > That being said, I think I have answered enough technical questions > of > my approach, to let you re-consider my proposal. You should be able > to > realize by now my approach is more optimal compared to VS. Do you > agree or not? We are just arguing how big the gap that is. > We would have much more confidence in your solution if you had told us exactly how you were planning to solve things in future stages of the project. A "I'll solve it, but I can't tell you how" is not very confidence inspiring. -- All Rights Reversed.
On Thu, Nov 27, 2025 at 1:53 AM Rik van Riel <riel@surriel.com> wrote: > > On Wed, 2025-11-26 at 23:22 +0400, Chris Li wrote: > > > > That being said, I think I have answered enough technical questions > > of > > my approach, to let you re-consider my proposal. You should be able > > to > > realize by now my approach is more optimal compared to VS. Do you > > agree or not? We are just arguing how big the gap that is. > > > > We would have much more confidence in your > solution if you had told us exactly how > you were planning to solve things in future > stages of the project. Can you clarify who is "We", sorry I am not part of your Meta kernel team circle. II just reply to you and others how to solve the other things. If you have further questions, please ask a clarifying question. Until you ask, I don't know which part of the Swap Pony plan you don't understand needs more clarifications. > A "I'll solve it, but I can't tell you how" > is not very confidence inspiring. Don't need this kind of innuendo and it is not helping. Please stay on the technical side of discussion and try not to project personal judgement, thanks. Please keep in mind that I am just one person love kernel hacking and want to do the right things. I am doing this at my spare time, it is not part of my company OKR's to work on upstream swap in the last two years. I don't get pay to do this. I am replying this email from my vacation 5am in the morning. Again, let's stay technical. If you think I am holding any secret (I am not ), please just ask a clarify question. Thanks for your cooperation and sorry that I did have a chance to explain things better earlier. Chris
On Thu, 2025-11-27 at 05:52 +0400, Chris Li wrote: > On Thu, Nov 27, 2025 at 1:53 AM Rik van Riel <riel@surriel.com> > wrote: > > > > On Wed, 2025-11-26 at 23:22 +0400, Chris Li wrote: > > > > > > That being said, I think I have answered enough technical > > > questions > > > of > > > my approach, to let you re-consider my proposal. You should be > > > able > > > to > > > realize by now my approach is more optimal compared to VS. Do you > > > agree or not? We are just arguing how big the gap that is. > > > > > > > We would have much more confidence in your > > solution if you had told us exactly how > > you were planning to solve things in future > > stages of the project. > > Can you clarify who is "We", Sorry, I am talking about upstream. When one developer has code, and somebody else emails the equivalent of "trust me, bro", the code is usually preferred. > > Please keep in mind that I am just one person love kernel hacking and > want to do the right things. I am doing this at my spare time, it is > not part of my company OKR's to work on upstream swap in the last two > years. I don't get pay to do this. I am replying this email from my > vacation 5am in the morning. > > Again, let's stay technical. If you think I am holding any secret (I > am not ), please just ask a clarify question. I really appreciate anybody participating in Linux kernel development. Linux is good because different people bring different perspectives to the table. Some real numbers, even if just back of the envelope math to estimate the overhead of various ideas being proposed, are often a good way to move a discussion along in a productive direction. Let me reply to your other email with some more technical details. -- All Rights Reversed.
On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote: > > Sorry, I am talking about upstream. So far I have not had a pleasant upstream experience when submitting this particular patch to upstream. > I really appreciate anybody participating in Linux > kernel development. Linux is good because different > people bring different perspectives to the table. Of course everybody is welcome. However, NACK without technical justification is very bad for upstream development. I can't imagine what a new hacker would think after going through what I have gone through for this patch. He/she will likely quit contributing upstream. This is not the kind of welcome we want. Nhat needs to be able to technically justify his NACK as a maintainer. Sorry there is no other way to sugar coat it. Chris
On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote: > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote: > > > > Sorry, I am talking about upstream. > > So far I have not had a pleasant upstream experience when submitting > this particular patch to upstream. > > > I really appreciate anybody participating in Linux > > kernel development. Linux is good because different > > people bring different perspectives to the table. > > Of course everybody is welcome. However, NACK without technical > justification is very bad for upstream development. I can't imagine > what a new hacker would think after going through what I have gone > through for this patch. He/she will likely quit contributing upstream. > This is not the kind of welcome we want. > > Nhat needs to be able to technically justify his NACK as a maintainer. > Sorry there is no other way to sugar coat it. I am NOT the only zswap maintainer who expresses concerns. Other people also have their misgivings, so I have let them speak and not put words in their mouths. But since you have repeatedly singled me out, I will repeat my concerns here: 1. I don't like the operational overhead (to statically size the zswap swapfile size for each <host x workload> combination) of static swapfile. Misspecification of swapfile size can lead to unacceptable swap metadata overhead on small machines, or underutilization of zswap on big machines. And it is *impossible* to know how much zswap will be needed ahead of time, even if we fix host - it depends on workloads access patterns, memory compressibility, and latency/memory pressure tolerance. 2. I don't like the maintainer's overhead (to support a special infrastructure for a very specific use case, i.e no-writeback), especially since I'm not convinced this can be turned into a general architecture. See below. 3. I want to move us towards a more dynamic architecture for zswap. This is a step in the WRONG direction. 4. I don't believe this buys us anything we can't already do with userspace hacking. Again, zswap-over-zram (or insert whatever RAM-only swap option here), with writeback disabled, is 2-3 lines of script. I believe I already justified myself well enough :) It is you who have not really convinced me that this is, at the very least, a temporary/first step towards a long-term generalized architecture for zswap. Every time we pointed out an issue, you seem to justify it with some more vague ideas that deepen the confusion. Let's recap the discussion so far: 1. We claimed that this architecture is hard to extend for efficient zswap writeback, or backend transfer in general, without incurring page table updates. You claim you plan to implement a redirection entry to solve this. 2. We then pointed out that inserting redirect entry into the current physical swap infrastructure will leave holes in the upper swap tier's address space, which is arguably *worse* than the current status quo of zswap occupying disk swap space. Again, you pull out some vague ideas about "frontend" and "backend" swap, which, frankly, is conceptually very similar to swap virtualization. 3. The dynamicization of swap space is treated with the same rigor (or, more accurately, lack thereof). Just more handwaving about the "frontend" vs "backend" (which, again, is very close to swap virtualization). This requirement is a deal breaker for me - see requirement 1 above again. 4. We also pointed out your lack of thoughts for swapoff optimization, which again, seem to be missing in your design. Again, more vagueness about rmap, which is probably more overhead. Look man, I'm not being hostile to you. Believe me on this - I respect your opinion, and I'm working very hard on reducing memory overhead for virtual swap, to see if I can meet you where you want it to be. The RFC's original design inefficient memory usage was due to: a) Readability. Space optimization can make it hard to read code, when fields are squeezed into the same int/long variable. So I just put one different field for each piece of metadata information b) I was playing with synchronization optimization, i.e using atomics instead of locks, and using per-entry locks. But I can go back to using per-cluster lock (I haven't implemented cluster allocator at the time of the RFC, but in my latest version I have done it), which will further reduce the memory overhead by removing a couple of fields/packing more fields. The only non-negotiable per-swap-entry overhead will be a field to indicate the backend location (physical swap slot, zswap entry, etc.) + 2 bits to indicate the swap type. With some field union-ing magic, or pointer tagging magic, we can perhaps squeeze it even harder. I'm also working on reducing the CPU overhead - re-partitioning swap architectures (swap cache, zswap tree), reducing unnecessary xarray lookups where possible. We can then benchmark, and attempt to optimize it together as a community.
On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote: > > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote: > > > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote: > > > > > > Sorry, I am talking about upstream. > > > > So far I have not had a pleasant upstream experience when submitting > > this particular patch to upstream. > > > > > I really appreciate anybody participating in Linux > > > kernel development. Linux is good because different > > > people bring different perspectives to the table. > > > > Of course everybody is welcome. However, NACK without technical > > justification is very bad for upstream development. I can't imagine > > what a new hacker would think after going through what I have gone > > through for this patch. He/she will likely quit contributing upstream. > > This is not the kind of welcome we want. > > > > Nhat needs to be able to technically justify his NACK as a maintainer. > > Sorry there is no other way to sugar coat it. > > I am NOT the only zswap maintainer who expresses concerns. Other > people also have their misgivings, so I have let them speak and not > put words in their mouths. You did not mention the fact that both two NACK from zswap maintainers are from the same company. I assume you have some kind of team sync. There is a term for that, called "person acting in concert". What I mean in "technically unjustifiable" is that VS patch series is a non-starter to merge into mainline. In this email you suggest the per swap slot memory overhead is 48 bytes previously 64 bytes. https://lore.kernel.org/linux-mm/CAKEwX=Mea5V6CKcGuQrYfCQAKErgbje1s0fThjkgCwZXgF-d2A@mail.gmail.com/ Do you have newer VS that significantly reduce that? If so, what is the new number? The starting point before your VS is 11 bytes (3 bytes static, 8 bytes dynamic). 48bytes is more than 4x the original size. This will have a huge impact on the deployment that uses a lot of swap. The worst part is that once your VS series is in the kernel. That overhead is always on, it is forcing the overhead even if the redirection is not used. This will hurt Google's fleet very badly if deployed. Because of the same jobs, the kernel memory consumption will jump up and fail jobs. Every body's kernel who use swap will suffer because it is always on. The alternative, the swap table, uses much less overhead. So your VS leave money on the table. So I consider your VS is a non-starter. I repeatedly call you out because you keep dodging this critical question. Johannes refers to you for the detail value of the overhead as well. Dodging critical questions makes a technical debate very difficult to conduct and drive to a conflict resolution impossible. BTW, this is my big concern on the 2023 swap abstraction talk which our VS is based on. The community feedback at the time strongly favored my solution. I don't understand why you reboot the community un-favored solution without addressing those concerns. The other part of the bad experience is that you NACK first then ask clarifying questions later. The proper order is the other way around. You should fully understand the subject BEFORE you NACK on it. NACK is a very serious business. I did try my best to answer clarification question from your team. I appreciate that Johannes and Yosry ask clarification to advance the discussion. I did not see more question from them I assume they got what they want to know. If you still feel something is missing out, you should ask a follow up question for the part in which you need more clarification. We can repeat until you understand. You keep using the phrase "hand waving" as if I am faking it. That is FUD. Communication is a two way street. I can't force you to understand, asking more questions can help you. This is complex problem. I am confident I can explain to Kairui and he can understand, because he has a lot more context, not because I am faking it. Ask nicely so I can answer nicely. Stay in the technical side of the discussion please. So I consider using VS to NACK my patch is technically unjustifiable. Your current VS with 48 byte overhead is not usable at all as an standard upstream kernel. Can we agree to that? As we all know, using less memory to function the same is a lot harder than using more. If you can dramatically reduce the memory usage, you likely need to rebuild the whole patch series from scratch. If might force you to use solution similar to swap table, in that case why not join team swap table? We can reopen the topic again by then if you have a newer VS: 1) address the per swap slot memory over head, ideally close to the first principle value. 2) make the overhead optional, if not using redirection, preferably not pay the overhead. 3) make your VS patch series incrementally show value, not all or nothing. Sorry this email is getting very long and I have very limited time. Let's discuss one topic at a time. I would like to conclude the current VS is not a viable option as of now. I can reply to other parts of your email once we get the VS out of the way. Best Regards, Chris > > 1. I don't like the operational overhead (to statically size the zswap > swapfile size for each <host x workload> combination) of static > swapfile. Misspecification of swapfile size can lead to unacceptable > swap metadata overhead on small machines, or underutilization of zswap > on big machines. And it is *impossible* to know how much zswap will be > needed ahead of time, even if we fix host - it depends on workloads > access patterns, memory compressibility, and latency/memory pressure > tolerance. > > 2. I don't like the maintainer's overhead (to support a special > infrastructure for a very specific use case, i.e no-writeback), > especially since I'm not convinced this can be turned into a general > architecture. See below. > > 3. I want to move us towards a more dynamic architecture for zswap. > This is a step in the WRONG direction. > > 4. I don't believe this buys us anything we can't already do with > userspace hacking. Again, zswap-over-zram (or insert whatever RAM-only > swap option here), with writeback disabled, is 2-3 lines of script. > > I believe I already justified myself well enough :) It is you who have > not really convinced me that this is, at the very least, a > temporary/first step towards a long-term generalized architecture for > zswap. Every time we pointed out an issue, you seem to justify it with > some more vague ideas that deepen the confusion. > > Let's recap the discussion so far: > > 1. We claimed that this architecture is hard to extend for efficient > zswap writeback, or backend transfer in general, without incurring > page table updates. You claim you plan to implement a redirection > entry to solve this. > > 2. We then pointed out that inserting redirect entry into the current > physical swap infrastructure will leave holes in the upper swap tier's > address space, which is arguably *worse* than the current status quo > of zswap occupying disk swap space. Again, you pull out some vague > ideas about "frontend" and "backend" swap, which, frankly, is > conceptually very similar to swap virtualization. > > 3. The dynamicization of swap space is treated with the same rigor > (or, more accurately, lack thereof). Just more handwaving about the > "frontend" vs "backend" (which, again, is very close to swap > virtualization). This requirement is a deal breaker for me - see > requirement 1 above again. > > 4. We also pointed out your lack of thoughts for swapoff optimization, > which again, seem to be missing in your design. Again, more vagueness > about rmap, which is probably more overhead. > > Look man, I'm not being hostile to you. Believe me on this - I respect > your opinion, and I'm working very hard on reducing memory overhead > for virtual swap, to see if I can meet you where you want it to be. > The RFC's original design inefficient memory usage was due to: > > a) Readability. Space optimization can make it hard to read code, when > fields are squeezed into the same int/long variable. So I just put one > different field for each piece of metadata information > > b) I was playing with synchronization optimization, i.e using atomics > instead of locks, and using per-entry locks. But I can go back to > using per-cluster lock (I haven't implemented cluster allocator at the > time of the RFC, but in my latest version I have done it), which will > further reduce the memory overhead by removing a couple of > fields/packing more fields. > > The only non-negotiable per-swap-entry overhead will be a field to > indicate the backend location (physical swap slot, zswap entry, etc.) > + 2 bits to indicate the swap type. With some field union-ing magic, > or pointer tagging magic, we can perhaps squeeze it even harder. > > I'm also working on reducing the CPU overhead - re-partitioning swap > architectures (swap cache, zswap tree), reducing unnecessary xarray > lookups where possible. > > We can then benchmark, and attempt to optimize it together as a community.
On Sat, Nov 29, 2025 at 12:38 PM Chris Li <chrisl@kernel.org> wrote: > > On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote: > > > > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote: > > > > > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote: > > > > > > > > Sorry, I am talking about upstream. > > > > > > So far I have not had a pleasant upstream experience when submitting > > > this particular patch to upstream. > > > > > > > I really appreciate anybody participating in Linux > > > > kernel development. Linux is good because different > > > > people bring different perspectives to the table. > > > > > > Of course everybody is welcome. However, NACK without technical > > > justification is very bad for upstream development. I can't imagine > > > what a new hacker would think after going through what I have gone > > > through for this patch. He/she will likely quit contributing upstream. > > > This is not the kind of welcome we want. > > > > > > Nhat needs to be able to technically justify his NACK as a maintainer. > > > Sorry there is no other way to sugar coat it. > > > > I am NOT the only zswap maintainer who expresses concerns. Other > > people also have their misgivings, so I have let them speak and not > > put words in their mouths. > > You did not mention the fact that both two NACK from zswap maintainers > are from the same company. I assume you have some kind of team sync. > There is a term for that, called "person acting in concert". I mean, Yosry pointed out issues with your approach too. Yosry is from your company, no? The issues I pointed out have all been technical, thus far. I never even brought up Meta - I'm sure other parties have the same issues. > > What I mean in "technically unjustifiable" is that VS patch series is > a non-starter to merge into mainline. > In this email you suggest the per swap slot memory overhead is 48 > bytes previously 64 bytes. > > https://lore.kernel.org/linux-mm/CAKEwX=Mea5V6CKcGuQrYfCQAKErgbje1s0fThjkgCwZXgF-d2A@mail.gmail.com/ > > Do you have newer VS that significantly reduce that? If so, what is > the new number? > > The starting point before your VS is 11 bytes (3 bytes static, 8 bytes > dynamic). 48bytes is more than 4x the original size. > This will have a huge impact on the deployment that uses a lot of > swap. The worst part is that once your VS series is in the kernel. > That overhead is always on, it is forcing the overhead even if the > redirection is not used. This will hurt Google's fleet very badly if > deployed. Because of the same jobs, the kernel memory consumption will > jump up and fail jobs. Every body's kernel who use swap will suffer > because it is always on. The alternative, the swap table, uses much > less overhead. So your VS leave money on the table. > > So I consider your VS is a non-starter. I repeatedly call you out > because you keep dodging this critical question. Johannes refers to > you for the detail value of the overhead as well. Dodging critical > questions makes a technical debate very difficult to conduct and drive > to a conflict resolution impossible. BTW, this is my big concern on > the 2023 swap abstraction talk which our VS is based on. The community > feedback at the time strongly favored my solution. I don't understand > why you reboot the community un-favored solution without addressing > those concerns. I reboot the VS work because I have not seen any indications that your design could solve the problems I believe are principle for any swap architectures: dynamicization of swap space, efficient backend transfer, to name 2. > > The other part of the bad experience is that you NACK first then ask > clarifying questions later. The proper order is the other way around. > You should fully understand the subject BEFORE you NACK on it. NACK is > a very serious business. > > I did try my best to answer clarification question from your team. I > appreciate that Johannes and Yosry ask clarification to advance the > discussion. I did not see more question from them I assume they got > what they want to know. If you still feel something is missing out, > you should ask a follow up question for the part in which you need > more clarification. We can repeat until you understand. You keep using > the phrase "hand waving" as if I am faking it. That is FUD. > Communication is a two way street. I can't force you to understand, > asking more questions can help you. This is complex problem. I am > confident I can explain to Kairui and he can understand, because he > has a lot more context, not because I am faking it. Ask nicely so I > can answer nicely. Stay in the technical side of the discussion > please. > > So I consider using VS to NACK my patch is technically unjustifiable. I'm not NACK-ing the ghost swapfile because of VS. I'm NACK-ing swapfile because of the technical requirements I pointed out above. Virtual swap happens to neatly solve all of them, by design, from first principle. I never ruled out the possibility of another design that would satisfy all of them - I just did not see enough from you to believe otherwise. I don't believe a static ghosttfile is it. In fact, you CAN theoretically implement virtual swap with a ghost swapfile as well. The staticity will just make it operationally untenable. The next step would be to dynamicize the swap infrastructure, at which point we revert back to the original VS design. I see the same thing played out in your response as well, with the redirection entry, then frontend/backend swap space. It's starting to eerily resembles virtual swap. Or maybe you can clarify? > Your current VS with 48 byte overhead is not usable at all as an > standard upstream kernel. Can we agree to that? Sure, which is why I sent it as an RFC and not as an actual patch series pending merging :) Its main purpose was to demonstrate the workflow of how a feature-complete virtual swap subsystem might behave, in all of the code paths of the memory subsystem. I can then optimize the fields piecemeal, while weighing the tradeoff (such as lock granularity v.s lock fields memory overhead). You and Kairui are welcome to criticize, comment, and help me optimize it, as did Yosry and Johannes in the past. > > As we all know, using less memory to function the same is a lot harder > than using more. If you can dramatically reduce the memory usage, you I don't necessarily disagree. I would, however, would like to point out that the reverse is true too - you can't necessarily compare the overhead of two designs, where one achieve a lot more in terms of features and/or operational goals than the other. > likely need to rebuild the whole patch series from scratch. If might > force you to use solution similar to swap table, in that case why not > join team swap table? Because even with the current swap table design, the allocator is *still* static. I would LOVE to use the current physical swap allocation infrastructure. It just doesn't work in its current state. > We can reopen the topic again by then if you have a newer VS: Sure.
On Tue, Dec 2, 2025 at 3:37 AM Nhat Pham <nphamcs@gmail.com> wrote: > > On Sat, Nov 29, 2025 at 12:38 PM Chris Li <chrisl@kernel.org> wrote: > > > > On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote: > > > > > > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote: > > > > > > > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote: > > > > > > > > > > Sorry, I am talking about upstream. > > > > > > > > So far I have not had a pleasant upstream experience when submitting > > > > this particular patch to upstream. > > > > > > > > > I really appreciate anybody participating in Linux > > > > > kernel development. Linux is good because different > > > > > people bring different perspectives to the table. > > > > > > > > Of course everybody is welcome. However, NACK without technical > > > > justification is very bad for upstream development. I can't imagine > > > > what a new hacker would think after going through what I have gone > > > > through for this patch. He/she will likely quit contributing upstream. > > > > This is not the kind of welcome we want. > > > > > > > > Nhat needs to be able to technically justify his NACK as a maintainer. > > > > Sorry there is no other way to sugar coat it. > > > > > > I am NOT the only zswap maintainer who expresses concerns. Other > > > people also have their misgivings, so I have let them speak and not > > > put words in their mouths. > > > > You did not mention the fact that both two NACK from zswap maintainers > > are from the same company. I assume you have some kind of team sync. > > There is a term for that, called "person acting in concert". > > I mean, Yosry pointed out issues with your approach too. Yosry is from > your company, no? I don't know who's interest Yosry is representing on this issue. > > What I mean in "technically unjustifiable" is that VS patch series is > > a non-starter to merge into mainline. > > In this email you suggest the per swap slot memory overhead is 48 > > bytes previously 64 bytes. > > > > https://lore.kernel.org/linux-mm/CAKEwX=Mea5V6CKcGuQrYfCQAKErgbje1s0fThjkgCwZXgF-d2A@mail.gmail.com/ > > > > Do you have newer VS that significantly reduce that? If so, what is > > the new number? > > > > The starting point before your VS is 11 bytes (3 bytes static, 8 bytes > > dynamic). 48bytes is more than 4x the original size. > > This will have a huge impact on the deployment that uses a lot of > > swap. The worst part is that once your VS series is in the kernel. > > That overhead is always on, it is forcing the overhead even if the > > redirection is not used. This will hurt Google's fleet very badly if > > deployed. Because of the same jobs, the kernel memory consumption will > > jump up and fail jobs. Every body's kernel who use swap will suffer > > because it is always on. The alternative, the swap table, uses much > > less overhead. So your VS leave money on the table. > > > > So I consider your VS is a non-starter. I repeatedly call you out > > because you keep dodging this critical question. Johannes refers to > > you for the detail value of the overhead as well. Dodging critical > > questions makes a technical debate very difficult to conduct and drive > > to a conflict resolution impossible. BTW, this is my big concern on > > the 2023 swap abstraction talk which our VS is based on. The community > > feedback at the time strongly favored my solution. I don't understand > > why you reboot the community un-favored solution without addressing > > those concerns. > > I reboot the VS work because I have not seen any indications that your > design could solve the problems I believe are principle for any swap > architectures: dynamicization of swap space, efficient backend > transfer, to name 2. So no new number and no new date yet. > > > > > The other part of the bad experience is that you NACK first then ask > > clarifying questions later. The proper order is the other way around. > > You should fully understand the subject BEFORE you NACK on it. NACK is > > a very serious business. > > > > I did try my best to answer clarification question from your team. I > > appreciate that Johannes and Yosry ask clarification to advance the > > discussion. I did not see more question from them I assume they got > > what they want to know. If you still feel something is missing out, > > you should ask a follow up question for the part in which you need > > more clarification. We can repeat until you understand. You keep using > > the phrase "hand waving" as if I am faking it. That is FUD. > > Communication is a two way street. I can't force you to understand, > > asking more questions can help you. This is complex problem. I am > > confident I can explain to Kairui and he can understand, because he > > has a lot more context, not because I am faking it. Ask nicely so I > > can answer nicely. Stay in the technical side of the discussion > > please. > > > > So I consider using VS to NACK my patch is technically unjustifiable. > > I'm not NACK-ing the ghost swapfile because of VS. I'm NACK-ing > swapfile because of the technical requirements I pointed out above. > Virtual swap happens to neatly solve all of them, by design, from > first principle. I never ruled out the possibility of another design > that would satisfy all of them - I just did not see enough from you to > believe otherwise. That is FUD. (Doubt). Do you notice that over half of the core swap code in the kernel now is my design? Swap allocator, swap table. Soon there will be more like swap.tiers. Code will speak for itself. By the way, FUD can be universally applied to anything, so it is a distraction for technical discussion. Let's focus the discussion on the technical and stop spreading FUD. > > I don't believe a static ghosttfile is it. In fact, you CAN > theoretically implement virtual swap with a ghost swapfile as well. > The staticity will just make it operationally untenable. The next step > would be to dynamicize the swap infrastructure, at which point we > revert back to the original VS design. Just a starting point. Can incremental change to dynamic size. I believe in incremental landing improvement in the kernel. > I see the same thing played out in your response as well, with the > redirection entry, then frontend/backend swap space. It's starting to > eerily resembles virtual swap. Or maybe you can clarify? > > > Your current VS with 48 byte overhead is not usable at all as an > > standard upstream kernel. Can we agree to that? > > Sure, which is why I sent it as an RFC and not as an actual patch > series pending merging :) Its main purpose was to demonstrate the > workflow of how a feature-complete virtual swap subsystem might > behave, in all of the code paths of the memory subsystem. I can then > optimize the fields piecemeal, while weighing the tradeoff (such as > lock granularity v.s lock fields memory overhead). You and Kairui are > welcome to criticize, comment, and help me optimize it, as did Yosry > and Johannes in the past. We need a new VS number close to first principle value to reboot the discussion. Because you care about competing with something designed to run close to the first principle value. Until then VS is just a prototype, not ready for production quality. > > As we all know, using less memory to function the same is a lot harder > > than using more. If you can dramatically reduce the memory usage, you > > I don't necessarily disagree. > > I would, however, would like to point out that the reverse is true too > - you can't necessarily compare the overhead of two designs, where one > achieve a lot more in terms of features and/or operational goals than > the other. Then the swap table is landing the kernel. The ghost swap file I submit is much closer to the production quality. If you merge that patch now, the kernel can have a swapfile that does not waste disk space. It also performs well NOW. Google has been using ghost swapfile in production for over 10 years. So you are right, you can't even compare the two in the sense that VS is not ready. I want to focus my time to spend on something that can land the kernel next. Ping me again when your VS is at that stage. > > likely need to rebuild the whole patch series from scratch. If might > > force you to use solution similar to swap table, in that case why not > > join team swap table? > > Because even with the current swap table design, the allocator is > *still* static. Can be changed, will be changed, it needs to happen after phase IV of swap table. As a matter of fact, a lot of swap tables clean up untangle to make it happen. > I would LOVE to use the current physical swap allocation > infrastructure. It just doesn't work in its current state. Help reviewing the swap table phase II and phase III then. Make it happen sooner. > > We can reopen the topic again by then if you have a newer VS: > > Sure. Now I will conclude the VS: It is not production ready and it has no dates when it will be production ready. It is just red herring. Chris
On Sun, Nov 30, 2025 at 12:38:38AM +0400, Chris Li wrote: > On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote: > > > > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote: > > > > > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote: > > > > > > > > Sorry, I am talking about upstream. > > > > > > So far I have not had a pleasant upstream experience when submitting > > > this particular patch to upstream. > > > > > > > I really appreciate anybody participating in Linux > > > > kernel development. Linux is good because different > > > > people bring different perspectives to the table. > > > > > > Of course everybody is welcome. However, NACK without technical > > > justification is very bad for upstream development. I can't imagine > > > what a new hacker would think after going through what I have gone > > > through for this patch. He/she will likely quit contributing upstream. > > > This is not the kind of welcome we want. > > > > > > Nhat needs to be able to technically justify his NACK as a maintainer. > > > Sorry there is no other way to sugar coat it. > > > > I am NOT the only zswap maintainer who expresses concerns. Other > > people also have their misgivings, so I have let them speak and not > > put words in their mouths. > > You did not mention the fact that both two NACK from zswap maintainers > are from the same company. I assume you have some kind of team sync. > There is a term for that, called "person acting in concert". For the benefit of anybody following this from the sidelines, the third zswap maintainer also expressed concerns about Chris's proposal upthread. He works for the same company as Chris. The reality is that Chris is failing to convince others of his design direction, and is now obviously resorting to manipulation and hominem attacks. During the course of this thread, Chris has asked for "a little faith" that his idea will work for all stated requirements, without deeming it necessary to explain how. When probed on technical details, he stated that he doesn't like to plan that far ahead, and prefers having somebody else iron out the implementation details. He also referred to high-level slides from his LSFMM '24 session - which was received thusly[1]: Matthew Wilcox agreed, warning Li that he was setting himself up for "a world of pain". Jan Kara said that existing filesystem designs are not suited to this task Hildenbrand said that this plan was introducing too much complexity His first response to criticism was to invoke his <4 week status of swap maintainer. Meanwhile, the design direction that Chris is construing as a single company conspiracy is anything but. The collaborative origins of these patches are well documented. Chris was CC'd on those RFCs. He notably did not engage in them. He is now lying about the narrative and choosing to attack these patches in bad faith and out of context. This pattern of behavior gives me low confidence that Chris is able to collaborate and compromise on a design that works for all users. And while Chris has been quite vocal and opinionated in mailing list discussions, his actual code contributions to the kernel do not instill confidence that he can solve this problem by himself, either. [1] https://lwn.net/Articles/974587/
On Mon, Dec 1, 2025 at 8:43 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > For the benefit of anybody following this from the sidelines, the > third zswap maintainer also expressed concerns about Chris's proposal > upthread. He works for the same company as Chris. Yes, I don't know who's interest Yosry's represent. We have a disagreement on the swap abstraction 2023 that is why I have an alternative proposal. The community back then strongly favored my proposal. I guess Yosry just hasn't graduated from that yet. > > The reality is that Chris is failing to convince others of his design > direction, and is now obviously resorting to manipulation and hominem > attacks. Now we can't even talk about technical and move to personal attacks, is that all you have left in you? > During the course of this thread, Chris has asked for "a little faith" > that his idea will work for all stated requirements, without deeming > it necessary to explain how. More FUD please. > When probed on technical details, he stated that he doesn't like to > plan that far ahead, and prefers having somebody else iron out the > implementation details. He also referred to high-level slides from his > LSFMM '24 session - which was received thusly[1]: > > Matthew Wilcox agreed, warning Li that he was setting himself up for "a world of pain". Yes, we deal with that pain. Swap table is the outcome so we don't further impose pain to maintain file cache vs swap cache where a lot of swap specific optimization will be painful for the file cache side. As far as I am concerned, the most painful part, swap table as the new swap cache has already landed. We did not cause Matthew pain in the process. > Jan Kara said that existing filesystem designs are not suited to this task > > Hildenbrand said that this plan was introducing too much complexity > More personal attacks please. > His first response to criticism was to invoke his <4 week status of > swap maintainer. I take that back and apologize for what I say and you accept it as "no hard feelings". Do you mean you don't mean what you say? > Meanwhile, the design direction that Chris is construing as a single > company conspiracy is anything but. The collaborative origins of these > patches are well documented. Chris was CC'd on those RFCs. He notably I feel the 48 bytes overhead is a joke, I already provide my feedback against it in the 2023 LSF swap abstraction. I don't like to keep beating the dead horse. > did not engage in them. He is now lying about the narrative and > choosing to attack these patches in bad faith and out of context. More FUD and personal attack, is that all you can output now? > > This pattern of behavior gives me low confidence that Chris is able to > collaborate and compromise on a design that works for all users. > > And while Chris has been quite vocal and opinionated in mailing list > discussions, his actual code contributions to the kernel do not > instill confidence that he can solve this problem by himself, either. > > [1] https://lwn.net/Articles/974587/ You obviously haven't graduated from the fact that most of the swap core is my design now, in the current kernel. There will be more. More personal attacks please, I am ignoring the attack in the order I received. It seems that is what is left of you, personal attacks to dominate a technical discussion when the technical is losing. Sign, this is an example case study of upstream bullying and that is why sometimes upstream submission is very unfriendly for the less established person. I personally know people who were bullied by you and give up upstream contributions completely. Go ahead and try to add me to the list. That will win you more followers. More people will enjoy working with you. I agree I am not a native English speaker. I will lose to you in a bullying shout out flight, you win in. Let's compete in code and benchmarks and see what happens. Chris
On Tue, Dec 2, 2025 at 12:47 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Sun, Nov 30, 2025 at 12:38:38AM +0400, Chris Li wrote: > > On Sat, Nov 29, 2025 at 12:46 AM Nhat Pham <nphamcs@gmail.com> wrote: > > > > > > On Thu, Nov 27, 2025 at 11:10 AM Chris Li <chrisl@kernel.org> wrote: > > > > > > > > On Thu, Nov 27, 2025 at 6:28 AM Rik van Riel <riel@surriel.com> wrote: > > > > > > > > > > Sorry, I am talking about upstream. > > > > > > > > So far I have not had a pleasant upstream experience when submitting > > > > this particular patch to upstream. > > > > > > > > > I really appreciate anybody participating in Linux > > > > > kernel development. Linux is good because different > > > > > people bring different perspectives to the table. > > > > > > > > Of course everybody is welcome. However, NACK without technical > > > > justification is very bad for upstream development. I can't imagine > > > > what a new hacker would think after going through what I have gone > > > > through for this patch. He/she will likely quit contributing upstream. > > > > This is not the kind of welcome we want. > > > > > > > > Nhat needs to be able to technically justify his NACK as a maintainer. > > > > Sorry there is no other way to sugar coat it. > > > > > > I am NOT the only zswap maintainer who expresses concerns. Other > > > people also have their misgivings, so I have let them speak and not > > > put words in their mouths. > > > > You did not mention the fact that both two NACK from zswap maintainers > > are from the same company. I assume you have some kind of team sync. > > There is a term for that, called "person acting in concert". > > For the benefit of anybody following this from the sidelines, the > third zswap maintainer also expressed concerns about Chris's proposal > upthread. He works for the same company as Chris. > > The reality is that Chris is failing to convince others of his design > direction, and is now obviously resorting to manipulation and hominem > attacks. > > During the course of this thread, Chris has asked for "a little faith" > that his idea will work for all stated requirements, without deeming > it necessary to explain how. > > When probed on technical details, he stated that he doesn't like to > plan that far ahead, and prefers having somebody else iron out the > implementation details. He also referred to high-level slides from his > LSFMM '24 session - which was received thusly[1]: > > Matthew Wilcox agreed, warning Li that he was setting himself up for "a world of pain". > > Jan Kara said that existing filesystem designs are not suited to this task > > Hildenbrand said that this plan was introducing too much complexity > > His first response to criticism was to invoke his <4 week status of > swap maintainer. > > Meanwhile, the design direction that Chris is construing as a single > company conspiracy is anything but. The collaborative origins of these > patches are well documented. Chris was CC'd on those RFCs. He notably > did not engage in them. He is now lying about the narrative and > choosing to attack these patches in bad faith and out of context. > > This pattern of behavior gives me low confidence that Chris is able to > collaborate and compromise on a design that works for all users. > > And while Chris has been quite vocal and opinionated in mailing list > discussions, his actual code contributions to the kernel do not > instill confidence that he can solve this problem by himself, either. Hi all, I’d really prefer we all let things cool off a bit before the thread gets too dramatic. :) Sorry to see that the discussion went quite off topic, still I believe this is some kind of misunderstanding on Chris' intention to improve the kernel in a more generic way. From my perspective, Chris did co-developed, suggested, reviewed or authored many of the implementation details around the swap-table idea, and he implemented the swap cluster allocator in 6.11, which unlocked a bunch of follow-on optimizations. I’ve been working on swap for a while as well and have rewritten and refactored large parts of the swap, swap allocator and swap cache (mm/swapfile.c, mm/swap_state.c, swap.h, swap_table.h). Maybe, yeah, I’m not a kernel vet with decades of patches yet, but I do think I'm familiar enough with swap. I think Chris' work, words or code, has been looking good in the end results. It's hard to put a penthouse on a sandcastle, and maybe that's the reason makes it hard to describe or layout the further implementations of swap. We all struggled with swap subsystem a lot, the code base served us well, but it had accumulated a lot of historical complexity and awkward workarounds overtime (we had so many people in the community complaining about it for so many years). I think we all agree that pursuing incremental cleanups and improvement (eg. swap slot cache cleanup, swap lock cleanup, swap_has_cache cleanup, direct-swap workarounds removal, etc.) is more suitable upstream. Chris also help a lot on this (eg. the LPC talk last year) and we finally got rid of many long time burdens, quite some of these works are directly enabled by his swap allocator rework first. And I do have a more completed branch that I posted several times showing the end results of swap tables have better memory consumption & performance, and the code is much simpler than what we had in upstream. It's getting merged step by step, and each step is a gain. I believe that is the right way to improve things upstream, everyone and every workload benefits, and progressively. And based on that, we will be able to implement things much easier. I believe things will look much better and cleaner as we process (eg. resizing might be doable for generic swap too), and make it easier for all of us, and make the swap subsystem better in a collaborative way. Cheers.
On Tue, Dec 02, 2025 at 03:49:22AM +0800, Kairui Song wrote: > From my perspective, Chris did co-developed, suggested, reviewed or > authored many of the implementation details around the swap-table > idea, and he implemented the swap cluster allocator in 6.11, which > unlocked a bunch of follow-on optimizations. > > I’ve been working on swap for a while as well and have rewritten and > refactored large parts of the swap, swap allocator and swap cache > (mm/swapfile.c, mm/swap_state.c, swap.h, swap_table.h). Maybe, yeah, > I’m not a kernel vet with decades of patches yet, but I do think I'm > familiar enough with swap. I think Chris' work, words or code, has > been looking good in the end results. I have absolute respect for your work. And if you say Chris was instrumental to getting it done, I will take your word for it. > It's hard to put a penthouse on a sandcastle, and maybe that's the > reason makes it hard to describe or layout the further implementations > of swap. Sure, I can understand that. However, I think the conflict is not necessarily about implementation strategy, it's about priorities. We have a usecase. We have a functional implementation that satisfies this usecase. It was sent as RFCs early on to gain consensus on the direction and find the best tradeoffs wrt other usecases. These RFC threads are the place to voice concerns and influence direction. Both Chris and you have stated that further swap table work *may* also enable this usecase. However, at this time, I think it's also fair to say that it's more of an afterthought, and no concrete design or code for how this would actually look like has been proposed. High-level ideas have been floated, but as you can see from Nhat, Rik's, Yosry's and my replies, they don't really meet the necessary requirements. This is not some principled stance. The virtual swap patches are difficult to develop, especially given the current rate of change of the underlying swap codebase. If anybody working on vswap had seen a plausible way to solve these issues through incremental swap table improvements they would have jumped on it a long time ago. It's more about priorities. Combining compression with disk swap is extremely powerful, because it dramatically reduces the worst aspects of both: it reduces the memory footprint of compression by shedding the coldest data to disk; it reduces the IO latencies and flash wear of disk swap through the writeback cache. In practice, this reduces *average event rates of the entire reclaim/paging/IO stack*. These are higher-order overhead savings that are difficult to beat with first-order descriptor and lookup cost optimizations. We obviously want to have both, but they are orthogonal goals. You can see how it doesn't make sense for us to deprioritize the former for the latter, or why Nhat says it's an apples to oranges comparison. It also won't work for one party to say "we will fix this, give us time". Everybody wants to advance the thing they primarily care about with the least amount of detours. That's where we have to find compromise. Either let people pursue what's most important to them, or lay out an encompassing design to build consensus and organize effort. And yes, let's please stay technical and on-topic in these discussions. Let's acknowledge we have interests that overlap, and interests that do not. Then find ways to service everybody's usecases. Disagreements are part of the game. There is no need to get personal, pull rank, or make accusations to dismiss somebody else's clearly stated usecase, perspective, or objections. The best way to avoid this is to make technical statements, and reply with technical responses where those statements are made.
On Tue, Dec 2, 2025 at 9:02 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > It's hard to put a penthouse on a sandcastle, and maybe that's the > > reason makes it hard to describe or layout the further implementations > > of swap. > > Sure, I can understand that. However, I think the conflict is not > necessarily about implementation strategy, it's about priorities. Right, that is why we take the more incremental approach. Cleanup and simplify the swap code in order to make later stage things happen. > We have a usecase. We have a functional implementation that satisfies > this usecase. It was sent as RFCs early on to gain consensus on the > direction and find the best tradeoffs wrt other usecases. These RFC > threads are the place to voice concerns and influence direction. Speak of priority. We have a more priority approach by incrementally landing code. The priority reflects that we are actually landing code, improving it towards that glorified goal. I have already expressed concern and the 2023 LSF swap abstraction talk the community already picked the winner. Not the one VS is based on. Rebooting that without addressing the previous concern is a waste of everybody's time. You basically say the community picks the wrong one. Let's retry again. > Both Chris and you have stated that further swap table work *may* also > enable this usecase. However, at this time, I think it's also fair to > say that it's more of an afterthought, and no concrete design or code > for how this would actually look like has been proposed. High-level > ideas have been floated, but as you can see from Nhat, Rik's, Yosry's > and my replies, they don't really meet the necessary requirements. The VS doesn't meet the requirement of upstream and other companies that do not unnecessarily blow up the kernel memory usage. One fight at a time, sorry I have to get the VS out of the way before I comment on other aspects of this patch. > This is not some principled stance. The virtual swap patches are > difficult to develop, especially given the current rate of change of > the underlying swap codebase. If anybody working on vswap had seen a > plausible way to solve these issues through incremental swap table > improvements they would have jumped on it a long time ago. > > It's more about priorities. Combining compression with disk swap is > extremely powerful, because it dramatically reduces the worst aspects > of both: it reduces the memory footprint of compression by shedding > the coldest data to disk; it reduces the IO latencies and flash wear > of disk swap through the writeback cache. In practice, this reduces > *average event rates of the entire reclaim/paging/IO stack*. > > These are higher-order overhead savings that are difficult to beat > with first-order descriptor and lookup cost optimizations. > > We obviously want to have both, but they are orthogonal goals. You can > see how it doesn't make sense for us to deprioritize the former for > the latter, or why Nhat says it's an apples to oranges comparison. My advice is that, make it incremental, come up with production quality solutions. Adding one layer of XArry to redirect is easy. The hard part is how to keep the memory usage in check and perform well. The posted VS will give you false illusion of progress because it doesn't have a clear way to address the performance and memory usage problem which can meet the production quality requirement for upstream. The upstream kernel is not a toy kernel. > It also won't work for one party to say "we will fix this, give us > time". Everybody wants to advance the thing they primarily care about Exactly, the same applies to VS. Even if I can spend the time convincing you the grant vision and you are buying it. Who guarantees that vision can be implemented without the surprise black swan assumption that set us back to the drawing board? So landing the actual improvement is the king. That is the real progress. If your team want to get the result sooner, helping swap table landing would speed up your goal. Once the code base is cleaned up. It is much easier to move in ANY direction. > with the least amount of detours. That's where we have to find > compromise. Either let people pursue what's most important to them, or > lay out an encompassing design to build consensus and organize effort. This is just a development methodology, a personal choice. That is why the swap table is landing as of now. > And yes, let's please stay technical and on-topic in these > discussions. Let's acknowledge we have interests that overlap, and > interests that do not. Then find ways to service everybody's usecases. Yes, I agree. > Disagreements are part of the game. There is no need to get personal, > pull rank, or make accusations to dismiss somebody else's clearly > stated usecase, perspective, or objections. Also agree. > > The best way to avoid this is to make technical statements, and reply > with technical responses where those statements are made. Yes, exactly. That is why I want to get a straight answer about the VS slot overhead number. Writing a grant design doc is a much bigger task. It will put non native English speakers at a disadvantage for writing the big design docs. A lot of us don't have the luxury of contributing to the upstream as a day job, me included. We do it in our spare times because we love it. Chris
On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote: > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> > wrote: > > > > > > Zswap is primarily a compressed cache for real swap on secondary > > storage. It's indeed quite important that entries currently in > > zswap > > don't occupy disk slots; but for a solution to this to be > > acceptable, > > it has to work with the primary usecase and support disk writeback. > > Well, my plan is to support the writeback via swap.tiers. > How would you do writeback from a zswap entry in a ghost swapfile, to a real disk swap backend? That is the use case people are trying to solve. How would your architecture address it? -- All Rights Reversed.
On Mon, Nov 24, 2025 at 7:15 PM Rik van Riel <riel@surriel.com> wrote:
>
> On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org>
> > wrote:
> > >
> > >
> > > Zswap is primarily a compressed cache for real swap on secondary
> > > storage. It's indeed quite important that entries currently in
> > > zswap
> > > don't occupy disk slots; but for a solution to this to be
> > > acceptable,
> > > it has to work with the primary usecase and support disk writeback.
> >
> > Well, my plan is to support the writeback via swap.tiers.
> >
> How would you do writeback from a zswap entry in
> a ghost swapfile, to a real disk swap backend?
Basically, each swap file has its own version swap
ops->{read,write}_folio(). The mem swap tier is similar to the current
zswap but it is memory only, there is no file backing and don't share
swap entries with the real swapfile.
When writing back from one swap entry to another swapfile, for the
simple case of uncompressing the data, data will store to swap cache
and write to another swapfile with allocated another swap entry. The
front end of the swap cache will have the option map the front end
swap entry offset to the back end block locations. At the memory price
of 4 byte per swap entry.
This kind of physical block redirection not only happens in more than
one swapfile, it can happen in the same swapfile, in the situation
that there is available space in lower order swap entries. But can not
allocate the higher order one because those lower order ones are not
continued. In such a case, the swap file can expand the high order
swap entry beyond the end of the current physical swapfile. Then map
two continues high order swap entry into the low order physical
locations. I have some slides I shared in the 2024 LSF the swap pony
talk with some diagrams for that physical swap location redirection.
> That is the use case people are trying to solve.
Yes, me too.
> How would your architecture address it?
The cluster base swap allocator, swap table as the new swap cache, per
cgroup swap.tiers and the vfs like swap ops all integrally work
together as the grant vision for the new swap system. I might not have
an answer for all the design details right now. I am the type of
person who likes to improvise and adjust the design details when more
detailed design constraints are found. So far I found this design can
work well. Some of the early milestones, swap allocator and swap
tables which already landed in the kernel and show great results.
I consider this is much better than the VS (previous swap astraction).
It does not enforce pain like the VS does. One of the big downsides of
VS is that, once applied to the kernel. Even normal swap does not use
redirection and will pay the price for it as well. The pain is
mandatory. My swap.tiers write back does not have this problem. If no
writeback or not redirection of physical blocks, no additional
overhead pay for memory nor CPU.
Chris
On Mon, 2025-11-24 at 20:26 +0300, Chris Li wrote:
> On Mon, Nov 24, 2025 at 7:15 PM Rik van Riel <riel@surriel.com>
> wrote:
> >
> > On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner
> > > <hannes@cmpxchg.org>
> > > wrote:
> > > >
> > > >
> > > > Zswap is primarily a compressed cache for real swap on
> > > > secondary
> > > > storage. It's indeed quite important that entries currently in
> > > > zswap
> > > > don't occupy disk slots; but for a solution to this to be
> > > > acceptable,
> > > > it has to work with the primary usecase and support disk
> > > > writeback.
> > >
> > > Well, my plan is to support the writeback via swap.tiers.
> > >
> > How would you do writeback from a zswap entry in
> > a ghost swapfile, to a real disk swap backend?
>
> Basically, each swap file has its own version swap
> ops->{read,write}_folio(). The mem swap tier is similar to the
> current
> zswap but it is memory only, there is no file backing and don't share
> swap entries with the real swapfile.
>
> When writing back from one swap entry to another swapfile, for the
> simple case of uncompressing the data, data will store to swap cache
> and write to another swapfile with allocated another swap entry. The
> front end of the swap cache will have the option map the front end
> swap entry offset to the back end block locations. At the memory
> price
> of 4 byte per swap entry.
Wait, so you use the swap cache radix tree to
indicate the physical location of data between
multiple swap devices?
Isn't that exactly what the vswap approach
does, too?
How is this different?
--
All Rights Reversed.
On Mon, Nov 24, 2025 at 8:43 PM Rik van Riel <riel@surriel.com> wrote:
>
> On Mon, 2025-11-24 at 20:26 +0300, Chris Li wrote:
> > On Mon, Nov 24, 2025 at 7:15 PM Rik van Riel <riel@surriel.com>
> > wrote:
> > >
> > > On Fri, 2025-11-21 at 17:52 -0800, Chris Li wrote:
> > > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner
> > > > <hannes@cmpxchg.org>
> > > > wrote:
> > > > >
> > > > >
> > > > > Zswap is primarily a compressed cache for real swap on
> > > > > secondary
> > > > > storage. It's indeed quite important that entries currently in
> > > > > zswap
> > > > > don't occupy disk slots; but for a solution to this to be
> > > > > acceptable,
> > > > > it has to work with the primary usecase and support disk
> > > > > writeback.
> > > >
> > > > Well, my plan is to support the writeback via swap.tiers.
> > > >
> > > How would you do writeback from a zswap entry in
> > > a ghost swapfile, to a real disk swap backend?
> >
> > Basically, each swap file has its own version swap
> > ops->{read,write}_folio(). The mem swap tier is similar to the
> > current
> > zswap but it is memory only, there is no file backing and don't share
> > swap entries with the real swapfile.
> >
> > When writing back from one swap entry to another swapfile, for the
> > simple case of uncompressing the data, data will store to swap cache
> > and write to another swapfile with allocated another swap entry. The
> > front end of the swap cache will have the option map the front end
> > swap entry offset to the back end block locations. At the memory
> > price
> > of 4 byte per swap entry.
>
> Wait, so you use the swap cache radix tree to
> indicate the physical location of data between
> multiple swap devices?
Ah, you haven't caught up with the progress that the new swap cache
does not use radix trees any more. It is using swap tables. It is a
512 entry swpa table array lookup, no tree lookup. Much faster with
less locks. The swap table commit shows there are about 20% difference
in throughput in some test benchmark workloads.
> Isn't that exactly what the vswap approach
> does, too?
Except that I purpose it earlier.
https://lore.kernel.org/linux-mm/CANeU7QnPsTouKxdK2QO8Opho6dh1qMGTox2e5kFOV8jKoEJwig@mail.gmail.com/
That swap cache physcial entry redirection is my original idea as far
as I can tell and presented in the conference earlier.
> How is this different?
The main difference will be I just get rid of the xarray in swap cache
lookup. I don't want to re-introduce it again.
Also in my swap.tiers design, the redirection overhead is optional. If
you are not using redirection, in swap.tiers swpa ops you don't pay
for it. Just like the ghost swap file. VS it is not optional, will
enforce the overhead as well. In my design the memory overhead will be
smaller per swap entry because it will be integrated tightly with swap
entry.
Chris
On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote: > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > The current zswap requires a backing swapfile. The swap slot used > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > space. > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > prevents accidental read or write of ghost swapfile, bdev of > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > flag because there is no rotation disk access when using zswap. > > > > Zswap is primarily a compressed cache for real swap on secondary > > storage. It's indeed quite important that entries currently in zswap > > don't occupy disk slots; but for a solution to this to be acceptable, > > it has to work with the primary usecase and support disk writeback. > > Well, my plan is to support the writeback via swap.tiers. > > > This direction is a dead-end. Please take a look at Nhat's swap > > virtualization patches. They decouple zswap from disk geometry, while > > still supporting writeback to an actual backend file. > > Yes, there are many ways to decouple zswap from disk geometry, my swap > table + swap.tiers design can do that as well. I have concerns about > swap virtualization in the aspect of adding another layer of memory > overhead addition per swap entry and CPU overhead of extra xarray > lookup. I believe my approach is technically superior and cleaner. True, but the static nature of the current swapfile infrastructure also imposes an space overhead and/or operational overhead. I did play around with a prototype with a ghost swapfile for virtual swap, but had to stop because of the swapfile overhead for larger virtual swap space. > Both faster and cleaner. Basically swap.tiers + VFS like swap read > write page ops. I will let Nhat clarify the performance and memory That just solves static placement, no? Backend transfer requires something extra/orthogonal. > overhead side of the swap virtualization. > > I am not against swap entry redirection. Just the swap virtualization There will be redirection either way. I don't think it's avoidable. The only option is whether to shove it into the backend (what zram is doing), or having a generalized module (swap virtualization). Or do a page table walk every time you want to do backend transfer (what swapoff is doing). > series needs to compare against the alternatives in terms of memory > overhead and throughput. > Solving it from the swap.tiers angle is cleaner. > > > Nacked-by: Johannes Weiner <hannes@cmpxchg.org> > > I take that the only relevant part is you are zswap maintainer and I > am the swap maintainer. Fine. I got the message. I will leave the > zswap alone. I will find other ways to address the memory base swap > tiers in swap.tiers. Please keep this discussion technical and not pull ranks unnecessarily. > > Chris
On Sat, Nov 22, 2025 at 10:09 AM Chris Li <chrisl@kernel.org> wrote: > > On Fri, Nov 21, 2025 at 3:40 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > On Fri, Nov 21, 2025 at 01:31:43AM -0800, Chris Li wrote: > > > The current zswap requires a backing swapfile. The swap slot used > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > space. > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > prevents accidental read or write of ghost swapfile, bdev of > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > flag because there is no rotation disk access when using zswap. > > > > Zswap is primarily a compressed cache for real swap on secondary > > storage. It's indeed quite important that entries currently in zswap > > don't occupy disk slots; but for a solution to this to be acceptable, > > it has to work with the primary usecase and support disk writeback. > > Well, my plan is to support the writeback via swap.tiers. That sounds interesting. Have been watching YoungJun and yours swap.tiers discussion for a while, looking forward to see how they play together. Using tiering to resolve the writeback issue sounds like a nice solution, we definitely don't want to limit the writeback to zswap/ram-block only, we will also want things like block-block writeback. We (and I have noticed many community users) have setups involving hybrid tiers. We have a internal module that moves swap entry from SSD to HDD too. To do it upstreamly we need something like the swap.tiers. > > > This direction is a dead-end. Please take a look at Nhat's swap > > virtualization patches. They decouple zswap from disk geometry, while > > still supporting writeback to an actual backend file. > > Yes, there are many ways to decouple zswap from disk geometry, my swap > ... > Solving it from the swap.tiers angle is cleaner. Agree with the swap.tiers part, that sounds cleaner. > > > Nacked-by: Johannes Weiner <hannes@cmpxchg.org> I think that's too early to justify. Let's stay open for ideas.
On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote: > > The current zswap requires a backing swapfile. The swap slot used > by zswap is not able to be used by the swapfile. That waste swapfile > space. > > The ghost swapfile is a swapfile that only contains the swapfile header > for zswap. The swapfile header indicate the size of the swapfile. There > is no swap data section in the ghost swapfile, therefore, no waste of > swapfile space. As such, any write to a ghost swapfile will fail. To > prevents accidental read or write of ghost swapfile, bdev of > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > flag because there is no rotation disk access when using zswap. Would this also affect the swap slot allocation algorithm? > > The zswap write back has been disabled if all swapfiles in the system > are ghost swap files. I don't like this design: 1. Statically sizing the compression tier will be an operational nightmare, for users that have to support a variety (and increasingly bigger sized) types of hosts. It's one of the primary motivations of the virtual swap line of work. We need to move towards a more dynamic architecture for zswap, not the other way around, in order to reduce both (human's) operational overhead, AND actual space overhead (i.e only allocate (z)swap metadata on-demand). 2. This digs us in the hole of supporting a special infrastructure for non-writeback cases. Now every future change to zswap's architecture has to take this into account. It's not easy to turn this design into something that can support writeback - you're stuck with either having to do an expensive page table walk to update the PTEs, or shoving the virtual swap layer inside zswap. Ugly. 3. And what does this even buy us? Just create a fake in-memory-only swapfile (heck, you can use zram), disable writeback (which you can do both at a cgroup and host-level), and call it a day. Nacked-by: Nhat Pham <nphamcs@gmail.com>
On Fri, Nov 21, 2025 at 2:19 AM Nhat Pham <nphamcs@gmail.com> wrote: > > On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote: > > > > The current zswap requires a backing swapfile. The swap slot used > > by zswap is not able to be used by the swapfile. That waste swapfile > > space. > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > for zswap. The swapfile header indicate the size of the swapfile. There > > is no swap data section in the ghost swapfile, therefore, no waste of > > swapfile space. As such, any write to a ghost swapfile will fail. To > > prevents accidental read or write of ghost swapfile, bdev of > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > flag because there is no rotation disk access when using zswap. > > Would this also affect the swap slot allocation algorithm? > > > > > The zswap write back has been disabled if all swapfiles in the system > > are ghost swap files. > > I don't like this design: > > 1. Statically sizing the compression tier will be an operational > nightmare, for users that have to support a variety (and increasingly > bigger sized) types of hosts. It's one of the primary motivations of > the virtual swap line of work. We need to move towards a more dynamic > architecture for zswap, not the other way around, in order to reduce > both (human's) operational overhead, AND actual space overhead (i.e > only allocate (z)swap metadata on-demand). Let's do it one step at a time. > 2. This digs us in the hole of supporting a special infrastructure for > non-writeback cases. Now every future change to zswap's architecture > has to take this into account. It's not easy to turn this design into > something that can support writeback - you're stuck with either having > to do an expensive page table walk to update the PTEs, or shoving the > virtual swap layer inside zswap. Ugly. What are you talking about? This patch does not have any page table work. You are opposing something in your imagination. Please show me the code in which I do expensive PTE walks. > 3. And what does this even buy us? Just create a fake in-memory-only > swapfile (heck, you can use zram), disable writeback (which you can do > both at a cgroup and host-level), and call it a day. Well this provides users a choice, if they don't care about write backs. They can do zswap with ghost swapfile now without actually wasting disk space. It also does not stop zswap using write back with normal SSD. If you want to write back, you can still use a non ghost swapfile as normal. It is a simple enough patch to provide value right now. It also fits into the swap.tiers long term roadmap to have a seperate tier for memory based swapfiles. I believe that is a cleaner picture than the current zswap as cache but also gets its hands so deep into the swap stack and slows down other swap tiers. > Nacked-by: Nhat Pham <nphamcs@gmail.com> I heard you, if you don't don't want zswap to have anything to do with memory based swap tier in the swap.tiers design. I respect your choice. Chris
On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote: > > On Fri, Nov 21, 2025 at 2:19 AM Nhat Pham <nphamcs@gmail.com> wrote: > > > > On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote: > > > > > > The current zswap requires a backing swapfile. The swap slot used > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > space. > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > prevents accidental read or write of ghost swapfile, bdev of > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > flag because there is no rotation disk access when using zswap. > > > > Would this also affect the swap slot allocation algorithm? > > > > > > > > The zswap write back has been disabled if all swapfiles in the system > > > are ghost swap files. > > > > I don't like this design: > > > > 1. Statically sizing the compression tier will be an operational > > nightmare, for users that have to support a variety (and increasingly > > bigger sized) types of hosts. It's one of the primary motivations of > > the virtual swap line of work. We need to move towards a more dynamic > > architecture for zswap, not the other way around, in order to reduce > > both (human's) operational overhead, AND actual space overhead (i.e > > only allocate (z)swap metadata on-demand). > > Let's do it one step at a time. I'm happy with landing these patches one step at a time. But from my POV (and admittedly limited imagination), it's a bit of a deadend. The only architecture, IMO, that satisfies: 1. Dynamic overhead of (z)swap metadata. 2. Decouple swap backends, i.e no pre-reservation of lower tiers space (what zswap is doing right now). 3. Backend transfer without page table walks. is swap virtualization. If you want to present an alternative vision, you don't have to implement it right away, but you have to at least explain to me how to achieve all these 3. > > > 2. This digs us in the hole of supporting a special infrastructure for > > non-writeback cases. Now every future change to zswap's architecture > > has to take this into account. It's not easy to turn this design into > > something that can support writeback - you're stuck with either having > > to do an expensive page table walk to update the PTEs, or shoving the > > virtual swap layer inside zswap. Ugly. > > What are you talking about? This patch does not have any page table > work. You are opposing something in your imagination. Please show me > the code in which I do expensive PTE walks. Please read my response again. I did not say you did any PTE walk in this patch. What I meant was, if you want to make this the general architecture for zswap and not some niche infrastructure for specialized use case, you need to be able to support backend transfer, i.e zswap writeback (zswap -> disk swap, and perhaps in the future the other direction). This will be very expensive with this design. > > > 3. And what does this even buy us? Just create a fake in-memory-only > > swapfile (heck, you can use zram), disable writeback (which you can do > > both at a cgroup and host-level), and call it a day. > > Well this provides users a choice, if they don't care about write > backs. They can do zswap with ghost swapfile now without actually > wasting disk space. > > It also does not stop zswap using write back with normal SSD. If you > want to write back, you can still use a non ghost swapfile as normal. > > It is a simple enough patch to provide value right now. It also fits > into the swap.tiers long term roadmap to have a seperate tier for > memory based swapfiles. I believe that is a cleaner picture than the > current zswap as cache but also gets its hands so deep into the swap > stack and slows down other swap tiers. > > > Nacked-by: Nhat Pham <nphamcs@gmail.com> > > I heard you, if you don't don't want zswap to have anything to do > with memory based swap tier in the swap.tiers design. I respect your > choice. Where does this even come from? I can't speak for Johannes or Yosry, but personally I'm ambivalent with respect to swap.tiers. My only objection in the past was there was not any use case at a time, but there seems to be one now. I won't stand in the way of swap.tiers landing, or zswap's integration into it. From my POV, swap.tiers solve a problem completely orthogonal to what I'm trying to solve, namely, the three points listed above. It's about definition of swap hierarchy, either at initial placement time, or during offloading from one backend to another, where as I'm trying to figure out the mechanistic side of it (how to transfer a page from one backend to another without page table walking). These two are independent, if not synergistic. > > Chris
On Mon, Nov 24, 2025 at 5:47 PM Nhat Pham <nphamcs@gmail.com> wrote: > > On Fri, Nov 21, 2025 at 5:52 PM Chris Li <chrisl@kernel.org> wrote: > > > > On Fri, Nov 21, 2025 at 2:19 AM Nhat Pham <nphamcs@gmail.com> wrote: > > > > > > On Fri, Nov 21, 2025 at 9:32 AM Chris Li <chrisl@kernel.org> wrote: > > > > > > > > The current zswap requires a backing swapfile. The swap slot used > > > > by zswap is not able to be used by the swapfile. That waste swapfile > > > > space. > > > > > > > > The ghost swapfile is a swapfile that only contains the swapfile header > > > > for zswap. The swapfile header indicate the size of the swapfile. There > > > > is no swap data section in the ghost swapfile, therefore, no waste of > > > > swapfile space. As such, any write to a ghost swapfile will fail. To > > > > prevents accidental read or write of ghost swapfile, bdev of > > > > swap_info_struct is set to NULL. Ghost swapfile will also set the SSD > > > > flag because there is no rotation disk access when using zswap. > > > > > > Would this also affect the swap slot allocation algorithm? > > > > > > > > > > > The zswap write back has been disabled if all swapfiles in the system > > > > are ghost swap files. > > > > > > I don't like this design: > > > > > > 1. Statically sizing the compression tier will be an operational > > > nightmare, for users that have to support a variety (and increasingly > > > bigger sized) types of hosts. It's one of the primary motivations of > > > the virtual swap line of work. We need to move towards a more dynamic > > > architecture for zswap, not the other way around, in order to reduce > > > both (human's) operational overhead, AND actual space overhead (i.e > > > only allocate (z)swap metadata on-demand). > > > > Let's do it one step at a time. > > I'm happy with landing these patches one step at a time. But from my > POV (and admittedly limited imagination), it's a bit of a deadend. > > The only architecture, IMO, that satisfies: > > 1. Dynamic overhead of (z)swap metadata. > > 2. Decouple swap backends, i.e no pre-reservation of lower tiers space > (what zswap is doing right now). > > 3. Backend transfer without page table walks. > > is swap virtualization. > > If you want to present an alternative vision, you don't have to > implement it right away, but you have to at least explain to me how to > achieve all these 3. From 1,2,3 to SV as the only solution is a big jump. How many possibilities have you explored to conclude that no other solution can satisfy your 123? I just replied to Rik's email about the high level sketch design. My design should satisfy it and can serve as one counter example of alternative design. > > > > > > 2. This digs us in the hole of supporting a special infrastructure for > > > non-writeback cases. Now every future change to zswap's architecture > > > has to take this into account. It's not easy to turn this design into > > > something that can support writeback - you're stuck with either having > > > to do an expensive page table walk to update the PTEs, or shoving the > > > virtual swap layer inside zswap. Ugly. > > > > What are you talking about? This patch does not have any page table > > work. You are opposing something in your imagination. Please show me > > the code in which I do expensive PTE walks. > > Please read my response again. I did not say you did any PTE walk in this patch. > > What I meant was, if you want to make this the general architecture > for zswap and not some niche infrastructure for specialized use case, > you need to be able to support backend transfer, i.e zswap writeback > (zswap -> disk swap, and perhaps in the future the other direction). > This will be very expensive with this design. I can't say I agree with you. It seems you have made a lot of assumptions in your reasoning. > > > 3. And what does this even buy us? Just create a fake in-memory-only > > > swapfile (heck, you can use zram), disable writeback (which you can do > > > both at a cgroup and host-level), and call it a day. > > > > Well this provides users a choice, if they don't care about write > > backs. They can do zswap with ghost swapfile now without actually > > wasting disk space. > > > > It also does not stop zswap using write back with normal SSD. If you > > want to write back, you can still use a non ghost swapfile as normal. > > > > It is a simple enough patch to provide value right now. It also fits > > into the swap.tiers long term roadmap to have a seperate tier for > > memory based swapfiles. I believe that is a cleaner picture than the > > current zswap as cache but also gets its hands so deep into the swap > > stack and slows down other swap tiers. > > > > > Nacked-by: Nhat Pham <nphamcs@gmail.com> > > > > I heard you, if you don't don't want zswap to have anything to do > > with memory based swap tier in the swap.tiers design. I respect your > > choice. > > Where does this even come from? > > I can't speak for Johannes or Yosry, but personally I'm ambivalent > with respect to swap.tiers. My only objection in the past was there > was not any use case at a time, but there seems to be one now. I won't > stand in the way of swap.tiers landing, or zswap's integration into > it. > > From my POV, swap.tiers solve a problem completely orthogonal to what > I'm trying to solve, namely, the three points listed above. It's about > definition of swap hierarchy, either at initial placement time, or > during offloading from one backend to another, where as I'm trying to > figure out the mechanistic side of it (how to transfer a page from one > backend to another without page table walking). These two are > independent, if not synergistic. I think our goal overlaps, just a different approach with different performance charistic. I have asked in this thread a few times, how big is the per swap slot memory overhead VS introduced? That is something that I care about a lot. Chris
© 2016 - 2026 Red Hat, Inc.