[PATCH] btrfs: index buffer_tree using node size

Daniel Vacek posted 1 patch 9 months ago
There is a newer version of this series
fs/btrfs/disk-io.c   |  1 +
fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
fs/btrfs/fs.h        |  3 ++-
3 files changed, 18 insertions(+), 16 deletions(-)
[PATCH] btrfs: index buffer_tree using node size
Posted by Daniel Vacek 9 months ago
So far we are deriving the buffer tree index using the sector size. But each
extent buffer covers multiple sectors. This makes the buffer tree rather sparse.

For example the typical and quite common configuration uses sector size of 4KiB
and node size of 16KiB. In this case it means the buffer tree is using up to
the maximum of 25% of it's slots. Or in other words at least 75% of the tree
slots are wasted as never used.

We can score significant memory savings on the required tree nodes by indexing
the tree using the node size instead. As a result far less slots are wasted
and the tree can now use up to all 100% of it's slots this way.

Signed-off-by: Daniel Vacek <neelx@suse.com>
---
 fs/btrfs/disk-io.c   |  1 +
 fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
 fs/btrfs/fs.h        |  3 ++-
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5bcf11246ba66..dcea5b0a2db50 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3395,6 +3395,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
 
 	fs_info->nodesize = nodesize;
+	fs_info->node_bits = ilog2(nodesize);
 	fs_info->sectorsize = sectorsize;
 	fs_info->sectorsize_bits = ilog2(sectorsize);
 	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4d3584790cf7f..80a8563a25add 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
 	 */
 	spin_lock(&eb->refs_lock);
 	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
+		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
 		unsigned long flags;
 
 		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
@@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
 static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
+	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
 	unsigned long flags;
 
 	xas_lock_irqsave(&xas, flags);
@@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
 static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
+	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
 	unsigned long flags;
 
 	xas_lock_irqsave(&xas, flags);
@@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
 	rcu_read_lock();
 	while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
 		if (!eb_batch_add(batch, eb)) {
-			*start = (eb->start + eb->len) >> fs_info->sectorsize_bits;
+			*start = (eb->start + eb->len) >> fs_info->node_bits;
 			goto out;
 		}
 	}
@@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
 		struct btrfs_fs_info *fs_info, u64 start)
 {
 	struct extent_buffer *eb;
-	unsigned long index = start >> fs_info->sectorsize_bits;
+	unsigned long index = start >> fs_info->node_bits;
 
 	rcu_read_lock();
 	eb = xa_load(&fs_info->buffer_tree, index);
@@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
 				      u64 end)
 {
 	struct eb_batch batch;
-	unsigned long start_index = start >> fs_info->sectorsize_bits;
-	unsigned long end_index = end >> fs_info->sectorsize_bits;
+	unsigned long start_index = start >> fs_info->node_bits;
+	unsigned long end_index = end >> fs_info->node_bits;
 
 	eb_batch_init(&batch);
 	while (start_index <= end_index) {
@@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 
 	eb_batch_init(&batch);
 	if (wbc->range_cyclic) {
-		index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits;
+		index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->node_bits;
 		end = -1;
 
 		/*
@@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
 		 */
 		scanned = (index == 0);
 	} else {
-		index = wbc->range_start >> fs_info->sectorsize_bits;
-		end = wbc->range_end >> fs_info->sectorsize_bits;
+		index = wbc->range_start >> fs_info->node_bits;
+		end = wbc->range_end >> fs_info->node_bits;
 
 		scanned = 1;
 	}
@@ -3037,7 +3037,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
 	eb->fs_info = fs_info;
 again:
 	xa_lock_irq(&fs_info->buffer_tree);
-	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
+	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->node_bits,
 			      NULL, eb, GFP_NOFS);
 	if (xa_is_err(exists)) {
 		ret = xa_err(exists);
@@ -3353,7 +3353,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 again:
 	xa_lock_irq(&fs_info->buffer_tree);
 	existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
-				   start >> fs_info->sectorsize_bits, NULL, eb,
+				   start >> fs_info->node_bits, NULL, eb,
 				   GFP_NOFS);
 	if (xa_is_err(existing_eb)) {
 		ret = xa_err(existing_eb);
@@ -3456,7 +3456,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
 		 * in this case.
 		 */
 		xa_cmpxchg_irq(&fs_info->buffer_tree,
-			       eb->start >> fs_info->sectorsize_bits, eb, NULL,
+			       eb->start >> fs_info->node_bits, eb, NULL,
 			       GFP_ATOMIC);
 
 		btrfs_leak_debug_del_eb(eb);
@@ -4294,9 +4294,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
 {
 	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
 	struct extent_buffer *eb;
-	unsigned long start = folio_pos(folio) >> fs_info->sectorsize_bits;
+	unsigned long start = folio_pos(folio) >> fs_info->node_bits;
 	unsigned long index = start;
-	unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
+	unsigned long end = index + (PAGE_SIZE >> fs_info->node_bits) - 1;
 	int ret;
 
 	xa_lock_irq(&fs_info->buffer_tree);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index cf805b4032af3..8c9113304fabe 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -778,8 +778,9 @@ struct btrfs_fs_info {
 
 	struct btrfs_delayed_root *delayed_root;
 
-	/* Entries are eb->start / sectorsize */
+	/* Entries are eb->start >> node_bits */
 	struct xarray buffer_tree;
+	int node_bits;
 
 	/* Next backup root to be overwritten */
 	int backup_root_index;
-- 
2.47.2
Re: [PATCH] btrfs: index buffer_tree using node size
Posted by David Sterba 8 months ago
On Mon, May 12, 2025 at 07:23:20PM +0200, Daniel Vacek wrote:
> So far we are deriving the buffer tree index using the sector size. But each
> extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> 
> For example the typical and quite common configuration uses sector size of 4KiB
> and node size of 16KiB. In this case it means the buffer tree is using up to
> the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> slots are wasted as never used.
> 
> We can score significant memory savings on the required tree nodes by indexing
> the tree using the node size instead. As a result far less slots are wasted
> and the tree can now use up to all 100% of it's slots this way.
> 
> Signed-off-by: Daniel Vacek <neelx@suse.com>

This is a useful improvement, so we should continue and merge it. The
performance improvements should be done so we get some idea. Runtime and
slab savings.

One coding comment, please rename node_bits to nodesize_bits so it's
consistent with sectorsize and sectorsize_bits.

>  fs/btrfs/disk-io.c   |  1 +
>  fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
>  fs/btrfs/fs.h        |  3 ++-
>  3 files changed, 18 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 5bcf11246ba66..dcea5b0a2db50 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -4294,9 +4294,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
>  {
>  	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
>  	struct extent_buffer *eb;
> -	unsigned long start = folio_pos(folio) >> fs_info->sectorsize_bits;
> +	unsigned long start = folio_pos(folio) >> fs_info->node_bits;
>  	unsigned long index = start;
> -	unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
> +	unsigned long end = index + (PAGE_SIZE >> fs_info->node_bits) - 1;

This looks a bit suspicious, page size is 4k node size can be 4k .. 64k.
It's in subpage code so sector < page, the shift it's always >= 0. Node
can be larger so the shift result would be 0 but as a result of 4k
shifted by "16k".

>  	int ret;
>  
>  	xa_lock_irq(&fs_info->buffer_tree);
> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> index cf805b4032af3..8c9113304fabe 100644
> --- a/fs/btrfs/fs.h
> +++ b/fs/btrfs/fs.h
> @@ -778,8 +778,9 @@ struct btrfs_fs_info {
>  
>  	struct btrfs_delayed_root *delayed_root;
>  
> -	/* Entries are eb->start / sectorsize */
> +	/* Entries are eb->start >> node_bits */
>  	struct xarray buffer_tree;
> +	int node_bits;

u32 and pleas move it to nodesize.
Re: [PATCH] btrfs: index buffer_tree using node size
Posted by Daniel Vacek 8 months ago
On Fri, 6 Jun 2025 at 19:28, David Sterba <dsterba@suse.cz> wrote:
>
> On Mon, May 12, 2025 at 07:23:20PM +0200, Daniel Vacek wrote:
> > So far we are deriving the buffer tree index using the sector size. But each
> > extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> >
> > For example the typical and quite common configuration uses sector size of 4KiB
> > and node size of 16KiB. In this case it means the buffer tree is using up to
> > the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> > slots are wasted as never used.
> >
> > We can score significant memory savings on the required tree nodes by indexing
> > the tree using the node size instead. As a result far less slots are wasted
> > and the tree can now use up to all 100% of it's slots this way.
> >
> > Signed-off-by: Daniel Vacek <neelx@suse.com>
>
> This is a useful improvement, so we should continue and merge it. The
> performance improvements should be done so we get some idea. Runtime and
> slab savings.

Performance-wise this fix is not that significant. With my testing I
did not notice any changes in runtime performance.
Slab usage is _relatively_ significant though - about 2/3 of the btrfs
radix_tree_node objects are saved. Though in _absolute_ numbers
system-wide it's still within the level of noise. Without this fix
btrfs uses a bit over 1% of the radix_tree_node slab cache while with
the fix it falls below half a percent from what I was able to pull
from the memory.

> One coding comment, please rename node_bits to nodesize_bits so it's
> consistent with sectorsize and sectorsize_bits.

Right.

> >  fs/btrfs/disk-io.c   |  1 +
> >  fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
> >  fs/btrfs/fs.h        |  3 ++-
> >  3 files changed, 18 insertions(+), 16 deletions(-)
> >
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index 5bcf11246ba66..dcea5b0a2db50 100644
> > --- a/fs/btrfs/extent_io.c
> > +++ b/fs/btrfs/extent_io.c
> > @@ -4294,9 +4294,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
> >  {
> >       struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
> >       struct extent_buffer *eb;
> > -     unsigned long start = folio_pos(folio) >> fs_info->sectorsize_bits;
> > +     unsigned long start = folio_pos(folio) >> fs_info->node_bits;
> >       unsigned long index = start;
> > -     unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
> > +     unsigned long end = index + (PAGE_SIZE >> fs_info->node_bits) - 1;
>
> This looks a bit suspicious, page size is 4k node size can be 4k .. 64k.
> It's in subpage code so sector < page, the shift it's always >= 0. Node
> can be larger so the shift result would be 0 but as a result of 4k
> shifted by "16k".

This comes from commit 19d7f65f032f ("btrfs: convert the buffer_radix
to an xarray").
You can still have 64 KiB page with 16 KiB node size. So this still
looks sound to me.

> >       int ret;
> >
> >       xa_lock_irq(&fs_info->buffer_tree);
> > diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> > index cf805b4032af3..8c9113304fabe 100644
> > --- a/fs/btrfs/fs.h
> > +++ b/fs/btrfs/fs.h
> > @@ -778,8 +778,9 @@ struct btrfs_fs_info {
> >
> >       struct btrfs_delayed_root *delayed_root;
> >
> > -     /* Entries are eb->start / sectorsize */
> > +     /* Entries are eb->start >> node_bits */
> >       struct xarray buffer_tree;
> > +     int node_bits;
>
> u32 and pleas move it to nodesize.

Sure.
Re: [PATCH] btrfs: index buffer_tree using node size
Posted by Filipe Manana 8 months, 4 weeks ago
On Mon, May 12, 2025 at 6:24 PM Daniel Vacek <neelx@suse.com> wrote:
>
> So far we are deriving the buffer tree index using the sector size. But each
> extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
>
> For example the typical and quite common configuration uses sector size of 4KiB
> and node size of 16KiB. In this case it means the buffer tree is using up to
> the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> slots are wasted as never used.
>
> We can score significant memory savings on the required tree nodes by indexing
> the tree using the node size instead. As a result far less slots are wasted
> and the tree can now use up to all 100% of it's slots this way.

Did you do any benchmarks? What results did you get?

If we could get in practice such a huge amount of space gains and
waste so much less memory, you should observe faster operations on the
buffer tree (xarray) as well, as we would get a smaller data
structure, with fewer nodes.

However, while this is a logically sound thing to do, in practice we
will always get many unused slots per xarray node, because:

1) We don't keep all extent buffers in memory all the time - some are
less frequently used and get evicted from memory after a while or
rarely get loaded in the first place, not all parts of a btree are
equally "hot";

2) It's a COW filesystem and metadata is always COWed, so you get
extent buffers allocated all over the place with big gaps between
them, in different block groups, etc.

And what about memory usage, did you see any significant reduction for
some workload? What was the reduction, what was the workload?

Xarray uses a kmem_cache to allocate nodes, so if we get such huge
gains as the change log claims, we should see a reduction by
monitoring files inside /sys/kernel/slab/radix_tree_node, like the
"objects" and "total_objects" files which tell the us the total amount
of allocated xarray nodes and how many are in use - this is for all
xarrays, so many will not belong to the buffer tree or even btrfs, but
it should still be very noticeable reduction on a workload that is
heavy on metadata, like fs_mark with empty files creating a large fs
tree (a few gigabytes at least).

Thanks.


>
> Signed-off-by: Daniel Vacek <neelx@suse.com>
> ---
>  fs/btrfs/disk-io.c   |  1 +
>  fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
>  fs/btrfs/fs.h        |  3 ++-
>  3 files changed, 18 insertions(+), 16 deletions(-)
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 5bcf11246ba66..dcea5b0a2db50 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3395,6 +3395,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
>         fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
>
>         fs_info->nodesize = nodesize;
> +       fs_info->node_bits = ilog2(nodesize);
>         fs_info->sectorsize = sectorsize;
>         fs_info->sectorsize_bits = ilog2(sectorsize);
>         fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 4d3584790cf7f..80a8563a25add 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
>          */
>         spin_lock(&eb->refs_lock);
>         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
> -               XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +               XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
>                 unsigned long flags;
>
>                 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
> @@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
>  static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
>  {
>         struct btrfs_fs_info *fs_info = eb->fs_info;
> -       XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +       XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
>         unsigned long flags;
>
>         xas_lock_irqsave(&xas, flags);
> @@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
>  static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
>  {
>         struct btrfs_fs_info *fs_info = eb->fs_info;
> -       XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +       XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
>         unsigned long flags;
>
>         xas_lock_irqsave(&xas, flags);
> @@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
>         rcu_read_lock();
>         while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
>                 if (!eb_batch_add(batch, eb)) {
> -                       *start = (eb->start + eb->len) >> fs_info->sectorsize_bits;
> +                       *start = (eb->start + eb->len) >> fs_info->node_bits;
>                         goto out;
>                 }
>         }
> @@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
>                 struct btrfs_fs_info *fs_info, u64 start)
>  {
>         struct extent_buffer *eb;
> -       unsigned long index = start >> fs_info->sectorsize_bits;
> +       unsigned long index = start >> fs_info->node_bits;
>
>         rcu_read_lock();
>         eb = xa_load(&fs_info->buffer_tree, index);
> @@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
>                                       u64 end)
>  {
>         struct eb_batch batch;
> -       unsigned long start_index = start >> fs_info->sectorsize_bits;
> -       unsigned long end_index = end >> fs_info->sectorsize_bits;
> +       unsigned long start_index = start >> fs_info->node_bits;
> +       unsigned long end_index = end >> fs_info->node_bits;
>
>         eb_batch_init(&batch);
>         while (start_index <= end_index) {
> @@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
>
>         eb_batch_init(&batch);
>         if (wbc->range_cyclic) {
> -               index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits;
> +               index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->node_bits;
>                 end = -1;
>
>                 /*
> @@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
>                  */
>                 scanned = (index == 0);
>         } else {
> -               index = wbc->range_start >> fs_info->sectorsize_bits;
> -               end = wbc->range_end >> fs_info->sectorsize_bits;
> +               index = wbc->range_start >> fs_info->node_bits;
> +               end = wbc->range_end >> fs_info->node_bits;
>
>                 scanned = 1;
>         }
> @@ -3037,7 +3037,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
>         eb->fs_info = fs_info;
>  again:
>         xa_lock_irq(&fs_info->buffer_tree);
> -       exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
> +       exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->node_bits,
>                               NULL, eb, GFP_NOFS);
>         if (xa_is_err(exists)) {
>                 ret = xa_err(exists);
> @@ -3353,7 +3353,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  again:
>         xa_lock_irq(&fs_info->buffer_tree);
>         existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
> -                                  start >> fs_info->sectorsize_bits, NULL, eb,
> +                                  start >> fs_info->node_bits, NULL, eb,
>                                    GFP_NOFS);
>         if (xa_is_err(existing_eb)) {
>                 ret = xa_err(existing_eb);
> @@ -3456,7 +3456,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
>                  * in this case.
>                  */
>                 xa_cmpxchg_irq(&fs_info->buffer_tree,
> -                              eb->start >> fs_info->sectorsize_bits, eb, NULL,
> +                              eb->start >> fs_info->node_bits, eb, NULL,
>                                GFP_ATOMIC);
>
>                 btrfs_leak_debug_del_eb(eb);
> @@ -4294,9 +4294,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
>  {
>         struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
>         struct extent_buffer *eb;
> -       unsigned long start = folio_pos(folio) >> fs_info->sectorsize_bits;
> +       unsigned long start = folio_pos(folio) >> fs_info->node_bits;
>         unsigned long index = start;
> -       unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
> +       unsigned long end = index + (PAGE_SIZE >> fs_info->node_bits) - 1;
>         int ret;
>
>         xa_lock_irq(&fs_info->buffer_tree);
> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> index cf805b4032af3..8c9113304fabe 100644
> --- a/fs/btrfs/fs.h
> +++ b/fs/btrfs/fs.h
> @@ -778,8 +778,9 @@ struct btrfs_fs_info {
>
>         struct btrfs_delayed_root *delayed_root;
>
> -       /* Entries are eb->start / sectorsize */
> +       /* Entries are eb->start >> node_bits */
>         struct xarray buffer_tree;
> +       int node_bits;
>
>         /* Next backup root to be overwritten */
>         int backup_root_index;
> --
> 2.47.2
>
>
Re: [PATCH] btrfs: index buffer_tree using node size
Posted by Daniel Vacek 8 months, 4 weeks ago
On Tue, 13 May 2025 at 12:12, Filipe Manana <fdmanana@kernel.org> wrote:
>
> On Mon, May 12, 2025 at 6:24 PM Daniel Vacek <neelx@suse.com> wrote:
> >
> > So far we are deriving the buffer tree index using the sector size. But each
> > extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> >
> > For example the typical and quite common configuration uses sector size of 4KiB
> > and node size of 16KiB. In this case it means the buffer tree is using up to
> > the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> > slots are wasted as never used.
> >
> > We can score significant memory savings on the required tree nodes by indexing
> > the tree using the node size instead. As a result far less slots are wasted
> > and the tree can now use up to all 100% of it's slots this way.
>
> Did you do any benchmarks? What results did you get?

Not really benchmarks. I just run fstests to make sure nothing fails.

> If we could get in practice such a huge amount of space gains and
> waste so much less memory, you should observe faster operations on the
> buffer tree (xarray) as well, as we would get a smaller data
> structure, with fewer nodes.

That's precisely the point, to save the memory with less nodes. I
don't expect to see any faster operations, really. I believe the radix
tree is quite efficient in the kernel. And for a tree in general
theory the operations are O(log(n)), so a bigger tree does not have
that much higher overhead.
But this reduces the tree height by one level tops. More likely not
even that. I still quite believe the tree operations overhead is well
below the threshold of noise for fs operations. You would really have
to microbenchmark just the tree operations themselves to see the
difference.

But I can still be pleasantly surprised.

> However, while this is a logically sound thing to do, in practice we
> will always get many unused slots per xarray node, because:
>
> 1) We don't keep all extent buffers in memory all the time - some are
> less frequently used and get evicted from memory after a while or
> rarely get loaded in the first place, not all parts of a btree are
> equally "hot";

No questions about that. This part does not change.

> 2) It's a COW filesystem and metadata is always COWed, so you get
> extent buffers allocated all over the place with big gaps between
> them, in different block groups, etc.

Yes, and sparse leaf nodes are actually expected and perfectly fine.
Still, there is no need to be more sparse than actually needed. Wasted
slots are wasted slots. Some leaf nodes are still full with 16 ebs
while 64 could have been fitted.

> And what about memory usage, did you see any significant reduction for
> some workload? What was the reduction, what was the workload?

I did not really check, to be honest. But since you're asking...

On my laptop with an uptime of two months and with a handful of sleep
cycles, workload about what you do on your workstation (couple VMs,
many FF tabs, many terminals and a live crash running), mem stats from
`top` below, 1T NVMe, there are 20610 ebs and 6455 tree leaves as I
type at the moment (but I bet it was similar yday or a week ago). The
highest tree index is 4341c9c which means 5 levels (with the patch it
would still be 5 levels, just the width would be 1/4 as the index
would become 10d0727) and that FS (/home) has 13k of the ebs and 4150
of leave nodes (/ is 7600 ebs / 2300 leaves and /boot has 10/5
allocated in memory), so you can see the tree is still fairly sparse
as there are way less ebs then available slots. Not even 3.2 ebs in
one leaf on average.

MiB Mem :  58470,3 total,   6035,4 free,  34408,7 used,  19525,8 buff/cache
MiB Swap:  32768,0 total,  15929,4 free,  16838,6 used.  24061,6 avail Mem

But I believe there are also huge parts of the tree not even
allocated. For example there are no nodes needed between indices 41ccc
and 3cc1d88 as there are no ebs allocated in this range. I did not
check but I guess the hole actually begins with the end of one bg and
ends with the start of another bg??

With the patch I guess you could expect something like 4k leaf nodes
instead of the 6k5 I see now. I estimate something like 30-40% savings
instead of the theoretical 75% if the ebs were fully present. Well
that's still not bad.

On the other hand if you check the absolute numbers, that's roughly
1.5 megs saved on a 64 gigs machine. And that's provided the slabs
were released, but see below...

Now even though it looks like a huge win if you focused only in terms
of btrfs, compared to the rest of the system it looks almost
insignificant:

> crash> kmem -s radix_tree_node
> CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
> ffff9eecc0045b00      576     464807    542899  19393    16k  radix_tree_node

I guess most of the tree nodes are used in memory mappings, vmas and
stuff, and inode mapping to page cache.

If btrfs is using 6.5k tree nodes (just for leaves to be fair, there
will be some more for inner nodes) out of the total of 0.5M nodes in
flight system-wide it's something over 1%, so maybe 2%. This patch can
reduce it by something like 0.5% being generous. Well, I can take it.

So yeah, we're wasting some memory and that can be fixed. But we're
far from being any significant user in the kernel. Just to fill in the
perspective.

> Xarray uses a kmem_cache to allocate nodes, so if we get such huge
> gains as the change log claims, we should see a reduction by
> monitoring files inside /sys/kernel/slab/radix_tree_node, like the
> "objects" and "total_objects" files which tell the us the total amount
> of allocated xarray nodes and how many are in use - this is for all
> xarrays, so many will not belong to the buffer tree or even btrfs, but
> it should still be very noticeable reduction on a workload that is
> heavy on metadata, like fs_mark with empty files creating a large fs
> tree (a few gigabytes at least).
>
> Thanks.
>
>
> >
> > Signed-off-by: Daniel Vacek <neelx@suse.com>
> > ---
> >  fs/btrfs/disk-io.c   |  1 +
> >  fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
> >  fs/btrfs/fs.h        |  3 ++-
> >  3 files changed, 18 insertions(+), 16 deletions(-)
> >
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index 5bcf11246ba66..dcea5b0a2db50 100644
> > --- a/fs/btrfs/disk-io.c
> > +++ b/fs/btrfs/disk-io.c
> > @@ -3395,6 +3395,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
> >         fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
> >
> >         fs_info->nodesize = nodesize;
> > +       fs_info->node_bits = ilog2(nodesize);
> >         fs_info->sectorsize = sectorsize;
> >         fs_info->sectorsize_bits = ilog2(sectorsize);
> >         fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
> > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> > index 4d3584790cf7f..80a8563a25add 100644
> > --- a/fs/btrfs/extent_io.c
> > +++ b/fs/btrfs/extent_io.c
> > @@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
> >          */
> >         spin_lock(&eb->refs_lock);
> >         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
> > -               XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> > +               XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
> >                 unsigned long flags;
> >
> >                 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
> > @@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
> >  static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
> >  {
> >         struct btrfs_fs_info *fs_info = eb->fs_info;
> > -       XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> > +       XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
> >         unsigned long flags;
> >
> >         xas_lock_irqsave(&xas, flags);
> > @@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
> >  static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
> >  {
> >         struct btrfs_fs_info *fs_info = eb->fs_info;
> > -       XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> > +       XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
> >         unsigned long flags;
> >
> >         xas_lock_irqsave(&xas, flags);
> > @@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
> >         rcu_read_lock();
> >         while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
> >                 if (!eb_batch_add(batch, eb)) {
> > -                       *start = (eb->start + eb->len) >> fs_info->sectorsize_bits;
> > +                       *start = (eb->start + eb->len) >> fs_info->node_bits;
> >                         goto out;
> >                 }
> >         }
> > @@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
> >                 struct btrfs_fs_info *fs_info, u64 start)
> >  {
> >         struct extent_buffer *eb;
> > -       unsigned long index = start >> fs_info->sectorsize_bits;
> > +       unsigned long index = start >> fs_info->node_bits;
> >
> >         rcu_read_lock();
> >         eb = xa_load(&fs_info->buffer_tree, index);
> > @@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
> >                                       u64 end)
> >  {
> >         struct eb_batch batch;
> > -       unsigned long start_index = start >> fs_info->sectorsize_bits;
> > -       unsigned long end_index = end >> fs_info->sectorsize_bits;
> > +       unsigned long start_index = start >> fs_info->node_bits;
> > +       unsigned long end_index = end >> fs_info->node_bits;
> >
> >         eb_batch_init(&batch);
> >         while (start_index <= end_index) {
> > @@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
> >
> >         eb_batch_init(&batch);
> >         if (wbc->range_cyclic) {
> > -               index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits;
> > +               index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->node_bits;
> >                 end = -1;
> >
> >                 /*
> > @@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
> >                  */
> >                 scanned = (index == 0);
> >         } else {
> > -               index = wbc->range_start >> fs_info->sectorsize_bits;
> > -               end = wbc->range_end >> fs_info->sectorsize_bits;
> > +               index = wbc->range_start >> fs_info->node_bits;
> > +               end = wbc->range_end >> fs_info->node_bits;
> >
> >                 scanned = 1;
> >         }
> > @@ -3037,7 +3037,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
> >         eb->fs_info = fs_info;
> >  again:
> >         xa_lock_irq(&fs_info->buffer_tree);
> > -       exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
> > +       exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->node_bits,
> >                               NULL, eb, GFP_NOFS);
> >         if (xa_is_err(exists)) {
> >                 ret = xa_err(exists);
> > @@ -3353,7 +3353,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
> >  again:
> >         xa_lock_irq(&fs_info->buffer_tree);
> >         existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
> > -                                  start >> fs_info->sectorsize_bits, NULL, eb,
> > +                                  start >> fs_info->node_bits, NULL, eb,
> >                                    GFP_NOFS);
> >         if (xa_is_err(existing_eb)) {
> >                 ret = xa_err(existing_eb);
> > @@ -3456,7 +3456,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
> >                  * in this case.
> >                  */
> >                 xa_cmpxchg_irq(&fs_info->buffer_tree,
> > -                              eb->start >> fs_info->sectorsize_bits, eb, NULL,
> > +                              eb->start >> fs_info->node_bits, eb, NULL,
> >                                GFP_ATOMIC);
> >
> >                 btrfs_leak_debug_del_eb(eb);
> > @@ -4294,9 +4294,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
> >  {
> >         struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
> >         struct extent_buffer *eb;
> > -       unsigned long start = folio_pos(folio) >> fs_info->sectorsize_bits;
> > +       unsigned long start = folio_pos(folio) >> fs_info->node_bits;
> >         unsigned long index = start;
> > -       unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
> > +       unsigned long end = index + (PAGE_SIZE >> fs_info->node_bits) - 1;
> >         int ret;
> >
> >         xa_lock_irq(&fs_info->buffer_tree);
> > diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> > index cf805b4032af3..8c9113304fabe 100644
> > --- a/fs/btrfs/fs.h
> > +++ b/fs/btrfs/fs.h
> > @@ -778,8 +778,9 @@ struct btrfs_fs_info {
> >
> >         struct btrfs_delayed_root *delayed_root;
> >
> > -       /* Entries are eb->start / sectorsize */
> > +       /* Entries are eb->start >> node_bits */
> >         struct xarray buffer_tree;
> > +       int node_bits;
> >
> >         /* Next backup root to be overwritten */
> >         int backup_root_index;
> > --
> > 2.47.2
> >
> >
Re: [PATCH] btrfs: index buffer_tree using node size
Posted by Qu Wenruo 9 months ago

在 2025/5/13 02:53, Daniel Vacek 写道:
> So far we are deriving the buffer tree index using the sector size. But each
> extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> 
> For example the typical and quite common configuration uses sector size of 4KiB
> and node size of 16KiB. In this case it means the buffer tree is using up to
> the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> slots are wasted as never used.
> 
> We can score significant memory savings on the required tree nodes by indexing
> the tree using the node size instead. As a result far less slots are wasted
> and the tree can now use up to all 100% of it's slots this way.
> 
> Signed-off-by: Daniel Vacek <neelx@suse.com>

I'm fine with this change, but I still believe, we should address 
unaligned tree blocks before doing this.

As this requires all tree blocks are nodesize aligned.

If we have some metadata chunks which starts at a bytenr that is not 
node size aligned, all tree blocks inside that chunk will not be 
nodesize aligned and causing problems.

Thanks,
Qu

> ---
>   fs/btrfs/disk-io.c   |  1 +
>   fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
>   fs/btrfs/fs.h        |  3 ++-
>   3 files changed, 18 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 5bcf11246ba66..dcea5b0a2db50 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3395,6 +3395,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
>   	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
>   
>   	fs_info->nodesize = nodesize;
> +	fs_info->node_bits = ilog2(nodesize);
>   	fs_info->sectorsize = sectorsize;
>   	fs_info->sectorsize_bits = ilog2(sectorsize);
>   	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 4d3584790cf7f..80a8563a25add 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
>   	 */
>   	spin_lock(&eb->refs_lock);
>   	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
> -		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
>   		unsigned long flags;
>   
>   		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
> @@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
>   static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
>   {
>   	struct btrfs_fs_info *fs_info = eb->fs_info;
> -	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
>   	unsigned long flags;
>   
>   	xas_lock_irqsave(&xas, flags);
> @@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
>   static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
>   {
>   	struct btrfs_fs_info *fs_info = eb->fs_info;
> -	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
>   	unsigned long flags;
>   
>   	xas_lock_irqsave(&xas, flags);
> @@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
>   	rcu_read_lock();
>   	while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
>   		if (!eb_batch_add(batch, eb)) {
> -			*start = (eb->start + eb->len) >> fs_info->sectorsize_bits;
> +			*start = (eb->start + eb->len) >> fs_info->node_bits;
>   			goto out;
>   		}
>   	}
> @@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
>   		struct btrfs_fs_info *fs_info, u64 start)
>   {
>   	struct extent_buffer *eb;
> -	unsigned long index = start >> fs_info->sectorsize_bits;
> +	unsigned long index = start >> fs_info->node_bits;
>   
>   	rcu_read_lock();
>   	eb = xa_load(&fs_info->buffer_tree, index);
> @@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
>   				      u64 end)
>   {
>   	struct eb_batch batch;
> -	unsigned long start_index = start >> fs_info->sectorsize_bits;
> -	unsigned long end_index = end >> fs_info->sectorsize_bits;
> +	unsigned long start_index = start >> fs_info->node_bits;
> +	unsigned long end_index = end >> fs_info->node_bits;
>   
>   	eb_batch_init(&batch);
>   	while (start_index <= end_index) {
> @@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
>   
>   	eb_batch_init(&batch);
>   	if (wbc->range_cyclic) {
> -		index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits;
> +		index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->node_bits;
>   		end = -1;
>   
>   		/*
> @@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
>   		 */
>   		scanned = (index == 0);
>   	} else {
> -		index = wbc->range_start >> fs_info->sectorsize_bits;
> -		end = wbc->range_end >> fs_info->sectorsize_bits;
> +		index = wbc->range_start >> fs_info->node_bits;
> +		end = wbc->range_end >> fs_info->node_bits;
>   
>   		scanned = 1;
>   	}
> @@ -3037,7 +3037,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
>   	eb->fs_info = fs_info;
>   again:
>   	xa_lock_irq(&fs_info->buffer_tree);
> -	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
> +	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->node_bits,
>   			      NULL, eb, GFP_NOFS);
>   	if (xa_is_err(exists)) {
>   		ret = xa_err(exists);
> @@ -3353,7 +3353,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>   again:
>   	xa_lock_irq(&fs_info->buffer_tree);
>   	existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
> -				   start >> fs_info->sectorsize_bits, NULL, eb,
> +				   start >> fs_info->node_bits, NULL, eb,
>   				   GFP_NOFS);
>   	if (xa_is_err(existing_eb)) {
>   		ret = xa_err(existing_eb);
> @@ -3456,7 +3456,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
>   		 * in this case.
>   		 */
>   		xa_cmpxchg_irq(&fs_info->buffer_tree,
> -			       eb->start >> fs_info->sectorsize_bits, eb, NULL,
> +			       eb->start >> fs_info->node_bits, eb, NULL,
>   			       GFP_ATOMIC);
>   
>   		btrfs_leak_debug_del_eb(eb);
> @@ -4294,9 +4294,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
>   {
>   	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
>   	struct extent_buffer *eb;
> -	unsigned long start = folio_pos(folio) >> fs_info->sectorsize_bits;
> +	unsigned long start = folio_pos(folio) >> fs_info->node_bits;
>   	unsigned long index = start;
> -	unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
> +	unsigned long end = index + (PAGE_SIZE >> fs_info->node_bits) - 1;
>   	int ret;
>   
>   	xa_lock_irq(&fs_info->buffer_tree);
> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> index cf805b4032af3..8c9113304fabe 100644
> --- a/fs/btrfs/fs.h
> +++ b/fs/btrfs/fs.h
> @@ -778,8 +778,9 @@ struct btrfs_fs_info {
>   
>   	struct btrfs_delayed_root *delayed_root;
>   
> -	/* Entries are eb->start / sectorsize */
> +	/* Entries are eb->start >> node_bits */
>   	struct xarray buffer_tree;
> +	int node_bits;
>   
>   	/* Next backup root to be overwritten */
>   	int backup_root_index;
Re: [PATCH] btrfs: index buffer_tree using node size
Posted by Daniel Vacek 9 months ago
On Tue, 13 May 2025 at 00:44, Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>
>
>
> 在 2025/5/13 02:53, Daniel Vacek 写道:
> > So far we are deriving the buffer tree index using the sector size. But each
> > extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> >
> > For example the typical and quite common configuration uses sector size of 4KiB
> > and node size of 16KiB. In this case it means the buffer tree is using up to
> > the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> > slots are wasted as never used.
> >
> > We can score significant memory savings on the required tree nodes by indexing
> > the tree using the node size instead. As a result far less slots are wasted
> > and the tree can now use up to all 100% of it's slots this way.
> >
> > Signed-off-by: Daniel Vacek <neelx@suse.com>
>
> I'm fine with this change, but I still believe, we should address
> unaligned tree blocks before doing this.

Yeah, we discussed it on chat. But giving it another thought I
realized that the ebs don't overlap each other. Well, I think they do
not, right?

That means they are always nodesize apart. So no matter what
alignment/offset each one always falls into a dedicated tree slot. It
can never happen that two neighboring ebs would fall into the same
slot. Hence I think we're safe here with this regard.

> As this requires all tree blocks are nodesize aligned.

Does it? I don't think that's correct. Am I missing something?

> If we have some metadata chunks which starts at a bytenr that is not
> node size aligned, all tree blocks inside that chunk will not be
> nodesize aligned and causing problems.

As explained above, I don't really see any problems. But then again, I
may be missing something. Let me know, please.

> Thanks,
> Qu
>
> > ---
> >   fs/btrfs/disk-io.c   |  1 +
> >   fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
> >   fs/btrfs/fs.h        |  3 ++-
> >   3 files changed, 18 insertions(+), 16 deletions(-)
> >
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index 5bcf11246ba66..dcea5b0a2db50 100644
> > --- a/fs/btrfs/disk-io.c
> > +++ b/fs/btrfs/disk-io.c
> > @@ -3395,6 +3395,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
> >       fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
> >
> >       fs_info->nodesize = nodesize;
> > +     fs_info->node_bits = ilog2(nodesize);
> >       fs_info->sectorsize = sectorsize;
> >       fs_info->sectorsize_bits = ilog2(sectorsize);
> >       fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
> > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> > index 4d3584790cf7f..80a8563a25add 100644
> > --- a/fs/btrfs/extent_io.c
> > +++ b/fs/btrfs/extent_io.c
> > @@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
> >        */
> >       spin_lock(&eb->refs_lock);
> >       if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
> > -             XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> > +             XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
> >               unsigned long flags;
> >
> >               set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
> > @@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
> >   static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
> >   {
> >       struct btrfs_fs_info *fs_info = eb->fs_info;
> > -     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> > +     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
> >       unsigned long flags;
> >
> >       xas_lock_irqsave(&xas, flags);
> > @@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
> >   static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
> >   {
> >       struct btrfs_fs_info *fs_info = eb->fs_info;
> > -     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> > +     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
> >       unsigned long flags;
> >
> >       xas_lock_irqsave(&xas, flags);
> > @@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
> >       rcu_read_lock();
> >       while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
> >               if (!eb_batch_add(batch, eb)) {
> > -                     *start = (eb->start + eb->len) >> fs_info->sectorsize_bits;
> > +                     *start = (eb->start + eb->len) >> fs_info->node_bits;
> >                       goto out;
> >               }
> >       }
> > @@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
> >               struct btrfs_fs_info *fs_info, u64 start)
> >   {
> >       struct extent_buffer *eb;
> > -     unsigned long index = start >> fs_info->sectorsize_bits;
> > +     unsigned long index = start >> fs_info->node_bits;
> >
> >       rcu_read_lock();
> >       eb = xa_load(&fs_info->buffer_tree, index);
> > @@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
> >                                     u64 end)
> >   {
> >       struct eb_batch batch;
> > -     unsigned long start_index = start >> fs_info->sectorsize_bits;
> > -     unsigned long end_index = end >> fs_info->sectorsize_bits;
> > +     unsigned long start_index = start >> fs_info->node_bits;
> > +     unsigned long end_index = end >> fs_info->node_bits;
> >
> >       eb_batch_init(&batch);
> >       while (start_index <= end_index) {
> > @@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
> >
> >       eb_batch_init(&batch);
> >       if (wbc->range_cyclic) {
> > -             index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits;
> > +             index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->node_bits;
> >               end = -1;
> >
> >               /*
> > @@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
> >                */
> >               scanned = (index == 0);
> >       } else {
> > -             index = wbc->range_start >> fs_info->sectorsize_bits;
> > -             end = wbc->range_end >> fs_info->sectorsize_bits;
> > +             index = wbc->range_start >> fs_info->node_bits;
> > +             end = wbc->range_end >> fs_info->node_bits;
> >
> >               scanned = 1;
> >       }
> > @@ -3037,7 +3037,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
> >       eb->fs_info = fs_info;
> >   again:
> >       xa_lock_irq(&fs_info->buffer_tree);
> > -     exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
> > +     exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->node_bits,
> >                             NULL, eb, GFP_NOFS);
> >       if (xa_is_err(exists)) {
> >               ret = xa_err(exists);
> > @@ -3353,7 +3353,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
> >   again:
> >       xa_lock_irq(&fs_info->buffer_tree);
> >       existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
> > -                                start >> fs_info->sectorsize_bits, NULL, eb,
> > +                                start >> fs_info->node_bits, NULL, eb,
> >                                  GFP_NOFS);
> >       if (xa_is_err(existing_eb)) {
> >               ret = xa_err(existing_eb);
> > @@ -3456,7 +3456,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
> >                * in this case.
> >                */
> >               xa_cmpxchg_irq(&fs_info->buffer_tree,
> > -                            eb->start >> fs_info->sectorsize_bits, eb, NULL,
> > +                            eb->start >> fs_info->node_bits, eb, NULL,
> >                              GFP_ATOMIC);
> >
> >               btrfs_leak_debug_del_eb(eb);
> > @@ -4294,9 +4294,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
> >   {
> >       struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
> >       struct extent_buffer *eb;
> > -     unsigned long start = folio_pos(folio) >> fs_info->sectorsize_bits;
> > +     unsigned long start = folio_pos(folio) >> fs_info->node_bits;
> >       unsigned long index = start;
> > -     unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
> > +     unsigned long end = index + (PAGE_SIZE >> fs_info->node_bits) - 1;
> >       int ret;
> >
> >       xa_lock_irq(&fs_info->buffer_tree);
> > diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> > index cf805b4032af3..8c9113304fabe 100644
> > --- a/fs/btrfs/fs.h
> > +++ b/fs/btrfs/fs.h
> > @@ -778,8 +778,9 @@ struct btrfs_fs_info {
> >
> >       struct btrfs_delayed_root *delayed_root;
> >
> > -     /* Entries are eb->start / sectorsize */
> > +     /* Entries are eb->start >> node_bits */
> >       struct xarray buffer_tree;
> > +     int node_bits;
> >
> >       /* Next backup root to be overwritten */
> >       int backup_root_index;
>
Re: [PATCH] btrfs: index buffer_tree using node size
Posted by Qu Wenruo 9 months ago

在 2025/5/13 17:35, Daniel Vacek 写道:
> On Tue, 13 May 2025 at 00:44, Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>>
>>
>>
>> 在 2025/5/13 02:53, Daniel Vacek 写道:
>>> So far we are deriving the buffer tree index using the sector size. But each
>>> extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
>>>
>>> For example the typical and quite common configuration uses sector size of 4KiB
>>> and node size of 16KiB. In this case it means the buffer tree is using up to
>>> the maximum of 25% of it's slots. Or in other words at least 75% of the tree
>>> slots are wasted as never used.
>>>
>>> We can score significant memory savings on the required tree nodes by indexing
>>> the tree using the node size instead. As a result far less slots are wasted
>>> and the tree can now use up to all 100% of it's slots this way.
>>>
>>> Signed-off-by: Daniel Vacek <neelx@suse.com>
>>
>> I'm fine with this change, but I still believe, we should address
>> unaligned tree blocks before doing this.
> 
> Yeah, we discussed it on chat. But giving it another thought I
> realized that the ebs don't overlap each other. Well, I think they do
> not, right?
> 
> That means they are always nodesize apart. So no matter what
> alignment/offset each one always falls into a dedicated tree slot. It
> can never happen that two neighboring ebs would fall into the same
> slot. Hence I think we're safe here with this regard.

Oh, this is indeed a brilliant trick/hack.

Yes, even with unaligned tree blocks, we can still get unique index by 
doing eb->start >> nodesize_shift.

Mind to add this part into the commit message?

In that case, it looks good to me.

Reviewed-by: Qu Wenruo <wqu@suse.com>

Thanks,
Qu

> 
>> As this requires all tree blocks are nodesize aligned.
> 
> Does it? I don't think that's correct. Am I missing something?
> 
>> If we have some metadata chunks which starts at a bytenr that is not
>> node size aligned, all tree blocks inside that chunk will not be
>> nodesize aligned and causing problems.
> 
> As explained above, I don't really see any problems. But then again, I
> may be missing something. Let me know, please.
> 
>> Thanks,
>> Qu
>>
>>> ---
>>>    fs/btrfs/disk-io.c   |  1 +
>>>    fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
>>>    fs/btrfs/fs.h        |  3 ++-
>>>    3 files changed, 18 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>>> index 5bcf11246ba66..dcea5b0a2db50 100644
>>> --- a/fs/btrfs/disk-io.c
>>> +++ b/fs/btrfs/disk-io.c
>>> @@ -3395,6 +3395,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
>>>        fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
>>>
>>>        fs_info->nodesize = nodesize;
>>> +     fs_info->node_bits = ilog2(nodesize);
>>>        fs_info->sectorsize = sectorsize;
>>>        fs_info->sectorsize_bits = ilog2(sectorsize);
>>>        fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
>>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>>> index 4d3584790cf7f..80a8563a25add 100644
>>> --- a/fs/btrfs/extent_io.c
>>> +++ b/fs/btrfs/extent_io.c
>>> @@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
>>>         */
>>>        spin_lock(&eb->refs_lock);
>>>        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
>>> -             XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
>>> +             XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
>>>                unsigned long flags;
>>>
>>>                set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
>>> @@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
>>>    static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
>>>    {
>>>        struct btrfs_fs_info *fs_info = eb->fs_info;
>>> -     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
>>> +     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
>>>        unsigned long flags;
>>>
>>>        xas_lock_irqsave(&xas, flags);
>>> @@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
>>>    static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
>>>    {
>>>        struct btrfs_fs_info *fs_info = eb->fs_info;
>>> -     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
>>> +     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
>>>        unsigned long flags;
>>>
>>>        xas_lock_irqsave(&xas, flags);
>>> @@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
>>>        rcu_read_lock();
>>>        while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
>>>                if (!eb_batch_add(batch, eb)) {
>>> -                     *start = (eb->start + eb->len) >> fs_info->sectorsize_bits;
>>> +                     *start = (eb->start + eb->len) >> fs_info->node_bits;
>>>                        goto out;
>>>                }
>>>        }
>>> @@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
>>>                struct btrfs_fs_info *fs_info, u64 start)
>>>    {
>>>        struct extent_buffer *eb;
>>> -     unsigned long index = start >> fs_info->sectorsize_bits;
>>> +     unsigned long index = start >> fs_info->node_bits;
>>>
>>>        rcu_read_lock();
>>>        eb = xa_load(&fs_info->buffer_tree, index);
>>> @@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
>>>                                      u64 end)
>>>    {
>>>        struct eb_batch batch;
>>> -     unsigned long start_index = start >> fs_info->sectorsize_bits;
>>> -     unsigned long end_index = end >> fs_info->sectorsize_bits;
>>> +     unsigned long start_index = start >> fs_info->node_bits;
>>> +     unsigned long end_index = end >> fs_info->node_bits;
>>>
>>>        eb_batch_init(&batch);
>>>        while (start_index <= end_index) {
>>> @@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
>>>
>>>        eb_batch_init(&batch);
>>>        if (wbc->range_cyclic) {
>>> -             index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits;
>>> +             index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->node_bits;
>>>                end = -1;
>>>
>>>                /*
>>> @@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
>>>                 */
>>>                scanned = (index == 0);
>>>        } else {
>>> -             index = wbc->range_start >> fs_info->sectorsize_bits;
>>> -             end = wbc->range_end >> fs_info->sectorsize_bits;
>>> +             index = wbc->range_start >> fs_info->node_bits;
>>> +             end = wbc->range_end >> fs_info->node_bits;
>>>
>>>                scanned = 1;
>>>        }
>>> @@ -3037,7 +3037,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
>>>        eb->fs_info = fs_info;
>>>    again:
>>>        xa_lock_irq(&fs_info->buffer_tree);
>>> -     exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
>>> +     exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->node_bits,
>>>                              NULL, eb, GFP_NOFS);
>>>        if (xa_is_err(exists)) {
>>>                ret = xa_err(exists);
>>> @@ -3353,7 +3353,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>>>    again:
>>>        xa_lock_irq(&fs_info->buffer_tree);
>>>        existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
>>> -                                start >> fs_info->sectorsize_bits, NULL, eb,
>>> +                                start >> fs_info->node_bits, NULL, eb,
>>>                                   GFP_NOFS);
>>>        if (xa_is_err(existing_eb)) {
>>>                ret = xa_err(existing_eb);
>>> @@ -3456,7 +3456,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
>>>                 * in this case.
>>>                 */
>>>                xa_cmpxchg_irq(&fs_info->buffer_tree,
>>> -                            eb->start >> fs_info->sectorsize_bits, eb, NULL,
>>> +                            eb->start >> fs_info->node_bits, eb, NULL,
>>>                               GFP_ATOMIC);
>>>
>>>                btrfs_leak_debug_del_eb(eb);
>>> @@ -4294,9 +4294,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
>>>    {
>>>        struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
>>>        struct extent_buffer *eb;
>>> -     unsigned long start = folio_pos(folio) >> fs_info->sectorsize_bits;
>>> +     unsigned long start = folio_pos(folio) >> fs_info->node_bits;
>>>        unsigned long index = start;
>>> -     unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
>>> +     unsigned long end = index + (PAGE_SIZE >> fs_info->node_bits) - 1;
>>>        int ret;
>>>
>>>        xa_lock_irq(&fs_info->buffer_tree);
>>> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
>>> index cf805b4032af3..8c9113304fabe 100644
>>> --- a/fs/btrfs/fs.h
>>> +++ b/fs/btrfs/fs.h
>>> @@ -778,8 +778,9 @@ struct btrfs_fs_info {
>>>
>>>        struct btrfs_delayed_root *delayed_root;
>>>
>>> -     /* Entries are eb->start / sectorsize */
>>> +     /* Entries are eb->start >> node_bits */
>>>        struct xarray buffer_tree;
>>> +     int node_bits;
>>>
>>>        /* Next backup root to be overwritten */
>>>        int backup_root_index;
>>
> 
Re: [PATCH] btrfs: index buffer_tree using node size
Posted by David Sterba 9 months ago
On Mon, May 12, 2025 at 07:23:20PM +0200, Daniel Vacek wrote:
> So far we are deriving the buffer tree index using the sector size. But each
> extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> 
> For example the typical and quite common configuration uses sector size of 4KiB
> and node size of 16KiB. In this case it means the buffer tree is using up to
> the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> slots are wasted as never used.
> 
> We can score significant memory savings on the required tree nodes by indexing
> the tree using the node size instead. As a result far less slots are wasted
> and the tree can now use up to all 100% of it's slots this way.

This looks interesting. Is there a way to get xarray stats? I don't see
anything in the public API, e.g. depth, fanout, slack per level. For
debugging purposes we can put it to sysfs or as syslog message,
eventually as non-debugging output to commit_stats.
Re: [PATCH] btrfs: index buffer_tree using node size
Posted by Daniel Vacek 9 months ago
On Mon, 12 May 2025 at 22:20, David Sterba <dsterba@suse.cz> wrote:
>
> On Mon, May 12, 2025 at 07:23:20PM +0200, Daniel Vacek wrote:
> > So far we are deriving the buffer tree index using the sector size. But each
> > extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> >
> > For example the typical and quite common configuration uses sector size of 4KiB
> > and node size of 16KiB. In this case it means the buffer tree is using up to
> > the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> > slots are wasted as never used.
> >
> > We can score significant memory savings on the required tree nodes by indexing
> > the tree using the node size instead. As a result far less slots are wasted
> > and the tree can now use up to all 100% of it's slots this way.
>
> This looks interesting. Is there a way to get xarray stats? I don't see
> anything in the public API, e.g. depth, fanout, slack per level. For
> debugging purposes we can put it to sysfs or as syslog message,
> eventually as non-debugging output to commit_stats.

I'm using a python script in crash (even live on my laptop). I believe
you could do the same in dragon. Though that's not the runtime stats
you described. And I don't really think it's worth it.
Re: [PATCH] btrfs: index buffer_tree using node size
Posted by David Sterba 8 months, 4 weeks ago
On Tue, May 13, 2025 at 09:56:19AM +0200, Daniel Vacek wrote:
> On Mon, 12 May 2025 at 22:20, David Sterba <dsterba@suse.cz> wrote:
> >
> > On Mon, May 12, 2025 at 07:23:20PM +0200, Daniel Vacek wrote:
> > > So far we are deriving the buffer tree index using the sector size. But each
> > > extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> > >
> > > For example the typical and quite common configuration uses sector size of 4KiB
> > > and node size of 16KiB. In this case it means the buffer tree is using up to
> > > the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> > > slots are wasted as never used.
> > >
> > > We can score significant memory savings on the required tree nodes by indexing
> > > the tree using the node size instead. As a result far less slots are wasted
> > > and the tree can now use up to all 100% of it's slots this way.
> >
> > This looks interesting. Is there a way to get xarray stats? I don't see
> > anything in the public API, e.g. depth, fanout, slack per level. For
> > debugging purposes we can put it to sysfs or as syslog message,
> > eventually as non-debugging output to commit_stats.
> 
> I'm using a python script in crash (even live on my laptop). I believe
> you could do the same in dragon. Though that's not the runtime stats
> you described. And I don't really think it's worth it.

How come you don't think it's worth it? You claim some numbers and we
don't have a way to verify that or gather on various setups or
workloads. I'd be interested in the numbers also to better understand
how xarray performs with the extent buffers but I don't now how to write
the analysis scripts in any of the tools, nor have time for that.
Re: [PATCH] btrfs: index buffer_tree using node size
Posted by Daniel Vacek 8 months, 4 weeks ago
On Tue, 13 May 2025 at 19:10, David Sterba <dsterba@suse.cz> wrote:
>
> On Tue, May 13, 2025 at 09:56:19AM +0200, Daniel Vacek wrote:
> > On Mon, 12 May 2025 at 22:20, David Sterba <dsterba@suse.cz> wrote:
> > >
> > > On Mon, May 12, 2025 at 07:23:20PM +0200, Daniel Vacek wrote:
> > > > So far we are deriving the buffer tree index using the sector size. But each
> > > > extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> > > >
> > > > For example the typical and quite common configuration uses sector size of 4KiB
> > > > and node size of 16KiB. In this case it means the buffer tree is using up to
> > > > the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> > > > slots are wasted as never used.
> > > >
> > > > We can score significant memory savings on the required tree nodes by indexing
> > > > the tree using the node size instead. As a result far less slots are wasted
> > > > and the tree can now use up to all 100% of it's slots this way.
> > >
> > > This looks interesting. Is there a way to get xarray stats? I don't see
> > > anything in the public API, e.g. depth, fanout, slack per level. For
> > > debugging purposes we can put it to sysfs or as syslog message,
> > > eventually as non-debugging output to commit_stats.
> >
> > I'm using a python script in crash (even live on my laptop). I believe
> > you could do the same in dragon. Though that's not the runtime stats
> > you described. And I don't really think it's worth it.
>
> How come you don't think it's worth it? You claim some numbers and we
> don't have a way to verify that or gather on various setups or
> workloads. I'd be interested in the numbers also to better understand
> how xarray performs with the extent buffers but I don't now how to write
> the analysis scripts in any of the tools, nor have time for that.

Well, xarray behaves as a generic radix tree. That's basic theory. It
is just a tool. A tool you know. You know how it works, what it is
good for and what to expect from it. That's why you decide to use it,
right?

I only claim, we're now limited to using 16 slots per node (mistakenly
skipping every 3 out of 4 slots as we're not using the last 2 bits
from the index) and this can be quite simply fixed by correct indexing
and we can use the xarray correctly. Basically we're not using the
tool correctly at the moment and we are using it very inefficiently. I
believe you can see that. So far so good, right?
No stats would explain it better than this. This is already the best
information you can get to describe, depict and understand the issue,
IMO.

In my experience, sometimes more stats can be misleading or
misinterpreted. Even by kernel engineers. Sometimes the stats can be
confusing or even not really relevant. So it's better to focus on the
right facts for the given issue.

And I also believe there is a reason xarray does not provide or even
keep any stats. What for? Would users need it? Would they benefit? Do
you care about any tree stats on your cell phone? I quite doubt so.
And kernel would be doing additional work just to keep the stats for
no use. That would also be inefficient and useless overhead.
Perhaps if it was an optional feature only for debug builds, maybe
useful during the development process??? But even then, you already
know you need to use this tool. And if you don't implement it from
scratch and you rather use the one kernel provides, you also know it
already works the best it can. In the end it's just a plain radix
tree.

Looking at it the other way around. The fact that xarray does not
provide any stats perhaps means no one really needed that so far? It
won't be that hard to implement if someone really needed it.
So why is it not there already? Maybe it would be just an additional
overhead which seems not justified so far, considering the feature is
still not implemented.
And IMO, that fact itself that it's still not implemented is also a
very good hint that it does not make much sense to really check.
Either it's the right tool for you no matter how the stats would end
up, or you need a different tool.

Now if you'd like some numbers to refresh how radix trees behave in
general, I can show the numbers I pulled yday for Filipe as he was
also curious. I'm going to be rather brief below, for further details
check my other email.

So for a radix tree, the height as well as the width of the tree are
fully determined by the maximal index. The one implemented in kernel
grows and shrinks accordingly. All the leaves are on the same/last
level. In a sense the leaves are kinda 'flat'.
Checking my /home fs on the lap I'm now writing from, the max
buffer_radix index at the moment was 4341c9c which means 5 levels.
Xarray uses radix 64 which means each 6 bits of index need a new tree
level. Now, since we're filling the last 2 bits of index with zeroes
(well, depending on the eb->start offset/alignment maybe some other
constant, but still a constant) we're effectively using only 4 bits
for the last (leaf) level. That's why we fill the leaves only up to
25% tops (provided the ebs densely follow each other). Of course we
don't always allocate all ebs, so the tree can be sparse by design.
But that's still OK, that's what we want. But...
We also make the tree needlessly wider than what would be optimal.
Correctly (what this patch does) we should get rid of the 2
constant/redundant index bits: 4341c9c >> 2 = 10d0727. See? Now the
width of the tree would be 10d0727 instead of 4341c9c. It's still the
same 5 levels of height, but it's now much more narrow/dense.
Depending on how sparse the tree is some nodes can be merged to one
this way. In theory up to 4, but again - that depends on how dense or
sparse the tree is to begin with.
The difference really is that the tree will not use more nodes than
needed which unfortunately can happen without the fix as the tree is
additionally filled with a thin air.

Out of curiosity I also checked how many ebs were actually present and
how many leaves were used in the tree. For this FS it was 13k of ebs
fitted in 4150 leaf nodes. So on average a bit over 3 ebs per leaf.
I'd estimate that with the fix we can get in a ballpark of 7-9 ebs per
leaf. Maybe something like 1600 leaves instead of 4150 for the same
13k ebs. IMO, that's a very nice win indeed.
The worst case would be no change. But the tree is not that sparse really.

Now imagine, if you wanted to use the rb tree here, you'd need 13k
tree nodes, but they'd be embedded in the ebs themselves. But the tree
height would be something like 14-28 depending on balancing.

In terms of used memory, for this example the rb tree would use about
300KB while the xarray could be estimated to about 900KB with this
patch (using 2.4MB now). That's if I estimate 1.5 megs saved by the
fix. That's the difference between using the tree correctly and
incorrectly.
And even if the estimation is not exact, we'll always get at least some savings.

Hopefully this gives you a better picture to satisfy your curiosity.
Re: [PATCH] btrfs: index buffer_tree using node size
Posted by Daniel Vacek 9 months ago
On Tue, 13 May 2025 at 09:56, Daniel Vacek <neelx@suse.com> wrote:
>
> On Mon, 12 May 2025 at 22:20, David Sterba <dsterba@suse.cz> wrote:
> >
> > On Mon, May 12, 2025 at 07:23:20PM +0200, Daniel Vacek wrote:
> > > So far we are deriving the buffer tree index using the sector size. But each
> > > extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> > >
> > > For example the typical and quite common configuration uses sector size of 4KiB
> > > and node size of 16KiB. In this case it means the buffer tree is using up to
> > > the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> > > slots are wasted as never used.
> > >
> > > We can score significant memory savings on the required tree nodes by indexing
> > > the tree using the node size instead. As a result far less slots are wasted
> > > and the tree can now use up to all 100% of it's slots this way.
> >
> > This looks interesting. Is there a way to get xarray stats? I don't see
> > anything in the public API, e.g. depth, fanout, slack per level. For
> > debugging purposes we can put it to sysfs or as syslog message,
> > eventually as non-debugging output to commit_stats.
>
> I'm using a python script in crash (even live on my laptop). I believe
> you could do the same in dragon. Though that's not the runtime stats
> you described. And I don't really think it's worth it.

Eventually you could use a systemtap or bpftrace script. Though both
have their pros and cons.
[PATCH v2] btrfs: index buffer_tree using node size
Posted by Daniel Vacek 8 months ago
So far we are deriving the buffer tree index using the sector size. But each
extent buffer covers multiple sectors. This makes the buffer tree rather sparse.

For example the typical and quite common configuration uses sector size of 4KiB
and node size of 16KiB. In this case it means the buffer tree is using up to
the maximum of 25% of it's slots. Or in other words at least 75% of the tree
slots are wasted as never used.

We can score significant memory savings on the required tree nodes by indexing
the tree using the node size instead. As a result far less slots are wasted
and the tree can now use up to all 100% of it's slots this way.

Note: This works even with unaligned tree blocks as we can still get unique
      index by doing eb->start >> nodesize_shift.

Signed-off-by: Daniel Vacek <neelx@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
v2 changes:
 * Note that this is still correct even with unaligned tree blocks.
 * Rename node_bits to nodesize_bits to stay consistent.
 * Move the nodesize_bits member next to nodesize and make it u32.

---
 fs/btrfs/disk-io.c   |  1 +
 fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
 fs/btrfs/fs.h        |  3 ++-
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0d6ad7512f217..3d465258f15b7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3396,6 +3396,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
 
 	fs_info->nodesize = nodesize;
+	fs_info->nodesize_bits = ilog2(nodesize);
 	fs_info->sectorsize = sectorsize;
 	fs_info->sectorsize_bits = ilog2(sectorsize);
 	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e9ba80a56172d..a55c7c7eb8990 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
 	 */
 	spin_lock(&eb->refs_lock);
 	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
+		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
 		unsigned long flags;
 
 		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
@@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
 static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
+	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
 	unsigned long flags;
 
 	xas_lock_irqsave(&xas, flags);
@@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
 static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
+	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
 	unsigned long flags;
 
 	xas_lock_irqsave(&xas, flags);
@@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
 	rcu_read_lock();
 	while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
 		if (!eb_batch_add(batch, eb)) {
-			*start = ((eb->start + eb->len) >> fs_info->sectorsize_bits);
+			*start = (eb->start + eb->len) >> fs_info->nodesize_bits;
 			goto out;
 		}
 	}
@@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
 		struct btrfs_fs_info *fs_info, u64 start)
 {
 	struct extent_buffer *eb;
-	unsigned long index = (start >> fs_info->sectorsize_bits);
+	unsigned long index = start >> fs_info->nodesize_bits;
 
 	rcu_read_lock();
 	eb = xa_load(&fs_info->buffer_tree, index);
@@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
 				      u64 end)
 {
 	struct eb_batch batch;
-	unsigned long start_index = (start >> fs_info->sectorsize_bits);
-	unsigned long end_index = (end >> fs_info->sectorsize_bits);
+	unsigned long start_index = start >> fs_info->nodesize_bits;
+	unsigned long end_index = end >> fs_info->nodesize_bits;
 
 	eb_batch_init(&batch);
 	while (start_index <= end_index) {
@@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 
 	eb_batch_init(&batch);
 	if (wbc->range_cyclic) {
-		index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits);
+		index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits;
 		end = -1;
 
 		/*
@@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
 		 */
 		scanned = (index == 0);
 	} else {
-		index = (wbc->range_start >> fs_info->sectorsize_bits);
-		end = (wbc->range_end >> fs_info->sectorsize_bits);
+		index = wbc->range_start >> fs_info->nodesize_bits;
+		end = wbc->range_end >> fs_info->nodesize_bits;
 
 		scanned = 1;
 	}
@@ -3038,7 +3038,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
 	eb->fs_info = fs_info;
 again:
 	xa_lock_irq(&fs_info->buffer_tree);
-	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
+	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits,
 			      NULL, eb, GFP_NOFS);
 	if (xa_is_err(exists)) {
 		ret = xa_err(exists);
@@ -3355,7 +3355,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 again:
 	xa_lock_irq(&fs_info->buffer_tree);
 	existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
-				   start >> fs_info->sectorsize_bits, NULL, eb,
+				   start >> fs_info->nodesize_bits, NULL, eb,
 				   GFP_NOFS);
 	if (xa_is_err(existing_eb)) {
 		ret = xa_err(existing_eb);
@@ -3458,7 +3458,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
 		 * in this case.
 		 */
 		xa_cmpxchg_irq(&fs_info->buffer_tree,
-			       eb->start >> fs_info->sectorsize_bits, eb, NULL,
+			       eb->start >> fs_info->nodesize_bits, eb, NULL,
 			       GFP_ATOMIC);
 
 		btrfs_leak_debug_del_eb(eb);
@@ -4300,9 +4300,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
 {
 	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
 	struct extent_buffer *eb;
-	unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits);
+	unsigned long start = folio_pos(folio) >> fs_info->nodesize_bits;
 	unsigned long index = start;
-	unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
+	unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
 	int ret;
 
 	xa_lock_irq(&fs_info->buffer_tree);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index b239e4b8421cf..fd7cbbe3515d6 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -781,7 +781,7 @@ struct btrfs_fs_info {
 
 	struct btrfs_delayed_root *delayed_root;
 
-	/* Entries are eb->start / sectorsize */
+	/* Entries are eb->start >> nodesize_bits */
 	struct xarray buffer_tree;
 
 	/* Next backup root to be overwritten */
@@ -813,6 +813,7 @@ struct btrfs_fs_info {
 
 	/* Cached block sizes */
 	u32 nodesize;
+	u32 nodesize_bits;
 	u32 sectorsize;
 	/* ilog2 of sectorsize, use to avoid 64bit division */
 	u32 sectorsize_bits;
-- 
2.47.2
Re: [PATCH v2] btrfs: index buffer_tree using node size
Posted by David Sterba 7 months, 3 weeks ago
On Thu, Jun 12, 2025 at 10:47:23AM +0200, Daniel Vacek wrote:
> So far we are deriving the buffer tree index using the sector size. But each
> extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> 
> For example the typical and quite common configuration uses sector size of 4KiB
> and node size of 16KiB. In this case it means the buffer tree is using up to
> the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> slots are wasted as never used.
> 
> We can score significant memory savings on the required tree nodes by indexing
> the tree using the node size instead. As a result far less slots are wasted
> and the tree can now use up to all 100% of it's slots this way.
> 
> Note: This works even with unaligned tree blocks as we can still get unique
>       index by doing eb->start >> nodesize_shift.

Can we have at least some numbers? As we've talked about it and you
showed me the number of radix nodes or other internal xarray structures
before/after.

> Signed-off-by: Daniel Vacek <neelx@suse.com>
> Reviewed-by: Qu Wenruo <wqu@suse.com>
> ---
> v2 changes:
>  * Note that this is still correct even with unaligned tree blocks.
>  * Rename node_bits to nodesize_bits to stay consistent.
>  * Move the nodesize_bits member next to nodesize and make it u32.
> 
> ---
>  fs/btrfs/disk-io.c   |  1 +
>  fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
>  fs/btrfs/fs.h        |  3 ++-
>  3 files changed, 18 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 0d6ad7512f217..3d465258f15b7 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3396,6 +3396,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
>  	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
>  
>  	fs_info->nodesize = nodesize;
> +	fs_info->nodesize_bits = ilog2(nodesize);
>  	fs_info->sectorsize = sectorsize;
>  	fs_info->sectorsize_bits = ilog2(sectorsize);
>  	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index e9ba80a56172d..a55c7c7eb8990 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
>  	 */
>  	spin_lock(&eb->refs_lock);
>  	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
> -		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
>  		unsigned long flags;
>  
>  		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
> @@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
>  static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
>  {
>  	struct btrfs_fs_info *fs_info = eb->fs_info;
> -	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
>  	unsigned long flags;
>  
>  	xas_lock_irqsave(&xas, flags);
> @@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
>  static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
>  {
>  	struct btrfs_fs_info *fs_info = eb->fs_info;
> -	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
>  	unsigned long flags;
>  
>  	xas_lock_irqsave(&xas, flags);
> @@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
>  	rcu_read_lock();
>  	while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
>  		if (!eb_batch_add(batch, eb)) {
> -			*start = ((eb->start + eb->len) >> fs_info->sectorsize_bits);
> +			*start = (eb->start + eb->len) >> fs_info->nodesize_bits;

In other places you drop the outer ( ) from the shifts, please keep it
if it's there (or add if it's missing).

>  			goto out;
>  		}
>  	}
> @@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
>  		struct btrfs_fs_info *fs_info, u64 start)
>  {
>  	struct extent_buffer *eb;
> -	unsigned long index = (start >> fs_info->sectorsize_bits);
> +	unsigned long index = start >> fs_info->nodesize_bits;
>  
>  	rcu_read_lock();
>  	eb = xa_load(&fs_info->buffer_tree, index);
> @@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
>  				      u64 end)
>  {
>  	struct eb_batch batch;
> -	unsigned long start_index = (start >> fs_info->sectorsize_bits);
> -	unsigned long end_index = (end >> fs_info->sectorsize_bits);
> +	unsigned long start_index = start >> fs_info->nodesize_bits;
> +	unsigned long end_index = end >> fs_info->nodesize_bits;
>  
>  	eb_batch_init(&batch);
>  	while (start_index <= end_index) {
> @@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
>  
>  	eb_batch_init(&batch);
>  	if (wbc->range_cyclic) {
> -		index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits);
> +		index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits;
>  		end = -1;
>  
>  		/*
> @@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
>  		 */
>  		scanned = (index == 0);
>  	} else {
> -		index = (wbc->range_start >> fs_info->sectorsize_bits);
> -		end = (wbc->range_end >> fs_info->sectorsize_bits);
> +		index = wbc->range_start >> fs_info->nodesize_bits;
> +		end = wbc->range_end >> fs_info->nodesize_bits;
>  
>  		scanned = 1;
>  	}
> @@ -3038,7 +3038,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
>  	eb->fs_info = fs_info;
>  again:
>  	xa_lock_irq(&fs_info->buffer_tree);
> -	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
> +	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits,
>  			      NULL, eb, GFP_NOFS);
>  	if (xa_is_err(exists)) {
>  		ret = xa_err(exists);
> @@ -3355,7 +3355,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  again:
>  	xa_lock_irq(&fs_info->buffer_tree);
>  	existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
> -				   start >> fs_info->sectorsize_bits, NULL, eb,
> +				   start >> fs_info->nodesize_bits, NULL, eb,
>  				   GFP_NOFS);
>  	if (xa_is_err(existing_eb)) {
>  		ret = xa_err(existing_eb);
> @@ -3458,7 +3458,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
>  		 * in this case.
>  		 */
>  		xa_cmpxchg_irq(&fs_info->buffer_tree,
> -			       eb->start >> fs_info->sectorsize_bits, eb, NULL,
> +			       eb->start >> fs_info->nodesize_bits, eb, NULL,
>  			       GFP_ATOMIC);
>  
>  		btrfs_leak_debug_del_eb(eb);
> @@ -4300,9 +4300,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
>  {
>  	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
>  	struct extent_buffer *eb;
> -	unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits);
> +	unsigned long start = folio_pos(folio) >> fs_info->nodesize_bits;
>  	unsigned long index = start;
> -	unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
> +	unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
>  	int ret;
>  
>  	xa_lock_irq(&fs_info->buffer_tree);
> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> index b239e4b8421cf..fd7cbbe3515d6 100644
> --- a/fs/btrfs/fs.h
> +++ b/fs/btrfs/fs.h
> @@ -781,7 +781,7 @@ struct btrfs_fs_info {
>  
>  	struct btrfs_delayed_root *delayed_root;
>  
> -	/* Entries are eb->start / sectorsize */
> +	/* Entries are eb->start >> nodesize_bits */
>  	struct xarray buffer_tree;
>  
>  	/* Next backup root to be overwritten */
> @@ -813,6 +813,7 @@ struct btrfs_fs_info {
>  
>  	/* Cached block sizes */
>  	u32 nodesize;
> +	u32 nodesize_bits;
>  	u32 sectorsize;
>  	/* ilog2 of sectorsize, use to avoid 64bit division */
>  	u32 sectorsize_bits;
> -- 
> 2.47.2
>
Re: [PATCH v2] btrfs: index buffer_tree using node size
Posted by Daniel Vacek 7 months, 2 weeks ago
On Fri, 20 Jun 2025 at 14:57, David Sterba <dsterba@suse.cz> wrote:
>
> On Thu, Jun 12, 2025 at 10:47:23AM +0200, Daniel Vacek wrote:
> > So far we are deriving the buffer tree index using the sector size. But each
> > extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> >
> > For example the typical and quite common configuration uses sector size of 4KiB
> > and node size of 16KiB. In this case it means the buffer tree is using up to
> > the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> > slots are wasted as never used.
> >
> > We can score significant memory savings on the required tree nodes by indexing
> > the tree using the node size instead. As a result far less slots are wasted
> > and the tree can now use up to all 100% of it's slots this way.
> >
> > Note: This works even with unaligned tree blocks as we can still get unique
> >       index by doing eb->start >> nodesize_shift.
>
> Can we have at least some numbers? As we've talked about it and you
> showed me the number of radix nodes or other internal xarray structures
> before/after.

The numbers are in this email thread. Do you mean to put them directly
into the commit message?

> > Signed-off-by: Daniel Vacek <neelx@suse.com>
> > Reviewed-by: Qu Wenruo <wqu@suse.com>
> > ---
> > v2 changes:
> >  * Note that this is still correct even with unaligned tree blocks.
> >  * Rename node_bits to nodesize_bits to stay consistent.
> >  * Move the nodesize_bits member next to nodesize and make it u32.
> >
> > ---
> >  fs/btrfs/disk-io.c   |  1 +
> >  fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
> >  fs/btrfs/fs.h        |  3 ++-
> >  3 files changed, 18 insertions(+), 16 deletions(-)
> >
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index 0d6ad7512f217..3d465258f15b7 100644
> > --- a/fs/btrfs/disk-io.c
> > +++ b/fs/btrfs/disk-io.c
> > @@ -3396,6 +3396,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
> >       fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
> >
> >       fs_info->nodesize = nodesize;
> > +     fs_info->nodesize_bits = ilog2(nodesize);
> >       fs_info->sectorsize = sectorsize;
> >       fs_info->sectorsize_bits = ilog2(sectorsize);
> >       fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
> > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> > index e9ba80a56172d..a55c7c7eb8990 100644
> > --- a/fs/btrfs/extent_io.c
> > +++ b/fs/btrfs/extent_io.c
> > @@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
> >        */
> >       spin_lock(&eb->refs_lock);
> >       if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
> > -             XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> > +             XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
> >               unsigned long flags;
> >
> >               set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
> > @@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
> >  static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
> >  {
> >       struct btrfs_fs_info *fs_info = eb->fs_info;
> > -     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> > +     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
> >       unsigned long flags;
> >
> >       xas_lock_irqsave(&xas, flags);
> > @@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
> >  static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
> >  {
> >       struct btrfs_fs_info *fs_info = eb->fs_info;
> > -     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> > +     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
> >       unsigned long flags;
> >
> >       xas_lock_irqsave(&xas, flags);
> > @@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
> >       rcu_read_lock();
> >       while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
> >               if (!eb_batch_add(batch, eb)) {
> > -                     *start = ((eb->start + eb->len) >> fs_info->sectorsize_bits);
> > +                     *start = (eb->start + eb->len) >> fs_info->nodesize_bits;
>
> In other places you drop the outer ( ) from the shifts, please keep it
> if it's there (or add if it's missing).

Right. This happened by the rebase as they were not there before. So
is this the preferred code style from now on?

> >                       goto out;
> >               }
> >       }
> > @@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
> >               struct btrfs_fs_info *fs_info, u64 start)
> >  {
> >       struct extent_buffer *eb;
> > -     unsigned long index = (start >> fs_info->sectorsize_bits);
> > +     unsigned long index = start >> fs_info->nodesize_bits;
> >
> >       rcu_read_lock();
> >       eb = xa_load(&fs_info->buffer_tree, index);
> > @@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
> >                                     u64 end)
> >  {
> >       struct eb_batch batch;
> > -     unsigned long start_index = (start >> fs_info->sectorsize_bits);
> > -     unsigned long end_index = (end >> fs_info->sectorsize_bits);
> > +     unsigned long start_index = start >> fs_info->nodesize_bits;
> > +     unsigned long end_index = end >> fs_info->nodesize_bits;
> >
> >       eb_batch_init(&batch);
> >       while (start_index <= end_index) {
> > @@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
> >
> >       eb_batch_init(&batch);
> >       if (wbc->range_cyclic) {
> > -             index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits);
> > +             index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits;
> >               end = -1;
> >
> >               /*
> > @@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
> >                */
> >               scanned = (index == 0);
> >       } else {
> > -             index = (wbc->range_start >> fs_info->sectorsize_bits);
> > -             end = (wbc->range_end >> fs_info->sectorsize_bits);
> > +             index = wbc->range_start >> fs_info->nodesize_bits;
> > +             end = wbc->range_end >> fs_info->nodesize_bits;
> >
> >               scanned = 1;
> >       }
> > @@ -3038,7 +3038,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
> >       eb->fs_info = fs_info;
> >  again:
> >       xa_lock_irq(&fs_info->buffer_tree);
> > -     exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
> > +     exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits,
> >                             NULL, eb, GFP_NOFS);
> >       if (xa_is_err(exists)) {
> >               ret = xa_err(exists);
> > @@ -3355,7 +3355,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
> >  again:
> >       xa_lock_irq(&fs_info->buffer_tree);
> >       existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
> > -                                start >> fs_info->sectorsize_bits, NULL, eb,
> > +                                start >> fs_info->nodesize_bits, NULL, eb,
> >                                  GFP_NOFS);
> >       if (xa_is_err(existing_eb)) {
> >               ret = xa_err(existing_eb);
> > @@ -3458,7 +3458,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
> >                * in this case.
> >                */
> >               xa_cmpxchg_irq(&fs_info->buffer_tree,
> > -                            eb->start >> fs_info->sectorsize_bits, eb, NULL,
> > +                            eb->start >> fs_info->nodesize_bits, eb, NULL,
> >                              GFP_ATOMIC);
> >
> >               btrfs_leak_debug_del_eb(eb);
> > @@ -4300,9 +4300,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
> >  {
> >       struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
> >       struct extent_buffer *eb;
> > -     unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits);
> > +     unsigned long start = folio_pos(folio) >> fs_info->nodesize_bits;
> >       unsigned long index = start;
> > -     unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
> > +     unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
> >       int ret;
> >
> >       xa_lock_irq(&fs_info->buffer_tree);
> > diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> > index b239e4b8421cf..fd7cbbe3515d6 100644
> > --- a/fs/btrfs/fs.h
> > +++ b/fs/btrfs/fs.h
> > @@ -781,7 +781,7 @@ struct btrfs_fs_info {
> >
> >       struct btrfs_delayed_root *delayed_root;
> >
> > -     /* Entries are eb->start / sectorsize */
> > +     /* Entries are eb->start >> nodesize_bits */
> >       struct xarray buffer_tree;
> >
> >       /* Next backup root to be overwritten */
> > @@ -813,6 +813,7 @@ struct btrfs_fs_info {
> >
> >       /* Cached block sizes */
> >       u32 nodesize;
> > +     u32 nodesize_bits;
> >       u32 sectorsize;
> >       /* ilog2 of sectorsize, use to avoid 64bit division */
> >       u32 sectorsize_bits;
> > --
> > 2.47.2
> >
Re: [PATCH v2] btrfs: index buffer_tree using node size
Posted by David Sterba 7 months, 2 weeks ago
On Mon, Jun 23, 2025 at 04:04:39PM +0200, Daniel Vacek wrote:
> On Fri, 20 Jun 2025 at 14:57, David Sterba <dsterba@suse.cz> wrote:
> >
> > On Thu, Jun 12, 2025 at 10:47:23AM +0200, Daniel Vacek wrote:
> > > So far we are deriving the buffer tree index using the sector size. But each
> > > extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> > >
> > > For example the typical and quite common configuration uses sector size of 4KiB
> > > and node size of 16KiB. In this case it means the buffer tree is using up to
> > > the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> > > slots are wasted as never used.
> > >
> > > We can score significant memory savings on the required tree nodes by indexing
> > > the tree using the node size instead. As a result far less slots are wasted
> > > and the tree can now use up to all 100% of it's slots this way.
> > >
> > > Note: This works even with unaligned tree blocks as we can still get unique
> > >       index by doing eb->start >> nodesize_shift.
> >
> > Can we have at least some numbers? As we've talked about it and you
> > showed me the number of radix nodes or other internal xarray structures
> > before/after.
> 
> The numbers are in this email thread. Do you mean to put them directly
> into the commit message?

Yes, it's relevant information and part of why the patch is done. Mails
are for discussion, for permanent stoarge we want it in changelog in a
presentable form.
Re: [PATCH v2] btrfs: index buffer_tree using node size
Posted by Daniel Vacek 7 months, 1 week ago
On Mon, 23 Jun 2025 at 16:25, David Sterba <dsterba@suse.cz> wrote:
>
> On Mon, Jun 23, 2025 at 04:04:39PM +0200, Daniel Vacek wrote:
> > On Fri, 20 Jun 2025 at 14:57, David Sterba <dsterba@suse.cz> wrote:
> > >
> > > On Thu, Jun 12, 2025 at 10:47:23AM +0200, Daniel Vacek wrote:
> > > > So far we are deriving the buffer tree index using the sector size. But each
> > > > extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> > > >
> > > > For example the typical and quite common configuration uses sector size of 4KiB
> > > > and node size of 16KiB. In this case it means the buffer tree is using up to
> > > > the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> > > > slots are wasted as never used.
> > > >
> > > > We can score significant memory savings on the required tree nodes by indexing
> > > > the tree using the node size instead. As a result far less slots are wasted
> > > > and the tree can now use up to all 100% of it's slots this way.
> > > >
> > > > Note: This works even with unaligned tree blocks as we can still get unique
> > > >       index by doing eb->start >> nodesize_shift.
> > >
> > > Can we have at least some numbers? As we've talked about it and you
> > > showed me the number of radix nodes or other internal xarray structures
> > > before/after.
> >
> > The numbers are in this email thread. Do you mean to put them directly
> > into the commit message?
>
> Yes, it's relevant information and part of why the patch is done. Mails
> are for discussion, for permanent stoarge we want it in changelog in a
> presentable form.

I've posted the v3 just now.

https://lore.kernel.org/linux-btrfs/20250704160704.3353446-1-neelx@suse.com/