[PATCH v2] btrfs: index buffer_tree using node size

Daniel Vacek posted 1 patch 4 months ago
There is a newer version of this series
fs/btrfs/disk-io.c   |  1 +
fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
fs/btrfs/fs.h        |  3 ++-
3 files changed, 18 insertions(+), 16 deletions(-)
[PATCH v2] btrfs: index buffer_tree using node size
Posted by Daniel Vacek 4 months ago
So far we are deriving the buffer tree index using the sector size. But each
extent buffer covers multiple sectors. This makes the buffer tree rather sparse.

For example the typical and quite common configuration uses sector size of 4KiB
and node size of 16KiB. In this case it means the buffer tree is using up to
the maximum of 25% of it's slots. Or in other words at least 75% of the tree
slots are wasted as never used.

We can score significant memory savings on the required tree nodes by indexing
the tree using the node size instead. As a result far less slots are wasted
and the tree can now use up to all 100% of it's slots this way.

Note: This works even with unaligned tree blocks as we can still get unique
      index by doing eb->start >> nodesize_shift.

Signed-off-by: Daniel Vacek <neelx@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
v2 changes:
 * Note that this is still correct even with unaligned tree blocks.
 * Rename node_bits to nodesize_bits to stay consistent.
 * Move the nodesize_bits member next to nodesize and make it u32.

---
 fs/btrfs/disk-io.c   |  1 +
 fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
 fs/btrfs/fs.h        |  3 ++-
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0d6ad7512f217..3d465258f15b7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3396,6 +3396,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
 
 	fs_info->nodesize = nodesize;
+	fs_info->nodesize_bits = ilog2(nodesize);
 	fs_info->sectorsize = sectorsize;
 	fs_info->sectorsize_bits = ilog2(sectorsize);
 	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e9ba80a56172d..a55c7c7eb8990 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
 	 */
 	spin_lock(&eb->refs_lock);
 	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
+		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
 		unsigned long flags;
 
 		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
@@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
 static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
+	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
 	unsigned long flags;
 
 	xas_lock_irqsave(&xas, flags);
@@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
 static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
+	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
 	unsigned long flags;
 
 	xas_lock_irqsave(&xas, flags);
@@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
 	rcu_read_lock();
 	while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
 		if (!eb_batch_add(batch, eb)) {
-			*start = ((eb->start + eb->len) >> fs_info->sectorsize_bits);
+			*start = (eb->start + eb->len) >> fs_info->nodesize_bits;
 			goto out;
 		}
 	}
@@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
 		struct btrfs_fs_info *fs_info, u64 start)
 {
 	struct extent_buffer *eb;
-	unsigned long index = (start >> fs_info->sectorsize_bits);
+	unsigned long index = start >> fs_info->nodesize_bits;
 
 	rcu_read_lock();
 	eb = xa_load(&fs_info->buffer_tree, index);
@@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
 				      u64 end)
 {
 	struct eb_batch batch;
-	unsigned long start_index = (start >> fs_info->sectorsize_bits);
-	unsigned long end_index = (end >> fs_info->sectorsize_bits);
+	unsigned long start_index = start >> fs_info->nodesize_bits;
+	unsigned long end_index = end >> fs_info->nodesize_bits;
 
 	eb_batch_init(&batch);
 	while (start_index <= end_index) {
@@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 
 	eb_batch_init(&batch);
 	if (wbc->range_cyclic) {
-		index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits);
+		index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits;
 		end = -1;
 
 		/*
@@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
 		 */
 		scanned = (index == 0);
 	} else {
-		index = (wbc->range_start >> fs_info->sectorsize_bits);
-		end = (wbc->range_end >> fs_info->sectorsize_bits);
+		index = wbc->range_start >> fs_info->nodesize_bits;
+		end = wbc->range_end >> fs_info->nodesize_bits;
 
 		scanned = 1;
 	}
@@ -3038,7 +3038,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
 	eb->fs_info = fs_info;
 again:
 	xa_lock_irq(&fs_info->buffer_tree);
-	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
+	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits,
 			      NULL, eb, GFP_NOFS);
 	if (xa_is_err(exists)) {
 		ret = xa_err(exists);
@@ -3355,7 +3355,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 again:
 	xa_lock_irq(&fs_info->buffer_tree);
 	existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
-				   start >> fs_info->sectorsize_bits, NULL, eb,
+				   start >> fs_info->nodesize_bits, NULL, eb,
 				   GFP_NOFS);
 	if (xa_is_err(existing_eb)) {
 		ret = xa_err(existing_eb);
@@ -3458,7 +3458,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
 		 * in this case.
 		 */
 		xa_cmpxchg_irq(&fs_info->buffer_tree,
-			       eb->start >> fs_info->sectorsize_bits, eb, NULL,
+			       eb->start >> fs_info->nodesize_bits, eb, NULL,
 			       GFP_ATOMIC);
 
 		btrfs_leak_debug_del_eb(eb);
@@ -4300,9 +4300,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
 {
 	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
 	struct extent_buffer *eb;
-	unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits);
+	unsigned long start = folio_pos(folio) >> fs_info->nodesize_bits;
 	unsigned long index = start;
-	unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
+	unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
 	int ret;
 
 	xa_lock_irq(&fs_info->buffer_tree);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index b239e4b8421cf..fd7cbbe3515d6 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -781,7 +781,7 @@ struct btrfs_fs_info {
 
 	struct btrfs_delayed_root *delayed_root;
 
-	/* Entries are eb->start / sectorsize */
+	/* Entries are eb->start >> nodesize_bits */
 	struct xarray buffer_tree;
 
 	/* Next backup root to be overwritten */
@@ -813,6 +813,7 @@ struct btrfs_fs_info {
 
 	/* Cached block sizes */
 	u32 nodesize;
+	u32 nodesize_bits;
 	u32 sectorsize;
 	/* ilog2 of sectorsize, use to avoid 64bit division */
 	u32 sectorsize_bits;
-- 
2.47.2
Re: [PATCH v2] btrfs: index buffer_tree using node size
Posted by David Sterba 3 months, 3 weeks ago
On Thu, Jun 12, 2025 at 10:47:23AM +0200, Daniel Vacek wrote:
> So far we are deriving the buffer tree index using the sector size. But each
> extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> 
> For example the typical and quite common configuration uses sector size of 4KiB
> and node size of 16KiB. In this case it means the buffer tree is using up to
> the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> slots are wasted as never used.
> 
> We can score significant memory savings on the required tree nodes by indexing
> the tree using the node size instead. As a result far less slots are wasted
> and the tree can now use up to all 100% of it's slots this way.
> 
> Note: This works even with unaligned tree blocks as we can still get unique
>       index by doing eb->start >> nodesize_shift.

Can we have at least some numbers? As we've talked about it and you
showed me the number of radix nodes or other internal xarray structures
before/after.

> Signed-off-by: Daniel Vacek <neelx@suse.com>
> Reviewed-by: Qu Wenruo <wqu@suse.com>
> ---
> v2 changes:
>  * Note that this is still correct even with unaligned tree blocks.
>  * Rename node_bits to nodesize_bits to stay consistent.
>  * Move the nodesize_bits member next to nodesize and make it u32.
> 
> ---
>  fs/btrfs/disk-io.c   |  1 +
>  fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
>  fs/btrfs/fs.h        |  3 ++-
>  3 files changed, 18 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 0d6ad7512f217..3d465258f15b7 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3396,6 +3396,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
>  	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
>  
>  	fs_info->nodesize = nodesize;
> +	fs_info->nodesize_bits = ilog2(nodesize);
>  	fs_info->sectorsize = sectorsize;
>  	fs_info->sectorsize_bits = ilog2(sectorsize);
>  	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index e9ba80a56172d..a55c7c7eb8990 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
>  	 */
>  	spin_lock(&eb->refs_lock);
>  	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
> -		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
>  		unsigned long flags;
>  
>  		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
> @@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
>  static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
>  {
>  	struct btrfs_fs_info *fs_info = eb->fs_info;
> -	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
>  	unsigned long flags;
>  
>  	xas_lock_irqsave(&xas, flags);
> @@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
>  static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
>  {
>  	struct btrfs_fs_info *fs_info = eb->fs_info;
> -	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
>  	unsigned long flags;
>  
>  	xas_lock_irqsave(&xas, flags);
> @@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
>  	rcu_read_lock();
>  	while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
>  		if (!eb_batch_add(batch, eb)) {
> -			*start = ((eb->start + eb->len) >> fs_info->sectorsize_bits);
> +			*start = (eb->start + eb->len) >> fs_info->nodesize_bits;

In other places you drop the outer ( ) from the shifts, please keep it
if it's there (or add if it's missing).

>  			goto out;
>  		}
>  	}
> @@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
>  		struct btrfs_fs_info *fs_info, u64 start)
>  {
>  	struct extent_buffer *eb;
> -	unsigned long index = (start >> fs_info->sectorsize_bits);
> +	unsigned long index = start >> fs_info->nodesize_bits;
>  
>  	rcu_read_lock();
>  	eb = xa_load(&fs_info->buffer_tree, index);
> @@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
>  				      u64 end)
>  {
>  	struct eb_batch batch;
> -	unsigned long start_index = (start >> fs_info->sectorsize_bits);
> -	unsigned long end_index = (end >> fs_info->sectorsize_bits);
> +	unsigned long start_index = start >> fs_info->nodesize_bits;
> +	unsigned long end_index = end >> fs_info->nodesize_bits;
>  
>  	eb_batch_init(&batch);
>  	while (start_index <= end_index) {
> @@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
>  
>  	eb_batch_init(&batch);
>  	if (wbc->range_cyclic) {
> -		index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits);
> +		index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits;
>  		end = -1;
>  
>  		/*
> @@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
>  		 */
>  		scanned = (index == 0);
>  	} else {
> -		index = (wbc->range_start >> fs_info->sectorsize_bits);
> -		end = (wbc->range_end >> fs_info->sectorsize_bits);
> +		index = wbc->range_start >> fs_info->nodesize_bits;
> +		end = wbc->range_end >> fs_info->nodesize_bits;
>  
>  		scanned = 1;
>  	}
> @@ -3038,7 +3038,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
>  	eb->fs_info = fs_info;
>  again:
>  	xa_lock_irq(&fs_info->buffer_tree);
> -	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
> +	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits,
>  			      NULL, eb, GFP_NOFS);
>  	if (xa_is_err(exists)) {
>  		ret = xa_err(exists);
> @@ -3355,7 +3355,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  again:
>  	xa_lock_irq(&fs_info->buffer_tree);
>  	existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
> -				   start >> fs_info->sectorsize_bits, NULL, eb,
> +				   start >> fs_info->nodesize_bits, NULL, eb,
>  				   GFP_NOFS);
>  	if (xa_is_err(existing_eb)) {
>  		ret = xa_err(existing_eb);
> @@ -3458,7 +3458,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
>  		 * in this case.
>  		 */
>  		xa_cmpxchg_irq(&fs_info->buffer_tree,
> -			       eb->start >> fs_info->sectorsize_bits, eb, NULL,
> +			       eb->start >> fs_info->nodesize_bits, eb, NULL,
>  			       GFP_ATOMIC);
>  
>  		btrfs_leak_debug_del_eb(eb);
> @@ -4300,9 +4300,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
>  {
>  	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
>  	struct extent_buffer *eb;
> -	unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits);
> +	unsigned long start = folio_pos(folio) >> fs_info->nodesize_bits;
>  	unsigned long index = start;
> -	unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
> +	unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
>  	int ret;
>  
>  	xa_lock_irq(&fs_info->buffer_tree);
> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> index b239e4b8421cf..fd7cbbe3515d6 100644
> --- a/fs/btrfs/fs.h
> +++ b/fs/btrfs/fs.h
> @@ -781,7 +781,7 @@ struct btrfs_fs_info {
>  
>  	struct btrfs_delayed_root *delayed_root;
>  
> -	/* Entries are eb->start / sectorsize */
> +	/* Entries are eb->start >> nodesize_bits */
>  	struct xarray buffer_tree;
>  
>  	/* Next backup root to be overwritten */
> @@ -813,6 +813,7 @@ struct btrfs_fs_info {
>  
>  	/* Cached block sizes */
>  	u32 nodesize;
> +	u32 nodesize_bits;
>  	u32 sectorsize;
>  	/* ilog2 of sectorsize, use to avoid 64bit division */
>  	u32 sectorsize_bits;
> -- 
> 2.47.2
>
Re: [PATCH v2] btrfs: index buffer_tree using node size
Posted by Daniel Vacek 3 months, 2 weeks ago
On Fri, 20 Jun 2025 at 14:57, David Sterba <dsterba@suse.cz> wrote:
>
> On Thu, Jun 12, 2025 at 10:47:23AM +0200, Daniel Vacek wrote:
> > So far we are deriving the buffer tree index using the sector size. But each
> > extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> >
> > For example the typical and quite common configuration uses sector size of 4KiB
> > and node size of 16KiB. In this case it means the buffer tree is using up to
> > the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> > slots are wasted as never used.
> >
> > We can score significant memory savings on the required tree nodes by indexing
> > the tree using the node size instead. As a result far less slots are wasted
> > and the tree can now use up to all 100% of it's slots this way.
> >
> > Note: This works even with unaligned tree blocks as we can still get unique
> >       index by doing eb->start >> nodesize_shift.
>
> Can we have at least some numbers? As we've talked about it and you
> showed me the number of radix nodes or other internal xarray structures
> before/after.

The numbers are in this email thread. Do you mean to put them directly
into the commit message?

> > Signed-off-by: Daniel Vacek <neelx@suse.com>
> > Reviewed-by: Qu Wenruo <wqu@suse.com>
> > ---
> > v2 changes:
> >  * Note that this is still correct even with unaligned tree blocks.
> >  * Rename node_bits to nodesize_bits to stay consistent.
> >  * Move the nodesize_bits member next to nodesize and make it u32.
> >
> > ---
> >  fs/btrfs/disk-io.c   |  1 +
> >  fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
> >  fs/btrfs/fs.h        |  3 ++-
> >  3 files changed, 18 insertions(+), 16 deletions(-)
> >
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index 0d6ad7512f217..3d465258f15b7 100644
> > --- a/fs/btrfs/disk-io.c
> > +++ b/fs/btrfs/disk-io.c
> > @@ -3396,6 +3396,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
> >       fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
> >
> >       fs_info->nodesize = nodesize;
> > +     fs_info->nodesize_bits = ilog2(nodesize);
> >       fs_info->sectorsize = sectorsize;
> >       fs_info->sectorsize_bits = ilog2(sectorsize);
> >       fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
> > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> > index e9ba80a56172d..a55c7c7eb8990 100644
> > --- a/fs/btrfs/extent_io.c
> > +++ b/fs/btrfs/extent_io.c
> > @@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
> >        */
> >       spin_lock(&eb->refs_lock);
> >       if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
> > -             XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> > +             XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
> >               unsigned long flags;
> >
> >               set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
> > @@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
> >  static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
> >  {
> >       struct btrfs_fs_info *fs_info = eb->fs_info;
> > -     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> > +     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
> >       unsigned long flags;
> >
> >       xas_lock_irqsave(&xas, flags);
> > @@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
> >  static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
> >  {
> >       struct btrfs_fs_info *fs_info = eb->fs_info;
> > -     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> > +     XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
> >       unsigned long flags;
> >
> >       xas_lock_irqsave(&xas, flags);
> > @@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
> >       rcu_read_lock();
> >       while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
> >               if (!eb_batch_add(batch, eb)) {
> > -                     *start = ((eb->start + eb->len) >> fs_info->sectorsize_bits);
> > +                     *start = (eb->start + eb->len) >> fs_info->nodesize_bits;
>
> In other places you drop the outer ( ) from the shifts, please keep it
> if it's there (or add if it's missing).

Right. This happened by the rebase as they were not there before. So
is this the preferred code style from now on?

> >                       goto out;
> >               }
> >       }
> > @@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
> >               struct btrfs_fs_info *fs_info, u64 start)
> >  {
> >       struct extent_buffer *eb;
> > -     unsigned long index = (start >> fs_info->sectorsize_bits);
> > +     unsigned long index = start >> fs_info->nodesize_bits;
> >
> >       rcu_read_lock();
> >       eb = xa_load(&fs_info->buffer_tree, index);
> > @@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
> >                                     u64 end)
> >  {
> >       struct eb_batch batch;
> > -     unsigned long start_index = (start >> fs_info->sectorsize_bits);
> > -     unsigned long end_index = (end >> fs_info->sectorsize_bits);
> > +     unsigned long start_index = start >> fs_info->nodesize_bits;
> > +     unsigned long end_index = end >> fs_info->nodesize_bits;
> >
> >       eb_batch_init(&batch);
> >       while (start_index <= end_index) {
> > @@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
> >
> >       eb_batch_init(&batch);
> >       if (wbc->range_cyclic) {
> > -             index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits);
> > +             index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits;
> >               end = -1;
> >
> >               /*
> > @@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
> >                */
> >               scanned = (index == 0);
> >       } else {
> > -             index = (wbc->range_start >> fs_info->sectorsize_bits);
> > -             end = (wbc->range_end >> fs_info->sectorsize_bits);
> > +             index = wbc->range_start >> fs_info->nodesize_bits;
> > +             end = wbc->range_end >> fs_info->nodesize_bits;
> >
> >               scanned = 1;
> >       }
> > @@ -3038,7 +3038,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
> >       eb->fs_info = fs_info;
> >  again:
> >       xa_lock_irq(&fs_info->buffer_tree);
> > -     exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
> > +     exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits,
> >                             NULL, eb, GFP_NOFS);
> >       if (xa_is_err(exists)) {
> >               ret = xa_err(exists);
> > @@ -3355,7 +3355,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
> >  again:
> >       xa_lock_irq(&fs_info->buffer_tree);
> >       existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
> > -                                start >> fs_info->sectorsize_bits, NULL, eb,
> > +                                start >> fs_info->nodesize_bits, NULL, eb,
> >                                  GFP_NOFS);
> >       if (xa_is_err(existing_eb)) {
> >               ret = xa_err(existing_eb);
> > @@ -3458,7 +3458,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
> >                * in this case.
> >                */
> >               xa_cmpxchg_irq(&fs_info->buffer_tree,
> > -                            eb->start >> fs_info->sectorsize_bits, eb, NULL,
> > +                            eb->start >> fs_info->nodesize_bits, eb, NULL,
> >                              GFP_ATOMIC);
> >
> >               btrfs_leak_debug_del_eb(eb);
> > @@ -4300,9 +4300,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
> >  {
> >       struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
> >       struct extent_buffer *eb;
> > -     unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits);
> > +     unsigned long start = folio_pos(folio) >> fs_info->nodesize_bits;
> >       unsigned long index = start;
> > -     unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
> > +     unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
> >       int ret;
> >
> >       xa_lock_irq(&fs_info->buffer_tree);
> > diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> > index b239e4b8421cf..fd7cbbe3515d6 100644
> > --- a/fs/btrfs/fs.h
> > +++ b/fs/btrfs/fs.h
> > @@ -781,7 +781,7 @@ struct btrfs_fs_info {
> >
> >       struct btrfs_delayed_root *delayed_root;
> >
> > -     /* Entries are eb->start / sectorsize */
> > +     /* Entries are eb->start >> nodesize_bits */
> >       struct xarray buffer_tree;
> >
> >       /* Next backup root to be overwritten */
> > @@ -813,6 +813,7 @@ struct btrfs_fs_info {
> >
> >       /* Cached block sizes */
> >       u32 nodesize;
> > +     u32 nodesize_bits;
> >       u32 sectorsize;
> >       /* ilog2 of sectorsize, use to avoid 64bit division */
> >       u32 sectorsize_bits;
> > --
> > 2.47.2
> >
Re: [PATCH v2] btrfs: index buffer_tree using node size
Posted by David Sterba 3 months, 2 weeks ago
On Mon, Jun 23, 2025 at 04:04:39PM +0200, Daniel Vacek wrote:
> On Fri, 20 Jun 2025 at 14:57, David Sterba <dsterba@suse.cz> wrote:
> >
> > On Thu, Jun 12, 2025 at 10:47:23AM +0200, Daniel Vacek wrote:
> > > So far we are deriving the buffer tree index using the sector size. But each
> > > extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> > >
> > > For example the typical and quite common configuration uses sector size of 4KiB
> > > and node size of 16KiB. In this case it means the buffer tree is using up to
> > > the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> > > slots are wasted as never used.
> > >
> > > We can score significant memory savings on the required tree nodes by indexing
> > > the tree using the node size instead. As a result far less slots are wasted
> > > and the tree can now use up to all 100% of it's slots this way.
> > >
> > > Note: This works even with unaligned tree blocks as we can still get unique
> > >       index by doing eb->start >> nodesize_shift.
> >
> > Can we have at least some numbers? As we've talked about it and you
> > showed me the number of radix nodes or other internal xarray structures
> > before/after.
> 
> The numbers are in this email thread. Do you mean to put them directly
> into the commit message?

Yes, it's relevant information and part of why the patch is done. Mails
are for discussion, for permanent stoarge we want it in changelog in a
presentable form.
Re: [PATCH v2] btrfs: index buffer_tree using node size
Posted by Daniel Vacek 3 months, 1 week ago
On Mon, 23 Jun 2025 at 16:25, David Sterba <dsterba@suse.cz> wrote:
>
> On Mon, Jun 23, 2025 at 04:04:39PM +0200, Daniel Vacek wrote:
> > On Fri, 20 Jun 2025 at 14:57, David Sterba <dsterba@suse.cz> wrote:
> > >
> > > On Thu, Jun 12, 2025 at 10:47:23AM +0200, Daniel Vacek wrote:
> > > > So far we are deriving the buffer tree index using the sector size. But each
> > > > extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> > > >
> > > > For example the typical and quite common configuration uses sector size of 4KiB
> > > > and node size of 16KiB. In this case it means the buffer tree is using up to
> > > > the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> > > > slots are wasted as never used.
> > > >
> > > > We can score significant memory savings on the required tree nodes by indexing
> > > > the tree using the node size instead. As a result far less slots are wasted
> > > > and the tree can now use up to all 100% of it's slots this way.
> > > >
> > > > Note: This works even with unaligned tree blocks as we can still get unique
> > > >       index by doing eb->start >> nodesize_shift.
> > >
> > > Can we have at least some numbers? As we've talked about it and you
> > > showed me the number of radix nodes or other internal xarray structures
> > > before/after.
> >
> > The numbers are in this email thread. Do you mean to put them directly
> > into the commit message?
>
> Yes, it's relevant information and part of why the patch is done. Mails
> are for discussion, for permanent stoarge we want it in changelog in a
> presentable form.

I've posted the v3 just now.

https://lore.kernel.org/linux-btrfs/20250704160704.3353446-1-neelx@suse.com/