bcachefs disk accounting rewrite

[PATCH 03/21] bcachefs: btree write buffer knows how to accumulate bch_accounting keys

Posted by Kent Overstreet 1 year, 11 months ago

Teach the btree write buffer how to accumulate accounting keys - instead
of having the newer key overwrite the older key as we do with other
updates, we need to add them together.

Also, add a flag so that write buffer flush knows when journal replay is
finished flushing accounting, and teach it to hold accounting keys until
that flag is set.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h           |  1 +
 fs/bcachefs/btree_write_buffer.c | 66 +++++++++++++++++++++++++++-----
 fs/bcachefs/recovery.c           |  3 ++
 3 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 62812fc1cad0..9a24989c9a6a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -616,6 +616,7 @@ struct bch_dev {
 
 #define BCH_FS_FLAGS()			\
 	x(started)			\
+	x(accounting_replay_done)	\
 	x(may_go_rw)			\
 	x(rw)				\
 	x(was_rw)			\
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index b77e7b382b66..002a0762fc85 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -5,6 +5,7 @@
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_write_buffer.h"
+#include "disk_accounting.h"
 #include "error.h"
 #include "journal.h"
 #include "journal_io.h"
@@ -123,7 +124,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
 
 static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
 			       struct btree_write_buffered_key *wb,
-			       bool *write_locked, size_t *fast)
+			       bool *write_locked,
+			       bool *accounting_accumulated,
+			       size_t *fast)
 {
 	struct btree_path *path;
 	int ret;
@@ -136,6 +139,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
 	if (ret)
 		return ret;
 
+	if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
+		struct bkey u;
+		struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
+
+		if (k.k->type == KEY_TYPE_accounting)
+			bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
+						   bkey_s_c_to_accounting(k));
+	}
+	*accounting_accumulated = true;
+
 	/*
 	 * We can't clone a path that has write locks: unshare it now, before
 	 * set_pos and traverse():
@@ -248,8 +261,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 	struct journal *j = &c->journal;
 	struct btree_write_buffer *wb = &c->btree_write_buffer;
 	struct btree_iter iter = { NULL };
-	size_t skipped = 0, fast = 0, slowpath = 0;
+	size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
 	bool write_locked = false;
+	bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
 	int ret = 0;
 
 	bch2_trans_unlock(trans);
@@ -284,17 +298,29 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 
 	darray_for_each(wb->sorted, i) {
 		struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
+		bool accounting_accumulated = false;
 
 		for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
 			prefetch(&wb->flushing.keys.data[n->idx]);
 
 		BUG_ON(!k->journal_seq);
 
+		if (!accounting_replay_done &&
+		    k->k.k.type == KEY_TYPE_accounting) {
+			slowpath++;
+			continue;
+		}
+
 		if (i + 1 < &darray_top(wb->sorted) &&
 		    wb_key_eq(i, i + 1)) {
 			struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
 
-			skipped++;
+			if (k->k.k.type == KEY_TYPE_accounting &&
+			    n->k.k.type == KEY_TYPE_accounting)
+				bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
+							   bkey_i_to_s_c_accounting(&k->k));
+
+			overwritten++;
 			n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
 			k->journal_seq = 0;
 			continue;
@@ -325,7 +351,8 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 				break;
 			}
 
-			ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
+			ret = wb_flush_one(trans, &iter, k, &write_locked,
+					   &accounting_accumulated, &fast);
 			if (!write_locked)
 				bch2_trans_begin(trans);
 		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
@@ -361,8 +388,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 			if (!i->journal_seq)
 				continue;
 
-			bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
-						bch2_btree_write_buffer_journal_flush);
+			if (!accounting_replay_done &&
+			    i->k.k.type == KEY_TYPE_accounting) {
+				could_not_insert++;
+				continue;
+			}
+
+			if (!could_not_insert)
+				bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
+							bch2_btree_write_buffer_journal_flush);
 
 			bch2_trans_begin(trans);
 
@@ -375,13 +409,27 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 					btree_write_buffered_insert(trans, i));
 			if (ret)
 				goto err;
+
+			i->journal_seq = 0;
+		}
+
+		if (could_not_insert) {
+			struct btree_write_buffered_key *dst = wb->flushing.keys.data;
+
+			darray_for_each(wb->flushing.keys, i)
+				if (i->journal_seq)
+					*dst++ = *i;
+			wb->flushing.keys.nr = dst - wb->flushing.keys.data;
 		}
 	}
 err:
+	if (ret || !could_not_insert) {
+		bch2_journal_pin_drop(j, &wb->flushing.pin);
+		wb->flushing.keys.nr = 0;
+	}
+
 	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
-	trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
-	bch2_journal_pin_drop(j, &wb->flushing.pin);
-	wb->flushing.keys.nr = 0;
+	trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
 	return ret;
 }
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6829d80bd181..b8289af66c8e 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -228,6 +228,8 @@ static int bch2_journal_replay(struct bch_fs *c)
 			goto err;
 	}
 
+	set_bit(BCH_FS_accounting_replay_done, &c->flags);
+
 	/*
 	 * First, attempt to replay keys in sorted order. This is more
 	 * efficient - better locality of btree access -  but some might fail if
@@ -1204,6 +1206,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	 * set up the journal.pin FIFO and journal.cur pointer:
 	 */
 	bch2_fs_journal_start(&c->journal, 1);
+	set_bit(BCH_FS_accounting_replay_done, &c->flags);
 	bch2_journal_set_replay_done(&c->journal);
 
 	ret = bch2_fs_read_write_early(c);
-- 
2.43.0

Re: [PATCH 03/21] bcachefs: btree write buffer knows how to accumulate bch_accounting keys

Posted by Brian Foster 1 year, 11 months ago

On Sat, Feb 24, 2024 at 09:38:05PM -0500, Kent Overstreet wrote:
> Teach the btree write buffer how to accumulate accounting keys - instead
> of having the newer key overwrite the older key as we do with other
> updates, we need to add them together.
> 
> Also, add a flag so that write buffer flush knows when journal replay is
> finished flushing accounting, and teach it to hold accounting keys until
> that flag is set.
> 
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> ---
>  fs/bcachefs/bcachefs.h           |  1 +
>  fs/bcachefs/btree_write_buffer.c | 66 +++++++++++++++++++++++++++-----
>  fs/bcachefs/recovery.c           |  3 ++
>  3 files changed, 61 insertions(+), 9 deletions(-)
> 
...
> diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
> index b77e7b382b66..002a0762fc85 100644
> --- a/fs/bcachefs/btree_write_buffer.c
> +++ b/fs/bcachefs/btree_write_buffer.c
> @@ -5,6 +5,7 @@
>  #include "btree_update.h"
>  #include "btree_update_interior.h"
>  #include "btree_write_buffer.h"
> +#include "disk_accounting.h"
>  #include "error.h"
>  #include "journal.h"
>  #include "journal_io.h"
> @@ -123,7 +124,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
>  
>  static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
>  			       struct btree_write_buffered_key *wb,
> -			       bool *write_locked, size_t *fast)
> +			       bool *write_locked,
> +			       bool *accounting_accumulated,
> +			       size_t *fast)
>  {
>  	struct btree_path *path;
>  	int ret;
> @@ -136,6 +139,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
>  	if (ret)
>  		return ret;
>  
> +	if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
> +		struct bkey u;
> +		struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
> +
> +		if (k.k->type == KEY_TYPE_accounting)
> +			bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
> +						   bkey_s_c_to_accounting(k));

So it looks like we're accumulating from the btree key into the write
buffer key. Is this so the following code will basically insert a new
btree key based on the value of the write buffer key?

> +	}
> +	*accounting_accumulated = true;
> +
>  	/*
>  	 * We can't clone a path that has write locks: unshare it now, before
>  	 * set_pos and traverse():
> @@ -248,8 +261,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
>  	struct journal *j = &c->journal;
>  	struct btree_write_buffer *wb = &c->btree_write_buffer;
>  	struct btree_iter iter = { NULL };
> -	size_t skipped = 0, fast = 0, slowpath = 0;
> +	size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
>  	bool write_locked = false;
> +	bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
>  	int ret = 0;
>  
>  	bch2_trans_unlock(trans);
> @@ -284,17 +298,29 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
>  
>  	darray_for_each(wb->sorted, i) {
>  		struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
> +		bool accounting_accumulated = false;

Should this live within the interior flush loop?

>  
>  		for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
>  			prefetch(&wb->flushing.keys.data[n->idx]);
>  
>  		BUG_ON(!k->journal_seq);
>  
> +		if (!accounting_replay_done &&
> +		    k->k.k.type == KEY_TYPE_accounting) {
> +			slowpath++;
> +			continue;
> +		}
> +
>  		if (i + 1 < &darray_top(wb->sorted) &&
>  		    wb_key_eq(i, i + 1)) {
>  			struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
>  
> -			skipped++;
> +			if (k->k.k.type == KEY_TYPE_accounting &&
> +			    n->k.k.type == KEY_TYPE_accounting)
> +				bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
> +							   bkey_i_to_s_c_accounting(&k->k));
> +
> +			overwritten++;
>  			n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
>  			k->journal_seq = 0;
>  			continue;
> @@ -325,7 +351,8 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
>  				break;
>  			}
>  
> -			ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
> +			ret = wb_flush_one(trans, &iter, k, &write_locked,
> +					   &accounting_accumulated, &fast);
>  			if (!write_locked)
>  				bch2_trans_begin(trans);
>  		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
> @@ -361,8 +388,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
>  			if (!i->journal_seq)
>  				continue;
>  
> -			bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
> -						bch2_btree_write_buffer_journal_flush);
> +			if (!accounting_replay_done &&
> +			    i->k.k.type == KEY_TYPE_accounting) {
> +				could_not_insert++;
> +				continue;
> +			}
> +
> +			if (!could_not_insert)
> +				bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
> +							bch2_btree_write_buffer_journal_flush);

Hmm.. so this is sane because the slowpath runs in journal sorted order,
right?

>  
>  			bch2_trans_begin(trans);
>  
> @@ -375,13 +409,27 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
>  					btree_write_buffered_insert(trans, i));
>  			if (ret)
>  				goto err;
> +
> +			i->journal_seq = 0;
> +		}
> +

		/*
		 * Condense the remaining keys <reasons reasons>...??
		 */

> +		if (could_not_insert) {
> +			struct btree_write_buffered_key *dst = wb->flushing.keys.data;
> +
> +			darray_for_each(wb->flushing.keys, i)
> +				if (i->journal_seq)
> +					*dst++ = *i;
> +			wb->flushing.keys.nr = dst - wb->flushing.keys.data;
>  		}
>  	}
>  err:
> +	if (ret || !could_not_insert) {
> +		bch2_journal_pin_drop(j, &wb->flushing.pin);
> +		wb->flushing.keys.nr = 0;
> +	}
> +
>  	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
> -	trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
> -	bch2_journal_pin_drop(j, &wb->flushing.pin);
> -	wb->flushing.keys.nr = 0;
> +	trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);

I feel like the last time I looked at the write buffer stuff the flush
wasn't reentrant in this way. I.e., the flush switched out the active
buffer and so had to process all entries in the current buffer (or
something like that). Has something changed or do I misunderstand?

>  	return ret;
>  }
>  
> diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
> index 6829d80bd181..b8289af66c8e 100644
> --- a/fs/bcachefs/recovery.c
> +++ b/fs/bcachefs/recovery.c
> @@ -228,6 +228,8 @@ static int bch2_journal_replay(struct bch_fs *c)
>  			goto err;
>  	}
>  
> +	set_bit(BCH_FS_accounting_replay_done, &c->flags);
> +

I assume this ties into the question on the previous patch..

Related question.. if the write buffer can't flush during journal
replay, is there concern/risk of overflowing it?

Brian

>  	/*
>  	 * First, attempt to replay keys in sorted order. This is more
>  	 * efficient - better locality of btree access -  but some might fail if
> @@ -1204,6 +1206,7 @@ int bch2_fs_initialize(struct bch_fs *c)
>  	 * set up the journal.pin FIFO and journal.cur pointer:
>  	 */
>  	bch2_fs_journal_start(&c->journal, 1);
> +	set_bit(BCH_FS_accounting_replay_done, &c->flags);
>  	bch2_journal_set_replay_done(&c->journal);
>  
>  	ret = bch2_fs_read_write_early(c);
> -- 
> 2.43.0
>

Re: [PATCH 03/21] bcachefs: btree write buffer knows how to accumulate bch_accounting keys

Posted by Kent Overstreet 1 year, 11 months ago

On Tue, Feb 27, 2024 at 10:50:23AM -0500, Brian Foster wrote:
> On Sat, Feb 24, 2024 at 09:38:05PM -0500, Kent Overstreet wrote:
> > +	if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
> > +		struct bkey u;
> > +		struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
> > +
> > +		if (k.k->type == KEY_TYPE_accounting)
> > +			bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
> > +						   bkey_s_c_to_accounting(k));
> 
> So it looks like we're accumulating from the btree key into the write
> buffer key. Is this so the following code will basically insert a new
> btree key based on the value of the write buffer key?

Correct, this is where we go from "accounting keys is a delta" to
"accounting key is new version of the key".

> >  	darray_for_each(wb->sorted, i) {
> >  		struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
> > +		bool accounting_accumulated = false;
> 
> Should this live within the interior flush loop?

We can't define it within the loop because then we'd be setting it to
false on every loop iteration... but it does belong _with_ the loop, so
I'll move it to right before.

> > -			bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
> > -						bch2_btree_write_buffer_journal_flush);
> > +			if (!accounting_replay_done &&
> > +			    i->k.k.type == KEY_TYPE_accounting) {
> > +				could_not_insert++;
> > +				continue;
> > +			}
> > +
> > +			if (!could_not_insert)
> > +				bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
> > +							bch2_btree_write_buffer_journal_flush);
> 
> Hmm.. so this is sane because the slowpath runs in journal sorted order,
> right?

yup, which means as soon as we hit a key we can't insert we can't
release any more journal pins

> 
> >  
> >  			bch2_trans_begin(trans);
> >  
> > @@ -375,13 +409,27 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
> >  					btree_write_buffered_insert(trans, i));
> >  			if (ret)
> >  				goto err;
> > +
> > +			i->journal_seq = 0;
> > +		}
> > +
> 
> 		/*
> 		 * Condense the remaining keys <reasons reasons>...??
> 		 */

yup, that's a good comment

> > +		if (could_not_insert) {
> > +			struct btree_write_buffered_key *dst = wb->flushing.keys.data;
> > +
> > +			darray_for_each(wb->flushing.keys, i)
> > +				if (i->journal_seq)
> > +					*dst++ = *i;
> > +			wb->flushing.keys.nr = dst - wb->flushing.keys.data;
> >  		}
> >  	}
> >  err:
> > +	if (ret || !could_not_insert) {
> > +		bch2_journal_pin_drop(j, &wb->flushing.pin);
> > +		wb->flushing.keys.nr = 0;
> > +	}
> > +
> >  	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
> > -	trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
> > -	bch2_journal_pin_drop(j, &wb->flushing.pin);
> > -	wb->flushing.keys.nr = 0;
> > +	trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
> 
> I feel like the last time I looked at the write buffer stuff the flush
> wasn't reentrant in this way. I.e., the flush switched out the active
> buffer and so had to process all entries in the current buffer (or
> something like that). Has something changed or do I misunderstand?

Yeah, originally we were adding keys to the write buffer directly from
the transaction commit path, so that necessitated the super fast
lockless stuff where we'd toggle between buffers so one was always
available.

Now keys are pulled from the journal, so we can use (somewhat) simpler
locking and buffering; now the complication is that we can't predict in
advance how many keys are going to come out of the journal for the write
buffer.

> 
> >  	return ret;
> >  }
> >  
> > diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
> > index 6829d80bd181..b8289af66c8e 100644
> > --- a/fs/bcachefs/recovery.c
> > +++ b/fs/bcachefs/recovery.c
> > @@ -228,6 +228,8 @@ static int bch2_journal_replay(struct bch_fs *c)
> >  			goto err;
> >  	}
> >  
> > +	set_bit(BCH_FS_accounting_replay_done, &c->flags);
> > +
> 
> I assume this ties into the question on the previous patch..
> 
> Related question.. if the write buffer can't flush during journal
> replay, is there concern/risk of overflowing it?

Shouldn't be any actual risk. It's just new accounting updates that the
write buffer can't flush, and those are only going to be generated by
interior btree node updates as journal replay has to split/rewrite nodes
to make room for its updates.

And for those new acounting updates, updates to the same counters get
accumulated as they're flushed from the journal to the write buffer -
see the patch for eytzingcer tree accumulated. So we could only overflow
if the number of distinct counters touched somehow was very large.

And the number of distinct counters will be growing significantly, but
the new counters will all be for user data, not metadata.

(Except: that reminds me, we do want to add per-btree counters, so users
can see "I have x amount of extents, x amount of dirents, etc.).

Re: [PATCH 03/21] bcachefs: btree write buffer knows how to accumulate bch_accounting keys

Posted by Brian Foster 1 year, 11 months ago

On Wed, Feb 28, 2024 at 05:42:39PM -0500, Kent Overstreet wrote:
> On Tue, Feb 27, 2024 at 10:50:23AM -0500, Brian Foster wrote:
> > On Sat, Feb 24, 2024 at 09:38:05PM -0500, Kent Overstreet wrote:
> > > +	if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
> > > +		struct bkey u;
> > > +		struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
> > > +
> > > +		if (k.k->type == KEY_TYPE_accounting)
> > > +			bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
> > > +						   bkey_s_c_to_accounting(k));
> > 
> > So it looks like we're accumulating from the btree key into the write
> > buffer key. Is this so the following code will basically insert a new
> > btree key based on the value of the write buffer key?
> 
> Correct, this is where we go from "accounting keys is a delta" to
> "accounting key is new version of the key".
> 
> > >  	darray_for_each(wb->sorted, i) {
> > >  		struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
> > > +		bool accounting_accumulated = false;
> > 
> > Should this live within the interior flush loop?
> 
> We can't define it within the loop because then we'd be setting it to
> false on every loop iteration... but it does belong _with_ the loop, so
> I'll move it to right before.
> 

Ah, right.

> > > -			bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
> > > -						bch2_btree_write_buffer_journal_flush);
> > > +			if (!accounting_replay_done &&
> > > +			    i->k.k.type == KEY_TYPE_accounting) {
> > > +				could_not_insert++;
> > > +				continue;
> > > +			}
> > > +
> > > +			if (!could_not_insert)
> > > +				bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
> > > +							bch2_btree_write_buffer_journal_flush);
> > 
> > Hmm.. so this is sane because the slowpath runs in journal sorted order,
> > right?
> 
> yup, which means as soon as we hit a key we can't insert we can't
> release any more journal pins
> 
> > 
> > >  
> > >  			bch2_trans_begin(trans);
> > >  
> > > @@ -375,13 +409,27 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
> > >  					btree_write_buffered_insert(trans, i));
> > >  			if (ret)
> > >  				goto err;
> > > +
> > > +			i->journal_seq = 0;
> > > +		}
> > > +
> > 
> > 		/*
> > 		 * Condense the remaining keys <reasons reasons>...??
> > 		 */
> 
> yup, that's a good comment
> 
> > > +		if (could_not_insert) {
> > > +			struct btree_write_buffered_key *dst = wb->flushing.keys.data;
> > > +
> > > +			darray_for_each(wb->flushing.keys, i)
> > > +				if (i->journal_seq)
> > > +					*dst++ = *i;
> > > +			wb->flushing.keys.nr = dst - wb->flushing.keys.data;
> > >  		}
> > >  	}
> > >  err:
> > > +	if (ret || !could_not_insert) {
> > > +		bch2_journal_pin_drop(j, &wb->flushing.pin);
> > > +		wb->flushing.keys.nr = 0;
> > > +	}
> > > +
> > >  	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
> > > -	trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
> > > -	bch2_journal_pin_drop(j, &wb->flushing.pin);
> > > -	wb->flushing.keys.nr = 0;
> > > +	trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
> > 
> > I feel like the last time I looked at the write buffer stuff the flush
> > wasn't reentrant in this way. I.e., the flush switched out the active
> > buffer and so had to process all entries in the current buffer (or
> > something like that). Has something changed or do I misunderstand?
> 
> Yeah, originally we were adding keys to the write buffer directly from
> the transaction commit path, so that necessitated the super fast
> lockless stuff where we'd toggle between buffers so one was always
> available.
> 
> Now keys are pulled from the journal, so we can use (somewhat) simpler
> locking and buffering; now the complication is that we can't predict in
> advance how many keys are going to come out of the journal for the write
> buffer.
> 

Ok.

> > 
> > >  	return ret;
> > >  }
> > >  
> > > diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
> > > index 6829d80bd181..b8289af66c8e 100644
> > > --- a/fs/bcachefs/recovery.c
> > > +++ b/fs/bcachefs/recovery.c
> > > @@ -228,6 +228,8 @@ static int bch2_journal_replay(struct bch_fs *c)
> > >  			goto err;
> > >  	}
> > >  
> > > +	set_bit(BCH_FS_accounting_replay_done, &c->flags);
> > > +
> > 
> > I assume this ties into the question on the previous patch..
> > 
> > Related question.. if the write buffer can't flush during journal
> > replay, is there concern/risk of overflowing it?
> 
> Shouldn't be any actual risk. It's just new accounting updates that the
> write buffer can't flush, and those are only going to be generated by
> interior btree node updates as journal replay has to split/rewrite nodes
> to make room for its updates.
> 
> And for those new acounting updates, updates to the same counters get
> accumulated as they're flushed from the journal to the write buffer -
> see the patch for eytzingcer tree accumulated. So we could only overflow
> if the number of distinct counters touched somehow was very large.
> 
> And the number of distinct counters will be growing significantly, but
> the new counters will all be for user data, not metadata.
> 
> (Except: that reminds me, we do want to add per-btree counters, so users
> can see "I have x amount of extents, x amount of dirents, etc.).
> 

Heh, Ok. This all does sound a little open ended to me. Maybe the better
question is: suppose this hypothetically does happen after adding a
bunch of new counters, what would the expected side effect be in the
recovery scenario where the write buffer can't be flushed?

If write buffer updates now basically just journal a special entry,
would that basically mean we'd deadlock during recovery due to no longer
being able to insert journal entries due to a pinned write buffer? If
so, that actually seems reasonable to me in the sense that in theory it
at least doesn't break the filesystem on-disk, but obviously it would
require some kind of enhancement in order to complete the recovery (even
if what that is is currently unknown). Hm?

Brian

Re: [PATCH 03/21] bcachefs: btree write buffer knows how to accumulate bch_accounting keys

Posted by Kent Overstreet 1 year, 11 months ago

On Thu, Feb 29, 2024 at 01:44:07PM -0500, Brian Foster wrote:
> On Wed, Feb 28, 2024 at 05:42:39PM -0500, Kent Overstreet wrote:
> > Shouldn't be any actual risk. It's just new accounting updates that the
> > write buffer can't flush, and those are only going to be generated by
> > interior btree node updates as journal replay has to split/rewrite nodes
> > to make room for its updates.
> > 
> > And for those new acounting updates, updates to the same counters get
> > accumulated as they're flushed from the journal to the write buffer -
> > see the patch for eytzingcer tree accumulated. So we could only overflow
> > if the number of distinct counters touched somehow was very large.
> > 
> > And the number of distinct counters will be growing significantly, but
> > the new counters will all be for user data, not metadata.
> > 
> > (Except: that reminds me, we do want to add per-btree counters, so users
> > can see "I have x amount of extents, x amount of dirents, etc.).
> > 
> 
> Heh, Ok. This all does sound a little open ended to me. Maybe the better
> question is: suppose this hypothetically does happen after adding a
> bunch of new counters, what would the expected side effect be in the
> recovery scenario where the write buffer can't be flushed?

The btree write buffer buf is allowed to grow - we try to keep it
bounded in normal operation, but that's one of the ways we deal with the
unpredictability of the amount of write buffer keys in the journal.

So it'll grow until that kvrealloc fails. It won't show up as a
deadlock, it'll show up as an allocation failure; and for that to
mappen, that would mean the number of accounting keys being update - not
the number of accounting updates, just the number of distinct keys being
updated - is no longer fitting in the write buffer.