[PATCH] bcache: add "clock" cache replacement policy

Robert Pang posted 1 patch 2 months, 1 week ago
There is a newer version of this series
drivers/md/bcache/alloc.c         | 34 +++++++++++++++++++++++++++----
drivers/md/bcache/bcache_ondisk.h |  1 +
drivers/md/bcache/sysfs.c         |  1 +
3 files changed, 32 insertions(+), 4 deletions(-)
[PATCH] bcache: add "clock" cache replacement policy
Posted by Robert Pang 2 months, 1 week ago
This new policy extends the FIFO policy to approximate the classic clock policy
(O(n) time complexity) by considering bucket priority, similar to the LRU
policy.

This policy addresses the high IO latency (1-2 seconds) experienced on
multi-terabyte cache devices when the free list is empty. The default LRU
policy's O(n log n) complexity for sorting priorities for the entire bucket
list causes this delay.

Signed-off-by: Robert Pang <robertpang@google.com>
---
 drivers/md/bcache/alloc.c         | 34 +++++++++++++++++++++++++++----
 drivers/md/bcache/bcache_ondisk.h |  1 +
 drivers/md/bcache/sysfs.c         |  1 +
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 48ce750bf70a..c65c48eab169 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -69,7 +69,8 @@
 #include <linux/random.h>
 #include <trace/events/bcache.h>
 
-#define MAX_OPEN_BUCKETS 128
+#define MAX_OPEN_BUCKETS	128
+#define CHECK_PRIO_SLICES	16
 
 /* Bucket heap / gen */
 
@@ -211,19 +212,41 @@ static void invalidate_buckets_lru(struct cache *ca)
 	}
 }
 
-static void invalidate_buckets_fifo(struct cache *ca)
+/*
+ * When check_prio is true, this FIFO policy examines the priority of the
+ * buckets and invalidates only the ones below a threshold in the priority
+ * ladder. As it goes, the threshold will be raised if not enough buckets are
+ * invalidated. Empty buckets are also invalidated. This evaulation resembles
+ * the LRU policy, and is used to approximate the classic clock-sweep cache
+ * replacement algorithm.
+ */
+static void invalidate_buckets_fifo(struct cache *ca, bool check_prio)
 {
 	struct bucket *b;
 	size_t checked = 0;
+	size_t check_quota = 0;
+	uint16_t prio_threshold = ca->set->min_prio;
 
 	while (!fifo_full(&ca->free_inc)) {
 		if (ca->fifo_last_bucket <  ca->sb.first_bucket ||
 		    ca->fifo_last_bucket >= ca->sb.nbuckets)
 			ca->fifo_last_bucket = ca->sb.first_bucket;
 
+		if (check_prio && checked >= check_quota) {
+			BUG_ON(ca->set->min_prio > INITIAL_PRIO);
+			prio_threshold +=
+				DIV_ROUND_UP(INITIAL_PRIO - ca->set->min_prio,
+					     CHECK_PRIO_SLICES);
+			check_quota += DIV_ROUND_UP(ca->sb.nbuckets,
+						    CHECK_PRIO_SLICES);
+		}
+
 		b = ca->buckets + ca->fifo_last_bucket++;
 
-		if (bch_can_invalidate_bucket(ca, b))
+		if (bch_can_invalidate_bucket(ca, b) &&
+		    (!check_prio ||
+		     b->prio <= prio_threshold ||
+		     !GC_SECTORS_USED(b)))
 			bch_invalidate_one_bucket(ca, b);
 
 		if (++checked >= ca->sb.nbuckets) {
@@ -269,11 +292,14 @@ static void invalidate_buckets(struct cache *ca)
 		invalidate_buckets_lru(ca);
 		break;
 	case CACHE_REPLACEMENT_FIFO:
-		invalidate_buckets_fifo(ca);
+		invalidate_buckets_fifo(ca, false);
 		break;
 	case CACHE_REPLACEMENT_RANDOM:
 		invalidate_buckets_random(ca);
 		break;
+	case CACHE_REPLACEMENT_CLOCK:
+		invalidate_buckets_fifo(ca, true);
+		break;
 	}
 }
 
diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h
index 6620a7f8fffc..d45794e01fe1 100644
--- a/drivers/md/bcache/bcache_ondisk.h
+++ b/drivers/md/bcache/bcache_ondisk.h
@@ -288,6 +288,7 @@ BITMASK(CACHE_REPLACEMENT,		struct cache_sb, flags, 2, 3);
 #define CACHE_REPLACEMENT_LRU		0U
 #define CACHE_REPLACEMENT_FIFO		1U
 #define CACHE_REPLACEMENT_RANDOM	2U
+#define CACHE_REPLACEMENT_CLOCK		3U
 
 BITMASK(BDEV_CACHE_MODE,		struct cache_sb, flags, 0, 4);
 #define CACHE_MODE_WRITETHROUGH		0U
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 826b14cae4e5..c8617bad0648 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -45,6 +45,7 @@ static const char * const cache_replacement_policies[] = {
 	"lru",
 	"fifo",
 	"random",
+	"clock",
 	NULL
 };
 
-- 
2.51.0.710.ga91ca5db03-goog
Re: [PATCH] bcache: add "clock" cache replacement policy
Posted by Coly Li 2 months, 1 week ago
On Mon, Oct 06, 2025 at 04:18:46PM +0800, Robert Pang wrote:
> This new policy extends the FIFO policy to approximate the classic clock policy
> (O(n) time complexity) by considering bucket priority, similar to the LRU
> policy.
> 

Current bcache GC is single thread, clock is good here. BTW, could you please
also add the clock entry into bcache kernel document,
- Documentation/admin-guide/bcache.rst
- Documentation/ABI/testing/sysfs-block-bcache

> This policy addresses the high IO latency (1-2 seconds) experienced on
  ^^^^^^^^^^^-> I assume this policy means LRU, am I correct?

> multi-terabyte cache devices when the free list is empty. The default LRU
> policy's O(n log n) complexity for sorting priorities for the entire bucket
> list causes this delay.
> 

Can you provide performance numbers about lock replacement algorithm and add
them into the commit log?

Yes, for performance optimization, we always need to see the difference made
by this improvement.

Thanks.

Coly Li



> Signed-off-by: Robert Pang <robertpang@google.com>
> ---
>  drivers/md/bcache/alloc.c         | 34 +++++++++++++++++++++++++++----
>  drivers/md/bcache/bcache_ondisk.h |  1 +
>  drivers/md/bcache/sysfs.c         |  1 +
>  3 files changed, 32 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
> index 48ce750bf70a..c65c48eab169 100644
> --- a/drivers/md/bcache/alloc.c
> +++ b/drivers/md/bcache/alloc.c
> @@ -69,7 +69,8 @@
>  #include <linux/random.h>
>  #include <trace/events/bcache.h>
>  
> -#define MAX_OPEN_BUCKETS 128
> +#define MAX_OPEN_BUCKETS	128
> +#define CHECK_PRIO_SLICES	16
>  
>  /* Bucket heap / gen */
>  
> @@ -211,19 +212,41 @@ static void invalidate_buckets_lru(struct cache *ca)
>  	}
>  }
>  
> -static void invalidate_buckets_fifo(struct cache *ca)
> +/*
> + * When check_prio is true, this FIFO policy examines the priority of the
> + * buckets and invalidates only the ones below a threshold in the priority
> + * ladder. As it goes, the threshold will be raised if not enough buckets are
> + * invalidated. Empty buckets are also invalidated. This evaulation resembles
> + * the LRU policy, and is used to approximate the classic clock-sweep cache
> + * replacement algorithm.
> + */
> +static void invalidate_buckets_fifo(struct cache *ca, bool check_prio)
>  {
>  	struct bucket *b;
>  	size_t checked = 0;
> +	size_t check_quota = 0;
> +	uint16_t prio_threshold = ca->set->min_prio;
>  
>  	while (!fifo_full(&ca->free_inc)) {
>  		if (ca->fifo_last_bucket <  ca->sb.first_bucket ||
>  		    ca->fifo_last_bucket >= ca->sb.nbuckets)
>  			ca->fifo_last_bucket = ca->sb.first_bucket;
>  
> +		if (check_prio && checked >= check_quota) {
> +			BUG_ON(ca->set->min_prio > INITIAL_PRIO);
> +			prio_threshold +=
> +				DIV_ROUND_UP(INITIAL_PRIO - ca->set->min_prio,
> +					     CHECK_PRIO_SLICES);
> +			check_quota += DIV_ROUND_UP(ca->sb.nbuckets,
> +						    CHECK_PRIO_SLICES);
> +		}
> +
>  		b = ca->buckets + ca->fifo_last_bucket++;
>  
> -		if (bch_can_invalidate_bucket(ca, b))
> +		if (bch_can_invalidate_bucket(ca, b) &&
> +		    (!check_prio ||
> +		     b->prio <= prio_threshold ||
> +		     !GC_SECTORS_USED(b)))
>  			bch_invalidate_one_bucket(ca, b);
>  
>  		if (++checked >= ca->sb.nbuckets) {
> @@ -269,11 +292,14 @@ static void invalidate_buckets(struct cache *ca)
>  		invalidate_buckets_lru(ca);
>  		break;
>  	case CACHE_REPLACEMENT_FIFO:
> -		invalidate_buckets_fifo(ca);
> +		invalidate_buckets_fifo(ca, false);
>  		break;
>  	case CACHE_REPLACEMENT_RANDOM:
>  		invalidate_buckets_random(ca);
>  		break;
> +	case CACHE_REPLACEMENT_CLOCK:
> +		invalidate_buckets_fifo(ca, true);
> +		break;
>  	}
>  }
>  
> diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h
> index 6620a7f8fffc..d45794e01fe1 100644
> --- a/drivers/md/bcache/bcache_ondisk.h
> +++ b/drivers/md/bcache/bcache_ondisk.h
> @@ -288,6 +288,7 @@ BITMASK(CACHE_REPLACEMENT,		struct cache_sb, flags, 2, 3);
>  #define CACHE_REPLACEMENT_LRU		0U
>  #define CACHE_REPLACEMENT_FIFO		1U
>  #define CACHE_REPLACEMENT_RANDOM	2U
> +#define CACHE_REPLACEMENT_CLOCK		3U
>  
>  BITMASK(BDEV_CACHE_MODE,		struct cache_sb, flags, 0, 4);
>  #define CACHE_MODE_WRITETHROUGH		0U
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index 826b14cae4e5..c8617bad0648 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -45,6 +45,7 @@ static const char * const cache_replacement_policies[] = {
>  	"lru",
>  	"fifo",
>  	"random",
> +	"clock",
>  	NULL
>  };
>  
> -- 
> 2.51.0.710.ga91ca5db03-goog
Re: [PATCH] bcache: add "clock" cache replacement policy
Posted by Robert Pang 2 months, 1 week ago
Hi Coly

Thank you for your quick look at this patch.

On Mon, Oct 6, 2025 at 10:20 PM Coly Li <colyli@fnnas.com> wrote:
>
> On Mon, Oct 06, 2025 at 04:18:46PM +0800, Robert Pang wrote:
> > This new policy extends the FIFO policy to approximate the classic clock policy
> > (O(n) time complexity) by considering bucket priority, similar to the LRU
> > policy.
> >
>
> Current bcache GC is single thread, clock is good here. BTW, could you please
> also add the clock entry into bcache kernel document,
> - Documentation/admin-guide/bcache.rst
> - Documentation/ABI/testing/sysfs-block-bcache

There is no reference to cache_replacement_policy in
sysfs-block-bcache currently. Will add the clock entry in bcache.rst.

> > This policy addresses the high IO latency (1-2 seconds) experienced on
>   ^^^^^^^^^^^-> I assume this policy means LRU, am I correct?

That's correct.

> > multi-terabyte cache devices when the free list is empty. The default LRU
> > policy's O(n log n) complexity for sorting priorities for the entire bucket
> > list causes this delay.
> >
>
> Can you provide performance numbers about lock replacement algorithm and add
> them into the commit log?
>
> Yes, for performance optimization, we always need to see the difference made
> by this improvement.

Will do it quickly.

Best regards
Robert Pang

> Thanks.
>
> Coly Li
>
>
>
> > Signed-off-by: Robert Pang <robertpang@google.com>
> > ---
> >  drivers/md/bcache/alloc.c         | 34 +++++++++++++++++++++++++++----
> >  drivers/md/bcache/bcache_ondisk.h |  1 +
> >  drivers/md/bcache/sysfs.c         |  1 +
> >  3 files changed, 32 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
> > index 48ce750bf70a..c65c48eab169 100644
> > --- a/drivers/md/bcache/alloc.c
> > +++ b/drivers/md/bcache/alloc.c
> > @@ -69,7 +69,8 @@
> >  #include <linux/random.h>
> >  #include <trace/events/bcache.h>
> >
> > -#define MAX_OPEN_BUCKETS 128
> > +#define MAX_OPEN_BUCKETS     128
> > +#define CHECK_PRIO_SLICES    16
> >
> >  /* Bucket heap / gen */
> >
> > @@ -211,19 +212,41 @@ static void invalidate_buckets_lru(struct cache *ca)
> >       }
> >  }
> >
> > -static void invalidate_buckets_fifo(struct cache *ca)
> > +/*
> > + * When check_prio is true, this FIFO policy examines the priority of the
> > + * buckets and invalidates only the ones below a threshold in the priority
> > + * ladder. As it goes, the threshold will be raised if not enough buckets are
> > + * invalidated. Empty buckets are also invalidated. This evaulation resembles
> > + * the LRU policy, and is used to approximate the classic clock-sweep cache
> > + * replacement algorithm.
> > + */
> > +static void invalidate_buckets_fifo(struct cache *ca, bool check_prio)
> >  {
> >       struct bucket *b;
> >       size_t checked = 0;
> > +     size_t check_quota = 0;
> > +     uint16_t prio_threshold = ca->set->min_prio;
> >
> >       while (!fifo_full(&ca->free_inc)) {
> >               if (ca->fifo_last_bucket <  ca->sb.first_bucket ||
> >                   ca->fifo_last_bucket >= ca->sb.nbuckets)
> >                       ca->fifo_last_bucket = ca->sb.first_bucket;
> >
> > +             if (check_prio && checked >= check_quota) {
> > +                     BUG_ON(ca->set->min_prio > INITIAL_PRIO);
> > +                     prio_threshold +=
> > +                             DIV_ROUND_UP(INITIAL_PRIO - ca->set->min_prio,
> > +                                          CHECK_PRIO_SLICES);
> > +                     check_quota += DIV_ROUND_UP(ca->sb.nbuckets,
> > +                                                 CHECK_PRIO_SLICES);
> > +             }
> > +
> >               b = ca->buckets + ca->fifo_last_bucket++;
> >
> > -             if (bch_can_invalidate_bucket(ca, b))
> > +             if (bch_can_invalidate_bucket(ca, b) &&
> > +                 (!check_prio ||
> > +                  b->prio <= prio_threshold ||
> > +                  !GC_SECTORS_USED(b)))
> >                       bch_invalidate_one_bucket(ca, b);
> >
> >               if (++checked >= ca->sb.nbuckets) {
> > @@ -269,11 +292,14 @@ static void invalidate_buckets(struct cache *ca)
> >               invalidate_buckets_lru(ca);
> >               break;
> >       case CACHE_REPLACEMENT_FIFO:
> > -             invalidate_buckets_fifo(ca);
> > +             invalidate_buckets_fifo(ca, false);
> >               break;
> >       case CACHE_REPLACEMENT_RANDOM:
> >               invalidate_buckets_random(ca);
> >               break;
> > +     case CACHE_REPLACEMENT_CLOCK:
> > +             invalidate_buckets_fifo(ca, true);
> > +             break;
> >       }
> >  }
> >
> > diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h
> > index 6620a7f8fffc..d45794e01fe1 100644
> > --- a/drivers/md/bcache/bcache_ondisk.h
> > +++ b/drivers/md/bcache/bcache_ondisk.h
> > @@ -288,6 +288,7 @@ BITMASK(CACHE_REPLACEMENT,                struct cache_sb, flags, 2, 3);
> >  #define CACHE_REPLACEMENT_LRU                0U
> >  #define CACHE_REPLACEMENT_FIFO               1U
> >  #define CACHE_REPLACEMENT_RANDOM     2U
> > +#define CACHE_REPLACEMENT_CLOCK              3U
> >
> >  BITMASK(BDEV_CACHE_MODE,             struct cache_sb, flags, 0, 4);
> >  #define CACHE_MODE_WRITETHROUGH              0U
> > diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> > index 826b14cae4e5..c8617bad0648 100644
> > --- a/drivers/md/bcache/sysfs.c
> > +++ b/drivers/md/bcache/sysfs.c
> > @@ -45,6 +45,7 @@ static const char * const cache_replacement_policies[] = {
> >       "lru",
> >       "fifo",
> >       "random",
> > +     "clock",
> >       NULL
> >  };
> >
> > --
> > 2.51.0.710.ga91ca5db03-goog