[v4] fs/buffer.c: update per-CPU bh_lru cache via RCU

[PATCH v4] fs/buffer.c: update per-CPU bh_lru cache via RCU

Posted by Marcelo Tosatti 3 years ago


For certain types of applications (for example PLC software or
RAN processing), upon occurrence of an event, it is necessary to
complete a certain task in a maximum amount of time (deadline).

One way to express this requirement is with a pair of numbers, 
deadline time and execution time, where:

	* deadline time: length of time between event and deadline.
	* execution time: length of time it takes for processing of event
			  to occur on a particular hardware platform
			  (uninterrupted).

The particular values depend on use-case. For the case
where the realtime application executes in a virtualized
guest, an IPI which must be serviced in the host will cause 
the following sequence of events:

	1) VM-exit
	2) execution of IPI (and function call)
	3) VM-entry

Which causes an excess of 50us latency as observed by cyclictest
(this violates the latency requirement of vRAN application with 1ms TTI,
for example).

invalidate_bh_lrus calls an IPI on each CPU that has non empty
per-CPU cache:

	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);

To avoid the IPI, free the per-CPU caches remotely via RCU.
Two bh_lrus structures for each CPU are allocated: one is being
used (assigned to per-CPU bh_lru pointer), and the other is
being freed (or idle).

An alternative solution would be to protect the fast path 
(__find_get_block) with a per-CPU spinlock. Then grab the 
lock from invalidate_bh_lru, when evaluating whether a given
CPUs buffer_head cache should be invalidated.
This solution would slow down the fast path.

Numbers (16 vCPU guest) for the following test:

for i in `seq 0 50`;
	mount -o loop alpine-standard-3.17.1-x86_64.iso /mnt/loop
	umount /mnt/loop
done

Where the time being measured is time between invalidate_bh_lrus 
function call start and return.

Unpatched: average is 2us
	     ┌                                        ┐
[ 0.0,  2.0) ┤████████████████████████▊ 53
[ 2.0,  4.0) ┤████████████████████████████████████  77
[ 4.0,  6.0) ┤████████▍ 18
[ 6.0,  8.0) ┤▌ 1
[ 8.0, 10.0) ┤  0
[10.0, 12.0) ┤  0
[12.0, 14.0) ┤▌ 1
[14.0, 16.0) ┤  0
[16.0, 18.0) ┤▌ 1
	     └                                        ┘
			   Frequency

Patched: average is 16us

	     ┌                                        ┐
[ 0.0, 10.0) ┤██████████████████▍ 35
[10.0, 20.0) ┤████████████████████████████████████  69
[20.0, 30.0) ┤██████████████████▍ 35
[30.0, 40.0) ┤████▎ 8
[40.0, 50.0) ┤█▌ 3
[50.0, 60.0) ┤█▏ 2
	     └                                        ┘
			   Frequency

The fact that invalidate_bh_lru() is now serialized should not be 
an issue, since invalidate_bdev does:

/* Invalidate clean unused buffers and pagecache. */
void invalidate_bdev(struct block_device *bdev)
{
	struct address_space *mapping = bdev->bd_inode->i_mapping;

	if (mapping->nrpages) {
		invalidate_bh_lrus();
		lru_add_drain_all();    /* make sure all lru add caches are flushed */
		invalidate_mapping_pages(mapping, 0, -1);
	}
}

Where lru_add_drain_all() is serialized by a single mutex lock
(and there have been no reported use cases where this
serialization is an issue).

Regarding scalability, considering the results above where 
it takes 16us to execute invalidate_bh_lrus on 16 CPUs
(where 8us are taken by synchronize_rcu_expedited),
we can assume 500ns per CPU. For a system with 
1024 CPUs, we can infer 8us + 1024*500ns ~= 500us
(which seems acceptable).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---

v4: improved changelog, no code change	(Dave Chinner)
v3: fix CPU hotplug
v2: fix sparse warnings (kernel test robot)

diff --git a/fs/buffer.c b/fs/buffer.c
index 9e1e2add541e..e9b4d579eff0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1246,7 +1246,21 @@ struct bh_lru {
 	struct buffer_head *bhs[BH_LRU_SIZE];
 };
 
-static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
+
+/*
+ * Allocate two bh_lrus structures for each CPU. bh_lru points to the
+ * one that is currently in use, and the update path does
+ * (consider cpu->bh_lru = bh_lrus[0]).
+ *
+ * cpu->bh_lrup = bh_lrus[1]
+ * synchronize_rcu()
+ * free bh's in bh_lrus[0]
+ */
+static unsigned int bh_lru_idx;
+static DEFINE_PER_CPU(struct bh_lru, bh_lrus[2]) = {{{ NULL }}, {{NULL}}};
+static DEFINE_PER_CPU(struct bh_lru __rcu *, bh_lrup);
+
+static DEFINE_MUTEX(bh_lru_invalidate_mutex);
 
 #ifdef CONFIG_SMP
 #define bh_lru_lock()	local_irq_disable()
@@ -1288,16 +1302,19 @@ static void bh_lru_install(struct buffer_head *bh)
 		return;
 	}
 
-	b = this_cpu_ptr(&bh_lrus);
+	rcu_read_lock();
+	b = rcu_dereference(per_cpu(bh_lrup, smp_processor_id()));
 	for (i = 0; i < BH_LRU_SIZE; i++) {
 		swap(evictee, b->bhs[i]);
 		if (evictee == bh) {
+			rcu_read_unlock();
 			bh_lru_unlock();
 			return;
 		}
 	}
 
 	get_bh(bh);
+	rcu_read_unlock();
 	bh_lru_unlock();
 	brelse(evictee);
 }
@@ -1309,28 +1326,32 @@ static struct buffer_head *
 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
 {
 	struct buffer_head *ret = NULL;
+	struct bh_lru *lru;
 	unsigned int i;
 
 	check_irqs_on();
 	bh_lru_lock();
+	rcu_read_lock();
+
+	lru = rcu_dereference(per_cpu(bh_lrup, smp_processor_id()));
 	for (i = 0; i < BH_LRU_SIZE; i++) {
-		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
+		struct buffer_head *bh = lru->bhs[i];
 
 		if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
 		    bh->b_size == size) {
 			if (i) {
 				while (i) {
-					__this_cpu_write(bh_lrus.bhs[i],
-						__this_cpu_read(bh_lrus.bhs[i - 1]));
+					lru->bhs[i] = lru->bhs[i - 1];
 					i--;
 				}
-				__this_cpu_write(bh_lrus.bhs[0], bh);
+				lru->bhs[0] = bh;
 			}
 			get_bh(bh);
 			ret = bh;
 			break;
 		}
 	}
+	rcu_read_unlock();
 	bh_lru_unlock();
 	return ret;
 }
@@ -1424,35 +1445,54 @@ static void __invalidate_bh_lrus(struct bh_lru *b)
 		b->bhs[i] = NULL;
 	}
 }
-/*
- * invalidate_bh_lrus() is called rarely - but not only at unmount.
- * This doesn't race because it runs in each cpu either in irq
- * or with preempt disabled.
- */
-static void invalidate_bh_lru(void *arg)
-{
-	struct bh_lru *b = &get_cpu_var(bh_lrus);
-
-	__invalidate_bh_lrus(b);
-	put_cpu_var(bh_lrus);
-}
 
 bool has_bh_in_lru(int cpu, void *dummy)
 {
-	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
+	struct bh_lru *b;
 	int i;
-	
+
+	rcu_read_lock();
+	b = rcu_dereference(per_cpu(bh_lrup, cpu));
 	for (i = 0; i < BH_LRU_SIZE; i++) {
-		if (b->bhs[i])
+		if (b->bhs[i]) {
+			rcu_read_unlock();
 			return true;
+		}
 	}
 
+	rcu_read_unlock();
 	return false;
 }
 
+/*
+ * invalidate_bh_lrus() is called rarely - but not only at unmount.
+ */
 void invalidate_bh_lrus(void)
 {
-	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
+	int cpu, oidx;
+
+	mutex_lock(&bh_lru_invalidate_mutex);
+	cpus_read_lock();
+	oidx = bh_lru_idx;
+	bh_lru_idx++;
+	if (bh_lru_idx >= 2)
+		bh_lru_idx = 0;
+
+	/* Assign the per-CPU bh_lru pointer */
+	for_each_online_cpu(cpu)
+		rcu_assign_pointer(per_cpu(bh_lrup, cpu),
+				   per_cpu_ptr(&bh_lrus[bh_lru_idx], cpu));
+	synchronize_rcu_expedited();
+
+	for_each_online_cpu(cpu) {
+		struct bh_lru *b = per_cpu_ptr(&bh_lrus[oidx], cpu);
+
+		bh_lru_lock();
+		__invalidate_bh_lrus(b);
+		bh_lru_unlock();
+	}
+	cpus_read_unlock();
+	mutex_unlock(&bh_lru_invalidate_mutex);
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
 
@@ -1465,8 +1505,10 @@ void invalidate_bh_lrus_cpu(void)
 	struct bh_lru *b;
 
 	bh_lru_lock();
-	b = this_cpu_ptr(&bh_lrus);
+	rcu_read_lock();
+	b = rcu_dereference(per_cpu(bh_lrup, smp_processor_id()));
 	__invalidate_bh_lrus(b);
+	rcu_read_unlock();
 	bh_lru_unlock();
 }
 
@@ -2968,15 +3010,25 @@ void free_buffer_head(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(free_buffer_head);
 
+static int buffer_cpu_online(unsigned int cpu)
+{
+	rcu_assign_pointer(per_cpu(bh_lrup, cpu),
+			   per_cpu_ptr(&bh_lrus[bh_lru_idx], cpu));
+	return 0;
+}
+
 static int buffer_exit_cpu_dead(unsigned int cpu)
 {
 	int i;
-	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
+	struct bh_lru *b;
 
+	rcu_read_lock();
+	b = rcu_dereference(per_cpu(bh_lrup, cpu));
 	for (i = 0; i < BH_LRU_SIZE; i++) {
 		brelse(b->bhs[i]);
 		b->bhs[i] = NULL;
 	}
+	rcu_read_unlock();
 	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
 	per_cpu(bh_accounting, cpu).nr = 0;
 	return 0;
@@ -3069,7 +3121,7 @@ EXPORT_SYMBOL(__bh_read_batch);
 void __init buffer_init(void)
 {
 	unsigned long nrpages;
-	int ret;
+	int ret, cpu;
 
 	bh_cachep = kmem_cache_create("buffer_head",
 			sizeof(struct buffer_head), 0,
@@ -3077,6 +3129,11 @@ void __init buffer_init(void)
 				SLAB_MEM_SPREAD),
 				NULL);
 
+	cpus_read_lock();
+	for_each_online_cpu(cpu)
+		rcu_assign_pointer(per_cpu(bh_lrup, cpu), per_cpu_ptr(&bh_lrus[0], cpu));
+	cpus_read_unlock();
+
 	/*
 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
 	 */
@@ -3085,4 +3142,7 @@ void __init buffer_init(void)
 	ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
 					NULL, buffer_exit_cpu_dead);
 	WARN_ON(ret < 0);
+	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "fs/buffer:online",
+					NULL, buffer_cpu_online);
+	WARN_ON(ret < 0);
 }

Re: [PATCH v4] fs/buffer.c: update per-CPU bh_lru cache via RCU

Posted by Valentin Schneider 2 years, 10 months ago

On 30/03/23 16:27, Marcelo Tosatti wrote:
> +/*
> + * invalidate_bh_lrus() is called rarely - but not only at unmount.
> + */
>  void invalidate_bh_lrus(void)
>  {
> -	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
> +	int cpu, oidx;
> +
> +	mutex_lock(&bh_lru_invalidate_mutex);
> +	cpus_read_lock();
> +	oidx = bh_lru_idx;

> +	bh_lru_idx++;
> +	if (bh_lru_idx >= 2)
> +		bh_lru_idx = 0;
> +

You could make this a bool and flip it:
  bh_lru_idx = !bh_lru_idx

> +	/* Assign the per-CPU bh_lru pointer */
> +	for_each_online_cpu(cpu)
> +		rcu_assign_pointer(per_cpu(bh_lrup, cpu),
> +				   per_cpu_ptr(&bh_lrus[bh_lru_idx], cpu));
> +	synchronize_rcu_expedited();
> +
> +	for_each_online_cpu(cpu) {
> +		struct bh_lru *b = per_cpu_ptr(&bh_lrus[oidx], cpu);
> +
> +		bh_lru_lock();
> +		__invalidate_bh_lrus(b);
> +		bh_lru_unlock();

Given the bh_lrup has been updated and we're past the synchronize_rcu(),
what is bh_lru_lock() used for here?

> +	}
> +	cpus_read_unlock();
> +	mutex_unlock(&bh_lru_invalidate_mutex);

Re scalability, this is shifting a set of per-CPU-IPI callbacks to a single
CPU, which isn't great. Can we consider doing something like [1], i.e. in
the general case send an IPI to:

  rcu_assign_pointer() + call_rcu(/* invalidation callback */)

and in the case we're NOHZ_FULL and the target CPU is not executing in the
kernel, we do that remotely to reduce interference. We might want to batch
the synchronize_rcu() for the remote invalidates, maybe some abuse of the
API like so?

  bool do_local_invalidate(int cpu, struct cpumask *mask)
  {
          if (cpu_in_kernel(cpu)) {
              __cpumask_clear_cpu(cpu, mask);
              return true;
          }

          return false;
  }

  void invalidate_bh_lrus(void)
  {
          cpumask_var_t cpumask;

          cpus_read_lock();
          cpumask_copy(&cpumask, cpu_online_mask);
          on_each_cpu_cond(do_local_invalidate, invalidate_bh_lru, &cpumask, 1);

          for_each_cpu(cpu, &cpumask)
                  rcu_assign_pointer(per_cpu(bh_lrup, cpu),
                                             per_cpu_ptr(&bh_lrus[bh_lru_idx], cpu));

          synchronize_rcu_expedited();

          for_each_cpu(cpu, &cpumask) {
                  // Do remote invalidate here
          }
  }

[1]: https://lore.kernel.org/lkml/20230404134224.137038-4-ypodemsk@redhat.com/

>  }
>  EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
>
> @@ -1465,8 +1505,10 @@ void invalidate_bh_lrus_cpu(void)
>       struct bh_lru *b;
>
>       bh_lru_lock();
> -	b = this_cpu_ptr(&bh_lrus);
> +	rcu_read_lock();
> +	b = rcu_dereference(per_cpu(bh_lrup, smp_processor_id()));
>       __invalidate_bh_lrus(b);
> +	rcu_read_unlock();
>       bh_lru_unlock();
>  }
>
> @@ -2968,15 +3010,25 @@ void free_buffer_head(struct buffer_head *bh)
>  }
>  EXPORT_SYMBOL(free_buffer_head);
>
> +static int buffer_cpu_online(unsigned int cpu)
> +{
> +	rcu_assign_pointer(per_cpu(bh_lrup, cpu),
> +			   per_cpu_ptr(&bh_lrus[bh_lru_idx], cpu));
> +	return 0;
> +}

What serializes this against invalidate_bh_lrus()? Are you relying on this
running under cpus_write_lock()?

Re: [PATCH v4] fs/buffer.c: update per-CPU bh_lru cache via RCU

Posted by Marcelo Tosatti 2 years, 11 months ago

Friendly ping ?

On Thu, Mar 30, 2023 at 04:27:32PM -0300, Marcelo Tosatti wrote:
> 
> For certain types of applications (for example PLC software or
> RAN processing), upon occurrence of an event, it is necessary to
> complete a certain task in a maximum amount of time (deadline).
> 
> One way to express this requirement is with a pair of numbers, 
> deadline time and execution time, where:
> 
> 	* deadline time: length of time between event and deadline.
> 	* execution time: length of time it takes for processing of event
> 			  to occur on a particular hardware platform
> 			  (uninterrupted).
> 
> The particular values depend on use-case. For the case
> where the realtime application executes in a virtualized
> guest, an IPI which must be serviced in the host will cause 
> the following sequence of events:
> 
> 	1) VM-exit
> 	2) execution of IPI (and function call)
> 	3) VM-entry
> 
> Which causes an excess of 50us latency as observed by cyclictest
> (this violates the latency requirement of vRAN application with 1ms TTI,
> for example).
> 
> invalidate_bh_lrus calls an IPI on each CPU that has non empty
> per-CPU cache:
> 
> 	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
> 
> To avoid the IPI, free the per-CPU caches remotely via RCU.
> Two bh_lrus structures for each CPU are allocated: one is being
> used (assigned to per-CPU bh_lru pointer), and the other is
> being freed (or idle).
> 
> An alternative solution would be to protect the fast path 
> (__find_get_block) with a per-CPU spinlock. Then grab the 
> lock from invalidate_bh_lru, when evaluating whether a given
> CPUs buffer_head cache should be invalidated.
> This solution would slow down the fast path.
> 
> Numbers (16 vCPU guest) for the following test:
> 
> for i in `seq 0 50`;
> 	mount -o loop alpine-standard-3.17.1-x86_64.iso /mnt/loop
> 	umount /mnt/loop
> done
> 
> Where the time being measured is time between invalidate_bh_lrus 
> function call start and return.
> 
> Unpatched: average is 2us
> 	     ┌                                        ┐
> [ 0.0,  2.0) ┤████████████████████████▊ 53
> [ 2.0,  4.0) ┤████████████████████████████████████  77
> [ 4.0,  6.0) ┤████████▍ 18
> [ 6.0,  8.0) ┤▌ 1
> [ 8.0, 10.0) ┤  0
> [10.0, 12.0) ┤  0
> [12.0, 14.0) ┤▌ 1
> [14.0, 16.0) ┤  0
> [16.0, 18.0) ┤▌ 1
> 	     └                                        ┘
> 			   Frequency
> 
> Patched: average is 16us
> 
> 	     ┌                                        ┐
> [ 0.0, 10.0) ┤██████████████████▍ 35
> [10.0, 20.0) ┤████████████████████████████████████  69
> [20.0, 30.0) ┤██████████████████▍ 35
> [30.0, 40.0) ┤████▎ 8
> [40.0, 50.0) ┤█▌ 3
> [50.0, 60.0) ┤█▏ 2
> 	     └                                        ┘
> 			   Frequency
> 
> The fact that invalidate_bh_lru() is now serialized should not be 
> an issue, since invalidate_bdev does:
> 
> /* Invalidate clean unused buffers and pagecache. */
> void invalidate_bdev(struct block_device *bdev)
> {
> 	struct address_space *mapping = bdev->bd_inode->i_mapping;
> 
> 	if (mapping->nrpages) {
> 		invalidate_bh_lrus();
> 		lru_add_drain_all();    /* make sure all lru add caches are flushed */
> 		invalidate_mapping_pages(mapping, 0, -1);
> 	}
> }
> 
> Where lru_add_drain_all() is serialized by a single mutex lock
> (and there have been no reported use cases where this
> serialization is an issue).
> 
> Regarding scalability, considering the results above where 
> it takes 16us to execute invalidate_bh_lrus on 16 CPUs
> (where 8us are taken by synchronize_rcu_expedited),
> we can assume 500ns per CPU. For a system with 
> 1024 CPUs, we can infer 8us + 1024*500ns ~= 500us
> (which seems acceptable).
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
> 
> v4: improved changelog, no code change	(Dave Chinner)
> v3: fix CPU hotplug
> v2: fix sparse warnings (kernel test robot)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 9e1e2add541e..e9b4d579eff0 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -1246,7 +1246,21 @@ struct bh_lru {
>  	struct buffer_head *bhs[BH_LRU_SIZE];
>  };
>  
> -static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
> +
> +/*
> + * Allocate two bh_lrus structures for each CPU. bh_lru points to the
> + * one that is currently in use, and the update path does
> + * (consider cpu->bh_lru = bh_lrus[0]).
> + *
> + * cpu->bh_lrup = bh_lrus[1]
> + * synchronize_rcu()
> + * free bh's in bh_lrus[0]
> + */
> +static unsigned int bh_lru_idx;
> +static DEFINE_PER_CPU(struct bh_lru, bh_lrus[2]) = {{{ NULL }}, {{NULL}}};
> +static DEFINE_PER_CPU(struct bh_lru __rcu *, bh_lrup);
> +
> +static DEFINE_MUTEX(bh_lru_invalidate_mutex);
>  
>  #ifdef CONFIG_SMP
>  #define bh_lru_lock()	local_irq_disable()
> @@ -1288,16 +1302,19 @@ static void bh_lru_install(struct buffer_head *bh)
>  		return;
>  	}
>  
> -	b = this_cpu_ptr(&bh_lrus);
> +	rcu_read_lock();
> +	b = rcu_dereference(per_cpu(bh_lrup, smp_processor_id()));
>  	for (i = 0; i < BH_LRU_SIZE; i++) {
>  		swap(evictee, b->bhs[i]);
>  		if (evictee == bh) {
> +			rcu_read_unlock();
>  			bh_lru_unlock();
>  			return;
>  		}
>  	}
>  
>  	get_bh(bh);
> +	rcu_read_unlock();
>  	bh_lru_unlock();
>  	brelse(evictee);
>  }
> @@ -1309,28 +1326,32 @@ static struct buffer_head *
>  lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
>  {
>  	struct buffer_head *ret = NULL;
> +	struct bh_lru *lru;
>  	unsigned int i;
>  
>  	check_irqs_on();
>  	bh_lru_lock();
> +	rcu_read_lock();
> +
> +	lru = rcu_dereference(per_cpu(bh_lrup, smp_processor_id()));
>  	for (i = 0; i < BH_LRU_SIZE; i++) {
> -		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
> +		struct buffer_head *bh = lru->bhs[i];
>  
>  		if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
>  		    bh->b_size == size) {
>  			if (i) {
>  				while (i) {
> -					__this_cpu_write(bh_lrus.bhs[i],
> -						__this_cpu_read(bh_lrus.bhs[i - 1]));
> +					lru->bhs[i] = lru->bhs[i - 1];
>  					i--;
>  				}
> -				__this_cpu_write(bh_lrus.bhs[0], bh);
> +				lru->bhs[0] = bh;
>  			}
>  			get_bh(bh);
>  			ret = bh;
>  			break;
>  		}
>  	}
> +	rcu_read_unlock();
>  	bh_lru_unlock();
>  	return ret;
>  }
> @@ -1424,35 +1445,54 @@ static void __invalidate_bh_lrus(struct bh_lru *b)
>  		b->bhs[i] = NULL;
>  	}
>  }
> -/*
> - * invalidate_bh_lrus() is called rarely - but not only at unmount.
> - * This doesn't race because it runs in each cpu either in irq
> - * or with preempt disabled.
> - */
> -static void invalidate_bh_lru(void *arg)
> -{
> -	struct bh_lru *b = &get_cpu_var(bh_lrus);
> -
> -	__invalidate_bh_lrus(b);
> -	put_cpu_var(bh_lrus);
> -}
>  
>  bool has_bh_in_lru(int cpu, void *dummy)
>  {
> -	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
> +	struct bh_lru *b;
>  	int i;
> -	
> +
> +	rcu_read_lock();
> +	b = rcu_dereference(per_cpu(bh_lrup, cpu));
>  	for (i = 0; i < BH_LRU_SIZE; i++) {
> -		if (b->bhs[i])
> +		if (b->bhs[i]) {
> +			rcu_read_unlock();
>  			return true;
> +		}
>  	}
>  
> +	rcu_read_unlock();
>  	return false;
>  }
>  
> +/*
> + * invalidate_bh_lrus() is called rarely - but not only at unmount.
> + */
>  void invalidate_bh_lrus(void)
>  {
> -	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
> +	int cpu, oidx;
> +
> +	mutex_lock(&bh_lru_invalidate_mutex);
> +	cpus_read_lock();
> +	oidx = bh_lru_idx;
> +	bh_lru_idx++;
> +	if (bh_lru_idx >= 2)
> +		bh_lru_idx = 0;
> +
> +	/* Assign the per-CPU bh_lru pointer */
> +	for_each_online_cpu(cpu)
> +		rcu_assign_pointer(per_cpu(bh_lrup, cpu),
> +				   per_cpu_ptr(&bh_lrus[bh_lru_idx], cpu));
> +	synchronize_rcu_expedited();
> +
> +	for_each_online_cpu(cpu) {
> +		struct bh_lru *b = per_cpu_ptr(&bh_lrus[oidx], cpu);
> +
> +		bh_lru_lock();
> +		__invalidate_bh_lrus(b);
> +		bh_lru_unlock();
> +	}
> +	cpus_read_unlock();
> +	mutex_unlock(&bh_lru_invalidate_mutex);
>  }
>  EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
>  
> @@ -1465,8 +1505,10 @@ void invalidate_bh_lrus_cpu(void)
>  	struct bh_lru *b;
>  
>  	bh_lru_lock();
> -	b = this_cpu_ptr(&bh_lrus);
> +	rcu_read_lock();
> +	b = rcu_dereference(per_cpu(bh_lrup, smp_processor_id()));
>  	__invalidate_bh_lrus(b);
> +	rcu_read_unlock();
>  	bh_lru_unlock();
>  }
>  
> @@ -2968,15 +3010,25 @@ void free_buffer_head(struct buffer_head *bh)
>  }
>  EXPORT_SYMBOL(free_buffer_head);
>  
> +static int buffer_cpu_online(unsigned int cpu)
> +{
> +	rcu_assign_pointer(per_cpu(bh_lrup, cpu),
> +			   per_cpu_ptr(&bh_lrus[bh_lru_idx], cpu));
> +	return 0;
> +}
> +
>  static int buffer_exit_cpu_dead(unsigned int cpu)
>  {
>  	int i;
> -	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
> +	struct bh_lru *b;
>  
> +	rcu_read_lock();
> +	b = rcu_dereference(per_cpu(bh_lrup, cpu));
>  	for (i = 0; i < BH_LRU_SIZE; i++) {
>  		brelse(b->bhs[i]);
>  		b->bhs[i] = NULL;
>  	}
> +	rcu_read_unlock();
>  	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
>  	per_cpu(bh_accounting, cpu).nr = 0;
>  	return 0;
> @@ -3069,7 +3121,7 @@ EXPORT_SYMBOL(__bh_read_batch);
>  void __init buffer_init(void)
>  {
>  	unsigned long nrpages;
> -	int ret;
> +	int ret, cpu;
>  
>  	bh_cachep = kmem_cache_create("buffer_head",
>  			sizeof(struct buffer_head), 0,
> @@ -3077,6 +3129,11 @@ void __init buffer_init(void)
>  				SLAB_MEM_SPREAD),
>  				NULL);
>  
> +	cpus_read_lock();
> +	for_each_online_cpu(cpu)
> +		rcu_assign_pointer(per_cpu(bh_lrup, cpu), per_cpu_ptr(&bh_lrus[0], cpu));
> +	cpus_read_unlock();
> +
>  	/*
>  	 * Limit the bh occupancy to 10% of ZONE_NORMAL
>  	 */
> @@ -3085,4 +3142,7 @@ void __init buffer_init(void)
>  	ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
>  					NULL, buffer_exit_cpu_dead);
>  	WARN_ON(ret < 0);
> +	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "fs/buffer:online",
> +					NULL, buffer_cpu_online);
> +	WARN_ON(ret < 0);
>  }
> 
>