[PATCH v7 08/10] blk-mq: use hk cpus only when isolcpus=io_queue is enabled

Daniel Wagner posted 10 patches 5 months, 2 weeks ago
There is a newer version of this series
[PATCH v7 08/10] blk-mq: use hk cpus only when isolcpus=io_queue is enabled
Posted by Daniel Wagner 5 months, 2 weeks ago
Extend the capabilities of the generic CPU to hardware queue (hctx)
mapping code, so it maps houskeeping CPUs and isolated CPUs to the
hardware queues evenly.

A hctx is only operational when there is at least one online
housekeeping CPU assigned (aka active_hctx). Thus, check the final
mapping that there is no hctx which has only offline housekeeing CPU and
online isolated CPUs.

Example mapping result:

  16 online CPUs

  isolcpus=io_queue,2-3,6-7,12-13

Queue mapping:
        hctx0: default 0 2
        hctx1: default 1 3
        hctx2: default 4 6
        hctx3: default 5 7
        hctx4: default 8 12
        hctx5: default 9 13
        hctx6: default 10
        hctx7: default 11
        hctx8: default 14
        hctx9: default 15

IRQ mapping:
        irq 42 affinity 0 effective 0  nvme0q0
        irq 43 affinity 0 effective 0  nvme0q1
        irq 44 affinity 1 effective 1  nvme0q2
        irq 45 affinity 4 effective 4  nvme0q3
        irq 46 affinity 5 effective 5  nvme0q4
        irq 47 affinity 8 effective 8  nvme0q5
        irq 48 affinity 9 effective 9  nvme0q6
        irq 49 affinity 10 effective 10  nvme0q7
        irq 50 affinity 11 effective 11  nvme0q8
        irq 51 affinity 14 effective 14  nvme0q9
        irq 52 affinity 15 effective 15  nvme0q10

A corner case is when the number of online CPUs and present CPUs
differ and the driver asks for less queues than online CPUs, e.g.

  8 online CPUs, 16 possible CPUs

  isolcpus=io_queue,2-3,6-7,12-13
  virtio_blk.num_request_queues=2

Queue mapping:
        hctx0: default 0 1 2 3 4 5 6 7 8 12 13
        hctx1: default 9 10 11 14 15

IRQ mapping
        irq 27 affinity 0 effective 0 virtio0-config
        irq 28 affinity 0-1,4-5,8 effective 5 virtio0-req.0
        irq 29 affinity 9-11,14-15 effective 0 virtio0-req.1

Signed-off-by: Daniel Wagner <wagi@kernel.org>
---
 block/blk-mq-cpumap.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 191 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 8244ecf878358c0b8de84458dcd5100c2f360213..4cb2724a78e13216e50f0e6b1a18f19ea41a54f8 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -17,12 +17,25 @@
 #include "blk.h"
 #include "blk-mq.h"
 
+static struct cpumask blk_hk_online_mask;
+
 static unsigned int blk_mq_num_queues(const struct cpumask *mask,
 				      unsigned int max_queues)
 {
 	unsigned int num;
 
-	num = cpumask_weight(mask);
+	if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) {
+		const struct cpumask *hk_mask;
+		struct cpumask avail_mask;
+
+		hk_mask = housekeeping_cpumask(HK_TYPE_IO_QUEUE);
+		cpumask_and(&avail_mask, mask, hk_mask);
+
+		num = cpumask_weight(&avail_mask);
+	} else {
+		num = cpumask_weight(mask);
+	}
+
 	return min_not_zero(num, max_queues);
 }
 
@@ -31,9 +44,13 @@ static unsigned int blk_mq_num_queues(const struct cpumask *mask,
  *
  * Returns an affinity mask that represents the queue-to-CPU mapping
  * requested by the block layer based on possible CPUs.
+ * This helper takes isolcpus settings into account.
  */
 const struct cpumask *blk_mq_possible_queue_affinity(void)
 {
+	if (housekeeping_enabled(HK_TYPE_IO_QUEUE))
+		return housekeeping_cpumask(HK_TYPE_IO_QUEUE);
+
 	return cpu_possible_mask;
 }
 EXPORT_SYMBOL_GPL(blk_mq_possible_queue_affinity);
@@ -46,6 +63,12 @@ EXPORT_SYMBOL_GPL(blk_mq_possible_queue_affinity);
  */
 const struct cpumask *blk_mq_online_queue_affinity(void)
 {
+	if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) {
+		cpumask_and(&blk_hk_online_mask, cpu_online_mask,
+			    housekeeping_cpumask(HK_TYPE_IO_QUEUE));
+		return &blk_hk_online_mask;
+	}
+
 	return cpu_online_mask;
 }
 EXPORT_SYMBOL_GPL(blk_mq_online_queue_affinity);
@@ -57,7 +80,8 @@ EXPORT_SYMBOL_GPL(blk_mq_online_queue_affinity);
  *		ignored.
  *
  * Calculates the number of queues to be used for a multiqueue
- * device based on the number of possible CPUs.
+ * device based on the number of possible CPUs. This helper
+ * takes isolcpus settings into account.
  */
 unsigned int blk_mq_num_possible_queues(unsigned int max_queues)
 {
@@ -72,7 +96,8 @@ EXPORT_SYMBOL_GPL(blk_mq_num_possible_queues);
  *		ignored.
  *
  * Calculates the number of queues to be used for a multiqueue
- * device based on the number of online CPUs.
+ * device based on the number of online CPUs. This helper
+ * takes isolcpus settings into account.
  */
 unsigned int blk_mq_num_online_queues(unsigned int max_queues)
 {
@@ -80,11 +105,169 @@ unsigned int blk_mq_num_online_queues(unsigned int max_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_num_online_queues);
 
+static bool blk_mq_hk_validate(struct blk_mq_queue_map *qmap,
+			       const struct cpumask *active_hctx)
+{
+	/*
+	 * Verify if the mapping is usable.
+	 *
+	 * First, mark all hctx which have at least online houskeeping
+	 * CPU assigned.
+	 */
+	for (int queue = 0; queue < qmap->nr_queues; queue++) {
+		int cpu;
+
+		if (cpumask_test_cpu(queue, active_hctx)) {
+			/*
+			 * This htcx has at least one online houskeeping
+			 * CPU thus it is able to serve any assigned
+			 * isolated CPU.
+			 */
+			continue;
+		}
+
+		/*
+		 * There is no online houskeeping CPU for this hctx, all
+		 * good as long as all isolated CPUs are also offline.
+		 */
+		for_each_online_cpu(cpu) {
+			if (qmap->mq_map[cpu] != queue)
+				continue;
+
+			pr_warn("Unable to create a usable CPU-to-queue mapping with the given constraints\n");
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/*
+ * blk_mq_map_hk_queues - Create housekeeping CPU to
+ *                        hardware queue mapping
+ * @qmap:	CPU to hardware queue map
+ *
+ * Create a housekeeping CPU to hardware queue mapping in @qmap. @qmap
+ * contains a valid configuration honoring the isolcpus configuration.
+ */
+static void blk_mq_map_hk_queues(struct blk_mq_queue_map *qmap)
+{
+	cpumask_var_t active_hctx __free(free_cpumask_var) = NULL;
+	struct cpumask *hk_masks __free(kfree) = NULL;
+	const struct cpumask *mask;
+	unsigned int queue, cpu, nr_masks;
+
+	if (housekeeping_enabled(HK_TYPE_IO_QUEUE))
+		mask = housekeeping_cpumask(HK_TYPE_IO_QUEUE);
+	else
+		goto fallback;
+
+	if (!zalloc_cpumask_var(&active_hctx, GFP_KERNEL))
+		goto fallback;
+
+	/* Map housekeeping CPUs to a hctx */
+	hk_masks = group_mask_cpus_evenly(qmap->nr_queues, mask, &nr_masks);
+	if (!hk_masks)
+		goto fallback;
+
+	for (queue = 0; queue < qmap->nr_queues; queue++) {
+		unsigned int idx = (qmap->queue_offset + queue) % nr_masks;
+
+		for_each_cpu(cpu, &hk_masks[idx]) {
+			qmap->mq_map[cpu] = idx;
+
+			if (cpu_online(cpu))
+				cpumask_set_cpu(qmap->mq_map[cpu], active_hctx);
+		}
+	}
+
+	/* Map isolcpus to hardware context */
+	queue = cpumask_first(active_hctx);
+	for_each_cpu_andnot(cpu, cpu_possible_mask, mask) {
+		qmap->mq_map[cpu] = (qmap->queue_offset + queue) % nr_masks;
+		queue = cpumask_next_wrap(queue, active_hctx);
+	}
+
+	if (!blk_mq_hk_validate(qmap, active_hctx))
+		goto fallback;
+
+	return;
+
+fallback:
+	/*
+	 * Map all CPUs to the first hctx to ensure at least one online
+	 * housekeeping CPU is serving it.
+	 */
+	for_each_possible_cpu(cpu)
+		qmap->mq_map[cpu] = 0;
+}
+
+/*
+ * blk_mq_map_hk_irq_queues - Create housekeeping CPU to
+ *                            hardware queue mapping
+ * @dev:	The device to map queues
+ * @qmap:	CPU to hardware queue map
+ * @offset:	Queue offset to use for the device
+ *
+ * Create a housekeeping CPU to hardware queue mapping in @qmap. @qmap
+ * contains a valid configuration honoring the isolcpus configuration.
+ */
+static void blk_mq_map_hk_irq_queues(struct device *dev,
+				     struct blk_mq_queue_map *qmap,
+				     int offset)
+{
+	cpumask_var_t active_hctx __free(free_cpumask_var) = NULL;
+	cpumask_var_t mask __free(free_cpumask_var) = NULL;
+	unsigned int queue, cpu;
+
+	if (!zalloc_cpumask_var(&active_hctx, GFP_KERNEL))
+		goto fallback;
+
+	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+		goto fallback;
+
+	/* Map housekeeping CPUs to a hctx */
+	for (queue = 0; queue < qmap->nr_queues; queue++) {
+		for_each_cpu(cpu, dev->bus->irq_get_affinity(dev, offset + queue)) {
+			qmap->mq_map[cpu] = qmap->queue_offset + queue;
+
+			cpumask_set_cpu(cpu, mask);
+			if (cpu_online(cpu))
+				cpumask_set_cpu(qmap->mq_map[cpu], active_hctx);
+		}
+	}
+
+	/* Map isolcpus to hardware context */
+	queue = cpumask_first(active_hctx);
+	for_each_cpu_andnot(cpu, cpu_possible_mask, mask) {
+		qmap->mq_map[cpu] = qmap->queue_offset + queue;
+		queue = cpumask_next_wrap(queue, active_hctx);
+	}
+
+	if (!blk_mq_hk_validate(qmap, active_hctx))
+		goto fallback;
+
+	return;
+
+fallback:
+	/*
+	 * Map all CPUs to the first hctx to ensure at least one online
+	 * housekeeping CPU is serving it.
+	 */
+	for_each_possible_cpu(cpu)
+		qmap->mq_map[cpu] = 0;
+}
+
 void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
 {
 	const struct cpumask *masks;
 	unsigned int queue, cpu, nr_masks;
 
+	if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) {
+		blk_mq_map_hk_queues(qmap);
+		return;
+	}
+
 	masks = group_cpus_evenly(qmap->nr_queues, &nr_masks);
 	if (!masks) {
 		for_each_possible_cpu(cpu)
@@ -139,6 +322,11 @@ void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
 	if (!dev->bus->irq_get_affinity)
 		goto fallback;
 
+	if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) {
+		blk_mq_map_hk_irq_queues(dev, qmap, offset);
+		return;
+	}
+
 	for (queue = 0; queue < qmap->nr_queues; queue++) {
 		mask = dev->bus->irq_get_affinity(dev, queue + offset);
 		if (!mask)

-- 
2.50.0
Re: [PATCH v7 08/10] blk-mq: use hk cpus only when isolcpus=io_queue is enabled
Posted by kernel test robot 5 months, 2 weeks ago
Hi Daniel,

kernel test robot noticed the following build errors:

[auto build test ERROR on 32f85e8468ce081d8e73ca3f0d588f1004013037]

url:    https://github.com/intel-lab-lkp/linux/commits/Daniel-Wagner/lib-group_cpus-Add-group_masks_cpus_evenly/20250703-003811
base:   32f85e8468ce081d8e73ca3f0d588f1004013037
patch link:    https://lore.kernel.org/r/20250702-isolcpus-io-queues-v7-8-557aa7eacce4%40kernel.org
patch subject: [PATCH v7 08/10] blk-mq: use hk cpus only when isolcpus=io_queue is enabled
config: arm-allnoconfig (https://download.01.org/0day-ci/archive/20250703/202507032238.AoTmQnGP-lkp@intel.com/config)
compiler: clang version 21.0.0git (https://github.com/llvm/llvm-project f1a4bb62452d88a0edd9340b3ca7c9b11ad9193f)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250703/202507032238.AoTmQnGP-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507032238.AoTmQnGP-lkp@intel.com/

All errors (new ones prefixed by >>):

>> block/blk-mq-cpumap.c:155:16: error: array initializer must be an initializer list
     155 |         cpumask_var_t active_hctx __free(free_cpumask_var) = NULL;
         |                       ^
   block/blk-mq-cpumap.c:219:16: error: array initializer must be an initializer list
     219 |         cpumask_var_t active_hctx __free(free_cpumask_var) = NULL;
         |                       ^
   block/blk-mq-cpumap.c:220:16: error: array initializer must be an initializer list
     220 |         cpumask_var_t mask __free(free_cpumask_var) = NULL;
         |                       ^
   3 errors generated.


vim +155 block/blk-mq-cpumap.c

   144	
   145	/*
   146	 * blk_mq_map_hk_queues - Create housekeeping CPU to
   147	 *                        hardware queue mapping
   148	 * @qmap:	CPU to hardware queue map
   149	 *
   150	 * Create a housekeeping CPU to hardware queue mapping in @qmap. @qmap
   151	 * contains a valid configuration honoring the isolcpus configuration.
   152	 */
   153	static void blk_mq_map_hk_queues(struct blk_mq_queue_map *qmap)
   154	{
 > 155		cpumask_var_t active_hctx __free(free_cpumask_var) = NULL;
   156		struct cpumask *hk_masks __free(kfree) = NULL;
   157		const struct cpumask *mask;
   158		unsigned int queue, cpu, nr_masks;
   159	
   160		if (housekeeping_enabled(HK_TYPE_IO_QUEUE))
   161			mask = housekeeping_cpumask(HK_TYPE_IO_QUEUE);
   162		else
   163			goto fallback;
   164	
   165		if (!zalloc_cpumask_var(&active_hctx, GFP_KERNEL))
   166			goto fallback;
   167	
   168		/* Map housekeeping CPUs to a hctx */
   169		hk_masks = group_mask_cpus_evenly(qmap->nr_queues, mask, &nr_masks);
   170		if (!hk_masks)
   171			goto fallback;
   172	
   173		for (queue = 0; queue < qmap->nr_queues; queue++) {
   174			unsigned int idx = (qmap->queue_offset + queue) % nr_masks;
   175	
   176			for_each_cpu(cpu, &hk_masks[idx]) {
   177				qmap->mq_map[cpu] = idx;
   178	
   179				if (cpu_online(cpu))
   180					cpumask_set_cpu(qmap->mq_map[cpu], active_hctx);
   181			}
   182		}
   183	
   184		/* Map isolcpus to hardware context */
   185		queue = cpumask_first(active_hctx);
   186		for_each_cpu_andnot(cpu, cpu_possible_mask, mask) {
   187			qmap->mq_map[cpu] = (qmap->queue_offset + queue) % nr_masks;
   188			queue = cpumask_next_wrap(queue, active_hctx);
   189		}
   190	
   191		if (!blk_mq_hk_validate(qmap, active_hctx))
   192			goto fallback;
   193	
   194		return;
   195	
   196	fallback:
   197		/*
   198		 * Map all CPUs to the first hctx to ensure at least one online
   199		 * housekeeping CPU is serving it.
   200		 */
   201		for_each_possible_cpu(cpu)
   202			qmap->mq_map[cpu] = 0;
   203	}
   204	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v7 08/10] blk-mq: use hk cpus only when isolcpus=io_queue is enabled
Posted by Christoph Hellwig 5 months, 2 weeks ago
On Wed, Jul 02, 2025 at 06:33:58PM +0200, Daniel Wagner wrote:
>  const struct cpumask *blk_mq_possible_queue_affinity(void)
>  {
> +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE))
> +		return housekeeping_cpumask(HK_TYPE_IO_QUEUE);
> +
>  	return cpu_possible_mask;
>  }

I'm no expert on the housekeeping stuff, but why isn't the
housekeeping_enabled check done in housekeeping_cpumask directly so
that the drivers could use housekeeping_cpumask without a blk-mq
wrapper?
Re: [PATCH v7 08/10] blk-mq: use hk cpus only when isolcpus=io_queue is enabled
Posted by Daniel Wagner 5 months, 2 weeks ago
On Thu, Jul 03, 2025 at 11:01:58AM +0200, Christoph Hellwig wrote:
> On Wed, Jul 02, 2025 at 06:33:58PM +0200, Daniel Wagner wrote:
> >  const struct cpumask *blk_mq_possible_queue_affinity(void)
> >  {
> > +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE))
> > +		return housekeeping_cpumask(HK_TYPE_IO_QUEUE);
> > +
> >  	return cpu_possible_mask;
> >  }
> 
> I'm no expert on the housekeeping stuff, but why isn't the
> housekeeping_enabled check done in housekeeping_cpumask directly so
> that the drivers could use housekeeping_cpumask without a blk-mq
> wrapper?

Yes, housekeeping_cpumask will return cpu_possible_mask when housekeping
is disabled. Though some drivers want cpu_online_mask instead. If all
drivers would agree on one version of the mask it should allow to drop
to these helpers (maybe we the houskeeping API needs to be extended then
though)

This is also what Hannes brought up. If the number of supported hardware
queues for a device is less than cpu_possible_mask, it really makes
sense to distribute the hardware queues only between the online cpus. I
think the only two drivers which are interested in the cpu_possible_mask
are nvme-pci and virtio.
Re: [PATCH v7 08/10] blk-mq: use hk cpus only when isolcpus=io_queue is enabled
Posted by Christoph Hellwig 5 months, 2 weeks ago
On Fri, Jul 04, 2025 at 11:00:56AM +0200, Daniel Wagner wrote:
> > I'm no expert on the housekeeping stuff, but why isn't the
> > housekeeping_enabled check done in housekeeping_cpumask directly so
> > that the drivers could use housekeeping_cpumask without a blk-mq
> > wrapper?
> 
> Yes, housekeeping_cpumask will return cpu_possible_mask when housekeping
> is disabled. Though some drivers want cpu_online_mask instead. If all
> drivers would agree on one version of the mask it should allow to drop
> to these helpers (maybe we the houskeeping API needs to be extended then
> though)

Drivers don't get cpu hotplug notifications, so cpu_possible_mask is
the only valid answer right now.  That could change if we ever implement
notifications to the drivers.

> This is also what Hannes brought up. If the number of supported hardware
> queues for a device is less than cpu_possible_mask, it really makes
> sense to distribute the hardware queues only between the online cpus. I
> think the only two drivers which are interested in the cpu_possible_mask
> are nvme-pci and virtio.

That's the only two drivers that get it right :(
Re: [PATCH v7 08/10] blk-mq: use hk cpus only when isolcpus=io_queue is enabled
Posted by Hannes Reinecke 5 months, 2 weeks ago
On 7/2/25 18:33, Daniel Wagner wrote:
> Extend the capabilities of the generic CPU to hardware queue (hctx)
> mapping code, so it maps houskeeping CPUs and isolated CPUs to the
> hardware queues evenly.
> 
> A hctx is only operational when there is at least one online
> housekeeping CPU assigned (aka active_hctx). Thus, check the final
> mapping that there is no hctx which has only offline housekeeing CPU and
> online isolated CPUs.
> 
> Example mapping result:
> 
>    16 online CPUs
> 
>    isolcpus=io_queue,2-3,6-7,12-13
> 
> Queue mapping:
>          hctx0: default 0 2
>          hctx1: default 1 3
>          hctx2: default 4 6
>          hctx3: default 5 7
>          hctx4: default 8 12
>          hctx5: default 9 13
>          hctx6: default 10
>          hctx7: default 11
>          hctx8: default 14
>          hctx9: default 15
> 
> IRQ mapping:
>          irq 42 affinity 0 effective 0  nvme0q0
>          irq 43 affinity 0 effective 0  nvme0q1
>          irq 44 affinity 1 effective 1  nvme0q2
>          irq 45 affinity 4 effective 4  nvme0q3
>          irq 46 affinity 5 effective 5  nvme0q4
>          irq 47 affinity 8 effective 8  nvme0q5
>          irq 48 affinity 9 effective 9  nvme0q6
>          irq 49 affinity 10 effective 10  nvme0q7
>          irq 50 affinity 11 effective 11  nvme0q8
>          irq 51 affinity 14 effective 14  nvme0q9
>          irq 52 affinity 15 effective 15  nvme0q10
> 
> A corner case is when the number of online CPUs and present CPUs
> differ and the driver asks for less queues than online CPUs, e.g.
> 
>    8 online CPUs, 16 possible CPUs
> 
>    isolcpus=io_queue,2-3,6-7,12-13
>    virtio_blk.num_request_queues=2
> 
> Queue mapping:
>          hctx0: default 0 1 2 3 4 5 6 7 8 12 13
>          hctx1: default 9 10 11 14 15
> 
> IRQ mapping
>          irq 27 affinity 0 effective 0 virtio0-config
>          irq 28 affinity 0-1,4-5,8 effective 5 virtio0-req.0
>          irq 29 affinity 9-11,14-15 effective 0 virtio0-req.1
> 
> Signed-off-by: Daniel Wagner <wagi@kernel.org>
> ---
>   block/blk-mq-cpumap.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 191 insertions(+), 3 deletions(-)
> 
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index 8244ecf878358c0b8de84458dcd5100c2f360213..4cb2724a78e13216e50f0e6b1a18f19ea41a54f8 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -17,12 +17,25 @@
>   #include "blk.h"
>   #include "blk-mq.h"
>   
> +static struct cpumask blk_hk_online_mask;
> +
>   static unsigned int blk_mq_num_queues(const struct cpumask *mask,
>   				      unsigned int max_queues)
>   {
>   	unsigned int num;
>   
> -	num = cpumask_weight(mask);
> +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) {
> +		const struct cpumask *hk_mask;
> +		struct cpumask avail_mask;
> +
> +		hk_mask = housekeeping_cpumask(HK_TYPE_IO_QUEUE);
> +		cpumask_and(&avail_mask, mask, hk_mask);
> +
> +		num = cpumask_weight(&avail_mask);
> +	} else {
> +		num = cpumask_weight(mask);
> +	}
> +
>   	return min_not_zero(num, max_queues);
>   }
>   
> @@ -31,9 +44,13 @@ static unsigned int blk_mq_num_queues(const struct cpumask *mask,
>    *
>    * Returns an affinity mask that represents the queue-to-CPU mapping
>    * requested by the block layer based on possible CPUs.
> + * This helper takes isolcpus settings into account.
>    */
>   const struct cpumask *blk_mq_possible_queue_affinity(void)
>   {
> +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE))
> +		return housekeeping_cpumask(HK_TYPE_IO_QUEUE);
> +
>   	return cpu_possible_mask;
>   }
>   EXPORT_SYMBOL_GPL(blk_mq_possible_queue_affinity);
> @@ -46,6 +63,12 @@ EXPORT_SYMBOL_GPL(blk_mq_possible_queue_affinity);
>    */
>   const struct cpumask *blk_mq_online_queue_affinity(void)
>   {
> +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) {
> +		cpumask_and(&blk_hk_online_mask, cpu_online_mask,
> +			    housekeeping_cpumask(HK_TYPE_IO_QUEUE));
> +		return &blk_hk_online_mask;
> +	}
> +
>   	return cpu_online_mask;
>   }
>   EXPORT_SYMBOL_GPL(blk_mq_online_queue_affinity);
> @@ -57,7 +80,8 @@ EXPORT_SYMBOL_GPL(blk_mq_online_queue_affinity);
>    *		ignored.
>    *
>    * Calculates the number of queues to be used for a multiqueue
> - * device based on the number of possible CPUs.
> + * device based on the number of possible CPUs. This helper
> + * takes isolcpus settings into account.
>    */
>   unsigned int blk_mq_num_possible_queues(unsigned int max_queues)
>   {
> @@ -72,7 +96,8 @@ EXPORT_SYMBOL_GPL(blk_mq_num_possible_queues);
>    *		ignored.
>    *
>    * Calculates the number of queues to be used for a multiqueue
> - * device based on the number of online CPUs.
> + * device based on the number of online CPUs. This helper
> + * takes isolcpus settings into account.
>    */
>   unsigned int blk_mq_num_online_queues(unsigned int max_queues)
>   {
> @@ -80,11 +105,169 @@ unsigned int blk_mq_num_online_queues(unsigned int max_queues)
>   }
>   EXPORT_SYMBOL_GPL(blk_mq_num_online_queues);
>   
> +static bool blk_mq_hk_validate(struct blk_mq_queue_map *qmap,
> +			       const struct cpumask *active_hctx)
> +{
> +	/*
> +	 * Verify if the mapping is usable.
> +	 *
> +	 * First, mark all hctx which have at least online houskeeping
> +	 * CPU assigned.
> +	 */
> +	for (int queue = 0; queue < qmap->nr_queues; queue++) {
> +		int cpu;
> +
> +		if (cpumask_test_cpu(queue, active_hctx)) {
> +			/*
> +			 * This htcx has at least one online houskeeping
> +			 * CPU thus it is able to serve any assigned
> +			 * isolated CPU.
> +			 */
> +			continue;
> +		}
> +
> +		/*
> +		 * There is no online houskeeping CPU for this hctx, all
> +		 * good as long as all isolated CPUs are also offline.
> +		 */
> +		for_each_online_cpu(cpu) {
> +			if (qmap->mq_map[cpu] != queue)
> +				continue;
> +
> +			pr_warn("Unable to create a usable CPU-to-queue mapping with the given constraints\n");
> +			return false;
> +		}
> +	}
> +
> +	return true;
> +}
> +
> +/*
> + * blk_mq_map_hk_queues - Create housekeeping CPU to
> + *                        hardware queue mapping
> + * @qmap:	CPU to hardware queue map
> + *
> + * Create a housekeeping CPU to hardware queue mapping in @qmap. @qmap
> + * contains a valid configuration honoring the isolcpus configuration.
> + */
> +static void blk_mq_map_hk_queues(struct blk_mq_queue_map *qmap)
> +{
> +	cpumask_var_t active_hctx __free(free_cpumask_var) = NULL;
> +	struct cpumask *hk_masks __free(kfree) = NULL;
> +	const struct cpumask *mask;
> +	unsigned int queue, cpu, nr_masks;
> +
> +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE))
> +		mask = housekeeping_cpumask(HK_TYPE_IO_QUEUE);
> +	else
> +		goto fallback;
> +
> +	if (!zalloc_cpumask_var(&active_hctx, GFP_KERNEL))
> +		goto fallback;
> +
> +	/* Map housekeeping CPUs to a hctx */
> +	hk_masks = group_mask_cpus_evenly(qmap->nr_queues, mask, &nr_masks);
> +	if (!hk_masks)
> +		goto fallback;
> +
> +	for (queue = 0; queue < qmap->nr_queues; queue++) {
> +		unsigned int idx = (qmap->queue_offset + queue) % nr_masks;
> +
> +		for_each_cpu(cpu, &hk_masks[idx]) {
> +			qmap->mq_map[cpu] = idx;
> +
> +			if (cpu_online(cpu))
> +				cpumask_set_cpu(qmap->mq_map[cpu], active_hctx);

Why cpu_online? Up until this point it really didn't matter if the 
affinity mask was set to 'online' or 'possible' cpus, but here you
require CPUs to be online...

> +		}
> +	}
> +
> +	/* Map isolcpus to hardware context */
> +	queue = cpumask_first(active_hctx);
> +	for_each_cpu_andnot(cpu, cpu_possible_mask, mask) {
> +		qmap->mq_map[cpu] = (qmap->queue_offset + queue) % nr_masks;
> +		queue = cpumask_next_wrap(queue, active_hctx);
> +	}

Really? Doesn't this map _all_ cpus, and not just the isolcpus?

> +
> +	if (!blk_mq_hk_validate(qmap, active_hctx))
> +		goto fallback;
> +
> +	return;
> +
> +fallback:
> +	/*
> +	 * Map all CPUs to the first hctx to ensure at least one online
> +	 * housekeeping CPU is serving it.
> +	 */
> +	for_each_possible_cpu(cpu)
> +		qmap->mq_map[cpu] = 0;

I think you need to map all hctx, no?

> +}
> +
> +/*
> + * blk_mq_map_hk_irq_queues - Create housekeeping CPU to
> + *                            hardware queue mapping
> + * @dev:	The device to map queues
> + * @qmap:	CPU to hardware queue map
> + * @offset:	Queue offset to use for the device
> + *
> + * Create a housekeeping CPU to hardware queue mapping in @qmap. @qmap
> + * contains a valid configuration honoring the isolcpus configuration.
> + */
> +static void blk_mq_map_hk_irq_queues(struct device *dev,
> +				     struct blk_mq_queue_map *qmap,
> +				     int offset)
> +{
> +	cpumask_var_t active_hctx __free(free_cpumask_var) = NULL;
> +	cpumask_var_t mask __free(free_cpumask_var) = NULL;
> +	unsigned int queue, cpu;
> +
> +	if (!zalloc_cpumask_var(&active_hctx, GFP_KERNEL))
> +		goto fallback;
> +
> +	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
> +		goto fallback;
> +
> +	/* Map housekeeping CPUs to a hctx */
> +	for (queue = 0; queue < qmap->nr_queues; queue++) {
> +		for_each_cpu(cpu, dev->bus->irq_get_affinity(dev, offset + queue)) {
> +			qmap->mq_map[cpu] = qmap->queue_offset + queue;
> +
> +			cpumask_set_cpu(cpu, mask);
> +			if (cpu_online(cpu))
> +				cpumask_set_cpu(qmap->mq_map[cpu], active_hctx);

Now that is really curious. You pick up the interrupt affinity from the
'bus', which, I assume, is the PCI bus. And this would imply that the
bus can (or already is) programmed for this interrupt affinity.
Which would imply that this is a usable interrupt affinity from the
hardware perspective, irrespective on whether the cpu is online or not.
So why the check to cpu_online()? Can't we simply take the existing 
affinity and rely on the hardware to do the right thing?

> +		}
> +	}
> +
> +	/* Map isolcpus to hardware context */
> +	queue = cpumask_first(active_hctx);
> +	for_each_cpu_andnot(cpu, cpu_possible_mask, mask) {
> +		qmap->mq_map[cpu] = qmap->queue_offset + queue;
> +		queue = cpumask_next_wrap(queue, active_hctx);
> +	}
> +
> +	if (!blk_mq_hk_validate(qmap, active_hctx))
> +		goto fallback;
> +
> +	return;
> +
> +fallback:
> +	/*
> +	 * Map all CPUs to the first hctx to ensure at least one online
> +	 * housekeeping CPU is serving it.
> +	 */
> +	for_each_possible_cpu(cpu)
> +		qmap->mq_map[cpu] = 0;

Same comment as previously; don't we need to map all hctx?

> +}
> +
>   void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
>   {
>   	const struct cpumask *masks;
>   	unsigned int queue, cpu, nr_masks;
>   
> +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) {
> +		blk_mq_map_hk_queues(qmap);
> +		return;
> +	}
> +
>   	masks = group_cpus_evenly(qmap->nr_queues, &nr_masks);
>   	if (!masks) {
>   		for_each_possible_cpu(cpu)
> @@ -139,6 +322,11 @@ void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
>   	if (!dev->bus->irq_get_affinity)
>   		goto fallback;
>   
> +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) {
> +		blk_mq_map_hk_irq_queues(dev, qmap, offset);
> +		return;
> +	}
> +
>   	for (queue = 0; queue < qmap->nr_queues; queue++) {
>   		mask = dev->bus->irq_get_affinity(dev, queue + offset);
>   		if (!mask)
> 

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
Re: [PATCH v7 08/10] blk-mq: use hk cpus only when isolcpus=io_queue is enabled
Posted by Daniel Wagner 5 months, 2 weeks ago
On Thu, Jul 03, 2025 at 08:58:02AM +0200, Hannes Reinecke wrote:
> > +	for (queue = 0; queue < qmap->nr_queues; queue++) {
> > +		unsigned int idx = (qmap->queue_offset + queue) % nr_masks;
> > +
> > +		for_each_cpu(cpu, &hk_masks[idx]) {
> > +			qmap->mq_map[cpu] = idx;
> > +
> > +			if (cpu_online(cpu))
> > +				cpumask_set_cpu(qmap->mq_map[cpu], active_hctx);
> 
> Why cpu_online? Up until this point it really didn't matter if the affinity
> mask was set to 'online' or 'possible' cpus, but here you
> require CPUs to be online...

This part here tracks if the a hardware context has at least one
housekeeping CPU online. It is possible to provide configuration where
we end up with hardware context which have offline housekeeping CPUs and
online isolcpus. active_hctx tracks which of the hardware contexts is
usable which is used in the next step...

> > +		}
> > +	}
> > +
> > +	/* Map isolcpus to hardware context */
> > +	queue = cpumask_first(active_hctx);
> > +	for_each_cpu_andnot(cpu, cpu_possible_mask, mask) {
> > +		qmap->mq_map[cpu] = (qmap->queue_offset + queue) % nr_masks;
> > +		queue = cpumask_next_wrap(queue, active_hctx);
> > +	}
> 
> Really? Doesn't this map _all_ cpus, and not just the isolcpus?

for_each_cpu iterates over is all CPUs which are not houskeeping CPUs
(mask is the housekeeping mask), thus these are all isol CPU. Note the
'andnot' part.

The cpumask_first/cpumask_next_wrap returns only hardware context which
have at least one housekeeping CPU which is online. Yes, it possible to
make this a bit smarter, so that we keep the grouping of the offline
CPUs intact, though I am not sure if it is worth to add complexity for a
corner case at least not yet.

> > +fallback:
> > +	/*
> > +	 * Map all CPUs to the first hctx to ensure at least one online
> > +	 * housekeeping CPU is serving it.
> > +	 */
> > +	for_each_possible_cpu(cpu)
> > +		qmap->mq_map[cpu] = 0;
> 
> I think you need to map all hctx, no?

The block layer is filtering out hctx which have no CPU assigned to it
when selecting a queue. This is really a failsafe mode, it just makes
sure the system boots.

> > +	/* Map housekeeping CPUs to a hctx */
> > +	for (queue = 0; queue < qmap->nr_queues; queue++) {
> > +		for_each_cpu(cpu, dev->bus->irq_get_affinity(dev, offset + queue)) {
> > +			qmap->mq_map[cpu] = qmap->queue_offset + queue;
> > +
> > +			cpumask_set_cpu(cpu, mask);
> > +			if (cpu_online(cpu))
> > +				cpumask_set_cpu(qmap->mq_map[cpu], active_hctx);
> 
> Now that is really curious. You pick up the interrupt affinity from the
> 'bus', which, I assume, is the PCI bus. And this would imply that the
> bus can (or already is) programmed for this interrupt affinity.

Yes, this is the case. irq_create_affinity_masks which use
group_cpu_evenly/group_mask_cpu_evenly for the number of requested IRQs.
The number of IRQs can be higher than the number of requested queues
here. It's necessary to use the affinity mask created by
irq_create_affinity_mask as input.

> Which would imply that this is a usable interrupt affinity from the
> hardware perspective, irrespective on whether the cpu is online or
> not. So why the check to cpu_online()? Can't we simply take the existing affinity
> and rely on the hardware to do the right thing?

Again, this is tracking if a htcx has online housekeeping CPU.