blk-mq: add CPU latency limit control

[PATCHv2 2/2] blk-mq: add support for CPU latency limits

Posted by Tero Kristo 1 year, 3 months ago

Add support for setting CPU latency limits when a request is dispatched
to driver layer, and removing it once the device is idle. The latency
limits use the dev PM QoS framework for setting per-cpu limits for
active CPUs. The feature is user configurable via sysfs knobs under the
block device.

Signed-off-by: Tero Kristo <tero.kristo@linux.intel.com>
---
 block/blk-mq.c         | 54 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq.h | 12 ++++++++++
 2 files changed, 66 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4b2c8e940f59..f8906e2aff6d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -29,6 +29,7 @@
 #include <linux/blk-crypto.h>
 #include <linux/part_stat.h>
 #include <linux/sched/isolation.h>
+#include <linux/pm_qos.h>
 
 #include <trace/events/block.h>
 
@@ -2700,11 +2701,62 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug)
 static void __blk_mq_flush_plug_list(struct request_queue *q,
 				     struct blk_plug *plug)
 {
+	struct request *req, *next;
+	struct blk_mq_hw_ctx *hctx;
+	int cpu;
+
 	if (blk_queue_quiesced(q))
 		return;
+
+	rq_list_for_each_safe(&plug->mq_list, req, next) {
+		hctx = req->mq_hctx;
+
+		if (next && next->mq_hctx == hctx)
+			continue;
+
+		if (q->disk->cpu_lat_limit < 0)
+			continue;
+
+		hctx->last_active = jiffies + msecs_to_jiffies(q->disk->cpu_lat_timeout);
+
+		if (!hctx->cpu_lat_limit_active) {
+			hctx->cpu_lat_limit_active = true;
+			for_each_cpu(cpu, hctx->cpumask) {
+				struct dev_pm_qos_request *qos;
+
+				qos = per_cpu_ptr(hctx->cpu_lat_qos, cpu);
+				dev_pm_qos_add_request(get_cpu_device(cpu), qos,
+						       DEV_PM_QOS_RESUME_LATENCY,
+						       q->disk->cpu_lat_limit);
+			}
+			schedule_delayed_work(&hctx->cpu_latency_work,
+					      msecs_to_jiffies(q->disk->cpu_lat_timeout));
+		}
+	}
+
 	q->mq_ops->queue_rqs(&plug->mq_list);
 }
 
+static void blk_mq_cpu_latency_work(struct work_struct *work)
+{
+	struct blk_mq_hw_ctx *hctx = container_of(work, struct blk_mq_hw_ctx,
+						  cpu_latency_work.work);
+	int cpu;
+
+	if (time_after(jiffies, hctx->last_active)) {
+		for_each_cpu(cpu, hctx->cpumask) {
+			struct dev_pm_qos_request *qos;
+
+			qos = per_cpu_ptr(hctx->cpu_lat_qos, cpu);
+			dev_pm_qos_remove_request(qos);
+		}
+		hctx->cpu_lat_limit_active = false;
+	} else {
+		schedule_delayed_work(&hctx->cpu_latency_work,
+				      msecs_to_jiffies(hctx->queue->disk->cpu_lat_timeout));
+	}
+}
+
 static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
 {
 	struct blk_mq_hw_ctx *this_hctx = NULL;
@@ -3729,6 +3778,11 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
 		goto exit_flush_rq;
 
+	hctx->cpu_lat_qos = alloc_percpu(struct dev_pm_qos_request);
+	if (!hctx->cpu_lat_qos)
+		goto exit_flush_rq;
+	INIT_DELAYED_WORK(&hctx->cpu_latency_work, blk_mq_cpu_latency_work);
+
 	return 0;
 
  exit_flush_rq:
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b751cc92209b..2b61942490d6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -435,6 +435,18 @@ struct blk_mq_hw_ctx {
 	/** @kobj: Kernel object for sysfs. */
 	struct kobject		kobj;
 
+	/** @cpu_latency_work: Work to handle CPU latency PM limits. */
+	struct delayed_work	cpu_latency_work;
+
+	/** @cpu_lat_limit_active: If CPU latency limits are active or not. */
+	bool			cpu_lat_limit_active;
+
+	/** @last_active: Jiffies value when the queue was last active. */
+	unsigned long		last_active;
+
+	/** @cpu_lat_qos: PM QoS latency limits for individual CPUs. */
+	struct dev_pm_qos_request __percpu *cpu_lat_qos;
+
 #ifdef CONFIG_BLK_DEBUG_FS
 	/**
 	 * @debugfs_dir: debugfs directory for this hardware queue. Named
-- 
2.43.1

Re: [PATCHv2 2/2] blk-mq: add support for CPU latency limits

Posted by Jens Axboe 1 year, 3 months ago

On 10/18/24 1:30 AM, Tero Kristo wrote:
> @@ -2700,11 +2701,62 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug)
>  static void __blk_mq_flush_plug_list(struct request_queue *q,
>  				     struct blk_plug *plug)
>  {
> +	struct request *req, *next;
> +	struct blk_mq_hw_ctx *hctx;
> +	int cpu;
> +
>  	if (blk_queue_quiesced(q))
>  		return;
> +
> +	rq_list_for_each_safe(&plug->mq_list, req, next) {
> +		hctx = req->mq_hctx;
> +
> +		if (next && next->mq_hctx == hctx)
> +			continue;
> +
> +		if (q->disk->cpu_lat_limit < 0)
> +			continue;
> +
> +		hctx->last_active = jiffies + msecs_to_jiffies(q->disk->cpu_lat_timeout);
> +
> +		if (!hctx->cpu_lat_limit_active) {
> +			hctx->cpu_lat_limit_active = true;
> +			for_each_cpu(cpu, hctx->cpumask) {
> +				struct dev_pm_qos_request *qos;
> +
> +				qos = per_cpu_ptr(hctx->cpu_lat_qos, cpu);
> +				dev_pm_qos_add_request(get_cpu_device(cpu), qos,
> +						       DEV_PM_QOS_RESUME_LATENCY,
> +						       q->disk->cpu_lat_limit);
> +			}
> +			schedule_delayed_work(&hctx->cpu_latency_work,
> +					      msecs_to_jiffies(q->disk->cpu_lat_timeout));
> +		}
> +	}
> +

This is, quite literally, and insane amount of cycles to add to the hot
issue path. You're iterating each request in the list, and then each CPU
in the mask of the hardware context for each request.

This just won't fly, not at all. Like the previous feedback, please
figure out a way to make this cheaper. This means don't iterate a bunch
of stuff.

Outside of that, lots of styling issues here too, but none of that
really matters until the base mechanism is at least half way sane.

-- 
Jens Axboe

Re: [PATCHv2 2/2] blk-mq: add support for CPU latency limits

Posted by Tero Kristo 1 year, 3 months ago

On Fri, 2024-10-18 at 08:21 -0600, Jens Axboe wrote:
> On 10/18/24 1:30 AM, Tero Kristo wrote:
> > @@ -2700,11 +2701,62 @@ static void blk_mq_plug_issue_direct(struct
> > blk_plug *plug)
> >  static void __blk_mq_flush_plug_list(struct request_queue *q,
> >  				     struct blk_plug *plug)
> >  {
> > +	struct request *req, *next;
> > +	struct blk_mq_hw_ctx *hctx;
> > +	int cpu;
> > +
> >  	if (blk_queue_quiesced(q))
> >  		return;
> > +
> > +	rq_list_for_each_safe(&plug->mq_list, req, next) {
> > +		hctx = req->mq_hctx;
> > +
> > +		if (next && next->mq_hctx == hctx)
> > +			continue;
> > +
> > +		if (q->disk->cpu_lat_limit < 0)
> > +			continue;
> > +
> > +		hctx->last_active = jiffies + msecs_to_jiffies(q-
> > >disk->cpu_lat_timeout);
> > +
> > +		if (!hctx->cpu_lat_limit_active) {
> > +			hctx->cpu_lat_limit_active = true;
> > +			for_each_cpu(cpu, hctx->cpumask) {
> > +				struct dev_pm_qos_request *qos;
> > +
> > +				qos = per_cpu_ptr(hctx-
> > >cpu_lat_qos, cpu);
> > +				dev_pm_qos_add_request(get_cpu_dev
> > ice(cpu), qos,
> > +						      
> > DEV_PM_QOS_RESUME_LATENCY,
> > +						       q->disk-
> > >cpu_lat_limit);
> > +			}
> > +			schedule_delayed_work(&hctx-
> > >cpu_latency_work,
> > +					      msecs_to_jiffies(q-
> > >disk->cpu_lat_timeout));
> > +		}
> > +	}
> > +
> 
> This is, quite literally, and insane amount of cycles to add to the
> hot
> issue path. You're iterating each request in the list, and then each
> CPU
> in the mask of the hardware context for each request.

Ok, I made some optimizations to the code, sending v3 shortly. In this,
all the PM QoS handling and iteration of lists is moved to the
workqueue, and happens in the background. The initial block requests
(until the workqueue fires) may run with higher latency, but that is
most likely an okay compromise.

PS: Please bear with me, my knowledge of the block layer and/or NVMe is
pretty limited. I am sorry if these patches make you frustrated, that
is not my intention.

-Tero

> 
> This just won't fly, not at all. Like the previous feedback, please
> figure out a way to make this cheaper. This means don't iterate a
> bunch
> of stuff.
> 
> Outside of that, lots of styling issues here too, but none of that
> really matters until the base mechanism is at least half way sane.
>

Re: [PATCHv2 2/2] blk-mq: add support for CPU latency limits

Posted by Jens Axboe 1 year, 3 months ago

On 10/23/24 7:26 AM, Tero Kristo wrote:
> On Fri, 2024-10-18 at 08:21 -0600, Jens Axboe wrote:
>> On 10/18/24 1:30 AM, Tero Kristo wrote:
>>> @@ -2700,11 +2701,62 @@ static void blk_mq_plug_issue_direct(struct
>>> blk_plug *plug)
>>>  static void __blk_mq_flush_plug_list(struct request_queue *q,
>>>  				     struct blk_plug *plug)
>>>  {
>>> +	struct request *req, *next;
>>> +	struct blk_mq_hw_ctx *hctx;
>>> +	int cpu;
>>> +
>>>  	if (blk_queue_quiesced(q))
>>>  		return;
>>> +
>>> +	rq_list_for_each_safe(&plug->mq_list, req, next) {
>>> +		hctx = req->mq_hctx;
>>> +
>>> +		if (next && next->mq_hctx == hctx)
>>> +			continue;
>>> +
>>> +		if (q->disk->cpu_lat_limit < 0)
>>> +			continue;
>>> +
>>> +		hctx->last_active = jiffies + msecs_to_jiffies(q-
>>>> disk->cpu_lat_timeout);
>>> +
>>> +		if (!hctx->cpu_lat_limit_active) {
>>> +			hctx->cpu_lat_limit_active = true;
>>> +			for_each_cpu(cpu, hctx->cpumask) {
>>> +				struct dev_pm_qos_request *qos;
>>> +
>>> +				qos = per_cpu_ptr(hctx-
>>>> cpu_lat_qos, cpu);
>>> +				dev_pm_qos_add_request(get_cpu_dev
>>> ice(cpu), qos,
>>> +						      
>>> DEV_PM_QOS_RESUME_LATENCY,
>>> +						       q->disk-
>>>> cpu_lat_limit);
>>> +			}
>>> +			schedule_delayed_work(&hctx-
>>>> cpu_latency_work,
>>> +					      msecs_to_jiffies(q-
>>>> disk->cpu_lat_timeout));
>>> +		}
>>> +	}
>>> +
>>
>> This is, quite literally, and insane amount of cycles to add to the
>> hot
>> issue path. You're iterating each request in the list, and then each
>> CPU
>> in the mask of the hardware context for each request.
> 
> Ok, I made some optimizations to the code, sending v3 shortly. In this,
> all the PM QoS handling and iteration of lists is moved to the
> workqueue, and happens in the background. The initial block requests
> (until the workqueue fires) may run with higher latency, but that is
> most likely an okay compromise.
> 
> PS: Please bear with me, my knowledge of the block layer and/or NVMe is
> pretty limited. I am sorry if these patches make you frustrated, that
> is not my intention.

That's fine, but I'd much rather you ask for clarification if there's
something that you don't understand, rather than keep adding really
expensive code to the issue path. Pushing the iteration to the workqueue
indeed sounds like the much better approach.

-- 
Jens Axboe