drivers/nvme/host/tcp.c | 54 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-)
The default worker affinity policy is using all online cpus, e.g. from 0
to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
have a bad performance.
This patch adds a module parameter to set the cpu affinity for the nvme-tcp
socket worker threads. The parameter is a comma separated list of CPU
numbers. The list is parsed and the resulting cpumask is used to set the
affinity of the socket worker threads. If the list is empty or the
parsing fails, the default affinity is used.
Signed-off-by: Li Feng <fengli@smartx.com>
---
V2 - Fix missing static reported by lkp
drivers/nvme/host/tcp.c | 54 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 53 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 49c9e7bc9116..47748de5159b 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -31,6 +31,18 @@ static int so_priority;
module_param(so_priority, int, 0644);
MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
+/* Support for specifying the CPU affinity for the nvme-tcp socket worker
+ * threads. This is a comma separated list of CPU numbers. The list is
+ * parsed and the resulting cpumask is used to set the affinity of the
+ * socket worker threads. If the list is empty or the parsing fails, the
+ * default affinity is used.
+ */
+static char *cpu_affinity_list;
+module_param(cpu_affinity_list, charp, 0644);
+MODULE_PARM_DESC(cpu_affinity_list, "nvme tcp socket worker cpu affinity list");
+
+static struct cpumask cpu_affinity_mask;
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* lockdep can detect a circular dependency of the form
* sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
@@ -1483,6 +1495,41 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
ctrl->io_queues[HCTX_TYPE_POLL];
}
+static ssize_t update_cpu_affinity(const char *buf)
+{
+ cpumask_var_t new_value;
+ cpumask_var_t dst_value;
+ int err = 0;
+
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parselist(buf, cpumask_bits(new_value), nr_cpumask_bits);
+ if (err)
+ goto free_new_cpumask;
+
+ if (!zalloc_cpumask_var(&dst_value, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto free_new_cpumask;
+ }
+
+ /*
+ * If the new_value does not have any intersection with the cpu_online_mask,
+ * the dst_value will be empty, then keep the cpu_affinity_mask as cpu_online_mask.
+ */
+ if (cpumask_and(dst_value, new_value, cpu_online_mask))
+ cpu_affinity_mask = *dst_value;
+
+ free_cpumask_var(dst_value);
+
+free_new_cpumask:
+ free_cpumask_var(new_value);
+ if (err)
+ pr_err("failed to update cpu affinity mask, bad affinity list [%s], err %d\n",
+ buf, err);
+ return err;
+}
+
static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
@@ -1496,7 +1543,12 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
else if (nvme_tcp_poll_queue(queue))
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
ctrl->io_queues[HCTX_TYPE_READ] - 1;
- queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+
+ if (!cpu_affinity_list || update_cpu_affinity(cpu_affinity_list) != 0) {
+ // Set the default cpu_affinity_mask to cpu_online_mask
+ cpu_affinity_mask = *cpu_online_mask;
+ }
+ queue->io_cpu = cpumask_next_wrap(n - 1, &cpu_affinity_mask, -1, false);
}
static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
--
2.40.0
On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote: > The default worker affinity policy is using all online cpus, e.g. from 0 > to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will > have a bad performance. Can you explain in detail how nvme-tcp performs worse in this situation? If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs on other non-busy CPUs via taskset, or scheduler is supposed to choose proper CPUs for you. And usually nvme-tcp device should be saturated with limited io depth or jobs/cpus. Thanks, Ming
> 2023年4月17日 下午3:37,Ming Lei <ming.lei@redhat.com> 写道: > > On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote: >> The default worker affinity policy is using all online cpus, e.g. from 0 >> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will >> have a bad performance. > > Can you explain in detail how nvme-tcp performs worse in this situation? > > If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs > on other non-busy CPUs via taskset, or scheduler is supposed to choose > proper CPUs for you. And usually nvme-tcp device should be saturated > with limited io depth or jobs/cpus. > > > Thanks, > Ming > Taskset can’t work on nvme-tcp io-queues, because the worker cpu has decided at the nvme-tcp ‘connect’ stage, not the sending io stage. Assume there is only one io-queue, the binding cpu is CPU0, no matter io jobs run other cpus.
On Mon, Apr 17, 2023 at 03:50:46PM +0800, Li Feng wrote: > > > > 2023年4月17日 下午3:37,Ming Lei <ming.lei@redhat.com> 写道: > > > > On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote: > >> The default worker affinity policy is using all online cpus, e.g. from 0 > >> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will > >> have a bad performance. > > > > Can you explain in detail how nvme-tcp performs worse in this situation? > > > > If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs > > on other non-busy CPUs via taskset, or scheduler is supposed to choose > > proper CPUs for you. And usually nvme-tcp device should be saturated > > with limited io depth or jobs/cpus. > > > > > > Thanks, > > Ming > > > > Taskset can’t work on nvme-tcp io-queues, because the worker cpu has decided at the nvme-tcp ‘connect’ stage, > not the sending io stage. Assume there is only one io-queue, the binding cpu is CPU0, no matter io jobs > run other cpus. OK, looks the problem is on queue->io_cpu, see nvme_tcp_queue_request(). But I am wondering why nvme-tcp doesn't queue the io work on the current cpu? And why is queue->io_cpu introduced? Given blk-mq defines cpu affinities for each hw queue, driver is supposed to submit IO request to hardware on the local CPU. Sagi and Guys, any ideas about introducing queue->io_cpu? Thanks, Ming
>>> On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote: >>>> The default worker affinity policy is using all online cpus, e.g. from 0 >>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will >>>> have a bad performance. >>> >>> Can you explain in detail how nvme-tcp performs worse in this situation? >>> >>> If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs >>> on other non-busy CPUs via taskset, or scheduler is supposed to choose >>> proper CPUs for you. And usually nvme-tcp device should be saturated >>> with limited io depth or jobs/cpus. >>> >>> >>> Thanks, >>> Ming >>> >> >> Taskset can’t work on nvme-tcp io-queues, because the worker cpu has decided at the nvme-tcp ‘connect’ stage, >> not the sending io stage. Assume there is only one io-queue, the binding cpu is CPU0, no matter io jobs >> run other cpus. > > OK, looks the problem is on queue->io_cpu, see nvme_tcp_queue_request(). > > But I am wondering why nvme-tcp doesn't queue the io work on the current > cpu? And why is queue->io_cpu introduced? Given blk-mq defines cpu > affinities for each hw queue, driver is supposed to submit IO request > to hardware on the local CPU. > > Sagi and Guys, any ideas about introducing queue->io_cpu? Hey Ming, I have some vague memories wrt to this, but from what I recall: - The number of queues is dependent on both the controller and the user (Not a reason/motivation on its own, just clarifying). - It simply matches what pci does (to some extent, outside of rx side entropy that may exist), it just happens to take more cpu cycles due to the network stack overhead. - I didn't want io threads to change CPUs because of RFS/aRFS optimizations that people use, which allows the NIC to steer interrupts (and napi context) to where the io thread is running, and thus minimize latency due to improved locality. that on its own was shown to be worth over 30% reduction. - At some point nvme-tcp rx context used to run in softirq, and having to synchronize different cores (on different numa nodes maybe, depends on what RSS decided) when processing the socket resulted in high latency as well. This is not the case today (due to some nics back then that surfaced various issues with this) but it may be come back in the future again (if shown to provide value). - Also today when there is a sync network send from .queue_rq path, it is only executed when the running cpu == queue->io_cpu, to avoid high contention. My concern is that if the io context is not bound to a specific cpu, it may create heavier contention on queue serialization. Today there are at most 2 contexts that compete, io context (triggered from network rx or scheduled in the submission path) and .queue_rq sync network send path. I'd prefer to not have to introduce more contention with increasing number of threads accessing an nvme controller. Having said that, I don't think there is a fundamental issue with using queue_work, or queue_work_node(cur_cpu) or queue_work_node(netdev_home_cpu), if that does not introduce additional latency in the common cases. Although having io threads bounce around is going to regress users that use RFS/aRFS...
Hi Sagi, On Mon, Apr 17, 2023 at 04:33:40PM +0300, Sagi Grimberg wrote: > > > > > On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote: > > > > > The default worker affinity policy is using all online cpus, e.g. from 0 > > > > > to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will > > > > > have a bad performance. > > > > > > > > Can you explain in detail how nvme-tcp performs worse in this situation? > > > > > > > > If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs > > > > on other non-busy CPUs via taskset, or scheduler is supposed to choose > > > > proper CPUs for you. And usually nvme-tcp device should be saturated > > > > with limited io depth or jobs/cpus. > > > > > > > > > > > > Thanks, > > > > Ming > > > > > > > > > > Taskset can’t work on nvme-tcp io-queues, because the worker cpu has decided at the nvme-tcp ‘connect’ stage, > > > not the sending io stage. Assume there is only one io-queue, the binding cpu is CPU0, no matter io jobs > > > run other cpus. > > > > OK, looks the problem is on queue->io_cpu, see nvme_tcp_queue_request(). > > > > But I am wondering why nvme-tcp doesn't queue the io work on the current > > cpu? And why is queue->io_cpu introduced? Given blk-mq defines cpu > > affinities for each hw queue, driver is supposed to submit IO request > > to hardware on the local CPU. > > > > Sagi and Guys, any ideas about introducing queue->io_cpu? > > Hey Ming, > > I have some vague memories wrt to this, but from what I recall: > > - The number of queues is dependent on both the controller and > the user (Not a reason/motivation on its own, just clarifying). > > - It simply matches what pci does (to some extent, outside of rx side > entropy that may exist), it just happens to take more cpu cycles due to > the network stack overhead. > > - I didn't want io threads to change CPUs because of RFS/aRFS > optimizations that people use, which allows the NIC to steer interrupts > (and napi context) to where the io thread is running, and thus minimize > latency due to improved locality. that on its own was shown to be worth > over 30% reduction. OK, sounds like one per-queue kthread model may work for this case, and the kthread cpu affinity can be wired with hw queue's cpu affinity, and scheduler may figure out perfect time to migrate kthread to proper CPUs, and task migration is supposed to happen much less frequently. > > - At some point nvme-tcp rx context used to run in softirq, and having > to synchronize different cores (on different numa nodes maybe, depends > on what RSS decided) when processing the socket resulted in high > latency as well. This is not the case today (due to some nics back then > that surfaced various issues with this) but it may be come back in > the future again (if shown to provide value). > > - Also today when there is a sync network send from .queue_rq path, > it is only executed when the running cpu == queue->io_cpu, to avoid high > contention. My concern is that if the io context is not bound to This one looks one optimization for send? But this way actually causes contention with nvme_tcp_io_work(). > a specific cpu, it may create heavier contention on queue serialization. > Today there are at most 2 contexts that compete, io context (triggered from > network rx or scheduled in the submission path) and .queue_rq sync > network send path. I'd prefer to not have to introduce more contention with > increasing number of threads accessing an nvme controller. I understand contention doesn't have to require one fixed cpu bound with wq or kthread. Using single per-queue work or kthread can avoid contention completely. Here one tcp queue needs to handle both send & recv, and I guess all tcp sends need to be serialized, same with tcp recvs. Maybe two wq/kthreads, one is for send, the other is for recv? Or single wq/kthread is fine too if the two can be done in async style. Then the send_mutex can be saved, maybe nvme/tcp blocking can be removed. > > Having said that, I don't think there is a fundamental issue with > using queue_work, or queue_work_node(cur_cpu) or > queue_work_node(netdev_home_cpu), if that does not introduce > additional latency in the common cases. Although having io threads > bounce around is going to regress users that use RFS/aRFS... IMO, here the fundamental issue is that one fixed cpu(queue->io_cpu) is selected for handling IO submission aiming at same queue, which can't scale, because we can't expect userspace or scheduler to reserve this fixed cpu for nvme_tcp queue. Thanks, Ming
> Hi Sagi, > > On Mon, Apr 17, 2023 at 04:33:40PM +0300, Sagi Grimberg wrote: >> >>>>> On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote: >>>>>> The default worker affinity policy is using all online cpus, e.g. from 0 >>>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will >>>>>> have a bad performance. >>>>> >>>>> Can you explain in detail how nvme-tcp performs worse in this situation? >>>>> >>>>> If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs >>>>> on other non-busy CPUs via taskset, or scheduler is supposed to choose >>>>> proper CPUs for you. And usually nvme-tcp device should be saturated >>>>> with limited io depth or jobs/cpus. >>>>> >>>>> >>>>> Thanks, >>>>> Ming >>>>> >>>> >>>> Taskset can’t work on nvme-tcp io-queues, because the worker cpu has decided at the nvme-tcp ‘connect’ stage, >>>> not the sending io stage. Assume there is only one io-queue, the binding cpu is CPU0, no matter io jobs >>>> run other cpus. >>> >>> OK, looks the problem is on queue->io_cpu, see nvme_tcp_queue_request(). >>> >>> But I am wondering why nvme-tcp doesn't queue the io work on the current >>> cpu? And why is queue->io_cpu introduced? Given blk-mq defines cpu >>> affinities for each hw queue, driver is supposed to submit IO request >>> to hardware on the local CPU. >>> >>> Sagi and Guys, any ideas about introducing queue->io_cpu? >> >> Hey Ming, >> >> I have some vague memories wrt to this, but from what I recall: >> >> - The number of queues is dependent on both the controller and >> the user (Not a reason/motivation on its own, just clarifying). >> >> - It simply matches what pci does (to some extent, outside of rx side >> entropy that may exist), it just happens to take more cpu cycles due to >> the network stack overhead. >> >> - I didn't want io threads to change CPUs because of RFS/aRFS >> optimizations that people use, which allows the NIC to steer interrupts >> (and napi context) to where the io thread is running, and thus minimize >> latency due to improved locality. that on its own was shown to be worth >> over 30% reduction. > > OK, sounds like one per-queue kthread model may work for this case, and > the kthread cpu affinity can be wired with hw queue's cpu affinity, and > scheduler may figure out perfect time to migrate kthread to proper CPUs, > and task migration is supposed to happen much less frequently. That is not my experience with the task scheduler. >> - At some point nvme-tcp rx context used to run in softirq, and having >> to synchronize different cores (on different numa nodes maybe, depends >> on what RSS decided) when processing the socket resulted in high >> latency as well. This is not the case today (due to some nics back then >> that surfaced various issues with this) but it may be come back in >> the future again (if shown to provide value). >> >> - Also today when there is a sync network send from .queue_rq path, >> it is only executed when the running cpu == queue->io_cpu, to avoid high >> contention. My concern is that if the io context is not bound to > > This one looks one optimization for send? Yes. > But this way actually causes contention with nvme_tcp_io_work(). Correct, but it is only taken if the current cpu matches the queue io_cpu, which is either contended locally, or not with voluntary preemption. >> a specific cpu, it may create heavier contention on queue serialization. >> Today there are at most 2 contexts that compete, io context (triggered from >> network rx or scheduled in the submission path) and .queue_rq sync >> network send path. I'd prefer to not have to introduce more contention with >> increasing number of threads accessing an nvme controller. > > I understand contention doesn't have to require one fixed cpu bound with wq or > kthread. Using single per-queue work or kthread can avoid contention completely. I'd like to keep the optimization for send path, always context-switch to a kthread for I/O is expensive. So some contention will happen due to serialization. I opted to only take the send_mutex if the io context is scheduled on the same CPU in order to not unconditionally take a mutex. If we don't control where the queue io thread is scheduled, we will always attempt to take send_mutex, which will increase the contention on it. > Here one tcp queue needs to handle both send & recv, and I guess all tcp sends > need to be serialized, same with tcp recvs. Maybe two wq/kthreads, one > is for send, the other is for recv? Or single wq/kthread is fine too if > the two can be done in async style. That is what is done today. > > Then the send_mutex can be saved, maybe nvme/tcp blocking can be removed. > >> >> Having said that, I don't think there is a fundamental issue with >> using queue_work, or queue_work_node(cur_cpu) or >> queue_work_node(netdev_home_cpu), if that does not introduce >> additional latency in the common cases. Although having io threads >> bounce around is going to regress users that use RFS/aRFS... > > IMO, here the fundamental issue is that one fixed cpu(queue->io_cpu) is selected > for handling IO submission aiming at same queue, which can't scale, But a single queue is not designed to scale, we scale with multiple queues. > because we can't expect userspace or scheduler to reserve this fixed cpu for > nvme_tcp queue. Of course not, the queue io context is just another thread, which share the cpu with all other system threads. The design in nvme-tcp is that we keep the io context local to the cpu where the user thread is running. Naturally there are cases where one may desire to place it on some reserved cpu, or on a sibling, or based on other factors (like the nic). So on the one hand, I'd like to preserve the queue <-> cpu locality, and minimize io threads from bouncing around, and on the other hand, I don't want to expose all this to users to mangle with low-level stuff. And, I'm not particularly keen on start troubleshooting users performance issues leading to tweak some finicky scheduler knobs that are system-wide. But I'm not opposed to changing this if it proves to improve the driver.
> 2023年4月17日 下午9:33,Sagi Grimberg <sagi@grimberg.me> 写道: > > >>>> On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote: >>>>> The default worker affinity policy is using all online cpus, e.g. from 0 >>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will >>>>> have a bad performance. >>>> >>>> Can you explain in detail how nvme-tcp performs worse in this situation? >>>> >>>> If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs >>>> on other non-busy CPUs via taskset, or scheduler is supposed to choose >>>> proper CPUs for you. And usually nvme-tcp device should be saturated >>>> with limited io depth or jobs/cpus. >>>> >>>> >>>> Thanks, >>>> Ming >>>> >>> >>> Taskset can’t work on nvme-tcp io-queues, because the worker cpu has decided at the nvme-tcp ‘connect’ stage, >>> not the sending io stage. Assume there is only one io-queue, the binding cpu is CPU0, no matter io jobs >>> run other cpus. >> OK, looks the problem is on queue->io_cpu, see nvme_tcp_queue_request(). >> But I am wondering why nvme-tcp doesn't queue the io work on the current >> cpu? And why is queue->io_cpu introduced? Given blk-mq defines cpu >> affinities for each hw queue, driver is supposed to submit IO request >> to hardware on the local CPU. >> Sagi and Guys, any ideas about introducing queue->io_cpu? > > Hey Ming, > > I have some vague memories wrt to this, but from what I recall: > > - The number of queues is dependent on both the controller and > the user (Not a reason/motivation on its own, just clarifying). > > - It simply matches what pci does (to some extent, outside of rx side > entropy that may exist), it just happens to take more cpu cycles due to > the network stack overhead. > > - I didn't want io threads to change CPUs because of RFS/aRFS > optimizations that people use, which allows the NIC to steer interrupts > (and napi context) to where the io thread is running, and thus minimize > latency due to improved locality. that on its own was shown to be worth > over 30% reduction. > RFS works not good here. On my aarch64, the NIC irq is handled on NUMA node 2 CPU. And nvme-tcp io-queue is busy on CPU0. > - At some point nvme-tcp rx context used to run in softirq, and having > to synchronize different cores (on different numa nodes maybe, depends > on what RSS decided) when processing the socket resulted in high > latency as well. This is not the case today (due to some nics back then > that surfaced various issues with this) but it may be come back in > the future again (if shown to provide value). > > - Also today when there is a sync network send from .queue_rq path, > it is only executed when the running cpu == queue->io_cpu, to avoid high > contention. My concern is that if the io context is not bound to > a specific cpu, it may create heavier contention on queue serialization. > Today there are at most 2 contexts that compete, io context (triggered from network rx or scheduled in the submission path) and .queue_rq sync > network send path. I'd prefer to not have to introduce more contention with increasing number of threads accessing an nvme controller. > > Having said that, I don't think there is a fundamental issue with > using queue_work, or queue_work_node(cur_cpu) or > queue_work_node(netdev_home_cpu), if that does not introduce > additional latency in the common cases. Although having io threads > bounce around is going to regress users that use RFS/aRFS...
On 4/13/23 15:29, Li Feng wrote:
> The default worker affinity policy is using all online cpus, e.g. from 0
> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> have a bad performance.
>
> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
> socket worker threads. The parameter is a comma separated list of CPU
> numbers. The list is parsed and the resulting cpumask is used to set the
> affinity of the socket worker threads. If the list is empty or the
> parsing fails, the default affinity is used.
>
> Signed-off-by: Li Feng <fengli@smartx.com>
> ---
>
> V2 - Fix missing static reported by lkp
>
> drivers/nvme/host/tcp.c | 54 ++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 53 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 49c9e7bc9116..47748de5159b 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -31,6 +31,18 @@ static int so_priority;
> module_param(so_priority, int, 0644);
> MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
>
> +/* Support for specifying the CPU affinity for the nvme-tcp socket worker
> + * threads. This is a comma separated list of CPU numbers. The list is
> + * parsed and the resulting cpumask is used to set the affinity of the
> + * socket worker threads. If the list is empty or the parsing fails, the
> + * default affinity is used.
> + */
> +static char *cpu_affinity_list;
> +module_param(cpu_affinity_list, charp, 0644);
> +MODULE_PARM_DESC(cpu_affinity_list, "nvme tcp socket worker cpu affinity list");
> +
> +static struct cpumask cpu_affinity_mask;
> +
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> /* lockdep can detect a circular dependency of the form
> * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
> @@ -1483,6 +1495,41 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
> ctrl->io_queues[HCTX_TYPE_POLL];
> }
>
> +static ssize_t update_cpu_affinity(const char *buf)
> +{
> + cpumask_var_t new_value;
> + cpumask_var_t dst_value;
> + int err = 0;
> +
> + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
> + return -ENOMEM;
> +
> + err = bitmap_parselist(buf, cpumask_bits(new_value), nr_cpumask_bits);
> + if (err)
> + goto free_new_cpumask;
> +
> + if (!zalloc_cpumask_var(&dst_value, GFP_KERNEL)) {
> + err = -ENOMEM;
> + goto free_new_cpumask;
> + }
> +
> + /*
> + * If the new_value does not have any intersection with the cpu_online_mask,
> + * the dst_value will be empty, then keep the cpu_affinity_mask as cpu_online_mask.
> + */
> + if (cpumask_and(dst_value, new_value, cpu_online_mask))
> + cpu_affinity_mask = *dst_value;
> +
> + free_cpumask_var(dst_value);
> +
> +free_new_cpumask:
> + free_cpumask_var(new_value);
> + if (err)
> + pr_err("failed to update cpu affinity mask, bad affinity list [%s], err %d\n",
> + buf, err);
> + return err;
> +}
> +
> static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
> {
> struct nvme_tcp_ctrl *ctrl = queue->ctrl;
> @@ -1496,7 +1543,12 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
> else if (nvme_tcp_poll_queue(queue))
> n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
> ctrl->io_queues[HCTX_TYPE_READ] - 1;
> - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
> +
> + if (!cpu_affinity_list || update_cpu_affinity(cpu_affinity_list) != 0) {
> + // Set the default cpu_affinity_mask to cpu_online_mask
> + cpu_affinity_mask = *cpu_online_mask;
> + }
> + queue->io_cpu = cpumask_next_wrap(n - 1, &cpu_affinity_mask, -1, false);
> }
>
> static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
I am not in favour of this.
NVMe-over-Fabrics has _virtual_ queues, which really have no
relationship to the underlying hardware.
So trying to be clever here by tacking queues to CPUs sort of works if
you have one subsystem to talk to, but if you have several where each
exposes a _different_ number of queues you end up with a quite
suboptimal setting (ie you rely on the resulting cpu sets to overlap,
but there is no guarantee that they do).
Rather leave it to the hardware to sort things out, and rely on the
blk-mq CPU mapping to get I/O aligned to CPUs.
Cheers,
Hannes
On Fri, Apr 14, 2023 at 4:36 PM Hannes Reinecke <hare@suse.de> wrote:
>
> On 4/13/23 15:29, Li Feng wrote:
> > The default worker affinity policy is using all online cpus, e.g. from 0
> > to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> > have a bad performance.
> >
> > This patch adds a module parameter to set the cpu affinity for the nvme-tcp
> > socket worker threads. The parameter is a comma separated list of CPU
> > numbers. The list is parsed and the resulting cpumask is used to set the
> > affinity of the socket worker threads. If the list is empty or the
> > parsing fails, the default affinity is used.
> >
> > Signed-off-by: Li Feng <fengli@smartx.com>
> > ---
> >
> > V2 - Fix missing static reported by lkp
> >
> > drivers/nvme/host/tcp.c | 54 ++++++++++++++++++++++++++++++++++++++++-
> > 1 file changed, 53 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> > index 49c9e7bc9116..47748de5159b 100644
> > --- a/drivers/nvme/host/tcp.c
> > +++ b/drivers/nvme/host/tcp.c
> > @@ -31,6 +31,18 @@ static int so_priority;
> > module_param(so_priority, int, 0644);
> > MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
> >
> > +/* Support for specifying the CPU affinity for the nvme-tcp socket worker
> > + * threads. This is a comma separated list of CPU numbers. The list is
> > + * parsed and the resulting cpumask is used to set the affinity of the
> > + * socket worker threads. If the list is empty or the parsing fails, the
> > + * default affinity is used.
> > + */
> > +static char *cpu_affinity_list;
> > +module_param(cpu_affinity_list, charp, 0644);
> > +MODULE_PARM_DESC(cpu_affinity_list, "nvme tcp socket worker cpu affinity list");
> > +
> > +static struct cpumask cpu_affinity_mask;
> > +
> > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > /* lockdep can detect a circular dependency of the form
> > * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
> > @@ -1483,6 +1495,41 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
> > ctrl->io_queues[HCTX_TYPE_POLL];
> > }
> >
> > +static ssize_t update_cpu_affinity(const char *buf)
> > +{
> > + cpumask_var_t new_value;
> > + cpumask_var_t dst_value;
> > + int err = 0;
> > +
> > + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
> > + return -ENOMEM;
> > +
> > + err = bitmap_parselist(buf, cpumask_bits(new_value), nr_cpumask_bits);
> > + if (err)
> > + goto free_new_cpumask;
> > +
> > + if (!zalloc_cpumask_var(&dst_value, GFP_KERNEL)) {
> > + err = -ENOMEM;
> > + goto free_new_cpumask;
> > + }
> > +
> > + /*
> > + * If the new_value does not have any intersection with the cpu_online_mask,
> > + * the dst_value will be empty, then keep the cpu_affinity_mask as cpu_online_mask.
> > + */
> > + if (cpumask_and(dst_value, new_value, cpu_online_mask))
> > + cpu_affinity_mask = *dst_value;
> > +
> > + free_cpumask_var(dst_value);
> > +
> > +free_new_cpumask:
> > + free_cpumask_var(new_value);
> > + if (err)
> > + pr_err("failed to update cpu affinity mask, bad affinity list [%s], err %d\n",
> > + buf, err);
> > + return err;
> > +}
> > +
> > static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
> > {
> > struct nvme_tcp_ctrl *ctrl = queue->ctrl;
> > @@ -1496,7 +1543,12 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
> > else if (nvme_tcp_poll_queue(queue))
> > n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
> > ctrl->io_queues[HCTX_TYPE_READ] - 1;
> > - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
> > +
> > + if (!cpu_affinity_list || update_cpu_affinity(cpu_affinity_list) != 0) {
> > + // Set the default cpu_affinity_mask to cpu_online_mask
> > + cpu_affinity_mask = *cpu_online_mask;
> > + }
> > + queue->io_cpu = cpumask_next_wrap(n - 1, &cpu_affinity_mask, -1, false);
> > }
> >
> > static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
>
> I am not in favour of this.
> NVMe-over-Fabrics has _virtual_ queues, which really have no
> relationship to the underlying hardware.
> So trying to be clever here by tacking queues to CPUs sort of works if
> you have one subsystem to talk to, but if you have several where each
> exposes a _different_ number of queues you end up with a quite
> suboptimal setting (ie you rely on the resulting cpu sets to overlap,
> but there is no guarantee that they do).
Thanks for your comment.
The current io-queues/cpu map method is not optimal.
It is stupid, and just starts from 0 to the last CPU, which is not configurable.
> Rather leave it to the hardware to sort things out, and rely on the
> blk-mq CPU mapping to get I/O aligned to CPUs.
>
> Cheers,
>
> Hannes
>
The nvme-tcp currently doesn't support a *clever* method to bind the
io-queue and cpu,
it's decided at the allocation and doesn't have a method to change it.
E.g. One subsystem's first io queue binds to CPU0, the next io queue
binds to CPU1, and
if the NIC is located on the other NUMA node 2(CPU24 - CPU36), and
binds the fio to NUMA node 2,
the nvme-tcp will still have poor performance,so how should I tune the
performance?
I have to change the nic irq affinity, but it will hurt other numa
node2 application's performance.
We should maximize one subsystem performance, then maximize multiple
subsystems performance.
This patch gives a chance to adapt the nic and cpu load.
Before and after patch, on my aarch64 server with 4 numa nodes, the
256k read throughput ups
from 1GB/s to 1.4GB/s.
Thanks.
From: Li Feng > Sent: 14 April 2023 10:35 > > > > On 4/13/23 15:29, Li Feng wrote: > > > The default worker affinity policy is using all online cpus, e.g. from 0 > > > to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will > > > have a bad performance. > > > > > > This patch adds a module parameter to set the cpu affinity for the nvme-tcp > > > socket worker threads. The parameter is a comma separated list of CPU > > > numbers. The list is parsed and the resulting cpumask is used to set the > > > affinity of the socket worker threads. If the list is empty or the > > > parsing fails, the default affinity is used. > > > ... > > I am not in favour of this. > > NVMe-over-Fabrics has _virtual_ queues, which really have no > > relationship to the underlying hardware. > > So trying to be clever here by tacking queues to CPUs sort of works if > > you have one subsystem to talk to, but if you have several where each > > exposes a _different_ number of queues you end up with a quite > > suboptimal setting (ie you rely on the resulting cpu sets to overlap, > > but there is no guarantee that they do). > > Thanks for your comment. > The current io-queues/cpu map method is not optimal. > It is stupid, and just starts from 0 to the last CPU, which is not configurable. Module parameters suck, and passing the buck to the user when you can't decide how to do something isn't a good idea either. If the system is busy pinning threads to cpus is very hard to get right. It can be better to set the threads to run at the lowest RT priority - so they have priority over all 'normal' threads and also have a very sticky (but not fixed) cpu affinity so that all such threads tends to get spread out by the scheduler. This all works best if the number of RT threads isn't greater than the number of physical cpu. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On 4/15/23 23:06, David Laight wrote: > From: Li Feng >> Sent: 14 April 2023 10:35 >>> >>> On 4/13/23 15:29, Li Feng wrote: >>>> The default worker affinity policy is using all online cpus, e.g. from 0 >>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will >>>> have a bad performance. >>>> >>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp >>>> socket worker threads. The parameter is a comma separated list of CPU >>>> numbers. The list is parsed and the resulting cpumask is used to set the >>>> affinity of the socket worker threads. If the list is empty or the >>>> parsing fails, the default affinity is used. >>>> > ... >>> I am not in favour of this. >>> NVMe-over-Fabrics has _virtual_ queues, which really have no >>> relationship to the underlying hardware. >>> So trying to be clever here by tacking queues to CPUs sort of works if >>> you have one subsystem to talk to, but if you have several where each >>> exposes a _different_ number of queues you end up with a quite >>> suboptimal setting (ie you rely on the resulting cpu sets to overlap, >>> but there is no guarantee that they do). >> >> Thanks for your comment. >> The current io-queues/cpu map method is not optimal. >> It is stupid, and just starts from 0 to the last CPU, which is not configurable. > > Module parameters suck, and passing the buck to the user > when you can't decide how to do something isn't a good idea either. > > If the system is busy pinning threads to cpus is very hard to > get right. > > It can be better to set the threads to run at the lowest RT > priority - so they have priority over all 'normal' threads > and also have a very sticky (but not fixed) cpu affinity so > that all such threads tends to get spread out by the scheduler. > This all works best if the number of RT threads isn't greater > than the number of physical cpu. > And the problem is that you cannot give an 'optimal' performance metric here. With NVMe-over-Fabrics the number of queues is negotiated during the initial 'connect' call, and the resulting number of queues strongly depends on target preferences (eg a NetApp array will expose only 4 queues, with Dell/EMC you end up with up max 128 queues). And these queues need to be mapped on the underlying hardware, which has its own issues wrt to NUMA affinity. To give you an example: Given a setup with a 4 node NUMA machine, one NIC connected to one NUMA core, each socket having 24 threads, the NIC exposing up to 32 interrupts, and connections to a NetApp _and_ a EMC, how exactly should the 'best' layout look like? And, what _is_ the 'best' layout? You cannot satisfy the queue requirements from NetApp _and_ EMC, as you only have one NIC, and you cannot change the interrupt affinity for each I/O. Cheers, Hannes -- Dr. Hannes Reinecke Kernel Storage Architect hare@suse.de +49 911 74053 688 SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew Myers, Andrew McDonald, Martje Boudien Moerman
On Mon, Apr 17, 2023 at 2:27 PM Hannes Reinecke <hare@suse.de> wrote: > > On 4/15/23 23:06, David Laight wrote: > > From: Li Feng > >> Sent: 14 April 2023 10:35 > >>> > >>> On 4/13/23 15:29, Li Feng wrote: > >>>> The default worker affinity policy is using all online cpus, e.g. from 0 > >>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will > >>>> have a bad performance. > >>>> > >>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp > >>>> socket worker threads. The parameter is a comma separated list of CPU > >>>> numbers. The list is parsed and the resulting cpumask is used to set the > >>>> affinity of the socket worker threads. If the list is empty or the > >>>> parsing fails, the default affinity is used. > >>>> > > ... > >>> I am not in favour of this. > >>> NVMe-over-Fabrics has _virtual_ queues, which really have no > >>> relationship to the underlying hardware. > >>> So trying to be clever here by tacking queues to CPUs sort of works if > >>> you have one subsystem to talk to, but if you have several where each > >>> exposes a _different_ number of queues you end up with a quite > >>> suboptimal setting (ie you rely on the resulting cpu sets to overlap, > >>> but there is no guarantee that they do). > >> > >> Thanks for your comment. > >> The current io-queues/cpu map method is not optimal. > >> It is stupid, and just starts from 0 to the last CPU, which is not configurable. > > > > Module parameters suck, and passing the buck to the user > > when you can't decide how to do something isn't a good idea either. > > > > If the system is busy pinning threads to cpus is very hard to > > get right. > > > > It can be better to set the threads to run at the lowest RT > > priority - so they have priority over all 'normal' threads > > and also have a very sticky (but not fixed) cpu affinity so > > that all such threads tends to get spread out by the scheduler. > > This all works best if the number of RT threads isn't greater > > than the number of physical cpu. > > > And the problem is that you cannot give an 'optimal' performance metric > here. With NVMe-over-Fabrics the number of queues is negotiated during > the initial 'connect' call, and the resulting number of queues strongly > depends on target preferences (eg a NetApp array will expose only 4 > queues, with Dell/EMC you end up with up max 128 queues). > And these queues need to be mapped on the underlying hardware, which has > its own issues wrt to NUMA affinity. > > To give you an example: > Given a setup with a 4 node NUMA machine, one NIC connected to > one NUMA core, each socket having 24 threads, the NIC exposing up to 32 > interrupts, and connections to a NetApp _and_ a EMC, how exactly should > the 'best' layout look like? > And, what _is_ the 'best' layout? > You cannot satisfy the queue requirements from NetApp _and_ EMC, as you > only have one NIC, and you cannot change the interrupt affinity for each > I/O. > Not all users have so many NIC cards that they can have one NIC per NUMA node. This scenario is quite common that only has one NIC. There doesn’t exist a ‘best' layout for all cases, So add this parameter to let users select what they want. > Cheers, > > Hannes > -- > Dr. Hannes Reinecke Kernel Storage Architect > hare@suse.de +49 911 74053 688 > SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg > HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew > Myers, Andrew McDonald, Martje Boudien Moerman >
> 2023年4月16日 上午5:06,David Laight <David.Laight@ACULAB.COM> 写道:
>
> From: Li Feng
>> Sent: 14 April 2023 10:35
>>>
>>> On 4/13/23 15:29, Li Feng wrote:
>>>> The default worker affinity policy is using all online cpus, e.g. from 0
>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>>>> have a bad performance.
>>>>
>>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
>>>> socket worker threads. The parameter is a comma separated list of CPU
>>>> numbers. The list is parsed and the resulting cpumask is used to set the
>>>> affinity of the socket worker threads. If the list is empty or the
>>>> parsing fails, the default affinity is used.
>>>>
> ...
>>> I am not in favour of this.
>>> NVMe-over-Fabrics has _virtual_ queues, which really have no
>>> relationship to the underlying hardware.
>>> So trying to be clever here by tacking queues to CPUs sort of works if
>>> you have one subsystem to talk to, but if you have several where each
>>> exposes a _different_ number of queues you end up with a quite
>>> suboptimal setting (ie you rely on the resulting cpu sets to overlap,
>>> but there is no guarantee that they do).
>>
>> Thanks for your comment.
>> The current io-queues/cpu map method is not optimal.
>> It is stupid, and just starts from 0 to the last CPU, which is not configurable.
>
> Module parameters suck, and passing the buck to the user
> when you can't decide how to do something isn't a good idea either.
>
> If the system is busy pinning threads to cpus is very hard to
> get right.
>
> It can be better to set the threads to run at the lowest RT
> priority - so they have priority over all 'normal' threads
> and also have a very sticky (but not fixed) cpu affinity so
> that all such threads tends to get spread out by the scheduler.
> This all works best if the number of RT threads isn't greater
> than the number of physical cpu.
>
> David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
Hi David,
RT priority can’t solve the cross numa access issue.
If the user doesn’t know how to configure this affinity, just keep it default.
Cross numa is not a obvious issue on X86{_64}, but it’s a significant issue
on aarch64 with multiple numa nodes.
Thanks.
On 4/14/23 02:35, Li Feng wrote:
> On Fri, Apr 14, 2023 at 4:36 PM Hannes Reinecke <hare@suse.de> wrote:
>> On 4/13/23 15:29, Li Feng wrote:
>>> The default worker affinity policy is using all online cpus, e.g. from 0
>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>>> have a bad performance.
>>>
>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
>>> socket worker threads. The parameter is a comma separated list of CPU
>>> numbers. The list is parsed and the resulting cpumask is used to set the
>>> affinity of the socket worker threads. If the list is empty or the
>>> parsing fails, the default affinity is used.
>>>
>>> Signed-off-by: Li Feng <fengli@smartx.com>
>>> ---
>>>
>>> V2 - Fix missing static reported by lkp
>>>
>>> drivers/nvme/host/tcp.c | 54 ++++++++++++++++++++++++++++++++++++++++-
>>> 1 file changed, 53 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>>> index 49c9e7bc9116..47748de5159b 100644
>>> --- a/drivers/nvme/host/tcp.c
>>> +++ b/drivers/nvme/host/tcp.c
>>> @@ -31,6 +31,18 @@ static int so_priority;
>>> module_param(so_priority, int, 0644);
>>> MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
>>>
>>> +/* Support for specifying the CPU affinity for the nvme-tcp socket worker
>>> + * threads. This is a comma separated list of CPU numbers. The list is
>>> + * parsed and the resulting cpumask is used to set the affinity of the
>>> + * socket worker threads. If the list is empty or the parsing fails, the
>>> + * default affinity is used.
>>> + */
>>> +static char *cpu_affinity_list;
>>> +module_param(cpu_affinity_list, charp, 0644);
>>> +MODULE_PARM_DESC(cpu_affinity_list, "nvme tcp socket worker cpu affinity list");
>>> +
>>> +static struct cpumask cpu_affinity_mask;
>>> +
>>> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>>> /* lockdep can detect a circular dependency of the form
>>> * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
>>> @@ -1483,6 +1495,41 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
>>> ctrl->io_queues[HCTX_TYPE_POLL];
>>> }
>>>
>>> +static ssize_t update_cpu_affinity(const char *buf)
>>> +{
>>> + cpumask_var_t new_value;
>>> + cpumask_var_t dst_value;
>>> + int err = 0;
>>> +
>>> + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
>>> + return -ENOMEM;
>>> +
>>> + err = bitmap_parselist(buf, cpumask_bits(new_value), nr_cpumask_bits);
>>> + if (err)
>>> + goto free_new_cpumask;
>>> +
>>> + if (!zalloc_cpumask_var(&dst_value, GFP_KERNEL)) {
>>> + err = -ENOMEM;
>>> + goto free_new_cpumask;
>>> + }
>>> +
>>> + /*
>>> + * If the new_value does not have any intersection with the cpu_online_mask,
>>> + * the dst_value will be empty, then keep the cpu_affinity_mask as cpu_online_mask.
>>> + */
>>> + if (cpumask_and(dst_value, new_value, cpu_online_mask))
>>> + cpu_affinity_mask = *dst_value;
>>> +
>>> + free_cpumask_var(dst_value);
>>> +
>>> +free_new_cpumask:
>>> + free_cpumask_var(new_value);
>>> + if (err)
>>> + pr_err("failed to update cpu affinity mask, bad affinity list [%s], err %d\n",
>>> + buf, err);
>>> + return err;
>>> +}
>>> +
>>> static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
>>> {
>>> struct nvme_tcp_ctrl *ctrl = queue->ctrl;
>>> @@ -1496,7 +1543,12 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
>>> else if (nvme_tcp_poll_queue(queue))
>>> n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
>>> ctrl->io_queues[HCTX_TYPE_READ] - 1;
>>> - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
>>> +
>>> + if (!cpu_affinity_list || update_cpu_affinity(cpu_affinity_list) != 0) {
>>> + // Set the default cpu_affinity_mask to cpu_online_mask
>>> + cpu_affinity_mask = *cpu_online_mask;
>>> + }
>>> + queue->io_cpu = cpumask_next_wrap(n - 1, &cpu_affinity_mask, -1, false);
>>> }
>>>
>>> static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
>> I am not in favour of this.
>> NVMe-over-Fabrics has _virtual_ queues, which really have no
>> relationship to the underlying hardware.
>> So trying to be clever here by tacking queues to CPUs sort of works if
>> you have one subsystem to talk to, but if you have several where each
>> exposes a _different_ number of queues you end up with a quite
>> suboptimal setting (ie you rely on the resulting cpu sets to overlap,
>> but there is no guarantee that they do).
> Thanks for your comment.
> The current io-queues/cpu map method is not optimal.
> It is stupid, and just starts from 0 to the last CPU, which is not configurable.
>
>> Rather leave it to the hardware to sort things out, and rely on the
>> blk-mq CPU mapping to get I/O aligned to CPUs.
>>
>> Cheers,
>>
>> Hannes
>>
> The nvme-tcp currently doesn't support a *clever* method to bind the
> io-queue and cpu,
> it's decided at the allocation and doesn't have a method to change it.
> E.g. One subsystem's first io queue binds to CPU0, the next io queue
> binds to CPU1, and
> if the NIC is located on the other NUMA node 2(CPU24 - CPU36), and
> binds the fio to NUMA node 2,
> the nvme-tcp will still have poor performance,so how should I tune the
> performance?
> I have to change the nic irq affinity, but it will hurt other numa
> node2 application's performance.
> We should maximize one subsystem performance, then maximize multiple
> subsystems performance.
>
> This patch gives a chance to adapt the nic and cpu load.
> Before and after patch, on my aarch64 server with 4 numa nodes, the
> 256k read throughput ups
> from 1GB/s to 1.4GB/s.
>
> Thanks.
>
For patch like this it needs to be backed by the detailed performance
analysis to start with, both with and without this patch to prove it'
usefulness with quantitative data, that will avoid any further questions
and allow reviewers to come to the conclusion faster ...
Also, please make sure to cover all possible general usecases to avoid
further questions such as one subsystem performance vs multiple susbsys
performance ..
-ck
© 2016 - 2025 Red Hat, Inc.