drivers/nvme/host/tcp.c | 54 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-)
The default worker affinity policy is using all online cpus, e.g. from 0
to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
have a bad performance.
This patch adds a module parameter to set the cpu affinity for the nvme-tcp
socket worker threads. The parameter is a comma separated list of CPU
numbers. The list is parsed and the resulting cpumask is used to set the
affinity of the socket worker threads. If the list is empty or the
parsing fails, the default affinity is used.
Signed-off-by: Li Feng <fengli@smartx.com>
---
drivers/nvme/host/tcp.c | 54 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 53 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 49c9e7bc9116..a82c50adb12b 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -31,6 +31,18 @@ static int so_priority;
module_param(so_priority, int, 0644);
MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
+/* Support for specifying the CPU affinity for the nvme-tcp socket worker
+ * threads. This is a comma separated list of CPU numbers. The list is
+ * parsed and the resulting cpumask is used to set the affinity of the
+ * socket worker threads. If the list is empty or the parsing fails, the
+ * default affinity is used.
+ */
+static char *cpu_affinity_list;
+module_param(cpu_affinity_list, charp, 0644);
+MODULE_PARM_DESC(cpu_affinity_list, "nvme tcp socket worker cpu affinity list");
+
+struct cpumask cpu_affinity_mask;
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* lockdep can detect a circular dependency of the form
* sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
@@ -1483,6 +1495,41 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
ctrl->io_queues[HCTX_TYPE_POLL];
}
+static ssize_t update_cpu_affinity(const char *buf)
+{
+ cpumask_var_t new_value;
+ cpumask_var_t dst_value;
+ int err = 0;
+
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parselist(buf, cpumask_bits(new_value), nr_cpumask_bits);
+ if (err)
+ goto free_new_cpumask;
+
+ if (!zalloc_cpumask_var(&dst_value, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto free_new_cpumask;
+ }
+
+ /*
+ * If the new_value does not have any intersection with the cpu_online_mask,
+ * the dst_value will be empty, then keep the cpu_affinity_mask as cpu_online_mask.
+ */
+ if (cpumask_and(dst_value, new_value, cpu_online_mask))
+ cpu_affinity_mask = *dst_value;
+
+ free_cpumask_var(dst_value);
+
+free_new_cpumask:
+ free_cpumask_var(new_value);
+ if (err)
+ pr_err("failed to update cpu affinity mask, bad affinity list [%s], err %d\n",
+ buf, err);
+ return err;
+}
+
static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
@@ -1496,7 +1543,12 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
else if (nvme_tcp_poll_queue(queue))
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
ctrl->io_queues[HCTX_TYPE_READ] - 1;
- queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+
+ if (!cpu_affinity_list || update_cpu_affinity(cpu_affinity_list) != 0) {
+ // Set the default cpu_affinity_mask to cpu_online_mask
+ cpu_affinity_mask = *cpu_online_mask;
+ }
+ queue->io_cpu = cpumask_next_wrap(n - 1, &cpu_affinity_mask, -1, false);
}
static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
--
2.40.0
Hey Li, > The default worker affinity policy is using all online cpus, e.g. from 0 > to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will > have a bad performance. > > This patch adds a module parameter to set the cpu affinity for the nvme-tcp > socket worker threads. The parameter is a comma separated list of CPU > numbers. The list is parsed and the resulting cpumask is used to set the > affinity of the socket worker threads. If the list is empty or the > parsing fails, the default affinity is used. I can see how this may benefit a specific set of workloads, but I have a few issues with this. - This is exposing a user interface for something that is really internal to the driver. - This is something that can be misleading and could be tricky to get right, my concern is that this would only benefit a very niche case. - If the setting should exist, it should not be global. - I prefer not to introduce new modparams. - I'd prefer to find a way to support your use-case without introducing a config knob for it. - It is not backed by performance improvements, but more importantly does not cover any potential regressions in key metrics (bw/iops/lat) or lack there of.
> - It is not backed by performance improvements, but more importantly > does not cover any potential regressions in key metrics (bw/iops/lat) > or lack there of. > I've already asked for this, without seeing performance numbers and any regression for general usecase it is hard to justify this. -ck
> 2023年4月18日 上午11:58,Chaitanya Kulkarni <chaitanyak@nvidia.com> 写道:
>
>
>> - It is not backed by performance improvements, but more importantly
>> does not cover any potential regressions in key metrics (bw/iops/lat)
>> or lack there of.
>>
>
> I've already asked for this, without seeing performance numbers
> and any regression for general usecase it is hard to justify this.
>
> -ck
>
>
>
Hi ck & sagi,
There are more io pattern results.
# ENV
[root@Node81 vhost]# uname -a
Linux Node81 5.10.0-136.28.0.104.oe2203sp1.aarch64 #1 SMP Thu Apr 13 10:50:10 CST 2023 aarch64 aarch64 aarch64 GNU/Linux
[root@Node81 vhost]# lscpu
Architecture: aarch64
CPU op-mode(s): 64-bit
Byte Order: Little Endian
CPU(s): 96
On-line CPU(s) list: 0-95
Vendor ID: HiSilicon
BIOS Vendor ID: HiSilicon
Model name: Kunpeng-920
BIOS Model name: HUAWEI Kunpeng 920 5251K
Model: 0
Thread(s) per core: 1
Core(s) per socket: 48
Socket(s): 2
Stepping: 0x1
Frequency boost: disabled
CPU max MHz: 2600.0000
CPU min MHz: 200.0000
BogoMIPS: 200.00
Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm jscvt fcma dcpop asimddp asimdfhm
Caches (sum of all):
L1d: 6 MiB (96 instances)
L1i: 6 MiB (96 instances)
L2: 48 MiB (96 instances)
L3: 96 MiB (4 instances)
NUMA:
NUMA node(s): 4
NUMA node0 CPU(s): 0-23
NUMA node1 CPU(s): 24-47
NUMA node2 CPU(s): 48-71
NUMA node3 CPU(s): 72-95
Vulnerabilities:
Itlb multihit: Not affected
L1tf: Not affected
Mds: Not affected
Meltdown: Not affected
Mmio stale data: Not affected
Retbleed: Not affected
Spec store bypass: Not affected
Spectre v1: Mitigation; __user pointer sanitization
Spectre v2: Not affected
Srbds: Not affected
Tsx async abort: Not affected
[root@Node81 host3]# numactl -H
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
node 0 size: 0 MB
node 0 free: 0 MB
node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
node 1 size: 31619 MB
node 1 free: 30598 MB
node 2 cpus: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
node 2 size: 0 MB
node 2 free: 0 MB
node 3 cpus: 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
node 3 size: 31219 MB
node 3 free: 29854 MB
node distances:
node 0 1 2 3
0: 10 12 20 22
1: 12 10 22 24
2: 20 22 10 12
3: 22 24 12 10
[root@Node81 vhost]# lshw -short -c network
H/W path Device Class Description
===========================================================
/0/106/0 enp129s0f0np0 network MT27800 Family [ConnectX-5]
/0/106/0.1 enp129s0f1np1 network MT27800 Family [ConnectX-5]
[root@Node81 vhost]# lspci | grep MT
81:00.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5]
81:00.1 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5]
[root@Node81 vhost]# ethtool -i enp129s0f0np0
driver: mlx5_core
version: 5.0-0
firmware-version: 16.35.2000 (MT_0000000425)
expansion-rom-version:
bus-info: 0000:81:00.0
supports-statistics: yes
supports-test: yes
supports-eeprom-access: no
supports-register-dump: no
supports-priv-flags: yes
[root@Node81 vhost]# lspci -s 81:00.0 -vv
81:00.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5]
Subsystem: Mellanox Technologies Device 0121
Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr+ Stepping- SERR+ FastB2B- DisINTx+
Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
Latency: 0, Cache Line Size: 32 bytes
Interrupt: pin A routed to IRQ 27
NUMA node: 2
Region 0: Memory at 280002000000 (64-bit, prefetchable) [size=32M]
Expansion ROM at b0200000 [disabled] [size=1M]
Capabilities: [60] Express (v2) Endpoint, MSI 00
DevCap: MaxPayload 512 bytes, PhantFunc 0, Latency L0s unlimited, L1 unlimited
ExtTag+ AttnBtn- AttnInd- PwrInd- RBE+ FLReset+ SlotPowerLimit 25W
DevCtl: CorrErr+ NonFatalErr+ FatalErr+ UnsupReq-
RlxdOrd+ ExtTag+ PhantFunc- AuxPwr- NoSnoop+ FLReset-
MaxPayload 256 bytes, MaxReadReq 512 bytes
DevSta: CorrErr+ NonFatalErr- FatalErr- UnsupReq+ AuxPwr- TransPend-
LnkCap: Port #0, Speed 8GT/s, Width x8, ASPM not supported
ClockPM- Surprise- LLActRep- BwNot- ASPMOptComp+
LnkCtl: ASPM Disabled; RCB 128 bytes, Disabled- CommClk-
ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt-
LnkSta: Speed 8GT/s, Width x8
TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt-
DevCap2: Completion Timeout: Range ABC, TimeoutDis+ NROPrPrP- LTR-
10BitTagComp- 10BitTagReq- OBFF Not Supported, ExtFmt- EETLPPrefix-
EmergencyPowerReduction Not Supported, EmergencyPowerReductionInit-
FRS- TPHComp- ExtTPHComp-
AtomicOpsCap: 32bit- 64bit- 128bitCAS-
DevCtl2: Completion Timeout: 50us to 50ms, TimeoutDis- LTR- 10BitTagReq- OBFF Disabled,
AtomicOpsCtl: ReqEn+
LnkCap2: Supported Link Speeds: 2.5-8GT/s, Crosslink- Retimer- 2Retimers- DRS-
LnkCtl2: Target Link Speed: 8GT/s, EnterCompliance- SpeedDis-
Transmit Margin: Normal Operating Range, EnterModifiedCompliance- ComplianceSOS-
Compliance Preset/De-emphasis: -6dB de-emphasis, 0dB preshoot
LnkSta2: Current De-emphasis Level: -6dB, EqualizationComplete+ EqualizationPhase1+
EqualizationPhase2+ EqualizationPhase3+ LinkEqualizationRequest-
Retimer- 2Retimers- CrosslinkRes: unsupported
Capabilities: [48] Vital Product Data
Product Name: CX512A - ConnectX-5 SFP28
Read-only fields:
[PN] Part number: MCX512A-ACUT
[EC] Engineering changes: B7
[V2] Vendor specific: MCX512A-ACUT
[SN] Serial number: MT2211K02268
[V3] Vendor specific: fe87176f019fec1180001070fd62e0e0
[VA] Vendor specific: MLX:MODL=CX512A:MN=MLNX:CSKU=V2:UUID=V3:PCI=V0
[V0] Vendor specific: PCIeGen3 x8
[RV] Reserved: checksum good, 0 byte(s) reserved
End
Capabilities: [9c] MSI-X: Enable+ Count=64 Masked-
Vector table: BAR=0 offset=00002000
PBA: BAR=0 offset=00003000
Capabilities: [c0] Vendor Specific Information: Len=18 <?>
Capabilities: [40] Power Management version 3
Flags: PMEClk- DSI- D1- D2- AuxCurrent=375mA PME(D0-,D1-,D2-,D3hot-,D3cold+)
Status: D0 NoSoftRst+ PME-Enable- DSel=0 DScale=0 PME-
Capabilities: [100 v1] Advanced Error Reporting
UESta: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
UEMsk: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq+ ACSViol-
UESvrt: DLP+ SDES- TLP- FCP+ CmpltTO- CmpltAbrt- UnxCmplt- RxOF+ MalfTLP+ ECRC- UnsupReq- ACSViol-
CESta: RxErr- BadTLP- BadDLLP- Rollover- Timeout- AdvNonFatalErr-
CEMsk: RxErr- BadTLP- BadDLLP- Rollover- Timeout- AdvNonFatalErr+
AERCap: First Error Pointer: 08, ECRCGenCap+ ECRCGenEn- ECRCChkCap+ ECRCChkEn-
MultHdrRecCap- MultHdrRecEn- TLPPfxPres- HdrLogCap-
HeaderLog: 00000000 00000000 00000000 00000000
Capabilities: [150 v1] Alternative Routing-ID Interpretation (ARI)
ARICap: MFVC- ACS-, Next Function: 1
ARICtl: MFVC- ACS-, Function Group: 0
Capabilities: [180 v1] Single Root I/O Virtualization (SR-IOV)
IOVCap: Migration- 10BitTagReq- Interrupt Message Number: 000
IOVCtl: Enable- Migration- Interrupt- MSE- ARIHierarchy+ 10BitTagReq-
IOVSta: Migration-
Initial VFs: 8, Total VFs: 8, Number of VFs: 0, Function Dependency Link: 00
VF offset: 2, stride: 1, Device ID: 1018
Supported Page Size: 000007ff, System Page Size: 00000001
Region 0: Memory at 0000280004800000 (64-bit, prefetchable)
VF Migration: offset: 00000000, BIR: 0
Capabilities: [1c0 v1] Secondary PCI Express
LnkCtl3: LnkEquIntrruptEn- PerformEqu-
LaneErrStat: 0
Capabilities: [230 v1] Access Control Services
ACSCap: SrcValid- TransBlk- ReqRedir- CmpltRedir- UpstreamFwd- EgressCtrl- DirectTrans-
ACSCtl: SrcValid- TransBlk- ReqRedir- CmpltRedir- UpstreamFwd- EgressCtrl- DirectTrans-
Kernel driver in use: mlx5_core
Kernel modules: mlx5_core
# Before patch:
2293 root 0 -20 0 0 0 R 68.6 0.0 0:23.85 kworker/0:1H+nvme_tcp_wq
12504 root 20 0 476960 26280 20140 S 41.3 0.0 0:03.38 fio
pattern | bandwidth(MiB/s) | iops | latency(us)
:- | -: | -: | -:
4k-randread-q128-j1 | 308 | 79086 | 1617.33
256k-randread-q128-j1 | 1072 | 4288 | 29822.7
4k-randwrite-q128-j1 | 327 | 83782 | 1526.77
256k-randwrite-q128-j1 | 1807 | 7228 | 17700.8
# After patch:
[root@Node81 host3]# cat /sys/module/nvme_tcp/parameters/cpu_affinity_list
66-68
1862 root 0 -20 0 0 0 R 56.6 0.0 0:59.37 kworker/66:1H+nvme_tcp_wq
12348 root 20 0 476960 25892 19804 S 45.4 0.0 0:02.00 fio
pattern | bandwidth(MiB/s) | iops | latency(us)
:- | -: | -: | -:
4k-randread-q128-j1 | 451 | 115530 | 1107.1
256k-randread-q128-j1 | 1410 | 5641 | 22671.7
4k-randwrite-q128-j1 | 432 | 110738 | 1155.12
256k-randwrite-q128-j1 | 1818 | 7274 | 17591.4
4k-randread-q128-j1 means 4k randread, iodepth = 128, jobs = 1.
All nvme-tcp io queue is 1.
Fio binds to numa node 2 cpu 69-70 like this:
taskset -c 69-70 fio --ioengine=libaio --numjobs=1 …...
> 2023年4月18日 上午11:58,Chaitanya Kulkarni <chaitanyak@nvidia.com> 写道: > > >> - It is not backed by performance improvements, but more importantly >> does not cover any potential regressions in key metrics (bw/iops/lat) >> or lack there of. >> > > I've already asked for this, without seeing performance numbers > and any regression for general usecase it is hard to justify this. > > -ck > > Hi ck, Thanks for your comment. In the previous mail, just paste the 256k read result(1GB/s to 1.4GB/s), I will add more io pattern result asap when the machine environment is available.
Hi Sagi, > 2023年4月17日 下午9:45,Sagi Grimberg <sagi@grimberg.me> 写道: > > Hey Li, > >> The default worker affinity policy is using all online cpus, e.g. from 0 >> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will >> have a bad performance. >> This patch adds a module parameter to set the cpu affinity for the nvme-tcp >> socket worker threads. The parameter is a comma separated list of CPU >> numbers. The list is parsed and the resulting cpumask is used to set the >> affinity of the socket worker threads. If the list is empty or the >> parsing fails, the default affinity is used. > > I can see how this may benefit a specific set of workloads, but I have a > few issues with this. > > - This is exposing a user interface for something that is really > internal to the driver. > > - This is something that can be misleading and could be tricky to get > right, my concern is that this would only benefit a very niche case. Our storage products needs this feature~ If the user doesn’t know what this is, they can keep it default, so I thinks this is not unacceptable. > > - If the setting should exist, it should not be global. V2 has fixed it. > > - I prefer not to introduce new modparams. > > - I'd prefer to find a way to support your use-case without introducing > a config knob for it. > I’m looking forward to it. > - It is not backed by performance improvements, but more importantly > does not cover any potential regressions in key metrics (bw/iops/lat) > or lack there of. I can do more tests if needed. Thanks, Feng Li
>> Hey Li, >> >>> The default worker affinity policy is using all online cpus, e.g. from 0 >>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will >>> have a bad performance. >>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp >>> socket worker threads. The parameter is a comma separated list of CPU >>> numbers. The list is parsed and the resulting cpumask is used to set the >>> affinity of the socket worker threads. If the list is empty or the >>> parsing fails, the default affinity is used. >> >> I can see how this may benefit a specific set of workloads, but I have a >> few issues with this. >> >> - This is exposing a user interface for something that is really >> internal to the driver. >> >> - This is something that can be misleading and could be tricky to get >> right, my concern is that this would only benefit a very niche case. > Our storage products needs this feature~ > If the user doesn’t know what this is, they can keep it default, so I thinks this is > not unacceptable. It doesn't work like that. A user interface is not something exposed to a specific consumer. >> - If the setting should exist, it should not be global. > V2 has fixed it. >> >> - I prefer not to introduce new modparams. >> >> - I'd prefer to find a way to support your use-case without introducing >> a config knob for it. >> > I’m looking forward to it. If you change queue_work_on to queue_work, ignoring the io_cpu, does it address your problem? Not saying that this should be a solution though. How many queues does your controller support that you happen to use queue 0 ? Also, what happens if you don't pin your process to a specific cpu, does that change anything?
Hi Sagi,
On Wed, Apr 19, 2023 at 5:32 PM Sagi Grimberg <sagi@grimberg.me> wrote:
>
>
> >> Hey Li,
> >>
> >>> The default worker affinity policy is using all online cpus, e.g. from 0
> >>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> >>> have a bad performance.
> >>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
> >>> socket worker threads. The parameter is a comma separated list of CPU
> >>> numbers. The list is parsed and the resulting cpumask is used to set the
> >>> affinity of the socket worker threads. If the list is empty or the
> >>> parsing fails, the default affinity is used.
> >>
> >> I can see how this may benefit a specific set of workloads, but I have a
> >> few issues with this.
> >>
> >> - This is exposing a user interface for something that is really
> >> internal to the driver.
> >>
> >> - This is something that can be misleading and could be tricky to get
> >> right, my concern is that this would only benefit a very niche case.
> > Our storage products needs this feature~
> > If the user doesn’t know what this is, they can keep it default, so I thinks this is
> > not unacceptable.
>
> It doesn't work like that. A user interface is not something exposed to
> a specific consumer.
>
> >> - If the setting should exist, it should not be global.
> > V2 has fixed it.
> >>
> >> - I prefer not to introduce new modparams.
> >>
> >> - I'd prefer to find a way to support your use-case without introducing
> >> a config knob for it.
> >>
> > I’m looking forward to it.
>
> If you change queue_work_on to queue_work, ignoring the io_cpu, does it
> address your problem?
Sorry for the late response, I just got my machine back.
Replace the queue_work_on to queue_work, looks like it has a little
good performance.
The busy worker is `kworker/56:1H+nvme_tcp_wq`, and fio binds to
90('cpus_allowed=90'),
I don't know why the worker 56 is selected.
The performance of 256k read up from 1.15GB/s to 1.35GB/s.
>
> Not saying that this should be a solution though.
>
> How many queues does your controller support that you happen to use
> queue 0 ?
Our controller only support one io queue currently.
>
> Also, what happens if you don't pin your process to a specific cpu, does
> that change anything?
If I don't pin the cpu, the performance has no effect.
Thanks,
Li
> Hi Sagi,
>
> On Wed, Apr 19, 2023 at 5:32 PM Sagi Grimberg <sagi@grimberg.me> wrote:
>>
>>
>>>> Hey Li,
>>>>
>>>>> The default worker affinity policy is using all online cpus, e.g. from 0
>>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>>>>> have a bad performance.
>>>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
>>>>> socket worker threads. The parameter is a comma separated list of CPU
>>>>> numbers. The list is parsed and the resulting cpumask is used to set the
>>>>> affinity of the socket worker threads. If the list is empty or the
>>>>> parsing fails, the default affinity is used.
>>>>
>>>> I can see how this may benefit a specific set of workloads, but I have a
>>>> few issues with this.
>>>>
>>>> - This is exposing a user interface for something that is really
>>>> internal to the driver.
>>>>
>>>> - This is something that can be misleading and could be tricky to get
>>>> right, my concern is that this would only benefit a very niche case.
>>> Our storage products needs this feature~
>>> If the user doesn’t know what this is, they can keep it default, so I thinks this is
>>> not unacceptable.
>>
>> It doesn't work like that. A user interface is not something exposed to
>> a specific consumer.
>>
>>>> - If the setting should exist, it should not be global.
>>> V2 has fixed it.
>>>>
>>>> - I prefer not to introduce new modparams.
>>>>
>>>> - I'd prefer to find a way to support your use-case without introducing
>>>> a config knob for it.
>>>>
>>> I’m looking forward to it.
>>
>> If you change queue_work_on to queue_work, ignoring the io_cpu, does it
>> address your problem?
> Sorry for the late response, I just got my machine back.
> Replace the queue_work_on to queue_work, looks like it has a little
> good performance.
> The busy worker is `kworker/56:1H+nvme_tcp_wq`, and fio binds to
> 90('cpus_allowed=90'),
> I don't know why the worker 56 is selected.
> The performance of 256k read up from 1.15GB/s to 1.35GB/s.
The question becomes what would be the impact for multi-threaded
workloads and different NIC/CPU/App placements... This is the
tricky part of touching this stuff.
>> Not saying that this should be a solution though.
>>
>> How many queues does your controller support that you happen to use
>> queue 0 ?
> Our controller only support one io queue currently.
I don't think I ever heard of a fabrics controller that supports
a single io queue.
>>
>> Also, what happens if you don't pin your process to a specific cpu, does
>> that change anything?
> If I don't pin the cpu, the performance has no effect.
Which again, makes this optimization point a niche.
On 4/25/23 10:32, Li Feng wrote:
> Hi Sagi,
>
> On Wed, Apr 19, 2023 at 5:32 PM Sagi Grimberg <sagi@grimberg.me> wrote:
>>
>>
>>>> Hey Li,
>>>>
>>>>> The default worker affinity policy is using all online cpus, e.g. from 0
>>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>>>>> have a bad performance.
>>>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
>>>>> socket worker threads. The parameter is a comma separated list of CPU
>>>>> numbers. The list is parsed and the resulting cpumask is used to set the
>>>>> affinity of the socket worker threads. If the list is empty or the
>>>>> parsing fails, the default affinity is used.
>>>>
>>>> I can see how this may benefit a specific set of workloads, but I have a
>>>> few issues with this.
>>>>
>>>> - This is exposing a user interface for something that is really
>>>> internal to the driver.
>>>>
>>>> - This is something that can be misleading and could be tricky to get
>>>> right, my concern is that this would only benefit a very niche case.
>>> Our storage products needs this feature~
>>> If the user doesn’t know what this is, they can keep it default, so I thinks this is
>>> not unacceptable.
>>
>> It doesn't work like that. A user interface is not something exposed to
>> a specific consumer.
>>
>>>> - If the setting should exist, it should not be global.
>>> V2 has fixed it.
>>>>
>>>> - I prefer not to introduce new modparams.
>>>>
>>>> - I'd prefer to find a way to support your use-case without introducing
>>>> a config knob for it.
>>>>
>>> I’m looking forward to it.
>>
>> If you change queue_work_on to queue_work, ignoring the io_cpu, does it
>> address your problem?
> Sorry for the late response, I just got my machine back.
> Replace the queue_work_on to queue_work, looks like it has a little
> good performance.
> The busy worker is `kworker/56:1H+nvme_tcp_wq`, and fio binds to
> 90('cpus_allowed=90'),
> I don't know why the worker 56 is selected.
> The performance of 256k read up from 1.15GB/s to 1.35GB/s.
>
>>
>> Not saying that this should be a solution though.
>>
>> How many queues does your controller support that you happen to use
>> queue 0 ?
> Our controller only support one io queue currently.
Ouch.
Remember, NVMe gets most of the performance improvements by using
several queues, and be able to bind the queues to cpu sets.
Exposing just one queue will be invalidating any assumptions we do,
and trying to improve interrupt steering won't work anyway.
I sincerely doubt we should try to 'optimize' for this rather peculiar
setup.
Cheers,
Hannes
On Wed, Apr 26, 2023 at 7:34 PM Hannes Reinecke <hare@suse.de> wrote:
>
> On 4/25/23 10:32, Li Feng wrote:
> > Hi Sagi,
> >
> > On Wed, Apr 19, 2023 at 5:32 PM Sagi Grimberg <sagi@grimberg.me> wrote:
> >>
> >>
> >>>> Hey Li,
> >>>>
> >>>>> The default worker affinity policy is using all online cpus, e.g. from 0
> >>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> >>>>> have a bad performance.
> >>>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
> >>>>> socket worker threads. The parameter is a comma separated list of CPU
> >>>>> numbers. The list is parsed and the resulting cpumask is used to set the
> >>>>> affinity of the socket worker threads. If the list is empty or the
> >>>>> parsing fails, the default affinity is used.
> >>>>
> >>>> I can see how this may benefit a specific set of workloads, but I have a
> >>>> few issues with this.
> >>>>
> >>>> - This is exposing a user interface for something that is really
> >>>> internal to the driver.
> >>>>
> >>>> - This is something that can be misleading and could be tricky to get
> >>>> right, my concern is that this would only benefit a very niche case.
> >>> Our storage products needs this feature~
> >>> If the user doesn’t know what this is, they can keep it default, so I thinks this is
> >>> not unacceptable.
> >>
> >> It doesn't work like that. A user interface is not something exposed to
> >> a specific consumer.
> >>
> >>>> - If the setting should exist, it should not be global.
> >>> V2 has fixed it.
> >>>>
> >>>> - I prefer not to introduce new modparams.
> >>>>
> >>>> - I'd prefer to find a way to support your use-case without introducing
> >>>> a config knob for it.
> >>>>
> >>> I’m looking forward to it.
> >>
> >> If you change queue_work_on to queue_work, ignoring the io_cpu, does it
> >> address your problem?
> > Sorry for the late response, I just got my machine back.
> > Replace the queue_work_on to queue_work, looks like it has a little
> > good performance.
> > The busy worker is `kworker/56:1H+nvme_tcp_wq`, and fio binds to
> > 90('cpus_allowed=90'),
> > I don't know why the worker 56 is selected.
> > The performance of 256k read up from 1.15GB/s to 1.35GB/s.
> >
> >>
> >> Not saying that this should be a solution though.
> >>
> >> How many queues does your controller support that you happen to use
> >> queue 0 ?
> > Our controller only support one io queue currently.
>
> Ouch.
> Remember, NVMe gets most of the performance improvements by using
> several queues, and be able to bind the queues to cpu sets.
> Exposing just one queue will be invalidating any assumptions we do,
> and trying to improve interrupt steering won't work anyway.
>
> I sincerely doubt we should try to 'optimize' for this rather peculiar
> setup.
Queue isn't free, and it does consume both host and device resources,
especially blk-mq takes static mapping.
Also it may depend on how application uses the nvme/tcp, such as,
io_uring may saturate one device easily in very limited tasks/queues
(one or two or a little more, depends on the device and driver implementation)
Thanks,
Ming
>>> Not saying that this should be a solution though. >>> >>> How many queues does your controller support that you happen to use >>> queue 0 ? >> Our controller only support one io queue currently. > > Ouch. > Remember, NVMe gets most of the performance improvements by using > several queues, and be able to bind the queues to cpu sets. > Exposing just one queue will be invalidating any assumptions we do, > and trying to improve interrupt steering won't work anyway. > > I sincerely doubt we should try to 'optimize' for this rather peculiar > setup. I tend to agree. This is not a common setup that I'm particularly interested in exporting something dedicated in the driver for fiddling with it...
Hi Li,
kernel test robot noticed the following build warnings:
[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master v6.3-rc6 next-20230412]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Li-Feng/nvme-tcp-Add-support-to-set-the-tcp-worker-cpu-affinity/20230413-143611
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/r/20230413063317.2455680-1-fengli%40smartx.com
patch subject: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity
config: x86_64-randconfig-s021 (https://download.01.org/0day-ci/archive/20230413/202304132042.jQcDkblz-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce:
# apt-get install sparse
# sparse version: v0.6.4-39-gce1a6720-dirty
# https://github.com/intel-lab-lkp/linux/commit/e5a036c113d5ce43375a6aafedcf705ef8c3acb1
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Li-Feng/nvme-tcp-Add-support-to-set-the-tcp-worker-cpu-affinity/20230413-143611
git checkout e5a036c113d5ce43375a6aafedcf705ef8c3acb1
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=x86_64 olddefconfig
make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/nvme/host/
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202304132042.jQcDkblz-lkp@intel.com/
sparse warnings: (new ones prefixed by >>)
>> drivers/nvme/host/tcp.c:44:16: sparse: sparse: symbol 'cpu_affinity_mask' was not declared. Should it be static?
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
© 2016 - 2025 Red Hat, Inc.