include/linux/osq_lock.h | 33 ++- kernel/Kconfig.numalocks | 17 ++ kernel/locking/Makefile | 1 + kernel/locking/numa.h | 98 +++++++ kernel/locking/numa_osq.h | 32 +++ kernel/locking/osq_lock.c | 62 +++- kernel/locking/x_osq_lock.c | 332 ++++++++++++++++++++++ kernel/locking/zx_numa.c | 537 +++++++++++++++++++++++++++++++++++ kernel/locking/zx_numa_osq.c | 433 ++++++++++++++++++++++++++++ lib/Kconfig.debug | 2 + 10 files changed, 1543 insertions(+), 4 deletions(-) create mode 100644 kernel/Kconfig.numalocks create mode 100644 kernel/locking/numa.h create mode 100644 kernel/locking/numa_osq.h create mode 100644 kernel/locking/x_osq_lock.c create mode 100644 kernel/locking/zx_numa.c create mode 100644 kernel/locking/zx_numa_osq.c
The dynamic numa-aware osq_lock supports numa architecture based on the kernel in-box osq_lock. After enable by echo 1 > /proc/zx_numa_lock/dynamic_enable, the patch keeps checking how many processes are waiting for a osq lock. If more than a threshold, the patch stops these processes and switches to the numa-aware osq lock, then restarts. By a cleanup work queue, numa-aware osq lock turns back to osq_lock when all nodes unlocked, all the numa-aware lock memory returns to the pre-allocated Linux kernel memory cache. The struct optimistic_spin_queue of dynamic numa-aware lock is also 4 bytes, the same as the in-box osq lock. If enable dynamic switch, it will be accessed as three members by union. The tail is tail16, 2 bytes, supports 65534 cpu cores. The other two members are for lock switch state and numa memory index, each 1 byte. The serial and numa are added to the struct optimistic_spin_node to know how many processes is waiting an osq lock. Each process applies an osq lock, the serial will add 1. The numa is a condition for turning program from osq_lock to numa-aware lock. We have done some performance evaluation for the dynamic numa-aware osq lock by perf, locktorture, unixbench, fxmark etc. fxmark: Filesystem Multicore Scalability Benchmark https://github.com/sslab-gatech/fxmark The following results are tested by Zhaoxin KH40000 32 cores processor or 32+32 cores, two sockets processor, and AMD EPYC 7551 32-core processor, two sockets. Since I do not know well about AMD CPU, the code to support AMD CPU is a sample only. The number under Average represents an average of test five times. The CV is the Coefficient of Variation. The kernel source code is 6.6.28 stable, compiled in the default configuration. The 6.6.28-osq-dynamic is the kernel 6.6.28 with the patch and enable dynamic switch. The OS is Ubuntu 22.04.02 LTS, gcc version 9.5.0. perf bench Zhaoxin KH40000 32 cores kernel 6.6.28 6.6.28-osq-dynamic epoll Average CV Average CV Improve ADD 25620 0.78% 64609 2.55% 152.18% WAIT 7134 1.77% 11098 0.52% 55.56% locktortue Zhaoxin KH40000 32 cores kernel 6.6.28 6.6.28-osq-dynamic lock torture Average CV Average CV Improve mutex_lock Writes 7433503 1.59% 17979058 1.90% 141.87% unixbench Zhaoxin KH40000 32+32 cores, run on ssd 64 copys 6.6.28 6.6.28-osq-dynamic System Benchmarks Partial Average CV Average CV Improve Execl Throughput 1460.18 1.18% 1865.22 0.25% 27.74% File Copy 1024 bufsize 200 549.94 0.62% 1221.32 6.71% 122.08% File Copy 256 bufsize 500 339.62 2.20% 896.58 6.57% 164.00% File Copy 4096 bufsize 800 1173.68 1.88% 2089.7 5.20% 78.05% Pipe Throughput 52122.26 0.18% 53842.72 0.15% 3.30% Pipe-based Context Switchi 18340.38 0.92% 19874.66 0.80% 8.37% Process Creation 2325.12 0.18% 2178.16 0.21% -6.32% Shell Scripts (1 concurren 7414.32 0.29% 8458.5 0.10% 14.08% Shell Scripts (16 concurre Shell Scripts (8 concurren 7156.48 0.10% 8132.42 0.14% 13.64% System Call Overhead 1476.9 0.14% 1574.32 0.09% 6.60% System Benchmarks Index Sc 2982.64 0.33% 4008.66 0.94% 34.40% fxmark Zhaoxin KH40000 32 cores, run on ssd (ssd, ext4) parallel cores 32 24 6.6.28 vs 6.6.28-osq-dynamic 6.6.28 vs 6.6.28-osq-dynamic item Improve Average,CV:Average,CV Improv CV:CV DWAL -0.17% ( 455895, 0.14%: 455115, 0.37%) -0.07% ( 0.42%: 0.44%) DWOL 1.10% (32166648, 2.64%:32521877, 2.06%) -0.68% ( 2.54%: 3.04%) DWOM 51.63% ( 496955, 4.34%: 753509, 8.32%) 45.93% ( 3.14%: 2.57%) DWSL 1.67% ( 20229, 2.34%: 20566, 3.18%) -1.74% ( 1.96%: 2.66%) MWRL 71.00% ( 348097, 0.92%: 595241, 1.26%) 65.95% ( 0.65%: 2.27%) MWRM 63.06% ( 6750, 3.33%: 11007, 4.31%) 60.18% ( 5.67%: 4.81%) MWCL 16.99% ( 149628, 1.66%: 175054, 0.82%) 16.96% ( 2.57%: 0.51%) MWCM 80.97% ( 9448, 4.66%: 17098, 0.96%) 73.79% ( 5.37%: 1.79%) MWUM 37.73% ( 16858, 3.13%: 23220, 3.42%) 31.16% ( 3.59%: 1.62%) MWUL 12.83% ( 45275, 3.90%: 51083, 3.25%) 19.94% ( 4.19%: 1.98%) DWTL 41.44% ( 85255, 5.01%: 120583, 9.83%) 45.07% ( 6.42%: 6.11%) MRPL -2.63% (11448731, 1.91%:11147179, 4.18%) -0.56% ( 1.65%: 3.33%) MRPM 0.29% ( 5423233, 1.77%: 5438929, 2.59%) -10.54% (15.85%:16.42%) MRPH -0.49% ( 688629, 2.84%: 685266, 2.88%) -18.99% (15.00%:24.98%) MRDM 8.42% ( 3662627, 0.76%: 3971133, 0.45%) 4.53% ( 1.77%: 1.72%) MRDL 6.25% ( 530518, 2.75%: 563671, 5.33%) 12.43% (25.88%:26.91%) DRBH 7.16% ( 388144, 7.88%: 415933,17.87%) -20.61% (29.12%:21.91%) DRBM -4.34% ( 381710, 5.51%: 365159, 3.15%) -16.93% (27.15%:29.85%) DRBL -0.17% (46227341, 2.50%:46147935, 2.89%) -4.03% ( 4.01%: 5.30%) fxmark Zhaoxin KH40000 32 cores, run on ssd (ssd, ext4) parallel cores 2 1 6.6.28 vs 6.6.28-osq-dynamic 6.6.28 vs 6.6.28-osq-dynamic item Improve CV:CV Improve CV: CV DWAL 1.78% (0.31%:0.20%) 6.36% (2.52%: 0.67%) DWOL 2.46% (2.26%:2.53%) 1.83% (2.69%: 3.07%) DWOM 2.70% (2.58%:3.12%) 2.22% (2.67%: 3.79%) DWSL 3.28% (2.90%:3.38%) 4.41% (1.32%: 1.36%) MWRL -0.76% (1.46%:1.94%) -0.82% (2.04%: 2.32%) MWRM 1.94% (4.38%:0.89%) -2.05% (4.07%: 5.16%) MWCL -0.07% (1.36%:3.84%) -2.17% (1.58%: 3.04%) MWCM 1.85% (2.95%:4.68%) 0.28% (0.45%: 2.48%) MWUM -2.85% (1.48%:2.01%) -3.06% (1.47%: 1.97%) MWUL -1.46% (0.58%:2.27%) -2.98% (0.71%: 2.11%) DWTL 0.40% (3.89%:4.35%) -2.68% (4.04%: 3.15%) MRPL 3.11% (1.38%:0.35%) -4.81% (0.32%:16.52%) MRPM 2.99% (0.29%:1.19%) 3.50% (0.56%: 0.78%) MRPH 3.01% (1.10%:1.42%) 5.06% (1.18%: 1.73%) MRDM -1.67% (4.59%:5.58%) -3.30% (0.23%: 8.01%) MRDL 1.94% (1.56%:4.39%) -0.55% (0.88%: 9.57%) DRBH 7.24% (7.07%:7.10%) 3.36% (3.30%: 2.95%) DRBM 4.40% (5.11%:0.74%) -2.55% (0.46%: 3.28%) DRBL 5.50% (5.58%:0.30%) -1.00% (0.71%: 5.21%) (some tests has more than 10% loss, CV is also more than 10%, the result is not stable) perf bench AMD EPYC 7551 32-core, 2 sockets kernel 6.6.28 6.6.28-osq-dynamic epoll Average CV Average CV Improve ADD 15258 2.30% 62160 2.40% 307.38% WAIT 3861 4.20% 6990 16.77% 81.03% locktortue AMD EPYC 7551 32-core, 2 sockets kernel 6.6.28 6.6.28-osq-dynamic lock torture Average CV Average CV Improve mutex_lock Writes 10435064 3.14% 22627890 4.92% 116.84% unixbench AMD EPYC 7551 32-core, 2 sockets. run on ramdisk 64 copys 6.6.28 6.6.28-osq-dynamic System Benchmarks Partial Average CV Average CV Improve Execl Throughput 2677.18 0.90% 3451.76 0.22% 28.93% File Copy 1024 bufsize 200 815.2 0.59% 1999.54 0.36% 145.28% File Copy 256 bufsize 500 504.6 0.69% 1359.6 0.49% 169.44% File Copy 4096 bufsize 800 1842.76 1.24% 3236.48 1.40% 75.63% Pipe Throughput 57748.74 0.01% 57539.6 0.03% -0.36% Pipe-based Context Switchi 20882.18 0.57% 20525.38 0.57% -1.71% Process Creation 4523.98 0.20% 4784.98 0.10% 5.77% Shell Scripts (1 concurren 13136.54 0.06% 15883.6 0.35% 20.91% Shell Scripts (16 concurre Shell Scripts (8 concurren 12883.82 0.14% 15640.32 0.20% 21.40% System Call Overhead 3533.74 0.04% 3544.16 0.02% 0.29% System Benchmarks Index Sc 4809.38 0.23% 6575.44 0.14% 36.72% fxmark AMD EPYC 7551 32-core, 2 sockets. run on ramdisk (mem,tmpfs) parallel cores 64 32 6.6.28 vs 6.6.28-osq-dynamic 6.6.28 vs 6.6.28-osq-dynamic item Improve Average, CV : Average, CV Improve CV : CV DWAL -0.22% ( 24091112, 0.31%: 24038426, 0.52%) -0.26% (0.10%: 0.12%) DWOL 2.21% ( 86569869, 0.36%: 88479947, 0.27%) 1.99% (0.41%: 0.29%) DWOM 210.41% ( 425986, 0.77%: 1322320, 0.28%) 128.86% (0.59%: 0.46%) DWSL 1.27% ( 70260252, 0.39%: 71149334, 0.37%) 1.19% (0.31%: 0.22%) MWRL 0.85% ( 489865, 0.22%: 494045, 0.25%) 2.29% (0.12%: 0.33%) MWRM 96.28% ( 149042, 0.45%: 292540, 3.55%) 60.10% (2.49%: 0.38%) MWCL -5.44% ( 772582, 2.92%: 730585, 0.80%) 0.32% (2.41%: 2.56%) MWCM 53.89% ( 153857, 1.92%: 236774, 0.46%) 23.84% (0.72%: 0.50%) MWUM 88.20% ( 214551, 3.90%: 403790, 0.41%) 62.81% (0.80%: 1.12%) MWUL -8.26% ( 970810, 1.63%: 890615, 1.63%) -6.73% (3.01%: 1.61%) DWTL 5.90% ( 5522297, 0.49%: 5847951, 0.18%) 5.03% (0.44%: 0.08%) MRPL -1.10% ( 39707577, 0.07%: 39268812, 0.03%) -1.30% (0.18%: 0.07%) MRPM -0.63% ( 16446350, 0.47%: 16341936, 0.40%) 0.45% (0.15%: 0.45%) MRPH -0.03% ( 3805484, 0.50%: 3804248, 0.12%) 3.02% (1.54%: 0.36%) MRDM 49.41% ( 20178742, 1.89%: 30148449, 1.01%) 17.58% (1.19%: 0.85%) MRDL -1.95% (227253170, 0.48%:222825409, 1.34%) -1.80% (0.32%: 0.54%) DRBH 6.01% ( 1045587, 1.91%: 1108467, 0.64%) 0.12% (0.13%: 0.30%) DRBM 0.65% (117702744, 0.31%:118473408, 0.87%) 1.12% (0.25%: 1.18%) DRBL 0.93% (121770444, 0.42%:122905957, 0.25%) 1.59% (0.31%: 0.40%) fxmark AMD EPYC 7551 32-core, 2 sockets. run on ramdisk (mem,tmpfs) parallel cores 2 1 6.6.28 vs 6.6.28-osq-dynamic 6.6.28 vs 6.6.28-osq-dynamic item Improve CV : CV Improve CV : CV DWAL -0.74% (0.33%: 0.19%) -1.02% (0.19%: 0.34%) DWOL 1.50% (0.36%: 0.44%) 1.89% (0.30%: 0.36%) DWOM -2.00% (0.73%: 0.38%) 2.43% (0.35%: 0.29%) DWSL 1.03% (0.34%: 0.54%) 1.18% (0.46%: 0.61%) MWRL 0.93% (0.39%: 0.18%) 2.25% (1.28%: 1.78%) MWRM -0.30% (0.60%: 0.47%) 0.17% (0.58%: 0.47%) MWCL -1.28% (0.41%: 0.66%) -0.38% (0.19%: 0.44%) MWCM -1.23% (0.36%: 0.23%) -1.42% (0.41%: 0.54%) MWUM -2.28% (0.57%: 0.75%) -1.11% (0.82%: 0.21%) MWUL -1.87% (0.64%: 0.50%) -1.75% (0.58%: 0.65%) DWTL 0.36% (0.09%: 0.12%) 0.19% (0.09%: 0.09%) MRPL -1.45% (0.37%: 0.31%) -1.35% (0.12%: 0.54%) MRPM -0.58% (0.30%: 0.11%) -1.04% (0.18%: 0.31%) MRPH 0.79% (3.92%: 0.48%) -0.53% (0.68%: 0.33%) MRDM -0.55% (0.93%: 0.44%) -0.13% (0.43%: 0.67%) MRDL -0.11% (0.56%: 0.19%) 0.68% (0.71%: 0.49%) DRBH 0.09% (1.31%: 0.87%) 2.75% (0.68%: 0.45%) DRBM 1.09% (0.19%: 1.05%) 1.60% (0.15%: 0.72%) DRBL 3.26% (1.00%: 0.56%) 2.34% (0.36%: 0.23%) From the test result, when heavy contention, the performance of dynamic numa-aware lock is better than the performance of in-box osq_lock. If not too many processes apply a lock, the performance is nearly the same as the in-box osq_lock. yongli-oc (4): locking/osq_lock: The Kconfig for dynamic numa-aware osq lock. locking/osq_lock: Turn to dynamic switch function from osq_lock/unlock. locking/osq_lock: From x_osq_lock/unlock to numa-aware lock/unlock. locking/osq_lock: The numa-aware lock memory prepare, assign and cleanup. include/linux/osq_lock.h | 33 ++- kernel/Kconfig.numalocks | 17 ++ kernel/locking/Makefile | 1 + kernel/locking/numa.h | 98 +++++++ kernel/locking/numa_osq.h | 32 +++ kernel/locking/osq_lock.c | 62 +++- kernel/locking/x_osq_lock.c | 332 ++++++++++++++++++++++ kernel/locking/zx_numa.c | 537 +++++++++++++++++++++++++++++++++++ kernel/locking/zx_numa_osq.c | 433 ++++++++++++++++++++++++++++ lib/Kconfig.debug | 2 + 10 files changed, 1543 insertions(+), 4 deletions(-) create mode 100644 kernel/Kconfig.numalocks create mode 100644 kernel/locking/numa.h create mode 100644 kernel/locking/numa_osq.h create mode 100644 kernel/locking/x_osq_lock.c create mode 100644 kernel/locking/zx_numa.c create mode 100644 kernel/locking/zx_numa_osq.c -- 2.34.1
The series patch changes osq lock to 2 bytes if the CONFIG_LOCK_SPIN_ON_OWNER=y, and change some coding problems, add more comments. Since the 2-byte and the 4-byte osq lock both access the same cacheline with LOCK# asserted, the speed should be essentially the same. To compare the performance for the two kinds of osq lock, I use locktorture and set cpu affinity for each mutex_lock write kthreads. The result is the an average of 9 times test. locktortue-SET CPU AFFINITY AMD EPYC 7551 32-core, 2 sockets Writers 6.6.28 6.6.28-osq-dynamic disable 6.6.28-osq-dynamic enable stress 4-byte 2-byte tail 2-byte tail Average CV Average CV Improve Average CV Improve 1 21047265 3.48% 21331993 12.16% 1.35% 21359519 6.76% 1.48% 2 39186677 5.66% 40348197 6.44% 2.96% 39387961 4.18% 0.51% 4 43467264 3.63% 44133849 4.95% 1.53% 38961218 7.01% -10.37% 8 43780445 6.67% 48887433 3.31% 11.66% 41725007 5.29% -4.69% 16 41407176 4.19% 51042178 3.45% 23.27% 71381112 2.75% 72.39% 32 46000746 6.63% 50060246 14.19% 8.82% 79361487 2.29% 72.52% 48 44235011 5.20% 44988160 7.22% 1.70% 79779501 4.88% 80.35% 64 59054128 4.02% 62233006 2.00% 5.38% 112695286 7.42% 90.83% The 2-byte osq lock in 1, 2, 4 threads, the performance is nearly the same as the 4-byte lock. 8, 16, 32 threads, the performance is better than the 4-byte lock, more threads, the performance tends to be the same. If turn on dynamic switching, 2-byte locks in 4, 8 threads, the performance has a little degradation, may be related to the lock contention cumulative, but it is never satisfied with the switching conditions. If 16 threads or more, the performance improvements approaching 80 percent. v1: The dynamic numa-aware osq_lock supports numa architecture based on the kernel in-box osq_lock. After enable by echo 1 > /proc/zx_numa_lock/dynamic_enable, the patch keeps checking how many processes are waiting for a osq lock. If more than a threshold, the patch stops these processes and switches to the numa-aware osq lock, then restarts. By a cleanup work queue, numa-aware osq lock turns back to osq_lock when all nodes unlocked, all the numa-aware lock memory returns to the pre-allocated Linux kernel memory cache. The struct optimistic_spin_queue of dynamic numa-aware lock is also 4 bytes, the same as the in-box osq lock. If enable dynamic switch, it will be accessed as three members by union. The tail is tail16, 2 bytes, supports 65534 cpu cores. The other two members are for lock switch state and numa memory index, each 1 byte. The serial are added to the struct optimistic_spin_node to know how many processes is waiting an osq lock. Each process applies an osq lock, the serial will add 1. We have done some performance evaluation for the dynamic numa-aware osq lock by perf, locktorture, unixbench, fxmark etc. fxmark: Filesystem Multicore Scalability Benchmark https://github.com/sslab-gatech/fxmark The following results are tested by Zhaoxin KH40000 32 cores processor or 32+32 cores, two sockets processor, and AMD EPYC 7551 32-core processor, two sockets. Since I do not know well about AMD CPU, the code to support AMD CPU is a sample only. The number under Average represents an average of test five times. The CV is the Coefficient of Variation. The kernel source code is 6.6.28 stable, compiled in the default configuration. The 6.6.28-osq-dynamic is the kernel 6.6.28 with the patch and enable dynamic switch. The OS is Ubuntu 22.04.02 LTS, gcc version 9.5.0. perf bench Zhaoxin KH40000 32 cores kernel 6.6.28 6.6.28-osq-dynamic epoll Average CV Average CV Improve ADD 25620 0.78% 64609 2.55% 152.18% WAIT 7134 1.77% 11098 0.52% 55.56% locktortue Zhaoxin KH40000 32 cores kernel 6.6.28 6.6.28-osq-dynamic lock torture Average CV Average CV Improve mutex_lock Writes 7433503 1.59% 17979058 1.90% 141.87% unixbench Zhaoxin KH40000 32+32 cores, run on ssd 64 copys 6.6.28 6.6.28-osq-dynamic System Benchmarks Partial Average CV Average CV Improve Execl Throughput 1460.18 1.18% 1865.22 0.25% 27.74% File Copy 1024 bufsize 200 549.94 0.62% 1221.32 6.71% 122.08% File Copy 256 bufsize 500 339.62 2.20% 896.58 6.57% 164.00% File Copy 4096 bufsize 800 1173.68 1.88% 2089.7 5.20% 78.05% Pipe Throughput 52122.26 0.18% 53842.72 0.15% 3.30% Pipe-based Context Switchi 18340.38 0.92% 19874.66 0.80% 8.37% Process Creation 2325.12 0.18% 2178.16 0.21% -6.32% Shell Scripts (1 concurren 7414.32 0.29% 8458.5 0.10% 14.08% Shell Scripts (16 concurre Shell Scripts (8 concurren 7156.48 0.10% 8132.42 0.14% 13.64% System Call Overhead 1476.9 0.14% 1574.32 0.09% 6.60% System Benchmarks Index Sc 2982.64 0.33% 4008.66 0.94% 34.40% fxmark Zhaoxin KH40000 32 cores, run on ssd (ssd, ext4) parallel cores 32 24 6.6.28 vs 6.6.28-osq-dynamic 6.6.28 vs 6.6.28-osq-dynamic item Improve Average,CV:Average,CV Improv CV:CV DWAL -0.17% ( 455895, 0.14%: 455115, 0.37%) -0.07% ( 0.42%: 0.44%) DWOL 1.10% (32166648, 2.64%:32521877, 2.06%) -0.68% ( 2.54%: 3.04%) DWOM 51.63% ( 496955, 4.34%: 753509, 8.32%) 45.93% ( 3.14%: 2.57%) DWSL 1.67% ( 20229, 2.34%: 20566, 3.18%) -1.74% ( 1.96%: 2.66%) MWRL 71.00% ( 348097, 0.92%: 595241, 1.26%) 65.95% ( 0.65%: 2.27%) MWRM 63.06% ( 6750, 3.33%: 11007, 4.31%) 60.18% ( 5.67%: 4.81%) MWCL 16.99% ( 149628, 1.66%: 175054, 0.82%) 16.96% ( 2.57%: 0.51%) MWCM 80.97% ( 9448, 4.66%: 17098, 0.96%) 73.79% ( 5.37%: 1.79%) MWUM 37.73% ( 16858, 3.13%: 23220, 3.42%) 31.16% ( 3.59%: 1.62%) MWUL 12.83% ( 45275, 3.90%: 51083, 3.25%) 19.94% ( 4.19%: 1.98%) DWTL 41.44% ( 85255, 5.01%: 120583, 9.83%) 45.07% ( 6.42%: 6.11%) MRPL -2.63% (11448731, 1.91%:11147179, 4.18%) -0.56% ( 1.65%: 3.33%) MRPM 0.29% ( 5423233, 1.77%: 5438929, 2.59%) -10.54% (15.85%:16.42%) MRPH -0.49% ( 688629, 2.84%: 685266, 2.88%) -18.99% (15.00%:24.98%) MRDM 8.42% ( 3662627, 0.76%: 3971133, 0.45%) 4.53% ( 1.77%: 1.72%) MRDL 6.25% ( 530518, 2.75%: 563671, 5.33%) 12.43% (25.88%:26.91%) DRBH 7.16% ( 388144, 7.88%: 415933,17.87%) -20.61% (29.12%:21.91%) DRBM -4.34% ( 381710, 5.51%: 365159, 3.15%) -16.93% (27.15%:29.85%) DRBL -0.17% (46227341, 2.50%:46147935, 2.89%) -4.03% ( 4.01%: 5.30%) fxmark Zhaoxin KH40000 32 cores, run on ssd (ssd, ext4) parallel cores 2 1 6.6.28 vs 6.6.28-osq-dynamic 6.6.28 vs 6.6.28-osq-dynamic item Improve CV:CV Improve CV: CV DWAL 1.78% (0.31%:0.20%) 6.36% (2.52%: 0.67%) DWOL 2.46% (2.26%:2.53%) 1.83% (2.69%: 3.07%) DWOM 2.70% (2.58%:3.12%) 2.22% (2.67%: 3.79%) DWSL 3.28% (2.90%:3.38%) 4.41% (1.32%: 1.36%) MWRL -0.76% (1.46%:1.94%) -0.82% (2.04%: 2.32%) MWRM 1.94% (4.38%:0.89%) -2.05% (4.07%: 5.16%) MWCL -0.07% (1.36%:3.84%) -2.17% (1.58%: 3.04%) MWCM 1.85% (2.95%:4.68%) 0.28% (0.45%: 2.48%) MWUM -2.85% (1.48%:2.01%) -3.06% (1.47%: 1.97%) MWUL -1.46% (0.58%:2.27%) -2.98% (0.71%: 2.11%) DWTL 0.40% (3.89%:4.35%) -2.68% (4.04%: 3.15%) MRPL 3.11% (1.38%:0.35%) -4.81% (0.32%:16.52%) MRPM 2.99% (0.29%:1.19%) 3.50% (0.56%: 0.78%) MRPH 3.01% (1.10%:1.42%) 5.06% (1.18%: 1.73%) MRDM -1.67% (4.59%:5.58%) -3.30% (0.23%: 8.01%) MRDL 1.94% (1.56%:4.39%) -0.55% (0.88%: 9.57%) DRBH 7.24% (7.07%:7.10%) 3.36% (3.30%: 2.95%) DRBM 4.40% (5.11%:0.74%) -2.55% (0.46%: 3.28%) DRBL 5.50% (5.58%:0.30%) -1.00% (0.71%: 5.21%) (some tests has more than 10% loss, CV is also more than 10%, the result is not stable) perf bench AMD EPYC 7551 32-core, 2 sockets kernel 6.6.28 6.6.28-osq-dynamic epoll Average CV Average CV Improve ADD 15258 2.30% 62160 2.40% 307.38% WAIT 3861 4.20% 6990 16.77% 81.03% locktortue AMD EPYC 7551 32-core, 2 sockets kernel 6.6.28 6.6.28-osq-dynamic lock torture Average CV Average CV Improve mutex_lock Writes 10435064 3.14% 22627890 4.92% 116.84% unixbench AMD EPYC 7551 32-core, 2 sockets. run on ramdisk 64 copys 6.6.28 6.6.28-osq-dynamic System Benchmarks Partial Average CV Average CV Improve Execl Throughput 2677.18 0.90% 3451.76 0.22% 28.93% File Copy 1024 bufsize 200 815.2 0.59% 1999.54 0.36% 145.28% File Copy 256 bufsize 500 504.6 0.69% 1359.6 0.49% 169.44% File Copy 4096 bufsize 800 1842.76 1.24% 3236.48 1.40% 75.63% Pipe Throughput 57748.74 0.01% 57539.6 0.03% -0.36% Pipe-based Context Switchi 20882.18 0.57% 20525.38 0.57% -1.71% Process Creation 4523.98 0.20% 4784.98 0.10% 5.77% Shell Scripts (1 concurren 13136.54 0.06% 15883.6 0.35% 20.91% Shell Scripts (16 concurre Shell Scripts (8 concurren 12883.82 0.14% 15640.32 0.20% 21.40% System Call Overhead 3533.74 0.04% 3544.16 0.02% 0.29% System Benchmarks Index Sc 4809.38 0.23% 6575.44 0.14% 36.72% fxmark AMD EPYC 7551 32-core, 2 sockets. run on ramdisk (mem,tmpfs) parallel cores 64 32 6.6.28 vs 6.6.28-osq-dynamic 6.6.28 vs 6.6.28-osq-dynamic item Improve Average, CV : Average, CV Improve CV : CV DWAL -0.22% ( 24091112, 0.31%: 24038426, 0.52%) -0.26% (0.10%: 0.12%) DWOL 2.21% ( 86569869, 0.36%: 88479947, 0.27%) 1.99% (0.41%: 0.29%) DWOM 210.41% ( 425986, 0.77%: 1322320, 0.28%) 128.86% (0.59%: 0.46%) DWSL 1.27% ( 70260252, 0.39%: 71149334, 0.37%) 1.19% (0.31%: 0.22%) MWRL 0.85% ( 489865, 0.22%: 494045, 0.25%) 2.29% (0.12%: 0.33%) MWRM 96.28% ( 149042, 0.45%: 292540, 3.55%) 60.10% (2.49%: 0.38%) MWCL -5.44% ( 772582, 2.92%: 730585, 0.80%) 0.32% (2.41%: 2.56%) MWCM 53.89% ( 153857, 1.92%: 236774, 0.46%) 23.84% (0.72%: 0.50%) MWUM 88.20% ( 214551, 3.90%: 403790, 0.41%) 62.81% (0.80%: 1.12%) MWUL -8.26% ( 970810, 1.63%: 890615, 1.63%) -6.73% (3.01%: 1.61%) DWTL 5.90% ( 5522297, 0.49%: 5847951, 0.18%) 5.03% (0.44%: 0.08%) MRPL -1.10% ( 39707577, 0.07%: 39268812, 0.03%) -1.30% (0.18%: 0.07%) MRPM -0.63% ( 16446350, 0.47%: 16341936, 0.40%) 0.45% (0.15%: 0.45%) MRPH -0.03% ( 3805484, 0.50%: 3804248, 0.12%) 3.02% (1.54%: 0.36%) MRDM 49.41% ( 20178742, 1.89%: 30148449, 1.01%) 17.58% (1.19%: 0.85%) MRDL -1.95% (227253170, 0.48%:222825409, 1.34%) -1.80% (0.32%: 0.54%) DRBH 6.01% ( 1045587, 1.91%: 1108467, 0.64%) 0.12% (0.13%: 0.30%) DRBM 0.65% (117702744, 0.31%:118473408, 0.87%) 1.12% (0.25%: 1.18%) DRBL 0.93% (121770444, 0.42%:122905957, 0.25%) 1.59% (0.31%: 0.40%) fxmark AMD EPYC 7551 32-core, 2 sockets. run on ramdisk (mem,tmpfs) parallel cores 2 1 6.6.28 vs 6.6.28-osq-dynamic 6.6.28 vs 6.6.28-osq-dynamic item Improve CV : CV Improve CV : CV DWAL -0.74% (0.33%: 0.19%) -1.02% (0.19%: 0.34%) DWOL 1.50% (0.36%: 0.44%) 1.89% (0.30%: 0.36%) DWOM -2.00% (0.73%: 0.38%) 2.43% (0.35%: 0.29%) DWSL 1.03% (0.34%: 0.54%) 1.18% (0.46%: 0.61%) MWRL 0.93% (0.39%: 0.18%) 2.25% (1.28%: 1.78%) MWRM -0.30% (0.60%: 0.47%) 0.17% (0.58%: 0.47%) MWCL -1.28% (0.41%: 0.66%) -0.38% (0.19%: 0.44%) MWCM -1.23% (0.36%: 0.23%) -1.42% (0.41%: 0.54%) MWUM -2.28% (0.57%: 0.75%) -1.11% (0.82%: 0.21%) MWUL -1.87% (0.64%: 0.50%) -1.75% (0.58%: 0.65%) DWTL 0.36% (0.09%: 0.12%) 0.19% (0.09%: 0.09%) MRPL -1.45% (0.37%: 0.31%) -1.35% (0.12%: 0.54%) MRPM -0.58% (0.30%: 0.11%) -1.04% (0.18%: 0.31%) MRPH 0.79% (3.92%: 0.48%) -0.53% (0.68%: 0.33%) MRDM -0.55% (0.93%: 0.44%) -0.13% (0.43%: 0.67%) MRDL -0.11% (0.56%: 0.19%) 0.68% (0.71%: 0.49%) DRBH 0.09% (1.31%: 0.87%) 2.75% (0.68%: 0.45%) DRBM 1.09% (0.19%: 1.05%) 1.60% (0.15%: 0.72%) DRBL 3.26% (1.00%: 0.56%) 2.34% (0.36%: 0.23%) From the test result, when heavy contention, the performance of dynamic numa-aware lock is better than the performance of in-box osq_lock. If not too many processes apply a lock, the performance is nearly the same as the in-box osq_lock. --- Changes since v1 (based on Longman reviews) #1 Changes the bisection from v1 patchs. #2 Modify some code, such as the definition, special value in macro, cpu_relax(). #3 Add some comments. yongli-oc (3): locking/osq_lock: The Kconfig for dynamic numa-aware osq lock. locking/osq_lock: Define osq by union to support dynamic numa-aware lock. locking/osq_lock: Turn from 2-byte osq_lock/unlock to numa lock/unlock. include/linux/osq_lock.h | 33 ++- kernel/Kconfig.numalocks | 17 ++ kernel/locking/Makefile | 3 + kernel/locking/numa.h | 90 ++++++ kernel/locking/numa_osq.h | 29 ++ kernel/locking/x_osq_lock.c | 371 ++++++++++++++++++++++++ kernel/locking/zx_numa.c | 540 +++++++++++++++++++++++++++++++++++ kernel/locking/zx_numa_osq.c | 497 ++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 1 + 9 files changed, 1580 insertions(+), 1 deletion(-) create mode 100644 kernel/Kconfig.numalocks create mode 100644 kernel/locking/numa.h create mode 100644 kernel/locking/numa_osq.h create mode 100644 kernel/locking/x_osq_lock.c create mode 100644 kernel/locking/zx_numa.c create mode 100644 kernel/locking/zx_numa_osq.c -- 2.34.1
The make menu to choose if compile the dynamic numa-aware osq lock
or not, default is N.
Signed-off-by: yongli-oc <yongli-oc@zhaoxin.com>
---
kernel/Kconfig.numalocks | 17 +++++++++++++++++
lib/Kconfig.debug | 1 +
2 files changed, 18 insertions(+)
create mode 100644 kernel/Kconfig.numalocks
diff --git a/kernel/Kconfig.numalocks b/kernel/Kconfig.numalocks
new file mode 100644
index 000000000000..feb1751b637c
--- /dev/null
+++ b/kernel/Kconfig.numalocks
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "NUMA Lock Supporting (OSQ)"
+
+config LOCK_SPIN_ON_OWNER_NUMA
+ bool "Enable dynamic numa-aware osq lock"
+ depends on LOCK_SPIN_ON_OWNER && X86_64
+ default n
+ help
+ According the cpu numa architecture, the numa-aware lock
+ always unlocks the process waiting on the same numa node
+ first. It is different from the kernel inbox osq_lock.
+ The dynamic numa-aware osq lock switchies between osq_lock and
+ numa-aware lock automatically, according contention level.
+ Enable: echo 1 > /proc/zx_numa_lock/dynamic_enable.
+
+endmenu # NUMA Lock Supporting
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 27539c2626bf..cf9344eb61a4 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1337,6 +1337,7 @@ config DEBUG_PREEMPT
depending on workload as it triggers debugging routines for each
this_cpu operation. It should only be used for debugging purposes.
+source "kernel/Kconfig.numalocks"
menu "Lock Debugging (spinlocks, mutexes, etc...)"
config LOCK_DEBUGGING_SUPPORT
--
2.34.1
To support numa-aware osq lock, the struct optimistic_spin_queue
is accessed as three members, numa_enable, index, tail16, by union.
The size of the struct is the same as before.
Signed-off-by: yongli-oc <yongli-oc@zhaoxin.com>
---
include/linux/osq_lock.h | 33 ++++++++++++++++++++++++++++++++-
1 file changed, 32 insertions(+), 1 deletion(-)
diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index ea8fb31379e3..37a7bc4ab530 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -12,14 +12,42 @@ struct optimistic_spin_queue {
* Stores an encoded value of the CPU # of the tail node in the queue.
* If the queue is empty, then it's set to OSQ_UNLOCKED_VAL.
*/
+#ifdef CONFIG_LOCK_SPIN_ON_OWNER_NUMA
+ union {
+ atomic_t tail;
+ u32 val;
+#ifdef __LITTLE_ENDIAN
+ struct {
+ u16 tail16;
+ u8 index;
+ u8 numa_enable;
+ };
+#else
+ struct {
+ u8 numa_enable;
+ u8 index;
+ u16 tail16;
+ };
+#endif
+ };
+#else
atomic_t tail;
+#endif
};
#define OSQ_UNLOCKED_VAL (0)
/* Init macro and function. */
+#ifdef CONFIG_LOCK_SPIN_ON_OWNER_NUMA
+
+#define OSQ_LOCK_UNLOCKED { .tail = ATOMIC_INIT(OSQ_UNLOCKED_VAL) }
+
+#else
+
#define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) }
+#endif
+
static inline void osq_lock_init(struct optimistic_spin_queue *lock)
{
atomic_set(&lock->tail, OSQ_UNLOCKED_VAL);
@@ -28,9 +56,12 @@ static inline void osq_lock_init(struct optimistic_spin_queue *lock)
extern bool osq_lock(struct optimistic_spin_queue *lock);
extern void osq_unlock(struct optimistic_spin_queue *lock);
+#ifdef CONFIG_LOCK_SPIN_ON_OWNER_NUMA
+extern bool osq_is_locked(struct optimistic_spin_queue *lock);
+#else
static inline bool osq_is_locked(struct optimistic_spin_queue *lock)
{
return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL;
}
-
+#endif
#endif
--
2.34.1
According to the contention level, switches from osq_lock to
numa-aware osq_lock.
The numa.h is definition for numa-aware lock.
The numa_osq.h is definition for osq to numa switching.
The x_osq_lock.c is the crossing from two bytes osq lock to
numa-aware lock.
The zx_numa_osq.c lock is a two level osq_lock.
zx_numa.c:
The numa-aware lock kernel memory cache preparation, and a
workqueue to turn numa-aware lock back to osq lock.
The /proc interface. Enable dynamic switch by
echo 1 > /proc/zx_numa_lock/dynamic_enable
The new Makefile for dynamic numa-aware osq lock.
Signed-off-by: yongli-oc <yongli-oc@zhaoxin.com>
---
kernel/locking/Makefile | 3 +
kernel/locking/numa.h | 90 ++++++
kernel/locking/numa_osq.h | 29 ++
kernel/locking/x_osq_lock.c | 371 ++++++++++++++++++++++++
kernel/locking/zx_numa.c | 540 +++++++++++++++++++++++++++++++++++
kernel/locking/zx_numa_osq.c | 497 ++++++++++++++++++++++++++++++++
6 files changed, 1530 insertions(+)
create mode 100644 kernel/locking/numa.h
create mode 100644 kernel/locking/numa_osq.h
create mode 100644 kernel/locking/x_osq_lock.c
create mode 100644 kernel/locking/zx_numa.c
create mode 100644 kernel/locking/zx_numa_osq.c
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 0db4093d17b8..297bfad88fda 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -21,7 +21,10 @@ ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_LOCK_SPIN_ON_OWNER_NUMA) += x_osq_lock.o zx_numa_osq.o zx_numa.o
+else
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
+endif
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o
diff --git a/kernel/locking/numa.h b/kernel/locking/numa.h
new file mode 100644
index 000000000000..790c27ed18e5
--- /dev/null
+++ b/kernel/locking/numa.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_NUMA_LOCK_H
+#define __LINUX_NUMA_LOCK_H
+#include <linux/cache.h>
+#include "mcs_spinlock.h"
+
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
+ int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # + 1 value */
+ u32 serial;
+};
+
+
+struct _numa_buf {
+ void *numa_ptr;
+ struct list_head list;
+ u32 lockaddr;
+ u32 highaddr;
+ u8 idle;
+ u8 type;
+ u16 index;
+};
+
+struct _numa_lock {
+ atomic_t tail ____cacheline_aligned_in_smp;
+ atomic_t addr;
+ u8 shift;
+ u8 stopping;
+ u16 numa_nodes;
+ u32 accessed;
+ uint64_t totalaccessed;
+ u32 nodeswitched;
+ atomic_t initlock;
+ atomic_t pending;
+ union {
+ struct mcs_spinlock mcs_node;
+ struct optimistic_spin_node osq_node;
+ };
+ CACHELINE_PADDING(pad);
+};
+
+struct numa_cpu_info {
+ __u8 x86_model;
+ /* CPU family */
+ __u8 x86;
+ /* CPU vendor */
+ __u8 x86_vendor;
+ __u8 x86_reserved;
+ u32 feature1;
+};
+
+#define NUMAEXPAND 1
+
+#define COHORT_START 1
+#define ACQUIRE_NUMALOCK (UINT_MAX-1)
+#define NODE_WAIT UINT_MAX
+#define LOCK_NUMALOCK 1
+#define UNLOCK_NUMALOCK 0
+
+#define NUMALOCKDYNAMIC 0xff
+#define TURNTONUMAREADY 0xa
+
+#define NUMA_LOCKED_VAL 0xffffff
+#define NUMA_UNLOCKED_VAL 0
+
+#define HIGH32BITMASK 0xffffffff00000000
+#define LOW32MASK 0xffffffff
+
+extern int numa_shift;
+extern int numa_clusters;
+extern int zx_numa_lock_total;
+extern struct _numa_buf *zx_numa_entry;
+extern atomic_t numa_count;
+extern int enable_zx_numa_osq_lock;
+extern u32 zx_numa_lock;
+extern int dynamic_enable;
+extern struct kmem_cache *zx_numa_lock_cachep;
+
+static inline u32 ptrmask(void *s)
+{
+ return (uint64_t)s & LOW32MASK;
+}
+inline void *get_numa_lock(int index);
+
+int zx_check_numa_dynamic_locked(u32 lockaddr, struct _numa_lock *_numa_lock);
+int zx_numa_lock_ptr_get(void *p);
+void numa_lock_init_data(struct _numa_lock *s, int clusters, u32 lockval,
+ u32 lockaddr);
+#endif
diff --git a/kernel/locking/numa_osq.h b/kernel/locking/numa_osq.h
new file mode 100644
index 000000000000..5c4675abc4fc
--- /dev/null
+++ b/kernel/locking/numa_osq.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_NUMA_OSQ_H
+#define __LINUX_NUMA_OSQ_H
+
+#include <linux/osq_lock.h>
+#include "mcs_spinlock.h"
+
+#define OSQLOCKINITED 0
+#define OSQTONUMADETECT 0x10
+#define OSQLOCKSTOPPING 0xfc
+#define OSQ_LOCKED_VAL 0xffff
+
+extern u16 osq_keep_times;
+extern u16 osq_lock_depth;
+extern int osq_node_max;
+
+inline int encode_cpu(int cpu_nr);
+inline int node_cpu(struct optimistic_spin_node *node);
+inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val);
+
+void zx_osq_lock_stopping(struct optimistic_spin_queue *lock);
+void zx_osq_numa_start(struct optimistic_spin_queue *lock);
+void zx_osq_turn_numa_waiting(struct optimistic_spin_queue *lock);
+
+inline void zx_numa_osq_unlock(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *n);
+inline bool zx_numa_osq_lock(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *n);
+#endif
diff --git a/kernel/locking/x_osq_lock.c b/kernel/locking/x_osq_lock.c
new file mode 100644
index 000000000000..82cde1f6355b
--- /dev/null
+++ b/kernel/locking/x_osq_lock.c
@@ -0,0 +1,371 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * crossing from osq_lock to numa-aware lock
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/osq_lock.h>
+#include "numa.h"
+#include "numa_osq.h"
+
+u16 osq_lock_depth = 8;
+u16 osq_keep_times = 32;
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+inline int encode_cpu(int cpu_nr)
+{
+ return cpu_nr + 1;
+}
+
+inline int node_cpu(struct optimistic_spin_node *node)
+{
+ return node->cpu - 1;
+}
+
+inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+ int cpu_nr = encoded_cpu_val - 1;
+
+ return per_cpu_ptr(&osq_node, cpu_nr);
+}
+
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ *
+ * If osq_lock() is being cancelled there must be a previous node
+ * and 'old_cpu' is its CPU #.
+ * For osq_unlock() there is never a previous node and old_cpu is
+ * set to OSQ_UNLOCKED_VAL.
+ * If the osq is in OSQLOCKSTOPPING state,
+ * the tail will be set to OSQ_LOCKED_VAL to stop the queue.
+ */
+static inline struct optimistic_spin_node *
+osq_wait_next_stop(struct optimistic_spin_queue *lock,
+ struct optimistic_spin_node *node,
+ int old_cpu)
+{
+ u16 curr = encode_cpu(smp_processor_id());
+ u16 old = old_cpu;
+
+ if (lock->numa_enable == OSQLOCKSTOPPING && old == OSQ_UNLOCKED_VAL)
+ old = OSQ_LOCKED_VAL;
+
+ for (;;) {
+ if (READ_ONCE(lock->tail16) == curr &&
+ cmpxchg(&lock->tail16, curr, old) == curr) {
+
+ /*
+ * We were the last queued, we moved @lock back. @prev
+ * will now observe @lock and will complete its
+ * unlock()/unqueue().
+ */
+ return NULL;
+ }
+
+ /*
+ * We must xchg() the @node->next value, because if we were to
+ * leave it in, a concurrent unlock()/unqueue() from
+ * @node->next might complete Step-A and think its @prev is
+ * still valid.
+ *
+ * If the concurrent unlock()/unqueue() wins the race, we'll
+ * wait for either @lock to point to us, through its Step-B, or
+ * wait for a new @node->next from its Step-C.
+ */
+ if (node->next) {
+ struct optimistic_spin_node *next;
+
+ next = xchg(&node->next, NULL);
+ if (next)
+ return next;
+ }
+
+ cpu_relax();
+ }
+}
+
+bool osq_lock(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *prev, *next;
+ int cpu = smp_processor_id();
+ u16 curr = encode_cpu(cpu);
+ struct optimistic_spin_queue tail;
+ u16 old;
+
+ tail.val = READ_ONCE(lock->val);
+ if (unlikely(tail.numa_enable == OSQLOCKSTOPPING)) {
+ zx_osq_turn_numa_waiting(lock);
+ return osq_lock(lock);
+ }
+
+ if (unlikely(tail.numa_enable == NUMALOCKDYNAMIC)) {
+ struct _numa_lock *_numa_lock = NULL;
+ struct _numa_lock *node_lock = NULL;
+
+ _numa_lock = get_numa_lock(tail.index);
+ node_lock = (struct _numa_lock *) _numa_lock +
+ (cpu >> numa_shift);
+
+ prefetch(node_lock);
+ return zx_numa_osq_lock(lock, _numa_lock);
+ }
+
+ node->locked = 0;
+ node->next = NULL;
+ node->cpu = curr;
+ node->serial = 0;
+
+ /*
+ * if ss.tail16 is OSQ_LOCKED_VAL, keeps the LOCKED state
+ * and waiting numa-aware lock ready.
+ */
+
+ if (likely(tail.numa_enable >= OSQTONUMADETECT)) {
+ struct optimistic_spin_queue ss;
+
+ while (1) {
+ ss.val = atomic_read(&lock->tail);
+ if (ss.tail16 == OSQ_LOCKED_VAL) {
+ zx_osq_turn_numa_waiting(lock);
+ return osq_lock(lock);
+ }
+ if (cmpxchg(&lock->tail16, ss.tail16, curr)
+ == ss.tail16) {
+ old = ss.tail16;
+ break;
+ }
+ cpu_relax();
+ }
+ } else
+ old = xchg(&lock->tail16, curr);
+
+ if (old == OSQ_UNLOCKED_VAL) {
+ node->serial = 1;
+ return true;
+ }
+
+ prev = decode_cpu(old);
+ node->prev = prev;
+
+ // Record osq serial number for the lock.
+ node->serial = prev->serial + 1;
+ /*
+ * osq_lock() unqueue
+ *
+ * node->prev = prev osq_wait_next()
+ * WMB MB
+ * prev->next = node next->prev = prev // unqueue-C
+ *
+ * Here 'node->prev' and 'next->prev' are the same variable and we need
+ * to ensure these stores happen in-order to avoid corrupting the list.
+ */
+ smp_wmb();
+
+ WRITE_ONCE(prev->next, node);
+
+ /*
+ * Normally @prev is untouchable after the above store; because at that
+ * moment unlock can proceed and wipe the node element from stack.
+ *
+ * However, since our nodes are static per-cpu storage, we're
+ * guaranteed their existence -- this allows us to apply
+ * cmpxchg in an attempt to undo our queueing.
+ */
+
+ /*
+ * Wait to acquire the lock or cancellation. Note that need_resched()
+ * will come with an IPI, which will wake smp_cond_load_relaxed() if it
+ * is implemented with a monitor-wait. vcpu_is_preempted() relies on
+ * polling, be careful.
+ */
+ if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
+ vcpu_is_preempted(node_cpu(node->prev))))
+ return true;
+
+ /* unqueue */
+ /*
+ * Step - A -- stabilize @prev
+ *
+ * Undo our @prev->next assignment; this will make @prev's
+ * unlock()/unqueue() wait for a next pointer since @lock points to us
+ * (or later).
+ */
+
+ for (;;) {
+ /*
+ * cpu_relax() below implies a compiler barrier which would
+ * prevent this comparison being optimized away.
+ */
+ if (data_race(prev->next) == node &&
+ cmpxchg(&prev->next, node, NULL) == node)
+ break;
+
+ /*
+ * We can only fail the cmpxchg() racing against an unlock(),
+ * in which case we should observe @node->locked becoming
+ * true.
+ */
+ if (smp_load_acquire(&node->locked))
+ return true;
+
+ cpu_relax();
+
+ /*
+ * Or we race against a concurrent unqueue()'s step-B, in which
+ * case its step-C will write us a new @node->prev pointer.
+ */
+ prev = READ_ONCE(node->prev);
+ }
+
+ /*
+ * Step - B -- stabilize @next
+ *
+ * Similar to unlock(), wait for @node->next or move @lock from @node
+ * back to @prev.
+ */
+
+ next = osq_wait_next_stop(lock, node, prev->cpu);
+ if (!next)
+ return false;
+
+ /*
+ * Step - C -- unlink
+ *
+ * @prev is stable because its still waiting for a new @prev->next
+ * pointer, @next is stable because our @node->next pointer is NULL and
+ * it will wait in Step-A.
+ */
+
+ WRITE_ONCE(next->prev, prev);
+ WRITE_ONCE(prev->next, next);
+
+ return false;
+}
+
+/*
+ * In osq_unlock(), changes the osq state for switching, then unlock next.
+ * OSQTONUMADETECT: Starts to detect lock contention when first osq tail reached
+ * after dynamic enable.
+ * OSQLOCKSTOPPING: If lock contention keeps more than osq_lock_depth
+ * osq_keep_times, set the osq to STOPPING state and get
+ * numa-aware lock memory entry.
+ * NUMALOCKDYNAMIC: numa-aware lock initialized and ready.
+ */
+
+void osq_unlock(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_node *node, *next;
+ int threadshold = osq_lock_depth;
+ int cpu = smp_processor_id();
+ u16 curr = encode_cpu(cpu);
+ int depth = 0;
+ u32 count = 0;
+
+ if (unlikely(lock->numa_enable == NUMALOCKDYNAMIC)) {
+ struct _numa_lock *_numa_lock = get_numa_lock(lock->index);
+
+ prefetch((struct _numa_lock *) _numa_lock + (cpu >> numa_shift));
+ return zx_numa_osq_unlock(lock, _numa_lock);
+ }
+ /*
+ * Fast path for the uncontended case.
+ */
+ if (unlikely(lock->numa_enable == OSQTONUMADETECT)) {
+ struct optimistic_spin_node *node_last = NULL;
+ u16 tail = 0;
+
+ tail = cmpxchg(&lock->tail16, curr, OSQ_UNLOCKED_VAL);
+ if (tail == curr)
+ return;
+
+ node = this_cpu_ptr(&osq_node);
+ node_last = decode_cpu(tail);
+ //Get the contention level
+ depth = node_last->serial - node->serial;
+ count = READ_ONCE(node->locked);
+ if (count > osq_keep_times && (dynamic_enable & 0x1))
+ zx_osq_lock_stopping(lock);
+ } else if (unlikely(lock->numa_enable == OSQLOCKSTOPPING)) {
+ if (cmpxchg(&lock->tail16, curr, OSQ_LOCKED_VAL)
+ == curr) {
+ //All osq stopped, start to run as numa-aware lock.
+ zx_osq_numa_start(lock);
+ return;
+ }
+ } else {
+ struct optimistic_spin_queue t;
+
+ /*
+ * After dynamic enable, when osq reaches tail, set the osq
+ * to DETECT mode to detect the contention.
+ */
+ t.val = 0;
+ if (dynamic_enable & 0x1) {
+ if (atomic_read(&numa_count) < zx_numa_lock_total)
+ t.numa_enable = OSQTONUMADETECT;
+ }
+ if (t.numa_enable == OSQTONUMADETECT) {
+ if (atomic_cmpxchg_release(&lock->tail, curr,
+ (t.val | OSQ_UNLOCKED_VAL)) == curr)
+ return;
+ } else if (cmpxchg(&lock->tail16, curr,
+ OSQ_UNLOCKED_VAL) == curr)
+ return;
+ }
+
+ /*
+ * Second most likely case.
+ */
+ node = this_cpu_ptr(&osq_node);
+ next = xchg(&node->next, NULL);
+ if (next) {
+ if (depth > threadshold)
+ WRITE_ONCE(next->locked, count + 1);
+ else
+ WRITE_ONCE(next->locked, 1);
+ return;
+ }
+
+ next = osq_wait_next_stop(lock, node, OSQ_UNLOCKED_VAL);
+ if (next) {
+ if (depth > threadshold)
+ WRITE_ONCE(next->locked, count + 1);
+ else
+ WRITE_ONCE(next->locked, 1);
+ }
+}
+
+bool osq_is_locked(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_queue val;
+
+ val.val = atomic_read(&lock->tail);
+ if (val.tail16 == OSQ_UNLOCKED_VAL)
+ return false;
+
+ if (val.tail16 == OSQ_LOCKED_VAL) {
+
+ //state changing
+ if (val.numa_enable != NUMALOCKDYNAMIC)
+ return true;
+
+ return zx_check_numa_dynamic_locked(ptrmask(lock),
+ get_numa_lock(val.index));
+ }
+
+ return true;
+}
diff --git a/kernel/locking/zx_numa.c b/kernel/locking/zx_numa.c
new file mode 100644
index 000000000000..31d247261474
--- /dev/null
+++ b/kernel/locking/zx_numa.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Dynamic numa-aware osq lock
+ * Crossing from numa-aware lock to osq_lock
+ * Numa lock memory initialize and /proc interface
+ * Author: LiYong <yongli-oc@zhaoxin.com>
+ *
+ */
+#include <linux/cpumask.h>
+#include <asm/byteorder.h>
+#include <asm/kvm_para.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/osq_lock.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/reboot.h>
+
+#include "numa.h"
+#include "numa_osq.h"
+
+int enable_zx_numa_osq_lock;
+struct delayed_work zx_numa_start_work;
+struct delayed_work zx_numa_cleanup_work;
+
+atomic_t numa_count;
+struct _numa_buf *zx_numa_entry;
+int zx_numa_lock_total = 256;
+LIST_HEAD(_zx_numa_head);
+LIST_HEAD(_zx_numa_lock_head);
+
+struct kmem_cache *zx_numa_entry_cachep;
+struct kmem_cache *zx_numa_lock_cachep;
+int numa_shift;
+int numa_clusters;
+static atomic_t lockindex;
+int dynamic_enable;
+
+static const struct numa_cpu_info numa_cpu_list[] = {
+ /*feature1=1, a numa node includes two clusters*/
+ //{1, 23, X86_VENDOR_AMD, 0, 1},
+ {0x5b, 7, X86_VENDOR_CENTAUR, 0, 1},
+ {0x5b, 7, X86_VENDOR_ZHAOXIN, 0, 1}
+};
+
+inline void *get_numa_lock(int index)
+{
+ if (index >= 0 && index < zx_numa_lock_total)
+ return zx_numa_entry[index].numa_ptr;
+ else
+ return NULL;
+}
+
+static int zx_get_numa_shift(int all_cpus, int clusters)
+{
+ int cpus = (int) all_cpus/clusters;
+ int count = 0;
+
+ while (cpus) {
+ cpus >>= 1;
+ count++;
+ }
+ return count-1;
+}
+
+void numa_lock_init_data(struct _numa_lock *s, int clusters,
+ u32 lockval, u32 lockaddr)
+{
+ int j = 0;
+
+ for (j = 0; j < clusters + NUMAEXPAND; j++) {
+ atomic_set(&(s + j)->tail, lockval);
+ atomic_set(&(s + j)->addr, lockaddr);
+ (s + j)->shift = numa_shift;
+ (s + j)->stopping = 0;
+ (s + j)->numa_nodes = clusters;
+ (s + j)->accessed = 0;
+ (s + j)->totalaccessed = 0;
+ (s + j)->nodeswitched = 0;
+ atomic_set(&(s + j)->initlock, 0);
+ atomic_set(&(s + j)->pending, 0);
+ }
+}
+/*
+ * The lockaddr of zx_numa_enry is key value to know which index is occupied.
+ */
+int zx_numa_lock_ptr_get(void *p)
+{
+ int i = 0;
+ int index = 0;
+
+ if (atomic_read(&numa_count) >= zx_numa_lock_total)
+ return zx_numa_lock_total;
+
+ index = atomic_inc_return(&lockindex);
+
+ for (i = 0; i < zx_numa_lock_total; i++) {
+ if (index >= zx_numa_lock_total)
+ index = 0;
+ if (cmpxchg(&zx_numa_entry[index].lockaddr,
+ 0, ptrmask(p)) == 0) {
+ while (1) {
+ struct _numa_lock *node_lock =
+ zx_numa_entry[index].numa_ptr;
+ struct _numa_lock *numa_lock = node_lock +
+ node_lock->numa_nodes;
+
+ if (atomic_read(&numa_lock->tail) ==
+ NUMA_LOCKED_VAL)
+ break;
+ cpu_relax();
+
+ }
+ atomic_inc(&numa_count);
+ zx_numa_entry[index].highaddr = ((u64)p) >> 32;
+ atomic_set(&lockindex, index);
+ return index;
+ }
+ index++;
+ if (atomic_read(&numa_count) >= zx_numa_lock_total)
+ break;
+ }
+ return zx_numa_lock_total;
+}
+
+int zx_check_numa_dynamic_locked(u32 lockaddr,
+ struct _numa_lock *_numa_lock)
+{
+ struct _numa_lock *node_lock = NULL;
+ u64 s = -1;
+ int i = 0;
+
+ //in switching if the pending is not 0
+ if (atomic_read(&_numa_lock->pending) != 0)
+ return 1;
+
+ for (i = 0; i < _numa_lock->numa_nodes + 1; i++) {
+ node_lock = _numa_lock + i;
+ cpu_relax();
+ s = atomic64_read((atomic64_t *) &node_lock->tail);
+ if ((s >> 32) != lockaddr)
+ continue;
+ if ((s & LOW32MASK) == NUMA_LOCKED_VAL
+ || (s & LOW32MASK) == NUMA_UNLOCKED_VAL)
+ continue;
+ break;
+ }
+
+ if (i == _numa_lock->numa_nodes + 1)
+ return 0;
+ return i+1;
+}
+
+static int zx_numa_lock64_try_to_freeze(u32 lockaddr, struct _numa_lock *_numa_lock,
+ int index)
+{
+ struct _numa_lock *node_lock = NULL;
+ u64 addr = ((u64)lockaddr) << 32;
+ u64 s = 0;
+ u64 ff = 0;
+ int i = 0;
+
+ //check and set the tail to LOCKED from first node to last node,
+ //if, all node tail are LOCKED, try to set the NUMA node to LOCKED
+ for (i = 0; i < _numa_lock->numa_nodes+1; i++) {
+ node_lock = _numa_lock + i;
+ cpu_relax();
+
+ s = atomic64_read((atomic64_t *)&node_lock->tail);
+ if ((s & HIGH32BITMASK) != addr)
+ continue;
+
+ if ((s & LOW32MASK) == NUMA_LOCKED_VAL)
+ continue;
+
+ if ((s & LOW32MASK) == NUMA_UNLOCKED_VAL) {
+ ff = atomic64_cmpxchg((atomic64_t *)&node_lock->tail,
+ (addr|NUMA_UNLOCKED_VAL), NUMA_LOCKED_VAL);
+ if (ff == (addr|NUMA_UNLOCKED_VAL))
+ continue;
+ }
+ break;
+ }
+ //All node's tail and numa node's tail are LOCKED, set numa lock memory
+ //referenced by index to un-occupied.
+ if (i == _numa_lock->numa_nodes + 1) {
+ zx_numa_entry[index].idle = 0;
+ zx_numa_entry[index].type = 0;
+ zx_numa_entry[index].highaddr = 0;
+ xchg(&zx_numa_entry[index].lockaddr, 0);
+ }
+
+ return i;
+}
+
+static void zx_numa_lock_stopping(struct _numa_lock *_numa_lock)
+{
+ struct _numa_lock *node_lock = NULL;
+ int i = 0;
+
+ for (i = 0; i < _numa_lock->numa_nodes+1; i++) {
+ node_lock = _numa_lock + i;
+ WRITE_ONCE(node_lock->stopping, 1);
+ }
+}
+
+static void zx_numa_cleanup(struct work_struct *work)
+{
+ int i = 0;
+ int checktimes = 2;
+
+ //reboot or power off state
+ if (READ_ONCE(enable_zx_numa_osq_lock) == 0xf)
+ return;
+
+ //If dynamic enable and numa-aware lock count is 0, reschedule the
+ //workqueue.
+ if (atomic_read(&numa_count) == 0) {
+ if (READ_ONCE(dynamic_enable) != 0)
+ schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
+ return;
+ }
+
+ for (i = 0; i < zx_numa_lock_total; i++) {
+ int s = 0;
+ u32 lockaddr = READ_ONCE(zx_numa_entry[i].lockaddr);
+ u32 type = zx_numa_entry[i].type;
+ struct _numa_lock *buf = zx_numa_entry[i].numa_ptr;
+ int nodes = 0;
+
+ if (lockaddr == 0 || type == 3 || zx_numa_entry[i].idle == 0)
+ continue;
+ nodes = buf->numa_nodes;
+ if (zx_numa_entry[i].idle < checktimes) {
+ //check if all node is idle
+ s = zx_check_numa_dynamic_locked(lockaddr, buf);
+ if (s != 0) {
+ zx_numa_entry[i].idle = 1;
+ continue;
+ }
+ zx_numa_entry[i].idle++;
+ }
+
+ if (zx_numa_entry[i].idle == checktimes) {
+ //set each node to stopping mode
+ zx_numa_lock_stopping(buf);
+ zx_numa_entry[i].idle++;
+
+ }
+
+ if (zx_numa_entry[i].idle == checktimes+1) {
+ while (1) {
+ //try to freezed all nodes
+ if (zx_numa_lock64_try_to_freeze(lockaddr, buf,
+ i) == nodes + 1) {
+ //all node has been locked
+ atomic_dec(&numa_count);
+ break;
+ }
+ cpu_relax();
+ }
+ }
+ }
+ schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
+}
+
+static int create_numa_buffer_list(int clusters, int len)
+{
+ int i = 0;
+
+ for (i = 0; i < zx_numa_lock_total; i++) {
+ struct _numa_lock *s = (struct _numa_lock *)kmem_cache_alloc(
+ zx_numa_lock_cachep, GFP_KERNEL);
+ if (!s) {
+ while (i > 0) {
+ kmem_cache_free(zx_numa_lock_cachep,
+ zx_numa_entry[i-1].numa_ptr);
+ i--;
+ }
+ return 0;
+ }
+ memset((char *)s, 0,
+ len * L1_CACHE_BYTES * (clusters + NUMAEXPAND));
+ numa_lock_init_data(s, clusters, NUMA_LOCKED_VAL, 0);
+ zx_numa_entry[i].numa_ptr = s;
+ zx_numa_entry[i].lockaddr = 0;
+ zx_numa_entry[i].highaddr = 0;
+ zx_numa_entry[i].idle = 0;
+ zx_numa_entry[i].type = 0;
+ }
+
+ for (i = 0; i < zx_numa_lock_total; i++) {
+ zx_numa_entry[i].index = i;
+ list_add_tail(&(zx_numa_entry[i].list), &_zx_numa_lock_head);
+ }
+ return 1;
+}
+
+static int zx_numa_lock_init(int numa)
+{
+ int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+ int d = 0;
+ int status = 0;
+
+ atomic_set(&lockindex, 0);
+ atomic_set(&numa_count, 0);
+
+ if (sizeof(struct _numa_lock) & 0x3f)
+ d = (int)((sizeof(struct _numa_lock) + L1_CACHE_BYTES) /
+ L1_CACHE_BYTES);
+ else
+ d = (int)(sizeof(struct _numa_lock) / L1_CACHE_BYTES);
+
+ zx_numa_entry_cachep = kmem_cache_create(
+ "zx_numa_entry",
+ sizeof(struct _numa_buf) * zx_numa_lock_total, align,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
+
+ zx_numa_lock_cachep = kmem_cache_create(
+ "zx_numa_lock",
+ d * L1_CACHE_BYTES * (numa + NUMAEXPAND), align,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
+
+
+ if (zx_numa_entry_cachep && zx_numa_lock_cachep) {
+ zx_numa_entry = (struct _numa_buf *)kmem_cache_alloc(
+ zx_numa_entry_cachep, GFP_KERNEL);
+ if (zx_numa_entry) {
+ memset((char *)zx_numa_entry, 0,
+ sizeof(struct _numa_buf) * zx_numa_lock_total);
+ create_numa_buffer_list(numa, d);
+ status = 1;
+ }
+ }
+
+ pr_info("enable dynamic numa-aware osq_lock, clusters %d\n",
+ numa);
+ return status;
+}
+
+
+#define numa_lock_proc_dir "zx_numa_lock"
+#define numa_entry_total 8
+struct proc_dir_entry *numa_lock_proc;
+struct proc_dir_entry *numa_lock_enable;
+struct proc_dir_entry *numa_proc_entry[numa_entry_total];
+
+static ssize_t numa_lock_proc_read(struct file *file,
+ char __user *usrbuf, size_t len, loff_t *off)
+{
+ int id = (long) pde_data(file_inode(file));
+ char kbuffer[128];
+ ssize_t retval = 0;
+ size_t n = 0;
+
+ memset(kbuffer, 0, sizeof(kbuffer));
+ if (id == 0)
+ n = sprintf(kbuffer, "%d\n", READ_ONCE(dynamic_enable));
+ else if (id == 1)
+ n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_lock_depth));
+ else if (id == 2)
+ n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_keep_times));
+ else if (id == 3)
+ n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_node_max));
+ else if (id == 4)
+ n = sprintf(kbuffer, "%d\n", atomic_read(&numa_count));
+ retval = simple_read_from_buffer(usrbuf, len, off, kbuffer, n);
+
+ return retval;
+}
+
+static ssize_t numa_lock_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *f_pos)
+{
+ int id = (long) pde_data(file_inode(file));
+ char kbuffer[128];
+ unsigned long new = 0;
+ int err = 0;
+
+ memset(kbuffer, 0, sizeof(kbuffer));
+ if (copy_from_user(kbuffer, buffer, count))
+ return count;
+ kbuffer[count] = '\0';
+ err = kstrtoul(kbuffer, 10, &new);
+ if (err != 0)
+ return count;
+
+ if (id == 0) {
+ int last = READ_ONCE(dynamic_enable);
+
+ if (new < 0 || new >= 2 || last == new)
+ return count;
+
+ if (last == 0)
+ schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
+ prefetchw(&dynamic_enable);
+ WRITE_ONCE(dynamic_enable, new);
+ return count;
+ }
+
+ if (READ_ONCE(dynamic_enable) != 0) {
+ pr_info("dynamic %d: change setting should disable dynamic\n",
+ dynamic_enable);
+ return count;
+ }
+ if (id == 1 && new > 4 && new <= 32)
+ WRITE_ONCE(osq_lock_depth, new);
+ else if (id == 2 && new >= 16 && new <= 2048)
+ WRITE_ONCE(osq_keep_times, new);
+ else if (id == 3 && new > 4 && new <= 2048)
+ WRITE_ONCE(osq_node_max, new);
+ return count;
+}
+static int numa_lock_proc_show(struct seq_file *m, void *v)
+{
+ return 0;
+}
+
+static int numa_lock_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, numa_lock_proc_show, NULL);
+}
+static const struct proc_ops numa_lock_proc_fops = {
+ .proc_open = numa_lock_proc_open,
+ .proc_read = numa_lock_proc_read,
+ .proc_write = numa_lock_proc_write
+};
+
+static int numalock_proc_init(void)
+{
+ int index = 0;
+ int i = 0;
+
+ numa_lock_proc = proc_mkdir(numa_lock_proc_dir, NULL);
+ if (numa_lock_proc == NULL) {
+ pr_info("%s proc create %s failed\n", __func__,
+ numa_lock_proc_dir);
+ return -EINVAL;
+ }
+
+ numa_lock_enable = proc_create_data("dynamic_enable", 0666,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ if (!numa_lock_enable) {
+ pr_info("%s proc_create_data %s failed!\n", __func__,
+ "dynamic_enable");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < numa_entry_total; i++)
+ numa_proc_entry[i] = NULL;
+
+ numa_proc_entry[0] = proc_create_data("osq_lock_depth", 0664,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ numa_proc_entry[1] = proc_create_data("osq_keep_times", 0664,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ numa_proc_entry[2] = proc_create_data("osq_node_max", 0664,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ numa_proc_entry[3] = proc_create_data("numa_osq_lock", 0444,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ return 0;
+}
+
+static void numalock_proc_exit(void)
+{
+ int i = 0;
+
+ for (i = 0; i < numa_entry_total; i++) {
+ if (numa_proc_entry[i])
+ proc_remove(numa_proc_entry[i]);
+ }
+ if (numa_lock_enable)
+ proc_remove(numa_lock_enable);
+ if (numa_lock_proc)
+ remove_proc_entry(numa_lock_proc_dir, NULL);
+
+}
+
+static int numalock_shutdown_notify(struct notifier_block *unused1,
+ unsigned long unused2, void *unused3)
+{
+ if (READ_ONCE(enable_zx_numa_osq_lock) == 1) {
+ WRITE_ONCE(dynamic_enable, 0);
+ WRITE_ONCE(enable_zx_numa_osq_lock, 0xf);
+ }
+ return NOTIFY_DONE;
+}
+static struct notifier_block numalock_shutdown_nb = {
+ .notifier_call = numalock_shutdown_notify,
+};
+static int __init zx_numa_base_init(void)
+{
+ int cpu = num_possible_cpus();
+ int i = 0;
+
+ WRITE_ONCE(enable_zx_numa_osq_lock, 0);
+ if (kvm_para_available())
+ return 0;
+ if (cpu >= 65534 || cpu < 16 || (cpu & 0x7) != 0)
+ return 0;
+
+ for (i = 0; i < ARRAY_SIZE(numa_cpu_list); i++) {
+ if (boot_cpu_data.x86_vendor == numa_cpu_list[i].x86_vendor &&
+ boot_cpu_data.x86 == numa_cpu_list[i].x86 &&
+ boot_cpu_data.x86_model == numa_cpu_list[i].x86_model) {
+
+ if (numa_cpu_list[i].feature1 == 1)
+ numa_clusters = nr_node_ids + nr_node_ids;
+ numa_shift = zx_get_numa_shift(num_possible_cpus(),
+ numa_clusters);
+
+ if (zx_numa_lock_init(numa_clusters) == 0)
+ return -ENOMEM;
+ register_reboot_notifier(&numalock_shutdown_nb);
+ numalock_proc_init();
+ INIT_DELAYED_WORK(&zx_numa_cleanup_work,
+ zx_numa_cleanup);
+ prefetchw(&enable_zx_numa_osq_lock);
+ WRITE_ONCE(enable_zx_numa_osq_lock, 1);
+ return 0;
+ }
+ }
+ return 0;
+}
+
+static void __exit zx_numa_lock_exit(void)
+{
+ numalock_proc_exit();
+ prefetchw(&dynamic_enable);
+ WRITE_ONCE(dynamic_enable, 0);
+}
+
+late_initcall(zx_numa_base_init);
+module_exit(zx_numa_lock_exit);
+MODULE_AUTHOR("LiYong <yongli-oc@zhaoxin.com>");
+MODULE_DESCRIPTION("zx dynamic numa-aware osq lock");
+MODULE_LICENSE("GPL");
+
diff --git a/kernel/locking/zx_numa_osq.c b/kernel/locking/zx_numa_osq.c
new file mode 100644
index 000000000000..9fc329f33c36
--- /dev/null
+++ b/kernel/locking/zx_numa_osq.c
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Dynamic numa-aware osq lock
+ * Author: LiYong <yongli-oc@zhaoxin.com>
+ *
+ */
+#include <linux/cpumask.h>
+#include <asm/byteorder.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/osq_lock.h>
+#include "numa.h"
+#include "numa_osq.h"
+
+int osq_node_max = 256;
+
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_cpu_node);
+
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int decode(int cpu_nr)
+{
+ return cpu_nr - 1;
+}
+
+static inline struct optimistic_spin_node *decode_curr(int encoded_cpu_val)
+{
+ int cpu_nr = decode(encoded_cpu_val);
+
+ return per_cpu_ptr(&osq_cpu_node, cpu_nr);
+}
+/*
+ * Exchange the tail to get the last one in the queue.
+ * If the high 32 bits of tail is not equal to the osq lock addr, or
+ * the low 32 bits is NUMA_LOCKED_VAL, the queue is in LOCKED state.
+ */
+static int atomic64_cmpxchg_notequal(void *qslock, atomic_t *tail, int curr)
+{
+ u64 ss = 0;
+ u32 addr = ptrmask(qslock);
+ u64 addrcurr = (((u64)addr) << 32) | curr;
+
+ while (1) {
+ ss = atomic64_read((atomic64_t *) tail);
+ if ((ss >> 32) != addr)
+ return NUMA_LOCKED_VAL;
+ if ((ss & LOW32MASK) == NUMA_LOCKED_VAL)
+ return NUMA_LOCKED_VAL;
+ if (atomic64_cmpxchg((atomic64_t *) tail, ss, addrcurr) == ss)
+ return ss & LOW32MASK;
+ cpu_relax();
+ }
+}
+/*
+ * Get numa-aware lock memory and set the osq to STOPPING state.
+ * Look for the zx_numa_entry array, get a free one and set each node's addr
+ * to the address of the osq lock, then set the struct optimistic_spin_queue's
+ * index to s.
+ *
+ * From the index, lock()/unlock() gets the position of struct _numa_lock,
+ * atomic check the struct _numa_lock's addr with the struct optimistic_spin_queue
+ * lock's address, equal means the link is valid. if the addr is not equal,
+ * the link is broken by the numa_cleanup workqueue since idle.
+ */
+void zx_osq_lock_stopping(struct optimistic_spin_queue *lock)
+{
+ int s = 0;
+
+ s = zx_numa_lock_ptr_get(lock);
+ if (s < zx_numa_lock_total) {
+ numa_lock_init_data(zx_numa_entry[s].numa_ptr,
+ numa_clusters, NUMA_UNLOCKED_VAL,
+ ptrmask(lock));
+
+ WRITE_ONCE(lock->index, s);
+ zx_numa_entry[s].type = 1;
+ smp_mb();/*should set these before enable*/
+ prefetchw(&lock->numa_enable);
+ WRITE_ONCE(lock->numa_enable, OSQLOCKSTOPPING);
+ } else {
+ prefetchw(&lock->numa_enable);
+ WRITE_ONCE(lock->numa_enable, OSQLOCKINITED);
+ }
+}
+/*
+ * Set numa_enable to NUMALOCKDYNAMIC to start by numa-aware lock,
+ * then set _numa_lock->initlock to TURNTONUMAREADY, breaks the
+ * loop of zx_osq_turn_numa_waiting.
+ */
+void zx_osq_numa_start(struct optimistic_spin_queue *lock)
+{
+ struct _numa_lock *_numa_lock = get_numa_lock(lock->index);
+
+ prefetchw(&lock->numa_enable);
+ WRITE_ONCE(lock->numa_enable, NUMALOCKDYNAMIC);
+ smp_mb(); /*should keep lock->numa_enable modified first*/
+ atomic_set(&_numa_lock->initlock, TURNTONUMAREADY);
+}
+
+/*
+ * Waiting numa-aware lock ready
+ */
+void zx_osq_turn_numa_waiting(struct optimistic_spin_queue *lock)
+{
+ struct _numa_lock *_numa_lock = get_numa_lock(lock->index);
+
+ atomic_inc(&_numa_lock->pending);
+ while (1) {
+ int s = atomic_read(&_numa_lock->initlock);
+
+ if (s == TURNTONUMAREADY)
+ break;
+ cpu_relax();
+
+ }
+ atomic_dec(&_numa_lock->pending);
+}
+
+static struct optimistic_spin_node *
+zx_numa_osq_wait_next(struct _numa_lock *lock,
+ struct optimistic_spin_node *node,
+ struct optimistic_spin_node *prev, int cpu)
+{
+ struct optimistic_spin_node *next = NULL;
+ int curr = encode_cpu(cpu);
+ int old;
+
+ old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
+ for (;;) {
+ if (atomic_read(&lock->tail) == curr &&
+ atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
+
+ break;
+ }
+ if (node->next) {
+ next = xchg(&node->next, NULL);
+ if (next)
+ break;
+ }
+ cpu_relax();
+ }
+ return next;
+}
+/*
+ * If the numa_lock tail is in LOCKED state or the high 32 bits addr
+ * is not equal to the osq lock, the numa-aware lock is stopped, and
+ * break the loop to osq_lock()
+ * if not, exchange the tail to enqueue.
+ */
+static void zx_numa_turn_osq_waiting(struct optimistic_spin_queue *lock,
+ struct _numa_lock *_numa_lock)
+{
+ struct _numa_lock *numa_lock = _numa_lock + _numa_lock->numa_nodes;
+ int lockaddr = ptrmask(lock);
+ u64 s = 0;
+ struct optimistic_spin_queue tail;
+
+ tail.numa_enable = NUMALOCKDYNAMIC;
+ tail.index = lock->index;
+ tail.tail16 = OSQ_LOCKED_VAL;
+ while (1) {
+ cpu_relax();
+ s = atomic64_read((atomic64_t *) &numa_lock->tail);
+ if ((s >> 32) != lockaddr)
+ break;
+ if ((s & LOW32MASK) == NUMA_LOCKED_VAL)
+ break;
+ }
+ prefetchw(&lock->tail);
+ if (atomic_cmpxchg(&lock->tail, tail.val, OSQ_UNLOCKED_VAL)
+ == tail.val) {
+ ;
+ }
+
+}
+
+static int _zx_node_osq_lock_internal(struct optimistic_spin_queue *qslock,
+ struct optimistic_spin_node *node, struct optimistic_spin_node *prev,
+ struct _numa_lock *node_lock, int cpu, int *cur_status)
+{
+ struct optimistic_spin_node *next = NULL;
+
+ for (;;) {
+ struct optimistic_spin_node *node_prev = NULL;
+
+ /*
+ * cpu_relax() below implies a compiler barrier which would
+ * prevent this comparison being optimized away.
+ */
+ if (data_race(prev->next) == node &&
+ cmpxchg(&prev->next, node, NULL) == node) {
+ break;
+ }
+ /*load locked first each time*/
+ *cur_status = smp_load_acquire(&node->locked);
+
+ if (*cur_status != NODE_WAIT)
+ return 0; //goto NODE_UNLOCK;
+
+ cpu_relax();
+ /*
+ * Or we race against a concurrent unqueue()'s step-B, in which
+ * case its step-C will write us a new @node->prev pointer.
+ */
+ node_prev = READ_ONCE(node->prev);
+ if (node_prev != prev)
+ prev = node_prev;
+ }
+
+ /*
+ * Step - B -- stabilize @next
+ *
+ * Similar to unlock(), wait for @node->next or move @lock from @node
+ * back to @prev.
+ */
+ next = zx_numa_osq_wait_next(node_lock, node, prev, cpu);
+ if (!next)
+ return -1;
+
+ WRITE_ONCE(next->prev, prev);
+ WRITE_ONCE(prev->next, next);
+
+ return -1;
+}
+
+static int _zx_node_osq_lock(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *_numa_lock)
+{
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_cpu_node);
+ struct optimistic_spin_node *prev = NULL;
+ int cpu = smp_processor_id();
+ int curr = encode_cpu(cpu);
+ int numa = cpu >> _numa_lock->shift;
+ struct _numa_lock *node_lock = _numa_lock + numa;
+ int cur_status = 0;
+ int old = 0;
+
+ node->locked = NODE_WAIT;
+ node->next = NULL;
+ node->cpu = curr;
+
+ old = atomic64_cmpxchg_notequal(qslock, &node_lock->tail, curr);
+
+ if (old == NUMA_LOCKED_VAL) {
+ bool s = true;
+
+ zx_numa_turn_osq_waiting(qslock, _numa_lock);
+ s = osq_lock(qslock);
+ if (s == true)
+ return 1;
+ else
+ return -1;
+ }
+
+ if (old == 0) {
+ node->locked = COHORT_START;
+ return ACQUIRE_NUMALOCK;
+ }
+
+ prev = decode_curr(old);
+ node->prev = prev;
+
+ smp_mb(); /* make sure node set before set pre->next */
+
+ WRITE_ONCE(prev->next, node);
+ /*
+ * Normally @prev is untouchable after the above store; because at that
+ * moment unlock can proceed and wipe the node element from stack.
+ *
+ * However, since our nodes are static per-cpu storage, we're
+ * guaranteed their existence -- this allows us to apply
+ * cmpxchg in an attempt to undo our queueing.
+ */
+
+ /*
+ * Wait to acquire the lock or cancellation. Note that need_resched()
+ * will come with an IPI, which will wake smp_cond_load_relaxed() if it
+ * is implemented with a monitor-wait. vcpu_is_preempted() relies on
+ * polling, be careful.
+ */
+ while ((cur_status = READ_ONCE(node->locked)) == NODE_WAIT) {
+ if (need_resched() || vcpu_is_preempted(node_cpu(node->prev))) {
+ int ddd = _zx_node_osq_lock_internal(qslock, node, prev,
+ node_lock, cpu, &cur_status);
+
+ if (cur_status != NODE_WAIT)
+ goto NODE_UNLOCK;
+ if (ddd == -1)
+ return -1;
+ }
+ cpu_relax();
+ }
+NODE_UNLOCK:
+ if (cur_status == ACQUIRE_NUMALOCK)
+ node->locked = COHORT_START;
+ return cur_status;
+}
+static int _zx_numa_osq_lock(struct optimistic_spin_queue *qslock, int cpu,
+ struct _numa_lock *_numa_lock)
+{
+ int numacpu = cpu >> _numa_lock->shift;
+ int numacurr = encode_cpu(numacpu);
+
+ struct optimistic_spin_node *node = &(_numa_lock + numacpu)->osq_node;
+ struct _numa_lock *numa_lock = _numa_lock + _numa_lock->numa_nodes;
+ struct optimistic_spin_node *prevnode = NULL;
+ int prev = 0;
+
+ node->next = NULL;
+ node->locked = LOCK_NUMALOCK;
+ node->cpu = numacurr;
+
+ prev = atomic_xchg(&numa_lock->tail, numacurr);
+ if (prev == 0) {
+ node->locked = UNLOCK_NUMALOCK;
+ return 0;
+ }
+
+ prevnode = &(_numa_lock + prev - 1)->osq_node;
+ node->prev = prevnode;
+ smp_mb(); /*node->prev should be set before next*/
+ WRITE_ONCE(prevnode->next, node);
+
+ while (READ_ONCE(node->locked) == LOCK_NUMALOCK)
+ cpu_relax();
+
+ return 0;
+}
+/*
+ * Two level osq_lock
+ * Check current node's tail then check the numa tail, the queue goes to
+ * run or wait according the two tail's state.
+ */
+
+inline bool zx_numa_osq_lock(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *_numa_lock)
+{
+ struct _numa_lock *node_lock = NULL;
+ int cpu = smp_processor_id();
+ int numa = cpu >> _numa_lock->shift;
+ int status = 0;
+
+ node_lock = _numa_lock + numa;
+
+ if (node_lock->stopping) {
+ zx_numa_turn_osq_waiting(qslock, _numa_lock);
+ return osq_lock(qslock);
+ }
+
+ status = _zx_node_osq_lock(qslock, _numa_lock);
+ if (status == ACQUIRE_NUMALOCK)
+ status = _zx_numa_osq_lock(qslock, smp_processor_id(),
+ _numa_lock);
+
+ if (status == -1)
+ return false;
+ return true;
+}
+/*
+ * Check the end of the queue on current node.
+ * Keep the addr of the node_lock when set the queue to UNLOCKED.
+ * If the node is stopping, the node_lock will be set LOCKED.
+ */
+static int atomic64_checktail_osq(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *node_lock, int ctail)
+{
+ u64 addr = ((u64)ptrmask(qslock)) << 32;
+ u64 addrtail = addr | ctail;
+ u64 ss = 0;
+ bool mark;
+
+ ss = atomic64_read((atomic64_t *) &node_lock->tail);
+ if (node_lock->stopping == 0)
+ mark = (ss == addrtail &&
+ atomic64_cmpxchg_acquire(
+ (atomic64_t *) &node_lock->tail,
+ addrtail, addr|NUMA_UNLOCKED_VAL) == addrtail);
+ else
+ mark = (ss == addrtail &&
+ atomic64_cmpxchg_acquire(
+ (atomic64_t *) &node_lock->tail,
+ addrtail, NUMA_LOCKED_VAL) == addrtail);
+ return mark;
+}
+
+/*
+ * Set current node's tail to ACQUIRE_NUMALOCK to check the numa tail busy or
+ * UNLOCKED
+ */
+static void node_lock_release(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *node_lock, struct optimistic_spin_node *node,
+ int val, int cpu, int numa_end)
+{
+ struct optimistic_spin_node *next = NULL;
+ int curr = encode_cpu(cpu);
+
+ while (1) {
+ if (atomic64_checktail_osq(qslock, node_lock, curr)) {
+ if (qslock->numa_enable == NUMALOCKDYNAMIC) {
+ int index = qslock->index;
+ //starts numa_lock idle checking in cleanup workqueue.
+ if (numa_end == OSQ_UNLOCKED_VAL &&
+ zx_numa_entry[index].idle == 0) {
+ cmpxchg(&zx_numa_entry[index].idle,
+ 0, 1);
+ }
+ }
+ return;
+ }
+ if (node->next) {
+ next = xchg(&node->next, NULL);
+ if (next) {
+ WRITE_ONCE(next->locked, val);
+ return;
+ }
+ }
+ cpu_relax();
+ }
+}
+/*
+ * Unlocks the queue waiting on the next NUMA node
+ */
+static int numa_lock_release(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *numa_lock,
+ struct optimistic_spin_node *node, int cpu)
+{
+ struct optimistic_spin_node *next = NULL;
+ int curr = cpu >> numa_lock->shift;
+ int numacurr = encode_cpu(curr);
+
+ while (1) {
+ if (atomic_read(&numa_lock->tail) == numacurr &&
+ atomic_cmpxchg_acquire(&numa_lock->tail, numacurr,
+ OSQ_UNLOCKED_VAL) == numacurr) {
+ return OSQ_UNLOCKED_VAL;
+ }
+
+ if (node->next) {
+ next = xchg(&node->next, NULL);
+ if (next) {
+ WRITE_ONCE(next->locked, UNLOCK_NUMALOCK);
+ return 1;
+ }
+ }
+ cpu_relax();
+ }
+}
+/*
+ * Two level osq_unlock.
+ */
+inline void zx_numa_osq_unlock(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *_numa_lock)
+{
+ u32 cpu = smp_processor_id();
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_cpu_node);
+ int numa = cpu >> _numa_lock->shift;
+ struct _numa_lock *numa_lock = _numa_lock + _numa_lock->numa_nodes;
+ struct _numa_lock *node_lock = _numa_lock + numa;
+ struct optimistic_spin_node *numa_node =
+ &(_numa_lock + numa)->osq_node;
+ struct optimistic_spin_node *next = NULL;
+ int cur_count = 0;
+ int numa_end = 0;
+
+ cur_count = READ_ONCE(node->locked);
+
+ /*
+ * Turns to the queue waiting on the next node if unlocked times more
+ * than osq_node_max on current node.
+ */
+ if (cur_count >= osq_node_max - 1) {
+ //Unlocks the queue on next node.
+ numa_end = numa_lock_release(qslock,
+ numa_lock, numa_node, cpu);
+ node_lock_release(qslock, node_lock, node,
+ ACQUIRE_NUMALOCK, cpu, numa_end);
+ return;
+ }
+ /*
+ * Unlocks the next on current node.
+ */
+ next = xchg(&node->next, NULL);
+ if (next) {
+ WRITE_ONCE(next->locked, cur_count + 1);
+ return;
+ }
+ /*
+ * The queue on current node reaches end.
+ */
+ numa_end = numa_lock_release(qslock, numa_lock, numa_node, cpu);
+ node_lock_release(qslock, node_lock, node, ACQUIRE_NUMALOCK,
+ cpu, numa_end);
+}
--
2.34.1
© 2016 - 2024 Red Hat, Inc.