The numa-aware lock kernel memory cache preparation, and a
workqueue to turn numa-aware lock back to osq lock.
The /proc interface. Enable dynamic switch by
echo 1 > /proc/zx_numa_lock/dynamic_enable
Signed-off-by: yongli-oc <yongli-oc@zhaoxin.com>
---
kernel/locking/zx_numa.c | 537 +++++++++++++++++++++++++++++++++++++++
1 file changed, 537 insertions(+)
create mode 100644 kernel/locking/zx_numa.c
diff --git a/kernel/locking/zx_numa.c b/kernel/locking/zx_numa.c
new file mode 100644
index 000000000000..89df6670a024
--- /dev/null
+++ b/kernel/locking/zx_numa.c
@@ -0,0 +1,537 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Dynamic numa-aware osq lock
+ * Crossing from numa-aware lock to osq_lock
+ * Numa lock memory initialize and /proc interface
+ * Author: LiYong <yongli-oc@zhaoxin.com>
+ *
+ */
+#include <linux/cpumask.h>
+#include <asm/byteorder.h>
+#include <asm/kvm_para.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/osq_lock.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/reboot.h>
+
+#include "numa.h"
+#include "numa_osq.h"
+
+int enable_zx_numa_osq_lock;
+struct delayed_work zx_numa_start_work;
+struct delayed_work zx_numa_cleanup_work;
+
+atomic_t numa_count;
+struct _numa_buf *zx_numa_entry;
+int zx_numa_lock_total = 256;
+LIST_HEAD(_zx_numa_head);
+LIST_HEAD(_zx_numa_lock_head);
+
+struct kmem_cache *zx_numa_entry_cachep;
+struct kmem_cache *zx_numa_lock_cachep;
+int NUMASHIFT;
+int NUMACLUSTERS;
+static atomic_t lockindex;
+int dynamic_enable;
+
+static const struct numa_cpu_info numa_cpu_list[] = {
+ /*feature1=1, a numa node includes two clusters*/
+ //{1, 23, X86_VENDOR_AMD, 0, 1},
+ {0x5b, 7, X86_VENDOR_CENTAUR, 0, 1},
+ {0x5b, 7, X86_VENDOR_ZHAOXIN, 0, 1}
+};
+
+inline void *get_numa_lock(int index)
+{
+ if (index >= 0 && index < zx_numa_lock_total)
+ return zx_numa_entry[index].numa_ptr;
+ else
+ return NULL;
+}
+
+static int zx_get_numa_shift(int all_cpus, int clusters)
+{
+ int cpus = (int) all_cpus/clusters;
+ int count = 0;
+
+ while (cpus) {
+ cpus >>= 1;
+ count++;
+ }
+ return count-1;
+}
+
+void numa_lock_init_data(struct _numa_lock *s, int clusters,
+ u32 lockval, u32 lockaddr)
+{
+ int j = 0;
+
+ for (j = 0; j < clusters + NUMAEXPAND; j++) {
+ atomic_set(&(s + j)->tail, lockval);
+ atomic_set(&(s + j)->addr, lockaddr);
+ (s + j)->shift = NUMASHIFT;
+ (s + j)->stopping = 0;
+ (s + j)->numa_nodes = clusters;
+ (s + j)->accessed = 0;
+ (s + j)->totalaccessed = 0;
+ (s + j)->nodeswitched = 0;
+ atomic_set(&(s + j)->initlock, 0);
+ atomic_set(&(s + j)->pending, 0);
+ }
+}
+
+int zx_numa_lock_ptr_get(void *p)
+{
+ int i = 0;
+ int index = 0;
+
+ if (atomic_read(&numa_count) >= zx_numa_lock_total)
+ return zx_numa_lock_total;
+
+ index = atomic_inc_return(&lockindex);
+
+ for (i = 0; i < zx_numa_lock_total; i++) {
+ if (index >= zx_numa_lock_total)
+ index = 0;
+ if (cmpxchg(&zx_numa_entry[index].lockaddr,
+ 0, ptrmask(p)) == 0) {
+ while (1) {
+ struct _numa_lock *node_lock =
+ zx_numa_entry[index].numa_ptr;
+ struct _numa_lock *numa_lock = node_lock +
+ node_lock->numa_nodes;
+
+ if (atomic_read(&numa_lock->tail) ==
+ NUMA_LOCKED_VAL)
+ break;
+ cpu_relax();
+
+ }
+ atomic_inc(&numa_count);
+ zx_numa_entry[index].highaddr = ((u64)p) >> 32;
+ atomic_set(&lockindex, index);
+ return index;
+ }
+ index++;
+ if (atomic_read(&numa_count) >= zx_numa_lock_total)
+ break;
+ }
+ return zx_numa_lock_total;
+}
+
+int zx_check_numa_dynamic_locked(u32 lockaddr,
+ struct _numa_lock *_numa_lock, int t)
+{
+ struct _numa_lock *node_lock = NULL;
+ u64 s = -1;
+ int i = 0;
+
+ if (atomic_read(&_numa_lock->pending) != 0)
+ return 1;
+
+ for (i = 0; i < _numa_lock->numa_nodes + 1; i++) {
+ node_lock = _numa_lock + i;
+ cpu_relax(); cpu_relax(); cpu_relax(); cpu_relax();
+ s = atomic64_read((atomic64_t *) &node_lock->tail);
+ if ((s >> 32) != lockaddr)
+ continue;
+ if ((s & LOW32MASK) == NUMA_LOCKED_VAL
+ || (s & LOW32MASK) == NUMA_UNLOCKED_VAL)
+ continue;
+ break;
+ }
+
+ if (i == _numa_lock->numa_nodes + 1)
+ return 0;
+ return i+1;
+}
+
+static int zx_numa_lock64_try_to_freeze(u32 lockaddr, struct _numa_lock *_numa_lock,
+ int index)
+{
+ struct _numa_lock *node_lock = NULL;
+ u64 addr = ((u64)lockaddr) << 32;
+ u64 s = 0;
+ u64 ff = 0;
+ int i = 0;
+
+ for (i = 0; i < _numa_lock->numa_nodes+1; i++) {
+ node_lock = _numa_lock + i;
+ cpu_relax();
+
+ s = atomic64_read((atomic64_t *)&node_lock->tail);
+ if ((s & HIGH32BITMASK) != addr)
+ continue;
+
+ if ((s & LOW32MASK) == NUMA_LOCKED_VAL)
+ continue;
+
+ if ((s & LOW32MASK) == NUMA_UNLOCKED_VAL) {
+ ff = atomic64_cmpxchg((atomic64_t *)&node_lock->tail,
+ (addr|NUMA_UNLOCKED_VAL), NUMA_LOCKED_VAL);
+ if (ff == (addr|NUMA_UNLOCKED_VAL))
+ continue;
+ }
+ break;
+ }
+
+ if (i == _numa_lock->numa_nodes + 1) {
+ zx_numa_entry[index].idle = 0;
+ zx_numa_entry[index].type = 0;
+ zx_numa_entry[index].highaddr = 0;
+ xchg(&zx_numa_entry[index].lockaddr, 0);
+ }
+
+ return i;
+}
+
+static void zx_numa_lock_stopping(struct _numa_lock *_numa_lock)
+{
+ struct _numa_lock *node_lock = NULL;
+ int i = 0;
+
+ for (i = 0; i < _numa_lock->numa_nodes+1; i++) {
+ node_lock = _numa_lock + i;
+ WRITE_ONCE(node_lock->stopping, 1);
+ }
+}
+
+static void zx_numa_cleanup(struct work_struct *work)
+{
+ int i = 0;
+ int checktimes = 2;
+
+ //reboot or power off state
+ if (READ_ONCE(enable_zx_numa_osq_lock) == 0xf)
+ return;
+
+ if (atomic_read(&numa_count) == 0) {
+ if (READ_ONCE(dynamic_enable) != 0)
+ schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
+ return;
+ }
+
+ for (i = 0; i < zx_numa_lock_total; i++) {
+ int s = 0;
+ u32 lockaddr = READ_ONCE(zx_numa_entry[i].lockaddr);
+ u32 type = zx_numa_entry[i].type;
+ struct _numa_lock *buf = zx_numa_entry[i].numa_ptr;
+ int nodes = 0;
+
+ if (lockaddr == 0 || type == 3 || zx_numa_entry[i].idle == 0)
+ continue;
+ nodes = buf->numa_nodes;
+ if (zx_numa_entry[i].idle < checktimes) {
+
+ s = zx_check_numa_dynamic_locked(lockaddr, buf, 1);
+ if (s != 0) {
+ zx_numa_entry[i].idle = 1;
+ continue;
+ }
+ zx_numa_entry[i].idle++;
+ }
+
+ if (zx_numa_entry[i].idle == checktimes) {
+ zx_numa_lock_stopping(buf);
+ zx_numa_entry[i].idle++;
+
+ }
+
+ if (zx_numa_entry[i].idle == checktimes+1) {
+ while (1) {
+ if (zx_numa_lock64_try_to_freeze(lockaddr, buf,
+ i) == nodes + 1) {
+ //all node has been locked
+ u32 left = 0;
+
+ left = atomic_dec_return(&numa_count);
+ break;
+ }
+ cpu_relax(); cpu_relax();
+ cpu_relax(); cpu_relax();
+ }
+ }
+ }
+ schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
+}
+
+static int create_numa_buffer_list(int clusters, int len)
+{
+ int i = 0;
+
+ for (i = 0; i < zx_numa_lock_total; i++) {
+ struct _numa_lock *s = (struct _numa_lock *)kmem_cache_alloc(
+ zx_numa_lock_cachep, GFP_KERNEL);
+ if (!s) {
+ while (i > 0) {
+ kmem_cache_free(zx_numa_lock_cachep,
+ zx_numa_entry[i-1].numa_ptr);
+ i--;
+ }
+ return 0;
+ }
+ memset((char *)s, 0,
+ len * L1_CACHE_BYTES * (clusters + NUMAEXPAND));
+ numa_lock_init_data(s, clusters, NUMA_LOCKED_VAL, 0);
+ zx_numa_entry[i].numa_ptr = s;
+ zx_numa_entry[i].lockaddr = 0;
+ zx_numa_entry[i].highaddr = 0;
+ zx_numa_entry[i].idle = 0;
+ zx_numa_entry[i].type = 0;
+ }
+
+ for (i = 0; i < zx_numa_lock_total; i++) {
+ zx_numa_entry[i].index = i;
+ list_add_tail(&(zx_numa_entry[i].list), &_zx_numa_lock_head);
+ }
+ return 1;
+}
+
+static int zx_numa_lock_init(int numa)
+{
+ int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+ int d = 0;
+ int status = 0;
+
+ atomic_set(&lockindex, 0);
+ atomic_set(&numa_count, 0);
+
+ if (sizeof(struct _numa_lock) & 0x3f)
+ d = (int)((sizeof(struct _numa_lock) + L1_CACHE_BYTES) /
+ L1_CACHE_BYTES);
+ else
+ d = (int)(sizeof(struct _numa_lock) / L1_CACHE_BYTES);
+
+ zx_numa_entry_cachep = kmem_cache_create(
+ "zx_numa_entry",
+ sizeof(struct _numa_buf) * zx_numa_lock_total, align,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
+
+ zx_numa_lock_cachep = kmem_cache_create(
+ "zx_numa_lock",
+ d * L1_CACHE_BYTES * (numa + NUMAEXPAND), align,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
+
+
+ if (zx_numa_entry_cachep && zx_numa_lock_cachep) {
+ zx_numa_entry = (struct _numa_buf *)kmem_cache_alloc(
+ zx_numa_entry_cachep, GFP_KERNEL);
+ if (zx_numa_entry) {
+ memset((char *)zx_numa_entry, 0,
+ sizeof(struct _numa_buf) * zx_numa_lock_total);
+ create_numa_buffer_list(numa, d);
+ status = 1;
+ }
+ }
+
+ pr_info("enable dynamic numa-aware osq_lock, clusters %d\n",
+ numa);
+ return status;
+}
+
+
+#define numa_lock_proc_dir "zx_numa_lock"
+#define zx_numa_enable_dir "dynamic_enable"
+#define numa_entry_total 8
+struct proc_dir_entry *numa_lock_proc;
+struct proc_dir_entry *numa_lock_enable;
+struct proc_dir_entry *numa_proc_entry[numa_entry_total];
+
+static ssize_t numa_lock_proc_read(struct file *file,
+ char __user *usrbuf, size_t len, loff_t *off)
+{
+ int id = (long) pde_data(file_inode(file));
+ char kbuffer[128];
+ ssize_t retval = 0;
+ size_t n = 0;
+
+ memset(kbuffer, 0, sizeof(kbuffer));
+ if (id == 0)
+ n = sprintf(kbuffer, "%d\n", READ_ONCE(dynamic_enable));
+ else if (id == 1)
+ n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_lock_depth));
+ else if (id == 2)
+ n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_keep_times));
+ else if (id == 3)
+ n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_node_max));
+ else if (id == 4)
+ n = sprintf(kbuffer, "%d\n", atomic_read(&numa_count));
+ retval = simple_read_from_buffer(usrbuf, len, off, kbuffer, n);
+
+ return retval;
+}
+
+static ssize_t numa_lock_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *f_pos)
+{
+ int id = (long) pde_data(file_inode(file));
+ char kbuffer[128];
+ unsigned long new = 0;
+ int err = 0;
+
+ memset(kbuffer, 0, sizeof(kbuffer));
+ if (copy_from_user(kbuffer, buffer, count))
+ return count;
+ kbuffer[count] = '\0';
+ err = kstrtoul(kbuffer, 10, &new);
+
+ if (id == 0) {
+ int last = READ_ONCE(dynamic_enable);
+
+ if (new < 0 || new >= 2 || last == new)
+ return count;
+
+ if (last == 0) {
+ prefetchw(&enable_zx_numa_osq_lock);
+ //enable to the 2-bytes-tail osq-lock
+ prefetchw(&enable_zx_numa_osq_lock);
+ WRITE_ONCE(enable_zx_numa_osq_lock, 2);
+ schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
+ }
+ prefetchw(&dynamic_enable);
+ WRITE_ONCE(dynamic_enable, new);
+ return count;
+ }
+
+ if (READ_ONCE(dynamic_enable) != 0) {
+ pr_info("dynamic %d: change setting should disable dynamic\n",
+ dynamic_enable);
+ return count;
+ }
+ if (id == 1 && new > 4 && new <= 32)
+ WRITE_ONCE(osq_lock_depth, new);
+ else if (id == 2 && new >= 16 && new <= 2048)
+ WRITE_ONCE(osq_keep_times, new);
+ else if (id == 3 && new > 4 && new <= 2048)
+ WRITE_ONCE(osq_node_max, new);
+ return count;
+}
+static int numa_lock_proc_show(struct seq_file *m, void *v)
+{
+ return 0;
+}
+
+static int numa_lock_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, numa_lock_proc_show, NULL);
+}
+static const struct proc_ops numa_lock_proc_fops = {
+ .proc_open = numa_lock_proc_open,
+ .proc_read = numa_lock_proc_read,
+ .proc_write = numa_lock_proc_write
+};
+
+static int numalock_proc_init(void)
+{
+ int index = 0;
+ int i = 0;
+
+ numa_lock_proc = proc_mkdir(numa_lock_proc_dir, NULL);
+ if (numa_lock_proc == NULL) {
+ pr_info("%s proc create %s failed\n", __func__,
+ numa_lock_proc_dir);
+ return -EINVAL;
+ }
+
+ numa_lock_enable = proc_create_data(zx_numa_enable_dir, 0666,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ if (!numa_lock_enable) {
+ pr_info("%s proc_create_data %s failed!\n", __func__,
+ zx_numa_enable_dir);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < numa_entry_total; i++)
+ numa_proc_entry[i] = NULL;
+
+ numa_proc_entry[0] = proc_create_data("osq_lock_depth", 0664,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ numa_proc_entry[1] = proc_create_data("osq_keep_times", 0664,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ numa_proc_entry[2] = proc_create_data("osq_node_max", 0664,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ numa_proc_entry[3] = proc_create_data("numa_osq_lock", 0444,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ return 0;
+}
+
+static void numalock_proc_exit(void)
+{
+ int i = 0;
+
+ for (i = 0; i < numa_entry_total; i++) {
+ if (numa_proc_entry[i])
+ proc_remove(numa_proc_entry[i]);
+ }
+ if (numa_lock_enable)
+ proc_remove(numa_lock_enable);
+ if (numa_lock_proc)
+ remove_proc_entry(numa_lock_proc_dir, NULL);
+
+}
+
+static int numalock_shutdown_notify(struct notifier_block *unused1,
+ unsigned long unused2, void *unused3)
+{
+ if (READ_ONCE(enable_zx_numa_osq_lock) == 2) {
+ WRITE_ONCE(dynamic_enable, 0);
+ WRITE_ONCE(enable_zx_numa_osq_lock, 0xf);
+ }
+ return NOTIFY_DONE;
+}
+static struct notifier_block numalock_shutdown_nb = {
+ .notifier_call = numalock_shutdown_notify,
+};
+static int __init zx_numa_base_init(void)
+{
+ int cpu = num_possible_cpus();
+ int i = 0;
+
+ WRITE_ONCE(enable_zx_numa_osq_lock, 0);
+ if (kvm_para_available())
+ return 0;
+ if (cpu >= 65534 || cpu < 16 || (cpu & 0x7) != 0)
+ return 0;
+
+ for (i = 0; i < ARRAY_SIZE(numa_cpu_list); i++) {
+ if (boot_cpu_data.x86_vendor == numa_cpu_list[i].x86_vendor &&
+ boot_cpu_data.x86 == numa_cpu_list[i].x86 &&
+ boot_cpu_data.x86_model == numa_cpu_list[i].x86_model) {
+
+ if (numa_cpu_list[i].feature1 == 1)
+ NUMACLUSTERS = nr_node_ids + nr_node_ids;
+ NUMASHIFT = zx_get_numa_shift(num_possible_cpus(),
+ NUMACLUSTERS);
+
+ if (zx_numa_lock_init(NUMACLUSTERS) == 0)
+ return -ENOMEM;
+ register_reboot_notifier(&numalock_shutdown_nb);
+ numalock_proc_init();
+ INIT_DELAYED_WORK(&zx_numa_cleanup_work,
+ zx_numa_cleanup);
+ prefetchw(&enable_zx_numa_osq_lock);
+ WRITE_ONCE(enable_zx_numa_osq_lock, 1);
+ return 0;
+ }
+ }
+ return 0;
+}
+
+static void __exit zx_numa_lock_exit(void)
+{
+ numalock_proc_exit();
+ prefetchw(&dynamic_enable);
+ WRITE_ONCE(dynamic_enable, 0);
+}
+
+late_initcall(zx_numa_base_init);
+module_exit(zx_numa_lock_exit);
+MODULE_AUTHOR("LiYong <yongli-oc@zhaoxin.com>");
+MODULE_DESCRIPTION("zx dynamic numa-aware osq lock");
+MODULE_LICENSE("GPL");
+
--
2.34.1
Hi yongli-oc,
kernel test robot noticed the following build warnings:
[auto build test WARNING on tip/locking/core]
[also build test WARNING on akpm-mm/mm-nonmm-unstable linus/master v6.11-rc7 next-20240913]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/yongli-oc/locking-osq_lock-The-Kconfig-for-dynamic-numa-aware-osq-lock/20240914-172336
base: tip/locking/core
patch link: https://lore.kernel.org/r/20240914085327.32912-5-yongli-oc%40zhaoxin.com
patch subject: [PATCH 4/4] locking/osq_lock: The numa-aware lock memory prepare, assign and cleanup.
config: x86_64-allyesconfig (https://download.01.org/0day-ci/archive/20240916/202409160059.VIbC9G04-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240916/202409160059.VIbC9G04-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409160059.VIbC9G04-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> kernel/locking/zx_numa.c:250:10: warning: variable 'left' set but not used [-Wunused-but-set-variable]
250 | u32 left = 0;
| ^
>> kernel/locking/zx_numa.c:375:6: warning: variable 'err' set but not used [-Wunused-but-set-variable]
375 | int err = 0;
| ^
2 warnings generated.
vim +/left +250 kernel/locking/zx_numa.c
203
204 static void zx_numa_cleanup(struct work_struct *work)
205 {
206 int i = 0;
207 int checktimes = 2;
208
209 //reboot or power off state
210 if (READ_ONCE(enable_zx_numa_osq_lock) == 0xf)
211 return;
212
213 if (atomic_read(&numa_count) == 0) {
214 if (READ_ONCE(dynamic_enable) != 0)
215 schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
216 return;
217 }
218
219 for (i = 0; i < zx_numa_lock_total; i++) {
220 int s = 0;
221 u32 lockaddr = READ_ONCE(zx_numa_entry[i].lockaddr);
222 u32 type = zx_numa_entry[i].type;
223 struct _numa_lock *buf = zx_numa_entry[i].numa_ptr;
224 int nodes = 0;
225
226 if (lockaddr == 0 || type == 3 || zx_numa_entry[i].idle == 0)
227 continue;
228 nodes = buf->numa_nodes;
229 if (zx_numa_entry[i].idle < checktimes) {
230
231 s = zx_check_numa_dynamic_locked(lockaddr, buf, 1);
232 if (s != 0) {
233 zx_numa_entry[i].idle = 1;
234 continue;
235 }
236 zx_numa_entry[i].idle++;
237 }
238
239 if (zx_numa_entry[i].idle == checktimes) {
240 zx_numa_lock_stopping(buf);
241 zx_numa_entry[i].idle++;
242
243 }
244
245 if (zx_numa_entry[i].idle == checktimes+1) {
246 while (1) {
247 if (zx_numa_lock64_try_to_freeze(lockaddr, buf,
248 i) == nodes + 1) {
249 //all node has been locked
> 250 u32 left = 0;
251
252 left = atomic_dec_return(&numa_count);
253 break;
254 }
255 cpu_relax(); cpu_relax();
256 cpu_relax(); cpu_relax();
257 }
258 }
259 }
260 schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
261 }
262
263 static int create_numa_buffer_list(int clusters, int len)
264 {
265 int i = 0;
266
267 for (i = 0; i < zx_numa_lock_total; i++) {
268 struct _numa_lock *s = (struct _numa_lock *)kmem_cache_alloc(
269 zx_numa_lock_cachep, GFP_KERNEL);
270 if (!s) {
271 while (i > 0) {
272 kmem_cache_free(zx_numa_lock_cachep,
273 zx_numa_entry[i-1].numa_ptr);
274 i--;
275 }
276 return 0;
277 }
278 memset((char *)s, 0,
279 len * L1_CACHE_BYTES * (clusters + NUMAEXPAND));
280 numa_lock_init_data(s, clusters, NUMA_LOCKED_VAL, 0);
281 zx_numa_entry[i].numa_ptr = s;
282 zx_numa_entry[i].lockaddr = 0;
283 zx_numa_entry[i].highaddr = 0;
284 zx_numa_entry[i].idle = 0;
285 zx_numa_entry[i].type = 0;
286 }
287
288 for (i = 0; i < zx_numa_lock_total; i++) {
289 zx_numa_entry[i].index = i;
290 list_add_tail(&(zx_numa_entry[i].list), &_zx_numa_lock_head);
291 }
292 return 1;
293 }
294
295 static int zx_numa_lock_init(int numa)
296 {
297 int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
298 int d = 0;
299 int status = 0;
300
301 atomic_set(&lockindex, 0);
302 atomic_set(&numa_count, 0);
303
304 if (sizeof(struct _numa_lock) & 0x3f)
305 d = (int)((sizeof(struct _numa_lock) + L1_CACHE_BYTES) /
306 L1_CACHE_BYTES);
307 else
308 d = (int)(sizeof(struct _numa_lock) / L1_CACHE_BYTES);
309
310 zx_numa_entry_cachep = kmem_cache_create(
311 "zx_numa_entry",
312 sizeof(struct _numa_buf) * zx_numa_lock_total, align,
313 SLAB_PANIC | SLAB_ACCOUNT, NULL);
314
315 zx_numa_lock_cachep = kmem_cache_create(
316 "zx_numa_lock",
317 d * L1_CACHE_BYTES * (numa + NUMAEXPAND), align,
318 SLAB_PANIC | SLAB_ACCOUNT, NULL);
319
320
321 if (zx_numa_entry_cachep && zx_numa_lock_cachep) {
322 zx_numa_entry = (struct _numa_buf *)kmem_cache_alloc(
323 zx_numa_entry_cachep, GFP_KERNEL);
324 if (zx_numa_entry) {
325 memset((char *)zx_numa_entry, 0,
326 sizeof(struct _numa_buf) * zx_numa_lock_total);
327 create_numa_buffer_list(numa, d);
328 status = 1;
329 }
330 }
331
332 pr_info("enable dynamic numa-aware osq_lock, clusters %d\n",
333 numa);
334 return status;
335 }
336
337
338 #define numa_lock_proc_dir "zx_numa_lock"
339 #define zx_numa_enable_dir "dynamic_enable"
340 #define numa_entry_total 8
341 struct proc_dir_entry *numa_lock_proc;
342 struct proc_dir_entry *numa_lock_enable;
343 struct proc_dir_entry *numa_proc_entry[numa_entry_total];
344
345 static ssize_t numa_lock_proc_read(struct file *file,
346 char __user *usrbuf, size_t len, loff_t *off)
347 {
348 int id = (long) pde_data(file_inode(file));
349 char kbuffer[128];
350 ssize_t retval = 0;
351 size_t n = 0;
352
353 memset(kbuffer, 0, sizeof(kbuffer));
354 if (id == 0)
355 n = sprintf(kbuffer, "%d\n", READ_ONCE(dynamic_enable));
356 else if (id == 1)
357 n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_lock_depth));
358 else if (id == 2)
359 n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_keep_times));
360 else if (id == 3)
361 n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_node_max));
362 else if (id == 4)
363 n = sprintf(kbuffer, "%d\n", atomic_read(&numa_count));
364 retval = simple_read_from_buffer(usrbuf, len, off, kbuffer, n);
365
366 return retval;
367 }
368
369 static ssize_t numa_lock_proc_write(struct file *file,
370 const char __user *buffer, size_t count, loff_t *f_pos)
371 {
372 int id = (long) pde_data(file_inode(file));
373 char kbuffer[128];
374 unsigned long new = 0;
> 375 int err = 0;
376
377 memset(kbuffer, 0, sizeof(kbuffer));
378 if (copy_from_user(kbuffer, buffer, count))
379 return count;
380 kbuffer[count] = '\0';
381 err = kstrtoul(kbuffer, 10, &new);
382
383 if (id == 0) {
384 int last = READ_ONCE(dynamic_enable);
385
386 if (new < 0 || new >= 2 || last == new)
387 return count;
388
389 if (last == 0) {
390 prefetchw(&enable_zx_numa_osq_lock);
391 //enable to the 2-bytes-tail osq-lock
392 prefetchw(&enable_zx_numa_osq_lock);
393 WRITE_ONCE(enable_zx_numa_osq_lock, 2);
394 schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
395 }
396 prefetchw(&dynamic_enable);
397 WRITE_ONCE(dynamic_enable, new);
398 return count;
399 }
400
401 if (READ_ONCE(dynamic_enable) != 0) {
402 pr_info("dynamic %d: change setting should disable dynamic\n",
403 dynamic_enable);
404 return count;
405 }
406 if (id == 1 && new > 4 && new <= 32)
407 WRITE_ONCE(osq_lock_depth, new);
408 else if (id == 2 && new >= 16 && new <= 2048)
409 WRITE_ONCE(osq_keep_times, new);
410 else if (id == 3 && new > 4 && new <= 2048)
411 WRITE_ONCE(osq_node_max, new);
412 return count;
413 }
414 static int numa_lock_proc_show(struct seq_file *m, void *v)
415 {
416 return 0;
417 }
418
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On 9/14/24 04:53, yongli-oc wrote:
> The numa-aware lock kernel memory cache preparation, and a
> workqueue to turn numa-aware lock back to osq lock.
> The /proc interface. Enable dynamic switch by
> echo 1 > /proc/zx_numa_lock/dynamic_enable
>
> Signed-off-by: yongli-oc <yongli-oc@zhaoxin.com>
> ---
> kernel/locking/zx_numa.c | 537 +++++++++++++++++++++++++++++++++++++++
> 1 file changed, 537 insertions(+)
> create mode 100644 kernel/locking/zx_numa.c
>
> diff --git a/kernel/locking/zx_numa.c b/kernel/locking/zx_numa.c
> new file mode 100644
> index 000000000000..89df6670a024
> --- /dev/null
> +++ b/kernel/locking/zx_numa.c
> @@ -0,0 +1,537 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Dynamic numa-aware osq lock
> + * Crossing from numa-aware lock to osq_lock
> + * Numa lock memory initialize and /proc interface
> + * Author: LiYong <yongli-oc@zhaoxin.com>
> + *
> + */
> +#include <linux/cpumask.h>
> +#include <asm/byteorder.h>
> +#include <asm/kvm_para.h>
> +#include <linux/percpu.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/osq_lock.h>
> +#include <linux/module.h>
> +#include <linux/proc_fs.h>
> +#include <linux/seq_file.h>
> +#include <linux/uaccess.h>
> +#include <linux/reboot.h>
> +
> +#include "numa.h"
> +#include "numa_osq.h"
> +
> +int enable_zx_numa_osq_lock;
> +struct delayed_work zx_numa_start_work;
> +struct delayed_work zx_numa_cleanup_work;
> +
> +atomic_t numa_count;
> +struct _numa_buf *zx_numa_entry;
> +int zx_numa_lock_total = 256;
> +LIST_HEAD(_zx_numa_head);
> +LIST_HEAD(_zx_numa_lock_head);
> +
> +struct kmem_cache *zx_numa_entry_cachep;
> +struct kmem_cache *zx_numa_lock_cachep;
> +int NUMASHIFT;
> +int NUMACLUSTERS;
> +static atomic_t lockindex;
> +int dynamic_enable;
> +
> +static const struct numa_cpu_info numa_cpu_list[] = {
> + /*feature1=1, a numa node includes two clusters*/
> + //{1, 23, X86_VENDOR_AMD, 0, 1},
> + {0x5b, 7, X86_VENDOR_CENTAUR, 0, 1},
> + {0x5b, 7, X86_VENDOR_ZHAOXIN, 0, 1}
> +};
Why are this zx_*() code specifically for ZhaoXin and Centaur family of
CPUs? Are there some special hardware features that are specific to
these CPUs?
BTW, your patch series lacks performance data to justify the addition of
quite a lot of complexity to the core locking code. We are unlikely to
take this without sufficient justification.
Another question that I have is that the base osq_lock() can coexist
with your xz_osq_lock(). A cpu can dynamically switch from using
osq_lock() to xz_osq_lock() and vice versa. What happens if some CPUs
use osq_lock() while others use xz_osq_lock()? Will that cause a
problem? Have you fully test this scenario to make sure that nothing
breaks?
Cheers,
Longman
On 2024/9/15 01:21, Waiman Long wrote:
>
>
> [这封邮件来自外部发件人 谨防风险]
>
> On 9/14/24 04:53, yongli-oc wrote:
>> The numa-aware lock kernel memory cache preparation, and a
>> workqueue to turn numa-aware lock back to osq lock.
>> The /proc interface. Enable dynamic switch by
>> echo 1 > /proc/zx_numa_lock/dynamic_enable
>>
>> Signed-off-by: yongli-oc <yongli-oc@zhaoxin.com>
>> ---
>> kernel/locking/zx_numa.c | 537 +++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 537 insertions(+)
>> create mode 100644 kernel/locking/zx_numa.c
>>
>> diff --git a/kernel/locking/zx_numa.c b/kernel/locking/zx_numa.c
>> new file mode 100644
>> index 000000000000..89df6670a024
>> --- /dev/null
>> +++ b/kernel/locking/zx_numa.c
>> @@ -0,0 +1,537 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Dynamic numa-aware osq lock
>> + * Crossing from numa-aware lock to osq_lock
>> + * Numa lock memory initialize and /proc interface
>> + * Author: LiYong <yongli-oc@zhaoxin.com>
>> + *
>> + */
>> +#include <linux/cpumask.h>
>> +#include <asm/byteorder.h>
>> +#include <asm/kvm_para.h>
>> +#include <linux/percpu.h>
>> +#include <linux/sched.h>
>> +#include <linux/slab.h>
>> +#include <linux/osq_lock.h>
>> +#include <linux/module.h>
>> +#include <linux/proc_fs.h>
>> +#include <linux/seq_file.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/reboot.h>
>> +
>> +#include "numa.h"
>> +#include "numa_osq.h"
>> +
>> +int enable_zx_numa_osq_lock;
>> +struct delayed_work zx_numa_start_work;
>> +struct delayed_work zx_numa_cleanup_work;
>> +
>> +atomic_t numa_count;
>> +struct _numa_buf *zx_numa_entry;
>> +int zx_numa_lock_total = 256;
>> +LIST_HEAD(_zx_numa_head);
>> +LIST_HEAD(_zx_numa_lock_head);
>> +
>> +struct kmem_cache *zx_numa_entry_cachep;
>> +struct kmem_cache *zx_numa_lock_cachep;
>> +int NUMASHIFT;
>> +int NUMACLUSTERS;
>> +static atomic_t lockindex;
>> +int dynamic_enable;
>> +
>> +static const struct numa_cpu_info numa_cpu_list[] = {
>> + /*feature1=1, a numa node includes two clusters*/
>> + //{1, 23, X86_VENDOR_AMD, 0, 1},
>> + {0x5b, 7, X86_VENDOR_CENTAUR, 0, 1},
>> + {0x5b, 7, X86_VENDOR_ZHAOXIN, 0, 1}
>> +};
>
> Why are this zx_*() code specifically for ZhaoXin and Centaur family of
> CPUs? Are there some special hardware features that are specific to
> these CPUs?
> Zhaoxin cpu is a x86 architecture processor. The processor has no any
special hardware features about the dynamic numa-aware lock patch.
But since different processor always has different NUMA architecture
features, I listed Zhaoxin CPU only.
When I tested the patch, I found the AMD EPYC 7551 is something like
the Zhaoxin CPU. Both one node has two clusters, unlock processes
in one cluster is much faster than unlock them in NUMA node.
I am not sure if it is fit for AMD CPU or not. so I comment the code for
the AMD CPU.
BTW, your patch series lacks performance data to justify the addition of
> quite a lot of complexity to the core locking code. We are unlikely to
> take this without sufficient justification.
>
In the cover letter, these is performance test result for AMD EPYC 7551 and
Zhaoxin KH40000. I listed the perf epoll, locktorture mutex, unixbench
and fxmark.
What test do you think is important for the Lock performance?
I will do more test in next submission.
> Another question that I have is that the base osq_lock() can coexist
> with your xz_osq_lock(). A cpu can dynamically switch from using
> osq_lock() to xz_osq_lock() and vice versa. What happens if some CPUs
> use osq_lock() while others use xz_osq_lock()? Will that cause a
> problem? Have you fully test this scenario to make sure that nothing
> breaks?
> Cheers,
> Longman
The x_osq_lock uses a 16 bits tail, the program is the nearly the same as
osq_lock before turning to numa-aware lock. By my opinion, from Intel
instruction set, the atomic_xchg 32bits and cmpxchg 16 bits, both have
LOCK prefix, the cacheline for tail are all accessed exclusively.
After dynamic switch enable, some processes will enter the
x_osq_lock/x_osq_unlock, if these processes meet queue tail, it will
atomic set the numa_enable to OSQTONUMADETECT. If some processes
are still in osq_lock, the numa_enable will be cleaned by atomic_xchg and
old &= 0xffff; it will be set again when x_osq_unlock meets queue tail
next time.
After the numa_enable is set to OSQTONUMADETECT, the x_osq_unlock
will start to record contention depth(the serial in queue tail 's
optimistic_spin_node minus it in current unlocked CPU's node). If the depth
is more than osq_lock_depth, it will start increase the locked variable
in struct optimistic_spin_node. After the locked variable is more than
osq_keep_times, it starts to turn to numa-aware lock.
If some processes in osq_lock/osq_unlock, the locked variable is
always set to 1.
So when set numa_enable to OSQLOCKSTOPPING, start switching to numa-aware
lock, so many lock()/unlock() are finished, all the processes should
read the
enable_zx_numa_osq_lock as 2, to execute the x_osq_lock().
Consider unnecessarily to enable/disable dynamic switch frequently,
I did not add stopping protection here.
I prefer to use x_osq_lock to replace the osq_lock when
CONFIG_LOCK_SPIN_ON_OWNER_NUMA=y.
As I know, in x86_64, with __LOCK prefix, the performance of 32 bits
operand
is nearly the same as its of 16 bits operand. From the test result in
cover letter,
one or two processes, the performance difference is very little. I do
not know if it
is the same for other platform?
Best regards.
Li Yong
>
On 9/19/24 05:41, yongli-os wrote: > BTW, your patch series lacks performance data to justify the addition of > >> quite a lot of complexity to the core locking code. We are unlikely to >> take this without sufficient justification. >> > In the cover letter, these is performance test result for AMD EPYC > 7551 and > > Zhaoxin KH40000. I listed the perf epoll, locktorture mutex, unixbench > and fxmark. > > What test do you think is important for the Lock performance? > > I will do more test in next submission. Ah, I was not sent to/cc on the cover-letter. I only got your patches 1-4. Yes, you did sent out a cover letter with some performance numbers after checking the LKML list. I will take a closer look at these performance numbers later as I am attending the LPC conference this week. Cheers, Longman
© 2016 - 2026 Red Hat, Inc.