[PATCH 4/4] locking/osq_lock: The numa-aware lock memory prepare, assign and cleanup.

yongli-oc posted 4 patches 2 months, 2 weeks ago
There is a newer version of this series
[PATCH 4/4] locking/osq_lock: The numa-aware lock memory prepare, assign and cleanup.
Posted by yongli-oc 2 months, 2 weeks ago
The numa-aware lock kernel memory cache preparation, and a
workqueue to turn numa-aware lock back to osq lock.
The /proc interface. Enable dynamic switch by
echo 1 > /proc/zx_numa_lock/dynamic_enable

Signed-off-by: yongli-oc <yongli-oc@zhaoxin.com>
---
 kernel/locking/zx_numa.c | 537 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 537 insertions(+)
 create mode 100644 kernel/locking/zx_numa.c

diff --git a/kernel/locking/zx_numa.c b/kernel/locking/zx_numa.c
new file mode 100644
index 000000000000..89df6670a024
--- /dev/null
+++ b/kernel/locking/zx_numa.c
@@ -0,0 +1,537 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Dynamic numa-aware osq lock
+ * Crossing from numa-aware lock to osq_lock
+ * Numa lock memory initialize and /proc interface
+ * Author: LiYong <yongli-oc@zhaoxin.com>
+ *
+ */
+#include <linux/cpumask.h>
+#include <asm/byteorder.h>
+#include <asm/kvm_para.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/osq_lock.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/reboot.h>
+
+#include "numa.h"
+#include "numa_osq.h"
+
+int enable_zx_numa_osq_lock;
+struct delayed_work zx_numa_start_work;
+struct delayed_work zx_numa_cleanup_work;
+
+atomic_t numa_count;
+struct _numa_buf *zx_numa_entry;
+int zx_numa_lock_total = 256;
+LIST_HEAD(_zx_numa_head);
+LIST_HEAD(_zx_numa_lock_head);
+
+struct kmem_cache *zx_numa_entry_cachep;
+struct kmem_cache *zx_numa_lock_cachep;
+int NUMASHIFT;
+int NUMACLUSTERS;
+static atomic_t lockindex;
+int dynamic_enable;
+
+static const struct numa_cpu_info numa_cpu_list[] = {
+	/*feature1=1, a numa node includes two clusters*/
+	//{1, 23, X86_VENDOR_AMD, 0, 1},
+	{0x5b, 7, X86_VENDOR_CENTAUR, 0, 1},
+	{0x5b, 7, X86_VENDOR_ZHAOXIN, 0, 1}
+};
+
+inline void *get_numa_lock(int index)
+{
+	if (index >= 0 && index < zx_numa_lock_total)
+		return zx_numa_entry[index].numa_ptr;
+	else
+		return NULL;
+}
+
+static int zx_get_numa_shift(int all_cpus, int clusters)
+{
+	int cpus = (int) all_cpus/clusters;
+	int count = 0;
+
+	while (cpus) {
+		cpus >>= 1;
+		count++;
+	}
+	return count-1;
+}
+
+void numa_lock_init_data(struct _numa_lock *s, int clusters,
+			u32 lockval, u32 lockaddr)
+{
+	int j = 0;
+
+	for (j = 0; j < clusters + NUMAEXPAND; j++) {
+		atomic_set(&(s + j)->tail, lockval);
+		atomic_set(&(s + j)->addr, lockaddr);
+		(s + j)->shift = NUMASHIFT;
+		(s + j)->stopping = 0;
+		(s + j)->numa_nodes = clusters;
+		(s + j)->accessed = 0;
+		(s + j)->totalaccessed = 0;
+		(s + j)->nodeswitched = 0;
+		atomic_set(&(s + j)->initlock, 0);
+		atomic_set(&(s + j)->pending, 0);
+	}
+}
+
+int zx_numa_lock_ptr_get(void *p)
+{
+	int i = 0;
+	int index = 0;
+
+	if (atomic_read(&numa_count) >= zx_numa_lock_total)
+		return zx_numa_lock_total;
+
+	index = atomic_inc_return(&lockindex);
+
+	for (i = 0; i < zx_numa_lock_total; i++) {
+		if (index >= zx_numa_lock_total)
+			index = 0;
+		if (cmpxchg(&zx_numa_entry[index].lockaddr,
+					0, ptrmask(p)) == 0) {
+			while (1) {
+				struct _numa_lock *node_lock =
+					zx_numa_entry[index].numa_ptr;
+				struct _numa_lock *numa_lock = node_lock +
+						node_lock->numa_nodes;
+
+				if (atomic_read(&numa_lock->tail) ==
+								NUMA_LOCKED_VAL)
+					break;
+				cpu_relax();
+
+			}
+			atomic_inc(&numa_count);
+			zx_numa_entry[index].highaddr = ((u64)p) >> 32;
+			atomic_set(&lockindex, index);
+			return index;
+		}
+		index++;
+		if (atomic_read(&numa_count) >= zx_numa_lock_total)
+			break;
+	}
+	return zx_numa_lock_total;
+}
+
+int zx_check_numa_dynamic_locked(u32 lockaddr,
+		struct _numa_lock *_numa_lock, int t)
+{
+	struct _numa_lock *node_lock = NULL;
+	u64 s = -1;
+	int i = 0;
+
+	if (atomic_read(&_numa_lock->pending) != 0)
+		return 1;
+
+	for (i = 0; i < _numa_lock->numa_nodes + 1; i++) {
+		node_lock = _numa_lock + i;
+		cpu_relax(); cpu_relax(); cpu_relax(); cpu_relax();
+		s = atomic64_read((atomic64_t *) &node_lock->tail);
+		if ((s >> 32) != lockaddr)
+			continue;
+		if ((s & LOW32MASK) == NUMA_LOCKED_VAL
+				|| (s & LOW32MASK) == NUMA_UNLOCKED_VAL)
+			continue;
+		break;
+	}
+
+	if (i == _numa_lock->numa_nodes + 1)
+		return 0;
+	return i+1;
+}
+
+static int zx_numa_lock64_try_to_freeze(u32 lockaddr, struct _numa_lock *_numa_lock,
+			int index)
+{
+	struct _numa_lock *node_lock = NULL;
+	u64 addr = ((u64)lockaddr) << 32;
+	u64 s = 0;
+	u64 ff = 0;
+	int i = 0;
+
+	for (i = 0; i < _numa_lock->numa_nodes+1; i++) {
+		node_lock = _numa_lock + i;
+		cpu_relax();
+
+		s = atomic64_read((atomic64_t *)&node_lock->tail);
+		if ((s & HIGH32BITMASK) != addr)
+			continue;
+
+		if ((s & LOW32MASK) == NUMA_LOCKED_VAL)
+			continue;
+
+		if ((s & LOW32MASK) == NUMA_UNLOCKED_VAL) {
+			ff = atomic64_cmpxchg((atomic64_t *)&node_lock->tail,
+				(addr|NUMA_UNLOCKED_VAL), NUMA_LOCKED_VAL);
+			if (ff == (addr|NUMA_UNLOCKED_VAL))
+				continue;
+		}
+		break;
+	}
+
+	if (i == _numa_lock->numa_nodes + 1) {
+		zx_numa_entry[index].idle = 0;
+		zx_numa_entry[index].type = 0;
+		zx_numa_entry[index].highaddr = 0;
+		xchg(&zx_numa_entry[index].lockaddr, 0);
+	}
+
+	return i;
+}
+
+static void zx_numa_lock_stopping(struct _numa_lock *_numa_lock)
+{
+	struct _numa_lock *node_lock = NULL;
+	int i = 0;
+
+	for (i = 0; i < _numa_lock->numa_nodes+1; i++) {
+		node_lock = _numa_lock + i;
+		WRITE_ONCE(node_lock->stopping, 1);
+	}
+}
+
+static void zx_numa_cleanup(struct work_struct *work)
+{
+	int i = 0;
+	int checktimes = 2;
+
+	//reboot or power off state
+	if (READ_ONCE(enable_zx_numa_osq_lock) == 0xf)
+		return;
+
+	if (atomic_read(&numa_count) == 0) {
+		if (READ_ONCE(dynamic_enable) != 0)
+			schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
+		return;
+	}
+
+	for (i = 0; i < zx_numa_lock_total; i++) {
+		int s = 0;
+		u32 lockaddr = READ_ONCE(zx_numa_entry[i].lockaddr);
+		u32 type = zx_numa_entry[i].type;
+		struct _numa_lock *buf =  zx_numa_entry[i].numa_ptr;
+		int nodes = 0;
+
+		if (lockaddr == 0 || type == 3 || zx_numa_entry[i].idle == 0)
+			continue;
+		nodes = buf->numa_nodes;
+		if (zx_numa_entry[i].idle < checktimes) {
+
+			s = zx_check_numa_dynamic_locked(lockaddr, buf, 1);
+			if (s != 0) {
+				zx_numa_entry[i].idle = 1;
+				continue;
+			}
+			zx_numa_entry[i].idle++;
+		}
+
+		if (zx_numa_entry[i].idle == checktimes) {
+			zx_numa_lock_stopping(buf);
+			zx_numa_entry[i].idle++;
+
+		}
+
+		if (zx_numa_entry[i].idle == checktimes+1) {
+			while (1) {
+				if (zx_numa_lock64_try_to_freeze(lockaddr, buf,
+						i) == nodes + 1) {
+					//all node has been locked
+					u32 left = 0;
+
+					left = atomic_dec_return(&numa_count);
+					break;
+				}
+				cpu_relax(); cpu_relax();
+				cpu_relax(); cpu_relax();
+			}
+		}
+	}
+	schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
+}
+
+static int create_numa_buffer_list(int clusters, int len)
+{
+	int i = 0;
+
+	for (i = 0; i < zx_numa_lock_total; i++) {
+		struct _numa_lock *s = (struct _numa_lock *)kmem_cache_alloc(
+				zx_numa_lock_cachep, GFP_KERNEL);
+		if (!s) {
+			while (i > 0) {
+				kmem_cache_free(zx_numa_lock_cachep,
+						zx_numa_entry[i-1].numa_ptr);
+				i--;
+			}
+			return 0;
+		}
+		memset((char *)s, 0,
+			len * L1_CACHE_BYTES * (clusters + NUMAEXPAND));
+		numa_lock_init_data(s, clusters, NUMA_LOCKED_VAL, 0);
+		zx_numa_entry[i].numa_ptr = s;
+		zx_numa_entry[i].lockaddr = 0;
+		zx_numa_entry[i].highaddr = 0;
+		zx_numa_entry[i].idle = 0;
+		zx_numa_entry[i].type = 0;
+	}
+
+	for (i = 0; i < zx_numa_lock_total; i++) {
+		zx_numa_entry[i].index = i;
+		list_add_tail(&(zx_numa_entry[i].list), &_zx_numa_lock_head);
+	}
+	return 1;
+}
+
+static int zx_numa_lock_init(int numa)
+{
+	int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+	int d = 0;
+	int status = 0;
+
+	atomic_set(&lockindex, 0);
+	atomic_set(&numa_count, 0);
+
+	if (sizeof(struct _numa_lock) & 0x3f)
+		d = (int)((sizeof(struct _numa_lock) + L1_CACHE_BYTES) /
+			  L1_CACHE_BYTES);
+	else
+		d = (int)(sizeof(struct _numa_lock) / L1_CACHE_BYTES);
+
+	zx_numa_entry_cachep = kmem_cache_create(
+		"zx_numa_entry",
+		sizeof(struct _numa_buf) * zx_numa_lock_total, align,
+		SLAB_PANIC | SLAB_ACCOUNT, NULL);
+
+	zx_numa_lock_cachep = kmem_cache_create(
+		"zx_numa_lock",
+		d * L1_CACHE_BYTES * (numa + NUMAEXPAND), align,
+		SLAB_PANIC | SLAB_ACCOUNT, NULL);
+
+
+	if (zx_numa_entry_cachep && zx_numa_lock_cachep) {
+		zx_numa_entry = (struct _numa_buf *)kmem_cache_alloc(
+				zx_numa_entry_cachep, GFP_KERNEL);
+		if (zx_numa_entry) {
+			memset((char *)zx_numa_entry, 0,
+				sizeof(struct _numa_buf) * zx_numa_lock_total);
+			create_numa_buffer_list(numa, d);
+			status = 1;
+		}
+	}
+
+	pr_info("enable dynamic numa-aware osq_lock, clusters %d\n",
+		numa);
+	return status;
+}
+
+
+#define numa_lock_proc_dir "zx_numa_lock"
+#define zx_numa_enable_dir "dynamic_enable"
+#define numa_entry_total 8
+struct proc_dir_entry *numa_lock_proc;
+struct proc_dir_entry *numa_lock_enable;
+struct proc_dir_entry *numa_proc_entry[numa_entry_total];
+
+static ssize_t numa_lock_proc_read(struct file *file,
+		char __user *usrbuf, size_t len, loff_t *off)
+{
+	int id = (long) pde_data(file_inode(file));
+	char kbuffer[128];
+	ssize_t retval = 0;
+	size_t n = 0;
+
+	memset(kbuffer, 0, sizeof(kbuffer));
+	if (id == 0)
+		n = sprintf(kbuffer, "%d\n", READ_ONCE(dynamic_enable));
+	else if (id == 1)
+		n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_lock_depth));
+	else if (id == 2)
+		n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_keep_times));
+	else if (id == 3)
+		n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_node_max));
+	else if (id == 4)
+		n = sprintf(kbuffer, "%d\n", atomic_read(&numa_count));
+	retval = simple_read_from_buffer(usrbuf, len, off, kbuffer, n);
+
+	return retval;
+}
+
+static ssize_t numa_lock_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *f_pos)
+{
+	int id = (long) pde_data(file_inode(file));
+	char kbuffer[128];
+	unsigned long new = 0;
+	int err = 0;
+
+	memset(kbuffer, 0, sizeof(kbuffer));
+	if (copy_from_user(kbuffer, buffer, count))
+		return count;
+	kbuffer[count] = '\0';
+	err = kstrtoul(kbuffer, 10, &new);
+
+	if (id == 0) {
+		int last = READ_ONCE(dynamic_enable);
+
+		if (new < 0 || new >= 2 || last == new)
+			return count;
+
+		if (last == 0) {
+			prefetchw(&enable_zx_numa_osq_lock);
+			//enable to the 2-bytes-tail osq-lock
+			prefetchw(&enable_zx_numa_osq_lock);
+			WRITE_ONCE(enable_zx_numa_osq_lock, 2);
+			schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
+		}
+		prefetchw(&dynamic_enable);
+		WRITE_ONCE(dynamic_enable, new);
+		return count;
+	}
+
+	if (READ_ONCE(dynamic_enable) != 0) {
+		pr_info("dynamic %d: change setting should disable dynamic\n",
+			dynamic_enable);
+		return count;
+	}
+	if (id == 1 && new > 4 && new <= 32)
+		WRITE_ONCE(osq_lock_depth, new);
+	else if (id == 2 && new >= 16 && new <= 2048)
+		WRITE_ONCE(osq_keep_times, new);
+	else if (id == 3 && new > 4 && new <= 2048)
+		WRITE_ONCE(osq_node_max, new);
+	return count;
+}
+static int numa_lock_proc_show(struct seq_file *m, void *v)
+{
+	return 0;
+}
+
+static int numa_lock_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, numa_lock_proc_show, NULL);
+}
+static const struct proc_ops numa_lock_proc_fops = {
+	.proc_open = numa_lock_proc_open,
+	.proc_read = numa_lock_proc_read,
+	.proc_write = numa_lock_proc_write
+};
+
+static int numalock_proc_init(void)
+{
+	int index = 0;
+	int i = 0;
+
+	numa_lock_proc = proc_mkdir(numa_lock_proc_dir, NULL);
+	if (numa_lock_proc == NULL) {
+		pr_info("%s proc create %s failed\n", __func__,
+				numa_lock_proc_dir);
+		return -EINVAL;
+	}
+
+	numa_lock_enable = proc_create_data(zx_numa_enable_dir, 0666,
+		numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+	if (!numa_lock_enable) {
+		pr_info("%s proc_create_data %s failed!\n", __func__,
+				zx_numa_enable_dir);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < numa_entry_total; i++)
+		numa_proc_entry[i] = NULL;
+
+	numa_proc_entry[0] =  proc_create_data("osq_lock_depth", 0664,
+		numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+	numa_proc_entry[1] =  proc_create_data("osq_keep_times", 0664,
+		numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+	numa_proc_entry[2] =  proc_create_data("osq_node_max", 0664,
+		numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+	numa_proc_entry[3] =  proc_create_data("numa_osq_lock", 0444,
+		numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+	return 0;
+}
+
+static void numalock_proc_exit(void)
+{
+	int i = 0;
+
+	for (i = 0; i < numa_entry_total; i++) {
+		if (numa_proc_entry[i])
+			proc_remove(numa_proc_entry[i]);
+	}
+	if (numa_lock_enable)
+		proc_remove(numa_lock_enable);
+	if (numa_lock_proc)
+		remove_proc_entry(numa_lock_proc_dir, NULL);
+
+}
+
+static int numalock_shutdown_notify(struct notifier_block *unused1,
+		unsigned long unused2, void *unused3)
+{
+	if (READ_ONCE(enable_zx_numa_osq_lock) == 2) {
+		WRITE_ONCE(dynamic_enable, 0);
+		WRITE_ONCE(enable_zx_numa_osq_lock, 0xf);
+	}
+	return NOTIFY_DONE;
+}
+static struct notifier_block numalock_shutdown_nb = {
+	.notifier_call = numalock_shutdown_notify,
+};
+static int __init zx_numa_base_init(void)
+{
+	int cpu = num_possible_cpus();
+	int i = 0;
+
+	WRITE_ONCE(enable_zx_numa_osq_lock, 0);
+	if (kvm_para_available())
+		return 0;
+	if (cpu >= 65534 || cpu < 16 || (cpu & 0x7) != 0)
+		return 0;
+
+	for (i = 0; i < ARRAY_SIZE(numa_cpu_list); i++) {
+		if (boot_cpu_data.x86_vendor == numa_cpu_list[i].x86_vendor &&
+			boot_cpu_data.x86 == numa_cpu_list[i].x86 &&
+			boot_cpu_data.x86_model == numa_cpu_list[i].x86_model) {
+
+			if (numa_cpu_list[i].feature1 == 1)
+				NUMACLUSTERS = nr_node_ids + nr_node_ids;
+			NUMASHIFT = zx_get_numa_shift(num_possible_cpus(),
+					NUMACLUSTERS);
+
+			if (zx_numa_lock_init(NUMACLUSTERS) == 0)
+				return -ENOMEM;
+			register_reboot_notifier(&numalock_shutdown_nb);
+			numalock_proc_init();
+			INIT_DELAYED_WORK(&zx_numa_cleanup_work,
+				zx_numa_cleanup);
+			prefetchw(&enable_zx_numa_osq_lock);
+			WRITE_ONCE(enable_zx_numa_osq_lock, 1);
+			return 0;
+		}
+	}
+	return 0;
+}
+
+static void __exit zx_numa_lock_exit(void)
+{
+	numalock_proc_exit();
+	prefetchw(&dynamic_enable);
+	WRITE_ONCE(dynamic_enable, 0);
+}
+
+late_initcall(zx_numa_base_init);
+module_exit(zx_numa_lock_exit);
+MODULE_AUTHOR("LiYong <yongli-oc@zhaoxin.com>");
+MODULE_DESCRIPTION("zx dynamic numa-aware osq lock");
+MODULE_LICENSE("GPL");
+
-- 
2.34.1
Re: [PATCH 4/4] locking/osq_lock: The numa-aware lock memory prepare, assign and cleanup.
Posted by kernel test robot 2 months, 2 weeks ago
Hi yongli-oc,

kernel test robot noticed the following build warnings:

[auto build test WARNING on tip/locking/core]
[also build test WARNING on akpm-mm/mm-nonmm-unstable linus/master v6.11-rc7 next-20240913]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/yongli-oc/locking-osq_lock-The-Kconfig-for-dynamic-numa-aware-osq-lock/20240914-172336
base:   tip/locking/core
patch link:    https://lore.kernel.org/r/20240914085327.32912-5-yongli-oc%40zhaoxin.com
patch subject: [PATCH 4/4] locking/osq_lock: The numa-aware lock memory prepare, assign and cleanup.
config: x86_64-allyesconfig (https://download.01.org/0day-ci/archive/20240916/202409160059.VIbC9G04-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240916/202409160059.VIbC9G04-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409160059.VIbC9G04-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> kernel/locking/zx_numa.c:250:10: warning: variable 'left' set but not used [-Wunused-but-set-variable]
     250 |                                         u32 left = 0;
         |                                             ^
>> kernel/locking/zx_numa.c:375:6: warning: variable 'err' set but not used [-Wunused-but-set-variable]
     375 |         int err = 0;
         |             ^
   2 warnings generated.


vim +/left +250 kernel/locking/zx_numa.c

   203	
   204	static void zx_numa_cleanup(struct work_struct *work)
   205	{
   206		int i = 0;
   207		int checktimes = 2;
   208	
   209		//reboot or power off state
   210		if (READ_ONCE(enable_zx_numa_osq_lock) == 0xf)
   211			return;
   212	
   213		if (atomic_read(&numa_count) == 0) {
   214			if (READ_ONCE(dynamic_enable) != 0)
   215				schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
   216			return;
   217		}
   218	
   219		for (i = 0; i < zx_numa_lock_total; i++) {
   220			int s = 0;
   221			u32 lockaddr = READ_ONCE(zx_numa_entry[i].lockaddr);
   222			u32 type = zx_numa_entry[i].type;
   223			struct _numa_lock *buf =  zx_numa_entry[i].numa_ptr;
   224			int nodes = 0;
   225	
   226			if (lockaddr == 0 || type == 3 || zx_numa_entry[i].idle == 0)
   227				continue;
   228			nodes = buf->numa_nodes;
   229			if (zx_numa_entry[i].idle < checktimes) {
   230	
   231				s = zx_check_numa_dynamic_locked(lockaddr, buf, 1);
   232				if (s != 0) {
   233					zx_numa_entry[i].idle = 1;
   234					continue;
   235				}
   236				zx_numa_entry[i].idle++;
   237			}
   238	
   239			if (zx_numa_entry[i].idle == checktimes) {
   240				zx_numa_lock_stopping(buf);
   241				zx_numa_entry[i].idle++;
   242	
   243			}
   244	
   245			if (zx_numa_entry[i].idle == checktimes+1) {
   246				while (1) {
   247					if (zx_numa_lock64_try_to_freeze(lockaddr, buf,
   248							i) == nodes + 1) {
   249						//all node has been locked
 > 250						u32 left = 0;
   251	
   252						left = atomic_dec_return(&numa_count);
   253						break;
   254					}
   255					cpu_relax(); cpu_relax();
   256					cpu_relax(); cpu_relax();
   257				}
   258			}
   259		}
   260		schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
   261	}
   262	
   263	static int create_numa_buffer_list(int clusters, int len)
   264	{
   265		int i = 0;
   266	
   267		for (i = 0; i < zx_numa_lock_total; i++) {
   268			struct _numa_lock *s = (struct _numa_lock *)kmem_cache_alloc(
   269					zx_numa_lock_cachep, GFP_KERNEL);
   270			if (!s) {
   271				while (i > 0) {
   272					kmem_cache_free(zx_numa_lock_cachep,
   273							zx_numa_entry[i-1].numa_ptr);
   274					i--;
   275				}
   276				return 0;
   277			}
   278			memset((char *)s, 0,
   279				len * L1_CACHE_BYTES * (clusters + NUMAEXPAND));
   280			numa_lock_init_data(s, clusters, NUMA_LOCKED_VAL, 0);
   281			zx_numa_entry[i].numa_ptr = s;
   282			zx_numa_entry[i].lockaddr = 0;
   283			zx_numa_entry[i].highaddr = 0;
   284			zx_numa_entry[i].idle = 0;
   285			zx_numa_entry[i].type = 0;
   286		}
   287	
   288		for (i = 0; i < zx_numa_lock_total; i++) {
   289			zx_numa_entry[i].index = i;
   290			list_add_tail(&(zx_numa_entry[i].list), &_zx_numa_lock_head);
   291		}
   292		return 1;
   293	}
   294	
   295	static int zx_numa_lock_init(int numa)
   296	{
   297		int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
   298		int d = 0;
   299		int status = 0;
   300	
   301		atomic_set(&lockindex, 0);
   302		atomic_set(&numa_count, 0);
   303	
   304		if (sizeof(struct _numa_lock) & 0x3f)
   305			d = (int)((sizeof(struct _numa_lock) + L1_CACHE_BYTES) /
   306				  L1_CACHE_BYTES);
   307		else
   308			d = (int)(sizeof(struct _numa_lock) / L1_CACHE_BYTES);
   309	
   310		zx_numa_entry_cachep = kmem_cache_create(
   311			"zx_numa_entry",
   312			sizeof(struct _numa_buf) * zx_numa_lock_total, align,
   313			SLAB_PANIC | SLAB_ACCOUNT, NULL);
   314	
   315		zx_numa_lock_cachep = kmem_cache_create(
   316			"zx_numa_lock",
   317			d * L1_CACHE_BYTES * (numa + NUMAEXPAND), align,
   318			SLAB_PANIC | SLAB_ACCOUNT, NULL);
   319	
   320	
   321		if (zx_numa_entry_cachep && zx_numa_lock_cachep) {
   322			zx_numa_entry = (struct _numa_buf *)kmem_cache_alloc(
   323					zx_numa_entry_cachep, GFP_KERNEL);
   324			if (zx_numa_entry) {
   325				memset((char *)zx_numa_entry, 0,
   326					sizeof(struct _numa_buf) * zx_numa_lock_total);
   327				create_numa_buffer_list(numa, d);
   328				status = 1;
   329			}
   330		}
   331	
   332		pr_info("enable dynamic numa-aware osq_lock, clusters %d\n",
   333			numa);
   334		return status;
   335	}
   336	
   337	
   338	#define numa_lock_proc_dir "zx_numa_lock"
   339	#define zx_numa_enable_dir "dynamic_enable"
   340	#define numa_entry_total 8
   341	struct proc_dir_entry *numa_lock_proc;
   342	struct proc_dir_entry *numa_lock_enable;
   343	struct proc_dir_entry *numa_proc_entry[numa_entry_total];
   344	
   345	static ssize_t numa_lock_proc_read(struct file *file,
   346			char __user *usrbuf, size_t len, loff_t *off)
   347	{
   348		int id = (long) pde_data(file_inode(file));
   349		char kbuffer[128];
   350		ssize_t retval = 0;
   351		size_t n = 0;
   352	
   353		memset(kbuffer, 0, sizeof(kbuffer));
   354		if (id == 0)
   355			n = sprintf(kbuffer, "%d\n", READ_ONCE(dynamic_enable));
   356		else if (id == 1)
   357			n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_lock_depth));
   358		else if (id == 2)
   359			n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_keep_times));
   360		else if (id == 3)
   361			n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_node_max));
   362		else if (id == 4)
   363			n = sprintf(kbuffer, "%d\n", atomic_read(&numa_count));
   364		retval = simple_read_from_buffer(usrbuf, len, off, kbuffer, n);
   365	
   366		return retval;
   367	}
   368	
   369	static ssize_t numa_lock_proc_write(struct file *file,
   370			const char __user *buffer, size_t count, loff_t *f_pos)
   371	{
   372		int id = (long) pde_data(file_inode(file));
   373		char kbuffer[128];
   374		unsigned long new = 0;
 > 375		int err = 0;
   376	
   377		memset(kbuffer, 0, sizeof(kbuffer));
   378		if (copy_from_user(kbuffer, buffer, count))
   379			return count;
   380		kbuffer[count] = '\0';
   381		err = kstrtoul(kbuffer, 10, &new);
   382	
   383		if (id == 0) {
   384			int last = READ_ONCE(dynamic_enable);
   385	
   386			if (new < 0 || new >= 2 || last == new)
   387				return count;
   388	
   389			if (last == 0) {
   390				prefetchw(&enable_zx_numa_osq_lock);
   391				//enable to the 2-bytes-tail osq-lock
   392				prefetchw(&enable_zx_numa_osq_lock);
   393				WRITE_ONCE(enable_zx_numa_osq_lock, 2);
   394				schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
   395			}
   396			prefetchw(&dynamic_enable);
   397			WRITE_ONCE(dynamic_enable, new);
   398			return count;
   399		}
   400	
   401		if (READ_ONCE(dynamic_enable) != 0) {
   402			pr_info("dynamic %d: change setting should disable dynamic\n",
   403				dynamic_enable);
   404			return count;
   405		}
   406		if (id == 1 && new > 4 && new <= 32)
   407			WRITE_ONCE(osq_lock_depth, new);
   408		else if (id == 2 && new >= 16 && new <= 2048)
   409			WRITE_ONCE(osq_keep_times, new);
   410		else if (id == 3 && new > 4 && new <= 2048)
   411			WRITE_ONCE(osq_node_max, new);
   412		return count;
   413	}
   414	static int numa_lock_proc_show(struct seq_file *m, void *v)
   415	{
   416		return 0;
   417	}
   418	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH 4/4] locking/osq_lock: The numa-aware lock memory prepare, assign and cleanup.
Posted by Waiman Long 2 months, 2 weeks ago
On 9/14/24 04:53, yongli-oc wrote:
> The numa-aware lock kernel memory cache preparation, and a
> workqueue to turn numa-aware lock back to osq lock.
> The /proc interface. Enable dynamic switch by
> echo 1 > /proc/zx_numa_lock/dynamic_enable
>
> Signed-off-by: yongli-oc <yongli-oc@zhaoxin.com>
> ---
>   kernel/locking/zx_numa.c | 537 +++++++++++++++++++++++++++++++++++++++
>   1 file changed, 537 insertions(+)
>   create mode 100644 kernel/locking/zx_numa.c
>
> diff --git a/kernel/locking/zx_numa.c b/kernel/locking/zx_numa.c
> new file mode 100644
> index 000000000000..89df6670a024
> --- /dev/null
> +++ b/kernel/locking/zx_numa.c
> @@ -0,0 +1,537 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Dynamic numa-aware osq lock
> + * Crossing from numa-aware lock to osq_lock
> + * Numa lock memory initialize and /proc interface
> + * Author: LiYong <yongli-oc@zhaoxin.com>
> + *
> + */
> +#include <linux/cpumask.h>
> +#include <asm/byteorder.h>
> +#include <asm/kvm_para.h>
> +#include <linux/percpu.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/osq_lock.h>
> +#include <linux/module.h>
> +#include <linux/proc_fs.h>
> +#include <linux/seq_file.h>
> +#include <linux/uaccess.h>
> +#include <linux/reboot.h>
> +
> +#include "numa.h"
> +#include "numa_osq.h"
> +
> +int enable_zx_numa_osq_lock;
> +struct delayed_work zx_numa_start_work;
> +struct delayed_work zx_numa_cleanup_work;
> +
> +atomic_t numa_count;
> +struct _numa_buf *zx_numa_entry;
> +int zx_numa_lock_total = 256;
> +LIST_HEAD(_zx_numa_head);
> +LIST_HEAD(_zx_numa_lock_head);
> +
> +struct kmem_cache *zx_numa_entry_cachep;
> +struct kmem_cache *zx_numa_lock_cachep;
> +int NUMASHIFT;
> +int NUMACLUSTERS;
> +static atomic_t lockindex;
> +int dynamic_enable;
> +
> +static const struct numa_cpu_info numa_cpu_list[] = {
> +	/*feature1=1, a numa node includes two clusters*/
> +	//{1, 23, X86_VENDOR_AMD, 0, 1},
> +	{0x5b, 7, X86_VENDOR_CENTAUR, 0, 1},
> +	{0x5b, 7, X86_VENDOR_ZHAOXIN, 0, 1}
> +};

Why are this zx_*() code specifically for ZhaoXin and Centaur family of 
CPUs? Are there some special hardware features that are specific to 
these CPUs?

BTW, your patch series lacks performance data to justify the addition of 
quite a lot of complexity to the core locking code. We are unlikely to 
take this without sufficient justification.

Another question that I have is that the base osq_lock() can coexist 
with your xz_osq_lock(). A cpu can dynamically switch from using 
osq_lock() to xz_osq_lock() and vice versa. What happens if some CPUs 
use osq_lock() while others use xz_osq_lock()? Will that cause a 
problem? Have you fully test this scenario to make sure that nothing 
breaks?

Cheers,
Longman
Re: [PATCH 4/4] locking/osq_lock: The numa-aware lock memory prepare, assign and cleanup.
Posted by yongli-os 2 months, 1 week ago
On 2024/9/15 01:21, Waiman Long wrote:
>
>
> [这封邮件来自外部发件人 谨防风险]
>
> On 9/14/24 04:53, yongli-oc wrote:
>> The numa-aware lock kernel memory cache preparation, and a
>> workqueue to turn numa-aware lock back to osq lock.
>> The /proc interface. Enable dynamic switch by
>> echo 1 > /proc/zx_numa_lock/dynamic_enable
>>
>> Signed-off-by: yongli-oc <yongli-oc@zhaoxin.com>
>> ---
>>   kernel/locking/zx_numa.c | 537 +++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 537 insertions(+)
>>   create mode 100644 kernel/locking/zx_numa.c
>>
>> diff --git a/kernel/locking/zx_numa.c b/kernel/locking/zx_numa.c
>> new file mode 100644
>> index 000000000000..89df6670a024
>> --- /dev/null
>> +++ b/kernel/locking/zx_numa.c
>> @@ -0,0 +1,537 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Dynamic numa-aware osq lock
>> + * Crossing from numa-aware lock to osq_lock
>> + * Numa lock memory initialize and /proc interface
>> + * Author: LiYong <yongli-oc@zhaoxin.com>
>> + *
>> + */
>> +#include <linux/cpumask.h>
>> +#include <asm/byteorder.h>
>> +#include <asm/kvm_para.h>
>> +#include <linux/percpu.h>
>> +#include <linux/sched.h>
>> +#include <linux/slab.h>
>> +#include <linux/osq_lock.h>
>> +#include <linux/module.h>
>> +#include <linux/proc_fs.h>
>> +#include <linux/seq_file.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/reboot.h>
>> +
>> +#include "numa.h"
>> +#include "numa_osq.h"
>> +
>> +int enable_zx_numa_osq_lock;
>> +struct delayed_work zx_numa_start_work;
>> +struct delayed_work zx_numa_cleanup_work;
>> +
>> +atomic_t numa_count;
>> +struct _numa_buf *zx_numa_entry;
>> +int zx_numa_lock_total = 256;
>> +LIST_HEAD(_zx_numa_head);
>> +LIST_HEAD(_zx_numa_lock_head);
>> +
>> +struct kmem_cache *zx_numa_entry_cachep;
>> +struct kmem_cache *zx_numa_lock_cachep;
>> +int NUMASHIFT;
>> +int NUMACLUSTERS;
>> +static atomic_t lockindex;
>> +int dynamic_enable;
>> +
>> +static const struct numa_cpu_info numa_cpu_list[] = {
>> +     /*feature1=1, a numa node includes two clusters*/
>> +     //{1, 23, X86_VENDOR_AMD, 0, 1},
>> +     {0x5b, 7, X86_VENDOR_CENTAUR, 0, 1},
>> +     {0x5b, 7, X86_VENDOR_ZHAOXIN, 0, 1}
>> +};
>
> Why are this zx_*() code specifically for ZhaoXin and Centaur family of
> CPUs? Are there some special hardware features that are specific to
> these CPUs?

> Zhaoxin cpu is a x86 architecture processor. The processor has no any

special hardware features about the dynamic numa-aware lock patch.

But since different processor always has different  NUMA architecture

features,  I listed Zhaoxin CPU only.

When I tested the patch, I found the AMD EPYC 7551 is something like

  the Zhaoxin CPU. Both one node has  two clusters,  unlock processes

  in one cluster is much faster than unlock them in NUMA node.

I am not sure if it is fit for AMD CPU or not. so I comment the code for

the AMD CPU.

BTW, your patch series lacks performance data to justify the addition of

> quite a lot of complexity to the core locking code. We are unlikely to
> take this without sufficient justification.
>
In the cover letter,  these is performance test result for AMD EPYC 7551 and

Zhaoxin KH40000. I listed the perf epoll, locktorture mutex, unixbench 
and fxmark.

What test do you think is important for the Lock performance?

I will do more test in next submission.


> Another question that I have is that the base osq_lock() can coexist
> with your xz_osq_lock(). A cpu can dynamically switch from using
> osq_lock() to xz_osq_lock() and vice versa. What happens if some CPUs
> use osq_lock() while others use xz_osq_lock()? Will that cause a
> problem? Have you fully test this scenario to make sure that nothing
> breaks?
> Cheers,
> Longman 

The x_osq_lock uses a 16 bits tail,  the program is the nearly the same as

osq_lock before turning to numa-aware lock. By my opinion, from Intel

instruction set,  the atomic_xchg 32bits and cmpxchg 16 bits, both have

LOCK prefix,  the cacheline for tail are all accessed exclusively.


After dynamic switch enable,  some processes will enter the

x_osq_lock/x_osq_unlock,  if these processes meet queue tail, it will

atomic set the numa_enable to OSQTONUMADETECT. If some processes

are still in osq_lock, the numa_enable will be cleaned by atomic_xchg and

old &= 0xffff;  it will be set again when x_osq_unlock meets queue tail

next time.

After the numa_enable is set to OSQTONUMADETECT, the x_osq_unlock

will start to record contention depth(the serial in queue tail 's

optimistic_spin_node minus it in current unlocked CPU's node). If the depth

is more than osq_lock_depth, it will start increase the locked variable

in struct optimistic_spin_node.  After the locked variable is more than

osq_keep_times, it starts to turn to numa-aware lock.

If some processes in osq_lock/osq_unlock, the locked variable is

always set to 1.

So when set numa_enable to OSQLOCKSTOPPING, start switching to numa-aware

lock, so many lock()/unlock() are finished, all the processes should 
read the

enable_zx_numa_osq_lock as 2, to execute the x_osq_lock().

Consider unnecessarily to enable/disable dynamic switch frequently,

I did not add stopping protection here.


I prefer to use x_osq_lock to replace the osq_lock when

CONFIG_LOCK_SPIN_ON_OWNER_NUMA=y.

As I know,  in x86_64,  with __LOCK prefix,   the performance of 32 bits 
operand

is nearly the same as its of 16 bits operand.  From the test result in 
cover letter,

one or two processes, the performance difference is very little. I do 
not know if it

  is the same for other platform?

Best regards.

Li Yong


>
Re: [PATCH 4/4] locking/osq_lock: The numa-aware lock memory prepare, assign and cleanup.
Posted by Waiman Long 2 months, 1 week ago
On 9/19/24 05:41, yongli-os wrote:
> BTW, your patch series lacks performance data to justify the addition of
>
>> quite a lot of complexity to the core locking code. We are unlikely to
>> take this without sufficient justification.
>>
> In the cover letter,  these is performance test result for AMD EPYC 
> 7551 and
>
> Zhaoxin KH40000. I listed the perf epoll, locktorture mutex, unixbench 
> and fxmark.
>
> What test do you think is important for the Lock performance?
>
> I will do more test in next submission.

Ah, I was not sent to/cc on the cover-letter. I only got your patches 
1-4. Yes, you did sent out a cover letter with some performance numbers 
after checking the LKML list. I will take a closer look at these 
performance numbers later as I am attending the LPC conference this week.

Cheers,
Longman