According to the contention level, switches from osq_lock to
numa-aware osq_lock.
The numa.h is definition for numa-aware lock.
The numa_osq.h is definition for osq to numa switching.
The x_osq_lock.c is the crossing from two bytes osq lock to
numa-aware lock.
The zx_numa_osq.c lock is a two level osq_lock.
zx_numa.c:
The numa-aware lock kernel memory cache preparation, and a
workqueue to turn numa-aware lock back to osq lock.
The /proc interface. Enable dynamic switch by
echo 1 > /proc/zx_numa_lock/dynamic_enable
The new Makefile for dynamic numa-aware osq lock.
Signed-off-by: yongli-oc <yongli-oc@zhaoxin.com>
---
kernel/locking/Makefile | 3 +
kernel/locking/numa.h | 90 ++++++
kernel/locking/numa_osq.h | 29 ++
kernel/locking/x_osq_lock.c | 371 ++++++++++++++++++++++++
kernel/locking/zx_numa.c | 540 +++++++++++++++++++++++++++++++++++
kernel/locking/zx_numa_osq.c | 497 ++++++++++++++++++++++++++++++++
6 files changed, 1530 insertions(+)
create mode 100644 kernel/locking/numa.h
create mode 100644 kernel/locking/numa_osq.h
create mode 100644 kernel/locking/x_osq_lock.c
create mode 100644 kernel/locking/zx_numa.c
create mode 100644 kernel/locking/zx_numa_osq.c
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 0db4093d17b8..297bfad88fda 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -21,7 +21,10 @@ ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_LOCK_SPIN_ON_OWNER_NUMA) += x_osq_lock.o zx_numa_osq.o zx_numa.o
+else
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
+endif
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o
diff --git a/kernel/locking/numa.h b/kernel/locking/numa.h
new file mode 100644
index 000000000000..790c27ed18e5
--- /dev/null
+++ b/kernel/locking/numa.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_NUMA_LOCK_H
+#define __LINUX_NUMA_LOCK_H
+#include <linux/cache.h>
+#include "mcs_spinlock.h"
+
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
+ int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # + 1 value */
+ u32 serial;
+};
+
+
+struct _numa_buf {
+ void *numa_ptr;
+ struct list_head list;
+ u32 lockaddr;
+ u32 highaddr;
+ u8 idle;
+ u8 type;
+ u16 index;
+};
+
+struct _numa_lock {
+ atomic_t tail ____cacheline_aligned_in_smp;
+ atomic_t addr;
+ u8 shift;
+ u8 stopping;
+ u16 numa_nodes;
+ u32 accessed;
+ uint64_t totalaccessed;
+ u32 nodeswitched;
+ atomic_t initlock;
+ atomic_t pending;
+ union {
+ struct mcs_spinlock mcs_node;
+ struct optimistic_spin_node osq_node;
+ };
+ CACHELINE_PADDING(pad);
+};
+
+struct numa_cpu_info {
+ __u8 x86_model;
+ /* CPU family */
+ __u8 x86;
+ /* CPU vendor */
+ __u8 x86_vendor;
+ __u8 x86_reserved;
+ u32 feature1;
+};
+
+#define NUMAEXPAND 1
+
+#define COHORT_START 1
+#define ACQUIRE_NUMALOCK (UINT_MAX-1)
+#define NODE_WAIT UINT_MAX
+#define LOCK_NUMALOCK 1
+#define UNLOCK_NUMALOCK 0
+
+#define NUMALOCKDYNAMIC 0xff
+#define TURNTONUMAREADY 0xa
+
+#define NUMA_LOCKED_VAL 0xffffff
+#define NUMA_UNLOCKED_VAL 0
+
+#define HIGH32BITMASK 0xffffffff00000000
+#define LOW32MASK 0xffffffff
+
+extern int numa_shift;
+extern int numa_clusters;
+extern int zx_numa_lock_total;
+extern struct _numa_buf *zx_numa_entry;
+extern atomic_t numa_count;
+extern int enable_zx_numa_osq_lock;
+extern u32 zx_numa_lock;
+extern int dynamic_enable;
+extern struct kmem_cache *zx_numa_lock_cachep;
+
+static inline u32 ptrmask(void *s)
+{
+ return (uint64_t)s & LOW32MASK;
+}
+inline void *get_numa_lock(int index);
+
+int zx_check_numa_dynamic_locked(u32 lockaddr, struct _numa_lock *_numa_lock);
+int zx_numa_lock_ptr_get(void *p);
+void numa_lock_init_data(struct _numa_lock *s, int clusters, u32 lockval,
+ u32 lockaddr);
+#endif
diff --git a/kernel/locking/numa_osq.h b/kernel/locking/numa_osq.h
new file mode 100644
index 000000000000..5c4675abc4fc
--- /dev/null
+++ b/kernel/locking/numa_osq.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_NUMA_OSQ_H
+#define __LINUX_NUMA_OSQ_H
+
+#include <linux/osq_lock.h>
+#include "mcs_spinlock.h"
+
+#define OSQLOCKINITED 0
+#define OSQTONUMADETECT 0x10
+#define OSQLOCKSTOPPING 0xfc
+#define OSQ_LOCKED_VAL 0xffff
+
+extern u16 osq_keep_times;
+extern u16 osq_lock_depth;
+extern int osq_node_max;
+
+inline int encode_cpu(int cpu_nr);
+inline int node_cpu(struct optimistic_spin_node *node);
+inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val);
+
+void zx_osq_lock_stopping(struct optimistic_spin_queue *lock);
+void zx_osq_numa_start(struct optimistic_spin_queue *lock);
+void zx_osq_turn_numa_waiting(struct optimistic_spin_queue *lock);
+
+inline void zx_numa_osq_unlock(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *n);
+inline bool zx_numa_osq_lock(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *n);
+#endif
diff --git a/kernel/locking/x_osq_lock.c b/kernel/locking/x_osq_lock.c
new file mode 100644
index 000000000000..82cde1f6355b
--- /dev/null
+++ b/kernel/locking/x_osq_lock.c
@@ -0,0 +1,371 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * crossing from osq_lock to numa-aware lock
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/osq_lock.h>
+#include "numa.h"
+#include "numa_osq.h"
+
+u16 osq_lock_depth = 8;
+u16 osq_keep_times = 32;
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+inline int encode_cpu(int cpu_nr)
+{
+ return cpu_nr + 1;
+}
+
+inline int node_cpu(struct optimistic_spin_node *node)
+{
+ return node->cpu - 1;
+}
+
+inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+ int cpu_nr = encoded_cpu_val - 1;
+
+ return per_cpu_ptr(&osq_node, cpu_nr);
+}
+
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ *
+ * If osq_lock() is being cancelled there must be a previous node
+ * and 'old_cpu' is its CPU #.
+ * For osq_unlock() there is never a previous node and old_cpu is
+ * set to OSQ_UNLOCKED_VAL.
+ * If the osq is in OSQLOCKSTOPPING state,
+ * the tail will be set to OSQ_LOCKED_VAL to stop the queue.
+ */
+static inline struct optimistic_spin_node *
+osq_wait_next_stop(struct optimistic_spin_queue *lock,
+ struct optimistic_spin_node *node,
+ int old_cpu)
+{
+ u16 curr = encode_cpu(smp_processor_id());
+ u16 old = old_cpu;
+
+ if (lock->numa_enable == OSQLOCKSTOPPING && old == OSQ_UNLOCKED_VAL)
+ old = OSQ_LOCKED_VAL;
+
+ for (;;) {
+ if (READ_ONCE(lock->tail16) == curr &&
+ cmpxchg(&lock->tail16, curr, old) == curr) {
+
+ /*
+ * We were the last queued, we moved @lock back. @prev
+ * will now observe @lock and will complete its
+ * unlock()/unqueue().
+ */
+ return NULL;
+ }
+
+ /*
+ * We must xchg() the @node->next value, because if we were to
+ * leave it in, a concurrent unlock()/unqueue() from
+ * @node->next might complete Step-A and think its @prev is
+ * still valid.
+ *
+ * If the concurrent unlock()/unqueue() wins the race, we'll
+ * wait for either @lock to point to us, through its Step-B, or
+ * wait for a new @node->next from its Step-C.
+ */
+ if (node->next) {
+ struct optimistic_spin_node *next;
+
+ next = xchg(&node->next, NULL);
+ if (next)
+ return next;
+ }
+
+ cpu_relax();
+ }
+}
+
+bool osq_lock(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *prev, *next;
+ int cpu = smp_processor_id();
+ u16 curr = encode_cpu(cpu);
+ struct optimistic_spin_queue tail;
+ u16 old;
+
+ tail.val = READ_ONCE(lock->val);
+ if (unlikely(tail.numa_enable == OSQLOCKSTOPPING)) {
+ zx_osq_turn_numa_waiting(lock);
+ return osq_lock(lock);
+ }
+
+ if (unlikely(tail.numa_enable == NUMALOCKDYNAMIC)) {
+ struct _numa_lock *_numa_lock = NULL;
+ struct _numa_lock *node_lock = NULL;
+
+ _numa_lock = get_numa_lock(tail.index);
+ node_lock = (struct _numa_lock *) _numa_lock +
+ (cpu >> numa_shift);
+
+ prefetch(node_lock);
+ return zx_numa_osq_lock(lock, _numa_lock);
+ }
+
+ node->locked = 0;
+ node->next = NULL;
+ node->cpu = curr;
+ node->serial = 0;
+
+ /*
+ * if ss.tail16 is OSQ_LOCKED_VAL, keeps the LOCKED state
+ * and waiting numa-aware lock ready.
+ */
+
+ if (likely(tail.numa_enable >= OSQTONUMADETECT)) {
+ struct optimistic_spin_queue ss;
+
+ while (1) {
+ ss.val = atomic_read(&lock->tail);
+ if (ss.tail16 == OSQ_LOCKED_VAL) {
+ zx_osq_turn_numa_waiting(lock);
+ return osq_lock(lock);
+ }
+ if (cmpxchg(&lock->tail16, ss.tail16, curr)
+ == ss.tail16) {
+ old = ss.tail16;
+ break;
+ }
+ cpu_relax();
+ }
+ } else
+ old = xchg(&lock->tail16, curr);
+
+ if (old == OSQ_UNLOCKED_VAL) {
+ node->serial = 1;
+ return true;
+ }
+
+ prev = decode_cpu(old);
+ node->prev = prev;
+
+ // Record osq serial number for the lock.
+ node->serial = prev->serial + 1;
+ /*
+ * osq_lock() unqueue
+ *
+ * node->prev = prev osq_wait_next()
+ * WMB MB
+ * prev->next = node next->prev = prev // unqueue-C
+ *
+ * Here 'node->prev' and 'next->prev' are the same variable and we need
+ * to ensure these stores happen in-order to avoid corrupting the list.
+ */
+ smp_wmb();
+
+ WRITE_ONCE(prev->next, node);
+
+ /*
+ * Normally @prev is untouchable after the above store; because at that
+ * moment unlock can proceed and wipe the node element from stack.
+ *
+ * However, since our nodes are static per-cpu storage, we're
+ * guaranteed their existence -- this allows us to apply
+ * cmpxchg in an attempt to undo our queueing.
+ */
+
+ /*
+ * Wait to acquire the lock or cancellation. Note that need_resched()
+ * will come with an IPI, which will wake smp_cond_load_relaxed() if it
+ * is implemented with a monitor-wait. vcpu_is_preempted() relies on
+ * polling, be careful.
+ */
+ if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
+ vcpu_is_preempted(node_cpu(node->prev))))
+ return true;
+
+ /* unqueue */
+ /*
+ * Step - A -- stabilize @prev
+ *
+ * Undo our @prev->next assignment; this will make @prev's
+ * unlock()/unqueue() wait for a next pointer since @lock points to us
+ * (or later).
+ */
+
+ for (;;) {
+ /*
+ * cpu_relax() below implies a compiler barrier which would
+ * prevent this comparison being optimized away.
+ */
+ if (data_race(prev->next) == node &&
+ cmpxchg(&prev->next, node, NULL) == node)
+ break;
+
+ /*
+ * We can only fail the cmpxchg() racing against an unlock(),
+ * in which case we should observe @node->locked becoming
+ * true.
+ */
+ if (smp_load_acquire(&node->locked))
+ return true;
+
+ cpu_relax();
+
+ /*
+ * Or we race against a concurrent unqueue()'s step-B, in which
+ * case its step-C will write us a new @node->prev pointer.
+ */
+ prev = READ_ONCE(node->prev);
+ }
+
+ /*
+ * Step - B -- stabilize @next
+ *
+ * Similar to unlock(), wait for @node->next or move @lock from @node
+ * back to @prev.
+ */
+
+ next = osq_wait_next_stop(lock, node, prev->cpu);
+ if (!next)
+ return false;
+
+ /*
+ * Step - C -- unlink
+ *
+ * @prev is stable because its still waiting for a new @prev->next
+ * pointer, @next is stable because our @node->next pointer is NULL and
+ * it will wait in Step-A.
+ */
+
+ WRITE_ONCE(next->prev, prev);
+ WRITE_ONCE(prev->next, next);
+
+ return false;
+}
+
+/*
+ * In osq_unlock(), changes the osq state for switching, then unlock next.
+ * OSQTONUMADETECT: Starts to detect lock contention when first osq tail reached
+ * after dynamic enable.
+ * OSQLOCKSTOPPING: If lock contention keeps more than osq_lock_depth
+ * osq_keep_times, set the osq to STOPPING state and get
+ * numa-aware lock memory entry.
+ * NUMALOCKDYNAMIC: numa-aware lock initialized and ready.
+ */
+
+void osq_unlock(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_node *node, *next;
+ int threadshold = osq_lock_depth;
+ int cpu = smp_processor_id();
+ u16 curr = encode_cpu(cpu);
+ int depth = 0;
+ u32 count = 0;
+
+ if (unlikely(lock->numa_enable == NUMALOCKDYNAMIC)) {
+ struct _numa_lock *_numa_lock = get_numa_lock(lock->index);
+
+ prefetch((struct _numa_lock *) _numa_lock + (cpu >> numa_shift));
+ return zx_numa_osq_unlock(lock, _numa_lock);
+ }
+ /*
+ * Fast path for the uncontended case.
+ */
+ if (unlikely(lock->numa_enable == OSQTONUMADETECT)) {
+ struct optimistic_spin_node *node_last = NULL;
+ u16 tail = 0;
+
+ tail = cmpxchg(&lock->tail16, curr, OSQ_UNLOCKED_VAL);
+ if (tail == curr)
+ return;
+
+ node = this_cpu_ptr(&osq_node);
+ node_last = decode_cpu(tail);
+ //Get the contention level
+ depth = node_last->serial - node->serial;
+ count = READ_ONCE(node->locked);
+ if (count > osq_keep_times && (dynamic_enable & 0x1))
+ zx_osq_lock_stopping(lock);
+ } else if (unlikely(lock->numa_enable == OSQLOCKSTOPPING)) {
+ if (cmpxchg(&lock->tail16, curr, OSQ_LOCKED_VAL)
+ == curr) {
+ //All osq stopped, start to run as numa-aware lock.
+ zx_osq_numa_start(lock);
+ return;
+ }
+ } else {
+ struct optimistic_spin_queue t;
+
+ /*
+ * After dynamic enable, when osq reaches tail, set the osq
+ * to DETECT mode to detect the contention.
+ */
+ t.val = 0;
+ if (dynamic_enable & 0x1) {
+ if (atomic_read(&numa_count) < zx_numa_lock_total)
+ t.numa_enable = OSQTONUMADETECT;
+ }
+ if (t.numa_enable == OSQTONUMADETECT) {
+ if (atomic_cmpxchg_release(&lock->tail, curr,
+ (t.val | OSQ_UNLOCKED_VAL)) == curr)
+ return;
+ } else if (cmpxchg(&lock->tail16, curr,
+ OSQ_UNLOCKED_VAL) == curr)
+ return;
+ }
+
+ /*
+ * Second most likely case.
+ */
+ node = this_cpu_ptr(&osq_node);
+ next = xchg(&node->next, NULL);
+ if (next) {
+ if (depth > threadshold)
+ WRITE_ONCE(next->locked, count + 1);
+ else
+ WRITE_ONCE(next->locked, 1);
+ return;
+ }
+
+ next = osq_wait_next_stop(lock, node, OSQ_UNLOCKED_VAL);
+ if (next) {
+ if (depth > threadshold)
+ WRITE_ONCE(next->locked, count + 1);
+ else
+ WRITE_ONCE(next->locked, 1);
+ }
+}
+
+bool osq_is_locked(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_queue val;
+
+ val.val = atomic_read(&lock->tail);
+ if (val.tail16 == OSQ_UNLOCKED_VAL)
+ return false;
+
+ if (val.tail16 == OSQ_LOCKED_VAL) {
+
+ //state changing
+ if (val.numa_enable != NUMALOCKDYNAMIC)
+ return true;
+
+ return zx_check_numa_dynamic_locked(ptrmask(lock),
+ get_numa_lock(val.index));
+ }
+
+ return true;
+}
diff --git a/kernel/locking/zx_numa.c b/kernel/locking/zx_numa.c
new file mode 100644
index 000000000000..31d247261474
--- /dev/null
+++ b/kernel/locking/zx_numa.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Dynamic numa-aware osq lock
+ * Crossing from numa-aware lock to osq_lock
+ * Numa lock memory initialize and /proc interface
+ * Author: LiYong <yongli-oc@zhaoxin.com>
+ *
+ */
+#include <linux/cpumask.h>
+#include <asm/byteorder.h>
+#include <asm/kvm_para.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/osq_lock.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/reboot.h>
+
+#include "numa.h"
+#include "numa_osq.h"
+
+int enable_zx_numa_osq_lock;
+struct delayed_work zx_numa_start_work;
+struct delayed_work zx_numa_cleanup_work;
+
+atomic_t numa_count;
+struct _numa_buf *zx_numa_entry;
+int zx_numa_lock_total = 256;
+LIST_HEAD(_zx_numa_head);
+LIST_HEAD(_zx_numa_lock_head);
+
+struct kmem_cache *zx_numa_entry_cachep;
+struct kmem_cache *zx_numa_lock_cachep;
+int numa_shift;
+int numa_clusters;
+static atomic_t lockindex;
+int dynamic_enable;
+
+static const struct numa_cpu_info numa_cpu_list[] = {
+ /*feature1=1, a numa node includes two clusters*/
+ //{1, 23, X86_VENDOR_AMD, 0, 1},
+ {0x5b, 7, X86_VENDOR_CENTAUR, 0, 1},
+ {0x5b, 7, X86_VENDOR_ZHAOXIN, 0, 1}
+};
+
+inline void *get_numa_lock(int index)
+{
+ if (index >= 0 && index < zx_numa_lock_total)
+ return zx_numa_entry[index].numa_ptr;
+ else
+ return NULL;
+}
+
+static int zx_get_numa_shift(int all_cpus, int clusters)
+{
+ int cpus = (int) all_cpus/clusters;
+ int count = 0;
+
+ while (cpus) {
+ cpus >>= 1;
+ count++;
+ }
+ return count-1;
+}
+
+void numa_lock_init_data(struct _numa_lock *s, int clusters,
+ u32 lockval, u32 lockaddr)
+{
+ int j = 0;
+
+ for (j = 0; j < clusters + NUMAEXPAND; j++) {
+ atomic_set(&(s + j)->tail, lockval);
+ atomic_set(&(s + j)->addr, lockaddr);
+ (s + j)->shift = numa_shift;
+ (s + j)->stopping = 0;
+ (s + j)->numa_nodes = clusters;
+ (s + j)->accessed = 0;
+ (s + j)->totalaccessed = 0;
+ (s + j)->nodeswitched = 0;
+ atomic_set(&(s + j)->initlock, 0);
+ atomic_set(&(s + j)->pending, 0);
+ }
+}
+/*
+ * The lockaddr of zx_numa_enry is key value to know which index is occupied.
+ */
+int zx_numa_lock_ptr_get(void *p)
+{
+ int i = 0;
+ int index = 0;
+
+ if (atomic_read(&numa_count) >= zx_numa_lock_total)
+ return zx_numa_lock_total;
+
+ index = atomic_inc_return(&lockindex);
+
+ for (i = 0; i < zx_numa_lock_total; i++) {
+ if (index >= zx_numa_lock_total)
+ index = 0;
+ if (cmpxchg(&zx_numa_entry[index].lockaddr,
+ 0, ptrmask(p)) == 0) {
+ while (1) {
+ struct _numa_lock *node_lock =
+ zx_numa_entry[index].numa_ptr;
+ struct _numa_lock *numa_lock = node_lock +
+ node_lock->numa_nodes;
+
+ if (atomic_read(&numa_lock->tail) ==
+ NUMA_LOCKED_VAL)
+ break;
+ cpu_relax();
+
+ }
+ atomic_inc(&numa_count);
+ zx_numa_entry[index].highaddr = ((u64)p) >> 32;
+ atomic_set(&lockindex, index);
+ return index;
+ }
+ index++;
+ if (atomic_read(&numa_count) >= zx_numa_lock_total)
+ break;
+ }
+ return zx_numa_lock_total;
+}
+
+int zx_check_numa_dynamic_locked(u32 lockaddr,
+ struct _numa_lock *_numa_lock)
+{
+ struct _numa_lock *node_lock = NULL;
+ u64 s = -1;
+ int i = 0;
+
+ //in switching if the pending is not 0
+ if (atomic_read(&_numa_lock->pending) != 0)
+ return 1;
+
+ for (i = 0; i < _numa_lock->numa_nodes + 1; i++) {
+ node_lock = _numa_lock + i;
+ cpu_relax();
+ s = atomic64_read((atomic64_t *) &node_lock->tail);
+ if ((s >> 32) != lockaddr)
+ continue;
+ if ((s & LOW32MASK) == NUMA_LOCKED_VAL
+ || (s & LOW32MASK) == NUMA_UNLOCKED_VAL)
+ continue;
+ break;
+ }
+
+ if (i == _numa_lock->numa_nodes + 1)
+ return 0;
+ return i+1;
+}
+
+static int zx_numa_lock64_try_to_freeze(u32 lockaddr, struct _numa_lock *_numa_lock,
+ int index)
+{
+ struct _numa_lock *node_lock = NULL;
+ u64 addr = ((u64)lockaddr) << 32;
+ u64 s = 0;
+ u64 ff = 0;
+ int i = 0;
+
+ //check and set the tail to LOCKED from first node to last node,
+ //if, all node tail are LOCKED, try to set the NUMA node to LOCKED
+ for (i = 0; i < _numa_lock->numa_nodes+1; i++) {
+ node_lock = _numa_lock + i;
+ cpu_relax();
+
+ s = atomic64_read((atomic64_t *)&node_lock->tail);
+ if ((s & HIGH32BITMASK) != addr)
+ continue;
+
+ if ((s & LOW32MASK) == NUMA_LOCKED_VAL)
+ continue;
+
+ if ((s & LOW32MASK) == NUMA_UNLOCKED_VAL) {
+ ff = atomic64_cmpxchg((atomic64_t *)&node_lock->tail,
+ (addr|NUMA_UNLOCKED_VAL), NUMA_LOCKED_VAL);
+ if (ff == (addr|NUMA_UNLOCKED_VAL))
+ continue;
+ }
+ break;
+ }
+ //All node's tail and numa node's tail are LOCKED, set numa lock memory
+ //referenced by index to un-occupied.
+ if (i == _numa_lock->numa_nodes + 1) {
+ zx_numa_entry[index].idle = 0;
+ zx_numa_entry[index].type = 0;
+ zx_numa_entry[index].highaddr = 0;
+ xchg(&zx_numa_entry[index].lockaddr, 0);
+ }
+
+ return i;
+}
+
+static void zx_numa_lock_stopping(struct _numa_lock *_numa_lock)
+{
+ struct _numa_lock *node_lock = NULL;
+ int i = 0;
+
+ for (i = 0; i < _numa_lock->numa_nodes+1; i++) {
+ node_lock = _numa_lock + i;
+ WRITE_ONCE(node_lock->stopping, 1);
+ }
+}
+
+static void zx_numa_cleanup(struct work_struct *work)
+{
+ int i = 0;
+ int checktimes = 2;
+
+ //reboot or power off state
+ if (READ_ONCE(enable_zx_numa_osq_lock) == 0xf)
+ return;
+
+ //If dynamic enable and numa-aware lock count is 0, reschedule the
+ //workqueue.
+ if (atomic_read(&numa_count) == 0) {
+ if (READ_ONCE(dynamic_enable) != 0)
+ schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
+ return;
+ }
+
+ for (i = 0; i < zx_numa_lock_total; i++) {
+ int s = 0;
+ u32 lockaddr = READ_ONCE(zx_numa_entry[i].lockaddr);
+ u32 type = zx_numa_entry[i].type;
+ struct _numa_lock *buf = zx_numa_entry[i].numa_ptr;
+ int nodes = 0;
+
+ if (lockaddr == 0 || type == 3 || zx_numa_entry[i].idle == 0)
+ continue;
+ nodes = buf->numa_nodes;
+ if (zx_numa_entry[i].idle < checktimes) {
+ //check if all node is idle
+ s = zx_check_numa_dynamic_locked(lockaddr, buf);
+ if (s != 0) {
+ zx_numa_entry[i].idle = 1;
+ continue;
+ }
+ zx_numa_entry[i].idle++;
+ }
+
+ if (zx_numa_entry[i].idle == checktimes) {
+ //set each node to stopping mode
+ zx_numa_lock_stopping(buf);
+ zx_numa_entry[i].idle++;
+
+ }
+
+ if (zx_numa_entry[i].idle == checktimes+1) {
+ while (1) {
+ //try to freezed all nodes
+ if (zx_numa_lock64_try_to_freeze(lockaddr, buf,
+ i) == nodes + 1) {
+ //all node has been locked
+ atomic_dec(&numa_count);
+ break;
+ }
+ cpu_relax();
+ }
+ }
+ }
+ schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
+}
+
+static int create_numa_buffer_list(int clusters, int len)
+{
+ int i = 0;
+
+ for (i = 0; i < zx_numa_lock_total; i++) {
+ struct _numa_lock *s = (struct _numa_lock *)kmem_cache_alloc(
+ zx_numa_lock_cachep, GFP_KERNEL);
+ if (!s) {
+ while (i > 0) {
+ kmem_cache_free(zx_numa_lock_cachep,
+ zx_numa_entry[i-1].numa_ptr);
+ i--;
+ }
+ return 0;
+ }
+ memset((char *)s, 0,
+ len * L1_CACHE_BYTES * (clusters + NUMAEXPAND));
+ numa_lock_init_data(s, clusters, NUMA_LOCKED_VAL, 0);
+ zx_numa_entry[i].numa_ptr = s;
+ zx_numa_entry[i].lockaddr = 0;
+ zx_numa_entry[i].highaddr = 0;
+ zx_numa_entry[i].idle = 0;
+ zx_numa_entry[i].type = 0;
+ }
+
+ for (i = 0; i < zx_numa_lock_total; i++) {
+ zx_numa_entry[i].index = i;
+ list_add_tail(&(zx_numa_entry[i].list), &_zx_numa_lock_head);
+ }
+ return 1;
+}
+
+static int zx_numa_lock_init(int numa)
+{
+ int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+ int d = 0;
+ int status = 0;
+
+ atomic_set(&lockindex, 0);
+ atomic_set(&numa_count, 0);
+
+ if (sizeof(struct _numa_lock) & 0x3f)
+ d = (int)((sizeof(struct _numa_lock) + L1_CACHE_BYTES) /
+ L1_CACHE_BYTES);
+ else
+ d = (int)(sizeof(struct _numa_lock) / L1_CACHE_BYTES);
+
+ zx_numa_entry_cachep = kmem_cache_create(
+ "zx_numa_entry",
+ sizeof(struct _numa_buf) * zx_numa_lock_total, align,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
+
+ zx_numa_lock_cachep = kmem_cache_create(
+ "zx_numa_lock",
+ d * L1_CACHE_BYTES * (numa + NUMAEXPAND), align,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
+
+
+ if (zx_numa_entry_cachep && zx_numa_lock_cachep) {
+ zx_numa_entry = (struct _numa_buf *)kmem_cache_alloc(
+ zx_numa_entry_cachep, GFP_KERNEL);
+ if (zx_numa_entry) {
+ memset((char *)zx_numa_entry, 0,
+ sizeof(struct _numa_buf) * zx_numa_lock_total);
+ create_numa_buffer_list(numa, d);
+ status = 1;
+ }
+ }
+
+ pr_info("enable dynamic numa-aware osq_lock, clusters %d\n",
+ numa);
+ return status;
+}
+
+
+#define numa_lock_proc_dir "zx_numa_lock"
+#define numa_entry_total 8
+struct proc_dir_entry *numa_lock_proc;
+struct proc_dir_entry *numa_lock_enable;
+struct proc_dir_entry *numa_proc_entry[numa_entry_total];
+
+static ssize_t numa_lock_proc_read(struct file *file,
+ char __user *usrbuf, size_t len, loff_t *off)
+{
+ int id = (long) pde_data(file_inode(file));
+ char kbuffer[128];
+ ssize_t retval = 0;
+ size_t n = 0;
+
+ memset(kbuffer, 0, sizeof(kbuffer));
+ if (id == 0)
+ n = sprintf(kbuffer, "%d\n", READ_ONCE(dynamic_enable));
+ else if (id == 1)
+ n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_lock_depth));
+ else if (id == 2)
+ n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_keep_times));
+ else if (id == 3)
+ n = sprintf(kbuffer, "%d\n", READ_ONCE(osq_node_max));
+ else if (id == 4)
+ n = sprintf(kbuffer, "%d\n", atomic_read(&numa_count));
+ retval = simple_read_from_buffer(usrbuf, len, off, kbuffer, n);
+
+ return retval;
+}
+
+static ssize_t numa_lock_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *f_pos)
+{
+ int id = (long) pde_data(file_inode(file));
+ char kbuffer[128];
+ unsigned long new = 0;
+ int err = 0;
+
+ memset(kbuffer, 0, sizeof(kbuffer));
+ if (copy_from_user(kbuffer, buffer, count))
+ return count;
+ kbuffer[count] = '\0';
+ err = kstrtoul(kbuffer, 10, &new);
+ if (err != 0)
+ return count;
+
+ if (id == 0) {
+ int last = READ_ONCE(dynamic_enable);
+
+ if (new < 0 || new >= 2 || last == new)
+ return count;
+
+ if (last == 0)
+ schedule_delayed_work(&zx_numa_cleanup_work, 60*HZ);
+ prefetchw(&dynamic_enable);
+ WRITE_ONCE(dynamic_enable, new);
+ return count;
+ }
+
+ if (READ_ONCE(dynamic_enable) != 0) {
+ pr_info("dynamic %d: change setting should disable dynamic\n",
+ dynamic_enable);
+ return count;
+ }
+ if (id == 1 && new > 4 && new <= 32)
+ WRITE_ONCE(osq_lock_depth, new);
+ else if (id == 2 && new >= 16 && new <= 2048)
+ WRITE_ONCE(osq_keep_times, new);
+ else if (id == 3 && new > 4 && new <= 2048)
+ WRITE_ONCE(osq_node_max, new);
+ return count;
+}
+static int numa_lock_proc_show(struct seq_file *m, void *v)
+{
+ return 0;
+}
+
+static int numa_lock_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, numa_lock_proc_show, NULL);
+}
+static const struct proc_ops numa_lock_proc_fops = {
+ .proc_open = numa_lock_proc_open,
+ .proc_read = numa_lock_proc_read,
+ .proc_write = numa_lock_proc_write
+};
+
+static int numalock_proc_init(void)
+{
+ int index = 0;
+ int i = 0;
+
+ numa_lock_proc = proc_mkdir(numa_lock_proc_dir, NULL);
+ if (numa_lock_proc == NULL) {
+ pr_info("%s proc create %s failed\n", __func__,
+ numa_lock_proc_dir);
+ return -EINVAL;
+ }
+
+ numa_lock_enable = proc_create_data("dynamic_enable", 0666,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ if (!numa_lock_enable) {
+ pr_info("%s proc_create_data %s failed!\n", __func__,
+ "dynamic_enable");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < numa_entry_total; i++)
+ numa_proc_entry[i] = NULL;
+
+ numa_proc_entry[0] = proc_create_data("osq_lock_depth", 0664,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ numa_proc_entry[1] = proc_create_data("osq_keep_times", 0664,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ numa_proc_entry[2] = proc_create_data("osq_node_max", 0664,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ numa_proc_entry[3] = proc_create_data("numa_osq_lock", 0444,
+ numa_lock_proc, &numa_lock_proc_fops, (void *)(long)index++);
+ return 0;
+}
+
+static void numalock_proc_exit(void)
+{
+ int i = 0;
+
+ for (i = 0; i < numa_entry_total; i++) {
+ if (numa_proc_entry[i])
+ proc_remove(numa_proc_entry[i]);
+ }
+ if (numa_lock_enable)
+ proc_remove(numa_lock_enable);
+ if (numa_lock_proc)
+ remove_proc_entry(numa_lock_proc_dir, NULL);
+
+}
+
+static int numalock_shutdown_notify(struct notifier_block *unused1,
+ unsigned long unused2, void *unused3)
+{
+ if (READ_ONCE(enable_zx_numa_osq_lock) == 1) {
+ WRITE_ONCE(dynamic_enable, 0);
+ WRITE_ONCE(enable_zx_numa_osq_lock, 0xf);
+ }
+ return NOTIFY_DONE;
+}
+static struct notifier_block numalock_shutdown_nb = {
+ .notifier_call = numalock_shutdown_notify,
+};
+static int __init zx_numa_base_init(void)
+{
+ int cpu = num_possible_cpus();
+ int i = 0;
+
+ WRITE_ONCE(enable_zx_numa_osq_lock, 0);
+ if (kvm_para_available())
+ return 0;
+ if (cpu >= 65534 || cpu < 16 || (cpu & 0x7) != 0)
+ return 0;
+
+ for (i = 0; i < ARRAY_SIZE(numa_cpu_list); i++) {
+ if (boot_cpu_data.x86_vendor == numa_cpu_list[i].x86_vendor &&
+ boot_cpu_data.x86 == numa_cpu_list[i].x86 &&
+ boot_cpu_data.x86_model == numa_cpu_list[i].x86_model) {
+
+ if (numa_cpu_list[i].feature1 == 1)
+ numa_clusters = nr_node_ids + nr_node_ids;
+ numa_shift = zx_get_numa_shift(num_possible_cpus(),
+ numa_clusters);
+
+ if (zx_numa_lock_init(numa_clusters) == 0)
+ return -ENOMEM;
+ register_reboot_notifier(&numalock_shutdown_nb);
+ numalock_proc_init();
+ INIT_DELAYED_WORK(&zx_numa_cleanup_work,
+ zx_numa_cleanup);
+ prefetchw(&enable_zx_numa_osq_lock);
+ WRITE_ONCE(enable_zx_numa_osq_lock, 1);
+ return 0;
+ }
+ }
+ return 0;
+}
+
+static void __exit zx_numa_lock_exit(void)
+{
+ numalock_proc_exit();
+ prefetchw(&dynamic_enable);
+ WRITE_ONCE(dynamic_enable, 0);
+}
+
+late_initcall(zx_numa_base_init);
+module_exit(zx_numa_lock_exit);
+MODULE_AUTHOR("LiYong <yongli-oc@zhaoxin.com>");
+MODULE_DESCRIPTION("zx dynamic numa-aware osq lock");
+MODULE_LICENSE("GPL");
+
diff --git a/kernel/locking/zx_numa_osq.c b/kernel/locking/zx_numa_osq.c
new file mode 100644
index 000000000000..9fc329f33c36
--- /dev/null
+++ b/kernel/locking/zx_numa_osq.c
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Dynamic numa-aware osq lock
+ * Author: LiYong <yongli-oc@zhaoxin.com>
+ *
+ */
+#include <linux/cpumask.h>
+#include <asm/byteorder.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/osq_lock.h>
+#include "numa.h"
+#include "numa_osq.h"
+
+int osq_node_max = 256;
+
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_cpu_node);
+
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int decode(int cpu_nr)
+{
+ return cpu_nr - 1;
+}
+
+static inline struct optimistic_spin_node *decode_curr(int encoded_cpu_val)
+{
+ int cpu_nr = decode(encoded_cpu_val);
+
+ return per_cpu_ptr(&osq_cpu_node, cpu_nr);
+}
+/*
+ * Exchange the tail to get the last one in the queue.
+ * If the high 32 bits of tail is not equal to the osq lock addr, or
+ * the low 32 bits is NUMA_LOCKED_VAL, the queue is in LOCKED state.
+ */
+static int atomic64_cmpxchg_notequal(void *qslock, atomic_t *tail, int curr)
+{
+ u64 ss = 0;
+ u32 addr = ptrmask(qslock);
+ u64 addrcurr = (((u64)addr) << 32) | curr;
+
+ while (1) {
+ ss = atomic64_read((atomic64_t *) tail);
+ if ((ss >> 32) != addr)
+ return NUMA_LOCKED_VAL;
+ if ((ss & LOW32MASK) == NUMA_LOCKED_VAL)
+ return NUMA_LOCKED_VAL;
+ if (atomic64_cmpxchg((atomic64_t *) tail, ss, addrcurr) == ss)
+ return ss & LOW32MASK;
+ cpu_relax();
+ }
+}
+/*
+ * Get numa-aware lock memory and set the osq to STOPPING state.
+ * Look for the zx_numa_entry array, get a free one and set each node's addr
+ * to the address of the osq lock, then set the struct optimistic_spin_queue's
+ * index to s.
+ *
+ * From the index, lock()/unlock() gets the position of struct _numa_lock,
+ * atomic check the struct _numa_lock's addr with the struct optimistic_spin_queue
+ * lock's address, equal means the link is valid. if the addr is not equal,
+ * the link is broken by the numa_cleanup workqueue since idle.
+ */
+void zx_osq_lock_stopping(struct optimistic_spin_queue *lock)
+{
+ int s = 0;
+
+ s = zx_numa_lock_ptr_get(lock);
+ if (s < zx_numa_lock_total) {
+ numa_lock_init_data(zx_numa_entry[s].numa_ptr,
+ numa_clusters, NUMA_UNLOCKED_VAL,
+ ptrmask(lock));
+
+ WRITE_ONCE(lock->index, s);
+ zx_numa_entry[s].type = 1;
+ smp_mb();/*should set these before enable*/
+ prefetchw(&lock->numa_enable);
+ WRITE_ONCE(lock->numa_enable, OSQLOCKSTOPPING);
+ } else {
+ prefetchw(&lock->numa_enable);
+ WRITE_ONCE(lock->numa_enable, OSQLOCKINITED);
+ }
+}
+/*
+ * Set numa_enable to NUMALOCKDYNAMIC to start by numa-aware lock,
+ * then set _numa_lock->initlock to TURNTONUMAREADY, breaks the
+ * loop of zx_osq_turn_numa_waiting.
+ */
+void zx_osq_numa_start(struct optimistic_spin_queue *lock)
+{
+ struct _numa_lock *_numa_lock = get_numa_lock(lock->index);
+
+ prefetchw(&lock->numa_enable);
+ WRITE_ONCE(lock->numa_enable, NUMALOCKDYNAMIC);
+ smp_mb(); /*should keep lock->numa_enable modified first*/
+ atomic_set(&_numa_lock->initlock, TURNTONUMAREADY);
+}
+
+/*
+ * Waiting numa-aware lock ready
+ */
+void zx_osq_turn_numa_waiting(struct optimistic_spin_queue *lock)
+{
+ struct _numa_lock *_numa_lock = get_numa_lock(lock->index);
+
+ atomic_inc(&_numa_lock->pending);
+ while (1) {
+ int s = atomic_read(&_numa_lock->initlock);
+
+ if (s == TURNTONUMAREADY)
+ break;
+ cpu_relax();
+
+ }
+ atomic_dec(&_numa_lock->pending);
+}
+
+static struct optimistic_spin_node *
+zx_numa_osq_wait_next(struct _numa_lock *lock,
+ struct optimistic_spin_node *node,
+ struct optimistic_spin_node *prev, int cpu)
+{
+ struct optimistic_spin_node *next = NULL;
+ int curr = encode_cpu(cpu);
+ int old;
+
+ old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
+ for (;;) {
+ if (atomic_read(&lock->tail) == curr &&
+ atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
+
+ break;
+ }
+ if (node->next) {
+ next = xchg(&node->next, NULL);
+ if (next)
+ break;
+ }
+ cpu_relax();
+ }
+ return next;
+}
+/*
+ * If the numa_lock tail is in LOCKED state or the high 32 bits addr
+ * is not equal to the osq lock, the numa-aware lock is stopped, and
+ * break the loop to osq_lock()
+ * if not, exchange the tail to enqueue.
+ */
+static void zx_numa_turn_osq_waiting(struct optimistic_spin_queue *lock,
+ struct _numa_lock *_numa_lock)
+{
+ struct _numa_lock *numa_lock = _numa_lock + _numa_lock->numa_nodes;
+ int lockaddr = ptrmask(lock);
+ u64 s = 0;
+ struct optimistic_spin_queue tail;
+
+ tail.numa_enable = NUMALOCKDYNAMIC;
+ tail.index = lock->index;
+ tail.tail16 = OSQ_LOCKED_VAL;
+ while (1) {
+ cpu_relax();
+ s = atomic64_read((atomic64_t *) &numa_lock->tail);
+ if ((s >> 32) != lockaddr)
+ break;
+ if ((s & LOW32MASK) == NUMA_LOCKED_VAL)
+ break;
+ }
+ prefetchw(&lock->tail);
+ if (atomic_cmpxchg(&lock->tail, tail.val, OSQ_UNLOCKED_VAL)
+ == tail.val) {
+ ;
+ }
+
+}
+
+static int _zx_node_osq_lock_internal(struct optimistic_spin_queue *qslock,
+ struct optimistic_spin_node *node, struct optimistic_spin_node *prev,
+ struct _numa_lock *node_lock, int cpu, int *cur_status)
+{
+ struct optimistic_spin_node *next = NULL;
+
+ for (;;) {
+ struct optimistic_spin_node *node_prev = NULL;
+
+ /*
+ * cpu_relax() below implies a compiler barrier which would
+ * prevent this comparison being optimized away.
+ */
+ if (data_race(prev->next) == node &&
+ cmpxchg(&prev->next, node, NULL) == node) {
+ break;
+ }
+ /*load locked first each time*/
+ *cur_status = smp_load_acquire(&node->locked);
+
+ if (*cur_status != NODE_WAIT)
+ return 0; //goto NODE_UNLOCK;
+
+ cpu_relax();
+ /*
+ * Or we race against a concurrent unqueue()'s step-B, in which
+ * case its step-C will write us a new @node->prev pointer.
+ */
+ node_prev = READ_ONCE(node->prev);
+ if (node_prev != prev)
+ prev = node_prev;
+ }
+
+ /*
+ * Step - B -- stabilize @next
+ *
+ * Similar to unlock(), wait for @node->next or move @lock from @node
+ * back to @prev.
+ */
+ next = zx_numa_osq_wait_next(node_lock, node, prev, cpu);
+ if (!next)
+ return -1;
+
+ WRITE_ONCE(next->prev, prev);
+ WRITE_ONCE(prev->next, next);
+
+ return -1;
+}
+
+static int _zx_node_osq_lock(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *_numa_lock)
+{
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_cpu_node);
+ struct optimistic_spin_node *prev = NULL;
+ int cpu = smp_processor_id();
+ int curr = encode_cpu(cpu);
+ int numa = cpu >> _numa_lock->shift;
+ struct _numa_lock *node_lock = _numa_lock + numa;
+ int cur_status = 0;
+ int old = 0;
+
+ node->locked = NODE_WAIT;
+ node->next = NULL;
+ node->cpu = curr;
+
+ old = atomic64_cmpxchg_notequal(qslock, &node_lock->tail, curr);
+
+ if (old == NUMA_LOCKED_VAL) {
+ bool s = true;
+
+ zx_numa_turn_osq_waiting(qslock, _numa_lock);
+ s = osq_lock(qslock);
+ if (s == true)
+ return 1;
+ else
+ return -1;
+ }
+
+ if (old == 0) {
+ node->locked = COHORT_START;
+ return ACQUIRE_NUMALOCK;
+ }
+
+ prev = decode_curr(old);
+ node->prev = prev;
+
+ smp_mb(); /* make sure node set before set pre->next */
+
+ WRITE_ONCE(prev->next, node);
+ /*
+ * Normally @prev is untouchable after the above store; because at that
+ * moment unlock can proceed and wipe the node element from stack.
+ *
+ * However, since our nodes are static per-cpu storage, we're
+ * guaranteed their existence -- this allows us to apply
+ * cmpxchg in an attempt to undo our queueing.
+ */
+
+ /*
+ * Wait to acquire the lock or cancellation. Note that need_resched()
+ * will come with an IPI, which will wake smp_cond_load_relaxed() if it
+ * is implemented with a monitor-wait. vcpu_is_preempted() relies on
+ * polling, be careful.
+ */
+ while ((cur_status = READ_ONCE(node->locked)) == NODE_WAIT) {
+ if (need_resched() || vcpu_is_preempted(node_cpu(node->prev))) {
+ int ddd = _zx_node_osq_lock_internal(qslock, node, prev,
+ node_lock, cpu, &cur_status);
+
+ if (cur_status != NODE_WAIT)
+ goto NODE_UNLOCK;
+ if (ddd == -1)
+ return -1;
+ }
+ cpu_relax();
+ }
+NODE_UNLOCK:
+ if (cur_status == ACQUIRE_NUMALOCK)
+ node->locked = COHORT_START;
+ return cur_status;
+}
+static int _zx_numa_osq_lock(struct optimistic_spin_queue *qslock, int cpu,
+ struct _numa_lock *_numa_lock)
+{
+ int numacpu = cpu >> _numa_lock->shift;
+ int numacurr = encode_cpu(numacpu);
+
+ struct optimistic_spin_node *node = &(_numa_lock + numacpu)->osq_node;
+ struct _numa_lock *numa_lock = _numa_lock + _numa_lock->numa_nodes;
+ struct optimistic_spin_node *prevnode = NULL;
+ int prev = 0;
+
+ node->next = NULL;
+ node->locked = LOCK_NUMALOCK;
+ node->cpu = numacurr;
+
+ prev = atomic_xchg(&numa_lock->tail, numacurr);
+ if (prev == 0) {
+ node->locked = UNLOCK_NUMALOCK;
+ return 0;
+ }
+
+ prevnode = &(_numa_lock + prev - 1)->osq_node;
+ node->prev = prevnode;
+ smp_mb(); /*node->prev should be set before next*/
+ WRITE_ONCE(prevnode->next, node);
+
+ while (READ_ONCE(node->locked) == LOCK_NUMALOCK)
+ cpu_relax();
+
+ return 0;
+}
+/*
+ * Two level osq_lock
+ * Check current node's tail then check the numa tail, the queue goes to
+ * run or wait according the two tail's state.
+ */
+
+inline bool zx_numa_osq_lock(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *_numa_lock)
+{
+ struct _numa_lock *node_lock = NULL;
+ int cpu = smp_processor_id();
+ int numa = cpu >> _numa_lock->shift;
+ int status = 0;
+
+ node_lock = _numa_lock + numa;
+
+ if (node_lock->stopping) {
+ zx_numa_turn_osq_waiting(qslock, _numa_lock);
+ return osq_lock(qslock);
+ }
+
+ status = _zx_node_osq_lock(qslock, _numa_lock);
+ if (status == ACQUIRE_NUMALOCK)
+ status = _zx_numa_osq_lock(qslock, smp_processor_id(),
+ _numa_lock);
+
+ if (status == -1)
+ return false;
+ return true;
+}
+/*
+ * Check the end of the queue on current node.
+ * Keep the addr of the node_lock when set the queue to UNLOCKED.
+ * If the node is stopping, the node_lock will be set LOCKED.
+ */
+static int atomic64_checktail_osq(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *node_lock, int ctail)
+{
+ u64 addr = ((u64)ptrmask(qslock)) << 32;
+ u64 addrtail = addr | ctail;
+ u64 ss = 0;
+ bool mark;
+
+ ss = atomic64_read((atomic64_t *) &node_lock->tail);
+ if (node_lock->stopping == 0)
+ mark = (ss == addrtail &&
+ atomic64_cmpxchg_acquire(
+ (atomic64_t *) &node_lock->tail,
+ addrtail, addr|NUMA_UNLOCKED_VAL) == addrtail);
+ else
+ mark = (ss == addrtail &&
+ atomic64_cmpxchg_acquire(
+ (atomic64_t *) &node_lock->tail,
+ addrtail, NUMA_LOCKED_VAL) == addrtail);
+ return mark;
+}
+
+/*
+ * Set current node's tail to ACQUIRE_NUMALOCK to check the numa tail busy or
+ * UNLOCKED
+ */
+static void node_lock_release(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *node_lock, struct optimistic_spin_node *node,
+ int val, int cpu, int numa_end)
+{
+ struct optimistic_spin_node *next = NULL;
+ int curr = encode_cpu(cpu);
+
+ while (1) {
+ if (atomic64_checktail_osq(qslock, node_lock, curr)) {
+ if (qslock->numa_enable == NUMALOCKDYNAMIC) {
+ int index = qslock->index;
+ //starts numa_lock idle checking in cleanup workqueue.
+ if (numa_end == OSQ_UNLOCKED_VAL &&
+ zx_numa_entry[index].idle == 0) {
+ cmpxchg(&zx_numa_entry[index].idle,
+ 0, 1);
+ }
+ }
+ return;
+ }
+ if (node->next) {
+ next = xchg(&node->next, NULL);
+ if (next) {
+ WRITE_ONCE(next->locked, val);
+ return;
+ }
+ }
+ cpu_relax();
+ }
+}
+/*
+ * Unlocks the queue waiting on the next NUMA node
+ */
+static int numa_lock_release(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *numa_lock,
+ struct optimistic_spin_node *node, int cpu)
+{
+ struct optimistic_spin_node *next = NULL;
+ int curr = cpu >> numa_lock->shift;
+ int numacurr = encode_cpu(curr);
+
+ while (1) {
+ if (atomic_read(&numa_lock->tail) == numacurr &&
+ atomic_cmpxchg_acquire(&numa_lock->tail, numacurr,
+ OSQ_UNLOCKED_VAL) == numacurr) {
+ return OSQ_UNLOCKED_VAL;
+ }
+
+ if (node->next) {
+ next = xchg(&node->next, NULL);
+ if (next) {
+ WRITE_ONCE(next->locked, UNLOCK_NUMALOCK);
+ return 1;
+ }
+ }
+ cpu_relax();
+ }
+}
+/*
+ * Two level osq_unlock.
+ */
+inline void zx_numa_osq_unlock(struct optimistic_spin_queue *qslock,
+ struct _numa_lock *_numa_lock)
+{
+ u32 cpu = smp_processor_id();
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_cpu_node);
+ int numa = cpu >> _numa_lock->shift;
+ struct _numa_lock *numa_lock = _numa_lock + _numa_lock->numa_nodes;
+ struct _numa_lock *node_lock = _numa_lock + numa;
+ struct optimistic_spin_node *numa_node =
+ &(_numa_lock + numa)->osq_node;
+ struct optimistic_spin_node *next = NULL;
+ int cur_count = 0;
+ int numa_end = 0;
+
+ cur_count = READ_ONCE(node->locked);
+
+ /*
+ * Turns to the queue waiting on the next node if unlocked times more
+ * than osq_node_max on current node.
+ */
+ if (cur_count >= osq_node_max - 1) {
+ //Unlocks the queue on next node.
+ numa_end = numa_lock_release(qslock,
+ numa_lock, numa_node, cpu);
+ node_lock_release(qslock, node_lock, node,
+ ACQUIRE_NUMALOCK, cpu, numa_end);
+ return;
+ }
+ /*
+ * Unlocks the next on current node.
+ */
+ next = xchg(&node->next, NULL);
+ if (next) {
+ WRITE_ONCE(next->locked, cur_count + 1);
+ return;
+ }
+ /*
+ * The queue on current node reaches end.
+ */
+ numa_end = numa_lock_release(qslock, numa_lock, numa_node, cpu);
+ node_lock_release(qslock, node_lock, node, ACQUIRE_NUMALOCK,
+ cpu, numa_end);
+}
--
2.34.1
© 2016 - 2024 Red Hat, Inc.