[PATCH v2 3/3] LoongArch: KVM: Set vcpu_is_preempted() macro rather than function

Bibo Mao posted 3 patches 2 weeks, 6 days ago
There is a newer version of this series
[PATCH v2 3/3] LoongArch: KVM: Set vcpu_is_preempted() macro rather than function
Posted by Bibo Mao 2 weeks, 6 days ago
vcpu_is_preempted() is performance sensitive called in function
osq_lock(), here set it as macro. So that parameter is not parsed
at most time, it can avoid cache line thrashing across numa node.

Here is part of unixbench result on 3C5000 DualWay machine with 32
Cores and 2 Numa node.
          origin    with patch    CONFIG_PARAVIRT disabled
execl     6871.9    7134.2        7190.8
fstime    425.5     959.9         956.1

From the test result, with macro method it is almost the same with
CONFIG_PARAVIRT disabled, and there is some improvment compared with
function method.

Signed-off-by: Bibo Mao <maobibo@loongson.cn>
---
 arch/loongarch/include/asm/qspinlock.h | 27 +++++++++++++++++++++-----
 arch/loongarch/kernel/paravirt.c       | 15 ++------------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/arch/loongarch/include/asm/qspinlock.h b/arch/loongarch/include/asm/qspinlock.h
index 66244801db67..b5d7a038faf1 100644
--- a/arch/loongarch/include/asm/qspinlock.h
+++ b/arch/loongarch/include/asm/qspinlock.h
@@ -5,8 +5,10 @@
 #include <linux/jump_label.h>
 
 #ifdef CONFIG_PARAVIRT
-
+#include <asm/kvm_para.h>
 DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
+DECLARE_STATIC_KEY_FALSE(virt_preempt_key);
+DECLARE_PER_CPU(struct kvm_steal_time, steal_time);
 
 #define virt_spin_lock virt_spin_lock
 
@@ -34,10 +36,25 @@ static inline bool virt_spin_lock(struct qspinlock *lock)
 	return true;
 }
 
-#define vcpu_is_preempted vcpu_is_preempted
-
-bool vcpu_is_preempted(int cpu);
-
+/*
+ * Macro is better than inline function here
+ * With inline function, parameter cpu is parsed even though it is not used.
+ * This may cause cache line thrashing across NUMA node.
+ * With macro method, parameter cpu is parsed only when it is used.
+ */
+#define vcpu_is_preempted(cpu)							\
+({										\
+	bool __val;								\
+										\
+	if (!static_branch_unlikely(&virt_preempt_key))				\
+		__val = false;							\
+	else {									\
+		struct kvm_steal_time *src;					\
+		src = &per_cpu(steal_time, cpu);				\
+		__val = !!(READ_ONCE(src->preempted) & KVM_VCPU_PREEMPTED);	\
+	}									\
+	__val;									\
+})
 #endif /* CONFIG_PARAVIRT */
 
 #include <asm-generic/qspinlock.h>
diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
index b74fe6db49ab..2d1206e486e2 100644
--- a/arch/loongarch/kernel/paravirt.c
+++ b/arch/loongarch/kernel/paravirt.c
@@ -10,8 +10,8 @@
 #include <asm/paravirt.h>
 
 static int has_steal_clock;
-static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
-static DEFINE_STATIC_KEY_FALSE(virt_preempt_key);
+DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+DEFINE_STATIC_KEY_FALSE(virt_preempt_key);
 DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
 
 static bool steal_acc = true;
@@ -261,17 +261,6 @@ static int pv_time_cpu_down_prepare(unsigned int cpu)
 	return 0;
 }
 
-bool vcpu_is_preempted(int cpu)
-{
-	struct kvm_steal_time *src;
-
-	if (!static_branch_unlikely(&virt_preempt_key))
-		return false;
-
-	src = &per_cpu(steal_time, cpu);
-	return !!(src->preempted & KVM_VCPU_PREEMPTED);
-}
-EXPORT_SYMBOL(vcpu_is_preempted);
 #endif
 
 static void pv_cpu_reboot(void *unused)
-- 
2.39.3
Re: [PATCH v2 3/3] LoongArch: KVM: Set vcpu_is_preempted() macro rather than function
Posted by Huacai Chen 2 weeks, 4 days ago
Hi, Bibo,

On Tue, Mar 17, 2026 at 5:29 PM Bibo Mao <maobibo@loongson.cn> wrote:
>
> vcpu_is_preempted() is performance sensitive called in function
> osq_lock(), here set it as macro. So that parameter is not parsed
> at most time, it can avoid cache line thrashing across numa node.
>
> Here is part of unixbench result on 3C5000 DualWay machine with 32
> Cores and 2 Numa node.
>           origin    with patch    CONFIG_PARAVIRT disabled
> execl     6871.9    7134.2        7190.8
> fstime    425.5     959.9         956.1
>
> From the test result, with macro method it is almost the same with
> CONFIG_PARAVIRT disabled, and there is some improvment compared with
> function method.
>
> Signed-off-by: Bibo Mao <maobibo@loongson.cn>
> ---
>  arch/loongarch/include/asm/qspinlock.h | 27 +++++++++++++++++++++-----
>  arch/loongarch/kernel/paravirt.c       | 15 ++------------
>  2 files changed, 24 insertions(+), 18 deletions(-)
>
> diff --git a/arch/loongarch/include/asm/qspinlock.h b/arch/loongarch/include/asm/qspinlock.h
> index 66244801db67..b5d7a038faf1 100644
> --- a/arch/loongarch/include/asm/qspinlock.h
> +++ b/arch/loongarch/include/asm/qspinlock.h
> @@ -5,8 +5,10 @@
>  #include <linux/jump_label.h>
>
>  #ifdef CONFIG_PARAVIRT
> -
> +#include <asm/kvm_para.h>
>  DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
> +DECLARE_STATIC_KEY_FALSE(virt_preempt_key);
> +DECLARE_PER_CPU(struct kvm_steal_time, steal_time);
>
>  #define virt_spin_lock virt_spin_lock
>
> @@ -34,10 +36,25 @@ static inline bool virt_spin_lock(struct qspinlock *lock)
>         return true;
>  }
>
> -#define vcpu_is_preempted vcpu_is_preempted
> -
> -bool vcpu_is_preempted(int cpu);
> -
> +/*
> + * Macro is better than inline function here
> + * With inline function, parameter cpu is parsed even though it is not used.
> + * This may cause cache line thrashing across NUMA node.
> + * With macro method, parameter cpu is parsed only when it is used.
What is the test result with inline function? If it isn't
significantly worse than with macro, we usually prefer inline
functions.



Huacai

> + */
> +#define vcpu_is_preempted(cpu)                                                 \
> +({                                                                             \
> +       bool __val;                                                             \
> +                                                                               \
> +       if (!static_branch_unlikely(&virt_preempt_key))                         \
> +               __val = false;                                                  \
> +       else {                                                                  \
> +               struct kvm_steal_time *src;                                     \
> +               src = &per_cpu(steal_time, cpu);                                \
> +               __val = !!(READ_ONCE(src->preempted) & KVM_VCPU_PREEMPTED);     \
> +       }                                                                       \
> +       __val;                                                                  \
> +})
>  #endif /* CONFIG_PARAVIRT */
>
>  #include <asm-generic/qspinlock.h>
> diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
> index b74fe6db49ab..2d1206e486e2 100644
> --- a/arch/loongarch/kernel/paravirt.c
> +++ b/arch/loongarch/kernel/paravirt.c
> @@ -10,8 +10,8 @@
>  #include <asm/paravirt.h>
>
>  static int has_steal_clock;
> -static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
> -static DEFINE_STATIC_KEY_FALSE(virt_preempt_key);
> +DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
> +DEFINE_STATIC_KEY_FALSE(virt_preempt_key);
>  DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
>
>  static bool steal_acc = true;
> @@ -261,17 +261,6 @@ static int pv_time_cpu_down_prepare(unsigned int cpu)
>         return 0;
>  }
>
> -bool vcpu_is_preempted(int cpu)
> -{
> -       struct kvm_steal_time *src;
> -
> -       if (!static_branch_unlikely(&virt_preempt_key))
> -               return false;
> -
> -       src = &per_cpu(steal_time, cpu);
> -       return !!(src->preempted & KVM_VCPU_PREEMPTED);
> -}
> -EXPORT_SYMBOL(vcpu_is_preempted);
>  #endif
>
>  static void pv_cpu_reboot(void *unused)
> --
> 2.39.3
>
Re: [PATCH v2 3/3] LoongArch: KVM: Set vcpu_is_preempted() macro rather than function
Posted by Bibo Mao 2 weeks, 3 days ago

On 2026/3/19 下午8:48, Huacai Chen wrote:
> Hi, Bibo,
> 
> On Tue, Mar 17, 2026 at 5:29 PM Bibo Mao <maobibo@loongson.cn> wrote:
>>
>> vcpu_is_preempted() is performance sensitive called in function
>> osq_lock(), here set it as macro. So that parameter is not parsed
>> at most time, it can avoid cache line thrashing across numa node.
>>
>> Here is part of unixbench result on 3C5000 DualWay machine with 32
>> Cores and 2 Numa node.
>>            origin    with patch    CONFIG_PARAVIRT disabled
>> execl     6871.9    7134.2        7190.8
>> fstime    425.5     959.9         956.1
>>
>>  From the test result, with macro method it is almost the same with
>> CONFIG_PARAVIRT disabled, and there is some improvment compared with
>> function method.
>>
>> Signed-off-by: Bibo Mao <maobibo@loongson.cn>
>> ---
>>   arch/loongarch/include/asm/qspinlock.h | 27 +++++++++++++++++++++-----
>>   arch/loongarch/kernel/paravirt.c       | 15 ++------------
>>   2 files changed, 24 insertions(+), 18 deletions(-)
>>
>> diff --git a/arch/loongarch/include/asm/qspinlock.h b/arch/loongarch/include/asm/qspinlock.h
>> index 66244801db67..b5d7a038faf1 100644
>> --- a/arch/loongarch/include/asm/qspinlock.h
>> +++ b/arch/loongarch/include/asm/qspinlock.h
>> @@ -5,8 +5,10 @@
>>   #include <linux/jump_label.h>
>>
>>   #ifdef CONFIG_PARAVIRT
>> -
>> +#include <asm/kvm_para.h>
>>   DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
>> +DECLARE_STATIC_KEY_FALSE(virt_preempt_key);
>> +DECLARE_PER_CPU(struct kvm_steal_time, steal_time);
>>
>>   #define virt_spin_lock virt_spin_lock
>>
>> @@ -34,10 +36,25 @@ static inline bool virt_spin_lock(struct qspinlock *lock)
>>          return true;
>>   }
>>
>> -#define vcpu_is_preempted vcpu_is_preempted
>> -
>> -bool vcpu_is_preempted(int cpu);
>> -
>> +/*
>> + * Macro is better than inline function here
>> + * With inline function, parameter cpu is parsed even though it is not used.
>> + * This may cause cache line thrashing across NUMA node.
>> + * With macro method, parameter cpu is parsed only when it is used.
> What is the test result with inline function? If it isn't
> significantly worse than with macro, we usually prefer inline
> functions.
In the beginning, I had the similar thoughts. And inline function is 
better to maintain, however here is obvious performance gap between 
inline and macro method.

Here is part of unixbench result on 3C5000 DualWay machine with 32
Cores and 2 Numa node, based on 7.0.0-rc4 kernel version. 
vcpu_is_preempted() is implemented with different methods.
          origin (function)   inline        macro
execl     7025.7             6991.2        7242.3
fstime    474.6              703.1         1071.1

Regards
Bibo Mao
> 
> 
> 
> Huacai
> 
>> + */
>> +#define vcpu_is_preempted(cpu)                                                 \
>> +({                                                                             \
>> +       bool __val;                                                             \
>> +                                                                               \
>> +       if (!static_branch_unlikely(&virt_preempt_key))                         \
>> +               __val = false;                                                  \
>> +       else {                                                                  \
>> +               struct kvm_steal_time *src;                                     \
>> +               src = &per_cpu(steal_time, cpu);                                \
>> +               __val = !!(READ_ONCE(src->preempted) & KVM_VCPU_PREEMPTED);     \
>> +       }                                                                       \
>> +       __val;                                                                  \
>> +})
>>   #endif /* CONFIG_PARAVIRT */
>>
>>   #include <asm-generic/qspinlock.h>
>> diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
>> index b74fe6db49ab..2d1206e486e2 100644
>> --- a/arch/loongarch/kernel/paravirt.c
>> +++ b/arch/loongarch/kernel/paravirt.c
>> @@ -10,8 +10,8 @@
>>   #include <asm/paravirt.h>
>>
>>   static int has_steal_clock;
>> -static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
>> -static DEFINE_STATIC_KEY_FALSE(virt_preempt_key);
>> +DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
>> +DEFINE_STATIC_KEY_FALSE(virt_preempt_key);
>>   DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
>>
>>   static bool steal_acc = true;
>> @@ -261,17 +261,6 @@ static int pv_time_cpu_down_prepare(unsigned int cpu)
>>          return 0;
>>   }
>>
>> -bool vcpu_is_preempted(int cpu)
>> -{
>> -       struct kvm_steal_time *src;
>> -
>> -       if (!static_branch_unlikely(&virt_preempt_key))
>> -               return false;
>> -
>> -       src = &per_cpu(steal_time, cpu);
>> -       return !!(src->preempted & KVM_VCPU_PREEMPTED);
>> -}
>> -EXPORT_SYMBOL(vcpu_is_preempted);
>>   #endif
>>
>>   static void pv_cpu_reboot(void *unused)
>> --
>> 2.39.3
>>

Re: [PATCH v2 3/3] LoongArch: KVM: Set vcpu_is_preempted() macro rather than function
Posted by Huacai Chen 1 week ago
On Fri, Mar 20, 2026 at 11:01 AM Bibo Mao <maobibo@loongson.cn> wrote:
>
>
>
> On 2026/3/19 下午8:48, Huacai Chen wrote:
> > Hi, Bibo,
> >
> > On Tue, Mar 17, 2026 at 5:29 PM Bibo Mao <maobibo@loongson.cn> wrote:
> >>
> >> vcpu_is_preempted() is performance sensitive called in function
> >> osq_lock(), here set it as macro. So that parameter is not parsed
> >> at most time, it can avoid cache line thrashing across numa node.
> >>
> >> Here is part of unixbench result on 3C5000 DualWay machine with 32
> >> Cores and 2 Numa node.
> >>            origin    with patch    CONFIG_PARAVIRT disabled
> >> execl     6871.9    7134.2        7190.8
> >> fstime    425.5     959.9         956.1
> >>
> >>  From the test result, with macro method it is almost the same with
> >> CONFIG_PARAVIRT disabled, and there is some improvment compared with
> >> function method.
> >>
> >> Signed-off-by: Bibo Mao <maobibo@loongson.cn>
> >> ---
> >>   arch/loongarch/include/asm/qspinlock.h | 27 +++++++++++++++++++++-----
> >>   arch/loongarch/kernel/paravirt.c       | 15 ++------------
> >>   2 files changed, 24 insertions(+), 18 deletions(-)
> >>
> >> diff --git a/arch/loongarch/include/asm/qspinlock.h b/arch/loongarch/include/asm/qspinlock.h
> >> index 66244801db67..b5d7a038faf1 100644
> >> --- a/arch/loongarch/include/asm/qspinlock.h
> >> +++ b/arch/loongarch/include/asm/qspinlock.h
> >> @@ -5,8 +5,10 @@
> >>   #include <linux/jump_label.h>
> >>
> >>   #ifdef CONFIG_PARAVIRT
> >> -
> >> +#include <asm/kvm_para.h>
> >>   DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
> >> +DECLARE_STATIC_KEY_FALSE(virt_preempt_key);
> >> +DECLARE_PER_CPU(struct kvm_steal_time, steal_time);
> >>
> >>   #define virt_spin_lock virt_spin_lock
> >>
> >> @@ -34,10 +36,25 @@ static inline bool virt_spin_lock(struct qspinlock *lock)
> >>          return true;
> >>   }
> >>
> >> -#define vcpu_is_preempted vcpu_is_preempted
> >> -
> >> -bool vcpu_is_preempted(int cpu);
> >> -
> >> +/*
> >> + * Macro is better than inline function here
> >> + * With inline function, parameter cpu is parsed even though it is not used.
> >> + * This may cause cache line thrashing across NUMA node.
> >> + * With macro method, parameter cpu is parsed only when it is used.
> > What is the test result with inline function? If it isn't
> > significantly worse than with macro, we usually prefer inline
> > functions.
> In the beginning, I had the similar thoughts. And inline function is
> better to maintain, however here is obvious performance gap between
> inline and macro method.
Strange, try __always_inline to replace inline? Maybe inline fails for
some cases?


Huacai

>
> Here is part of unixbench result on 3C5000 DualWay machine with 32
> Cores and 2 Numa node, based on 7.0.0-rc4 kernel version.
> vcpu_is_preempted() is implemented with different methods.
>           origin (function)   inline        macro
> execl     7025.7             6991.2        7242.3
> fstime    474.6              703.1         1071.1
>
> Regards
> Bibo Mao
> >
> >
> >
> > Huacai
> >
> >> + */
> >> +#define vcpu_is_preempted(cpu)                                                 \
> >> +({                                                                             \
> >> +       bool __val;                                                             \
> >> +                                                                               \
> >> +       if (!static_branch_unlikely(&virt_preempt_key))                         \
> >> +               __val = false;                                                  \
> >> +       else {                                                                  \
> >> +               struct kvm_steal_time *src;                                     \
> >> +               src = &per_cpu(steal_time, cpu);                                \
> >> +               __val = !!(READ_ONCE(src->preempted) & KVM_VCPU_PREEMPTED);     \
> >> +       }                                                                       \
> >> +       __val;                                                                  \
> >> +})
> >>   #endif /* CONFIG_PARAVIRT */
> >>
> >>   #include <asm-generic/qspinlock.h>
> >> diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
> >> index b74fe6db49ab..2d1206e486e2 100644
> >> --- a/arch/loongarch/kernel/paravirt.c
> >> +++ b/arch/loongarch/kernel/paravirt.c
> >> @@ -10,8 +10,8 @@
> >>   #include <asm/paravirt.h>
> >>
> >>   static int has_steal_clock;
> >> -static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
> >> -static DEFINE_STATIC_KEY_FALSE(virt_preempt_key);
> >> +DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
> >> +DEFINE_STATIC_KEY_FALSE(virt_preempt_key);
> >>   DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
> >>
> >>   static bool steal_acc = true;
> >> @@ -261,17 +261,6 @@ static int pv_time_cpu_down_prepare(unsigned int cpu)
> >>          return 0;
> >>   }
> >>
> >> -bool vcpu_is_preempted(int cpu)
> >> -{
> >> -       struct kvm_steal_time *src;
> >> -
> >> -       if (!static_branch_unlikely(&virt_preempt_key))
> >> -               return false;
> >> -
> >> -       src = &per_cpu(steal_time, cpu);
> >> -       return !!(src->preempted & KVM_VCPU_PREEMPTED);
> >> -}
> >> -EXPORT_SYMBOL(vcpu_is_preempted);
> >>   #endif
> >>
> >>   static void pv_cpu_reboot(void *unused)
> >> --
> >> 2.39.3
> >>
>
>
Re: [PATCH v2 3/3] LoongArch: KVM: Set vcpu_is_preempted() macro rather than function
Posted by Bibo Mao 1 week ago

On 2026/3/30 下午6:00, Huacai Chen wrote:
> On Fri, Mar 20, 2026 at 11:01 AM Bibo Mao <maobibo@loongson.cn> wrote:
>>
>>
>>
>> On 2026/3/19 下午8:48, Huacai Chen wrote:
>>> Hi, Bibo,
>>>
>>> On Tue, Mar 17, 2026 at 5:29 PM Bibo Mao <maobibo@loongson.cn> wrote:
>>>>
>>>> vcpu_is_preempted() is performance sensitive called in function
>>>> osq_lock(), here set it as macro. So that parameter is not parsed
>>>> at most time, it can avoid cache line thrashing across numa node.
>>>>
>>>> Here is part of unixbench result on 3C5000 DualWay machine with 32
>>>> Cores and 2 Numa node.
>>>>             origin    with patch    CONFIG_PARAVIRT disabled
>>>> execl     6871.9    7134.2        7190.8
>>>> fstime    425.5     959.9         956.1
>>>>
>>>>   From the test result, with macro method it is almost the same with
>>>> CONFIG_PARAVIRT disabled, and there is some improvment compared with
>>>> function method.
>>>>
>>>> Signed-off-by: Bibo Mao <maobibo@loongson.cn>
>>>> ---
>>>>    arch/loongarch/include/asm/qspinlock.h | 27 +++++++++++++++++++++-----
>>>>    arch/loongarch/kernel/paravirt.c       | 15 ++------------
>>>>    2 files changed, 24 insertions(+), 18 deletions(-)
>>>>
>>>> diff --git a/arch/loongarch/include/asm/qspinlock.h b/arch/loongarch/include/asm/qspinlock.h
>>>> index 66244801db67..b5d7a038faf1 100644
>>>> --- a/arch/loongarch/include/asm/qspinlock.h
>>>> +++ b/arch/loongarch/include/asm/qspinlock.h
>>>> @@ -5,8 +5,10 @@
>>>>    #include <linux/jump_label.h>
>>>>
>>>>    #ifdef CONFIG_PARAVIRT
>>>> -
>>>> +#include <asm/kvm_para.h>
>>>>    DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
>>>> +DECLARE_STATIC_KEY_FALSE(virt_preempt_key);
>>>> +DECLARE_PER_CPU(struct kvm_steal_time, steal_time);
>>>>
>>>>    #define virt_spin_lock virt_spin_lock
>>>>
>>>> @@ -34,10 +36,25 @@ static inline bool virt_spin_lock(struct qspinlock *lock)
>>>>           return true;
>>>>    }
>>>>
>>>> -#define vcpu_is_preempted vcpu_is_preempted
>>>> -
>>>> -bool vcpu_is_preempted(int cpu);
>>>> -
>>>> +/*
>>>> + * Macro is better than inline function here
>>>> + * With inline function, parameter cpu is parsed even though it is not used.
>>>> + * This may cause cache line thrashing across NUMA node.
>>>> + * With macro method, parameter cpu is parsed only when it is used.
>>> What is the test result with inline function? If it isn't
>>> significantly worse than with macro, we usually prefer inline
>>> functions.
>> In the beginning, I had the similar thoughts. And inline function is
>> better to maintain, however here is obvious performance gap between
>> inline and macro method.
> Strange, try __always_inline to replace inline? Maybe inline fails for
> some cases?
__always_inline is the same with inline.

With the following sentences in osq_lock(). With inline method, 
parameter node_cpu(node->prev) will be parsed and put in temp register 
in the beginning before the inline function is called, so that there 
will be one memory access.

And I ask with LLVM guys, they say that LLVM framework does this, not 
LLVM arch specified. inline is different with macro actually.

However with marcro method, parameter node_cpu(node->prev) will not be 
parsed until virt_preempt_key() is set with true.

if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
                                vcpu_is_preempted(node_cpu(node->prev))))

Regards
Bibo Mao
> 
> 
> Huacai
> 
>>
>> Here is part of unixbench result on 3C5000 DualWay machine with 32
>> Cores and 2 Numa node, based on 7.0.0-rc4 kernel version.
>> vcpu_is_preempted() is implemented with different methods.
>>            origin (function)   inline        macro
>> execl     7025.7             6991.2        7242.3
>> fstime    474.6              703.1         1071.1
>>
>> Regards
>> Bibo Mao
>>>
>>>
>>>
>>> Huacai
>>>
>>>> + */
>>>> +#define vcpu_is_preempted(cpu)                                                 \
>>>> +({                                                                             \
>>>> +       bool __val;                                                             \
>>>> +                                                                               \
>>>> +       if (!static_branch_unlikely(&virt_preempt_key))                         \
>>>> +               __val = false;                                                  \
>>>> +       else {                                                                  \
>>>> +               struct kvm_steal_time *src;                                     \
>>>> +               src = &per_cpu(steal_time, cpu);                                \
>>>> +               __val = !!(READ_ONCE(src->preempted) & KVM_VCPU_PREEMPTED);     \
>>>> +       }                                                                       \
>>>> +       __val;                                                                  \
>>>> +})
>>>>    #endif /* CONFIG_PARAVIRT */
>>>>
>>>>    #include <asm-generic/qspinlock.h>
>>>> diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
>>>> index b74fe6db49ab..2d1206e486e2 100644
>>>> --- a/arch/loongarch/kernel/paravirt.c
>>>> +++ b/arch/loongarch/kernel/paravirt.c
>>>> @@ -10,8 +10,8 @@
>>>>    #include <asm/paravirt.h>
>>>>
>>>>    static int has_steal_clock;
>>>> -static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
>>>> -static DEFINE_STATIC_KEY_FALSE(virt_preempt_key);
>>>> +DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
>>>> +DEFINE_STATIC_KEY_FALSE(virt_preempt_key);
>>>>    DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
>>>>
>>>>    static bool steal_acc = true;
>>>> @@ -261,17 +261,6 @@ static int pv_time_cpu_down_prepare(unsigned int cpu)
>>>>           return 0;
>>>>    }
>>>>
>>>> -bool vcpu_is_preempted(int cpu)
>>>> -{
>>>> -       struct kvm_steal_time *src;
>>>> -
>>>> -       if (!static_branch_unlikely(&virt_preempt_key))
>>>> -               return false;
>>>> -
>>>> -       src = &per_cpu(steal_time, cpu);
>>>> -       return !!(src->preempted & KVM_VCPU_PREEMPTED);
>>>> -}
>>>> -EXPORT_SYMBOL(vcpu_is_preempted);
>>>>    #endif
>>>>
>>>>    static void pv_cpu_reboot(void *unused)
>>>> --
>>>> 2.39.3
>>>>
>>
>>