arch/arm64/mm/contpte.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-)
Patch V3 has changed the while loop to a for loop according to the suggestions
of Dev. Meanwhile, to improve efficiency, the definition of local variables has
been removed. This macro is only used within the current function and there
will be no additional risks. In order to verify the optimization performance of
Patch V3, a test function has been designed. By repeatedly calling mlock in a
loop, the kernel is made to call contpte_ptep_get extensively to test the
optimization effect of this function.
The function's execution time and instruction statistics have been traced using
perf, and the following are the operation results on a certain Qualcomm mobile
phone chip:
Instruction Statistics - Before Optimization
# count event_name # count / runtime
20,814,352 branch-load-misses # 662.244 K/sec
41,894,986,323 branch-loads # 1.333 G/sec
1,957,415 iTLB-load-misses # 62.278 K/sec
49,872,282,100 iTLB-loads # 1.587 G/sec
302,808,096 L1-icache-load-misses # 9.634 M/sec
49,872,282,100 L1-icache-loads # 1.587 G/sec
Total test time: 31.485237 seconds.
Instruction Statistics - After Optimization
# count event_name # count / runtime
19,340,524 branch-load-misses # 688.753 K/sec
38,510,185,183 branch-loads # 1.371 G/sec
1,812,716 iTLB-load-misses # 64.554 K/sec
47,673,923,151 iTLB-loads # 1.698 G/sec
675,853,661 L1-icache-load-misses # 24.068 M/sec
47,673,923,151 L1-icache-loads # 1.698 G/sec
Total test time: 28.108048 seconds.
Function Statistics - Before Optimization
Arch: arm64
Event: cpu-cycles (type 0, config 0)
Samples: 1419716
Event count: 99618088900
Overhead Symbol
21.42% lock_release
21.26% lock_acquire
20.88% arch_counter_get_cntvct
14.32% _raw_spin_unlock_irq
6.79% contpte_ptep_get
2.20% test_contpte_perf
1.82% follow_page_pte
0.97% lock_acquired
0.97% rcu_is_watching
0.89% mlock_pte_range
0.84% sched_clock_noinstr
0.70% handle_softirqs.llvm.8218488130471452153
0.58% test_preempt_disable_long
0.57% _raw_spin_unlock_irqrestore
0.54% arch_stack_walk
0.51% vm_normal_folio
0.48% check_preemption_disabled
0.47% stackinfo_get_task
0.36% try_grab_folio
0.34% preempt_count
0.32% trace_preempt_on
0.29% trace_preempt_off
0.24% debug_smp_processor_id
Function Statistics - After Optimization
Arch: arm64
Event: cpu-cycles (type 0, config 0)
Samples: 1431006
Event count: 118856425042
Overhead Symbol
22.59% lock_release
22.13% arch_counter_get_cntvct
22.08% lock_acquire
15.32% _raw_spin_unlock_irq
2.26% test_contpte_perf
1.50% follow_page_pte
1.49% arch_stack_walk
1.30% rcu_is_watching
1.09% lock_acquired
1.07% sched_clock_noinstr
0.88% handle_softirqs.llvm.12507768597002095717
0.88% trace_preempt_off
0.76% _raw_spin_unlock_irqrestore
0.61% check_preemption_disabled
0.52% trace_preempt_on
0.50% mlock_pte_range
0.43% try_grab_folio
0.41% folio_mark_accessed
0.40% vm_normal_folio
0.38% test_preempt_disable_long
0.28% contpte_ptep_get
0.27% __traceiter_android_rvh_preempt_disable
0.26% debug_smp_processor_id
0.24% return_address
0.20% __pte_offset_map_lock
0.19% unwind_next_frame_record
If there is no problem with my test program, it can be seen that there is a
significant performance improvement both in the overall number of instructions
and the execution time of contpte_ptep_get.
If any reviewers have time, you can also test it on your machines for comparison.
I have enabled THP and hugepages-64kB.
Test Function:
---
#define PAGE_SIZE 4096
#define CONT_PTES 16
#define TEST_SIZE (4096* CONT_PTES * PAGE_SIZE)
void rwdata(char *buf)
{
for (size_t i = 0; i < TEST_SIZE; i += PAGE_SIZE) {
buf[i] = 'a';
volatile char c = buf[i];
}
}
void test_contpte_perf()
{
char *buf;
int ret = posix_memalign((void **)&buf, PAGE_SIZE, TEST_SIZE);
if (ret != 0) {
perror("posix_memalign failed");
exit(EXIT_FAILURE);
}
rwdata(buf);
for (int j = 0; j < 500; j++) {
mlock(buf, TEST_SIZE);
rwdata(buf);
munlock(buf, TEST_SIZE);
}
free(buf);
}
---
Xavier (1):
mm/contpte: Optimize loop to reduce redundant operations
arch/arm64/mm/contpte.c | 20 ++++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)
--
2.34.1
On 15/04/2025 09:22, Xavier wrote:
> Patch V3 has changed the while loop to a for loop according to the suggestions
> of Dev. Meanwhile, to improve efficiency, the definition of local variables has
> been removed. This macro is only used within the current function and there
> will be no additional risks. In order to verify the optimization performance of
> Patch V3, a test function has been designed. By repeatedly calling mlock in a
> loop, the kernel is made to call contpte_ptep_get extensively to test the
> optimization effect of this function.
> The function's execution time and instruction statistics have been traced using
> perf, and the following are the operation results on a certain Qualcomm mobile
> phone chip:
Xavier, for some reason your emails aren't hitting my inbox - I'm only seeing
the replies from others. I'll monitor lore but appologies if I'm slow to respond
- that's the reason.
Please start the first line of the commit with "arm64/mm" instead of "mm/contpte".
Also I noticed that Andrew put this into mm-new last night. I'd prefer that this
go via the arm64 tree, if we decide we want it.
>
> Instruction Statistics - Before Optimization
> # count event_name # count / runtime
> 20,814,352 branch-load-misses # 662.244 K/sec
> 41,894,986,323 branch-loads # 1.333 G/sec
> 1,957,415 iTLB-load-misses # 62.278 K/sec
> 49,872,282,100 iTLB-loads # 1.587 G/sec
> 302,808,096 L1-icache-load-misses # 9.634 M/sec
> 49,872,282,100 L1-icache-loads # 1.587 G/sec
>
> Total test time: 31.485237 seconds.
>
> Instruction Statistics - After Optimization
> # count event_name # count / runtime
> 19,340,524 branch-load-misses # 688.753 K/sec
> 38,510,185,183 branch-loads # 1.371 G/sec
> 1,812,716 iTLB-load-misses # 64.554 K/sec
> 47,673,923,151 iTLB-loads # 1.698 G/sec
> 675,853,661 L1-icache-load-misses # 24.068 M/sec
> 47,673,923,151 L1-icache-loads # 1.698 G/sec
>
> Total test time: 28.108048 seconds.
>
> Function Statistics - Before Optimization
> Arch: arm64
> Event: cpu-cycles (type 0, config 0)
> Samples: 1419716
> Event count: 99618088900
>
> Overhead Symbol
> 21.42% lock_release
> 21.26% lock_acquire
> 20.88% arch_counter_get_cntvct
> 14.32% _raw_spin_unlock_irq
> 6.79% contpte_ptep_get
> 2.20% test_contpte_perf
> 1.82% follow_page_pte
> 0.97% lock_acquired
> 0.97% rcu_is_watching
> 0.89% mlock_pte_range
> 0.84% sched_clock_noinstr
> 0.70% handle_softirqs.llvm.8218488130471452153
> 0.58% test_preempt_disable_long
> 0.57% _raw_spin_unlock_irqrestore
> 0.54% arch_stack_walk
> 0.51% vm_normal_folio
> 0.48% check_preemption_disabled
> 0.47% stackinfo_get_task
> 0.36% try_grab_folio
> 0.34% preempt_count
> 0.32% trace_preempt_on
> 0.29% trace_preempt_off
> 0.24% debug_smp_processor_id
>
> Function Statistics - After Optimization
> Arch: arm64
> Event: cpu-cycles (type 0, config 0)
> Samples: 1431006
> Event count: 118856425042
>
> Overhead Symbol
> 22.59% lock_release
> 22.13% arch_counter_get_cntvct
> 22.08% lock_acquire
> 15.32% _raw_spin_unlock_irq
> 2.26% test_contpte_perf
> 1.50% follow_page_pte
> 1.49% arch_stack_walk
> 1.30% rcu_is_watching
> 1.09% lock_acquired
> 1.07% sched_clock_noinstr
> 0.88% handle_softirqs.llvm.12507768597002095717
> 0.88% trace_preempt_off
> 0.76% _raw_spin_unlock_irqrestore
> 0.61% check_preemption_disabled
> 0.52% trace_preempt_on
> 0.50% mlock_pte_range
> 0.43% try_grab_folio
> 0.41% folio_mark_accessed
> 0.40% vm_normal_folio
> 0.38% test_preempt_disable_long
> 0.28% contpte_ptep_get
> 0.27% __traceiter_android_rvh_preempt_disable
> 0.26% debug_smp_processor_id
> 0.24% return_address
> 0.20% __pte_offset_map_lock
> 0.19% unwind_next_frame_record
>
> If there is no problem with my test program, it can be seen that there is a
> significant performance improvement both in the overall number of instructions
> and the execution time of contpte_ptep_get.
>
> If any reviewers have time, you can also test it on your machines for comparison.
> I have enabled THP and hugepages-64kB.
>
> Test Function:
> ---
> #define PAGE_SIZE 4096
> #define CONT_PTES 16
> #define TEST_SIZE (4096* CONT_PTES * PAGE_SIZE)
>
> void rwdata(char *buf)
> {
> for (size_t i = 0; i < TEST_SIZE; i += PAGE_SIZE) {
> buf[i] = 'a';
> volatile char c = buf[i];
> }
> }
> void test_contpte_perf()
> {
> char *buf;
> int ret = posix_memalign((void **)&buf, PAGE_SIZE, TEST_SIZE);
> if (ret != 0) {
> perror("posix_memalign failed");
> exit(EXIT_FAILURE);
> }
>
> rwdata(buf);
>
> for (int j = 0; j < 500; j++) {
> mlock(buf, TEST_SIZE);
>
> rwdata(buf);
>
> munlock(buf, TEST_SIZE);
This is a microbenchmark in a pathological case and it's showing ~11%
improvement. But in principle I'm ok with it. I have some comments on the actual
change though, which I'll send through against email.
Thanks,
Ryan
> }
>
> free(buf);
> }
> ---
>
> Xavier (1):
> mm/contpte: Optimize loop to reduce redundant operations
>
> arch/arm64/mm/contpte.c | 20 ++++++++++++++++++--
> 1 file changed, 18 insertions(+), 2 deletions(-)
>
Hi Ryan,
At 2025-04-16 20:48:57, "Ryan Roberts" <ryan.roberts@arm.com> wrote:
>On 15/04/2025 09:22, Xavier wrote:
>> Patch V3 has changed the while loop to a for loop according to the suggestions
>> of Dev. Meanwhile, to improve efficiency, the definition of local variables has
>> been removed. This macro is only used within the current function and there
>> will be no additional risks. In order to verify the optimization performance of
>> Patch V3, a test function has been designed. By repeatedly calling mlock in a
>> loop, the kernel is made to call contpte_ptep_get extensively to test the
>> optimization effect of this function.
>> The function's execution time and instruction statistics have been traced using
>> perf, and the following are the operation results on a certain Qualcomm mobile
>> phone chip:
>
>Xavier, for some reason your emails aren't hitting my inbox - I'm only seeing
>the replies from others. I'll monitor lore but appologies if I'm slow to respond
>- that's the reason.
>
>Please start the first line of the commit with "arm64/mm" instead of "mm/contpte".
>
>Also I noticed that Andrew put this into mm-new last night. I'd prefer that this
>go via the arm64 tree, if we decide we want it.
OK, I will change it to "arm64/mm" in the subsequent version.
>>
>> Instruction Statistics - Before Optimization
>> # count event_name # count / runtime
>> 20,814,352 branch-load-misses # 662.244 K/sec
>> 41,894,986,323 branch-loads # 1.333 G/sec
>> 1,957,415 iTLB-load-misses # 62.278 K/sec
>> 49,872,282,100 iTLB-loads # 1.587 G/sec
>> 302,808,096 L1-icache-load-misses # 9.634 M/sec
>> 49,872,282,100 L1-icache-loads # 1.587 G/sec
>>
>> Total test time: 31.485237 seconds.
>>
>> Instruction Statistics - After Optimization
>> # count event_name # count / runtime
>> 19,340,524 branch-load-misses # 688.753 K/sec
>> 38,510,185,183 branch-loads # 1.371 G/sec
>> 1,812,716 iTLB-load-misses # 64.554 K/sec
>> 47,673,923,151 iTLB-loads # 1.698 G/sec
>> 675,853,661 L1-icache-load-misses # 24.068 M/sec
>> 47,673,923,151 L1-icache-loads # 1.698 G/sec
>>
>> Total test time: 28.108048 seconds.
>>
>> Function Statistics - Before Optimization
>> Arch: arm64
>> Event: cpu-cycles (type 0, config 0)
>> Samples: 1419716
>> Event count: 99618088900
>>
>> Overhead Symbol
>> 21.42% lock_release
>> 21.26% lock_acquire
>> 20.88% arch_counter_get_cntvct
>> 14.32% _raw_spin_unlock_irq
>> 6.79% contpte_ptep_get
>> 2.20% test_contpte_perf
>> 1.82% follow_page_pte
>> 0.97% lock_acquired
>> 0.97% rcu_is_watching
>> 0.89% mlock_pte_range
>> 0.84% sched_clock_noinstr
>> 0.70% handle_softirqs.llvm.8218488130471452153
>> 0.58% test_preempt_disable_long
>> 0.57% _raw_spin_unlock_irqrestore
>> 0.54% arch_stack_walk
>> 0.51% vm_normal_folio
>> 0.48% check_preemption_disabled
>> 0.47% stackinfo_get_task
>> 0.36% try_grab_folio
>> 0.34% preempt_count
>> 0.32% trace_preempt_on
>> 0.29% trace_preempt_off
>> 0.24% debug_smp_processor_id
>>
>> Function Statistics - After Optimization
>> Arch: arm64
>> Event: cpu-cycles (type 0, config 0)
>> Samples: 1431006
>> Event count: 118856425042
>>
>> Overhead Symbol
>> 22.59% lock_release
>> 22.13% arch_counter_get_cntvct
>> 22.08% lock_acquire
>> 15.32% _raw_spin_unlock_irq
>> 2.26% test_contpte_perf
>> 1.50% follow_page_pte
>> 1.49% arch_stack_walk
>> 1.30% rcu_is_watching
>> 1.09% lock_acquired
>> 1.07% sched_clock_noinstr
>> 0.88% handle_softirqs.llvm.12507768597002095717
>> 0.88% trace_preempt_off
>> 0.76% _raw_spin_unlock_irqrestore
>> 0.61% check_preemption_disabled
>> 0.52% trace_preempt_on
>> 0.50% mlock_pte_range
>> 0.43% try_grab_folio
>> 0.41% folio_mark_accessed
>> 0.40% vm_normal_folio
>> 0.38% test_preempt_disable_long
>> 0.28% contpte_ptep_get
>> 0.27% __traceiter_android_rvh_preempt_disable
>> 0.26% debug_smp_processor_id
>> 0.24% return_address
>> 0.20% __pte_offset_map_lock
>> 0.19% unwind_next_frame_record
>>
>> If there is no problem with my test program, it can be seen that there is a
>> significant performance improvement both in the overall number of instructions
>> and the execution time of contpte_ptep_get.
>>
>> If any reviewers have time, you can also test it on your machines for comparison.
>> I have enabled THP and hugepages-64kB.
>>
>> Test Function:
>> ---
>> #define PAGE_SIZE 4096
>> #define CONT_PTES 16
>> #define TEST_SIZE (4096* CONT_PTES * PAGE_SIZE)
>>
>> void rwdata(char *buf)
>> {
>> for (size_t i = 0; i < TEST_SIZE; i += PAGE_SIZE) {
>> buf[i] = 'a';
>> volatile char c = buf[i];
>> }
>> }
>> void test_contpte_perf()
>> {
>> char *buf;
>> int ret = posix_memalign((void **)&buf, PAGE_SIZE, TEST_SIZE);
>> if (ret != 0) {
>> perror("posix_memalign failed");
>> exit(EXIT_FAILURE);
>> }
>>
>> rwdata(buf);
>>
>> for (int j = 0; j < 500; j++) {
>> mlock(buf, TEST_SIZE);
>>
>> rwdata(buf);
>>
>> munlock(buf, TEST_SIZE);
>
>This is a microbenchmark in a pathological case and it's showing ~11%
>improvement. But in principle I'm ok with it. I have some comments on the actual
>change though, which I'll send through against email.
OK. Please refer to my reply to that email.
Thanks,
Xavier
On Tue, Apr 15, 2025 at 04:22:04PM +0800, Xavier wrote: > Patch V3 has changed the while loop to a for loop according to the suggestions > of Dev. For some reason, my email (office365) rejected all these patches (not even quarantined), I only got the replies. Anyway, I can get them from the lore archive. > Meanwhile, to improve efficiency, the definition of local variables has > been removed. This macro is only used within the current function and there > will be no additional risks. In order to verify the optimization performance of > Patch V3, a test function has been designed. By repeatedly calling mlock in a > loop, the kernel is made to call contpte_ptep_get extensively to test the > optimization effect of this function. > The function's execution time and instruction statistics have been traced using > perf, and the following are the operation results on a certain Qualcomm mobile > phone chip: > > Instruction Statistics - Before Optimization > # count event_name # count / runtime > 20,814,352 branch-load-misses # 662.244 K/sec > 41,894,986,323 branch-loads # 1.333 G/sec > 1,957,415 iTLB-load-misses # 62.278 K/sec > 49,872,282,100 iTLB-loads # 1.587 G/sec > 302,808,096 L1-icache-load-misses # 9.634 M/sec > 49,872,282,100 L1-icache-loads # 1.587 G/sec > > Total test time: 31.485237 seconds. > > Instruction Statistics - After Optimization > # count event_name # count / runtime > 19,340,524 branch-load-misses # 688.753 K/sec > 38,510,185,183 branch-loads # 1.371 G/sec > 1,812,716 iTLB-load-misses # 64.554 K/sec > 47,673,923,151 iTLB-loads # 1.698 G/sec > 675,853,661 L1-icache-load-misses # 24.068 M/sec > 47,673,923,151 L1-icache-loads # 1.698 G/sec > > Total test time: 28.108048 seconds. We'd need to reproduce these numbers on other platforms as well and with different page sizes. I hope Ryan can do some tests next week. Purely looking at the patch, I don't like the complexity. I'd rather go with your v1 if the numbers are fairly similar (even if slightly slower). However, I don't trust microbenchmarks like calling mlock() in a loop. It was hand-crafted to dirty the whole buffer (making ptes young+dirty) before mlock() to make the best out of the rewritten contpte_ptep_get(). Are there any real world workloads that would benefit from such change? As it stands, I think this patch needs better justification. Thanks. -- Catalin
Hi Catalin, At 2025-04-16 20:47:46, "Catalin Marinas" <catalin.marinas@arm.com> wrote: >On Tue, Apr 15, 2025 at 04:22:04PM +0800, Xavier wrote: >> Patch V3 has changed the while loop to a for loop according to the suggestions >> of Dev. > >For some reason, my email (office365) rejected all these patches (not >even quarantined), I only got the replies. Anyway, I can get them from >the lore archive. > >> Meanwhile, to improve efficiency, the definition of local variables has >> been removed. This macro is only used within the current function and there >> will be no additional risks. In order to verify the optimization performance of >> Patch V3, a test function has been designed. By repeatedly calling mlock in a >> loop, the kernel is made to call contpte_ptep_get extensively to test the >> optimization effect of this function. >> The function's execution time and instruction statistics have been traced using >> perf, and the following are the operation results on a certain Qualcomm mobile >> phone chip: >> >> Instruction Statistics - Before Optimization >> # count event_name # count / runtime >> 20,814,352 branch-load-misses # 662.244 K/sec >> 41,894,986,323 branch-loads # 1.333 G/sec >> 1,957,415 iTLB-load-misses # 62.278 K/sec >> 49,872,282,100 iTLB-loads # 1.587 G/sec >> 302,808,096 L1-icache-load-misses # 9.634 M/sec >> 49,872,282,100 L1-icache-loads # 1.587 G/sec >> >> Total test time: 31.485237 seconds. >> >> Instruction Statistics - After Optimization >> # count event_name # count / runtime >> 19,340,524 branch-load-misses # 688.753 K/sec >> 38,510,185,183 branch-loads # 1.371 G/sec >> 1,812,716 iTLB-load-misses # 64.554 K/sec >> 47,673,923,151 iTLB-loads # 1.698 G/sec >> 675,853,661 L1-icache-load-misses # 24.068 M/sec >> 47,673,923,151 L1-icache-loads # 1.698 G/sec >> >> Total test time: 28.108048 seconds. > >We'd need to reproduce these numbers on other platforms as well and with >different page sizes. I hope Ryan can do some tests next week. Of course, it would be even better if any reviewers could verify it with some real-world scenarios. > >Purely looking at the patch, I don't like the complexity. I'd rather go >with your v1 if the numbers are fairly similar (even if slightly slower). The implementation of this macro is indeed a bit complicated. If it doesn't affect the performance, I will try to make it simpler. > >However, I don't trust microbenchmarks like calling mlock() in a loop. >It was hand-crafted to dirty the whole buffer (making ptes young+dirty) >before mlock() to make the best out of the rewritten contpte_ptep_get(). >Are there any real world workloads that would benefit from such change? > >As it stands, I think this patch needs better justification. > Indeed, I used mlock() because it is an example that can simply achieve a large number of calls to contpte_ptep_get() in the user space. Of course, there are many other scenarios where this function will be called, such as madvise and mprotect, and so on. But essentially, there is no difference, and all of them are aimed at verifying the performance improvement brought by the contpte_ptep_get() function. It's true that the previous test cases only tested the best-case scenario by dirtying the data. Next, I will continue to test the impact of the new patch on performance in scenarios where there are no "young" and "dirty" flags, or where there is only a "young" flag. -- Thanks, Xavier
Please try to avoid presentation of a [0/N] cover letter when N==1! A
simple singleton patch is better.
On Tue, 15 Apr 2025 16:22:04 +0800 Xavier <xavier_qy@163.com> wrote:
> Patch V3 has changed the while loop to a for loop according to the suggestions
> of Dev. Meanwhile, to improve efficiency, the definition of local variables has
> been removed. This macro is only used within the current function and there
which function?
> will be no additional risks. In order to verify the optimization performance of
> Patch V3, a test function has been designed. By repeatedly calling mlock in a
> loop, the kernel is made to call contpte_ptep_get extensively to test the
> optimization effect of this function.
> The function's execution time and instruction statistics have been traced using
> perf, and the following are the operation results on a certain Qualcomm mobile
> phone chip:
All the words thus far appear to be discussing changes since v2. For
the permanent kernel record, this isn't interesting or useful material.
So please present a standalone description which doesn't refer to
previous iterations.
It's great to present this what-i-changed-since-last-time material, but
that is better placed after the "^---$" separator, after the
Signed-off-by:, Reviewed-by: etc tags.
>
> ...
>
Below is what I came up with for a changelog. Please check?
Optimize contpte_ptep_get() by adding early termination logic. Check if
the dirty and young bits of orig_pte are already set and skip redundant
bit-setting operations during the loop. This reduces unnecessary
iterations and improves performance.
The function's execution time and instruction statistics have been traced
using perf, and the following are the operation results on a certain
Qualcomm mobile phone chip:
Instruction Statistics - Before Optimization
# count event_name # count / runtime
20,814,352 branch-load-misses # 662.244 K/sec
41,894,986,323 branch-loads # 1.333 G/sec
1,957,415 iTLB-load-misses # 62.278 K/sec
49,872,282,100 iTLB-loads # 1.587 G/sec
302,808,096 L1-icache-load-misses # 9.634 M/sec
49,872,282,100 L1-icache-loads # 1.587 G/sec
Total test time: 31.485237 seconds.
Instruction Statistics - After Optimization
# count event_name # count / runtime
19,340,524 branch-load-misses # 688.753 K/sec
38,510,185,183 branch-loads # 1.371 G/sec
1,812,716 iTLB-load-misses # 64.554 K/sec
47,673,923,151 iTLB-loads # 1.698 G/sec
675,853,661 L1-icache-load-misses # 24.068 M/sec
47,673,923,151 L1-icache-loads # 1.698 G/sec
Total test time: 28.108048 seconds.
Function Statistics - Before Optimization
Arch: arm64
Event: cpu-cycles (type 0, config 0)
Samples: 1419716
Event count: 99618088900
Overhead Symbol
21.42% lock_release
21.26% lock_acquire
20.88% arch_counter_get_cntvct
14.32% _raw_spin_unlock_irq
6.79% contpte_ptep_get
2.20% test_contpte_perf
1.82% follow_page_pte
0.97% lock_acquired
0.97% rcu_is_watching
0.89% mlock_pte_range
0.84% sched_clock_noinstr
0.70% handle_softirqs.llvm.8218488130471452153
0.58% test_preempt_disable_long
0.57% _raw_spin_unlock_irqrestore
0.54% arch_stack_walk
0.51% vm_normal_folio
0.48% check_preemption_disabled
0.47% stackinfo_get_task
0.36% try_grab_folio
0.34% preempt_count
0.32% trace_preempt_on
0.29% trace_preempt_off
0.24% debug_smp_processor_id
Function Statistics - After Optimization
Arch: arm64
Event: cpu-cycles (type 0, config 0)
Samples: 1431006
Event count: 118856425042
Overhead Symbol
22.59% lock_release
22.13% arch_counter_get_cntvct
22.08% lock_acquire
15.32% _raw_spin_unlock_irq
2.26% test_contpte_perf
1.50% follow_page_pte
1.49% arch_stack_walk
1.30% rcu_is_watching
1.09% lock_acquired
1.07% sched_clock_noinstr
0.88% handle_softirqs.llvm.12507768597002095717
0.88% trace_preempt_off
0.76% _raw_spin_unlock_irqrestore
0.61% check_preemption_disabled
0.52% trace_preempt_on
0.50% mlock_pte_range
0.43% try_grab_folio
0.41% folio_mark_accessed
0.40% vm_normal_folio
0.38% test_preempt_disable_long
0.28% contpte_ptep_get
0.27% __traceiter_android_rvh_preempt_disable
0.26% debug_smp_processor_id
0.24% return_address
0.20% __pte_offset_map_lock
0.19% unwind_next_frame_record
If there is no problem with my test program, it can be seen that there is a
significant performance improvement both in the overall number of instructions
and the execution time of contpte_ptep_get.
If any reviewers have time, you can also test it on your machines for comparison.
I have enabled THP and hugepages-64kB.
Test function:
#define PAGE_SIZE 4096
#define CONT_PTES 16
#define TEST_SIZE (4096* CONT_PTES * PAGE_SIZE)
void rwdata(char *buf)
{
for (size_t i = 0; i < TEST_SIZE; i += PAGE_SIZE) {
buf[i] = 'a';
volatile char c = buf[i];
}
}
void test_contpte_perf()
{
char *buf;
int ret = posix_memalign((void **)&buf, PAGE_SIZE, TEST_SIZE);
if (ret != 0) {
perror("posix_memalign failed");
exit(EXIT_FAILURE);
}
rwdata(buf);
for (int j = 0; j < 500; j++) {
mlock(buf, TEST_SIZE);
rwdata(buf);
munlock(buf, TEST_SIZE);
}
free(buf);
}
Hi Andrew,
At 2025-04-16 10:10:27, "Andrew Morton" <akpm@linux-foundation.org> wrote:
>
>Please try to avoid presentation of a [0/N] cover letter when N==1! A
>simple singleton patch is better.
Got it, I'll keep this in mind for future submissions. Thanks for the reminder!
>
>On Tue, 15 Apr 2025 16:22:04 +0800 Xavier <xavier_qy@163.com> wrote:
>
>> Patch V3 has changed the while loop to a for loop according to the suggestions
>> of Dev. Meanwhile, to improve efficiency, the definition of local variables has
>> been removed. This macro is only used within the current function and there
>
>which function?
It's contpte_ptep_get().
>
>> will be no additional risks. In order to verify the optimization performance of
>> Patch V3, a test function has been designed. By repeatedly calling mlock in a
>> loop, the kernel is made to call contpte_ptep_get extensively to test the
>> optimization effect of this function.
>> The function's execution time and instruction statistics have been traced using
>> perf, and the following are the operation results on a certain Qualcomm mobile
>> phone chip:
>
>All the words thus far appear to be discussing changes since v2. For
>the permanent kernel record, this isn't interesting or useful material.
>So please present a standalone description which doesn't refer to
>previous iterations.
>
>It's great to present this what-i-changed-since-last-time material, but
>that is better placed after the "^---$" separator, after the
>Signed-off-by:, Reviewed-by: etc tags.
>
OK, I will follow this requirement for future submissions.
>>
>> ...
>>
>
>
>Below is what I came up with for a changelog. Please check?
I've reviewed it, and it looks good. Thank you for your revisions!
>
>Optimize contpte_ptep_get() by adding early termination logic. Check if
>the dirty and young bits of orig_pte are already set and skip redundant
>bit-setting operations during the loop. This reduces unnecessary
>iterations and improves performance.
>
>The function's execution time and instruction statistics have been traced
>using perf, and the following are the operation results on a certain
>Qualcomm mobile phone chip:
>
>Instruction Statistics - Before Optimization
># count event_name # count / runtime
> 20,814,352 branch-load-misses # 662.244 K/sec
> 41,894,986,323 branch-loads # 1.333 G/sec
> 1,957,415 iTLB-load-misses # 62.278 K/sec
> 49,872,282,100 iTLB-loads # 1.587 G/sec
> 302,808,096 L1-icache-load-misses # 9.634 M/sec
> 49,872,282,100 L1-icache-loads # 1.587 G/sec
>
>Total test time: 31.485237 seconds.
>
>Instruction Statistics - After Optimization
># count event_name # count / runtime
> 19,340,524 branch-load-misses # 688.753 K/sec
> 38,510,185,183 branch-loads # 1.371 G/sec
> 1,812,716 iTLB-load-misses # 64.554 K/sec
> 47,673,923,151 iTLB-loads # 1.698 G/sec
> 675,853,661 L1-icache-load-misses # 24.068 M/sec
> 47,673,923,151 L1-icache-loads # 1.698 G/sec
>
>Total test time: 28.108048 seconds.
>
>Function Statistics - Before Optimization
>Arch: arm64
>Event: cpu-cycles (type 0, config 0)
>Samples: 1419716
>Event count: 99618088900
>
>Overhead Symbol
>21.42% lock_release
>21.26% lock_acquire
>20.88% arch_counter_get_cntvct
>14.32% _raw_spin_unlock_irq
>6.79% contpte_ptep_get
>2.20% test_contpte_perf
>1.82% follow_page_pte
>0.97% lock_acquired
>0.97% rcu_is_watching
>0.89% mlock_pte_range
>0.84% sched_clock_noinstr
>0.70% handle_softirqs.llvm.8218488130471452153
>0.58% test_preempt_disable_long
>0.57% _raw_spin_unlock_irqrestore
>0.54% arch_stack_walk
>0.51% vm_normal_folio
>0.48% check_preemption_disabled
>0.47% stackinfo_get_task
>0.36% try_grab_folio
>0.34% preempt_count
>0.32% trace_preempt_on
>0.29% trace_preempt_off
>0.24% debug_smp_processor_id
>
>Function Statistics - After Optimization
>Arch: arm64
>Event: cpu-cycles (type 0, config 0)
>Samples: 1431006
>Event count: 118856425042
>
>Overhead Symbol
>22.59% lock_release
>22.13% arch_counter_get_cntvct
>22.08% lock_acquire
>15.32% _raw_spin_unlock_irq
>2.26% test_contpte_perf
>1.50% follow_page_pte
>1.49% arch_stack_walk
>1.30% rcu_is_watching
>1.09% lock_acquired
>1.07% sched_clock_noinstr
>0.88% handle_softirqs.llvm.12507768597002095717
>0.88% trace_preempt_off
>0.76% _raw_spin_unlock_irqrestore
>0.61% check_preemption_disabled
>0.52% trace_preempt_on
>0.50% mlock_pte_range
>0.43% try_grab_folio
>0.41% folio_mark_accessed
>0.40% vm_normal_folio
>0.38% test_preempt_disable_long
>0.28% contpte_ptep_get
>0.27% __traceiter_android_rvh_preempt_disable
>0.26% debug_smp_processor_id
>0.24% return_address
>0.20% __pte_offset_map_lock
>0.19% unwind_next_frame_record
>
>If there is no problem with my test program, it can be seen that there is a
>significant performance improvement both in the overall number of instructions
>and the execution time of contpte_ptep_get.
>
>If any reviewers have time, you can also test it on your machines for comparison.
>I have enabled THP and hugepages-64kB.
>
>Test function:
>
>#define PAGE_SIZE 4096
>#define CONT_PTES 16
>#define TEST_SIZE (4096* CONT_PTES * PAGE_SIZE)
>
>void rwdata(char *buf)
>{
> for (size_t i = 0; i < TEST_SIZE; i += PAGE_SIZE) {
> buf[i] = 'a';
> volatile char c = buf[i];
> }
>}
>void test_contpte_perf()
>{
> char *buf;
> int ret = posix_memalign((void **)&buf, PAGE_SIZE, TEST_SIZE);
> if (ret != 0) {
> perror("posix_memalign failed");
> exit(EXIT_FAILURE);
> }
>
> rwdata(buf);
>
> for (int j = 0; j < 500; j++) {
> mlock(buf, TEST_SIZE);
>
> rwdata(buf);
>
> munlock(buf, TEST_SIZE);
> }
>
> free(buf);
>}
--
Thanks,
Xavier
© 2016 - 2025 Red Hat, Inc.