Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old.
x86 CMPXCHG instruction returns success in ZF flag, so this change
saves a compare after cmpxchg (and related move instruction in
front of cmpxchg).
Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg
fails. There is no need to re-read the value in the loop.
No functional change intended.
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
---
kernel/trace/ring_buffer.c | 20 ++++++++------------
1 file changed, 8 insertions(+), 12 deletions(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4188af7d4cfe..8f0ef7d12ddd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1493,14 +1493,11 @@ static bool rb_head_page_replace(struct buffer_page *old,
{
unsigned long *ptr = (unsigned long *)&old->list.prev->next;
unsigned long val;
- unsigned long ret;
val = *ptr & ~RB_FLAG_MASK;
val |= RB_PAGE_HEAD;
- ret = cmpxchg(ptr, val, (unsigned long)&new->list);
-
- return ret == val;
+ return try_cmpxchg(ptr, &val, (unsigned long)&new->list);
}
/*
@@ -2055,7 +2052,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
retries = 10;
success = false;
while (retries--) {
- struct list_head *head_page, *prev_page, *r;
+ struct list_head *head_page, *prev_page;
struct list_head *last_page, *first_page;
struct list_head *head_page_with_bit;
@@ -2073,9 +2070,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
last_page->next = head_page_with_bit;
first_page->prev = prev_page;
- r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
-
- if (r == head_page_with_bit) {
+ if (try_cmpxchg(&prev_page->next,
+ &head_page_with_bit, first_page)) {
/*
* yay, we replaced the page pointer to our new list,
* now, we just have to update to head page's prev
@@ -4061,10 +4057,10 @@ void ring_buffer_record_off(struct trace_buffer *buffer)
unsigned int rd;
unsigned int new_rd;
+ rd = atomic_read(&buffer->record_disabled);
do {
- rd = atomic_read(&buffer->record_disabled);
new_rd = rd | RB_BUFFER_OFF;
- } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+ } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
}
EXPORT_SYMBOL_GPL(ring_buffer_record_off);
@@ -4084,10 +4080,10 @@ void ring_buffer_record_on(struct trace_buffer *buffer)
unsigned int rd;
unsigned int new_rd;
+ rd = atomic_read(&buffer->record_disabled);
do {
- rd = atomic_read(&buffer->record_disabled);
new_rd = rd & ~RB_BUFFER_OFF;
- } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+ } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
}
EXPORT_SYMBOL_GPL(ring_buffer_record_on);
--
2.39.2
On Tue, 28 Feb 2023 18:59:29 +0100
Uros Bizjak <ubizjak@gmail.com> wrote:
> Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old.
> x86 CMPXCHG instruction returns success in ZF flag, so this change
> saves a compare after cmpxchg (and related move instruction in
> front of cmpxchg).
>
> Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg
> fails. There is no need to re-read the value in the loop.
>
> No functional change intended.
As I mentioned in the RCU thread, I have issues with some of the changes
here.
>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Masami Hiramatsu <mhiramat@kernel.org>
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> ---
> kernel/trace/ring_buffer.c | 20 ++++++++------------
> 1 file changed, 8 insertions(+), 12 deletions(-)
>
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 4188af7d4cfe..8f0ef7d12ddd 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -1493,14 +1493,11 @@ static bool rb_head_page_replace(struct buffer_page *old,
> {
> unsigned long *ptr = (unsigned long *)&old->list.prev->next;
> unsigned long val;
> - unsigned long ret;
>
> val = *ptr & ~RB_FLAG_MASK;
> val |= RB_PAGE_HEAD;
>
> - ret = cmpxchg(ptr, val, (unsigned long)&new->list);
> -
> - return ret == val;
> + return try_cmpxchg(ptr, &val, (unsigned long)&new->list);
No, val should not be updated.
> }
>
> /*
> @@ -2055,7 +2052,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
> retries = 10;
> success = false;
> while (retries--) {
> - struct list_head *head_page, *prev_page, *r;
> + struct list_head *head_page, *prev_page;
> struct list_head *last_page, *first_page;
> struct list_head *head_page_with_bit;
>
> @@ -2073,9 +2070,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
> last_page->next = head_page_with_bit;
> first_page->prev = prev_page;
>
> - r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
> -
> - if (r == head_page_with_bit) {
> + if (try_cmpxchg(&prev_page->next,
> + &head_page_with_bit, first_page)) {
No. head_page_with_bit should not be updated.
> /*
> * yay, we replaced the page pointer to our new list,
> * now, we just have to update to head page's prev
> @@ -4061,10 +4057,10 @@ void ring_buffer_record_off(struct trace_buffer *buffer)
> unsigned int rd;
> unsigned int new_rd;
>
> + rd = atomic_read(&buffer->record_disabled);
> do {
> - rd = atomic_read(&buffer->record_disabled);
> new_rd = rd | RB_BUFFER_OFF;
> - } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
> + } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
This change is OK.
> }
> EXPORT_SYMBOL_GPL(ring_buffer_record_off);
>
> @@ -4084,10 +4080,10 @@ void ring_buffer_record_on(struct trace_buffer *buffer)
> unsigned int rd;
> unsigned int new_rd;
>
> + rd = atomic_read(&buffer->record_disabled);
> do {
> - rd = atomic_read(&buffer->record_disabled);
> new_rd = rd & ~RB_BUFFER_OFF;
> - } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
> + } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
And so is this one.
So I will not take this patch as is.
-- Steve
> }
> EXPORT_SYMBOL_GPL(ring_buffer_record_on);
>
On Tue, Feb 28, 2023 at 10:43 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Tue, 28 Feb 2023 18:59:29 +0100
> Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old.
> > x86 CMPXCHG instruction returns success in ZF flag, so this change
> > saves a compare after cmpxchg (and related move instruction in
> > front of cmpxchg).
> >
> > Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg
> > fails. There is no need to re-read the value in the loop.
> >
> > No functional change intended.
>
> As I mentioned in the RCU thread, I have issues with some of the changes
> here.
>
> >
> > Cc: Steven Rostedt <rostedt@goodmis.org>
> > Cc: Masami Hiramatsu <mhiramat@kernel.org>
> > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > ---
> > kernel/trace/ring_buffer.c | 20 ++++++++------------
> > 1 file changed, 8 insertions(+), 12 deletions(-)
> >
> > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> > index 4188af7d4cfe..8f0ef7d12ddd 100644
> > --- a/kernel/trace/ring_buffer.c
> > +++ b/kernel/trace/ring_buffer.c
> > @@ -1493,14 +1493,11 @@ static bool rb_head_page_replace(struct buffer_page *old,
> > {
> > unsigned long *ptr = (unsigned long *)&old->list.prev->next;
> > unsigned long val;
> > - unsigned long ret;
> >
> > val = *ptr & ~RB_FLAG_MASK;
> > val |= RB_PAGE_HEAD;
> >
> > - ret = cmpxchg(ptr, val, (unsigned long)&new->list);
> > -
> > - return ret == val;
> > + return try_cmpxchg(ptr, &val, (unsigned long)&new->list);
>
> No, val should not be updated.
Please see the definition of try_cmpxchg. The definition is written in
such a way that benefits loops as well as linear code and in the later
case depends on the compiler to eliminate assignment to val as a dead
assignment.
The above change was done under the assumption that val is unused
after try_cmpxchg, and can be considered as a temporary
[Alternatively, the value could be copied to a local temporary and a
pointer to this local temporary could be passed to try_cmpxchg
instead. Compiler is smart enough to eliminate the assignment in any
case.]
Even in the linear code, the change has considerable effect.
rb_head_page_replace is inlined in rb_get_reader_page and gcc-10.3.1
improves code from:
ef8: 48 8b 0e mov (%rsi),%rcx
efb: 48 83 e1 fc and $0xfffffffffffffffc,%rcx
eff: 48 83 c9 01 or $0x1,%rcx
f03: 48 89 c8 mov %rcx,%rax
f06: f0 48 0f b1 3e lock cmpxchg %rdi,(%rsi)
f0b: 48 39 c1 cmp %rax,%rcx
f0e: 74 3b je f4b <rb_get_reader_page+0x13b>
to:
ed8: 48 8b 01 mov (%rcx),%rax
edb: 48 83 e0 fc and $0xfffffffffffffffc,%rax
edf: 48 83 c8 01 or $0x1,%rax
ee3: f0 48 0f b1 31 lock cmpxchg %rsi,(%rcx)
ee8: 74 3b je f25 <rb_get_reader_page+0x135>
Again, even in linear code the change is able to eliminate the
assignment to a temporary reg and the compare. Please note that there
is no move *from* %rax register after cmpxchg, so the compiler
correctly eliminated unused assignment.
>
> > }
> >
> > /*
> > @@ -2055,7 +2052,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
> > retries = 10;
> > success = false;
> > while (retries--) {
> > - struct list_head *head_page, *prev_page, *r;
> > + struct list_head *head_page, *prev_page;
> > struct list_head *last_page, *first_page;
> > struct list_head *head_page_with_bit;
> >
> > @@ -2073,9 +2070,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
> > last_page->next = head_page_with_bit;
> > first_page->prev = prev_page;
> >
> > - r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
> > -
> > - if (r == head_page_with_bit) {
> > + if (try_cmpxchg(&prev_page->next,
> > + &head_page_with_bit, first_page)) {
>
> No. head_page_with_bit should not be updated.
As above, head_page_with_bit should be considered as a temporary, it
is initialized a couple of lines above cmpxchg and unused after. The
gcc-10.3.1 compiler even found some more optimization opportunities
and reordered the code from:
1364: 4d 8b 86 38 01 00 00 mov 0x138(%r14),%r8
136b: 48 83 ce 01 or $0x1,%rsi
136f: 48 89 f0 mov %rsi,%rax
1372: 49 89 30 mov %rsi,(%r8)
1375: 48 89 4f 08 mov %rcx,0x8(%rdi)
1379: f0 48 0f b1 39 lock cmpxchg %rdi,(%rcx)
137e: 48 39 c6 cmp %rax,%rsi
1381: 74 78 je 13fb <rb_insert_pages+0xdb>
to:
1343: 48 83 c8 01 or $0x1,%rax
1347: 48 8b bb 38 01 00 00 mov 0x138(%rbx),%rdi
134e: 48 89 07 mov %rax,(%rdi)
1351: 48 89 4e 08 mov %rcx,0x8(%rsi)
1355: f0 48 0f b1 31 lock cmpxchg %rsi,(%rcx)
135a: 41 0f 94 c7 sete %r15b
135e: 75 2f jne 138f <rb_insert_pages+0x8f>
Please also note SETE insn in the above code, this is how the
"success" variable is handled in the loop. So, besides code size
improvement, other secondary improvements can be expected from the
change, too.
I think that the above examples demonstrate various improvements that
can be achieved with effectively a one-line, almost mechanical change
to the code, even in linear code. It would be unfortunate to not
consider them.
Uros.
On Wed, 1 Mar 2023 10:37:28 +0100
Uros Bizjak <ubizjak@gmail.com> wrote:
> > No, val should not be updated.
>
> Please see the definition of try_cmpxchg. The definition is written in
> such a way that benefits loops as well as linear code and in the later
> case depends on the compiler to eliminate assignment to val as a dead
> assignment.
I did read it ;-)
>
> The above change was done under the assumption that val is unused
> after try_cmpxchg, and can be considered as a temporary
> [Alternatively, the value could be copied to a local temporary and a
> pointer to this local temporary could be passed to try_cmpxchg
> instead. Compiler is smart enough to eliminate the assignment in any
> case.]
>
> Even in the linear code, the change has considerable effect.
> rb_head_page_replace is inlined in rb_get_reader_page and gcc-10.3.1
> improves code from:
>
> ef8: 48 8b 0e mov (%rsi),%rcx
> efb: 48 83 e1 fc and $0xfffffffffffffffc,%rcx
> eff: 48 83 c9 01 or $0x1,%rcx
> f03: 48 89 c8 mov %rcx,%rax
> f06: f0 48 0f b1 3e lock cmpxchg %rdi,(%rsi)
> f0b: 48 39 c1 cmp %rax,%rcx
> f0e: 74 3b je f4b <rb_get_reader_page+0x13b>
>
> to:
>
> ed8: 48 8b 01 mov (%rcx),%rax
> edb: 48 83 e0 fc and $0xfffffffffffffffc,%rax
> edf: 48 83 c8 01 or $0x1,%rax
> ee3: f0 48 0f b1 31 lock cmpxchg %rsi,(%rcx)
> ee8: 74 3b je f25 <rb_get_reader_page+0x135>
I'm using gcc 12.2.0 and have this;
cmpxchg:
0000000000000e50 <rb_get_reader_page>:
e50: 41 55 push %r13
e52: 41 54 push %r12
e54: 55 push %rbp
e55: 53 push %rbx
e56: 48 89 fb mov %rdi,%rbx
e59: 9c pushf
e5a: 5d pop %rbp
e5b: fa cli
e5c: 81 e5 00 02 00 00 and $0x200,%ebp
e62: 0f 85 e6 01 00 00 jne 104e <rb_get_reader_page+0x1fe>
e68: 48 8d 7b 1c lea 0x1c(%rbx),%rdi
e6c: 31 c0 xor %eax,%eax
e6e: ba 01 00 00 00 mov $0x1,%edx
e73: f0 0f b1 53 1c lock cmpxchg %edx,0x1c(%rbx)
e78: 0f 85 e5 01 00 00 jne 1063 <rb_get_reader_page+0x213>
e7e: 41 bc 03 00 00 00 mov $0x3,%r12d
e84: 4c 8b 6b 58 mov 0x58(%rbx),%r13
e88: 49 8b 55 30 mov 0x30(%r13),%rdx
e8c: 41 8b 45 18 mov 0x18(%r13),%eax
e90: 48 8b 4a 08 mov 0x8(%rdx),%rcx
e94: 39 c8 cmp %ecx,%eax
e96: 0f 82 61 01 00 00 jb ffd <rb_get_reader_page+0x1ad>
e9c: 48 8b 52 08 mov 0x8(%rdx),%rdx
try_cmpxchg:
0000000000000e70 <rb_get_reader_page>:
e70: 41 55 push %r13
e72: 41 54 push %r12
e74: 55 push %rbp
e75: 53 push %rbx
e76: 48 89 fb mov %rdi,%rbx
e79: 9c pushf
e7a: 5d pop %rbp
e7b: fa cli
e7c: 81 e5 00 02 00 00 and $0x200,%ebp
e82: 0f 85 e0 01 00 00 jne 1068 <rb_get_reader_page+0x1f8>
e88: 48 8d 7b 1c lea 0x1c(%rbx),%rdi
e8c: 31 c0 xor %eax,%eax
e8e: ba 01 00 00 00 mov $0x1,%edx
e93: f0 0f b1 53 1c lock cmpxchg %edx,0x1c(%rbx)
e98: 0f 85 df 01 00 00 jne 107d <rb_get_reader_page+0x20d>
e9e: 41 bc 03 00 00 00 mov $0x3,%r12d
ea4: 4c 8b 6b 58 mov 0x58(%rbx),%r13
ea8: 49 8b 55 30 mov 0x30(%r13),%rdx
eac: 41 8b 45 18 mov 0x18(%r13),%eax
eb0: 48 8b 4a 08 mov 0x8(%rdx),%rcx
eb4: 39 c8 cmp %ecx,%eax
eb6: 0f 82 5b 01 00 00 jb 1017 <rb_get_reader_page+0x1a7>
ebc: 48 8b 52 08 mov 0x8(%rdx),%rdx
Which has no difference :-/
>
> Again, even in linear code the change is able to eliminate the
> assignment to a temporary reg and the compare. Please note that there
> is no move *from* %rax register after cmpxchg, so the compiler
> correctly eliminated unused assignment.
>
> >
> > > }
> > >
> > > /*
> > > @@ -2055,7 +2052,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
> > > retries = 10;
> > > success = false;
> > > while (retries--) {
> > > - struct list_head *head_page, *prev_page, *r;
> > > + struct list_head *head_page, *prev_page;
> > > struct list_head *last_page, *first_page;
> > > struct list_head *head_page_with_bit;
> > >
> > > @@ -2073,9 +2070,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
> > > last_page->next = head_page_with_bit;
> > > first_page->prev = prev_page;
> > >
> > > - r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
> > > -
> > > - if (r == head_page_with_bit) {
> > > + if (try_cmpxchg(&prev_page->next,
> > > + &head_page_with_bit, first_page)) {
> >
> > No. head_page_with_bit should not be updated.
>
> As above, head_page_with_bit should be considered as a temporary, it
> is initialized a couple of lines above cmpxchg and unused after. The
> gcc-10.3.1 compiler even found some more optimization opportunities
> and reordered the code from:
>
> 1364: 4d 8b 86 38 01 00 00 mov 0x138(%r14),%r8
> 136b: 48 83 ce 01 or $0x1,%rsi
> 136f: 48 89 f0 mov %rsi,%rax
> 1372: 49 89 30 mov %rsi,(%r8)
> 1375: 48 89 4f 08 mov %rcx,0x8(%rdi)
> 1379: f0 48 0f b1 39 lock cmpxchg %rdi,(%rcx)
> 137e: 48 39 c6 cmp %rax,%rsi
> 1381: 74 78 je 13fb <rb_insert_pages+0xdb>
>
> to:
>
> 1343: 48 83 c8 01 or $0x1,%rax
> 1347: 48 8b bb 38 01 00 00 mov 0x138(%rbx),%rdi
> 134e: 48 89 07 mov %rax,(%rdi)
> 1351: 48 89 4e 08 mov %rcx,0x8(%rsi)
> 1355: f0 48 0f b1 31 lock cmpxchg %rsi,(%rcx)
> 135a: 41 0f 94 c7 sete %r15b
> 135e: 75 2f jne 138f <rb_insert_pages+0x8f>
>
> Please also note SETE insn in the above code, this is how the
> "success" variable is handled in the loop. So, besides code size
> improvement, other secondary improvements can be expected from the
> change, too.
For this gcc 12.2.0 did have a different result:
cmpxchg:
1436: 49 89 c5 mov %rax,%r13
1439: eb 41 jmp 147c <rb_update_pages+0x7c>
143b: 48 89 df mov %rbx,%rdi
143e: e8 bd ed ff ff call 200 <rb_set_head_page>
1443: 48 89 c2 mov %rax,%rdx
1446: 48 85 c0 test %rax,%rax
1449: 74 37 je 1482 <rb_update_pages+0x82>
144b: 48 8b 48 08 mov 0x8(%rax),%rcx
144f: 48 8b bb 30 01 00 00 mov 0x130(%rbx),%rdi
1456: 48 89 c6 mov %rax,%rsi
1459: 4c 8b 83 38 01 00 00 mov 0x138(%rbx),%r8
1460: 48 83 ce 01 or $0x1,%rsi
1464: 48 89 f0 mov %rsi,%rax
1467: 49 89 30 mov %rsi,(%r8)
146a: 48 89 4f 08 mov %rcx,0x8(%rdi)
146e: f0 48 0f b1 39 lock cmpxchg %rdi,(%rcx)
1473: 48 39 c6 cmp %rax,%rsi
1476: 0f 84 97 01 00 00 je 1613 <rb_update_pages+0x213>
147c: 41 83 ee 01 sub $0x1,%r14d
1480: 75 b9 jne 143b <rb_update_pages+0x3b>
1482: 48 8b 43 10 mov 0x10(%rbx),%rax
1486: f0 ff 40 08 lock incl 0x8(%rax)
try_cmpxchg:
1446: 49 89 c4 mov %rax,%r12
1449: 41 83 ee 01 sub $0x1,%r14d
144d: 0f 84 7b 01 00 00 je 15ce <rb_update_pages+0x1be>
1453: 48 89 df mov %rbx,%rdi
1456: e8 c5 ed ff ff call 220 <rb_set_head_page>
145b: 48 89 c2 mov %rax,%rdx
145e: 48 85 c0 test %rax,%rax
1461: 0f 84 67 01 00 00 je 15ce <rb_update_pages+0x1be>
1467: 48 8b 48 08 mov 0x8(%rax),%rcx
146b: 48 8b b3 30 01 00 00 mov 0x130(%rbx),%rsi
1472: 48 83 c8 01 or $0x1,%rax
1476: 48 8b bb 38 01 00 00 mov 0x138(%rbx),%rdi
147d: 48 89 07 mov %rax,(%rdi)
1480: 48 89 4e 08 mov %rcx,0x8(%rsi)
1484: f0 48 0f b1 31 lock cmpxchg %rsi,(%rcx)
1489: 75 be jne 1449 <rb_update_pages+0x39>
148b: 48 89 7a 08 mov %rdi,0x8(%rdx)
148f: 4c 89 e6 mov %r12,%rsi
1492: 48 89 ef mov %rbp,%rdi
1495: 4c 89 ab 30 01 00 00 mov %r13,0x130(%rbx)
149c: 4c 89 ab 38 01 00 00 mov %r13,0x138(%rbx)
14a3: e8 00 00 00 00 call 14a8 <rb_update_pages+0x98>
It's different, but I'm not so sure it's beneficial.
>
> I think that the above examples demonstrate various improvements that
> can be achieved with effectively a one-line, almost mechanical change
> to the code, even in linear code. It would be unfortunate to not
> consider them.
But with gcc 12.2.0 I don't really see the benefit. And I'm worried that
the side effect of modifying the old variable could cause a bug in the
future, if it is used after the try_cmpxchg(). At least for the second case.
-- Steve
On Wed, Mar 1, 2023 at 5:18 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Wed, 1 Mar 2023 10:37:28 +0100
> Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > > No, val should not be updated.
> >
> > Please see the definition of try_cmpxchg. The definition is written in
> > such a way that benefits loops as well as linear code and in the later
> > case depends on the compiler to eliminate assignment to val as a dead
> > assignment.
>
> I did read it ;-)
>
> >
> > The above change was done under the assumption that val is unused
> > after try_cmpxchg, and can be considered as a temporary
> > [Alternatively, the value could be copied to a local temporary and a
> > pointer to this local temporary could be passed to try_cmpxchg
> > instead. Compiler is smart enough to eliminate the assignment in any
> > case.]
> >
> > Even in the linear code, the change has considerable effect.
> > rb_head_page_replace is inlined in rb_get_reader_page and gcc-10.3.1
> > improves code from:
> >
> > ef8: 48 8b 0e mov (%rsi),%rcx
> > efb: 48 83 e1 fc and $0xfffffffffffffffc,%rcx
> > eff: 48 83 c9 01 or $0x1,%rcx
> > f03: 48 89 c8 mov %rcx,%rax
> > f06: f0 48 0f b1 3e lock cmpxchg %rdi,(%rsi)
> > f0b: 48 39 c1 cmp %rax,%rcx
> > f0e: 74 3b je f4b <rb_get_reader_page+0x13b>
> >
> > to:
> >
> > ed8: 48 8b 01 mov (%rcx),%rax
> > edb: 48 83 e0 fc and $0xfffffffffffffffc,%rax
> > edf: 48 83 c8 01 or $0x1,%rax
> > ee3: f0 48 0f b1 31 lock cmpxchg %rsi,(%rcx)
> > ee8: 74 3b je f25 <rb_get_reader_page+0x135>
>
> I'm using gcc 12.2.0 and have this;
>
> cmpxchg:
>
> 0000000000000e50 <rb_get_reader_page>:
> e50: 41 55 push %r13
> e52: 41 54 push %r12
> e54: 55 push %rbp
> e55: 53 push %rbx
> e56: 48 89 fb mov %rdi,%rbx
> e59: 9c pushf
> e5a: 5d pop %rbp
> e5b: fa cli
> e5c: 81 e5 00 02 00 00 and $0x200,%ebp
> e62: 0f 85 e6 01 00 00 jne 104e <rb_get_reader_page+0x1fe>
> e68: 48 8d 7b 1c lea 0x1c(%rbx),%rdi
> e6c: 31 c0 xor %eax,%eax
> e6e: ba 01 00 00 00 mov $0x1,%edx
> e73: f0 0f b1 53 1c lock cmpxchg %edx,0x1c(%rbx)
> e78: 0f 85 e5 01 00 00 jne 1063 <rb_get_reader_page+0x213>
> e7e: 41 bc 03 00 00 00 mov $0x3,%r12d
> e84: 4c 8b 6b 58 mov 0x58(%rbx),%r13
> e88: 49 8b 55 30 mov 0x30(%r13),%rdx
> e8c: 41 8b 45 18 mov 0x18(%r13),%eax
> e90: 48 8b 4a 08 mov 0x8(%rdx),%rcx
> e94: 39 c8 cmp %ecx,%eax
> e96: 0f 82 61 01 00 00 jb ffd <rb_get_reader_page+0x1ad>
> e9c: 48 8b 52 08 mov 0x8(%rdx),%rdx
>
>
> try_cmpxchg:
>
> 0000000000000e70 <rb_get_reader_page>:
> e70: 41 55 push %r13
> e72: 41 54 push %r12
> e74: 55 push %rbp
> e75: 53 push %rbx
> e76: 48 89 fb mov %rdi,%rbx
> e79: 9c pushf
> e7a: 5d pop %rbp
> e7b: fa cli
> e7c: 81 e5 00 02 00 00 and $0x200,%ebp
> e82: 0f 85 e0 01 00 00 jne 1068 <rb_get_reader_page+0x1f8>
> e88: 48 8d 7b 1c lea 0x1c(%rbx),%rdi
> e8c: 31 c0 xor %eax,%eax
> e8e: ba 01 00 00 00 mov $0x1,%edx
> e93: f0 0f b1 53 1c lock cmpxchg %edx,0x1c(%rbx)
> e98: 0f 85 df 01 00 00 jne 107d <rb_get_reader_page+0x20d>
> e9e: 41 bc 03 00 00 00 mov $0x3,%r12d
> ea4: 4c 8b 6b 58 mov 0x58(%rbx),%r13
> ea8: 49 8b 55 30 mov 0x30(%r13),%rdx
> eac: 41 8b 45 18 mov 0x18(%r13),%eax
> eb0: 48 8b 4a 08 mov 0x8(%rdx),%rcx
> eb4: 39 c8 cmp %ecx,%eax
> eb6: 0f 82 5b 01 00 00 jb 1017 <rb_get_reader_page+0x1a7>
> ebc: 48 8b 52 08 mov 0x8(%rdx),%rdx
>
> Which has no difference :-/
This lock cmpxchg belongs to some other locking primitive that were
already converted en masse to try_cmpxchg some time in the past. The
place we are looking for has a compare insn between lock cmpxchg and a
follow-up conditional jump in the original assembly code.
> > Again, even in linear code the change is able to eliminate the
> > assignment to a temporary reg and the compare. Please note that there
> > is no move *from* %rax register after cmpxchg, so the compiler
> > correctly eliminated unused assignment.
> >
> > >
> > > > }
> > > >
> > > > /*
> > > > @@ -2055,7 +2052,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
> > > > retries = 10;
> > > > success = false;
> > > > while (retries--) {
> > > > - struct list_head *head_page, *prev_page, *r;
> > > > + struct list_head *head_page, *prev_page;
> > > > struct list_head *last_page, *first_page;
> > > > struct list_head *head_page_with_bit;
> > > >
> > > > @@ -2073,9 +2070,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
> > > > last_page->next = head_page_with_bit;
> > > > first_page->prev = prev_page;
> > > >
> > > > - r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
> > > > -
> > > > - if (r == head_page_with_bit) {
> > > > + if (try_cmpxchg(&prev_page->next,
> > > > + &head_page_with_bit, first_page)) {
> > >
> > > No. head_page_with_bit should not be updated.
> >
> > As above, head_page_with_bit should be considered as a temporary, it
> > is initialized a couple of lines above cmpxchg and unused after. The
> > gcc-10.3.1 compiler even found some more optimization opportunities
> > and reordered the code from:
> >
> > 1364: 4d 8b 86 38 01 00 00 mov 0x138(%r14),%r8
> > 136b: 48 83 ce 01 or $0x1,%rsi
> > 136f: 48 89 f0 mov %rsi,%rax
> > 1372: 49 89 30 mov %rsi,(%r8)
> > 1375: 48 89 4f 08 mov %rcx,0x8(%rdi)
> > 1379: f0 48 0f b1 39 lock cmpxchg %rdi,(%rcx)
> > 137e: 48 39 c6 cmp %rax,%rsi
> > 1381: 74 78 je 13fb <rb_insert_pages+0xdb>
> >
> > to:
> >
> > 1343: 48 83 c8 01 or $0x1,%rax
> > 1347: 48 8b bb 38 01 00 00 mov 0x138(%rbx),%rdi
> > 134e: 48 89 07 mov %rax,(%rdi)
> > 1351: 48 89 4e 08 mov %rcx,0x8(%rsi)
> > 1355: f0 48 0f b1 31 lock cmpxchg %rsi,(%rcx)
> > 135a: 41 0f 94 c7 sete %r15b
> > 135e: 75 2f jne 138f <rb_insert_pages+0x8f>
> >
> > Please also note SETE insn in the above code, this is how the
> > "success" variable is handled in the loop. So, besides code size
> > improvement, other secondary improvements can be expected from the
> > change, too.
>
> For this gcc 12.2.0 did have a different result:
>
> cmpxchg:
>
> 1436: 49 89 c5 mov %rax,%r13
> 1439: eb 41 jmp 147c <rb_update_pages+0x7c>
> 143b: 48 89 df mov %rbx,%rdi
> 143e: e8 bd ed ff ff call 200 <rb_set_head_page>
> 1443: 48 89 c2 mov %rax,%rdx
> 1446: 48 85 c0 test %rax,%rax
> 1449: 74 37 je 1482 <rb_update_pages+0x82>
> 144b: 48 8b 48 08 mov 0x8(%rax),%rcx
> 144f: 48 8b bb 30 01 00 00 mov 0x130(%rbx),%rdi
> 1456: 48 89 c6 mov %rax,%rsi
> 1459: 4c 8b 83 38 01 00 00 mov 0x138(%rbx),%r8
> 1460: 48 83 ce 01 or $0x1,%rsi
> 1464: 48 89 f0 mov %rsi,%rax
> 1467: 49 89 30 mov %rsi,(%r8)
> 146a: 48 89 4f 08 mov %rcx,0x8(%rdi)
> 146e: f0 48 0f b1 39 lock cmpxchg %rdi,(%rcx)
> 1473: 48 39 c6 cmp %rax,%rsi
> 1476: 0f 84 97 01 00 00 je 1613 <rb_update_pages+0x213>
> 147c: 41 83 ee 01 sub $0x1,%r14d
> 1480: 75 b9 jne 143b <rb_update_pages+0x3b>
> 1482: 48 8b 43 10 mov 0x10(%rbx),%rax
> 1486: f0 ff 40 08 lock incl 0x8(%rax)
>
> try_cmpxchg:
>
> 1446: 49 89 c4 mov %rax,%r12
> 1449: 41 83 ee 01 sub $0x1,%r14d
> 144d: 0f 84 7b 01 00 00 je 15ce <rb_update_pages+0x1be>
> 1453: 48 89 df mov %rbx,%rdi
> 1456: e8 c5 ed ff ff call 220 <rb_set_head_page>
> 145b: 48 89 c2 mov %rax,%rdx
> 145e: 48 85 c0 test %rax,%rax
> 1461: 0f 84 67 01 00 00 je 15ce <rb_update_pages+0x1be>
> 1467: 48 8b 48 08 mov 0x8(%rax),%rcx
> 146b: 48 8b b3 30 01 00 00 mov 0x130(%rbx),%rsi
> 1472: 48 83 c8 01 or $0x1,%rax
> 1476: 48 8b bb 38 01 00 00 mov 0x138(%rbx),%rdi
> 147d: 48 89 07 mov %rax,(%rdi)
> 1480: 48 89 4e 08 mov %rcx,0x8(%rsi)
> 1484: f0 48 0f b1 31 lock cmpxchg %rsi,(%rcx)
> 1489: 75 be jne 1449 <rb_update_pages+0x39>
> 148b: 48 89 7a 08 mov %rdi,0x8(%rdx)
> 148f: 4c 89 e6 mov %r12,%rsi
> 1492: 48 89 ef mov %rbp,%rdi
> 1495: 4c 89 ab 30 01 00 00 mov %r13,0x130(%rbx)
> 149c: 4c 89 ab 38 01 00 00 mov %r13,0x138(%rbx)
> 14a3: e8 00 00 00 00 call 14a8 <rb_update_pages+0x98>
>
> It's different, but I'm not so sure it's beneficial.
This is the place we are looking for. Please see that a move at $1464
and a compare at $1473 is missing in the assembly from the patched
code. If it is beneficial ... well, we achieved the same result with
two instructions less in a loopy code.
Uros.
> >
> > I think that the above examples demonstrate various improvements that
> > can be achieved with effectively a one-line, almost mechanical change
> > to the code, even in linear code. It would be unfortunate to not
> > consider them.
>
> But with gcc 12.2.0 I don't really see the benefit. And I'm worried that
> the side effect of modifying the old variable could cause a bug in the
> future, if it is used after the try_cmpxchg(). At least for the second case.
>
> -- Steve
>
On Wed, 1 Mar 2023 18:16:04 +0100 Uros Bizjak <ubizjak@gmail.com> wrote: > > It's different, but I'm not so sure it's beneficial. > > This is the place we are looking for. Please see that a move at $1464 > and a compare at $1473 is missing in the assembly from the patched > code. If it is beneficial ... well, we achieved the same result with > two instructions less in a loopy code. Note, none of these locations are in fast paths. (now if we had a local_try_cmpxchg() then we could improve those locations!). Thus my main concern here is for correctness. Unfortunately, to add a cmpxchg_success() would require a lot of work, as all the cmpxchg() functions are really macros that do a lot of magic (the include/linux/atomic/atomic-instrumented.h says its even generated!). So that will likely not happen. I have mixed feelings for this patch. I like the fact that you are looking in optimizing the code, but I'm also concerned that it could cause issues later down the road. I am leaning to just taking this, and hope that it doesn't cause problems. And I would really love to change all the local_cmpxchg() to local_try_cmpxchg(). Hmm, I wonder how much of an effort that would be to implement those? -- Steve
On Wed, 1 Mar 2023 13:18:31 -0500 Steven Rostedt <rostedt@goodmis.org> wrote: > I am leaning to just taking this, and hope that it doesn't cause problems. > And I would really love to change all the local_cmpxchg() to > local_try_cmpxchg(). Hmm, I wonder how much of an effort that would be to > implement those? I see that you were the one that added the generic support for try_cmpxchg64() and messed with all those generated files. Care to add one for local_try_cmpxchg() ;-) -- Steve
On Wed, Mar 1, 2023 at 7:28 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Wed, 1 Mar 2023 13:18:31 -0500 > Steven Rostedt <rostedt@goodmis.org> wrote: > > > I am leaning to just taking this, and hope that it doesn't cause problems. > > And I would really love to change all the local_cmpxchg() to > > local_try_cmpxchg(). Hmm, I wonder how much of an effort that would be to > > implement those? > > I see that you were the one that added the generic support for > try_cmpxchg64() and messed with all those generated files. Care to add one > for local_try_cmpxchg() ;-) I already have a half-written patch that implements local_try_cmpxchg. Half-written in the sense that above-mentioned scripts generate correct locking primitives, but the fallback code for local_cmpxchg is a bit different that the fallback code for cmpxchg. There is an effort to unify all cmpxchg stuff (cmpxchg128 will be introduced) and I think that local_cmpxchg should also be handled in the same way. So, if there is interest, I can find some time to finish the infrastructure patch and convert the uses. But this is quite an undertaking that will take some time. Uros.
On Wed, 1 Mar 2023 11:18:50 -0500 Steven Rostedt <rostedt@goodmis.org> wrote: > But with gcc 12.2.0 I don't really see the benefit. And I'm worried that > the side effect of modifying the old variable could cause a bug in the > future, if it is used after the try_cmpxchg(). At least for the second case. Actually, I like Joel's recommendation of adding a cmpxchg_succeeded() function, that does the try_cmpxchg() without needing to save the old variable. That's my main concern, as it does have that side effect that could be missed when updating the code. That would be the best of both worlds ;-) -- Steve
On Wed, Mar 1, 2023 at 5:28 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Wed, 1 Mar 2023 11:18:50 -0500 > Steven Rostedt <rostedt@goodmis.org> wrote: > > > But with gcc 12.2.0 I don't really see the benefit. And I'm worried that > > the side effect of modifying the old variable could cause a bug in the > > future, if it is used after the try_cmpxchg(). At least for the second case. > > Actually, I like Joel's recommendation of adding a cmpxchg_succeeded() > function, that does the try_cmpxchg() without needing to save the old > variable. That's my main concern, as it does have that side effect that > could be missed when updating the code. I understand your concern regarding updating of head_page_with_bit in the middle of rb_insert_pages. OTOH, rb_head_page_replace is a small utility function where update happens in a return clause, so there is no chance of using val after the try_cmpxchg. If we can ignore the "updating" issue in rb_head_page_replace, we can simply define cmpxchg_success in front of rb_insert_pages (now its sole user) and be done with it. Uros.
On Wed, Mar 1, 2023 at 5:28 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Wed, 1 Mar 2023 11:18:50 -0500
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > But with gcc 12.2.0 I don't really see the benefit. And I'm worried that
> > the side effect of modifying the old variable could cause a bug in the
> > future, if it is used after the try_cmpxchg(). At least for the second case.
>
> Actually, I like Joel's recommendation of adding a cmpxchg_succeeded()
> function, that does the try_cmpxchg() without needing to save the old
> variable. That's my main concern, as it does have that side effect that
> could be missed when updating the code.
The "controversial" part of the patch would then look like the
attached patch. As expected, the compiler again produces expected
code:
eb8: 48 8b 0e mov (%rsi),%rcx
ebb: 48 83 e1 fc and $0xfffffffffffffffc,%rcx
ebf: 48 83 c9 01 or $0x1,%rcx
ec3: 48 89 c8 mov %rcx,%rax
ec6: f0 48 0f b1 3e lock cmpxchg %rdi,(%rsi)
ecb: 48 39 c1 cmp %rax,%rcx
ece: 74 2d je efd <rb_get_reader_page+0x12d>
to:
eb8: 48 8b 01 mov (%rcx),%rax
ebb: 48 83 e0 fc and $0xfffffffffffffffc,%rax
ebf: 48 83 c8 01 or $0x1,%rax
ec3: f0 48 0f b1 31 lock cmpxchg %rsi,(%rcx)
ec8: 74 2d je ef7 <rb_get_reader_page+0x127>
Uros.
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index af50d931b020..7ad855f54371 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -163,6 +163,12 @@ enum {
#define extended_time(event) \
(event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
+#define cmpxchg_success(ptr, old, new) \
+({ \
+ typeof(*(ptr)) __tmp = (old); \
+ try_cmpxchg((ptr), &__tmp, (new)); \
+})
+
static inline int rb_null_event(struct ring_buffer_event *event)
{
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -1495,14 +1501,11 @@ static int rb_head_page_replace(struct buffer_page *old,
{
unsigned long *ptr = (unsigned long *)&old->list.prev->next;
unsigned long val;
- unsigned long ret;
val = *ptr & ~RB_FLAG_MASK;
val |= RB_PAGE_HEAD;
- ret = cmpxchg(ptr, val, (unsigned long)&new->list);
-
- return ret == val;
+ return cmpxchg_success(ptr, val, (unsigned long)&new->list);
}
/*
@@ -2061,7 +2064,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
retries = 10;
success = 0;
while (retries--) {
- struct list_head *head_page, *prev_page, *r;
+ struct list_head *head_page, *prev_page;
struct list_head *last_page, *first_page;
struct list_head *head_page_with_bit;
@@ -2079,9 +2082,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
last_page->next = head_page_with_bit;
first_page->prev = prev_page;
- r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
-
- if (r == head_page_with_bit) {
+ if (cmpxchg_success(&prev_page->next,
+ head_page_with_bit, first_page)) {
/*
* yay, we replaced the page pointer to our new list,
* now, we just have to update to head page's prev
On Wed, Mar 1, 2023 at 4:37 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Tue, Feb 28, 2023 at 10:43 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Tue, 28 Feb 2023 18:59:29 +0100
> > Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > > Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old.
> > > x86 CMPXCHG instruction returns success in ZF flag, so this change
> > > saves a compare after cmpxchg (and related move instruction in
> > > front of cmpxchg).
> > >
> > > Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg
> > > fails. There is no need to re-read the value in the loop.
> > >
> > > No functional change intended.
> >
> > As I mentioned in the RCU thread, I have issues with some of the changes
> > here.
> >
> > >
> > > Cc: Steven Rostedt <rostedt@goodmis.org>
> > > Cc: Masami Hiramatsu <mhiramat@kernel.org>
> > > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > > ---
> > > kernel/trace/ring_buffer.c | 20 ++++++++------------
> > > 1 file changed, 8 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> > > index 4188af7d4cfe..8f0ef7d12ddd 100644
> > > --- a/kernel/trace/ring_buffer.c
> > > +++ b/kernel/trace/ring_buffer.c
> > > @@ -1493,14 +1493,11 @@ static bool rb_head_page_replace(struct buffer_page *old,
> > > {
> > > unsigned long *ptr = (unsigned long *)&old->list.prev->next;
> > > unsigned long val;
> > > - unsigned long ret;
> > >
> > > val = *ptr & ~RB_FLAG_MASK;
> > > val |= RB_PAGE_HEAD;
> > >
> > > - ret = cmpxchg(ptr, val, (unsigned long)&new->list);
> > > -
> > > - return ret == val;
> > > + return try_cmpxchg(ptr, &val, (unsigned long)&new->list);
> >
> > No, val should not be updated.
>
> Please see the definition of try_cmpxchg. The definition is written in
> such a way that benefits loops as well as linear code and in the later
> case depends on the compiler to eliminate assignment to val as a dead
> assignment.
>
> The above change was done under the assumption that val is unused
> after try_cmpxchg, and can be considered as a temporary
> [Alternatively, the value could be copied to a local temporary and a
> pointer to this local temporary could be passed to try_cmpxchg
> instead. Compiler is smart enough to eliminate the assignment in any
> case.]
If I understood Steve correctly, I think the "misleading" part is that
you are passing a variable by address to try_cmpxchg() but not really
modifying it (unlike in the loop patterns).
Perhaps it is more meaningful to have an API that looks like:
bool cmpxchg_succeeded(TYPE ptr, TYPE old, TYPE new)
Where old is not a pointer (unlike try_cmpxchg), and the API returns bool.
Both cleaner to read and has the optimization you want, I believe.
However, the other point is, this is useful only for slow paths, but
at least cmpxchg_succeeded() is more readable and less "misleading"
than try_cmpxchg() IMO.
- Joel
[...]
On Wed, Mar 1, 2023 at 10:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
>
> On Wed, Mar 1, 2023 at 4:37 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Tue, Feb 28, 2023 at 10:43 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > On Tue, 28 Feb 2023 18:59:29 +0100
> > > Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > > Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old.
> > > > x86 CMPXCHG instruction returns success in ZF flag, so this change
> > > > saves a compare after cmpxchg (and related move instruction in
> > > > front of cmpxchg).
> > > >
> > > > Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg
> > > > fails. There is no need to re-read the value in the loop.
> > > >
> > > > No functional change intended.
> > >
> > > As I mentioned in the RCU thread, I have issues with some of the changes
> > > here.
> > >
> > > >
> > > > Cc: Steven Rostedt <rostedt@goodmis.org>
> > > > Cc: Masami Hiramatsu <mhiramat@kernel.org>
> > > > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > > > ---
> > > > kernel/trace/ring_buffer.c | 20 ++++++++------------
> > > > 1 file changed, 8 insertions(+), 12 deletions(-)
> > > >
> > > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> > > > index 4188af7d4cfe..8f0ef7d12ddd 100644
> > > > --- a/kernel/trace/ring_buffer.c
> > > > +++ b/kernel/trace/ring_buffer.c
> > > > @@ -1493,14 +1493,11 @@ static bool rb_head_page_replace(struct buffer_page *old,
> > > > {
> > > > unsigned long *ptr = (unsigned long *)&old->list.prev->next;
> > > > unsigned long val;
> > > > - unsigned long ret;
> > > >
> > > > val = *ptr & ~RB_FLAG_MASK;
> > > > val |= RB_PAGE_HEAD;
> > > >
> > > > - ret = cmpxchg(ptr, val, (unsigned long)&new->list);
> > > > -
> > > > - return ret == val;
> > > > + return try_cmpxchg(ptr, &val, (unsigned long)&new->list);
> > >
> > > No, val should not be updated.
> >
> > Please see the definition of try_cmpxchg. The definition is written in
> > such a way that benefits loops as well as linear code and in the later
> > case depends on the compiler to eliminate assignment to val as a dead
> > assignment.
> >
> > The above change was done under the assumption that val is unused
> > after try_cmpxchg, and can be considered as a temporary
> > [Alternatively, the value could be copied to a local temporary and a
> > pointer to this local temporary could be passed to try_cmpxchg
> > instead. Compiler is smart enough to eliminate the assignment in any
> > case.]
Ah I need to be more careful how I type.
> If I understood Steve correctly, I think the "misleading" part is that
> you are passing a variable by address to try_cmpxchg() but not really
> modifying it (unlike in the loop patterns).
It does modify it, but I meant it does not use it.
> Perhaps it is more meaningful to have an API that looks like:
> bool cmpxchg_succeeded(TYPE ptr, TYPE old, TYPE new)
> Where old is not a pointer (unlike try_cmpxchg), and the API returns bool.
>
> Both cleaner to read and has the optimization you want, I believe.
>
> However, the other point is, this is useful only for slow paths, but
Useful only for fast paths...
> at least cmpxchg_succeeded() is more readable and less "misleading"
> than try_cmpxchg() IMO.
>
Proofreading emails properly from here on! Not after the fact!
- Joel
© 2016 - 2026 Red Hat, Inc.