[PATCH] xen/x86: Work around Clang code generation bug with asm parameters

Andrew Cooper posted 1 patch 2 weeks, 3 days ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/xen tags/patchew/20201111124512.2268-1-andrew.cooper3@citrix.com
xen/include/asm-x86/guest/xen-hcall.h | 35 ++++++++++++++++-------------------
1 file changed, 16 insertions(+), 19 deletions(-)

[PATCH] xen/x86: Work around Clang code generation bug with asm parameters

Posted by Andrew Cooper 2 weeks, 3 days ago
Clang 9 and later don't handle the clobber of %r10 correctly in
_hypercall64_4().  See https://bugs.llvm.org/show_bug.cgi?id=48122

Rewrite the logic to use the "+r" form, rather than using separate input and
output specifiers, which works around the issue.  For consistency, adjust all
operand handling.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
---
 xen/include/asm-x86/guest/xen-hcall.h | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/xen/include/asm-x86/guest/xen-hcall.h b/xen/include/asm-x86/guest/xen-hcall.h
index 03d5868a9e..3240d9807b 100644
--- a/xen/include/asm-x86/guest/xen-hcall.h
+++ b/xen/include/asm-x86/guest/xen-hcall.h
@@ -39,53 +39,50 @@
 
 #define _hypercall64_1(type, hcall, a1)                                 \
     ({                                                                  \
-        long res, tmp__;                                                \
+        long res, _a1 = (long)(a1);                                     \
         asm volatile (                                                  \
             "call hypercall_page + %c[offset]"                          \
-            : "=a" (res), "=D" (tmp__) ASM_CALL_CONSTRAINT              \
-            : [offset] "i" (hcall * 32),                                \
-              "1" ((long)(a1))                                          \
+            : "=a" (res), "+D" (_a1) ASM_CALL_CONSTRAINT                \
+            : [offset] "i" (hcall * 32)                                 \
             : "memory" );                                               \
         (type)res;                                                      \
     })
 
 #define _hypercall64_2(type, hcall, a1, a2)                             \
     ({                                                                  \
-        long res, tmp__;                                                \
+        long res, _a1 = (long)(a1), _a2 = (long)(a2);                   \
         asm volatile (                                                  \
             "call hypercall_page + %c[offset]"                          \
-            : "=a" (res), "=D" (tmp__), "=S" (tmp__)                    \
+            : "=a" (res), "+D" (_a1), "+S" (_a2)                        \
               ASM_CALL_CONSTRAINT                                       \
-            : [offset] "i" (hcall * 32),                                \
-              "1" ((long)(a1)), "2" ((long)(a2))                        \
+            : [offset] "i" (hcall * 32)                                 \
             : "memory" );                                               \
         (type)res;                                                      \
     })
 
 #define _hypercall64_3(type, hcall, a1, a2, a3)                         \
     ({                                                                  \
-        long res, tmp__;                                                \
+        long res, _a1 = (long)(a1), _a2 = (long)(a2),                   \
+            _a3 = (long)(a3);                                           \
         asm volatile (                                                  \
             "call hypercall_page + %c[offset]"                          \
-            : "=a" (res), "=D" (tmp__), "=S" (tmp__), "=d" (tmp__)      \
+            : "=a" (res), "+D" (_a1), "+S" (_a2), "+d" (_a3)            \
               ASM_CALL_CONSTRAINT                                       \
-            : [offset] "i" (hcall * 32),                                \
-              "1" ((long)(a1)), "2" ((long)(a2)), "3" ((long)(a3))      \
+            : [offset] "i" (hcall * 32)                                 \
             : "memory" );                                               \
         (type)res;                                                      \
     })
 
 #define _hypercall64_4(type, hcall, a1, a2, a3, a4)                     \
     ({                                                                  \
-        long res, tmp__;                                                \
-        register long _a4 asm ("r10") = ((long)(a4));                   \
+        long res, _a1 = (long)(a1), _a2 = (long)(a2),                   \
+            _a3 = (long)(a3);                                           \
+        register long _a4 asm ("r10") = (long)(a4);                     \
         asm volatile (                                                  \
             "call hypercall_page + %c[offset]"                          \
-            : "=a" (res), "=D" (tmp__), "=S" (tmp__), "=d" (tmp__),     \
-              "=&r" (tmp__) ASM_CALL_CONSTRAINT                         \
-            : [offset] "i" (hcall * 32),                                \
-              "1" ((long)(a1)), "2" ((long)(a2)), "3" ((long)(a3)),     \
-              "4" (_a4)                                                 \
+            : "=a" (res), "+D" (_a1), "+S" (_a2), "+d" (_a3)            \
+              "+r" (_a4) ASM_CALL_CONSTRAINT                            \
+            : [offset] "i" (hcall * 32)                                 \
             : "memory" );                                               \
         (type)res;                                                      \
     })
-- 
2.11.0


Re: [PATCH] xen/x86: Work around Clang code generation bug with asm parameters

Posted by Jan Beulich 2 weeks, 3 days ago
On 11.11.2020 13:45, Andrew Cooper wrote:
> Clang 9 and later don't handle the clobber of %r10 correctly in
> _hypercall64_4().  See https://bugs.llvm.org/show_bug.cgi?id=48122

Are you sure this is a bug? With ...

>  #define _hypercall64_4(type, hcall, a1, a2, a3, a4)                     \
>      ({                                                                  \
> -        long res, tmp__;                                                \
> -        register long _a4 asm ("r10") = ((long)(a4));                   \
> +        long res, _a1 = (long)(a1), _a2 = (long)(a2),                   \
> +            _a3 = (long)(a3);                                           \
> +        register long _a4 asm ("r10") = (long)(a4);                     \
>          asm volatile (                                                  \
>              "call hypercall_page + %c[offset]"                          \
> -            : "=a" (res), "=D" (tmp__), "=S" (tmp__), "=d" (tmp__),     \
> -              "=&r" (tmp__) ASM_CALL_CONSTRAINT                         \

... this we've requested "any register", while with ...

> -            : [offset] "i" (hcall * 32),                                \
> -              "1" ((long)(a1)), "2" ((long)(a2)), "3" ((long)(a3)),     \
> -              "4" (_a4)                                                 \

... this we've asked for that specific register to be initialized
from r10 (and without telling the compiler that r10 is going to
change).

Also, by what I would have hoped is convention meanwhile, the new
macro local variables' names shouldn't start with an underscore,
but instead perhaps end in one. But to be honest, despite knowing
of the latent (albeit highly hypothetical) issue, each time I
find myself making such a comment I'm one tiny step closer to
giving up.

Anyway, with at least title and description changed (or your view
clarified verbally)
Reviewed-by: Jan Beulich <jbeulich@suse.com>

Jan

Re: [PATCH] xen/x86: Work around Clang code generation bug with asm parameters

Posted by Andrew Cooper 2 weeks, 3 days ago
On 11/11/2020 15:11, Jan Beulich wrote:
> On 11.11.2020 13:45, Andrew Cooper wrote:
>> Clang 9 and later don't handle the clobber of %r10 correctly in
>> _hypercall64_4().  See https://bugs.llvm.org/show_bug.cgi?id=48122
> Are you sure this is a bug?

Yes.

>  With ...
>
>>  #define _hypercall64_4(type, hcall, a1, a2, a3, a4)                     \
>>      ({                                                                  \
>> -        long res, tmp__;                                                \
>> -        register long _a4 asm ("r10") = ((long)(a4));                   \
>> +        long res, _a1 = (long)(a1), _a2 = (long)(a2),                   \
>> +            _a3 = (long)(a3);                                           \
>> +        register long _a4 asm ("r10") = (long)(a4);                     \
>>          asm volatile (                                                  \
>>              "call hypercall_page + %c[offset]"                          \
>> -            : "=a" (res), "=D" (tmp__), "=S" (tmp__), "=d" (tmp__),     \
>> -              "=&r" (tmp__) ASM_CALL_CONSTRAINT                         \
> ... this we've requested "any register", while with ...
>
>> -            : [offset] "i" (hcall * 32),                                \
>> -              "1" ((long)(a1)), "2" ((long)(a2)), "3" ((long)(a3)),     \
>> -              "4" (_a4)                                                 \
> ... this we've asked for that specific register to be initialized
> from r10 (and without telling the compiler that r10 is going to
> change).

Consider applying that same reasoning to "1" instead of "4".  In that
case, a1 would no longer be bound to %rdi.

The use of "4" explicitly binds the input and the output, which includes
requiring them to be the same register.

Furthermore, LLVM tends to consider "not behaving in the same was as
GCC" a bug.

> Also, by what I would have hoped is convention meanwhile, the new
> macro local variables' names shouldn't start with an underscore,
> but instead perhaps end in one. But to be honest, despite knowing
> of the latent (albeit highly hypothetical) issue, each time I
> find myself making such a comment I'm one tiny step closer to
> giving up.

Convention is not created by you getting irritated at others getting
irritated at you for requesting bizarre and unusual changes in submitted
code, or by continuing to point things out, knowing that others
specifically disagree with your reasoning in this case.

Convention is created when there is general agreement over the technical
merits of writing code in one particular way vs the alternatives, *and*
its written down somewhere, so this argument does not continue any further.

There is no restrictions or guidance in the coding style to prefer one
form over another, therefore anything is acceptable.

~Andrew

Re: [PATCH] xen/x86: Work around Clang code generation bug with asm parameters

Posted by Jan Beulich 2 weeks, 2 days ago
On 11.11.2020 21:01, Andrew Cooper wrote:
> On 11/11/2020 15:11, Jan Beulich wrote:
>> On 11.11.2020 13:45, Andrew Cooper wrote:
>>> Clang 9 and later don't handle the clobber of %r10 correctly in
>>> _hypercall64_4().  See https://bugs.llvm.org/show_bug.cgi?id=48122
>> Are you sure this is a bug?
> 
> Yes.
> 
>>  With ...
>>
>>>  #define _hypercall64_4(type, hcall, a1, a2, a3, a4)                     \
>>>      ({                                                                  \
>>> -        long res, tmp__;                                                \
>>> -        register long _a4 asm ("r10") = ((long)(a4));                   \
>>> +        long res, _a1 = (long)(a1), _a2 = (long)(a2),                   \
>>> +            _a3 = (long)(a3);                                           \
>>> +        register long _a4 asm ("r10") = (long)(a4);                     \
>>>          asm volatile (                                                  \
>>>              "call hypercall_page + %c[offset]"                          \
>>> -            : "=a" (res), "=D" (tmp__), "=S" (tmp__), "=d" (tmp__),     \
>>> -              "=&r" (tmp__) ASM_CALL_CONSTRAINT                         \
>> ... this we've requested "any register", while with ...
>>
>>> -            : [offset] "i" (hcall * 32),                                \
>>> -              "1" ((long)(a1)), "2" ((long)(a2)), "3" ((long)(a3)),     \
>>> -              "4" (_a4)                                                 \
>> ... this we've asked for that specific register to be initialized
>> from r10 (and without telling the compiler that r10 is going to
>> change).
> 
> Consider applying that same reasoning to "1" instead of "4".  In that
> case, a1 would no longer be bound to %rdi.

That's different: "=D" specifies the register, and "1" says "use
the same register as input". Whereas, as said, "=&r" says "use
any register" with "1" saying "use the same register" and (_a4)
specifying where the value is to come from.

> The use of "4" explicitly binds the input and the output, which includes
> requiring them to be the same register.
> 
> Furthermore, LLVM tends to consider "not behaving in the same was as
> GCC" a bug.

That's a fair statement, but then still the description wants
re-wording. Plus of course future gcc is free to change their
behavior to that currently observed with clang.

Consider the following example (on an arch where "f" is a
floating point register and there are ways to copy directly
between GPR and floating point registers:

   int i;
   register float f asm("f7") = <input>;
   asm("..." : "=r" (i) : "0" (f));

In this case obviously f7 can't be used for i (as it doesn't
match "r"). It's merely that the initial value of i is to come
from f7. In fact for Arm64 this

extern float flt;

int test(void) {
	int i;
	register float f asm("s7") = flt;
	asm("add %0,%0,5" : "=r" (i) : "0" (f));
	return i;
}

behaves exactly as described:

test:
        adrp    x0, flt
        ldr     s7, [x0, @lo12(flt)]
        fmov    w0, s7
        add     x0, x0, #5
        ret

(Whether fmov is a sensible choice here is a different question;
I'd have expected some fcvt*.)

Similarly a constant initial value would demonstrate this (or
one necessarily coming from memory), albeit in a less obvious
way: It doesn't say "this constant lives in the register", but
"this constant is to be loaded into the register".

>> Also, by what I would have hoped is convention meanwhile, the new
>> macro local variables' names shouldn't start with an underscore,
>> but instead perhaps end in one. But to be honest, despite knowing
>> of the latent (albeit highly hypothetical) issue, each time I
>> find myself making such a comment I'm one tiny step closer to
>> giving up.
> 
> Convention is not created by you getting irritated at others getting
> irritated at you for requesting bizarre and unusual changes in submitted
> code, or by continuing to point things out, knowing that others
> specifically disagree with your reasoning in this case.
> 
> Convention is created when there is general agreement over the technical
> merits of writing code in one particular way vs the alternatives, *and*
> its written down somewhere, so this argument does not continue any further.
> 
> There is no restrictions or guidance in the coding style to prefer one
> form over another, therefore anything is acceptable.

Our coding style document imo ought to describe things not already
specified by the standard. What scopes certain identifiers are to
be considered reserved in is, however, written down there already.

Jan

Re: [PATCH] xen/x86: Work around Clang code generation bug with asm parameters

Posted by Jan Beulich 1 day, 12 hours ago
On 12.11.2020 09:14, Jan Beulich wrote:
> On 11.11.2020 21:01, Andrew Cooper wrote:
>> On 11/11/2020 15:11, Jan Beulich wrote:
>>> On 11.11.2020 13:45, Andrew Cooper wrote:
>>>> Clang 9 and later don't handle the clobber of %r10 correctly in
>>>> _hypercall64_4().  See https://bugs.llvm.org/show_bug.cgi?id=48122
>>> Are you sure this is a bug?
>>
>> Yes.
>>
>>>  With ...
>>>
>>>>  #define _hypercall64_4(type, hcall, a1, a2, a3, a4)                     \
>>>>      ({                                                                  \
>>>> -        long res, tmp__;                                                \
>>>> -        register long _a4 asm ("r10") = ((long)(a4));                   \
>>>> +        long res, _a1 = (long)(a1), _a2 = (long)(a2),                   \
>>>> +            _a3 = (long)(a3);                                           \
>>>> +        register long _a4 asm ("r10") = (long)(a4);                     \
>>>>          asm volatile (                                                  \
>>>>              "call hypercall_page + %c[offset]"                          \
>>>> -            : "=a" (res), "=D" (tmp__), "=S" (tmp__), "=d" (tmp__),     \
>>>> -              "=&r" (tmp__) ASM_CALL_CONSTRAINT                         \
>>> ... this we've requested "any register", while with ...
>>>
>>>> -            : [offset] "i" (hcall * 32),                                \
>>>> -              "1" ((long)(a1)), "2" ((long)(a2)), "3" ((long)(a3)),     \
>>>> -              "4" (_a4)                                                 \
>>> ... this we've asked for that specific register to be initialized
>>> from r10 (and without telling the compiler that r10 is going to
>>> change).
>>
>> Consider applying that same reasoning to "1" instead of "4".  In that
>> case, a1 would no longer be bound to %rdi.
> 
> That's different: "=D" specifies the register, and "1" says "use
> the same register as input". Whereas, as said, "=&r" says "use
> any register" with "1" saying "use the same register" and (_a4)
> specifying where the value is to come from.
> 
>> The use of "4" explicitly binds the input and the output, which includes
>> requiring them to be the same register.
>>
>> Furthermore, LLVM tends to consider "not behaving in the same was as
>> GCC" a bug.
> 
> That's a fair statement, but then still the description wants
> re-wording. Plus of course future gcc is free to change their
> behavior to that currently observed with clang.
> 
> Consider the following example (on an arch where "f" is a
> floating point register and there are ways to copy directly
> between GPR and floating point registers:
> 
>    int i;
>    register float f asm("f7") = <input>;
>    asm("..." : "=r" (i) : "0" (f));
> 
> In this case obviously f7 can't be used for i (as it doesn't
> match "r"). It's merely that the initial value of i is to come
> from f7. In fact for Arm64 this
> 
> extern float flt;
> 
> int test(void) {
> 	int i;
> 	register float f asm("s7") = flt;
> 	asm("add %0,%0,5" : "=r" (i) : "0" (f));
> 	return i;
> }
> 
> behaves exactly as described:
> 
> test:
>         adrp    x0, flt
>         ldr     s7, [x0, @lo12(flt)]
>         fmov    w0, s7
>         add     x0, x0, #5
>         ret
> 
> (Whether fmov is a sensible choice here is a different question;
> I'd have expected some fcvt*.)

Meanwhile I've realized that I neither need to resort to Arm here,
nor to floating point, e.g.

int test2(int in) {
	int i;
	register int ri asm("ecx") = in;
	asm("nop %0" : "=r" (i) : "0" (ri));
	return i;
}

You'll find that the resulting code (at -O2; gcc 10.2.0) doesn't
use %ecx at all - %edi gets moved directly to %eax.

Jan