[PATCH] x86/hvm: Simplify stdvga_mem_accept() further

Andrew Cooper posted 1 patch 1 year, 1 month ago
Patches applied successfully (tree, apply log)
git fetch https://gitlab.com/xen-project/patchew/xen tags/patchew/20240912120602.1677194-1-andrew.cooper3@citrix.com
xen/arch/x86/hvm/stdvga.c | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)
[PATCH] x86/hvm: Simplify stdvga_mem_accept() further
Posted by Andrew Cooper 1 year, 1 month ago
stdvga_mem_accept() is called on almost all IO emulations, and the
overwhelming likely answer is to reject the ioreq.  Simply rearranging the
expression yields an improvement:

  add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-57 (-57)
  Function                                     old     new   delta
  stdvga_mem_accept                            109      52     -57

which is best explained looking at the disassembly:

  Before:                                                    After:
  f3 0f 1e fa           endbr64                              f3 0f 1e fa           endbr64
  0f b6 4e 1e           movzbl 0x1e(%rsi),%ecx            |  0f b6 46 1e           movzbl 0x1e(%rsi),%eax
  48 8b 16              mov    (%rsi),%rdx                |  31 d2                 xor    %edx,%edx
  f6 c1 40              test   $0x40,%cl                  |  a8 30                 test   $0x30,%al
  75 38                 jne    <stdvga_mem_accept+0x48>   |  75 23                 jne    <stdvga_mem_accept+0x31>
  31 c0                 xor    %eax,%eax                  <
  48 81 fa ff ff 09 00  cmp    $0x9ffff,%rdx              <
  76 26                 jbe    <stdvga_mem_accept+0x41>   <
  8b 46 14              mov    0x14(%rsi),%eax            <
  8b 7e 10              mov    0x10(%rsi),%edi            <
  48 0f af c7           imul   %rdi,%rax                  <
  48 8d 54 02 ff        lea    -0x1(%rdx,%rax,1),%rdx     <
  31 c0                 xor    %eax,%eax                  <
  48 81 fa ff ff 0b 00  cmp    $0xbffff,%rdx              <
  77 0c                 ja     <stdvga_mem_accept+0x41>   <
  83 e1 30              and    $0x30,%ecx                 <
  75 07                 jne    <stdvga_mem_accept+0x41>   <
  83 7e 10 01           cmpl   $0x1,0x10(%rsi)               83 7e 10 01           cmpl   $0x1,0x10(%rsi)
  0f 94 c0              sete   %al                        |  75 1d                 jne    <stdvga_mem_accept+0x31>
  c3                    ret                               |  48 8b 0e              mov    (%rsi),%rcx
  66 0f 1f 44 00 00     nopw   0x0(%rax,%rax,1)           |  48 81 f9 ff ff 09 00  cmp    $0x9ffff,%rcx
  8b 46 10              mov    0x10(%rsi),%eax            |  76 11                 jbe    <stdvga_mem_accept+0x31>
  8b 7e 14              mov    0x14(%rsi),%edi            |  8b 46 14              mov    0x14(%rsi),%eax
  49 89 d0              mov    %rdx,%r8                   |  48 8d 44 01 ff        lea    -0x1(%rcx,%rax,1),%rax
  48 83 e8 01           sub    $0x1,%rax                  |  48 3d ff ff 0b 00     cmp    $0xbffff,%rax
  48 8d 54 3a ff        lea    -0x1(%rdx,%rdi,1),%rdx     |  0f 96 c2              setbe  %dl
  48 0f af c7           imul   %rdi,%rax                  |  89 d0                 mov    %edx,%eax
  49 29 c0              sub    %rax,%r8                   <
  31 c0                 xor    %eax,%eax                  <
  49 81 f8 ff ff 09 00  cmp    $0x9ffff,%r8               <
  77 be                 ja     <stdvga_mem_accept+0x2a>   <
  c3                    ret                                  c3                    ret

By moving the "p->count != 1" check ahead of the
ioreq_mmio_{first,last}_byte() calls, both multiplies disappear along with a
lot of surrounding logic.

No functional change.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/arch/x86/hvm/stdvga.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/xen/arch/x86/hvm/stdvga.c b/xen/arch/x86/hvm/stdvga.c
index d38d30affbff..c3c43f59eead 100644
--- a/xen/arch/x86/hvm/stdvga.c
+++ b/xen/arch/x86/hvm/stdvga.c
@@ -69,18 +69,14 @@ static int cf_check stdvga_mem_write(
 static bool cf_check stdvga_mem_accept(
     const struct hvm_io_handler *handler, const ioreq_t *p)
 {
-    if ( (ioreq_mmio_first_byte(p) < VGA_MEM_BASE) ||
+    /*
+     * Only accept single direct writes, as that's the only thing we can
+     * accelerate using buffered ioreq handling.
+     */
+    if ( p->dir != IOREQ_WRITE || p->data_is_ptr || p->count != 1 ||
+         (ioreq_mmio_first_byte(p) < VGA_MEM_BASE) ||
          (ioreq_mmio_last_byte(p) >= (VGA_MEM_BASE + VGA_MEM_SIZE)) )
-        return 0;
-
-    if ( p->dir != IOREQ_WRITE || p->data_is_ptr || p->count != 1 )
-    {
-        /*
-         * Only accept single direct writes, as that's the only thing we can
-         * accelerate using buffered ioreq handling.
-         */
         return false;
-    }
 
     return true;
 }

base-commit: 6e7f7a0c16c4d406bda6d4a900252ff63a7c5fad
-- 
2.39.2


Re: [PATCH] x86/hvm: Simplify stdvga_mem_accept() further
Posted by Jan Beulich 1 year, 1 month ago
On 12.09.2024 14:06, Andrew Cooper wrote:
> stdvga_mem_accept() is called on almost all IO emulations, and the
> overwhelming likely answer is to reject the ioreq.  Simply rearranging the
> expression yields an improvement:
> 
>   add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-57 (-57)
>   Function                                     old     new   delta
>   stdvga_mem_accept                            109      52     -57
> 
> which is best explained looking at the disassembly:
> 
>   Before:                                                    After:
>   f3 0f 1e fa           endbr64                              f3 0f 1e fa           endbr64
>   0f b6 4e 1e           movzbl 0x1e(%rsi),%ecx            |  0f b6 46 1e           movzbl 0x1e(%rsi),%eax
>   48 8b 16              mov    (%rsi),%rdx                |  31 d2                 xor    %edx,%edx
>   f6 c1 40              test   $0x40,%cl                  |  a8 30                 test   $0x30,%al
>   75 38                 jne    <stdvga_mem_accept+0x48>   |  75 23                 jne    <stdvga_mem_accept+0x31>
>   31 c0                 xor    %eax,%eax                  <
>   48 81 fa ff ff 09 00  cmp    $0x9ffff,%rdx              <
>   76 26                 jbe    <stdvga_mem_accept+0x41>   <
>   8b 46 14              mov    0x14(%rsi),%eax            <
>   8b 7e 10              mov    0x10(%rsi),%edi            <
>   48 0f af c7           imul   %rdi,%rax                  <
>   48 8d 54 02 ff        lea    -0x1(%rdx,%rax,1),%rdx     <
>   31 c0                 xor    %eax,%eax                  <
>   48 81 fa ff ff 0b 00  cmp    $0xbffff,%rdx              <
>   77 0c                 ja     <stdvga_mem_accept+0x41>   <
>   83 e1 30              and    $0x30,%ecx                 <
>   75 07                 jne    <stdvga_mem_accept+0x41>   <
>   83 7e 10 01           cmpl   $0x1,0x10(%rsi)               83 7e 10 01           cmpl   $0x1,0x10(%rsi)
>   0f 94 c0              sete   %al                        |  75 1d                 jne    <stdvga_mem_accept+0x31>
>   c3                    ret                               |  48 8b 0e              mov    (%rsi),%rcx
>   66 0f 1f 44 00 00     nopw   0x0(%rax,%rax,1)           |  48 81 f9 ff ff 09 00  cmp    $0x9ffff,%rcx
>   8b 46 10              mov    0x10(%rsi),%eax            |  76 11                 jbe    <stdvga_mem_accept+0x31>
>   8b 7e 14              mov    0x14(%rsi),%edi            |  8b 46 14              mov    0x14(%rsi),%eax
>   49 89 d0              mov    %rdx,%r8                   |  48 8d 44 01 ff        lea    -0x1(%rcx,%rax,1),%rax
>   48 83 e8 01           sub    $0x1,%rax                  |  48 3d ff ff 0b 00     cmp    $0xbffff,%rax
>   48 8d 54 3a ff        lea    -0x1(%rdx,%rdi,1),%rdx     |  0f 96 c2              setbe  %dl
>   48 0f af c7           imul   %rdi,%rax                  |  89 d0                 mov    %edx,%eax
>   49 29 c0              sub    %rax,%r8                   <
>   31 c0                 xor    %eax,%eax                  <
>   49 81 f8 ff ff 09 00  cmp    $0x9ffff,%r8               <
>   77 be                 ja     <stdvga_mem_accept+0x2a>   <
>   c3                    ret                                  c3                    ret
> 
> By moving the "p->count != 1" check ahead of the
> ioreq_mmio_{first,last}_byte() calls, both multiplies disappear along with a
> lot of surrounding logic.
> 
> No functional change.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>

Reviewed-by: Jan Beulich <jbeulich@suse.com>

> --- a/xen/arch/x86/hvm/stdvga.c
> +++ b/xen/arch/x86/hvm/stdvga.c
> @@ -69,18 +69,14 @@ static int cf_check stdvga_mem_write(
>  static bool cf_check stdvga_mem_accept(
>      const struct hvm_io_handler *handler, const ioreq_t *p)
>  {
> -    if ( (ioreq_mmio_first_byte(p) < VGA_MEM_BASE) ||
> +    /*
> +     * Only accept single direct writes, as that's the only thing we can
> +     * accelerate using buffered ioreq handling.
> +     */
> +    if ( p->dir != IOREQ_WRITE || p->data_is_ptr || p->count != 1 ||
> +         (ioreq_mmio_first_byte(p) < VGA_MEM_BASE) ||
>           (ioreq_mmio_last_byte(p) >= (VGA_MEM_BASE + VGA_MEM_SIZE)) )

Arguably the function calls are then pointless (as generated code proves),
but maybe keeping them for doc purposes is indeed worthwhile.

Jan
Re: [PATCH] x86/hvm: Simplify stdvga_mem_accept() further
Posted by Andrew Cooper 1 year, 1 month ago
On 12/09/2024 1:10 pm, Jan Beulich wrote:
> On 12.09.2024 14:06, Andrew Cooper wrote:
>> stdvga_mem_accept() is called on almost all IO emulations, and the
>> overwhelming likely answer is to reject the ioreq.  Simply rearranging the
>> expression yields an improvement:
>>
>>   add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-57 (-57)
>>   Function                                     old     new   delta
>>   stdvga_mem_accept                            109      52     -57
>>
>> which is best explained looking at the disassembly:
>>
>>   Before:                                                    After:
>>   f3 0f 1e fa           endbr64                              f3 0f 1e fa           endbr64
>>   0f b6 4e 1e           movzbl 0x1e(%rsi),%ecx            |  0f b6 46 1e           movzbl 0x1e(%rsi),%eax
>>   48 8b 16              mov    (%rsi),%rdx                |  31 d2                 xor    %edx,%edx
>>   f6 c1 40              test   $0x40,%cl                  |  a8 30                 test   $0x30,%al
>>   75 38                 jne    <stdvga_mem_accept+0x48>   |  75 23                 jne    <stdvga_mem_accept+0x31>
>>   31 c0                 xor    %eax,%eax                  <
>>   48 81 fa ff ff 09 00  cmp    $0x9ffff,%rdx              <
>>   76 26                 jbe    <stdvga_mem_accept+0x41>   <
>>   8b 46 14              mov    0x14(%rsi),%eax            <
>>   8b 7e 10              mov    0x10(%rsi),%edi            <
>>   48 0f af c7           imul   %rdi,%rax                  <
>>   48 8d 54 02 ff        lea    -0x1(%rdx,%rax,1),%rdx     <
>>   31 c0                 xor    %eax,%eax                  <
>>   48 81 fa ff ff 0b 00  cmp    $0xbffff,%rdx              <
>>   77 0c                 ja     <stdvga_mem_accept+0x41>   <
>>   83 e1 30              and    $0x30,%ecx                 <
>>   75 07                 jne    <stdvga_mem_accept+0x41>   <
>>   83 7e 10 01           cmpl   $0x1,0x10(%rsi)               83 7e 10 01           cmpl   $0x1,0x10(%rsi)
>>   0f 94 c0              sete   %al                        |  75 1d                 jne    <stdvga_mem_accept+0x31>
>>   c3                    ret                               |  48 8b 0e              mov    (%rsi),%rcx
>>   66 0f 1f 44 00 00     nopw   0x0(%rax,%rax,1)           |  48 81 f9 ff ff 09 00  cmp    $0x9ffff,%rcx
>>   8b 46 10              mov    0x10(%rsi),%eax            |  76 11                 jbe    <stdvga_mem_accept+0x31>
>>   8b 7e 14              mov    0x14(%rsi),%edi            |  8b 46 14              mov    0x14(%rsi),%eax
>>   49 89 d0              mov    %rdx,%r8                   |  48 8d 44 01 ff        lea    -0x1(%rcx,%rax,1),%rax
>>   48 83 e8 01           sub    $0x1,%rax                  |  48 3d ff ff 0b 00     cmp    $0xbffff,%rax
>>   48 8d 54 3a ff        lea    -0x1(%rdx,%rdi,1),%rdx     |  0f 96 c2              setbe  %dl
>>   48 0f af c7           imul   %rdi,%rax                  |  89 d0                 mov    %edx,%eax
>>   49 29 c0              sub    %rax,%r8                   <
>>   31 c0                 xor    %eax,%eax                  <
>>   49 81 f8 ff ff 09 00  cmp    $0x9ffff,%r8               <
>>   77 be                 ja     <stdvga_mem_accept+0x2a>   <
>>   c3                    ret                                  c3                    ret
>>
>> By moving the "p->count != 1" check ahead of the
>> ioreq_mmio_{first,last}_byte() calls, both multiplies disappear along with a
>> lot of surrounding logic.
>>
>> No functional change.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>

Thanks.

>
>> --- a/xen/arch/x86/hvm/stdvga.c
>> +++ b/xen/arch/x86/hvm/stdvga.c
>> @@ -69,18 +69,14 @@ static int cf_check stdvga_mem_write(
>>  static bool cf_check stdvga_mem_accept(
>>      const struct hvm_io_handler *handler, const ioreq_t *p)
>>  {
>> -    if ( (ioreq_mmio_first_byte(p) < VGA_MEM_BASE) ||
>> +    /*
>> +     * Only accept single direct writes, as that's the only thing we can
>> +     * accelerate using buffered ioreq handling.
>> +     */
>> +    if ( p->dir != IOREQ_WRITE || p->data_is_ptr || p->count != 1 ||
>> +         (ioreq_mmio_first_byte(p) < VGA_MEM_BASE) ||
>>           (ioreq_mmio_last_byte(p) >= (VGA_MEM_BASE + VGA_MEM_SIZE)) )
> Arguably the function calls are then pointless (as generated code proves),
> but maybe keeping them for doc purposes is indeed worthwhile.

They're static inlines, but the code is more readable this way IMO.

One thing that did occur to me though.  The compiler doesn't know that
p->df is only relevant for REP instructions, and the p->count == 1 case
is ambiguous.

I don't think we can do any better, considering that ioreq is a public type.

~Andrew