[PATCH 1/4] x86/hvmloader: SMP improvements

Andrew Cooper posted 4 patches 3 years, 5 months ago
[PATCH 1/4] x86/hvmloader: SMP improvements
Posted by Andrew Cooper 3 years, 5 months ago
 * Use MOV CR instead of LMSW.  LMSW has no decode assist at all on AMD CPUs,
   forcing us to fully emulate the instruction.
 * Use __attribute__((used)) to fix the comment about ap_start().
 * Have ap_start() perform a self-INIT for APs, rather than having boot_cpu()
   do it.  This is marginally more parallel, and reduces the amount of remote
   vCPU management that Xen has to do on behalf of the guest.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
---
 tools/firmware/hvmloader/smp.c | 46 ++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/tools/firmware/hvmloader/smp.c b/tools/firmware/hvmloader/smp.c
index 082b17f13818..80154950ac32 100644
--- a/tools/firmware/hvmloader/smp.c
+++ b/tools/firmware/hvmloader/smp.c
@@ -35,9 +35,9 @@ asm (
     "    mov   %cs,%ax               \n"
     "    mov   %ax,%ds               \n"
     "    lgdt  gdt_desr-ap_boot_start\n"
-    "    xor   %ax, %ax              \n"
-    "    inc   %ax                   \n"
-    "    lmsw  %ax                   \n"
+    "    mov   %cr0, %eax            \n"
+    "    or    $1, %al               \n"
+    "    mov   %eax, %cr0            \n"
     "    ljmpl $0x08,$1f             \n"
     "gdt_desr:                       \n"
     "    .word gdt_end - gdt - 1     \n"
@@ -50,8 +50,6 @@ asm (
     "    movl  $stack_top,%esp       \n"
     "    movl  %esp,%ebp             \n"
     "    call  ap_start              \n"
-    "1:  hlt                         \n"
-    "    jmp  1b                     \n"
     "                                \n"
     "    .align 8                    \n"
     "gdt:                            \n"
@@ -68,14 +66,37 @@ asm (
     "    .text                       \n"
     );
 
-void ap_start(void); /* non-static avoids unused-function compiler warning */
-/*static*/ void ap_start(void)
+static void __attribute__((used)) ap_start(void)
 {
-    printf(" - CPU%d ... ", ap_cpuid);
+    unsigned int cpu = ap_cpuid;
+
+    printf(" - CPU%d ... ", cpu);
     cacheattr_init();
     printf("done.\n");
-    wmb();
-    ap_callin = 1;
+
+    /*
+     * Call in to the BSP.  For APs, take ourselves offline.
+     *
+     * We must not use the stack after calling in to the BSP.
+     */
+    asm volatile (
+        "    movb $1, ap_callin          \n"
+
+        "    test %[icr2], %[icr2]       \n"
+        "    jz   .Lbsp                  \n"
+
+        "    movl %[icr2], %[ICR2]       \n"
+        "    movl %[init], %[ICR1]       \n"
+        "1:  hlt                         \n"
+        "    jmp  1b                     \n"
+
+        ".Lbsp:                          \n"
+        :
+        : [icr2] "r" (SET_APIC_DEST_FIELD(LAPIC_ID(cpu))),
+          [init] "i" (APIC_DM_INIT),
+          [ICR1] "m" (*(uint32_t *)(LAPIC_BASE_ADDRESS + APIC_ICR)),
+          [ICR2] "m" (*(uint32_t *)(LAPIC_BASE_ADDRESS + APIC_ICR2))
+        : "memory" );
 }
 
 static void lapic_wait_ready(void)
@@ -111,11 +132,6 @@ static void boot_cpu(unsigned int cpu)
      */
     while ( !ap_callin )
         cpu_relax();
-
-    /* Take the secondary processor offline. */
-    lapic_write(APIC_ICR2, icr2);
-    lapic_write(APIC_ICR, APIC_DM_INIT);
-    lapic_wait_ready();    
 }
 
 void smp_initialise(void)
-- 
2.11.0


Re: [PATCH 1/4] x86/hvmloader: SMP improvements
Posted by Jan Beulich 3 years, 5 months ago
On 24.08.2022 12:59, Andrew Cooper wrote:
> --- a/tools/firmware/hvmloader/smp.c
> +++ b/tools/firmware/hvmloader/smp.c
> @@ -35,9 +35,9 @@ asm (
>      "    mov   %cs,%ax               \n"
>      "    mov   %ax,%ds               \n"
>      "    lgdt  gdt_desr-ap_boot_start\n"
> -    "    xor   %ax, %ax              \n"
> -    "    inc   %ax                   \n"
> -    "    lmsw  %ax                   \n"
> +    "    mov   %cr0, %eax            \n"
> +    "    or    $1, %al               \n"
> +    "    mov   %eax, %cr0            \n"

Hmm, yes, read-modify-write should probably have been used from
the beginning, irrespective of using 286 or 386 insns.

> @@ -68,14 +66,37 @@ asm (
>      "    .text                       \n"
>      );
>  
> -void ap_start(void); /* non-static avoids unused-function compiler warning */
> -/*static*/ void ap_start(void)
> +static void __attribute__((used)) ap_start(void)
>  {
> -    printf(" - CPU%d ... ", ap_cpuid);
> +    unsigned int cpu = ap_cpuid;
> +
> +    printf(" - CPU%d ... ", cpu);
>      cacheattr_init();
>      printf("done.\n");
> -    wmb();

Is there a reason you remove this barrier but not the one in boot_cpu()?

> -    ap_callin = 1;
> +
> +    /*
> +     * Call in to the BSP.  For APs, take ourselves offline.
> +     *
> +     * We must not use the stack after calling in to the BSP.
> +     */
> +    asm volatile (
> +        "    movb $1, ap_callin          \n"
> +
> +        "    test %[icr2], %[icr2]       \n"
> +        "    jz   .Lbsp                  \n"

Are we intending to guarantee going forward that the BSP always has
APIC ID zero?

> +        "    movl %[icr2], %[ICR2]       \n"
> +        "    movl %[init], %[ICR1]       \n"
> +        "1:  hlt                         \n"
> +        "    jmp  1b                     \n"

The use of the function for the BSP is questionable anyway. What is
really needed is the call to cacheattr_init(). I'm inclined to
suggest to move to something like

void smp_initialise(void)
{
    unsigned int i, nr_cpus = hvm_info->nr_vcpus;

    cacheattr_init();

    if ( nr_cpus <= 1 )
        return;

    memcpy((void *)AP_BOOT_EIP, ap_boot_start, ap_boot_end - ap_boot_start);

    printf("Multiprocessor initialisation:\n");
    for ( i = 1; i < nr_cpus; i++ )
        boot_cpu(i);
}

thus eliminating bogus output when there's just one vCPU. 

Then the function here can become noreturn (which I was about to suggest
until spotting that for the BSP the function actually does return).

> +        ".Lbsp:                          \n"
> +        :
> +        : [icr2] "r" (SET_APIC_DEST_FIELD(LAPIC_ID(cpu))),
> +          [init] "i" (APIC_DM_INIT),
> +          [ICR1] "m" (*(uint32_t *)(LAPIC_BASE_ADDRESS + APIC_ICR)),
> +          [ICR2] "m" (*(uint32_t *)(LAPIC_BASE_ADDRESS + APIC_ICR2))
> +        : "memory" );

Can't you use APIC_DEST_SELF now, avoiding the need to fiddle
with ICR2?

Jan
Re: [PATCH 1/4] x86/hvmloader: SMP improvements
Posted by Andrew Cooper 2 years, 10 months ago
On 24/08/2022 3:21 pm, Jan Beulich wrote:
> On 24.08.2022 12:59, Andrew Cooper wrote:
>
>> -    ap_callin = 1;
>> +
>> +    /*
>> +     * Call in to the BSP.  For APs, take ourselves offline.
>> +     *
>> +     * We must not use the stack after calling in to the BSP.
>> +     */
>> +    asm volatile (
>> +        "    movb $1, ap_callin          \n"
>> +
>> +        "    test %[icr2], %[icr2]       \n"
>> +        "    jz   .Lbsp                  \n"
>
> Are we intending to guarantee going forward that the BSP always has
> APIC ID zero?

It's currently true, and I doubt that will change, but I prefer the
suggestion to not call this at all on the BSP.

>
>> +        "    movl %[icr2], %[ICR2]       \n"
>> +        "    movl %[init], %[ICR1]       \n"
>> +        "1:  hlt                         \n"
>> +        "    jmp  1b                     \n"
>
> The use of the function for the BSP is questionable anyway. What is
> really needed is the call to cacheattr_init(). I'm inclined to
> suggest to move to something like
>
> void smp_initialise(void)
> {
>    unsigned int i, nr_cpus = hvm_info->nr_vcpus;
>
>    cacheattr_init();
>
>    if ( nr_cpus <= 1 )
>        return;
>
>    memcpy((void *)AP_BOOT_EIP, ap_boot_start, ap_boot_end -
> ap_boot_start);
>
>    printf("Multiprocessor initialisation:\n");
>    for ( i = 1; i < nr_cpus; i++ )
>        boot_cpu(i);
> }
>
> thus eliminating bogus output when there's just one vCPU.
> Then the function here can become noreturn (which I was about to suggest
> until spotting that for the BSP the function actually does return).

Dropping the printk() isn't nice, because you'll then get unqualified
information from cacheattr_init().

I'll see if I can rearrange this a bit more nicely.

>
>> +        ".Lbsp:                          \n"
>> +        :
>> +        : [icr2] "r" (SET_APIC_DEST_FIELD(LAPIC_ID(cpu))),
>> +          [init] "i" (APIC_DM_INIT),
>> +          [ICR1] "m" (*(uint32_t *)(LAPIC_BASE_ADDRESS + APIC_ICR)),
>> +          [ICR2] "m" (*(uint32_t *)(LAPIC_BASE_ADDRESS + APIC_ICR2))
>> +        : "memory" );
>
> Can't you use APIC_DEST_SELF now, avoiding the need to fiddle
> with ICR2?

No.  Fixed is the only message type which can use self or all-inc-self. 
All others are only permitted to use the all-excluding-self.

This makes sense as a consequence of likely shortcuts taking when
integrating the LAPIC into the core.  Either way, it's documented
behaviour now, however inconvenient this is for this case.

~Andrew