[PATCH] x86/vmce: Dispatch vmce_{rd,wr}msr() from guest_{rd,wr}msr()

Andrew Cooper posted 1 patch 2 weeks ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/xen tags/patchew/20200722101809.8389-1-andrew.cooper3@citrix.com
xen/arch/x86/hvm/hvm.c         | 16 ++--------------
xen/arch/x86/msr.c             | 16 ++++++++++++++++
xen/arch/x86/pv/emul-priv-op.c | 15 ---------------
3 files changed, 18 insertions(+), 29 deletions(-)

[PATCH] x86/vmce: Dispatch vmce_{rd,wr}msr() from guest_{rd,wr}msr()

Posted by Andrew Cooper 2 weeks ago
... rather than from the default clauses of the PV and HVM MSR handlers.

This means that we no longer take the vmce lock for any unknown MSR, and
accesses to architectural MCE banks outside of the subset implemented for the
guest no longer fall further through the unknown MSR path.

With the vmce calls removed, the hvm alternative_call()'s expression can be
simplified substantially.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Wei Liu <wl@xen.org>
CC: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/arch/x86/hvm/hvm.c         | 16 ++--------------
 xen/arch/x86/msr.c             | 16 ++++++++++++++++
 xen/arch/x86/pv/emul-priv-op.c | 15 ---------------
 3 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 5bb47583b3..a9d1685549 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3560,13 +3560,7 @@ int hvm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
          break;
 
     default:
-        if ( (ret = vmce_rdmsr(msr, msr_content)) < 0 )
-            goto gp_fault;
-        /* If ret == 0 then this is not an MCE MSR, see other MSRs. */
-        ret = ((ret == 0)
-               ? alternative_call(hvm_funcs.msr_read_intercept,
-                                  msr, msr_content)
-               : X86EMUL_OKAY);
+        ret = alternative_call(hvm_funcs.msr_read_intercept, msr, msr_content);
         break;
     }
 
@@ -3696,13 +3690,7 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content,
         break;
 
     default:
-        if ( (ret = vmce_wrmsr(msr, msr_content)) < 0 )
-            goto gp_fault;
-        /* If ret == 0 then this is not an MCE MSR, see other MSRs. */
-        ret = ((ret == 0)
-               ? alternative_call(hvm_funcs.msr_write_intercept,
-                                  msr, msr_content)
-               : X86EMUL_OKAY);
+        ret = alternative_call(hvm_funcs.msr_write_intercept, msr, msr_content);
         break;
     }
 
diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
index 22f921cc71..ca4307e19f 100644
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -227,6 +227,14 @@ int guest_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val)
         *val = msrs->misc_features_enables.raw;
         break;
 
+    case MSR_IA32_MCG_CAP     ... MSR_IA32_MCG_CTL:      /* 0x179 -> 0x17b */
+    case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
+    case MSR_IA32_MCx_CTL(0)  ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */
+    case MSR_IA32_MCG_EXT_CTL:                           /* 0x4d0 */
+        if ( vmce_rdmsr(msr, val) < 0 )
+            goto gp_fault;
+        break;
+
     case MSR_X2APIC_FIRST ... MSR_X2APIC_LAST:
         if ( !is_hvm_domain(d) || v != curr )
             goto gp_fault;
@@ -436,6 +444,14 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
         break;
     }
 
+    case MSR_IA32_MCG_CAP     ... MSR_IA32_MCG_CTL:      /* 0x179 -> 0x17b */
+    case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
+    case MSR_IA32_MCx_CTL(0)  ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */
+    case MSR_IA32_MCG_EXT_CTL:                           /* 0x4d0 */
+        if ( vmce_wrmsr(msr, val) < 0 )
+            goto gp_fault;
+        break;
+
     case MSR_X2APIC_FIRST ... MSR_X2APIC_LAST:
         if ( !is_hvm_domain(d) || v != curr )
             goto gp_fault;
diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
index 254da2b849..f14552cb4b 100644
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -855,8 +855,6 @@ static int read_msr(unsigned int reg, uint64_t *val,
 
     switch ( reg )
     {
-        int rc;
-
     case MSR_FS_BASE:
         if ( is_pv_32bit_domain(currd) )
             break;
@@ -955,12 +953,6 @@ static int read_msr(unsigned int reg, uint64_t *val,
         }
         /* fall through */
     default:
-        rc = vmce_rdmsr(reg, val);
-        if ( rc < 0 )
-            break;
-        if ( rc )
-            return X86EMUL_OKAY;
-        /* fall through */
     normal:
         /* Everyone can read the MSR space. */
         /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
@@ -991,7 +983,6 @@ static int write_msr(unsigned int reg, uint64_t val,
     switch ( reg )
     {
         uint64_t temp;
-        int rc;
 
     case MSR_FS_BASE:
         if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
@@ -1122,12 +1113,6 @@ static int write_msr(unsigned int reg, uint64_t val,
         }
         /* fall through */
     default:
-        rc = vmce_wrmsr(reg, val);
-        if ( rc < 0 )
-            break;
-        if ( rc )
-            return X86EMUL_OKAY;
-
         if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
     invalid:
             gdprintk(XENLOG_WARNING,
-- 
2.11.0


Re: [PATCH] x86/vmce: Dispatch vmce_{rd,wr}msr() from guest_{rd,wr}msr()

Posted by Jan Beulich 2 weeks ago
On 22.07.2020 12:18, Andrew Cooper wrote:
> --- a/xen/arch/x86/msr.c
> +++ b/xen/arch/x86/msr.c
> @@ -227,6 +227,14 @@ int guest_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val)
>          *val = msrs->misc_features_enables.raw;
>          break;
>  
> +    case MSR_IA32_MCG_CAP     ... MSR_IA32_MCG_CTL:      /* 0x179 -> 0x17b */
> +    case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
> +    case MSR_IA32_MCx_CTL(0)  ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */
> +    case MSR_IA32_MCG_EXT_CTL:                           /* 0x4d0 */
> +        if ( vmce_rdmsr(msr, val) < 0 )
> +            goto gp_fault;
> +        break;
> +
>      case MSR_X2APIC_FIRST ... MSR_X2APIC_LAST:
>          if ( !is_hvm_domain(d) || v != curr )
>              goto gp_fault;
> @@ -436,6 +444,14 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
>          break;
>      }
>  
> +    case MSR_IA32_MCG_CAP     ... MSR_IA32_MCG_CTL:      /* 0x179 -> 0x17b */
> +    case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
> +    case MSR_IA32_MCx_CTL(0)  ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */
> +    case MSR_IA32_MCG_EXT_CTL:                           /* 0x4d0 */
> +        if ( vmce_wrmsr(msr, val) < 0 )
> +            goto gp_fault;
> +        break;
> +
>      case MSR_X2APIC_FIRST ... MSR_X2APIC_LAST:
>          if ( !is_hvm_domain(d) || v != curr )
>              goto gp_fault;

With this the two functions also possibly returning 0 or 1 becomes
meaningless. Would you think you can make then return bool at this
occasion, or would you prefer to leave this to whenever someone
gets to clean up this resulting anomaly? (I'm fine either way, but
would prefer to not see the then meaningless tristate return values
left in place.)

Jan

Re: [PATCH] x86/vmce: Dispatch vmce_{rd,wr}msr() from guest_{rd,wr}msr()

Posted by Andrew Cooper 2 weeks ago
On 23/07/2020 11:37, Jan Beulich wrote:
> On 22.07.2020 12:18, Andrew Cooper wrote:
>> --- a/xen/arch/x86/msr.c
>> +++ b/xen/arch/x86/msr.c
>> @@ -227,6 +227,14 @@ int guest_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val)
>>          *val = msrs->misc_features_enables.raw;
>>          break;
>>  
>> +    case MSR_IA32_MCG_CAP     ... MSR_IA32_MCG_CTL:      /* 0x179 -> 0x17b */
>> +    case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
>> +    case MSR_IA32_MCx_CTL(0)  ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */
>> +    case MSR_IA32_MCG_EXT_CTL:                           /* 0x4d0 */
>> +        if ( vmce_rdmsr(msr, val) < 0 )
>> +            goto gp_fault;
>> +        break;
>> +
>>      case MSR_X2APIC_FIRST ... MSR_X2APIC_LAST:
>>          if ( !is_hvm_domain(d) || v != curr )
>>              goto gp_fault;
>> @@ -436,6 +444,14 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
>>          break;
>>      }
>>  
>> +    case MSR_IA32_MCG_CAP     ... MSR_IA32_MCG_CTL:      /* 0x179 -> 0x17b */
>> +    case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
>> +    case MSR_IA32_MCx_CTL(0)  ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */
>> +    case MSR_IA32_MCG_EXT_CTL:                           /* 0x4d0 */
>> +        if ( vmce_wrmsr(msr, val) < 0 )
>> +            goto gp_fault;
>> +        break;
>> +
>>      case MSR_X2APIC_FIRST ... MSR_X2APIC_LAST:
>>          if ( !is_hvm_domain(d) || v != curr )
>>              goto gp_fault;
> With this the two functions also possibly returning 0 or 1 becomes
> meaningless. Would you think you can make then return bool at this
> occasion, or would you prefer to leave this to whenever someone
> gets to clean up this resulting anomaly? (I'm fine either way, but
> would prefer to not see the then meaningless tristate return values
> left in place.)

The entire internals of vmce_{wr,rd}msr() need an overhaul.

I tried switching them to use X86EMUL_* (at which point they will match
all the other subsystems we hand off MSR blocks to), but that quickly
turned into a larger mess than I have time for right now.  I've still
got the partial work so far, and will finish it at some point.

~Andrew

Re: [PATCH] x86/vmce: Dispatch vmce_{rd,wr}msr() from guest_{rd,wr}msr()

Posted by Roger Pau Monné 2 weeks ago
On Wed, Jul 22, 2020 at 11:18:09AM +0100, Andrew Cooper wrote:
> ... rather than from the default clauses of the PV and HVM MSR handlers.
> 
> This means that we no longer take the vmce lock for any unknown MSR, and
> accesses to architectural MCE banks outside of the subset implemented for the
> guest no longer fall further through the unknown MSR path.
> 
> With the vmce calls removed, the hvm alternative_call()'s expression can be
> simplified substantially.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>

LGTM, I just have one question below regarding the ranges.

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Wei Liu <wl@xen.org>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> ---
>  xen/arch/x86/hvm/hvm.c         | 16 ++--------------
>  xen/arch/x86/msr.c             | 16 ++++++++++++++++
>  xen/arch/x86/pv/emul-priv-op.c | 15 ---------------
>  3 files changed, 18 insertions(+), 29 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index 5bb47583b3..a9d1685549 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -3560,13 +3560,7 @@ int hvm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
>           break;
>  
>      default:
> -        if ( (ret = vmce_rdmsr(msr, msr_content)) < 0 )
> -            goto gp_fault;
> -        /* If ret == 0 then this is not an MCE MSR, see other MSRs. */
> -        ret = ((ret == 0)
> -               ? alternative_call(hvm_funcs.msr_read_intercept,
> -                                  msr, msr_content)
> -               : X86EMUL_OKAY);
> +        ret = alternative_call(hvm_funcs.msr_read_intercept, msr, msr_content);
>          break;
>      }
>  
> @@ -3696,13 +3690,7 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content,
>          break;
>  
>      default:
> -        if ( (ret = vmce_wrmsr(msr, msr_content)) < 0 )
> -            goto gp_fault;
> -        /* If ret == 0 then this is not an MCE MSR, see other MSRs. */
> -        ret = ((ret == 0)
> -               ? alternative_call(hvm_funcs.msr_write_intercept,
> -                                  msr, msr_content)
> -               : X86EMUL_OKAY);
> +        ret = alternative_call(hvm_funcs.msr_write_intercept, msr, msr_content);
>          break;
>      }
>  
> diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
> index 22f921cc71..ca4307e19f 100644
> --- a/xen/arch/x86/msr.c
> +++ b/xen/arch/x86/msr.c
> @@ -227,6 +227,14 @@ int guest_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val)
>          *val = msrs->misc_features_enables.raw;
>          break;
>  
> +    case MSR_IA32_MCG_CAP     ... MSR_IA32_MCG_CTL:      /* 0x179 -> 0x17b */
> +    case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
> +    case MSR_IA32_MCx_CTL(0)  ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */

Where do you get the ranges from 0 to 31? It seems like the count
field in the CAP register is 8 bits, which could allow for up to 256
banks?

I'm quite sure this would then overlap with other MSRs?

> +    case MSR_IA32_MCG_EXT_CTL:                           /* 0x4d0 */
> +        if ( vmce_rdmsr(msr, val) < 0 )
> +            goto gp_fault;
> +        break;
> +
>      case MSR_X2APIC_FIRST ... MSR_X2APIC_LAST:
>          if ( !is_hvm_domain(d) || v != curr )
>              goto gp_fault;
> @@ -436,6 +444,14 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
>          break;
>      }
>  
> +    case MSR_IA32_MCG_CAP     ... MSR_IA32_MCG_CTL:      /* 0x179 -> 0x17b */
> +    case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
> +    case MSR_IA32_MCx_CTL(0)  ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */
> +    case MSR_IA32_MCG_EXT_CTL:                           /* 0x4d0 */
> +        if ( vmce_wrmsr(msr, val) < 0 )
> +            goto gp_fault;
> +        break;
> +
>      case MSR_X2APIC_FIRST ... MSR_X2APIC_LAST:
>          if ( !is_hvm_domain(d) || v != curr )
>              goto gp_fault;
> diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
> index 254da2b849..f14552cb4b 100644
> --- a/xen/arch/x86/pv/emul-priv-op.c
> +++ b/xen/arch/x86/pv/emul-priv-op.c
> @@ -855,8 +855,6 @@ static int read_msr(unsigned int reg, uint64_t *val,
>  
>      switch ( reg )
>      {
> -        int rc;
> -
>      case MSR_FS_BASE:
>          if ( is_pv_32bit_domain(currd) )
>              break;
> @@ -955,12 +953,6 @@ static int read_msr(unsigned int reg, uint64_t *val,
>          }
>          /* fall through */
>      default:
> -        rc = vmce_rdmsr(reg, val);
> -        if ( rc < 0 )
> -            break;
> -        if ( rc )
> -            return X86EMUL_OKAY;
> -        /* fall through */
>      normal:

We could remove the 'normal' label and just use the default one
instead.

Thanks, Roger.

Re: [PATCH] x86/vmce: Dispatch vmce_{rd,wr}msr() from guest_{rd,wr}msr()

Posted by Andrew Cooper 2 weeks ago
On 23/07/2020 11:07, Roger Pau Monné wrote:
> On Wed, Jul 22, 2020 at 11:18:09AM +0100, Andrew Cooper wrote:
>> ... rather than from the default clauses of the PV and HVM MSR handlers.
>>
>> This means that we no longer take the vmce lock for any unknown MSR, and
>> accesses to architectural MCE banks outside of the subset implemented for the
>> guest no longer fall further through the unknown MSR path.
>>
>> With the vmce calls removed, the hvm alternative_call()'s expression can be
>> simplified substantially.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> LGTM, I just have one question below regarding the ranges.
>
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
>
>> ---
>> CC: Jan Beulich <JBeulich@suse.com>
>> CC: Wei Liu <wl@xen.org>
>> CC: Roger Pau Monné <roger.pau@citrix.com>
>> ---
>>  xen/arch/x86/hvm/hvm.c         | 16 ++--------------
>>  xen/arch/x86/msr.c             | 16 ++++++++++++++++
>>  xen/arch/x86/pv/emul-priv-op.c | 15 ---------------
>>  3 files changed, 18 insertions(+), 29 deletions(-)
>>
>> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
>> index 5bb47583b3..a9d1685549 100644
>> --- a/xen/arch/x86/hvm/hvm.c
>> +++ b/xen/arch/x86/hvm/hvm.c
>> @@ -3560,13 +3560,7 @@ int hvm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
>>           break;
>>  
>>      default:
>> -        if ( (ret = vmce_rdmsr(msr, msr_content)) < 0 )
>> -            goto gp_fault;
>> -        /* If ret == 0 then this is not an MCE MSR, see other MSRs. */
>> -        ret = ((ret == 0)
>> -               ? alternative_call(hvm_funcs.msr_read_intercept,
>> -                                  msr, msr_content)
>> -               : X86EMUL_OKAY);
>> +        ret = alternative_call(hvm_funcs.msr_read_intercept, msr, msr_content);
>>          break;
>>      }
>>  
>> @@ -3696,13 +3690,7 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content,
>>          break;
>>  
>>      default:
>> -        if ( (ret = vmce_wrmsr(msr, msr_content)) < 0 )
>> -            goto gp_fault;
>> -        /* If ret == 0 then this is not an MCE MSR, see other MSRs. */
>> -        ret = ((ret == 0)
>> -               ? alternative_call(hvm_funcs.msr_write_intercept,
>> -                                  msr, msr_content)
>> -               : X86EMUL_OKAY);
>> +        ret = alternative_call(hvm_funcs.msr_write_intercept, msr, msr_content);
>>          break;
>>      }
>>  
>> diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
>> index 22f921cc71..ca4307e19f 100644
>> --- a/xen/arch/x86/msr.c
>> +++ b/xen/arch/x86/msr.c
>> @@ -227,6 +227,14 @@ int guest_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val)
>>          *val = msrs->misc_features_enables.raw;
>>          break;
>>  
>> +    case MSR_IA32_MCG_CAP     ... MSR_IA32_MCG_CTL:      /* 0x179 -> 0x17b */
>> +    case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
>> +    case MSR_IA32_MCx_CTL(0)  ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */
> Where do you get the ranges from 0 to 31? It seems like the count
> field in the CAP register is 8 bits, which could allow for up to 256
> banks?
>
> I'm quite sure this would then overlap with other MSRs?

Irritatingly, nothing I can find actually states an upper architectural
limit.

SDM Vol4, Table 2-2 which enumerates the Architectural MSRs.

0x280 thru 0x29f are explicitly reserved MCx_CTL2, which is a limit of
32 banks.  There are gaps after this in the architectural table, but
IceLake has PRMRR_BASE_0 at 0x2a0.

The main bank of MCx_{CTL,STATUS,ADDR,MISC} start at 0x400 and are
listed in the table up to 0x473, which is a limit of 29 banks.  The
Model specific table for SandyBridge fills in the remaining 3 banks up
to MSR 0x47f, which is the previous limit of 32 banks.  (These MSRs have
package scope rather than core/thread scope, but they are still
enumerated architecturally so I'm not sure why they are in the model
specific tables.)

More importantly however, the VMX MSR range starts at 0x480, immediately
above bank 31, which puts an architectural hard limit on the number of
banks.

>> +    case MSR_IA32_MCG_EXT_CTL:                           /* 0x4d0 */
>> +        if ( vmce_rdmsr(msr, val) < 0 )
>> +            goto gp_fault;
>> +        break;
>> +
>>      case MSR_X2APIC_FIRST ... MSR_X2APIC_LAST:
>>          if ( !is_hvm_domain(d) || v != curr )
>>              goto gp_fault;
>> @@ -436,6 +444,14 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
>>          break;
>>      }
>>  
>> +    case MSR_IA32_MCG_CAP     ... MSR_IA32_MCG_CTL:      /* 0x179 -> 0x17b */
>> +    case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
>> +    case MSR_IA32_MCx_CTL(0)  ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */
>> +    case MSR_IA32_MCG_EXT_CTL:                           /* 0x4d0 */
>> +        if ( vmce_wrmsr(msr, val) < 0 )
>> +            goto gp_fault;
>> +        break;
>> +
>>      case MSR_X2APIC_FIRST ... MSR_X2APIC_LAST:
>>          if ( !is_hvm_domain(d) || v != curr )
>>              goto gp_fault;
>> diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
>> index 254da2b849..f14552cb4b 100644
>> --- a/xen/arch/x86/pv/emul-priv-op.c
>> +++ b/xen/arch/x86/pv/emul-priv-op.c
>> @@ -855,8 +855,6 @@ static int read_msr(unsigned int reg, uint64_t *val,
>>  
>>      switch ( reg )
>>      {
>> -        int rc;
>> -
>>      case MSR_FS_BASE:
>>          if ( is_pv_32bit_domain(currd) )
>>              break;
>> @@ -955,12 +953,6 @@ static int read_msr(unsigned int reg, uint64_t *val,
>>          }
>>          /* fall through */
>>      default:
>> -        rc = vmce_rdmsr(reg, val);
>> -        if ( rc < 0 )
>> -            break;
>> -        if ( rc )
>> -            return X86EMUL_OKAY;
>> -        /* fall through */
>>      normal:
> We could remove the 'normal' label and just use the default one
> instead.

You can't "goto default;" which is what a number of paths between these
two hunks do.

I do however have a plan to clean this up in due course.

~Andrew

Re: [PATCH] x86/vmce: Dispatch vmce_{rd,wr}msr() from guest_{rd,wr}msr()

Posted by Roger Pau Monné 2 weeks ago
On Thu, Jul 23, 2020 at 12:00:53PM +0100, Andrew Cooper wrote:
> On 23/07/2020 11:07, Roger Pau Monné wrote:
> > On Wed, Jul 22, 2020 at 11:18:09AM +0100, Andrew Cooper wrote:
> >> +    case MSR_IA32_MCG_CAP     ... MSR_IA32_MCG_CTL:      /* 0x179 -> 0x17b */
> >> +    case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
> >> +    case MSR_IA32_MCx_CTL(0)  ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */
> > Where do you get the ranges from 0 to 31? It seems like the count
> > field in the CAP register is 8 bits, which could allow for up to 256
> > banks?
> >
> > I'm quite sure this would then overlap with other MSRs?
> 
> Irritatingly, nothing I can find actually states an upper architectural
> limit.
> 
> SDM Vol4, Table 2-2 which enumerates the Architectural MSRs.
> 
> 0x280 thru 0x29f are explicitly reserved MCx_CTL2, which is a limit of
> 32 banks.  There are gaps after this in the architectural table, but
> IceLake has PRMRR_BASE_0 at 0x2a0.
> 
> The main bank of MCx_{CTL,STATUS,ADDR,MISC} start at 0x400 and are
> listed in the table up to 0x473, which is a limit of 29 banks.  The
> Model specific table for SandyBridge fills in the remaining 3 banks up
> to MSR 0x47f, which is the previous limit of 32 banks.  (These MSRs have
> package scope rather than core/thread scope, but they are still
> enumerated architecturally so I'm not sure why they are in the model
> specific tables.)
> 
> More importantly however, the VMX MSR range starts at 0x480, immediately
> above bank 31, which puts an architectural hard limit on the number of
> banks.

Yes, realized about the VMX MSRs starting at 0x480, which limits the
number of banks. Maybe a small comment about the fact that albeit the
count in the CAP register could go up to 256 32 is the actual limit
due to how MSRs are arranged?

Note there's also GUEST_MC_BANK_NUM which is the actual implementation
limit in Xen AFAICT, maybe using it here would be clearer? (and limit
the ranges forwarded to vmce_rdmsr)

Thanks, Roger.

Re: [PATCH] x86/vmce: Dispatch vmce_{rd,wr}msr() from guest_{rd,wr}msr()

Posted by Andrew Cooper 2 weeks ago
On 23/07/2020 12:30, Roger Pau Monné wrote:
> On Thu, Jul 23, 2020 at 12:00:53PM +0100, Andrew Cooper wrote:
>> On 23/07/2020 11:07, Roger Pau Monné wrote:
>>> On Wed, Jul 22, 2020 at 11:18:09AM +0100, Andrew Cooper wrote:
>>>> +    case MSR_IA32_MCG_CAP     ... MSR_IA32_MCG_CTL:      /* 0x179 -> 0x17b */
>>>> +    case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
>>>> +    case MSR_IA32_MCx_CTL(0)  ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */
>>> Where do you get the ranges from 0 to 31? It seems like the count
>>> field in the CAP register is 8 bits, which could allow for up to 256
>>> banks?
>>>
>>> I'm quite sure this would then overlap with other MSRs?
>> Irritatingly, nothing I can find actually states an upper architectural
>> limit.
>>
>> SDM Vol4, Table 2-2 which enumerates the Architectural MSRs.
>>
>> 0x280 thru 0x29f are explicitly reserved MCx_CTL2, which is a limit of
>> 32 banks.  There are gaps after this in the architectural table, but
>> IceLake has PRMRR_BASE_0 at 0x2a0.
>>
>> The main bank of MCx_{CTL,STATUS,ADDR,MISC} start at 0x400 and are
>> listed in the table up to 0x473, which is a limit of 29 banks.  The
>> Model specific table for SandyBridge fills in the remaining 3 banks up
>> to MSR 0x47f, which is the previous limit of 32 banks.  (These MSRs have
>> package scope rather than core/thread scope, but they are still
>> enumerated architecturally so I'm not sure why they are in the model
>> specific tables.)
>>
>> More importantly however, the VMX MSR range starts at 0x480, immediately
>> above bank 31, which puts an architectural hard limit on the number of
>> banks.
> Yes, realized about the VMX MSRs starting at 0x480, which limits the
> number of banks. Maybe a small comment about the fact that albeit the
> count in the CAP register could go up to 256 32 is the actual limit
> due to how MSRs are arranged?

Ok.  I've added:

The bank limit of 32 isn't stated anywhere I can locate, but is a
consequence
of the MSR layout described in SDM Volume 4.

as another paragraph to the commit message.

> Note there's also GUEST_MC_BANK_NUM which is the actual implementation
> limit in Xen AFAICT, maybe using it here would be clearer? (and limit
> the ranges forwarded to vmce_rdmsr)

First, there is a note saying that older versions of Xen advertised more
than 2 banks to guests (and therefore we might see such a guest migrated
in), and second, capturing all MSRs is a bug I'm specifically fixing
with this change, and was called out in the commit message.

These MSRs, even beyond the banks implemented by the guest, are still
MCE banks, and need handling appropriately as "out of range banks",
which isn't necessarily the same as falling into the general default MSR
path.

~Andrew