[PATCH v6 04/20] x86/cea: Export an API to get per CPU exception stacks for KVM to use

Xin Li (Intel) posted 20 patches 1 month, 1 week ago
There is a newer version of this series
[PATCH v6 04/20] x86/cea: Export an API to get per CPU exception stacks for KVM to use
Posted by Xin Li (Intel) 1 month, 1 week ago
FRED introduced new fields in the host-state area of the VMCS for
stack levels 1->3 (HOST_IA32_FRED_RSP[123]), each respectively
corresponding to per CPU exception stacks for #DB, NMI and #DF.
KVM must populate these each time a vCPU is loaded onto a CPU.

Convert the __this_cpu_ist_{top,bottom}_va() macros into real
functions and export __this_cpu_ist_top_va().

Suggested-by: Christoph Hellwig <hch@infradead.org>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
Tested-by: Xuelian Guo <xuelian.guo@intel.com>
---

Change in v5:
* Export accessor instead of data (Christoph Hellwig).
* Add TB from Xuelian Guo.

Change in v4:
* Rewrite the change log and add comments to the export (Dave Hansen).
---
 arch/x86/coco/sev/sev-nmi.c           |  4 ++--
 arch/x86/coco/sev/vc-handle.c         |  2 +-
 arch/x86/include/asm/cpu_entry_area.h | 17 ++++-------------
 arch/x86/kernel/cpu/common.c          | 10 +++++-----
 arch/x86/kernel/fred.c                |  6 +++---
 arch/x86/kernel/traps.c               |  2 +-
 arch/x86/mm/cpu_entry_area.c          | 21 +++++++++++++++++++++
 arch/x86/mm/fault.c                   |  2 +-
 8 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/arch/x86/coco/sev/sev-nmi.c b/arch/x86/coco/sev/sev-nmi.c
index d8dfaddfb367..73e34ad7a1a9 100644
--- a/arch/x86/coco/sev/sev-nmi.c
+++ b/arch/x86/coco/sev/sev-nmi.c
@@ -30,7 +30,7 @@ static __always_inline bool on_vc_stack(struct pt_regs *regs)
 	if (ip_within_syscall_gap(regs))
 		return false;
 
-	return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
+	return ((sp >= __this_cpu_ist_bottom_va(ESTACK_VC)) && (sp < __this_cpu_ist_top_va(ESTACK_VC)));
 }
 
 /*
@@ -82,7 +82,7 @@ void noinstr __sev_es_ist_exit(void)
 	/* Read IST entry */
 	ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
 
-	if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
+	if (WARN_ON(ist == __this_cpu_ist_top_va(ESTACK_VC)))
 		return;
 
 	/* Read back old IST entry and write it to the TSS */
diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c
index c3b4acbde0d8..88b6bc518a5a 100644
--- a/arch/x86/coco/sev/vc-handle.c
+++ b/arch/x86/coco/sev/vc-handle.c
@@ -859,7 +859,7 @@ static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
 
 static __always_inline bool is_vc2_stack(unsigned long sp)
 {
-	return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
+	return (sp >= __this_cpu_ist_bottom_va(ESTACK_VC2) && sp < __this_cpu_ist_top_va(ESTACK_VC2));
 }
 
 static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 462fc34f1317..8e17f0ca74e6 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -46,7 +46,7 @@ struct cea_exception_stacks {
  * The exception stack ordering in [cea_]exception_stacks
  */
 enum exception_stack_ordering {
-	ESTACK_DF,
+	ESTACK_DF = 0,
 	ESTACK_NMI,
 	ESTACK_DB,
 	ESTACK_MCE,
@@ -58,18 +58,15 @@ enum exception_stack_ordering {
 #define CEA_ESTACK_SIZE(st)					\
 	sizeof(((struct cea_exception_stacks *)0)->st## _stack)
 
-#define CEA_ESTACK_BOT(ceastp, st)				\
-	((unsigned long)&(ceastp)->st## _stack)
-
-#define CEA_ESTACK_TOP(ceastp, st)				\
-	(CEA_ESTACK_BOT(ceastp, st) + CEA_ESTACK_SIZE(st))
-
 #define CEA_ESTACK_OFFS(st)					\
 	offsetof(struct cea_exception_stacks, st## _stack)
 
 #define CEA_ESTACK_PAGES					\
 	(sizeof(struct cea_exception_stacks) / PAGE_SIZE)
 
+extern unsigned long __this_cpu_ist_top_va(enum exception_stack_ordering stack);
+extern unsigned long __this_cpu_ist_bottom_va(enum exception_stack_ordering stack);
+
 #endif
 
 #ifdef CONFIG_X86_32
@@ -144,10 +141,4 @@ static __always_inline struct entry_stack *cpu_entry_stack(int cpu)
 	return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
 }
 
-#define __this_cpu_ist_top_va(name)					\
-	CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name)
-
-#define __this_cpu_ist_bottom_va(name)					\
-	CEA_ESTACK_BOT(__this_cpu_read(cea_exception_stacks), name)
-
 #endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 34a054181c4d..cb14919f92da 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2307,12 +2307,12 @@ static inline void setup_getcpu(int cpu)
 static inline void tss_setup_ist(struct tss_struct *tss)
 {
 	/* Set up the per-CPU TSS IST stacks */
-	tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
-	tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
-	tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
-	tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+	tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(ESTACK_DF);
+	tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(ESTACK_NMI);
+	tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(ESTACK_DB);
+	tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(ESTACK_MCE);
 	/* Only mapped when SEV-ES is active */
-	tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);
+	tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(ESTACK_VC);
 }
 #else /* CONFIG_X86_64 */
 static inline void tss_setup_ist(struct tss_struct *tss) { }
diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c
index 816187da3a47..06d944a3d051 100644
--- a/arch/x86/kernel/fred.c
+++ b/arch/x86/kernel/fred.c
@@ -87,7 +87,7 @@ void cpu_init_fred_rsps(void)
 	       FRED_STKLVL(X86_TRAP_DF,  FRED_DF_STACK_LEVEL));
 
 	/* The FRED equivalents to IST stacks... */
-	wrmsrq(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB));
-	wrmsrq(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI));
-	wrmsrq(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF));
+	wrmsrq(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(ESTACK_DB));
+	wrmsrq(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(ESTACK_NMI));
+	wrmsrq(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(ESTACK_DF));
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 36354b470590..5c9c5ebf5e73 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -954,7 +954,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
 
 	if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
 	    info.type > STACK_TYPE_EXCEPTION_LAST)
-		sp = __this_cpu_ist_top_va(VC2);
+		sp = __this_cpu_ist_top_va(ESTACK_VC2);
 
 sync:
 	/*
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 575f863f3c75..eedaf103c8ad 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -18,6 +18,27 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage)
 static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks);
 DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
 
+/*
+ * FRED introduced new fields in the host-state area of the VMCS for
+ * stack levels 1->3 (HOST_IA32_FRED_RSP[123]), each respectively
+ * corresponding to per CPU stacks for #DB, NMI and #DF.  KVM must
+ * populate these each time a vCPU is loaded onto a CPU.
+ *
+ * Called from entry code, so must be noinstr.
+ */
+noinstr unsigned long __this_cpu_ist_top_va(enum exception_stack_ordering stack)
+{
+	unsigned long base = (unsigned long)&(__this_cpu_read(cea_exception_stacks)->DF_stack);
+	return base + EXCEPTION_STKSZ + stack * (EXCEPTION_STKSZ + PAGE_SIZE);
+}
+EXPORT_SYMBOL(__this_cpu_ist_top_va);
+
+noinstr unsigned long __this_cpu_ist_bottom_va(enum exception_stack_ordering stack)
+{
+	unsigned long base = (unsigned long)&(__this_cpu_read(cea_exception_stacks)->DF_stack);
+	return base + stack * (EXCEPTION_STKSZ + PAGE_SIZE);
+}
+
 static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, _cea_offset);
 
 static __always_inline unsigned int cea_offset(unsigned int cpu)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 998bd807fc7b..1804eb86cc14 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -671,7 +671,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		 * and then double-fault, though, because we're likely to
 		 * break the console driver and lose most of the stack dump.
 		 */
-		call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
+		call_on_stack(__this_cpu_ist_top_va(ESTACK_DF) - sizeof(void*),
 			      handle_stack_overflow,
 			      ASM_CALL_ARG3,
 			      , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
-- 
2.50.1
Re: [PATCH v6 04/20] x86/cea: Export an API to get per CPU exception stacks for KVM to use
Posted by Dave Hansen 1 month, 1 week ago
On 8/21/25 15:36, Xin Li (Intel) wrote:
> FRED introduced new fields in the host-state area of the VMCS for
> stack levels 1->3 (HOST_IA32_FRED_RSP[123]), each respectively
> corresponding to per CPU exception stacks for #DB, NMI and #DF.
> KVM must populate these each time a vCPU is loaded onto a CPU.
> 
> Convert the __this_cpu_ist_{top,bottom}_va() macros into real
> functions and export __this_cpu_ist_top_va().
> 
> Suggested-by: Christoph Hellwig <hch@infradead.org>
> Suggested-by: Dave Hansen <dave.hansen@intel.com>

Nit: I wouldn't use Suggested-by unless the person basically asked for
the *entire* patch. Christoph and I were asking for specific bits of
this, but neither of us asked for this patch as a whole.

> diff --git a/arch/x86/coco/sev/sev-nmi.c b/arch/x86/coco/sev/sev-nmi.c
> index d8dfaddfb367..73e34ad7a1a9 100644
> --- a/arch/x86/coco/sev/sev-nmi.c
> +++ b/arch/x86/coco/sev/sev-nmi.c
> @@ -30,7 +30,7 @@ static __always_inline bool on_vc_stack(struct pt_regs *regs)
>  	if (ip_within_syscall_gap(regs))
>  		return false;
>  
> -	return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
> +	return ((sp >= __this_cpu_ist_bottom_va(ESTACK_VC)) && (sp < __this_cpu_ist_top_va(ESTACK_VC)));
>  }

This rename is one of those things that had me scratching my head for a
minute. It wasn't obvious at _all_ why the VC=>ESTACK_VC "rename" is
necessary.

This needs to have been mentioned in the changelog.

Better yet would have been to do this in a separate patch because a big
chunk of this patch is just rename noise.

>  /*
> @@ -82,7 +82,7 @@ void noinstr __sev_es_ist_exit(void)
>  	/* Read IST entry */
>  	ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
>  
> -	if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
> +	if (WARN_ON(ist == __this_cpu_ist_top_va(ESTACK_VC)))
>  		return;
>  
>  	/* Read back old IST entry and write it to the TSS */
> diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c
> index c3b4acbde0d8..88b6bc518a5a 100644
> --- a/arch/x86/coco/sev/vc-handle.c
> +++ b/arch/x86/coco/sev/vc-handle.c
> @@ -859,7 +859,7 @@ static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
>  
>  static __always_inline bool is_vc2_stack(unsigned long sp)
>  {
> -	return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
> +	return (sp >= __this_cpu_ist_bottom_va(ESTACK_VC2) && sp < __this_cpu_ist_top_va(ESTACK_VC2));
>  }
>  
>  static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
> diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
> index 462fc34f1317..8e17f0ca74e6 100644
> --- a/arch/x86/include/asm/cpu_entry_area.h
> +++ b/arch/x86/include/asm/cpu_entry_area.h
> @@ -46,7 +46,7 @@ struct cea_exception_stacks {
>   * The exception stack ordering in [cea_]exception_stacks
>   */
>  enum exception_stack_ordering {
> -	ESTACK_DF,
> +	ESTACK_DF = 0,
>  	ESTACK_NMI,
>  	ESTACK_DB,
>  	ESTACK_MCE,

Is this really required? I thought the first enum was always 0? Is this
just trying to ensure that ESTACKS_MEMBERS() defines a matching number
of N_EXCEPTION_STACKS stacks?

If that's the case, shouldn't this be represented with a BUILD_BUG_ON()?

> @@ -58,18 +58,15 @@ enum exception_stack_ordering {
>  #define CEA_ESTACK_SIZE(st)					\
>  	sizeof(((struct cea_exception_stacks *)0)->st## _stack)
>  
> -#define CEA_ESTACK_BOT(ceastp, st)				\
> -	((unsigned long)&(ceastp)->st## _stack)
> -
> -#define CEA_ESTACK_TOP(ceastp, st)				\
> -	(CEA_ESTACK_BOT(ceastp, st) + CEA_ESTACK_SIZE(st))
> -
>  #define CEA_ESTACK_OFFS(st)					\
>  	offsetof(struct cea_exception_stacks, st## _stack)
>  
>  #define CEA_ESTACK_PAGES					\
>  	(sizeof(struct cea_exception_stacks) / PAGE_SIZE)
>  
> +extern unsigned long __this_cpu_ist_top_va(enum exception_stack_ordering stack);
> +extern unsigned long __this_cpu_ist_bottom_va(enum exception_stack_ordering stack);
> +
>  #endif
>  
>  #ifdef CONFIG_X86_32
> @@ -144,10 +141,4 @@ static __always_inline struct entry_stack *cpu_entry_stack(int cpu)
>  	return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
>  }
>  
> -#define __this_cpu_ist_top_va(name)					\
> -	CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name)
> -
> -#define __this_cpu_ist_bottom_va(name)					\
> -	CEA_ESTACK_BOT(__this_cpu_read(cea_exception_stacks), name)
> -
>  #endif
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 34a054181c4d..cb14919f92da 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -2307,12 +2307,12 @@ static inline void setup_getcpu(int cpu)
>  static inline void tss_setup_ist(struct tss_struct *tss)
>  {
>  	/* Set up the per-CPU TSS IST stacks */
> -	tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
> -	tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
> -	tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
> -	tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
> +	tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(ESTACK_DF);
> +	tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(ESTACK_NMI);
> +	tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(ESTACK_DB);
> +	tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(ESTACK_MCE);

If you respin this, please vertically align these.

> +/*
> + * FRED introduced new fields in the host-state area of the VMCS for
> + * stack levels 1->3 (HOST_IA32_FRED_RSP[123]), each respectively
> + * corresponding to per CPU stacks for #DB, NMI and #DF.  KVM must
> + * populate these each time a vCPU is loaded onto a CPU.
> + *
> + * Called from entry code, so must be noinstr.
> + */
> +noinstr unsigned long __this_cpu_ist_top_va(enum exception_stack_ordering stack)
> +{
> +	unsigned long base = (unsigned long)&(__this_cpu_read(cea_exception_stacks)->DF_stack);
> +	return base + EXCEPTION_STKSZ + stack * (EXCEPTION_STKSZ + PAGE_SIZE);
> +}
> +EXPORT_SYMBOL(__this_cpu_ist_top_va);
> +
> +noinstr unsigned long __this_cpu_ist_bottom_va(enum exception_stack_ordering stack)
> +{
> +	unsigned long base = (unsigned long)&(__this_cpu_read(cea_exception_stacks)->DF_stack);
> +	return base + stack * (EXCEPTION_STKSZ + PAGE_SIZE);
> +}

These are basically treating 'struct exception_stacks' like an array.
There's no type safety or anything here. It's just an open-coded array
access.

Also, starting with ->DF_stack is a bit goofy looking. It's not obvious
(or enforced) that it is stack #0 or at the beginning of the structure.

Shouldn't we be _trying_ to make this look like:

	struct cea_exception_stacks *s;
	s = __this_cpu_read(cea_exception_stacks);

	return &s[stack_nr].stack;

?

Where 'cea_exception_stacks' is an actual array:

	struct cea_exception_stacks[N_EXCEPTION_STACKS];

which might need to be embedded in a larger structure to get the
'IST_top_guard' without wasting allocating space for an extra full stack.

>  static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, _cea_offset);
>  
>  static __always_inline unsigned int cea_offset(unsigned int cpu)
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 998bd807fc7b..1804eb86cc14 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -671,7 +671,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
>  		 * and then double-fault, though, because we're likely to
>  		 * break the console driver and lose most of the stack dump.
>  		 */
> -		call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
> +		call_on_stack(__this_cpu_ist_top_va(ESTACK_DF) - sizeof(void*),
>  			      handle_stack_overflow,
>  			      ASM_CALL_ARG3,
>  			      , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
Re: [PATCH v6 04/20] x86/cea: Export an API to get per CPU exception stacks for KVM to use
Posted by Xin Li 1 month, 1 week ago
>> Suggested-by: Christoph Hellwig <hch@infradead.org>
>> Suggested-by: Dave Hansen <dave.hansen@intel.com>
> 
> Nit: I wouldn't use Suggested-by unless the person basically asked for
> the *entire* patch. Christoph and I were asking for specific bits of
> this, but neither of us asked for this patch as a whole.

I did it because the patch is almost rewritten to export accessors instead
of raw data, IOW, the way of doing it is completely changed.

But I will remove Suggested-by.

> 
>> diff --git a/arch/x86/coco/sev/sev-nmi.c b/arch/x86/coco/sev/sev-nmi.c
>> index d8dfaddfb367..73e34ad7a1a9 100644
>> --- a/arch/x86/coco/sev/sev-nmi.c
>> +++ b/arch/x86/coco/sev/sev-nmi.c
>> @@ -30,7 +30,7 @@ static __always_inline bool on_vc_stack(struct pt_regs *regs)
>>   	if (ip_within_syscall_gap(regs))
>>   		return false;
>>   
>> -	return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
>> +	return ((sp >= __this_cpu_ist_bottom_va(ESTACK_VC)) && (sp < __this_cpu_ist_top_va(ESTACK_VC)));
>>   }
> 
> This rename is one of those things that had me scratching my head for a
> minute. It wasn't obvious at _all_ why the VC=>ESTACK_VC "rename" is
> necessary.
> 
> This needs to have been mentioned in the changelog.
> 
> Better yet would have been to do this in a separate patch because a big
> chunk of this patch is just rename noise.

Sure, will do.

>> diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
>> index 462fc34f1317..8e17f0ca74e6 100644
>> --- a/arch/x86/include/asm/cpu_entry_area.h
>> +++ b/arch/x86/include/asm/cpu_entry_area.h
>> @@ -46,7 +46,7 @@ struct cea_exception_stacks {
>>    * The exception stack ordering in [cea_]exception_stacks
>>    */
>>   enum exception_stack_ordering {
>> -	ESTACK_DF,
>> +	ESTACK_DF = 0,
>>   	ESTACK_NMI,
>>   	ESTACK_DB,
>>   	ESTACK_MCE,
> 
> Is this really required? I thought the first enum was always 0? Is this
> just trying to ensure that ESTACKS_MEMBERS() defines a matching number
> of N_EXCEPTION_STACKS stacks?
> 
> If that's the case, shouldn't this be represented with a BUILD_BUG_ON()?

Will do BUILD_BUG_ON().

> 
>> @@ -58,18 +58,15 @@ enum exception_stack_ordering {
>>   #define CEA_ESTACK_SIZE(st)					\
>>   	sizeof(((struct cea_exception_stacks *)0)->st## _stack)
>>   
>> -#define CEA_ESTACK_BOT(ceastp, st)				\
>> -	((unsigned long)&(ceastp)->st## _stack)
>> -
>> -#define CEA_ESTACK_TOP(ceastp, st)				\
>> -	(CEA_ESTACK_BOT(ceastp, st) + CEA_ESTACK_SIZE(st))
>> -
>>   #define CEA_ESTACK_OFFS(st)					\
>>   	offsetof(struct cea_exception_stacks, st## _stack)
>>   
>>   #define CEA_ESTACK_PAGES					\
>>   	(sizeof(struct cea_exception_stacks) / PAGE_SIZE)
>>   
>> +extern unsigned long __this_cpu_ist_top_va(enum exception_stack_ordering stack);
>> +extern unsigned long __this_cpu_ist_bottom_va(enum exception_stack_ordering stack);
>> +
>>   #endif
>>   
>>   #ifdef CONFIG_X86_32
>> @@ -144,10 +141,4 @@ static __always_inline struct entry_stack *cpu_entry_stack(int cpu)
>>   	return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
>>   }
>>   
>> -#define __this_cpu_ist_top_va(name)					\
>> -	CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name)
>> -
>> -#define __this_cpu_ist_bottom_va(name)					\
>> -	CEA_ESTACK_BOT(__this_cpu_read(cea_exception_stacks), name)
>> -
>>   #endif
>> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
>> index 34a054181c4d..cb14919f92da 100644
>> --- a/arch/x86/kernel/cpu/common.c
>> +++ b/arch/x86/kernel/cpu/common.c
>> @@ -2307,12 +2307,12 @@ static inline void setup_getcpu(int cpu)
>>   static inline void tss_setup_ist(struct tss_struct *tss)
>>   {
>>   	/* Set up the per-CPU TSS IST stacks */
>> -	tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
>> -	tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
>> -	tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
>> -	tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
>> +	tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(ESTACK_DF);
>> +	tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(ESTACK_NMI);
>> +	tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(ESTACK_DB);
>> +	tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(ESTACK_MCE);
> 
> If you respin this, please vertically align these.

NP.

> 
>> +/*
>> + * FRED introduced new fields in the host-state area of the VMCS for
>> + * stack levels 1->3 (HOST_IA32_FRED_RSP[123]), each respectively
>> + * corresponding to per CPU stacks for #DB, NMI and #DF.  KVM must
>> + * populate these each time a vCPU is loaded onto a CPU.
>> + *
>> + * Called from entry code, so must be noinstr.
>> + */
>> +noinstr unsigned long __this_cpu_ist_top_va(enum exception_stack_ordering stack)
>> +{
>> +	unsigned long base = (unsigned long)&(__this_cpu_read(cea_exception_stacks)->DF_stack);
>> +	return base + EXCEPTION_STKSZ + stack * (EXCEPTION_STKSZ + PAGE_SIZE);
>> +}
>> +EXPORT_SYMBOL(__this_cpu_ist_top_va);
>> +
>> +noinstr unsigned long __this_cpu_ist_bottom_va(enum exception_stack_ordering stack)
>> +{
>> +	unsigned long base = (unsigned long)&(__this_cpu_read(cea_exception_stacks)->DF_stack);
>> +	return base + stack * (EXCEPTION_STKSZ + PAGE_SIZE);
>> +}
> 
> These are basically treating 'struct exception_stacks' like an array.
> There's no type safety or anything here. It's just an open-coded array
> access.
> 
> Also, starting with ->DF_stack is a bit goofy looking. It's not obvious
> (or enforced) that it is stack #0 or at the beginning of the structure.
> 
> Shouldn't we be _trying_ to make this look like:
> 
> 	struct cea_exception_stacks *s;
> 	s = __this_cpu_read(cea_exception_stacks);
> 
> 	return &s[stack_nr].stack;
> 
> ?
> 
> Where 'cea_exception_stacks' is an actual array:
> 
> 	struct cea_exception_stacks[N_EXCEPTION_STACKS];
> 
> which might need to be embedded in a larger structure to get the
> 'IST_top_guard' without wasting allocating space for an extra full stack.
> 

Good suggestion!

Thanks!
     Xin