[v1] KVM: TDX huge page support for private memory

[RFC PATCH 12/21] KVM: TDX: Determine max mapping level according to vCPU's ACCEPT level

Posted by Yan Zhao 9 months, 2 weeks ago

Determine the max mapping level of a private GFN according to the vCPU's
ACCEPT level specified in the TDCALL TDG.MEM.PAGE.ACCEPT.

When an EPT violation occurs due to a vCPU invoking TDG.MEM.PAGE.ACCEPT
before any actual memory access, the vCPU's ACCEPT level is available in
the extended exit qualification. Set the vCPU's ACCEPT level as the max
mapping level for the faulting GFN. This is necessary because if KVM
specifies a mapping level greater than the vCPU's ACCEPT level, and no
other vCPUs are accepting at KVM's mapping level, TDG.MEM.PAGE.ACCEPT will
produce another EPT violation on the vCPU after re-entering the TD, with
the vCPU's ACCEPT level indicated in the extended exit qualification.

Introduce "violation_gfn_start", "violation_gfn_end", and
"violation_request_level" in "struct vcpu_tdx" to pass the vCPU's ACCEPT
level to TDX's private_max_mapping_level hook for determining the max
mapping level.

Instead of taking some bits of the error_code passed to
kvm_mmu_page_fault() and requiring KVM MMU core to check the error_code for
a fault's max_level, having TDX's private_max_mapping_level hook check for
request level avoids changes to the KVM MMU core. This approach also
accommodates future scenarios where the requested mapping level is unknown
at the start of tdx_handle_ept_violation() (i.e., before invoking
kvm_mmu_page_fault()).

Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 arch/x86/kvm/vmx/tdx.c      | 36 +++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/tdx.h      |  4 ++++
 arch/x86/kvm/vmx/tdx_arch.h |  3 +++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 86775af85cd8..dd63a634e633 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1859,10 +1859,34 @@ static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcp
 	return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
 }
 
+static inline void tdx_get_accept_level(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+	int level = -1;
+
+	u64 eeq_type = tdx->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
+
+	u32 eeq_info = (tdx->ext_exit_qualification & TDX_EXT_EXIT_QUAL_INFO_MASK) >>
+			TDX_EXT_EXIT_QUAL_INFO_SHIFT;
+
+	if (eeq_type == TDX_EXT_EXIT_QUAL_TYPE_ACCEPT) {
+		level = (eeq_info & GENMASK(2, 0)) + 1;
+
+		tdx->violation_gfn_start = gfn_round_for_level(gpa_to_gfn(gpa), level);
+		tdx->violation_gfn_end = tdx->violation_gfn_start + KVM_PAGES_PER_HPAGE(level);
+		tdx->violation_request_level = level;
+	} else {
+		tdx->violation_gfn_start = -1;
+		tdx->violation_gfn_end = -1;
+		tdx->violation_request_level = -1;
+	}
+}
+
 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qual;
-	gpa_t gpa = to_tdx(vcpu)->exit_gpa;
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+	gpa_t gpa = tdx->exit_gpa;
 	bool local_retry = false;
 	int ret;
 
@@ -1884,6 +1908,8 @@ static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
 		 */
 		exit_qual = EPT_VIOLATION_ACC_WRITE;
 
+		tdx_get_accept_level(vcpu, gpa);
+
 		/* Only private GPA triggers zero-step mitigation */
 		local_retry = true;
 	} else {
@@ -2917,6 +2943,9 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
 
 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
+	tdx->violation_gfn_start = -1;
+	tdx->violation_gfn_end = -1;
+	tdx->violation_request_level = -1;
 	return 0;
 
 free_tdcx:
@@ -3260,9 +3289,14 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
 
 int tdx_gmem_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn)
 {
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+
 	if (unlikely(to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
 		return PG_LEVEL_4K;
 
+	if (gfn >= tdx->violation_gfn_start && gfn < tdx->violation_gfn_end)
+		return tdx->violation_request_level;
+
 	return PG_LEVEL_2M;
 }
 
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 51f98443e8a2..6e13895813c5 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -70,6 +70,10 @@ struct vcpu_tdx {
 
 	u64 map_gpa_next;
 	u64 map_gpa_end;
+
+	u64 violation_gfn_start;
+	u64 violation_gfn_end;
+	int violation_request_level;
 };
 
 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err);
diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h
index a30e880849e3..af006a73ee05 100644
--- a/arch/x86/kvm/vmx/tdx_arch.h
+++ b/arch/x86/kvm/vmx/tdx_arch.h
@@ -82,7 +82,10 @@ struct tdx_cpuid_value {
 #define TDX_TD_ATTR_PERFMON		BIT_ULL(63)
 
 #define TDX_EXT_EXIT_QUAL_TYPE_MASK	GENMASK(3, 0)
+#define TDX_EXT_EXIT_QUAL_TYPE_ACCEPT  1
 #define TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION  6
+#define TDX_EXT_EXIT_QUAL_INFO_MASK	GENMASK(63, 32)
+#define TDX_EXT_EXIT_QUAL_INFO_SHIFT	32
 /*
  * TD_PARAMS is provided as an input to TDH_MNG_INIT, the size of which is 1024B.
  */
-- 
2.43.2

Re: [RFC PATCH 12/21] KVM: TDX: Determine max mapping level according to vCPU's ACCEPT level

Posted by Edgecombe, Rick P 9 months ago

On Thu, 2025-04-24 at 11:07 +0800, Yan Zhao wrote:
> Determine the max mapping level of a private GFN according to the vCPU's
> ACCEPT level specified in the TDCALL TDG.MEM.PAGE.ACCEPT.
> 
> When an EPT violation occurs due to a vCPU invoking TDG.MEM.PAGE.ACCEPT
> before any actual memory access, the vCPU's ACCEPT level is available in
> the extended exit qualification. Set the vCPU's ACCEPT level as the max
> mapping level for the faulting GFN. This is necessary because if KVM
> specifies a mapping level greater than the vCPU's ACCEPT level, and no
> other vCPUs are accepting at KVM's mapping level, TDG.MEM.PAGE.ACCEPT will
> produce another EPT violation on the vCPU after re-entering the TD, with
> the vCPU's ACCEPT level indicated in the extended exit qualification.

Maybe a little more info would help. It's because the TDX module wants to
"accept" the smaller size in the real S-EPT, but KVM created a huge page. It
can't demote to do this without help from KVM.

> 
> Introduce "violation_gfn_start", "violation_gfn_end", and
> "violation_request_level" in "struct vcpu_tdx" to pass the vCPU's ACCEPT
> level to TDX's private_max_mapping_level hook for determining the max
> mapping level.
> 
> Instead of taking some bits of the error_code passed to
> kvm_mmu_page_fault() and requiring KVM MMU core to check the error_code for
> a fault's max_level, having TDX's private_max_mapping_level hook check for
> request level avoids changes to the KVM MMU core. This approach also
> accommodates future scenarios where the requested mapping level is unknown
> at the start of tdx_handle_ept_violation() (i.e., before invoking
> kvm_mmu_page_fault()).
> 
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> ---
>  arch/x86/kvm/vmx/tdx.c      | 36 +++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/vmx/tdx.h      |  4 ++++
>  arch/x86/kvm/vmx/tdx_arch.h |  3 +++
>  3 files changed, 42 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 86775af85cd8..dd63a634e633 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -1859,10 +1859,34 @@ static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcp
>  	return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
>  }
>  
> +static inline void tdx_get_accept_level(struct kvm_vcpu *vcpu, gpa_t gpa)
> +{
> +	struct vcpu_tdx *tdx = to_tdx(vcpu);
> +	int level = -1;
> +
> +	u64 eeq_type = tdx->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
> +
> +	u32 eeq_info = (tdx->ext_exit_qualification & TDX_EXT_EXIT_QUAL_INFO_MASK) >>
> +			TDX_EXT_EXIT_QUAL_INFO_SHIFT;
> +
> +	if (eeq_type == TDX_EXT_EXIT_QUAL_TYPE_ACCEPT) {
> +		level = (eeq_info & GENMASK(2, 0)) + 1;
> +
> +		tdx->violation_gfn_start = gfn_round_for_level(gpa_to_gfn(gpa), level);
> +		tdx->violation_gfn_end = tdx->violation_gfn_start + KVM_PAGES_PER_HPAGE(level);
> +		tdx->violation_request_level = level;
> +	} else {
> +		tdx->violation_gfn_start = -1;
> +		tdx->violation_gfn_end = -1;
> +		tdx->violation_request_level = -1;

We had some internal conversations on how KVM used to stuff a bunch of fault
stuff in the vcpu so it didn't have to pass it around, but now uses the fault
struct for this. The point was (IIRC) to prevent stale data from getting
confused on future faults, and it being hard to track what came from where.

In the TDX case, I think the potential for confusion is still there. The MMU
code could use stale data if an accept EPT violation happens and control returns
to userspace, at which point userspace does a KVM_PRE_FAULT_MEMORY. Then it will
see the stale  tdx->violation_*. Not exactly a common case, but better to not
have loose ends if we can avoid it.

Looking more closely, I don't see why it's too hard to pass in a max_fault_level
into the fault struct. Totally untested rough idea, what do you think?

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index faae82eefd99..3dc476da6391 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -282,7 +282,11 @@ enum x86_intercept_stage;
  * when the guest was accessing private memory.
  */
 #define PFERR_PRIVATE_ACCESS   BIT_ULL(49)
-#define PFERR_SYNTHETIC_MASK   (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
+
+#define PFERR_FAULT_LEVEL_MASK (BIT_ULL(50) | BIT_ULL(51) | BIT_ULL(52))
+#define PFERR_FAULT_LEVEL_SHIFT 50
+
+#define PFERR_SYNTHETIC_MASK   (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS |
PFERR_FAULT_LEVEL_MASK)
 
 /* apic attention bits */
 #define KVM_APIC_CHECK_VAPIC   0
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 1c1764f46e66..bdb1b0eabd67 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -361,7 +361,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu
*vcpu, gpa_t cr2_or_gpa,
                .nx_huge_page_workaround_enabled =
                        is_nx_huge_page_enabled(vcpu->kvm),
 
-               .max_level = KVM_MAX_HUGEPAGE_LEVEL,
+               .max_level = (err & PFERR_FAULT_LEVEL_MASK) >>
PFERR_FAULT_LEVEL_SHIFT,
                .req_level = PG_LEVEL_4K,
                .goal_level = PG_LEVEL_4K,
                .is_private = err & PFERR_PRIVATE_ACCESS,
diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index 8f46a06e2c44..2f22b294ef8b 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -83,7 +83,8 @@ static inline bool vt_is_tdx_private_gpa(struct kvm *kvm,
gpa_t gpa)
 }
 
 static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
-                                            unsigned long exit_qualification)
+                                            unsigned long exit_qualification,
+                                            u8 max_fault_level)
 {
        u64 error_code;
 
@@ -107,6 +108,10 @@ static inline int __vmx_handle_ept_violation(struct
kvm_vcpu *vcpu, gpa_t gpa,
        if (vt_is_tdx_private_gpa(vcpu->kvm, gpa))
                error_code |= PFERR_PRIVATE_ACCESS;
 
+       BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL >= (1 <<
hweight64(PFERR_FAULT_LEVEL_MASK)));
+
+       error_code |= (u64)max_fault_level << PFERR_FAULT_LEVEL_SHIFT;
+
        return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index e994a6c08a75..19047de4d98d 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -2027,7 +2027,7 @@ static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
         * handle retries locally in their EPT violation handlers.
         */
        while (1) {
-               ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
+               ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual,
KVM_MAX_HUGEPAGE_LEVEL);
 
                if (ret != RET_PF_RETRY || !local_retry)
                        break;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ef2d7208dd20..b70a2ff35884 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5782,7 +5782,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
        if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu,
gpa)))
                return kvm_emulate_instruction(vcpu, 0);
 
-       return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
+       return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification,
KVM_MAX_HUGEPAGE_LEVEL);
 }
 
 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)

Re: [RFC PATCH 12/21] KVM: TDX: Determine max mapping level according to vCPU's ACCEPT level

Posted by Yan Zhao 8 months, 4 weeks ago

On Wed, May 14, 2025 at 05:20:01AM +0800, Edgecombe, Rick P wrote:
> On Thu, 2025-04-24 at 11:07 +0800, Yan Zhao wrote:
> > Determine the max mapping level of a private GFN according to the vCPU's
> > ACCEPT level specified in the TDCALL TDG.MEM.PAGE.ACCEPT.
> > 
> > When an EPT violation occurs due to a vCPU invoking TDG.MEM.PAGE.ACCEPT
> > before any actual memory access, the vCPU's ACCEPT level is available in
> > the extended exit qualification. Set the vCPU's ACCEPT level as the max
> > mapping level for the faulting GFN. This is necessary because if KVM
> > specifies a mapping level greater than the vCPU's ACCEPT level, and no
> > other vCPUs are accepting at KVM's mapping level, TDG.MEM.PAGE.ACCEPT will
> > produce another EPT violation on the vCPU after re-entering the TD, with
> > the vCPU's ACCEPT level indicated in the extended exit qualification.
> 
> Maybe a little more info would help. It's because the TDX module wants to
> "accept" the smaller size in the real S-EPT, but KVM created a huge page. It
> can't demote to do this without help from KVM.
Ok. Right, the TDX module cannot set the entire 2MB mapping to the accepted
state because the guest only specifies 4KB acceptance. The TDX module cannot
perform demotion without a request from KVM. Therefore, the requested level must
be passed to KVM to ensure the mirror page table faults at the expected level.

> > Introduce "violation_gfn_start", "violation_gfn_end", and
> > "violation_request_level" in "struct vcpu_tdx" to pass the vCPU's ACCEPT
> > level to TDX's private_max_mapping_level hook for determining the max
> > mapping level.
> > 
> > Instead of taking some bits of the error_code passed to
> > kvm_mmu_page_fault() and requiring KVM MMU core to check the error_code for
> > a fault's max_level, having TDX's private_max_mapping_level hook check for
> > request level avoids changes to the KVM MMU core. This approach also
> > accommodates future scenarios where the requested mapping level is unknown
> > at the start of tdx_handle_ept_violation() (i.e., before invoking
> > kvm_mmu_page_fault()).
> > 
> > Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > ---
> >  arch/x86/kvm/vmx/tdx.c      | 36 +++++++++++++++++++++++++++++++++++-
> >  arch/x86/kvm/vmx/tdx.h      |  4 ++++
> >  arch/x86/kvm/vmx/tdx_arch.h |  3 +++
> >  3 files changed, 42 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index 86775af85cd8..dd63a634e633 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -1859,10 +1859,34 @@ static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcp
> >  	return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
> >  }
> >  
> > +static inline void tdx_get_accept_level(struct kvm_vcpu *vcpu, gpa_t gpa)
> > +{
> > +	struct vcpu_tdx *tdx = to_tdx(vcpu);
> > +	int level = -1;
> > +
> > +	u64 eeq_type = tdx->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
> > +
> > +	u32 eeq_info = (tdx->ext_exit_qualification & TDX_EXT_EXIT_QUAL_INFO_MASK) >>
> > +			TDX_EXT_EXIT_QUAL_INFO_SHIFT;
> > +
> > +	if (eeq_type == TDX_EXT_EXIT_QUAL_TYPE_ACCEPT) {
> > +		level = (eeq_info & GENMASK(2, 0)) + 1;
> > +
> > +		tdx->violation_gfn_start = gfn_round_for_level(gpa_to_gfn(gpa), level);
> > +		tdx->violation_gfn_end = tdx->violation_gfn_start + KVM_PAGES_PER_HPAGE(level);
> > +		tdx->violation_request_level = level;
> > +	} else {
> > +		tdx->violation_gfn_start = -1;
> > +		tdx->violation_gfn_end = -1;
> > +		tdx->violation_request_level = -1;
> 
> We had some internal conversations on how KVM used to stuff a bunch of fault
> stuff in the vcpu so it didn't have to pass it around, but now uses the fault
> struct for this. The point was (IIRC) to prevent stale data from getting
> confused on future faults, and it being hard to track what came from where.
> 
> In the TDX case, I think the potential for confusion is still there. The MMU
> code could use stale data if an accept EPT violation happens and control returns
> to userspace, at which point userspace does a KVM_PRE_FAULT_MEMORY. Then it will
> see the stale  tdx->violation_*. Not exactly a common case, but better to not
> have loose ends if we can avoid it.
> 
> Looking more closely, I don't see why it's too hard to pass in a max_fault_level
> into the fault struct. Totally untested rough idea, what do you think?
Thanks for bringing this up and providing the idea below. In the previous TDX
huge page v8, there's a similar implementation [1] [2].

This series did not adopt that approach because that approach requires
tdx_handle_ept_violation() to pass in max_fault_level, which is not always
available at that stage. e.g.

In patch 19, when vCPU 1 faults on a GFN at 2MB level and then vCPU 2 faults on
the same GFN at 4KB level, TDX wants to ignore the demotion request caused by
vCPU 2's 4KB level fault. So, patch 19 sets tdx->violation_request_level to 2MB
in vCPU 2's split callback and fails the split. vCPU 2's
__vmx_handle_ept_violation() will see RET_PF_RETRY and either do local retry (or
return to the guest).

If it retries locally, tdx_gmem_private_max_mapping_level() will return
tdx->violation_request_level, causing KVM to fault at 2MB level for vCPU 2,
resulting in a spurious fault, eventually returning to the guest.

As tdx->violation_request_level is per-vCPU and it resets in
tdx_get_accept_level() in tdx_handle_ept_violation() (meaning it resets after
each invocation of tdx_handle_ept_violation() and only affects the TDX local
retry loop), it should not hold any stale value.

Alternatively, instead of having tdx_gmem_private_max_mapping_level() to return
tdx->violation_request_level, tdx_handle_ept_violation() could grab
tdx->violation_request_level as the max_fault_level to pass to
__vmx_handle_ept_violation().

This series chose to use tdx_gmem_private_max_mapping_level() to avoid
modification to the KVM MMU core.

[1] https://lore.kernel.org/all/4d61104bff388a081ff8f6ae4ac71e05a13e53c3.1708933624.git.isaku.yamahata@intel.com/
[2 ]https://lore.kernel.org/all/3d2a6bfb033ee1b51f7b875360bd295376c32b54.1708933624.git.isaku.yamahata@intel.com/

> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index faae82eefd99..3dc476da6391 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -282,7 +282,11 @@ enum x86_intercept_stage;
>   * when the guest was accessing private memory.
>   */
>  #define PFERR_PRIVATE_ACCESS   BIT_ULL(49)
> -#define PFERR_SYNTHETIC_MASK   (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
> +
> +#define PFERR_FAULT_LEVEL_MASK (BIT_ULL(50) | BIT_ULL(51) | BIT_ULL(52))
> +#define PFERR_FAULT_LEVEL_SHIFT 50
> +
> +#define PFERR_SYNTHETIC_MASK   (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS |
> PFERR_FAULT_LEVEL_MASK)
>  
>  /* apic attention bits */
>  #define KVM_APIC_CHECK_VAPIC   0
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 1c1764f46e66..bdb1b0eabd67 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -361,7 +361,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu
> *vcpu, gpa_t cr2_or_gpa,
>                 .nx_huge_page_workaround_enabled =
>                         is_nx_huge_page_enabled(vcpu->kvm),
>  
> -               .max_level = KVM_MAX_HUGEPAGE_LEVEL,
> +               .max_level = (err & PFERR_FAULT_LEVEL_MASK) >>
> PFERR_FAULT_LEVEL_SHIFT,
>                 .req_level = PG_LEVEL_4K,
>                 .goal_level = PG_LEVEL_4K,
>                 .is_private = err & PFERR_PRIVATE_ACCESS,
> diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
> index 8f46a06e2c44..2f22b294ef8b 100644
> --- a/arch/x86/kvm/vmx/common.h
> +++ b/arch/x86/kvm/vmx/common.h
> @@ -83,7 +83,8 @@ static inline bool vt_is_tdx_private_gpa(struct kvm *kvm,
> gpa_t gpa)
>  }
>  
>  static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
> -                                            unsigned long exit_qualification)
> +                                            unsigned long exit_qualification,
> +                                            u8 max_fault_level)
>  {
>         u64 error_code;
>  
> @@ -107,6 +108,10 @@ static inline int __vmx_handle_ept_violation(struct
> kvm_vcpu *vcpu, gpa_t gpa,
>         if (vt_is_tdx_private_gpa(vcpu->kvm, gpa))
>                 error_code |= PFERR_PRIVATE_ACCESS;
>  
> +       BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL >= (1 <<
> hweight64(PFERR_FAULT_LEVEL_MASK)));
> +
> +       error_code |= (u64)max_fault_level << PFERR_FAULT_LEVEL_SHIFT;
> +
>         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
>  }
>  
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index e994a6c08a75..19047de4d98d 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -2027,7 +2027,7 @@ static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
>          * handle retries locally in their EPT violation handlers.
>          */
>         while (1) {
> -               ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
> +               ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual,
> KVM_MAX_HUGEPAGE_LEVEL);
>  
>                 if (ret != RET_PF_RETRY || !local_retry)
>                         break;
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index ef2d7208dd20..b70a2ff35884 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -5782,7 +5782,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
>         if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu,
> gpa)))
>                 return kvm_emulate_instruction(vcpu, 0);
>  
> -       return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
> +       return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification,
> KVM_MAX_HUGEPAGE_LEVEL);
>  }
>  
>  static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
> 
>

Re: [RFC PATCH 12/21] KVM: TDX: Determine max mapping level according to vCPU's ACCEPT level

Posted by Edgecombe, Rick P 8 months, 4 weeks ago

On Fri, 2025-05-16 at 14:30 +0800, Yan Zhao wrote:
> > Looking more closely, I don't see why it's too hard to pass in a
> > max_fault_level
> > into the fault struct. Totally untested rough idea, what do you think?
> Thanks for bringing this up and providing the idea below. In the previous TDX
> huge page v8, there's a similar implementation [1] [2].
> 
> This series did not adopt that approach because that approach requires
> tdx_handle_ept_violation() to pass in max_fault_level, which is not always
> available at that stage. e.g.
> 
> In patch 19, when vCPU 1 faults on a GFN at 2MB level and then vCPU 2 faults
> on
> the same GFN at 4KB level, TDX wants to ignore the demotion request caused by
> vCPU 2's 4KB level fault. So, patch 19 sets tdx->violation_request_level to
> 2MB
> in vCPU 2's split callback and fails the split. vCPU 2's
> __vmx_handle_ept_violation() will see RET_PF_RETRY and either do local retry
> (or
> return to the guest).

I think you mean patch 20 "KVM: x86: Force a prefetch fault's max mapping level
to 4KB for TDX"?

> 
> If it retries locally, tdx_gmem_private_max_mapping_level() will return
> tdx->violation_request_level, causing KVM to fault at 2MB level for vCPU 2,
> resulting in a spurious fault, eventually returning to the guest.
> 
> As tdx->violation_request_level is per-vCPU and it resets in
> tdx_get_accept_level() in tdx_handle_ept_violation() (meaning it resets after
> each invocation of tdx_handle_ept_violation() and only affects the TDX local
> retry loop), it should not hold any stale value.
> 
> Alternatively, instead of having tdx_gmem_private_max_mapping_level() to
> return
> tdx->violation_request_level, tdx_handle_ept_violation() could grab
> tdx->violation_request_level as the max_fault_level to pass to
> __vmx_handle_ept_violation().
> 
> This series chose to use tdx_gmem_private_max_mapping_level() to avoid
> modification to the KVM MMU core.

It sounds like Kirill is suggesting we do have to have demotion in the fault
path. IIRC it adds a lock, but the cost to skip fault path demotion seems to be
adding up.

> 
> [1]
> https://lore.kernel.org/all/4d61104bff388a081ff8f6ae4ac71e05a13e53c3.1708933624.git.isaku.yamahata@intel.com/
> [2
> ]https://lore.kernel.org/all/3d2a6bfb033ee1b51f7b875360bd295376c32b54.17089336
> 24.git.isaku.yamahata@intel.com/

Re: [RFC PATCH 12/21] KVM: TDX: Determine max mapping level according to vCPU's ACCEPT level

Posted by Yan Zhao 8 months, 3 weeks ago

On Sat, May 17, 2025 at 06:02:14AM +0800, Edgecombe, Rick P wrote:
> On Fri, 2025-05-16 at 14:30 +0800, Yan Zhao wrote:
> > > Looking more closely, I don't see why it's too hard to pass in a
> > > max_fault_level
> > > into the fault struct. Totally untested rough idea, what do you think?
> > Thanks for bringing this up and providing the idea below. In the previous TDX
> > huge page v8, there's a similar implementation [1] [2].
> > 
> > This series did not adopt that approach because that approach requires
> > tdx_handle_ept_violation() to pass in max_fault_level, which is not always
> > available at that stage. e.g.
> > 
> > In patch 19, when vCPU 1 faults on a GFN at 2MB level and then vCPU 2 faults
> > on
> > the same GFN at 4KB level, TDX wants to ignore the demotion request caused by
> > vCPU 2's 4KB level fault. So, patch 19 sets tdx->violation_request_level to
> > 2MB
> > in vCPU 2's split callback and fails the split. vCPU 2's
> > __vmx_handle_ept_violation() will see RET_PF_RETRY and either do local retry
> > (or
> > return to the guest).
> 
> I think you mean patch 20 "KVM: x86: Force a prefetch fault's max mapping level
> to 4KB for TDX"?
Sorry. It's patch 21 "KVM: x86: Ignore splitting huge pages in fault path for
TDX"

> > 
> > If it retries locally, tdx_gmem_private_max_mapping_level() will return
> > tdx->violation_request_level, causing KVM to fault at 2MB level for vCPU 2,
> > resulting in a spurious fault, eventually returning to the guest.
> > 
> > As tdx->violation_request_level is per-vCPU and it resets in
> > tdx_get_accept_level() in tdx_handle_ept_violation() (meaning it resets after
> > each invocation of tdx_handle_ept_violation() and only affects the TDX local
> > retry loop), it should not hold any stale value.
> > 
> > Alternatively, instead of having tdx_gmem_private_max_mapping_level() to
> > return
> > tdx->violation_request_level, tdx_handle_ept_violation() could grab
> > tdx->violation_request_level as the max_fault_level to pass to
> > __vmx_handle_ept_violation().
> > 
> > This series chose to use tdx_gmem_private_max_mapping_level() to avoid
> > modification to the KVM MMU core.
> 
> It sounds like Kirill is suggesting we do have to have demotion in the fault
> path. IIRC it adds a lock, but the cost to skip fault path demotion seems to be
> adding up.
Yes, though Kirill is suggesting to support demotion in the fault path, I still
think that using tdx_gmem_private_max_mapping_level() might be more friendly to
other potential scenarios, such as when the KVM core MMU requests TDX to perform
page promotion, and TDX finds that promotion would consistently fail on a GFN.

Another important reason for not passing a max_fault_level into the fault struct
is that the KVM MMU now has the hook private_max_mapping_level to determine a
private fault's maximum level, which was introduced by commit f32fb32820b1
("KVM: x86: Add hook for determining max NPT mapping level"). We'd better not to
introduce another mechanism if the same job can be accomplished via the
private_max_mapping_level hook.

The code in TDX huge page v8 [1][2] simply inherited the old implementation from
its v1 [3], where the private_max_mapping_level hook had not yet been introduced
for private faults.

[1] https://lore.kernel.org/all/4d61104bff388a081ff8f6ae4ac71e05a13e53c3.1708933624.git.isaku.yamahata@intel.com/
[2] https://lore.kernel.org/all/3d2a6bfb033ee1b51f7b875360bd295376c32b54.1708933624.git.isaku.yamahata@intel.com/
[3] https://lore.kernel.org/all/cover.1659854957.git.isaku.yamahata@intel.com/

Re: [RFC PATCH 12/21] KVM: TDX: Determine max mapping level according to vCPU's ACCEPT level

Posted by Edgecombe, Rick P 8 months, 3 weeks ago

On Mon, 2025-05-19 at 14:39 +0800, Yan Zhao wrote:
> > It sounds like Kirill is suggesting we do have to have demotion in the fault
> > path. IIRC it adds a lock, but the cost to skip fault path demotion seems to
> > be
> > adding up.
> Yes, though Kirill is suggesting to support demotion in the fault path, I
> still
> think that using tdx_gmem_private_max_mapping_level() might be more friendly
> to
> other potential scenarios, such as when the KVM core MMU requests TDX to
> perform
> page promotion, and TDX finds that promotion would consistently fail on a GFN.
> 
> Another important reason for not passing a max_fault_level into the fault
> struct
> is that the KVM MMU now has the hook private_max_mapping_level to determine a
> private fault's maximum level, which was introduced by commit f32fb32820b1
> ("KVM: x86: Add hook for determining max NPT mapping level"). We'd better not
> to
> introduce another mechanism if the same job can be accomplished via the
> private_max_mapping_level hook.

How about the alternative discussed on the thread with Kai? I don't think Kirill
was suggesting #VE based TDs need huge pages, just that they need to work with
4k accepts. Let's continue the discussion on that thread, because I think they
are all related. Once we conclude there we can iron out any remaining issues on
this specific patch.

> 
> The code in TDX huge page v8 [1][2] simply inherited the old implementation
> from
> its v1 [3], where the private_max_mapping_level hook had not yet been
> introduced
> for private faults.

Re: [RFC PATCH 12/21] KVM: TDX: Determine max mapping level according to vCPU's ACCEPT level

Posted by Xiaoyao Li 8 months, 4 weeks ago

On 5/14/2025 5:20 AM, Edgecombe, Rick P wrote:
> On Thu, 2025-04-24 at 11:07 +0800, Yan Zhao wrote:
>> Determine the max mapping level of a private GFN according to the vCPU's
>> ACCEPT level specified in the TDCALL TDG.MEM.PAGE.ACCEPT.
>>
>> When an EPT violation occurs due to a vCPU invoking TDG.MEM.PAGE.ACCEPT
>> before any actual memory access, the vCPU's ACCEPT level is available in
>> the extended exit qualification. Set the vCPU's ACCEPT level as the max
>> mapping level for the faulting GFN. This is necessary because if KVM
>> specifies a mapping level greater than the vCPU's ACCEPT level, and no
>> other vCPUs are accepting at KVM's mapping level, TDG.MEM.PAGE.ACCEPT will
>> produce another EPT violation on the vCPU after re-entering the TD, with
>> the vCPU's ACCEPT level indicated in the extended exit qualification.
> 
> Maybe a little more info would help. It's because the TDX module wants to
> "accept" the smaller size in the real S-EPT, but KVM created a huge page. It
> can't demote to do this without help from KVM.
> 
>>
>> Introduce "violation_gfn_start", "violation_gfn_end", and
>> "violation_request_level" in "struct vcpu_tdx" to pass the vCPU's ACCEPT
>> level to TDX's private_max_mapping_level hook for determining the max
>> mapping level.
>>
>> Instead of taking some bits of the error_code passed to
>> kvm_mmu_page_fault() and requiring KVM MMU core to check the error_code for
>> a fault's max_level, having TDX's private_max_mapping_level hook check for
>> request level avoids changes to the KVM MMU core. This approach also
>> accommodates future scenarios where the requested mapping level is unknown
>> at the start of tdx_handle_ept_violation() (i.e., before invoking
>> kvm_mmu_page_fault()).
>>
>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
>> ---
>>   arch/x86/kvm/vmx/tdx.c      | 36 +++++++++++++++++++++++++++++++++++-
>>   arch/x86/kvm/vmx/tdx.h      |  4 ++++
>>   arch/x86/kvm/vmx/tdx_arch.h |  3 +++
>>   3 files changed, 42 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
>> index 86775af85cd8..dd63a634e633 100644
>> --- a/arch/x86/kvm/vmx/tdx.c
>> +++ b/arch/x86/kvm/vmx/tdx.c
>> @@ -1859,10 +1859,34 @@ static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcp
>>   	return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
>>   }
>>   
>> +static inline void tdx_get_accept_level(struct kvm_vcpu *vcpu, gpa_t gpa)
>> +{
>> +	struct vcpu_tdx *tdx = to_tdx(vcpu);
>> +	int level = -1;
>> +
>> +	u64 eeq_type = tdx->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
>> +
>> +	u32 eeq_info = (tdx->ext_exit_qualification & TDX_EXT_EXIT_QUAL_INFO_MASK) >>
>> +			TDX_EXT_EXIT_QUAL_INFO_SHIFT;
>> +
>> +	if (eeq_type == TDX_EXT_EXIT_QUAL_TYPE_ACCEPT) {
>> +		level = (eeq_info & GENMASK(2, 0)) + 1;
>> +
>> +		tdx->violation_gfn_start = gfn_round_for_level(gpa_to_gfn(gpa), level);
>> +		tdx->violation_gfn_end = tdx->violation_gfn_start + KVM_PAGES_PER_HPAGE(level);
>> +		tdx->violation_request_level = level;
>> +	} else {
>> +		tdx->violation_gfn_start = -1;
>> +		tdx->violation_gfn_end = -1;
>> +		tdx->violation_request_level = -1;
> 
> We had some internal conversations on how KVM used to stuff a bunch of fault
> stuff in the vcpu so it didn't have to pass it around, but now uses the fault
> struct for this. The point was (IIRC) to prevent stale data from getting
> confused on future faults, and it being hard to track what came from where.
> 
> In the TDX case, I think the potential for confusion is still there. The MMU
> code could use stale data if an accept EPT violation happens and control returns
> to userspace, at which point userspace does a KVM_PRE_FAULT_MEMORY. Then it will
> see the stale  tdx->violation_*. Not exactly a common case, but better to not
> have loose ends if we can avoid it.
> 
> Looking more closely, I don't see why it's too hard to pass in a max_fault_level
> into the fault struct. Totally untested rough idea, what do you think?

the original huge page support patch did encode the level info in 
error_code. So it has my vote.

https://lore.kernel.org/all/4d61104bff388a081ff8f6ae4ac71e05a13e53c3.1708933624.git.isaku.yamahata@intel.com/

> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index faae82eefd99..3dc476da6391 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -282,7 +282,11 @@ enum x86_intercept_stage;
>    * when the guest was accessing private memory.
>    */
>   #define PFERR_PRIVATE_ACCESS   BIT_ULL(49)
> -#define PFERR_SYNTHETIC_MASK   (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
> +
> +#define PFERR_FAULT_LEVEL_MASK (BIT_ULL(50) | BIT_ULL(51) | BIT_ULL(52))
> +#define PFERR_FAULT_LEVEL_SHIFT 50
> +
> +#define PFERR_SYNTHETIC_MASK   (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS |
> PFERR_FAULT_LEVEL_MASK)
>   
>   /* apic attention bits */
>   #define KVM_APIC_CHECK_VAPIC   0
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 1c1764f46e66..bdb1b0eabd67 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -361,7 +361,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu
> *vcpu, gpa_t cr2_or_gpa,
>                  .nx_huge_page_workaround_enabled =
>                          is_nx_huge_page_enabled(vcpu->kvm),
>   
> -               .max_level = KVM_MAX_HUGEPAGE_LEVEL,
> +               .max_level = (err & PFERR_FAULT_LEVEL_MASK) >>
> PFERR_FAULT_LEVEL_SHIFT,
>                  .req_level = PG_LEVEL_4K,
>                  .goal_level = PG_LEVEL_4K,
>                  .is_private = err & PFERR_PRIVATE_ACCESS,
> diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
> index 8f46a06e2c44..2f22b294ef8b 100644
> --- a/arch/x86/kvm/vmx/common.h
> +++ b/arch/x86/kvm/vmx/common.h
> @@ -83,7 +83,8 @@ static inline bool vt_is_tdx_private_gpa(struct kvm *kvm,
> gpa_t gpa)
>   }
>   
>   static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
> -                                            unsigned long exit_qualification)
> +                                            unsigned long exit_qualification,
> +                                            u8 max_fault_level)
>   {
>          u64 error_code;
>   
> @@ -107,6 +108,10 @@ static inline int __vmx_handle_ept_violation(struct
> kvm_vcpu *vcpu, gpa_t gpa,
>          if (vt_is_tdx_private_gpa(vcpu->kvm, gpa))
>                  error_code |= PFERR_PRIVATE_ACCESS;
>   
> +       BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL >= (1 <<
> hweight64(PFERR_FAULT_LEVEL_MASK)));
> +
> +       error_code |= (u64)max_fault_level << PFERR_FAULT_LEVEL_SHIFT;
> +
>          return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
>   }
>   
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index e994a6c08a75..19047de4d98d 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -2027,7 +2027,7 @@ static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
>           * handle retries locally in their EPT violation handlers.
>           */
>          while (1) {
> -               ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
> +               ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual,
> KVM_MAX_HUGEPAGE_LEVEL);
>   
>                  if (ret != RET_PF_RETRY || !local_retry)
>                          break;
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index ef2d7208dd20..b70a2ff35884 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -5782,7 +5782,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
>          if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu,
> gpa)))
>                  return kvm_emulate_instruction(vcpu, 0);
>   
> -       return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
> +       return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification,
> KVM_MAX_HUGEPAGE_LEVEL);
>   }
>   
>   static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
> 
>