[RFC PATCH v4 05/16] KVM: TDX: Pass size to reclaim_page()

isaku.yamahata@intel.com posted 16 patches 2 years, 6 months ago
There is a newer version of this series
[RFC PATCH v4 05/16] KVM: TDX: Pass size to reclaim_page()
Posted by isaku.yamahata@intel.com 2 years, 6 months ago
From: Xiaoyao Li <xiaoyao.li@intel.com>

A 2MB large page can be tdh_mem_page_aug()'ed to TD directly. In this case,
it needs to reclaim and clear the page as 2MB size.

Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/kvm/vmx/tdx.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 3522ee232eda..86cfbf435671 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -198,12 +198,13 @@ static void tdx_disassociate_vp_on_cpu(struct kvm_vcpu *vcpu)
 	smp_call_function_single(cpu, tdx_disassociate_vp_arg, vcpu, 1);
 }
 
-static void tdx_clear_page(unsigned long page_pa)
+static void tdx_clear_page(unsigned long page_pa, int size)
 {
 	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
 	void *page = __va(page_pa);
 	unsigned long i;
 
+	WARN_ON_ONCE(size % PAGE_SIZE);
 	/*
 	 * When re-assign one page from old keyid to a new keyid, MOVDIR64B is
 	 * required to clear/write the page with new keyid to prevent integrity
@@ -212,7 +213,7 @@ static void tdx_clear_page(unsigned long page_pa)
 	 * clflush doesn't flush cache with HKID set.  The cache line could be
 	 * poisoned (even without MKTME-i), clear the poison bit.
 	 */
-	for (i = 0; i < PAGE_SIZE; i += 64)
+	for (i = 0; i < size; i += 64)
 		movdir64b(page + i, zero_page);
 	/*
 	 * MOVDIR64B store uses WC buffer.  Prevent following memory reads
@@ -221,7 +222,8 @@ static void tdx_clear_page(unsigned long page_pa)
 	__mb();
 }
 
-static int tdx_reclaim_page(hpa_t pa, bool do_wb, u16 hkid)
+static int tdx_reclaim_page(hpa_t pa, enum pg_level level,
+			    bool do_wb, u16 hkid)
 {
 	struct tdx_module_output out;
 	u64 err;
@@ -239,8 +241,10 @@ static int tdx_reclaim_page(hpa_t pa, bool do_wb, u16 hkid)
 		pr_tdx_error(TDH_PHYMEM_PAGE_RECLAIM, err, &out);
 		return -EIO;
 	}
+	/* out.r8 == tdx sept page level */
+	WARN_ON_ONCE(out.r8 != pg_level_to_tdx_sept_level(level));
 
-	if (do_wb) {
+	if (do_wb && level == PG_LEVEL_4K) {
 		/*
 		 * Only TDR page gets into this path.  No contention is expected
 		 * because of the last page of TD.
@@ -252,7 +256,7 @@ static int tdx_reclaim_page(hpa_t pa, bool do_wb, u16 hkid)
 		}
 	}
 
-	tdx_clear_page(pa);
+	tdx_clear_page(pa, KVM_HPAGE_SIZE(level));
 	return 0;
 }
 
@@ -266,7 +270,7 @@ static void tdx_reclaim_td_page(unsigned long td_page_pa)
 	 * was already flushed by TDH.PHYMEM.CACHE.WB before here, So
 	 * cache doesn't need to be flushed again.
 	 */
-	if (tdx_reclaim_page(td_page_pa, false, 0))
+	if (tdx_reclaim_page(td_page_pa, PG_LEVEL_4K, false, 0))
 		/*
 		 * Leak the page on failure:
 		 * tdx_reclaim_page() returns an error if and only if there's an
@@ -474,7 +478,7 @@ void tdx_vm_free(struct kvm *kvm)
 	 * while operating on TD (Especially reclaiming TDCS).  Cache flush with
 	 * TDX global HKID is needed.
 	 */
-	if (tdx_reclaim_page(kvm_tdx->tdr_pa, true, tdx_global_keyid))
+	if (tdx_reclaim_page(kvm_tdx->tdr_pa, PG_LEVEL_4K, true, tdx_global_keyid))
 		return;
 
 	free_page((unsigned long)__va(kvm_tdx->tdr_pa));
@@ -1468,7 +1472,7 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
 		 * The HKID assigned to this TD was already freed and cache
 		 * was already flushed. We don't have to flush again.
 		 */
-		err = tdx_reclaim_page(hpa, false, 0);
+		err = tdx_reclaim_page(hpa, level, false, 0);
 		if (KVM_BUG_ON(err, kvm))
 			return -EIO;
 		tdx_unpin(kvm, pfn);
@@ -1501,7 +1505,7 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
 		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err, NULL);
 		return -EIO;
 	}
-	tdx_clear_page(hpa);
+	tdx_clear_page(hpa, PAGE_SIZE);
 	tdx_unpin(kvm, pfn);
 	return 0;
 }
@@ -1612,7 +1616,7 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
 	 * already flushed. We don't have to flush again.
 	 */
 	if (!is_hkid_assigned(kvm_tdx))
-		return tdx_reclaim_page(__pa(private_spt), false, 0);
+		return tdx_reclaim_page(__pa(private_spt), PG_LEVEL_4K, false, 0);
 
 	/*
 	 * free_private_spt() is (obviously) called when a shadow page is being
-- 
2.25.1
Re: [RFC PATCH v4 05/16] KVM: TDX: Pass size to reclaim_page()
Posted by Binbin Wu 2 years, 5 months ago

On 7/26/2023 6:23 AM, isaku.yamahata@intel.com wrote:
> From: Xiaoyao Li <xiaoyao.li@intel.com>
>
> A 2MB large page can be tdh_mem_page_aug()'ed to TD directly. In this case,
> it needs to reclaim and clear the page as 2MB size.
>
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>   arch/x86/kvm/vmx/tdx.c | 24 ++++++++++++++----------
>   1 file changed, 14 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 3522ee232eda..86cfbf435671 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -198,12 +198,13 @@ static void tdx_disassociate_vp_on_cpu(struct kvm_vcpu *vcpu)
>   	smp_call_function_single(cpu, tdx_disassociate_vp_arg, vcpu, 1);
>   }
>   
> -static void tdx_clear_page(unsigned long page_pa)
> +static void tdx_clear_page(unsigned long page_pa, int size)
>   {
>   	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
>   	void *page = __va(page_pa);
>   	unsigned long i;
>   
> +	WARN_ON_ONCE(size % PAGE_SIZE);
>   	/*
>   	 * When re-assign one page from old keyid to a new keyid, MOVDIR64B is
>   	 * required to clear/write the page with new keyid to prevent integrity
> @@ -212,7 +213,7 @@ static void tdx_clear_page(unsigned long page_pa)
>   	 * clflush doesn't flush cache with HKID set.  The cache line could be
>   	 * poisoned (even without MKTME-i), clear the poison bit.
>   	 */
> -	for (i = 0; i < PAGE_SIZE; i += 64)
> +	for (i = 0; i < size; i += 64)
>   		movdir64b(page + i, zero_page);
>   	/*
>   	 * MOVDIR64B store uses WC buffer.  Prevent following memory reads
> @@ -221,7 +222,8 @@ static void tdx_clear_page(unsigned long page_pa)
>   	__mb();
>   }
>   
> -static int tdx_reclaim_page(hpa_t pa, bool do_wb, u16 hkid)
> +static int tdx_reclaim_page(hpa_t pa, enum pg_level level,
> +			    bool do_wb, u16 hkid)
>   {
>   	struct tdx_module_output out;
>   	u64 err;
> @@ -239,8 +241,10 @@ static int tdx_reclaim_page(hpa_t pa, bool do_wb, u16 hkid)
>   		pr_tdx_error(TDH_PHYMEM_PAGE_RECLAIM, err, &out);
>   		return -EIO;
>   	}
> +	/* out.r8 == tdx sept page level */
> +	WARN_ON_ONCE(out.r8 != pg_level_to_tdx_sept_level(level));
>   
> -	if (do_wb) {
> +	if (do_wb && level == PG_LEVEL_4K) {
I was wondering if it is better to add a WARN_ON_ONCE() to ensure level is
PG_LEVEL_4K instead of skipping it silently. But later, I found the 
warning of
comparing out.r8 and level has guaranteed that there will be a warning 
if there
is a mismatch between do_wb and level.

>   		/*
>   		 * Only TDR page gets into this path.  No contention is expected
>   		 * because of the last page of TD.
> @@ -252,7 +256,7 @@ static int tdx_reclaim_page(hpa_t pa, bool do_wb, u16 hkid)
>   		}
>   	}
>   
> -	tdx_clear_page(pa);
> +	tdx_clear_page(pa, KVM_HPAGE_SIZE(level));
>   	return 0;
>   }
>   
> @@ -266,7 +270,7 @@ static void tdx_reclaim_td_page(unsigned long td_page_pa)
>   	 * was already flushed by TDH.PHYMEM.CACHE.WB before here, So
>   	 * cache doesn't need to be flushed again.
>   	 */
> -	if (tdx_reclaim_page(td_page_pa, false, 0))
> +	if (tdx_reclaim_page(td_page_pa, PG_LEVEL_4K, false, 0))
>   		/*
>   		 * Leak the page on failure:
>   		 * tdx_reclaim_page() returns an error if and only if there's an
> @@ -474,7 +478,7 @@ void tdx_vm_free(struct kvm *kvm)
>   	 * while operating on TD (Especially reclaiming TDCS).  Cache flush with
>   	 * TDX global HKID is needed.
>   	 */
> -	if (tdx_reclaim_page(kvm_tdx->tdr_pa, true, tdx_global_keyid))
> +	if (tdx_reclaim_page(kvm_tdx->tdr_pa, PG_LEVEL_4K, true, tdx_global_keyid))
>   		return;
>   
>   	free_page((unsigned long)__va(kvm_tdx->tdr_pa));
> @@ -1468,7 +1472,7 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
>   		 * The HKID assigned to this TD was already freed and cache
>   		 * was already flushed. We don't have to flush again.
>   		 */
> -		err = tdx_reclaim_page(hpa, false, 0);
> +		err = tdx_reclaim_page(hpa, level, false, 0);
>   		if (KVM_BUG_ON(err, kvm))
>   			return -EIO;
>   		tdx_unpin(kvm, pfn);
> @@ -1501,7 +1505,7 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
>   		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err, NULL);
>   		return -EIO;
>   	}
> -	tdx_clear_page(hpa);
> +	tdx_clear_page(hpa, PAGE_SIZE);
>   	tdx_unpin(kvm, pfn);
>   	return 0;
>   }
> @@ -1612,7 +1616,7 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
>   	 * already flushed. We don't have to flush again.
>   	 */
>   	if (!is_hkid_assigned(kvm_tdx))
> -		return tdx_reclaim_page(__pa(private_spt), false, 0);
> +		return tdx_reclaim_page(__pa(private_spt), PG_LEVEL_4K, false, 0);
>   
>   	/*
>   	 * free_private_spt() is (obviously) called when a shadow page is being