From nobody Mon Feb  9 09:28:48 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id B7A13FA373D
	for <linux-kernel@archiver.kernel.org>; Sun, 30 Oct 2022 06:26:28 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230355AbiJ3G00 (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Sun, 30 Oct 2022 02:26:26 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:47114 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229902AbiJ3GYJ (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Sun, 30 Oct 2022 02:24:09 -0400
Received: from mga05.intel.com (mga05.intel.com [192.55.52.43])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 16FEA109;
        Sat, 29 Oct 2022 23:24:08 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1667111048; x=1698647048;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=U9taNEK+Sd3FCL0cWVbcEHkBfRflbcnLImJXgQmB6kY=;
  b=kzICDaKjl6x20djopy8LQzZUe23Ko2ecYjergso9toBT04KajTYY1U6M
   30MO5hDmiNLjONRhhnZrw6iWywEs7xz3hDZRv01M41Cu9+cdP4F81CeGH
   n04KtK1GLl+VBgVJj/qCsF0m1Ptk/2QqgidejL2WcAXa14LGs32m6QqfO
   Kynx3fQbfLad2RrOD+ick7qaOMxlPs+rYTAf+ilm2gRFFTm/Gn5OlZEkD
   qVWwNxN5/hbdY7d8S8PMDlxsLbH919ahbXtF7hvsrAzhFDI7UekRUhIcT
   MZ811XPJzi8BGHlT8w8U0rAg9gM/sUAaG6vk7gLUjX7RnXsKVl3X3t7b9
   g==;
X-IronPort-AV: E=McAfee;i="6500,9779,10515"; a="395037143"
X-IronPort-AV: E=Sophos;i="5.95,225,1661842800";
   d="scan'208";a="395037143"
Received: from fmsmga006.fm.intel.com ([10.253.24.20])
  by fmsmga105.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 29 Oct 2022 23:24:02 -0700
X-IronPort-AV: E=McAfee;i="6500,9779,10515"; a="878392942"
X-IronPort-AV: E=Sophos;i="5.95,225,1661842800";
   d="scan'208";a="878392942"
Received: from ls.sc.intel.com (HELO localhost) ([143.183.96.54])
  by fmsmga006-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 29 Oct 2022 23:24:01 -0700
From: isaku.yamahata@intel.com
To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org
Cc: isaku.yamahata@intel.com, isaku.yamahata@gmail.com,
        Paolo Bonzini <pbonzini@redhat.com>, erdemaktas@google.com,
        Sean Christopherson <seanjc@google.com>,
        Sagi Shahar <sagis@google.com>,
        David Matlack <dmatlack@google.com>,
        Sean Christopherson <sean.j.christopherson@intel.com>
Subject: [PATCH v10 031/108] KVM: x86/mmu: Replace hardcoded value 0 for the
 initial value for SPTE
Date: Sat, 29 Oct 2022 23:22:32 -0700
Message-Id: 
 <0de1d5dfbce49b5e9d4f93289296b726180b8dd0.1667110240.git.isaku.yamahata@intel.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <cover.1667110240.git.isaku.yamahata@intel.com>
References: <cover.1667110240.git.isaku.yamahata@intel.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

From: Isaku Yamahata <isaku.yamahata@intel.com>

The TDX support will need the "suppress #VE" bit (bit 63) set as the
initial value for SPTE.  To reduce code change size, introduce a new macro
SHADOW_NONPRESENT_VALUE for the initial value for the shadow page table
entry (SPTE) and replace hard-coded value 0 for it.  Initialize shadow page
tables with their value.

The plan is to unconditionally set the "suppress #VE" bit for both AMD and
Intel as: 1) AMD hardware doesn't use this bit; 2) for conventional VMX
guests, KVM never enables the "EPT-violation #VE" in VMCS control and
"suppress #VE" bit is ignored by hardware.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/kvm/mmu/mmu.c     | 50 +++++++++++++++++++++++++++++++++-----
 arch/x86/kvm/mmu/spte.h    |  2 ++
 arch/x86/kvm/mmu/tdp_mmu.c | 15 ++++++------
 3 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 10017a9f26ee..e7e11f51f8b4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -538,9 +538,9 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u=
64 *sptep)
=20
 	if (!is_shadow_present_pte(old_spte) ||
 	    !spte_has_volatile_bits(old_spte))
-		__update_clear_spte_fast(sptep, 0ull);
+		__update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
 	else
-		old_spte =3D __update_clear_spte_slow(sptep, 0ull);
+		old_spte =3D __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
=20
 	if (!is_shadow_present_pte(old_spte))
 		return old_spte;
@@ -574,7 +574,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u=
64 *sptep)
  */
 static void mmu_spte_clear_no_track(u64 *sptep)
 {
-	__update_clear_spte_fast(sptep, 0ull);
+	__update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
 }
=20
 static u64 mmu_spte_get_lockless(u64 *sptep)
@@ -642,6 +642,39 @@ static void walk_shadow_page_lockless_end(struct kvm_v=
cpu *vcpu)
 	}
 }
=20
+#ifdef CONFIG_X86_64
+static inline void kvm_init_shadow_page(void *page)
+{
+	memset64(page, SHADOW_NONPRESENT_VALUE, 4096 / 8);
+}
+
+static int mmu_topup_shadow_page_cache(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_memory_cache *mc =3D &vcpu->arch.mmu_shadow_page_cache;
+	int start, end, i, r;
+
+	start =3D kvm_mmu_memory_cache_nr_free_objects(mc);
+	r =3D kvm_mmu_topup_memory_cache(mc, PT64_ROOT_MAX_LEVEL);
+
+	/*
+	 * Note, topup may have allocated objects even if it failed to allocate
+	 * the minimum number of objects required to make forward progress _at
+	 * this time_.  Initialize newly allocated objects even on failure, as
+	 * userspace can free memory and rerun the vCPU in response to -ENOMEM.
+	 */
+	end =3D kvm_mmu_memory_cache_nr_free_objects(mc);
+	for (i =3D start; i < end; i++)
+		kvm_init_shadow_page(mc->objects[i]);
+	return r;
+}
+#else
+static int mmu_topup_shadow_page_cache(struct kvm_vcpu *vcpu)
+{
+	return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
+					  PT64_ROOT_MAX_LEVEL);
+}
+#endif /* CONFIG_X86_64 */
+
 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indir=
ect)
 {
 	int r;
@@ -651,8 +684,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcp=
u, bool maybe_indirect)
 				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
 	if (r)
 		return r;
-	r =3D kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
-				       PT64_ROOT_MAX_LEVEL);
+	r =3D mmu_topup_shadow_page_cache(vcpu);
 	if (r)
 		return r;
 	if (maybe_indirect) {
@@ -5870,7 +5902,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu_page_header_cache.kmem_cache =3D mmu_page_header_cache;
 	vcpu->arch.mmu_page_header_cache.gfp_zero =3D __GFP_ZERO;
=20
-	vcpu->arch.mmu_shadow_page_cache.gfp_zero =3D __GFP_ZERO;
+	/*
+	 * When X86_64, initial SEPT entries are initialized with
+	 * SHADOW_NONPRESENT_VALUE.  Otherwise zeroed.  See
+	 * mmu_topup_shadow_page_cache().
+	 */
+	if (!IS_ENABLED(CONFIG_X86_64))
+		vcpu->arch.mmu_shadow_page_cache.gfp_zero =3D __GFP_ZERO;
=20
 	vcpu->arch.mmu =3D &vcpu->arch.root_mmu;
 	vcpu->arch.walk_mmu =3D &vcpu->arch.root_mmu;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 7670c13ce251..42ecaa75da15 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -148,6 +148,8 @@ static_assert(MMIO_SPTE_GEN_LOW_BITS =3D=3D 8 && MMIO_S=
PTE_GEN_HIGH_BITS =3D=3D 11);
=20
 #define MMIO_SPTE_GEN_MASK		GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE=
_GEN_HIGH_BITS - 1, 0)
=20
+#define SHADOW_NONPRESENT_VALUE	0ULL
+
 extern u64 __read_mostly shadow_host_writable_mask;
 extern u64 __read_mostly shadow_mmu_writable_mask;
 extern u64 __read_mostly shadow_nx_mask;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index eab765442d0b..38bc4c2f0f1f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -694,7 +694,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *k=
vm,
 	 * here since the SPTE is going from non-present to non-present.  Use
 	 * the raw write helper to avoid an unnecessary check on volatile bits.
 	 */
-	__kvm_tdp_mmu_write_spte(iter->sptep, 0);
+	__kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
=20
 	return 0;
 }
@@ -871,8 +871,8 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct =
kvm_mmu_page *root,
 			continue;
=20
 		if (!shared)
-			tdp_mmu_set_spte(kvm, &iter, 0);
-		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
+			tdp_mmu_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
+		else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
 			goto retry;
 	}
 }
@@ -928,8 +928,9 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu=
_page *sp)
 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
 		return false;
=20
-	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
-			   sp->gfn, sp->role.level + 1, true, true);
+	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
+			   SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1,
+			   true, true);
=20
 	return true;
 }
@@ -963,7 +964,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct k=
vm_mmu_page *root,
 		    !is_last_spte(iter.old_spte, iter.level))
 			continue;
=20
-		tdp_mmu_set_spte(kvm, &iter, 0);
+		tdp_mmu_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
 		flush =3D true;
 	}
=20
@@ -1328,7 +1329,7 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_=
iter *iter,
 	 * invariant that the PFN of a present * leaf SPTE can never change.
 	 * See __handle_changed_spte().
 	 */
-	tdp_mmu_set_spte(kvm, iter, 0);
+	tdp_mmu_set_spte(kvm, iter, SHADOW_NONPRESENT_VALUE);
=20
 	if (!pte_write(range->pte)) {
 		new_spte =3D kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
--=20
2.25.1