From nobody Wed Sep 10 01:55:25 2025
Received: from casper.infradead.org (casper.infradead.org [90.155.50.34])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 256B32E9EB7;
	Mon,  8 Sep 2025 21:33:15 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=90.155.50.34
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1757367198; cv=none;
 b=UTSADrgJhqzep5EszY8x3d8JS4Lgp0rznX8ixysGBd9xP7rcUdxczHOBRGLPRNIPQIfTWnLpGTyI9A8Nb9ahiaEBH25XEzz6X/SPMu/VXvnzdK72a8w1WNUh3oNojG1UaPzwAaYxIe9zaf6GRfGKhf0gA6X4EkQPrHccFoUI/B8=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1757367198; c=relaxed/simple;
	bh=phyHjqCS9uGKYCLf30hLTJeN8xw1q6nVFvs+aihC6Us=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=fAzJIrdSizqjiqA6S04FlwG/rJsm9uE2fhjn8Rtx8eniPEWZ6WswdJplv7fVihWrFbJX5+SH//6M4m6CZr4/MwaAHoXPNbWd0dTapH8me/62LRl7H5sfgt4Jq4Kicg7sPogWy7/zi5lEUSDGQWlhib3mPMVSwJyaGi3eTQ8gdCQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org;
 spf=none smtp.mailfrom=infradead.org;
 dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b=ZmbsH5/D; arc=none smtp.client-ip=90.155.50.34
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
 spf=none smtp.mailfrom=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b="ZmbsH5/D"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
	d=infradead.org; s=casper.20170209; h=Content-Transfer-Encoding:MIME-Version:
	References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:
	Content-Type:Content-ID:Content-Description;
	bh=AJRiZoVFyTJIE0rFVOLfgm4TnIZJeCki13dbxXUIFdA=; b=ZmbsH5/DWZ9tLnp1JDG2Kna9HU
	p7uiOVDvJxe2a89hoPzH9i1pVL5IiOkKQpQOAQOmPwTHKa1mOd9f/45fQdNNCBXhKjnUN5qoHL9qw
	ee/AKEHNEvbe3iaRlGDG1P8hAzu3P4Ct5fsXlNTUGRUQtDfJMYCigoT0Glz+VcFhyJZJT+IXrUbev
	spFby46unZtN+cXdHg2qcRB9SkbXqGSTUmIKJfY5z+3zef9/85JsCuioVBf3B5x7YnrxniviI7FA+
	sOaSQG2UALOipZl6FmW03GgtkWCkbZWI28HGoA2ahbeq0aH8mM80Y3xKpXy3Bkr9PkNzG4rWBE/PV
	omoTX91g==;
Received: from griffoul by casper.infradead.org with local (Exim 4.98.2 #2
 (Red Hat Linux))
	id 1uvjUA-0000000DNug-1ht0;
	Mon, 08 Sep 2025 21:33:14 +0000
From: Fred Griffoul <griffoul@infradead.org>
To: kvm@vger.kernel.org
Cc: griffoul@gmail.com,
	Fred Griffoul <fgriffo@amazon.co.uk>,
	Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>,
	Borislav Petkov <bp@alien8.de>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	x86@kernel.org,
	"H. Peter Anvin" <hpa@zytor.com>,
	Shuah Khan <shuah@kernel.org>,
	linux-kernel@vger.kernel.org,
	linux-kselftest@vger.kernel.org
Subject: [PATCH 1/5] KVM: nVMX: Implement cache for L1 MSR bitmap
Date: Mon,  8 Sep 2025 22:32:26 +0100
Message-ID: <20250908213241.3189113-2-griffoul@infradead.org>
X-Mailer: git-send-email 2.51.0
In-Reply-To: <20250908213241.3189113-1-griffoul@infradead.org>
References: <20250908213241.3189113-1-griffoul@infradead.org>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Optimize L1 MSR bitmap access by replacing map/unmap operations with a
persistent gfn_to_pfn_cache. This optimization reduces overhead during
L2 VM-entry where nested_vmx_prepare_msr_bitmap() merges L1's MSR
intercepts with L0's requirements.

Current implementation using kvm_vcpu_map_readonly() and
kvm_vcpu_unmap() creates significant performance impact, particularly
with unmanaged guest memory.

New implementation:
- Initializes a pfn cache when entering VMX operation.
- Maintains persistent access throughout operation.
- Deactivates cache when VMX operation ends.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 arch/x86/kvm/vmx/nested.c | 42 +++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/vmx/vmx.h    |  2 ++
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b8ea1969113d..aa4fe1fe571d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -315,6 +315,34 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, str=
uct loaded_vmcs *vmcs)
 	vcpu->arch.regs_dirty =3D 0;
 }
=20
+/*
+ * Maps a single guest page starting at @gpa and lock the cache for access.
+ */
+static int nested_gpc_lock(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
+{
+	int err;
+
+	if (WARN_ON_ONCE(!PAGE_ALIGNED(gpa)))
+		return -EINVAL;
+retry:
+	read_lock(&gpc->lock);
+	if (!kvm_gpc_check(gpc, PAGE_SIZE) || (gpc->gpa !=3D gpa)) {
+		read_unlock(&gpc->lock);
+		err =3D kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
+		if (err)
+			return err;
+
+		goto retry;
+	}
+
+	return 0;
+}
+
+static void nested_gpc_unlock(struct gfn_to_pfn_cache *gpc)
+{
+	read_unlock(&gpc->lock);
+}
+
 static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
@@ -344,6 +372,9 @@ static void free_nested(struct kvm_vcpu *vcpu)
 	vmx->nested.vmxon =3D false;
 	vmx->nested.smm.vmxon =3D false;
 	vmx->nested.vmxon_ptr =3D INVALID_GPA;
+
+	kvm_gpc_deactivate(&vmx->nested.msr_bitmap_cache);
+
 	free_vpid(vmx->nested.vpid02);
 	vmx->nested.posted_intr_nv =3D -1;
 	vmx->nested.current_vmptr =3D INVALID_GPA;
@@ -625,7 +656,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct=
 kvm_vcpu *vcpu,
 	int msr;
 	unsigned long *msr_bitmap_l1;
 	unsigned long *msr_bitmap_l0 =3D vmx->nested.vmcs02.msr_bitmap;
-	struct kvm_host_map map;
+	struct gfn_to_pfn_cache *gpc;
=20
 	/* Nothing to do if the MSR bitmap is not in use.  */
 	if (!cpu_has_vmx_msr_bitmap() ||
@@ -648,10 +679,11 @@ static inline bool nested_vmx_prepare_msr_bitmap(stru=
ct kvm_vcpu *vcpu,
 			return true;
 	}
=20
-	if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map))
+	gpc =3D &vmx->nested.msr_bitmap_cache;
+	if (nested_gpc_lock(gpc, vmcs12->msr_bitmap))
 		return false;
=20
-	msr_bitmap_l1 =3D (unsigned long *)map.hva;
+	msr_bitmap_l1 =3D (unsigned long *)gpc->khva;
=20
 	/*
 	 * To keep the control flow simple, pay eight 8-byte writes (sixteen
@@ -721,7 +753,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct=
 kvm_vcpu *vcpu,
 	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
 					 MSR_IA32_MPERF, MSR_TYPE_R);
=20
-	kvm_vcpu_unmap(vcpu, &map);
+	nested_gpc_unlock(gpc);
=20
 	vmx->nested.force_msr_bitmap_recalc =3D false;
=20
@@ -5352,6 +5384,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
=20
 	vmx->nested.vpid02 =3D allocate_vpid();
=20
+	kvm_gpc_init(&vmx->nested.msr_bitmap_cache, vcpu->kvm);
+
 	vmx->nested.vmcs02_initialized =3D false;
 	vmx->nested.vmxon =3D true;
=20
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index d3389baf3ab3..3a6983222841 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -152,6 +152,8 @@ struct nested_vmx {
=20
 	struct loaded_vmcs vmcs02;
=20
+	struct gfn_to_pfn_cache msr_bitmap_cache;
+
 	/*
 	 * Guest pages referred to in the vmcs02 with host-physical
 	 * pointers, so we must keep them pinned while L2 runs.
--=20
2.51.0
From nobody Wed Sep 10 01:55:25 2025
Received: from casper.infradead.org (casper.infradead.org [90.155.50.34])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9568F2DF705;
	Mon,  8 Sep 2025 21:33:24 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=90.155.50.34
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1757367206; cv=none;
 b=FnC83+tJE7WNsbYWmSHJMzhlGe8G+0qjyZebTjNmTOFugPSBZH1OhllA1mhbWgqt6YHvMqmZxPbS9bOEcCzamQmAPj4lfpUB0QhLZew2D07YoACq4aVhI6YB4Nrf0/S+AyvLEqEKZUyua2J2D9tRIZNB3trdE5V1876alhbmVN0=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1757367206; c=relaxed/simple;
	bh=f9LPBzgWad+KOxvJjX8NalTox3CpKgTBqcGiqD6aalo=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=ZTHsFJTUka5lSjcFuAw/+A9iaC0lln8ErnZ/SdsrITHlgnyoFfCK3Fi1OJMjg/qbBHRm+qxPUviI3XU6slwxTIoPx65WAUL/dNvM+dpTQKx1s7+CHsSKasYeQgtU6sr4o5ISJUdzvVCDa2wniLEp6TIsHX7mrFl7BnuiIHTRfeY=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org;
 spf=none smtp.mailfrom=infradead.org;
 dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b=E1hYuvhs; arc=none smtp.client-ip=90.155.50.34
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
 spf=none smtp.mailfrom=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b="E1hYuvhs"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
	d=infradead.org; s=casper.20170209; h=Content-Transfer-Encoding:MIME-Version:
	References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:
	Content-Type:Content-ID:Content-Description;
	bh=e+6M2G4SsyonChyGwoA7gCylBXiASD9sY4y3VUk+fa8=; b=E1hYuvhsbox3HG9HbDNvxy6wSZ
	tWm2ODP5TPbLWHxvxx+q4IyYSALDqJ/acZ31+NEAnrobtiI+eRnhFkUDtrLpECSqtHBlvVZVnT0x1
	WnB4eruq40O8jwwHe9ru/nPGclV4x9yxkvz89TXo8UKVHCtQWuZpBvfNp3+9geNKVTt/tb0g4FpnO
	AWuLGv2pm6saRVrgYpthQQCl/0gpeXV6XrYkzN7ifWk4imAcxrUyQkz0htUfb1OOYQSQnS9LrTa4O
	2vBHqNT4XiZAyfwq4osKfD6/TIAgvfGoeItaWYpwn2PYS89poF8sv1RJX3e/zuPUQlEcXSbpZsI88
	PIMz6IUA==;
Received: from griffoul by casper.infradead.org with local (Exim 4.98.2 #2
 (Red Hat Linux))
	id 1uvjUI-0000000DNxm-0dPr;
	Mon, 08 Sep 2025 21:33:22 +0000
From: Fred Griffoul <griffoul@infradead.org>
To: kvm@vger.kernel.org
Cc: griffoul@gmail.com,
	Fred Griffoul <fgriffo@amazon.co.uk>,
	Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>,
	Borislav Petkov <bp@alien8.de>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	x86@kernel.org,
	"H. Peter Anvin" <hpa@zytor.com>,
	Shuah Khan <shuah@kernel.org>,
	linux-kernel@vger.kernel.org,
	linux-kselftest@vger.kernel.org
Subject: [PATCH 2/5] KVM: pfncache: Restore guest-uses-pfn support
Date: Mon,  8 Sep 2025 22:32:27 +0100
Message-ID: <20250908213241.3189113-3-griffoul@infradead.org>
X-Mailer: git-send-email 2.51.0
In-Reply-To: <20250908213241.3189113-1-griffoul@infradead.org>
References: <20250908213241.3189113-1-griffoul@infradead.org>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Restore functionality for guest page access tracking in pfncache,
enabling automatic vCPU request generation when cache invalidation
occurs through MMU notifier events.

This feature is critical for nested VMX operations where both KVM and L2
guest access guest-provided pages, such as APIC pages and posted
interrupt descriptors.

This change:

- Reverts commit eefb85b3f031 ("KVM: Drop unused @may_block param from
  gfn_to_pfn_cache_invalidate_start()")

- Partially reverts commit a4bff3df5147 ("KVM: pfncache: remove
  KVM_GUEST_USES_PFN usage"). Adds kvm_gpc_init_for_vcpu() to
  initialize pfncache for guest mode access, instead of the
  usage-specific flag approach.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 include/linux/kvm_host.h  | 29 +++++++++++++++++++++++++-
 include/linux/kvm_types.h |  1 +
 virt/kvm/kvm_main.c       |  3 ++-
 virt/kvm/kvm_mm.h         |  6 ++++--
 virt/kvm/pfncache.c       | 43 ++++++++++++++++++++++++++++++++++++---
 5 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8b47891adca1..2eb551a11818 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1397,6 +1397,9 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t=
 gpa, const void *data,
 			 unsigned long len);
 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
=20
+void __kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
+		    struct kvm_vcpu *vcpu);
+
 /**
  * kvm_gpc_init - initialize gfn_to_pfn_cache.
  *
@@ -1407,7 +1410,11 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu,=
 gfn_t gfn);
  * immutable attributes.  Note, the cache must be zero-allocated (or zeroe=
d by
  * the caller before init).
  */
-void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm);
+
+static inline void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *=
kvm)
+{
+	__kvm_gpc_init(gpc, kvm, NULL);
+}
=20
 /**
  * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given =
guest
@@ -1489,6 +1496,26 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, un=
signed long len);
  */
 void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc);
=20
+/**
+ * kvm_gpc_init_for_vcpu - initialize gfn_to_pfn_cache for pin/unpin usage
+ *
+ * @gpc:        struct gfn_to_pfn_cache object.
+ * @vcpu:       vCPU that will pin and directly access this cache.
+ * @req:        request to send when cache is invalidated while pinned.
+ *
+ * This sets up a gfn_to_pfn_cache for use by a vCPU that will directly ac=
cess
+ * the cached physical address. When the cache is invalidated while pinned,
+ * the specified request will be sent to the associated vCPU to force cache
+ * refresh.
+ *
+ * Note, the cache must be zero-allocated (or zeroed by the caller before =
init).
+ */
+static inline void kvm_gpc_init_for_vcpu(struct gfn_to_pfn_cache *gpc,
+					 struct kvm_vcpu *vcpu)
+{
+	__kvm_gpc_init(gpc, vcpu->kvm, vcpu);
+}
+
 static inline bool kvm_gpc_is_gpa_active(struct gfn_to_pfn_cache *gpc)
 {
 	return gpc->active && !kvm_is_error_gpa(gpc->gpa);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 827ecc0b7e10..1c4bbf9947a4 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -64,6 +64,7 @@ struct gfn_to_pfn_cache {
 	struct kvm_memory_slot *memslot;
 	struct kvm *kvm;
 	struct list_head list;
+	struct kvm_vcpu *vcpu;
 	rwlock_t lock;
 	struct mutex refresh_lock;
 	void *khva;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 18f29ef93543..f42cfd524ecf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -760,7 +760,8 @@ static int kvm_mmu_notifier_invalidate_range_start(stru=
ct mmu_notifier *mn,
 	 * mn_active_invalidate_count (see above) instead of
 	 * mmu_invalidate_in_progress.
 	 */
-	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
+	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
+					  hva_range.may_block);
=20
 	/*
 	 * If one or more memslots were found and thus zapped, notify arch code
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 31defb08ccba..f1ba02084bd9 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -58,11 +58,13 @@ kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp);
 #ifdef CONFIG_HAVE_KVM_PFNCACHE
 void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
 				       unsigned long start,
-				       unsigned long end);
+				       unsigned long end,
+				       bool may_block);
 #else
 static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
 						     unsigned long start,
-						     unsigned long end)
+						     unsigned long end,
+						     bool may_block)
 {
 }
 #endif /* HAVE_KVM_PFNCACHE */
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 728d2c1b488a..543466ff40a0 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -23,9 +23,11 @@
  * MMU notifier 'invalidate_range_start' hook.
  */
 void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long star=
t,
-				       unsigned long end)
+				       unsigned long end, bool may_block)
 {
+	DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
 	struct gfn_to_pfn_cache *gpc;
+	bool evict_vcpus =3D false;
=20
 	spin_lock(&kvm->gpc_lock);
 	list_for_each_entry(gpc, &kvm->gpc_list, list) {
@@ -46,8 +48,21 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, =
unsigned long start,
=20
 			write_lock_irq(&gpc->lock);
 			if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
-			    gpc->uhva >=3D start && gpc->uhva < end)
+			    gpc->uhva >=3D start && gpc->uhva < end) {
 				gpc->valid =3D false;
+
+				/*
+				 * If a guest vCPU could be using the physical address,
+				 * it needs to be forced out of guest mode.
+				 */
+				if (gpc->vcpu) {
+					if (!evict_vcpus) {
+						evict_vcpus =3D true;
+						bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
+					}
+					__set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap);
+				}
+			}
 			write_unlock_irq(&gpc->lock);
 			continue;
 		}
@@ -55,6 +70,27 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, =
unsigned long start,
 		read_unlock_irq(&gpc->lock);
 	}
 	spin_unlock(&kvm->gpc_lock);
+
+	if (evict_vcpus) {
+		/*
+		 * KVM needs to ensure the vCPU is fully out of guest context
+		 * before allowing the invalidation to continue.
+		 */
+		unsigned int req =3D KVM_REQ_OUTSIDE_GUEST_MODE;
+		bool called;
+
+		/*
+		 * If the OOM reaper is active, then all vCPUs should have
+		 * been stopped already, so perform the request without
+		 * KVM_REQUEST_WAIT and be sad if any needed to be IPI'd.
+		 */
+		if (!may_block)
+			req &=3D ~KVM_REQUEST_WAIT;
+
+		called =3D kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap);
+
+		WARN_ON_ONCE(called && !may_block);
+	}
 }
=20
 static bool kvm_gpc_is_valid_len(gpa_t gpa, unsigned long uhva,
@@ -382,7 +418,7 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsig=
ned long len)
 	return __kvm_gpc_refresh(gpc, gpc->gpa, uhva);
 }
=20
-void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
+void __kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, struct =
kvm_vcpu *vcpu)
 {
 	rwlock_init(&gpc->lock);
 	mutex_init(&gpc->refresh_lock);
@@ -391,6 +427,7 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct =
kvm *kvm)
 	gpc->pfn =3D KVM_PFN_ERR_FAULT;
 	gpc->gpa =3D INVALID_GPA;
 	gpc->uhva =3D KVM_HVA_ERR_BAD;
+	gpc->vcpu =3D vcpu;
 	gpc->active =3D gpc->valid =3D false;
 }
=20
--=20
2.51.0
From nobody Wed Sep 10 01:55:25 2025
Received: from casper.infradead.org (casper.infradead.org [90.155.50.34])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 18C5A2DF71C;
	Mon,  8 Sep 2025 21:33:34 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=90.155.50.34
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1757367216; cv=none;
 b=A5VDNVtuvD0wf6KKsF3Zy1zJ2j/Ei1oJrr0u3VSmTjopJ5f8T+n0q93L6yc5bM/qUGR422rqB2JDIS5xodvEk+hOaNTsKEkErL+71nxju0/tAPT/c8tMYzp/oYVLX27kewZZUFXJgLM/hm3Gx5BVdpKwJOkyBXL20V56qgOPPFU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1757367216; c=relaxed/simple;
	bh=dcdN9kLk7VQEOw9Elsf+i9vD+FkayaCDHoUI9tT7Jo4=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=iKr391sWA2ObkyIek3wSrG85G3mo6siYiOlyuVGPWEqmx/UR5ADU1L6nf3IFYBJbeikGcuN5xeHJhHZHxiKjYdGqkyUGvxO7SsyFHbnwbBpQl+04zmaqXRmIGtJzqk8T6eZN3oETkojRyjwnyOIS1x8OZksnPa5fJ0JCoCA7TlU=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org;
 spf=none smtp.mailfrom=infradead.org;
 dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b=Gq/0Mxzd; arc=none smtp.client-ip=90.155.50.34
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
 spf=none smtp.mailfrom=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b="Gq/0Mxzd"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
	d=infradead.org; s=casper.20170209; h=Content-Transfer-Encoding:MIME-Version:
	References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:
	Content-Type:Content-ID:Content-Description;
	bh=WhcJ4ks4HR7lN45MGHC2zUK+ijZkfxtGRLCaVoT0v5s=; b=Gq/0MxzdEpQpNHdYEXw6rg22v3
	jej35xilsOWPwV3rZEPBdCoN0yrqUe5upUU5hkRRsgmBSL8+fH9Cgs3b4zSi6zpamo0vMDWJCaxzs
	RRVZe00tiChPX2z9Lrlh7rbync642scx8OC4ZtqEvvxdtLSW7sjzUx9V7OJNDDgFy1bFMGwApZLnT
	FCVDV5MbR2ndmvi0LAJwomNTkdGoX5TGsgDbnBp+N+Zht0z0MmuRxqmfZNTFDGf5vAnYDfj7QcP66
	zpwBC9WCjT5LMMOV5FJayNmQsjWjMlbxR3RBVWzcoV3sVLtomOeBHp4xPukv9AUyU39FYK/m2+LC6
	cmo4m+MQ==;
Received: from griffoul by casper.infradead.org with local (Exim 4.98.2 #2
 (Red Hat Linux))
	id 1uvjUT-0000000DO4m-03RQ;
	Mon, 08 Sep 2025 21:33:33 +0000
From: Fred Griffoul <griffoul@infradead.org>
To: kvm@vger.kernel.org
Cc: griffoul@gmail.com,
	Fred Griffoul <fgriffo@amazon.co.uk>,
	Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>,
	Borislav Petkov <bp@alien8.de>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	x86@kernel.org,
	"H. Peter Anvin" <hpa@zytor.com>,
	Shuah Khan <shuah@kernel.org>,
	linux-kernel@vger.kernel.org,
	linux-kselftest@vger.kernel.org
Subject: [PATCH 3/5] KVM: x86: Add nested state validation for pfncache
 support
Date: Mon,  8 Sep 2025 22:32:28 +0100
Message-ID: <20250908213241.3189113-4-griffoul@infradead.org>
X-Mailer: git-send-email 2.51.0
In-Reply-To: <20250908213241.3189113-1-griffoul@infradead.org>
References: <20250908213241.3189113-1-griffoul@infradead.org>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Implement state validation for nested virtualization to enable pfncache
support for L1 guest pages.

This adds a new nested_ops callback 'is_nested_state_invalid()' that
detects when KVM needs to reload nested virtualization state. A
KVM_REQ_GET_NESTED_STATE_PAGES request is triggered to reload affected
pages before L2 execution when it detects invalid state. The callback
monitors L1 guest pages during guest entry/exit while the vCPU runs in
IN_GUEST_MODE.

Currently, VMX implementations return false, with full support planned
for the next patch.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx/nested.c       |  6 ++++++
 arch/x86/kvm/x86.c              | 14 +++++++++++++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_hos=
t.h
index c56cc54d682a..97df7cac14e6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1939,6 +1939,7 @@ struct kvm_x86_nested_ops {
 			 struct kvm_nested_state __user *user_kvm_nested_state,
 			 struct kvm_nested_state *kvm_state);
 	bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu);
+	bool (*is_nested_state_invalid)(struct kvm_vcpu *vcpu);
 	int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
=20
 	int (*enable_evmcs)(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index aa4fe1fe571d..06187b8baa19 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3472,6 +3472,11 @@ static bool vmx_get_nested_state_pages(struct kvm_vc=
pu *vcpu)
 	return true;
 }
=20
+static bool vmx_is_nested_state_invalid(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+
 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
 {
 	struct vmcs12 *vmcs12;
@@ -7366,6 +7371,7 @@ struct kvm_x86_nested_ops vmx_nested_ops =3D {
 	.get_state =3D vmx_get_nested_state,
 	.set_state =3D vmx_set_nested_state,
 	.get_nested_state_pages =3D vmx_get_nested_state_pages,
+	.is_nested_state_invalid =3D vmx_is_nested_state_invalid,
 	.write_log_dirty =3D nested_vmx_write_pml_buffer,
 #ifdef CONFIG_KVM_HYPERV
 	.enable_evmcs =3D nested_enable_evmcs,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 33fba801b205..db4a6b62f81f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2125,12 +2125,24 @@ int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
=20
+static inline bool kvm_invalid_nested_state(struct kvm_vcpu *vcpu)
+{
+	if (is_guest_mode(vcpu) &&
+	    kvm_x86_ops.nested_ops->is_nested_state_invalid &&
+	    kvm_x86_ops.nested_ops->is_nested_state_invalid(vcpu)) {
+		kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+		return true;
+	}
+	return false;
+}
+
 static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
 {
 	xfer_to_guest_mode_prepare();
=20
 	return READ_ONCE(vcpu->mode) =3D=3D EXITING_GUEST_MODE ||
-	       kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending();
+	       kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending() ||
+	       kvm_invalid_nested_state(vcpu);
 }
=20
 /*
--=20
2.51.0
From nobody Wed Sep 10 01:55:25 2025
Received: from casper.infradead.org (casper.infradead.org [90.155.50.34])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2063B2EB872;
	Mon,  8 Sep 2025 21:33:43 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=90.155.50.34
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1757367225; cv=none;
 b=YDFGHsRxDswqTS3+KmmYrwKI2rxL9csYOUJQuJEtcwccrMoscaHlRWhhM7e9iLUwFNAOT0EteBnUbiea4L+OCkCFxScJ1L46fBBfdZBeRg1aKQcHuHq33adN6GJxkVOY4pMGjeOH4bIF9HggoBVl1F/hrbOe+fxDfN2Agf/jDnY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1757367225; c=relaxed/simple;
	bh=ZyzxJOrXeCBvSkvn4Zaisq4Q66kVSSuiU0l3bqax8/0=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=kH57KPRiYTJaKHpOzM0rq1l2/8wX3Py20GsZ7baaytiUHtq7APWjDVEVfWhyy4dq609eacB9CxRvv9vWlmLlcR+NNOl5nVYSVy+bJ3k4Gj6r1v6Rt+13LUJeQ6kp/CMj/mcF91FQr5itHGUKXzfXb5e6pn8CV9ll376i5C/l5KI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org;
 spf=none smtp.mailfrom=infradead.org;
 dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b=CBnBVU0O; arc=none smtp.client-ip=90.155.50.34
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
 spf=none smtp.mailfrom=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b="CBnBVU0O"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
	d=infradead.org; s=casper.20170209; h=Content-Transfer-Encoding:MIME-Version:
	References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:
	Content-Type:Content-ID:Content-Description;
	bh=xSSCIKHnBwG12TAiPC1gTDS36uFxF6aCGWNSnSCy+HY=; b=CBnBVU0Oras+fwNAhm+aS/6eUD
	a2YGs3uY2H7085Y2Nh/sGOoVnnRmKk7sH20n1CMyYv5+RQJ0xKWbSBDpl4LhxFaTNvdm1nJhUo8f/
	JIWo5AUClXDTBVRismVK9IX2pT0oUknLyuAfXr/jb65ySEjs92FZ+6lyZVwCUOZVSBXohYlI+tG0B
	VfU3k3S1MuH+8gWBhvKn2VdxBwOBWRj98Gu3//Q1px8/AynTy8St4kfVhA1r7od1PCN2favih0rx8
	lMOartX1pLhQotiVvPBYS1qbBmLs2i2pvud5uj8mBQWASjEBkL3P0ie0mnd82oRaxFs2psQggcGOM
	07ei6Pgw==;
Received: from griffoul by casper.infradead.org with local (Exim 4.98.2 #2
 (Red Hat Linux))
	id 1uvjUa-0000000DOC3-1sYu;
	Mon, 08 Sep 2025 21:33:40 +0000
From: Fred Griffoul <griffoul@infradead.org>
To: kvm@vger.kernel.org
Cc: griffoul@gmail.com,
	Fred Griffoul <fgriffo@amazon.co.uk>,
	Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>,
	Borislav Petkov <bp@alien8.de>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	x86@kernel.org,
	"H. Peter Anvin" <hpa@zytor.com>,
	Shuah Khan <shuah@kernel.org>,
	linux-kernel@vger.kernel.org,
	linux-kselftest@vger.kernel.org
Subject: [PATCH 4/5] KVM: nVMX: Implement cache for L1 APIC pages
Date: Mon,  8 Sep 2025 22:32:29 +0100
Message-ID: <20250908213241.3189113-5-griffoul@infradead.org>
X-Mailer: git-send-email 2.51.0
In-Reply-To: <20250908213241.3189113-1-griffoul@infradead.org>
References: <20250908213241.3189113-1-griffoul@infradead.org>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Replace kvm_host_map usage with persistent gfn_to_pfn_cache for L1 APIC
virtualization pages (APIC access, virtual APIC, and posted interrupt
descriptor pages) to improve performance with unmanaged guest memory.

The conversion involves several key changes:

- Page loading in nested_get_vmcs12_pages(): load vmcs02 fields with
  pfncache PFNs after each cache has been checked and possibly activated
  or refreshed, during OUTSIDE_GUEST_MODE vCPU mode.

- Invalidation window handling: since nested_get_vmcs12_pages() runs in
  OUTSIDE_GUEST_MODE, there's a window where caches can be invalidated
  by MMU notifications before entering IN_GUEST_MODE. implement
  is_nested_state_invalid() callback to monitor cache validity between
  OUTSIDE_GUEST_MODE and IN_GUEST_MODE transitions. This triggers
  KVM_REQ_GET_NESTED_STATE_PAGES when needed.

- Cache access in event callbacks: the virtual APIC and posted interrupt
  descriptor pages are accessed by KVM in has_events() and
  check_events() nested_ops callbacks. These use the kernel HVA following
  the pfncache pattern of check/refresh, with both callbacks able to sleep
  if cache refresh is required.

This eliminates expensive memremap/memunmap cycles for each L2 VM
entry/exit, providing substantial performance improvements when using
unmanaged memory such as guest_memfd or memory passed with mem=3D kernel
parameter.

The persistent caching approach maintains correctness through proper
invalidation detection while avoiding the overhead of repeated mapping
operations.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 arch/x86/kvm/vmx/nested.c | 169 +++++++++++++++++++++++++++++---------
 arch/x86/kvm/vmx/vmx.h    |   8 +-
 include/linux/kvm_host.h  |   5 ++
 3 files changed, 139 insertions(+), 43 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 06187b8baa19..0cb66314d58b 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -329,8 +329,18 @@ static int nested_gpc_lock(struct gfn_to_pfn_cache *gp=
c, gpa_t gpa)
 	if (!kvm_gpc_check(gpc, PAGE_SIZE) || (gpc->gpa !=3D gpa)) {
 		read_unlock(&gpc->lock);
 		err =3D kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
-		if (err)
+		if (err) {
+			/*
+			 * Deactivate nested state caches to prevent
+			 * kvm_gpc_invalid() from returning true in subsequent
+			 * is_nested_state_invalid() calls. This prevents an
+			 * infinite loop while entering guest mode.
+			 */
+			if (gpc->vcpu)
+				kvm_gpc_deactivate(gpc);
+
 			return err;
+		}
=20
 		goto retry;
 	}
@@ -343,14 +353,17 @@ static void nested_gpc_unlock(struct gfn_to_pfn_cache=
 *gpc)
 	read_unlock(&gpc->lock);
 }
=20
-static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
+static int nested_gpc_hpa(struct gfn_to_pfn_cache *gpc, gpa_t gpa, hpa_t *=
hpa)
 {
-	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
+	int err;
+
+	err =3D nested_gpc_lock(gpc, gpa);
+	if (err)
+		return err;
=20
-	kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map);
-	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map);
-	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map);
-	vmx->nested.pi_desc =3D NULL;
+	*hpa =3D pfn_to_hpa(gpc->pfn);
+	nested_gpc_unlock(gpc);
+	return 0;
 }
=20
 /*
@@ -373,6 +386,9 @@ static void free_nested(struct kvm_vcpu *vcpu)
 	vmx->nested.smm.vmxon =3D false;
 	vmx->nested.vmxon_ptr =3D INVALID_GPA;
=20
+	kvm_gpc_deactivate(&vmx->nested.pi_desc_cache);
+	kvm_gpc_deactivate(&vmx->nested.virtual_apic_cache);
+	kvm_gpc_deactivate(&vmx->nested.apic_access_page_cache);
 	kvm_gpc_deactivate(&vmx->nested.msr_bitmap_cache);
=20
 	free_vpid(vmx->nested.vpid02);
@@ -389,8 +405,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
 	kfree(vmx->nested.cached_shadow_vmcs12);
 	vmx->nested.cached_shadow_vmcs12 =3D NULL;
=20
-	nested_put_vmcs12_pages(vcpu);
-
 	kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
=20
 	nested_release_evmcs(vcpu);
@@ -3361,7 +3375,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *=
vcpu)
 {
 	struct vmcs12 *vmcs12 =3D get_vmcs12(vcpu);
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
-	struct kvm_host_map *map;
+	struct gfn_to_pfn_cache *gpc;
+	hpa_t hpa;
=20
 	if (!vcpu->arch.pdptrs_from_userspace &&
 	    !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
@@ -3376,10 +3391,10 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu=
 *vcpu)
=20
=20
 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-		map =3D &vmx->nested.apic_access_page_map;
+		gpc =3D &vmx->nested.apic_access_page_cache;
=20
-		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
-			vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
+		if (!nested_gpc_hpa(gpc, vmcs12->apic_access_addr, &hpa)) {
+			vmcs_write64(APIC_ACCESS_ADDR, hpa);
 		} else {
 			pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\=
n",
 					     __func__);
@@ -3392,10 +3407,10 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu=
 *vcpu)
 	}
=20
 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-		map =3D &vmx->nested.virtual_apic_map;
+		gpc =3D &vmx->nested.virtual_apic_cache;
=20
-		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)=
) {
-			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
+		if (!nested_gpc_hpa(gpc, vmcs12->virtual_apic_page_addr, &hpa)) {
+			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
 		} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
 		           nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
 			   !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -3418,14 +3433,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu=
 *vcpu)
 	}
=20
 	if (nested_cpu_has_posted_intr(vmcs12)) {
-		map =3D &vmx->nested.pi_desc_map;
+		gpc =3D &vmx->nested.pi_desc_cache;
=20
-		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map))=
 {
-			vmx->nested.pi_desc =3D
-				(struct pi_desc *)(((void *)map->hva) +
-				offset_in_page(vmcs12->posted_intr_desc_addr));
+		if (!nested_gpc_hpa(gpc, vmcs12->posted_intr_desc_addr & PAGE_MASK, &hpa=
)) {
+			vmx->nested.pi_desc_offset =3D offset_in_page(vmcs12->posted_intr_desc_=
addr);
 			vmcs_write64(POSTED_INTR_DESC_ADDR,
-				     pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_ad=
dr));
+				     hpa + offset_in_page(vmcs12->posted_intr_desc_addr));
 		} else {
 			/*
 			 * Defer the KVM_INTERNAL_EXIT until KVM tries to
@@ -3433,7 +3446,6 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *=
vcpu)
 			 * descriptor. (Note that KVM may do this when it
 			 * should not, per the architectural specification.)
 			 */
-			vmx->nested.pi_desc =3D NULL;
 			pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
 		}
 	}
@@ -3474,7 +3486,16 @@ static bool vmx_get_nested_state_pages(struct kvm_vc=
pu *vcpu)
=20
 static bool vmx_is_nested_state_invalid(struct kvm_vcpu *vcpu)
 {
-	return false;
+	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
+
+	/*
+	 * @vcpu is in IN_GUEST_MODE, eliminating the need for individual gpc
+	 * locks. Since kvm_gpc_invalid() doesn't verify gpc memslot
+	 * generation, we can also skip acquiring the srcu lock.
+	 */
+	return kvm_gpc_invalid(&vmx->nested.apic_access_page_cache) ||
+		kvm_gpc_invalid(&vmx->nested.virtual_apic_cache) ||
+		kvm_gpc_invalid(&vmx->nested.pi_desc_cache);
 }
=20
 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
@@ -3969,9 +3990,55 @@ void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu =
*vcpu)
 	}
 }
=20
+static void *nested_gpc_lock_if_active(struct gfn_to_pfn_cache *gpc)
+{
+retry:
+	read_lock(&gpc->lock);
+	if (!gpc->active) {
+		read_unlock(&gpc->lock);
+		return NULL;
+	}
+
+	if (!kvm_gpc_check(gpc, PAGE_SIZE)) {
+		read_unlock(&gpc->lock);
+		if (kvm_gpc_refresh(gpc, PAGE_SIZE))
+			return NULL;
+		goto retry;
+	}
+
+	return gpc->khva;
+}
+
+static struct pi_desc *nested_lock_pi_desc(struct vcpu_vmx *vmx)
+{
+	u8 *pi_desc_page;
+
+	pi_desc_page =3D nested_gpc_lock_if_active(&vmx->nested.pi_desc_cache);
+	if (!pi_desc_page)
+		return NULL;
+
+	return (struct pi_desc *)(pi_desc_page + vmx->nested.pi_desc_offset);
+}
+
+static void nested_unlock_pi_desc(struct vcpu_vmx *vmx)
+{
+	nested_gpc_unlock(&vmx->nested.pi_desc_cache);
+}
+
+static void *nested_lock_vapic(struct vcpu_vmx *vmx)
+{
+	return nested_gpc_lock_if_active(&vmx->nested.virtual_apic_cache);
+}
+
+static void nested_unlock_vapic(struct vcpu_vmx *vmx)
+{
+	nested_gpc_unlock(&vmx->nested.virtual_apic_cache);
+}
+
 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
+	struct pi_desc *pi_desc;
 	int max_irr;
 	void *vapic_page;
 	u16 status;
@@ -3979,22 +4046,29 @@ static int vmx_complete_nested_posted_interrupt(str=
uct kvm_vcpu *vcpu)
 	if (!vmx->nested.pi_pending)
 		return 0;
=20
-	if (!vmx->nested.pi_desc)
+	pi_desc =3D nested_lock_pi_desc(vmx);
+	if (!pi_desc)
 		goto mmio_needed;
=20
 	vmx->nested.pi_pending =3D false;
=20
-	if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+	if (!pi_test_and_clear_on(pi_desc)) {
+		nested_unlock_pi_desc(vmx);
 		return 0;
+	}
=20
-	max_irr =3D pi_find_highest_vector(vmx->nested.pi_desc);
+	max_irr =3D pi_find_highest_vector(pi_desc);
 	if (max_irr > 0) {
-		vapic_page =3D vmx->nested.virtual_apic_map.hva;
-		if (!vapic_page)
+		vapic_page =3D nested_lock_vapic(vmx);
+		if (!vapic_page) {
+			nested_unlock_pi_desc(vmx);
 			goto mmio_needed;
+		}
+
+		__kvm_apic_update_irr(pi_desc->pir, vapic_page, &max_irr);
+
+		nested_unlock_vapic(vmx);
=20
-		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
-			vapic_page, &max_irr);
 		status =3D vmcs_read16(GUEST_INTR_STATUS);
 		if ((u8)max_irr > ((u8)status & 0xff)) {
 			status &=3D ~0xff;
@@ -4003,6 +4077,7 @@ static int vmx_complete_nested_posted_interrupt(struc=
t kvm_vcpu *vcpu)
 		}
 	}
=20
+	nested_unlock_pi_desc(vmx);
 	nested_mark_vmcs12_pages_dirty(vcpu);
 	return 0;
=20
@@ -4122,8 +4197,10 @@ static bool nested_vmx_preemption_timer_pending(stru=
ct kvm_vcpu *vcpu)
 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injectio=
n)
 {
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
-	void *vapic =3D vmx->nested.virtual_apic_map.hva;
+	struct pi_desc *pi_desc;
 	int max_irr, vppr;
+	void *vapic;
+	bool res =3D false;
=20
 	if (nested_vmx_preemption_timer_pending(vcpu) ||
 	    vmx->nested.mtf_pending)
@@ -4142,23 +4219,33 @@ static bool vmx_has_nested_events(struct kvm_vcpu *=
vcpu, bool for_injection)
 	    __vmx_interrupt_blocked(vcpu))
 		return false;
=20
+	vapic =3D nested_lock_vapic(vmx);
 	if (!vapic)
 		return false;
=20
 	vppr =3D *((u32 *)(vapic + APIC_PROCPRI));
=20
+	nested_unlock_vapic(vmx);
+
 	max_irr =3D vmx_get_rvi();
 	if ((max_irr & 0xf0) > (vppr & 0xf0))
 		return true;
=20
-	if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
-	    pi_test_on(vmx->nested.pi_desc)) {
-		max_irr =3D pi_find_highest_vector(vmx->nested.pi_desc);
-		if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
-			return true;
+	if (vmx->nested.pi_pending) {
+		pi_desc =3D nested_lock_pi_desc(vmx);
+		if (!pi_desc)
+			return false;
+
+		if (pi_test_on(pi_desc)) {
+			max_irr =3D pi_find_highest_vector(pi_desc);
+			if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
+				res =3D true;
+		}
+
+		nested_unlock_pi_desc(vmx);
 	}
=20
-	return false;
+	return res;
 }
=20
 /*
@@ -5106,7 +5193,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 v=
m_exit_reason,
 		vmx_update_cpu_dirty_logging(vcpu);
 	}
=20
-	nested_put_vmcs12_pages(vcpu);
+	nested_mark_vmcs12_pages_dirty(vcpu);
=20
 	if (vmx->nested.reload_vmcs01_apic_access_page) {
 		vmx->nested.reload_vmcs01_apic_access_page =3D false;
@@ -5391,6 +5478,10 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
=20
 	kvm_gpc_init(&vmx->nested.msr_bitmap_cache, vcpu->kvm);
=20
+	kvm_gpc_init_for_vcpu(&vmx->nested.apic_access_page_cache, vcpu);
+	kvm_gpc_init_for_vcpu(&vmx->nested.virtual_apic_cache, vcpu);
+	kvm_gpc_init_for_vcpu(&vmx->nested.pi_desc_cache, vcpu);
+
 	vmx->nested.vmcs02_initialized =3D false;
 	vmx->nested.vmxon =3D true;
=20
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 3a6983222841..2c74c65d3383 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -158,11 +158,11 @@ struct nested_vmx {
 	 * Guest pages referred to in the vmcs02 with host-physical
 	 * pointers, so we must keep them pinned while L2 runs.
 	 */
-	struct kvm_host_map apic_access_page_map;
-	struct kvm_host_map virtual_apic_map;
-	struct kvm_host_map pi_desc_map;
+	struct gfn_to_pfn_cache apic_access_page_cache;
+	struct gfn_to_pfn_cache virtual_apic_cache;
+	struct gfn_to_pfn_cache pi_desc_cache;
=20
-	struct pi_desc *pi_desc;
+	u64 pi_desc_offset;
 	bool pi_pending;
 	u16 posted_intr_nv;
=20
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2eb551a11818..dc622adb561f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1526,6 +1526,11 @@ static inline bool kvm_gpc_is_hva_active(struct gfn_=
to_pfn_cache *gpc)
 	return gpc->active && kvm_is_error_gpa(gpc->gpa);
 }
=20
+static inline bool kvm_gpc_invalid(struct gfn_to_pfn_cache *gpc)
+{
+	return gpc->active && !gpc->valid;
+}
+
 void kvm_sigset_activate(struct kvm_vcpu *vcpu);
 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
=20
--=20
2.51.0
From nobody Wed Sep 10 01:55:25 2025
Received: from casper.infradead.org (casper.infradead.org [90.155.50.34])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id A350C322A32;
	Mon,  8 Sep 2025 21:33:48 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=90.155.50.34
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1757367230; cv=none;
 b=h4fX2mgUB2QvI7GegtUPS3kyIOYgnmWVn/69EQllTj8q2JiyZktFPVmDYU+uiKLvG4iRtjecjIGeRTJadKAS288y39FfDCW4NWRlVoejJHPzpPMKEokcImqAAePFKuyqX+ujCMJ/E2f+KY0wUC7OpXtbnVrpLcaVNqs4zJa6arg=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1757367230; c=relaxed/simple;
	bh=cWjZ/0tujnxmnc8VcuuOQHPCItl8OhAMSkPsY3SIRCo=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=SgYi8A5FuxvmoxppEjSemmrKKzkKjIr8ZCkuxHhGmxxVgX61v3t8tv39VWQ/gD2XkTYOuWAyh1XpS82pL/FWmXclbKljm64vxaz0uBuP/9BlF2daDJO8hw7tvKic8O/ijlQqUUs/Up3Py66f67wL7H1zLQmFH+S3JYMQUxEOTkI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org;
 spf=none smtp.mailfrom=infradead.org;
 dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b=j7KRMTbT; arc=none smtp.client-ip=90.155.50.34
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
 spf=none smtp.mailfrom=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b="j7KRMTbT"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
	d=infradead.org; s=casper.20170209; h=Content-Transfer-Encoding:MIME-Version:
	References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:
	Content-Type:Content-ID:Content-Description;
	bh=YMBiXsmw3+pphIcAsuGv9ZDurAs5Mc73MEcRV/bqKLE=; b=j7KRMTbTSoct2yth0mxl6n/Dun
	WCgeohzSmkryZCDkKbfYbPpadCGiwb3cn1OZ8JKXD9PrZSkVELmfRJoZdF14k/it9ptGmiCDGkxvn
	Ivj/q2tQe9VC7ZUOi0D+vl3mPlFncfvjxMIzTGcZs+vq1zLOJOnSI+TsCNgrov6bSSiiWQG6LgNKR
	y0dZgtqtEX25ZvmtSLD0f25c+w+/ZIy7OYJny/FsDkK94j+xtASkN3C8lmVqu7kjsPbOQloo7qbG9
	kkUqvhNMXKrDuLGwznjZf+MqBmaMiO156wXKvqLtUgVVOmLkjKXdCeRb0Yq8rrvnrcoYFVMqRrdnx
	oQvrAHdQ==;
Received: from griffoul by casper.infradead.org with local (Exim 4.98.2 #2
 (Red Hat Linux))
	id 1uvjUg-0000000DOEe-3ILa;
	Mon, 08 Sep 2025 21:33:46 +0000
From: Fred Griffoul <griffoul@infradead.org>
To: kvm@vger.kernel.org
Cc: griffoul@gmail.com,
	Fred Griffoul <fgriffo@amazon.co.uk>,
	Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>,
	Borislav Petkov <bp@alien8.de>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	x86@kernel.org,
	"H. Peter Anvin" <hpa@zytor.com>,
	Shuah Khan <shuah@kernel.org>,
	linux-kernel@vger.kernel.org,
	linux-kselftest@vger.kernel.org
Subject: [PATCH 5/5] KVM: selftests: Add nested VMX APIC cache invalidation
 test
Date: Mon,  8 Sep 2025 22:32:30 +0100
Message-ID: <20250908213241.3189113-6-griffoul@infradead.org>
X-Mailer: git-send-email 2.51.0
In-Reply-To: <20250908213241.3189113-1-griffoul@infradead.org>
References: <20250908213241.3189113-1-griffoul@infradead.org>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Introduce selftest to verify nested VMX APIC virtualization page cache
invalidation and refresh mechanisms for pfncache implementation.

The test exercises the nested VMX APIC cache invalidation path through:

- L2 guest setup: creates a nested environment where L2 accesses the
  APIC access page that is cached by KVM using pfncache.

- Cache invalidation triggers: a separate update thread periodically
  invalidates the cached pages using either:
   - madvise(MADV_DONTNEED) to trigger MMU notifications.
   - vm_mem_region_move() to trigger memslot changes.

The test validates that:
- L2 can successfully access APIC page before and after invalidation.
- KVM properly handles cache refresh without guest-visible errors.
- Both MMU notification and memslot change invalidation paths work
  correctly.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 tools/testing/selftests/kvm/Makefile.kvm      |   1 +
 .../selftests/kvm/x86/vmx_apic_update_test.c  | 302 ++++++++++++++++++
 2 files changed, 303 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_apic_update_test.c

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selft=
ests/kvm/Makefile.kvm
index 90f03f00cb04..5d4505c7f6f0 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -136,6 +136,7 @@ TEST_GEN_PROGS_x86 +=3D x86/max_vcpuid_cap_test
 TEST_GEN_PROGS_x86 +=3D x86/triple_fault_event_test
 TEST_GEN_PROGS_x86 +=3D x86/recalc_apic_map_test
 TEST_GEN_PROGS_x86 +=3D x86/aperfmperf_test
+TEST_GEN_PROGS_x86 +=3D x86/vmx_apic_update_test
 TEST_GEN_PROGS_x86 +=3D access_tracking_perf_test
 TEST_GEN_PROGS_x86 +=3D coalesced_io_test
 TEST_GEN_PROGS_x86 +=3D dirty_log_perf_test
diff --git a/tools/testing/selftests/kvm/x86/vmx_apic_update_test.c b/tools=
/testing/selftests/kvm/x86/vmx_apic_update_test.c
new file mode 100644
index 000000000000..22f82cf6dd0c
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_apic_update_test.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_apic_update_test
+ *
+ * Copyright (C) 2025, mazon.com, Inc. or its affiliates. All Rights Reser=
ved.
+ *
+ * Test L2 guest APIC access page writes with concurrent MMU
+ * notifications and memslot move updates.
+ */
+#include <pthread.h>
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define VAPIC_GPA	0xc0000000
+#define VAPIC_SLOT	1
+
+#define L2_GUEST_STACK_SIZE 64
+
+#define L2_DELAY	(100)
+
+static void l2_guest_code(void)
+{
+	uint32_t *vapic_addr =3D (uint32_t *) (VAPIC_GPA + 0x80);
+
+	/* Unroll the loop to avoid any compiler side effect */
+
+	WRITE_ONCE(*vapic_addr, 1 << 0);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 1 << 1);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 1 << 2);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 1 << 3);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 1 << 4);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 1 << 5);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 1 << 6);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 0);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	/* Exit to L1 */
+	vmcall();
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	uint32_t control, exit_reason;
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/* Enable APIC access */
+	control =3D vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+	control |=3D CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+	control =3D vmreadz(SECONDARY_VM_EXEC_CONTROL);
+	control |=3D SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+	vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
+	vmwrite(APIC_ACCESS_ADDR, VAPIC_GPA);
+
+	GUEST_SYNC1(0);
+	GUEST_ASSERT(!vmlaunch());
+again:
+	exit_reason =3D vmreadz(VM_EXIT_REASON);
+	if (exit_reason =3D=3D EXIT_REASON_APIC_ACCESS) {
+		uint64_t guest_rip =3D vmreadz(GUEST_RIP);
+		uint64_t instr_len =3D vmreadz(VM_EXIT_INSTRUCTION_LEN);
+
+		vmwrite(GUEST_RIP, guest_rip + instr_len);
+		GUEST_ASSERT(!vmresume());
+		goto again;
+	}
+
+	GUEST_SYNC1(exit_reason);
+	GUEST_ASSERT(exit_reason =3D=3D EXIT_REASON_VMCALL);
+	GUEST_DONE();
+}
+
+static const char *progname;
+static int update_period_ms =3D L2_DELAY / 4;
+
+struct update_control {
+	pthread_mutex_t mutex;
+	pthread_cond_t start_cond;
+	struct kvm_vm *vm;
+	bool running;
+	bool started;
+	int updates;
+};
+
+static void wait_for_start_signal(struct update_control *ctrl)
+{
+	pthread_mutex_lock(&ctrl->mutex);
+	while (!ctrl->started)
+		pthread_cond_wait(&ctrl->start_cond, &ctrl->mutex);
+
+	pthread_mutex_unlock(&ctrl->mutex);
+	printf("%s: starting update\n", progname);
+}
+
+static bool is_running(struct update_control *ctrl)
+{
+	return READ_ONCE(ctrl->running);
+}
+
+static void set_running(struct update_control *ctrl, bool running)
+{
+	WRITE_ONCE(ctrl->running, running);
+}
+
+static void signal_thread_start(struct update_control *ctrl)
+{
+	pthread_mutex_lock(&ctrl->mutex);
+	if (!ctrl->started) {
+		ctrl->started =3D true;
+		pthread_cond_signal(&ctrl->start_cond);
+	}
+	pthread_mutex_unlock(&ctrl->mutex);
+}
+
+static void *update_madvise(void *arg)
+{
+	struct update_control *ctrl =3D arg;
+	void *hva;
+
+	wait_for_start_signal(ctrl);
+
+	hva =3D addr_gpa2hva(ctrl->vm, VAPIC_GPA);
+	memset(hva, 0x45, ctrl->vm->page_size);
+
+	while (is_running(ctrl)) {
+		usleep(update_period_ms * 1000);
+		madvise(hva, ctrl->vm->page_size, MADV_DONTNEED);
+		ctrl->updates++;
+	}
+
+	return NULL;
+}
+
+static void *update_move_memslot(void *arg)
+{
+	struct update_control *ctrl =3D arg;
+	uint64_t gpa =3D VAPIC_GPA;
+
+	wait_for_start_signal(ctrl);
+
+	while (is_running(ctrl)) {
+		usleep(update_period_ms * 1000);
+		gpa +=3D 0x10000;
+		vm_mem_region_move(ctrl->vm, VAPIC_SLOT, gpa);
+		ctrl->updates++;
+	}
+
+	return NULL;
+}
+
+static void run(void * (*update)(void *), const char *name)
+{
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	struct vmx_pages *vmx;
+	struct update_control ctrl;
+	struct ucall uc;
+	vm_vaddr_t vmx_pages_gva;
+	pthread_t update_thread;
+	bool done =3D false;
+
+	vm =3D vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+	/* Allocate VMX pages */
+	vmx =3D vcpu_alloc_vmx(vm, &vmx_pages_gva);
+
+	/* Allocate memory and create VAPIC memslot */
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, VAPIC_GPA,
+				    VAPIC_SLOT, 1, 0);
+
+	/* Allocate guest page table */
+	virt_map(vm, VAPIC_GPA, VAPIC_GPA, 1);
+
+	/* Set up nested EPT */
+	prepare_eptp(vmx, vm, 0);
+	nested_map_memslot(vmx, vm, 0);
+	nested_map_memslot(vmx, vm, VAPIC_SLOT);
+	nested_map(vmx, vm, VAPIC_GPA, VAPIC_GPA, vm->page_size);
+
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
+
+	pthread_mutex_init(&ctrl.mutex, NULL);
+	pthread_cond_init(&ctrl.start_cond, NULL);
+	ctrl.vm =3D vm;
+	ctrl.running =3D true;
+	ctrl.started =3D false;
+	ctrl.updates =3D 0;
+
+	pthread_create(&update_thread, NULL, update, &ctrl);
+
+	printf("%s: running %s (tsc_khz %lu)\n", progname, name, guest_tsc_khz);
+
+	while (!done) {
+		vcpu_run(vcpu);
+
+		switch (vcpu->run->exit_reason) {
+		case KVM_EXIT_IO:
+			switch (get_ucall(vcpu, &uc)) {
+			case UCALL_SYNC:
+				printf("%s: sync(%ld)\n", progname, uc.args[0]);
+				if (uc.args[0] =3D=3D 0)
+					signal_thread_start(&ctrl);
+				break;
+			case UCALL_ABORT:
+				REPORT_GUEST_ASSERT(uc);
+				/* NOT REACHED */
+			case UCALL_DONE:
+				done =3D true;
+				break;
+			default:
+				TEST_ASSERT(false, "Unknown ucall %lu", uc.cmd);
+			}
+			break;
+		case KVM_EXIT_MMIO:
+			/* Handle APIC MMIO access after memslot move */
+			printf
+			    ("%s: APIC MMIO access at 0x%llx (memslot move effect)\n",
+			     progname, vcpu->run->mmio.phys_addr);
+			break;
+		default:
+			TEST_FAIL("%s: Unexpected exit reason: %d (flags 0x%x)",
+				  progname,
+				  vcpu->run->exit_reason, vcpu->run->flags);
+		}
+	}
+
+	set_running(&ctrl, false);
+	if (!ctrl.started)
+		signal_thread_start(&ctrl);
+	pthread_join(update_thread, NULL);
+	printf("%s: completed with %d updates\n", progname, ctrl.updates);
+
+	pthread_mutex_destroy(&ctrl.mutex);
+	pthread_cond_destroy(&ctrl.start_cond);
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	int opt_madvise =3D 0;
+	int opt_memslot_move =3D 0;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+	TEST_REQUIRE(kvm_cpu_has_ept());
+
+	if (argc =3D=3D 1) {
+		opt_madvise =3D 1;
+		opt_memslot_move =3D 1;
+	} else {
+		int opt;
+
+		while ((opt =3D getopt(argc, argv, "amp:")) !=3D -1) {
+			switch (opt) {
+			case 'a':
+				opt_madvise =3D 1;
+				break;
+			case 'm':
+				opt_memslot_move =3D 1;
+				break;
+			case 'p':
+				update_period_ms =3D atoi(optarg);
+				break;
+			default:
+				exit(1);
+			}
+		}
+	}
+
+	TEST_ASSERT(opt_madvise
+		    || opt_memslot_move, "No update test configured");
+
+	progname =3D argv[0];
+
+	if (opt_madvise)
+		run(update_madvise, "madvise");
+
+	if (opt_memslot_move)
+		run(update_move_memslot, "move memslot");
+
+	return 0;
+}
--=20
2.51.0