From nobody Sun Feb  8 03:44:33 2026
Received: from mail-wr1-f44.google.com (mail-wr1-f44.google.com
 [209.85.221.44])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 91E43333439
	for <linux-kernel@vger.kernel.org>; Fri, 21 Nov 2025 11:11:25 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.221.44
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1763723487; cv=none;
 b=WG0gnEAKAe/yhRJv8LNmna3wmwskRUBODNzmicbwe5n2/pe93DvWTsSpq5hdctUEkgMdYX4zeeg1TdIgPejRMT0ETlCSWQhLW06HnR2Txs6G57l/x/kxW7a295UcMkgVNsNmRfZIyoOtJH5ukBTEnaUf9u/XK1zU0OQjcHhN60M=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1763723487; c=relaxed/simple;
	bh=GTh03DnovL1IfXrIJWOOWsXVB4KulpyzdpPOxbFjAXc=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=MJJOWYMRH7GBzDWhcSUzu0o1BfFkf3AwHKObZgzcFaWz/+bDWgArM9VxHNUUtAHD+gQpI0Ad5/FHXJ1YiwabGRh6Ykyq2v6CXzi0et79BdJHMNpxGEv/4DFAbk/C4YapVaRAff+QJb84RFtQD1wumJ4fPqHwb7WxnXj2C2WcREU=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=NRQ0dWgl; arc=none smtp.client-ip=209.85.221.44
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="NRQ0dWgl"
Received: by mail-wr1-f44.google.com with SMTP id
 ffacd0b85a97d-42b566859ecso1718081f8f.2
        for <linux-kernel@vger.kernel.org>;
 Fri, 21 Nov 2025 03:11:25 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1763723484; x=1764328284;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=2msaF/pghuykVBiKeYLpA2uXeztIpV3HBLG8Kfba9b0=;
        b=NRQ0dWglCzb6OKJfkoLo+VryyIaJKNwPiLDqsNKZ3Lxsu3fnHYc/hqSU+aH5pZrsD0
         YEXVg8dC4RHNcNc+KHz/EOeRIAdFfIhYvcrw1nTw3JIE8KRxMn+kmfLPiyddBLSK9Orh
         UzV74q7zMcu542GGnCWSlfV3FxJnBZClVtu6PMyoZ96GhzqZ2bVQ/jmImXcZK6y1uve4
         i/b7ySFkC1wKd4LHHaZE/9l+K2GGNj+JJhFU/oLuBv5sQJE+Mfvb6XvscchFyU3kM8AI
         r1a/yWUzzfxUfk20/QX31Cjc2jtnkvdD8jsS2PnVP+U/skPSm176r4Fu/N3m829hK5CQ
         nJeg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1763723484; x=1764328284;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-gg:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=2msaF/pghuykVBiKeYLpA2uXeztIpV3HBLG8Kfba9b0=;
        b=CbWw7IJzqEQhkIcRyO9TNEkslHVmmslbPg5apBoZuD9LukDf6eH6qgF6aB+B7j40lJ
         rHQjEtuvFRsKQq4bh59vc+TEjN/tYrCpvXr82i4YBysbDFnGpTYlqhk4Su4gXrdOfXPA
         jiMcabVOFA8u9BZMblomdSyhP2ZG3T5XMGkBb1GuDCxhYZ55UXyv3KguixsNjwWr+g11
         yQkoeHT8bbZFvcNqLv9+2l5I/NgnteJbZqJGC+H9OAaCb9JhG7NyJiCEIY9hBmh9dlBx
         nuId0fKgF07i7orxHJLBxXl5XBwjFoq9DnLi6rwJZphnaR2u+No8uaSKdz4NY7kzutP7
         jbWA==
X-Forwarded-Encrypted: i=1;
 AJvYcCVjSN6wob/FZdtUNsbNyje7Ke2jrZQfAA4Ohm89Rl6yJ7mpnncl8cGRMgsLUNBbni90C14KaMF/OHif2CQ=@vger.kernel.org
X-Gm-Message-State: AOJu0Yw/ol0kCAKP5W4ISUJIX8zyNSlSNyNM7dyGptNzo1QNineLW3+1
	Yp+DPgYOrmGwVi1qsmXjdKEU5HJuaPRuL07ga/6n/brUcG4jiAxKB5n6
X-Gm-Gg: ASbGncsvtx5PxMQnRm4jaAn+raBtfCxlkR6GXj1EBiCbca5Inwl13egK3+CYVLnOFTU
	7pVehBxc1R65wSpGixducW6hp4DPyqFI4vvZSNB6ZiAagfHOhS5rWbti3gV4Al412FyWjTklRVM
	ssWFuVAVM4ZzmSEp3IyRUgO0SMKTfGEAvoaqozpYLHfZyteSCJ5KJF2I482m7Ne6HQ3fvk3EGHf
	eX4+EbzqHdjBXe3Okd9Hy3uIdUo9rF3W2jRXyLSkpu857A9JGzYymJcgUQyFT7qzcVU7vbLMZRN
	Y4+hZsKW3InNniSzt59zF/zhGPmGOX47yljWrMo1etF74kbWv4JdUAQ5fHSfd01giTkr3kwdzUT
	Arlj9qNk65AahhGtFrmirUABnG9bUEanDIV52i/anjwv3C0Mrsbyp9k2OFeRvBadRkTU/ScGOIt
	WZZ91NL+FB0Knu+9PHdN91k6OxlXlM3Cpsh5GPKn42JUs3AAuKGYGwURz86Xm+om6pWmjEdIM4R
	z2qrtKDwrSpNVGVG+CxbEtTpP1DpHmQ
X-Google-Smtp-Source: 
 AGHT+IHo5C0rwkC/ovLdq93KgTIAbuzG1TA3/TDefcWM5XOmUmrdZ1+NEnJ8PSLvBfBcBoCYrETpjg==
X-Received: by 2002:a05:6000:40c7:b0:429:9323:2bec with SMTP id
 ffacd0b85a97d-42cc1d1981dmr2054714f8f.40.1763723483713;
        Fri, 21 Nov 2025 03:11:23 -0800 (PST)
Received: from ip-10-0-150-200.eu-west-1.compute.internal
 (ec2-52-49-196-232.eu-west-1.compute.amazonaws.com. [52.49.196.232])
        by smtp.gmail.com with ESMTPSA id
 ffacd0b85a97d-42cb7f363e4sm10484180f8f.12.2025.11.21.03.11.23
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Fri, 21 Nov 2025 03:11:23 -0800 (PST)
From: Fred Griffoul <griffoul@gmail.com>
To: kvm@vger.kernel.org
Cc: seanjc@google.com,
	pbonzini@redhat.com,
	vkuznets@redhat.com,
	shuah@kernel.org,
	dwmw@amazon.co.uk,
	linux-kselftest@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Fred Griffoul <fgriffo@amazon.co.uk>
Subject: [PATCH v3 01/10] KVM: nVMX: Implement cache for L1 MSR bitmap
Date: Fri, 21 Nov 2025 11:11:04 +0000
Message-ID: <20251121111113.456628-2-griffoul@gmail.com>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20251121111113.456628-1-griffoul@gmail.com>
References: <20251121111113.456628-1-griffoul@gmail.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Introduce a gfn_to_pfn_cache to optimize L1 MSR bitmap access by
replacing map/unmap operations. This optimization reduces overhead
during L2 VM-entry where nested_vmx_prepare_msr_bitmap() merges L1's MSR
intercepts with L0's requirements.

Current implementation using kvm_vcpu_map_readonly() and
kvm_vcpu_unmap() creates significant performance impact, mostly with
unmanaged guest memory.

The cache is initialized when entering VMX operation and deactivated
when VMX operation ends.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 arch/x86/kvm/vmx/nested.c | 42 +++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/vmx/vmx.h    |  2 ++
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 8b131780e981..0de84b30c41d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -315,6 +315,34 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, str=
uct loaded_vmcs *vmcs)
 	vcpu->arch.regs_dirty =3D 0;
 }
=20
+/*
+ * Maps a single guest page starting at @gpa and lock the cache for access.
+ */
+static int nested_gpc_lock(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
+{
+	int err;
+
+	if (!PAGE_ALIGNED(gpa))
+		return -EINVAL;
+retry:
+	read_lock(&gpc->lock);
+	if (!kvm_gpc_check(gpc, PAGE_SIZE) || (gpc->gpa !=3D gpa)) {
+		read_unlock(&gpc->lock);
+		err =3D kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
+		if (err)
+			return err;
+
+		goto retry;
+	}
+
+	return 0;
+}
+
+static void nested_gpc_unlock(struct gfn_to_pfn_cache *gpc)
+{
+	read_unlock(&gpc->lock);
+}
+
 static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
@@ -344,6 +372,9 @@ static void free_nested(struct kvm_vcpu *vcpu)
 	vmx->nested.vmxon =3D false;
 	vmx->nested.smm.vmxon =3D false;
 	vmx->nested.vmxon_ptr =3D INVALID_GPA;
+
+	kvm_gpc_deactivate(&vmx->nested.msr_bitmap_cache);
+
 	free_vpid(vmx->nested.vpid02);
 	vmx->nested.posted_intr_nv =3D -1;
 	vmx->nested.current_vmptr =3D INVALID_GPA;
@@ -625,7 +656,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct=
 kvm_vcpu *vcpu,
 	int msr;
 	unsigned long *msr_bitmap_l1;
 	unsigned long *msr_bitmap_l0 =3D vmx->nested.vmcs02.msr_bitmap;
-	struct kvm_host_map map;
+	struct gfn_to_pfn_cache *gpc;
=20
 	/* Nothing to do if the MSR bitmap is not in use.  */
 	if (!cpu_has_vmx_msr_bitmap() ||
@@ -648,10 +679,11 @@ static inline bool nested_vmx_prepare_msr_bitmap(stru=
ct kvm_vcpu *vcpu,
 			return true;
 	}
=20
-	if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map))
+	gpc =3D &vmx->nested.msr_bitmap_cache;
+	if (nested_gpc_lock(gpc, vmcs12->msr_bitmap))
 		return false;
=20
-	msr_bitmap_l1 =3D (unsigned long *)map.hva;
+	msr_bitmap_l1 =3D (unsigned long *)gpc->khva;
=20
 	/*
 	 * To keep the control flow simple, pay eight 8-byte writes (sixteen
@@ -739,7 +771,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct=
 kvm_vcpu *vcpu,
 	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
 					 MSR_IA32_PL3_SSP, MSR_TYPE_RW);
=20
-	kvm_vcpu_unmap(vcpu, &map);
+	nested_gpc_unlock(gpc);
=20
 	vmx->nested.force_msr_bitmap_recalc =3D false;
=20
@@ -5490,6 +5522,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
=20
 	vmx->nested.vpid02 =3D allocate_vpid();
=20
+	kvm_gpc_init(&vmx->nested.msr_bitmap_cache, vcpu->kvm);
+
 	vmx->nested.vmcs02_initialized =3D false;
 	vmx->nested.vmxon =3D true;
=20
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index ea93121029f9..d76621403c28 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -152,6 +152,8 @@ struct nested_vmx {
=20
 	struct loaded_vmcs vmcs02;
=20
+	struct gfn_to_pfn_cache msr_bitmap_cache;
+
 	/*
 	 * Guest pages referred to in the vmcs02 with host-physical
 	 * pointers, so we must keep them pinned while L2 runs.
--=20
2.43.0
From nobody Sun Feb  8 03:44:33 2026
Received: from mail-wr1-f54.google.com (mail-wr1-f54.google.com
 [209.85.221.54])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6796F346E6B
	for <linux-kernel@vger.kernel.org>; Fri, 21 Nov 2025 11:11:27 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.221.54
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1763723489; cv=none;
 b=m95D3siN6uhxpYpjEznUKeDN4zp1Egtb2UGkNTahojm5b+YZEQ4bdD2N5KmBvVr12TvXA6hC1WGBm0BNfdNwQtPsFO5UEacP4lk39dkL0wR7BN4wcozU13vgG0+VYHLOU3jMb44klZQ9bFztVIYUfLJn+SjsgljJQ8ITMfS0zfY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1763723489; c=relaxed/simple;
	bh=NuQn3OfYxineh0KWDf920OOndHNl13nMc1XXtPNfvQY=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=pbgD6lt1LiXtNOeZZQ+qkS4BfSqtt6Ciyh/ALPXRHH4vA0yV9C61+A6rFSPwaI+2vqe5gQw8o9pPi8DegvmvJOeYzp+JpGq/5lULawOIdL8b3pwOQJmAUulPDDhA2KY1NbdNVh45M92SBJhBbabc5ZP8wlWegd4EV44ud1U3zWc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=gNNKsIpC; arc=none smtp.client-ip=209.85.221.54
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="gNNKsIpC"
Received: by mail-wr1-f54.google.com with SMTP id
 ffacd0b85a97d-42bb288c219so1759925f8f.1
        for <linux-kernel@vger.kernel.org>;
 Fri, 21 Nov 2025 03:11:27 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1763723486; x=1764328286;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=H/+N5QsX2rcNo+FLul8FJV25DAtd4OsYln3zlGw8aLQ=;
        b=gNNKsIpCGs0ulvRAZXWJ+salB/54cCkF4H7Odzj1875RWLcbmacLB9ip5qqsDTGKgx
         oDveFtmlH4eSZlHBjmfXBJ7mfgUXNm/Ih3HGMVrqPu4x3UDNU/UL5nquACHF35d5dhNz
         7kFnwckSUIKpe8+lfdnEAIfFCZA5rA3bCKN9gaiYjMSqlrlHqUILVLhoPHL+pfzFQwAh
         HgbEacCFjO83PPOqx3ku+9CnVEgID8qJ8dqXBpno93gKsToXdIgWMj1CcYLDMIn9XqcX
         dTvlFolFXrZZQwyNXJPBmwY9YHpRr8k7trIpRKPSPfyWZ1zJvyHZp9jS9TOvyyR6vpFK
         WKWQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1763723486; x=1764328286;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-gg:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=H/+N5QsX2rcNo+FLul8FJV25DAtd4OsYln3zlGw8aLQ=;
        b=SqxRqivFhVQWutb3QaelRq22WNNVbkGgilIqNZd9Ewils2ZMY+o+Le1pIb+sA9tsy9
         YrMus1/UjSvQvzolfAVgYU7YuTmNc/FyK27E8LXHqFgTyKfvkPEv7Dnhq/kFF5e/F2I/
         swHjNk8HruR24FuWOq4e+MwxIk6O4We1QzJJWfbrJbsvJOjB2mguPfMFtoHHielTKLmK
         2f6HkgLj1AfC72McHqvm8qccVVCbJv5StAk2ObiS4fdsMmoQ+Ji8lHMH10QCqEtuM8Al
         rlJwpreAb55JSu7rr+PUic3e1F1A7OdQhdEnRXYFx0iNSKoYhSBvBNEYmEEnV0u4Y4v2
         S3jg==
X-Forwarded-Encrypted: i=1;
 AJvYcCWEVeFcAzZ7/9NtyWQ5RNSHh+6jxzsI0xDLI5M97NtOINis2jhtnpAwAMr6lDCsU9fUO8hzvzek9W3duIs=@vger.kernel.org
X-Gm-Message-State: AOJu0YzDWbH6/VVbWvFxD5LEmNnAK15vktD8VSWcF25T6w/7Cp3NsRGg
	Kj0FUmWjyzU8cbt+qNdUXRE0b5t1zakZzfFzXClAdo07g4RxA0zF3ltf
X-Gm-Gg: ASbGnctBkxaSWLoxc750MWiIU5De28mgElNXUYhuEEvDW8n4p9usbrn8BCr0D29wOfh
	2ameR0MgoZQoBwLm1PkxoFQiLYru8SKLPme59xgyaEgjeQpF78RE1lvac8CUq7nnB+b5XdLw0BX
	sd3Miw07l99bJu6sFK26fJft3sVbX9cxPSjHYInTSvVX3M7VZs+Bo7cp9FaAkx0AIFR5eXW5BeR
	2ur9Jge9HaJvbZn+JrHT4t2Rj5nCHbYOWSOjcvQnfKJUgLYrErX4ZiFrYNA+tpXZkffUlqebpBa
	Q2VZl40Hm6nHYtYTA3z0f7PkTHYjpn7lXHHUfKD69lSId+yah9JYSaiFpRNhK83geyl0Ojxx4Dm
	Ggi/W0EHz3YabTIP+ziVdGFpDAVOVW3jT7FwsDUR9xvcxWlaqWvK7/AIF0EZk39g6g3qmOWntXd
	6MkAX3Ao5CXb5RdFrEVomAjpvui3nZoWQ/DmphUwKGBfiZkWVng706drubceQ5jp2PL6vetw/MO
	6jUTBm2/FIZtB4z2k8FuBcpmVaoAN1V
X-Google-Smtp-Source: 
 AGHT+IE04GkdDWbSoxK4nXnivsbovEmPBdGfBxOvSa2BVNcTZwUMEgj+UKflNwgV9u/2RIjRITxxBg==
X-Received: by 2002:a5d:5f95:0:b0:42b:52c4:663a with SMTP id
 ffacd0b85a97d-42cc1ac9d17mr1913155f8f.11.1763723485380;
        Fri, 21 Nov 2025 03:11:25 -0800 (PST)
Received: from ip-10-0-150-200.eu-west-1.compute.internal
 (ec2-52-49-196-232.eu-west-1.compute.amazonaws.com. [52.49.196.232])
        by smtp.gmail.com with ESMTPSA id
 ffacd0b85a97d-42cb7f363e4sm10484180f8f.12.2025.11.21.03.11.24
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Fri, 21 Nov 2025 03:11:25 -0800 (PST)
From: Fred Griffoul <griffoul@gmail.com>
To: kvm@vger.kernel.org
Cc: seanjc@google.com,
	pbonzini@redhat.com,
	vkuznets@redhat.com,
	shuah@kernel.org,
	dwmw@amazon.co.uk,
	linux-kselftest@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Fred Griffoul <fgriffo@amazon.co.uk>
Subject: [PATCH v3 02/10] KVM: pfncache: Restore guest-uses-pfn support
Date: Fri, 21 Nov 2025 11:11:05 +0000
Message-ID: <20251121111113.456628-3-griffoul@gmail.com>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20251121111113.456628-1-griffoul@gmail.com>
References: <20251121111113.456628-1-griffoul@gmail.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Restore functionality for guest page access tracking in pfncache,
enabling automatic vCPU request generation when cache invalidation
occurs through MMU notifier events.

This feature is critical for nested VMX operations where both KVM and L2
guest access guest-provided pages, such as APIC pages and posted
interrupt descriptors.

This change:

- Reverts commit eefb85b3f031 ("KVM: Drop unused @may_block param from
  gfn_to_pfn_cache_invalidate_start()")

- Partially reverts commit a4bff3df5147 ("KVM: pfncache: remove
  KVM_GUEST_USES_PFN usage"). Adds kvm_gpc_init_for_vcpu() to
  initialize pfncache for guest mode access, instead of the
  usage-specific flag approach.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 include/linux/kvm_host.h  | 29 +++++++++++++++++++++++++-
 include/linux/kvm_types.h |  1 +
 virt/kvm/kvm_main.c       |  3 ++-
 virt/kvm/kvm_mm.h         |  6 ++++--
 virt/kvm/pfncache.c       | 43 ++++++++++++++++++++++++++++++++++++---
 5 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 19b8c4bebb9c..6253cf1c38c1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1402,6 +1402,9 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t=
 gpa, const void *data,
 			 unsigned long len);
 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
=20
+void __kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
+		    struct kvm_vcpu *vcpu);
+
 /**
  * kvm_gpc_init - initialize gfn_to_pfn_cache.
  *
@@ -1412,7 +1415,11 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu,=
 gfn_t gfn);
  * immutable attributes.  Note, the cache must be zero-allocated (or zeroe=
d by
  * the caller before init).
  */
-void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm);
+
+static inline void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *=
kvm)
+{
+	__kvm_gpc_init(gpc, kvm, NULL);
+}
=20
 /**
  * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given =
guest
@@ -1494,6 +1501,26 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, un=
signed long len);
  */
 void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc);
=20
+/**
+ * kvm_gpc_init_for_vcpu - initialize gfn_to_pfn_cache for pin/unpin usage
+ *
+ * @gpc:        struct gfn_to_pfn_cache object.
+ * @vcpu:       vCPU that will pin and directly access this cache.
+ * @req:        request to send when cache is invalidated while pinned.
+ *
+ * This sets up a gfn_to_pfn_cache for use by a vCPU that will directly ac=
cess
+ * the cached physical address. When the cache is invalidated while pinned,
+ * the specified request will be sent to the associated vCPU to force cache
+ * refresh.
+ *
+ * Note, the cache must be zero-allocated (or zeroed by the caller before =
init).
+ */
+static inline void kvm_gpc_init_for_vcpu(struct gfn_to_pfn_cache *gpc,
+					 struct kvm_vcpu *vcpu)
+{
+	__kvm_gpc_init(gpc, vcpu->kvm, vcpu);
+}
+
 static inline bool kvm_gpc_is_gpa_active(struct gfn_to_pfn_cache *gpc)
 {
 	return gpc->active && !kvm_is_error_gpa(gpc->gpa);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 490464c205b4..445170ea23e4 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -74,6 +74,7 @@ struct gfn_to_pfn_cache {
 	struct kvm_memory_slot *memslot;
 	struct kvm *kvm;
 	struct list_head list;
+	struct kvm_vcpu *vcpu;
 	rwlock_t lock;
 	struct mutex refresh_lock;
 	void *khva;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 226faeaa8e56..88de1eac5baf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -760,7 +760,8 @@ static int kvm_mmu_notifier_invalidate_range_start(stru=
ct mmu_notifier *mn,
 	 * mn_active_invalidate_count (see above) instead of
 	 * mmu_invalidate_in_progress.
 	 */
-	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
+	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
+					  hva_range.may_block);
=20
 	/*
 	 * If one or more memslots were found and thus zapped, notify arch code
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 31defb08ccba..f1ba02084bd9 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -58,11 +58,13 @@ kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp);
 #ifdef CONFIG_HAVE_KVM_PFNCACHE
 void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
 				       unsigned long start,
-				       unsigned long end);
+				       unsigned long end,
+				       bool may_block);
 #else
 static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
 						     unsigned long start,
-						     unsigned long end)
+						     unsigned long end,
+						     bool may_block)
 {
 }
 #endif /* HAVE_KVM_PFNCACHE */
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 728d2c1b488a..543466ff40a0 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -23,9 +23,11 @@
  * MMU notifier 'invalidate_range_start' hook.
  */
 void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long star=
t,
-				       unsigned long end)
+				       unsigned long end, bool may_block)
 {
+	DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
 	struct gfn_to_pfn_cache *gpc;
+	bool evict_vcpus =3D false;
=20
 	spin_lock(&kvm->gpc_lock);
 	list_for_each_entry(gpc, &kvm->gpc_list, list) {
@@ -46,8 +48,21 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, =
unsigned long start,
=20
 			write_lock_irq(&gpc->lock);
 			if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
-			    gpc->uhva >=3D start && gpc->uhva < end)
+			    gpc->uhva >=3D start && gpc->uhva < end) {
 				gpc->valid =3D false;
+
+				/*
+				 * If a guest vCPU could be using the physical address,
+				 * it needs to be forced out of guest mode.
+				 */
+				if (gpc->vcpu) {
+					if (!evict_vcpus) {
+						evict_vcpus =3D true;
+						bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
+					}
+					__set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap);
+				}
+			}
 			write_unlock_irq(&gpc->lock);
 			continue;
 		}
@@ -55,6 +70,27 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, =
unsigned long start,
 		read_unlock_irq(&gpc->lock);
 	}
 	spin_unlock(&kvm->gpc_lock);
+
+	if (evict_vcpus) {
+		/*
+		 * KVM needs to ensure the vCPU is fully out of guest context
+		 * before allowing the invalidation to continue.
+		 */
+		unsigned int req =3D KVM_REQ_OUTSIDE_GUEST_MODE;
+		bool called;
+
+		/*
+		 * If the OOM reaper is active, then all vCPUs should have
+		 * been stopped already, so perform the request without
+		 * KVM_REQUEST_WAIT and be sad if any needed to be IPI'd.
+		 */
+		if (!may_block)
+			req &=3D ~KVM_REQUEST_WAIT;
+
+		called =3D kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap);
+
+		WARN_ON_ONCE(called && !may_block);
+	}
 }
=20
 static bool kvm_gpc_is_valid_len(gpa_t gpa, unsigned long uhva,
@@ -382,7 +418,7 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsig=
ned long len)
 	return __kvm_gpc_refresh(gpc, gpc->gpa, uhva);
 }
=20
-void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
+void __kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, struct =
kvm_vcpu *vcpu)
 {
 	rwlock_init(&gpc->lock);
 	mutex_init(&gpc->refresh_lock);
@@ -391,6 +427,7 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct =
kvm *kvm)
 	gpc->pfn =3D KVM_PFN_ERR_FAULT;
 	gpc->gpa =3D INVALID_GPA;
 	gpc->uhva =3D KVM_HVA_ERR_BAD;
+	gpc->vcpu =3D vcpu;
 	gpc->active =3D gpc->valid =3D false;
 }
=20
--=20
2.43.0
From nobody Sun Feb  8 03:44:33 2026
Received: from mail-wr1-f51.google.com (mail-wr1-f51.google.com
 [209.85.221.51])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0951C347FD1
	for <linux-kernel@vger.kernel.org>; Fri, 21 Nov 2025 11:11:28 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.221.51
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1763723491; cv=none;
 b=H642yxs/VJwnL//ohi4dO8+3BYaD5QWrxD6QRLN2zFPZyaprAWr8UHyTFNRv2rJ+hKfbvcfvReKqBDFr90Kfmv8ysYeearmidzvsX1kpaRipdWlw/nIkVBmwXWFdQRmqtKXWdovECwdR2C2zdAxE8uo0BiPRQQZzTAgyZTwavnM=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1763723491; c=relaxed/simple;
	bh=PJXfOYUgQ/koH9RKoJHaRKsC+CsEh7NIL8+BGeZYdZc=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=RW5JdfjCoO1pHwvmfhvWKByruBt6Qh0T1820tnlFBzzwyv1YohGhy0cSMeDN0DClS2Kb1njFD4qysSTIFNgpJqSbZ2uWgl54sX4yDjXaCeTZQCcO9HM6j/UIJRa2fr35ogrJzwaOBFCvWQwCygmqxG10BRpBfphT8ORpMRahnb8=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=lNYqPw+N; arc=none smtp.client-ip=209.85.221.51
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="lNYqPw+N"
Received: by mail-wr1-f51.google.com with SMTP id
 ffacd0b85a97d-42b47f662a0so1363382f8f.0
        for <linux-kernel@vger.kernel.org>;
 Fri, 21 Nov 2025 03:11:28 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1763723487; x=1764328287;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=siFHznuHrhv1xGgfDI5kE0TqLDfCEZnyaPZRcY6rR3Y=;
        b=lNYqPw+N4EPoEcrfikcCNZxY1lNNAfsKecWuiQ+BUQXp/C+6LbeeQz4IDCEVgRrZRM
         Dgi4teePinz4GWLy1pVXamMMIwWutpdBHnE2Xuj1nHkfB7xfP79JJ3u4z7uFfTZWOOqr
         /Nj3+auWHDbSzQEsEQyWK/ZhKzncpPakAiBBGCR+IrND2YOk0/u3rhVyQg4G5EHA1P7S
         jwy0vRu1tqhhp77XCg0kovGE8mkt8q3/evM1uZgdVnk2CkKvaxlGlnVXsD3ihbhZFzke
         QqfK1S4bwa75oLpr+LDz1DCaRtxwjgBh+BSnNahIMbEDbQsACFZAwDXZKjceHVW/64bS
         l5eg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1763723487; x=1764328287;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-gg:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=siFHznuHrhv1xGgfDI5kE0TqLDfCEZnyaPZRcY6rR3Y=;
        b=ojrs8tjwps5ERAEhtfsEZuYuXPPOaJhsyyPZMJZzGPghCjc/HttX6bwzDcir9NFH9R
         fPimhY2oiBKETXzUZvPsJYnjko3dDmg2LjV/8DogvR/oQ+xLkytW4AbWaQtM+Jywv71C
         f/2tI1jJIJru13RqE085Ur7aF3qmxfQIEsloZQvmo4j1wyfyTlT4I3D450riwNe4d8iA
         Okk3GEmpbL8/75DGf1HAOgJJp69FFalWFZEbALHLBVIJt2dPJsRPocRn2pi8xhmuUNcZ
         cBb+75mllSxwQGzIdYAX5vkCFxBGfg9CWH6PoY+k6Urmc+7lcj/7EJ7eJwdfjyGjI7Pn
         mfRg==
X-Forwarded-Encrypted: i=1;
 AJvYcCUUmKaaoTRKseYVg0HTPxFxYyT68bDhlflcUVb7MUAHXl8QnlPeU1iLD3PdzGbZ9DN/RakjtCMKCT3F2Dw=@vger.kernel.org
X-Gm-Message-State: AOJu0Yy6FfFYsA1V8+Jce7xyfN3V2OlzC/fp9muxepWcSet52DJnvTEy
	68T8NiXsp3+fLVwqYchN6kLjewviH7uYrgfibP7Qp3Wy8jJtb8ZJ9Z+k
X-Gm-Gg: ASbGncsDF4L+LqK8phvw3vkv6KAYqkkCrX8RCMdmkLhLfTm6sBh5w60euw0CXXQHQDk
	K2tzijIljBzBYR/kojJelRf0IU9F5MB6LOiRF8TYK921WA7nEmM9BYmLDFDA5yvL61wi5mJQTYX
	jcdW10YTMB/QM09N2Z95/GNn8xyIPWxg2jaZbNT/LzNDpgtX+b/35hBApvpFE+NtqzJ9S/sVpPR
	WCZXLVDxi6SqYAxA/WEzEVKVs5SDz9dgfCMc09N6kDD8rBj1hHAz6TzPT0KwxtRTFOJvz03WKE1
	yVKhIeGMveVQ3fx8rkC87iTy0FkO/WSGcg1dxJNmnc+gLE4KXblrBRrcp67zh0Opvz+fqHa+atF
	bzVZda4oFqWt9Sc9dW6qq42lBBroh7rnL7q3e7UuO46yqwz8Vv/PcG0MZnb4GOZobaHbIJ9NJxH
	sWe+i2tC7MvYxE2/gExYQ8gEK051OXOVXDXYxhJog/03BOS9T7LccY9HCYlXaEZkUlD/L4vU0hN
	0ymdE9IPM4Sa0aEieD3kI8zQg+FdnpD
X-Google-Smtp-Source: 
 AGHT+IE5dgFYA+xYxxyFQN3JmCdA/JLY8RAcFtgzm4PUr5ke2lXeNl51iCj4HMCTmRGjYFM/hXQn3A==
X-Received: by 2002:a05:6000:4305:b0:429:ef72:c33a with SMTP id
 ffacd0b85a97d-42cba63b5e9mr6506218f8f.3.1763723487033;
        Fri, 21 Nov 2025 03:11:27 -0800 (PST)
Received: from ip-10-0-150-200.eu-west-1.compute.internal
 (ec2-52-49-196-232.eu-west-1.compute.amazonaws.com. [52.49.196.232])
        by smtp.gmail.com with ESMTPSA id
 ffacd0b85a97d-42cb7f363e4sm10484180f8f.12.2025.11.21.03.11.26
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Fri, 21 Nov 2025 03:11:26 -0800 (PST)
From: Fred Griffoul <griffoul@gmail.com>
To: kvm@vger.kernel.org
Cc: seanjc@google.com,
	pbonzini@redhat.com,
	vkuznets@redhat.com,
	shuah@kernel.org,
	dwmw@amazon.co.uk,
	linux-kselftest@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Fred Griffoul <fgriffo@amazon.co.uk>
Subject: [PATCH v3 03/10] KVM: x86: Add nested state validation for pfncache
 support
Date: Fri, 21 Nov 2025 11:11:06 +0000
Message-ID: <20251121111113.456628-4-griffoul@gmail.com>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20251121111113.456628-1-griffoul@gmail.com>
References: <20251121111113.456628-1-griffoul@gmail.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Implement state validation for nested virtualization to enable pfncache
support for L1 guest pages.

This adds a new nested_ops callback 'is_nested_state_invalid()' that
detects when KVM needs to reload nested virtualization state. A
KVM_REQ_GET_NESTED_STATE_PAGES request is triggered to reload affected
pages before L2 execution when it detects invalid state. The callback
monitors L1 guest pages during guest entry/exit while the vCPU runs in
IN_GUEST_MODE.

Currently, VMX implementations return false, with full support planned
for the next patch.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx/nested.c       |  6 ++++++
 arch/x86/kvm/x86.c              | 14 +++++++++++++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_hos=
t.h
index 48598d017d6f..4675e71b33a7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1960,6 +1960,7 @@ struct kvm_x86_nested_ops {
 			 struct kvm_nested_state __user *user_kvm_nested_state,
 			 struct kvm_nested_state *kvm_state);
 	bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu);
+	bool (*is_nested_state_invalid)(struct kvm_vcpu *vcpu);
 	int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
=20
 	int (*enable_evmcs)(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0de84b30c41d..627a6c24625d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3588,6 +3588,11 @@ static bool vmx_get_nested_state_pages(struct kvm_vc=
pu *vcpu)
 	return true;
 }
=20
+static bool vmx_is_nested_state_invalid(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+
 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
 {
 	struct vmcs12 *vmcs12;
@@ -7527,6 +7532,7 @@ struct kvm_x86_nested_ops vmx_nested_ops =3D {
 	.get_state =3D vmx_get_nested_state,
 	.set_state =3D vmx_set_nested_state,
 	.get_nested_state_pages =3D vmx_get_nested_state_pages,
+	.is_nested_state_invalid =3D vmx_is_nested_state_invalid,
 	.write_log_dirty =3D nested_vmx_write_pml_buffer,
 #ifdef CONFIG_KVM_HYPERV
 	.enable_evmcs =3D nested_enable_evmcs,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4b8138bd4857..1a9c1171df49 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2262,12 +2262,24 @@ int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_monitor);
=20
+static inline bool kvm_invalid_nested_state(struct kvm_vcpu *vcpu)
+{
+	if (is_guest_mode(vcpu) &&
+	    kvm_x86_ops.nested_ops->is_nested_state_invalid &&
+	    kvm_x86_ops.nested_ops->is_nested_state_invalid(vcpu)) {
+		kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+		return true;
+	}
+	return false;
+}
+
 static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
 {
 	xfer_to_guest_mode_prepare();
=20
 	return READ_ONCE(vcpu->mode) =3D=3D EXITING_GUEST_MODE ||
-	       kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending();
+	       kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending() ||
+	       kvm_invalid_nested_state(vcpu);
 }
=20
 static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, =
u64 data)
--=20
2.43.0
From nobody Sun Feb  8 03:44:33 2026
Received: from mail-wm1-f48.google.com (mail-wm1-f48.google.com
 [209.85.128.48])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9B979348898
	for <linux-kernel@vger.kernel.org>; Fri, 21 Nov 2025 11:11:30 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.128.48
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1763723493; cv=none;
 b=fSwRlu3SnwNYUtuKFGzORQhWj53OIZjY2MKlp1l/1Npx0jb3Enf1GrOpzj+tvsH+AaJvYx+zxoViZiZtokiGMHp7VYl6CsP4qTF7lxOCD1fTniKgadFoP822NfADUXXSA5TSTfc5ojTVV37+54iKU4WVcyX41FQrAqL0yQvtI/o=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1763723493; c=relaxed/simple;
	bh=HrafMklrqoTEF4Rn4QhVKLuftPBVxnPN6Lx5eLic2JY=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=n0Jj1L5RzSATJyOxKd5wcO8PxrJ8BSPo1rJVyAl/n5djcn8VygSawcqFeiGqaUsdCdlzNadN8RRC1Zyxb7ts7s4/VugYp5mAzlkiNQqQoUzOmOtPdZYaiv7I143HEGnIRQJhlsal+H1IAhrt0v8WU0GE7LVyhrhHi7AGYc5HeSw=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=iApafypM; arc=none smtp.client-ip=209.85.128.48
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="iApafypM"
Received: by mail-wm1-f48.google.com with SMTP id
 5b1f17b1804b1-4779adb38d3so14266285e9.2
        for <linux-kernel@vger.kernel.org>;
 Fri, 21 Nov 2025 03:11:30 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1763723489; x=1764328289;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=q/xIHpiQQ/Lo5XB08/US8R5a2GUT8/PzNY3ETA2TAw0=;
        b=iApafypMo/20VGVXEPaMgfh9k8/m0Ks2U7VPZmjS7LIfvTcwEu5trpjAvOSPhoDD77
         35qFJxMRxGwkb0FCKLQjbaFL5vshARQ+djFbIl1bEJCSM1tMRlCHjbZXsGSWWl9OxIpU
         EhvLlCRST7qzszHZ5FDbo/C5NhiEcVMr2vNM7C8kW+5BWyVAgIg8VY3JCV4XbIxhRYpO
         3yTrs/RZIqJI65EeAsuGerfMBS7MlBlAkvhsItEVgI23SOxbv7S6eRWpOYhWr2VhAKWR
         0bIhqzryTNIDGAF9pKujnl7VGcHRbBkbC2Y/kznD7DxsMOHWOgwMGYA2xCpdfbJfRaEW
         WyBg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1763723489; x=1764328289;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-gg:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=q/xIHpiQQ/Lo5XB08/US8R5a2GUT8/PzNY3ETA2TAw0=;
        b=KaQ9BABe/bHi/p1lxYUR5OyWdKyU7sPzlK7Vu+eWpqgqYYIEJVb1i7E7JfmZpIVCYB
         gbfKdYBFGgrUgqMmISaYp4r/mbmdTytlc3+3dq6/RFtX9ELqXPBrDPrMAiAYMbzWYgT8
         Sb+nd3r437uWzbBJQWFiNKzXdT781INx/qpkMWSvQAPyYPokezih4QnltXgJPYLwnvLt
         qXmS6VsByyeAUZRrIgZa2KA+PakR8DgcAVKJlXn4kyA2Ef1zt5tFpj+fLwwCuDuxA74g
         aBsDzcfDAi6kTKguvf+mmkwQkryN/kmlc7WAYfUIweA9NH6VOvO8KmZctMJ/6e/WPlgy
         HKyg==
X-Forwarded-Encrypted: i=1;
 AJvYcCWd4R9tJ09sfVDDmDeXC9cVdncn9iZwmo/Poye3vxQAkrHRBDo8LhloYakWlq4WcLfLr37cahbKiasbi3w=@vger.kernel.org
X-Gm-Message-State: AOJu0Yz4LedLYjkcgmVSJUdwmxgj6E3R6gxB5iCnxueWtydIC339GXWI
	GJhBOOAVS0JnNgvFQg4jdIzmRMrzPEeA/7ekSU7e855F4GJTqEyuq4LR
X-Gm-Gg: ASbGncsrljFWtmUvh4GfNSahK1dbuxX1cELconOv8/GdV9i8IApEm4tzZavk+e3hlW2
	ijBu/E+tC13SaoYXqMspbrhsG4RyWrjzgI9bOYKBQapJMXzu94hXrSElMHBQqWkMpU2JRfuTvpG
	uqGFWb7aJ7xkvMWxvjIobh7pf5gWYhyFPCoy4KlQH7/uzFXZogDBR1nAUYiHh5YC1khmbvd20m+
	K0xnoK4RzjrqGPSlOIOd7dg8esiCIQfJsE04XMwi0/thRr4+eZIk+obYXTtuEerBDzhXcZaL/OV
	gN1111mlASkB1AHZx17ligItK1+DXCM5Qk0yQyUPEEiWILZlSL0JMdyocw49qmorTzHdJwdpkS7
	gIAbq2UpwteELGPQ4j6+kPnz0Pcfm5IjyoQGTVTNBKIWZUz2P6nCbuVGKT+W0uJTnMexjaF63WC
	d/XHdl1/e+tngcnGcq5c59i5Ys3srR0mcGbOXbF0yC2sp2NfbNSRV0anNJNWWiNCvZvbSm75RyS
	zR5KpusCkYomWEtHw7rbAZ3RcustIc9ZPZSIW75cXM=
X-Google-Smtp-Source: 
 AGHT+IH7H4FBA7WjAQ3UUabqbpl85RgzYy0DRe3lSwmH152nPHYQoDzfP63UrssZaXr+/XP+NNReLg==
X-Received: by 2002:a05:600c:470e:b0:477:79f8:da9d with SMTP id
 5b1f17b1804b1-477c01e9dbemr23954225e9.24.1763723488520;
        Fri, 21 Nov 2025 03:11:28 -0800 (PST)
Received: from ip-10-0-150-200.eu-west-1.compute.internal
 (ec2-52-49-196-232.eu-west-1.compute.amazonaws.com. [52.49.196.232])
        by smtp.gmail.com with ESMTPSA id
 ffacd0b85a97d-42cb7f363e4sm10484180f8f.12.2025.11.21.03.11.27
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Fri, 21 Nov 2025 03:11:28 -0800 (PST)
From: Fred Griffoul <griffoul@gmail.com>
To: kvm@vger.kernel.org
Cc: seanjc@google.com,
	pbonzini@redhat.com,
	vkuznets@redhat.com,
	shuah@kernel.org,
	dwmw@amazon.co.uk,
	linux-kselftest@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Fred Griffoul <fgriffo@amazon.co.uk>
Subject: [PATCH v3 04/10] KVM: nVMX: Implement cache for L1 APIC pages
Date: Fri, 21 Nov 2025 11:11:07 +0000
Message-ID: <20251121111113.456628-5-griffoul@gmail.com>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20251121111113.456628-1-griffoul@gmail.com>
References: <20251121111113.456628-1-griffoul@gmail.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Replace kvm_host_map usage with gfn_to_pfn_cache for L1 APIC
virtualization pages (APIC access, virtual APIC, and posted interrupt
descriptor pages) to improve performance with unmanaged guest memory.

The conversion involves several changes:

- Page loading in nested_get_vmcs12_pages(): load vmcs02 fields with
  pfncache PFNs after each cache has been checked and possibly activated
  or refreshed, during OUTSIDE_GUEST_MODE vCPU mode.

- Invalidation window handling: since nested_get_vmcs12_pages() runs in
  OUTSIDE_GUEST_MODE, there's a window where caches can be invalidated
  by MMU notifications before entering IN_GUEST_MODE. implement
  is_nested_state_invalid() callback to monitor cache validity between
  OUTSIDE_GUEST_MODE and IN_GUEST_MODE transitions. This triggers
  KVM_REQ_GET_NESTED_STATE_PAGES when needed.

- Cache access in event callbacks: the virtual APIC and posted interrupt
  descriptor pages are accessed by KVM in has_events() and
  check_events() nested_ops callbacks. These use the kernel HVA following
  the pfncache pattern of check/refresh, with both callbacks able to sleep
  if cache refresh is required.

This eliminates expensive memremap/memunmap cycles for each L2 VM
entry/exit, providing substantial performance improvements when using
unmanaged memory.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 arch/x86/kvm/vmx/nested.c | 169 +++++++++++++++++++++++++++++---------
 arch/x86/kvm/vmx/vmx.h    |   8 +-
 include/linux/kvm_host.h  |   5 ++
 3 files changed, 139 insertions(+), 43 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 627a6c24625d..1f58b380585b 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -329,8 +329,18 @@ static int nested_gpc_lock(struct gfn_to_pfn_cache *gp=
c, gpa_t gpa)
 	if (!kvm_gpc_check(gpc, PAGE_SIZE) || (gpc->gpa !=3D gpa)) {
 		read_unlock(&gpc->lock);
 		err =3D kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
-		if (err)
+		if (err) {
+			/*
+			 * Deactivate nested state caches to prevent
+			 * kvm_gpc_invalid() from returning true in subsequent
+			 * is_nested_state_invalid() calls. This prevents an
+			 * infinite loop while entering guest mode.
+			 */
+			if (gpc->vcpu)
+				kvm_gpc_deactivate(gpc);
+
 			return err;
+		}
=20
 		goto retry;
 	}
@@ -343,14 +353,17 @@ static void nested_gpc_unlock(struct gfn_to_pfn_cache=
 *gpc)
 	read_unlock(&gpc->lock);
 }
=20
-static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
+static int nested_gpc_hpa(struct gfn_to_pfn_cache *gpc, gpa_t gpa, hpa_t *=
hpa)
 {
-	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
+	int err;
+
+	err =3D nested_gpc_lock(gpc, gpa);
+	if (err)
+		return err;
=20
-	kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map);
-	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map);
-	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map);
-	vmx->nested.pi_desc =3D NULL;
+	*hpa =3D pfn_to_hpa(gpc->pfn);
+	nested_gpc_unlock(gpc);
+	return 0;
 }
=20
 /*
@@ -373,6 +386,9 @@ static void free_nested(struct kvm_vcpu *vcpu)
 	vmx->nested.smm.vmxon =3D false;
 	vmx->nested.vmxon_ptr =3D INVALID_GPA;
=20
+	kvm_gpc_deactivate(&vmx->nested.pi_desc_cache);
+	kvm_gpc_deactivate(&vmx->nested.virtual_apic_cache);
+	kvm_gpc_deactivate(&vmx->nested.apic_access_page_cache);
 	kvm_gpc_deactivate(&vmx->nested.msr_bitmap_cache);
=20
 	free_vpid(vmx->nested.vpid02);
@@ -389,8 +405,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
 	kfree(vmx->nested.cached_shadow_vmcs12);
 	vmx->nested.cached_shadow_vmcs12 =3D NULL;
=20
-	nested_put_vmcs12_pages(vcpu);
-
 	kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
=20
 	nested_release_evmcs(vcpu);
@@ -3477,7 +3491,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *=
vcpu)
 {
 	struct vmcs12 *vmcs12 =3D get_vmcs12(vcpu);
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
-	struct kvm_host_map *map;
+	struct gfn_to_pfn_cache *gpc;
+	hpa_t hpa;
=20
 	if (!vcpu->arch.pdptrs_from_userspace &&
 	    !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
@@ -3492,10 +3507,10 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu=
 *vcpu)
=20
=20
 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-		map =3D &vmx->nested.apic_access_page_map;
+		gpc =3D &vmx->nested.apic_access_page_cache;
=20
-		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
-			vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
+		if (!nested_gpc_hpa(gpc, vmcs12->apic_access_addr, &hpa)) {
+			vmcs_write64(APIC_ACCESS_ADDR, hpa);
 		} else {
 			pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\=
n",
 					     __func__);
@@ -3508,10 +3523,10 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu=
 *vcpu)
 	}
=20
 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-		map =3D &vmx->nested.virtual_apic_map;
+		gpc =3D &vmx->nested.virtual_apic_cache;
=20
-		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)=
) {
-			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
+		if (!nested_gpc_hpa(gpc, vmcs12->virtual_apic_page_addr, &hpa)) {
+			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
 		} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
 		           nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
 			   !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -3534,14 +3549,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu=
 *vcpu)
 	}
=20
 	if (nested_cpu_has_posted_intr(vmcs12)) {
-		map =3D &vmx->nested.pi_desc_map;
+		gpc =3D &vmx->nested.pi_desc_cache;
=20
-		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map))=
 {
-			vmx->nested.pi_desc =3D
-				(struct pi_desc *)(((void *)map->hva) +
-				offset_in_page(vmcs12->posted_intr_desc_addr));
+		if (!nested_gpc_hpa(gpc, vmcs12->posted_intr_desc_addr & PAGE_MASK, &hpa=
)) {
+			vmx->nested.pi_desc_offset =3D offset_in_page(vmcs12->posted_intr_desc_=
addr);
 			vmcs_write64(POSTED_INTR_DESC_ADDR,
-				     pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_ad=
dr));
+				     hpa + offset_in_page(vmcs12->posted_intr_desc_addr));
 		} else {
 			/*
 			 * Defer the KVM_INTERNAL_EXIT until KVM tries to
@@ -3549,7 +3562,6 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *=
vcpu)
 			 * descriptor. (Note that KVM may do this when it
 			 * should not, per the architectural specification.)
 			 */
-			vmx->nested.pi_desc =3D NULL;
 			pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
 		}
 	}
@@ -3590,7 +3602,16 @@ static bool vmx_get_nested_state_pages(struct kvm_vc=
pu *vcpu)
=20
 static bool vmx_is_nested_state_invalid(struct kvm_vcpu *vcpu)
 {
-	return false;
+	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
+
+	/*
+	 * @vcpu is in IN_GUEST_MODE, eliminating the need for individual gpc
+	 * locks. Since kvm_gpc_invalid() doesn't verify gpc memslot
+	 * generation, we can also skip acquiring the srcu lock.
+	 */
+	return kvm_gpc_invalid(&vmx->nested.apic_access_page_cache) ||
+		kvm_gpc_invalid(&vmx->nested.virtual_apic_cache) ||
+		kvm_gpc_invalid(&vmx->nested.pi_desc_cache);
 }
=20
 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
@@ -4091,9 +4112,55 @@ void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu =
*vcpu)
 	}
 }
=20
+static void *nested_gpc_lock_if_active(struct gfn_to_pfn_cache *gpc)
+{
+retry:
+	read_lock(&gpc->lock);
+	if (!gpc->active) {
+		read_unlock(&gpc->lock);
+		return NULL;
+	}
+
+	if (!kvm_gpc_check(gpc, PAGE_SIZE)) {
+		read_unlock(&gpc->lock);
+		if (kvm_gpc_refresh(gpc, PAGE_SIZE))
+			return NULL;
+		goto retry;
+	}
+
+	return gpc->khva;
+}
+
+static struct pi_desc *nested_lock_pi_desc(struct vcpu_vmx *vmx)
+{
+	u8 *pi_desc_page;
+
+	pi_desc_page =3D nested_gpc_lock_if_active(&vmx->nested.pi_desc_cache);
+	if (!pi_desc_page)
+		return NULL;
+
+	return (struct pi_desc *)(pi_desc_page + vmx->nested.pi_desc_offset);
+}
+
+static void nested_unlock_pi_desc(struct vcpu_vmx *vmx)
+{
+	nested_gpc_unlock(&vmx->nested.pi_desc_cache);
+}
+
+static void *nested_lock_vapic(struct vcpu_vmx *vmx)
+{
+	return nested_gpc_lock_if_active(&vmx->nested.virtual_apic_cache);
+}
+
+static void nested_unlock_vapic(struct vcpu_vmx *vmx)
+{
+	nested_gpc_unlock(&vmx->nested.virtual_apic_cache);
+}
+
 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
+	struct pi_desc *pi_desc;
 	int max_irr;
 	void *vapic_page;
 	u16 status;
@@ -4101,22 +4168,29 @@ static int vmx_complete_nested_posted_interrupt(str=
uct kvm_vcpu *vcpu)
 	if (!vmx->nested.pi_pending)
 		return 0;
=20
-	if (!vmx->nested.pi_desc)
+	pi_desc =3D nested_lock_pi_desc(vmx);
+	if (!pi_desc)
 		goto mmio_needed;
=20
 	vmx->nested.pi_pending =3D false;
=20
-	if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+	if (!pi_test_and_clear_on(pi_desc)) {
+		nested_unlock_pi_desc(vmx);
 		return 0;
+	}
=20
-	max_irr =3D pi_find_highest_vector(vmx->nested.pi_desc);
+	max_irr =3D pi_find_highest_vector(pi_desc);
 	if (max_irr > 0) {
-		vapic_page =3D vmx->nested.virtual_apic_map.hva;
-		if (!vapic_page)
+		vapic_page =3D nested_lock_vapic(vmx);
+		if (!vapic_page) {
+			nested_unlock_pi_desc(vmx);
 			goto mmio_needed;
+		}
+
+		__kvm_apic_update_irr(pi_desc->pir, vapic_page, &max_irr);
+
+		nested_unlock_vapic(vmx);
=20
-		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
-			vapic_page, &max_irr);
 		status =3D vmcs_read16(GUEST_INTR_STATUS);
 		if ((u8)max_irr > ((u8)status & 0xff)) {
 			status &=3D ~0xff;
@@ -4125,6 +4199,7 @@ static int vmx_complete_nested_posted_interrupt(struc=
t kvm_vcpu *vcpu)
 		}
 	}
=20
+	nested_unlock_pi_desc(vmx);
 	nested_mark_vmcs12_pages_dirty(vcpu);
 	return 0;
=20
@@ -4244,8 +4319,10 @@ static bool nested_vmx_preemption_timer_pending(stru=
ct kvm_vcpu *vcpu)
 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injectio=
n)
 {
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
-	void *vapic =3D vmx->nested.virtual_apic_map.hva;
+	struct pi_desc *pi_desc;
 	int max_irr, vppr;
+	void *vapic;
+	bool res =3D false;
=20
 	if (nested_vmx_preemption_timer_pending(vcpu) ||
 	    vmx->nested.mtf_pending)
@@ -4264,23 +4341,33 @@ static bool vmx_has_nested_events(struct kvm_vcpu *=
vcpu, bool for_injection)
 	    __vmx_interrupt_blocked(vcpu))
 		return false;
=20
+	vapic =3D nested_lock_vapic(vmx);
 	if (!vapic)
 		return false;
=20
 	vppr =3D *((u32 *)(vapic + APIC_PROCPRI));
=20
+	nested_unlock_vapic(vmx);
+
 	max_irr =3D vmx_get_rvi();
 	if ((max_irr & 0xf0) > (vppr & 0xf0))
 		return true;
=20
-	if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
-	    pi_test_on(vmx->nested.pi_desc)) {
-		max_irr =3D pi_find_highest_vector(vmx->nested.pi_desc);
-		if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
-			return true;
+	if (vmx->nested.pi_pending) {
+		pi_desc =3D nested_lock_pi_desc(vmx);
+		if (!pi_desc)
+			return false;
+
+		if (pi_test_on(pi_desc)) {
+			max_irr =3D pi_find_highest_vector(pi_desc);
+			if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
+				res =3D true;
+		}
+
+		nested_unlock_pi_desc(vmx);
 	}
=20
-	return false;
+	return res;
 }
=20
 /*
@@ -5244,7 +5331,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 v=
m_exit_reason,
 		vmx_update_cpu_dirty_logging(vcpu);
 	}
=20
-	nested_put_vmcs12_pages(vcpu);
+	nested_mark_vmcs12_pages_dirty(vcpu);
=20
 	if (vmx->nested.reload_vmcs01_apic_access_page) {
 		vmx->nested.reload_vmcs01_apic_access_page =3D false;
@@ -5529,6 +5616,10 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
=20
 	kvm_gpc_init(&vmx->nested.msr_bitmap_cache, vcpu->kvm);
=20
+	kvm_gpc_init_for_vcpu(&vmx->nested.apic_access_page_cache, vcpu);
+	kvm_gpc_init_for_vcpu(&vmx->nested.virtual_apic_cache, vcpu);
+	kvm_gpc_init_for_vcpu(&vmx->nested.pi_desc_cache, vcpu);
+
 	vmx->nested.vmcs02_initialized =3D false;
 	vmx->nested.vmxon =3D true;
=20
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index d76621403c28..9a285834ccda 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -158,11 +158,11 @@ struct nested_vmx {
 	 * Guest pages referred to in the vmcs02 with host-physical
 	 * pointers, so we must keep them pinned while L2 runs.
 	 */
-	struct kvm_host_map apic_access_page_map;
-	struct kvm_host_map virtual_apic_map;
-	struct kvm_host_map pi_desc_map;
+	struct gfn_to_pfn_cache apic_access_page_cache;
+	struct gfn_to_pfn_cache virtual_apic_cache;
+	struct gfn_to_pfn_cache pi_desc_cache;
=20
-	struct pi_desc *pi_desc;
+	u64 pi_desc_offset;
 	bool pi_pending;
 	u16 posted_intr_nv;
=20
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6253cf1c38c1..b05aace9e295 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1531,6 +1531,11 @@ static inline bool kvm_gpc_is_hva_active(struct gfn_=
to_pfn_cache *gpc)
 	return gpc->active && kvm_is_error_gpa(gpc->gpa);
 }
=20
+static inline bool kvm_gpc_invalid(struct gfn_to_pfn_cache *gpc)
+{
+	return gpc->active && !gpc->valid;
+}
+
 void kvm_sigset_activate(struct kvm_vcpu *vcpu);
 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
=20
--=20
2.43.0
From nobody Sun Feb  8 03:44:33 2026
Received: from mail-wr1-f53.google.com (mail-wr1-f53.google.com
 [209.85.221.53])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0CE023491E8
	for <linux-kernel@vger.kernel.org>; Fri, 21 Nov 2025 11:11:31 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.221.53
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1763723494; cv=none;
 b=PQ9sWKV+wGh8sKw0GoKV9ryeFdM/DZ1wwFnNHQfVWXjm1KH4xaQS84ZwoEx4bLGtScjrjCPuQjXhmKduvMsm+oHRkSvhn6HbqFKDol5betK2i3pafzc4gRHOTkuTNdozUyKhKF3NM/Am9mAOFCX8RJhD3bfuw28v1IGdzKTbYng=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1763723494; c=relaxed/simple;
	bh=qvVV1+lOsJ9XDAaWR0KsZyjPvFBsVj16xVloKj+rpks=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=EU1posm5fh9R7m1nkRmj5L5w8EA21eONCjyxPiYY2ftWXHnxMZGfBj24PAMOfGE17sNf0ofWGcHYznkwUCN2K4rPETLVN4D47WRi8CNVAsYJyuq9/sq1aij2Jo/QdGAvDNs77Q1Cd4+bevZMjW0KstKaUFNnfeuPRh95R76V8ec=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=jCC1SvJj; arc=none smtp.client-ip=209.85.221.53
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="jCC1SvJj"
Received: by mail-wr1-f53.google.com with SMTP id
 ffacd0b85a97d-42b3c965df5so1014615f8f.1
        for <linux-kernel@vger.kernel.org>;
 Fri, 21 Nov 2025 03:11:31 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1763723490; x=1764328290;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=NWJX5jCMFbHZLR5KOshFREEhHWutM4XlgAjgK/LAhnI=;
        b=jCC1SvJjWooFbxEkNpG2XXc5esKxz8tJt6lGeQLxlu4diUnk7NLRQ5z/o0tuF4Hp+m
         FhiydQ3kzAl2D+aU7h8N/KRfTj3AIsOKYO/BdyKBVTYl0brKUmv0QIVlrCk58Bg1yBMO
         LtCVG6HToMRxqJDWtHj/RvHX7LVdVJwP/P8GLICsnBQifaIvEp0bO8wJIluncHUm5MNX
         7FLfhdArwqJaA/UJ/DghqoadDNsl0ahVc2xr6cjzIaCu4lzHc+C8G9SJQ1B2N4rfeto4
         u8JSHTV5iiBjRFG4HQmnx/AWgka1430MOMd0h/ST4KrlYKidPNowqq8HbGIf2mRLbjRL
         RHNw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1763723490; x=1764328290;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-gg:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=NWJX5jCMFbHZLR5KOshFREEhHWutM4XlgAjgK/LAhnI=;
        b=Xdm1MwpEn4VV59rMqL/y9+bqPWlVqrLoWz8pwyeVsWAq0Dveuwi0OzUnpnHyAMytl2
         8mr+oR9YVQ8dXG63TQE5y8yhpKyEubaTCOXxwjzNPFjs+ZhUQHoxEClG3Cy13mAXwEPW
         jccV0dC8Xq7TO5nKpwNIsDoSX8HNzTpYDr9684H+m9yeJ16d4X3kRZnXio6K0KHkHwci
         5eo3dBT3BQyPGtb9QJRm4hZYGF7tsN3zhPy9n7tQt8tBtun0wX7+e6wwH5ixi3COkvYE
         xn0hEGNOA3t+Xu/L0THy/uUXNuR2q8YMf+oeV/3KfUdYcZ7yKrzZr6JZXm0QEuNMMgDm
         1OWQ==
X-Forwarded-Encrypted: i=1;
 AJvYcCXZH+tkGQVB9J2+M8kVtHuU3Esm3TeLPeniC4wRrOwhauvzCfjGwOBMRYpkG6wify4SYaJtEWhFaok+zqs=@vger.kernel.org
X-Gm-Message-State: AOJu0YwBzhdsluzPaRCJ3AydxMcNNVJtptNjfhwRW5+iL7kw5Jk2Q7M/
	erbHExdn/AY6dfjtHNEXp1SZPBZz/r2JfRc3dKGnOK0F7L9b7eK6Y3h8
X-Gm-Gg: ASbGnctlWFRJ16f8TJxy58jdSR5vRJESk9GOm5TyPaOzSwvLaDk8jBANHKmIAL61LvS
	PAw+t7LB8ocFdVqlJx6o8XN6MH8HA0U9Pq9z+/n0doS7BrFhsBuy2Bxm/GaGwHGc+WBuV+HLG2a
	06VHBUf1l+3TIxUxEpzEAnigVr9U6zP7XrrjYtKhn122w1ScIbJ/MghmmaJ93DQPdhqSJo7pQh6
	vNFTDP6Gd+EzlLmzUokVexx/75V75VOntr6v1aTAt89XW5a/6AHzccvfnTNJ/TBzGWyKFUwDGH0
	qPpNM8AuqnszVHCdlINZqX5sbQa9ky2HWoE0l96bXA8sePe2kuaTTx4FX1JIniX+e+y2LwYFGD5
	+h4C/OeEBVc1TVBMZUnVmbHk9R1ZW1inbx8gLx2oP/VZAgySOAeKz+P/kDqB6AfV1eaqlqvwBwf
	4oFn6D8jkcSRCHxFMt/ROi/UbKrdqPZIfGmt2E5G41fGmApjNBSXmvkhGmkWk8X/Mlwp9xE3ITs
	3NAsO4wIfoK943fYGwc6nzCkJ8E9Q+H
X-Google-Smtp-Source: 
 AGHT+IGXBHs40wjiJ4FmYuoKhqzHiCE/U2vasQWi77Woq+KAnDevxkqLtbQXFO0ZzYmRmqtSHvc2ow==
X-Received: by 2002:a05:6000:26ca:b0:42c:b8fd:21b4 with SMTP id
 ffacd0b85a97d-42cc1d2111dmr2122435f8f.57.1763723490039;
        Fri, 21 Nov 2025 03:11:30 -0800 (PST)
Received: from ip-10-0-150-200.eu-west-1.compute.internal
 (ec2-52-49-196-232.eu-west-1.compute.amazonaws.com. [52.49.196.232])
        by smtp.gmail.com with ESMTPSA id
 ffacd0b85a97d-42cb7f363e4sm10484180f8f.12.2025.11.21.03.11.29
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Fri, 21 Nov 2025 03:11:29 -0800 (PST)
From: Fred Griffoul <griffoul@gmail.com>
To: kvm@vger.kernel.org
Cc: seanjc@google.com,
	pbonzini@redhat.com,
	vkuznets@redhat.com,
	shuah@kernel.org,
	dwmw@amazon.co.uk,
	linux-kselftest@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Fred Griffoul <fgriffo@amazon.co.uk>
Subject: [PATCH v3 05/10] KVM: selftests: Add nested VMX APIC cache
 invalidation test
Date: Fri, 21 Nov 2025 11:11:08 +0000
Message-ID: <20251121111113.456628-6-griffoul@gmail.com>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20251121111113.456628-1-griffoul@gmail.com>
References: <20251121111113.456628-1-griffoul@gmail.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Introduce selftest to verify nested VMX APIC virtualization page cache
invalidation and refresh mechanisms for pfncache implementation.

The test exercises the nested VMX APIC cache invalidation path through:

- L2 guest setup: creates a nested environment where L2 accesses the
  APIC access page that is cached by KVM using pfncache.

- Cache invalidation triggers: a separate update thread periodically
  invalidates the cached pages using either:
   - madvise(MADV_DONTNEED) to trigger MMU notifications.
   - vm_mem_region_move() to trigger memslot changes.

The test validates that:
- L2 can successfully access APIC page before and after invalidation.
- KVM properly handles cache refresh without guest-visible errors.
- Both MMU notification and memslot change invalidation paths work
  correctly.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 tools/testing/selftests/kvm/Makefile.kvm      |   1 +
 .../selftests/kvm/x86/vmx_apic_update_test.c  | 302 ++++++++++++++++++
 2 files changed, 303 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_apic_update_test.c

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selft=
ests/kvm/Makefile.kvm
index 148d427ff24b..3431568d837e 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -137,6 +137,7 @@ TEST_GEN_PROGS_x86 +=3D x86/max_vcpuid_cap_test
 TEST_GEN_PROGS_x86 +=3D x86/triple_fault_event_test
 TEST_GEN_PROGS_x86 +=3D x86/recalc_apic_map_test
 TEST_GEN_PROGS_x86 +=3D x86/aperfmperf_test
+TEST_GEN_PROGS_x86 +=3D x86/vmx_apic_update_test
 TEST_GEN_PROGS_x86 +=3D access_tracking_perf_test
 TEST_GEN_PROGS_x86 +=3D coalesced_io_test
 TEST_GEN_PROGS_x86 +=3D dirty_log_perf_test
diff --git a/tools/testing/selftests/kvm/x86/vmx_apic_update_test.c b/tools=
/testing/selftests/kvm/x86/vmx_apic_update_test.c
new file mode 100644
index 000000000000..1b5b69627a01
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_apic_update_test.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_apic_update_test
+ *
+ * Copyright (C) 2025, Amazon.com, Inc. or its affiliates. All Rights Rese=
rved.
+ *
+ * Test L2 guest APIC access page writes with concurrent MMU
+ * notification and memslot move updates.
+ */
+#include <pthread.h>
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define VAPIC_GPA	0xc0000000
+#define VAPIC_SLOT	1
+
+#define L2_GUEST_STACK_SIZE 64
+
+#define L2_DELAY	(100)
+
+static void l2_guest_code(void)
+{
+	uint32_t *vapic_addr =3D (uint32_t *) (VAPIC_GPA + 0x80);
+
+	/* Unroll the loop to avoid any compiler side effect */
+
+	WRITE_ONCE(*vapic_addr, 1 << 0);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 1 << 1);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 1 << 2);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 1 << 3);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 1 << 4);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 1 << 5);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 1 << 6);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	WRITE_ONCE(*vapic_addr, 0);
+	udelay(msecs_to_usecs(L2_DELAY));
+
+	/* Exit to L1 */
+	vmcall();
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	uint32_t control, exit_reason;
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/* Enable APIC access */
+	control =3D vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+	control |=3D CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+	control =3D vmreadz(SECONDARY_VM_EXEC_CONTROL);
+	control |=3D SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+	vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
+	vmwrite(APIC_ACCESS_ADDR, VAPIC_GPA);
+
+	GUEST_SYNC1(0);
+	GUEST_ASSERT(!vmlaunch());
+again:
+	exit_reason =3D vmreadz(VM_EXIT_REASON);
+	if (exit_reason =3D=3D EXIT_REASON_APIC_ACCESS) {
+		uint64_t guest_rip =3D vmreadz(GUEST_RIP);
+		uint64_t instr_len =3D vmreadz(VM_EXIT_INSTRUCTION_LEN);
+
+		vmwrite(GUEST_RIP, guest_rip + instr_len);
+		GUEST_ASSERT(!vmresume());
+		goto again;
+	}
+
+	GUEST_SYNC1(exit_reason);
+	GUEST_ASSERT(exit_reason =3D=3D EXIT_REASON_VMCALL);
+	GUEST_DONE();
+}
+
+static const char *progname;
+static int update_period_ms =3D L2_DELAY / 4;
+
+struct update_control {
+	pthread_mutex_t mutex;
+	pthread_cond_t start_cond;
+	struct kvm_vm *vm;
+	bool running;
+	bool started;
+	int updates;
+};
+
+static void wait_for_start_signal(struct update_control *ctrl)
+{
+	pthread_mutex_lock(&ctrl->mutex);
+	while (!ctrl->started)
+		pthread_cond_wait(&ctrl->start_cond, &ctrl->mutex);
+
+	pthread_mutex_unlock(&ctrl->mutex);
+	printf("%s: starting update\n", progname);
+}
+
+static bool is_running(struct update_control *ctrl)
+{
+	return READ_ONCE(ctrl->running);
+}
+
+static void set_running(struct update_control *ctrl, bool running)
+{
+	WRITE_ONCE(ctrl->running, running);
+}
+
+static void signal_thread_start(struct update_control *ctrl)
+{
+	pthread_mutex_lock(&ctrl->mutex);
+	if (!ctrl->started) {
+		ctrl->started =3D true;
+		pthread_cond_signal(&ctrl->start_cond);
+	}
+	pthread_mutex_unlock(&ctrl->mutex);
+}
+
+static void *update_madvise(void *arg)
+{
+	struct update_control *ctrl =3D arg;
+	void *hva;
+
+	wait_for_start_signal(ctrl);
+
+	hva =3D addr_gpa2hva(ctrl->vm, VAPIC_GPA);
+	memset(hva, 0x45, ctrl->vm->page_size);
+
+	while (is_running(ctrl)) {
+		usleep(update_period_ms * 1000);
+		madvise(hva, ctrl->vm->page_size, MADV_DONTNEED);
+		ctrl->updates++;
+	}
+
+	return NULL;
+}
+
+static void *update_move_memslot(void *arg)
+{
+	struct update_control *ctrl =3D arg;
+	uint64_t gpa =3D VAPIC_GPA;
+
+	wait_for_start_signal(ctrl);
+
+	while (is_running(ctrl)) {
+		usleep(update_period_ms * 1000);
+		gpa +=3D 0x10000;
+		vm_mem_region_move(ctrl->vm, VAPIC_SLOT, gpa);
+		ctrl->updates++;
+	}
+
+	return NULL;
+}
+
+static void run(void * (*update)(void *), const char *name)
+{
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	struct vmx_pages *vmx;
+	struct update_control ctrl;
+	struct ucall uc;
+	vm_vaddr_t vmx_pages_gva;
+	pthread_t update_thread;
+	bool done =3D false;
+
+	vm =3D vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+	/* Allocate VMX pages */
+	vmx =3D vcpu_alloc_vmx(vm, &vmx_pages_gva);
+
+	/* Allocate memory and create VAPIC memslot */
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, VAPIC_GPA,
+				    VAPIC_SLOT, 1, 0);
+
+	/* Allocate guest page table */
+	virt_map(vm, VAPIC_GPA, VAPIC_GPA, 1);
+
+	/* Set up nested EPT */
+	prepare_eptp(vmx, vm, 0);
+	nested_map_memslot(vmx, vm, 0);
+	nested_map_memslot(vmx, vm, VAPIC_SLOT);
+	nested_map(vmx, vm, VAPIC_GPA, VAPIC_GPA, vm->page_size);
+
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
+
+	pthread_mutex_init(&ctrl.mutex, NULL);
+	pthread_cond_init(&ctrl.start_cond, NULL);
+	ctrl.vm =3D vm;
+	ctrl.running =3D true;
+	ctrl.started =3D false;
+	ctrl.updates =3D 0;
+
+	pthread_create(&update_thread, NULL, update, &ctrl);
+
+	printf("%s: running %s (tsc_khz %lu)\n", progname, name, guest_tsc_khz);
+
+	while (!done) {
+		vcpu_run(vcpu);
+
+		switch (vcpu->run->exit_reason) {
+		case KVM_EXIT_IO:
+			switch (get_ucall(vcpu, &uc)) {
+			case UCALL_SYNC:
+				printf("%s: sync(%ld)\n", progname, uc.args[0]);
+				if (uc.args[0] =3D=3D 0)
+					signal_thread_start(&ctrl);
+				break;
+			case UCALL_ABORT:
+				REPORT_GUEST_ASSERT(uc);
+				/* NOT REACHED */
+			case UCALL_DONE:
+				done =3D true;
+				break;
+			default:
+				TEST_ASSERT(false, "Unknown ucall %lu", uc.cmd);
+			}
+			break;
+		case KVM_EXIT_MMIO:
+			/* Handle APIC MMIO access after memslot move */
+			printf
+			    ("%s: APIC MMIO access at 0x%llx (memslot move effect)\n",
+			     progname, vcpu->run->mmio.phys_addr);
+			break;
+		default:
+			TEST_FAIL("%s: Unexpected exit reason: %d (flags 0x%x)",
+				  progname,
+				  vcpu->run->exit_reason, vcpu->run->flags);
+		}
+	}
+
+	set_running(&ctrl, false);
+	if (!ctrl.started)
+		signal_thread_start(&ctrl);
+	pthread_join(update_thread, NULL);
+	printf("%s: completed with %d updates\n", progname, ctrl.updates);
+
+	pthread_mutex_destroy(&ctrl.mutex);
+	pthread_cond_destroy(&ctrl.start_cond);
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	int opt_madvise =3D 0;
+	int opt_memslot_move =3D 0;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+	TEST_REQUIRE(kvm_cpu_has_ept());
+
+	if (argc =3D=3D 1) {
+		opt_madvise =3D 1;
+		opt_memslot_move =3D 1;
+	} else {
+		int opt;
+
+		while ((opt =3D getopt(argc, argv, "amp:")) !=3D -1) {
+			switch (opt) {
+			case 'a':
+				opt_madvise =3D 1;
+				break;
+			case 'm':
+				opt_memslot_move =3D 1;
+				break;
+			case 'p':
+				update_period_ms =3D atoi(optarg);
+				break;
+			default:
+				exit(1);
+			}
+		}
+	}
+
+	TEST_ASSERT(opt_madvise
+		    || opt_memslot_move, "No update test configured");
+
+	progname =3D argv[0];
+
+	if (opt_madvise)
+		run(update_madvise, "madvise");
+
+	if (opt_memslot_move)
+		run(update_move_memslot, "move memslot");
+
+	return 0;
+}
--=20
2.43.0
From nobody Sun Feb  8 03:44:33 2026
Received: from mail-wr1-f51.google.com (mail-wr1-f51.google.com
 [209.85.221.51])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id EC31C344058
	for <linux-kernel@vger.kernel.org>; Fri, 21 Nov 2025 11:11:33 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.221.51
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1763723497; cv=none;
 b=sntpBXYMRDXancQBPan410A84kKucXfTN+On3DwnNRoEJiWiQ6LJd3mC0Me8OKnfM4QTWRucgQgX0MKK/6r5SnjJZnW/2vGQlsiPtky1iXeQBLsQ2CHgO7qTQMJJHOyZ5pPof0p6icyisV665Dlhio66wifxViaMm2KS49xuLec=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1763723497; c=relaxed/simple;
	bh=dS2tU290nqAox27OC6Jajp3FZHZfPUPTd5QGznGhFbo=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=sdKAuct5D6v6u7jLmNLkjIHxdxQVd9LrwZWFy5wT8IUzwy77jgn/FnTASn5k4Wb7Bak+0xSoHeKNy8f+5+PIou4Y20wseHthYYtl2Vw1JnxWUnDZyWKzjcXM0v2ntnIC+YVLqnyQCOYKsBBzPhhf8Hnc0TjoqdiR6ZUOY276cWg=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=F+J0/t5J; arc=none smtp.client-ip=209.85.221.51
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="F+J0/t5J"
Received: by mail-wr1-f51.google.com with SMTP id
 ffacd0b85a97d-42b3d4d9ca6so1641351f8f.2
        for <linux-kernel@vger.kernel.org>;
 Fri, 21 Nov 2025 03:11:33 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1763723492; x=1764328292;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=z9lfbyfUipLlDYUmoPFufZ/w0ZJAQG/DQOcG0klnz4U=;
        b=F+J0/t5JXTAI9fZCB1LVmXpGbh3RoXZs+yRZxhE7FZMc8ei2csLj57fpcVVHslNsEJ
         kV4s6Qv5SZUD2FoGEjXpKQZv6roo2paMIOJsl4PCyM5ErriIkPSO54+TbZlJQII4JG1g
         MT5P3Yb/xzbEy4Q8EIvleoEpVAa3GLd1mRUK7On4y8Vs+qyl7Wb3cqIf52fF+He9sk7i
         hSqgl0n6FzGhcvJ97npqRwVb0Vbdt+dOgfsGWoBeX915g+GMrqUh555ET2QZOLyTqedK
         nU6oZcU5jK8txI8W8nU5G2pJF4Qx1CGZqoUriVGj90DcWhzkkDwFCs6LqEgr5kIL5L1c
         qqGw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1763723492; x=1764328292;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-gg:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=z9lfbyfUipLlDYUmoPFufZ/w0ZJAQG/DQOcG0klnz4U=;
        b=bZ4g7WH8LTDAdzf15WWLh9klA9RRi88J9eNGYqGftPzRh/orBclzXiUnwRLVsoNT+D
         wSD9CsW9MSjcDG7AOFlGGIpFsQxtlTAYPeqCxEC4Vy1q+OpBqXN6hNndIe5Ojxhex/rm
         SI1Wdv2IbVTwB6LE5ymz/+QOhgSuMAk2IYK59ary0ZR8+Fr5LBaSrEhDHxvcS2+T4BnF
         QUqdHOZjFxEeX0C/C0u04EcWNoFLjjAej2eRSKZrPyCI5Ehi2tZ4F4Tv9uWmA3czThdk
         G34XVdzkI9xMT1uipg2J1GiogndSSeDq1MgLVokCLpdsXUg2IlW8zpD4Y+HXyuZkdVs6
         uKAw==
X-Forwarded-Encrypted: i=1;
 AJvYcCXVeKmkozjUDkK8O6kCaHOmUpuzqAavjTixl7aZxyk0l9wvZU9ja42vcK9VrOQG2CI0Kr5h+y0HrkVH2dc=@vger.kernel.org
X-Gm-Message-State: AOJu0YwqnaTWrcW8w13dvvZliPu+Dvhvt+7+tO+fxDNmyCcBtJPevVD0
	/OBNTq2PfMzhN2KcRYd5yK1qRY/RAUgoV5q3owhIMQBVyGb+anohSHj7
X-Gm-Gg: ASbGncv5EYqDYbd8gNhP32TDvW4Y4+KxtZW6ek2NbJmgz9EKDY/YH8vfb12tYHzVE0E
	Q1X5uIkU5583XqX4F2Rf+38XW/eR+OOiQvXw5gyxezDPHpn9xpTARj780oMxJq3rMYLkd/kKgJD
	cTB95Nv0kEI5LXC926f9xeb7ymTd3GIRtESkn0nbsO3HSXnobRL2F+TzIT9mY/kZHkuBVxdRZBP
	nQMmiMgOEMbzWWlsi/tS2T9gk0hJNKgdtwN06aqlkQj1v3iczkqjeOHTATP8nW6YFya6deF10vF
	mDrwN7qJRKWeS/c9ZadRBgi+WwThD8pZ+V6B1BK4yUD46DHrMu0hu+DocBkZ86c3cRTC71dTC6R
	1JGq2XQiWTjU1CQEMRknwlbQ0FYQWXyivvDbXemf5tGb/TVC/SFqg1wm1C7uH4s43h21kWUzdCg
	Sl62iFPUmlkNLRp+sJvBedo/1h2Ee8JB5uEMucBF4nKS25Mv4bT4layjhlZrjfbErLfbm+xXcss
	YGOC5KTrjBisFa7tU/WZUfTvgEAlhiM
X-Google-Smtp-Source: 
 AGHT+IFS5CVJxsaWzVOowDMAJtzWZgjU5STn25EdkLeEdQN+ZxFOgyISk8523u0BUvpkT7Am/KjKYQ==
X-Received: by 2002:a5d:64e5:0:b0:427:809:eff5 with SMTP id
 ffacd0b85a97d-42cc1d51ee7mr1872421f8f.53.1763723491607;
        Fri, 21 Nov 2025 03:11:31 -0800 (PST)
Received: from ip-10-0-150-200.eu-west-1.compute.internal
 (ec2-52-49-196-232.eu-west-1.compute.amazonaws.com. [52.49.196.232])
        by smtp.gmail.com with ESMTPSA id
 ffacd0b85a97d-42cb7f363e4sm10484180f8f.12.2025.11.21.03.11.31
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Fri, 21 Nov 2025 03:11:31 -0800 (PST)
From: Fred Griffoul <griffoul@gmail.com>
To: kvm@vger.kernel.org
Cc: seanjc@google.com,
	pbonzini@redhat.com,
	vkuznets@redhat.com,
	shuah@kernel.org,
	dwmw@amazon.co.uk,
	linux-kselftest@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Fred Griffoul <fgriffo@amazon.co.uk>
Subject: [PATCH v3 06/10] KVM: nVMX: Cache evmcs fields to ensure consistency
 during VM-entry
Date: Fri, 21 Nov 2025 11:11:09 +0000
Message-ID: <20251121111113.456628-7-griffoul@gmail.com>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20251121111113.456628-1-griffoul@gmail.com>
References: <20251121111113.456628-1-griffoul@gmail.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Cache enlightened VMCS control fields to prevent TOCTOU races where the
guest could modify hv_clean_fields or hv_enlightenments_control between
multiple accesses during nested VM-entry.

The cached values ensure consistent behavior across:
- The evmcs-to-vmcs12 copy operations
- MSR bitmap validation
- Clean field checks in prepare_vmcs02_rare()

This eliminates potential guest-induced inconsistencies in nested
virtualization state management.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 arch/x86/kvm/vmx/hyperv.c |  5 ++--
 arch/x86/kvm/vmx/hyperv.h | 20 +++++++++++++
 arch/x86/kvm/vmx/nested.c | 62 ++++++++++++++++++++++++---------------
 arch/x86/kvm/vmx/vmx.h    |  5 +++-
 4 files changed, 65 insertions(+), 27 deletions(-)

diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c
index fa41d036acd4..961b91b9bd64 100644
--- a/arch/x86/kvm/vmx/hyperv.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -213,12 +213,11 @@ bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcp=
u *vcpu)
 {
 	struct kvm_vcpu_hv *hv_vcpu =3D to_hv_vcpu(vcpu);
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
-	struct hv_enlightened_vmcs *evmcs =3D vmx->nested.hv_evmcs;
=20
-	if (!hv_vcpu || !evmcs)
+	if (!hv_vcpu || !nested_vmx_is_evmptr12_valid(vmx))
 		return false;
=20
-	if (!evmcs->hv_enlightenments_control.nested_flush_hypercall)
+	if (!vmx->nested.hv_flush_hypercall)
 		return false;
=20
 	return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index 11a339009781..3c7fea501ca5 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -52,6 +52,16 @@ static inline bool guest_cpu_cap_has_evmcs(struct kvm_vc=
pu *vcpu)
 	       to_vmx(vcpu)->nested.enlightened_vmcs_enabled;
 }
=20
+static inline u32 nested_evmcs_clean_fields(struct vcpu_vmx *vmx)
+{
+	return vmx->nested.hv_clean_fields;
+}
+
+static inline bool nested_evmcs_msr_bitmap(struct vcpu_vmx *vmx)
+{
+	return vmx->nested.hv_msr_bitmap;
+}
+
 u64 nested_get_evmptr(struct kvm_vcpu *vcpu);
 uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
 int nested_enable_evmcs(struct kvm_vcpu *vcpu,
@@ -85,6 +95,16 @@ static inline struct hv_enlightened_vmcs *nested_vmx_evm=
cs(struct vcpu_vmx *vmx)
 {
 	return NULL;
 }
+
+static inline u32 nested_evmcs_clean_fields(struct vcpu_vmx *vmx)
+{
+	return 0;
+}
+
+static inline bool nested_evmcs_msr_bitmap(struct vcpu_vmx *vmx)
+{
+	return false;
+}
 #endif
=20
 #endif /* __KVM_X86_VMX_HYPERV_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 1f58b380585b..aec150612818 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -235,6 +235,9 @@ static inline void nested_release_evmcs(struct kvm_vcpu=
 *vcpu)
 	kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map);
 	vmx->nested.hv_evmcs =3D NULL;
 	vmx->nested.hv_evmcs_vmptr =3D EVMPTR_INVALID;
+	vmx->nested.hv_clean_fields =3D 0;
+	vmx->nested.hv_msr_bitmap =3D false;
+	vmx->nested.hv_flush_hypercall =3D false;
=20
 	if (hv_vcpu) {
 		hv_vcpu->nested.pa_page_gpa =3D INVALID_GPA;
@@ -686,10 +689,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(stru=
ct kvm_vcpu *vcpu,
 	 *   and tells KVM (L0) there were no changes in MSR bitmap for L2.
 	 */
 	if (!vmx->nested.force_msr_bitmap_recalc) {
-		struct hv_enlightened_vmcs *evmcs =3D nested_vmx_evmcs(vmx);
-
-		if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap &&
-		    evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
+		if (nested_vmx_is_evmptr12_valid(vmx) &&
+		    nested_evmcs_msr_bitmap(vmx) &&
+		    (nested_evmcs_clean_fields(vmx)
+		     & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))
 			return true;
 	}
=20
@@ -2163,10 +2166,11 @@ static void copy_vmcs12_to_enlightened(struct vcpu_=
vmx *vmx)
  * instruction.
  */
 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
-	struct kvm_vcpu *vcpu, bool from_launch)
+	struct kvm_vcpu *vcpu, bool from_launch, bool copy)
 {
 #ifdef CONFIG_KVM_HYPERV
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
+	struct hv_enlightened_vmcs *evmcs;
 	bool evmcs_gpa_changed =3D false;
 	u64 evmcs_gpa;
=20
@@ -2246,6 +2250,22 @@ static enum nested_evmptrld_status nested_vmx_handle=
_enlightened_vmptrld(
 		vmx->nested.force_msr_bitmap_recalc =3D true;
 	}
=20
+	/* Cache evmcs fields to avoid reading evmcs after copy to vmcs12 */
+	evmcs =3D vmx->nested.hv_evmcs;
+	vmx->nested.hv_clean_fields =3D evmcs->hv_clean_fields;
+	vmx->nested.hv_flush_hypercall =3D evmcs->hv_enlightenments_control.neste=
d_flush_hypercall;
+	vmx->nested.hv_msr_bitmap =3D evmcs->hv_enlightenments_control.msr_bitmap;
+
+	if (copy) {
+		struct vmcs12 *vmcs12 =3D get_vmcs12(vcpu);
+
+		if (likely(!vmcs12->hdr.shadow_vmcs)) {
+			copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_clean_fields);
+			/* Enlightened VMCS doesn't have launch state */
+			vmcs12->launch_state =3D !from_launch;
+		}
+	}
+
 	return EVMPTRLD_SUCCEEDED;
 #else
 	return EVMPTRLD_DISABLED;
@@ -2613,10 +2633,12 @@ static void vmcs_write_cet_state(struct kvm_vcpu *v=
cpu, u64 s_cet,
=20
 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs1=
2)
 {
-	struct hv_enlightened_vmcs *hv_evmcs =3D nested_vmx_evmcs(vmx);
+	u32 hv_clean_fields =3D 0;
=20
-	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
-			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+	if (nested_vmx_is_evmptr12_valid(vmx))
+		hv_clean_fields =3D nested_evmcs_clean_fields(vmx);
+
+	if (!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
=20
 		vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
 		vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -2658,8 +2680,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx,=
 struct vmcs12 *vmcs12)
 		vmx_segment_cache_clear(vmx);
 	}
=20
-	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
-			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
+	if (!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
 		vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
 		vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
 			    vmcs12->guest_pending_dbg_exceptions);
@@ -2750,7 +2771,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, stru=
ct vmcs12 *vmcs12,
 			  enum vm_entry_failure_code *entry_failure_code)
 {
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
-	struct hv_enlightened_vmcs *evmcs =3D nested_vmx_evmcs(vmx);
+	struct hv_enlightened_vmcs *evmcs;
 	bool load_guest_pdptrs_vmcs12 =3D false;
=20
 	if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) {
@@ -2758,7 +2779,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, stru=
ct vmcs12 *vmcs12,
 		vmx->nested.dirty_vmcs12 =3D false;
=20
 		load_guest_pdptrs_vmcs12 =3D !nested_vmx_is_evmptr12_valid(vmx) ||
-			!(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+			!(nested_evmcs_clean_fields(vmx)
+			  & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
 	}
=20
 	if (vmx->nested.nested_run_pending &&
@@ -2887,7 +2909,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, stru=
ct vmcs12 *vmcs12,
 	 * bits when it changes a field in eVMCS. Mark all fields as clean
 	 * here.
 	 */
-	if (nested_vmx_is_evmptr12_valid(vmx))
+	evmcs =3D nested_vmx_evmcs(vmx);
+	if (evmcs)
 		evmcs->hv_clean_fields |=3D HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
=20
 	return 0;
@@ -3470,7 +3493,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vc=
pu)
 	if (guest_cpu_cap_has_evmcs(vcpu) &&
 	    vmx->nested.hv_evmcs_vmptr =3D=3D EVMPTR_MAP_PENDING) {
 		enum nested_evmptrld_status evmptrld_status =3D
-			nested_vmx_handle_enlightened_vmptrld(vcpu, false);
+			nested_vmx_handle_enlightened_vmptrld(vcpu, false, false);
=20
 		if (evmptrld_status =3D=3D EVMPTRLD_VMFAIL ||
 		    evmptrld_status =3D=3D EVMPTRLD_ERROR)
@@ -3864,7 +3887,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool=
 launch)
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
=20
-	evmptrld_status =3D nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
+	evmptrld_status =3D nested_vmx_handle_enlightened_vmptrld(vcpu, launch, t=
rue);
 	if (evmptrld_status =3D=3D EVMPTRLD_ERROR) {
 		kvm_queue_exception(vcpu, UD_VECTOR);
 		return 1;
@@ -3890,15 +3913,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, boo=
l launch)
 	if (CC(vmcs12->hdr.shadow_vmcs))
 		return nested_vmx_failInvalid(vcpu);
=20
-	if (nested_vmx_is_evmptr12_valid(vmx)) {
-		struct hv_enlightened_vmcs *evmcs =3D nested_vmx_evmcs(vmx);
-
-		copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields);
-		/* Enlightened VMCS doesn't have launch state */
-		vmcs12->launch_state =3D !launch;
-	} else if (enable_shadow_vmcs) {
+	if (!nested_vmx_is_evmptr12_valid(vmx) && enable_shadow_vmcs)
 		copy_shadow_to_vmcs12(vmx);
-	}
=20
 	/*
 	 * The nested entry process starts with enforcing various prerequisites
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9a285834ccda..87708af502f3 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -205,8 +205,11 @@ struct nested_vmx {
=20
 #ifdef CONFIG_KVM_HYPERV
 	gpa_t hv_evmcs_vmptr;
-	struct kvm_host_map hv_evmcs_map;
+	u32 hv_clean_fields;
+	bool hv_msr_bitmap;
+	bool hv_flush_hypercall;
 	struct hv_enlightened_vmcs *hv_evmcs;
+	struct kvm_host_map hv_evmcs_map;
 #endif
 };
=20
--=20
2.43.0
From nobody Sun Feb  8 03:44:33 2026
Received: from mail-wr1-f54.google.com (mail-wr1-f54.google.com
 [209.85.221.54])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 66C3534B402
	for <linux-kernel@vger.kernel.org>; Fri, 21 Nov 2025 11:11:35 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.221.54
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1763723499; cv=none;
 b=CRPaP9UCBCkUSM+Oz0CP8Sf5l9RpQwyUBYiCJDWx7BiV39G4ejQbGDmuMyoCxYYr47WlFJYYRENJ5U1YJNKuVoXs+ziqXX+Wp7fNCO8wJwB+xDrlOVqX7qG16rsoprPFvKZKKl3+2qghRkpN9XWkTHCmQgjOYnFhEG2EEAlBqzA=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1763723499; c=relaxed/simple;
	bh=wgDc98t7P8ZNdoMQOLd5MxQvGkcfkUw91+1rgWMHI5w=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=H4BSoTUSH96c7wkj5qaYi8j7AHz6npQeHG/GdGiEOcPEa4cBFdKWCx7vIMJyZ9hXG2L8iSKuBmWd92lnQlGuGHUwuDeA9XcngXnGIuwbZ0vn7BtEsbMbRkb1SrFZ4hfVF+eHitGZKFVzu3eLp0jHEwJUaSQjU0G+fZXdwcla0zs=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=TqNAKXZd; arc=none smtp.client-ip=209.85.221.54
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="TqNAKXZd"
Received: by mail-wr1-f54.google.com with SMTP id
 ffacd0b85a97d-42b2dc17965so1808281f8f.3
        for <linux-kernel@vger.kernel.org>;
 Fri, 21 Nov 2025 03:11:35 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1763723493; x=1764328293;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=AY8nf1WqUH91gBpTnhoe8Wrlir/GNvxmhkmTTXd2LOs=;
        b=TqNAKXZd++x7QtyQxoaA/Id0737RRq82F1ImqELpuveAAor+vfKc6KxbfdxOTQ4Zcy
         Ux+lDQbRUOzbNs1nKhdks+opTWk5ZwtgJj8iYeHRA2cQmGJDWb+gLpqEPPsMj/nbi3Q0
         pJxWaMqq3sb3tZ5r/C7h1yzlL4mDYzlcUG6h/Z3ZezgHhf0B2phb+aWF6QJrMtQI6brB
         yWu+385iy5875UiFfkUp+rC20rAFBEuLekM20+zWLPlA/awWSV8Ewz6vg9lpHyRPjvD7
         sObr6bBmn+XWEHPiT9uXgj853A44qhw/FRH4phSJ5cU+ZMTAncu8jnFsWWCjoF60wORE
         gA2Q==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1763723493; x=1764328293;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-gg:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=AY8nf1WqUH91gBpTnhoe8Wrlir/GNvxmhkmTTXd2LOs=;
        b=fueD+3xGq9vD/LJJSW7uGiFiYAZma+hN4VGPxs9WnDoyRTbpFHgq/MbI9emFZah7wY
         LiQwDA+Ba5Apaax6svGPc8i0/PgcMRqy/g1/0iJF/wO/d/X0uQTuH4PXsoC0yTwWMBlU
         YMsqqgWuXEknRgOUZZT5FTWTqPv/Qu2Jx7SOEbKypooXUrra9zuVHMZ5pXkz8TB4P38Z
         LhH8/q9q8G57CFiEXBD14qqeEtj67U2yKIUNhzHXFCJ4MYflFhdLWKD0kruRpLpwiEqz
         QFgZvDnUxC9K7PLqPkOtRVk8onZx0NYE6Dq6tqS5k6BpNC6Jg1nLjMptoigbvO4qPX0k
         V12g==
X-Forwarded-Encrypted: i=1;
 AJvYcCX+jwgXVcipoJatsIbMI8Ljgols5o9UJx/okuYs9ys503dVaseEhhwEk2rDbzyCCTk/+uk6Djj7Tjc/Ids=@vger.kernel.org
X-Gm-Message-State: AOJu0Yz7Ws5yUF6yy1c1WGiU0uGXbb0y10VneBpxw9Of45DOQW4iWo93
	M7y0Lx+0JFtWt0cRqJzG6pBd6COPP+4us5RNMyvsSydJM5dTMLeqLdC4
X-Gm-Gg: ASbGncutzFrE21/OQkF8TBzFwZiJ7stz5VS6z8oWGbPxNMpyfhyqq9RWt49EGbhryQ9
	H49V42sVi27d5xIKngq7VHPabxTCraQMWo9l+/uRAK3NleKMKZndVWYjdc0nluORMoEHUIiWQvY
	b/xGJov9qdIV6yxcoJoVZuS2wNxk0FIJF6YRQBEEQ56RUQePxev5kicMh66CpLKqPQK70ATxge5
	NSyAnJfcVSi1xxLjRD19OGyIFo9gmSiTe2rEywwGWrjTO+3+4TnR13Zmiml/DhtXa9cYMRLCKRT
	ZOSn6CGe1ZSjcZqyO3nq04ez67s1I0Eeu5/GdDbCJf7qRyHVCryLGlow+P3ZjKkLX1TLVL7PKTf
	CdzvJdYMRAnQkZ5DzyOQsCVvuCKLjoZ5PSJIz71dfZPghGtdcHsHTLVZA22vq7E0P1Bk4mwByRr
	NHmzrFMhcp4pwKj9TzNv5TwB2q21KqmKnLOLvrKeuwFs3XBF4rv1QY9cP9l6dpP3dX8q3R7SIW6
	mFUXCjMWH1K5oktvy6RI5+WXBrw8C3g
X-Google-Smtp-Source: 
 AGHT+IERzanYCk23RErBiQ6xR2Tl5MpVBZUpbfsKunpb725aVCZGbxpBtIO+KufNWCd0ibXD1qDWgw==
X-Received: by 2002:a05:6000:2689:b0:42b:2e65:655f with SMTP id
 ffacd0b85a97d-42cc1cbd2c0mr2002279f8f.19.1763723493221;
        Fri, 21 Nov 2025 03:11:33 -0800 (PST)
Received: from ip-10-0-150-200.eu-west-1.compute.internal
 (ec2-52-49-196-232.eu-west-1.compute.amazonaws.com. [52.49.196.232])
        by smtp.gmail.com with ESMTPSA id
 ffacd0b85a97d-42cb7f363e4sm10484180f8f.12.2025.11.21.03.11.32
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Fri, 21 Nov 2025 03:11:32 -0800 (PST)
From: Fred Griffoul <griffoul@gmail.com>
To: kvm@vger.kernel.org
Cc: seanjc@google.com,
	pbonzini@redhat.com,
	vkuznets@redhat.com,
	shuah@kernel.org,
	dwmw@amazon.co.uk,
	linux-kselftest@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Fred Griffoul <fgriffo@amazon.co.uk>
Subject: [PATCH v3 07/10] KVM: nVMX: Replace evmcs kvm_host_map with pfncache
Date: Fri, 21 Nov 2025 11:11:10 +0000
Message-ID: <20251121111113.456628-8-griffoul@gmail.com>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20251121111113.456628-1-griffoul@gmail.com>
References: <20251121111113.456628-1-griffoul@gmail.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Replace the eVMCS kvm_host_map with a gfn_to_pfn_cache to properly
handle memslot changes and unify with other pfncaches in nVMX.

The change introduces proper locking/unlocking semantics for eVMCS
access through nested_lock_evmcs() and nested_unlock_evmcs() helpers.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 arch/x86/kvm/vmx/hyperv.h |  21 +++----
 arch/x86/kvm/vmx/nested.c | 115 ++++++++++++++++++++++++++------------
 arch/x86/kvm/vmx/vmx.h    |   3 +-
 3 files changed, 90 insertions(+), 49 deletions(-)

diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index 3c7fea501ca5..3b6fcf8dff64 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -37,11 +37,6 @@ static inline bool nested_vmx_is_evmptr12_set(struct vcp=
u_vmx *vmx)
 	return evmptr_is_set(vmx->nested.hv_evmcs_vmptr);
 }
=20
-static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx=
 *vmx)
-{
-	return vmx->nested.hv_evmcs;
-}
-
 static inline bool guest_cpu_cap_has_evmcs(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -70,6 +65,8 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcp=
u, u32 msr_index, u64 *
 int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
 bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu);
 void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
+struct hv_enlightened_vmcs *nested_lock_evmcs(struct vcpu_vmx *vmx);
+void nested_unlock_evmcs(struct vcpu_vmx *vmx);
 #else
 static inline bool evmptr_is_valid(u64 evmptr)
 {
@@ -91,11 +88,6 @@ static inline bool nested_vmx_is_evmptr12_set(struct vcp=
u_vmx *vmx)
 	return false;
 }
=20
-static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx=
 *vmx)
-{
-	return NULL;
-}
-
 static inline u32 nested_evmcs_clean_fields(struct vcpu_vmx *vmx)
 {
 	return 0;
@@ -105,6 +97,15 @@ static inline bool nested_evmcs_msr_bitmap(struct vcpu_=
vmx *vmx)
 {
 	return false;
 }
+
+static inline struct hv_enlightened_vmcs *nested_lock_evmcs(struct vcpu_vm=
x *vmx)
+{
+	return NULL;
+}
+
+static inline void nested_unlock_evmcs(struct vcpu_vmx *vmx)
+{
+}
 #endif
=20
 #endif /* __KVM_X86_VMX_HYPERV_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index aec150612818..207780ef0926 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -232,8 +232,6 @@ static inline void nested_release_evmcs(struct kvm_vcpu=
 *vcpu)
 	struct kvm_vcpu_hv *hv_vcpu =3D to_hv_vcpu(vcpu);
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
=20
-	kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map);
-	vmx->nested.hv_evmcs =3D NULL;
 	vmx->nested.hv_evmcs_vmptr =3D EVMPTR_INVALID;
 	vmx->nested.hv_clean_fields =3D 0;
 	vmx->nested.hv_msr_bitmap =3D false;
@@ -265,7 +263,7 @@ static bool nested_evmcs_handle_vmclear(struct kvm_vcpu=
 *vcpu, gpa_t vmptr)
 	    !evmptr_is_valid(nested_get_evmptr(vcpu)))
 		return false;
=20
-	if (nested_vmx_evmcs(vmx) && vmptr =3D=3D vmx->nested.hv_evmcs_vmptr)
+	if (vmptr =3D=3D vmx->nested.hv_evmcs_vmptr)
 		nested_release_evmcs(vcpu);
=20
 	return true;
@@ -393,6 +391,9 @@ static void free_nested(struct kvm_vcpu *vcpu)
 	kvm_gpc_deactivate(&vmx->nested.virtual_apic_cache);
 	kvm_gpc_deactivate(&vmx->nested.apic_access_page_cache);
 	kvm_gpc_deactivate(&vmx->nested.msr_bitmap_cache);
+#ifdef CONFIG_KVM_HYPERV
+	kvm_gpc_deactivate(&vmx->nested.hv_evmcs_cache);
+#endif
=20
 	free_vpid(vmx->nested.vpid02);
 	vmx->nested.posted_intr_nv =3D -1;
@@ -1735,11 +1736,12 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *=
vmx)
 	vmcs_load(vmx->loaded_vmcs->vmcs);
 }
=20
-static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_=
fields)
-{
 #ifdef CONFIG_KVM_HYPERV
+static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx,
+				       struct hv_enlightened_vmcs *evmcs,
+				       u32 hv_clean_fields)
+{
 	struct vmcs12 *vmcs12 =3D vmx->nested.cached_vmcs12;
-	struct hv_enlightened_vmcs *evmcs =3D nested_vmx_evmcs(vmx);
 	struct kvm_vcpu_hv *hv_vcpu =3D to_hv_vcpu(&vmx->vcpu);
=20
 	/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
@@ -1978,16 +1980,14 @@ static void copy_enlightened_to_vmcs12(struct vcpu_=
vmx *vmx, u32 hv_clean_fields
 	 */
=20
 	return;
-#else /* CONFIG_KVM_HYPERV */
-	KVM_BUG_ON(1, vmx->vcpu.kvm);
-#endif /* CONFIG_KVM_HYPERV */
 }
+#endif /* CONFIG_KVM_HYPERV */
=20
 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
 {
 #ifdef CONFIG_KVM_HYPERV
 	struct vmcs12 *vmcs12 =3D vmx->nested.cached_vmcs12;
-	struct hv_enlightened_vmcs *evmcs =3D nested_vmx_evmcs(vmx);
+	struct hv_enlightened_vmcs *evmcs =3D nested_lock_evmcs(vmx);
=20
 	/*
 	 * Should not be changed by KVM:
@@ -2155,6 +2155,7 @@ static void copy_vmcs12_to_enlightened(struct vcpu_vm=
x *vmx)
=20
 	evmcs->guest_bndcfgs =3D vmcs12->guest_bndcfgs;
=20
+	nested_unlock_evmcs(vmx);
 	return;
 #else /* CONFIG_KVM_HYPERV */
 	KVM_BUG_ON(1, vmx->vcpu.kvm);
@@ -2171,6 +2172,8 @@ static enum nested_evmptrld_status nested_vmx_handle_=
enlightened_vmptrld(
 #ifdef CONFIG_KVM_HYPERV
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
 	struct hv_enlightened_vmcs *evmcs;
+	struct gfn_to_pfn_cache *gpc;
+	enum nested_evmptrld_status status =3D EVMPTRLD_SUCCEEDED;
 	bool evmcs_gpa_changed =3D false;
 	u64 evmcs_gpa;
=20
@@ -2183,17 +2186,19 @@ static enum nested_evmptrld_status nested_vmx_handl=
e_enlightened_vmptrld(
 		return EVMPTRLD_DISABLED;
 	}
=20
+	gpc =3D &vmx->nested.hv_evmcs_cache;
+	if (nested_gpc_lock(gpc, evmcs_gpa)) {
+		nested_release_evmcs(vcpu);
+		return EVMPTRLD_ERROR;
+	}
+
+	evmcs =3D gpc->khva;
+
 	if (unlikely(evmcs_gpa !=3D vmx->nested.hv_evmcs_vmptr)) {
 		vmx->nested.current_vmptr =3D INVALID_GPA;
=20
 		nested_release_evmcs(vcpu);
=20
-		if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
-				 &vmx->nested.hv_evmcs_map))
-			return EVMPTRLD_ERROR;
-
-		vmx->nested.hv_evmcs =3D vmx->nested.hv_evmcs_map.hva;
-
 		/*
 		 * Currently, KVM only supports eVMCS version 1
 		 * (=3D=3D KVM_EVMCS_VERSION) and thus we expect guest to set this
@@ -2216,10 +2221,11 @@ static enum nested_evmptrld_status nested_vmx_handl=
e_enlightened_vmptrld(
 		 * eVMCS version or VMCS12 revision_id as valid values for first
 		 * u32 field of eVMCS.
 		 */
-		if ((vmx->nested.hv_evmcs->revision_id !=3D KVM_EVMCS_VERSION) &&
-		    (vmx->nested.hv_evmcs->revision_id !=3D VMCS12_REVISION)) {
+		if ((evmcs->revision_id !=3D KVM_EVMCS_VERSION) &&
+		    (evmcs->revision_id !=3D VMCS12_REVISION)) {
 			nested_release_evmcs(vcpu);
-			return EVMPTRLD_VMFAIL;
+			status =3D EVMPTRLD_VMFAIL;
+			goto unlock;
 		}
=20
 		vmx->nested.hv_evmcs_vmptr =3D evmcs_gpa;
@@ -2244,14 +2250,11 @@ static enum nested_evmptrld_status nested_vmx_handl=
e_enlightened_vmptrld(
 	 * between different L2 guests as KVM keeps a single VMCS12 per L1.
 	 */
 	if (from_launch || evmcs_gpa_changed) {
-		vmx->nested.hv_evmcs->hv_clean_fields &=3D
-			~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-
+		evmcs->hv_clean_fields &=3D ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
 		vmx->nested.force_msr_bitmap_recalc =3D true;
 	}
=20
 	/* Cache evmcs fields to avoid reading evmcs after copy to vmcs12 */
-	evmcs =3D vmx->nested.hv_evmcs;
 	vmx->nested.hv_clean_fields =3D evmcs->hv_clean_fields;
 	vmx->nested.hv_flush_hypercall =3D evmcs->hv_enlightenments_control.neste=
d_flush_hypercall;
 	vmx->nested.hv_msr_bitmap =3D evmcs->hv_enlightenments_control.msr_bitmap;
@@ -2260,13 +2263,15 @@ static enum nested_evmptrld_status nested_vmx_handl=
e_enlightened_vmptrld(
 		struct vmcs12 *vmcs12 =3D get_vmcs12(vcpu);
=20
 		if (likely(!vmcs12->hdr.shadow_vmcs)) {
-			copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_clean_fields);
+			copy_enlightened_to_vmcs12(vmx, evmcs, vmx->nested.hv_clean_fields);
 			/* Enlightened VMCS doesn't have launch state */
 			vmcs12->launch_state =3D !from_launch;
 		}
 	}
=20
-	return EVMPTRLD_SUCCEEDED;
+unlock:
+	nested_gpc_unlock(gpc);
+	return status;
 #else
 	return EVMPTRLD_DISABLED;
 #endif
@@ -2771,7 +2776,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, stru=
ct vmcs12 *vmcs12,
 			  enum vm_entry_failure_code *entry_failure_code)
 {
 	struct vcpu_vmx *vmx =3D to_vmx(vcpu);
-	struct hv_enlightened_vmcs *evmcs;
 	bool load_guest_pdptrs_vmcs12 =3D false;
=20
 	if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) {
@@ -2909,9 +2913,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, str=
uct vmcs12 *vmcs12,
 	 * bits when it changes a field in eVMCS. Mark all fields as clean
 	 * here.
 	 */
-	evmcs =3D nested_vmx_evmcs(vmx);
-	if (evmcs)
+	if (nested_vmx_is_evmptr12_valid(vmx)) {
+		struct hv_enlightened_vmcs *evmcs;
+
+		evmcs =3D nested_lock_evmcs(vmx);
 		evmcs->hv_clean_fields |=3D HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		nested_unlock_evmcs(vmx);
+	}
=20
 	return 0;
 }
@@ -4147,6 +4155,18 @@ static void *nested_gpc_lock_if_active(struct gfn_to=
_pfn_cache *gpc)
 	return gpc->khva;
 }
=20
+#ifdef CONFIG_KVM_HYPERV
+struct hv_enlightened_vmcs *nested_lock_evmcs(struct vcpu_vmx *vmx)
+{
+	return nested_gpc_lock_if_active(&vmx->nested.hv_evmcs_cache);
+}
+
+void nested_unlock_evmcs(struct vcpu_vmx *vmx)
+{
+	nested_gpc_unlock(&vmx->nested.hv_evmcs_cache);
+}
+#endif
+
 static struct pi_desc *nested_lock_pi_desc(struct vcpu_vmx *vmx)
 {
 	u8 *pi_desc_page;
@@ -5636,6 +5656,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 	kvm_gpc_init_for_vcpu(&vmx->nested.virtual_apic_cache, vcpu);
 	kvm_gpc_init_for_vcpu(&vmx->nested.pi_desc_cache, vcpu);
=20
+#ifdef CONFIG_KVM_HYPERV
+	kvm_gpc_init(&vmx->nested.hv_evmcs_cache, vcpu->kvm);
+#endif
 	vmx->nested.vmcs02_initialized =3D false;
 	vmx->nested.vmxon =3D true;
=20
@@ -5887,6 +5910,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 		/* Read the field, zero-extended to a u64 value */
 		value =3D vmcs12_read_any(vmcs12, field, offset);
 	} else {
+		struct hv_enlightened_vmcs *evmcs;
+
 		/*
 		 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
 		 * enlightened VMCS is active VMREAD/VMWRITE instructions are
@@ -5905,7 +5930,9 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 			return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
=20
 		/* Read the field, zero-extended to a u64 value */
-		value =3D evmcs_read_any(nested_vmx_evmcs(vmx), field, offset);
+		evmcs =3D nested_lock_evmcs(vmx);
+		value =3D evmcs_read_any(evmcs, field, offset);
+		nested_unlock_evmcs(vmx);
 	}
=20
 	/*
@@ -6935,6 +6962,27 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
 	return true;
 }
=20
+static void vmx_get_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
+{
+#ifdef CONFIG_KVM_HYPERV
+	struct hv_enlightened_vmcs *evmcs;
+	struct kvm_vcpu *vcpu =3D &vmx->vcpu;
+
+	kvm_vcpu_srcu_read_lock(vcpu);
+	evmcs =3D nested_lock_evmcs(vmx);
+	/*
+	 * L1 hypervisor is not obliged to keep eVMCS
+	 * clean fields data always up-to-date while
+	 * not in guest mode, 'hv_clean_fields' is only
+	 * supposed to be actual upon vmentry so we need
+	 * to ignore it here and do full copy.
+	 */
+	copy_enlightened_to_vmcs12(vmx, evmcs, 0);
+	nested_unlock_evmcs(vmx);
+	kvm_vcpu_srcu_read_unlock(vcpu);
+#endif /* CONFIG_KVM_HYPERV */
+}
+
 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 				struct kvm_nested_state __user *user_kvm_nested_state,
 				u32 user_data_size)
@@ -7025,14 +7073,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcp=
u,
 		copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
 		if (!vmx->nested.need_vmcs12_to_shadow_sync) {
 			if (nested_vmx_is_evmptr12_valid(vmx))
-				/*
-				 * L1 hypervisor is not obliged to keep eVMCS
-				 * clean fields data always up-to-date while
-				 * not in guest mode, 'hv_clean_fields' is only
-				 * supposed to be actual upon vmentry so we need
-				 * to ignore it here and do full copy.
-				 */
-				copy_enlightened_to_vmcs12(vmx, 0);
+				vmx_get_enlightened_to_vmcs12(vmx);
 			else if (enable_shadow_vmcs)
 				copy_shadow_to_vmcs12(vmx);
 		}
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 87708af502f3..4da5a42b0c60 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -208,8 +208,7 @@ struct nested_vmx {
 	u32 hv_clean_fields;
 	bool hv_msr_bitmap;
 	bool hv_flush_hypercall;
-	struct hv_enlightened_vmcs *hv_evmcs;
-	struct kvm_host_map hv_evmcs_map;
+	struct gfn_to_pfn_cache hv_evmcs_cache;
 #endif
 };
=20
--=20
2.43.0
From nobody Sun Feb  8 03:44:33 2026
Received: from mail-wr1-f47.google.com (mail-wr1-f47.google.com
 [209.85.221.47])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6BBB334BA49
	for <linux-kernel@vger.kernel.org>; Fri, 21 Nov 2025 11:11:38 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.221.47
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1763723502; cv=none;
 b=XMfth9s+bULoiVfMVhw23Trf2877xa85/+yXli2mvTjHW8uo1eiXz1I/+n50gsVf5S+MZq6vcMWk1+4yuAlIztdbeBsqWGv3sdXjs4vgQuAJSbsoQwkwZKSNI6LVyKayDQYgoFMDESlWslKEPcmLC/wCZcRIvOg67R7tNRTscJU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1763723502; c=relaxed/simple;
	bh=IuqcEOEbe0+RixX82HnLhcdMi0Mf/0K4mfVR0f073pY=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=EfHPMTyjQR5+iUQkJi9cw01w1oY+RvQYpyCGes/N74lFQ+7aafS34HqcIeOUa0DoJwJkQusIjaxC00HkJLptCRCjkpmwikkvcO0eK2ewe0jXXFV96cTQC6f9UluNVAcj/LDp92vwEWT+5xv6Be0yaamtEKdwyVdnGhJmsncjtho=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=hju2Hlra; arc=none smtp.client-ip=209.85.221.47
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="hju2Hlra"
Received: by mail-wr1-f47.google.com with SMTP id
 ffacd0b85a97d-42bb288c1bfso1204010f8f.2
        for <linux-kernel@vger.kernel.org>;
 Fri, 21 Nov 2025 03:11:37 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1763723495; x=1764328295;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=UwlQx/OWwadt7gEx5Aj743IyAAkdjmoKIV+Gyuhnzno=;
        b=hju2HlraMTKxSAOEj+aejdFRCSCrnjPAb5+PNCWzEmHxEORtWGEg6qugUO+n4duIWa
         w33OGBh6w0s4EYPl4JuQbg9EWD9X3FpDvdzXdtVzLFMSwclpPZXKIGhY8Rltp5XBvGzC
         q3wy7sx9km3FpWLvGLRFwtQAYBxyUbqQigFBqU41p1YNOeBWaDNQhZstu9LP2onEsys8
         8jVlKnhJadzMXp48BQeZYdeRoU7QTjqtsdDQAPC1yBPY5N5XZO8xqm7DQglH4w03LQ44
         BLsfmtLyHEnbpqD7VfRipaJlbISsVcFpgvRF+QszkI/qF1R68gs46q60OiMWC09CGqQk
         HAUw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1763723495; x=1764328295;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-gg:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=UwlQx/OWwadt7gEx5Aj743IyAAkdjmoKIV+Gyuhnzno=;
        b=pA8O4FbFAVWpPohlj4QSJTShlQRA542JSYAaglPINxmVYW8F/eYPPgcxNuG+/7vSWz
         XtWM0ShlawGKgsEzg6+toLqr84LlLS1JVdktYh23YnoT45XYH9o8Y8EBgidqGBzsvgWH
         VuYrgRPOf9KOuq6yZtp+Hr7zyDgELOTxwUO+b/XOGOS92DClYIMDNJV42AdZxkK0Rj7W
         4SGhl9Q2wJKHECF3p+3nj1a4XMIpAmWAhbpkbEvCLfuRUMsAZH9X6QrpBuC3p0B/iykp
         PU7xZKCbEB/SKO1rANqWgty/dhZNFhV5Evq92tMnfhsNE975pJ97ja/5ens3VqfSATGE
         vwzA==
X-Forwarded-Encrypted: i=1;
 AJvYcCXqs+sU9uFNrCheQmFAhWGhYUtVsYOhuUfI7pIvjlqj6yT2dIS2bte7ohFyWUTCNlHaR/HV2eZDs58ITwc=@vger.kernel.org
X-Gm-Message-State: AOJu0YycEdS04pBJcex+zXQJaEyiknA0LNa7AR9WWLeQg7A32R8xWHEP
	YOFMecueFRQ26ugOIzP+ST1af4p0PYDITrRLGuVBRnxz/fVnaberg09O
X-Gm-Gg: ASbGncs8K8cxrA9rILB2uwPCpQcpiObbmKnpJ/XAMGCsmZCE2N/Vy3vKxO/JVM6Ugvr
	Tt4//9XOwQr4rfSYQH4OZxvFgfC6XH8443VJVa4iKpaQ5pCLjIUMpDyKVVWXqYk9YCE+NXXsmmz
	tINL3stg5ZFwvLJwWH219b5x4C87/ZV5XegYpOX6ZSyGVFGIoIZRrSWlgf0NnEhJOBaB5lZoDvZ
	PYxmCYfCWHLnXcm4B0W+HY+dJITbkWrImaCDumffj60BRiHYzJpLN617ciZcPvbw9/wbhtOQi9x
	Dozdlrf0sD4taIhl/3lHClBc0dCFSFQLj7hc5QrszxLmbDvKoB/wQ1Whh+smKh6KMf+/ZPsdSjo
	jx657iQftEaiXZwBm/6kgrYQHY8yVH8ydIBYKCffccGBYvWcoC86rOaIxwoHtzXW4iItT48gsaO
	Ju+gR+cJNPES6WWjF08ZTmD2EbpnyXG6/uHLUVJLh/Fc+0TVQdgKrqg9+oG3bUSwjIjbGX7mpWm
	75XMsyBjXY0yUEodHSQIdmoVYTU3IFG
X-Google-Smtp-Source: 
 AGHT+IHVJpc9/f8dm4MnHiPG8rhAYbdabzUczXwVfUZb1zuYY7xjGjgQXaM7mOC9X5Auu/VvvvBTgQ==
X-Received: by 2002:a05:6000:1789:b0:42b:40b5:e64c with SMTP id
 ffacd0b85a97d-42cc1d0c3bdmr1976326f8f.30.1763723494827;
        Fri, 21 Nov 2025 03:11:34 -0800 (PST)
Received: from ip-10-0-150-200.eu-west-1.compute.internal
 (ec2-52-49-196-232.eu-west-1.compute.amazonaws.com. [52.49.196.232])
        by smtp.gmail.com with ESMTPSA id
 ffacd0b85a97d-42cb7f363e4sm10484180f8f.12.2025.11.21.03.11.34
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Fri, 21 Nov 2025 03:11:34 -0800 (PST)
From: Fred Griffoul <griffoul@gmail.com>
To: kvm@vger.kernel.org
Cc: seanjc@google.com,
	pbonzini@redhat.com,
	vkuznets@redhat.com,
	shuah@kernel.org,
	dwmw@amazon.co.uk,
	linux-kselftest@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Fred Griffoul <fgriffo@amazon.co.uk>
Subject: [PATCH v3 08/10] KVM: x86: Add nested context management
Date: Fri, 21 Nov 2025 11:11:11 +0000
Message-ID: <20251121111113.456628-9-griffoul@gmail.com>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20251121111113.456628-1-griffoul@gmail.com>
References: <20251121111113.456628-1-griffoul@gmail.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Add infrastructure to persist nested virtualization state when L2 vCPUs
are switched on an L1 vCPU or migrated between L1 vCPUs.

The nested context table uses a hash table for fast lookup by nested
control block GPA (VMPTR for VMX, VMCB for SVM) and maintains a free
list for context management.

The kvm_nested_context_load() function searches for a context indexed by
the target GPA; if not found, it allocates a new context up to the
configured maximum. If at capacity, it recycles the oldest context from
the free list.

The oversubscription is hardcoded to support up to 8 L2 vCPUs per L1
vCPU.

The kvm_nested_context_clear() function moves the context to the free
list while keeping it in the hash table for potential reuse.

This allows nested hypervisors to multiplex multiple L2 vCPUs on L1
vCPUs without losing cached nested state, significantly improving
performance for workloads with frequent L2 context switches.

This patch adds the basic infrastructure. Subsequent patches will add
the nested VMX and SVM specific support to populate and utilize the
cached nested state.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 arch/x86/include/asm/kvm_host.h |  31 +++++
 arch/x86/include/uapi/asm/kvm.h |   2 +
 arch/x86/kvm/Makefile           |   2 +-
 arch/x86/kvm/nested.c           | 199 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |   5 +-
 5 files changed, 237 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/kvm/nested.c

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_hos=
t.h
index 4675e71b33a7..75f3cd82a073 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1379,6 +1379,28 @@ enum kvm_mmu_type {
 	KVM_NR_MMU_TYPES,
 };
=20
+struct kvm_nested_context {
+	gpa_t gpa;
+	struct hlist_node hnode;
+	struct list_head lru_link;
+	struct kvm_vcpu *vcpu;
+};
+
+struct kvm_nested_context_table {
+	spinlock_t lock;
+	u32 count;
+	struct list_head lru_list;
+	DECLARE_HASHTABLE(hash, 8);
+};
+
+void kvm_nested_context_clear(struct kvm_vcpu *vcpu, gpa_t gpa);
+struct kvm_nested_context *kvm_nested_context_load(
+		struct kvm_vcpu *vcpu,
+		gpa_t gpa);
+
+int kvm_nested_context_table_init(struct kvm *kvm);
+void kvm_nested_context_table_destroy(struct kvm *kvm);
+
 struct kvm_arch {
 	unsigned long n_used_mmu_pages;
 	unsigned long n_requested_mmu_pages;
@@ -1618,6 +1640,9 @@ struct kvm_arch {
 	 * current VM.
 	 */
 	int cpu_dirty_log_size;
+
+	/* Cache for nested contexts */
+	struct kvm_nested_context_table *nested_context_table;
 };
=20
 struct kvm_vm_stat {
@@ -1640,6 +1665,8 @@ struct kvm_vm_stat {
 	u64 nx_lpage_splits;
 	u64 max_mmu_page_hash_collisions;
 	u64 max_mmu_rmap_size;
+	u64 nested_context_recycle;
+	u64 nested_context_reuse;
 };
=20
 struct kvm_vcpu_stat {
@@ -1967,6 +1994,10 @@ struct kvm_x86_nested_ops {
 			    uint16_t *vmcs_version);
 	uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
 	void (*hv_inject_synthetic_vmexit_post_tlb_flush)(struct kvm_vcpu *vcpu);
+
+	struct kvm_nested_context *(*alloc_context)(struct kvm_vcpu *vcpu);
+	void (*free_context)(struct kvm_nested_context *ctx);
+	void (*reset_context)(struct kvm_nested_context *ctx);
 };
=20
 struct kvm_x86_init_ops {
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kv=
m.h
index d420c9c066d4..637ed9286f8e 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -1042,4 +1042,6 @@ struct kvm_tdx_init_mem_region {
 	__u64 nr_pages;
 };
=20
+#define KVM_NESTED_OVERSUB_RATIO 8
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c4b8950c7abe..2a5289cb5bd1 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,7 @@ ccflags-$(CONFIG_KVM_WERROR) +=3D -Werror
 include $(srctree)/virt/kvm/Makefile.kvm
=20
 kvm-y			+=3D x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \
-			   debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o
+			   debugfs.o nested.o mmu/mmu.o mmu/page_track.o mmu/spte.o
=20
 kvm-$(CONFIG_X86_64) +=3D mmu/tdp_iter.o mmu/tdp_mmu.o
 kvm-$(CONFIG_KVM_IOAPIC) +=3D i8259.o i8254.o ioapic.o
diff --git a/arch/x86/kvm/nested.c b/arch/x86/kvm/nested.c
new file mode 100644
index 000000000000..986820cb525f
--- /dev/null
+++ b/arch/x86/kvm/nested.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm_host.h>
+
+static struct kvm_nested_context_table *kvm_nested_context_table_alloc(voi=
d)
+{
+	struct kvm_nested_context_table *table;
+
+	table =3D kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT);
+	if (!table)
+		return NULL;
+
+	spin_lock_init(&table->lock);
+	INIT_LIST_HEAD(&table->lru_list);
+	hash_init(table->hash);
+	return table;
+}
+
+static void kvm_nested_context_table_free(struct kvm_nested_context_table
+					  *table)
+{
+	kfree(table);
+}
+
+int kvm_nested_context_table_init(struct kvm *kvm)
+{
+	struct kvm_nested_context_table *table;
+
+	if (!kvm_x86_ops.nested_ops->alloc_context ||
+	    !kvm_x86_ops.nested_ops->free_context ||
+	    !kvm_x86_ops.nested_ops->reset_context)
+		return -EINVAL;
+
+	table =3D kvm_nested_context_table_alloc();
+	if (!table)
+		return -ENOMEM;
+
+	kvm->arch.nested_context_table =3D table;
+	return 0;
+}
+
+void kvm_nested_context_table_destroy(struct kvm *kvm)
+{
+	struct kvm_nested_context_table *table;
+	struct kvm_nested_context *ctx;
+	struct hlist_node *tmp;
+	int bkt;
+
+	table =3D kvm->arch.nested_context_table;
+	if (!table)
+		return;
+
+	hash_for_each_safe(table->hash, bkt, tmp, ctx, hnode) {
+		hash_del(&ctx->hnode);
+		kvm_x86_ops.nested_ops->free_context(ctx);
+	}
+
+	kvm_nested_context_table_free(table);
+}
+
+static unsigned int kvm_nested_context_max(struct kvm *kvm)
+{
+	return KVM_NESTED_OVERSUB_RATIO * atomic_read(&kvm->online_vcpus);
+}
+
+static struct kvm_nested_context *__kvm_nested_context_find(struct kvm_nes=
ted_context_table
+							    *table, gpa_t gpa)
+{
+	struct kvm_nested_context *ctx;
+
+	hash_for_each_possible(table->hash, ctx, hnode, gpa) {
+		if (ctx->gpa =3D=3D gpa)
+			return ctx;
+	}
+
+	return NULL;
+}
+
+static struct kvm_nested_context *kvm_nested_context_find(struct
+							  kvm_nested_context_table
+							  *table,
+							  struct kvm_vcpu *vcpu,
+							  gpa_t gpa)
+{
+	struct kvm_nested_context *ctx;
+
+	ctx =3D __kvm_nested_context_find(table, gpa);
+	if (!ctx)
+		return NULL;
+
+	WARN_ON_ONCE(ctx->vcpu && ctx->vcpu !=3D vcpu);
+
+	/* Remove from the LRU list if not attached to a vcpu */
+	if (!ctx->vcpu)
+		list_del(&ctx->lru_link);
+
+	return ctx;
+}
+
+static struct kvm_nested_context *kvm_nested_context_recycle(struct
+							     kvm_nested_context_table
+							     *table)
+{
+	struct kvm_nested_context *ctx;
+
+	if (list_empty(&table->lru_list))
+		return NULL;
+
+	ctx =3D
+	    list_first_entry(&table->lru_list, struct kvm_nested_context,
+			     lru_link);
+	list_del(&ctx->lru_link);
+	hash_del(&ctx->hnode);
+	return ctx;
+}
+
+static void kvm_nested_context_insert(struct kvm_nested_context_table *tab=
le,
+				      struct kvm_nested_context *ctx, gpa_t gpa)
+{
+	hash_add(table->hash, &ctx->hnode, gpa);
+	ctx->gpa =3D gpa;
+}
+
+struct kvm_nested_context *kvm_nested_context_load(struct kvm_vcpu *vcpu,
+						   gpa_t gpa)
+{
+	struct kvm_nested_context_table *table;
+	struct kvm_nested_context *ctx, *new_ctx =3D NULL;
+	struct kvm *vm =3D vcpu->kvm;
+	bool reset =3D false;
+
+	table =3D vcpu->kvm->arch.nested_context_table;
+	if (WARN_ON_ONCE(!table))
+		return NULL;
+retry:
+	spin_lock(&table->lock);
+	ctx =3D kvm_nested_context_find(table, vcpu, gpa);
+	if (!ctx) {
+		/* At capacity? Recycle the LRU context */
+		if (table->count >=3D kvm_nested_context_max(vcpu->kvm)) {
+			ctx =3D kvm_nested_context_recycle(table);
+			if (unlikely(!ctx))
+				goto finish;
+
+			kvm_nested_context_insert(table, ctx, gpa);
+			++vm->stat.nested_context_recycle;
+			reset =3D true;
+
+		} else if (new_ctx) {
+			++table->count;
+			ctx =3D new_ctx;
+			kvm_nested_context_insert(table, ctx, gpa);
+			new_ctx =3D NULL;
+
+		} else {
+			/* Allocate a new context without holding the lock */
+			spin_unlock(&table->lock);
+			new_ctx =3D kvm_x86_ops.nested_ops->alloc_context(vcpu);
+			if (unlikely(!new_ctx))
+				return NULL;
+
+			goto retry;
+		}
+	} else
+		++vm->stat.nested_context_reuse;
+
+	ctx->vcpu =3D vcpu;
+finish:
+	spin_unlock(&table->lock);
+
+	if (new_ctx)
+		kvm_x86_ops.nested_ops->free_context(new_ctx);
+
+	if (reset)
+		kvm_x86_ops.nested_ops->reset_context(ctx);
+
+	return ctx;
+}
+
+void kvm_nested_context_clear(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+	struct kvm_nested_context_table *table;
+	struct kvm_nested_context *ctx;
+
+	table =3D vcpu->kvm->arch.nested_context_table;
+	if (WARN_ON_ONCE(!table))
+		return;
+
+	spin_lock(&table->lock);
+	ctx =3D __kvm_nested_context_find(table, gpa);
+	if (ctx && ctx->vcpu) {
+		/*
+		 * Move to LRU list but keep it in the hash table for possible future
+		 * reuse.
+		 */
+		list_add_tail(&ctx->lru_link, &table->lru_list);
+		ctx->vcpu =3D NULL;
+	}
+	spin_unlock(&table->lock);
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1a9c1171df49..db13b1921aff 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -255,7 +255,9 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] =3D {
 	STATS_DESC_ICOUNTER(VM, pages_1g),
 	STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
 	STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
-	STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
+	STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions),
+	STATS_DESC_COUNTER(VM, nested_context_recycle),
+	STATS_DESC_COUNTER(VM, nested_context_reuse)
 };
=20
 const struct kvm_stats_header kvm_vm_stats_header =3D {
@@ -13311,6 +13313,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_page_track_cleanup(kvm);
 	kvm_xen_destroy_vm(kvm);
 	kvm_hv_destroy_vm(kvm);
+	kvm_nested_context_table_destroy(kvm);
 	kvm_x86_call(vm_destroy)(kvm);
 }
=20
--=20
2.43.0
From nobody Sun Feb  8 03:44:33 2026
Received: from mail-wr1-f45.google.com (mail-wr1-f45.google.com
 [209.85.221.45])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id BC33334BA53
	for <linux-kernel@vger.kernel.org>; Fri, 21 Nov 2025 11:11:38 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.221.45
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1763723503; cv=none;
 b=jBX9Wx70MyO7rp9Q2J7CNCvDeeBvZu+aCLQuUjdJ63yyBZ0LNomo5I5DtkdfCwXDukgMDx+nhRcUciSr/RdY+ChfLXn+tOmQWfozZYXIJZRw1CzTwzkD++R1zl+3nRRtiIyyByO3gEf0+KgOSJ2JB4U5wdv2GHkiZkAuutIUt+U=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1763723503; c=relaxed/simple;
	bh=Dm+BEBrm/Ll97dS33rHM/QTSbCa8yrcmrrSYYLlbkL8=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=SEBC6qs2CST118akuwN4/SGpsQjLxkU9feklDo+DEXK25zpGptUmU/wDzIFDqUtaN8LNeubYt/ZIYu69B+yPSyK/yJ27zOSZZ0DYmtrZUM0NUZ9Sl9/EFof0+7WtoQ6HeFsWCT6azKCXH9IcqjrVIxYLw9IMfBKFflUrwIL2OyA=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=CcrMfjw1; arc=none smtp.client-ip=209.85.221.45
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="CcrMfjw1"
Received: by mail-wr1-f45.google.com with SMTP id
 ffacd0b85a97d-42b3d4d9ca6so1641398f8f.2
        for <linux-kernel@vger.kernel.org>;
 Fri, 21 Nov 2025 03:11:38 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1763723496; x=1764328296;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=VOXJSf/3ZicK6X9zkJJPFnaHL2wXDXQgFML1HXxSOA0=;
        b=CcrMfjw1T1ztlzc3LgAZjU0G1IDWW8sYXTTWQRIseKGjatAlS7ZMqZJdso9w34eGJS
         kv4paceZvgX3iE20Y2dvUFjyl3TiXYNPE2LBqgJsUnF9XyV5VX5GkyxGB0GnjAdmO5jt
         tcaodewIB3tJtukjc1cP2GnDtbBQwLtCchI/roiKEfHW52giBkSftCF4ETE9tLF/HyCA
         5KCRwJ6wVNYQmE99+SGP44G/D4gRsXBWEltFiz3r1ot1Hd6/66di7JcBOBgCQJgLCbVI
         ZfHaxWGYI9SsdcNCeDfTjHq1P/U5CTnvMPfj20PVLOegPmKefa+kYdUrvgTfE8zRWRlj
         CPeQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1763723496; x=1764328296;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-gg:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=VOXJSf/3ZicK6X9zkJJPFnaHL2wXDXQgFML1HXxSOA0=;
        b=n4rZt0dzdJrpRy2QU5nCFw1eA3wQnxaNIfFHJKVEONpaGMQzIePBbhqgtT7JcsbGD8
         J/AOlgAu2veuZ+sRYedIUHzdVFN/+2YQ/zGcrrdsW9c3iWpAXVG2AsrPgHx4SyIezWiP
         s4NzEIAtwj3M5qqk+wzx9kt6ZML3nFbURpim4SdoAsLfYjA9CKZgdsDB5EunTky9dsMK
         ST9veFFoYCQpRN8movBStLiC5krT9B8EmmM4G4MflhcQTHkQfb9pcueQUkZ7kdFKpM3V
         /KQsr53FfhEb6i/BvSqIKjtgJr4lk1yeNc9wXJgFlRcd/r4KmmV7/weuLuR4v4INs+3c
         /UmQ==
X-Forwarded-Encrypted: i=1;
 AJvYcCV/OB3OBqiBKcvN2Rj9gxuQ1f7PyuGYwtW2Uroqifo/PNTzdVdzafdl5BhpRkma+G6RdNQjnxe4BUs1oo8=@vger.kernel.org
X-Gm-Message-State: AOJu0YwYqYwDWZH07E4GmFaisj50oMUiiHdGdvDX1007ZiE/42wfImLB
	oMKX/VjJrkM/apnS3Fjz313Xa1kLghvfV/5yJzT0OC7EidlT2lxBjWMw
X-Gm-Gg: ASbGncsjPVUjMDEkaZ16Z5zIVymnCcqZoD7heDxq6QHzh63BAfXk/UNZrsz10yXKAnG
	f8GlIWAH52v7D5xno6qgnoBZA5cSW5OPvJFBHbNBbO0U3fDn1Ga1oG5U3n95/1DSbPgdrfhaTlz
	pH73NYAZIJUw9aPwr1ZYE3ZmFyxLKag5mcipL1gY2uWDhXEChSxpDDGEgFDUaHkGCnvc4Y63E+5
	vRBqith/YwfBDhbUA+A6bN/J0Yal4m20IhHsD8n5f6nzxU29O84Yvwwa4oSQ7bQ3+ZjdjDttBtq
	3ox5eho/a5TonmPjSv8YyJd1c29hjjZEpepW6AVntaaac7XvS4yLIkDLoZrMAud6Jl0q04N6bog
	4nI8WcgNvjQR6EMzG3wYcPyulHoXsPSW/ZFzH6DHWNw0K8c1v4Jc9OfC0qxUtuCyonn5095VkBp
	dyEA89lG6pBwiikbIszW+SACl+YVFLtkBj+t2Um/kGL28Sn1FqM3rqgm332zrQwkU2nNInaPDsG
	AmB5ntNQUgg/CCnBlGJ9CngzVGiO8TiT/E5qUiRW1o=
X-Google-Smtp-Source: 
 AGHT+IEIAoneRkmmi+OfaqjE525Aae3+kzixRpE0ECciHfimaKXtH3N4PTs7XwRIiOOi7OKuLNVHKg==
X-Received: by 2002:a05:6000:2084:b0:42b:3b62:cd9b with SMTP id
 ffacd0b85a97d-42cc1d2d52amr1935469f8f.37.1763723496404;
        Fri, 21 Nov 2025 03:11:36 -0800 (PST)
Received: from ip-10-0-150-200.eu-west-1.compute.internal
 (ec2-52-49-196-232.eu-west-1.compute.amazonaws.com. [52.49.196.232])
        by smtp.gmail.com with ESMTPSA id
 ffacd0b85a97d-42cb7f363e4sm10484180f8f.12.2025.11.21.03.11.35
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Fri, 21 Nov 2025 03:11:36 -0800 (PST)
From: Fred Griffoul <griffoul@gmail.com>
To: kvm@vger.kernel.org
Cc: seanjc@google.com,
	pbonzini@redhat.com,
	vkuznets@redhat.com,
	shuah@kernel.org,
	dwmw@amazon.co.uk,
	linux-kselftest@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Fred Griffoul <fgriffo@amazon.co.uk>
Subject: [PATCH v3 09/10] KVM: nVMX: Use nested context for pfncache
 persistence
Date: Fri, 21 Nov 2025 11:11:12 +0000
Message-ID: <20251121111113.456628-10-griffoul@gmail.com>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20251121111113.456628-1-griffoul@gmail.com>
References: <20251121111113.456628-1-griffoul@gmail.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Extend the nested context infrastructure to preserve gfn_to_pfn_cache
objects for nested VMX using kvm_nested_context_load() and
kvm_nested_context_clear() functions.

The VMX nested context stores gfn_to_pfn_cache structs for:
- MSR permission bitmaps
- APIC access page
- Virtual APIC page
- Posted interrupt descriptor
- Enlightened VMCS

For traditional nested VMX, those pfn caches are loaded upon 'vmptrld'
instruction emulation and the context is cleared upon 'vmclear'. This
follows the normal L2 vCPU migration sequence of
'vmclear/vmptrld/vmlaunch'.

For enlightened VMCS (eVMCS) support, both functions are called when
detecting a change in the eVMCS GPA, ensuring proper context management
for Hyper-V nested scenarios.

By preserving the gfn_to_pfn_cache objects across L2 context switches,
we avoid costly cache refresh operations, significantly improving nested
virtualization performance for workloads with frequent L2 vCPU
multiplexing on an L1 vCPU or L2 vCPUs migrations between L1 vCPUs.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 arch/x86/kvm/vmx/nested.c | 155 +++++++++++++++++++++++++++++---------
 arch/x86/kvm/vmx/vmx.c    |   8 ++
 arch/x86/kvm/vmx/vmx.h    |  10 +--
 include/linux/kvm_host.h  |   2 +-
 4 files changed, 134 insertions(+), 41 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 207780ef0926..bd600de29031 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -226,6 +226,93 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *v=
mx)
 	vmx->nested.need_vmcs12_to_shadow_sync =3D false;
 }
=20
+struct vmx_nested_context {
+	struct kvm_nested_context base;
+	struct gfn_to_pfn_cache msr_bitmap_cache;
+	struct gfn_to_pfn_cache apic_access_page_cache;
+	struct gfn_to_pfn_cache virtual_apic_cache;
+	struct gfn_to_pfn_cache pi_desc_cache;
+#ifdef CONFIG_KVM_HYPERV
+	struct gfn_to_pfn_cache evmcs_cache;
+#endif
+};
+
+static inline struct vmx_nested_context *to_vmx_nested_context(
+		struct kvm_nested_context *base)
+{
+	return base ? container_of(base, struct vmx_nested_context, base) : NULL;
+}
+
+static struct kvm_nested_context *vmx_nested_context_alloc(struct kvm_vcpu=
 *vcpu)
+{
+	struct vmx_nested_context *ctx;
+
+	ctx =3D kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
+	if (!ctx)
+		return NULL;
+
+	kvm_gpc_init(&ctx->msr_bitmap_cache, vcpu->kvm);
+	kvm_gpc_init_for_vcpu(&ctx->apic_access_page_cache, vcpu);
+	kvm_gpc_init_for_vcpu(&ctx->virtual_apic_cache, vcpu);
+	kvm_gpc_init_for_vcpu(&ctx->pi_desc_cache, vcpu);
+#ifdef CONFIG_KVM_HYPERV
+	kvm_gpc_init(&ctx->evmcs_cache, vcpu->kvm);
+#endif
+	return &ctx->base;
+}
+
+static void vmx_nested_context_reset(struct kvm_nested_context *base)
+{
+	/*
+	 * Skip pfncache reinitialization: active ones will be refreshed on
+	 * access.
+	 */
+}
+
+static void vmx_nested_context_free(struct kvm_nested_context *base)
+{
+	struct vmx_nested_context *ctx =3D to_vmx_nested_context(base);
+
+	kvm_gpc_deactivate(&ctx->pi_desc_cache);
+	kvm_gpc_deactivate(&ctx->virtual_apic_cache);
+	kvm_gpc_deactivate(&ctx->apic_access_page_cache);
+	kvm_gpc_deactivate(&ctx->msr_bitmap_cache);
+#ifdef CONFIG_KVM_HYPERV
+	kvm_gpc_deactivate(&ctx->evmcs_cache);
+#endif
+	kfree(ctx);
+}
+
+static void vmx_nested_context_load(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+	struct vmx_nested_context *ctx;
+
+	ctx =3D to_vmx_nested_context(kvm_nested_context_load(&vmx->vcpu, vmptr));
+	if (!ctx) {
+		/*
+		 * The cache could not be allocated. In the unlikely case of no
+		 * available memory, an error will be returned to L1 when
+		 * mapping the vmcs12 pages. More likely the current pfncaches
+		 * will be reused (and refreshed since their GPAs do not
+		 * match).
+		 */
+		return;
+	}
+
+	vmx->nested.msr_bitmap_cache =3D &ctx->msr_bitmap_cache;
+	vmx->nested.apic_access_page_cache =3D &ctx->apic_access_page_cache;
+	vmx->nested.virtual_apic_cache =3D &ctx->virtual_apic_cache;
+	vmx->nested.pi_desc_cache =3D &ctx->pi_desc_cache;
+#ifdef CONFIG_KVM_HYPERV
+	vmx->nested.hv_evmcs_cache =3D &ctx->evmcs_cache;
+#endif
+}
+
+static void vmx_nested_context_clear(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+	kvm_nested_context_clear(&vmx->vcpu, vmptr);
+}
+
 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_KVM_HYPERV
@@ -325,6 +412,9 @@ static int nested_gpc_lock(struct gfn_to_pfn_cache *gpc=
, gpa_t gpa)
=20
 	if (!PAGE_ALIGNED(gpa))
 		return -EINVAL;
+
+	if (WARN_ON_ONCE(!gpc))
+		return -ENOENT;
 retry:
 	read_lock(&gpc->lock);
 	if (!kvm_gpc_check(gpc, PAGE_SIZE) || (gpc->gpa !=3D gpa)) {
@@ -387,14 +477,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
 	vmx->nested.smm.vmxon =3D false;
 	vmx->nested.vmxon_ptr =3D INVALID_GPA;
=20
-	kvm_gpc_deactivate(&vmx->nested.pi_desc_cache);
-	kvm_gpc_deactivate(&vmx->nested.virtual_apic_cache);
-	kvm_gpc_deactivate(&vmx->nested.apic_access_page_cache);
-	kvm_gpc_deactivate(&vmx->nested.msr_bitmap_cache);
-#ifdef CONFIG_KVM_HYPERV
-	kvm_gpc_deactivate(&vmx->nested.hv_evmcs_cache);
-#endif
-
 	free_vpid(vmx->nested.vpid02);
 	vmx->nested.posted_intr_nv =3D -1;
 	vmx->nested.current_vmptr =3D INVALID_GPA;
@@ -697,7 +779,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct=
 kvm_vcpu *vcpu,
 			return true;
 	}
=20
-	gpc =3D &vmx->nested.msr_bitmap_cache;
+	gpc =3D vmx->nested.msr_bitmap_cache;
 	if (nested_gpc_lock(gpc, vmcs12->msr_bitmap))
 		return false;
=20
@@ -2186,7 +2268,13 @@ static enum nested_evmptrld_status nested_vmx_handle=
_enlightened_vmptrld(
 		return EVMPTRLD_DISABLED;
 	}
=20
-	gpc =3D &vmx->nested.hv_evmcs_cache;
+	if (evmcs_gpa !=3D vmx->nested.hv_evmcs_vmptr) {
+		vmx_nested_context_clear(vmx, vmx->nested.hv_evmcs_vmptr);
+		vmx_nested_context_load(vmx, evmcs_gpa);
+		evmcs_gpa_changed =3D true;
+	}
+
+	gpc =3D vmx->nested.hv_evmcs_cache;
 	if (nested_gpc_lock(gpc, evmcs_gpa)) {
 		nested_release_evmcs(vcpu);
 		return EVMPTRLD_ERROR;
@@ -2194,9 +2282,8 @@ static enum nested_evmptrld_status nested_vmx_handle_=
enlightened_vmptrld(
=20
 	evmcs =3D gpc->khva;
=20
-	if (unlikely(evmcs_gpa !=3D vmx->nested.hv_evmcs_vmptr)) {
+	if (evmcs_gpa_changed) {
 		vmx->nested.current_vmptr =3D INVALID_GPA;
-
 		nested_release_evmcs(vcpu);
=20
 		/*
@@ -2230,7 +2317,6 @@ static enum nested_evmptrld_status nested_vmx_handle_=
enlightened_vmptrld(
=20
 		vmx->nested.hv_evmcs_vmptr =3D evmcs_gpa;
=20
-		evmcs_gpa_changed =3D true;
 		/*
 		 * Unlike normal vmcs12, enlightened vmcs12 is not fully
 		 * reloaded from guest's memory (read only fields, fields not
@@ -3538,7 +3624,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *=
vcpu)
=20
=20
 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-		gpc =3D &vmx->nested.apic_access_page_cache;
+		gpc =3D vmx->nested.apic_access_page_cache;
=20
 		if (!nested_gpc_hpa(gpc, vmcs12->apic_access_addr, &hpa)) {
 			vmcs_write64(APIC_ACCESS_ADDR, hpa);
@@ -3554,7 +3640,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *=
vcpu)
 	}
=20
 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-		gpc =3D &vmx->nested.virtual_apic_cache;
+		gpc =3D vmx->nested.virtual_apic_cache;
=20
 		if (!nested_gpc_hpa(gpc, vmcs12->virtual_apic_page_addr, &hpa)) {
 			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
@@ -3580,7 +3666,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *=
vcpu)
 	}
=20
 	if (nested_cpu_has_posted_intr(vmcs12)) {
-		gpc =3D &vmx->nested.pi_desc_cache;
+		gpc =3D vmx->nested.pi_desc_cache;
=20
 		if (!nested_gpc_hpa(gpc, vmcs12->posted_intr_desc_addr & PAGE_MASK, &hpa=
)) {
 			vmx->nested.pi_desc_offset =3D offset_in_page(vmcs12->posted_intr_desc_=
addr);
@@ -3640,9 +3726,9 @@ static bool vmx_is_nested_state_invalid(struct kvm_vc=
pu *vcpu)
 	 * locks. Since kvm_gpc_invalid() doesn't verify gpc memslot
 	 * generation, we can also skip acquiring the srcu lock.
 	 */
-	return kvm_gpc_invalid(&vmx->nested.apic_access_page_cache) ||
-		kvm_gpc_invalid(&vmx->nested.virtual_apic_cache) ||
-		kvm_gpc_invalid(&vmx->nested.pi_desc_cache);
+	return kvm_gpc_invalid(vmx->nested.apic_access_page_cache) ||
+		kvm_gpc_invalid(vmx->nested.virtual_apic_cache) ||
+		kvm_gpc_invalid(vmx->nested.pi_desc_cache);
 }
=20
 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
@@ -4138,6 +4224,8 @@ void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *=
vcpu)
=20
 static void *nested_gpc_lock_if_active(struct gfn_to_pfn_cache *gpc)
 {
+	if (!gpc)
+		return NULL;
 retry:
 	read_lock(&gpc->lock);
 	if (!gpc->active) {
@@ -4158,12 +4246,12 @@ static void *nested_gpc_lock_if_active(struct gfn_t=
o_pfn_cache *gpc)
 #ifdef CONFIG_KVM_HYPERV
 struct hv_enlightened_vmcs *nested_lock_evmcs(struct vcpu_vmx *vmx)
 {
-	return nested_gpc_lock_if_active(&vmx->nested.hv_evmcs_cache);
+	return nested_gpc_lock_if_active(vmx->nested.hv_evmcs_cache);
 }
=20
 void nested_unlock_evmcs(struct vcpu_vmx *vmx)
 {
-	nested_gpc_unlock(&vmx->nested.hv_evmcs_cache);
+	nested_gpc_unlock(vmx->nested.hv_evmcs_cache);
 }
 #endif
=20
@@ -4171,7 +4259,7 @@ static struct pi_desc *nested_lock_pi_desc(struct vcp=
u_vmx *vmx)
 {
 	u8 *pi_desc_page;
=20
-	pi_desc_page =3D nested_gpc_lock_if_active(&vmx->nested.pi_desc_cache);
+	pi_desc_page =3D nested_gpc_lock_if_active(vmx->nested.pi_desc_cache);
 	if (!pi_desc_page)
 		return NULL;
=20
@@ -4180,17 +4268,17 @@ static struct pi_desc *nested_lock_pi_desc(struct v=
cpu_vmx *vmx)
=20
 static void nested_unlock_pi_desc(struct vcpu_vmx *vmx)
 {
-	nested_gpc_unlock(&vmx->nested.pi_desc_cache);
+	nested_gpc_unlock(vmx->nested.pi_desc_cache);
 }
=20
 static void *nested_lock_vapic(struct vcpu_vmx *vmx)
 {
-	return nested_gpc_lock_if_active(&vmx->nested.virtual_apic_cache);
+	return nested_gpc_lock_if_active(vmx->nested.virtual_apic_cache);
 }
=20
 static void nested_unlock_vapic(struct vcpu_vmx *vmx)
 {
-	nested_gpc_unlock(&vmx->nested.virtual_apic_cache);
+	nested_gpc_unlock(vmx->nested.virtual_apic_cache);
 }
=20
 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -5649,16 +5737,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 		      HRTIMER_MODE_ABS_PINNED);
=20
 	vmx->nested.vpid02 =3D allocate_vpid();
-
-	kvm_gpc_init(&vmx->nested.msr_bitmap_cache, vcpu->kvm);
-
-	kvm_gpc_init_for_vcpu(&vmx->nested.apic_access_page_cache, vcpu);
-	kvm_gpc_init_for_vcpu(&vmx->nested.virtual_apic_cache, vcpu);
-	kvm_gpc_init_for_vcpu(&vmx->nested.pi_desc_cache, vcpu);
-
-#ifdef CONFIG_KVM_HYPERV
-	kvm_gpc_init(&vmx->nested.hv_evmcs_cache, vcpu->kvm);
-#endif
 	vmx->nested.vmcs02_initialized =3D false;
 	vmx->nested.vmxon =3D true;
=20
@@ -5854,6 +5932,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 					   &zero, sizeof(zero));
 	}
=20
+	vmx_nested_context_clear(vmx, vmptr);
+
 	return nested_vmx_succeed(vcpu);
 }
=20
@@ -6098,6 +6178,8 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, g=
pa_t vmptr)
 	}
 	vmx->nested.dirty_vmcs12 =3D true;
 	vmx->nested.force_msr_bitmap_recalc =3D true;
+
+	vmx_nested_context_load(vmx, vmptr);
 }
=20
 /* Emulate the VMPTRLD instruction */
@@ -7687,4 +7769,7 @@ struct kvm_x86_nested_ops vmx_nested_ops =3D {
 	.get_evmcs_version =3D nested_get_evmcs_version,
 	.hv_inject_synthetic_vmexit_post_tlb_flush =3D vmx_hv_inject_synthetic_vm=
exit_post_tlb_flush,
 #endif
+	.alloc_context =3D vmx_nested_context_alloc,
+	.free_context =3D vmx_nested_context_free,
+	.reset_context =3D vmx_nested_context_reset,
 };
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 546272a5d34d..30b13241ae45 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7666,6 +7666,14 @@ int vmx_vm_init(struct kvm *kvm)
=20
 	if (enable_pml)
 		kvm->arch.cpu_dirty_log_size =3D PML_LOG_NR_ENTRIES;
+
+	if (nested) {
+		int err;
+
+		err =3D kvm_nested_context_table_init(kvm);
+		if (err)
+			return err;
+	}
 	return 0;
 }
=20
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 4da5a42b0c60..56b96e50290f 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -152,15 +152,15 @@ struct nested_vmx {
=20
 	struct loaded_vmcs vmcs02;
=20
-	struct gfn_to_pfn_cache msr_bitmap_cache;
+	struct gfn_to_pfn_cache *msr_bitmap_cache;
=20
 	/*
 	 * Guest pages referred to in the vmcs02 with host-physical
 	 * pointers, so we must keep them pinned while L2 runs.
 	 */
-	struct gfn_to_pfn_cache apic_access_page_cache;
-	struct gfn_to_pfn_cache virtual_apic_cache;
-	struct gfn_to_pfn_cache pi_desc_cache;
+	struct gfn_to_pfn_cache *apic_access_page_cache;
+	struct gfn_to_pfn_cache *virtual_apic_cache;
+	struct gfn_to_pfn_cache *pi_desc_cache;
=20
 	u64 pi_desc_offset;
 	bool pi_pending;
@@ -208,7 +208,7 @@ struct nested_vmx {
 	u32 hv_clean_fields;
 	bool hv_msr_bitmap;
 	bool hv_flush_hypercall;
-	struct gfn_to_pfn_cache hv_evmcs_cache;
+	struct gfn_to_pfn_cache *hv_evmcs_cache;
 #endif
 };
=20
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b05aace9e295..97e0b949e412 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1533,7 +1533,7 @@ static inline bool kvm_gpc_is_hva_active(struct gfn_t=
o_pfn_cache *gpc)
=20
 static inline bool kvm_gpc_invalid(struct gfn_to_pfn_cache *gpc)
 {
-	return gpc->active && !gpc->valid;
+	return gpc && gpc->active && !gpc->valid;
 }
=20
 void kvm_sigset_activate(struct kvm_vcpu *vcpu);
--=20
2.43.0
From nobody Sun Feb  8 03:44:33 2026
Received: from mail-wm1-f53.google.com (mail-wm1-f53.google.com
 [209.85.128.53])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id F0F6B347FD1
	for <linux-kernel@vger.kernel.org>; Fri, 21 Nov 2025 11:11:40 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.128.53
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1763723506; cv=none;
 b=M75g4ZE8pRe510F/tA4E4GKrm9KjFsLmhDoBxKocsl1H2ZMpKc6ca4u0v+1/F+1ZBAyqyk+lz/wgfKAARySqdFY8mSqDpVQE5CHgU4/ZzGIkc3dFKsPCV4bLSlKoJbOPghGQry0mBbyVOjlwqkqq6reWQW+DbDJRao+4G0ZVQEc=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1763723506; c=relaxed/simple;
	bh=xzwRLQEcrDtgwyEHwiBCiDScE8JYSDiIrumzHzd6MhM=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=TdfYmOKufsNVwdcI5o6kSpQf5gCsWvAziKZ82AITutop6dYSwL5a9p7JxUbP75Z54BWNmyrBw8kp1ckTRqES7ZEkYrel1GgUNLhgIyBOotdAvuj3VTblwVt7v3QFz9OoRjDkC5su+fEOmedkAdZ68tgg+KGw60Cofh3aHZKPcho=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=XrhE2BdH; arc=none smtp.client-ip=209.85.128.53
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="XrhE2BdH"
Received: by mail-wm1-f53.google.com with SMTP id
 5b1f17b1804b1-4779cb0a33fso18526235e9.0
        for <linux-kernel@vger.kernel.org>;
 Fri, 21 Nov 2025 03:11:40 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1763723498; x=1764328298;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=ezTG0c03OD07wwSaXajwua3oK74Hbi6tRsK/GFCnf+c=;
        b=XrhE2BdHviXUAmm56XRnPdvYjEGqHBduNSlI2eOB/Lvu20CEDCOef+edGSP4GwDOKy
         lKZVfgC9Nba18T3Zm+jtjH5rAQb9BqM6PCDgjAJeXbjUe3n7VHAUzn4cANIJYKJvq9/o
         Qt3aHRk7JcsLJK3VFXfLfZj16BEEP73tbu8b6tD51Z+awE/gDV+hu3cRf368hV01/5A/
         Pu85FZGQ74vr2bH1QgmFR9owbRLrfsOXku+afUQ50C9f1lI4iOeaGY5hR8PQe9y/58BH
         mVasaTck492n+76ZE8vZ+y0KFx5hgglet5kojVBGw+4VT4dFBEeE+5QJGhrF1xQwaP/5
         fUrQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1763723498; x=1764328298;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-gg:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=ezTG0c03OD07wwSaXajwua3oK74Hbi6tRsK/GFCnf+c=;
        b=CqXC3CMpDQ2apz/IGhwYvPu+HsGFSvQ8u33nk2mYV41JFwZKMANk9pwfhcJtbI9n+J
         mO57ZpL0nvmg3uct98owSh2QDNe9nbKAnOAfEoLaadk72ixHepz5gmxaq0HY8dTGAczQ
         H7/uzhBCLyo9axez2PlU90PqoQ/v3G1wqZTw8nBoiuCJ8zYOXcR/B0rM0k3j2WOUPsXT
         xAXCOrAccKYqMLZ+1PU7Ol0hTvlxyZ+5reGLntd7zTxd1zvh/MbrXxa6X2w923bup8WD
         Fh6/owYylDxVW3pxOjbbCFqOP5bTU/h5w+kOdOI9oz7EF96bMSWGgsVtAHxrC+IYFd53
         gekg==
X-Forwarded-Encrypted: i=1;
 AJvYcCXb9Pio4zERtvx0hvG5V4jvYk4o1OPJxXp3Z6KhKCCdgK5r1NrAzECTRibt5BW3/kdpjkWXoZ/YB9Kztq8=@vger.kernel.org
X-Gm-Message-State: AOJu0YyJ+rdFH+Ba+PDHxTqtNPgaoHM1IiDfXkx1gKAwy6BRGFSPBo4S
	1X73I2wmkmRQPsdcSiO4FhjQ15mKLufM/sftvF7i8oB/v0ffu4L74kMw
X-Gm-Gg: ASbGncufEr603OJZeC2Rm35Br7z0JMdbjnXxev/DfmalAKOTCtx1D1wG2ghAV/Hemo1
	uTfzStJy6lGkMWjujfyskLNa40ybxXRG3xHrD5tPSykpgu4PGSlxeknEGRzCZWZgrUaZAFqfDd5
	Yy5mngWaf7H4iIQyvu1FarwvWS3pBa9cJf9SFX/YCgMD3ARad7chkUS9HZ3CLY2bqBtMkgAnm+C
	QrwRVAgtAIhUYJl+zdaLev5zJEXHktc+qVI3Oid7sYgND7V4iK7B6yz3pD18ns0p8CIKjUUVOi6
	+Tpx90LXQUFmXBFvpv3+AXzH73HpgeHmQ++8NgYThZ3Ampxdu+t4QenFAEJKKzz5QlAytXR93pT
	riFuGAy4WiTSikWumr5JdBvyDg8RwDj3sUuzVjWbBncBvh5WUldWRPYNJURTkvd95vYxcBSfnkc
	gLl5+gjh4/nFrEmcI3VM24tA2D+NQ8OMc/AAq0mkQ3vNdDR+NeAjykXGG0c+CKsjRg4Lh+BDfw1
	oT/02pqZDAt5L0hJNyhgJBcUF4nO1VnF2qkWhe295A=
X-Google-Smtp-Source: 
 AGHT+IHt/kAAQWENOHXpe3yzAVmcIXRSnXZLw0S538PHuVkTpnQN+guqXm5d++Cpovl+Y87I7qmd2Q==
X-Received: by 2002:a05:600c:354b:b0:46f:b327:ecfb with SMTP id
 5b1f17b1804b1-477c0184c45mr19010215e9.9.1763723498088;
        Fri, 21 Nov 2025 03:11:38 -0800 (PST)
Received: from ip-10-0-150-200.eu-west-1.compute.internal
 (ec2-52-49-196-232.eu-west-1.compute.amazonaws.com. [52.49.196.232])
        by smtp.gmail.com with ESMTPSA id
 ffacd0b85a97d-42cb7f363e4sm10484180f8f.12.2025.11.21.03.11.37
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Fri, 21 Nov 2025 03:11:37 -0800 (PST)
From: Fred Griffoul <griffoul@gmail.com>
To: kvm@vger.kernel.org
Cc: seanjc@google.com,
	pbonzini@redhat.com,
	vkuznets@redhat.com,
	shuah@kernel.org,
	dwmw@amazon.co.uk,
	linux-kselftest@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Fred Griffoul <fgriffo@amazon.co.uk>
Subject: [PATCH v3 10/10] KVM: selftests: Add L2 vcpu context switch test
Date: Fri, 21 Nov 2025 11:11:13 +0000
Message-ID: <20251121111113.456628-11-griffoul@gmail.com>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20251121111113.456628-1-griffoul@gmail.com>
References: <20251121111113.456628-1-griffoul@gmail.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Fred Griffoul <fgriffo@amazon.co.uk>

Add selftest to validate nested VMX context switching between multiple
L2 vCPUs running on the same L1 vCPU. The test exercises both direct
VMX interface (using vmptrld/vmclear operations) and enlightened VMCS
(eVMCS) interface for Hyper-V nested scenarios.

The test creates multiple VMCS structures and switches between them to
verify that the nested_context kvm counters are correct, according to
the number of L2 vCPUs and the number of switches.

Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
Suggested-by: dwmw@amazon.co.uk
---
 tools/testing/selftests/kvm/Makefile.kvm      |   1 +
 .../selftests/kvm/x86/vmx_l2_switch_test.c    | 416 ++++++++++++++++++
 2 files changed, 417 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_l2_switch_test.c

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selft=
ests/kvm/Makefile.kvm
index 3431568d837e..5d47afa5789b 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -138,6 +138,7 @@ TEST_GEN_PROGS_x86 +=3D x86/triple_fault_event_test
 TEST_GEN_PROGS_x86 +=3D x86/recalc_apic_map_test
 TEST_GEN_PROGS_x86 +=3D x86/aperfmperf_test
 TEST_GEN_PROGS_x86 +=3D x86/vmx_apic_update_test
+TEST_GEN_PROGS_x86 +=3D x86/vmx_l2_switch_test
 TEST_GEN_PROGS_x86 +=3D access_tracking_perf_test
 TEST_GEN_PROGS_x86 +=3D coalesced_io_test
 TEST_GEN_PROGS_x86 +=3D dirty_log_perf_test
diff --git a/tools/testing/selftests/kvm/x86/vmx_l2_switch_test.c b/tools/t=
esting/selftests/kvm/x86/vmx_l2_switch_test.c
new file mode 100644
index 000000000000..5ec0da2f8386
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_l2_switch_test.c
@@ -0,0 +1,416 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test nested VMX context switching between multiple VMCS
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define L2_GUEST_STACK_SIZE 64
+#define L2_VCPU_MAX 16
+
+struct l2_vcpu_config {
+	vm_vaddr_t hv_pages_gva;	/* Guest VA for eVMCS */
+	vm_vaddr_t vmx_pages_gva;	/* Guest VA for VMX pages */
+	unsigned long stack[L2_GUEST_STACK_SIZE];
+	uint16_t vpid;
+};
+
+struct l1_test_config {
+	struct l2_vcpu_config l2_vcpus[L2_VCPU_MAX];
+	uint64_t hypercall_gpa;
+	uint32_t nr_l2_vcpus;
+	uint32_t nr_switches;
+	bool enable_vpid;
+	bool use_evmcs;
+	bool sched_only;
+};
+
+static void l2_guest(void)
+{
+	while (1)
+		vmcall();
+}
+
+static void run_l2_guest_evmcs(struct hyperv_test_pages *hv_pages,
+			       struct vmx_pages *vmx,
+			       void *guest_rip,
+			       void *guest_rsp,
+			       uint16_t vpid)
+{
+	GUEST_ASSERT(load_evmcs(hv_pages));
+	prepare_vmcs(vmx, guest_rip, guest_rsp);
+	current_evmcs->hv_enlightenments_control.msr_bitmap =3D 1;
+	vmwrite(VIRTUAL_PROCESSOR_ID, vpid);
+
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL);
+	current_evmcs->guest_rip +=3D 3;	/* vmcall */
+
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL);
+}
+
+static void run_l2_guest_vmx_migrate(struct vmx_pages *vmx,
+				     void *guest_rip,
+				     void *guest_rsp,
+				     uint16_t vpid,
+				     bool start)
+{
+	uint32_t control;
+
+	/*
+	 * Emulate L2 vCPU migration: vmptrld/vmlaunch/vmclear
+	 */
+
+	if (start)
+		GUEST_ASSERT(load_vmcs(vmx));
+	else
+		GUEST_ASSERT(!vmptrld(vmx->vmcs_gpa));
+
+	prepare_vmcs(vmx, guest_rip, guest_rsp);
+
+	control =3D vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+	control |=3D CPU_BASED_USE_MSR_BITMAPS;
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+	vmwrite(VIRTUAL_PROCESSOR_ID, vpid);
+
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL);
+
+	GUEST_ASSERT(vmptrstz() =3D=3D vmx->vmcs_gpa);
+	GUEST_ASSERT(!vmclear(vmx->vmcs_gpa));
+}
+
+static void run_l2_guest_vmx_sched(struct vmx_pages *vmx,
+				   void *guest_rip,
+				   void *guest_rsp,
+				   uint16_t vpid,
+				   bool start)
+{
+	/*
+	 * Emulate L2 vCPU multiplexing: vmptrld/vmresume
+	 */
+
+	if (start) {
+		uint32_t control;
+
+		GUEST_ASSERT(load_vmcs(vmx));
+		prepare_vmcs(vmx, guest_rip, guest_rsp);
+
+		control =3D vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+		control |=3D CPU_BASED_USE_MSR_BITMAPS;
+		vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+		vmwrite(VIRTUAL_PROCESSOR_ID, vpid);
+
+		GUEST_ASSERT(!vmlaunch());
+	} else {
+		GUEST_ASSERT(!vmptrld(vmx->vmcs_gpa));
+		GUEST_ASSERT(!vmresume());
+	}
+
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL);
+
+	vmwrite(GUEST_RIP,
+		vmreadz(GUEST_RIP) + vmreadz(VM_EXIT_INSTRUCTION_LEN));
+}
+
+static void l1_guest_evmcs(struct l1_test_config *config)
+{
+	struct hyperv_test_pages *hv_pages;
+	struct vmx_pages *vmx_pages;
+	uint32_t i, j;
+
+	/* Initialize Hyper-V MSRs */
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, config->hypercall_gpa);
+
+	/* Enable VP assist page */
+	hv_pages =3D (struct hyperv_test_pages *)config->l2_vcpus[0].hv_pages_gva;
+	enable_vp_assist(hv_pages->vp_assist_gpa, hv_pages->vp_assist);
+
+	/* Enable evmcs */
+	evmcs_enable();
+
+	vmx_pages =3D (struct vmx_pages *)config->l2_vcpus[0].vmx_pages_gva;
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+
+	for (i =3D 0; i < config->nr_switches; i++) {
+		for (j =3D 0; j < config->nr_l2_vcpus; j++) {
+			struct l2_vcpu_config *l2 =3D &config->l2_vcpus[j];
+
+			hv_pages =3D (struct hyperv_test_pages *)l2->hv_pages_gva;
+			vmx_pages =3D (struct vmx_pages *)l2->vmx_pages_gva;
+
+			run_l2_guest_evmcs(hv_pages, vmx_pages, l2_guest,
+					   &l2->stack[L2_GUEST_STACK_SIZE],
+					   l2->vpid);
+		}
+	}
+
+	GUEST_DONE();
+}
+
+static void l1_guest_vmx(struct l1_test_config *config)
+{
+	struct vmx_pages *vmx_pages;
+	uint32_t i, j;
+
+	vmx_pages =3D (struct vmx_pages *)config->l2_vcpus[0].vmx_pages_gva;
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+
+	for (i =3D 0; i < config->nr_switches; i++) {
+		for (j =3D 0; j < config->nr_l2_vcpus; j++) {
+			struct l2_vcpu_config *l2 =3D &config->l2_vcpus[j];
+
+			vmx_pages =3D (struct vmx_pages *)l2->vmx_pages_gva;
+
+			if (config->sched_only)
+				run_l2_guest_vmx_sched(vmx_pages, l2_guest,
+						       &l2->stack[L2_GUEST_STACK_SIZE],
+						       l2->vpid, i =3D=3D 0);
+			else
+				run_l2_guest_vmx_migrate(vmx_pages, l2_guest,
+							 &l2->stack[L2_GUEST_STACK_SIZE],
+							 l2->vpid, i =3D=3D 0);
+		}
+	}
+
+	if (config->sched_only) {
+		for (j =3D 0; j < config->nr_l2_vcpus; j++) {
+			struct l2_vcpu_config *l2 =3D &config->l2_vcpus[j];
+
+			vmx_pages =3D (struct vmx_pages *)l2->vmx_pages_gva;
+			vmclear(vmx_pages->vmcs_gpa);
+		}
+	}
+
+	GUEST_DONE();
+}
+
+static void vcpu_clone_hyperv_test_pages(struct kvm_vm *vm,
+					 vm_vaddr_t src_gva,
+					 vm_vaddr_t *dst_gva)
+{
+	struct hyperv_test_pages *src, *dst;
+	vm_vaddr_t evmcs_gva;
+
+	*dst_gva =3D vm_vaddr_alloc_page(vm);
+
+	src =3D addr_gva2hva(vm, src_gva);
+	dst =3D addr_gva2hva(vm, *dst_gva);
+	memcpy(dst, src, sizeof(*dst));
+
+	/* Allocate a new evmcs page */
+	evmcs_gva =3D vm_vaddr_alloc_page(vm);
+	dst->enlightened_vmcs =3D (void *)evmcs_gva;
+	dst->enlightened_vmcs_hva =3D addr_gva2hva(vm, evmcs_gva);
+	dst->enlightened_vmcs_gpa =3D addr_gva2gpa(vm, evmcs_gva);
+}
+
+static void prepare_vcpu(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
+			 uint32_t nr_l2_vcpus, uint32_t nr_switches,
+			 bool enable_vpid, bool use_evmcs,
+			 bool sched_only)
+{
+	vm_vaddr_t config_gva;
+	struct l1_test_config *config;
+	vm_vaddr_t hypercall_page_gva =3D 0;
+	uint32_t i;
+
+	TEST_ASSERT(nr_l2_vcpus <=3D L2_VCPU_MAX,
+		    "Too many L2 vCPUs: %u (max %u)", nr_l2_vcpus, L2_VCPU_MAX);
+
+	/* Allocate config structure in guest memory */
+	config_gva =3D vm_vaddr_alloc(vm, sizeof(*config), 0x1000);
+	config =3D addr_gva2hva(vm, config_gva);
+	memset(config, 0, sizeof(*config));
+
+	if (use_evmcs) {
+		/* Allocate hypercall page */
+		hypercall_page_gva =3D vm_vaddr_alloc_page(vm);
+		memset(addr_gva2hva(vm, hypercall_page_gva), 0, getpagesize());
+		config->hypercall_gpa =3D addr_gva2gpa(vm, hypercall_page_gva);
+
+		/* Enable Hyper-V enlightenments */
+		vcpu_set_hv_cpuid(vcpu);
+		vcpu_enable_evmcs(vcpu);
+	}
+
+	/* Allocate resources for each L2 vCPU */
+	for (i =3D 0; i < nr_l2_vcpus; i++) {
+		vm_vaddr_t vmx_pages_gva;
+
+		/* Allocate VMX pages (needed for both VMX and eVMCS) */
+		vcpu_alloc_vmx(vm, &vmx_pages_gva);
+		config->l2_vcpus[i].vmx_pages_gva =3D vmx_pages_gva;
+
+		if (use_evmcs) {
+			vm_vaddr_t hv_pages_gva;
+
+			/* Allocate or clone hyperv_test_pages */
+			if (i =3D=3D 0) {
+				vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva);
+			} else {
+				vm_vaddr_t first_hv_gva =3D
+				    config->l2_vcpus[0].hv_pages_gva;
+				vcpu_clone_hyperv_test_pages(vm, first_hv_gva,
+							     &hv_pages_gva);
+			}
+			config->l2_vcpus[i].hv_pages_gva =3D hv_pages_gva;
+		}
+
+		/* Set VPID */
+		config->l2_vcpus[i].vpid =3D enable_vpid ? (i + 3) : 0;
+	}
+
+	config->nr_l2_vcpus =3D nr_l2_vcpus;
+	config->nr_switches =3D nr_switches;
+	config->enable_vpid =3D enable_vpid;
+	config->use_evmcs =3D use_evmcs;
+	config->sched_only =3D use_evmcs ? false : sched_only;
+
+	/* Pass single pointer to config structure */
+	vcpu_args_set(vcpu, 1, config_gva);
+
+	if (use_evmcs)
+		vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id);
+}
+
+static bool opt_enable_vpid =3D true;
+static const char *progname;
+
+static void check_stats(struct kvm_vm *vm,
+			uint32_t nr_l2_vcpus,
+			uint32_t nr_switches,
+			bool use_evmcs,
+			bool sched_only)
+{
+	uint64_t reuse =3D 0;
+	uint64_t recycle =3D 0;
+
+	reuse =3D vm_get_stat(vm, nested_context_reuse);
+	recycle =3D vm_get_stat(vm, nested_context_recycle);
+
+	if (nr_l2_vcpus <=3D KVM_NESTED_OVERSUB_RATIO) {
+		GUEST_ASSERT_EQ(reuse, nr_l2_vcpus * (nr_switches - 1));
+		GUEST_ASSERT_EQ(recycle, 0);
+	} else {
+		if (sched_only) {
+			/*
+			 * When scheduling only no L2 vCPU vmcs is cleared so
+			 * we reuse up to the max. number of contexts, but we
+			 * cannot recycle any of them.
+			 */
+			GUEST_ASSERT_EQ(reuse,
+					KVM_NESTED_OVERSUB_RATIO *
+					(nr_switches - 1));
+			GUEST_ASSERT_EQ(recycle, 0);
+		} else {
+			/*
+			 * When migration we cycle in LRU order so no context
+			 * can be reused they are all recycled.
+			 */
+			GUEST_ASSERT_EQ(reuse, 0);
+			GUEST_ASSERT_EQ(recycle,
+					(nr_l2_vcpus * nr_switches) -
+					KVM_NESTED_OVERSUB_RATIO);
+		}
+	}
+
+	printf("%s %u switches with %u L2 vCPUS (%s) reuse %" PRIu64
+	       " recycle %" PRIu64 "\n", progname, nr_switches, nr_l2_vcpus,
+	       use_evmcs ? "evmcs" : (sched_only ? "vmx sched" : "vmx migrate"),
+	       reuse, recycle);
+}
+
+static void run_test(uint32_t nr_l2_vcpus, uint32_t nr_switches,
+		     bool use_evmcs, bool sched_only)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+
+	vm =3D vm_create_with_one_vcpu(&vcpu, use_evmcs
+				     ? l1_guest_evmcs : l1_guest_vmx);
+
+	prepare_vcpu(vm, vcpu, nr_l2_vcpus, nr_switches,
+		     opt_enable_vpid, use_evmcs, sched_only);
+
+	for (;;) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_DONE:
+			goto done;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		default:
+			TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+		}
+	}
+
+done:
+	check_stats(vm, nr_l2_vcpus, nr_switches, use_evmcs, sched_only);
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	uint32_t opt_nr_l2_vcpus =3D 0;
+	uint32_t opt_nr_switches =3D 0;
+	bool opt_sched_only =3D true;
+	int opt;
+	int i;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	progname =3D argv[0];
+
+	while ((opt =3D getopt(argc, argv, "c:rs:v")) !=3D -1) {
+		switch (opt) {
+		case 'c':
+			opt_nr_l2_vcpus =3D atoi_paranoid(optarg);
+			break;
+		case 'r':
+			opt_sched_only =3D false;
+			break;
+		case 's':
+			opt_nr_switches =3D atoi_paranoid(optarg);
+			break;
+		case 'v':
+			opt_enable_vpid =3D false;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (opt_nr_l2_vcpus && opt_nr_switches) {
+		run_test(opt_nr_l2_vcpus, opt_nr_switches, false,
+			 opt_sched_only);
+
+		if (kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS))
+			run_test(opt_nr_l2_vcpus, opt_nr_switches,
+				 true, false);
+	} else {
+		/* VMX vmlaunch */
+		for (i =3D 2; i <=3D 16; i++)
+			run_test(i, 4, false, false);
+
+		/* VMX vmresume */
+		for (i =3D 2; i <=3D 16; i++)
+			run_test(i, 4, false, true);
+
+		/* eVMCS */
+		if (kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) {
+			for (i =3D 2; i <=3D 16; i++)
+				run_test(i, 4, true, false);
+		}
+	}
+
+	return 0;
+}
--=20
2.43.0