From: Fred Griffoul <fgriffo@amazon.co.uk>
Add infrastructure to persist nested virtualization state when L2 vCPUs
are switched on an L1 vCPU or migrated between L1 vCPUs.
The nested context table uses a hash table for fast lookup by nested
control block GPA (VMPTR for VMX, VMCB for SVM) and maintains a free
list for context management.
The kvm_nested_context_load() function searches for a context indexed by
the target GPA; if not found, it allocates a new context up to the
configured maximum. If at capacity, it recycles the oldest context from
the free list.
The oversubscription is hardcoded to support up to 8 L2 vCPUs per L1
vCPU.
The kvm_nested_context_clear() function moves the context to the free
list while keeping it in the hash table for potential reuse.
This allows nested hypervisors to multiplex multiple L2 vCPUs on L1
vCPUs without losing cached nested state, significantly improving
performance for workloads with frequent L2 context switches.
This patch adds the basic infrastructure. Subsequent patches will add
the nested VMX and SVM specific support to populate and utilize the
cached nested state.
Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
---
arch/x86/include/asm/kvm_host.h | 31 +++++
arch/x86/include/uapi/asm/kvm.h | 2 +
arch/x86/kvm/Makefile | 2 +-
arch/x86/kvm/nested.c | 199 ++++++++++++++++++++++++++++++++
arch/x86/kvm/x86.c | 5 +-
5 files changed, 237 insertions(+), 2 deletions(-)
create mode 100644 arch/x86/kvm/nested.c
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4675e71b33a7..75f3cd82a073 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1379,6 +1379,28 @@ enum kvm_mmu_type {
KVM_NR_MMU_TYPES,
};
+struct kvm_nested_context {
+ gpa_t gpa;
+ struct hlist_node hnode;
+ struct list_head lru_link;
+ struct kvm_vcpu *vcpu;
+};
+
+struct kvm_nested_context_table {
+ spinlock_t lock;
+ u32 count;
+ struct list_head lru_list;
+ DECLARE_HASHTABLE(hash, 8);
+};
+
+void kvm_nested_context_clear(struct kvm_vcpu *vcpu, gpa_t gpa);
+struct kvm_nested_context *kvm_nested_context_load(
+ struct kvm_vcpu *vcpu,
+ gpa_t gpa);
+
+int kvm_nested_context_table_init(struct kvm *kvm);
+void kvm_nested_context_table_destroy(struct kvm *kvm);
+
struct kvm_arch {
unsigned long n_used_mmu_pages;
unsigned long n_requested_mmu_pages;
@@ -1618,6 +1640,9 @@ struct kvm_arch {
* current VM.
*/
int cpu_dirty_log_size;
+
+ /* Cache for nested contexts */
+ struct kvm_nested_context_table *nested_context_table;
};
struct kvm_vm_stat {
@@ -1640,6 +1665,8 @@ struct kvm_vm_stat {
u64 nx_lpage_splits;
u64 max_mmu_page_hash_collisions;
u64 max_mmu_rmap_size;
+ u64 nested_context_recycle;
+ u64 nested_context_reuse;
};
struct kvm_vcpu_stat {
@@ -1967,6 +1994,10 @@ struct kvm_x86_nested_ops {
uint16_t *vmcs_version);
uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
void (*hv_inject_synthetic_vmexit_post_tlb_flush)(struct kvm_vcpu *vcpu);
+
+ struct kvm_nested_context *(*alloc_context)(struct kvm_vcpu *vcpu);
+ void (*free_context)(struct kvm_nested_context *ctx);
+ void (*reset_context)(struct kvm_nested_context *ctx);
};
struct kvm_x86_init_ops {
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index d420c9c066d4..637ed9286f8e 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -1042,4 +1042,6 @@ struct kvm_tdx_init_mem_region {
__u64 nr_pages;
};
+#define KVM_NESTED_OVERSUB_RATIO 8
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c4b8950c7abe..2a5289cb5bd1 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,7 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror
include $(srctree)/virt/kvm/Makefile.kvm
kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \
- debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o
+ debugfs.o nested.o mmu/mmu.o mmu/page_track.o mmu/spte.o
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o
diff --git a/arch/x86/kvm/nested.c b/arch/x86/kvm/nested.c
new file mode 100644
index 000000000000..6e4e95567427
--- /dev/null
+++ b/arch/x86/kvm/nested.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm_host.h>
+
+static struct kvm_nested_context_table *kvm_nested_context_table_alloc(void)
+{
+ struct kvm_nested_context_table *table;
+
+ table = kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT);
+ if (!table)
+ return NULL;
+
+ spin_lock_init(&table->lock);
+ INIT_LIST_HEAD(&table->lru_list);
+ hash_init(table->hash);
+ return table;
+}
+
+static void kvm_nested_context_table_free(struct kvm_nested_context_table
+ *table)
+{
+ kfree(table);
+}
+
+int kvm_nested_context_table_init(struct kvm *kvm)
+{
+ struct kvm_nested_context_table *table;
+
+ if (!kvm_x86_ops.nested_ops->alloc_context ||
+ !kvm_x86_ops.nested_ops->free_context ||
+ !kvm_x86_ops.nested_ops->reset_context)
+ return -EINVAL;
+
+ table = kvm_nested_context_table_alloc();
+ if (!table)
+ return -ENOMEM;
+
+ kvm->arch.nested_context_table = table;
+ return 0;
+}
+
+void kvm_nested_context_table_destroy(struct kvm *kvm)
+{
+ struct kvm_nested_context_table *table;
+ struct kvm_nested_context *ctx;
+ struct hlist_node *tmp;
+ int bkt;
+
+ table = kvm->arch.nested_context_table;
+ if (!table)
+ return;
+
+ hash_for_each_safe(table->hash, bkt, tmp, ctx, hnode) {
+ hash_del(&ctx->hnode);
+ kvm_x86_ops.nested_ops->free_context(ctx);
+ }
+
+ kvm_nested_context_table_free(table);
+}
+
+static unsigned int kvm_nested_context_max(struct kvm *kvm)
+{
+ return KVM_NESTED_OVERSUB_RATIO * atomic_read(&kvm->online_vcpus);
+}
+
+static struct kvm_nested_context *__kvm_nested_context_find(struct kvm_nested_context_table
+ *table, gpa_t gpa)
+{
+ struct kvm_nested_context *ctx;
+
+ hash_for_each_possible(table->hash, ctx, hnode, gpa) {
+ if (ctx->gpa == gpa)
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static struct kvm_nested_context *kvm_nested_context_find(struct
+ kvm_nested_context_table
+ *table,
+ struct kvm_vcpu *vcpu,
+ gpa_t gpa)
+{
+ struct kvm_nested_context *ctx;
+
+ ctx = __kvm_nested_context_find(table, gpa);
+ if (!ctx)
+ return NULL;
+
+ WARN_ON_ONCE(ctx->vcpu && ctx->vcpu != vcpu);
+
+ /* Remove from the LRU list if not attached to a vcpu */
+ if (!ctx->vcpu)
+ list_del(&ctx->lru_link);
+
+ return ctx;
+}
+
+static struct kvm_nested_context *kvm_nested_context_recycle(struct
+ kvm_nested_context_table
+ *table)
+{
+ struct kvm_nested_context *ctx;
+
+ if (list_empty(&table->lru_list))
+ return NULL;
+
+ ctx =
+ list_first_entry(&table->lru_list, struct kvm_nested_context,
+ lru_link);
+ list_del(&ctx->lru_link);
+ hash_del(&ctx->hnode);
+ return ctx;
+}
+
+static void kvm_nested_context_insert(struct kvm_nested_context_table *table,
+ struct kvm_nested_context *ctx, gpa_t gpa)
+{
+ hash_add(table->hash, &ctx->hnode, gpa);
+ ctx->gpa = gpa;
+}
+
+struct kvm_nested_context *kvm_nested_context_load(struct kvm_vcpu *vcpu,
+ gpa_t gpa)
+{
+ struct kvm_nested_context_table *table;
+ struct kvm_nested_context *ctx, *new_ctx = NULL;
+ struct kvm *vm = vcpu->kvm;
+ bool reset = false;
+
+ table = vcpu->kvm->arch.nested_context_table;
+ if (WARN_ON_ONCE(!table))
+ return false;
+retry:
+ spin_lock(&table->lock);
+ ctx = kvm_nested_context_find(table, vcpu, gpa);
+ if (!ctx) {
+ /* At capacity? Recycle the LRU context */
+ if (table->count >= kvm_nested_context_max(vcpu->kvm)) {
+ ctx = kvm_nested_context_recycle(table);
+ if (unlikely(!ctx))
+ goto finish;
+
+ kvm_nested_context_insert(table, ctx, gpa);
+ ++vm->stat.nested_context_recycle;
+ reset = true;
+
+ } else if (new_ctx) {
+ ++table->count;
+ ctx = new_ctx;
+ kvm_nested_context_insert(table, ctx, gpa);
+ new_ctx = NULL;
+
+ } else {
+ /* Allocate a new context without holding the lock */
+ spin_unlock(&table->lock);
+ new_ctx = kvm_x86_ops.nested_ops->alloc_context(vcpu);
+ if (unlikely(!new_ctx))
+ return NULL;
+
+ goto retry;
+ }
+ } else
+ ++vm->stat.nested_context_reuse;
+
+ ctx->vcpu = vcpu;
+finish:
+ spin_unlock(&table->lock);
+
+ if (new_ctx)
+ kvm_x86_ops.nested_ops->free_context(new_ctx);
+
+ if (reset)
+ kvm_x86_ops.nested_ops->reset_context(ctx);
+
+ return ctx;
+}
+
+void kvm_nested_context_clear(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ struct kvm_nested_context_table *table;
+ struct kvm_nested_context *ctx;
+
+ table = vcpu->kvm->arch.nested_context_table;
+ if (WARN_ON_ONCE(!table))
+ return;
+
+ spin_lock(&table->lock);
+ ctx = __kvm_nested_context_find(table, gpa);
+ if (ctx && ctx->vcpu) {
+ /*
+ * Move to LRU list but keep it in the hash table for possible future
+ * reuse.
+ */
+ list_add_tail(&ctx->lru_link, &table->lru_list);
+ ctx->vcpu = NULL;
+ }
+ spin_unlock(&table->lock);
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1a9c1171df49..db13b1921aff 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -255,7 +255,9 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
STATS_DESC_ICOUNTER(VM, pages_1g),
STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
- STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
+ STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions),
+ STATS_DESC_COUNTER(VM, nested_context_recycle),
+ STATS_DESC_COUNTER(VM, nested_context_reuse)
};
const struct kvm_stats_header kvm_vm_stats_header = {
@@ -13311,6 +13313,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_page_track_cleanup(kvm);
kvm_xen_destroy_vm(kvm);
kvm_hv_destroy_vm(kvm);
+ kvm_nested_context_table_destroy(kvm);
kvm_x86_call(vm_destroy)(kvm);
}
--
2.43.0
Hi,
kernel test robot noticed the following build warnings:
[auto build test WARNING on kvm/queue]
[also build test WARNING on kvm/next mst-vhost/linux-next linus/master v6.18-rc6 next-20251121]
[cannot apply to kvm/linux-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/griffoul-gmail-com/KVM-nVMX-Implement-cache-for-L1-MSR-bitmap/20251119-012332
base: https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link: https://lore.kernel.org/r/20251118171113.363528-9-griffoul%40gmail.org
patch subject: [PATCH v2 08/10] KVM: x86: Add nested context management
config: i386-randconfig-062-20251121 (https://download.01.org/0day-ci/archive/20251122/202511220448.n0QXrANz-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251122/202511220448.n0QXrANz-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202511220448.n0QXrANz-lkp@intel.com/
sparse warnings: (new ones prefixed by >>)
>> arch/x86/kvm/nested.c:133:24: sparse: sparse: Using plain integer as NULL pointer
vim +133 arch/x86/kvm/nested.c
122
123 struct kvm_nested_context *kvm_nested_context_load(struct kvm_vcpu *vcpu,
124 gpa_t gpa)
125 {
126 struct kvm_nested_context_table *table;
127 struct kvm_nested_context *ctx, *new_ctx = NULL;
128 struct kvm *vm = vcpu->kvm;
129 bool reset = false;
130
131 table = vcpu->kvm->arch.nested_context_table;
132 if (WARN_ON_ONCE(!table))
> 133 return false;
134 retry:
135 spin_lock(&table->lock);
136 ctx = kvm_nested_context_find(table, vcpu, gpa);
137 if (!ctx) {
138 /* At capacity? Recycle the LRU context */
139 if (table->count >= kvm_nested_context_max(vcpu->kvm)) {
140 ctx = kvm_nested_context_recycle(table);
141 if (unlikely(!ctx))
142 goto finish;
143
144 kvm_nested_context_insert(table, ctx, gpa);
145 ++vm->stat.nested_context_recycle;
146 reset = true;
147
148 } else if (new_ctx) {
149 ++table->count;
150 ctx = new_ctx;
151 kvm_nested_context_insert(table, ctx, gpa);
152 new_ctx = NULL;
153
154 } else {
155 /* Allocate a new context without holding the lock */
156 spin_unlock(&table->lock);
157 new_ctx = kvm_x86_ops.nested_ops->alloc_context(vcpu);
158 if (unlikely(!new_ctx))
159 return NULL;
160
161 goto retry;
162 }
163 } else
164 ++vm->stat.nested_context_reuse;
165
166 ctx->vcpu = vcpu;
167 finish:
168 spin_unlock(&table->lock);
169
170 if (new_ctx)
171 kvm_x86_ops.nested_ops->free_context(new_ctx);
172
173 if (reset)
174 kvm_x86_ops.nested_ops->reset_context(ctx);
175
176 return ctx;
177 }
178
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi,
kernel test robot noticed the following build warnings:
[auto build test WARNING on kvm/queue]
[also build test WARNING on kvm/next mst-vhost/linux-next linus/master v6.18-rc6 next-20251120]
[cannot apply to kvm/linux-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/griffoul-gmail-com/KVM-nVMX-Implement-cache-for-L1-MSR-bitmap/20251119-012332
base: https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link: https://lore.kernel.org/r/20251118171113.363528-9-griffoul%40gmail.org
patch subject: [PATCH v2 08/10] KVM: x86: Add nested context management
config: i386-randconfig-141-20251120 (https://download.01.org/0day-ci/archive/20251121/202511210515.8L9NBb1R-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251121/202511210515.8L9NBb1R-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202511210515.8L9NBb1R-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> arch/x86/kvm/nested.c:133:10: warning: expression which evaluates to zero treated as a null pointer constant of type 'struct kvm_nested_context *' [-Wnon-literal-null-conversion]
133 | return false;
| ^~~~~
1 warning generated.
vim +133 arch/x86/kvm/nested.c
122
123 struct kvm_nested_context *kvm_nested_context_load(struct kvm_vcpu *vcpu,
124 gpa_t gpa)
125 {
126 struct kvm_nested_context_table *table;
127 struct kvm_nested_context *ctx, *new_ctx = NULL;
128 struct kvm *vm = vcpu->kvm;
129 bool reset = false;
130
131 table = vcpu->kvm->arch.nested_context_table;
132 if (WARN_ON_ONCE(!table))
> 133 return false;
134 retry:
135 spin_lock(&table->lock);
136 ctx = kvm_nested_context_find(table, vcpu, gpa);
137 if (!ctx) {
138 /* At capacity? Recycle the LRU context */
139 if (table->count >= kvm_nested_context_max(vcpu->kvm)) {
140 ctx = kvm_nested_context_recycle(table);
141 if (unlikely(!ctx))
142 goto finish;
143
144 kvm_nested_context_insert(table, ctx, gpa);
145 ++vm->stat.nested_context_recycle;
146 reset = true;
147
148 } else if (new_ctx) {
149 ++table->count;
150 ctx = new_ctx;
151 kvm_nested_context_insert(table, ctx, gpa);
152 new_ctx = NULL;
153
154 } else {
155 /* Allocate a new context without holding the lock */
156 spin_unlock(&table->lock);
157 new_ctx = kvm_x86_ops.nested_ops->alloc_context(vcpu);
158 if (unlikely(!new_ctx))
159 return NULL;
160
161 goto retry;
162 }
163 } else
164 ++vm->stat.nested_context_reuse;
165
166 ctx->vcpu = vcpu;
167 finish:
168 spin_unlock(&table->lock);
169
170 if (new_ctx)
171 kvm_x86_ops.nested_ops->free_context(new_ctx);
172
173 if (reset)
174 kvm_x86_ops.nested_ops->reset_context(ctx);
175
176 return ctx;
177 }
178
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
© 2016 - 2025 Red Hat, Inc.