From: Fred Griffoul <fgriffo@amazon.co.uk>
Replace kvm_host_map usage with persistent gfn_to_pfn_cache for L1 APIC
virtualization pages (APIC access, virtual APIC, and posted interrupt
descriptor pages) to improve performance with unmanaged guest memory.
The conversion involves several key changes:
- Page loading in nested_get_vmcs12_pages(): load vmcs02 fields with
pfncache PFNs after each cache has been checked and possibly activated
or refreshed, during OUTSIDE_GUEST_MODE vCPU mode.
- Invalidation window handling: since nested_get_vmcs12_pages() runs in
OUTSIDE_GUEST_MODE, there's a window where caches can be invalidated
by MMU notifications before entering IN_GUEST_MODE. implement
is_nested_state_invalid() callback to monitor cache validity between
OUTSIDE_GUEST_MODE and IN_GUEST_MODE transitions. This triggers
KVM_REQ_GET_NESTED_STATE_PAGES when needed.
- Cache access in event callbacks: the virtual APIC and posted interrupt
descriptor pages are accessed by KVM in has_events() and
check_events() nested_ops callbacks. These use the kernel HVA following
the pfncache pattern of check/refresh, with both callbacks able to sleep
if cache refresh is required.
This eliminates expensive memremap/memunmap cycles for each L2 VM
entry/exit, providing substantial performance improvements when using
unmanaged memory such as guest_memfd or memory passed with mem= kernel
parameter.
The persistent caching approach maintains correctness through proper
invalidation detection while avoiding the overhead of repeated mapping
operations.
Signed-off-by: Fred Griffoul <fgriffo@amazon.co.uk>
---
arch/x86/kvm/vmx/nested.c | 169 +++++++++++++++++++++++++++++---------
arch/x86/kvm/vmx/vmx.h | 8 +-
include/linux/kvm_host.h | 5 ++
3 files changed, 139 insertions(+), 43 deletions(-)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 06187b8baa19..0cb66314d58b 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -329,8 +329,18 @@ static int nested_gpc_lock(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
if (!kvm_gpc_check(gpc, PAGE_SIZE) || (gpc->gpa != gpa)) {
read_unlock(&gpc->lock);
err = kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
- if (err)
+ if (err) {
+ /*
+ * Deactivate nested state caches to prevent
+ * kvm_gpc_invalid() from returning true in subsequent
+ * is_nested_state_invalid() calls. This prevents an
+ * infinite loop while entering guest mode.
+ */
+ if (gpc->vcpu)
+ kvm_gpc_deactivate(gpc);
+
return err;
+ }
goto retry;
}
@@ -343,14 +353,17 @@ static void nested_gpc_unlock(struct gfn_to_pfn_cache *gpc)
read_unlock(&gpc->lock);
}
-static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
+static int nested_gpc_hpa(struct gfn_to_pfn_cache *gpc, gpa_t gpa, hpa_t *hpa)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int err;
+
+ err = nested_gpc_lock(gpc, gpa);
+ if (err)
+ return err;
- kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map);
- kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map);
- kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map);
- vmx->nested.pi_desc = NULL;
+ *hpa = pfn_to_hpa(gpc->pfn);
+ nested_gpc_unlock(gpc);
+ return 0;
}
/*
@@ -373,6 +386,9 @@ static void free_nested(struct kvm_vcpu *vcpu)
vmx->nested.smm.vmxon = false;
vmx->nested.vmxon_ptr = INVALID_GPA;
+ kvm_gpc_deactivate(&vmx->nested.pi_desc_cache);
+ kvm_gpc_deactivate(&vmx->nested.virtual_apic_cache);
+ kvm_gpc_deactivate(&vmx->nested.apic_access_page_cache);
kvm_gpc_deactivate(&vmx->nested.msr_bitmap_cache);
free_vpid(vmx->nested.vpid02);
@@ -389,8 +405,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
kfree(vmx->nested.cached_shadow_vmcs12);
vmx->nested.cached_shadow_vmcs12 = NULL;
- nested_put_vmcs12_pages(vcpu);
-
kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
nested_release_evmcs(vcpu);
@@ -3361,7 +3375,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_host_map *map;
+ struct gfn_to_pfn_cache *gpc;
+ hpa_t hpa;
if (!vcpu->arch.pdptrs_from_userspace &&
!nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
@@ -3376,10 +3391,10 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
- map = &vmx->nested.apic_access_page_map;
+ gpc = &vmx->nested.apic_access_page_cache;
- if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
- vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
+ if (!nested_gpc_hpa(gpc, vmcs12->apic_access_addr, &hpa)) {
+ vmcs_write64(APIC_ACCESS_ADDR, hpa);
} else {
pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n",
__func__);
@@ -3392,10 +3407,10 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
}
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
- map = &vmx->nested.virtual_apic_map;
+ gpc = &vmx->nested.virtual_apic_cache;
- if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
+ if (!nested_gpc_hpa(gpc, vmcs12->virtual_apic_page_addr, &hpa)) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -3418,14 +3433,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
}
if (nested_cpu_has_posted_intr(vmcs12)) {
- map = &vmx->nested.pi_desc_map;
+ gpc = &vmx->nested.pi_desc_cache;
- if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
- vmx->nested.pi_desc =
- (struct pi_desc *)(((void *)map->hva) +
- offset_in_page(vmcs12->posted_intr_desc_addr));
+ if (!nested_gpc_hpa(gpc, vmcs12->posted_intr_desc_addr & PAGE_MASK, &hpa)) {
+ vmx->nested.pi_desc_offset = offset_in_page(vmcs12->posted_intr_desc_addr);
vmcs_write64(POSTED_INTR_DESC_ADDR,
- pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
+ hpa + offset_in_page(vmcs12->posted_intr_desc_addr));
} else {
/*
* Defer the KVM_INTERNAL_EXIT until KVM tries to
@@ -3433,7 +3446,6 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
* descriptor. (Note that KVM may do this when it
* should not, per the architectural specification.)
*/
- vmx->nested.pi_desc = NULL;
pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
}
}
@@ -3474,7 +3486,16 @@ static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
static bool vmx_is_nested_state_invalid(struct kvm_vcpu *vcpu)
{
- return false;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * @vcpu is in IN_GUEST_MODE, eliminating the need for individual gpc
+ * locks. Since kvm_gpc_invalid() doesn't verify gpc memslot
+ * generation, we can also skip acquiring the srcu lock.
+ */
+ return kvm_gpc_invalid(&vmx->nested.apic_access_page_cache) ||
+ kvm_gpc_invalid(&vmx->nested.virtual_apic_cache) ||
+ kvm_gpc_invalid(&vmx->nested.pi_desc_cache);
}
static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
@@ -3969,9 +3990,55 @@ void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
}
}
+static void *nested_gpc_lock_if_active(struct gfn_to_pfn_cache *gpc)
+{
+retry:
+ read_lock(&gpc->lock);
+ if (!gpc->active) {
+ read_unlock(&gpc->lock);
+ return NULL;
+ }
+
+ if (!kvm_gpc_check(gpc, PAGE_SIZE)) {
+ read_unlock(&gpc->lock);
+ if (kvm_gpc_refresh(gpc, PAGE_SIZE))
+ return NULL;
+ goto retry;
+ }
+
+ return gpc->khva;
+}
+
+static struct pi_desc *nested_lock_pi_desc(struct vcpu_vmx *vmx)
+{
+ u8 *pi_desc_page;
+
+ pi_desc_page = nested_gpc_lock_if_active(&vmx->nested.pi_desc_cache);
+ if (!pi_desc_page)
+ return NULL;
+
+ return (struct pi_desc *)(pi_desc_page + vmx->nested.pi_desc_offset);
+}
+
+static void nested_unlock_pi_desc(struct vcpu_vmx *vmx)
+{
+ nested_gpc_unlock(&vmx->nested.pi_desc_cache);
+}
+
+static void *nested_lock_vapic(struct vcpu_vmx *vmx)
+{
+ return nested_gpc_lock_if_active(&vmx->nested.virtual_apic_cache);
+}
+
+static void nested_unlock_vapic(struct vcpu_vmx *vmx)
+{
+ nested_gpc_unlock(&vmx->nested.virtual_apic_cache);
+}
+
static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct pi_desc *pi_desc;
int max_irr;
void *vapic_page;
u16 status;
@@ -3979,22 +4046,29 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
if (!vmx->nested.pi_pending)
return 0;
- if (!vmx->nested.pi_desc)
+ pi_desc = nested_lock_pi_desc(vmx);
+ if (!pi_desc)
goto mmio_needed;
vmx->nested.pi_pending = false;
- if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+ if (!pi_test_and_clear_on(pi_desc)) {
+ nested_unlock_pi_desc(vmx);
return 0;
+ }
- max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ max_irr = pi_find_highest_vector(pi_desc);
if (max_irr > 0) {
- vapic_page = vmx->nested.virtual_apic_map.hva;
- if (!vapic_page)
+ vapic_page = nested_lock_vapic(vmx);
+ if (!vapic_page) {
+ nested_unlock_pi_desc(vmx);
goto mmio_needed;
+ }
+
+ __kvm_apic_update_irr(pi_desc->pir, vapic_page, &max_irr);
+
+ nested_unlock_vapic(vmx);
- __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
- vapic_page, &max_irr);
status = vmcs_read16(GUEST_INTR_STATUS);
if ((u8)max_irr > ((u8)status & 0xff)) {
status &= ~0xff;
@@ -4003,6 +4077,7 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
}
}
+ nested_unlock_pi_desc(vmx);
nested_mark_vmcs12_pages_dirty(vcpu);
return 0;
@@ -4122,8 +4197,10 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- void *vapic = vmx->nested.virtual_apic_map.hva;
+ struct pi_desc *pi_desc;
int max_irr, vppr;
+ void *vapic;
+ bool res = false;
if (nested_vmx_preemption_timer_pending(vcpu) ||
vmx->nested.mtf_pending)
@@ -4142,23 +4219,33 @@ static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
__vmx_interrupt_blocked(vcpu))
return false;
+ vapic = nested_lock_vapic(vmx);
if (!vapic)
return false;
vppr = *((u32 *)(vapic + APIC_PROCPRI));
+ nested_unlock_vapic(vmx);
+
max_irr = vmx_get_rvi();
if ((max_irr & 0xf0) > (vppr & 0xf0))
return true;
- if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
- pi_test_on(vmx->nested.pi_desc)) {
- max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
- if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
- return true;
+ if (vmx->nested.pi_pending) {
+ pi_desc = nested_lock_pi_desc(vmx);
+ if (!pi_desc)
+ return false;
+
+ if (pi_test_on(pi_desc)) {
+ max_irr = pi_find_highest_vector(pi_desc);
+ if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
+ res = true;
+ }
+
+ nested_unlock_pi_desc(vmx);
}
- return false;
+ return res;
}
/*
@@ -5106,7 +5193,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx_update_cpu_dirty_logging(vcpu);
}
- nested_put_vmcs12_pages(vcpu);
+ nested_mark_vmcs12_pages_dirty(vcpu);
if (vmx->nested.reload_vmcs01_apic_access_page) {
vmx->nested.reload_vmcs01_apic_access_page = false;
@@ -5391,6 +5478,10 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
kvm_gpc_init(&vmx->nested.msr_bitmap_cache, vcpu->kvm);
+ kvm_gpc_init_for_vcpu(&vmx->nested.apic_access_page_cache, vcpu);
+ kvm_gpc_init_for_vcpu(&vmx->nested.virtual_apic_cache, vcpu);
+ kvm_gpc_init_for_vcpu(&vmx->nested.pi_desc_cache, vcpu);
+
vmx->nested.vmcs02_initialized = false;
vmx->nested.vmxon = true;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 3a6983222841..2c74c65d3383 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -158,11 +158,11 @@ struct nested_vmx {
* Guest pages referred to in the vmcs02 with host-physical
* pointers, so we must keep them pinned while L2 runs.
*/
- struct kvm_host_map apic_access_page_map;
- struct kvm_host_map virtual_apic_map;
- struct kvm_host_map pi_desc_map;
+ struct gfn_to_pfn_cache apic_access_page_cache;
+ struct gfn_to_pfn_cache virtual_apic_cache;
+ struct gfn_to_pfn_cache pi_desc_cache;
- struct pi_desc *pi_desc;
+ u64 pi_desc_offset;
bool pi_pending;
u16 posted_intr_nv;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2eb551a11818..dc622adb561f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1526,6 +1526,11 @@ static inline bool kvm_gpc_is_hva_active(struct gfn_to_pfn_cache *gpc)
return gpc->active && kvm_is_error_gpa(gpc->gpa);
}
+static inline bool kvm_gpc_invalid(struct gfn_to_pfn_cache *gpc)
+{
+ return gpc->active && !gpc->valid;
+}
+
void kvm_sigset_activate(struct kvm_vcpu *vcpu);
void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
--
2.51.0
© 2016 - 2025 Red Hat, Inc.