From nobody Sun Feb  8 15:33:34 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 4ECA0C4332F
	for <linux-kernel@archiver.kernel.org>; Thu, 22 Dec 2022 02:35:49 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S235062AbiLVCfr (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 21 Dec 2022 21:35:47 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:46706 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235009AbiLVCfR (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 21 Dec 2022 21:35:17 -0500
Received: from mail-pj1-x1049.google.com (mail-pj1-x1049.google.com
 [IPv6:2607:f8b0:4864:20::1049])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 6CF66264B3
        for <linux-kernel@vger.kernel.org>;
 Wed, 21 Dec 2022 18:35:12 -0800 (PST)
Received: by mail-pj1-x1049.google.com with SMTP id
 ep17-20020a17090ae65100b00219702c495cso310534pjb.2
        for <linux-kernel@vger.kernel.org>;
 Wed, 21 Dec 2022 18:35:12 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=google.com; s=20210112;
        h=cc:to:from:subject:message-id:references:mime-version:in-reply-to
         :date:from:to:cc:subject:date:message-id:reply-to;
        bh=mit5lexfNnxmUxBj8TQ21C+l/JJ6de0LF0cpUag1ehI=;
        b=EHRq6KnggJg0qHDofwfjR8ae32NXDI/HKi8Txp4oJH7WGgC54js8Lmg9jOpgQKL0rf
         z5JBSYGPSrYY/bjbOoczVB/e35UZHQcAExxP9b/UwvY8aeF7ea4XZKfRr0uNoyjCemAz
         jVdJjr7nGbgxKIB6oB8sOdVoj3l64EwTQ7tXZxVXxJhjQtsdcf9fjjFsaFhEV0lMOSog
         qLan7LTbdfWcqMilsqnd4KRXpnWI1fT1BqGyVFVLzG+lEAxZDh3DifZDDrCiKznX2L+W
         UUGlJ8LkknW4OnL+SHGFiZMuUeqW1LHnteUkayX070e5ORkSeVq505RNpaK5ocBgWUOy
         L9lg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=cc:to:from:subject:message-id:references:mime-version:in-reply-to
         :date:x-gm-message-state:from:to:cc:subject:date:message-id:reply-to;
        bh=mit5lexfNnxmUxBj8TQ21C+l/JJ6de0LF0cpUag1ehI=;
        b=Z7Uu7G8Cnd8Xek0+DzzrWTNu9Ko4xuikA0T4HanUk69BFSvZph5ldTkLABkCUfIzwT
         MXdPSLjqtIC6oaac9y/I6AUWhTgw2czRfv3/BnRbTC/K388z96Ck8F/RN9cVx+Mv9tfO
         sTMJ9FfaSvZNEogizeexm/BvaOYD4GqoP0vSI0lY1TkS2dCmTGf6Qlnnqt2EdOVp0wxY
         RBKOVWP1/Yi9cJKuFx3rnIeaj6yEHlIqW1tvZz/uX8po3KSczJSJKGiBYiv69cbChe+y
         STeC9R+/N5L1fqCB9S5w6DO90BglwfK5ZrPav/UmUgXggVgT3bwkuQQ45AsYxO0eF5K0
         OZfg==
X-Gm-Message-State: AFqh2kqF0ANpGzlMXhi0GN83p0iTJPBRd7lr9+rCyPu7KYmPd97MHvsk
        dCnDxRZhU5rwSE0++8KTC5N674UagMtf
X-Google-Smtp-Source: 
 AMrXdXtxUxWd7v+xEZSntupDbitZG1s36UwcrA0HpW4+YL7A8mh7wqUIn/5H+j8OdYSHApBFGXK6k37/g7hV
X-Received: from vipin.c.googlers.com ([fda3:e722:ac3:cc00:7f:e700:c0a8:479f])
 (user=vipinsh job=sendgmr) by 2002:a17:90a:bd12:b0:225:b164:8874 with SMTP id
 y18-20020a17090abd1200b00225b1648874mr90799pjr.87.1671676511714; Wed, 21 Dec
 2022 18:35:11 -0800 (PST)
Date: Wed, 21 Dec 2022 18:34:55 -0800
In-Reply-To: <20221222023457.1764-1-vipinsh@google.com>
Mime-Version: 1.0
References: <20221222023457.1764-1-vipinsh@google.com>
X-Mailer: git-send-email 2.39.0.314.g84b9a713c41-goog
Message-ID: <20221222023457.1764-8-vipinsh@google.com>
Subject: [Patch v3 7/9] KVM: x86/mmu: Allocate page table's pages on NUMA node
 of the underlying pages
From: Vipin Sharma <vipinsh@google.com>
To: seanjc@google.com, pbonzini@redhat.com, bgardon@google.com,
        dmatlack@google.com
Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
        Vipin Sharma <vipinsh@google.com>
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Page table pages of a VM are currently allocated based on the current
task's NUMA node or its mempolicy. This can cause suboptimal remote
accesses by the vCPU if it is accessing physical pages local to its NUMA
node but the page table pages mapping those physcal pages were created
by some other vCPU which was on different NUMA node or had different
policy.

Allocate page table pages on the same NUMA node where underlying
physical page exists. Page table at level 5, 4, and 3 might not end up
on the same NUMA node as they can span multiple NUMA nodes.

Signed-off-by: Vipin Sharma <vipinsh@google.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu/mmu.c          | 63 ++++++++++++++++++++++-----------
 arch/x86/kvm/mmu/paging_tmpl.h  |  4 +--
 arch/x86/kvm/mmu/tdp_mmu.c      | 11 +++---
 virt/kvm/kvm_main.c             |  2 +-
 5 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_hos=
t.h
index 293994fabae3..b1f319ad6f89 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -782,7 +782,7 @@ struct kvm_vcpu_arch {
 	struct kvm_mmu *walk_mmu;
=20
 	struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
-	struct kvm_mmu_memory_cache mmu_shadow_page_cache;
+	struct kvm_mmu_memory_cache mmu_shadow_page_cache[MAX_NUMNODES];
 	struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
=20
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 23a3b82b2384..511c6ef265ee 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -677,24 +677,29 @@ static int mmu_topup_sp_memory_cache(struct kvm_mmu_m=
emory_cache *cache,
=20
 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indir=
ect)
 {
-	int r;
+	int r, nid;
=20
 	/* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
 	r =3D kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
 				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
 	if (r)
 		return r;
-	r =3D mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
-				      &vcpu->arch.mmu_shadow_page_cache_lock,
-				      PT64_ROOT_MAX_LEVEL);
-	if (r)
-		return r;
+
+	for_each_online_node(nid) {
+		r =3D mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache[nid],
+					      &vcpu->arch.mmu_shadow_page_cache_lock,
+					      PT64_ROOT_MAX_LEVEL);
+		if (r)
+			return r;
+	}
+
 	if (maybe_indirect) {
 		r =3D kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
 					       PT64_ROOT_MAX_LEVEL);
 		if (r)
 			return r;
 	}
+
 	return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
 					  PT64_ROOT_MAX_LEVEL);
 }
@@ -715,9 +720,14 @@ static void mmu_free_sp_memory_cache(struct kvm_mmu_me=
mory_cache *cache,
=20
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
+	int nid;
+
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
-	mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
-				 &vcpu->arch.mmu_shadow_page_cache_lock);
+
+	for_each_node(nid)
+		mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache[nid],
+					 &vcpu->arch.mmu_shadow_page_cache_lock);
+
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
 }
@@ -2256,11 +2266,12 @@ static struct kvm_mmu_page *__kvm_mmu_get_shadow_pa=
ge(struct kvm *kvm,
=20
 static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
 						    gfn_t gfn,
-						    union kvm_mmu_page_role role)
+						    union kvm_mmu_page_role role,
+						    int nid)
 {
 	struct shadow_page_caches caches =3D {
 		.page_header_cache =3D &vcpu->arch.mmu_page_header_cache,
-		.shadow_page_cache =3D &vcpu->arch.mmu_shadow_page_cache,
+		.shadow_page_cache =3D &vcpu->arch.mmu_shadow_page_cache[nid],
 		.shadowed_info_cache =3D &vcpu->arch.mmu_shadowed_info_cache,
 		.shadow_page_cache_lock =3D &vcpu->arch.mmu_shadow_page_cache_lock
 	};
@@ -2316,15 +2327,19 @@ static union kvm_mmu_page_role kvm_mmu_child_role(u=
64 *sptep, bool direct,
=20
 static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
 						 u64 *sptep, gfn_t gfn,
-						 bool direct, unsigned int access)
+						 bool direct, unsigned int access,
+						 kvm_pfn_t pfn)
 {
 	union kvm_mmu_page_role role;
+	int nid;
=20
 	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
 		return ERR_PTR(-EEXIST);
=20
 	role =3D kvm_mmu_child_role(sptep, direct, access);
-	return kvm_mmu_get_shadow_page(vcpu, gfn, role);
+	nid =3D kvm_pfn_to_page_table_nid(pfn);
+
+	return kvm_mmu_get_shadow_page(vcpu, gfn, role, nid);
 }
=20
 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *i=
terator,
@@ -3208,7 +3223,8 @@ static int direct_map(struct kvm_vcpu *vcpu, struct k=
vm_page_fault *fault)
 		if (it.level =3D=3D fault->goal_level)
 			break;
=20
-		sp =3D kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
+		sp =3D kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true,
+					  ACC_ALL, fault->pfn);
 		if (sp =3D=3D ERR_PTR(-EEXIST))
 			continue;
=20
@@ -3636,7 +3652,7 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gf=
n_t gfn, int quadrant,
 	WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte);
 	WARN_ON_ONCE(role.direct && role.has_4_byte_gpte);
=20
-	sp =3D kvm_mmu_get_shadow_page(vcpu, gfn, role);
+	sp =3D kvm_mmu_get_shadow_page(vcpu, gfn, role, numa_mem_id());
 	++sp->root_count;
=20
 	return __pa(sp->spt);
@@ -5952,7 +5968,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, st=
ruct kvm_mmu *mmu)
=20
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
-	int ret;
+	int ret, nid;
=20
 	INIT_KVM_MMU_MEMORY_CACHE(&vcpu->arch.mmu_pte_list_desc_cache,
 				  pte_list_desc_cache, NUMA_NO_NODE);
@@ -5960,8 +5976,9 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 	INIT_KVM_MMU_MEMORY_CACHE(&vcpu->arch.mmu_page_header_cache,
 				  mmu_page_header_cache, NUMA_NO_NODE);
=20
-	INIT_KVM_MMU_MEMORY_CACHE(&vcpu->arch.mmu_shadow_page_cache,
-				  NULL, NUMA_NO_NODE);
+	for_each_node(nid)
+		INIT_KVM_MMU_MEMORY_CACHE(&vcpu->arch.mmu_shadow_page_cache[nid],
+					  NULL, nid);
 	spin_lock_init(&vcpu->arch.mmu_shadow_page_cache_lock);
=20
 	vcpu->arch.mmu =3D &vcpu->arch.root_mmu;
@@ -6692,13 +6709,17 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm,=
 u64 gen)
 }
=20
 static unsigned long mmu_shrink_cache(struct kvm_mmu_memory_cache *cache,
+				      int cache_count,
 				      spinlock_t *cache_lock)
 {
 	unsigned long freed =3D 0;
+	int nid;
=20
 	spin_lock(cache_lock);
-	if (cache->nobjs)
-		freed =3D kvm_mmu_empty_memory_cache(cache);
+	for (nid =3D 0; nid < cache_count; nid++) {
+		if (node_online(nid) && cache[nid].nobjs)
+			freed +=3D kvm_mmu_empty_memory_cache(&cache[nid]);
+	}
 	spin_unlock(cache_lock);
 	return freed;
 }
@@ -6721,13 +6742,15 @@ mmu_shrink_scan(struct shrinker *shrink, struct shr=
ink_control *sc)
 		list_move_tail(&kvm->vm_list, &vm_list);
=20
 		freed +=3D mmu_shrink_cache(&kvm->arch.split_shadow_page_cache,
+					  1,
 					  &kvm->arch.split_shadow_page_cache_lock);
=20
 		if (freed >=3D sc->nr_to_scan)
 			break;
=20
 		kvm_for_each_vcpu(i, vcpu, kvm) {
-			freed +=3D mmu_shrink_cache(&vcpu->arch.mmu_shadow_page_cache,
+			freed +=3D mmu_shrink_cache(vcpu->arch.mmu_shadow_page_cache,
+						  MAX_NUMNODES,
 						  &vcpu->arch.mmu_shadow_page_cache_lock);
 		}
=20
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index e5662dbd519c..1ceca62ec4cf 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -652,7 +652,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct k=
vm_page_fault *fault,
 		table_gfn =3D gw->table_gfn[it.level - 2];
 		access =3D gw->pt_access[it.level - 2];
 		sp =3D kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
-					  false, access);
+					  false, access, fault->pfn);
=20
 		if (sp !=3D ERR_PTR(-EEXIST)) {
 			/*
@@ -708,7 +708,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct k=
vm_page_fault *fault,
 		validate_direct_spte(vcpu, it.sptep, direct_access);
=20
 		sp =3D kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn,
-					  true, direct_access);
+					  true, direct_access, fault->pfn);
 		if (sp =3D=3D ERR_PTR(-EEXIST))
 			continue;
=20
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 376b8dceb3f9..b5abae2366dd 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -259,12 +259,12 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct =
kvm *kvm,
 		    kvm_mmu_page_as_id(_root) !=3D _as_id) {		\
 		} else
=20
-static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
+static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu, int ni=
d)
 {
 	struct kvm_mmu_page *sp;
=20
 	sp =3D kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
-	sp->spt =3D kvm_mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cac=
he,
+	sp->spt =3D kvm_mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cac=
he[nid],
 						&vcpu->arch.mmu_shadow_page_cache_lock);
=20
 	return sp;
@@ -317,7 +317,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vc=
pu)
 			goto out;
 	}
=20
-	root =3D tdp_mmu_alloc_sp(vcpu);
+	root =3D tdp_mmu_alloc_sp(vcpu, numa_mem_id());
 	tdp_mmu_init_sp(root, NULL, 0, role);
=20
 	refcount_set(&root->tdp_mmu_root_count, 1);
@@ -1149,7 +1149,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm=
_page_fault *fault)
 	struct kvm *kvm =3D vcpu->kvm;
 	struct tdp_iter iter;
 	struct kvm_mmu_page *sp;
-	int ret =3D RET_PF_RETRY;
+	int ret =3D RET_PF_RETRY, nid;
=20
 	kvm_mmu_hugepage_adjust(vcpu, fault);
=20
@@ -1178,11 +1178,12 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct k=
vm_page_fault *fault)
 		    !is_large_pte(iter.old_spte))
 			continue;
=20
+		nid =3D kvm_pfn_to_page_table_nid(fault->pfn);
 		/*
 		 * The SPTE is either non-present or points to a huge page that
 		 * needs to be split.
 		 */
-		sp =3D tdp_mmu_alloc_sp(vcpu);
+		sp =3D tdp_mmu_alloc_sp(vcpu, nid);
 		tdp_mmu_init_child_sp(sp, &iter);
=20
 		sp->nx_huge_page_disallowed =3D fault->huge_page_disallowed;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d96c8146e9ba..4f3db7ffeba8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -415,7 +415,7 @@ static inline void *mmu_memory_cache_alloc_obj(struct k=
vm_mmu_memory_cache *mc,
 	if (mc->kmem_cache)
 		return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
 	else
-		return (void *)__get_free_page(gfp_flags);
+		return kvm_mmu_get_free_page(mc->node, gfp_flags);
 }
=20
 int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capa=
city, int min)
--=20
2.39.0.314.g84b9a713c41-goog