From: Isaku Yamahata <isaku.yamahata@intel.com>
Intel TDX protects guest VMs from malicious host and certain physical
attacks. The TDX module uses pages provided by the host for both control
structures and for TD guest pages. These pages are encrypted using the
MK-TME encryption engine, with its special requirements around cache
invalidation. For its own security, the TDX module ensures pages are
flushed properly and track which usage they are currently assigned. For
creating and tearing down TD VMs and vCPUs KVM will need to use the
TDH.PHYMEM.PAGE.RECLAIM, TDH.PHYMEM.CACHE.WB, and TDH.PHYMEM.PAGE.WBINVD
SEAMCALLs.
Add tdh_phymem_page_reclaim() to enable KVM to call
TDH.PHYMEM.PAGE.RECLAIM to reclaim the page for use by the host kernel.
This effectively resets its state in the TDX module's page tracking
(PAMT), if the page is available to be reclaimed. This will be used by KVM
to reclaim the various types of pages owned by the TDX module. It will
have a small wrapper in KVM that retries in the case of a relevant error
code. Don't implement this wrapper in arch/x86 because KVM's solution
around retrying SEAMCALLs will be better located in a single place.
Add tdh_phymem_cache_wb() to enable KVM to call TDH.PHYMEM.CACHE.WB to do
a cache write back in a way that the TDX module can verify, before it
allows a KeyID to be freed. The KVM code will use this to have a small
wrapper that handles retries. Since the TDH.PHYMEM.CACHE.WB operation is
interruptible, have tdh_phymem_cache_wb() take a resume argument to pass
this info to the TDX module for restarts. It is worth noting that this
SEAMCALL uses a SEAM specific MSR to do the write back in sections. In
this way it does export some new functionality that affects CPU state.
Add tdh_phymem_page_wbinvd_tdr() to enable KVM to call
TDH.PHYMEM.PAGE.WBINVD to do a cache write back and invalidate of a TDR,
using the global KeyID. The underlying TDH.PHYMEM.PAGE.WBINVD SEAMCALL
requires the related KeyID to be encoded into the SEAMCALL args. Since the
global KeyID is not exposed to KVM, a dedicated wrapper is needed for TDR
focused TDH.PHYMEM.PAGE.WBINVD operations.
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Reviewed-by: Yuan Yao <yuan.yao@intel.com>
---
uAPI breakout v2:
- Change to use 'u64' as function parameter to prepare to move
SEAMCALL wrappers to arch/x86. (Kai)
- Split to separate patch
- Move SEAMCALL wrappers from KVM to x86 core;
- Move TDH_xx macros from KVM to x86 core;
- Re-write log
uAPI breakout v1:
- Make argument to C wrapper function struct kvm_tdx * or
struct vcpu_tdx * .(Sean)
- Drop unused helpers (Kai)
- Fix bisectability issues in headers (Kai)
- Updates from seamcall overhaul (Kai)
v19:
- Update the commit message to match the patch by Yuan
- Use seamcall() and seamcall_ret() by paolo
v18:
- removed stub functions for __seamcall{,_ret}()
- Added Reviewed-by Binbin
- Make tdx_seamcall() use struct tdx_module_args instead of taking
each inputs.
v16:
- use struct tdx_module_args instead of struct tdx_module_output
- Add tdh_mem_sept_rd() for SEPT_VE_DISABLE=1.
---
arch/x86/include/asm/tdx.h | 3 +++
arch/x86/virt/vmx/tdx/tdx.c | 44 +++++++++++++++++++++++++++++++++++++
arch/x86/virt/vmx/tdx/tdx.h | 4 +++-
3 files changed, 50 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 6951faa37031..0cf8975759de 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -132,6 +132,9 @@ u64 tdh_mng_key_freeid(u64 tdr);
u64 tdh_mng_init(u64 tdr, u64 td_params, u64 *rcx);
u64 tdh_vp_init(u64 tdvpr, u64 initial_rcx);
u64 tdh_vp_init_apicid(u64 tdvpr, u64 initial_rcx, u32 x2apicid);
+u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8);
+u64 tdh_phymem_cache_wb(bool resume);
+u64 tdh_phymem_page_wbinvd_tdr(u64 tdr);
#else
static inline void tdx_init(void) { }
static inline int tdx_cpu_enable(void) { return -ENODEV; }
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index b3003031e0fe..7e7c2e2360af 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1670,3 +1670,47 @@ u64 tdh_vp_init_apicid(u64 tdvpr, u64 initial_rcx, u32 x2apicid)
return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args);
}
EXPORT_SYMBOL_GPL(tdh_vp_init_apicid);
+
+u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8)
+{
+ struct tdx_module_args args = {
+ .rcx = page,
+ };
+ u64 ret;
+
+ ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args);
+
+ /*
+ * Additional error information:
+ *
+ * - RCX: page type
+ * - RDX: owner
+ * - R8: page size (4K, 2M or 1G)
+ */
+ *rcx = args.rcx;
+ *rdx = args.rdx;
+ *r8 = args.r8;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim);
+
+u64 tdh_phymem_cache_wb(bool resume)
+{
+ struct tdx_module_args args = {
+ .rcx = resume ? 1 : 0,
+ };
+
+ return seamcall(TDH_PHYMEM_CACHE_WB, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb);
+
+u64 tdh_phymem_page_wbinvd_tdr(u64 tdr)
+{
+ struct tdx_module_args args = {};
+
+ args.rcx = tdr | ((u64)tdx_global_keyid << boot_cpu_data.x86_phys_bits);
+
+ return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr);
diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
index 64b6504791e1..191bdd1e571d 100644
--- a/arch/x86/virt/vmx/tdx/tdx.h
+++ b/arch/x86/virt/vmx/tdx/tdx.h
@@ -26,14 +26,16 @@
#define TDH_MNG_INIT 21
#define TDH_VP_INIT 22
#define TDH_PHYMEM_PAGE_RDMD 24
+#define TDH_PHYMEM_PAGE_RECLAIM 28
#define TDH_SYS_KEY_CONFIG 31
#define TDH_SYS_INIT 33
#define TDH_SYS_RD 34
#define TDH_SYS_LP_INIT 35
#define TDH_SYS_TDMR_INIT 36
+#define TDH_PHYMEM_CACHE_WB 40
+#define TDH_PHYMEM_PAGE_WBINVD 41
#define TDH_SYS_CONFIG 45
-
/*
* SEAMCALL leaf:
*
--
2.47.0
On 10/30/24 12:00, Rick Edgecombe wrote: > +u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8) > +{ > + struct tdx_module_args args = { > + .rcx = page, > + }; > + u64 ret; This isn't quite what I'm looking for in these wrappers. For instance: > + /* > + * Additional error information: > + * > + * - RCX: page type > + * - RDX: owner > + * - R8: page size (4K, 2M or 1G) > + */ > + *rcx = args.rcx; > + *rdx = args.rdx; > + *r8 = args.r8; If this were, instead: u64 tdh_phymem_page_reclaim(u64 page, u64 *type, u64 *owner, u64 *size) { ... *type = args.rcx; *owner = args.rdx; *size = args.r8; Then you wouldn't need the comment in the first place. Then you could also be thinking about adding _some_ kind of type safety to the arguments. The 'size' or the 'type' could totally be enums. There's really zero value in having wrappers like these. They don't have any type safety or add any readability or make the seamcall easier to use. There's almost no value in having these versus just exporting seamcall_ret() itself.
On Tue, 2024-11-12 at 16:20 -0800, Dave Hansen wrote: > If this were, instead: > > u64 tdh_phymem_page_reclaim(u64 page, u64 *type, u64 *owner, u64 *size) > { > ... > *type = args.rcx; > *owner = args.rdx; > *size = args.r8; > > Then you wouldn't need the comment in the first place. Then you could > also be thinking about adding _some_ kind of type safety to the > arguments. The 'size' or the 'type' could totally be enums. Yes, *rcx and *rdx stand out. > > There's really zero value in having wrappers like these. They don't > have any type safety or add any readability or make the seamcall easier > to use. There's almost no value in having these versus just exporting > seamcall_ret() itself. Hoping to solicit some more thoughts on the value question... I thought the main thing was to not export *all* SEAMCALLs. Future TDX modules could add new leafs that do who-knows-what. For this SEAMCALL wrapper, the only use of the out args is printing them in an error message (based on other logic). So turning them into enums would just add a layer of translation to be decoded. A developer would have to translate them back into the registers they came from to try to extract meaning from the TDX docs. However, some future user of TDH.PHYMEM.PAGE.RECLAIM might want to do something else where the enums could add code clarity. But this goes down the road of building things that are not needed today. Is there value in maintaining a sensible looking API to be exported, even if it is not needed today?
On 11/13/24 12:51, Edgecombe, Rick P wrote: > However, some future user of TDH.PHYMEM.PAGE.RECLAIM might want to do something > else where the enums could add code clarity. But this goes down the road of > building things that are not needed today. Here's why the current code is a bit suboptimal: > +/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ > +static int __tdx_reclaim_page(hpa_t pa) > +{ ... > + for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { > + err = tdh_phymem_page_reclaim(pa, &rcx, &rdx, &r8); ... > +out: > + if (WARN_ON_ONCE(err)) { > + pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); > + return -EIO; > + } > + return 0; > +} Let's say I see the error get spit out on the console. I can't make any sense out of it from this spot. I need to go over to the TDX docs or tdh_phymem_page_reclaim() to look at the *comment* to figure out what these the registers are named. The code as proposed has zero self-documenting properties. It's actually completely non-self-documenting. It isn't _any_ better for readability than just doing: struct tdx_module_args args = {}; for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { args.rcx = pa; err = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args); ... } pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, args.rcx, args.rdx, args.r8); Also, this is also showing a lack of naming discipline where things are named. The first argument is 'pa' in here but 'page' on the other side: > +u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8) > +{ > + struct tdx_module_args args = { > + .rcx = page, I can't tell you how many recompiles it's cost me when I got lazy about physical addr vs. virtual addr vs. struct page vs. pfn. So, yeah, I'd rather not export seamcall_ret(), but I'd rather do that than have a layer of abstraction that's adding little value while it also brings obfuscation.
On Wed, 2024-11-13 at 13:08 -0800, Dave Hansen wrote: > Let's say I see the error get spit out on the console. I can't make any > sense out of it from this spot. I need to go over to the TDX docs or > tdh_phymem_page_reclaim() to look at the *comment* to figure out what > these the registers are named. > > The code as proposed has zero self-documenting properties. It's > actually completely non-self-documenting. It isn't _any_ better for > readability than just doing: > > struct tdx_module_args args = {}; > > for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { > args.rcx = pa; > err = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args); > ... > } > > pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, > args.rcx, args.rdx, args.r8); If we extracted meaning from the registers and printed those, then we would not have any new bits that popped up in there. For example, currently r8 has bits 63:3 described as reserved. While expectations around TDX module behavior changes are still settling, I'd rather have the full register for debugging than an easy to read error message. But we have actually gone down this road a little bit already when we adjusted the KVM calling code to stop manually loading the struct tdx_module_args. > > Also, this is also showing a lack of naming discipline where things are > named. The first argument is 'pa' in here but 'page' on the other side: > > > +u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8) > > +{ > > + struct tdx_module_args args = { > > + .rcx = page, > > I can't tell you how many recompiles it's cost me when I got lazy about > physical addr vs. virtual addr vs. struct page vs. pfn. Standardizing on VAs for the SEAMCALL wrappers seems like a good idea. I haven't checked them all, but seems to be promising so far. > > So, yeah, I'd rather not export seamcall_ret(), but I'd rather do that > than have a layer of abstraction that's adding little value while it > also brings obfuscation. In KVM these types can get even more confusing. There are guest physical address and virtual addresses as well as the host physical and virtual. So in KVM there is a typedef for host physical addresses: hpa_t. Previously these wrappers used it because they are in KVM code. It was: +static inline u64 tdh_phymem_page_reclaim(hpa_t page, u64 *rcx, u64 *rdx, + u64 *r8) +{ + struct tdx_module_args in = { + .rcx = page, + }; + u64 ret; + + ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &in); + + *rcx = in.rcx; + *rdx = in.rdx; + *r8 = in.r8; + + return ret; +} Moving them to arch/x86 means we need to translate some things between KVM's parlance and the rest of the kernels. This is extra wrapping. Another example that was used in the old SEAMCALL wrappers was gpa_t, which KVM uses to refers to a guest physical address. void * to the host direct map doesn't fit, so we are back to u64 or a new gpa struct (like in the other thread) to speak to the arch/x86 layers. So I think we will need some light layers of abstraction if we keep the wrappers in arch/x86.
On 11/13/24 13:44, Edgecombe, Rick P wrote: > Moving them to arch/x86 means we need to translate some things between KVM's > parlance and the rest of the kernels. This is extra wrapping. Another example > that was used in the old SEAMCALL wrappers was gpa_t, which KVM uses to refers > to a guest physical address. void * to the host direct map doesn't fit, so we > are back to u64 or a new gpa struct (like in the other thread) to speak to the > arch/x86 layers. I have zero issues with non-core x86 code doing a #include <linux/kvm_types.h>. Why not just use the KVM types?
On Wed, 2024-11-13 at 13:50 -0800, Dave Hansen wrote: > On 11/13/24 13:44, Edgecombe, Rick P wrote: > > Moving them to arch/x86 means we need to translate some things between KVM's > > parlance and the rest of the kernels. This is extra wrapping. Another example > > that was used in the old SEAMCALL wrappers was gpa_t, which KVM uses to refers > > to a guest physical address. void * to the host direct map doesn't fit, so we > > are back to u64 or a new gpa struct (like in the other thread) to speak to the > > arch/x86 layers. > > I have zero issues with non-core x86 code doing a #include > <linux/kvm_types.h>. Why not just use the KVM types? You know...I assumed it wouldn't work because of some internal headers. But yea. Nevermind, we can just do that. Probably because the old code also referred to struct kvm_tdx, it just got fully separated. Kai did you attempt this path at all? I think, hand-waving in a general way, having the SEAMCALL wrappers in KVM code will result in at least more marshaling of structs members into function args. But I can't point to any specific problem in our current SEAMCALLs.
On 14/11/2024 11:00 am, Edgecombe, Rick P wrote: > On Wed, 2024-11-13 at 13:50 -0800, Dave Hansen wrote: >> On 11/13/24 13:44, Edgecombe, Rick P wrote: >>> Moving them to arch/x86 means we need to translate some things between KVM's >>> parlance and the rest of the kernels. This is extra wrapping. Another example >>> that was used in the old SEAMCALL wrappers was gpa_t, which KVM uses to refers >>> to a guest physical address. void * to the host direct map doesn't fit, so we >>> are back to u64 or a new gpa struct (like in the other thread) to speak to the >>> arch/x86 layers. >> >> I have zero issues with non-core x86 code doing a #include >> <linux/kvm_types.h>. Why not just use the KVM types? > > You know...I assumed it wouldn't work because of some internal headers. But yea. > Nevermind, we can just do that. Probably because the old code also referred to > struct kvm_tdx, it just got fully separated. Kai did you attempt this path at > all? 'struct kvm_tdx' is a KVM internal structure so we cannot use that in SEAMCALL wrappers in the x86 core. If you are talking about just use KVM types like 'gfn_t/hpa_t' etc (by including <linux/kvm_types.h>) perhaps this is fine. But I didn't try to do in this way. We can try if that's better, but I suppose we should get Sean/Paolo's feedback here?
On Thu, 2024-11-14 at 13:21 +1300, Huang, Kai wrote: > > On 14/11/2024 11:00 am, Edgecombe, Rick P wrote: > > On Wed, 2024-11-13 at 13:50 -0800, Dave Hansen wrote: > > > On 11/13/24 13:44, Edgecombe, Rick P wrote: > > > > Moving them to arch/x86 means we need to translate some things between KVM's > > > > parlance and the rest of the kernels. This is extra wrapping. Another example > > > > that was used in the old SEAMCALL wrappers was gpa_t, which KVM uses to refers > > > > to a guest physical address. void * to the host direct map doesn't fit, so we > > > > are back to u64 or a new gpa struct (like in the other thread) to speak to the > > > > arch/x86 layers. > > > > > > I have zero issues with non-core x86 code doing a #include > > > <linux/kvm_types.h>. Why not just use the KVM types? > > > > You know...I assumed it wouldn't work because of some internal headers. But yea. > > Nevermind, we can just do that. Probably because the old code also referred to > > struct kvm_tdx, it just got fully separated. Kai did you attempt this path at > > all? > > 'struct kvm_tdx' is a KVM internal structure so we cannot use that in > SEAMCALL wrappers in the x86 core. > Yea, makes sense. > If you are talking about just use > KVM types like 'gfn_t/hpa_t' etc (by including <linux/kvm_types.h>) > perhaps this is fine. > > But I didn't try to do in this way. We can try if that's better, but I > suppose we should get Sean/Paolo's feedback here? There are certainly a lot of style considerations here. I'm thinking to post like an RFC. Like a fork to look at Dave's suggestions.
> > So, yeah, I'd rather not export seamcall_ret(), but I'd rather do that > than have a layer of abstraction that's adding little value while it > also brings obfuscation. Just want to provide one more information: Peter posted a series to allow us to export one symbol _only_ for a particular module: https://lore.kernel.org/lkml/20241111105430.575636482@infradead.org/ IIUC we can use that to only export __seamcall*() for KVM. I am not sure whether this addresses the concern of "the exported symbol could be potentially abused by other modules like out-of-tree ones"?
On Thu, 2024-11-14 at 10:25 +1300, Huang, Kai wrote: > > > > So, yeah, I'd rather not export seamcall_ret(), but I'd rather do that > > than have a layer of abstraction that's adding little value while it > > also brings obfuscation. > > Just want to provide one more information: > > Peter posted a series to allow us to export one symbol _only_ for a > particular module: > > https://lore.kernel.org/lkml/20241111105430.575636482@infradead.org/ > > IIUC we can use that to only export __seamcall*() for KVM. > > I am not sure whether this addresses the concern of "the exported symbol > could be potentially abused by other modules like out-of-tree ones"? I think so. It's too bad it's an RFC v1. But maybe we could point to it for the future, if we move the wrappers back into KVM. The other small thing the export does is move the KVM disliked code generation into arch/x86. This is a silly non-technical reason though.
On Wed, Oct 30, 2024 at 12:00:21PM -0700, Rick Edgecombe wrote: > From: Isaku Yamahata <isaku.yamahata@intel.com> > Add tdh_phymem_page_reclaim() to enable KVM to call > TDH.PHYMEM.PAGE.RECLAIM to reclaim the page for use by the host kernel. > This effectively resets its state in the TDX module's page tracking > (PAMT), if the page is available to be reclaimed. This will be used by KVM > to reclaim the various types of pages owned by the TDX module. It will > have a small wrapper in KVM that retries in the case of a relevant error > code. Don't implement this wrapper in arch/x86 because KVM's solution > around retrying SEAMCALLs will be better located in a single place. With the current KVM code, it looks that KVM may not need the wrapper to retry tdh_phymem_page_reclaim(). The logic of SEAMCALL TDH_PHYMEM_PAGE_RECLAIM is like this: SEAMCALL TDH_PHYMEM_PAGE_RECLAIM: 1.pamt_walk case (a):if to reclaim TDR: get shared lock of 1gb and 2mb pamt entries of TDR page, get exclusive lock of 4k pamt entry of TDR page. case (b):if to reclaim non-TDR & non-TD pages, get shared lock of 1gb and 2mb pamt entries of the page to reclaim, get exclusive lock of 4k pamt entry of the page to reclaim. case (c):if to reclaim TD pages, get exclusive lock of 1gb or 2mb or 4k pamt entry of the page to reclaim, depending on the page size of page to reclaim, get shared lock of pamt entries above the page size. 2.check the exclusively locked pamt entry of page to reclaim (e.g. page type, alignment) 3:case (a):if to reclaim TDR, map and check TDR page case (b)(c):if to reclaim non-TDR pages or TD pages, get shared lock of 4k pamt entry of TDR page, map, check of TDR page, atomically update TDR child cnt. 4.set page type to NDA to the exclusively locked pamt entry of the page to reclaim. In summary, ------------------------------------------------------------------------------ page to reclaim | locks --------------------|--------------------------------------------------------- TDR | exclusive lock of 4k pamt entry of TDR page --------------------|--------------------------------------------------------- non-TDR and non-TD | shared lock of 4k pamt entry of TDR page | exclusive lock of 4k pamt entry of page to reclaim --------------------|--------------------------------------------------------- TD page | shared lock of 4k pamt entry of TDR page | exclusive lock of pamt entry of size of page to reclaim ------------------------------------------------------------------------------ When TD is tearing down, - TD pages are removed and freed when hkid is assigned, so tdh_phymem_page_reclaim() will not be called for them. - after vt_vm_destroy() releasing the hkid, kvm_arch_destroy_vm() calls kvm_destroy_vcpus(), kvm_mmu_uninit_tdp_mmu() and tdx_vm_free() to reclaim TDCX/TDVPR/EPT/TDR pages sequentially in a single thread. So, there should be no contentions expected for current KVM to call tdh_phymem_page_reclaim(). > +u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8) > +{ > + struct tdx_module_args args = { > + .rcx = page, > + }; > + u64 ret; > + > + ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args); > + > + /* > + * Additional error information: > + * > + * - RCX: page type > + * - RDX: owner > + * - R8: page size (4K, 2M or 1G) > + */ > + *rcx = args.rcx; > + *rdx = args.rdx; > + *r8 = args.r8; > + > + return ret; > +} > +EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim);
On Thu, 2024-10-31 at 11:57 +0800, Yan Zhao wrote: > When TD is tearing down, > - TD pages are removed and freed when hkid is assigned, so > tdh_phymem_page_reclaim() will not be called for them. > - after vt_vm_destroy() releasing the hkid, kvm_arch_destroy_vm() calls > kvm_destroy_vcpus(), kvm_mmu_uninit_tdp_mmu() and tdx_vm_free() to reclaim > TDCX/TDVPR/EPT/TDR pages sequentially in a single thread. > > So, there should be no contentions expected for current KVM to call > tdh_phymem_page_reclaim(). This links into the question of how much of the wrappers should be in KVM code vs arch/x86. I got the impression Dave would like to not see SEAMCALLs just getting wrapped on KVM's side with what it really needs. Towards that, it could be tempting to move tdx_reclaim_page() (see "[PATCH v2 17/25] KVM: TDX: create/destroy VM structure") into arch/x86 and have arch/x86 handle the tdx_clear_page() part too. That would also be more symmetric with what arch/x86 already does for clflush on the calls that hand pages to the TDX module. But the analysis of why we don't need to worry about TDX_OPERAND_BUSY is based on KVM's current use of tdh_phymem_page_reclaim(). So KVM still has to be the one to reason about TDX_OPERAND_BUSY, and the more we wrap the low level SEAMCALLs, the more brittle and spread out the solution to dance around the TDX module locks becomes. I took a look at dropping the retry loop and moving tdx_reclaim_page() into arch/x86 anyway: arch/x86/include/asm/tdx.h | 3 +-- arch/x86/kvm/vmx/tdx.c | 74 ++++------------------------------------------ ---------------------------- arch/x86/virt/vmx/tdx/tdx.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++------------- 3 files changed, 55 insertions(+), 85 deletions(-) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 051465261155..790d6d99d895 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -145,13 +145,12 @@ u64 tdh_vp_init(u64 tdvpr, u64 initial_rcx); u64 tdh_vp_rd(u64 tdvpr, u64 field, u64 *data); u64 tdh_vp_wr(u64 tdvpr, u64 field, u64 data, u64 mask); u64 tdh_vp_init_apicid(u64 tdvpr, u64 initial_rcx, u32 x2apicid); -u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8); +u64 tdx_reclaim_page(u64 pa, bool wbind); u64 tdh_mem_page_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx); u64 tdh_mem_sept_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx); u64 tdh_mem_track(u64 tdr); u64 tdh_mem_range_unblock(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx); u64 tdh_phymem_cache_wb(bool resume); -u64 tdh_phymem_page_wbinvd_tdr(u64 tdr); u64 tdh_phymem_page_wbinvd_hkid(u64 hpa, u64 hkid); #else static inline void tdx_init(void) { } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 0ee8ec86d02a..aca73d942344 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -291,67 +291,6 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } -static void tdx_clear_page(unsigned long page_pa) -{ - const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); - void *page = __va(page_pa); - unsigned long i; - - /* - * The page could have been poisoned. MOVDIR64B also clears - * the poison bit so the kernel can safely use the page again. - */ - for (i = 0; i < PAGE_SIZE; i += 64) - movdir64b(page + i, zero_page); - /* - * MOVDIR64B store uses WC buffer. Prevent following memory reads - * from seeing potentially poisoned cache. - */ - __mb(); -} - -/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ -static int __tdx_reclaim_page(hpa_t pa) -{ - u64 err, rcx, rdx, r8; - int i; - - for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { - err = tdh_phymem_page_reclaim(pa, &rcx, &rdx, &r8); - - /* - * TDH.PHYMEM.PAGE.RECLAIM is allowed only when TD is shutdown. - * state. i.e. destructing TD. - * TDH.PHYMEM.PAGE.RECLAIM requires TDR and target page. - * Because we're destructing TD, it's rare to contend with TDR. - */ - switch (err) { - case TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX: - case TDX_OPERAND_BUSY | TDX_OPERAND_ID_TDR: - cond_resched(); - continue; - default: - goto out; - } - } - -out: - if (WARN_ON_ONCE(err)) { - pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); - return -EIO; - } - return 0; -} - -static int tdx_reclaim_page(hpa_t pa) -{ - int r; - - r = __tdx_reclaim_page(pa); - if (!r) - tdx_clear_page(pa); - return r; -} /* @@ -365,7 +304,7 @@ static void tdx_reclaim_control_page(unsigned long ctrl_page_pa) * Leak the page if the kernel failed to reclaim the page. * The kernel cannot use it safely anymore. */ - if (tdx_reclaim_page(ctrl_page_pa)) + if (tdx_reclaim_page(ctrl_page_pa, false)) return; free_page((unsigned long)__va(ctrl_page_pa)); @@ -581,20 +520,16 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm) if (!kvm_tdx->tdr_pa) return; - if (__tdx_reclaim_page(kvm_tdx->tdr_pa)) - return; - /* * Use a SEAMCALL to ask the TDX module to flush the cache based on the * KeyID. TDX module may access TDR while operating on TD (Especially * when it is reclaiming TDCS). */ - err = tdh_phymem_page_wbinvd_tdr(kvm_tdx->tdr_pa); + err = tdx_reclaim_page(kvm_tdx->tdr_pa, true); if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + pr_tdx_error(tdx_reclaim_page, err); return; } - tdx_clear_page(kvm_tdx->tdr_pa); free_page((unsigned long)__va(kvm_tdx->tdr_pa)); kvm_tdx->tdr_pa = 0; @@ -1694,7 +1629,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); return -EIO; } - tdx_clear_page(hpa); tdx_unpin(kvm, pfn); return 0; } @@ -1805,7 +1739,7 @@ int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, * The HKID assigned to this TD was already freed and cache was * already flushed. We don't have to flush again. */ - return tdx_reclaim_page(__pa(private_spt)); + return tdx_reclaim_page(__pa(private_spt), false); } int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index bad83f6a3b0c..bb7cdb867581 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1892,7 +1892,7 @@ u64 tdh_vp_init_apicid(u64 tdvpr, u64 initial_rcx, u32 x2apicid) } EXPORT_SYMBOL_GPL(tdh_vp_init_apicid); -u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8) +static u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8) { struct tdx_module_args args = { .rcx = page, @@ -1914,7 +1914,49 @@ u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8) return ret; } -EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim); + +static void tdx_clear_page(unsigned long page_pa) +{ + const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); + void *page = __va(page_pa); + unsigned long i; + + /* + * The page could have been poisoned. MOVDIR64B also clears + * the poison bit so the kernel can safely use the page again. + */ + for (i = 0; i < PAGE_SIZE; i += 64) + movdir64b(page + i, zero_page); + /* + * MOVDIR64B store uses WC buffer. Prevent following memory reads + * from seeing potentially poisoned cache. + */ + __mb(); +} + +/* + * tdx_reclaim_page() calls tdh_phymem_page_reclaim() internally. Callers should + * be prepared to handle TDX_OPERAND_BUSY. + * If return code is not an error, page has been cleared with MOVDIR64. + */ +u64 tdx_reclaim_page(u64 pa, bool wbind_global_key) +{ + u64 rcx, rdx, r8; + u64 r; + + r = tdh_phymem_page_reclaim(pa, &rcx, &rdx, &r8); + if (r) + return r; + + /* tdh_phymem_page_wbinvd_hkid() will do tdx_clear_page() */ + if (wbind_global_key) + return tdh_phymem_page_wbinvd_hkid(pa, tdx_global_keyid); + + tdx_clear_page(pa); + + return r; +} +EXPORT_SYMBOL_GPL(tdx_reclaim_page); u64 tdh_mem_page_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx) { @@ -1987,22 +2029,17 @@ u64 tdh_phymem_cache_wb(bool resume) } EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb); -u64 tdh_phymem_page_wbinvd_tdr(u64 tdr) -{ - struct tdx_module_args args = {}; - - args.rcx = tdr | ((u64)tdx_global_keyid << boot_cpu_data.x86_phys_bits); - - return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); -} -EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr); - u64 tdh_phymem_page_wbinvd_hkid(u64 hpa, u64 hkid) { struct tdx_module_args args = {}; + u64 err; args.rcx = hpa | (hkid << boot_cpu_data.x86_phys_bits); - return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); + err = seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); + if (!err) + tdx_clear_page(hpa); + + return err; } EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
On 1/11/2024 7:57 am, Edgecombe, Rick P wrote: > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c > index bad83f6a3b0c..bb7cdb867581 100644 > --- a/arch/x86/virt/vmx/tdx/tdx.c > +++ b/arch/x86/virt/vmx/tdx/tdx.c ... > +static void tdx_clear_page(unsigned long page_pa) > +{ > + const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); > + void *page = __va(page_pa); > + unsigned long i; > + > + /* > + * The page could have been poisoned. MOVDIR64B also clears > + * the poison bit so the kernel can safely use the page again. > + */ > + for (i = 0; i < PAGE_SIZE; i += 64) > + movdir64b(page + i, zero_page); > + /* > + * MOVDIR64B store uses WC buffer. Prevent following memory reads > + * from seeing potentially poisoned cache. > + */ > + __mb(); > +} Just FYI there's already one reset_tdx_pages() doing the same thing in x86 tdx.c: /* * Convert TDX private pages back to normal by using MOVDIR64B to * clear these pages. Note this function doesn't flush cache of * these TDX private pages. The caller should make sure of that. */ static void reset_tdx_pages(unsigned long base, unsigned long size) { const void *zero_page = (const void *)page_address(ZERO_PAGE(0)); unsigned long phys, end; end = base + size; for (phys = base; phys < end; phys += 64) movdir64b(__va(phys), zero_page); /* * MOVDIR64B uses WC protocol. Use memory barrier to * make sure any later user of these pages sees the * updated data. */ mb(); }
© 2016 - 2024 Red Hat, Inc.