From: Xiaoyao Li <xiaoyao.li@intel.com>
Add a wrapper tdh_mem_page_demote() to invoke SEAMCALL TDH_MEM_PAGE_DEMOTE
to demote a huge leaf entry to a non-leaf entry in S-EPT. Currently, the
TDX module only supports demotion of a 2M huge leaf entry. After a
successful demotion, the old 2M huge leaf entry in S-EPT is replaced with a
non-leaf entry, linking to the newly-added page table page. The newly
linked page table page then contains 512 leaf entries, pointing to the 2M
guest private pages.
The "gpa" and "level" direct the TDX module to search and find the old
huge leaf entry.
As the new non-leaf entry points to a page table page, callers need to
pass in the page table page in parameter "page".
In case of S-EPT walk failure, the entry, level and state where the error
was detected are returned in ext_err1 and ext_err2.
On interrupt pending, SEAMCALL TDH_MEM_PAGE_DEMOTE returns error
TDX_INTERRUPTED_RESTARTABLE.
[Yan: Rebased and split patch, wrote changelog]
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
arch/x86/include/asm/tdx.h | 2 ++
arch/x86/virt/vmx/tdx/tdx.c | 20 ++++++++++++++++++++
arch/x86/virt/vmx/tdx/tdx.h | 1 +
3 files changed, 23 insertions(+)
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 26ffc792e673..08eff4b2f5e7 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -177,6 +177,8 @@ u64 tdh_mng_key_config(struct tdx_td *td);
u64 tdh_mng_create(struct tdx_td *td, u16 hkid);
u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp);
u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data);
+u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *page,
+ u64 *ext_err1, u64 *ext_err2);
u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2);
u64 tdh_mr_finalize(struct tdx_td *td);
u64 tdh_vp_flush(struct tdx_vp *vp);
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index a66d501b5677..5699dfe500d9 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1684,6 +1684,26 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
}
EXPORT_SYMBOL_GPL(tdh_mng_rd);
+u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *page,
+ u64 *ext_err1, u64 *ext_err2)
+{
+ struct tdx_module_args args = {
+ .rcx = gpa | level,
+ .rdx = tdx_tdr_pa(td),
+ .r8 = page_to_phys(page),
+ };
+ u64 ret;
+
+ tdx_clflush_page(page);
+ ret = seamcall_ret(TDH_MEM_PAGE_DEMOTE, &args);
+
+ *ext_err1 = args.rcx;
+ *ext_err2 = args.rdx;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdh_mem_page_demote);
+
u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2)
{
struct tdx_module_args args = {
diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
index 82bb82be8567..b4dc6b86d40a 100644
--- a/arch/x86/virt/vmx/tdx/tdx.h
+++ b/arch/x86/virt/vmx/tdx/tdx.h
@@ -24,6 +24,7 @@
#define TDH_MNG_KEY_CONFIG 8
#define TDH_MNG_CREATE 9
#define TDH_MNG_RD 11
+#define TDH_MEM_PAGE_DEMOTE 15
#define TDH_MR_EXTEND 16
#define TDH_MR_FINALIZE 17
#define TDH_VP_FLUSH 18
--
2.43.2
On Thu, 2025-04-24 at 11:04 +0800, Yan Zhao wrote:
> From: Xiaoyao Li <xiaoyao.li@intel.com>
>
> Add a wrapper tdh_mem_page_demote() to invoke SEAMCALL TDH_MEM_PAGE_DEMOTE
> to demote a huge leaf entry to a non-leaf entry in S-EPT. Currently, the
> TDX module only supports demotion of a 2M huge leaf entry. After a
> successful demotion, the old 2M huge leaf entry in S-EPT is replaced with a
> non-leaf entry, linking to the newly-added page table page. The newly
> linked page table page then contains 512 leaf entries, pointing to the 2M
> guest private pages.
>
> The "gpa" and "level" direct the TDX module to search and find the old
> huge leaf entry.
>
> As the new non-leaf entry points to a page table page, callers need to
> pass in the page table page in parameter "page".
>
> In case of S-EPT walk failure, the entry, level and state where the error
> was detected are returned in ext_err1 and ext_err2.
>
> On interrupt pending, SEAMCALL TDH_MEM_PAGE_DEMOTE returns error
> TDX_INTERRUPTED_RESTARTABLE.
>
> [Yan: Rebased and split patch, wrote changelog]
We should add the level of detail here like we did for the base series ones.
>
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> ---
> arch/x86/include/asm/tdx.h | 2 ++
> arch/x86/virt/vmx/tdx/tdx.c | 20 ++++++++++++++++++++
> arch/x86/virt/vmx/tdx/tdx.h | 1 +
> 3 files changed, 23 insertions(+)
>
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index 26ffc792e673..08eff4b2f5e7 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -177,6 +177,8 @@ u64 tdh_mng_key_config(struct tdx_td *td);
> u64 tdh_mng_create(struct tdx_td *td, u16 hkid);
> u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp);
> u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data);
> +u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *page,
> + u64 *ext_err1, u64 *ext_err2);
> u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2);
> u64 tdh_mr_finalize(struct tdx_td *td);
> u64 tdh_vp_flush(struct tdx_vp *vp);
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index a66d501b5677..5699dfe500d9 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -1684,6 +1684,26 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
> }
> EXPORT_SYMBOL_GPL(tdh_mng_rd);
>
> +u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *page,
> + u64 *ext_err1, u64 *ext_err2)
> +{
> + struct tdx_module_args args = {
> + .rcx = gpa | level,
This will only ever be level 2MB, how about dropping the arg?
> + .rdx = tdx_tdr_pa(td),
> + .r8 = page_to_phys(page),
> + };
> + u64 ret;
> +
> + tdx_clflush_page(page);
> + ret = seamcall_ret(TDH_MEM_PAGE_DEMOTE, &args);
> +
> + *ext_err1 = args.rcx;
> + *ext_err2 = args.rdx;
How about we just call these entry and level_state, like the caller.
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(tdh_mem_page_demote);
Looking in the docs, TDX module gives some somewhat constrained guidance:
1. TDH.MEM.PAGE.DEMOTE should be invoked in a loop until it terminates
successfully.
2. The host VMM should be designed to avoid cases where interrupt storms prevent
successful completion of TDH.MEM.PAGE.DEMOTE.
The caller looks like:
do {
err = tdh_mem_page_demote(&kvm_tdx->td, gpa, tdx_level, page,
&entry, &level_state);
} while (err == TDX_INTERRUPTED_RESTARTABLE);
if (unlikely(tdx_operand_busy(err))) {
tdx_no_vcpus_enter_start(kvm);
err = tdh_mem_page_demote(&kvm_tdx->td, gpa, tdx_level, page,
&entry, &level_state);
tdx_no_vcpus_enter_stop(kvm);
}
The brute force second case could also be subjected to a
TDX_INTERRUPTED_RESTARTABLE and is not handled. As for interrupt storms, I guess
we could disable interrupts while we do the second brute force case. So the
TDX_INTERRUPTED_RESTARTABLE loop could have a max retries, and the brute force
case could also disable interrupts.
Hmm, how to pick the max retries count. It's a tradeoff between interrupt
latency and DOS/code complexity. Do we have any idea how long demote might take?
On Wed, May 14, 2025 at 02:19:56AM +0800, Edgecombe, Rick P wrote:
> On Thu, 2025-04-24 at 11:04 +0800, Yan Zhao wrote:
> > From: Xiaoyao Li <xiaoyao.li@intel.com>
> >
> > Add a wrapper tdh_mem_page_demote() to invoke SEAMCALL TDH_MEM_PAGE_DEMOTE
> > to demote a huge leaf entry to a non-leaf entry in S-EPT. Currently, the
> > TDX module only supports demotion of a 2M huge leaf entry. After a
> > successful demotion, the old 2M huge leaf entry in S-EPT is replaced with a
> > non-leaf entry, linking to the newly-added page table page. The newly
> > linked page table page then contains 512 leaf entries, pointing to the 2M
> > guest private pages.
> >
> > The "gpa" and "level" direct the TDX module to search and find the old
> > huge leaf entry.
> >
> > As the new non-leaf entry points to a page table page, callers need to
> > pass in the page table page in parameter "page".
> >
> > In case of S-EPT walk failure, the entry, level and state where the error
> > was detected are returned in ext_err1 and ext_err2.
> >
> > On interrupt pending, SEAMCALL TDH_MEM_PAGE_DEMOTE returns error
> > TDX_INTERRUPTED_RESTARTABLE.
> >
> > [Yan: Rebased and split patch, wrote changelog]
>
> We should add the level of detail here like we did for the base series ones.
I'll provide changelog details under "---" of each patch in the next version.
> >
> > Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > ---
> > arch/x86/include/asm/tdx.h | 2 ++
> > arch/x86/virt/vmx/tdx/tdx.c | 20 ++++++++++++++++++++
> > arch/x86/virt/vmx/tdx/tdx.h | 1 +
> > 3 files changed, 23 insertions(+)
> >
> > diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> > index 26ffc792e673..08eff4b2f5e7 100644
> > --- a/arch/x86/include/asm/tdx.h
> > +++ b/arch/x86/include/asm/tdx.h
> > @@ -177,6 +177,8 @@ u64 tdh_mng_key_config(struct tdx_td *td);
> > u64 tdh_mng_create(struct tdx_td *td, u16 hkid);
> > u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp);
> > u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data);
> > +u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *page,
> > + u64 *ext_err1, u64 *ext_err2);
> > u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2);
> > u64 tdh_mr_finalize(struct tdx_td *td);
> > u64 tdh_vp_flush(struct tdx_vp *vp);
> > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> > index a66d501b5677..5699dfe500d9 100644
> > --- a/arch/x86/virt/vmx/tdx/tdx.c
> > +++ b/arch/x86/virt/vmx/tdx/tdx.c
> > @@ -1684,6 +1684,26 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
> > }
> > EXPORT_SYMBOL_GPL(tdh_mng_rd);
> >
> > +u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *page,
> > + u64 *ext_err1, u64 *ext_err2)
> > +{
> > + struct tdx_module_args args = {
> > + .rcx = gpa | level,
>
> This will only ever be level 2MB, how about dropping the arg?
Do you mean hardcoding level to be 2MB in tdh_mem_page_demote()?
The SEAMCALL TDH_MEM_PAGE_DEMOTE supports level of 1GB in current TDX module.
> > + .rdx = tdx_tdr_pa(td),
> > + .r8 = page_to_phys(page),
> > + };
> > + u64 ret;
> > +
> > + tdx_clflush_page(page);
> > + ret = seamcall_ret(TDH_MEM_PAGE_DEMOTE, &args);
> > +
> > + *ext_err1 = args.rcx;
> > + *ext_err2 = args.rdx;
>
> How about we just call these entry and level_state, like the caller.
Not sure, but I feel that ext_err* might be better, because
- according to the spec,
a) the args.rcx, args.rdx is unmodified (i.e. still hold the passed-in value)
in case of error TDX_INTERRUPTED_RESTARTABLE.
b) args.rcx, args.rdx can only be interpreted as entry and level_state in case
of EPT walk error.
c) in other cases, they are 0.
- consistent with tdh_mem_page_aug(), tdh_mem_range_block()...
> > +
> > + return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(tdh_mem_page_demote);
>
> Looking in the docs, TDX module gives some somewhat constrained guidance:
> 1. TDH.MEM.PAGE.DEMOTE should be invoked in a loop until it terminates
> successfully.
> 2. The host VMM should be designed to avoid cases where interrupt storms prevent
> successful completion of TDH.MEM.PAGE.DEMOTE.
>
> The caller looks like:
> do {
> err = tdh_mem_page_demote(&kvm_tdx->td, gpa, tdx_level, page,
> &entry, &level_state);
> } while (err == TDX_INTERRUPTED_RESTARTABLE);
>
> if (unlikely(tdx_operand_busy(err))) {
> tdx_no_vcpus_enter_start(kvm);
> err = tdh_mem_page_demote(&kvm_tdx->td, gpa, tdx_level, page,
> &entry, &level_state);
> tdx_no_vcpus_enter_stop(kvm);
> }
>
> The brute force second case could also be subjected to a
> TDX_INTERRUPTED_RESTARTABLE and is not handled. As for interrupt storms, I guess
You are right.
> we could disable interrupts while we do the second brute force case. So the
> TDX_INTERRUPTED_RESTARTABLE loop could have a max retries, and the brute force
> case could also disable interrupts.
Good idea.
> Hmm, how to pick the max retries count. It's a tradeoff between interrupt
> latency and DOS/code complexity. Do we have any idea how long demote might take?
I did a brief test on my SPR, where the host was not busy :
tdh_mem_page_demote() was called 142 times, with each invocation taking around
10us.
2 invocations were due to TDX_INTERRUPTED_RESTARTABLE.
For each GFN, at most 1 retry was performed.
I will do more investigations.
On Thu, 2025-05-15 at 16:26 +0800, Yan Zhao wrote:
> On Wed, May 14, 2025 at 02:19:56AM +0800, Edgecombe, Rick P wrote:
> > On Thu, 2025-04-24 at 11:04 +0800, Yan Zhao wrote:
> > > From: Xiaoyao Li <xiaoyao.li@intel.com>
> > >
> > > Add a wrapper tdh_mem_page_demote() to invoke SEAMCALL TDH_MEM_PAGE_DEMOTE
> > > to demote a huge leaf entry to a non-leaf entry in S-EPT. Currently, the
> > > TDX module only supports demotion of a 2M huge leaf entry. After a
> > > successful demotion, the old 2M huge leaf entry in S-EPT is replaced with a
> > > non-leaf entry, linking to the newly-added page table page. The newly
> > > linked page table page then contains 512 leaf entries, pointing to the 2M
> > > guest private pages.
> > >
> > > The "gpa" and "level" direct the TDX module to search and find the old
> > > huge leaf entry.
> > >
> > > As the new non-leaf entry points to a page table page, callers need to
> > > pass in the page table page in parameter "page".
> > >
> > > In case of S-EPT walk failure, the entry, level and state where the error
> > > was detected are returned in ext_err1 and ext_err2.
> > >
> > > On interrupt pending, SEAMCALL TDH_MEM_PAGE_DEMOTE returns error
> > > TDX_INTERRUPTED_RESTARTABLE.
> > >
> > > [Yan: Rebased and split patch, wrote changelog]
> >
> > We should add the level of detail here like we did for the base series ones.
> I'll provide changelog details under "---" of each patch in the next version.
I mean the commit log (above the "---") needs the same tip style treatment as
the other SEAMCALL wrapper patches.
>
> > >
> > > Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> > > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > > ---
> > > arch/x86/include/asm/tdx.h | 2 ++
> > > arch/x86/virt/vmx/tdx/tdx.c | 20 ++++++++++++++++++++
> > > arch/x86/virt/vmx/tdx/tdx.h | 1 +
> > > 3 files changed, 23 insertions(+)
> > >
> > > diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> > > index 26ffc792e673..08eff4b2f5e7 100644
> > > --- a/arch/x86/include/asm/tdx.h
> > > +++ b/arch/x86/include/asm/tdx.h
> > > @@ -177,6 +177,8 @@ u64 tdh_mng_key_config(struct tdx_td *td);
> > > u64 tdh_mng_create(struct tdx_td *td, u16 hkid);
> > > u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp);
> > > u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data);
> > > +u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *page,
> > > + u64 *ext_err1, u64 *ext_err2);
> > > u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2);
> > > u64 tdh_mr_finalize(struct tdx_td *td);
> > > u64 tdh_vp_flush(struct tdx_vp *vp);
> > > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> > > index a66d501b5677..5699dfe500d9 100644
> > > --- a/arch/x86/virt/vmx/tdx/tdx.c
> > > +++ b/arch/x86/virt/vmx/tdx/tdx.c
> > > @@ -1684,6 +1684,26 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
> > > }
> > > EXPORT_SYMBOL_GPL(tdh_mng_rd);
> > >
> > > +u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *page,
> > > + u64 *ext_err1, u64 *ext_err2)
> > > +{
> > > + struct tdx_module_args args = {
> > > + .rcx = gpa | level,
> >
> > This will only ever be level 2MB, how about dropping the arg?
> Do you mean hardcoding level to be 2MB in tdh_mem_page_demote()?
Yea, we don't support 1GB, so the level arg on the wrapper is superfluous.
>
> The SEAMCALL TDH_MEM_PAGE_DEMOTE supports level of 1GB in current TDX module.
>
> > > + .rdx = tdx_tdr_pa(td),
> > > + .r8 = page_to_phys(page),
> > > + };
> > > + u64 ret;
> > > +
> > > + tdx_clflush_page(page);
> > > + ret = seamcall_ret(TDH_MEM_PAGE_DEMOTE, &args);
> > > +
> > > + *ext_err1 = args.rcx;
> > > + *ext_err2 = args.rdx;
> >
> > How about we just call these entry and level_state, like the caller.
> Not sure, but I feel that ext_err* might be better, because
> - according to the spec,
> a) the args.rcx, args.rdx is unmodified (i.e. still hold the passed-in value)
> in case of error TDX_INTERRUPTED_RESTARTABLE.
> b) args.rcx, args.rdx can only be interpreted as entry and level_state in case
> of EPT walk error.
> c) in other cases, they are 0.
> - consistent with tdh_mem_page_aug(), tdh_mem_range_block()...
Yea, it's consistent. I'm ok leaving it as is.
>
>
> > > +
> > > + return ret;
> > > +}
> > > +EXPORT_SYMBOL_GPL(tdh_mem_page_demote);
> >
> > Looking in the docs, TDX module gives some somewhat constrained guidance:
> > 1. TDH.MEM.PAGE.DEMOTE should be invoked in a loop until it terminates
> > successfully.
> > 2. The host VMM should be designed to avoid cases where interrupt storms prevent
> > successful completion of TDH.MEM.PAGE.DEMOTE.
> >
> > The caller looks like:
> > do {
> > err = tdh_mem_page_demote(&kvm_tdx->td, gpa, tdx_level, page,
> > &entry, &level_state);
> > } while (err == TDX_INTERRUPTED_RESTARTABLE);
> >
> > if (unlikely(tdx_operand_busy(err))) {
> > tdx_no_vcpus_enter_start(kvm);
> > err = tdh_mem_page_demote(&kvm_tdx->td, gpa, tdx_level, page,
> > &entry, &level_state);
> > tdx_no_vcpus_enter_stop(kvm);
> > }
> >
> > The brute force second case could also be subjected to a
> > TDX_INTERRUPTED_RESTARTABLE and is not handled. As for interrupt storms, I guess
> You are right.
>
> > we could disable interrupts while we do the second brute force case. So the
> > TDX_INTERRUPTED_RESTARTABLE loop could have a max retries, and the brute force
> > case could also disable interrupts.
> Good idea.
>
> > Hmm, how to pick the max retries count. It's a tradeoff between interrupt
> > latency and DOS/code complexity. Do we have any idea how long demote might take?
> I did a brief test on my SPR, where the host was not busy :
> tdh_mem_page_demote() was called 142 times, with each invocation taking around
> 10us.
10us doesn't seem too bad? Makes me think to not loop and instead just do a
single retry with interrupts disabled. We should definitely add the data based
reasoning to the log.
The counter point is that the SEAMCALL must be supporting
TDX_INTERRUPTED_RESTARTABLE for a reason. And the reason probably is that it
sometimes takes longer than someone that was reasonable. Maybe we should ask TDX
module folks if there is any history.
> 2 invocations were due to TDX_INTERRUPTED_RESTARTABLE.
> For each GFN, at most 1 retry was performed.
>
> I will do more investigations.
On Thu, 2025-05-15 at 10:28 -0700, Rick Edgecombe wrote: > > I did a brief test on my SPR, where the host was not busy : > > tdh_mem_page_demote() was called 142 times, with each invocation taking > > around > > 10us. > > 10us doesn't seem too bad? Makes me think to not loop and instead just do a > single retry with interrupts disabled. We should definitely add the data based > reasoning to the log. > > The counter point is that the SEAMCALL must be supporting > TDX_INTERRUPTED_RESTARTABLE for a reason. And the reason probably is that it > sometimes takes longer than someone that was reasonable. Maybe we should ask > TDX > module folks if there is any history. Circling back here. After some research/discussion it seems demote should not take too long such that it should need the option to return TDX_INTERRUPTED_RESTARTABLE. Even in the dynamic PAMT case. The details of how to get this changed and documented are still ongoing, but for v2 I say we close this by expecting it to never return TDX_INTERRUPTED_RESTARTABLE. For now it can be a VM_BUG_ON() case, with the expectation that TDX module will update to make the logic valid. Sound good?
On Fri, May 16, 2025 at 01:28:52AM +0800, Edgecombe, Rick P wrote:
> On Thu, 2025-05-15 at 16:26 +0800, Yan Zhao wrote:
> > On Wed, May 14, 2025 at 02:19:56AM +0800, Edgecombe, Rick P wrote:
> > > On Thu, 2025-04-24 at 11:04 +0800, Yan Zhao wrote:
> > > > From: Xiaoyao Li <xiaoyao.li@intel.com>
> > > >
> > > > Add a wrapper tdh_mem_page_demote() to invoke SEAMCALL TDH_MEM_PAGE_DEMOTE
> > > > to demote a huge leaf entry to a non-leaf entry in S-EPT. Currently, the
> > > > TDX module only supports demotion of a 2M huge leaf entry. After a
> > > > successful demotion, the old 2M huge leaf entry in S-EPT is replaced with a
> > > > non-leaf entry, linking to the newly-added page table page. The newly
> > > > linked page table page then contains 512 leaf entries, pointing to the 2M
> > > > guest private pages.
> > > >
> > > > The "gpa" and "level" direct the TDX module to search and find the old
> > > > huge leaf entry.
> > > >
> > > > As the new non-leaf entry points to a page table page, callers need to
> > > > pass in the page table page in parameter "page".
> > > >
> > > > In case of S-EPT walk failure, the entry, level and state where the error
> > > > was detected are returned in ext_err1 and ext_err2.
> > > >
> > > > On interrupt pending, SEAMCALL TDH_MEM_PAGE_DEMOTE returns error
> > > > TDX_INTERRUPTED_RESTARTABLE.
> > > >
> > > > [Yan: Rebased and split patch, wrote changelog]
> > >
> > > We should add the level of detail here like we did for the base series ones.
> > I'll provide changelog details under "---" of each patch in the next version.
>
> I mean the commit log (above the "---") needs the same tip style treatment as
> the other SEAMCALL wrapper patches.
I thought I have followed the style.
Sorry that if you think the commit msg is too simple without showing details
of this SEAMCALL. I can provide a detailed on in the next version if that's the
concern you mentioned above.
> >
> > > >
> > > > Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> > > > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > > > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > > > ---
> > > > arch/x86/include/asm/tdx.h | 2 ++
> > > > arch/x86/virt/vmx/tdx/tdx.c | 20 ++++++++++++++++++++
> > > > arch/x86/virt/vmx/tdx/tdx.h | 1 +
> > > > 3 files changed, 23 insertions(+)
> > > >
> > > > diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> > > > index 26ffc792e673..08eff4b2f5e7 100644
> > > > --- a/arch/x86/include/asm/tdx.h
> > > > +++ b/arch/x86/include/asm/tdx.h
> > > > @@ -177,6 +177,8 @@ u64 tdh_mng_key_config(struct tdx_td *td);
> > > > u64 tdh_mng_create(struct tdx_td *td, u16 hkid);
> > > > u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp);
> > > > u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data);
> > > > +u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *page,
> > > > + u64 *ext_err1, u64 *ext_err2);
> > > > u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2);
> > > > u64 tdh_mr_finalize(struct tdx_td *td);
> > > > u64 tdh_vp_flush(struct tdx_vp *vp);
> > > > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> > > > index a66d501b5677..5699dfe500d9 100644
> > > > --- a/arch/x86/virt/vmx/tdx/tdx.c
> > > > +++ b/arch/x86/virt/vmx/tdx/tdx.c
> > > > @@ -1684,6 +1684,26 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
> > > > }
> > > > EXPORT_SYMBOL_GPL(tdh_mng_rd);
> > > >
> > > > +u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *page,
> > > > + u64 *ext_err1, u64 *ext_err2)
> > > > +{
> > > > + struct tdx_module_args args = {
> > > > + .rcx = gpa | level,
> > >
> > > This will only ever be level 2MB, how about dropping the arg?
> > Do you mean hardcoding level to be 2MB in tdh_mem_page_demote()?
>
> Yea, we don't support 1GB, so the level arg on the wrapper is superfluous.
I'm not sure. It's not like tdh_mem_page_add() where the TDX module just only
supports 4KB.
But your point that permitting 1GB in tdh_mem_page_demote() in x86 code until
after KVM TDX code adds 1GB support also makes sense.
> > The SEAMCALL TDH_MEM_PAGE_DEMOTE supports level of 1GB in current TDX module.
> >
> > > > + .rdx = tdx_tdr_pa(td),
> > > > + .r8 = page_to_phys(page),
> > > > + };
> > > > + u64 ret;
> > > > +
> > > > + tdx_clflush_page(page);
> > > > + ret = seamcall_ret(TDH_MEM_PAGE_DEMOTE, &args);
> > > > +
> > > > + *ext_err1 = args.rcx;
> > > > + *ext_err2 = args.rdx;
On 4/24/2025 11:04 AM, Yan Zhao wrote: > From: Xiaoyao Li <xiaoyao.li@intel.com> > > Add a wrapper tdh_mem_page_demote() to invoke SEAMCALL TDH_MEM_PAGE_DEMOTE > to demote a huge leaf entry to a non-leaf entry in S-EPT. Currently, the > TDX module only supports demotion of a 2M huge leaf entry. After a > successful demotion, the old 2M huge leaf entry in S-EPT is replaced with a > non-leaf entry, linking to the newly-added page table page. The newly > linked page table page then contains 512 leaf entries, pointing to the 2M 2M or 4K? > guest private pages. [...]
On Fri, Apr 25, 2025 at 03:12:32PM +0800, Binbin Wu wrote: > > > On 4/24/2025 11:04 AM, Yan Zhao wrote: > > From: Xiaoyao Li <xiaoyao.li@intel.com> > > > > Add a wrapper tdh_mem_page_demote() to invoke SEAMCALL TDH_MEM_PAGE_DEMOTE > > to demote a huge leaf entry to a non-leaf entry in S-EPT. Currently, the > > TDX module only supports demotion of a 2M huge leaf entry. After a > > successful demotion, the old 2M huge leaf entry in S-EPT is replaced with a > > non-leaf entry, linking to the newly-added page table page. The newly > > linked page table page then contains 512 leaf entries, pointing to the 2M > > 2M or 4K? The 512 leaf entries point to 2M guest private pages together, each pointing to 4K. > > guest private pages.
On 4/25/2025 3:17 PM, Yan Zhao wrote: > On Fri, Apr 25, 2025 at 03:12:32PM +0800, Binbin Wu wrote: >> >> On 4/24/2025 11:04 AM, Yan Zhao wrote: >>> From: Xiaoyao Li <xiaoyao.li@intel.com> >>> >>> Add a wrapper tdh_mem_page_demote() to invoke SEAMCALL TDH_MEM_PAGE_DEMOTE >>> to demote a huge leaf entry to a non-leaf entry in S-EPT. Currently, the >>> TDX module only supports demotion of a 2M huge leaf entry. After a >>> successful demotion, the old 2M huge leaf entry in S-EPT is replaced with a >>> non-leaf entry, linking to the newly-added page table page. The newly >>> linked page table page then contains 512 leaf entries, pointing to the 2M >> 2M or 4K? > The 512 leaf entries point to 2M guest private pages together, If this, it should be 2M range, since it's not a huge page after demotion. Also, the plural "pages" is confusing. > each pointing to > 4K. > >>> guest private pages.
On Fri, Apr 25, 2025 at 03:25:12PM +0800, Binbin Wu wrote: > > > On 4/25/2025 3:17 PM, Yan Zhao wrote: > > On Fri, Apr 25, 2025 at 03:12:32PM +0800, Binbin Wu wrote: > > > > > > On 4/24/2025 11:04 AM, Yan Zhao wrote: > > > > From: Xiaoyao Li <xiaoyao.li@intel.com> > > > > > > > > Add a wrapper tdh_mem_page_demote() to invoke SEAMCALL TDH_MEM_PAGE_DEMOTE > > > > to demote a huge leaf entry to a non-leaf entry in S-EPT. Currently, the > > > > TDX module only supports demotion of a 2M huge leaf entry. After a > > > > successful demotion, the old 2M huge leaf entry in S-EPT is replaced with a > > > > non-leaf entry, linking to the newly-added page table page. The newly > > > > linked page table page then contains 512 leaf entries, pointing to the 2M > > > 2M or 4K? > > The 512 leaf entries point to 2M guest private pages together, > If this, it should be 2M range, since it's not a huge page after demotion. > Also, the plural "pages" is confusing. Ah, indeed, plural "pages" is confiusing :) Maybe below is better: The newly linked page table now contains 512 leaf entries, each pointing to a 4K guest private page within the 2M range. > > each pointing to > > 4K. > > > > > > guest private pages. >
© 2016 - 2025 Red Hat, Inc.