[v1] TDX: Enable Dynamic PAMT

[RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by Kirill A. Shutemov 9 months, 1 week ago

Introduce a pair of helpers to allocate and free memory for a given 2M
range. The range is represented by struct page for any memory in the
range and the PAMT memory by a list of pages.

Use per-2M refcounts to detect when PAMT memory has to be allocated and
when it can be freed.

pamt_lock spinlock serializes against races between multiple
tdx_pamt_add() as well as tdx_pamt_add() vs tdx_pamt_put().

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 arch/x86/include/asm/tdx.h   |   2 +
 arch/x86/kvm/vmx/tdx.c       | 123 +++++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/tdx_errno.h |   1 +
 3 files changed, 126 insertions(+)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 8091bf5b43cc..42449c054938 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -135,6 +135,8 @@ static inline int tdx_nr_pamt_pages(const struct tdx_sys_info *sysinfo)
 	return sysinfo->tdmr.pamt_4k_entry_size * PTRS_PER_PTE / PAGE_SIZE;
 }
 
+atomic_t *tdx_get_pamt_refcount(unsigned long hpa);
+
 int tdx_guest_keyid_alloc(void);
 u32 tdx_get_nr_guest_keyids(void);
 void tdx_guest_keyid_free(unsigned int keyid);
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index b952bc673271..ea7e2d93fb44 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -207,6 +207,10 @@ static bool tdx_operand_busy(u64 err)
 	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
 }
 
+static bool tdx_hpa_range_not_free(u64 err)
+{
+	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_HPA_RANGE_NOT_FREE;
+}
 
 /*
  * A per-CPU list of TD vCPUs associated with a given CPU.
@@ -276,6 +280,125 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
 	vcpu->cpu = -1;
 }
 
+static DEFINE_SPINLOCK(pamt_lock);
+
+static void tdx_free_pamt_pages(struct list_head *pamt_pages)
+{
+	struct page *page;
+
+	while ((page = list_first_entry_or_null(pamt_pages, struct page, lru))) {
+		list_del(&page->lru);
+		__free_page(page);
+	}
+}
+
+static int tdx_alloc_pamt_pages(struct list_head *pamt_pages)
+{
+	for (int i = 0; i < tdx_nr_pamt_pages(tdx_sysinfo); i++) {
+		struct page *page = alloc_page(GFP_KERNEL);
+		if (!page)
+			goto fail;
+		list_add(&page->lru, pamt_pages);
+	}
+	return 0;
+fail:
+	tdx_free_pamt_pages(pamt_pages);
+	return -ENOMEM;
+}
+
+static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
+			struct list_head *pamt_pages)
+{
+	u64 err;
+
+	hpa = ALIGN_DOWN(hpa, SZ_2M);
+
+	spin_lock(&pamt_lock);
+
+	/* Lost race to other tdx_pamt_add() */
+	if (atomic_read(pamt_refcount) != 0) {
+		atomic_inc(pamt_refcount);
+		spin_unlock(&pamt_lock);
+		tdx_free_pamt_pages(pamt_pages);
+		return 0;
+	}
+
+	err = tdh_phymem_pamt_add(hpa | TDX_PS_2M, pamt_pages);
+
+	if (err)
+		tdx_free_pamt_pages(pamt_pages);
+
+	/*
+	 * tdx_hpa_range_not_free() is true if current task won race
+	 * against tdx_pamt_put().
+	 */
+	if (err && !tdx_hpa_range_not_free(err)) {
+		spin_unlock(&pamt_lock);
+		pr_tdx_error(TDH_PHYMEM_PAMT_ADD, err);
+		return -EIO;
+	}
+
+	atomic_set(pamt_refcount, 1);
+	spin_unlock(&pamt_lock);
+	return 0;
+}
+
+static int tdx_pamt_get(struct page *page)
+{
+	unsigned long hpa = page_to_phys(page);
+	atomic_t *pamt_refcount;
+	LIST_HEAD(pamt_pages);
+
+	if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
+		return 0;
+
+	pamt_refcount = tdx_get_pamt_refcount(hpa);
+	WARN_ON_ONCE(atomic_read(pamt_refcount) < 0);
+
+	if (atomic_inc_not_zero(pamt_refcount))
+		return 0;
+
+	if (tdx_alloc_pamt_pages(&pamt_pages))
+		return -ENOMEM;
+
+	return tdx_pamt_add(pamt_refcount, hpa, &pamt_pages);
+}
+
+static void tdx_pamt_put(struct page *page)
+{
+	unsigned long hpa = page_to_phys(page);
+	atomic_t *pamt_refcount;
+	LIST_HEAD(pamt_pages);
+	u64 err;
+
+	if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
+		return;
+
+	hpa = ALIGN_DOWN(hpa, SZ_2M);
+
+	pamt_refcount = tdx_get_pamt_refcount(hpa);
+	if (!atomic_dec_and_test(pamt_refcount))
+		return;
+
+	spin_lock(&pamt_lock);
+
+	/* Lost race against tdx_pamt_add()? */
+	if (atomic_read(pamt_refcount) != 0) {
+		spin_unlock(&pamt_lock);
+		return;
+	}
+
+	err = tdh_phymem_pamt_remove(hpa | TDX_PS_2M, &pamt_pages);
+	spin_unlock(&pamt_lock);
+
+	if (err) {
+		pr_tdx_error(TDH_PHYMEM_PAMT_REMOVE, err);
+		return;
+	}
+
+	tdx_free_pamt_pages(&pamt_pages);
+}
+
 static void tdx_clear_page(struct page *page)
 {
 	const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/kvm/vmx/tdx_errno.h
index 6ff4672c4181..c8a471d6b991 100644
--- a/arch/x86/kvm/vmx/tdx_errno.h
+++ b/arch/x86/kvm/vmx/tdx_errno.h
@@ -18,6 +18,7 @@
 #define TDX_OPERAND_BUSY			0x8000020000000000ULL
 #define TDX_PREVIOUS_TLB_EPOCH_BUSY		0x8000020100000000ULL
 #define TDX_PAGE_METADATA_INCORRECT		0xC000030000000000ULL
+#define TDX_HPA_RANGE_NOT_FREE			0xC000030400000000ULL
 #define TDX_VCPU_NOT_ASSOCIATED			0x8000070200000000ULL
 #define TDX_KEY_GENERATION_FAILED		0x8000080000000000ULL
 #define TDX_KEY_STATE_INCORRECT			0xC000081100000000ULL
-- 
2.47.2

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by Chao Gao 9 months ago

>+static void tdx_pamt_put(struct page *page)
>+{
>+	unsigned long hpa = page_to_phys(page);
>+	atomic_t *pamt_refcount;
>+	LIST_HEAD(pamt_pages);
>+	u64 err;
>+
>+	if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
>+		return;
>+
>+	hpa = ALIGN_DOWN(hpa, SZ_2M);
>+
>+	pamt_refcount = tdx_get_pamt_refcount(hpa);
>+	if (!atomic_dec_and_test(pamt_refcount))
>+		return;
>+
>+	spin_lock(&pamt_lock);
>+
>+	/* Lost race against tdx_pamt_add()? */
>+	if (atomic_read(pamt_refcount) != 0) {
>+		spin_unlock(&pamt_lock);
>+		return;
>+	}
>+
>+	err = tdh_phymem_pamt_remove(hpa | TDX_PS_2M, &pamt_pages);
>+	spin_unlock(&pamt_lock);
>+
>+	if (err) {
>+		pr_tdx_error(TDH_PHYMEM_PAMT_REMOVE, err);

Should the refcount be increased here, since the PAMT pages are not removed?

>+		return;
>+	}
>+
>+	tdx_free_pamt_pages(&pamt_pages);
>+}

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by Kirill A. Shutemov 9 months ago

On Wed, May 14, 2025 at 01:33:38PM +0800, Chao Gao wrote:
> >+static void tdx_pamt_put(struct page *page)
> >+{
> >+	unsigned long hpa = page_to_phys(page);
> >+	atomic_t *pamt_refcount;
> >+	LIST_HEAD(pamt_pages);
> >+	u64 err;
> >+
> >+	if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
> >+		return;
> >+
> >+	hpa = ALIGN_DOWN(hpa, SZ_2M);
> >+
> >+	pamt_refcount = tdx_get_pamt_refcount(hpa);
> >+	if (!atomic_dec_and_test(pamt_refcount))
> >+		return;
> >+
> >+	spin_lock(&pamt_lock);
> >+
> >+	/* Lost race against tdx_pamt_add()? */
> >+	if (atomic_read(pamt_refcount) != 0) {
> >+		spin_unlock(&pamt_lock);
> >+		return;
> >+	}
> >+
> >+	err = tdh_phymem_pamt_remove(hpa | TDX_PS_2M, &pamt_pages);
> >+	spin_unlock(&pamt_lock);
> >+
> >+	if (err) {
> >+		pr_tdx_error(TDH_PHYMEM_PAMT_REMOVE, err);
> 
> Should the refcount be increased here, since the PAMT pages are not removed?

Right. Thanks.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by Chao Gao 9 months ago

>+static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
>+			struct list_head *pamt_pages)
>+{
>+	u64 err;
>+
>+	hpa = ALIGN_DOWN(hpa, SZ_2M);

Nit: it is better to use SZ_2M or PMD_SIZE consistently.

e.g., patch 2 uses PMD_SIZE:
 
+atomic_t *tdx_get_pamt_refcount(unsigned long hpa)
+{
+	return &pamt_refcounts[hpa / PMD_SIZE];
+}
+EXPORT_SYMBOL_GPL(tdx_get_pamt_refcount);

>+
>+	spin_lock(&pamt_lock);
>+
>+	/* Lost race to other tdx_pamt_add() */
>+	if (atomic_read(pamt_refcount) != 0) {
>+		atomic_inc(pamt_refcount);
>+		spin_unlock(&pamt_lock);
>+		tdx_free_pamt_pages(pamt_pages);
>+		return 0;
>+	}
>+
>+	err = tdh_phymem_pamt_add(hpa | TDX_PS_2M, pamt_pages);
>+
>+	if (err)
>+		tdx_free_pamt_pages(pamt_pages);
>+

>+	/*
>+	 * tdx_hpa_range_not_free() is true if current task won race
>+	 * against tdx_pamt_put().
>+	 */
>+	if (err && !tdx_hpa_range_not_free(err)) {
>+		spin_unlock(&pamt_lock);
>+		pr_tdx_error(TDH_PHYMEM_PAMT_ADD, err);
>+		return -EIO;
>+	}

IIUC, this chunk is needed because tdx_pamt_put() decreases the refcount
without holding the pamt_lock. Why not move that decrease inside the lock?

And I suggest that all accesses to the pamt_refcount should be performed with
the pamt_lock held. This can make the code much clearer. It's similar to how
kvm_usage_count is managed, where transitions from 0 to 1 or 1 to 0 require
extra work, but other cases simply increases or decreases the refcount.

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by Kirill A. Shutemov 8 months, 3 weeks ago

On Wed, May 14, 2025 at 01:25:37PM +0800, Chao Gao wrote:
> >+static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
> >+			struct list_head *pamt_pages)
> >+{
> >+	u64 err;
> >+
> >+	hpa = ALIGN_DOWN(hpa, SZ_2M);
> 
> Nit: it is better to use SZ_2M or PMD_SIZE consistently.
> 
> e.g., patch 2 uses PMD_SIZE:
>  
> +atomic_t *tdx_get_pamt_refcount(unsigned long hpa)
> +{
> +	return &pamt_refcounts[hpa / PMD_SIZE];
> +}
> +EXPORT_SYMBOL_GPL(tdx_get_pamt_refcount);
> 
> >+
> >+	spin_lock(&pamt_lock);
> >+
> >+	/* Lost race to other tdx_pamt_add() */
> >+	if (atomic_read(pamt_refcount) != 0) {
> >+		atomic_inc(pamt_refcount);
> >+		spin_unlock(&pamt_lock);
> >+		tdx_free_pamt_pages(pamt_pages);
> >+		return 0;
> >+	}
> >+
> >+	err = tdh_phymem_pamt_add(hpa | TDX_PS_2M, pamt_pages);
> >+
> >+	if (err)
> >+		tdx_free_pamt_pages(pamt_pages);
> >+
> 
> >+	/*
> >+	 * tdx_hpa_range_not_free() is true if current task won race
> >+	 * against tdx_pamt_put().
> >+	 */
> >+	if (err && !tdx_hpa_range_not_free(err)) {
> >+		spin_unlock(&pamt_lock);
> >+		pr_tdx_error(TDH_PHYMEM_PAMT_ADD, err);
> >+		return -EIO;
> >+	}
> 
> IIUC, this chunk is needed because tdx_pamt_put() decreases the refcount
> without holding the pamt_lock. Why not move that decrease inside the lock?
> 
> And I suggest that all accesses to the pamt_refcount should be performed with
> the pamt_lock held. This can make the code much clearer. It's similar to how
> kvm_usage_count is managed, where transitions from 0 to 1 or 1 to 0 require
> extra work, but other cases simply increases or decreases the refcount.

Vast majority of cases will take fast path which requires single atomic
operation. We can move it under lock but it would double number of
atomics. I don't see a strong reason to do this.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by Huang, Kai 9 months, 1 week ago

On Fri, 2025-05-02 at 16:08 +0300, Kirill A. Shutemov wrote:
> Introduce a pair of helpers to allocate and free memory for a given 2M
> range. The range is represented by struct page for any memory in the
> range and the PAMT memory by a list of pages.
> 
> Use per-2M refcounts to detect when PAMT memory has to be allocated and
> when it can be freed.
> 
> pamt_lock spinlock serializes against races between multiple
> tdx_pamt_add() as well as tdx_pamt_add() vs tdx_pamt_put().

Maybe elaborate a little bit on _why_ using spinlock?

> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  arch/x86/include/asm/tdx.h   |   2 +
>  arch/x86/kvm/vmx/tdx.c       | 123 +++++++++++++++++++++++++++++++++++
>  arch/x86/kvm/vmx/tdx_errno.h |   1 +
>  3 files changed, 126 insertions(+)
> 
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index 8091bf5b43cc..42449c054938 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -135,6 +135,8 @@ static inline int tdx_nr_pamt_pages(const struct tdx_sys_info *sysinfo)
>  	return sysinfo->tdmr.pamt_4k_entry_size * PTRS_PER_PTE / PAGE_SIZE;
>  }
>  
> +atomic_t *tdx_get_pamt_refcount(unsigned long hpa);
> +

This at least needs to be in the same patch which exports it.  But as replied to
patch 2, I think we should just move the code in this patch to TDX core code.

>  int tdx_guest_keyid_alloc(void);
>  u32 tdx_get_nr_guest_keyids(void);
>  void tdx_guest_keyid_free(unsigned int keyid);
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index b952bc673271..ea7e2d93fb44 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -207,6 +207,10 @@ static bool tdx_operand_busy(u64 err)
>  	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
>  }
>  
> +static bool tdx_hpa_range_not_free(u64 err)
> +{
> +	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_HPA_RANGE_NOT_FREE;
> +}
>  
>  /*
>   * A per-CPU list of TD vCPUs associated with a given CPU.
> @@ -276,6 +280,125 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
>  	vcpu->cpu = -1;
>  }
>  
> +static DEFINE_SPINLOCK(pamt_lock);
> +
> +static void tdx_free_pamt_pages(struct list_head *pamt_pages)
> +{
> +	struct page *page;
> +
> +	while ((page = list_first_entry_or_null(pamt_pages, struct page, lru))) {
> +		list_del(&page->lru);
> +		__free_page(page);
> +	}
> +}
> +
> +static int tdx_alloc_pamt_pages(struct list_head *pamt_pages)
> +{
> +	for (int i = 0; i < tdx_nr_pamt_pages(tdx_sysinfo); i++) {
> +		struct page *page = alloc_page(GFP_KERNEL);
> +		if (!page)
> +			goto fail;
> +		list_add(&page->lru, pamt_pages);
> +	}
> +	return 0;
> +fail:
> +	tdx_free_pamt_pages(pamt_pages);
> +	return -ENOMEM;
> +}
> +
> +static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
> +			struct list_head *pamt_pages)
> +{
> +	u64 err;
> +
> +	hpa = ALIGN_DOWN(hpa, SZ_2M);
> +
> +	spin_lock(&pamt_lock);

Just curious, Can the lock be per-2M-range?

> +
> +	/* Lost race to other tdx_pamt_add() */
> +	if (atomic_read(pamt_refcount) != 0) {
> +		atomic_inc(pamt_refcount);
> +		spin_unlock(&pamt_lock);
> +		tdx_free_pamt_pages(pamt_pages);

It's unfortunate multiple caller of tdx_pamt_add() needs to firstly allocate
PAMT pages by the caller out of the spinlock and then free them here.

I am thinking if we make tdx_pamt_add() return:

	* > 0: PAMT pages already added (another tdx_pamt_add() won)
	* = 0: PAMT pages added successfully
	* < 0: error code

.. then we at least could move tdx_free_pamt_pages() to the caller too.

> +		return 0;
> +	}
> +
> +	err = tdh_phymem_pamt_add(hpa | TDX_PS_2M, pamt_pages);
> +
> +	if (err)
> +		tdx_free_pamt_pages(pamt_pages);

Seems we are calling tdx_free_pamt_pages() within spinlock, which is not
consistent with above when another tdx_pamt_add() has won the race.

> +
> +	/*
> +	 * tdx_hpa_range_not_free() is true if current task won race
> +	 * against tdx_pamt_put().
> +	 */
> +	if (err && !tdx_hpa_range_not_free(err)) {
> +		spin_unlock(&pamt_lock);
> +		pr_tdx_error(TDH_PHYMEM_PAMT_ADD, err);
> +		return -EIO;
> +	}

I had hard time to figure out why we need to handle tdx_hpa_range_not_free()
explicitly.  IIUC, it is because atomic_dec_and_test() is used in
tdx_pamt_put(), in which case the atomic_t can reach to 0 outside of the
spinlock thus tdh_phymem_pamt_add() can be called when there's still PAMT pages
populated.

But ...

> +
> +	atomic_set(pamt_refcount, 1);
> +	spin_unlock(&pamt_lock);
> +	return 0;
> +}
> +
> +static int tdx_pamt_get(struct page *page)
> +{
> +	unsigned long hpa = page_to_phys(page);
> +	atomic_t *pamt_refcount;
> +	LIST_HEAD(pamt_pages);
> +
> +	if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
> +		return 0;
> +
> +	pamt_refcount = tdx_get_pamt_refcount(hpa);
> +	WARN_ON_ONCE(atomic_read(pamt_refcount) < 0);
> +
> +	if (atomic_inc_not_zero(pamt_refcount))
> +		return 0;

... if we set the initial value of pamt_refcount to -1, and use
atomic_inc_unless_negetive() here:

	if (atomic_inc_unless_negative(pamt_refcount))
		return 0;

	if (tdx_alloc_pamt_pages(&pamt_pages))
		return -ENOMEM;

	spin_lock(&pamt_lock);
	ret = tdx_pamt_add(hpa, &pamt_pages);
	if (ret >= 0)
		atomic_inc(pamt_refcount, 0);
	spin_unlock(&pamt_lock);
	
	/*
	 * If another tdx_pamt_get() won the race, or in case of
	 * error, PAMT pages are not used and can be freed.
	 */
	if (ret)
		tdx_free_pamt_pages(&pamt_pages);

	return ret >= 0 ? 0 : ret;

and ...

> +
> +	if (tdx_alloc_pamt_pages(&pamt_pages))
> +		return -ENOMEM;
> +
> +	return tdx_pamt_add(pamt_refcount, hpa, &pamt_pages);
> +}
> +
> +static void tdx_pamt_put(struct page *page)
> +{
> +	unsigned long hpa = page_to_phys(page);
> +	atomic_t *pamt_refcount;
> +	LIST_HEAD(pamt_pages);
> +	u64 err;
> +
> +	if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
> +		return;
> +
> +	hpa = ALIGN_DOWN(hpa, SZ_2M);
> +
> +	pamt_refcount = tdx_get_pamt_refcount(hpa);
> +	if (!atomic_dec_and_test(pamt_refcount))
> +		return;

... use atomic_dec_if_possible() here, we should be able to avoid the special
handling of tdx_hpa_range_not_free() in tdx_pamt_get().  Someething like:

	if (atomic_dec_if_positive(pamt_refcount) >= 0)
		return;

	spin_lock(&pamt_lock);
	
	/* tdx_pamt_get() called more than once */
	if (atomic_read(pamt_refcount) > 0) {
		spin_unlock(&pamt_lock);
		return;
	}

	err = tdh_phymem_pamt_remove(hpa | TDX_PS_2M, &pamt_pages);
	atomic_set(pamt_refcount, -1);
	spin_unlock(&pamt_lock);

	tdx_free_pamt_pages(&pamt_pages);

Hmm.. am I missing anything?
			
> +
> +	spin_lock(&pamt_lock);
> +
> +	/* Lost race against tdx_pamt_add()? */
> +	if (atomic_read(pamt_refcount) != 0) {
> +		spin_unlock(&pamt_lock);
> +		return;
> +	}
> +
> +	err = tdh_phymem_pamt_remove(hpa | TDX_PS_2M, &pamt_pages);
> +	spin_unlock(&pamt_lock);
> +
> +	if (err) {
> +		pr_tdx_error(TDH_PHYMEM_PAMT_REMOVE, err);
> +		return;
> +	}
> +
> +	tdx_free_pamt_pages(&pamt_pages);
> +}
> +

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by kirill.shutemov@linux.intel.com 8 months, 3 weeks ago

On Mon, May 05, 2025 at 12:44:26PM +0000, Huang, Kai wrote:
> On Fri, 2025-05-02 at 16:08 +0300, Kirill A. Shutemov wrote:
> > Introduce a pair of helpers to allocate and free memory for a given 2M
> > range. The range is represented by struct page for any memory in the
> > range and the PAMT memory by a list of pages.
> > 
> > Use per-2M refcounts to detect when PAMT memory has to be allocated and
> > when it can be freed.
> > 
> > pamt_lock spinlock serializes against races between multiple
> > tdx_pamt_add() as well as tdx_pamt_add() vs tdx_pamt_put().
> 
> Maybe elaborate a little bit on _why_ using spinlock?
> 
> > 
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  arch/x86/include/asm/tdx.h   |   2 +
> >  arch/x86/kvm/vmx/tdx.c       | 123 +++++++++++++++++++++++++++++++++++
> >  arch/x86/kvm/vmx/tdx_errno.h |   1 +
> >  3 files changed, 126 insertions(+)
> > 
> > diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> > index 8091bf5b43cc..42449c054938 100644
> > --- a/arch/x86/include/asm/tdx.h
> > +++ b/arch/x86/include/asm/tdx.h
> > @@ -135,6 +135,8 @@ static inline int tdx_nr_pamt_pages(const struct tdx_sys_info *sysinfo)
> >  	return sysinfo->tdmr.pamt_4k_entry_size * PTRS_PER_PTE / PAGE_SIZE;
> >  }
> >  
> > +atomic_t *tdx_get_pamt_refcount(unsigned long hpa);
> > +
> 
> This at least needs to be in the same patch which exports it.  But as replied to
> patch 2, I think we should just move the code in this patch to TDX core code.
> 
> >  int tdx_guest_keyid_alloc(void);
> >  u32 tdx_get_nr_guest_keyids(void);
> >  void tdx_guest_keyid_free(unsigned int keyid);
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index b952bc673271..ea7e2d93fb44 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -207,6 +207,10 @@ static bool tdx_operand_busy(u64 err)
> >  	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
> >  }
> >  
> > +static bool tdx_hpa_range_not_free(u64 err)
> > +{
> > +	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_HPA_RANGE_NOT_FREE;
> > +}
> >  
> >  /*
> >   * A per-CPU list of TD vCPUs associated with a given CPU.
> > @@ -276,6 +280,125 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
> >  	vcpu->cpu = -1;
> >  }
> >  
> > +static DEFINE_SPINLOCK(pamt_lock);
> > +
> > +static void tdx_free_pamt_pages(struct list_head *pamt_pages)
> > +{
> > +	struct page *page;
> > +
> > +	while ((page = list_first_entry_or_null(pamt_pages, struct page, lru))) {
> > +		list_del(&page->lru);
> > +		__free_page(page);
> > +	}
> > +}
> > +
> > +static int tdx_alloc_pamt_pages(struct list_head *pamt_pages)
> > +{
> > +	for (int i = 0; i < tdx_nr_pamt_pages(tdx_sysinfo); i++) {
> > +		struct page *page = alloc_page(GFP_KERNEL);
> > +		if (!page)
> > +			goto fail;
> > +		list_add(&page->lru, pamt_pages);
> > +	}
> > +	return 0;
> > +fail:
> > +	tdx_free_pamt_pages(pamt_pages);
> > +	return -ENOMEM;
> > +}
> > +
> > +static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
> > +			struct list_head *pamt_pages)
> > +{
> > +	u64 err;
> > +
> > +	hpa = ALIGN_DOWN(hpa, SZ_2M);
> > +
> > +	spin_lock(&pamt_lock);
> 
> Just curious, Can the lock be per-2M-range?
> 
> > +
> > +	/* Lost race to other tdx_pamt_add() */
> > +	if (atomic_read(pamt_refcount) != 0) {
> > +		atomic_inc(pamt_refcount);
> > +		spin_unlock(&pamt_lock);
> > +		tdx_free_pamt_pages(pamt_pages);
> 
> It's unfortunate multiple caller of tdx_pamt_add() needs to firstly allocate
> PAMT pages by the caller out of the spinlock and then free them here.
> 
> I am thinking if we make tdx_pamt_add() return:
> 
> 	* > 0: PAMT pages already added (another tdx_pamt_add() won)
> 	* = 0: PAMT pages added successfully
> 	* < 0: error code
> 
> .. then we at least could move tdx_free_pamt_pages() to the caller too.
> 
> > +		return 0;
> > +	}
> > +
> > +	err = tdh_phymem_pamt_add(hpa | TDX_PS_2M, pamt_pages);
> > +
> > +	if (err)
> > +		tdx_free_pamt_pages(pamt_pages);
> 
> Seems we are calling tdx_free_pamt_pages() within spinlock, which is not
> consistent with above when another tdx_pamt_add() has won the race.
> 
> > +
> > +	/*
> > +	 * tdx_hpa_range_not_free() is true if current task won race
> > +	 * against tdx_pamt_put().
> > +	 */
> > +	if (err && !tdx_hpa_range_not_free(err)) {
> > +		spin_unlock(&pamt_lock);
> > +		pr_tdx_error(TDH_PHYMEM_PAMT_ADD, err);
> > +		return -EIO;
> > +	}
> 
> I had hard time to figure out why we need to handle tdx_hpa_range_not_free()
> explicitly.  IIUC, it is because atomic_dec_and_test() is used in
> tdx_pamt_put(), in which case the atomic_t can reach to 0 outside of the
> spinlock thus tdh_phymem_pamt_add() can be called when there's still PAMT pages
> populated.
> 
> But ...
> 
> > +
> > +	atomic_set(pamt_refcount, 1);
> > +	spin_unlock(&pamt_lock);
> > +	return 0;
> > +}
> > +
> > +static int tdx_pamt_get(struct page *page)
> > +{
> > +	unsigned long hpa = page_to_phys(page);
> > +	atomic_t *pamt_refcount;
> > +	LIST_HEAD(pamt_pages);
> > +
> > +	if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
> > +		return 0;
> > +
> > +	pamt_refcount = tdx_get_pamt_refcount(hpa);
> > +	WARN_ON_ONCE(atomic_read(pamt_refcount) < 0);
> > +
> > +	if (atomic_inc_not_zero(pamt_refcount))
> > +		return 0;
> 
> ... if we set the initial value of pamt_refcount to -1, and use
> atomic_inc_unless_negetive() here:
> 
> 	if (atomic_inc_unless_negative(pamt_refcount))
> 		return 0;
> 
> 	if (tdx_alloc_pamt_pages(&pamt_pages))
> 		return -ENOMEM;
> 
> 	spin_lock(&pamt_lock);
> 	ret = tdx_pamt_add(hpa, &pamt_pages);
> 	if (ret >= 0)
> 		atomic_inc(pamt_refcount, 0);
> 	spin_unlock(&pamt_lock);
> 	
> 	/*
> 	 * If another tdx_pamt_get() won the race, or in case of
> 	 * error, PAMT pages are not used and can be freed.
> 	 */
> 	if (ret)
> 		tdx_free_pamt_pages(&pamt_pages);
> 
> 	return ret >= 0 ? 0 : ret;
> 
> and ...
> 
> > +
> > +	if (tdx_alloc_pamt_pages(&pamt_pages))
> > +		return -ENOMEM;
> > +
> > +	return tdx_pamt_add(pamt_refcount, hpa, &pamt_pages);
> > +}
> > +
> > +static void tdx_pamt_put(struct page *page)
> > +{
> > +	unsigned long hpa = page_to_phys(page);
> > +	atomic_t *pamt_refcount;
> > +	LIST_HEAD(pamt_pages);
> > +	u64 err;
> > +
> > +	if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
> > +		return;
> > +
> > +	hpa = ALIGN_DOWN(hpa, SZ_2M);
> > +
> > +	pamt_refcount = tdx_get_pamt_refcount(hpa);
> > +	if (!atomic_dec_and_test(pamt_refcount))
> > +		return;
> 
> ... use atomic_dec_if_possible() here, we should be able to avoid the special
> handling of tdx_hpa_range_not_free() in tdx_pamt_get().  Someething like:
> 
> 	if (atomic_dec_if_positive(pamt_refcount) >= 0)
> 		return;
> 
> 	spin_lock(&pamt_lock);
> 	
> 	/* tdx_pamt_get() called more than once */
> 	if (atomic_read(pamt_refcount) > 0) {

This check would do nothing to protect you against parallel increase of
the counter as we get here with pamt_refcount == 0 the parallel
atomic_inc_unless_negative() is free to bump the counter in the fast path
without taking the lock just after this condition.

So, the code below will free PAMT memory when there is still user.

> 		spin_unlock(&pamt_lock);
> 		return;
> 	}
> 
> 	err = tdh_phymem_pamt_remove(hpa | TDX_PS_2M, &pamt_pages);
> 	atomic_set(pamt_refcount, -1);
> 	spin_unlock(&pamt_lock);
> 
> 	tdx_free_pamt_pages(&pamt_pages);
> 
> Hmm.. am I missing anything?
> 			
> > +
> > +	spin_lock(&pamt_lock);
> > +
> > +	/* Lost race against tdx_pamt_add()? */
> > +	if (atomic_read(pamt_refcount) != 0) {
> > +		spin_unlock(&pamt_lock);
> > +		return;
> > +	}
> > +
> > +	err = tdh_phymem_pamt_remove(hpa | TDX_PS_2M, &pamt_pages);
> > +	spin_unlock(&pamt_lock);
> > +
> > +	if (err) {
> > +		pr_tdx_error(TDH_PHYMEM_PAMT_REMOVE, err);
> > +		return;
> > +	}
> > +
> > +	tdx_free_pamt_pages(&pamt_pages);
> > +}
> > +
> 

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by Dave Hansen 9 months, 1 week ago

On 5/5/25 05:44, Huang, Kai wrote:
>> +static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
>> +			struct list_head *pamt_pages)
>> +{
>> +	u64 err;
>> +
>> +	hpa = ALIGN_DOWN(hpa, SZ_2M);
>> +
>> +	spin_lock(&pamt_lock);
> Just curious, Can the lock be per-2M-range?

Folks, please keep it simple.

If there's lock contention on this, we'll fix the lock contention, or
hash the physical address into a fixed number of locks. But having it be
per-2M-range sounds awful. Then you have to size it, and allocate it and
then resize it if there's ever hotplug, etc...

Kirill, could you put together some kind of torture test for this,
please? I would imagine a workload which is sitting in a loop setting up
and tearing down VMs on a bunch of CPUs would do it.

That ^ would be the worst possible case, I think. If you don't see lock
contention there, you'll hopefully never see it on real systems.

I *suspect* that real systems will get bottlenecked somewhere in the
page conversion process rather than on this lock. But it should be a
pretty simple experiment to run.

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by kirill.shutemov@linux.intel.com 9 months ago

On Wed, May 07, 2025 at 09:31:22AM -0700, Dave Hansen wrote:
> On 5/5/25 05:44, Huang, Kai wrote:
> >> +static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
> >> +			struct list_head *pamt_pages)
> >> +{
> >> +	u64 err;
> >> +
> >> +	hpa = ALIGN_DOWN(hpa, SZ_2M);
> >> +
> >> +	spin_lock(&pamt_lock);
> > Just curious, Can the lock be per-2M-range?
> 
> Folks, please keep it simple.
> 
> If there's lock contention on this, we'll fix the lock contention, or
> hash the physical address into a fixed number of locks.

I had this idea in mind as well.

> But having it be
> per-2M-range sounds awful. Then you have to size it, and allocate it and
> then resize it if there's ever hotplug, etc...
> 
> Kirill, could you put together some kind of torture test for this,
> please? I would imagine a workload which is sitting in a loop setting up
> and tearing down VMs on a bunch of CPUs would do it.

It has to be multiple parallel creation/teardown loops. With single TD we
won't see much concurrency. Most of PAMT allocations comes from single
VCPU.

And it makes sense to do with huge pages as it cuts number of allocated
PAMT memory allocated on TD creation by factor of 10 in my setup.

JFYI, booting a TD with huge pages consumes 1-2MB of PAMT memory. I doubt
any optimization here is justifiable.

> That ^ would be the worst possible case, I think. If you don't see lock
> contention there, you'll hopefully never see it on real systems.
> 
> I *suspect* that real systems will get bottlenecked somewhere in the
> page conversion process rather than on this lock. But it should be a
> pretty simple experiment to run.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by Yan Zhao 9 months, 1 week ago

On Wed, May 07, 2025 at 09:31:22AM -0700, Dave Hansen wrote:
> On 5/5/25 05:44, Huang, Kai wrote:
> >> +static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
> >> +			struct list_head *pamt_pages)
> >> +{
> >> +	u64 err;
> >> +
> >> +	hpa = ALIGN_DOWN(hpa, SZ_2M);
> >> +
> >> +	spin_lock(&pamt_lock);
> > Just curious, Can the lock be per-2M-range?
> 
> Folks, please keep it simple.
> 
> If there's lock contention on this, we'll fix the lock contention, or
> hash the physical address into a fixed number of locks. But having it be
> per-2M-range sounds awful. Then you have to size it, and allocate it and
> then resize it if there's ever hotplug, etc...
In patch 2, there're per-2M-range pamt_refcounts. Could the per-2M-range
lock be implemented in a similar way?

+static atomic_t *pamt_refcounts;
+atomic_t *tdx_get_pamt_refcount(unsigned long hpa)
+{
+       return &pamt_refcounts[hpa / PMD_SIZE];
+}


> Kirill, could you put together some kind of torture test for this,
> please? I would imagine a workload which is sitting in a loop setting up
> and tearing down VMs on a bunch of CPUs would do it.
> 
> That ^ would be the worst possible case, I think. If you don't see lock
> contention there, you'll hopefully never see it on real systems.
When one vCPU is trying to install a guest page of HPA A, while another vCPU
is trying to install a guest page of HPA B, theoretically they may content the
global pamt_lock even if HPA A and B belong to different PAMT 2M blocks.

> I *suspect* that real systems will get bottlenecked somewhere in the
> page conversion process rather than on this lock. But it should be a
> pretty simple experiment to run.

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by kirill.shutemov@linux.intel.com 9 months ago

On Thu, May 08, 2025 at 10:08:32AM +0800, Yan Zhao wrote:
> On Wed, May 07, 2025 at 09:31:22AM -0700, Dave Hansen wrote:
> > On 5/5/25 05:44, Huang, Kai wrote:
> > >> +static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
> > >> +			struct list_head *pamt_pages)
> > >> +{
> > >> +	u64 err;
> > >> +
> > >> +	hpa = ALIGN_DOWN(hpa, SZ_2M);
> > >> +
> > >> +	spin_lock(&pamt_lock);
> > > Just curious, Can the lock be per-2M-range?
> > 
> > Folks, please keep it simple.
> > 
> > If there's lock contention on this, we'll fix the lock contention, or
> > hash the physical address into a fixed number of locks. But having it be
> > per-2M-range sounds awful. Then you have to size it, and allocate it and
> > then resize it if there's ever hotplug, etc...
> In patch 2, there're per-2M-range pamt_refcounts. Could the per-2M-range
> lock be implemented in a similar way?
> 
> +static atomic_t *pamt_refcounts;
> +atomic_t *tdx_get_pamt_refcount(unsigned long hpa)
> +{
> +       return &pamt_refcounts[hpa / PMD_SIZE];
> +}

But why? If no contention, it is just wasteful.

> > Kirill, could you put together some kind of torture test for this,
> > please? I would imagine a workload which is sitting in a loop setting up
> > and tearing down VMs on a bunch of CPUs would do it.
> > 
> > That ^ would be the worst possible case, I think. If you don't see lock
> > contention there, you'll hopefully never see it on real systems.
> When one vCPU is trying to install a guest page of HPA A, while another vCPU
> is trying to install a guest page of HPA B, theoretically they may content the
> global pamt_lock even if HPA A and B belong to different PAMT 2M blocks.

This contention will be be momentary if ever happen.

> > I *suspect* that real systems will get bottlenecked somewhere in the
> > page conversion process rather than on this lock. But it should be a
> > pretty simple experiment to run.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by Yan Zhao 9 months, 1 week ago

On Mon, May 05, 2025 at 08:44:26PM +0800, Huang, Kai wrote:
> On Fri, 2025-05-02 at 16:08 +0300, Kirill A. Shutemov wrote:
> > +static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
> > +			struct list_head *pamt_pages)
> > +{
> > +	u64 err;
> > +
> > +	hpa = ALIGN_DOWN(hpa, SZ_2M);
> > +
> > +	spin_lock(&pamt_lock);
> 
> Just curious, Can the lock be per-2M-range?
Me too.
Could we introduce smaller locks each covering a 2M range?

And could we deposit 2 pamt pages per-2M hpa range no matter if it's finally
mapped as a huge page or not?

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by Vishal Annapurve 9 months, 1 week ago

On Tue, May 6, 2025 at 6:04 PM Yan Zhao <yan.y.zhao@intel.com> wrote:
>
> On Mon, May 05, 2025 at 08:44:26PM +0800, Huang, Kai wrote:
> > On Fri, 2025-05-02 at 16:08 +0300, Kirill A. Shutemov wrote:
> > > +static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
> > > +                   struct list_head *pamt_pages)
> > > +{
> > > +   u64 err;
> > > +
> > > +   hpa = ALIGN_DOWN(hpa, SZ_2M);
> > > +
> > > +   spin_lock(&pamt_lock);
> >
> > Just curious, Can the lock be per-2M-range?
> Me too.
> Could we introduce smaller locks each covering a 2M range?
>
> And could we deposit 2 pamt pages per-2M hpa range no matter if it's finally
> mapped as a huge page or not?
>

Are you suggesting to keep 2 PAMT pages allocated for each private 2M
page even if it's mapped as a hugepage? It will lead to wastage of
memory of 4 MB per 1GB of guest memory range. For large VM sizes that
will amount to high values.

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by Yan Zhao 9 months, 1 week ago

On Tue, May 06, 2025 at 06:15:40PM -0700, Vishal Annapurve wrote:
> On Tue, May 6, 2025 at 6:04 PM Yan Zhao <yan.y.zhao@intel.com> wrote:
> >
> > On Mon, May 05, 2025 at 08:44:26PM +0800, Huang, Kai wrote:
> > > On Fri, 2025-05-02 at 16:08 +0300, Kirill A. Shutemov wrote:
> > > > +static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
> > > > +                   struct list_head *pamt_pages)
> > > > +{
> > > > +   u64 err;
> > > > +
> > > > +   hpa = ALIGN_DOWN(hpa, SZ_2M);
> > > > +
> > > > +   spin_lock(&pamt_lock);
> > >
> > > Just curious, Can the lock be per-2M-range?
> > Me too.
> > Could we introduce smaller locks each covering a 2M range?
> >
> > And could we deposit 2 pamt pages per-2M hpa range no matter if it's finally
> > mapped as a huge page or not?
> >
> 
> Are you suggesting to keep 2 PAMT pages allocated for each private 2M
> page even if it's mapped as a hugepage? It will lead to wastage of
> memory of 4 MB per 1GB of guest memory range. For large VM sizes that
> will amount to high values.
Ok. I'm thinking of the possibility to aligning the time of PAMT page allocation
to that of physical page allocation.

Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

Posted by kirill.shutemov@linux.intel.com 9 months ago

On Wed, May 07, 2025 at 10:42:25AM +0800, Yan Zhao wrote:
> On Tue, May 06, 2025 at 06:15:40PM -0700, Vishal Annapurve wrote:
> > On Tue, May 6, 2025 at 6:04 PM Yan Zhao <yan.y.zhao@intel.com> wrote:
> > >
> > > On Mon, May 05, 2025 at 08:44:26PM +0800, Huang, Kai wrote:
> > > > On Fri, 2025-05-02 at 16:08 +0300, Kirill A. Shutemov wrote:
> > > > > +static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
> > > > > +                   struct list_head *pamt_pages)
> > > > > +{
> > > > > +   u64 err;
> > > > > +
> > > > > +   hpa = ALIGN_DOWN(hpa, SZ_2M);
> > > > > +
> > > > > +   spin_lock(&pamt_lock);
> > > >
> > > > Just curious, Can the lock be per-2M-range?
> > > Me too.
> > > Could we introduce smaller locks each covering a 2M range?
> > >
> > > And could we deposit 2 pamt pages per-2M hpa range no matter if it's finally
> > > mapped as a huge page or not?
> > >
> > 
> > Are you suggesting to keep 2 PAMT pages allocated for each private 2M
> > page even if it's mapped as a hugepage? It will lead to wastage of
> > memory of 4 MB per 1GB of guest memory range. For large VM sizes that
> > will amount to high values.
> Ok. I'm thinking of the possibility to aligning the time of PAMT page allocation
> to that of physical page allocation.

No. That's mostly wasted memory. We need to aim to allocate memory only as
needed. With huge pages wast majority of such allocations will never be
needed.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov