From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
init_pamt_metadata() allocates PAMT refcounters for all physical memory up
to max_pfn. It might be suboptimal if the physical memory layout is
discontinuous and has large holes.
The refcount allocation vmalloc allocation. This is necessary to support a
large allocation size. The virtually contiguous property also makes it
easy to find a specific 2MB range’s refcount since it can simply be
indexed.
Since vmalloc mappings support remapping during normal kernel runtime,
switch to an approach that only populates refcount pages for the vmalloc
mapping when there is actually memory for that range. This means any holes
in the physical address space won’t use actual physical memory.
The validity of this memory optimization is based on a couple assumptions:
1. Physical holes in the ram layout are commonly large enough for it to be
worth it.
2. An alternative approach that looks the refcounts via some more layered
data structure wouldn’t overly complicate the lookups. Or at least
more than the complexity of managing the vmalloc mapping.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
[Add feedback, update log]
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
v3:
- Split from "x86/virt/tdx: Allocate reference counters for
PAMT memory" (Dave)
- Rename tdx_get_pamt_refcount()->tdx_find_pamt_refcount() (Dave)
- Drop duplicate pte_none() check (Dave)
- Align assignments in alloc_pamt_refcount() (Kai)
- Add comment in pamt_refcount_depopulate() to clarify teardown
logic (Dave)
- Drop __va(PFN_PHYS(pte_pfn(ptep_get()))) pile on for simpler method.
(Dave)
- Improve log
---
arch/x86/virt/vmx/tdx/tdx.c | 120 ++++++++++++++++++++++++++++++++----
1 file changed, 109 insertions(+), 11 deletions(-)
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 0ce4181ca352..d4b01656759a 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -194,30 +194,119 @@ int tdx_cpu_enable(void)
}
EXPORT_SYMBOL_GPL(tdx_cpu_enable);
-/*
- * Allocate PAMT reference counters for all physical memory.
- *
- * It consumes 2MiB for every 1TiB of physical memory.
- */
-static int init_pamt_metadata(void)
+/* Find PAMT refcount for a given physical address */
+static atomic_t *tdx_find_pamt_refcount(unsigned long hpa)
{
- size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
+ return &pamt_refcounts[hpa / PMD_SIZE];
+}
- if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
- return 0;
+/* Map a page into the PAMT refcount vmalloc region */
+static int pamt_refcount_populate(pte_t *pte, unsigned long addr, void *data)
+{
+ struct page *page;
+ pte_t entry;
- pamt_refcounts = vmalloc(size);
- if (!pamt_refcounts)
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
return -ENOMEM;
+ entry = mk_pte(page, PAGE_KERNEL);
+
+ spin_lock(&init_mm.page_table_lock);
+ /*
+ * PAMT refcount populations can overlap due to rounding of the
+ * start/end pfn. Make sure another PAMT range didn't already
+ * populate it.
+ */
+ if (pte_none(ptep_get(pte)))
+ set_pte_at(&init_mm, addr, pte, entry);
+ else
+ __free_page(page);
+ spin_unlock(&init_mm.page_table_lock);
+
return 0;
}
+/*
+ * Allocate PAMT reference counters for the given PFN range.
+ *
+ * It consumes 2MiB for every 1TiB of physical memory.
+ */
+static int alloc_pamt_refcount(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long start, end;
+
+ start = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(start_pfn));
+ end = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(end_pfn + 1));
+ start = round_down(start, PAGE_SIZE);
+ end = round_up(end, PAGE_SIZE);
+
+ return apply_to_page_range(&init_mm, start, end - start,
+ pamt_refcount_populate, NULL);
+}
+
+/*
+ * Reserve vmalloc range for PAMT reference counters. It covers all physical
+ * address space up to max_pfn. It is going to be populated from
+ * build_tdx_memlist() only for present memory that available for TDX use.
+ *
+ * It reserves 2MiB of virtual address space for every 1TiB of physical memory.
+ */
+static int init_pamt_metadata(void)
+{
+ struct vm_struct *area;
+ size_t size;
+
+ if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
+ return 0;
+
+ size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
+ size = round_up(size, PAGE_SIZE);
+
+ area = get_vm_area(size, VM_SPARSE);
+ if (!area)
+ return -ENOMEM;
+
+ pamt_refcounts = area->addr;
+ return 0;
+}
+
+/* Unmap a page from the PAMT refcount vmalloc region */
+static int pamt_refcount_depopulate(pte_t *pte, unsigned long addr, void *data)
+{
+ struct page *page;
+ pte_t entry;
+
+ spin_lock(&init_mm.page_table_lock);
+
+ entry = ptep_get(pte);
+ /* refount allocation is sparse, may not be populated */
+ if (!pte_none(entry)) {
+ pte_clear(&init_mm, addr, pte);
+ page = pte_page(entry);
+ __free_page(page);
+ }
+
+ spin_unlock(&init_mm.page_table_lock);
+
+ return 0;
+}
+
+/* Unmap all PAMT refcount pages and free vmalloc range */
static void free_pamt_metadata(void)
{
+ size_t size;
+
if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
return;
+ size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
+ size = round_up(size, PAGE_SIZE);
+
+ apply_to_existing_page_range(&init_mm,
+ (unsigned long)pamt_refcounts,
+ size, pamt_refcount_depopulate,
+ NULL);
vfree(pamt_refcounts);
pamt_refcounts = NULL;
}
@@ -288,10 +377,19 @@ static int build_tdx_memlist(struct list_head *tmb_list)
ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
if (ret)
goto err;
+
+ /* Allocated PAMT refcountes for the memblock */
+ ret = alloc_pamt_refcount(start_pfn, end_pfn);
+ if (ret)
+ goto err;
}
return 0;
err:
+ /*
+ * Only free TDX memory blocks here, PAMT refcount pages
+ * will be freed in the init_tdx_module() error path.
+ */
free_tdx_memlist(tmb_list);
return ret;
}
--
2.51.0
> +/* Map a page into the PAMT refcount vmalloc region */ > +static int pamt_refcount_populate(pte_t *pte, unsigned long addr, void *data) > +{ > + struct page *page; > + pte_t entry; > > - pamt_refcounts = vmalloc(size); > - if (!pamt_refcounts) > + page = alloc_page(GFP_KERNEL | __GFP_ZERO); > + if (!page) > return -ENOMEM; > > + entry = mk_pte(page, PAGE_KERNEL); > + > + spin_lock(&init_mm.page_table_lock); > + /* > + * PAMT refcount populations can overlap due to rounding of the > + * start/end pfn. > [...] > Make sure another PAMT range didn't already > + * populate it. > + */ Make sure the same range only gets populated once ? > + if (pte_none(ptep_get(pte))) > + set_pte_at(&init_mm, addr, pte, entry); > + else > + __free_page(page); > + spin_unlock(&init_mm.page_table_lock); > + > return 0; > } > > +/* > + * Allocate PAMT reference counters for the given PFN range. > + * > + * It consumes 2MiB for every 1TiB of physical memory. > + */ > +static int alloc_pamt_refcount(unsigned long start_pfn, unsigned long end_pfn) > +{ > + unsigned long start, end; > + > + start = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(start_pfn)); > + end = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(end_pfn + 1)); (sorry didn't notice this in last version) I don't quite follow why we need "end_pfn + 1" instead of just "end_pfn"? IIUC this could result in an additional 2M range being populated unnecessarily when the end_pfn is 2M aligned. And ... > + start = round_down(start, PAGE_SIZE); > + end = round_up(end, PAGE_SIZE); > + > + return apply_to_page_range(&init_mm, start, end - start, > + pamt_refcount_populate, NULL); > +} > + > +/* > + * Reserve vmalloc range for PAMT reference counters. It covers all physical > + * address space up to max_pfn. It is going to be populated from > + * build_tdx_memlist() only for present memory that available for TDX use. > + * > + * It reserves 2MiB of virtual address space for every 1TiB of physical memory. > + */ > +static int init_pamt_metadata(void) > +{ > + struct vm_struct *area; > + size_t size; > + > + if (!tdx_supports_dynamic_pamt(&tdx_sysinfo)) > + return 0; > + > + size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts); > + size = round_up(size, PAGE_SIZE); > + > + area = get_vm_area(size, VM_SPARSE); > + if (!area) > + return -ENOMEM; > + > + pamt_refcounts = area->addr; > + return 0; > +} > + > +/* Unmap a page from the PAMT refcount vmalloc region */ > +static int pamt_refcount_depopulate(pte_t *pte, unsigned long addr, void *data) > +{ > + struct page *page; > + pte_t entry; > + > + spin_lock(&init_mm.page_table_lock); > + > + entry = ptep_get(pte); > + /* refount allocation is sparse, may not be populated */ > + if (!pte_none(entry)) { > + pte_clear(&init_mm, addr, pte); > + page = pte_page(entry); > + __free_page(page); > + } > + > + spin_unlock(&init_mm.page_table_lock); > + > + return 0; > +} > + > +/* Unmap all PAMT refcount pages and free vmalloc range */ > static void free_pamt_metadata(void) > { > + size_t size; > + > if (!tdx_supports_dynamic_pamt(&tdx_sysinfo)) > return; > > + size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts); > + size = round_up(size, PAGE_SIZE); > + > + apply_to_existing_page_range(&init_mm, > + (unsigned long)pamt_refcounts, > + size, pamt_refcount_depopulate, > + NULL); > vfree(pamt_refcounts); > pamt_refcounts = NULL; > } > @@ -288,10 +377,19 @@ static int build_tdx_memlist(struct list_head *tmb_list) > ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); > if (ret) > goto err; > + > + /* Allocated PAMT refcountes for the memblock */ > + ret = alloc_pamt_refcount(start_pfn, end_pfn); > + if (ret) > + goto err; > } ... when max_pfn == end_pfn of the last TDX memory block, this could result in an additional page of @pamt_refcounts being allocated, but it will never be freed since free_pamt_metadata() will only free mapping up to max_pfn. Am I missing anything?
On 9/19/2025 3:25 PM, Huang, Kai wrote: >> +/* Map a page into the PAMT refcount vmalloc region */ >> +static int pamt_refcount_populate(pte_t *pte, unsigned long addr, void *data) >> +{ >> + struct page *page; >> + pte_t entry; >> >> - pamt_refcounts = vmalloc(size); >> - if (!pamt_refcounts) >> + page = alloc_page(GFP_KERNEL | __GFP_ZERO); >> + if (!page) >> return -ENOMEM; >> >> + entry = mk_pte(page, PAGE_KERNEL); >> + >> + spin_lock(&init_mm.page_table_lock); >> + /* >> + * PAMT refcount populations can overlap due to rounding of the >> + * start/end pfn. >> > [...] > >> Make sure another PAMT range didn't already >> + * populate it. >> + */ > Make sure the same range only gets populated once ? > >> + if (pte_none(ptep_get(pte))) >> + set_pte_at(&init_mm, addr, pte, entry); >> + else >> + __free_page(page); >> + spin_unlock(&init_mm.page_table_lock); >> + >> return 0; >> } >> >> +/* >> + * Allocate PAMT reference counters for the given PFN range. >> + * >> + * It consumes 2MiB for every 1TiB of physical memory. >> + */ >> +static int alloc_pamt_refcount(unsigned long start_pfn, unsigned long end_pfn) >> +{ >> + unsigned long start, end; >> + >> + start = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(start_pfn)); >> + end = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(end_pfn + 1)); > (sorry didn't notice this in last version) > > I don't quite follow why we need "end_pfn + 1" instead of just "end_pfn"? > > IIUC this could result in an additional 2M range being populated > unnecessarily when the end_pfn is 2M aligned. IIUC, this will not happen. The +1 page will be converted to 4KB, and will be ignored since in tdx_find_pamt_refcount() the address is divided by 2M. To handle the address unaligned to 2M, +511 should be used instead of +1? > > And ... > >> + start = round_down(start, PAGE_SIZE); >> + end = round_up(end, PAGE_SIZE); >> + >> + return apply_to_page_range(&init_mm, start, end - start, >> + pamt_refcount_populate, NULL); >> +} >> + >> +/* >> + * Reserve vmalloc range for PAMT reference counters. It covers all physical >> + * address space up to max_pfn. It is going to be populated from >> + * build_tdx_memlist() only for present memory that available for TDX use. >> + * >> + * It reserves 2MiB of virtual address space for every 1TiB of physical memory. >> + */ >> +static int init_pamt_metadata(void) >> +{ >> + struct vm_struct *area; >> + size_t size; >> + >> + if (!tdx_supports_dynamic_pamt(&tdx_sysinfo)) >> + return 0; >> + >> + size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts); >> + size = round_up(size, PAGE_SIZE); >> + >> + area = get_vm_area(size, VM_SPARSE); >> + if (!area) >> + return -ENOMEM; >> + >> + pamt_refcounts = area->addr; >> + return 0; >> +} >> + >> +/* Unmap a page from the PAMT refcount vmalloc region */ >> +static int pamt_refcount_depopulate(pte_t *pte, unsigned long addr, void *data) >> +{ >> + struct page *page; >> + pte_t entry; >> + >> + spin_lock(&init_mm.page_table_lock); >> + >> + entry = ptep_get(pte); >> + /* refount allocation is sparse, may not be populated */ refount -> refcount >> + if (!pte_none(entry)) { >> + pte_clear(&init_mm, addr, pte); >> + page = pte_page(entry); >> + __free_page(page); >> + } >> + >> + spin_unlock(&init_mm.page_table_lock); >> + >> + return 0; >> +} >> + >> +/* Unmap all PAMT refcount pages and free vmalloc range */ >> static void free_pamt_metadata(void) >> { >> + size_t size; >> + >> if (!tdx_supports_dynamic_pamt(&tdx_sysinfo)) >> return; >> >> + size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts); >> + size = round_up(size, PAGE_SIZE); >> + >> + apply_to_existing_page_range(&init_mm, >> + (unsigned long)pamt_refcounts, >> + size, pamt_refcount_depopulate, >> + NULL); >> vfree(pamt_refcounts); >> pamt_refcounts = NULL; >> } >> @@ -288,10 +377,19 @@ static int build_tdx_memlist(struct list_head *tmb_list) >> ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); >> if (ret) >> goto err; >> + >> + /* Allocated PAMT refcountes for the memblock */ >> + ret = alloc_pamt_refcount(start_pfn, end_pfn); >> + if (ret) >> + goto err; >> } > ... when max_pfn == end_pfn of the last TDX memory block, this could > result in an additional page of @pamt_refcounts being allocated, but it > will never be freed since free_pamt_metadata() will only free mapping up > to max_pfn. > > Am I missing anything?
On Tue, 2025-09-23 at 17:38 +0800, Binbin Wu wrote: > > > +/* > > > + * Allocate PAMT reference counters for the given PFN range. > > > + * > > > + * It consumes 2MiB for every 1TiB of physical memory. > > > + */ > > > +static int alloc_pamt_refcount(unsigned long start_pfn, unsigned long end_pfn) > > > +{ > > > + unsigned long start, end; > > > + > > > + start = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(start_pfn)); > > > + end = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(end_pfn + 1)); > > (sorry didn't notice this in last version) > > > > I don't quite follow why we need "end_pfn + 1" instead of just "end_pfn"? > > > > IIUC this could result in an additional 2M range being populated > > unnecessarily when the end_pfn is 2M aligned. > > IIUC, this will not happen. > The +1 page will be converted to 4KB, and will be ignored since in > tdx_find_pamt_refcount() the address is divided by 2M. > > To handle the address unaligned to 2M, +511 should be used instead of +1? OK. Thanks for catching. But I still don't get why we need end_pfn + 1. Also, when end_pfn == 511, would this result in the second refcount being returned for the @end, while the intention should be the first refcount? For example, assuming we have a range [0, 2M), we only need one refcount. And the PFN range (which comes from for_each_mem_pfn_range()) would be: start_pfn == 0 end_pfn == 512 This will results in @start pointing to the first refcount and @end pointing to the second, IIUC. So it seems we need: start = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(start_pfn)); end = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(end_pfn) - 1)); start = round_down(start, PAGE_SIZE); end = round_up(end, PAGE_SIZE); ?
On 9/24/2025 2:50 PM, Huang, Kai wrote: > On Tue, 2025-09-23 at 17:38 +0800, Binbin Wu wrote: >>>> +/* >>>> + * Allocate PAMT reference counters for the given PFN range. >>>> + * >>>> + * It consumes 2MiB for every 1TiB of physical memory. >>>> + */ >>>> +static int alloc_pamt_refcount(unsigned long start_pfn, unsigned long end_pfn) >>>> +{ >>>> + unsigned long start, end; >>>> + >>>> + start = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(start_pfn)); >>>> + end = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(end_pfn + 1)); >>> (sorry didn't notice this in last version) >>> >>> I don't quite follow why we need "end_pfn + 1" instead of just "end_pfn"? >>> >>> IIUC this could result in an additional 2M range being populated >>> unnecessarily when the end_pfn is 2M aligned. >> IIUC, this will not happen. >> The +1 page will be converted to 4KB, and will be ignored since in >> tdx_find_pamt_refcount() the address is divided by 2M. >> >> To handle the address unaligned to 2M, +511 should be used instead of +1? > OK. Thanks for catching. But I still don't get why we need end_pfn + 1. > > Also, when end_pfn == 511, would this result in the second refcount being > returned for the @end, while the intention should be the first refcount? > > For example, assuming we have a range [0, 2M), we only need one refcount. > And the PFN range (which comes from for_each_mem_pfn_range()) would be: > > start_pfn == 0 > end_pfn == 512 > > This will results in @start pointing to the first refcount and @end > pointing to the second, IIUC. > > So it seems we need: > > start = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(start_pfn)); > end = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(end_pfn) - 1)); > start = round_down(start, PAGE_SIZE); > end = round_up(end, PAGE_SIZE); Checked again, this seems to be the right version. > > ?
On Wed, 2025-09-24 at 16:57 +0800, Binbin Wu wrote: > > This will results in @start pointing to the first refcount and @end > > pointing to the second, IIUC. > > > > So it seems we need: > > > > start = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(start_pfn)); > > end = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(end_pfn) - 1)); > > start = round_down(start, PAGE_SIZE); > > end = round_up(end, PAGE_SIZE); > > Checked again, this seems to be the right version. Thanks both for the analysis. I lazily created a test program to check some edge cases and found the original and this version were both buggy. Clearly this code needs to be clearer (as actually Dave pointed out in v2 and I failed to address). Example (synthetic failure): start_pfn = 0x80000 end_pfn = 0x80001 Original code: start = 0xff76ba4f9e034000 end = 0xff76ba4f9e034000 Above fix: start = 0xff76ba4f9e034000 end = 0xff76ba4f9e034000 Part of the problem is that tdx_find_pamt_refcount() expects the hpa passed in to be PMD aligned. The other callers of tdx_find_pamt_refcount() also make sure that the PA passed in is 2MB aligned before calling, and compute this starting with a PFN. So to try to make it easier to read and be correct what do you think about the below: static atomic_t *tdx_find_pamt_refcount(unsigned long pfn) { unsigned long hpa = ALIGN_DOWN(pfn, PMD_SIZE); return &pamt_refcounts[hpa / PMD_SIZE]; } /* * 'start_pfn' is inclusive and 'end_pfn' is exclusive. Compute the * page range to be inclusive of the start and end refcount * addresses and at least a page in size. The teardown logic needs * to handle potentially overlapping refcounts mappings resulting * from this. */ start = (unsigned long)tdx_find_pamt_refcount(start_pfn); end = (unsigned long)tdx_find_pamt_refcount(end_pfn - 1); start = ALIGN_DOWN(start, PAGE_SIZE); end = ALIGN_DOWN(end, PAGE_SIZE) + PAGE_SIZE;
On Wed, 2025-10-01 at 00:32 +0000, Edgecombe, Rick P wrote: > On Wed, 2025-09-24 at 16:57 +0800, Binbin Wu wrote: > > > This will results in @start pointing to the first refcount and @end > > > pointing to the second, IIUC. > > > > > > So it seems we need: > > > > > > start = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(start_pfn)); > > > end = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(end_pfn) - 1)); > > > start = round_down(start, PAGE_SIZE); > > > end = round_up(end, PAGE_SIZE); > > > > Checked again, this seems to be the right version. > > Thanks both for the analysis. I lazily created a test program to check some edge > cases and found the original and this version were both buggy. Clearly this code > needs to be clearer (as actually Dave pointed out in v2 and I failed to > address). Example (synthetic failure): > > start_pfn = 0x80000 > end_pfn = 0x80001 > > Original code: start = 0xff76ba4f9e034000 > end = 0xff76ba4f9e034000 > > Above fix: start = 0xff76ba4f9e034000 > end = 0xff76ba4f9e034000 Oh I think the problem of the "Above fix" is it fails when @start and @end are the same and both are already page aligned. > > Part of the problem is that tdx_find_pamt_refcount() expects the hpa passed in > to be PMD aligned. The other callers of tdx_find_pamt_refcount() also make sure > that the PA passed in is 2MB aligned before calling, and compute this starting > with a PFN. So to try to make it easier to read and be correct what do you think > about the below: > > static atomic_t *tdx_find_pamt_refcount(unsigned long pfn) { > unsigned long hpa = ALIGN_DOWN(pfn, PMD_SIZE); Shouldn't it be: hpa = ALIGN_DOWN(PFN_PHYS(pfn), PMD_SIZE)); ? > > return &pamt_refcounts[hpa / PMD_SIZE]; > } > > /* > * 'start_pfn' is inclusive and 'end_pfn' is exclusive. > I think 'end_pfn' is exclusive is a little bit confusing? It sounds like the physical range from PFN_PHYS(end_pfn - 1) to PFN_PHYS(end_pfn) is also exclusive, but it is actually not? To me it's more like only the physical address PFN_PHYS(end_pfn) is exclusive. > Compute the > * page range to be inclusive of the start and end refcount > * addresses and at least a page in size. The teardown logic needs > * to handle potentially overlapping refcounts mappings resulting > * from this. > */ > start = (unsigned long)tdx_find_pamt_refcount(start_pfn); > end = (unsigned long)tdx_find_pamt_refcount(end_pfn - 1); > start = ALIGN_DOWN(start, PAGE_SIZE); > end = ALIGN_DOWN(end, PAGE_SIZE) + PAGE_SIZE; This looks fine to me. I mean the result should be correct, but the 'end_pfn - 1' (due to 'end_pfn' is exclusive) is a bit confusing to me as said above, but maybe it's only me, so feel free to ignore. Or, as said above, I think the problem of the "Above fix" is when calculating the @end we didn't consider the case where it equals to @start and is already page aligned. Does below work (assuming tdx_find_pamt_refcount() still takes 'hpa')? start = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(start_pfn)); end = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(end_pfn) - 1)); start = PAGE_ALIGN_DOWN(start); end = PAGE_ALIGN_DOWN(end) + PAGE_SIZE; Anyway, don't have strong opinion here, so up to you.
On Wed, 2025-10-01 at 10:40 +0000, Huang, Kai wrote: > On Wed, 2025-10-01 at 00:32 +0000, Edgecombe, Rick P wrote: > > > > Part of the problem is that tdx_find_pamt_refcount() expects the hpa passed in > > to be PMD aligned. The other callers of tdx_find_pamt_refcount() also make sure > > that the PA passed in is 2MB aligned before calling, and compute this starting > > with a PFN. So to try to make it easier to read and be correct what do you think > > about the below: > > > > static atomic_t *tdx_find_pamt_refcount(unsigned long pfn) { > > unsigned long hpa = ALIGN_DOWN(pfn, PMD_SIZE); > > Shouldn't it be: > > hpa = ALIGN_DOWN(PFN_PHYS(pfn), PMD_SIZE)); > ? Err, yes. > > > > return &pamt_refcounts[hpa / PMD_SIZE]; > > } > > > > /* > > * 'start_pfn' is inclusive and 'end_pfn' is exclusive. > > > > I think 'end_pfn' is exclusive is a little bit confusing? > It's trying to say that end_pfn is an exclusive range value, so the range in question does not actually include end_pfn. That is how the calling code looked to me, but please check. Maybe I can clarify that start_pfn and end_pfn are a range? It seems redundant. > It sounds like > the physical range from PFN_PHYS(end_pfn - 1) to PFN_PHYS(end_pfn) is also > exclusive, but it is actually not? > Not sure what you mean. For clarity, when describing a range sometimes the start or end include that value, and sometimes they don't. So exclusive here means that the end_pfn is not included in the range. As in we don't need a refcount allocation for end_pfn. > To me it's more like only the physical > address PFN_PHYS(end_pfn) is exclusive. > > > Compute the > > * page range to be inclusive of the start and end refcount > > * addresses and at least a page in size. The teardown logic needs > > * to handle potentially overlapping refcounts mappings resulting > > * from this. > > */ > > start = (unsigned long)tdx_find_pamt_refcount(start_pfn); > > end = (unsigned long)tdx_find_pamt_refcount(end_pfn - 1); > > start = ALIGN_DOWN(start, PAGE_SIZE); > > end = ALIGN_DOWN(end, PAGE_SIZE) + PAGE_SIZE; > > This looks fine to me. I mean the result should be correct, but the > 'end_pfn - 1' (due to 'end_pfn' is exclusive) is a bit confusing to me as > said above, but maybe it's only me, so feel free to ignore. Sorry, I'm not following the confusion. Maybe we can have a quick chat when you come online. > > Or, as said above, I think the problem of the "Above fix" is when > calculating the @end we didn't consider the case where it equals to @start > and is already page aligned. Does below work (assuming > tdx_find_pamt_refcount() still takes 'hpa')? > > start = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(start_pfn)); > end = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(end_pfn) - 1)); > start = PAGE_ALIGN_DOWN(start); > end = PAGE_ALIGN_DOWN(end) + PAGE_SIZE; The refcounts are actually per page/pfn not per PA. So I think introducing the concept of a refcount for a PA, especially as part of the interfaces is adding confusion. The math happens to work out, but it's a layer of indirection. The other problem this collides with is that tdx_find_pamt_refcount() callers all have to convert PFN to PA. This should be fixed. I guess we are really doing two separate calculations. First calculate the range of refcounts we need, and then calculate the range of vmalloc space we need. How about this, is it clearer to you? It is very specific about what/why we actually are doing, but at the expense of minimizing operations. In this slow path, I think clarity is the priority. static int alloc_pamt_refcount(unsigned long start_pfn, unsigned long end_pfn) { unsigned long refcount_first, refcount_last; unsigned long mapping_start, mapping_end; /* * 'start_pfn' is inclusive and 'end_pfn' is exclusive. Find the * range of refcounts the pfn range will need. */ refcount_first = (unsigned long)tdx_find_pamt_refcount(start_pfn); refcount_last = (unsigned long)tdx_find_pamt_refcount(end_pfn - 1); /* * Calculate the page aligned range that includes the refcounts. The * teardown logic needs to handle potentially overlapping refcount * mappings resulting from the alignments. */ mapping_start = round_down(refcount_first, PAGE_SIZE); mapping_end = round_up(refcount_last + sizeof(*pamt_refcounts), PAGE_SIZE); return apply_to_page_range(&init_mm, mapping_start, mapping_end - mapping_start, pamt_refcount_populate, NULL); } > > Anyway, don't have strong opinion here, so up to you.
On Wed, 2025-10-01 at 19:00 +0000, Edgecombe, Rick P wrote: > I guess we are really doing two separate calculations. First calculate the range > of refcounts we need, and then calculate the range of vmalloc space we need. How > about this, is it clearer to you? It is very specific about what/why we actually > are doing, but at the expense of minimizing operations. In this slow path, I > think clarity is the priority. > > static int alloc_pamt_refcount(unsigned long start_pfn, unsigned long end_pfn) > { > unsigned long refcount_first, refcount_last; > unsigned long mapping_start, mapping_end; > > /* > * 'start_pfn' is inclusive and 'end_pfn' is exclusive. Find the > * range of refcounts the pfn range will need. > */ > refcount_first = (unsigned long)tdx_find_pamt_refcount(start_pfn); > refcount_last = (unsigned long)tdx_find_pamt_refcount(end_pfn - 1); > > /* > * Calculate the page aligned range that includes the refcounts. The > * teardown logic needs to handle potentially overlapping refcount > * mappings resulting from the alignments. > */ > mapping_start = round_down(refcount_first, PAGE_SIZE); > mapping_end = round_up(refcount_last + sizeof(*pamt_refcounts), > PAGE_SIZE); > > > return apply_to_page_range(&init_mm, mapping_start, mapping_end - > mapping_start, > pamt_refcount_populate, NULL); > } Yeah it's better and clearer. LGTM. Thanks.
© 2016 - 2025 Red Hat, Inc.