Hi,
When profiling a VM migration I noticed that on the sender side a significant (~45%) time is spent in PV Dom0 Linux taking an emulation fault in unmap_page_range [1]:
The call comes from zap_pte_range (pgtable_64.h, inlined):
ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
This has 2 implementations: native and a generic one where pte_clear is implemented using set_pte_at with a Xen pvop.
As a proof of concept I’ve deleted the native implementation [2], which makes it fall back to the generic implementation [3].
This is not necessarily safe on SMP (it reads and clears as 2 separate steps), but shows that a paravirt op is faster than emulation.
To fix this we may need to partially revert https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4cdd9c8931767e1c56a51a1078d33a8c340f4405
To test this more easily I’ve written a small test program that maps/unmaps pages from a domain in Dom0 [4].
Before: 3.26932 +- 0.00503 seconds time elapsed ( +- 0.15% )
After: 0.75642 +- 0.00202 seconds time elapsed ( +- 0.27% )
It is more than 4x faster to use the paravirt ops than trapping and emulating.
From a functional point of view the above commit is correct, Xen doesn’t need a dedicated PV operation: trap and emulation works.
But from a performance point of view I’d say that Xen does need it. This is a hot-path during migration, and it’d be worthwhile to optimise it.
Just deleting the native implementation is probably not the solution, since we also want a value returned, and the existing PV operation is void.
It probably needs a new PV operation (re)introduced with the desired semantics?
Best regards,
--Edwin
[1]: https://cdn.jsdelivr.net/gh/edwintorok/xen@pmustack-coverletter/docs/tmp/migrate-send.svg?x=950.6&y=2197
[2]:
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 690c0307afed..ab6318bb5676 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1097,13 +1097,9 @@ extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
extern int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep);
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- pte_t pte = native_ptep_get_and_clear(ptep);
- return pte;
-}
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long address,
+ pte_t *ptep);
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
[3]:
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long address,
pte_t *ptep)
{
pte_t pte = *ptep;
pte_clear(mm, address, ptep);
return pte;
}
#endif
static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { set_pte_at(mm, addr, ptep, __pte(0)); }
[4]:
$ cat >main.c <<EOF
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <xenctrl.h>
#include <xenforeignmemory.h>
int main(int argc, char *argv[]) {
if (argc != 3) {
fprintf(stderr, "Usage: %s <domid> <batchsize>\n", argv[0]);
return 1;
}
uint32_t domid = atoi(argv[1]);
uint32_t batch_size = atoi(argv[2]);
if (!domid || !batch_size) {
fprintf(stderr, "Invalid arguments, expected 2 integers");
return 2;
}
xenforeignmemory_handle *handle = xenforeignmemory_open(NULL, 0);
if (!handle)
return 3;
xc_interface *xc = xc_interface_open(NULL, NULL, 0);
if (!xc)
return 4;
xen_pfn_t nr_pfns;
if (xc_domain_nr_gpfns(xc, domid, &nr_pfns) < 0)
return 5;
unsigned mappings_size = nr_pfns / batch_size + 1;
void **mappings = calloc(mappings_size, sizeof(void *));
if (!mappings) {
perror("calloc");
return 6;
}
for (xen_pfn_t i = 0; i < nr_pfns; i += batch_size) {
xen_pfn_t arr[batch_size];
int err[batch_size];
for (unsigned j = 0; j < batch_size; j++)
arr[j] = i + j;
unsigned long idx = i / batch_size;
assert(idx < mappings_size);
mappings[idx] = xenforeignmemory_map(
handle, domid, PROT_READ, sizeof(arr) / sizeof(arr[0]), arr, err);
if (!mappings[idx])
return 7;
}
for (xen_pfn_t i = 0; i < nr_pfns; i += batch_size) {
unsigned long idx = i / batch_size;
assert(idx < mappings_size);
if (xenforeignmemory_unmap(handle, mappings[idx], batch_size))
return 8;
}
free(mappings);
xc_interface_close(xc);
xenforeignmemory_close(handle);
return EXIT_SUCCESS;
}
EOF
$ cat >meson.build <<EOF
project('xfm_scale', 'c', default_options:['c_std=gnu11'])
xfm = dependency('xenforeignmemory')
xc = dependency('xencontrol’)
executable('xfm_scale', 'main.c', dependencies: [xfm, xc])
EOF
Adding Juergen
Le 12/02/2026 à 18:50, Edwin Torok a écrit :
> Hi,
>
> When profiling a VM migration I noticed that on the sender side a significant (~45%) time is spent in PV Dom0 Linux taking an emulation fault in unmap_page_range [1]:
>
> The call comes from zap_pte_range (pgtable_64.h, inlined):
> ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
>
> This has 2 implementations: native and a generic one where pte_clear is implemented using set_pte_at with a Xen pvop.
>
> As a proof of concept I’ve deleted the native implementation [2], which makes it fall back to the generic implementation [3].
> This is not necessarily safe on SMP (it reads and clears as 2 separate steps), but shows that a paravirt op is faster than emulation.
>
> To fix this we may need to partially revert https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4cdd9c8931767e1c56a51a1078d33a8c340f4405
>
> To test this more easily I’ve written a small test program that maps/unmaps pages from a domain in Dom0 [4].
> Before: 3.26932 +- 0.00503 seconds time elapsed ( +- 0.15% )
> After: 0.75642 +- 0.00202 seconds time elapsed ( +- 0.27% )
>
> It is more than 4x faster to use the paravirt ops than trapping and emulating.
> From a functional point of view the above commit is correct, Xen doesn’t need a dedicated PV operation: trap and emulation works.
> But from a performance point of view I’d say that Xen does need it. This is a hot-path during migration, and it’d be worthwhile to optimise it.
>
> Just deleting the native implementation is probably not the solution, since we also want a value returned, and the existing PV operation is void.
> It probably needs a new PV operation (re)introduced with the desired semantics?
>
I can't find a MMUEXT hypercall which does roughly xchg on the
pagetable. You can write a entry through MMU_NORMAL_PT_UPDATE, but not
reading it before writing it atomically (xchg).
Maybe there is a way to achieve it, but otherwise, it would need a new
dedicated hypercall and related code for Linux.
> Best regards,
> --Edwin
>
> [1]: https://cdn.jsdelivr.net/gh/edwintorok/xen@pmustack-coverletter/docs/tmp/migrate-send.svg?x=950.6&y=2197
>
> [2]:
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 690c0307afed..ab6318bb5676 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -1097,13 +1097,9 @@ extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
> extern int ptep_clear_flush_young(struct vm_area_struct *vma,
> unsigned long address, pte_t *ptep);
>
> -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
> -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
> - pte_t *ptep)
> -{
> - pte_t pte = native_ptep_get_and_clear(ptep);
> - return pte;
> -}
> +static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
> + unsigned long address,
> + pte_t *ptep);
>
> #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
> static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
>
> [3]:
> #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
> static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
> unsigned long address,
> pte_t *ptep)
> {
> pte_t pte = *ptep;
> pte_clear(mm, address, ptep);
> return pte;
> }
> #endif
> static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { set_pte_at(mm, addr, ptep, __pte(0)); }
>
> [4]:
> $ cat >main.c <<EOF
> #include <assert.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <sys/mman.h>
> #include <xenctrl.h>
> #include <xenforeignmemory.h>
>
> int main(int argc, char *argv[]) {
> if (argc != 3) {
> fprintf(stderr, "Usage: %s <domid> <batchsize>\n", argv[0]);
> return 1;
> }
> uint32_t domid = atoi(argv[1]);
> uint32_t batch_size = atoi(argv[2]);
>
> if (!domid || !batch_size) {
> fprintf(stderr, "Invalid arguments, expected 2 integers");
> return 2;
> }
>
> xenforeignmemory_handle *handle = xenforeignmemory_open(NULL, 0);
> if (!handle)
> return 3;
>
> xc_interface *xc = xc_interface_open(NULL, NULL, 0);
> if (!xc)
> return 4;
>
> xen_pfn_t nr_pfns;
> if (xc_domain_nr_gpfns(xc, domid, &nr_pfns) < 0)
> return 5;
>
> unsigned mappings_size = nr_pfns / batch_size + 1;
> void **mappings = calloc(mappings_size, sizeof(void *));
> if (!mappings) {
> perror("calloc");
> return 6;
> }
>
> for (xen_pfn_t i = 0; i < nr_pfns; i += batch_size) {
> xen_pfn_t arr[batch_size];
> int err[batch_size];
>
> for (unsigned j = 0; j < batch_size; j++)
> arr[j] = i + j;
>
> unsigned long idx = i / batch_size;
> assert(idx < mappings_size);
> mappings[idx] = xenforeignmemory_map(
> handle, domid, PROT_READ, sizeof(arr) / sizeof(arr[0]), arr, err);
> if (!mappings[idx])
> return 7;
> }
>
> for (xen_pfn_t i = 0; i < nr_pfns; i += batch_size) {
> unsigned long idx = i / batch_size;
> assert(idx < mappings_size);
> if (xenforeignmemory_unmap(handle, mappings[idx], batch_size))
> return 8;
> }
>
> free(mappings);
> xc_interface_close(xc);
> xenforeignmemory_close(handle);
>
> return EXIT_SUCCESS;
> }
> EOF
>
> $ cat >meson.build <<EOF
> project('xfm_scale', 'c', default_options:['c_std=gnu11'])
> xfm = dependency('xenforeignmemory')
> xc = dependency('xencontrol’)
> executable('xfm_scale', 'main.c', dependencies: [xfm, xc])
> EOF
Teddy
--
Teddy Astie | Vates XCP-ng Developer
XCP-ng & Xen Orchestra - Vates solutions
web: https://vates.tech
© 2016 - 2026 Red Hat, Inc.