[v1] arm: Implement ARM DEN 0057A PV time interface

[RFC PATCH 4/4] xen/arm: Implement standard PV time interface as per ARM DEN 0057A

Posted by Koichiro Den 7 months, 3 weeks ago

The VCPUOP_register_runstate_memory_area hypercall is still actively
used, e.g., in the Linux arm64 codebase. When KPTI is enabled, the area
was not registered from the beginning due to the VA not always being
valid. In such cases, Linux falls back to using the standard PV time
interface (ARM DEN 0057A), but this interface has not been implemented
in Xen for ARM64 (until this commit).

The VCPUOP_register_runstate_phys_area was introduced, though it's
unclear whether this would be used in Linux arm64 and when it will be
prevalent amongst every possible downstream domain Linux variant. And of
course Linux is not an only option for the Xen arm64 domains.

Therefore, implementing the standard way of sharing PV time is
generically beneficial, avoiding reliance on specially crafted
hypercalls, the usage of which by guest VMs is not always guaranteed.
Note that the PV_TIME_ST interface communicates with IPA (GPA), not GVA.

Add the PV time interface according to ARM DEN 0057A.

Signed-off-by: Koichiro Den <den@valinux.co.jp>
---
 xen/arch/arm/domain.c                   | 30 +++++++++
 xen/arch/arm/domain_build.c             | 87 ++++++++++++++++++++++++-
 xen/arch/arm/include/asm/domain.h       | 17 +++++
 xen/arch/arm/include/asm/smccc.h        | 12 ++++
 xen/arch/arm/vsmc.c                     | 38 +++++++++++
 xen/common/device-tree/dom0less-build.c |  2 +-
 xen/include/xen/fdt-domain-build.h      |  2 +-
 7 files changed, 183 insertions(+), 5 deletions(-)

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index be58a23dd725..e895e4111f1b 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -277,6 +277,18 @@ static void ctxt_switch_to(struct vcpu *n)
     WRITE_SYSREG(n->arch.mdcr_el2, MDCR_EL2);
 }
 
+static void update_stolen_time(struct vcpu *n)
+{
+    uint64_t tot_stolen;
+
+    if ( is_idle_vcpu(current) )
+        return;
+
+    tot_stolen = n->runstate.time[RUNSTATE_runnable] +
+                 n->runstate.time[RUNSTATE_offline];
+    write_atomic(&n->arch.pv_time_region->stolen_time, tot_stolen);
+}
+
 static void schedule_tail(struct vcpu *prev)
 {
     ASSERT(prev != current);
@@ -291,6 +303,8 @@ static void schedule_tail(struct vcpu *prev)
 
     update_runstate_area(current);
 
+    update_stolen_time(current);
+
     /* Ensure that the vcpu has an up-to-date time base. */
     update_vcpu_system_time(current);
 }
@@ -586,6 +600,8 @@ int arch_vcpu_create(struct vcpu *v)
     if ( get_ssbd_state() == ARM_SSBD_RUNTIME )
         v->arch.cpu_info->flags |= CPUINFO_WORKAROUND_2_FLAG;
 
+    v->arch.pv_time_region = &v->domain->arch.pv_time_regions[v->vcpu_id];
+
     return rc;
 
 fail:
@@ -707,6 +723,7 @@ int arch_domain_create(struct domain *d,
                        unsigned int flags)
 {
     unsigned int count = 0;
+    int order;
     int rc;
 
     BUILD_BUG_ON(GUEST_MAX_VCPUS < MAX_VIRT_CPUS);
@@ -791,6 +808,19 @@ int arch_domain_create(struct domain *d,
     d->arch.sve_vl = config->arch.sve_vl;
 #endif
 
+    /*
+     * Preallocate the stolen time shared memory regions for all the
+     * possible vCPUs.
+     */
+    order = get_order_from_bytes(d->max_vcpus * sizeof(struct pv_time_region));
+    d->arch.pv_time_regions_gfn = INVALID_GFN;
+    d->arch.pv_time_regions = alloc_xenheap_pages(order, 0);
+    if ( !d->arch.pv_time_regions ) {
+        rc = -ENOMEM;
+        goto fail;
+    }
+    memset(d->arch.pv_time_regions, 0, PAGE_SIZE << order);
+
     return 0;
 
 fail:
diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index 85b6909e2b0e..1c51b53d9c6b 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -1561,8 +1561,80 @@ int __init make_chosen_node(const struct kernel_info *kinfo)
     return res;
 }
 
-int __init make_resv_memory_node(const struct kernel_info *kinfo, int addrcells,
-                                 int sizecells)
+int __init make_pv_time_resv_memory_node(struct domain *d,
+                                         const struct kernel_info *kinfo,
+                                         int addrcells, int sizecells)
+{
+    __be32 reg[GUEST_ROOT_ADDRESS_CELLS + GUEST_ROOT_SIZE_CELLS];
+    unsigned int len = (addrcells + sizecells) * sizeof(__be32);
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    struct membanks *unused_banks = NULL;
+    void *fdt = kinfo->fdt;
+    unsigned regions_len;
+    gfn_t pv_time_gfn;
+    mfn_t pv_time_mfn;
+    paddr_t aligned;
+    paddr_t avail;
+    __be32 *cells;
+    int res;
+    int i;
+
+    /* Find unused regions */
+    regions_len = PAGE_ALIGN(d->max_vcpus * 64);
+    unused_banks = membanks_xzalloc(NR_MEM_BANKS, MEMORY);
+    if ( !unused_banks )
+        return -ENOMEM;
+
+    res = find_unused_regions(d, kinfo, unused_banks);
+    if ( res ) {
+        printk(XENLOG_WARNING "%pd: failed to find unused regions\n", d);
+        goto fail;
+    }
+    for ( i = 0; i < unused_banks->nr_banks; i++ ) {
+        const struct membank *bank = &unused_banks->bank[i];
+        aligned = PAGE_ALIGN(bank->start);
+        avail = bank->size - (aligned - bank->start);
+        if ( avail >= regions_len )
+            break;
+    }
+    if ( i == unused_banks->nr_banks ) {
+        res = -ENOSPC;
+        goto fail;
+    }
+
+    /* Insert P2M entry */
+    pv_time_mfn = virt_to_mfn(d->arch.pv_time_regions);
+    pv_time_gfn = gaddr_to_gfn(aligned);
+    p2m_write_lock(p2m);
+    res = p2m_set_entry(p2m, pv_time_gfn, regions_len / PAGE_SIZE,
+                        pv_time_mfn, p2m_ram_rw, p2m_access_r);
+    p2m_write_unlock(p2m);
+    if ( res ) {
+        printk(XENLOG_WARNING "%pd: failed to set P2M entry for PV_TIME\n", d);
+        goto fail;
+    }
+    d->arch.pv_time_regions_gfn = pv_time_gfn;
+
+    /* Reserve the selected GFN */
+    res = domain_fdt_begin_node(fdt, "pv-time", gfn_x(pv_time_gfn));
+    if ( res )
+        goto fail;
+
+    cells = reg;
+    dt_child_set_range(&cells, addrcells, sizecells, gfn_x(pv_time_gfn), regions_len);
+    res = fdt_property(fdt, "reg", reg, len);
+    if ( res )
+        goto fail;
+
+    res = fdt_end_node(fdt);
+
+  fail:
+    xfree(unused_banks);
+    return res;
+}
+
+int __init make_resv_memory_node(struct domain *d, const struct kernel_info *kinfo,
+                                 int addrcells, int sizecells)
 {
     const struct membanks *mem = kernel_info_get_shm_mem_const(kinfo);
     void *fdt = kinfo->fdt;
@@ -1596,6 +1668,10 @@ int __init make_resv_memory_node(const struct kernel_info *kinfo, int addrcells,
     if ( res )
         return res;
 
+    res = make_pv_time_resv_memory_node(d, kinfo, addrcells, sizecells);
+    if ( res )
+        return res;
+
     res = fdt_end_node(fdt);
 
     return res;
@@ -1744,6 +1820,11 @@ static int __init handle_node(struct domain *d, struct kernel_info *kinfo,
                                         dt_n_size_cells(node));
         if ( res )
             return res;
+
+        res = make_pv_time_resv_memory_node(d, kinfo, dt_n_addr_cells(node),
+                                            dt_n_size_cells(node));
+        if ( res )
+            return res;
     }
 
     for ( child = node->child; child != NULL; child = child->sibling )
@@ -1791,7 +1872,7 @@ static int __init handle_node(struct domain *d, struct kernel_info *kinfo,
 
         if ( !res_mem_node_found )
         {
-            res = make_resv_memory_node(kinfo, addrcells, sizecells);
+            res = make_resv_memory_node(d, kinfo, addrcells, sizecells);
             if ( res )
                 return res;
         }
diff --git a/xen/arch/arm/include/asm/domain.h b/xen/arch/arm/include/asm/domain.h
index a3487ca71303..c231c45fe40f 100644
--- a/xen/arch/arm/include/asm/domain.h
+++ b/xen/arch/arm/include/asm/domain.h
@@ -59,6 +59,18 @@ struct paging_domain {
     unsigned long p2m_total_pages;
 };
 
+/* Stolen time shared memory region (ARM DEN 0057A.b) */
+struct pv_time_region {
+    /* This field must be 0 as per ARM DEN 0057A.b */
+    uint32_t revision;
+
+    /* This field must be 0 as per ARM DEN 0057A.b */
+    uint32_t attribute;
+
+    /* Total stolen time in nanoseconds */
+    uint64_t stolen_time;
+} __aligned(64);
+
 struct arch_domain
 {
 #ifdef CONFIG_ARM_64
@@ -121,6 +133,9 @@ struct arch_domain
     void *tee;
 #endif
 
+    struct pv_time_region *pv_time_regions;
+    gfn_t pv_time_regions_gfn;
+
 }  __cacheline_aligned;
 
 struct arch_vcpu
@@ -243,6 +258,8 @@ struct arch_vcpu
      */
     bool need_flush_to_ram;
 
+    struct pv_time_region *pv_time_region;
+
 }  __cacheline_aligned;
 
 void vcpu_show_registers(struct vcpu *v);
diff --git a/xen/arch/arm/include/asm/smccc.h b/xen/arch/arm/include/asm/smccc.h
index a289c48b7ffd..6207ac74b715 100644
--- a/xen/arch/arm/include/asm/smccc.h
+++ b/xen/arch/arm/include/asm/smccc.h
@@ -380,6 +380,18 @@ void arm_smccc_1_2_smc(const struct arm_smccc_1_2_regs *args,
                        ARM_SMCCC_OWNER_ARCH,        \
                        0x3FFF)
 
+#define ARM_SMCCC_HYP_PV_TIME_FEATURES              \
+    ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,         \
+                       ARM_SMCCC_CONV_64,           \
+                       ARM_SMCCC_OWNER_HYPERVISOR,  \
+                       0x20)
+
+#define ARM_SMCCC_HYP_PV_TIME_ST                    \
+    ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,         \
+                       ARM_SMCCC_CONV_64,           \
+                       ARM_SMCCC_OWNER_HYPERVISOR,  \
+                       0x21)
+
 /* SMCCC error codes */
 #define ARM_SMCCC_NOT_REQUIRED          (-2)
 #define ARM_SMCCC_ERR_UNKNOWN_FUNCTION  (-1)
diff --git a/xen/arch/arm/vsmc.c b/xen/arch/arm/vsmc.c
index 6081f14ed0c1..1e2fbc1a62b4 100644
--- a/xen/arch/arm/vsmc.c
+++ b/xen/arch/arm/vsmc.c
@@ -12,6 +12,7 @@
 #include <public/arch-arm/smccc.h>
 #include <asm/cpuerrata.h>
 #include <asm/cpufeature.h>
+#include <asm/domain.h>
 #include <asm/monitor.h>
 #include <asm/regs.h>
 #include <asm/smccc.h>
@@ -127,6 +128,10 @@ static bool handle_arch(struct cpu_user_regs *regs)
             if ( cpus_have_cap(ARM_WORKAROUND_BHB_SMCC_3) )
                 ret = ARM_SMCCC_SUCCESS;
             break;
+        case ARM_SMCCC_HYP_PV_TIME_FEATURES:
+            if ( !gfn_eq(current->domain->arch.pv_time_regions_gfn, INVALID_GFN) )
+                ret = ARM_SMCCC_SUCCESS;
+            break;
         }
 
         set_user_reg(regs, 0, ret);
@@ -162,6 +167,35 @@ static bool handle_arch(struct cpu_user_regs *regs)
     return false;
 }
 
+static bool fill_pv_time_features(struct cpu_user_regs *regs)
+{
+    uint32_t arch_func_id = get_user_reg(regs, 1);
+    int ret = ARM_SMCCC_NOT_SUPPORTED;
+
+    if ( arch_func_id == ARM_SMCCC_HYP_PV_TIME_ST &&
+         !gfn_eq(current->domain->arch.pv_time_regions_gfn, INVALID_GFN) )
+        ret = ARM_SMCCC_SUCCESS;
+
+    set_user_reg(regs, 0, ret);
+
+    return true;
+}
+
+static bool fill_pv_time_st(struct cpu_user_regs *regs)
+{
+    register_t ret = ARM_SMCCC_NOT_SUPPORTED;
+    paddr_t gaddr;
+
+    if ( !gfn_eq(current->domain->arch.pv_time_regions_gfn, INVALID_GFN) )
+    {
+        gaddr = gfn_to_gaddr(current->domain->arch.pv_time_regions_gfn);
+        ret = gaddr + current->vcpu_id * sizeof(struct pv_time_region);
+    }
+
+    set_user_reg(regs, 0, ret);
+    return true;
+}
+
 /* SMCCC interface for hypervisor. Tell about itself. */
 static bool handle_hypervisor(struct cpu_user_regs *regs)
 {
@@ -176,6 +210,10 @@ static bool handle_hypervisor(struct cpu_user_regs *regs)
     case ARM_SMCCC_REVISION_FID(HYPERVISOR):
         return fill_revision(regs, XEN_SMCCC_MAJOR_REVISION,
                              XEN_SMCCC_MINOR_REVISION);
+    case ARM_SMCCC_HYP_PV_TIME_FEATURES:
+        return fill_pv_time_features(regs);
+    case ARM_SMCCC_HYP_PV_TIME_ST:
+        return fill_pv_time_st(regs);
     default:
         return false;
     }
diff --git a/xen/common/device-tree/dom0less-build.c b/xen/common/device-tree/dom0less-build.c
index 3d503c697337..fa31b1733388 100644
--- a/xen/common/device-tree/dom0less-build.c
+++ b/xen/common/device-tree/dom0less-build.c
@@ -502,7 +502,7 @@ static int __init prepare_dtb_domU(struct domain *d, struct kernel_info *kinfo)
     if ( ret )
         goto err;
 
-    ret = make_resv_memory_node(kinfo, addrcells, sizecells);
+    ret = make_resv_memory_node(d, kinfo, addrcells, sizecells);
     if ( ret )
         goto err;
 
diff --git a/xen/include/xen/fdt-domain-build.h b/xen/include/xen/fdt-domain-build.h
index e9418857e386..645e7d0a54aa 100644
--- a/xen/include/xen/fdt-domain-build.h
+++ b/xen/include/xen/fdt-domain-build.h
@@ -25,7 +25,7 @@ int construct_domain(struct domain *d, struct kernel_info *kinfo);
 int construct_hwdom(struct kernel_info *kinfo,
                     const struct dt_device_node *node);
 int make_chosen_node(const struct kernel_info *kinfo);
-int make_resv_memory_node(const struct kernel_info *kinfo,
+int make_resv_memory_node(struct domain *d, const struct kernel_info *kinfo,
                           int addrcells, int sizecells);
 int make_cpus_node(const struct domain *d, void *fdt);
 int make_hypervisor_node(struct domain *d, const struct kernel_info *kinfo,
-- 
2.48.1

Re: [RFC PATCH 4/4] xen/arm: Implement standard PV time interface as per ARM DEN 0057A

Posted by Julien Grall 7 months, 3 weeks ago

Hi Koichiro,

I haven't checked all the details. I will mainly comment on the Xen 
internals.

On 21/06/2025 16:12, Koichiro Den wrote:
> The VCPUOP_register_runstate_memory_area hypercall is still actively
> used, e.g., in the Linux arm64 codebase. When KPTI is enabled, the area
> was not registered from the beginning due to the VA not always being
> valid. In such cases, Linux falls back to using the standard PV time
> interface (ARM DEN 0057A), but this interface has not been implemented
> in Xen for ARM64 (until this commit).
> 
> The VCPUOP_register_runstate_phys_area was introduced, though it's
> unclear whether this would be used in Linux arm64 and when it will be
> prevalent amongst every possible downstream domain Linux variant. And of
> course Linux is not an only option for the Xen arm64 domains.
> 
> Therefore, implementing the standard way of sharing PV time is
> generically beneficial, avoiding reliance on specially crafted
> hypercalls, the usage of which by guest VMs is not always guaranteed.
> Note that the PV_TIME_ST interface communicates with IPA (GPA), not GVA.
> 
> Add the PV time interface according to ARM DEN 0057A.
> 
> Signed-off-by: Koichiro Den <den@valinux.co.jp>
> ---
>   xen/arch/arm/domain.c                   | 30 +++++++++
>   xen/arch/arm/domain_build.c             | 87 ++++++++++++++++++++++++-
>   xen/arch/arm/include/asm/domain.h       | 17 +++++
>   xen/arch/arm/include/asm/smccc.h        | 12 ++++
>   xen/arch/arm/vsmc.c                     | 38 +++++++++++
>   xen/common/device-tree/dom0less-build.c |  2 +-
>   xen/include/xen/fdt-domain-build.h      |  2 +-
>   7 files changed, 183 insertions(+), 5 deletions(-)
> 
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index be58a23dd725..e895e4111f1b 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -277,6 +277,18 @@ static void ctxt_switch_to(struct vcpu *n)
>       WRITE_SYSREG(n->arch.mdcr_el2, MDCR_EL2);
>   }
>   
> +static void update_stolen_time(struct vcpu *n)
> +{
> +    uint64_t tot_stolen;
> +
> +    if ( is_idle_vcpu(current) )
> +        return;
> +
> +    tot_stolen = n->runstate.time[RUNSTATE_runnable] +
> +                 n->runstate.time[RUNSTATE_offline];
> +    write_atomic(&n->arch.pv_time_region->stolen_time, tot_stolen);
> +}
> +
>   static void schedule_tail(struct vcpu *prev)
>   {
>       ASSERT(prev != current);
> @@ -291,6 +303,8 @@ static void schedule_tail(struct vcpu *prev)
>   
>       update_runstate_area(current);
>   
> +    update_stolen_time(current);
> +
>       /* Ensure that the vcpu has an up-to-date time base. */
>       update_vcpu_system_time(current);
>   }
> @@ -586,6 +600,8 @@ int arch_vcpu_create(struct vcpu *v)
>       if ( get_ssbd_state() == ARM_SSBD_RUNTIME )
>           v->arch.cpu_info->flags |= CPUINFO_WORKAROUND_2_FLAG;
>   
> +    v->arch.pv_time_region = &v->domain->arch.pv_time_regions[v->vcpu_id];
> +
>       return rc;
>   
>   fail:
> @@ -707,6 +723,7 @@ int arch_domain_create(struct domain *d,
>                          unsigned int flags)
>   {
>       unsigned int count = 0;
> +    int order;
>       int rc;
>   
>       BUILD_BUG_ON(GUEST_MAX_VCPUS < MAX_VIRT_CPUS);
> @@ -791,6 +808,19 @@ int arch_domain_create(struct domain *d,
>       d->arch.sve_vl = config->arch.sve_vl;
>   #endif
>   
> +    /*
> +     * Preallocate the stolen time shared memory regions for all the
> +     * possible vCPUs.
> +     */
> +    order = get_order_from_bytes(d->max_vcpus * sizeof(struct pv_time_region));

As this is an order, we could end up to waste memory fairly quickly. So 
we should try to free the unused pages from the order. That said, the 
maximum number of virtual CPUs we currently support is 128. If I am not 
mistaken, this could fit in 2 4KB pages. So I would also be ok with a 
BUILD_BUG_ON(MAX_VIRT_CPUS <= 128) and we defer this work.

> +    d->arch.pv_time_regions_gfn = INVALID_GFN;

Does this mean PV time is optional? If so, shouldn't we allocate the 
memory conditionally?

Also, looking at the code below, you seem to cater domains created via 
dom0less but not from the toolstack. I think both should be supported 
for the PV time.

Lastly, you probably only want to allocate the memory for 64-bit domain 
as this is unusable for 32-bit domains.

> +    d->arch.pv_time_regions = alloc_xenheap_pages(order, 0);
 > +    if ( !d->arch.pv_time_regions ) {> +        rc = -ENOMEM;
> +        goto fail;
> +    }
> +    memset(d->arch.pv_time_regions, 0, PAGE_SIZE << order);
 > +>       return 0;
>   
>   fail:
> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
> index 85b6909e2b0e..1c51b53d9c6b 100644
> --- a/xen/arch/arm/domain_build.c
> +++ b/xen/arch/arm/domain_build.c
> @@ -1561,8 +1561,80 @@ int __init make_chosen_node(const struct kernel_info *kinfo)
>       return res;
>   }
>   
> -int __init make_resv_memory_node(const struct kernel_info *kinfo, int addrcells,
> -                                 int sizecells)
> +int __init make_pv_time_resv_memory_node(struct domain *d,
> +                                         const struct kernel_info *kinfo,
> +                                         int addrcells, int sizecells)
> +{
> +    __be32 reg[GUEST_ROOT_ADDRESS_CELLS + GUEST_ROOT_SIZE_CELLS];
> +    unsigned int len = (addrcells + sizecells) * sizeof(__be32);
> +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +    struct membanks *unused_banks = NULL;
> +    void *fdt = kinfo->fdt;
> +    unsigned regions_len;
> +    gfn_t pv_time_gfn;
> +    mfn_t pv_time_mfn;
> +    paddr_t aligned;
> +    paddr_t avail;
> +    __be32 *cells;
> +    int res;
> +    int i;
> +
> +    /* Find unused regions */
> +    regions_len = PAGE_ALIGN(d->max_vcpus * 64);

I would be best to avoid hardcoding the size of the region and use the 
size of struct pv_time_region.

> +    unused_banks = membanks_xzalloc(NR_MEM_BANKS, MEMORY);
> +    if ( !unused_banks )
> +        return -ENOMEM;
> +
> +    res = find_unused_regions(d, kinfo, unused_banks);
> +    if ( res ) {
> +        printk(XENLOG_WARNING "%pd: failed to find unused regions\n", d);
> +        goto fail;
> +    }
> +    for ( i = 0; i < unused_banks->nr_banks; i++ ) {
> +        const struct membank *bank = &unused_banks->bank[i];
> +        aligned = PAGE_ALIGN(bank->start);
> +        avail = bank->size - (aligned - bank->start);
> +        if ( avail >= regions_len )
> +            break;
> +    }
> +    if ( i == unused_banks->nr_banks ) {
> +        res = -ENOSPC;
> +        goto fail;
> +    }
> +
> +    /* Insert P2M entry */
> +    pv_time_mfn = virt_to_mfn(d->arch.pv_time_regions);
> +    pv_time_gfn = gaddr_to_gfn(aligned);
> +    p2m_write_lock(p2m);
> +    res = p2m_set_entry(p2m, pv_time_gfn, regions_len / PAGE_SIZE,
> +                        pv_time_mfn, p2m_ram_rw, p2m_access_r);

p2m_access_* are used for restricting temporarily permission by 
memaccess. So on a data abort, we will call p2m_mem_access_check() which 
will then forward the fault to the memaccess agent.

If you want to restrict permission forever, then you need to use a 
different p2m_type_t. In this case, I am guessing you want p2m_ram_ro.

[...]

> diff --git a/xen/arch/arm/include/asm/domain.h b/xen/arch/arm/include/asm/domain.h
> index a3487ca71303..c231c45fe40f 100644
> --- a/xen/arch/arm/include/asm/domain.h
> +++ b/xen/arch/arm/include/asm/domain.h
> @@ -59,6 +59,18 @@ struct paging_domain {
>       unsigned long p2m_total_pages;
>   };
>   
> +/* Stolen time shared memory region (ARM DEN 0057A.b) */
> +struct pv_time_region {
> +    /* This field must be 0 as per ARM DEN 0057A.b */
> +    uint32_t revision;
> +
> +    /* This field must be 0 as per ARM DEN 0057A.b */
> +    uint32_t attribute;
> +
> +    /* Total stolen time in nanoseconds */
> +    uint64_t stolen_time;
> +} __aligned(64);
> +
>   struct arch_domain
>   {
>   #ifdef CONFIG_ARM_64
> @@ -121,6 +133,9 @@ struct arch_domain
>       void *tee;
>   #endif
>   
> +    struct pv_time_region *pv_time_regions;
> +    gfn_t pv_time_regions_gfn;

Given the feature is 32-bit specific, shouldn't the field be protected 
with #define CONFIG_ARM_32?

Cheers,

-- 
Julien Grall

Re: [RFC PATCH 4/4] xen/arm: Implement standard PV time interface as per ARM DEN 0057A

Posted by Koichiro Den 7 months, 2 weeks ago

On Mon, Jun 23, 2025 at 09:41:47AM +0100, Julien Grall wrote:
> Hi Koichiro,
> 
> I haven't checked all the details. I will mainly comment on the Xen
> internals.

Hi Julien,

Thank you for the review, and apologies for my late response.

> 
---(snip)---
> > @@ -707,6 +723,7 @@ int arch_domain_create(struct domain *d,
> >                          unsigned int flags)
> >   {
> >       unsigned int count = 0;
> > +    int order;
> >       int rc;
> >       BUILD_BUG_ON(GUEST_MAX_VCPUS < MAX_VIRT_CPUS);
> > @@ -791,6 +808,19 @@ int arch_domain_create(struct domain *d,
> >       d->arch.sve_vl = config->arch.sve_vl;
> >   #endif
> > +    /*
> > +     * Preallocate the stolen time shared memory regions for all the
> > +     * possible vCPUs.
> > +     */
> > +    order = get_order_from_bytes(d->max_vcpus * sizeof(struct pv_time_region));
> 
> As this is an order, we could end up to waste memory fairly quickly. So we
> should try to free the unused pages from the order. That said, the maximum
> number of virtual CPUs we currently support is 128. If I am not mistaken,
> this could fit in 2 4KB pages. So I would also be ok with a
> BUILD_BUG_ON(MAX_VIRT_CPUS <= 128) and we defer this work.

I'll go with the former in the next iteration. Thanks!

> 
> > +    d->arch.pv_time_regions_gfn = INVALID_GFN;
> 
> Does this mean PV time is optional? If so, shouldn't we allocate the memory
> conditionally?
> 
> Also, looking at the code below, you seem to cater domains created via
> dom0less but not from the toolstack. I think both should be supported for
> the PV time.

Yes, that was intentional. I should've mentioned that this RFC series only
caters the dom0less case. For domains created dynamically via xl create, the
only viable solution I've found so far is to reserve PFN range(s) large enough
to cover the maximum possible number of toolstack-created domains on boot,
which I think would be too wasteful. Do you have any recommendations how to
reliably and dynamically allocating the shared region PFN(s) when using xl?

In any case, I agree that conditional allocation would be preferable.

> 
> Lastly, you probably only want to allocate the memory for 64-bit domain as
> this is unusable for 32-bit domains.
> 
> > +    d->arch.pv_time_regions = alloc_xenheap_pages(order, 0);
> > +    if ( !d->arch.pv_time_regions ) {> +        rc = -ENOMEM;
> > +        goto fail;
> > +    }
> > +    memset(d->arch.pv_time_regions, 0, PAGE_SIZE << order);
> > +>       return 0;
> >   fail:
> > diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
> > index 85b6909e2b0e..1c51b53d9c6b 100644
> > --- a/xen/arch/arm/domain_build.c
> > +++ b/xen/arch/arm/domain_build.c
> > @@ -1561,8 +1561,80 @@ int __init make_chosen_node(const struct kernel_info *kinfo)
> >       return res;
> >   }
> > -int __init make_resv_memory_node(const struct kernel_info *kinfo, int addrcells,
> > -                                 int sizecells)
> > +int __init make_pv_time_resv_memory_node(struct domain *d,
> > +                                         const struct kernel_info *kinfo,
> > +                                         int addrcells, int sizecells)
> > +{
> > +    __be32 reg[GUEST_ROOT_ADDRESS_CELLS + GUEST_ROOT_SIZE_CELLS];
> > +    unsigned int len = (addrcells + sizecells) * sizeof(__be32);
> > +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> > +    struct membanks *unused_banks = NULL;
> > +    void *fdt = kinfo->fdt;
> > +    unsigned regions_len;
> > +    gfn_t pv_time_gfn;
> > +    mfn_t pv_time_mfn;
> > +    paddr_t aligned;
> > +    paddr_t avail;
> > +    __be32 *cells;
> > +    int res;
> > +    int i;
> > +
> > +    /* Find unused regions */
> > +    regions_len = PAGE_ALIGN(d->max_vcpus * 64);
> 
> I would be best to avoid hardcoding the size of the region and use the size
> of struct pv_time_region.

Thanks for catching it, I'll fix this in v2.

> 
> > +    unused_banks = membanks_xzalloc(NR_MEM_BANKS, MEMORY);
> > +    if ( !unused_banks )
> > +        return -ENOMEM;
> > +
> > +    res = find_unused_regions(d, kinfo, unused_banks);
> > +    if ( res ) {
> > +        printk(XENLOG_WARNING "%pd: failed to find unused regions\n", d);
> > +        goto fail;
> > +    }
> > +    for ( i = 0; i < unused_banks->nr_banks; i++ ) {
> > +        const struct membank *bank = &unused_banks->bank[i];
> > +        aligned = PAGE_ALIGN(bank->start);
> > +        avail = bank->size - (aligned - bank->start);
> > +        if ( avail >= regions_len )
> > +            break;
> > +    }
> > +    if ( i == unused_banks->nr_banks ) {
> > +        res = -ENOSPC;
> > +        goto fail;
> > +    }
> > +
> > +    /* Insert P2M entry */
> > +    pv_time_mfn = virt_to_mfn(d->arch.pv_time_regions);
> > +    pv_time_gfn = gaddr_to_gfn(aligned);
> > +    p2m_write_lock(p2m);
> > +    res = p2m_set_entry(p2m, pv_time_gfn, regions_len / PAGE_SIZE,
> > +                        pv_time_mfn, p2m_ram_rw, p2m_access_r);
> 
> p2m_access_* are used for restricting temporarily permission by memaccess.
> So on a data abort, we will call p2m_mem_access_check() which will then
> forward the fault to the memaccess agent.
> 
> If you want to restrict permission forever, then you need to use a different
> p2m_type_t. In this case, I am guessing you want p2m_ram_ro.

Thanks for pointing that out, I'll fix this in v2.

> 
> [...]
> 
> > diff --git a/xen/arch/arm/include/asm/domain.h b/xen/arch/arm/include/asm/domain.h
> > index a3487ca71303..c231c45fe40f 100644
> > --- a/xen/arch/arm/include/asm/domain.h
> > +++ b/xen/arch/arm/include/asm/domain.h
> > @@ -59,6 +59,18 @@ struct paging_domain {
> >       unsigned long p2m_total_pages;
> >   };
> > +/* Stolen time shared memory region (ARM DEN 0057A.b) */
> > +struct pv_time_region {
> > +    /* This field must be 0 as per ARM DEN 0057A.b */
> > +    uint32_t revision;
> > +
> > +    /* This field must be 0 as per ARM DEN 0057A.b */
> > +    uint32_t attribute;
> > +
> > +    /* Total stolen time in nanoseconds */
> > +    uint64_t stolen_time;
> > +} __aligned(64);
> > +
> >   struct arch_domain
> >   {
> >   #ifdef CONFIG_ARM_64
> > @@ -121,6 +133,9 @@ struct arch_domain
> >       void *tee;
> >   #endif
> > +    struct pv_time_region *pv_time_regions;
> > +    gfn_t pv_time_regions_gfn;
> 
> Given the feature is 32-bit specific, shouldn't the field be protected with
> #define CONFIG_ARM_32?

Is this typo s/32/64/? Assuming so, I'll do so (=protect them with #ifdef
CONFIG_ARM_64) in the next iteration. Thanks!

> 
> Cheers,
> 
> -- 
> Julien Grall
>

Re: [RFC PATCH 4/4] xen/arm: Implement standard PV time interface as per ARM DEN 0057A

Posted by Julien Grall 7 months, 2 weeks ago

Hi,

On 28/06/2025 14:58, Koichiro Den wrote:
> On Mon, Jun 23, 2025 at 09:41:47AM +0100, Julien Grall wrote:
> ---(snip)---
>>> @@ -707,6 +723,7 @@ int arch_domain_create(struct domain *d,
>>>                           unsigned int flags)
>>>    {
>>>        unsigned int count = 0;
>>> +    int order;
>>>        int rc;
>>>        BUILD_BUG_ON(GUEST_MAX_VCPUS < MAX_VIRT_CPUS);
>>> @@ -791,6 +808,19 @@ int arch_domain_create(struct domain *d,
>>>        d->arch.sve_vl = config->arch.sve_vl;
>>>    #endif
>>> +    /*
>>> +     * Preallocate the stolen time shared memory regions for all the
>>> +     * possible vCPUs.
>>> +     */
>>> +    order = get_order_from_bytes(d->max_vcpus * sizeof(struct pv_time_region));
>>
>> As this is an order, we could end up to waste memory fairly quickly. So we
>> should try to free the unused pages from the order. That said, the maximum
>> number of virtual CPUs we currently support is 128. If I am not mistaken,
>> this could fit in 2 4KB pages. So I would also be ok with a
>> BUILD_BUG_ON(MAX_VIRT_CPUS <= 128) and we defer this work.
> 
> I'll go with the former in the next iteration. Thanks!
> 
>>
>>> +    d->arch.pv_time_regions_gfn = INVALID_GFN;
>>
>> Does this mean PV time is optional? If so, shouldn't we allocate the memory
>> conditionally?
>>
>> Also, looking at the code below, you seem to cater domains created via
>> dom0less but not from the toolstack. I think both should be supported for
>> the PV time.
> 
> Yes, that was intentional. I should've mentioned that this RFC series only
> caters the dom0less case. For domains created dynamically via xl create, the
> only viable solution I've found so far is to reserve PFN range(s) large enough
> to cover the maximum possible number of toolstack-created domains on boot,
> which I think would be too wasteful.

AFAICT, with current MAX_VIRT_CPUS (128), we would only need to reserve 
8KB in the guest address space. We still have plenty of space that we 
can afford reserve 8KB (the layout is described in 
xen/include/public/arch-arm.h). I would suggest to allocate the region 
just before the grant-table (see GUEST_GNTTAB_BASE).

> In any case, I agree that conditional allocation would be preferable.

For XL, I would suggest to introduce a field flags in 
xen_arch_domainconfig and use one bit for enabling the PV time 
interface. The other bits would be reserved for the future (so we would 
need to check they are zeroes). You can have a look how "flags" in 
xen_domctl_createdomain is handled.

[...]

>>> diff --git a/xen/arch/arm/include/asm/domain.h b/xen/arch/arm/include/asm/domain.h
>>> index a3487ca71303..c231c45fe40f 100644
>>> --- a/xen/arch/arm/include/asm/domain.h
>>> +++ b/xen/arch/arm/include/asm/domain.h
>>> @@ -59,6 +59,18 @@ struct paging_domain {
>>>        unsigned long p2m_total_pages;
>>>    };
>>> +/* Stolen time shared memory region (ARM DEN 0057A.b) */
>>> +struct pv_time_region {
>>> +    /* This field must be 0 as per ARM DEN 0057A.b */
>>> +    uint32_t revision;
>>> +
>>> +    /* This field must be 0 as per ARM DEN 0057A.b */
>>> +    uint32_t attribute;
>>> +
>>> +    /* Total stolen time in nanoseconds */
>>> +    uint64_t stolen_time;
>>> +} __aligned(64);
>>> +
>>>    struct arch_domain
>>>    {
>>>    #ifdef CONFIG_ARM_64
>>> @@ -121,6 +133,9 @@ struct arch_domain
>>>        void *tee;
>>>    #endif
>>> +    struct pv_time_region *pv_time_regions;
>>> +    gfn_t pv_time_regions_gfn;
>>
>> Given the feature is 32-bit specific, shouldn't the field be protected with
>> #define CONFIG_ARM_32?
> 
> Is this typo s/32/64/? Assuming so, I'll do so (=protect them with #ifdef
> CONFIG_ARM_64) in the next iteration. Thanks!

Yes this is a typo.

Cheers,

-- 
Julien Grall

Re: [RFC PATCH 4/4] xen/arm: Implement standard PV time interface as per ARM DEN 0057A

Posted by Koichiro Den 7 months, 2 weeks ago

On Sat, Jun 28, 2025 at 06:36:02PM +0100, Julien Grall wrote:
> Hi,
> 
> On 28/06/2025 14:58, Koichiro Den wrote:
> > On Mon, Jun 23, 2025 at 09:41:47AM +0100, Julien Grall wrote:
> > ---(snip)---
> > > > @@ -707,6 +723,7 @@ int arch_domain_create(struct domain *d,
> > > >                           unsigned int flags)
> > > >    {
> > > >        unsigned int count = 0;
> > > > +    int order;
> > > >        int rc;
> > > >        BUILD_BUG_ON(GUEST_MAX_VCPUS < MAX_VIRT_CPUS);
> > > > @@ -791,6 +808,19 @@ int arch_domain_create(struct domain *d,
> > > >        d->arch.sve_vl = config->arch.sve_vl;
> > > >    #endif
> > > > +    /*
> > > > +     * Preallocate the stolen time shared memory regions for all the
> > > > +     * possible vCPUs.
> > > > +     */
> > > > +    order = get_order_from_bytes(d->max_vcpus * sizeof(struct pv_time_region));
> > > 
> > > As this is an order, we could end up to waste memory fairly quickly. So we
> > > should try to free the unused pages from the order. That said, the maximum
> > > number of virtual CPUs we currently support is 128. If I am not mistaken,
> > > this could fit in 2 4KB pages. So I would also be ok with a
> > > BUILD_BUG_ON(MAX_VIRT_CPUS <= 128) and we defer this work.
> > 
> > I'll go with the former in the next iteration. Thanks!
> > 
> > > 
> > > > +    d->arch.pv_time_regions_gfn = INVALID_GFN;
> > > 
> > > Does this mean PV time is optional? If so, shouldn't we allocate the memory
> > > conditionally?
> > > 
> > > Also, looking at the code below, you seem to cater domains created via
> > > dom0less but not from the toolstack. I think both should be supported for
> > > the PV time.
> > 
> > Yes, that was intentional. I should've mentioned that this RFC series only
> > caters the dom0less case. For domains created dynamically via xl create, the
> > only viable solution I've found so far is to reserve PFN range(s) large enough
> > to cover the maximum possible number of toolstack-created domains on boot,
> > which I think would be too wasteful.
> 
> AFAICT, with current MAX_VIRT_CPUS (128), we would only need to reserve 8KB
> in the guest address space. We still have plenty of space that we can afford
> reserve 8KB (the layout is described in xen/include/public/arch-arm.h). I
> would suggest to allocate the region just before the grant-table (see
> GUEST_GNTTAB_BASE).

That makes sense. So I'm now thinking of adding XENMAPSPACE_pv_time and a
new call site of xc_domain_add_to_physmap() to kick MFN allocations + P2M
setup for PV time shared regions of dynamically created domains.
Thank you for the review.

Koichiro Den

> 
> > In any case, I agree that conditional allocation would be preferable.
> 
> For XL, I would suggest to introduce a field flags in xen_arch_domainconfig
> and use one bit for enabling the PV time interface. The other bits would be
> reserved for the future (so we would need to check they are zeroes). You can
> have a look how "flags" in xen_domctl_createdomain is handled.
> 
> [...]
> 
> > > > diff --git a/xen/arch/arm/include/asm/domain.h b/xen/arch/arm/include/asm/domain.h
> > > > index a3487ca71303..c231c45fe40f 100644
> > > > --- a/xen/arch/arm/include/asm/domain.h
> > > > +++ b/xen/arch/arm/include/asm/domain.h
> > > > @@ -59,6 +59,18 @@ struct paging_domain {
> > > >        unsigned long p2m_total_pages;
> > > >    };
> > > > +/* Stolen time shared memory region (ARM DEN 0057A.b) */
> > > > +struct pv_time_region {
> > > > +    /* This field must be 0 as per ARM DEN 0057A.b */
> > > > +    uint32_t revision;
> > > > +
> > > > +    /* This field must be 0 as per ARM DEN 0057A.b */
> > > > +    uint32_t attribute;
> > > > +
> > > > +    /* Total stolen time in nanoseconds */
> > > > +    uint64_t stolen_time;
> > > > +} __aligned(64);
> > > > +
> > > >    struct arch_domain
> > > >    {
> > > >    #ifdef CONFIG_ARM_64
> > > > @@ -121,6 +133,9 @@ struct arch_domain
> > > >        void *tee;
> > > >    #endif
> > > > +    struct pv_time_region *pv_time_regions;
> > > > +    gfn_t pv_time_regions_gfn;
> > > 
> > > Given the feature is 32-bit specific, shouldn't the field be protected with
> > > #define CONFIG_ARM_32?
> > 
> > Is this typo s/32/64/? Assuming so, I'll do so (=protect them with #ifdef
> > CONFIG_ARM_64) in the next iteration. Thanks!
> 
> Yes this is a typo.
> 
> Cheers,
> 
> -- 
> Julien Grall
>

Re: [RFC PATCH 4/4] xen/arm: Implement standard PV time interface as per ARM DEN 0057A

Posted by Stefano Stabellini 7 months, 3 weeks ago

On Sun, 22 Jun 2025, Koichiro Den wrote:
> The VCPUOP_register_runstate_memory_area hypercall is still actively
> used, e.g., in the Linux arm64 codebase. When KPTI is enabled, the area
> was not registered from the beginning due to the VA not always being
> valid. In such cases, Linux falls back to using the standard PV time
> interface (ARM DEN 0057A), but this interface has not been implemented
> in Xen for ARM64 (until this commit).
> 
> The VCPUOP_register_runstate_phys_area was introduced, though it's
> unclear whether this would be used in Linux arm64 and when it will be
> prevalent amongst every possible downstream domain Linux variant. And of
> course Linux is not an only option for the Xen arm64 domains.
> 
> Therefore, implementing the standard way of sharing PV time is
> generically beneficial, avoiding reliance on specially crafted
> hypercalls, the usage of which by guest VMs is not always guaranteed.
> Note that the PV_TIME_ST interface communicates with IPA (GPA), not GVA.
> 
> Add the PV time interface according to ARM DEN 0057A.
> 
> Signed-off-by: Koichiro Den <den@valinux.co.jp>
> ---
>  xen/arch/arm/domain.c                   | 30 +++++++++
>  xen/arch/arm/domain_build.c             | 87 ++++++++++++++++++++++++-
>  xen/arch/arm/include/asm/domain.h       | 17 +++++
>  xen/arch/arm/include/asm/smccc.h        | 12 ++++
>  xen/arch/arm/vsmc.c                     | 38 +++++++++++
>  xen/common/device-tree/dom0less-build.c |  2 +-
>  xen/include/xen/fdt-domain-build.h      |  2 +-
>  7 files changed, 183 insertions(+), 5 deletions(-)
> 
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index be58a23dd725..e895e4111f1b 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -277,6 +277,18 @@ static void ctxt_switch_to(struct vcpu *n)
>      WRITE_SYSREG(n->arch.mdcr_el2, MDCR_EL2);
>  }
>  
> +static void update_stolen_time(struct vcpu *n)
> +{
> +    uint64_t tot_stolen;
> +
> +    if ( is_idle_vcpu(current) )
> +        return;
> +
> +    tot_stolen = n->runstate.time[RUNSTATE_runnable] +
> +                 n->runstate.time[RUNSTATE_offline];
> +    write_atomic(&n->arch.pv_time_region->stolen_time, tot_stolen);
> +}
> +
>  static void schedule_tail(struct vcpu *prev)
>  {
>      ASSERT(prev != current);
> @@ -291,6 +303,8 @@ static void schedule_tail(struct vcpu *prev)
>  
>      update_runstate_area(current);
>  
> +    update_stolen_time(current);
> +
>      /* Ensure that the vcpu has an up-to-date time base. */
>      update_vcpu_system_time(current);
>  }
> @@ -586,6 +600,8 @@ int arch_vcpu_create(struct vcpu *v)
>      if ( get_ssbd_state() == ARM_SSBD_RUNTIME )
>          v->arch.cpu_info->flags |= CPUINFO_WORKAROUND_2_FLAG;
>  
> +    v->arch.pv_time_region = &v->domain->arch.pv_time_regions[v->vcpu_id];
> +
>      return rc;
>  
>  fail:
> @@ -707,6 +723,7 @@ int arch_domain_create(struct domain *d,
>                         unsigned int flags)
>  {
>      unsigned int count = 0;
> +    int order;

unsigned int


>      int rc;
>  
>      BUILD_BUG_ON(GUEST_MAX_VCPUS < MAX_VIRT_CPUS);
> @@ -791,6 +808,19 @@ int arch_domain_create(struct domain *d,
>      d->arch.sve_vl = config->arch.sve_vl;
>  #endif
>  
> +    /*
> +     * Preallocate the stolen time shared memory regions for all the
> +     * possible vCPUs.
> +     */
> +    order = get_order_from_bytes(d->max_vcpus * sizeof(struct pv_time_region));
> +    d->arch.pv_time_regions_gfn = INVALID_GFN;
> +    d->arch.pv_time_regions = alloc_xenheap_pages(order, 0);
> +    if ( !d->arch.pv_time_regions ) {
> +        rc = -ENOMEM;
> +        goto fail;
> +    }
> +    memset(d->arch.pv_time_regions, 0, PAGE_SIZE << order);
> +
>      return 0;
>  
>  fail:
> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
> index 85b6909e2b0e..1c51b53d9c6b 100644
> --- a/xen/arch/arm/domain_build.c
> +++ b/xen/arch/arm/domain_build.c
> @@ -1561,8 +1561,80 @@ int __init make_chosen_node(const struct kernel_info *kinfo)
>      return res;
>  }
>  
> -int __init make_resv_memory_node(const struct kernel_info *kinfo, int addrcells,
> -                                 int sizecells)
> +int __init make_pv_time_resv_memory_node(struct domain *d,
> +                                         const struct kernel_info *kinfo,
> +                                         int addrcells, int sizecells)
> +{
> +    __be32 reg[GUEST_ROOT_ADDRESS_CELLS + GUEST_ROOT_SIZE_CELLS];
> +    unsigned int len = (addrcells + sizecells) * sizeof(__be32);
> +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +    struct membanks *unused_banks = NULL;
> +    void *fdt = kinfo->fdt;
> +    unsigned regions_len;
> +    gfn_t pv_time_gfn;
> +    mfn_t pv_time_mfn;
> +    paddr_t aligned;
> +    paddr_t avail;
> +    __be32 *cells;
> +    int res;
> +    int i;

unsigned int i


> +    /* Find unused regions */
> +    regions_len = PAGE_ALIGN(d->max_vcpus * 64);
> +    unused_banks = membanks_xzalloc(NR_MEM_BANKS, MEMORY);
> +    if ( !unused_banks )
> +        return -ENOMEM;
> +
> +    res = find_unused_regions(d, kinfo, unused_banks);
> +    if ( res ) {
> +        printk(XENLOG_WARNING "%pd: failed to find unused regions\n", d);
> +        goto fail;
> +    }
> +    for ( i = 0; i < unused_banks->nr_banks; i++ ) {
> +        const struct membank *bank = &unused_banks->bank[i];
> +        aligned = PAGE_ALIGN(bank->start);
> +        avail = bank->size - (aligned - bank->start);
> +        if ( avail >= regions_len )
> +            break;
> +    }
> +    if ( i == unused_banks->nr_banks ) {
> +        res = -ENOSPC;
> +        goto fail;
> +    }
> +
> +    /* Insert P2M entry */
> +    pv_time_mfn = virt_to_mfn(d->arch.pv_time_regions);
> +    pv_time_gfn = gaddr_to_gfn(aligned);
> +    p2m_write_lock(p2m);
> +    res = p2m_set_entry(p2m, pv_time_gfn, regions_len / PAGE_SIZE,

PFN_DOW


> +                        pv_time_mfn, p2m_ram_rw, p2m_access_r);
> +    p2m_write_unlock(p2m);
> +    if ( res ) {
> +        printk(XENLOG_WARNING "%pd: failed to set P2M entry for PV_TIME\n", d);
> +        goto fail;
> +    }
> +    d->arch.pv_time_regions_gfn = pv_time_gfn;
> +
> +    /* Reserve the selected GFN */
> +    res = domain_fdt_begin_node(fdt, "pv-time", gfn_x(pv_time_gfn));
> +    if ( res )
> +        goto fail;
> +
> +    cells = reg;
> +    dt_child_set_range(&cells, addrcells, sizecells, gfn_x(pv_time_gfn), regions_len);
> +    res = fdt_property(fdt, "reg", reg, len);
> +    if ( res )
> +        goto fail;
> +
> +    res = fdt_end_node(fdt);
> +
> +  fail:
> +    xfree(unused_banks);
> +    return res;
> +}
> +
> +int __init make_resv_memory_node(struct domain *d, const struct kernel_info *kinfo,
> +                                 int addrcells, int sizecells)
>  {
>      const struct membanks *mem = kernel_info_get_shm_mem_const(kinfo);
>      void *fdt = kinfo->fdt;
> @@ -1596,6 +1668,10 @@ int __init make_resv_memory_node(const struct kernel_info *kinfo, int addrcells,
>      if ( res )
>          return res;
>  
> +    res = make_pv_time_resv_memory_node(d, kinfo, addrcells, sizecells);
> +    if ( res )
> +        return res;
> +
>      res = fdt_end_node(fdt);
>  
>      return res;
> @@ -1744,6 +1820,11 @@ static int __init handle_node(struct domain *d, struct kernel_info *kinfo,
>                                          dt_n_size_cells(node));
>          if ( res )
>              return res;
> +
> +        res = make_pv_time_resv_memory_node(d, kinfo, dt_n_addr_cells(node),
> +                                            dt_n_size_cells(node));
> +        if ( res )
> +            return res;
>      }
>  
>      for ( child = node->child; child != NULL; child = child->sibling )
> @@ -1791,7 +1872,7 @@ static int __init handle_node(struct domain *d, struct kernel_info *kinfo,
>  
>          if ( !res_mem_node_found )
>          {
> -            res = make_resv_memory_node(kinfo, addrcells, sizecells);
> +            res = make_resv_memory_node(d, kinfo, addrcells, sizecells);
>              if ( res )
>                  return res;
>          }
> diff --git a/xen/arch/arm/include/asm/domain.h b/xen/arch/arm/include/asm/domain.h
> index a3487ca71303..c231c45fe40f 100644
> --- a/xen/arch/arm/include/asm/domain.h
> +++ b/xen/arch/arm/include/asm/domain.h
> @@ -59,6 +59,18 @@ struct paging_domain {
>      unsigned long p2m_total_pages;
>  };
>  
> +/* Stolen time shared memory region (ARM DEN 0057A.b) */
> +struct pv_time_region {
> +    /* This field must be 0 as per ARM DEN 0057A.b */
> +    uint32_t revision;
> +
> +    /* This field must be 0 as per ARM DEN 0057A.b */
> +    uint32_t attribute;
> +
> +    /* Total stolen time in nanoseconds */
> +    uint64_t stolen_time;
> +} __aligned(64);
> +
>  struct arch_domain
>  {
>  #ifdef CONFIG_ARM_64
> @@ -121,6 +133,9 @@ struct arch_domain
>      void *tee;
>  #endif
>  
> +    struct pv_time_region *pv_time_regions;
> +    gfn_t pv_time_regions_gfn;
> +
>  }  __cacheline_aligned;
>  
>  struct arch_vcpu
> @@ -243,6 +258,8 @@ struct arch_vcpu
>       */
>      bool need_flush_to_ram;
>  
> +    struct pv_time_region *pv_time_region;
> +
>  }  __cacheline_aligned;
>  
>  void vcpu_show_registers(struct vcpu *v);
> diff --git a/xen/arch/arm/include/asm/smccc.h b/xen/arch/arm/include/asm/smccc.h
> index a289c48b7ffd..6207ac74b715 100644
> --- a/xen/arch/arm/include/asm/smccc.h
> +++ b/xen/arch/arm/include/asm/smccc.h
> @@ -380,6 +380,18 @@ void arm_smccc_1_2_smc(const struct arm_smccc_1_2_regs *args,
>                         ARM_SMCCC_OWNER_ARCH,        \
>                         0x3FFF)
>  
> +#define ARM_SMCCC_HYP_PV_TIME_FEATURES              \
> +    ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,         \
> +                       ARM_SMCCC_CONV_64,           \
> +                       ARM_SMCCC_OWNER_HYPERVISOR,  \
> +                       0x20)
> +
> +#define ARM_SMCCC_HYP_PV_TIME_ST                    \
> +    ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,         \
> +                       ARM_SMCCC_CONV_64,           \
> +                       ARM_SMCCC_OWNER_HYPERVISOR,  \
> +                       0x21)
> +
>  /* SMCCC error codes */
>  #define ARM_SMCCC_NOT_REQUIRED          (-2)
>  #define ARM_SMCCC_ERR_UNKNOWN_FUNCTION  (-1)
> diff --git a/xen/arch/arm/vsmc.c b/xen/arch/arm/vsmc.c
> index 6081f14ed0c1..1e2fbc1a62b4 100644
> --- a/xen/arch/arm/vsmc.c
> +++ b/xen/arch/arm/vsmc.c
> @@ -12,6 +12,7 @@
>  #include <public/arch-arm/smccc.h>
>  #include <asm/cpuerrata.h>
>  #include <asm/cpufeature.h>
> +#include <asm/domain.h>
>  #include <asm/monitor.h>
>  #include <asm/regs.h>
>  #include <asm/smccc.h>
> @@ -127,6 +128,10 @@ static bool handle_arch(struct cpu_user_regs *regs)
>              if ( cpus_have_cap(ARM_WORKAROUND_BHB_SMCC_3) )
>                  ret = ARM_SMCCC_SUCCESS;
>              break;
> +        case ARM_SMCCC_HYP_PV_TIME_FEATURES:
> +            if ( !gfn_eq(current->domain->arch.pv_time_regions_gfn, INVALID_GFN) )
> +                ret = ARM_SMCCC_SUCCESS;
> +            break;
>          }
>  
>          set_user_reg(regs, 0, ret);
> @@ -162,6 +167,35 @@ static bool handle_arch(struct cpu_user_regs *regs)
>      return false;
>  }
>  
> +static bool fill_pv_time_features(struct cpu_user_regs *regs)
> +{
> +    uint32_t arch_func_id = get_user_reg(regs, 1);
> +    int ret = ARM_SMCCC_NOT_SUPPORTED;
> +
> +    if ( arch_func_id == ARM_SMCCC_HYP_PV_TIME_ST &&
> +         !gfn_eq(current->domain->arch.pv_time_regions_gfn, INVALID_GFN) )
> +        ret = ARM_SMCCC_SUCCESS;
> +
> +    set_user_reg(regs, 0, ret);
> +
> +    return true;
> +}
> +
> +static bool fill_pv_time_st(struct cpu_user_regs *regs)
> +{
> +    register_t ret = ARM_SMCCC_NOT_SUPPORTED;
> +    paddr_t gaddr;
> +
> +    if ( !gfn_eq(current->domain->arch.pv_time_regions_gfn, INVALID_GFN) )
> +    {
> +        gaddr = gfn_to_gaddr(current->domain->arch.pv_time_regions_gfn);
> +        ret = gaddr + current->vcpu_id * sizeof(struct pv_time_region);
> +    }
> +
> +    set_user_reg(regs, 0, ret);
> +    return true;
> +}
> +
>  /* SMCCC interface for hypervisor. Tell about itself. */
>  static bool handle_hypervisor(struct cpu_user_regs *regs)
>  {
> @@ -176,6 +210,10 @@ static bool handle_hypervisor(struct cpu_user_regs *regs)
>      case ARM_SMCCC_REVISION_FID(HYPERVISOR):
>          return fill_revision(regs, XEN_SMCCC_MAJOR_REVISION,
>                               XEN_SMCCC_MINOR_REVISION);
> +    case ARM_SMCCC_HYP_PV_TIME_FEATURES:
> +        return fill_pv_time_features(regs);
> +    case ARM_SMCCC_HYP_PV_TIME_ST:
> +        return fill_pv_time_st(regs);

This file is common across 32-bit and 64-bit guests. Looking at the
implementation of fill_pv_time_st, which returns a 64-bit address on a
potentially 32-bit register, it looks like this interface is only meant
for 64-bit guests?

If so, then we should have a check here and return "not supported" for
32-bit callers.


>      default:
>          return false;
>      }
> diff --git a/xen/common/device-tree/dom0less-build.c b/xen/common/device-tree/dom0less-build.c
> index 3d503c697337..fa31b1733388 100644
> --- a/xen/common/device-tree/dom0less-build.c
> +++ b/xen/common/device-tree/dom0less-build.c
> @@ -502,7 +502,7 @@ static int __init prepare_dtb_domU(struct domain *d, struct kernel_info *kinfo)
>      if ( ret )
>          goto err;
>  
> -    ret = make_resv_memory_node(kinfo, addrcells, sizecells);
> +    ret = make_resv_memory_node(d, kinfo, addrcells, sizecells);
>      if ( ret )
>          goto err;
>  
> diff --git a/xen/include/xen/fdt-domain-build.h b/xen/include/xen/fdt-domain-build.h
> index e9418857e386..645e7d0a54aa 100644
> --- a/xen/include/xen/fdt-domain-build.h
> +++ b/xen/include/xen/fdt-domain-build.h
> @@ -25,7 +25,7 @@ int construct_domain(struct domain *d, struct kernel_info *kinfo);
>  int construct_hwdom(struct kernel_info *kinfo,
>                      const struct dt_device_node *node);
>  int make_chosen_node(const struct kernel_info *kinfo);
> -int make_resv_memory_node(const struct kernel_info *kinfo,
> +int make_resv_memory_node(struct domain *d, const struct kernel_info *kinfo,
>                            int addrcells, int sizecells);
>  int make_cpus_node(const struct domain *d, void *fdt);
>  int make_hypervisor_node(struct domain *d, const struct kernel_info *kinfo,
> -- 
> 2.48.1
>

Re: [RFC PATCH 4/4] xen/arm: Implement standard PV time interface as per ARM DEN 0057A

Posted by Julien Grall 7 months, 3 weeks ago

Hi Stefano,

On 23/06/2025 00:26, Stefano Stabellini wrote:
> On Sun, 22 Jun 2025, Koichiro Den wrote:
  > If so, then we should have a check here and return "not supported" for
> 32-bit callers.

We already have a generic check to confirm 32-bit domain are not using 
the 64-bit convention. See [1]. So no need for a per-call one.

Cheers,

[1] 
https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=xen/arch/arm/vsmc.c;h=6081f14ed0c195306029c5aba7309bee44193fa4;hb=HEAD#l272

-- 
Julien Grall

[RFC PATCH 1/4] xen/arm: Add wrapper find_unused_regions
[RFC PATCH 2/4] xen/arm: Move make_hypervisor_node()
[RFC PATCH 3/4] xen/arm: Move make_resv_memory_node()
[RFC PATCH 4/4] xen/arm: Implement standard PV time interface as per ARM DEN 0057A