[v3] xen/riscv: introduce p2m functionality

[PATCH v3 03/20] xen/riscv: introduce VMID allocation and manegement

Posted by Oleksii Kurochko 3 months ago

Current implementation is based on x86's way to allocate VMIDs:
  VMIDs partition the physical TLB. In the current implementation VMIDs are
  introduced to reduce the number of TLB flushes.  Each time the guest's
  virtual address space changes, instead of flushing the TLB, a new VMID is
  assigned.  This reduces the number of TLB flushes to at most 1/#VMIDs.
  The biggest advantage is that hot parts of the hypervisor's code and data
  retain in the TLB.

  VMIDs are a hart-local resource.  As preemption of VMIDs is not possible,
  VMIDs are assigned in a round-robin scheme.  To minimize the overhead of
  VMID invalidation, at the time of a TLB flush, VMIDs are tagged with a
  64-bit generation. Only on a generation overflow the code needs to
  invalidate all VMID information stored at the VCPUs with are run on the
  specific physical processor.  This overflow appears after about 2^80
  host processor cycles, so we do not optimize this case, but simply disable
  VMID useage to retain correctness.

Only minor changes are made compared to the x86 implementation.
These include using RISC-V-specific terminology, adding a check to ensure
the type used for storing the VMID has enough bits to hold VMIDLEN,
and introducing a new function vmidlen_detect() to clarify the VMIDLEN
value.

Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
---
Changes in V3:
 - Reimplemnt VMID allocation similar to what x86 has implemented.
---
Changes in V2:
 - New patch.
---
 xen/arch/riscv/Makefile               |   1 +
 xen/arch/riscv/include/asm/domain.h   |   6 +
 xen/arch/riscv/include/asm/flushtlb.h |   5 +
 xen/arch/riscv/include/asm/vmid.h     |   8 ++
 xen/arch/riscv/setup.c                |   3 +
 xen/arch/riscv/vmid.c                 | 165 ++++++++++++++++++++++++++
 6 files changed, 188 insertions(+)
 create mode 100644 xen/arch/riscv/include/asm/vmid.h
 create mode 100644 xen/arch/riscv/vmid.c

diff --git a/xen/arch/riscv/Makefile b/xen/arch/riscv/Makefile
index e2b8aa42c8..745a85e116 100644
--- a/xen/arch/riscv/Makefile
+++ b/xen/arch/riscv/Makefile
@@ -16,6 +16,7 @@ obj-y += smpboot.o
 obj-y += stubs.o
 obj-y += time.o
 obj-y += traps.o
+obj-y += vmid.o
 obj-y += vm_event.o
 
 $(TARGET): $(TARGET)-syms
diff --git a/xen/arch/riscv/include/asm/domain.h b/xen/arch/riscv/include/asm/domain.h
index c3d965a559..aac1040658 100644
--- a/xen/arch/riscv/include/asm/domain.h
+++ b/xen/arch/riscv/include/asm/domain.h
@@ -5,6 +5,11 @@
 #include <xen/xmalloc.h>
 #include <public/hvm/params.h>
 
+struct vcpu_vmid {
+    uint64_t generation;
+    uint16_t vmid;
+};
+
 struct hvm_domain
 {
     uint64_t              params[HVM_NR_PARAMS];
@@ -14,6 +19,7 @@ struct arch_vcpu_io {
 };
 
 struct arch_vcpu {
+    struct vcpu_vmid vmid;
 };
 
 struct arch_domain {
diff --git a/xen/arch/riscv/include/asm/flushtlb.h b/xen/arch/riscv/include/asm/flushtlb.h
index 51c8f753c5..f391ae6eb7 100644
--- a/xen/arch/riscv/include/asm/flushtlb.h
+++ b/xen/arch/riscv/include/asm/flushtlb.h
@@ -7,6 +7,11 @@
 
 #include <asm/sbi.h>
 
+static inline void local_hfence_gvma_all(void)
+{
+    asm volatile ( "hfence.gvma zero, zero" ::: "memory" );
+}
+
 /* Flush TLB of local processor for address va. */
 static inline void flush_tlb_one_local(vaddr_t va)
 {
diff --git a/xen/arch/riscv/include/asm/vmid.h b/xen/arch/riscv/include/asm/vmid.h
new file mode 100644
index 0000000000..2f1f7ec9a2
--- /dev/null
+++ b/xen/arch/riscv/include/asm/vmid.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef ASM_RISCV_VMID_H
+#define ASM_RISCV_VMID_H
+
+void vmid_init(void);
+
+#endif /* ASM_RISCV_VMID_H */
diff --git a/xen/arch/riscv/setup.c b/xen/arch/riscv/setup.c
index 483cdd7e17..549228d73f 100644
--- a/xen/arch/riscv/setup.c
+++ b/xen/arch/riscv/setup.c
@@ -25,6 +25,7 @@
 #include <asm/sbi.h>
 #include <asm/setup.h>
 #include <asm/traps.h>
+#include <asm/vmid.h>
 
 /* Xen stack for bringing up the first CPU. */
 unsigned char __initdata cpu0_boot_stack[STACK_SIZE]
@@ -148,6 +149,8 @@ void __init noreturn start_xen(unsigned long bootcpu_id,
 
     console_init_postirq();
 
+    vmid_init();
+
     printk("All set up\n");
 
     machine_halt();
diff --git a/xen/arch/riscv/vmid.c b/xen/arch/riscv/vmid.c
new file mode 100644
index 0000000000..7ad1b91ee2
--- /dev/null
+++ b/xen/arch/riscv/vmid.c
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <xen/domain.h>
+#include <xen/init.h>
+#include <xen/sections.h>
+#include <xen/lib.h>
+#include <xen/param.h>
+#include <xen/percpu.h>
+
+#include <asm/atomic.h>
+#include <asm/csr.h>
+#include <asm/flushtlb.h>
+
+/* Xen command-line option to enable VMIDs */
+static bool __read_mostly opt_vmid_enabled = true;
+boolean_param("vmid", opt_vmid_enabled);
+
+/*
+ * VMIDs partition the physical TLB. In the current implementation VMIDs are
+ * introduced to reduce the number of TLB flushes.  Each time the guest's
+ * virtual address space changes, instead of flushing the TLB, a new VMID is
+ * assigned. This reduces the number of TLB flushes to at most 1/#VMIDs.
+ * The biggest advantage is that hot parts of the hypervisor's code and data
+ * retain in the TLB.
+ *
+ * Sketch of the Implementation:
+ *
+ * VMIDs are a hart-local resource.  As preemption of VMIDs is not possible,
+ * VMIDs are assigned in a round-robin scheme.  To minimize the overhead of
+ * VMID invalidation, at the time of a TLB flush, VMIDs are tagged with a
+ * 64-bit generation. Only on a generation overflow the code needs to
+ * invalidate all VMID information stored at the VCPUs with are run on the
+ * specific physical processor.  This overflow appears after about 2^80
+ * host processor cycles, so we do not optimize this case, but simply disable
+ * VMID useage to retain correctness.
+ */
+
+/* Per-Hart VMID management. */
+struct vmid_data {
+   uint64_t hart_vmid_generation;
+   uint16_t next_vmid;
+   uint16_t max_vmid;
+   bool disabled;
+};
+
+static DEFINE_PER_CPU(struct vmid_data, vmid_data);
+
+static unsigned long vmidlen_detect(void)
+{
+    unsigned long vmid_bits;
+    unsigned long old;
+
+    /* Figure-out number of VMID bits in HW */
+    old = csr_read(CSR_HGATP);
+
+    csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
+    vmid_bits = csr_read(CSR_HGATP);
+    vmid_bits =  MASK_EXTR(vmid_bits, HGATP_VMID_MASK);
+    vmid_bits = flsl(vmid_bits);
+    csr_write(CSR_HGATP, old);
+
+    /*
+     * We polluted local TLB so flush all guest TLB as
+     * a speculative access can happen at any time.
+     */
+    local_hfence_gvma_all();
+
+    return vmid_bits;
+}
+
+void vmid_init(void)
+{
+    static bool g_disabled = false;
+
+    unsigned long vmid_len = vmidlen_detect();
+    struct vmid_data *data = &this_cpu(vmid_data);
+    unsigned long max_availalbe_bits = sizeof(data->max_vmid) << 3;
+
+    if ( vmid_len > max_availalbe_bits )
+        panic("%s: VMIDLEN is bigger then a type which represent VMID: %lu(%lu)\n",
+              __func__, vmid_len, max_availalbe_bits);
+
+    data->max_vmid = BIT(vmid_len, U) - 1;
+    data->disabled = !opt_vmid_enabled || (vmid_len <= 1);
+
+    if ( g_disabled != data->disabled )
+    {
+        printk("%s: VMIDs %sabled.\n", __func__,
+               data->disabled ? "dis" : "en");
+        if ( !g_disabled )
+            g_disabled = data->disabled;
+    }
+
+    /* Zero indicates 'invalid generation', so we start the count at one. */
+    data->hart_vmid_generation = 1;
+
+    /* Zero indicates 'VMIDs disabled', so we start the count at one. */
+    data->next_vmid = 1;
+}
+
+void vcpu_vmid_flush_vcpu(struct vcpu *v)
+{
+    write_atomic(&v->arch.vmid.generation, 0);
+}
+
+void vmid_flush_hart(void)
+{
+    struct vmid_data *data = &this_cpu(vmid_data);
+
+    if ( data->disabled )
+        return;
+
+    if ( likely(++data->hart_vmid_generation != 0) )
+        return;
+
+    /*
+     * VMID generations are 64 bit.  Overflow of generations never happens.
+     * For safety, we simply disable ASIDs, so correctness is established; it
+     * only runs a bit slower.
+     */
+    printk("%s: VMID generation overrun. Disabling VMIDs.\n", __func__);
+    data->disabled = 1;
+}
+
+bool vmid_handle_vmenter(struct vcpu_vmid *vmid)
+{
+    struct vmid_data *data = &this_cpu(vmid_data);
+
+    /* Test if VCPU has valid VMID. */
+    if ( read_atomic(&vmid->generation) == data->hart_vmid_generation )
+        return 0;
+
+    /* If there are no free VMIDs, need to go to a new generation. */
+    if ( unlikely(data->next_vmid > data->max_vmid) )
+    {
+        vmid_flush_hart();
+        data->next_vmid = 1;
+        if ( data->disabled )
+            goto disabled;
+    }
+
+    /* Now guaranteed to be a free VMID. */
+    vmid->vmid = data->next_vmid++;
+    write_atomic(&vmid->generation, data->hart_vmid_generation);
+
+    /*
+     * When we assign VMID 1, flush all TLB entries as we are starting a new
+     * generation, and all old VMID allocations are now stale.
+     */
+    return (vmid->vmid == 1);
+
+ disabled:
+    vmid->vmid = 0;
+    return 0;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
-- 
2.50.1

Re: [PATCH v3 03/20] xen/riscv: introduce VMID allocation and manegement

Posted by Jan Beulich 2 months, 3 weeks ago

On 31.07.2025 17:58, Oleksii Kurochko wrote:
> Current implementation is based on x86's way to allocate VMIDs:
>   VMIDs partition the physical TLB. In the current implementation VMIDs are
>   introduced to reduce the number of TLB flushes.  Each time the guest's
>   virtual address space changes,

virtual?

> instead of flushing the TLB, a new VMID is
>   assigned.  This reduces the number of TLB flushes to at most 1/#VMIDs.
>   The biggest advantage is that hot parts of the hypervisor's code and data
>   retain in the TLB.
> 
>   VMIDs are a hart-local resource.  As preemption of VMIDs is not possible,
>   VMIDs are assigned in a round-robin scheme.  To minimize the overhead of
>   VMID invalidation, at the time of a TLB flush, VMIDs are tagged with a
>   64-bit generation. Only on a generation overflow the code needs to
>   invalidate all VMID information stored at the VCPUs with are run on the
>   specific physical processor.  This overflow appears after about 2^80
>   host processor cycles,

Where's this number coming from? (If you provide numbers, I think they will
want to be "reproducible" by the reader. Which I fear isn't the case here.)

> @@ -148,6 +149,8 @@ void __init noreturn start_xen(unsigned long bootcpu_id,
>  
>      console_init_postirq();
>  
> +    vmid_init();

This lives here only temporarily, I assume? Every hart will need to execute
it, and hence (like we have it on x86) this may want to be a central place
elsewhere.

> --- /dev/null
> +++ b/xen/arch/riscv/vmid.c
> @@ -0,0 +1,165 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
> +#include <xen/domain.h>
> +#include <xen/init.h>
> +#include <xen/sections.h>
> +#include <xen/lib.h>
> +#include <xen/param.h>
> +#include <xen/percpu.h>
> +
> +#include <asm/atomic.h>
> +#include <asm/csr.h>
> +#include <asm/flushtlb.h>
> +
> +/* Xen command-line option to enable VMIDs */
> +static bool __read_mostly opt_vmid_enabled = true;

__ro_after_init ?

> +boolean_param("vmid", opt_vmid_enabled);
> +
> +/*
> + * VMIDs partition the physical TLB. In the current implementation VMIDs are
> + * introduced to reduce the number of TLB flushes.  Each time the guest's
> + * virtual address space changes, instead of flushing the TLB, a new VMID is

The same odd "virtual" again? All the code here is about guest-physical, isn't
it?

> + * assigned. This reduces the number of TLB flushes to at most 1/#VMIDs.
> + * The biggest advantage is that hot parts of the hypervisor's code and data
> + * retain in the TLB.
> + *
> + * Sketch of the Implementation:
> + *
> + * VMIDs are a hart-local resource.  As preemption of VMIDs is not possible,
> + * VMIDs are assigned in a round-robin scheme.  To minimize the overhead of
> + * VMID invalidation, at the time of a TLB flush, VMIDs are tagged with a
> + * 64-bit generation. Only on a generation overflow the code needs to
> + * invalidate all VMID information stored at the VCPUs with are run on the
> + * specific physical processor.  This overflow appears after about 2^80

And the same interesting number again.

> + * host processor cycles, so we do not optimize this case, but simply disable
> + * VMID useage to retain correctness.
> + */
> +
> +/* Per-Hart VMID management. */
> +struct vmid_data {
> +   uint64_t hart_vmid_generation;

Any reason not to simply use "generation"?

> +   uint16_t next_vmid;
> +   uint16_t max_vmid;
> +   bool disabled;
> +};
> +
> +static DEFINE_PER_CPU(struct vmid_data, vmid_data);
> +
> +static unsigned long vmidlen_detect(void)

__init ? Or wait, are you (deliberately) permitting different VMIDLEN
across harts?

> +{
> +    unsigned long vmid_bits;

Why "long" (also for the function return type)?

> +    unsigned long old;
> +
> +    /* Figure-out number of VMID bits in HW */
> +    old = csr_read(CSR_HGATP);
> +
> +    csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
> +    vmid_bits = csr_read(CSR_HGATP);
> +    vmid_bits =  MASK_EXTR(vmid_bits, HGATP_VMID_MASK);

Nit: Stray blank.

> +    vmid_bits = flsl(vmid_bits);
> +    csr_write(CSR_HGATP, old);
> +
> +    /*
> +     * We polluted local TLB so flush all guest TLB as
> +     * a speculative access can happen at any time.
> +     */
> +    local_hfence_gvma_all();

There's no guest running. If you wrote hgat.MODE as zero, as per my
understanding now new TLB entries could even purely theoretically appear.
In fact, with no guest running (yet) I'm having a hard time seeing why
you shouldn't be able to simply write the register with just
HGATP_VMID_MASK, i.e. without OR-ing in "old". It's even questionable
whether "old" needs restoring; writing plain zero afterwards ought to
suffice. You're in charcge of the register, after all.

> +    return vmid_bits;
> +}
> +
> +void vmid_init(void)
> +{
> +    static bool g_disabled = false;

> +    unsigned long vmid_len = vmidlen_detect();
> +    struct vmid_data *data = &this_cpu(vmid_data);
> +    unsigned long max_availalbe_bits = sizeof(data->max_vmid) << 3;

Nit: Typo in "available". Also now that we have it, better use
BITS_PER_BYTE here?

> +    if ( vmid_len > max_availalbe_bits )
> +        panic("%s: VMIDLEN is bigger then a type which represent VMID: %lu(%lu)\n",
> +              __func__, vmid_len, max_availalbe_bits);

This shouldn't be a runtime check imo. What you want to check (at build
time) is that the bits set in HGATP_VMID_MASK can be held in ->max_vmid.

> +    data->max_vmid = BIT(vmid_len, U) - 1;
> +    data->disabled = !opt_vmid_enabled || (vmid_len <= 1);

Actually, what exactly does it mean that "VMIDs are disabled"? There's
no enable bit that I could find anywhere. Isn't it rather that in this
case you need to arrange to flush always on VM entry (or always after a
P2M change, depending how the TLB is split between guest and host use)?
If you look at vmx_vmenter_helper(), its flipping of
SECONDARY_EXEC_ENABLE_VPID tweaks CPU behavior, such that the flush
would be implicit (when the bit is off). I don't expect RISC-V has any
such "implicit" flushing behavior?

> +    if ( g_disabled != data->disabled )
> +    {
> +        printk("%s: VMIDs %sabled.\n", __func__,
> +               data->disabled ? "dis" : "en");
> +        if ( !g_disabled )
> +            g_disabled = data->disabled;

This doesn't match x86 code. g_disabled is a tristate there, which only
the boot CPU would ever write to.

A clear shortcoming of the x86 code (that you copied) is that the log
message doesn't identify the CPU in question. A sequence of "disabled"
and "enabled" could thus result, without the last one (or in fact any
one) making clear what the overall state is. I think you want to avoid
this from the beginning.

Jan

Re: [PATCH v3 03/20] xen/riscv: introduce VMID allocation and manegement

Posted by Oleksii Kurochko 2 months, 3 weeks ago

On 8/4/25 5:19 PM, Jan Beulich wrote:
> On 31.07.2025 17:58, Oleksii Kurochko wrote:
>> Current implementation is based on x86's way to allocate VMIDs:
>>    VMIDs partition the physical TLB. In the current implementation VMIDs are
>>    introduced to reduce the number of TLB flushes.  Each time the guest's
>>    virtual address space changes,
> virtual?

I assumed that originally it meant that from Xen point of view it could be
called guest's virtual as guest doesn't really work with real physical address,
but it seems like it would be more clear to use guest-physical as you suggested
below.

>> instead of flushing the TLB, a new VMID is
>>    assigned.  This reduces the number of TLB flushes to at most 1/#VMIDs.
>>    The biggest advantage is that hot parts of the hypervisor's code and data
>>    retain in the TLB.
>>
>>    VMIDs are a hart-local resource.  As preemption of VMIDs is not possible,
>>    VMIDs are assigned in a round-robin scheme.  To minimize the overhead of
>>    VMID invalidation, at the time of a TLB flush, VMIDs are tagged with a
>>    64-bit generation. Only on a generation overflow the code needs to
>>    invalidate all VMID information stored at the VCPUs with are run on the
>>    specific physical processor.  This overflow appears after about 2^80
>>    host processor cycles,
> Where's this number coming from? (If you provide numbers, I think they will
> want to be "reproducible" by the reader. Which I fear isn't the case here.)

The 2^80 cycles (based on x86-related numbers) result from:
   1. And VM-Entry/-Exit cycle takes at least 1800 cycles (approximated by 2^10)
   2. We have 64 ASIDs (2^6)
   3. 2^64 generations.
I removed this part of the comment earlier because I assumed that the first
item is quite CPU-dependent, even for x86, let alone for other architectures,
which may have a different number (?).
However, this part of the comment was reintroduced during one of the merges.

>> @@ -148,6 +149,8 @@ void __init noreturn start_xen(unsigned long bootcpu_id,
>>   
>>       console_init_postirq();
>>   
>> +    vmid_init();
> This lives here only temporarily, I assume? Every hart will need to execute
> it, and hence (like we have it on x86) this may want to be a central place
> elsewhere.

I haven’t checked how it is done on x86; I probably should.

I planned to call it for each hart separately during secondary hart bring-up,
since accessing the|hgatp| register of a hart is required to detect|VMIDLEN|.
Therefore,|vmid_init()| should be called for secondary harts when their
initialization code starts executing.

>> --- /dev/null
>> +++ b/xen/arch/riscv/vmid.c
>> @@ -0,0 +1,165 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +
>> +#include <xen/domain.h>
>> +#include <xen/init.h>
>> +#include <xen/sections.h>
>> +#include <xen/lib.h>
>> +#include <xen/param.h>
>> +#include <xen/percpu.h>
>> +
>> +#include <asm/atomic.h>
>> +#include <asm/csr.h>
>> +#include <asm/flushtlb.h>
>> +
>> +/* Xen command-line option to enable VMIDs */
>> +static bool __read_mostly opt_vmid_enabled = true;
> __ro_after_init ?

Agree, __ro_afer_init would be better.

>> +boolean_param("vmid", opt_vmid_enabled);
>> +
>> +/*
>> + * VMIDs partition the physical TLB. In the current implementation VMIDs are
>> + * introduced to reduce the number of TLB flushes.  Each time the guest's
>> + * virtual address space changes, instead of flushing the TLB, a new VMID is
> The same odd "virtual" again? All the code here is about guest-physical, isn't
> it?

Answered above.

>> + * assigned. This reduces the number of TLB flushes to at most 1/#VMIDs.
>> + * The biggest advantage is that hot parts of the hypervisor's code and data
>> + * retain in the TLB.
>> + *
>> + * Sketch of the Implementation:
>> + *
>> + * VMIDs are a hart-local resource.  As preemption of VMIDs is not possible,
>> + * VMIDs are assigned in a round-robin scheme.  To minimize the overhead of
>> + * VMID invalidation, at the time of a TLB flush, VMIDs are tagged with a
>> + * 64-bit generation. Only on a generation overflow the code needs to
>> + * invalidate all VMID information stored at the VCPUs with are run on the
>> + * specific physical processor.  This overflow appears after about 2^80
> And the same interesting number again.

Answered above.

>> + * host processor cycles, so we do not optimize this case, but simply disable
>> + * VMID useage to retain correctness.
>> + */
>> +
>> +/* Per-Hart VMID management. */
>> +struct vmid_data {
>> +   uint64_t hart_vmid_generation;
> Any reason not to simply use "generation"?

No specific reason for that, it could be renamed to "generation".

>> +   uint16_t next_vmid;
>> +   uint16_t max_vmid;
>> +   bool disabled;
>> +};
>> +
>> +static DEFINE_PER_CPU(struct vmid_data, vmid_data);
>> +
>> +static unsigned long vmidlen_detect(void)
> __init ? Or wait, are you (deliberately) permitting different VMIDLEN
> across harts?

All what I was able in RISC-V spec is that:
   The number of VMID bits is UNSPECIFIED and may be zero. The number of
   implemented VMID bits, termed VMIDLEN, may be determined by writing one
   to every bit position in the VMID field, then reading back the value in
   hgatp to see which bit positions in the VMID field hold a one. The least-
   significant bits of VMID are implemented first: that is, if VMIDLEN > 0,
   VMID[VMIDLEN-1:0] is writable. The maximal value of VMIDLEN, termed
   VMIDMAX, is 7 for Sv32x4 or 14 for Sv39x4, Sv48x4, and Sv57x4..
And I couldn't find explicitly that VMIDLEN will be the same across harts.

Therefore, IMO, while the specification doesn't guarantee VMID will be
different, the "unspecified" nature and the per-hart discovery mechanism
of VMIDLEN in the hgatp CSR allows for VMIDLEN to be different on
different harts in an implementation without violating the
RISC-V privileged specification.

>
>> +{
>> +    unsigned long vmid_bits;
> Why "long" (also for the function return type)?

Because csr_read() returns unsigned long as HGATP register has
'unsigned long' length.

But it could be done in this way:
     csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
     vmid_bits =  MASK_EXTR(csr_read(CSR_HGATP), HGATP_VMID_MASK);
     vmid_bits = ffs_g(vmid_bits);
     csr_write(CSR_HGATP, old);
And then use uint16_t for vmid_bits and use uin16_t as a return type.

>
>> +    unsigned long old;
>> +
>> +    /* Figure-out number of VMID bits in HW */
>> +    old = csr_read(CSR_HGATP);
>> +
>> +    csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
>> +    vmid_bits = csr_read(CSR_HGATP);
>> +    vmid_bits =  MASK_EXTR(vmid_bits, HGATP_VMID_MASK);
> Nit: Stray blank.
>
>> +    vmid_bits = flsl(vmid_bits);
>> +    csr_write(CSR_HGATP, old);
>> +
>> +    /*
>> +     * We polluted local TLB so flush all guest TLB as
>> +     * a speculative access can happen at any time.
>> +     */
>> +    local_hfence_gvma_all();
> There's no guest running. If you wrote hgat.MODE as zero, as per my
> understanding now new TLB entries could even purely theoretically appear.

It could be an issue (or, at least, it is recommended) when hgatp.MODE is
changed:
  If hgatp.MODE is changed for a given VMID, an HFENCE.GVMA with rs1=x0
  (and rs2 set to either x0 or the VMID) must be executed to order subsequent
  guest translations with the MODE change—even if the old MODE or new MODE
  is Bare.
On other hand it is guaranteed that, at least, on Reset (and so I assume
for power on) that:
  If the hypervisor extension is implemented, the hgatp.MODE and vsatp.MODE
  fields are reset to 0.

So it seems like if no guest is ran then there is no need even to write
hgatp.MODE as zero, but it might be sense to do that explicitly just to
be sure.

I thought it was possible to have a running guest and perform a CPU hotplug.
In that case, I expect that during the hotplug,|vmidlen_detect()| will be
called and return the|vmid_bits| value, which is used as the active VMID.
At that moment, the local TLB could be speculatively polluted, I think.
Likely, it makes sense to call vmidlen_detect() only once for each hart
during initial bringup.

> In fact, with no guest running (yet) I'm having a hard time seeing why
> you shouldn't be able to simply write the register with just
> HGATP_VMID_MASK, i.e. without OR-ing in "old". It's even questionable
> whether "old" needs restoring; writing plain zero afterwards ought to
> suffice. You're in charcge of the register, after all.

It make sense (but I don't know if it is a possible case) to be sure that
HGATP.MODE remains the same, so there is no need to have TLB flush. If
HGATP.MODE is changed then it will be needed to do TLB flush as I mentioned
above.

If we agreed to keep local_hfence_gvma_all() then I think it isn't really
any sense to restore 'old' or OR-ing it with HGATP_VMID_MASK.

Generally if 'old' is guaranteed to be zero (and, probably, it makes sense
to check that in vmidlen_detect() and panic if it isn't zero) and if
vmidlen_detect() function will be called before any guest domain(s) will
be ran then I could agree that we don't need local_hfence_gvma_all() here.

As an option we can do local_hfence_gvma_all() only if 'old' value wasn't
set to zero.

Does it make sense?

>
>> +    return vmid_bits;
>> +}
>> +
>> +void vmid_init(void)
>> +{
>> +    static bool g_disabled = false;
>> +    unsigned long vmid_len = vmidlen_detect();
>> +    struct vmid_data *data = &this_cpu(vmid_data);
>> +    unsigned long max_availalbe_bits = sizeof(data->max_vmid) << 3;
> Nit: Typo in "available". Also now that we have it, better use
> BITS_PER_BYTE here?

Sure, I will use BITS_PER_BYTE.

>
>> +    if ( vmid_len > max_availalbe_bits )
>> +        panic("%s: VMIDLEN is bigger then a type which represent VMID: %lu(%lu)\n",
>> +              __func__, vmid_len, max_availalbe_bits);
> This shouldn't be a runtime check imo. What you want to check (at build
> time) is that the bits set in HGATP_VMID_MASK can be held in ->max_vmid.

Oh, I just noticed that this check isn't even really correct because of
data->max_vmid is inited after this check.

Anyway, build time check would be better.

>
>> +    data->max_vmid = BIT(vmid_len, U) - 1;
>> +    data->disabled = !opt_vmid_enabled || (vmid_len <= 1);
> Actually, what exactly does it mean that "VMIDs are disabled"? There's
> no enable bit that I could find anywhere. Isn't it rather that in this
> case you need to arrange to flush always on VM entry (or always after a
> P2M change, depending how the TLB is split between guest and host use)?

"VMIDs are disabled" here means that TLB flush will happen each time p2m
is changed.

> If you look at vmx_vmenter_helper(), its flipping of
> SECONDARY_EXEC_ENABLE_VPID tweaks CPU behavior, such that the flush
> would be implicit (when the bit is off). I don't expect RISC-V has any
> such "implicit" flushing behavior?

RISC-V relies on explicit software-managed fence instructions for TLB
flushing.

It seems like vmid_handle_vmenter() should be updated then to return
true if VMIDs are disabled:
   bool vmid_handle_vmenter(struct vcpu_vmid *vmid)
   {
       struct vmid_data *data = &this_cpu(vmid_data);

   ...

       /*
        * When we assign VMID 1, flush all TLB entries as we are starting a new
        * generation, and all old VMID allocations are now stale.
        */
       return (vmid->vmid == 1);

    disabled:
       vmid->vmid = 0;
-      return 0;
+      return true;
   }

>
>> +    if ( g_disabled != data->disabled )
>> +    {
>> +        printk("%s: VMIDs %sabled.\n", __func__,
>> +               data->disabled ? "dis" : "en");
>> +        if ( !g_disabled )
>> +            g_disabled = data->disabled;
> This doesn't match x86 code. g_disabled is a tristate there, which only
> the boot CPU would ever write to.

Why g_disabled is written only by boot CPU? Does x86 have only two options
or VMIDs are enabled for all CPUs or it is disabled for all of them?

For RISC-V as I mentioned above it is needed to check all harts as the spec.
doesn't explicitly mention that VMIDLEN is equal for all harts...

>
> A clear shortcoming of the x86 code (that you copied) is that the log
> message doesn't identify the CPU in question. A sequence of "disabled"
> and "enabled" could thus result, without the last one (or in fact any
> one) making clear what the overall state is. I think you want to avoid
> this from the beginning.

... Thereby it seems like declaration of g_disabled should be moved outside
vmid_init() function and add a new function which will return g_disabled
value (or just make g_disabled not static and rename to something like
g_vmids_disabled).

And the print message once after all harts will be initialized, somewhere
in setup.c in start_xen() after:

for_each_present_cpu ( i )
{
if( (num_online_cpus()< nr_cpu_ids)&& !cpu_online(i))
{
intret = cpu_up(i);
if( ret != 0)
printk("Failed to bring up CPU %u (error %d)\n", i, ret);
}
}
(RISC-V doesn't have such code at the moment)

~ Oleksii

Re: [PATCH v3 03/20] xen/riscv: introduce VMID allocation and manegement

Posted by Jan Beulich 2 months, 3 weeks ago

On 06.08.2025 13:33, Oleksii Kurochko wrote:
> On 8/4/25 5:19 PM, Jan Beulich wrote:
>> On 31.07.2025 17:58, Oleksii Kurochko wrote:
>>> @@ -148,6 +149,8 @@ void __init noreturn start_xen(unsigned long bootcpu_id,
>>>   
>>>       console_init_postirq();
>>>   
>>> +    vmid_init();
>> This lives here only temporarily, I assume? Every hart will need to execute
>> it, and hence (like we have it on x86) this may want to be a central place
>> elsewhere.
> 
> I haven’t checked how it is done on x86; I probably should.
> 
> I planned to call it for each hart separately during secondary hart bring-up,
> since accessing the|hgatp| register of a hart is required to detect|VMIDLEN|.
> Therefore,|vmid_init()| should be called for secondary harts when their
> initialization code starts executing.

But is this going to be the only per-hart thing that will need doing? Otherwise
the same larger "container" function may want calling instead.

>>> +static unsigned long vmidlen_detect(void)
>> __init ? Or wait, are you (deliberately) permitting different VMIDLEN
>> across harts?
> 
> All what I was able in RISC-V spec is that:
>    The number of VMID bits is UNSPECIFIED and may be zero. The number of
>    implemented VMID bits, termed VMIDLEN, may be determined by writing one
>    to every bit position in the VMID field, then reading back the value in
>    hgatp to see which bit positions in the VMID field hold a one. The least-
>    significant bits of VMID are implemented first: that is, if VMIDLEN > 0,
>    VMID[VMIDLEN-1:0] is writable. The maximal value of VMIDLEN, termed
>    VMIDMAX, is 7 for Sv32x4 or 14 for Sv39x4, Sv48x4, and Sv57x4..
> And I couldn't find explicitly that VMIDLEN will be the same across harts.
> 
> Therefore, IMO, while the specification doesn't guarantee VMID will be
> different, the "unspecified" nature and the per-hart discovery mechanism
> of VMIDLEN in the hgatp CSR allows for VMIDLEN to be different on
> different harts in an implementation without violating the
> RISC-V privileged specification.

Okay, since that's easily feasible with the present implementation, why not
keep it like that then.

>>> +{
>>> +    unsigned long vmid_bits;
>> Why "long" (also for the function return type)?
> 
> Because csr_read() returns unsigned long as HGATP register has
> 'unsigned long' length.

Oh, right, I should have commented on the function return type only.
Yet then I also can't resist stating that this kind of use of a variable,
which initially is assigned a value that doesn't really fit its name, is
easily misleading towards giving such comments.

> But it could be done in this way:
>      csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
>      vmid_bits =  MASK_EXTR(csr_read(CSR_HGATP), HGATP_VMID_MASK);
>      vmid_bits = ffs_g(vmid_bits);
>      csr_write(CSR_HGATP, old);
> And then use uint16_t for vmid_bits and use uin16_t as a return type.

Please check ./CODING_STYLE again as to the use of fixed-width types.

>>> +    unsigned long old;
>>> +
>>> +    /* Figure-out number of VMID bits in HW */
>>> +    old = csr_read(CSR_HGATP);
>>> +
>>> +    csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
>>> +    vmid_bits = csr_read(CSR_HGATP);
>>> +    vmid_bits =  MASK_EXTR(vmid_bits, HGATP_VMID_MASK);
>> Nit: Stray blank.
>>
>>> +    vmid_bits = flsl(vmid_bits);
>>> +    csr_write(CSR_HGATP, old);
>>> +
>>> +    /*
>>> +     * We polluted local TLB so flush all guest TLB as
>>> +     * a speculative access can happen at any time.
>>> +     */
>>> +    local_hfence_gvma_all();
>> There's no guest running. If you wrote hgat.MODE as zero, as per my
>> understanding now new TLB entries could even purely theoretically appear.
> 
> It could be an issue (or, at least, it is recommended) when hgatp.MODE is
> changed:
>   If hgatp.MODE is changed for a given VMID, an HFENCE.GVMA with rs1=x0
>   (and rs2 set to either x0 or the VMID) must be executed to order subsequent
>   guest translations with the MODE change—even if the old MODE or new MODE
>   is Bare.
> On other hand it is guaranteed that, at least, on Reset (and so I assume
> for power on) that:
>   If the hypervisor extension is implemented, the hgatp.MODE and vsatp.MODE
>   fields are reset to 0.
> 
> So it seems like if no guest is ran then there is no need even to write
> hgatp.MODE as zero, but it might be sense to do that explicitly just to
> be sure.
> 
> I thought it was possible to have a running guest and perform a CPU hotplug.

But that guest will run on another hart.

> In that case, I expect that during the hotplug,|vmidlen_detect()| will be
> called and return the|vmid_bits| value, which is used as the active VMID.
> At that moment, the local TLB could be speculatively polluted, I think.
> Likely, it makes sense to call vmidlen_detect() only once for each hart
> during initial bringup.

That may bring you more problems than it solves. You'd need to stash away
the value originally read somewhere. And that somewhere isn't per-CPU data.

>> In fact, with no guest running (yet) I'm having a hard time seeing why
>> you shouldn't be able to simply write the register with just
>> HGATP_VMID_MASK, i.e. without OR-ing in "old". It's even questionable
>> whether "old" needs restoring; writing plain zero afterwards ought to
>> suffice. You're in charcge of the register, after all.
> 
> It make sense (but I don't know if it is a possible case) to be sure that
> HGATP.MODE remains the same, so there is no need to have TLB flush. If
> HGATP.MODE is changed then it will be needed to do TLB flush as I mentioned
> above.
> 
> If we agreed to keep local_hfence_gvma_all() then I think it isn't really
> any sense to restore 'old' or OR-ing it with HGATP_VMID_MASK.
> 
> Generally if 'old' is guaranteed to be zero (and, probably, it makes sense
> to check that in vmidlen_detect() and panic if it isn't zero) and if
> vmidlen_detect() function will be called before any guest domain(s) will
> be ran then I could agree that we don't need local_hfence_gvma_all() here.
> 
> As an option we can do local_hfence_gvma_all() only if 'old' value wasn't
> set to zero.
> 
> Does it make sense?

Well - I'd like the pre-conditions to be understood better. For example, can
a hart really speculate into guest mode, when the hart is only in the
process of being brought up?

>>> +    data->max_vmid = BIT(vmid_len, U) - 1;
>>> +    data->disabled = !opt_vmid_enabled || (vmid_len <= 1);
>> Actually, what exactly does it mean that "VMIDs are disabled"? There's
>> no enable bit that I could find anywhere. Isn't it rather that in this
>> case you need to arrange to flush always on VM entry (or always after a
>> P2M change, depending how the TLB is split between guest and host use)?
> 
> "VMIDs are disabled" here means that TLB flush will happen each time p2m
> is changed.

That's better described as "VMIDs aren't used" then?

>>> +    if ( g_disabled != data->disabled )
>>> +    {
>>> +        printk("%s: VMIDs %sabled.\n", __func__,
>>> +               data->disabled ? "dis" : "en");
>>> +        if ( !g_disabled )
>>> +            g_disabled = data->disabled;
>> This doesn't match x86 code. g_disabled is a tristate there, which only
>> the boot CPU would ever write to.
> 
> Why g_disabled is written only by boot CPU? Does x86 have only two options
> or VMIDs are enabled for all CPUs or it is disabled for all of them?

Did you look at the x86 code again, or the patch that I sent for it?

> For RISC-V as I mentioned above it is needed to check all harts as the spec.
> doesn't explicitly mention that VMIDLEN is equal for all harts...

Even if in practice x86 systems are symmetric in this regard, you will
have seen that we support varying values there as well. Up to and
including ASIDs being in use on some CPUs, but not on others. So that
code can serve as a reference for you, I think.

>> A clear shortcoming of the x86 code (that you copied) is that the log
>> message doesn't identify the CPU in question. A sequence of "disabled"
>> and "enabled" could thus result, without the last one (or in fact any
>> one) making clear what the overall state is. I think you want to avoid
>> this from the beginning.
> 
> ... Thereby it seems like declaration of g_disabled should be moved outside
> vmid_init() function and add a new function which will return g_disabled
> value (or just make g_disabled not static and rename to something like
> g_vmids_disabled).

No, why? While I didn't Cc you on my patch submission, I specifically
replied to it with you (alone) on the To: list, just so you can look
there first before suggesting (sorry) odd things.

Jan

Re: [PATCH v3 03/20] xen/riscv: introduce VMID allocation and manegement

Posted by Oleksii Kurochko 2 months, 3 weeks ago

On 8/6/25 2:05 PM, Jan Beulich wrote:
> On 06.08.2025 13:33, Oleksii Kurochko wrote:
>> On 8/4/25 5:19 PM, Jan Beulich wrote:
>>> On 31.07.2025 17:58, Oleksii Kurochko wrote:
>>>> @@ -148,6 +149,8 @@ void __init noreturn start_xen(unsigned long bootcpu_id,
>>>>    
>>>>        console_init_postirq();
>>>>    
>>>> +    vmid_init();
>>> This lives here only temporarily, I assume? Every hart will need to execute
>>> it, and hence (like we have it on x86) this may want to be a central place
>>> elsewhere.
>> I haven’t checked how it is done on x86; I probably should.
>>
>> I planned to call it for each hart separately during secondary hart bring-up,
>> since accessing the|hgatp| register of a hart is required to detect|VMIDLEN|.
>> Therefore,|vmid_init()| should be called for secondary harts when their
>> initialization code starts executing.
> But is this going to be the only per-hart thing that will need doing? Otherwise
> the same larger "container" function may want calling instead.

Yes, it is going to be the only per-hart operation.

There is|__cpu_up()| (not yet upstreamed [1]), which calls
|sbi_hsm_hart_start(hartid, boot_addr, hsm_data)| to start a hart, and I planned
to place|vmid_init()| somewhere in the code executed at|boot_addr|.

[1]https://gitlab.com/xen-project/people/olkur/xen/-/blob/latest/xen/arch/riscv/smpboot.c#L40

>>>> +{
>>>> +    unsigned long vmid_bits;
>>> Why "long" (also for the function return type)?
>> Because csr_read() returns unsigned long as HGATP register has
>> 'unsigned long' length.
> Oh, right, I should have commented on the function return type only.
> Yet then I also can't resist stating that this kind of use of a variable,
> which initially is assigned a value that doesn't really fit its name, is
> easily misleading towards giving such comments.
>
>> But it could be done in this way:
>>       csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
>>       vmid_bits =  MASK_EXTR(csr_read(CSR_HGATP), HGATP_VMID_MASK);
>>       vmid_bits = ffs_g(vmid_bits);
>>       csr_write(CSR_HGATP, old);
>> And then use uint16_t for vmid_bits and use uin16_t as a return type.
> Please check ./CODING_STYLE again as to the use of fixed-width types.

I meant unsigned short, uint16_t was just short to write. I'll try to be
more specific.

>
>>>> +    unsigned long old;
>>>> +
>>>> +    /* Figure-out number of VMID bits in HW */
>>>> +    old = csr_read(CSR_HGATP);
>>>> +
>>>> +    csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
>>>> +    vmid_bits = csr_read(CSR_HGATP);
>>>> +    vmid_bits =  MASK_EXTR(vmid_bits, HGATP_VMID_MASK);
>>> Nit: Stray blank.
>>>
>>>> +    vmid_bits = flsl(vmid_bits);
>>>> +    csr_write(CSR_HGATP, old);
>>>> +
>>>> +    /*
>>>> +     * We polluted local TLB so flush all guest TLB as
>>>> +     * a speculative access can happen at any time.
>>>> +     */
>>>> +    local_hfence_gvma_all();
>>> There's no guest running. If you wrote hgat.MODE as zero, as per my
>>> understanding now new TLB entries could even purely theoretically appear.
>> It could be an issue (or, at least, it is recommended) when hgatp.MODE is
>> changed:
>>    If hgatp.MODE is changed for a given VMID, an HFENCE.GVMA with rs1=x0
>>    (and rs2 set to either x0 or the VMID) must be executed to order subsequent
>>    guest translations with the MODE change—even if the old MODE or new MODE
>>    is Bare.
>> On other hand it is guaranteed that, at least, on Reset (and so I assume
>> for power on) that:
>>    If the hypervisor extension is implemented, the hgatp.MODE and vsatp.MODE
>>    fields are reset to 0.
>>
>> So it seems like if no guest is ran then there is no need even to write
>> hgatp.MODE as zero, but it might be sense to do that explicitly just to
>> be sure.
>>
>> I thought it was possible to have a running guest and perform a CPU hotplug.
> But that guest will run on another hart.
>
>> In that case, I expect that during the hotplug,|vmidlen_detect()| will be
>> called and return the|vmid_bits| value, which is used as the active VMID.
>> At that moment, the local TLB could be speculatively polluted, I think.
>> Likely, it makes sense to call vmidlen_detect() only once for each hart
>> during initial bringup.
> That may bring you more problems than it solves. You'd need to stash away
> the value originally read somewhere. And that somewhere isn't per-CPU data.
>
>>> In fact, with no guest running (yet) I'm having a hard time seeing why
>>> you shouldn't be able to simply write the register with just
>>> HGATP_VMID_MASK, i.e. without OR-ing in "old". It's even questionable
>>> whether "old" needs restoring; writing plain zero afterwards ought to
>>> suffice. You're in charcge of the register, after all.
>> It make sense (but I don't know if it is a possible case) to be sure that
>> HGATP.MODE remains the same, so there is no need to have TLB flush. If
>> HGATP.MODE is changed then it will be needed to do TLB flush as I mentioned
>> above.
>>
>> If we agreed to keep local_hfence_gvma_all() then I think it isn't really
>> any sense to restore 'old' or OR-ing it with HGATP_VMID_MASK.
>>
>> Generally if 'old' is guaranteed to be zero (and, probably, it makes sense
>> to check that in vmidlen_detect() and panic if it isn't zero) and if
>> vmidlen_detect() function will be called before any guest domain(s) will
>> be ran then I could agree that we don't need local_hfence_gvma_all() here.
>>
>> As an option we can do local_hfence_gvma_all() only if 'old' value wasn't
>> set to zero.
>>
>> Does it make sense?
> Well - I'd like the pre-conditions to be understood better. For example, can
> a hart really speculate into guest mode, when the hart is only in the
> process of being brought up?

I couldn't explicit words that a hart can't speculate into guest mode
either on bring up or during its work.

But there are some moments in the spec which tells:
   Implementations with virtual memory are permitted to perform address
   translations speculatively and earlier than required by an explicit
   memory access, and are permitted to cache them in address translation
   cache structures—including possibly caching the identity mappings from
   effective address to physical address used in Bare translation modes and
   M-mode.
And here:
   Implementations may also execute the address-translation algorithm
   speculatively at any time, for any virtual address, as long as satp is
   active (as defined in Section 10.1.11). Such speculative executions have
   the effect of pre-populating the address-translation cache.
Where it is explicitly mentioned that speculation can happen in *any time*.
And at the same time:
   Speculative executions of the address-translation algorithm behave as
   non-speculative executions of the algorithm do, except that they must
   not set the dirty bit for a PTE, they must not trigger an exception,
   and they must not create address-translation cache entries if those
   entries would have been invalidated by any SFENCE.VMA instruction
   executed by the hart since the speculative execution of the algorithm began.
What I read as if TLB was empty before it will stay empty.

Also, despite of the fact here it is mentioned that when V=0 two-stage address
translation is inactivated:
   The current virtualization mode, denoted V, indicates whether the hart is
   currently executing in a guest. When V=1, the hart is either in virtual
   S-mode (VS-mode), or in virtual U-mode (VU-mode) atop a guest OS running
   in VS-mode. When V=0, the hart is either in M-mode, in HS-mode, or in
   U-mode atop an OS running in HS-mode. The virtualization mode also
   indicates whether two-stage address translation is active (V=1) or
   inactive (V=0).
But on the same side, writing to hgatp register activates it:
   The hgatp register is considered active for the purposes of
   the address-translation algorithm unless the effective privilege mode
   is U and hstatus.HU=0.
And if so + considering that speculation could happen at any time, and
we are in HS-mode, not it U mode then I would say that it could really
speculate into guest mode.


>
>>>> +    data->max_vmid = BIT(vmid_len, U) - 1;
>>>> +    data->disabled = !opt_vmid_enabled || (vmid_len <= 1);
>>> Actually, what exactly does it mean that "VMIDs are disabled"? There's
>>> no enable bit that I could find anywhere. Isn't it rather that in this
>>> case you need to arrange to flush always on VM entry (or always after a
>>> P2M change, depending how the TLB is split between guest and host use)?
>> "VMIDs are disabled" here means that TLB flush will happen each time p2m
>> is changed.
> That's better described as "VMIDs aren't used" then?

It sounds a little bit just like an opposite to "disabled" (i.e. means
basically the same), but I am okay to use "used" instead.

>
>>>> +    if ( g_disabled != data->disabled )
>>>> +    {
>>>> +        printk("%s: VMIDs %sabled.\n", __func__,
>>>> +               data->disabled ? "dis" : "en");
>>>> +        if ( !g_disabled )
>>>> +            g_disabled = data->disabled;
>>> This doesn't match x86 code. g_disabled is a tristate there, which only
>>> the boot CPU would ever write to.
>> Why g_disabled is written only by boot CPU? Does x86 have only two options
>> or VMIDs are enabled for all CPUs or it is disabled for all of them?
> Did you look at the x86 code again, or the patch that I sent for it?
>
>> For RISC-V as I mentioned above it is needed to check all harts as the spec.
>> doesn't explicitly mention that VMIDLEN is equal for all harts...
> Even if in practice x86 systems are symmetric in this regard, you will
> have seen that we support varying values there as well. Up to and
> including ASIDs being in use on some CPUs, but not on others. So that
> code can serve as a reference for you, I think.
>
>>> A clear shortcoming of the x86 code (that you copied) is that the log
>>> message doesn't identify the CPU in question. A sequence of "disabled"
>>> and "enabled" could thus result, without the last one (or in fact any
>>> one) making clear what the overall state is. I think you want to avoid
>>> this from the beginning.
>> ... Thereby it seems like declaration of g_disabled should be moved outside
>> vmid_init() function and add a new function which will return g_disabled
>> value (or just make g_disabled not static and rename to something like
>> g_vmids_disabled).
> No, why? While I didn't Cc you on my patch submission, I specifically
> replied to it with you (alone) on the To: list, just so you can look
> there first before suggesting (sorry) odd things.

I haven't received this e-mail, but I was able to find it on lore.kernel.org.
Thanks.

~ Oleksii

Re: [PATCH v3 03/20] xen/riscv: introduce VMID allocation and manegement

Posted by Jan Beulich 2 months, 3 weeks ago

On 06.08.2025 18:24, Oleksii Kurochko wrote:
> On 8/6/25 2:05 PM, Jan Beulich wrote:
>> On 06.08.2025 13:33, Oleksii Kurochko wrote:
>>> On 8/4/25 5:19 PM, Jan Beulich wrote:
>>>> On 31.07.2025 17:58, Oleksii Kurochko wrote:
>>>>> +{
>>>>> +    unsigned long vmid_bits;
>>>> Why "long" (also for the function return type)?
>>> Because csr_read() returns unsigned long as HGATP register has
>>> 'unsigned long' length.
>> Oh, right, I should have commented on the function return type only.
>> Yet then I also can't resist stating that this kind of use of a variable,
>> which initially is assigned a value that doesn't really fit its name, is
>> easily misleading towards giving such comments.
>>
>>> But it could be done in this way:
>>>       csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
>>>       vmid_bits =  MASK_EXTR(csr_read(CSR_HGATP), HGATP_VMID_MASK);
>>>       vmid_bits = ffs_g(vmid_bits);
>>>       csr_write(CSR_HGATP, old);
>>> And then use uint16_t for vmid_bits and use uin16_t as a return type.
>> Please check ./CODING_STYLE again as to the use of fixed-width types.
> 
> I meant unsigned short, uint16_t was just short to write. I'll try to be
> more specific.

I'd also recommend against unsigned short when there are no space concerns.
unsigned int is what wants using in the general case.

>>>>> +    unsigned long old;
>>>>> +
>>>>> +    /* Figure-out number of VMID bits in HW */
>>>>> +    old = csr_read(CSR_HGATP);
>>>>> +
>>>>> +    csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
>>>>> +    vmid_bits = csr_read(CSR_HGATP);
>>>>> +    vmid_bits =  MASK_EXTR(vmid_bits, HGATP_VMID_MASK);
>>>> Nit: Stray blank.
>>>>
>>>>> +    vmid_bits = flsl(vmid_bits);
>>>>> +    csr_write(CSR_HGATP, old);
>>>>> +
>>>>> +    /*
>>>>> +     * We polluted local TLB so flush all guest TLB as
>>>>> +     * a speculative access can happen at any time.
>>>>> +     */
>>>>> +    local_hfence_gvma_all();
>>>> There's no guest running. If you wrote hgat.MODE as zero, as per my
>>>> understanding now new TLB entries could even purely theoretically appear.
>>> It could be an issue (or, at least, it is recommended) when hgatp.MODE is
>>> changed:
>>>    If hgatp.MODE is changed for a given VMID, an HFENCE.GVMA with rs1=x0
>>>    (and rs2 set to either x0 or the VMID) must be executed to order subsequent
>>>    guest translations with the MODE change—even if the old MODE or new MODE
>>>    is Bare.
>>> On other hand it is guaranteed that, at least, on Reset (and so I assume
>>> for power on) that:
>>>    If the hypervisor extension is implemented, the hgatp.MODE and vsatp.MODE
>>>    fields are reset to 0.
>>>
>>> So it seems like if no guest is ran then there is no need even to write
>>> hgatp.MODE as zero, but it might be sense to do that explicitly just to
>>> be sure.
>>>
>>> I thought it was possible to have a running guest and perform a CPU hotplug.
>> But that guest will run on another hart.
>>
>>> In that case, I expect that during the hotplug,|vmidlen_detect()| will be
>>> called and return the|vmid_bits| value, which is used as the active VMID.
>>> At that moment, the local TLB could be speculatively polluted, I think.
>>> Likely, it makes sense to call vmidlen_detect() only once for each hart
>>> during initial bringup.
>> That may bring you more problems than it solves. You'd need to stash away
>> the value originally read somewhere. And that somewhere isn't per-CPU data.
>>
>>>> In fact, with no guest running (yet) I'm having a hard time seeing why
>>>> you shouldn't be able to simply write the register with just
>>>> HGATP_VMID_MASK, i.e. without OR-ing in "old". It's even questionable
>>>> whether "old" needs restoring; writing plain zero afterwards ought to
>>>> suffice. You're in charcge of the register, after all.
>>> It make sense (but I don't know if it is a possible case) to be sure that
>>> HGATP.MODE remains the same, so there is no need to have TLB flush. If
>>> HGATP.MODE is changed then it will be needed to do TLB flush as I mentioned
>>> above.
>>>
>>> If we agreed to keep local_hfence_gvma_all() then I think it isn't really
>>> any sense to restore 'old' or OR-ing it with HGATP_VMID_MASK.
>>>
>>> Generally if 'old' is guaranteed to be zero (and, probably, it makes sense
>>> to check that in vmidlen_detect() and panic if it isn't zero) and if
>>> vmidlen_detect() function will be called before any guest domain(s) will
>>> be ran then I could agree that we don't need local_hfence_gvma_all() here.
>>>
>>> As an option we can do local_hfence_gvma_all() only if 'old' value wasn't
>>> set to zero.
>>>
>>> Does it make sense?
>> Well - I'd like the pre-conditions to be understood better. For example, can
>> a hart really speculate into guest mode, when the hart is only in the
>> process of being brought up?
> 
> I couldn't explicit words that a hart can't speculate into guest mode
> either on bring up or during its work.
> 
> But there are some moments in the spec which tells:
>    Implementations with virtual memory are permitted to perform address
>    translations speculatively and earlier than required by an explicit
>    memory access, and are permitted to cache them in address translation
>    cache structures—including possibly caching the identity mappings from
>    effective address to physical address used in Bare translation modes and
>    M-mode.
> And here:
>    Implementations may also execute the address-translation algorithm
>    speculatively at any time, for any virtual address, as long as satp is
>    active (as defined in Section 10.1.11). Such speculative executions have
>    the effect of pre-populating the address-translation cache.

That's satp though, not hgatp.

> Where it is explicitly mentioned that speculation can happen in *any time*.
> And at the same time:
>    Speculative executions of the address-translation algorithm behave as
>    non-speculative executions of the algorithm do, except that they must
>    not set the dirty bit for a PTE, they must not trigger an exception,
>    and they must not create address-translation cache entries if those
>    entries would have been invalidated by any SFENCE.VMA instruction
>    executed by the hart since the speculative execution of the algorithm began.
> What I read as if TLB was empty before it will stay empty.
> 
> Also, despite of the fact here it is mentioned that when V=0 two-stage address
> translation is inactivated:
>    The current virtualization mode, denoted V, indicates whether the hart is
>    currently executing in a guest. When V=1, the hart is either in virtual
>    S-mode (VS-mode), or in virtual U-mode (VU-mode) atop a guest OS running
>    in VS-mode. When V=0, the hart is either in M-mode, in HS-mode, or in
>    U-mode atop an OS running in HS-mode. The virtualization mode also
>    indicates whether two-stage address translation is active (V=1) or
>    inactive (V=0).
> But on the same side, writing to hgatp register activates it:
>    The hgatp register is considered active for the purposes of
>    the address-translation algorithm unless the effective privilege mode
>    is U and hstatus.HU=0.
> And if so + considering that speculation could happen at any time, and
> we are in HS-mode, not it U mode then I would say that it could really
> speculate into guest mode.

Hmm, that leaves some things to be desired. What I'm particularly puzzled
by is that there's nothing said either way towards speculation through SRET.
That's the important aspect here aiui, because without that the hart can't
speculate into guest mode.

But yes, in the absence of any clear indication to the contrary, I think
you want to keep the local_hfence_gvma_all() (with a suitable comment).

>>>>> +    data->max_vmid = BIT(vmid_len, U) - 1;
>>>>> +    data->disabled = !opt_vmid_enabled || (vmid_len <= 1);
>>>> Actually, what exactly does it mean that "VMIDs are disabled"? There's
>>>> no enable bit that I could find anywhere. Isn't it rather that in this
>>>> case you need to arrange to flush always on VM entry (or always after a
>>>> P2M change, depending how the TLB is split between guest and host use)?
>>> "VMIDs are disabled" here means that TLB flush will happen each time p2m
>>> is changed.
>> That's better described as "VMIDs aren't used" then?
> 
> It sounds a little bit just like an opposite to "disabled" (i.e. means
> basically the same), but I am okay to use "used" instead.

If you want to stick to using "disabled", then how about "VMID use is
disabled"? (You probably meanwhile understood that what I'm after is it
becoming clear that this is a software decision, not something you can
enforce in hardware.)

Jan

Re: [PATCH v3 03/20] xen/riscv: introduce VMID allocation and manegement

Posted by Oleksii Kurochko 2 months, 3 weeks ago

On 8/7/25 12:11 PM, Jan Beulich wrote:
>>>>>> +    unsigned long old;
>>>>>> +
>>>>>> +    /* Figure-out number of VMID bits in HW */
>>>>>> +    old = csr_read(CSR_HGATP);
>>>>>> +
>>>>>> +    csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
>>>>>> +    vmid_bits = csr_read(CSR_HGATP);
>>>>>> +    vmid_bits =  MASK_EXTR(vmid_bits, HGATP_VMID_MASK);
>>>>> Nit: Stray blank.
>>>>>
>>>>>> +    vmid_bits = flsl(vmid_bits);
>>>>>> +    csr_write(CSR_HGATP, old);
>>>>>> +
>>>>>> +    /*
>>>>>> +     * We polluted local TLB so flush all guest TLB as
>>>>>> +     * a speculative access can happen at any time.
>>>>>> +     */
>>>>>> +    local_hfence_gvma_all();
>>>>> There's no guest running. If you wrote hgat.MODE as zero, as per my
>>>>> understanding now new TLB entries could even purely theoretically appear.
>>>> It could be an issue (or, at least, it is recommended) when hgatp.MODE is
>>>> changed:
>>>>     If hgatp.MODE is changed for a given VMID, an HFENCE.GVMA with rs1=x0
>>>>     (and rs2 set to either x0 or the VMID) must be executed to order subsequent
>>>>     guest translations with the MODE change—even if the old MODE or new MODE
>>>>     is Bare.
>>>> On other hand it is guaranteed that, at least, on Reset (and so I assume
>>>> for power on) that:
>>>>     If the hypervisor extension is implemented, the hgatp.MODE and vsatp.MODE
>>>>     fields are reset to 0.
>>>>
>>>> So it seems like if no guest is ran then there is no need even to write
>>>> hgatp.MODE as zero, but it might be sense to do that explicitly just to
>>>> be sure.
>>>>
>>>> I thought it was possible to have a running guest and perform a CPU hotplug.
>>> But that guest will run on another hart.
>>>
>>>> In that case, I expect that during the hotplug,|vmidlen_detect()| will be
>>>> called and return the|vmid_bits| value, which is used as the active VMID.
>>>> At that moment, the local TLB could be speculatively polluted, I think.
>>>> Likely, it makes sense to call vmidlen_detect() only once for each hart
>>>> during initial bringup.
>>> That may bring you more problems than it solves. You'd need to stash away
>>> the value originally read somewhere. And that somewhere isn't per-CPU data.
>>>
>>>>> In fact, with no guest running (yet) I'm having a hard time seeing why
>>>>> you shouldn't be able to simply write the register with just
>>>>> HGATP_VMID_MASK, i.e. without OR-ing in "old". It's even questionable
>>>>> whether "old" needs restoring; writing plain zero afterwards ought to
>>>>> suffice. You're in charcge of the register, after all.
>>>> It make sense (but I don't know if it is a possible case) to be sure that
>>>> HGATP.MODE remains the same, so there is no need to have TLB flush. If
>>>> HGATP.MODE is changed then it will be needed to do TLB flush as I mentioned
>>>> above.
>>>>
>>>> If we agreed to keep local_hfence_gvma_all() then I think it isn't really
>>>> any sense to restore 'old' or OR-ing it with HGATP_VMID_MASK.
>>>>
>>>> Generally if 'old' is guaranteed to be zero (and, probably, it makes sense
>>>> to check that in vmidlen_detect() and panic if it isn't zero) and if
>>>> vmidlen_detect() function will be called before any guest domain(s) will
>>>> be ran then I could agree that we don't need local_hfence_gvma_all() here.
>>>>
>>>> As an option we can do local_hfence_gvma_all() only if 'old' value wasn't
>>>> set to zero.
>>>>
>>>> Does it make sense?
>>> Well - I'd like the pre-conditions to be understood better. For example, can
>>> a hart really speculate into guest mode, when the hart is only in the
>>> process of being brought up?
>> I couldn't explicit words that a hart can't speculate into guest mode
>> either on bring up or during its work.
>>
>> But there are some moments in the spec which tells:
>>     Implementations with virtual memory are permitted to perform address
>>     translations speculatively and earlier than required by an explicit
>>     memory access, and are permitted to cache them in address translation
>>     cache structures—including possibly caching the identity mappings from
>>     effective address to physical address used in Bare translation modes and
>>     M-mode.
>> And here:
>>     Implementations may also execute the address-translation algorithm
>>     speculatively at any time, for any virtual address, as long as satp is
>>     active (as defined in Section 10.1.11). Such speculative executions have
>>     the effect of pre-populating the address-translation cache.
> That's satp though, not hgatp.
>
>> Where it is explicitly mentioned that speculation can happen in*any time*.
>> And at the same time:
>>     Speculative executions of the address-translation algorithm behave as
>>     non-speculative executions of the algorithm do, except that they must
>>     not set the dirty bit for a PTE, they must not trigger an exception,
>>     and they must not create address-translation cache entries if those
>>     entries would have been invalidated by any SFENCE.VMA instruction
>>     executed by the hart since the speculative execution of the algorithm began.
>> What I read as if TLB was empty before it will stay empty.
>>
>> Also, despite of the fact here it is mentioned that when V=0 two-stage address
>> translation is inactivated:
>>     The current virtualization mode, denoted V, indicates whether the hart is
>>     currently executing in a guest. When V=1, the hart is either in virtual
>>     S-mode (VS-mode), or in virtual U-mode (VU-mode) atop a guest OS running
>>     in VS-mode. When V=0, the hart is either in M-mode, in HS-mode, or in
>>     U-mode atop an OS running in HS-mode. The virtualization mode also
>>     indicates whether two-stage address translation is active (V=1) or
>>     inactive (V=0).
>> But on the same side, writing to hgatp register activates it:
>>     The hgatp register is considered active for the purposes of
>>     the address-translation algorithm unless the effective privilege mode
>>     is U and hstatus.HU=0.
>> And if so + considering that speculation could happen at any time, and
>> we are in HS-mode, not it U mode then I would say that it could really
>> speculate into guest mode.
> Hmm, that leaves some things to be desired. What I'm particularly puzzled
> by is that there's nothing said either way towards speculation through SRET.
> That's the important aspect here aiui, because without that the hart can't
> speculate into guest mode.
>
> But yes, in the absence of any clear indication to the contrary, I think
> you want to keep the local_hfence_gvma_all() (with a suitable comment).

Interesting that for VS-stage translation is explicitly mention that it is
possible to stop speculation:
   No mechanism is provided to atomically change vsatp and hgatp together.
   Hence, to prevent speculative execution causing one guest’s VS-stage
   translations to be cached under another guest’s VMID, world-switch code
   should zero vsatp, then swap hgatp, then finally write the new vsatp value.
   Similarly, if henvcfg.PBMTE need be world-switched, it should be switched
   after zeroing vsatp but before writing the new vsatp value, obviating
   the need to execute an HFENCE.VVMA instruction.
So if VSATP is 0 then there is no speculation as there is no need to execute
HFENCE.VVMA.

>
>>>>>> +    data->max_vmid = BIT(vmid_len, U) - 1;
>>>>>> +    data->disabled = !opt_vmid_enabled || (vmid_len <= 1);
>>>>> Actually, what exactly does it mean that "VMIDs are disabled"? There's
>>>>> no enable bit that I could find anywhere. Isn't it rather that in this
>>>>> case you need to arrange to flush always on VM entry (or always after a
>>>>> P2M change, depending how the TLB is split between guest and host use)?
>>>> "VMIDs are disabled" here means that TLB flush will happen each time p2m
>>>> is changed.
>>> That's better described as "VMIDs aren't used" then?
>> It sounds a little bit just like an opposite to "disabled" (i.e. means
>> basically the same), but I am okay to use "used" instead.
> If you want to stick to using "disabled", then how about "VMID use is
> disabled"? (You probably meanwhile understood that what I'm after is it
> becoming clear that this is a software decision, not something you can
> enforce in hardware.)

"VMID use is disabled" really sounds more clear. Thanks.

~ Oleksii

Re: [PATCH v3 03/20] xen/riscv: introduce VMID allocation and manegement

Posted by Demi Marie Obenour 2 months, 3 weeks ago

On 8/6/25 12:24, Oleksii Kurochko wrote:
> 
> On 8/6/25 2:05 PM, Jan Beulich wrote:
>> On 06.08.2025 13:33, Oleksii Kurochko wrote:
>>> On 8/4/25 5:19 PM, Jan Beulich wrote:
>>>> On 31.07.2025 17:58, Oleksii Kurochko wrote:
>>>>> @@ -148,6 +149,8 @@ void __init noreturn start_xen(unsigned long bootcpu_id,
>>>>>    
>>>>>        console_init_postirq();
>>>>>    
>>>>> +    vmid_init();
>>>> This lives here only temporarily, I assume? Every hart will need to execute
>>>> it, and hence (like we have it on x86) this may want to be a central place
>>>> elsewhere.
>>> I haven’t checked how it is done on x86; I probably should.
>>>
>>> I planned to call it for each hart separately during secondary hart bring-up,
>>> since accessing the|hgatp| register of a hart is required to detect|VMIDLEN|.
>>> Therefore,|vmid_init()| should be called for secondary harts when their
>>> initialization code starts executing.
>> But is this going to be the only per-hart thing that will need doing? Otherwise
>> the same larger "container" function may want calling instead.
> 
> Yes, it is going to be the only per-hart operation.
> 
> There is|__cpu_up()| (not yet upstreamed [1]), which calls
> |sbi_hsm_hart_start(hartid, boot_addr, hsm_data)| to start a hart, and I planned
> to place|vmid_init()| somewhere in the code executed at|boot_addr|.
> 
> [1]https://gitlab.com/xen-project/people/olkur/xen/-/blob/latest/xen/arch/riscv/smpboot.c#L40
> 
>>>>> +{
>>>>> +    unsigned long vmid_bits;
>>>> Why "long" (also for the function return type)?
>>> Because csr_read() returns unsigned long as HGATP register has
>>> 'unsigned long' length.
>> Oh, right, I should have commented on the function return type only.
>> Yet then I also can't resist stating that this kind of use of a variable,
>> which initially is assigned a value that doesn't really fit its name, is
>> easily misleading towards giving such comments.
>>
>>> But it could be done in this way:
>>>       csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
>>>       vmid_bits =  MASK_EXTR(csr_read(CSR_HGATP), HGATP_VMID_MASK);
>>>       vmid_bits = ffs_g(vmid_bits);
>>>       csr_write(CSR_HGATP, old);
>>> And then use uint16_t for vmid_bits and use uin16_t as a return type.
>> Please check ./CODING_STYLE again as to the use of fixed-width types.
> 
> I meant unsigned short, uint16_t was just short to write. I'll try to be
> more specific.
> 
>>
>>>>> +    unsigned long old;
>>>>> +
>>>>> +    /* Figure-out number of VMID bits in HW */
>>>>> +    old = csr_read(CSR_HGATP);
>>>>> +
>>>>> +    csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
>>>>> +    vmid_bits = csr_read(CSR_HGATP);
>>>>> +    vmid_bits =  MASK_EXTR(vmid_bits, HGATP_VMID_MASK);
>>>> Nit: Stray blank.
>>>>
>>>>> +    vmid_bits = flsl(vmid_bits);
>>>>> +    csr_write(CSR_HGATP, old);
>>>>> +
>>>>> +    /*
>>>>> +     * We polluted local TLB so flush all guest TLB as
>>>>> +     * a speculative access can happen at any time.
>>>>> +     */
>>>>> +    local_hfence_gvma_all();
>>>> There's no guest running. If you wrote hgat.MODE as zero, as per my
>>>> understanding now new TLB entries could even purely theoretically appear.
>>> It could be an issue (or, at least, it is recommended) when hgatp.MODE is
>>> changed:
>>>    If hgatp.MODE is changed for a given VMID, an HFENCE.GVMA with rs1=x0
>>>    (and rs2 set to either x0 or the VMID) must be executed to order subsequent
>>>    guest translations with the MODE change—even if the old MODE or new MODE
>>>    is Bare.
>>> On other hand it is guaranteed that, at least, on Reset (and so I assume
>>> for power on) that:
>>>    If the hypervisor extension is implemented, the hgatp.MODE and vsatp.MODE
>>>    fields are reset to 0.
>>>
>>> So it seems like if no guest is ran then there is no need even to write
>>> hgatp.MODE as zero, but it might be sense to do that explicitly just to
>>> be sure.
>>>
>>> I thought it was possible to have a running guest and perform a CPU hotplug.
>> But that guest will run on another hart.
>>
>>> In that case, I expect that during the hotplug,|vmidlen_detect()| will be
>>> called and return the|vmid_bits| value, which is used as the active VMID.
>>> At that moment, the local TLB could be speculatively polluted, I think.
>>> Likely, it makes sense to call vmidlen_detect() only once for each hart
>>> during initial bringup.
>> That may bring you more problems than it solves. You'd need to stash away
>> the value originally read somewhere. And that somewhere isn't per-CPU data.
>>
>>>> In fact, with no guest running (yet) I'm having a hard time seeing why
>>>> you shouldn't be able to simply write the register with just
>>>> HGATP_VMID_MASK, i.e. without OR-ing in "old". It's even questionable
>>>> whether "old" needs restoring; writing plain zero afterwards ought to
>>>> suffice. You're in charcge of the register, after all.
>>> It make sense (but I don't know if it is a possible case) to be sure that
>>> HGATP.MODE remains the same, so there is no need to have TLB flush. If
>>> HGATP.MODE is changed then it will be needed to do TLB flush as I mentioned
>>> above.
>>>
>>> If we agreed to keep local_hfence_gvma_all() then I think it isn't really
>>> any sense to restore 'old' or OR-ing it with HGATP_VMID_MASK.
>>>
>>> Generally if 'old' is guaranteed to be zero (and, probably, it makes sense
>>> to check that in vmidlen_detect() and panic if it isn't zero) and if
>>> vmidlen_detect() function will be called before any guest domain(s) will
>>> be ran then I could agree that we don't need local_hfence_gvma_all() here.
>>>
>>> As an option we can do local_hfence_gvma_all() only if 'old' value wasn't
>>> set to zero.
>>>
>>> Does it make sense?
>> Well - I'd like the pre-conditions to be understood better. For example, can
>> a hart really speculate into guest mode, when the hart is only in the
>> process of being brought up?
> 
> I couldn't explicit words that a hart can't speculate into guest mode
> either on bring up or during its work.
> 
> But there are some moments in the spec which tells:
>    Implementations with virtual memory are permitted to perform address
>    translations speculatively and earlier than required by an explicit
>    memory access, and are permitted to cache them in address translation
>    cache structures—including possibly caching the identity mappings from
>    effective address to physical address used in Bare translation modes and
>    M-mode.
> And here:
>    Implementations may also execute the address-translation algorithm
>    speculatively at any time, for any virtual address, as long as satp is
>    active (as defined in Section 10.1.11). Such speculative executions have
>    the effect of pre-populating the address-translation cache.
> Where it is explicitly mentioned that speculation can happen in *any time*.
> And at the same time:
>    Speculative executions of the address-translation algorithm behave as
>    non-speculative executions of the algorithm do, except that they must
>    not set the dirty bit for a PTE, they must not trigger an exception,
>    and they must not create address-translation cache entries if those
>    entries would have been invalidated by any SFENCE.VMA instruction
>    executed by the hart since the speculative execution of the algorithm began.
> What I read as if TLB was empty before it will stay empty.

I read that as "flushing the TLB invalidates entries created by speculative
execution before the TLB flush".  That is the bare minimum needed for TLB
flushing to work.  You have to do the TLB flush *after* changing the PTEs,
not before.

This is true on at least x86 but I expect it to hold in general.
-- 
Sincerely,
Demi Marie Obenour (she/her/hers)

Re: [PATCH v3 03/20] xen/riscv: introduce VMID allocation and manegement

Posted by Oleksii Kurochko 2 months, 3 weeks ago

On 8/6/25 6:50 PM, Demi Marie Obenour wrote:
> On 8/6/25 12:24, Oleksii Kurochko wrote:
>> On 8/6/25 2:05 PM, Jan Beulich wrote:
>>> On 06.08.2025 13:33, Oleksii Kurochko wrote:
>>>> On 8/4/25 5:19 PM, Jan Beulich wrote:
>>>>> On 31.07.2025 17:58, Oleksii Kurochko wrote:
>>>>>> @@ -148,6 +149,8 @@ void __init noreturn start_xen(unsigned long bootcpu_id,
>>>>>>     
>>>>>>         console_init_postirq();
>>>>>>     
>>>>>> +    vmid_init();
>>>>> This lives here only temporarily, I assume? Every hart will need to execute
>>>>> it, and hence (like we have it on x86) this may want to be a central place
>>>>> elsewhere.
>>>> I haven’t checked how it is done on x86; I probably should.
>>>>
>>>> I planned to call it for each hart separately during secondary hart bring-up,
>>>> since accessing the|hgatp| register of a hart is required to detect|VMIDLEN|.
>>>> Therefore,|vmid_init()| should be called for secondary harts when their
>>>> initialization code starts executing.
>>> But is this going to be the only per-hart thing that will need doing? Otherwise
>>> the same larger "container" function may want calling instead.
>> Yes, it is going to be the only per-hart operation.
>>
>> There is|__cpu_up()| (not yet upstreamed [1]), which calls
>> |sbi_hsm_hart_start(hartid, boot_addr, hsm_data)| to start a hart, and I planned
>> to place|vmid_init()| somewhere in the code executed at|boot_addr|.
>>
>> [1]https://gitlab.com/xen-project/people/olkur/xen/-/blob/latest/xen/arch/riscv/smpboot.c#L40
>>
>>>>>> +{
>>>>>> +    unsigned long vmid_bits;
>>>>> Why "long" (also for the function return type)?
>>>> Because csr_read() returns unsigned long as HGATP register has
>>>> 'unsigned long' length.
>>> Oh, right, I should have commented on the function return type only.
>>> Yet then I also can't resist stating that this kind of use of a variable,
>>> which initially is assigned a value that doesn't really fit its name, is
>>> easily misleading towards giving such comments.
>>>
>>>> But it could be done in this way:
>>>>        csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
>>>>        vmid_bits =  MASK_EXTR(csr_read(CSR_HGATP), HGATP_VMID_MASK);
>>>>        vmid_bits = ffs_g(vmid_bits);
>>>>        csr_write(CSR_HGATP, old);
>>>> And then use uint16_t for vmid_bits and use uin16_t as a return type.
>>> Please check ./CODING_STYLE again as to the use of fixed-width types.
>> I meant unsigned short, uint16_t was just short to write. I'll try to be
>> more specific.
>>
>>>>>> +    unsigned long old;
>>>>>> +
>>>>>> +    /* Figure-out number of VMID bits in HW */
>>>>>> +    old = csr_read(CSR_HGATP);
>>>>>> +
>>>>>> +    csr_write(CSR_HGATP, old | HGATP_VMID_MASK);
>>>>>> +    vmid_bits = csr_read(CSR_HGATP);
>>>>>> +    vmid_bits =  MASK_EXTR(vmid_bits, HGATP_VMID_MASK);
>>>>> Nit: Stray blank.
>>>>>
>>>>>> +    vmid_bits = flsl(vmid_bits);
>>>>>> +    csr_write(CSR_HGATP, old);
>>>>>> +
>>>>>> +    /*
>>>>>> +     * We polluted local TLB so flush all guest TLB as
>>>>>> +     * a speculative access can happen at any time.
>>>>>> +     */
>>>>>> +    local_hfence_gvma_all();
>>>>> There's no guest running. If you wrote hgat.MODE as zero, as per my
>>>>> understanding now new TLB entries could even purely theoretically appear.
>>>> It could be an issue (or, at least, it is recommended) when hgatp.MODE is
>>>> changed:
>>>>     If hgatp.MODE is changed for a given VMID, an HFENCE.GVMA with rs1=x0
>>>>     (and rs2 set to either x0 or the VMID) must be executed to order subsequent
>>>>     guest translations with the MODE change—even if the old MODE or new MODE
>>>>     is Bare.
>>>> On other hand it is guaranteed that, at least, on Reset (and so I assume
>>>> for power on) that:
>>>>     If the hypervisor extension is implemented, the hgatp.MODE and vsatp.MODE
>>>>     fields are reset to 0.
>>>>
>>>> So it seems like if no guest is ran then there is no need even to write
>>>> hgatp.MODE as zero, but it might be sense to do that explicitly just to
>>>> be sure.
>>>>
>>>> I thought it was possible to have a running guest and perform a CPU hotplug.
>>> But that guest will run on another hart.
>>>
>>>> In that case, I expect that during the hotplug,|vmidlen_detect()| will be
>>>> called and return the|vmid_bits| value, which is used as the active VMID.
>>>> At that moment, the local TLB could be speculatively polluted, I think.
>>>> Likely, it makes sense to call vmidlen_detect() only once for each hart
>>>> during initial bringup.
>>> That may bring you more problems than it solves. You'd need to stash away
>>> the value originally read somewhere. And that somewhere isn't per-CPU data.
>>>
>>>>> In fact, with no guest running (yet) I'm having a hard time seeing why
>>>>> you shouldn't be able to simply write the register with just
>>>>> HGATP_VMID_MASK, i.e. without OR-ing in "old". It's even questionable
>>>>> whether "old" needs restoring; writing plain zero afterwards ought to
>>>>> suffice. You're in charcge of the register, after all.
>>>> It make sense (but I don't know if it is a possible case) to be sure that
>>>> HGATP.MODE remains the same, so there is no need to have TLB flush. If
>>>> HGATP.MODE is changed then it will be needed to do TLB flush as I mentioned
>>>> above.
>>>>
>>>> If we agreed to keep local_hfence_gvma_all() then I think it isn't really
>>>> any sense to restore 'old' or OR-ing it with HGATP_VMID_MASK.
>>>>
>>>> Generally if 'old' is guaranteed to be zero (and, probably, it makes sense
>>>> to check that in vmidlen_detect() and panic if it isn't zero) and if
>>>> vmidlen_detect() function will be called before any guest domain(s) will
>>>> be ran then I could agree that we don't need local_hfence_gvma_all() here.
>>>>
>>>> As an option we can do local_hfence_gvma_all() only if 'old' value wasn't
>>>> set to zero.
>>>>
>>>> Does it make sense?
>>> Well - I'd like the pre-conditions to be understood better. For example, can
>>> a hart really speculate into guest mode, when the hart is only in the
>>> process of being brought up?
>> I couldn't explicit words that a hart can't speculate into guest mode
>> either on bring up or during its work.
>>
>> But there are some moments in the spec which tells:
>>     Implementations with virtual memory are permitted to perform address
>>     translations speculatively and earlier than required by an explicit
>>     memory access, and are permitted to cache them in address translation
>>     cache structures—including possibly caching the identity mappings from
>>     effective address to physical address used in Bare translation modes and
>>     M-mode.
>> And here:
>>     Implementations may also execute the address-translation algorithm
>>     speculatively at any time, for any virtual address, as long as satp is
>>     active (as defined in Section 10.1.11). Such speculative executions have
>>     the effect of pre-populating the address-translation cache.
>> Where it is explicitly mentioned that speculation can happen in *any time*.
>> And at the same time:
>>     Speculative executions of the address-translation algorithm behave as
>>     non-speculative executions of the algorithm do, except that they must
>>     not set the dirty bit for a PTE, they must not trigger an exception,
>>     and they must not create address-translation cache entries if those
>>     entries would have been invalidated by any SFENCE.VMA instruction
>>     executed by the hart since the speculative execution of the algorithm began.
>> What I read as if TLB was empty before it will stay empty.
> I read that as "flushing the TLB invalidates entries created by speculative
> execution before the TLB flush".

But this part:
   they must not create address-translation cache entries if those entries
   would have been invalidated by any SFENCE.VMA instruction

Doesn't it mean that entries which was invalidated by SFENCE.VMA can't be
inserted into the TLB during speculative execution?

So, if the speculative page walk started before|SFENCE.VMA|,|SFENCE.VMA|
indicates: “All previous TLB entries might be invalid". Therefore, any
speculative TLB entry/that started before/ must*not* be inserted into the
TLB afterward.

So, hardware tracks if a|SFENCE.VMA| occurred/after/ speculation started.
If so, any speculative address translations must be*discarded* or
*not committed*.

>    That is the bare minimum needed for TLB
> flushing to work.  You have to do the TLB flush *after* changing the PTEs,
> not before.
>
> This is true on at least x86 but I expect it to hold in general.

Agree with that.

~ Oleksii