Currently, the kernel won't start a guest if the MTE feature is enabled
and the guest RAM is backed by memory which doesn't support access tags.
Update this such that the kernel uses the NoTagAccess memory attribute
while mapping pages from VMAs for which MTE is not allowed. The fault
from accessing the access tags with such pages is forwarded to VMM so
that VMM can decide to kill the guest or remap the pages so that
access tag storage is allowed.
NOTE: We could also use KVM_EXIT_MEMORY_FAULT for this. I chose to
add a new EXIT type because this is arm64 specific exit type.
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
arch/arm64/include/asm/kvm_emulate.h | 5 +++++
arch/arm64/include/asm/kvm_pgtable.h | 1 +
arch/arm64/kvm/hyp/pgtable.c | 16 +++++++++++++---
arch/arm64/kvm/mmu.c | 28 ++++++++++++++++++++++------
include/uapi/linux/kvm.h | 7 +++++++
5 files changed, 48 insertions(+), 9 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index a601a9305b10..fa0149a0606a 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -373,6 +373,11 @@ static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu)
return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu);
}
+static inline bool kvm_vcpu_trap_is_tagaccess(const struct kvm_vcpu *vcpu)
+{
+ return !!(ESR_ELx_ISS2(kvm_vcpu_get_esr(vcpu)) & ESR_ELx_TagAccess);
+}
+
static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu)
{
return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC;
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 03f4c3d7839c..5657ac1998ad 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -252,6 +252,7 @@ enum kvm_pgtable_prot {
KVM_PGTABLE_PROT_DEVICE = BIT(3),
KVM_PGTABLE_PROT_NORMAL_NC = BIT(4),
+ KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS = BIT(5),
KVM_PGTABLE_PROT_SW0 = BIT(55),
KVM_PGTABLE_PROT_SW1 = BIT(56),
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index b11bcebac908..bc0d9f08c49a 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -677,9 +677,11 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
{
kvm_pte_t attr;
u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+ unsigned long prot_mask = KVM_PGTABLE_PROT_DEVICE |
+ KVM_PGTABLE_PROT_NORMAL_NC |
+ KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS;
- switch (prot & (KVM_PGTABLE_PROT_DEVICE |
- KVM_PGTABLE_PROT_NORMAL_NC)) {
+ switch (prot & prot_mask) {
case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC:
return -EINVAL;
case KVM_PGTABLE_PROT_DEVICE:
@@ -692,6 +694,12 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
return -EINVAL;
attr = KVM_S2_MEMATTR(pgt, NORMAL_NC);
break;
+ case KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS:
+ if (system_supports_notagaccess())
+ attr = KVM_S2_MEMATTR(pgt, NORMAL_NOTAGACCESS);
+ else
+ return -EINVAL;
+ break;
default:
attr = KVM_S2_MEMATTR(pgt, NORMAL);
}
@@ -872,7 +880,9 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
{
u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
- return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL);
+ return kvm_pte_valid(pte) &&
+ ((memattr == KVM_S2_MEMATTR(pgt, NORMAL)) ||
+ (memattr == KVM_S2_MEMATTR(pgt, NORMAL_NOTAGACCESS)));
}
static bool stage2_pte_executable(kvm_pte_t pte)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index b5824e93cee0..e56c6996332e 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1647,12 +1647,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* not a permission fault implies a translation fault which
* means mapping the page for the first time
*/
- if (mte_allowed) {
+ if (mte_allowed)
sanitise_mte_tags(kvm, pfn, vma_pagesize);
- } else {
- ret = -EFAULT;
- goto out_unlock;
- }
+ else
+ prot |= KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS;
}
if (writable)
@@ -1721,6 +1719,15 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
kvm_set_pfn_accessed(kvm_pte_to_pfn(pte));
}
+static inline void kvm_prepare_notagaccess_exit(struct kvm_vcpu *vcpu,
+ gpa_t gpa, gpa_t size)
+{
+ vcpu->run->exit_reason = KVM_EXIT_ARM_NOTAG_ACCESS;
+ vcpu->run->notag_access.flags = 0;
+ vcpu->run->notag_access.gpa = gpa;
+ vcpu->run->notag_access.size = size;
+}
+
/**
* kvm_handle_guest_abort - handles all 2nd stage aborts
* @vcpu: the VCPU pointer
@@ -1833,6 +1840,14 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
gfn = ipa >> PAGE_SHIFT;
memslot = gfn_to_memslot(vcpu->kvm, gfn);
+
+ if (kvm_vcpu_trap_is_tagaccess(vcpu)) {
+ /* exit to host and handle the error */
+ kvm_prepare_notagaccess_exit(vcpu, gfn << PAGE_SHIFT, PAGE_SIZE);
+ ret = 0;
+ goto out;
+ }
+
hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
write_fault = kvm_is_write_fault(vcpu);
if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
@@ -2145,7 +2160,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if (!vma)
break;
- if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
+ if (kvm_has_mte(kvm) && !system_supports_notagaccess() &&
+ !kvm_vma_mte_allowed(vma)) {
ret = -EINVAL;
break;
}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 637efc055145..a8268a164c4d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -178,6 +178,7 @@ struct kvm_xen_exit {
#define KVM_EXIT_NOTIFY 37
#define KVM_EXIT_LOONGARCH_IOCSR 38
#define KVM_EXIT_MEMORY_FAULT 39
+#define KVM_EXIT_ARM_NOTAG_ACCESS 40
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
@@ -446,6 +447,12 @@ struct kvm_run {
__u64 gpa;
__u64 size;
} memory_fault;
+ /* KVM_EXIT_ARM_NOTAG_ACCESS */
+ struct {
+ __u64 flags;
+ __u64 gpa;
+ __u64 size;
+ } notag_access;
/* Fix the size of the union. */
char padding[256];
};
--
2.43.0
On Mon, Oct 28, 2024 at 03:10:14PM +0530, Aneesh Kumar K.V (Arm) wrote: > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 637efc055145..a8268a164c4d 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -178,6 +178,7 @@ struct kvm_xen_exit { > #define KVM_EXIT_NOTIFY 37 > #define KVM_EXIT_LOONGARCH_IOCSR 38 > #define KVM_EXIT_MEMORY_FAULT 39 > +#define KVM_EXIT_ARM_NOTAG_ACCESS 40 > > /* For KVM_EXIT_INTERNAL_ERROR */ > /* Emulate instruction failed. */ > @@ -446,6 +447,12 @@ struct kvm_run { > __u64 gpa; > __u64 size; > } memory_fault; > + /* KVM_EXIT_ARM_NOTAG_ACCESS */ > + struct { > + __u64 flags; > + __u64 gpa; > + __u64 size; > + } notag_access; Can you please look into reusing the memory fault exit infrastructure? The entire point of that is for KVM to tell the VMM it cannot make forward progress because of ${SOMETHING} unexpected at the specified GPA. You can add a new flag that describes tag access. -- Thanks, Oliver
Hi Oliver, Thanks for reviewing the changes. Oliver Upton <oliver.upton@linux.dev> writes: > On Mon, Oct 28, 2024 at 03:10:14PM +0530, Aneesh Kumar K.V (Arm) wrote: > > NOTE: We could also use KVM_EXIT_MEMORY_FAULT for this. I chose to > add a new EXIT type because this is arm64 specific exit type. > > Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org> > --- I have used KVM_EXIT_MEMORY_FAULT as part of the initial prototype. >> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h >> index 637efc055145..a8268a164c4d 100644 >> --- a/include/uapi/linux/kvm.h >> +++ b/include/uapi/linux/kvm.h >> @@ -178,6 +178,7 @@ struct kvm_xen_exit { >> #define KVM_EXIT_NOTIFY 37 >> #define KVM_EXIT_LOONGARCH_IOCSR 38 >> #define KVM_EXIT_MEMORY_FAULT 39 >> +#define KVM_EXIT_ARM_NOTAG_ACCESS 40 >> >> /* For KVM_EXIT_INTERNAL_ERROR */ >> /* Emulate instruction failed. */ >> @@ -446,6 +447,12 @@ struct kvm_run { >> __u64 gpa; >> __u64 size; >> } memory_fault; >> + /* KVM_EXIT_ARM_NOTAG_ACCESS */ >> + struct { >> + __u64 flags; >> + __u64 gpa; >> + __u64 size; >> + } notag_access; > > Can you please look into reusing the memory fault exit infrastructure? > > The entire point of that is for KVM to tell the VMM it cannot make > forward progress because of ${SOMETHING} unexpected at the specified > GPA. You can add a new flag that describes tag access. > The only reason I dropped the change was because the flag will be very much arm64 specific. Based on your feedback, I will switch to KVM_EXIT_MEMORY_FAULT in the next update. -aneesh
On Mon, Oct 28, 2024 at 08:22:02PM +0530, Aneesh Kumar K.V wrote: > > Hi Oliver, > > > Thanks for reviewing the changes. > > Oliver Upton <oliver.upton@linux.dev> writes: > > > On Mon, Oct 28, 2024 at 03:10:14PM +0530, Aneesh Kumar K.V (Arm) wrote: > > > > > NOTE: We could also use KVM_EXIT_MEMORY_FAULT for this. I chose to > > add a new EXIT type because this is arm64 specific exit type. > > > > Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org> > > --- > > I have used KVM_EXIT_MEMORY_FAULT as part of the initial prototype. Ah, apologies, I clearly didn't read the changelog. > >> + /* KVM_EXIT_ARM_NOTAG_ACCESS */ > >> + struct { > >> + __u64 flags; > >> + __u64 gpa; > >> + __u64 size; > >> + } notag_access; > > > > Can you please look into reusing the memory fault exit infrastructure? > > > > The entire point of that is for KVM to tell the VMM it cannot make > > forward progress because of ${SOMETHING} unexpected at the specified > > GPA. You can add a new flag that describes tag access. > > > > The only reason I dropped the change was because the flag will be very much > arm64 specific. Eh, making it arm64-specific is very much the right call. There's no shortage of available bits in that structure, and we should make no attempt to provide a generic description of what is otherwise a very architecture-specific thing. > Based on your feedback, I will switch to KVM_EXIT_MEMORY_FAULT in the next > update. Excellent, thanks! -- Thanks, Oliver
On Mon, 28 Oct 2024 09:40:14 +0000, "Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org> wrote: > > Currently, the kernel won't start a guest if the MTE feature is enabled > and the guest RAM is backed by memory which doesn't support access tags. > Update this such that the kernel uses the NoTagAccess memory attribute > while mapping pages from VMAs for which MTE is not allowed. The fault > from accessing the access tags with such pages is forwarded to VMM so > that VMM can decide to kill the guest or remap the pages so that > access tag storage is allowed. I only have questions here: - what is the benefit of such approach? why shouldn't that be the kernel's job to fix it? - where is the documentation for this new userspace ABI? - are you expecting the VMM to create a new memslot for this? - where is the example of a VMM using this? > > NOTE: We could also use KVM_EXIT_MEMORY_FAULT for this. I chose to > add a new EXIT type because this is arm64 specific exit type. > > Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org> > --- > arch/arm64/include/asm/kvm_emulate.h | 5 +++++ > arch/arm64/include/asm/kvm_pgtable.h | 1 + > arch/arm64/kvm/hyp/pgtable.c | 16 +++++++++++++--- > arch/arm64/kvm/mmu.c | 28 ++++++++++++++++++++++------ > include/uapi/linux/kvm.h | 7 +++++++ > 5 files changed, 48 insertions(+), 9 deletions(-) > > diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h > index a601a9305b10..fa0149a0606a 100644 > --- a/arch/arm64/include/asm/kvm_emulate.h > +++ b/arch/arm64/include/asm/kvm_emulate.h > @@ -373,6 +373,11 @@ static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu) > return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); > } > > +static inline bool kvm_vcpu_trap_is_tagaccess(const struct kvm_vcpu *vcpu) > +{ > + return !!(ESR_ELx_ISS2(kvm_vcpu_get_esr(vcpu)) & ESR_ELx_TagAccess); > +} > + > static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) > { > return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC; > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h > index 03f4c3d7839c..5657ac1998ad 100644 > --- a/arch/arm64/include/asm/kvm_pgtable.h > +++ b/arch/arm64/include/asm/kvm_pgtable.h > @@ -252,6 +252,7 @@ enum kvm_pgtable_prot { > > KVM_PGTABLE_PROT_DEVICE = BIT(3), > KVM_PGTABLE_PROT_NORMAL_NC = BIT(4), > + KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS = BIT(5), This seems wrong. NOTAGACCESS is a *permission*, not a memory type. > > KVM_PGTABLE_PROT_SW0 = BIT(55), > KVM_PGTABLE_PROT_SW1 = BIT(56), > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c > index b11bcebac908..bc0d9f08c49a 100644 > --- a/arch/arm64/kvm/hyp/pgtable.c > +++ b/arch/arm64/kvm/hyp/pgtable.c > @@ -677,9 +677,11 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p > { > kvm_pte_t attr; > u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; > + unsigned long prot_mask = KVM_PGTABLE_PROT_DEVICE | > + KVM_PGTABLE_PROT_NORMAL_NC | > + KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS; > > - switch (prot & (KVM_PGTABLE_PROT_DEVICE | > - KVM_PGTABLE_PROT_NORMAL_NC)) { > + switch (prot & prot_mask) { > case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC: > return -EINVAL; > case KVM_PGTABLE_PROT_DEVICE: > @@ -692,6 +694,12 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p > return -EINVAL; > attr = KVM_S2_MEMATTR(pgt, NORMAL_NC); > break; > + case KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS: > + if (system_supports_notagaccess()) > + attr = KVM_S2_MEMATTR(pgt, NORMAL_NOTAGACCESS); > + else > + return -EINVAL; > + break; How do you see this working when migrating a VM from one host to another, one that supports FEAT_MTE_PERM and one that doesn't? The current assumptions are that the VMM will replay the *exact same* setup on the target host, and this obviously doesn't work. > default: > attr = KVM_S2_MEMATTR(pgt, NORMAL); > } > @@ -872,7 +880,9 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, > static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) > { > u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; > - return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL); > + return kvm_pte_valid(pte) && > + ((memattr == KVM_S2_MEMATTR(pgt, NORMAL)) || > + (memattr == KVM_S2_MEMATTR(pgt, NORMAL_NOTAGACCESS))); > } > > static bool stage2_pte_executable(kvm_pte_t pte) > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > index b5824e93cee0..e56c6996332e 100644 > --- a/arch/arm64/kvm/mmu.c > +++ b/arch/arm64/kvm/mmu.c > @@ -1647,12 +1647,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > * not a permission fault implies a translation fault which > * means mapping the page for the first time > */ > - if (mte_allowed) { > + if (mte_allowed) > sanitise_mte_tags(kvm, pfn, vma_pagesize); > - } else { > - ret = -EFAULT; > - goto out_unlock; > - } > + else > + prot |= KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS; > } > > if (writable) > @@ -1721,6 +1719,15 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) > kvm_set_pfn_accessed(kvm_pte_to_pfn(pte)); > } > > +static inline void kvm_prepare_notagaccess_exit(struct kvm_vcpu *vcpu, > + gpa_t gpa, gpa_t size) > +{ > + vcpu->run->exit_reason = KVM_EXIT_ARM_NOTAG_ACCESS; > + vcpu->run->notag_access.flags = 0; > + vcpu->run->notag_access.gpa = gpa; > + vcpu->run->notag_access.size = size; Why does size matter here? It seems pretty pointless. > +} > + > /** > * kvm_handle_guest_abort - handles all 2nd stage aborts > * @vcpu: the VCPU pointer > @@ -1833,6 +1840,14 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) > > gfn = ipa >> PAGE_SHIFT; > memslot = gfn_to_memslot(vcpu->kvm, gfn); > + > + if (kvm_vcpu_trap_is_tagaccess(vcpu)) { > + /* exit to host and handle the error */ > + kvm_prepare_notagaccess_exit(vcpu, gfn << PAGE_SHIFT, PAGE_SIZE); > + ret = 0; > + goto out; > + } > + > hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); > write_fault = kvm_is_write_fault(vcpu); > if (kvm_is_error_hva(hva) || (write_fault && !writable)) { > @@ -2145,7 +2160,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, > if (!vma) > break; > > - if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { > + if (kvm_has_mte(kvm) && !system_supports_notagaccess() && > + !kvm_vma_mte_allowed(vma)) { > ret = -EINVAL; > break; > } > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 637efc055145..a8268a164c4d 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -178,6 +178,7 @@ struct kvm_xen_exit { > #define KVM_EXIT_NOTIFY 37 > #define KVM_EXIT_LOONGARCH_IOCSR 38 > #define KVM_EXIT_MEMORY_FAULT 39 > +#define KVM_EXIT_ARM_NOTAG_ACCESS 40 > > /* For KVM_EXIT_INTERNAL_ERROR */ > /* Emulate instruction failed. */ > @@ -446,6 +447,12 @@ struct kvm_run { > __u64 gpa; > __u64 size; > } memory_fault; > + /* KVM_EXIT_ARM_NOTAG_ACCESS */ > + struct { > + __u64 flags; > + __u64 gpa; > + __u64 size; > + } notag_access; > /* Fix the size of the union. */ > char padding[256]; > }; How do you plan to handle the same thing for NV? Thanks, M. -- Without deviation from the norm, progress is not possible.
Marc Zyngier <maz@kernel.org> writes: > On Mon, 28 Oct 2024 09:40:14 +0000, > "Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org> wrote: >> >> Currently, the kernel won't start a guest if the MTE feature is enabled >> and the guest RAM is backed by memory which doesn't support access tags. >> Update this such that the kernel uses the NoTagAccess memory attribute >> while mapping pages from VMAs for which MTE is not allowed. The fault >> from accessing the access tags with such pages is forwarded to VMM so >> that VMM can decide to kill the guest or remap the pages so that >> access tag storage is allowed. > > I only have questions here: > > - what is the benefit of such approach? why shouldn't that be the > kernel's job to fix it? > IMHO leaving that policy decision to VMM makes the kernel changes simpler. In most cases, VMM will kill the guest, because these restrictions of MTE_ALLOWED are applied at the memslot/vma. > > - where is the documentation for this new userspace ABI? > I will add the details if we agree that this should be a separate EXIT as outlined in this patch. > > - are you expecting the VMM to create a new memslot for this? > I guess there are examples of configs where some memory regions are backed by page cache where we don't directly use those memory regions as allocatable memory in the guest. This change allows us to enable MTE in such configs. > - where is the example of a VMM using this? > I do have changes to kvmtool which won't do any fixup on receiving that VM exit. I expect other VMM to do the same by default when they get a VM exit with an unknown exit_reason. So unless VMM wants to do any special handling, we don't need any change in the VMM. diff --git a/arm/kvm-cpu.c b/arm/kvm-cpu.c index 3b95750ecec7..4760bad07476 100644 --- a/arm/kvm-cpu.c +++ b/arm/kvm-cpu.c @@ -239,6 +239,17 @@ static bool handle_memoryfault(struct kvm_cpu *vcpu) return true; } +static bool handle_notag_access(struct kvm_cpu *vcpu) +{ + u64 gpa = vcpu->kvm_run->memory_fault.gpa; + u64 size = vcpu->kvm_run->memory_fault.size; + + /* For now VMM just panic */ + pr_err("Tag Access to a wrong memory region 0x%lx size 0x%lx\n", + (unsigned long)gpa, (unsigned long)size); + return false; +} + bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu) { switch (vcpu->kvm_run->exit_reason) { @@ -246,6 +257,8 @@ bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu) return handle_hypercall(vcpu); case KVM_EXIT_MEMORY_FAULT: return handle_memoryfault(vcpu); + case KVM_EXIT_ARM_NOTAG_ACCESS: + return handle_notag_access(vcpu); } return false; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 32cff22f0e4d..deef6614f577 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -178,6 +178,7 @@ struct kvm_xen_exit { #define KVM_EXIT_NOTIFY 37 #define KVM_EXIT_LOONGARCH_IOCSR 38 #define KVM_EXIT_MEMORY_FAULT 39 +#define KVM_EXIT_ARM_NOTAG_ACCESS 40 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -429,10 +430,17 @@ struct kvm_run { /* KVM_EXIT_MEMORY_FAULT */ struct { #define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) +#define KVM_MEMORY_EXIT_FLAG_NOTAGACCESS (1ULL << 4) __u64 flags; __u64 gpa; __u64 size; } memory_fault; + /* KVM_EXIT_ARM_NOTAG_ACCESS */ + struct { + __u64 flags; + __u64 gpa; + __u64 size; + } notag_access; /* Fix the size of the union. */ char padding[256]; }; >> >> NOTE: We could also use KVM_EXIT_MEMORY_FAULT for this. I chose to >> add a new EXIT type because this is arm64 specific exit type. >> >> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org> >> --- >> arch/arm64/include/asm/kvm_emulate.h | 5 +++++ >> arch/arm64/include/asm/kvm_pgtable.h | 1 + >> arch/arm64/kvm/hyp/pgtable.c | 16 +++++++++++++--- >> arch/arm64/kvm/mmu.c | 28 ++++++++++++++++++++++------ >> include/uapi/linux/kvm.h | 7 +++++++ >> 5 files changed, 48 insertions(+), 9 deletions(-) >> >> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h >> index a601a9305b10..fa0149a0606a 100644 >> --- a/arch/arm64/include/asm/kvm_emulate.h >> +++ b/arch/arm64/include/asm/kvm_emulate.h >> @@ -373,6 +373,11 @@ static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu) >> return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); >> } >> >> +static inline bool kvm_vcpu_trap_is_tagaccess(const struct kvm_vcpu *vcpu) >> +{ >> + return !!(ESR_ELx_ISS2(kvm_vcpu_get_esr(vcpu)) & ESR_ELx_TagAccess); >> +} >> + >> static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) >> { >> return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC; >> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h >> index 03f4c3d7839c..5657ac1998ad 100644 >> --- a/arch/arm64/include/asm/kvm_pgtable.h >> +++ b/arch/arm64/include/asm/kvm_pgtable.h >> @@ -252,6 +252,7 @@ enum kvm_pgtable_prot { >> >> KVM_PGTABLE_PROT_DEVICE = BIT(3), >> KVM_PGTABLE_PROT_NORMAL_NC = BIT(4), >> + KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS = BIT(5), > > This seems wrong. NOTAGACCESS is a *permission*, not a memory type. > Are you suggesting the name is wrong? The memory attribute value I wanted to use is MemAttr[3:0] = 0b0100 which is Normal, NoTagAccess, writeback cacheable. I am following the changes similar to KVM_PGTABLE_PROT_NORMAL_NC. > >> >> KVM_PGTABLE_PROT_SW0 = BIT(55), >> KVM_PGTABLE_PROT_SW1 = BIT(56), >> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c >> index b11bcebac908..bc0d9f08c49a 100644 >> --- a/arch/arm64/kvm/hyp/pgtable.c >> +++ b/arch/arm64/kvm/hyp/pgtable.c >> @@ -677,9 +677,11 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p >> { >> kvm_pte_t attr; >> u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; >> + unsigned long prot_mask = KVM_PGTABLE_PROT_DEVICE | >> + KVM_PGTABLE_PROT_NORMAL_NC | >> + KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS; >> >> - switch (prot & (KVM_PGTABLE_PROT_DEVICE | >> - KVM_PGTABLE_PROT_NORMAL_NC)) { >> + switch (prot & prot_mask) { >> case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC: >> return -EINVAL; >> case KVM_PGTABLE_PROT_DEVICE: >> @@ -692,6 +694,12 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p >> return -EINVAL; >> attr = KVM_S2_MEMATTR(pgt, NORMAL_NC); >> break; >> + case KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS: >> + if (system_supports_notagaccess()) >> + attr = KVM_S2_MEMATTR(pgt, NORMAL_NOTAGACCESS); >> + else >> + return -EINVAL; >> + break; > > How do you see this working when migrating a VM from one host to > another, one that supports FEAT_MTE_PERM and one that doesn't? The > current assumptions are that the VMM will replay the *exact same* > setup on the target host, and this obviously doesn't work. > I missed looking at kvm migration. I guess I will have to expose this as a capability and only allow migration if the target also supports the same capability? > >> default: >> attr = KVM_S2_MEMATTR(pgt, NORMAL); >> } >> @@ -872,7 +880,9 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, >> static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) >> { >> u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; >> - return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL); >> + return kvm_pte_valid(pte) && >> + ((memattr == KVM_S2_MEMATTR(pgt, NORMAL)) || >> + (memattr == KVM_S2_MEMATTR(pgt, NORMAL_NOTAGACCESS))); >> } >> >> static bool stage2_pte_executable(kvm_pte_t pte) >> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c >> index b5824e93cee0..e56c6996332e 100644 >> --- a/arch/arm64/kvm/mmu.c >> +++ b/arch/arm64/kvm/mmu.c >> @@ -1647,12 +1647,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, >> * not a permission fault implies a translation fault which >> * means mapping the page for the first time >> */ >> - if (mte_allowed) { >> + if (mte_allowed) >> sanitise_mte_tags(kvm, pfn, vma_pagesize); >> - } else { >> - ret = -EFAULT; >> - goto out_unlock; >> - } >> + else >> + prot |= KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS; >> } >> >> if (writable) >> @@ -1721,6 +1719,15 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) >> kvm_set_pfn_accessed(kvm_pte_to_pfn(pte)); >> } >> >> +static inline void kvm_prepare_notagaccess_exit(struct kvm_vcpu *vcpu, >> + gpa_t gpa, gpa_t size) >> +{ >> + vcpu->run->exit_reason = KVM_EXIT_ARM_NOTAG_ACCESS; >> + vcpu->run->notag_access.flags = 0; >> + vcpu->run->notag_access.gpa = gpa; >> + vcpu->run->notag_access.size = size; > > Why does size matter here? It seems pretty pointless. > I agree that since the exit is only generated on fault and size will always be PAGE_SIZE, size field is not required. I will remove that in the next update. > >> +} >> + >> /** >> * kvm_handle_guest_abort - handles all 2nd stage aborts >> * @vcpu: the VCPU pointer >> @@ -1833,6 +1840,14 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) >> >> gfn = ipa >> PAGE_SHIFT; >> memslot = gfn_to_memslot(vcpu->kvm, gfn); >> + >> + if (kvm_vcpu_trap_is_tagaccess(vcpu)) { >> + /* exit to host and handle the error */ >> + kvm_prepare_notagaccess_exit(vcpu, gfn << PAGE_SHIFT, PAGE_SIZE); >> + ret = 0; >> + goto out; >> + } >> + >> hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); >> write_fault = kvm_is_write_fault(vcpu); >> if (kvm_is_error_hva(hva) || (write_fault && !writable)) { >> @@ -2145,7 +2160,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, >> if (!vma) >> break; >> >> - if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { >> + if (kvm_has_mte(kvm) && !system_supports_notagaccess() && >> + !kvm_vma_mte_allowed(vma)) { >> ret = -EINVAL; >> break; >> } >> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h >> index 637efc055145..a8268a164c4d 100644 >> --- a/include/uapi/linux/kvm.h >> +++ b/include/uapi/linux/kvm.h >> @@ -178,6 +178,7 @@ struct kvm_xen_exit { >> #define KVM_EXIT_NOTIFY 37 >> #define KVM_EXIT_LOONGARCH_IOCSR 38 >> #define KVM_EXIT_MEMORY_FAULT 39 >> +#define KVM_EXIT_ARM_NOTAG_ACCESS 40 >> >> /* For KVM_EXIT_INTERNAL_ERROR */ >> /* Emulate instruction failed. */ >> @@ -446,6 +447,12 @@ struct kvm_run { >> __u64 gpa; >> __u64 size; >> } memory_fault; >> + /* KVM_EXIT_ARM_NOTAG_ACCESS */ >> + struct { >> + __u64 flags; >> + __u64 gpa; >> + __u64 size; >> + } notag_access; >> /* Fix the size of the union. */ >> char padding[256]; >> }; > > How do you plan to handle the same thing for NV? > I will have to admit that I have not looked at nested virtualization. I will update that as part of the next update. -aneesh
On Mon, 28 Oct 2024 13:28:42 +0000, Aneesh Kumar K.V <aneesh.kumar@kernel.org> wrote: > > Marc Zyngier <maz@kernel.org> writes: > > > On Mon, 28 Oct 2024 09:40:14 +0000, > > "Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org> wrote: > >> > >> Currently, the kernel won't start a guest if the MTE feature is enabled > >> and the guest RAM is backed by memory which doesn't support access tags. > >> Update this such that the kernel uses the NoTagAccess memory attribute > >> while mapping pages from VMAs for which MTE is not allowed. The fault > >> from accessing the access tags with such pages is forwarded to VMM so > >> that VMM can decide to kill the guest or remap the pages so that > >> access tag storage is allowed. > > > > I only have questions here: > > > > - what is the benefit of such approach? why shouldn't that be the > > kernel's job to fix it? > > > > IMHO leaving that policy decision to VMM makes the kernel changes > simpler. In most cases, VMM will kill the guest, because these > restrictions of MTE_ALLOWED are applied at the memslot/vma. Where is that captured? The whole idea behind FEAT_MTE_PERM was that it would be the hypervisor's task to lazily allocate MTE-capable memory as tagged-access would occur. > > > > > - where is the documentation for this new userspace ABI? > > > > I will add the details if we agree that this should be a separate EXIT > as outlined in this patch. Woooot??? This isn't a *DETAIL*. This is the very first thing you should write. Everything *else* is an implementation detail. > > > > > - are you expecting the VMM to create a new memslot for this? > > > > I guess there are examples of configs where some memory regions are > backed by page cache where we don't directly use those memory regions as > allocatable memory in the guest. This change allows us to enable MTE in such > configs. This doesn't answer my question. What is expected sequence a VMM should apply to provide tagged memory to the guest? > > > - where is the example of a VMM using this? > > > > I do have changes to kvmtool which won't do any fixup on receiving that > VM exit. I expect other VMM to do the same by default when they get a > VM exit with an unknown exit_reason. So unless VMM wants to do any > special handling, we don't need any change in the VMM. OK, so this has never been tested. I'm sorry, but for something of this magnitude, with expected userspace interactions, and requiring VMM handling, I want to see the full end-to-end thing. > > > diff --git a/arm/kvm-cpu.c b/arm/kvm-cpu.c > index 3b95750ecec7..4760bad07476 100644 > --- a/arm/kvm-cpu.c > +++ b/arm/kvm-cpu.c > @@ -239,6 +239,17 @@ static bool handle_memoryfault(struct kvm_cpu *vcpu) > return true; > } > > +static bool handle_notag_access(struct kvm_cpu *vcpu) > +{ > + u64 gpa = vcpu->kvm_run->memory_fault.gpa; > + u64 size = vcpu->kvm_run->memory_fault.size; > + > + /* For now VMM just panic */ > + pr_err("Tag Access to a wrong memory region 0x%lx size 0x%lx\n", > + (unsigned long)gpa, (unsigned long)size); > + return false; > +} > + > bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu) > { > switch (vcpu->kvm_run->exit_reason) { > @@ -246,6 +257,8 @@ bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu) > return handle_hypercall(vcpu); > case KVM_EXIT_MEMORY_FAULT: > return handle_memoryfault(vcpu); > + case KVM_EXIT_ARM_NOTAG_ACCESS: > + return handle_notag_access(vcpu); > } > > return false; > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > index 32cff22f0e4d..deef6614f577 100644 > --- a/include/linux/kvm.h > +++ b/include/linux/kvm.h > @@ -178,6 +178,7 @@ struct kvm_xen_exit { > #define KVM_EXIT_NOTIFY 37 > #define KVM_EXIT_LOONGARCH_IOCSR 38 > #define KVM_EXIT_MEMORY_FAULT 39 > +#define KVM_EXIT_ARM_NOTAG_ACCESS 40 > > /* For KVM_EXIT_INTERNAL_ERROR */ > /* Emulate instruction failed. */ > @@ -429,10 +430,17 @@ struct kvm_run { > /* KVM_EXIT_MEMORY_FAULT */ > struct { > #define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) > +#define KVM_MEMORY_EXIT_FLAG_NOTAGACCESS (1ULL << 4) > __u64 flags; > __u64 gpa; > __u64 size; > } memory_fault; > + /* KVM_EXIT_ARM_NOTAG_ACCESS */ > + struct { > + __u64 flags; > + __u64 gpa; > + __u64 size; > + } notag_access; > /* Fix the size of the union. */ > char padding[256]; > }; > > >> > >> NOTE: We could also use KVM_EXIT_MEMORY_FAULT for this. I chose to > >> add a new EXIT type because this is arm64 specific exit type. > >> > >> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org> > >> --- > >> arch/arm64/include/asm/kvm_emulate.h | 5 +++++ > >> arch/arm64/include/asm/kvm_pgtable.h | 1 + > >> arch/arm64/kvm/hyp/pgtable.c | 16 +++++++++++++--- > >> arch/arm64/kvm/mmu.c | 28 ++++++++++++++++++++++------ > >> include/uapi/linux/kvm.h | 7 +++++++ > >> 5 files changed, 48 insertions(+), 9 deletions(-) > >> > >> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h > >> index a601a9305b10..fa0149a0606a 100644 > >> --- a/arch/arm64/include/asm/kvm_emulate.h > >> +++ b/arch/arm64/include/asm/kvm_emulate.h > >> @@ -373,6 +373,11 @@ static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu) > >> return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); > >> } > >> > >> +static inline bool kvm_vcpu_trap_is_tagaccess(const struct kvm_vcpu *vcpu) > >> +{ > >> + return !!(ESR_ELx_ISS2(kvm_vcpu_get_esr(vcpu)) & ESR_ELx_TagAccess); > >> +} > >> + > >> static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) > >> { > >> return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC; > >> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h > >> index 03f4c3d7839c..5657ac1998ad 100644 > >> --- a/arch/arm64/include/asm/kvm_pgtable.h > >> +++ b/arch/arm64/include/asm/kvm_pgtable.h > >> @@ -252,6 +252,7 @@ enum kvm_pgtable_prot { > >> > >> KVM_PGTABLE_PROT_DEVICE = BIT(3), > >> KVM_PGTABLE_PROT_NORMAL_NC = BIT(4), > >> + KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS = BIT(5), > > > > This seems wrong. NOTAGACCESS is a *permission*, not a memory type. > > > > Are you suggesting the name is wrong? The memory attribute value I > wanted to use is > > MemAttr[3:0] = 0b0100 which is Normal, NoTagAccess, writeback cacheable. > > I am following the changes similar to KVM_PGTABLE_PROT_NORMAL_NC. But that's entirely different. This really isn't a memory type. Quite the opposite, actually. This is a stage-2 permission setting, which you are conflating with the actual memory type. Also, I don't see why that should be incompatible with something other than Normal memory. > > > > >> > >> KVM_PGTABLE_PROT_SW0 = BIT(55), > >> KVM_PGTABLE_PROT_SW1 = BIT(56), > >> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c > >> index b11bcebac908..bc0d9f08c49a 100644 > >> --- a/arch/arm64/kvm/hyp/pgtable.c > >> +++ b/arch/arm64/kvm/hyp/pgtable.c > >> @@ -677,9 +677,11 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p > >> { > >> kvm_pte_t attr; > >> u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; > >> + unsigned long prot_mask = KVM_PGTABLE_PROT_DEVICE | > >> + KVM_PGTABLE_PROT_NORMAL_NC | > >> + KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS; > >> > >> - switch (prot & (KVM_PGTABLE_PROT_DEVICE | > >> - KVM_PGTABLE_PROT_NORMAL_NC)) { > >> + switch (prot & prot_mask) { > >> case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC: > >> return -EINVAL; > >> case KVM_PGTABLE_PROT_DEVICE: > >> @@ -692,6 +694,12 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p > >> return -EINVAL; > >> attr = KVM_S2_MEMATTR(pgt, NORMAL_NC); > >> break; > >> + case KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS: > >> + if (system_supports_notagaccess()) > >> + attr = KVM_S2_MEMATTR(pgt, NORMAL_NOTAGACCESS); > >> + else > >> + return -EINVAL; > >> + break; > > > > How do you see this working when migrating a VM from one host to > > another, one that supports FEAT_MTE_PERM and one that doesn't? The > > current assumptions are that the VMM will replay the *exact same* > > setup on the target host, and this obviously doesn't work. > > > > I missed looking at kvm migration. I guess I will have to expose this as > a capability and only allow migration if the target also supports the > same capability? I don't think so. This doesn't affect the guest at all, as the guest doesn't know anything about S2, unless you decide to expose FEAT_MTE_PERM to NV. It affects the VMM though, and that's because you are making a difference in handling between having FEAT_MTE_PERM or not. Given that this is invisible to userspace, that's not a great design. Thanks, M. -- Without deviation from the norm, progress is not possible.
Hi Marc, Sorry for the delay in response, I was looking at nested virt and migration changes so that I can get your feedback on those. Marc Zyngier <maz@kernel.org> writes: > On Mon, 28 Oct 2024 13:28:42 +0000, > Aneesh Kumar K.V <aneesh.kumar@kernel.org> wrote: >> >> Marc Zyngier <maz@kernel.org> writes: >> >> > On Mon, 28 Oct 2024 09:40:14 +0000, >> > "Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org> wrote: >> >> >> >> Currently, the kernel won't start a guest if the MTE feature is enabled >> >> and the guest RAM is backed by memory which doesn't support access tags. >> >> Update this such that the kernel uses the NoTagAccess memory attribute >> >> while mapping pages from VMAs for which MTE is not allowed. The fault >> >> from accessing the access tags with such pages is forwarded to VMM so >> >> that VMM can decide to kill the guest or remap the pages so that >> >> access tag storage is allowed. >> > >> > I only have questions here: >> > >> > - what is the benefit of such approach? why shouldn't that be the >> > kernel's job to fix it? >> >> >> IMHO leaving that policy decision to VMM makes the kernel changes >> simpler. In most cases, VMM will kill the guest, because these >> restrictions of MTE_ALLOWED are applied at the memslot/vma. > > Where is that captured? The whole idea behind FEAT_MTE_PERM was that > it would be the hypervisor's task to lazily allocate MTE-capable > memory as tagged-access would occur. > Lazily allocating MTE-capable memory requires changes to different kernel subsystems and previous attempts got dropped [1] because it was not clear whether the benefit of saving 3% memory overhead was worth the complexity we add to the kernel. This patchset is not looking at that feature. Instead, it can be used to enable MTE in configurations that currently won't allow MTE. One such example is libkrun which includes linux kernel as firmware in a dynamically linked library (libkrunfw). libkrun can insert the kernel region which got mmaped as part of the library load, directly into the guest memory map instead of copying the kernel. Such a guest config can't enable MTE currently even though we will never use the newly inserted memory regions as tag access memory. Similarly, virtiofs dax support can use a page cache region as virtio-shm region. We can use MTE_PERM to enable MTE in this config. [1] https://lore.kernel.org/all/20240125164256.4147-1-alexandru.elisei@arm.com/ >> >> > >> > - where is the documentation for this new userspace ABI? >> > >> >> I will add the details if we agree that this should be a separate EXIT >> as outlined in this patch. > > Woooot??? This isn't a *DETAIL*. This is the very first thing you > should write. Everything *else* is an implementation detail. > >> >> > >> > - are you expecting the VMM to create a new memslot for this? >> > >> >> I guess there are examples of configs where some memory regions are >> backed by page cache where we don't directly use those memory regions as >> allocatable memory in the guest. This change allows us to enable MTE in such >> configs. > > This doesn't answer my question. What is expected sequence a VMM > should apply to provide tagged memory to the guest? > >> >> > - where is the example of a VMM using this? >> > >> >> I do have changes to kvmtool which won't do any fixup on receiving that >> VM exit. I expect other VMM to do the same by default when they get a >> VM exit with an unknown exit_reason. So unless VMM wants to do any >> special handling, we don't need any change in the VMM. > > OK, so this has never been tested. I'm sorry, but for something of > this magnitude, with expected userspace interactions, and requiring > VMM handling, I want to see the full end-to-end thing. > >> >> >> diff --git a/arm/kvm-cpu.c b/arm/kvm-cpu.c >> index 3b95750ecec7..4760bad07476 100644 >> --- a/arm/kvm-cpu.c >> +++ b/arm/kvm-cpu.c >> @@ -239,6 +239,17 @@ static bool handle_memoryfault(struct kvm_cpu *vcpu) >> return true; >> } >> >> +static bool handle_notag_access(struct kvm_cpu *vcpu) >> +{ >> + u64 gpa = vcpu->kvm_run->memory_fault.gpa; >> + u64 size = vcpu->kvm_run->memory_fault.size; >> + >> + /* For now VMM just panic */ >> + pr_err("Tag Access to a wrong memory region 0x%lx size 0x%lx\n", >> + (unsigned long)gpa, (unsigned long)size); >> + return false; >> +} >> + >> bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu) >> { >> switch (vcpu->kvm_run->exit_reason) { >> @@ -246,6 +257,8 @@ bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu) >> return handle_hypercall(vcpu); >> case KVM_EXIT_MEMORY_FAULT: >> return handle_memoryfault(vcpu); >> + case KVM_EXIT_ARM_NOTAG_ACCESS: >> + return handle_notag_access(vcpu); >> } >> >> return false; >> diff --git a/include/linux/kvm.h b/include/linux/kvm.h >> index 32cff22f0e4d..deef6614f577 100644 >> --- a/include/linux/kvm.h >> +++ b/include/linux/kvm.h >> @@ -178,6 +178,7 @@ struct kvm_xen_exit { >> #define KVM_EXIT_NOTIFY 37 >> #define KVM_EXIT_LOONGARCH_IOCSR 38 >> #define KVM_EXIT_MEMORY_FAULT 39 >> +#define KVM_EXIT_ARM_NOTAG_ACCESS 40 >> >> /* For KVM_EXIT_INTERNAL_ERROR */ >> /* Emulate instruction failed. */ >> @@ -429,10 +430,17 @@ struct kvm_run { >> /* KVM_EXIT_MEMORY_FAULT */ >> struct { >> #define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) >> +#define KVM_MEMORY_EXIT_FLAG_NOTAGACCESS (1ULL << 4) >> __u64 flags; >> __u64 gpa; >> __u64 size; >> } memory_fault; >> + /* KVM_EXIT_ARM_NOTAG_ACCESS */ >> + struct { >> + __u64 flags; >> + __u64 gpa; >> + __u64 size; >> + } notag_access; >> /* Fix the size of the union. */ >> char padding[256]; >> }; >> >> >> >> >> NOTE: We could also use KVM_EXIT_MEMORY_FAULT for this. I chose to >> >> add a new EXIT type because this is arm64 specific exit type. >> >> >> >> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org> >> >> --- >> >> arch/arm64/include/asm/kvm_emulate.h | 5 +++++ >> >> arch/arm64/include/asm/kvm_pgtable.h | 1 + >> >> arch/arm64/kvm/hyp/pgtable.c | 16 +++++++++++++--- >> >> arch/arm64/kvm/mmu.c | 28 ++++++++++++++++++++++------ >> >> include/uapi/linux/kvm.h | 7 +++++++ >> >> 5 files changed, 48 insertions(+), 9 deletions(-) >> >> >> >> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h >> >> index a601a9305b10..fa0149a0606a 100644 >> >> --- a/arch/arm64/include/asm/kvm_emulate.h >> >> +++ b/arch/arm64/include/asm/kvm_emulate.h >> >> @@ -373,6 +373,11 @@ static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu) >> >> return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); >> >> } >> >> >> >> +static inline bool kvm_vcpu_trap_is_tagaccess(const struct kvm_vcpu *vcpu) >> >> +{ >> >> + return !!(ESR_ELx_ISS2(kvm_vcpu_get_esr(vcpu)) & ESR_ELx_TagAccess); >> >> +} >> >> + >> >> static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) >> >> { >> >> return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC; >> >> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h >> >> index 03f4c3d7839c..5657ac1998ad 100644 >> >> --- a/arch/arm64/include/asm/kvm_pgtable.h >> >> +++ b/arch/arm64/include/asm/kvm_pgtable.h >> >> @@ -252,6 +252,7 @@ enum kvm_pgtable_prot { >> >> >> >> KVM_PGTABLE_PROT_DEVICE = BIT(3), >> >> KVM_PGTABLE_PROT_NORMAL_NC = BIT(4), >> >> + KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS = BIT(5), >> > >> > This seems wrong. NOTAGACCESS is a *permission*, not a memory type. >> > >> >> Are you suggesting the name is wrong? The memory attribute value I >> wanted to use is >> >> MemAttr[3:0] = 0b0100 which is Normal, NoTagAccess, writeback cacheable. >> >> I am following the changes similar to KVM_PGTABLE_PROT_NORMAL_NC. > > But that's entirely different. This really isn't a memory type. Quite > the opposite, actually. This is a stage-2 permission setting, which > you are conflating with the actual memory type. > > Also, I don't see why that should be incompatible with something other > than Normal memory. > I am sorry that i am not able to follow you here. I am confused whether your feedback is related to the use of MemAttr[3:0] values or w.r.t the usage of KVM_PGTABLE_PROT_NORMAL_NOTAG_ACCESS. If you can explain more on how you would like to see the changes I can incorporate them in the next update. >> >> > >> >> >> >> KVM_PGTABLE_PROT_SW0 = BIT(55), >> >> KVM_PGTABLE_PROT_SW1 = BIT(56), >> >> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c >> >> index b11bcebac908..bc0d9f08c49a 100644 >> >> --- a/arch/arm64/kvm/hyp/pgtable.c >> >> +++ b/arch/arm64/kvm/hyp/pgtable.c >> >> @@ -677,9 +677,11 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p >> >> { >> >> kvm_pte_t attr; >> >> u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; >> >> + unsigned long prot_mask = KVM_PGTABLE_PROT_DEVICE | >> >> + KVM_PGTABLE_PROT_NORMAL_NC | >> >> + KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS; >> >> >> >> - switch (prot & (KVM_PGTABLE_PROT_DEVICE | >> >> - KVM_PGTABLE_PROT_NORMAL_NC)) { >> >> + switch (prot & prot_mask) { >> >> case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC: >> >> return -EINVAL; >> >> case KVM_PGTABLE_PROT_DEVICE: >> >> @@ -692,6 +694,12 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p >> >> return -EINVAL; >> >> attr = KVM_S2_MEMATTR(pgt, NORMAL_NC); >> >> break; >> >> + case KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS: >> >> + if (system_supports_notagaccess()) >> >> + attr = KVM_S2_MEMATTR(pgt, NORMAL_NOTAGACCESS); >> >> + else >> >> + return -EINVAL; >> >> + break; >> > >> > How do you see this working when migrating a VM from one host to >> > another, one that supports FEAT_MTE_PERM and one that doesn't? The >> > current assumptions are that the VMM will replay the *exact same* >> > setup on the target host, and this obviously doesn't work. >> > >> >> I missed looking at kvm migration. I guess I will have to expose this as >> a capability and only allow migration if the target also supports the >> same capability? > > I don't think so. This doesn't affect the guest at all, as the guest > doesn't know anything about S2, unless you decide to expose > FEAT_MTE_PERM to NV. > > It affects the VMM though, and that's because you are making a > difference in handling between having FEAT_MTE_PERM or not. Given that > this is invisible to userspace, that's not a great design. > With migration, I guess i can prevent migration to a target that doesn't support FEAT_MTE_PERM by updating set_one_reg to check for the feature support. Something like this modified arch/arm64/kvm/sys_regs.c @@ -1514,7 +1514,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u32 id = reg_to_encoding(r); - u64 val; + u64 val, mask; if (sysreg_visible_as_raz(vcpu, r)) return 0; @@ -1529,8 +1529,10 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); break; case SYS_ID_AA64PFR2_EL1: - /* We only expose FPMR */ - val &= ID_AA64PFR2_EL1_FPMR; + mask = ID_AA64PFR2_EL1_FPMR; + if (system_supports_notagaccess()) + mask |= ID_AA64PFR2_EL1_MTEPERM; + val &= mask; break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) @@ -2382,7 +2384,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP), }, ID_SANITISED(ID_AA64PFR1_EL1), - ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR), + ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR | + ID_AA64PFR2_EL1_MTEPERM), ID_UNALLOCATED(4,3), ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), ID_HIDDEN(ID_AA64SMFR0_EL1), For nested virtualization, I guess it is useful to support this for nested guest. This implies L2 stage2 in L1 will have to support Normal,NoTagAccess attribute. ie, in L0 kvm, if the guest abort is on a nested smmu, then we walk the L2 stage2 in L1 and see if the memory attribute is NoTagAccess. If yes we inject a s2 fault to L1. Do you agree this is a good design? -aneesh
On Fri, 08 Nov 2024 07:59:31 +0000, Aneesh Kumar K.V <aneesh.kumar@kernel.org> wrote: > > > Hi Marc, > > Sorry for the delay in response, I was looking at nested virt and > migration changes so that I can get your feedback on those. > > Marc Zyngier <maz@kernel.org> writes: > > > On Mon, 28 Oct 2024 13:28:42 +0000, > > Aneesh Kumar K.V <aneesh.kumar@kernel.org> wrote: > >> > >> Marc Zyngier <maz@kernel.org> writes: > >> > >> > On Mon, 28 Oct 2024 09:40:14 +0000, > >> > "Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org> wrote: > >> >> > >> >> Currently, the kernel won't start a guest if the MTE feature is enabled > >> >> and the guest RAM is backed by memory which doesn't support access tags. > >> >> Update this such that the kernel uses the NoTagAccess memory attribute > >> >> while mapping pages from VMAs for which MTE is not allowed. The fault > >> >> from accessing the access tags with such pages is forwarded to VMM so > >> >> that VMM can decide to kill the guest or remap the pages so that > >> >> access tag storage is allowed. > >> > > >> > I only have questions here: > >> > > >> > - what is the benefit of such approach? why shouldn't that be the > >> > kernel's job to fix it? > >> > >> > >> IMHO leaving that policy decision to VMM makes the kernel changes > >> simpler. In most cases, VMM will kill the guest, because these > >> restrictions of MTE_ALLOWED are applied at the memslot/vma. > > > > Where is that captured? The whole idea behind FEAT_MTE_PERM was that > > it would be the hypervisor's task to lazily allocate MTE-capable > > memory as tagged-access would occur. > > > > Lazily allocating MTE-capable memory requires changes to different > kernel subsystems and previous attempts got dropped [1] because it > was not clear whether the benefit of saving 3% memory overhead was worth > the complexity we add to the kernel. That's not the point. Tagged memory doesn't have to cover the whole of physical memory, and it can be statically allocated. The architecture doesn't mandate that all of the memory is MTE-capable. > This patchset is not looking at that feature. Instead, it can be used to > enable MTE in configurations that currently won't allow MTE. One such > example is libkrun which includes linux kernel as firmware in a > dynamically linked library (libkrunfw). libkrun can insert the kernel > region which got mmaped as part of the library load, directly into the > guest memory map instead of copying the kernel. Such a guest config > can't enable MTE currently even though we will never use the newly > inserted memory regions as tag access memory. > > Similarly, virtiofs dax support can use a page cache region as > virtio-shm region. We can use MTE_PERM to enable MTE in this config. And this use case doesn't contradict what I am stating above. But it definitely contradicts what you wrote: "In most cases, VMM will kill the guest". So which one is it? > > [1] https://lore.kernel.org/all/20240125164256.4147-1-alexandru.elisei@arm.com/ > > >> > >> > > >> > - where is the documentation for this new userspace ABI? > >> > > >> > >> I will add the details if we agree that this should be a separate EXIT > >> as outlined in this patch. > > > > Woooot??? This isn't a *DETAIL*. This is the very first thing you > > should write. Everything *else* is an implementation detail. > > > >> > >> > > >> > - are you expecting the VMM to create a new memslot for this? > >> > > >> > >> I guess there are examples of configs where some memory regions are > >> backed by page cache where we don't directly use those memory regions as > >> allocatable memory in the guest. This change allows us to enable MTE in such > >> configs. > > > > This doesn't answer my question. What is expected sequence a VMM > > should apply to provide tagged memory to the guest? > > > >> > >> > - where is the example of a VMM using this? > >> > > >> > >> I do have changes to kvmtool which won't do any fixup on receiving that > >> VM exit. I expect other VMM to do the same by default when they get a > >> VM exit with an unknown exit_reason. So unless VMM wants to do any > >> special handling, we don't need any change in the VMM. > > > > OK, so this has never been tested. I'm sorry, but for something of > > this magnitude, with expected userspace interactions, and requiring > > VMM handling, I want to see the full end-to-end thing. > > > >> > >> > >> diff --git a/arm/kvm-cpu.c b/arm/kvm-cpu.c > >> index 3b95750ecec7..4760bad07476 100644 > >> --- a/arm/kvm-cpu.c > >> +++ b/arm/kvm-cpu.c > >> @@ -239,6 +239,17 @@ static bool handle_memoryfault(struct kvm_cpu *vcpu) > >> return true; > >> } > >> > >> +static bool handle_notag_access(struct kvm_cpu *vcpu) > >> +{ > >> + u64 gpa = vcpu->kvm_run->memory_fault.gpa; > >> + u64 size = vcpu->kvm_run->memory_fault.size; > >> + > >> + /* For now VMM just panic */ > >> + pr_err("Tag Access to a wrong memory region 0x%lx size 0x%lx\n", > >> + (unsigned long)gpa, (unsigned long)size); > >> + return false; > >> +} > >> + > >> bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu) > >> { > >> switch (vcpu->kvm_run->exit_reason) { > >> @@ -246,6 +257,8 @@ bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu) > >> return handle_hypercall(vcpu); > >> case KVM_EXIT_MEMORY_FAULT: > >> return handle_memoryfault(vcpu); > >> + case KVM_EXIT_ARM_NOTAG_ACCESS: > >> + return handle_notag_access(vcpu); > >> } > >> > >> return false; > >> diff --git a/include/linux/kvm.h b/include/linux/kvm.h > >> index 32cff22f0e4d..deef6614f577 100644 > >> --- a/include/linux/kvm.h > >> +++ b/include/linux/kvm.h > >> @@ -178,6 +178,7 @@ struct kvm_xen_exit { > >> #define KVM_EXIT_NOTIFY 37 > >> #define KVM_EXIT_LOONGARCH_IOCSR 38 > >> #define KVM_EXIT_MEMORY_FAULT 39 > >> +#define KVM_EXIT_ARM_NOTAG_ACCESS 40 > >> > >> /* For KVM_EXIT_INTERNAL_ERROR */ > >> /* Emulate instruction failed. */ > >> @@ -429,10 +430,17 @@ struct kvm_run { > >> /* KVM_EXIT_MEMORY_FAULT */ > >> struct { > >> #define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) > >> +#define KVM_MEMORY_EXIT_FLAG_NOTAGACCESS (1ULL << 4) > >> __u64 flags; > >> __u64 gpa; > >> __u64 size; > >> } memory_fault; > >> + /* KVM_EXIT_ARM_NOTAG_ACCESS */ > >> + struct { > >> + __u64 flags; > >> + __u64 gpa; > >> + __u64 size; > >> + } notag_access; > >> /* Fix the size of the union. */ > >> char padding[256]; > >> }; > >> > >> >> > >> >> NOTE: We could also use KVM_EXIT_MEMORY_FAULT for this. I chose to > >> >> add a new EXIT type because this is arm64 specific exit type. > >> >> > >> >> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org> > >> >> --- > >> >> arch/arm64/include/asm/kvm_emulate.h | 5 +++++ > >> >> arch/arm64/include/asm/kvm_pgtable.h | 1 + > >> >> arch/arm64/kvm/hyp/pgtable.c | 16 +++++++++++++--- > >> >> arch/arm64/kvm/mmu.c | 28 ++++++++++++++++++++++------ > >> >> include/uapi/linux/kvm.h | 7 +++++++ > >> >> 5 files changed, 48 insertions(+), 9 deletions(-) > >> >> > >> >> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h > >> >> index a601a9305b10..fa0149a0606a 100644 > >> >> --- a/arch/arm64/include/asm/kvm_emulate.h > >> >> +++ b/arch/arm64/include/asm/kvm_emulate.h > >> >> @@ -373,6 +373,11 @@ static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu) > >> >> return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); > >> >> } > >> >> > >> >> +static inline bool kvm_vcpu_trap_is_tagaccess(const struct kvm_vcpu *vcpu) > >> >> +{ > >> >> + return !!(ESR_ELx_ISS2(kvm_vcpu_get_esr(vcpu)) & ESR_ELx_TagAccess); > >> >> +} > >> >> + > >> >> static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) > >> >> { > >> >> return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC; > >> >> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h > >> >> index 03f4c3d7839c..5657ac1998ad 100644 > >> >> --- a/arch/arm64/include/asm/kvm_pgtable.h > >> >> +++ b/arch/arm64/include/asm/kvm_pgtable.h > >> >> @@ -252,6 +252,7 @@ enum kvm_pgtable_prot { > >> >> > >> >> KVM_PGTABLE_PROT_DEVICE = BIT(3), > >> >> KVM_PGTABLE_PROT_NORMAL_NC = BIT(4), > >> >> + KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS = BIT(5), > >> > > >> > This seems wrong. NOTAGACCESS is a *permission*, not a memory type. > >> > > >> > >> Are you suggesting the name is wrong? The memory attribute value I > >> wanted to use is > >> > >> MemAttr[3:0] = 0b0100 which is Normal, NoTagAccess, writeback cacheable. > >> > >> I am following the changes similar to KVM_PGTABLE_PROT_NORMAL_NC. > > > > But that's entirely different. This really isn't a memory type. Quite > > the opposite, actually. This is a stage-2 permission setting, which > > you are conflating with the actual memory type. > > > > Also, I don't see why that should be incompatible with something other > > than Normal memory. > > > > I am sorry that i am not able to follow you here. I am confused whether > your feedback is related to the use of MemAttr[3:0] values or w.r.t > the usage of KVM_PGTABLE_PROT_NORMAL_NOTAG_ACCESS. If you can explain > more on how you would like to see the changes I can incorporate them in > the next update. There is a clear difference between permission and memory types. What if I want to have non-cacheable memory being tagged? > > >> > >> > > >> >> > >> >> KVM_PGTABLE_PROT_SW0 = BIT(55), > >> >> KVM_PGTABLE_PROT_SW1 = BIT(56), > >> >> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c > >> >> index b11bcebac908..bc0d9f08c49a 100644 > >> >> --- a/arch/arm64/kvm/hyp/pgtable.c > >> >> +++ b/arch/arm64/kvm/hyp/pgtable.c > >> >> @@ -677,9 +677,11 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p > >> >> { > >> >> kvm_pte_t attr; > >> >> u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; > >> >> + unsigned long prot_mask = KVM_PGTABLE_PROT_DEVICE | > >> >> + KVM_PGTABLE_PROT_NORMAL_NC | > >> >> + KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS; > >> >> > >> >> - switch (prot & (KVM_PGTABLE_PROT_DEVICE | > >> >> - KVM_PGTABLE_PROT_NORMAL_NC)) { > >> >> + switch (prot & prot_mask) { > >> >> case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC: > >> >> return -EINVAL; > >> >> case KVM_PGTABLE_PROT_DEVICE: > >> >> @@ -692,6 +694,12 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p > >> >> return -EINVAL; > >> >> attr = KVM_S2_MEMATTR(pgt, NORMAL_NC); > >> >> break; > >> >> + case KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS: > >> >> + if (system_supports_notagaccess()) > >> >> + attr = KVM_S2_MEMATTR(pgt, NORMAL_NOTAGACCESS); > >> >> + else > >> >> + return -EINVAL; > >> >> + break; > >> > > >> > How do you see this working when migrating a VM from one host to > >> > another, one that supports FEAT_MTE_PERM and one that doesn't? The > >> > current assumptions are that the VMM will replay the *exact same* > >> > setup on the target host, and this obviously doesn't work. > >> > > >> > >> I missed looking at kvm migration. I guess I will have to expose this as > >> a capability and only allow migration if the target also supports the > >> same capability? > > > > I don't think so. This doesn't affect the guest at all, as the guest > > doesn't know anything about S2, unless you decide to expose > > FEAT_MTE_PERM to NV. > > > > It affects the VMM though, and that's because you are making a > > difference in handling between having FEAT_MTE_PERM or not. Given that > > this is invisible to userspace, that's not a great design. > > > > With migration, I guess i can prevent migration to a target that doesn't > support FEAT_MTE_PERM by updating set_one_reg to check for the feature > support. Something like this > > modified arch/arm64/kvm/sys_regs.c > @@ -1514,7 +1514,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, > const struct sys_reg_desc *r) > { > u32 id = reg_to_encoding(r); > - u64 val; > + u64 val, mask; > > if (sysreg_visible_as_raz(vcpu, r)) > return 0; > @@ -1529,8 +1529,10 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, > val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); > break; > case SYS_ID_AA64PFR2_EL1: > - /* We only expose FPMR */ > - val &= ID_AA64PFR2_EL1_FPMR; > + mask = ID_AA64PFR2_EL1_FPMR; > + if (system_supports_notagaccess()) > + mask |= ID_AA64PFR2_EL1_MTEPERM; But this doesn't matter for an EL1 guest. Only for an EL2 guest.For an EL1 guest you need to validate the support at the API level. > + val &= mask; > break; > case SYS_ID_AA64ISAR1_EL1: > if (!vcpu_has_ptrauth(vcpu)) > @@ -2382,7 +2384,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { > ID_AA64PFR0_EL1_AdvSIMD | > ID_AA64PFR0_EL1_FP), }, > ID_SANITISED(ID_AA64PFR1_EL1), > - ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR), > + ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR | > + ID_AA64PFR2_EL1_MTEPERM), > ID_UNALLOCATED(4,3), > ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), > ID_HIDDEN(ID_AA64SMFR0_EL1), > > > For nested virtualization, I guess it is useful to support this for > nested guest. This implies L2 stage2 in L1 will have to support > Normal,NoTagAccess attribute. ie, > > in L0 kvm, if the guest abort is on a nested smmu, then we walk the L2 > stage2 in L1 and see if the memory attribute is NoTagAccess. If yes > we inject a s2 fault to L1. Do you agree this is a good design? Yes. But you need to add that support. Thanks, M. -- Without deviation from the norm, progress is not possible.
(joining the thread as well, though not sure I'm bringing anything new) On Tue, Nov 12, 2024 at 11:51:45AM +0000, Marc Zyngier wrote: > On Fri, 08 Nov 2024 07:59:31 +0000, Aneesh Kumar K.V <aneesh.kumar@kernel.org> wrote: > > Marc Zyngier <maz@kernel.org> writes: > > > On Mon, 28 Oct 2024 13:28:42 +0000, Aneesh Kumar K.V <aneesh.kumar@kernel.org> wrote: > > >> Marc Zyngier <maz@kernel.org> writes: > > >> > On Mon, 28 Oct 2024 09:40:14 +0000, "Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org> wrote: > > >> >> Currently, the kernel won't start a guest if the MTE feature is enabled > > >> >> and the guest RAM is backed by memory which doesn't support access tags. > > >> >> Update this such that the kernel uses the NoTagAccess memory attribute > > >> >> while mapping pages from VMAs for which MTE is not allowed. The fault > > >> >> from accessing the access tags with such pages is forwarded to VMM so > > >> >> that VMM can decide to kill the guest or remap the pages so that > > >> >> access tag storage is allowed. > > >> > > > >> > I only have questions here: > > >> > > > >> > - what is the benefit of such approach? why shouldn't that be the > > >> > kernel's job to fix it? > > >> > > >> IMHO leaving that policy decision to VMM makes the kernel changes > > >> simpler. In most cases, VMM will kill the guest, because these > > >> restrictions of MTE_ALLOWED are applied at the memslot/vma. > > > > > > Where is that captured? The whole idea behind FEAT_MTE_PERM was that > > > it would be the hypervisor's task to lazily allocate MTE-capable > > > memory as tagged-access would occur. > > > > Lazily allocating MTE-capable memory requires changes to different > > kernel subsystems and previous attempts got dropped [1] because it > > was not clear whether the benefit of saving 3% memory overhead was worth > > the complexity we add to the kernel. I'd say the most complex part in Alex's approach was the need to reuse the tag storage for classic data and kick the pages around when some other page needs to store tags in there. That approach is pretty much dead. In theory, having MTE and non-MTE memory (heterogeneous) without a carveout reuse would be a bit more manageable - not that far from the NUMA migration and at least you only migrate the page being accessed rather than unrelated ones where the tags need to go. But this was not Alex's goal for Android since people were asking for the reuse of the 3% carveout rather than a smaller carveout. Other future deployments, CXL-attached memory etc. may benefit from a new scheme but I wouldn't rush in implementing anything in the kernel for now. The VMM may be in a better position to manage such heterogeneous memory for the guest if it knows the capabilities of the slots (e.g. some DAX mmap() vs anonymous mmap()). This would require the VMM replacing a page within a slot from one memory type to another (while preserving the data). I don't think we have a concrete use-case yet to be worth the hassle. > That's not the point. Tagged memory doesn't have to cover the whole of > physical memory, and it can be statically allocated. The architecture > doesn't mandate that all of the memory is MTE-capable. There's some vague wording that general purpose memory should be MTE capable if FEAT_MTE2 or later is advertised. But that's not well defined and one can have other types of memory in the physical space (e.g. CXL) that don't support tags. Last time I looked we still haven't got a way to describe memory capabilities in firmware. For the time being, I think a real use-case for FEAT_MTE_PERM is in the context of cacheable MMIO (there are some patches around from Nvidia to do this with VFIO). That memory, if exposed to guest as WB and the guest enables MTE, may trigger some SErrors. With FEAT_MTE_PERM KVM could trap and inject a fault back into the guest - maybe SEA. Is it easier to do this from KVM itself or we would rather exit to the VMM and let it handle? The latter allows room for other fancier things in the future but the former may be quicker, in the absence of other strong use-cases. > > This patchset is not looking at that feature. Instead, it can be used to > > enable MTE in configurations that currently won't allow MTE. One such > > example is libkrun which includes linux kernel as firmware in a > > dynamically linked library (libkrunfw). libkrun can insert the kernel > > region which got mmaped as part of the library load, directly into the > > guest memory map instead of copying the kernel. Such a guest config > > can't enable MTE currently even though we will never use the newly > > inserted memory regions as tag access memory. I've never played with libkrunfw. Does it handle inserting a Linux kernel? Such approach may not work well with MTE. The kernel frees the init text/data sections back into the page allocator. With MTE advertised as present, the guest will try to reuse that memory, potentially as tagged. However, since the VMM mmap'ed the guest kernel from a file, VM_MTE is not supported in the VMM address space. Even if the mapping is MAP_PRIVATE and the page copied on write, the vma remains the original one associated with the file, so VM_MTE_ALLOWED not set. We could revisit this and allow mprotect(PROT_MTE) to force the CoW on private file mappings but we'd need some strong requirement for this (MTE+libkrunfw could be such thing if people need this combination). > > Similarly, virtiofs dax support can use a page cache region as > > virtio-shm region. We can use MTE_PERM to enable MTE in this config. > > And this use case doesn't contradict what I am stating above. But it > definitely contradicts what you wrote: "In most cases, VMM will kill > the guest". To simplify things, I think whatever is presented to the VM as standard RAM (typically present at the VM boot) should either support MTE or MTE will be disabled for the guest. For other types of memory, whether WB MMIO or RAM presented as virtio-(pmem, shm etc.) backed by files in the VMM, the VM should be aware it is not standard RAM and should not attempt to enable MTE on it. If it does (either by mistake or malice), FEAT_MTE_PERM should trap and inject a fault back into the guest (or kill it altogether but for debugging, I'd rather inject a fault if possible). -- Catalin
© 2016 - 2024 Red Hat, Inc.