From: Ankit Agrawal <ankita@nvidia.com>
Fixes a security bug due to mismatched attributes between S1 and
S2 mapping.
Currently, it is possible for a region to be cacheable in the userspace
VMA, but mapped non cached in S2. This creates a potential issue where
the VMM may sanitize cacheable memory across VMs using cacheable stores,
ensuring it is zeroed. However, if KVM subsequently assigns this memory
to a VM as uncached, the VM could end up accessing stale, non-zeroed data
from a previous VM, leading to unintended data exposure. This is a security
risk.
Block such mismatch attributes case by returning EINVAL when userspace
try to map PFNMAP cacheable. Only allow NORMAL_NC and DEVICE_*.
CC: Oliver Upton <oliver.upton@linux.dev>
CC: Sean Christopherson <seanjc@google.com>
CC: Catalin Marinas <catalin.marinas@arm.com>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
arch/arm64/kvm/mmu.c | 34 +++++++++++++++++++++++++++++++++-
1 file changed, 33 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 3d77a278fc4f..d6e0d5f46b45 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1470,6 +1470,22 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
return vma->vm_flags & VM_MTE_ALLOWED;
}
+/*
+ * Determine the memory region cacheability from VMA's pgprot. This
+ * is used to set the stage 2 PTEs.
+ */
+static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
+{
+ switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) {
+ case MT_NORMAL_NC:
+ case MT_DEVICE_nGnRnE:
+ case MT_DEVICE_nGnRE:
+ return false;
+ default:
+ return true;
+ }
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_s2_trans *nested,
struct kvm_memory_slot *memslot, unsigned long hva,
@@ -1477,7 +1493,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
{
int ret = 0;
bool write_fault, writable, force_pte = false;
- bool exec_fault, mte_allowed;
+ bool exec_fault, mte_allowed, is_vma_cacheable = false;
bool disable_cmo = false, vfio_allow_any_uc = false;
unsigned long mmu_seq;
phys_addr_t ipa = fault_ipa;
@@ -1619,6 +1635,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
+ is_vma_cacheable = kvm_vma_is_cacheable(vma);
+
/* Don't use the VMA after the unlock -- it may have vanished */
vma = NULL;
@@ -1643,6 +1661,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
if (!kvm_can_use_cmo_pfn(pfn)) {
+ if (is_vma_cacheable)
+ return -EINVAL;
+
/*
* If the page was identified as device early by looking at
* the VMA flags, vma_pagesize is already representing the
@@ -1726,6 +1747,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
prot |= KVM_PGTABLE_PROT_X;
if (disable_cmo) {
+ if (is_vma_cacheable) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
if (vfio_allow_any_uc)
prot |= KVM_PGTABLE_PROT_NORMAL_NC;
else
@@ -2221,6 +2247,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
ret = -EINVAL;
break;
}
+
+ /* Cacheable PFNMAP is not allowed */
+ if (kvm_vma_is_cacheable(vma)) {
+ ret = -EINVAL;
+ break;
+ }
}
hva = min(reg_end, vma->vm_end);
} while (hva < reg_end);
--
2.34.1
On Wed, Jun 18, 2025 at 06:55:38AM +0000, ankita@nvidia.com wrote: > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > index 3d77a278fc4f..d6e0d5f46b45 100644 > --- a/arch/arm64/kvm/mmu.c > +++ b/arch/arm64/kvm/mmu.c > @@ -1470,6 +1470,22 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) > return vma->vm_flags & VM_MTE_ALLOWED; > } > > +/* > + * Determine the memory region cacheability from VMA's pgprot. This > + * is used to set the stage 2 PTEs. > + */ > +static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) > +{ > + switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) { > + case MT_NORMAL_NC: > + case MT_DEVICE_nGnRnE: > + case MT_DEVICE_nGnRE: > + return false; > + default: > + return true; > + } > +} > + > static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > struct kvm_s2_trans *nested, > struct kvm_memory_slot *memslot, unsigned long hva, > @@ -1477,7 +1493,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > { > int ret = 0; > bool write_fault, writable, force_pte = false; > - bool exec_fault, mte_allowed; > + bool exec_fault, mte_allowed, is_vma_cacheable = false; Nit: do we need to initialise is_vma_cacheable here? It did not seem used until the kvm_vma_is_cacheable() call. Anyway, it's harmless. > bool disable_cmo = false, vfio_allow_any_uc = false; > unsigned long mmu_seq; > phys_addr_t ipa = fault_ipa; > @@ -1619,6 +1635,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; > > + is_vma_cacheable = kvm_vma_is_cacheable(vma); > + > /* Don't use the VMA after the unlock -- it may have vanished */ > vma = NULL; > > @@ -1643,6 +1661,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > return -EFAULT; > > if (!kvm_can_use_cmo_pfn(pfn)) { > + if (is_vma_cacheable) > + return -EINVAL; > + > /* > * If the page was identified as device early by looking at > * the VMA flags, vma_pagesize is already representing the This block also sets 'disable_cmo' (originally 'device') to true. > @@ -1726,6 +1747,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > prot |= KVM_PGTABLE_PROT_X; > > if (disable_cmo) { > + if (is_vma_cacheable) { > + ret = -EINVAL; > + goto out_unlock; > + } so, is there anything else changing 'disable_cmo' up to this point? If not, I'd drop the second is_vma_cacheable check. -- Catalin
© 2016 - 2025 Red Hat, Inc.