Instructions from kernel space are considered trusted. If the MMIO
instruction is from userspace it must be checked.
For userspace instructions, it is need to check that the INSN has not
changed at the time of #VE and before the execution of the instruction.
Once the userspace instruction parsed is enforced that the address
points to mapped memory of current process and that address does not
point to private memory.
After parsing the userspace instruction, it is necessary to ensure that:
1. the operation direction (read/write) corresponds to #VE info;
2. the address still points to mapped memory of current process;
3. the address does not point to private memory.
Signed-off-by: Alexey Gladkov (Intel) <legion@kernel.org>
---
arch/x86/coco/tdx/tdx.c | 118 +++++++++++++++++++++++++++++++++++-----
1 file changed, 105 insertions(+), 13 deletions(-)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 41b047a08071..8c894ee9c245 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -8,6 +8,7 @@
#include <linux/export.h>
#include <linux/io.h>
#include <linux/kexec.h>
+#include <linux/mm.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
@@ -405,6 +406,74 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
EPT_WRITE, addr, val);
}
+static inline bool is_private_gpa(u64 gpa)
+{
+ return gpa == cc_mkenc(gpa);
+}
+
+static int get_phys_addr(unsigned long addr, phys_addr_t *phys_addr, bool *writable)
+{
+ unsigned int level;
+ pgd_t *pgdp;
+ pte_t *ptep;
+
+ /*
+ * Address validation only makes sense for a user process. The lock must
+ * be obtained before validation can begin.
+ */
+ mmap_assert_locked(current->mm);
+
+ pgdp = pgd_offset(current->mm, addr);
+
+ if (!pgd_none(*pgdp)) {
+ ptep = lookup_address_in_pgd(pgdp, addr, &level);
+ if (ptep) {
+ unsigned long offset;
+
+ offset = addr & ~page_level_mask(level);
+ *phys_addr = PFN_PHYS(pte_pfn(*ptep));
+ *phys_addr |= offset;
+
+ *writable = pte_write(*ptep);
+
+ return 0;
+ }
+ }
+
+ return -EFAULT;
+}
+
+static int valid_vaddr(struct ve_info *ve, enum insn_mmio_type mmio, int size,
+ unsigned long vaddr)
+{
+ phys_addr_t phys_addr;
+ bool writable = false;
+
+ /* It's not fatal. This can happen due to swap out or page migration. */
+ if (get_phys_addr(vaddr, &phys_addr, &writable) || (ve->gpa != cc_mkdec(phys_addr)))
+ return -EAGAIN;
+
+ /* Check whether #VE info matches the instruction that was decoded. */
+ switch (mmio) {
+ case INSN_MMIO_WRITE:
+ case INSN_MMIO_WRITE_IMM:
+ if (!writable || !(ve->exit_qual & EPT_VIOLATION_ACC_WRITE))
+ return -EFAULT;
+ break;
+ case INSN_MMIO_READ:
+ case INSN_MMIO_READ_ZERO_EXTEND:
+ case INSN_MMIO_READ_SIGN_EXTEND:
+ if (!(ve->exit_qual & EPT_VIOLATION_ACC_READ))
+ return -EFAULT;
+ break;
+ default:
+ WARN_ONCE(1, "Unsupported mmio instruction: %d", mmio);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int handle_mmio_write(struct insn *insn, enum insn_mmio_type mmio, int size,
struct pt_regs *regs, struct ve_info *ve)
{
@@ -488,7 +557,7 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
char buffer[MAX_INSN_SIZE];
enum insn_mmio_type mmio;
struct insn insn = {};
- int size;
+ int size, ret;
/* Only in-kernel MMIO is supported */
if (WARN_ON_ONCE(user_mode(regs)))
@@ -504,6 +573,17 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
return -EINVAL;
+ vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
+
+ if (user_mode(regs)) {
+ if (mmap_read_lock_killable(current->mm))
+ return -EINTR;
+
+ ret = valid_vaddr(ve, mmio, size, vaddr);
+ if (ret)
+ goto fault;
+ }
+
/*
* Reject EPT violation #VEs that split pages.
*
@@ -513,30 +593,39 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
*
* load_unaligned_zeropad() will recover using exception fixups.
*/
- vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
- if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
- return -EFAULT;
+ if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE) {
+ ret = -EFAULT;
+ goto fault;
+ }
switch (mmio) {
case INSN_MMIO_WRITE:
case INSN_MMIO_WRITE_IMM:
case INSN_MMIO_MOVS:
- return handle_mmio_write(&insn, mmio, size, regs, ve);
+ ret = handle_mmio_write(&insn, mmio, size, regs, ve);
+ break;
case INSN_MMIO_READ:
case INSN_MMIO_READ_ZERO_EXTEND:
case INSN_MMIO_READ_SIGN_EXTEND:
- return handle_mmio_read(&insn, mmio, size, regs, ve);
+ ret = handle_mmio_read(&insn, mmio, size, regs, ve);
+ break;
case INSN_MMIO_DECODE_FAILED:
/*
* MMIO was accessed with an instruction that could not be
* decoded or handled properly. It was likely not using io.h
* helpers or accessed MMIO accidentally.
*/
- return -EINVAL;
+ ret = -EINVAL;
+ break;
default:
WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
- return -EINVAL;
+ ret = -EINVAL;
}
+fault:
+ if (user_mode(regs))
+ mmap_read_unlock(current->mm);
+
+ return ret;
}
static bool handle_in(struct pt_regs *regs, int size, int port)
@@ -680,11 +769,6 @@ static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
}
}
-static inline bool is_private_gpa(u64 gpa)
-{
- return gpa == cc_mkenc(gpa);
-}
-
/*
* Handle the kernel #VE.
*
@@ -722,6 +806,14 @@ bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
insn_len = virt_exception_user(regs, ve);
else
insn_len = virt_exception_kernel(regs, ve);
+
+ /*
+ * A special case to return to userspace without increasing regs->ip
+ * to repeat the instruction once again.
+ */
+ if (insn_len == -EAGAIN)
+ return true;
+
if (insn_len < 0)
return false;
--
2.45.2
On Tue, Jul 30, 2024 at 07:35:57PM +0200, Alexey Gladkov (Intel) wrote:
> +static int valid_vaddr(struct ve_info *ve, enum insn_mmio_type mmio, int size,
> + unsigned long vaddr)
> +{
> + phys_addr_t phys_addr;
> + bool writable = false;
> +
> + /* It's not fatal. This can happen due to swap out or page migration. */
> + if (get_phys_addr(vaddr, &phys_addr, &writable) || (ve->gpa != cc_mkdec(phys_addr)))
> + return -EAGAIN;
I think we need big fat comment here why these checks are needed.
We have ve->gpa and it was valid at the time we got ve_info. But after we
get ve_info, we enable interrupts allowing tlb shootdown and therefore
munmap() in parallel thread of the process.
So by the time we've got here ve->gpa might be unmapped from the process,
the device it belongs to removed from system and something else could be
plugged in its place.
That's why we need to re-check if the GPA is still mapped and writable if
we are going to write to it.
> +
> + /* Check whether #VE info matches the instruction that was decoded. */
> + switch (mmio) {
> + case INSN_MMIO_WRITE:
> + case INSN_MMIO_WRITE_IMM:
> + if (!writable || !(ve->exit_qual & EPT_VIOLATION_ACC_WRITE))
> + return -EFAULT;
> + break;
> + case INSN_MMIO_READ:
> + case INSN_MMIO_READ_ZERO_EXTEND:
> + case INSN_MMIO_READ_SIGN_EXTEND:
> + if (!(ve->exit_qual & EPT_VIOLATION_ACC_READ))
> + return -EFAULT;
> + break;
> + default:
> + WARN_ONCE(1, "Unsupported mmio instruction: %d", mmio);
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
--
Kiryl Shutsemau / Kirill A. Shutemov
On Fri, Aug 02, 2024 at 10:41:17AM +0300, Kirill A. Shutemov wrote:
> On Tue, Jul 30, 2024 at 07:35:57PM +0200, Alexey Gladkov (Intel) wrote:
> > +static int valid_vaddr(struct ve_info *ve, enum insn_mmio_type mmio, int size,
> > + unsigned long vaddr)
> > +{
> > + phys_addr_t phys_addr;
> > + bool writable = false;
> > +
> > + /* It's not fatal. This can happen due to swap out or page migration. */
> > + if (get_phys_addr(vaddr, &phys_addr, &writable) || (ve->gpa != cc_mkdec(phys_addr)))
> > + return -EAGAIN;
>
> I think we need big fat comment here why these checks are needed.
>
> We have ve->gpa and it was valid at the time we got ve_info. But after we
> get ve_info, we enable interrupts allowing tlb shootdown and therefore
> munmap() in parallel thread of the process.
>
> So by the time we've got here ve->gpa might be unmapped from the process,
> the device it belongs to removed from system and something else could be
> plugged in its place.
>
> That's why we need to re-check if the GPA is still mapped and writable if
> we are going to write to it.
Make sense. I will add bigger comment here.
> > +
> > + /* Check whether #VE info matches the instruction that was decoded. */
> > + switch (mmio) {
> > + case INSN_MMIO_WRITE:
> > + case INSN_MMIO_WRITE_IMM:
> > + if (!writable || !(ve->exit_qual & EPT_VIOLATION_ACC_WRITE))
> > + return -EFAULT;
> > + break;
> > + case INSN_MMIO_READ:
> > + case INSN_MMIO_READ_ZERO_EXTEND:
> > + case INSN_MMIO_READ_SIGN_EXTEND:
> > + if (!(ve->exit_qual & EPT_VIOLATION_ACC_READ))
> > + return -EFAULT;
> > + break;
> > + default:
> > + WARN_ONCE(1, "Unsupported mmio instruction: %d", mmio);
> > + return -EINVAL;
> > + }
> > +
> > + return 0;
> > +}
> > +
>
> --
> Kiryl Shutsemau / Kirill A. Shutemov
>
--
Rgrds, legion
On Tue, Jul 30 2024 at 19:35, Alexey Gladkov wrote:
>
> + vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> +
> + if (user_mode(regs)) {
> + if (mmap_read_lock_killable(current->mm))
> + return -EINTR;
> +
> + ret = valid_vaddr(ve, mmio, size, vaddr);
> + if (ret)
> + goto fault;
fault is really the wrong name for the label because it's the general
return point of the function. 'out' or 'unlock' perhaps?
Thanks,
tglx
© 2016 - 2025 Red Hat, Inc.