From: "Alexey Gladkov (Intel)" <legion@kernel.org>
Instructions from kernel space are considered trusted. If the MMIO
instruction is from userspace it must be checked.
For userspace instructions, it is need to check that the INSN has not
changed at the time of #VE and before the execution of the instruction.
Once the userspace instruction parsed is enforced that the address
points to mapped memory of current process and that address does not
point to private memory.
After parsing the userspace instruction, it is necessary to ensure that:
1. the operation direction (read/write) corresponds to #VE info;
2. the address still points to mapped memory of current process;
3. the address does not point to private memory.
Signed-off-by: Alexey Gladkov (Intel) <legion@kernel.org>
---
arch/x86/coco/tdx/tdx.c | 128 ++++++++++++++++++++++++++++++++++++----
1 file changed, 115 insertions(+), 13 deletions(-)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index af0b6c1cacf7..86c22fec97fb 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -8,6 +8,7 @@
#include <linux/export.h>
#include <linux/io.h>
#include <linux/kexec.h>
+#include <linux/mm.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
@@ -405,6 +406,84 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
EPT_WRITE, addr, val);
}
+static inline bool is_private_gpa(u64 gpa)
+{
+ return gpa == cc_mkenc(gpa);
+}
+
+static int get_phys_addr(unsigned long addr, phys_addr_t *phys_addr, bool *writable)
+{
+ unsigned int level;
+ pgd_t *pgdp;
+ pte_t *ptep;
+
+ /*
+ * Address validation only makes sense for a user process. The lock must
+ * be obtained before validation can begin.
+ */
+ mmap_assert_locked(current->mm);
+
+ pgdp = pgd_offset(current->mm, addr);
+
+ if (!pgd_none(*pgdp)) {
+ ptep = lookup_address_in_pgd(pgdp, addr, &level);
+ if (ptep) {
+ unsigned long offset;
+
+ offset = addr & ~page_level_mask(level);
+ *phys_addr = PFN_PHYS(pte_pfn(*ptep));
+ *phys_addr |= offset;
+
+ *writable = pte_write(*ptep);
+
+ return 0;
+ }
+ }
+
+ return -EFAULT;
+}
+
+static int valid_vaddr(struct ve_info *ve, enum insn_mmio_type mmio, int size,
+ unsigned long vaddr)
+{
+ phys_addr_t phys_addr;
+ bool writable = false;
+
+ /* It's not fatal. This can happen due to swap out or page migration. */
+ if (get_phys_addr(vaddr, &phys_addr, &writable) || (ve->gpa != cc_mkdec(phys_addr)))
+ return -EAGAIN;
+
+ /*
+ * Re-check whether #VE info matches the instruction that was decoded.
+ *
+ * The ve->gpa was valid at the time ve_info was received. But this code
+ * executed with interrupts enabled, allowing tlb shootdown and therefore
+ * munmap() to be executed in the parallel thread.
+ *
+ * By the time MMIO emulation is performed, ve->gpa may be already
+ * unmapped from the process, the device it belongs to removed from
+ * system and something else could be plugged in its place.
+ */
+ switch (mmio) {
+ case INSN_MMIO_WRITE:
+ case INSN_MMIO_WRITE_IMM:
+ if (!writable || !(ve->exit_qual & EPT_VIOLATION_ACC_WRITE))
+ return -EFAULT;
+ break;
+ case INSN_MMIO_READ:
+ case INSN_MMIO_READ_ZERO_EXTEND:
+ case INSN_MMIO_READ_SIGN_EXTEND:
+ if (!(ve->exit_qual & EPT_VIOLATION_ACC_READ))
+ return -EFAULT;
+ break;
+ default:
+ WARN_ONCE(1, "Unsupported mmio instruction: %d", mmio);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int handle_mmio_write(struct insn *insn, enum insn_mmio_type mmio, int size,
struct pt_regs *regs, struct ve_info *ve)
{
@@ -489,7 +568,7 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
enum insn_mmio_type mmio;
struct insn insn = {};
unsigned long vaddr;
- int size;
+ int size, ret;
/* Only in-kernel MMIO is supported */
if (WARN_ON_ONCE(user_mode(regs)))
@@ -505,6 +584,17 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
return -EINVAL;
+ vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
+
+ if (current->mm) {
+ if (mmap_read_lock_killable(current->mm))
+ return -EINTR;
+
+ ret = valid_vaddr(ve, mmio, size, vaddr);
+ if (ret)
+ goto unlock;
+ }
+
/*
* Reject EPT violation #VEs that split pages.
*
@@ -514,30 +604,39 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
*
* load_unaligned_zeropad() will recover using exception fixups.
*/
- vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
- if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
- return -EFAULT;
+ if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE) {
+ ret = -EFAULT;
+ goto unlock;
+ }
switch (mmio) {
case INSN_MMIO_WRITE:
case INSN_MMIO_WRITE_IMM:
case INSN_MMIO_MOVS:
- return handle_mmio_write(&insn, mmio, size, regs, ve);
+ ret = handle_mmio_write(&insn, mmio, size, regs, ve);
+ break;
case INSN_MMIO_READ:
case INSN_MMIO_READ_ZERO_EXTEND:
case INSN_MMIO_READ_SIGN_EXTEND:
- return handle_mmio_read(&insn, mmio, size, regs, ve);
+ ret = handle_mmio_read(&insn, mmio, size, regs, ve);
+ break;
case INSN_MMIO_DECODE_FAILED:
/*
* MMIO was accessed with an instruction that could not be
* decoded or handled properly. It was likely not using io.h
* helpers or accessed MMIO accidentally.
*/
- return -EINVAL;
+ ret = -EINVAL;
+ break;
default:
WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
- return -EINVAL;
+ ret = -EINVAL;
}
+unlock:
+ if (current->mm)
+ mmap_read_unlock(current->mm);
+
+ return ret;
}
static bool handle_in(struct pt_regs *regs, int size, int port)
@@ -681,11 +780,6 @@ static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
}
}
-static inline bool is_private_gpa(u64 gpa)
-{
- return gpa == cc_mkenc(gpa);
-}
-
/*
* Handle the kernel #VE.
*
@@ -723,6 +817,14 @@ bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
insn_len = virt_exception_user(regs, ve);
else
insn_len = virt_exception_kernel(regs, ve);
+
+ /*
+ * A special case to return to userspace without increasing regs->ip
+ * to repeat the instruction once again.
+ */
+ if (insn_len == -EAGAIN)
+ return true;
+
if (insn_len < 0)
return false;
--
2.45.2
On Fri, Aug 16, 2024 at 03:43:52PM +0200, Alexey Gladkov wrote:
> From: "Alexey Gladkov (Intel)" <legion@kernel.org>
>
> Instructions from kernel space are considered trusted. If the MMIO
> instruction is from userspace it must be checked.
>
> For userspace instructions, it is need to check that the INSN has not
> changed at the time of #VE and before the execution of the instruction.
Well, we cannot really check if the instruction changed under us. We can
only check if the parsed instruction does an MMIO operation that is
allowed for the process.
>
> Once the userspace instruction parsed is enforced that the address
> points to mapped memory of current process and that address does not
> point to private memory.
>
> After parsing the userspace instruction, it is necessary to ensure that:
>
> 1. the operation direction (read/write) corresponds to #VE info;
> 2. the address still points to mapped memory of current process;
> 3. the address does not point to private memory.
I don't see where you check 3.
I guess you can add pte_decrypted(pte) check to get_phys_addr().
But I'm not sure it is strictly needed.
> Signed-off-by: Alexey Gladkov (Intel) <legion@kernel.org>
> ---
> arch/x86/coco/tdx/tdx.c | 128 ++++++++++++++++++++++++++++++++++++----
> 1 file changed, 115 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
> index af0b6c1cacf7..86c22fec97fb 100644
> --- a/arch/x86/coco/tdx/tdx.c
> +++ b/arch/x86/coco/tdx/tdx.c
> @@ -8,6 +8,7 @@
> #include <linux/export.h>
> #include <linux/io.h>
> #include <linux/kexec.h>
> +#include <linux/mm.h>
> #include <asm/coco.h>
> #include <asm/tdx.h>
> #include <asm/vmx.h>
> @@ -405,6 +406,84 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
> EPT_WRITE, addr, val);
> }
>
> +static inline bool is_private_gpa(u64 gpa)
> +{
> + return gpa == cc_mkenc(gpa);
> +}
> +
> +static int get_phys_addr(unsigned long addr, phys_addr_t *phys_addr, bool *writable)
> +{
> + unsigned int level;
> + pgd_t *pgdp;
> + pte_t *ptep;
> +
> + /*
> + * Address validation only makes sense for a user process. The lock must
> + * be obtained before validation can begin.
> + */
> + mmap_assert_locked(current->mm);
> +
> + pgdp = pgd_offset(current->mm, addr);
> +
> + if (!pgd_none(*pgdp)) {
> + ptep = lookup_address_in_pgd(pgdp, addr, &level);
> + if (ptep) {
> + unsigned long offset;
> +
> + offset = addr & ~page_level_mask(level);
> + *phys_addr = PFN_PHYS(pte_pfn(*ptep));
> + *phys_addr |= offset;
> +
> + *writable = pte_write(*ptep);
> +
> + return 0;
> + }
> + }
> +
> + return -EFAULT;
> +}
> +
> +static int valid_vaddr(struct ve_info *ve, enum insn_mmio_type mmio, int size,
> + unsigned long vaddr)
> +{
> + phys_addr_t phys_addr;
> + bool writable = false;
> +
> + /* It's not fatal. This can happen due to swap out or page migration. */
> + if (get_phys_addr(vaddr, &phys_addr, &writable) || (ve->gpa != cc_mkdec(phys_addr)))
Too long line?
> + return -EAGAIN;
> +
> + /*
> + * Re-check whether #VE info matches the instruction that was decoded.
> + *
> + * The ve->gpa was valid at the time ve_info was received. But this code
> + * executed with interrupts enabled, allowing tlb shootdown and therefore
> + * munmap() to be executed in the parallel thread.
> + *
> + * By the time MMIO emulation is performed, ve->gpa may be already
> + * unmapped from the process, the device it belongs to removed from
> + * system and something else could be plugged in its place.
> + */
> + switch (mmio) {
> + case INSN_MMIO_WRITE:
> + case INSN_MMIO_WRITE_IMM:
> + if (!writable || !(ve->exit_qual & EPT_VIOLATION_ACC_WRITE))
> + return -EFAULT;
> + break;
> + case INSN_MMIO_READ:
> + case INSN_MMIO_READ_ZERO_EXTEND:
> + case INSN_MMIO_READ_SIGN_EXTEND:
> + if (!(ve->exit_qual & EPT_VIOLATION_ACC_READ))
> + return -EFAULT;
> + break;
> + default:
> + WARN_ONCE(1, "Unsupported mmio instruction: %d", mmio);
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> static int handle_mmio_write(struct insn *insn, enum insn_mmio_type mmio, int size,
> struct pt_regs *regs, struct ve_info *ve)
> {
> @@ -489,7 +568,7 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
> enum insn_mmio_type mmio;
> struct insn insn = {};
> unsigned long vaddr;
> - int size;
> + int size, ret;
>
> /* Only in-kernel MMIO is supported */
> if (WARN_ON_ONCE(user_mode(regs)))
> @@ -505,6 +584,17 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
> if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
> return -EINVAL;
>
> + vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> +
> + if (current->mm) {
Hm. This path will be taken for any MMIO if it is done in context of a
process, even in-kernel only. I don't think we want it. It is useless
overhead.
Use user_mode(regs) instead.
> + if (mmap_read_lock_killable(current->mm))
> + return -EINTR;
> +
> + ret = valid_vaddr(ve, mmio, size, vaddr);
> + if (ret)
> + goto unlock;
> + }
> +
> /*
> * Reject EPT violation #VEs that split pages.
> *
> @@ -514,30 +604,39 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
> *
> * load_unaligned_zeropad() will recover using exception fixups.
> */
> - vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> - if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
> - return -EFAULT;
> + if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE) {
> + ret = -EFAULT;
> + goto unlock;
> + }
>
> switch (mmio) {
> case INSN_MMIO_WRITE:
> case INSN_MMIO_WRITE_IMM:
> case INSN_MMIO_MOVS:
> - return handle_mmio_write(&insn, mmio, size, regs, ve);
> + ret = handle_mmio_write(&insn, mmio, size, regs, ve);
> + break;
> case INSN_MMIO_READ:
> case INSN_MMIO_READ_ZERO_EXTEND:
> case INSN_MMIO_READ_SIGN_EXTEND:
> - return handle_mmio_read(&insn, mmio, size, regs, ve);
> + ret = handle_mmio_read(&insn, mmio, size, regs, ve);
> + break;
> case INSN_MMIO_DECODE_FAILED:
> /*
> * MMIO was accessed with an instruction that could not be
> * decoded or handled properly. It was likely not using io.h
> * helpers or accessed MMIO accidentally.
> */
> - return -EINVAL;
> + ret = -EINVAL;
> + break;
> default:
> WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
> - return -EINVAL;
> + ret = -EINVAL;
> }
> +unlock:
> + if (current->mm)
> + mmap_read_unlock(current->mm);
> +
> + return ret;
> }
>
> static bool handle_in(struct pt_regs *regs, int size, int port)
> @@ -681,11 +780,6 @@ static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
> }
> }
>
> -static inline bool is_private_gpa(u64 gpa)
> -{
> - return gpa == cc_mkenc(gpa);
> -}
> -
> /*
> * Handle the kernel #VE.
> *
> @@ -723,6 +817,14 @@ bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
> insn_len = virt_exception_user(regs, ve);
> else
> insn_len = virt_exception_kernel(regs, ve);
> +
> + /*
> + * A special case to return to userspace without increasing regs->ip
> + * to repeat the instruction once again.
> + */
> + if (insn_len == -EAGAIN)
> + return true;
> +
> if (insn_len < 0)
> return false;
>
> --
> 2.45.2
>
--
Kiryl Shutsemau / Kirill A. Shutemov
On Mon, Aug 19, 2024 at 01:39:17PM +0300, Kirill A. Shutemov wrote:
> On Fri, Aug 16, 2024 at 03:43:52PM +0200, Alexey Gladkov wrote:
> > From: "Alexey Gladkov (Intel)" <legion@kernel.org>
> >
> > Instructions from kernel space are considered trusted. If the MMIO
> > instruction is from userspace it must be checked.
> >
> > For userspace instructions, it is need to check that the INSN has not
> > changed at the time of #VE and before the execution of the instruction.
>
> Well, we cannot really check if the instruction changed under us. We can
> only check if the parsed instruction does an MMIO operation that is
> allowed for the process.
We also check that the memory access (read/write) type matches. Yes, we
can't check the instruction itself, but we check the arguments.
> >
> > Once the userspace instruction parsed is enforced that the address
> > points to mapped memory of current process and that address does not
> > point to private memory.
> >
> > After parsing the userspace instruction, it is necessary to ensure that:
> >
> > 1. the operation direction (read/write) corresponds to #VE info;
> > 2. the address still points to mapped memory of current process;
> > 3. the address does not point to private memory.
>
> I don't see where you check 3.
>
> I guess you can add pte_decrypted(pte) check to get_phys_addr().
>
> But I'm not sure it is strictly needed.
(ve->gpa != cc_mkdec(phys_addr)
The ve->gpa was checked in the virt_exception_user/kernel().
>
> > Signed-off-by: Alexey Gladkov (Intel) <legion@kernel.org>
> > ---
> > arch/x86/coco/tdx/tdx.c | 128 ++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 115 insertions(+), 13 deletions(-)
> >
> > diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
> > index af0b6c1cacf7..86c22fec97fb 100644
> > --- a/arch/x86/coco/tdx/tdx.c
> > +++ b/arch/x86/coco/tdx/tdx.c
> > @@ -8,6 +8,7 @@
> > #include <linux/export.h>
> > #include <linux/io.h>
> > #include <linux/kexec.h>
> > +#include <linux/mm.h>
> > #include <asm/coco.h>
> > #include <asm/tdx.h>
> > #include <asm/vmx.h>
> > @@ -405,6 +406,84 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
> > EPT_WRITE, addr, val);
> > }
> >
> > +static inline bool is_private_gpa(u64 gpa)
> > +{
> > + return gpa == cc_mkenc(gpa);
> > +}
> > +
> > +static int get_phys_addr(unsigned long addr, phys_addr_t *phys_addr, bool *writable)
> > +{
> > + unsigned int level;
> > + pgd_t *pgdp;
> > + pte_t *ptep;
> > +
> > + /*
> > + * Address validation only makes sense for a user process. The lock must
> > + * be obtained before validation can begin.
> > + */
> > + mmap_assert_locked(current->mm);
> > +
> > + pgdp = pgd_offset(current->mm, addr);
> > +
> > + if (!pgd_none(*pgdp)) {
> > + ptep = lookup_address_in_pgd(pgdp, addr, &level);
> > + if (ptep) {
> > + unsigned long offset;
> > +
> > + offset = addr & ~page_level_mask(level);
> > + *phys_addr = PFN_PHYS(pte_pfn(*ptep));
> > + *phys_addr |= offset;
> > +
> > + *writable = pte_write(*ptep);
> > +
> > + return 0;
> > + }
> > + }
> > +
> > + return -EFAULT;
> > +}
> > +
> > +static int valid_vaddr(struct ve_info *ve, enum insn_mmio_type mmio, int size,
> > + unsigned long vaddr)
> > +{
> > + phys_addr_t phys_addr;
> > + bool writable = false;
> > +
> > + /* It's not fatal. This can happen due to swap out or page migration. */
> > + if (get_phys_addr(vaddr, &phys_addr, &writable) || (ve->gpa != cc_mkdec(phys_addr)))
>
> Too long line?
All patches pass checkpatch without warnings.
>
> > + return -EAGAIN;
> > +
> > + /*
> > + * Re-check whether #VE info matches the instruction that was decoded.
> > + *
> > + * The ve->gpa was valid at the time ve_info was received. But this code
> > + * executed with interrupts enabled, allowing tlb shootdown and therefore
> > + * munmap() to be executed in the parallel thread.
> > + *
> > + * By the time MMIO emulation is performed, ve->gpa may be already
> > + * unmapped from the process, the device it belongs to removed from
> > + * system and something else could be plugged in its place.
> > + */
> > + switch (mmio) {
> > + case INSN_MMIO_WRITE:
> > + case INSN_MMIO_WRITE_IMM:
> > + if (!writable || !(ve->exit_qual & EPT_VIOLATION_ACC_WRITE))
> > + return -EFAULT;
> > + break;
> > + case INSN_MMIO_READ:
> > + case INSN_MMIO_READ_ZERO_EXTEND:
> > + case INSN_MMIO_READ_SIGN_EXTEND:
> > + if (!(ve->exit_qual & EPT_VIOLATION_ACC_READ))
> > + return -EFAULT;
> > + break;
> > + default:
> > + WARN_ONCE(1, "Unsupported mmio instruction: %d", mmio);
> > + return -EINVAL;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > static int handle_mmio_write(struct insn *insn, enum insn_mmio_type mmio, int size,
> > struct pt_regs *regs, struct ve_info *ve)
> > {
> > @@ -489,7 +568,7 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
> > enum insn_mmio_type mmio;
> > struct insn insn = {};
> > unsigned long vaddr;
> > - int size;
> > + int size, ret;
> >
> > /* Only in-kernel MMIO is supported */
> > if (WARN_ON_ONCE(user_mode(regs)))
> > @@ -505,6 +584,17 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
> > if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
> > return -EINVAL;
> >
> > + vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> > +
> > + if (current->mm) {
>
> Hm. This path will be taken for any MMIO if it is done in context of a
> process, even in-kernel only. I don't think we want it. It is useless
> overhead.
The kthread do not have a current->mm. As an example:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/vfio/vfio_iommu_type1.c#n3053
Also documentation mention this as the way to check a user context:
(which makes more sense anyway - the test is basically one of "do
we have a user context", and is generally done by the page fault
handler and things like that).
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/mm/active_mm.rst#n80
> Use user_mode(regs) instead.
I can't use this. When nested exception happens in the handle_mmio_movs()
the regs will be not in the user mode.
I can make a flag that will be set either for user_mode or if we have a
nested exception.
> > + if (mmap_read_lock_killable(current->mm))
> > + return -EINTR;
> > +
> > + ret = valid_vaddr(ve, mmio, size, vaddr);
> > + if (ret)
> > + goto unlock;
> > + }
> > +
> > /*
> > * Reject EPT violation #VEs that split pages.
> > *
> > @@ -514,30 +604,39 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
> > *
> > * load_unaligned_zeropad() will recover using exception fixups.
> > */
> > - vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> > - if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
> > - return -EFAULT;
> > + if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE) {
> > + ret = -EFAULT;
> > + goto unlock;
> > + }
> >
> > switch (mmio) {
> > case INSN_MMIO_WRITE:
> > case INSN_MMIO_WRITE_IMM:
> > case INSN_MMIO_MOVS:
> > - return handle_mmio_write(&insn, mmio, size, regs, ve);
> > + ret = handle_mmio_write(&insn, mmio, size, regs, ve);
> > + break;
> > case INSN_MMIO_READ:
> > case INSN_MMIO_READ_ZERO_EXTEND:
> > case INSN_MMIO_READ_SIGN_EXTEND:
> > - return handle_mmio_read(&insn, mmio, size, regs, ve);
> > + ret = handle_mmio_read(&insn, mmio, size, regs, ve);
> > + break;
> > case INSN_MMIO_DECODE_FAILED:
> > /*
> > * MMIO was accessed with an instruction that could not be
> > * decoded or handled properly. It was likely not using io.h
> > * helpers or accessed MMIO accidentally.
> > */
> > - return -EINVAL;
> > + ret = -EINVAL;
> > + break;
> > default:
> > WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
> > - return -EINVAL;
> > + ret = -EINVAL;
> > }
> > +unlock:
> > + if (current->mm)
> > + mmap_read_unlock(current->mm);
> > +
> > + return ret;
> > }
> >
> > static bool handle_in(struct pt_regs *regs, int size, int port)
> > @@ -681,11 +780,6 @@ static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
> > }
> > }
> >
> > -static inline bool is_private_gpa(u64 gpa)
> > -{
> > - return gpa == cc_mkenc(gpa);
> > -}
> > -
> > /*
> > * Handle the kernel #VE.
> > *
> > @@ -723,6 +817,14 @@ bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
> > insn_len = virt_exception_user(regs, ve);
> > else
> > insn_len = virt_exception_kernel(regs, ve);
> > +
> > + /*
> > + * A special case to return to userspace without increasing regs->ip
> > + * to repeat the instruction once again.
> > + */
> > + if (insn_len == -EAGAIN)
> > + return true;
> > +
> > if (insn_len < 0)
> > return false;
> >
> > --
> > 2.45.2
> >
>
> --
> Kiryl Shutsemau / Kirill A. Shutemov
>
--
Rgrds, legion
On Mon, Aug 19, 2024 at 01:48:16PM +0200, Alexey Gladkov wrote:
> On Mon, Aug 19, 2024 at 01:39:17PM +0300, Kirill A. Shutemov wrote:
> > On Fri, Aug 16, 2024 at 03:43:52PM +0200, Alexey Gladkov wrote:
> > > From: "Alexey Gladkov (Intel)" <legion@kernel.org>
> > >
> > > Instructions from kernel space are considered trusted. If the MMIO
> > > instruction is from userspace it must be checked.
> > >
> > > For userspace instructions, it is need to check that the INSN has not
> > > changed at the time of #VE and before the execution of the instruction.
> >
> > Well, we cannot really check if the instruction changed under us. We can
> > only check if the parsed instruction does an MMIO operation that is
> > allowed for the process.
>
> We also check that the memory access (read/write) type matches. Yes, we
> can't check the instruction itself, but we check the arguments.
>
> > >
> > > Once the userspace instruction parsed is enforced that the address
> > > points to mapped memory of current process and that address does not
> > > point to private memory.
> > >
> > > After parsing the userspace instruction, it is necessary to ensure that:
> > >
> > > 1. the operation direction (read/write) corresponds to #VE info;
> > > 2. the address still points to mapped memory of current process;
> > > 3. the address does not point to private memory.
> >
> > I don't see where you check 3.
> >
> > I guess you can add pte_decrypted(pte) check to get_phys_addr().
> >
> > But I'm not sure it is strictly needed.
>
> (ve->gpa != cc_mkdec(phys_addr)
>
> The ve->gpa was checked in the virt_exception_user/kernel().
phys_addr doesn't have shared bit. It is masked out on pte_pfn(). That's
the reason you use cc_mkdec() to compare with ve->gpa. Otherwise it would
fail.
>
> >
> > > Signed-off-by: Alexey Gladkov (Intel) <legion@kernel.org>
> > > ---
> > > arch/x86/coco/tdx/tdx.c | 128 ++++++++++++++++++++++++++++++++++++----
> > > 1 file changed, 115 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
> > > index af0b6c1cacf7..86c22fec97fb 100644
> > > --- a/arch/x86/coco/tdx/tdx.c
> > > +++ b/arch/x86/coco/tdx/tdx.c
> > > @@ -8,6 +8,7 @@
> > > #include <linux/export.h>
> > > #include <linux/io.h>
> > > #include <linux/kexec.h>
> > > +#include <linux/mm.h>
> > > #include <asm/coco.h>
> > > #include <asm/tdx.h>
> > > #include <asm/vmx.h>
> > > @@ -405,6 +406,84 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
> > > EPT_WRITE, addr, val);
> > > }
> > >
> > > +static inline bool is_private_gpa(u64 gpa)
> > > +{
> > > + return gpa == cc_mkenc(gpa);
> > > +}
> > > +
> > > +static int get_phys_addr(unsigned long addr, phys_addr_t *phys_addr, bool *writable)
> > > +{
> > > + unsigned int level;
> > > + pgd_t *pgdp;
> > > + pte_t *ptep;
> > > +
> > > + /*
> > > + * Address validation only makes sense for a user process. The lock must
> > > + * be obtained before validation can begin.
> > > + */
> > > + mmap_assert_locked(current->mm);
> > > +
> > > + pgdp = pgd_offset(current->mm, addr);
> > > +
> > > + if (!pgd_none(*pgdp)) {
> > > + ptep = lookup_address_in_pgd(pgdp, addr, &level);
> > > + if (ptep) {
> > > + unsigned long offset;
> > > +
> > > + offset = addr & ~page_level_mask(level);
> > > + *phys_addr = PFN_PHYS(pte_pfn(*ptep));
> > > + *phys_addr |= offset;
> > > +
> > > + *writable = pte_write(*ptep);
> > > +
> > > + return 0;
> > > + }
> > > + }
> > > +
> > > + return -EFAULT;
> > > +}
> > > +
> > > +static int valid_vaddr(struct ve_info *ve, enum insn_mmio_type mmio, int size,
> > > + unsigned long vaddr)
> > > +{
> > > + phys_addr_t phys_addr;
> > > + bool writable = false;
> > > +
> > > + /* It's not fatal. This can happen due to swap out or page migration. */
> > > + if (get_phys_addr(vaddr, &phys_addr, &writable) || (ve->gpa != cc_mkdec(phys_addr)))
> >
> > Too long line?
>
> All patches pass checkpatch without warnings.
Checkpatch is not the ultimate authority. But I am neither. :P
> >
> > > + return -EAGAIN;
> > > +
> > > + /*
> > > + * Re-check whether #VE info matches the instruction that was decoded.
> > > + *
> > > + * The ve->gpa was valid at the time ve_info was received. But this code
> > > + * executed with interrupts enabled, allowing tlb shootdown and therefore
> > > + * munmap() to be executed in the parallel thread.
> > > + *
> > > + * By the time MMIO emulation is performed, ve->gpa may be already
> > > + * unmapped from the process, the device it belongs to removed from
> > > + * system and something else could be plugged in its place.
> > > + */
> > > + switch (mmio) {
> > > + case INSN_MMIO_WRITE:
> > > + case INSN_MMIO_WRITE_IMM:
> > > + if (!writable || !(ve->exit_qual & EPT_VIOLATION_ACC_WRITE))
> > > + return -EFAULT;
> > > + break;
> > > + case INSN_MMIO_READ:
> > > + case INSN_MMIO_READ_ZERO_EXTEND:
> > > + case INSN_MMIO_READ_SIGN_EXTEND:
> > > + if (!(ve->exit_qual & EPT_VIOLATION_ACC_READ))
> > > + return -EFAULT;
> > > + break;
> > > + default:
> > > + WARN_ONCE(1, "Unsupported mmio instruction: %d", mmio);
> > > + return -EINVAL;
> > > + }
> > > +
> > > + return 0;
> > > +}
> > > +
> > > static int handle_mmio_write(struct insn *insn, enum insn_mmio_type mmio, int size,
> > > struct pt_regs *regs, struct ve_info *ve)
> > > {
> > > @@ -489,7 +568,7 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
> > > enum insn_mmio_type mmio;
> > > struct insn insn = {};
> > > unsigned long vaddr;
> > > - int size;
> > > + int size, ret;
> > >
> > > /* Only in-kernel MMIO is supported */
> > > if (WARN_ON_ONCE(user_mode(regs)))
> > > @@ -505,6 +584,17 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
> > > if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
> > > return -EINVAL;
> > >
> > > + vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> > > +
> > > + if (current->mm) {
> >
> > Hm. This path will be taken for any MMIO if it is done in context of a
> > process, even in-kernel only. I don't think we want it. It is useless
> > overhead.
>
> The kthread do not have a current->mm.
I am not talking about kthread. I am talking about initiating MMIO from
kernel, but within a process context. Like, you call an ioctl() on a
device fd and it triggers MMIO in kernel. This scenario would have
current->mm, but it is not userspace MMIO.
> > Use user_mode(regs) instead.
>
> I can't use this. When nested exception happens in the handle_mmio_movs()
> the regs will be not in the user mode.
>
> I can make a flag that will be set either for user_mode or if we have a
> nested exception.
Hm. Yeah. This is ugly. Let me think about it.
--
Kiryl Shutsemau / Kirill A. Shutemov
On Mon, Aug 19, 2024 at 03:07:50PM +0300, Kirill A. Shutemov wrote:
> On Mon, Aug 19, 2024 at 01:48:16PM +0200, Alexey Gladkov wrote:
> > On Mon, Aug 19, 2024 at 01:39:17PM +0300, Kirill A. Shutemov wrote:
> > > On Fri, Aug 16, 2024 at 03:43:52PM +0200, Alexey Gladkov wrote:
> > > > From: "Alexey Gladkov (Intel)" <legion@kernel.org>
> > > >
> > > > Instructions from kernel space are considered trusted. If the MMIO
> > > > instruction is from userspace it must be checked.
> > > >
> > > > For userspace instructions, it is need to check that the INSN has not
> > > > changed at the time of #VE and before the execution of the instruction.
> > >
> > > Well, we cannot really check if the instruction changed under us. We can
> > > only check if the parsed instruction does an MMIO operation that is
> > > allowed for the process.
> >
> > We also check that the memory access (read/write) type matches. Yes, we
> > can't check the instruction itself, but we check the arguments.
> >
> > > >
> > > > Once the userspace instruction parsed is enforced that the address
> > > > points to mapped memory of current process and that address does not
> > > > point to private memory.
> > > >
> > > > After parsing the userspace instruction, it is necessary to ensure that:
> > > >
> > > > 1. the operation direction (read/write) corresponds to #VE info;
> > > > 2. the address still points to mapped memory of current process;
> > > > 3. the address does not point to private memory.
> > >
> > > I don't see where you check 3.
> > >
> > > I guess you can add pte_decrypted(pte) check to get_phys_addr().
> > >
> > > But I'm not sure it is strictly needed.
> >
> > (ve->gpa != cc_mkdec(phys_addr)
> >
> > The ve->gpa was checked in the virt_exception_user/kernel().
>
> phys_addr doesn't have shared bit. It is masked out on pte_pfn(). That's
> the reason you use cc_mkdec() to compare with ve->gpa. Otherwise it would
> fail.
Ok. I think I've confused myself. I will add pte_decrypted().
>
> >
> > >
> > > > Signed-off-by: Alexey Gladkov (Intel) <legion@kernel.org>
> > > > ---
> > > > arch/x86/coco/tdx/tdx.c | 128 ++++++++++++++++++++++++++++++++++++----
> > > > 1 file changed, 115 insertions(+), 13 deletions(-)
> > > >
> > > > diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
> > > > index af0b6c1cacf7..86c22fec97fb 100644
> > > > --- a/arch/x86/coco/tdx/tdx.c
> > > > +++ b/arch/x86/coco/tdx/tdx.c
> > > > @@ -8,6 +8,7 @@
> > > > #include <linux/export.h>
> > > > #include <linux/io.h>
> > > > #include <linux/kexec.h>
> > > > +#include <linux/mm.h>
> > > > #include <asm/coco.h>
> > > > #include <asm/tdx.h>
> > > > #include <asm/vmx.h>
> > > > @@ -405,6 +406,84 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
> > > > EPT_WRITE, addr, val);
> > > > }
> > > >
> > > > +static inline bool is_private_gpa(u64 gpa)
> > > > +{
> > > > + return gpa == cc_mkenc(gpa);
> > > > +}
> > > > +
> > > > +static int get_phys_addr(unsigned long addr, phys_addr_t *phys_addr, bool *writable)
> > > > +{
> > > > + unsigned int level;
> > > > + pgd_t *pgdp;
> > > > + pte_t *ptep;
> > > > +
> > > > + /*
> > > > + * Address validation only makes sense for a user process. The lock must
> > > > + * be obtained before validation can begin.
> > > > + */
> > > > + mmap_assert_locked(current->mm);
> > > > +
> > > > + pgdp = pgd_offset(current->mm, addr);
> > > > +
> > > > + if (!pgd_none(*pgdp)) {
> > > > + ptep = lookup_address_in_pgd(pgdp, addr, &level);
> > > > + if (ptep) {
> > > > + unsigned long offset;
> > > > +
> > > > + offset = addr & ~page_level_mask(level);
> > > > + *phys_addr = PFN_PHYS(pte_pfn(*ptep));
> > > > + *phys_addr |= offset;
> > > > +
> > > > + *writable = pte_write(*ptep);
> > > > +
> > > > + return 0;
> > > > + }
> > > > + }
> > > > +
> > > > + return -EFAULT;
> > > > +}
> > > > +
> > > > +static int valid_vaddr(struct ve_info *ve, enum insn_mmio_type mmio, int size,
> > > > + unsigned long vaddr)
> > > > +{
> > > > + phys_addr_t phys_addr;
> > > > + bool writable = false;
> > > > +
> > > > + /* It's not fatal. This can happen due to swap out or page migration. */
> > > > + if (get_phys_addr(vaddr, &phys_addr, &writable) || (ve->gpa != cc_mkdec(phys_addr)))
> > >
> > > Too long line?
> >
> > All patches pass checkpatch without warnings.
>
> Checkpatch is not the ultimate authority. But I am neither. :P
>
> > >
> > > > + return -EAGAIN;
> > > > +
> > > > + /*
> > > > + * Re-check whether #VE info matches the instruction that was decoded.
> > > > + *
> > > > + * The ve->gpa was valid at the time ve_info was received. But this code
> > > > + * executed with interrupts enabled, allowing tlb shootdown and therefore
> > > > + * munmap() to be executed in the parallel thread.
> > > > + *
> > > > + * By the time MMIO emulation is performed, ve->gpa may be already
> > > > + * unmapped from the process, the device it belongs to removed from
> > > > + * system and something else could be plugged in its place.
> > > > + */
> > > > + switch (mmio) {
> > > > + case INSN_MMIO_WRITE:
> > > > + case INSN_MMIO_WRITE_IMM:
> > > > + if (!writable || !(ve->exit_qual & EPT_VIOLATION_ACC_WRITE))
> > > > + return -EFAULT;
> > > > + break;
> > > > + case INSN_MMIO_READ:
> > > > + case INSN_MMIO_READ_ZERO_EXTEND:
> > > > + case INSN_MMIO_READ_SIGN_EXTEND:
> > > > + if (!(ve->exit_qual & EPT_VIOLATION_ACC_READ))
> > > > + return -EFAULT;
> > > > + break;
> > > > + default:
> > > > + WARN_ONCE(1, "Unsupported mmio instruction: %d", mmio);
> > > > + return -EINVAL;
> > > > + }
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > static int handle_mmio_write(struct insn *insn, enum insn_mmio_type mmio, int size,
> > > > struct pt_regs *regs, struct ve_info *ve)
> > > > {
> > > > @@ -489,7 +568,7 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
> > > > enum insn_mmio_type mmio;
> > > > struct insn insn = {};
> > > > unsigned long vaddr;
> > > > - int size;
> > > > + int size, ret;
> > > >
> > > > /* Only in-kernel MMIO is supported */
> > > > if (WARN_ON_ONCE(user_mode(regs)))
> > > > @@ -505,6 +584,17 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
> > > > if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
> > > > return -EINVAL;
> > > >
> > > > + vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> > > > +
> > > > + if (current->mm) {
> > >
> > > Hm. This path will be taken for any MMIO if it is done in context of a
> > > process, even in-kernel only. I don't think we want it. It is useless
> > > overhead.
> >
> > The kthread do not have a current->mm.
>
> I am not talking about kthread. I am talking about initiating MMIO from
> kernel, but within a process context. Like, you call an ioctl() on a
> device fd and it triggers MMIO in kernel. This scenario would have
> current->mm, but it is not userspace MMIO.
Ok. I will use user_mode here and in the movs patch I will add a special
flag to perform checks in case of nested exceptions.
> > > Use user_mode(regs) instead.
> >
> > I can't use this. When nested exception happens in the handle_mmio_movs()
> > the regs will be not in the user mode.
> >
> > I can make a flag that will be set either for user_mode or if we have a
> > nested exception.
>
> Hm. Yeah. This is ugly. Let me think about it.
Yes, it's not very good.
--
Rgrds, legion
© 2016 - 2026 Red Hat, Inc.