[PATCH v2 2/5] x86/tdx: Add validation of userspace MMIO instructions

Alexey Gladkov (Intel) posted 5 patches 1 year, 4 months ago
There is a newer version of this series
[PATCH v2 2/5] x86/tdx: Add validation of userspace MMIO instructions
Posted by Alexey Gladkov (Intel) 1 year, 4 months ago
Instructions from kernel space are considered trusted. If the MMIO
instruction is from userspace it must be checked.

For userspace instructions, it is need to check that the INSN has not
changed at the time of #VE and before the execution of the instruction.

Once the userspace instruction parsed is enforced that the address
points to mapped memory of current process and that address does not
point to private memory.

After parsing the userspace instruction, it is necessary to ensure that:

1. the operation direction (read/write) corresponds to #VE info;
2. the address still points to mapped memory of current process;
3. the address does not point to private memory.

Signed-off-by: Alexey Gladkov (Intel) <legion@kernel.org>
---
 arch/x86/coco/tdx/tdx.c | 128 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 115 insertions(+), 13 deletions(-)

diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index af0b6c1cacf7..95f2ff49728c 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -8,6 +8,7 @@
 #include <linux/export.h>
 #include <linux/io.h>
 #include <linux/kexec.h>
+#include <linux/mm.h>
 #include <asm/coco.h>
 #include <asm/tdx.h>
 #include <asm/vmx.h>
@@ -405,6 +406,84 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
 			       EPT_WRITE, addr, val);
 }
 
+static inline bool is_private_gpa(u64 gpa)
+{
+	return gpa == cc_mkenc(gpa);
+}
+
+static int get_phys_addr(unsigned long addr, phys_addr_t *phys_addr, bool *writable)
+{
+	unsigned int level;
+	pgd_t *pgdp;
+	pte_t *ptep;
+
+	/*
+	 * Address validation only makes sense for a user process. The lock must
+	 * be obtained before validation can begin.
+	 */
+	mmap_assert_locked(current->mm);
+
+	pgdp = pgd_offset(current->mm, addr);
+
+	if (!pgd_none(*pgdp)) {
+		ptep = lookup_address_in_pgd(pgdp, addr, &level);
+		if (ptep) {
+			unsigned long offset;
+
+			offset = addr & ~page_level_mask(level);
+			*phys_addr = PFN_PHYS(pte_pfn(*ptep));
+			*phys_addr |= offset;
+
+			*writable = pte_write(*ptep);
+
+			return 0;
+		}
+	}
+
+	return -EFAULT;
+}
+
+static int valid_vaddr(struct ve_info *ve, enum insn_mmio_type mmio, int size,
+		       unsigned long vaddr)
+{
+	phys_addr_t phys_addr;
+	bool writable = false;
+
+	/* It's not fatal. This can happen due to swap out or page migration. */
+	if (get_phys_addr(vaddr, &phys_addr, &writable) || (ve->gpa != cc_mkdec(phys_addr)))
+		return -EAGAIN;
+
+	/*
+	 * Re-check whether #VE info matches the instruction that was decoded.
+	 *
+	 * The ve->gpa was valid at the time ve_info was received. But this code
+	 * executed with interrupts enabled, allowing tlb shootdown and therefore
+	 * munmap() to be executed in the parallel thread.
+	 *
+	 * By the time MMIO emulation is performed, ve->gpa may be already
+	 * unmapped from the process, the device it belongs to removed from
+	 * system and something else could be plugged in its place.
+	 */
+	switch (mmio) {
+	case INSN_MMIO_WRITE:
+	case INSN_MMIO_WRITE_IMM:
+		if (!writable || !(ve->exit_qual & EPT_VIOLATION_ACC_WRITE))
+			return -EFAULT;
+		break;
+	case INSN_MMIO_READ:
+	case INSN_MMIO_READ_ZERO_EXTEND:
+	case INSN_MMIO_READ_SIGN_EXTEND:
+		if (!(ve->exit_qual & EPT_VIOLATION_ACC_READ))
+			return -EFAULT;
+		break;
+	default:
+		WARN_ONCE(1, "Unsupported mmio instruction: %d", mmio);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int handle_mmio_write(struct insn *insn, enum insn_mmio_type mmio, int size,
 			     struct pt_regs *regs, struct ve_info *ve)
 {
@@ -489,7 +568,7 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
 	enum insn_mmio_type mmio;
 	struct insn insn = {};
 	unsigned long vaddr;
-	int size;
+	int size, ret;
 
 	/* Only in-kernel MMIO is supported */
 	if (WARN_ON_ONCE(user_mode(regs)))
@@ -505,6 +584,17 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
 	if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
 		return -EINVAL;
 
+	vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
+
+	if (user_mode(regs)) {
+		if (mmap_read_lock_killable(current->mm))
+			return -EINTR;
+
+		ret = valid_vaddr(ve, mmio, size, vaddr);
+		if (ret)
+			goto unlock;
+	}
+
 	/*
 	 * Reject EPT violation #VEs that split pages.
 	 *
@@ -514,30 +604,39 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
 	 *
 	 * load_unaligned_zeropad() will recover using exception fixups.
 	 */
-	vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
-	if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
-		return -EFAULT;
+	if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE) {
+		ret = -EFAULT;
+		goto unlock;
+	}
 
 	switch (mmio) {
 	case INSN_MMIO_WRITE:
 	case INSN_MMIO_WRITE_IMM:
 	case INSN_MMIO_MOVS:
-		return handle_mmio_write(&insn, mmio, size, regs, ve);
+		ret = handle_mmio_write(&insn, mmio, size, regs, ve);
+		break;
 	case INSN_MMIO_READ:
 	case INSN_MMIO_READ_ZERO_EXTEND:
 	case INSN_MMIO_READ_SIGN_EXTEND:
-		return handle_mmio_read(&insn, mmio, size, regs, ve);
+		ret = handle_mmio_read(&insn, mmio, size, regs, ve);
+		break;
 	case INSN_MMIO_DECODE_FAILED:
 		/*
 		 * MMIO was accessed with an instruction that could not be
 		 * decoded or handled properly. It was likely not using io.h
 		 * helpers or accessed MMIO accidentally.
 		 */
-		return -EINVAL;
+		ret = -EINVAL;
+		break;
 	default:
 		WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
-		return -EINVAL;
+		ret = -EINVAL;
 	}
+unlock:
+	if (user_mode(regs))
+		mmap_read_unlock(current->mm);
+
+	return ret;
 }
 
 static bool handle_in(struct pt_regs *regs, int size, int port)
@@ -681,11 +780,6 @@ static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
 	}
 }
 
-static inline bool is_private_gpa(u64 gpa)
-{
-	return gpa == cc_mkenc(gpa);
-}
-
 /*
  * Handle the kernel #VE.
  *
@@ -723,6 +817,14 @@ bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
 		insn_len = virt_exception_user(regs, ve);
 	else
 		insn_len = virt_exception_kernel(regs, ve);
+
+	/*
+	 * A special case to return to userspace without increasing regs->ip
+	 * to repeat the instruction once again.
+	 */
+	if (insn_len == -EAGAIN)
+		return true;
+
 	if (insn_len < 0)
 		return false;
 
-- 
2.45.2
Re: [PATCH v2 2/5] x86/tdx: Add validation of userspace MMIO instructions
Posted by Edgecombe, Rick P 1 year, 4 months ago
On Mon, 2024-08-05 at 15:29 +0200, Alexey Gladkov (Intel) wrote:
> +       vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> +
> +       if (user_mode(regs)) {
> +               if (mmap_read_lock_killable(current->mm))
> +                       return -EINTR;
> +
> +               ret = valid_vaddr(ve, mmio, size, vaddr);
> +               if (ret)
> +                       goto unlock;
> +       }
> +

In the case of user MMIO, if the user instruction + MAX_INSN_SIZE straddles a
page, then the "fetch" in the kernel could trigger a #VE. In this case the  
kernel would handle this second #VE as a !user_mode() MMIO I guess.

Would something prevent the same munmap() checks needing to happen for that
second kernel #VE? If not, I wonder if the munmap() protection logic should also
trigger for any userspace range ve->gpa as well.
Re: [PATCH v2 2/5] x86/tdx: Add validation of userspace MMIO instructions
Posted by Alexey Gladkov 1 year, 4 months ago
On Mon, Aug 05, 2024 at 10:40:55PM +0000, Edgecombe, Rick P wrote:
> On Mon, 2024-08-05 at 15:29 +0200, Alexey Gladkov (Intel) wrote:
> > +       vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> > +
> > +       if (user_mode(regs)) {
> > +               if (mmap_read_lock_killable(current->mm))
> > +                       return -EINTR;
> > +
> > +               ret = valid_vaddr(ve, mmio, size, vaddr);
> > +               if (ret)
> > +                       goto unlock;
> > +       }
> > +
> 
> In the case of user MMIO, if the user instruction + MAX_INSN_SIZE straddles a
> page, then the "fetch" in the kernel could trigger a #VE. In this case the  
> kernel would handle this second #VE as a !user_mode() MMIO I guess.
> 
> Would something prevent the same munmap() checks needing to happen for that
> second kernel #VE? If not, I wonder if the munmap() protection logic should also
> trigger for any userspace range ve->gpa as well.

I've added two more patches that should fix the problem. We can try to
avoid crossing the page boundary by first reading the data to the end of
the page and trying to parse it and only if that fails read MAX_INSN_SIZE.

I fixed this locally for tdx because it is required to read and parse the
buffer at the same time.

It's generally worth fixing elsewhere as well. But this I propose to do by
a separate patchset.

-- 
Rgrds, legion
Re: [PATCH v2 2/5] x86/tdx: Add validation of userspace MMIO instructions
Posted by kirill.shutemov@linux.intel.com 1 year, 4 months ago
On Mon, Aug 05, 2024 at 10:40:55PM +0000, Edgecombe, Rick P wrote:
> On Mon, 2024-08-05 at 15:29 +0200, Alexey Gladkov (Intel) wrote:
> > +       vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> > +
> > +       if (user_mode(regs)) {
> > +               if (mmap_read_lock_killable(current->mm))
> > +                       return -EINTR;
> > +
> > +               ret = valid_vaddr(ve, mmio, size, vaddr);
> > +               if (ret)
> > +                       goto unlock;
> > +       }
> > +
> 
> In the case of user MMIO, if the user instruction + MAX_INSN_SIZE straddles a
> page, then the "fetch" in the kernel could trigger a #VE. In this case the  
> kernel would handle this second #VE as a !user_mode() MMIO I guess.
> 
> Would something prevent the same munmap() checks needing to happen for that
> second kernel #VE? If not, I wonder if the munmap() protection logic should also
> trigger for any userspace range ve->gpa as well.

That's an interesting scenario, but I think we are fine.

The fetch is copy_from_user() which is "REP; MOVSB" on all TDX platforms.
Kernel rejects MOVS instruction emulation for !user_mode() with -EFAULT.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCH v2 2/5] x86/tdx: Add validation of userspace MMIO instructions
Posted by Alexey Gladkov 1 year, 4 months ago
On Tue, Aug 06, 2024 at 10:18:20AM +0300, kirill.shutemov@linux.intel.com wrote:
> On Mon, Aug 05, 2024 at 10:40:55PM +0000, Edgecombe, Rick P wrote:
> > On Mon, 2024-08-05 at 15:29 +0200, Alexey Gladkov (Intel) wrote:
> > > +       vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> > > +
> > > +       if (user_mode(regs)) {
> > > +               if (mmap_read_lock_killable(current->mm))
> > > +                       return -EINTR;
> > > +
> > > +               ret = valid_vaddr(ve, mmio, size, vaddr);
> > > +               if (ret)
> > > +                       goto unlock;
> > > +       }
> > > +
> > 
> > In the case of user MMIO, if the user instruction + MAX_INSN_SIZE straddles a
> > page, then the "fetch" in the kernel could trigger a #VE. In this case the  
> > kernel would handle this second #VE as a !user_mode() MMIO I guess.
> > 
> > Would something prevent the same munmap() checks needing to happen for that
> > second kernel #VE? If not, I wonder if the munmap() protection logic should also
> > trigger for any userspace range ve->gpa as well.
> 
> That's an interesting scenario, but I think we are fine.
> 
> The fetch is copy_from_user() which is "REP; MOVSB" on all TDX platforms.
> Kernel rejects MOVS instruction emulation for !user_mode() with -EFAULT.

But MOVS will be used only if X86_FEATURE_FSRM feature is present.
Otherwise rep_movs_alternative will be used, which uses MOVB.

I know that X86_FEATURE_FSRM appeared since Ice Lake, but still.

-- 
Rgrds, legion
RE: [PATCH v2 2/5] x86/tdx: Add validation of userspace MMIO instructions
Posted by Reshetova, Elena 1 year, 4 months ago
> On Tue, Aug 06, 2024 at 10:18:20AM +0300, kirill.shutemov@linux.intel.com
> wrote:
> > On Mon, Aug 05, 2024 at 10:40:55PM +0000, Edgecombe, Rick P wrote:
> > > On Mon, 2024-08-05 at 15:29 +0200, Alexey Gladkov (Intel) wrote:
> > > > +       vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> > > > +
> > > > +       if (user_mode(regs)) {
> > > > +               if (mmap_read_lock_killable(current->mm))
> > > > +                       return -EINTR;
> > > > +
> > > > +               ret = valid_vaddr(ve, mmio, size, vaddr);
> > > > +               if (ret)
> > > > +                       goto unlock;
> > > > +       }
> > > > +
> > >
> > > In the case of user MMIO, if the user instruction + MAX_INSN_SIZE
> straddles a
> > > page, then the "fetch" in the kernel could trigger a #VE. In this case the
> > > kernel would handle this second #VE as a !user_mode() MMIO I guess.
> > >
> > > Would something prevent the same munmap() checks needing to happen
> for that
> > > second kernel #VE? If not, I wonder if the munmap() protection logic
> should also
> > > trigger for any userspace range ve->gpa as well.
> >
> > That's an interesting scenario, but I think we are fine.
> >
> > The fetch is copy_from_user() which is "REP; MOVSB" on all TDX platforms.
> > Kernel rejects MOVS instruction emulation for !user_mode() with -EFAULT.
> 
> But MOVS will be used only if X86_FEATURE_FSRM feature is present.
> Otherwise rep_movs_alternative will be used, which uses MOVB.
> 
> I know that X86_FEATURE_FSRM appeared since Ice Lake, but still.

This is how the X86_FEATURE_FSRM cpuid bit is treated under TDX:

{
          "MSB": "4",
          "LSB": "4",
          "Field Size": "1",
          "Field Name": "Fast Short REP MOV",
          "Configuration Details": "TD_PARAMS.CPUID_CONFIG",
          "Bit or Field Virtualization Type": "Configured & Native",
          "Virtualization Details": null
        },

Which means VMM has the way to overwrite the native platform value
and set it to "0", so we must account for both cases. 
Re: [PATCH v2 2/5] x86/tdx: Add validation of userspace MMIO instructions
Posted by Alexey Gladkov 1 year, 4 months ago
On Tue, Aug 06, 2024 at 11:41:57AM +0000, Reshetova, Elena wrote:
> > On Tue, Aug 06, 2024 at 10:18:20AM +0300, kirill.shutemov@linux.intel.com
> > wrote:
> > > On Mon, Aug 05, 2024 at 10:40:55PM +0000, Edgecombe, Rick P wrote:
> > > > On Mon, 2024-08-05 at 15:29 +0200, Alexey Gladkov (Intel) wrote:
> > > > > +       vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> > > > > +
> > > > > +       if (user_mode(regs)) {
> > > > > +               if (mmap_read_lock_killable(current->mm))
> > > > > +                       return -EINTR;
> > > > > +
> > > > > +               ret = valid_vaddr(ve, mmio, size, vaddr);
> > > > > +               if (ret)
> > > > > +                       goto unlock;
> > > > > +       }
> > > > > +
> > > >
> > > > In the case of user MMIO, if the user instruction + MAX_INSN_SIZE
> > straddles a
> > > > page, then the "fetch" in the kernel could trigger a #VE. In this case the
> > > > kernel would handle this second #VE as a !user_mode() MMIO I guess.
> > > >
> > > > Would something prevent the same munmap() checks needing to happen
> > for that
> > > > second kernel #VE? If not, I wonder if the munmap() protection logic
> > should also
> > > > trigger for any userspace range ve->gpa as well.
> > >
> > > That's an interesting scenario, but I think we are fine.
> > >
> > > The fetch is copy_from_user() which is "REP; MOVSB" on all TDX platforms.
> > > Kernel rejects MOVS instruction emulation for !user_mode() with -EFAULT.
> > 
> > But MOVS will be used only if X86_FEATURE_FSRM feature is present.
> > Otherwise rep_movs_alternative will be used, which uses MOVB.
> > 
> > I know that X86_FEATURE_FSRM appeared since Ice Lake, but still.
> 
> This is how the X86_FEATURE_FSRM cpuid bit is treated under TDX:
> 
> {
>           "MSB": "4",
>           "LSB": "4",
>           "Field Size": "1",
>           "Field Name": "Fast Short REP MOV",
>           "Configuration Details": "TD_PARAMS.CPUID_CONFIG",
>           "Bit or Field Virtualization Type": "Configured & Native",
>           "Virtualization Details": null
>         },
> 
> Which means VMM has the way to overwrite the native platform value
> and set it to "0", so we must account for both cases. 

I have added a patch that does not allow access to userspace addresses if
we are in a kernel space context.

-- 
Rgrds, legion
[PATCH v3 6/7] x86/tdx: Add a restriction on access to MMIO address
Posted by Alexey Gladkov (Intel) 1 year, 4 months ago
In the case of userspace MMIO, if the user instruction + MAX_INSN_SIZE
straddles page, then the "fetch" in the kernel could trigger a #VE. In
this case the kernel would handle this second #VE as a !user_mode() MMIO.
That way, additional address verifications can be avoided.

The scenario of accessing userspace MMIO addresses from kernelspace does
not seem appropriate under normal circumstances. Until there is a
specific usecase for such a scenario it can be disabled.

Signed-off-by: Alexey Gladkov (Intel) <legion@kernel.org>
---
 arch/x86/coco/tdx/tdx.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index dfadb085d2d3..5b3421a89998 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -411,6 +411,11 @@ static inline bool is_private_gpa(u64 gpa)
 	return gpa == cc_mkenc(gpa);
 }
 
+static inline bool is_kernel_addr(unsigned long addr)
+{
+	return (long)addr < 0;
+}
+
 static int get_phys_addr(unsigned long addr, phys_addr_t *phys_addr, bool *writable)
 {
 	unsigned int level;
@@ -641,6 +646,7 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
 	unsigned long vaddr;
 	int size, ret;
 
+
 	ret = decode_insn_struct(&insn, regs);
 	if (ret)
 		return ret;
@@ -661,6 +667,9 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
 		ret = valid_vaddr(ve, mmio, size, vaddr);
 		if (ret)
 			goto unlock;
+	} else if (!is_kernel_addr(ve->gla)) {
+		WARN_ONCE(1, "Access to userspace address is not supported");
+		return -EINVAL;
 	}
 
 	/*
-- 
2.45.2
[PATCH v3 7/7] x86/tdx: Avoid crossing the page boundary
Posted by Alexey Gladkov (Intel) 1 year, 4 months ago
In case the instruction is close to the page boundary, reading
MAX_INSN_SIZE may cross the page boundary. The second page might be
from a different VMA and reading can have side effects.

The problem is that the actual size of the instruction is not known.

The solution might be to try read the data to the end of the page and
try parse it in the hope that the instruction is smaller than the
maximum buffer size.

Co-developed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Alexey Gladkov (Intel) <legion@kernel.org>
---
 arch/x86/coco/tdx/tdx.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 5b3421a89998..ea3df77feef0 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -494,16 +494,32 @@ static int decode_insn_struct(struct insn *insn, struct pt_regs *regs)
 	char buffer[MAX_INSN_SIZE];
 
 	if (user_mode(regs)) {
-		int nr_copied = insn_fetch_from_user(regs, buffer);
+		int nr_copied, size;
+		unsigned long ip;
 
-		if (nr_copied <= 0)
+		if (insn_get_effective_ip(regs, &ip))
 			return -EFAULT;
 
-		if (!insn_decode_from_regs(insn, regs, buffer, nr_copied))
-			return -EINVAL;
+		/*
+		 * On the first attempt, read up to MAX_INSN_SIZE, but do not cross a
+		 * page boundary. The second page might be from a different VMA and
+		 * reading can have side effects (i.e. reading from MMIO).
+		 */
+		size = min(MAX_INSN_SIZE, PAGE_SIZE - offset_in_page(ip));
+retry:
+		nr_copied = size - copy_from_user(buffer, (void __user *)ip, size);
+
+		if (nr_copied <= 0)
+			return -EFAULT;
 
-		if (!insn->immediate.got)
+		if (!insn_decode_from_regs(insn, regs, buffer, nr_copied)) {
+			/* If decode failed, try to copy across page boundary */
+			if (size < MAX_INSN_SIZE) {
+				size = MAX_INSN_SIZE;
+				goto retry;
+			}
 			return -EINVAL;
+		}
 	} else {
 		if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
 			return -EFAULT;
@@ -511,6 +527,10 @@ static int decode_insn_struct(struct insn *insn, struct pt_regs *regs)
 		if (insn_decode(insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
 			return -EINVAL;
 	}
+
+	if (!insn->immediate.got)
+		return -EINVAL;
+
 	return 0;
 }
 
-- 
2.45.2