Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional
information can be supplied without setting userspace up to fail. The
padding in the new kvm_userspace_memory_region2 structure will be used to
pass a file descriptor in addition to the userspace_addr, i.e. allow
userspace to point at a file descriptor and map memory into a guest that
is NOT mapped into host userspace.
Alternatively, KVM could simply add "struct kvm_userspace_memory_region2"
without a new ioctl(), but as Paolo pointed out, adding a new ioctl()
makes detection of bad flags a bit more robust, e.g. if the new fd field
is guarded only by a flag and not a new ioctl(), then a userspace bug
(setting a "bad" flag) would generate out-of-bounds access instead of an
-EINVAL error.
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
Documentation/virt/kvm/api.rst | 21 +++++++++++++++++++
arch/x86/kvm/x86.c | 2 +-
include/linux/kvm_host.h | 4 ++--
include/uapi/linux/kvm.h | 13 ++++++++++++
virt/kvm/kvm_main.c | 38 +++++++++++++++++++++++++++-------
5 files changed, 67 insertions(+), 11 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 21a7578142a1..ace984acc125 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using the SET_ONE_REG
interface. No error will be returned, but the resulting offset will not be
applied.
+4.139 KVM_SET_USER_MEMORY_REGION2
+---------------------------------
+
+:Capability: KVM_CAP_USER_MEMORY2
+:Architectures: all
+:Type: vm ioctl
+:Parameters: struct kvm_userspace_memory_region2 (in)
+:Returns: 0 on success, -1 on error
+
+::
+
+ struct kvm_userspace_memory_region2 {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size; /* bytes */
+ __u64 userspace_addr; /* start of the userspace allocated memory */
+ };
+
+See KVM_SET_USER_MEMORY_REGION.
+
5. The kvm_run structure
========================
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 41cce5031126..6409914428ca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12455,7 +12455,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
}
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- struct kvm_userspace_memory_region m;
+ struct kvm_userspace_memory_region2 m;
m.slot = id | (i << 16);
m.flags = 0;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5faba69403ac..4e741ff27af3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1146,9 +1146,9 @@ enum kvm_mr_change {
};
int kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem);
+ const struct kvm_userspace_memory_region2 *mem);
int __kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem);
+ const struct kvm_userspace_memory_region2 *mem);
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
int kvm_arch_prepare_memory_region(struct kvm *kvm,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 13065dd96132..bd1abe067f28 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -95,6 +95,16 @@ struct kvm_userspace_memory_region {
__u64 userspace_addr; /* start of the userspace allocated memory */
};
+/* for KVM_SET_USER_MEMORY_REGION2 */
+struct kvm_userspace_memory_region2 {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size;
+ __u64 userspace_addr;
+ __u64 pad[16];
+};
+
/*
* The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for
* userspace, other bits are reserved for kvm internal use which are defined
@@ -1192,6 +1202,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_COUNTER_OFFSET 227
#define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
#define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
+#define KVM_CAP_USER_MEMORY2 230
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1473,6 +1484,8 @@ struct kvm_vfio_spapr_tce {
struct kvm_userspace_memory_region)
#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64)
+#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \
+ struct kvm_userspace_memory_region2)
/* enable ucontrol for s390 */
struct kvm_s390_ucas_mapping {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6e708017064d..3f5b7c2c5327 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1578,7 +1578,7 @@ static void kvm_replace_memslot(struct kvm *kvm,
}
}
-static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
+static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem)
{
u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
@@ -1980,7 +1980,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
* Must be called holding kvm->slots_lock for write.
*/
int __kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem)
+ const struct kvm_userspace_memory_region2 *mem)
{
struct kvm_memory_slot *old, *new;
struct kvm_memslots *slots;
@@ -2084,7 +2084,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
int kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem)
+ const struct kvm_userspace_memory_region2 *mem)
{
int r;
@@ -2096,7 +2096,7 @@ int kvm_set_memory_region(struct kvm *kvm,
EXPORT_SYMBOL_GPL(kvm_set_memory_region);
static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem)
+ struct kvm_userspace_memory_region2 *mem)
{
if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
return -EINVAL;
@@ -4566,6 +4566,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
{
switch (arg) {
case KVM_CAP_USER_MEMORY:
+ case KVM_CAP_USER_MEMORY2:
case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
case KVM_CAP_INTERNAL_ERROR_DATA:
@@ -4821,6 +4822,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
return fd;
}
+#define SANITY_CHECK_MEM_REGION_FIELD(field) \
+do { \
+ BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \
+ offsetof(struct kvm_userspace_memory_region2, field)); \
+ BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \
+ sizeof_field(struct kvm_userspace_memory_region2, field)); \
+} while (0)
+
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4843,15 +4852,28 @@ static long kvm_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
break;
}
+ case KVM_SET_USER_MEMORY_REGION2:
case KVM_SET_USER_MEMORY_REGION: {
- struct kvm_userspace_memory_region kvm_userspace_mem;
+ struct kvm_userspace_memory_region2 mem;
+ unsigned long size;
+
+ if (ioctl == KVM_SET_USER_MEMORY_REGION)
+ size = sizeof(struct kvm_userspace_memory_region);
+ else
+ size = sizeof(struct kvm_userspace_memory_region2);
+
+ /* Ensure the common parts of the two structs are identical. */
+ SANITY_CHECK_MEM_REGION_FIELD(slot);
+ SANITY_CHECK_MEM_REGION_FIELD(flags);
+ SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
+ SANITY_CHECK_MEM_REGION_FIELD(memory_size);
+ SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
r = -EFAULT;
- if (copy_from_user(&kvm_userspace_mem, argp,
- sizeof(kvm_userspace_mem)))
+ if (copy_from_user(&mem, argp, size))
goto out;
- r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
+ r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
break;
}
case KVM_GET_DIRTY_LOG: {
--
2.42.0.820.g83a721a137-goog
On 10/28/2023 2:21 AM, Sean Christopherson wrote: > Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional > information can be supplied without setting userspace up to fail. The > padding in the new kvm_userspace_memory_region2 structure will be used to > pass a file descriptor in addition to the userspace_addr, i.e. allow > userspace to point at a file descriptor and map memory into a guest that > is NOT mapped into host userspace. > > Alternatively, KVM could simply add "struct kvm_userspace_memory_region2" > without a new ioctl(), but as Paolo pointed out, adding a new ioctl() > makes detection of bad flags a bit more robust, e.g. if the new fd field > is guarded only by a flag and not a new ioctl(), then a userspace bug > (setting a "bad" flag) would generate out-of-bounds access instead of an > -EINVAL error. > > Cc: Jarkko Sakkinen <jarkko@kernel.org> > Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> > Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com> > Signed-off-by: Sean Christopherson <seanjc@google.com> > --- > Documentation/virt/kvm/api.rst | 21 +++++++++++++++++++ > arch/x86/kvm/x86.c | 2 +- > include/linux/kvm_host.h | 4 ++-- > include/uapi/linux/kvm.h | 13 ++++++++++++ > virt/kvm/kvm_main.c | 38 +++++++++++++++++++++++++++------- > 5 files changed, 67 insertions(+), 11 deletions(-) > > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst > index 21a7578142a1..ace984acc125 100644 > --- a/Documentation/virt/kvm/api.rst > +++ b/Documentation/virt/kvm/api.rst > @@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using the SET_ONE_REG > interface. No error will be returned, but the resulting offset will not be > applied. > > +4.139 KVM_SET_USER_MEMORY_REGION2 > +--------------------------------- > + > +:Capability: KVM_CAP_USER_MEMORY2 > +:Architectures: all > +:Type: vm ioctl > +:Parameters: struct kvm_userspace_memory_region2 (in) > +:Returns: 0 on success, -1 on error > + > +:: > + > + struct kvm_userspace_memory_region2 { > + __u32 slot; > + __u32 flags; > + __u64 guest_phys_addr; > + __u64 memory_size; /* bytes */ > + __u64 userspace_addr; /* start of the userspace allocated memory */ missing __u64 pad[16]; > + }; > + > +See KVM_SET_USER_MEMORY_REGION. > + > 5. The kvm_run structure > ======================== > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 41cce5031126..6409914428ca 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -12455,7 +12455,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, > } > > for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { > - struct kvm_userspace_memory_region m; > + struct kvm_userspace_memory_region2 m; > > m.slot = id | (i << 16); > m.flags = 0; > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 5faba69403ac..4e741ff27af3 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -1146,9 +1146,9 @@ enum kvm_mr_change { > }; > > int kvm_set_memory_region(struct kvm *kvm, > - const struct kvm_userspace_memory_region *mem); > + const struct kvm_userspace_memory_region2 *mem); > int __kvm_set_memory_region(struct kvm *kvm, > - const struct kvm_userspace_memory_region *mem); > + const struct kvm_userspace_memory_region2 *mem); > void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); > void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); > int kvm_arch_prepare_memory_region(struct kvm *kvm, > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 13065dd96132..bd1abe067f28 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -95,6 +95,16 @@ struct kvm_userspace_memory_region { > __u64 userspace_addr; /* start of the userspace allocated memory */ > }; > > +/* for KVM_SET_USER_MEMORY_REGION2 */ > +struct kvm_userspace_memory_region2 { > + __u32 slot; > + __u32 flags; > + __u64 guest_phys_addr; > + __u64 memory_size; > + __u64 userspace_addr; > + __u64 pad[16]; > +}; > + > /* > * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for > * userspace, other bits are reserved for kvm internal use which are defined > @@ -1192,6 +1202,7 @@ struct kvm_ppc_resize_hpt { > #define KVM_CAP_COUNTER_OFFSET 227 > #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 > #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 > +#define KVM_CAP_USER_MEMORY2 230 > > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -1473,6 +1484,8 @@ struct kvm_vfio_spapr_tce { > struct kvm_userspace_memory_region) > #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) > #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) > +#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \ > + struct kvm_userspace_memory_region2) > > /* enable ucontrol for s390 */ > struct kvm_s390_ucas_mapping { > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 6e708017064d..3f5b7c2c5327 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -1578,7 +1578,7 @@ static void kvm_replace_memslot(struct kvm *kvm, > } > } > > -static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) > +static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem) > { > u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; > > @@ -1980,7 +1980,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, > * Must be called holding kvm->slots_lock for write. > */ > int __kvm_set_memory_region(struct kvm *kvm, > - const struct kvm_userspace_memory_region *mem) > + const struct kvm_userspace_memory_region2 *mem) > { > struct kvm_memory_slot *old, *new; > struct kvm_memslots *slots; > @@ -2084,7 +2084,7 @@ int __kvm_set_memory_region(struct kvm *kvm, > EXPORT_SYMBOL_GPL(__kvm_set_memory_region); > > int kvm_set_memory_region(struct kvm *kvm, > - const struct kvm_userspace_memory_region *mem) > + const struct kvm_userspace_memory_region2 *mem) > { > int r; > > @@ -2096,7 +2096,7 @@ int kvm_set_memory_region(struct kvm *kvm, > EXPORT_SYMBOL_GPL(kvm_set_memory_region); > > static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, > - struct kvm_userspace_memory_region *mem) > + struct kvm_userspace_memory_region2 *mem) > { > if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) > return -EINVAL; > @@ -4566,6 +4566,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) > { > switch (arg) { > case KVM_CAP_USER_MEMORY: > + case KVM_CAP_USER_MEMORY2: > case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: > case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: > case KVM_CAP_INTERNAL_ERROR_DATA: > @@ -4821,6 +4822,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) > return fd; > } > > +#define SANITY_CHECK_MEM_REGION_FIELD(field) \ > +do { \ > + BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \ > + offsetof(struct kvm_userspace_memory_region2, field)); \ > + BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \ > + sizeof_field(struct kvm_userspace_memory_region2, field)); \ > +} while (0) > + > static long kvm_vm_ioctl(struct file *filp, > unsigned int ioctl, unsigned long arg) > { > @@ -4843,15 +4852,28 @@ static long kvm_vm_ioctl(struct file *filp, > r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); > break; > } > + case KVM_SET_USER_MEMORY_REGION2: > case KVM_SET_USER_MEMORY_REGION: { > - struct kvm_userspace_memory_region kvm_userspace_mem; > + struct kvm_userspace_memory_region2 mem; > + unsigned long size; > + > + if (ioctl == KVM_SET_USER_MEMORY_REGION) > + size = sizeof(struct kvm_userspace_memory_region); > + else > + size = sizeof(struct kvm_userspace_memory_region2); > + > + /* Ensure the common parts of the two structs are identical. */ > + SANITY_CHECK_MEM_REGION_FIELD(slot); > + SANITY_CHECK_MEM_REGION_FIELD(flags); > + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr); > + SANITY_CHECK_MEM_REGION_FIELD(memory_size); > + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr); > > r = -EFAULT; > - if (copy_from_user(&kvm_userspace_mem, argp, > - sizeof(kvm_userspace_mem))) > + if (copy_from_user(&mem, argp, size)) > goto out; > > - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); > + r = kvm_vm_ioctl_set_memory_region(kvm, &mem); > break; > } > case KVM_GET_DIRTY_LOG: {
On Tue, Oct 31, 2023, Xiaoyao Li wrote: > On 10/28/2023 2:21 AM, Sean Christopherson wrote: > > Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional > > information can be supplied without setting userspace up to fail. The > > padding in the new kvm_userspace_memory_region2 structure will be used to > > pass a file descriptor in addition to the userspace_addr, i.e. allow > > userspace to point at a file descriptor and map memory into a guest that > > is NOT mapped into host userspace. > > > > Alternatively, KVM could simply add "struct kvm_userspace_memory_region2" > > without a new ioctl(), but as Paolo pointed out, adding a new ioctl() > > makes detection of bad flags a bit more robust, e.g. if the new fd field > > is guarded only by a flag and not a new ioctl(), then a userspace bug > > (setting a "bad" flag) would generate out-of-bounds access instead of an > > -EINVAL error. > > > > Cc: Jarkko Sakkinen <jarkko@kernel.org> > > Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> > > Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com> > > Signed-off-by: Sean Christopherson <seanjc@google.com> > > --- > > Documentation/virt/kvm/api.rst | 21 +++++++++++++++++++ > > arch/x86/kvm/x86.c | 2 +- > > include/linux/kvm_host.h | 4 ++-- > > include/uapi/linux/kvm.h | 13 ++++++++++++ > > virt/kvm/kvm_main.c | 38 +++++++++++++++++++++++++++------- > > 5 files changed, 67 insertions(+), 11 deletions(-) > > > > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst > > index 21a7578142a1..ace984acc125 100644 > > --- a/Documentation/virt/kvm/api.rst > > +++ b/Documentation/virt/kvm/api.rst > > @@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using the SET_ONE_REG > > interface. No error will be returned, but the resulting offset will not be > > applied. > > +4.139 KVM_SET_USER_MEMORY_REGION2 > > +--------------------------------- > > + > > +:Capability: KVM_CAP_USER_MEMORY2 > > +:Architectures: all > > +:Type: vm ioctl > > +:Parameters: struct kvm_userspace_memory_region2 (in) > > +:Returns: 0 on success, -1 on error > > + > > +:: > > + > > + struct kvm_userspace_memory_region2 { > > + __u32 slot; > > + __u32 flags; > > + __u64 guest_phys_addr; > > + __u64 memory_size; /* bytes */ > > + __u64 userspace_addr; /* start of the userspace allocated memory */ > > missing > > __u64 pad[16]; I can't even copy+paste correctly :-(
On 10/27/23 20:21, Sean Christopherson wrote: > > + if (ioctl == KVM_SET_USER_MEMORY_REGION) > + size = sizeof(struct kvm_userspace_memory_region); This also needs a memset(&mem, 0, sizeof(mem)), otherwise the out-of-bounds access of the commit message becomes a kernel stack read. Probably worth adding a check on valid flags here. Paolo > + else > + size = sizeof(struct kvm_userspace_memory_region2); > + > + /* Ensure the common parts of the two structs are identical. */ > + SANITY_CHECK_MEM_REGION_FIELD(slot); > + SANITY_CHECK_MEM_REGION_FIELD(flags); > + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr); > + SANITY_CHECK_MEM_REGION_FIELD(memory_size); > + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr); >
On Mon, Oct 30, 2023, Paolo Bonzini wrote: > On 10/27/23 20:21, Sean Christopherson wrote: > > > > + if (ioctl == KVM_SET_USER_MEMORY_REGION) > > + size = sizeof(struct kvm_userspace_memory_region); > > This also needs a memset(&mem, 0, sizeof(mem)), otherwise the out-of-bounds > access of the commit message becomes a kernel stack read. Ouch. There's some irony. Might be worth doing memset(&mem, -1, sizeof(mem)) though as '0' is a valid file descriptor and a valid file offset. > Probably worth adding a check on valid flags here. Definitely needed. There's a very real bug here. But rather than duplicate flags checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we have the fancy guard(mutex) and there are no internal calls to kvm_set_memory_region(), what if we: 1. Acquire/release slots_lock in __kvm_set_memory_region() 2. Call kvm_set_memory_region() from x86 code for the internal memslots 3. Disallow *any* flags for internal memslots 4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region() 5. Pass @ioctl to kvm_vm_ioctl_set_memory_region() and allow KVM_MEM_PRIVATE only for KVM_SET_USER_MEMORY_REGION2 E.g. this over ~5 patches --- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 4 +-- virt/kvm/kvm_main.c | 65 +++++++++++++++++----------------------- 3 files changed, 29 insertions(+), 42 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e3eb608b6692..dd3e2017366c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12478,7 +12478,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, m.guest_phys_addr = gpa; m.userspace_addr = hva; m.memory_size = size; - r = __kvm_set_memory_region(kvm, &m); + r = kvm_set_memory_region(kvm, &m); if (r < 0) return ERR_PTR_USR(r); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 687589ce9f63..fbb98efe8200 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1170,7 +1170,7 @@ static inline bool kvm_memslot_iter_is_valid(struct kvm_memslot_iter *iter, gfn_ * -- just change its flags * * Since flags can be changed by some of these operations, the following - * differentiation is the best we can do for __kvm_set_memory_region(): + * differentiation is the best we can do for __kvm_set_memory_region(). */ enum kvm_mr_change { KVM_MR_CREATE, @@ -1181,8 +1181,6 @@ enum kvm_mr_change { int kvm_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem); -int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 23633984142f..39ceee2f67f2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1608,28 +1608,6 @@ static void kvm_replace_memslot(struct kvm *kvm, } } -static int check_memory_region_flags(struct kvm *kvm, - const struct kvm_userspace_memory_region2 *mem) -{ - u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; - - if (kvm_arch_has_private_mem(kvm)) - valid_flags |= KVM_MEM_PRIVATE; - - /* Dirty logging private memory is not currently supported. */ - if (mem->flags & KVM_MEM_PRIVATE) - valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; - -#ifdef __KVM_HAVE_READONLY_MEM - valid_flags |= KVM_MEM_READONLY; -#endif - - if (mem->flags & ~valid_flags) - return -EINVAL; - - return 0; -} - static void kvm_swap_active_memslots(struct kvm *kvm, int as_id) { struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); @@ -2014,11 +1992,9 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, * space. * * Discontiguous memory is allowed, mostly for framebuffers. - * - * Must be called holding kvm->slots_lock for write. */ -int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region2 *mem) +static int __kvm_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region2 *mem) { struct kvm_memory_slot *old, *new; struct kvm_memslots *slots; @@ -2028,9 +2004,7 @@ int __kvm_set_memory_region(struct kvm *kvm, int as_id, id; int r; - r = check_memory_region_flags(kvm, mem); - if (r) - return r; + guard(mutex)(&kvm->slots_lock); as_id = mem->slot >> 16; id = (u16)mem->slot; @@ -2139,27 +2113,42 @@ int __kvm_set_memory_region(struct kvm *kvm, kfree(new); return r; } -EXPORT_SYMBOL_GPL(__kvm_set_memory_region); int kvm_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem) { - int r; + /* Flags aren't supported for KVM-internal memslots. */ + if (WARN_ON_ONCE(mem->flags)) + return -EINVAL; - mutex_lock(&kvm->slots_lock); - r = __kvm_set_memory_region(kvm, mem); - mutex_unlock(&kvm->slots_lock); - return r; + return __kvm_set_memory_region(kvm, mem); } EXPORT_SYMBOL_GPL(kvm_set_memory_region); -static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, +static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, unsigned int ioctl, struct kvm_userspace_memory_region2 *mem) { + u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; + + if (ioctl == KVM_SET_USER_MEMORY_REGION2 && + kvm_arch_has_private_mem(kvm)) + valid_flags |= KVM_MEM_PRIVATE; + + /* Dirty logging private memory is not currently supported. */ + if (mem->flags & KVM_MEM_PRIVATE) + valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; + +#ifdef __KVM_HAVE_READONLY_MEM + valid_flags |= KVM_MEM_READONLY; +#endif + + if (mem->flags & ~valid_flags) + return -EINVAL; + if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; - return kvm_set_memory_region(kvm, mem); + return __kvm_set_memory_region(kvm, mem); } #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT @@ -5145,7 +5134,7 @@ static long kvm_vm_ioctl(struct file *filp, if (copy_from_user(&mem, argp, size)) goto out; - r = kvm_vm_ioctl_set_memory_region(kvm, &mem); + r = kvm_vm_ioctl_set_memory_region(kvm, ioctl, &mem); break; } case KVM_GET_DIRTY_LOG: { base-commit: 881375a408c0f4ea451ff14545b59216d2923881 --
On 10/30/23 21:25, Sean Christopherson wrote: > On Mon, Oct 30, 2023, Paolo Bonzini wrote: >> On 10/27/23 20:21, Sean Christopherson wrote: >>> >>> + if (ioctl == KVM_SET_USER_MEMORY_REGION) >>> + size = sizeof(struct kvm_userspace_memory_region); >> >> This also needs a memset(&mem, 0, sizeof(mem)), otherwise the out-of-bounds >> access of the commit message becomes a kernel stack read. > > Ouch. There's some irony. Might be worth doing memset(&mem, -1, sizeof(mem)) > though as '0' is a valid file descriptor and a valid file offset. Either is okay, because unless the flags check is screwed up it should not matter. The memset is actually unnecessary, though it may be a good idea anyway to keep it, aka belt-and-suspenders. >> Probably worth adding a check on valid flags here. > > Definitely needed. There's a very real bug here. But rather than duplicate flags > checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we > have the fancy guard(mutex) and there are no internal calls to kvm_set_memory_region(), > what if we: > > 1. Acquire/release slots_lock in __kvm_set_memory_region() > 2. Call kvm_set_memory_region() from x86 code for the internal memslots > 3. Disallow *any* flags for internal memslots > 4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region() I dislike this step, there is a clear point where all paths meet (ioctl/internal, locked/unlocked) and that's __kvm_set_memory_region(). I think that's the place where flags should be checked. (I don't mind the restriction on internal memslots; it's just that to me it's not a particularly natural way to structure the checks). On the other hand, the place where to protect from out-of-bounds accesses, is the place where you stop caring about struct kvm_userspace_memory_region vs kvm_userspace_memory_region2 (and your code gets it right, by dropping "ioctl" as soon as possible). diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 87f45aa91ced..fe5a2af14fff 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1635,6 +1635,14 @@ bool __weak kvm_arch_dirty_log_supported(struct kvm *kvm) return true; } +/* + * Flags that do not access any of the extra space of struct + * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_FLAGS + * only allows these. + */ +#define KVM_SET_USER_MEMORY_REGION_FLAGS \ + (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) + static int check_memory_region_flags(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem) { @@ -5149,10 +5149,16 @@ static long kvm_vm_ioctl(struct file *filp, struct kvm_userspace_memory_region2 mem; unsigned long size; - if (ioctl == KVM_SET_USER_MEMORY_REGION) + if (ioctl == KVM_SET_USER_MEMORY_REGION) { + /* + * Fields beyond struct kvm_userspace_memory_region shouldn't be + * accessed, but avoid leaking kernel memory in case of a bug. + */ + memset(&mem, 0, sizeof(mem)); size = sizeof(struct kvm_userspace_memory_region); - else + } else { size = sizeof(struct kvm_userspace_memory_region2); + } /* Ensure the common parts of the two structs are identical. */ SANITY_CHECK_MEM_REGION_FIELD(slot); @@ -5165,6 +5167,11 @@ static long kvm_vm_ioctl(struct file *filp, if (copy_from_user(&mem, argp, size)) goto out; + r = -EINVAL; + if (ioctl == KVM_SET_USER_MEMORY_REGION && + (mem->flags & ~KVM_SET_USER_MEMORY_REGION_FLAGS)) + goto out; + r = kvm_vm_ioctl_set_memory_region(kvm, &mem); break; } That's a kind of patch that you can't really get wrong (though I have the brown paper bag ready). Maintainance-wise it's fine, since flags are being added at a pace of roughly one every five years, and anyway it's also future proof: I placed the #define near check_memory_region_flags so that in five years we remember to keep it up to date. But worst case, the new flags will only be allowed by KVM_SET_USER_MEMORY_REGION2 unnecessarily; there are no security issues waiting to bite us. In sum, this is exactly the only kind of fix that should be in the v13->v14 delta. Paolo
On Tue, Oct 31, 2023, Paolo Bonzini wrote: > On 10/30/23 21:25, Sean Christopherson wrote: > > > Probably worth adding a check on valid flags here. > > > > Definitely needed. There's a very real bug here. But rather than duplicate flags > > checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we > > have the fancy guard(mutex) and there are no internal calls to kvm_set_memory_region(), > > what if we: > > > > 1. Acquire/release slots_lock in __kvm_set_memory_region() > > 2. Call kvm_set_memory_region() from x86 code for the internal memslots > > 3. Disallow *any* flags for internal memslots > > 4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region() > > I dislike this step, there is a clear point where all paths meet > (ioctl/internal, locked/unlocked) and that's __kvm_set_memory_region(). > I think that's the place where flags should be checked. (I don't mind > the restriction on internal memslots; it's just that to me it's not a > particularly natural way to structure the checks). Yeah, I just don't like the discrepancy it causes where some flags are explicitly checked and allowed, allowed and then later disallowed. > On the other hand, the place where to protect from out-of-bounds > accesses, is the place where you stop caring about struct > kvm_userspace_memory_region vs kvm_userspace_memory_region2 (and > your code gets it right, by dropping "ioctl" as soon as possible). > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 87f45aa91ced..fe5a2af14fff 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -1635,6 +1635,14 @@ bool __weak kvm_arch_dirty_log_supported(struct kvm *kvm) > return true; > } > +/* > + * Flags that do not access any of the extra space of struct > + * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_FLAGS > + * only allows these. > + */ > +#define KVM_SET_USER_MEMORY_REGION_FLAGS \ Can we name this KVM_SET_USER_MEMORY_REGION_LEGACY_FLAGS, or something equally horrific? As is, this sounds way too much like a generic "allowed flags for any memory region". Or maybe invert the macro? I.e. something to make it more obvious that it's effectively a versioning check, not a generic "what's supported?" check. #define KVM_SET_USER_MEMORY_FLAGS_V2_ONLY \ (~(KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) > + (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) > + > static int check_memory_region_flags(struct kvm *kvm, > const struct kvm_userspace_memory_region2 *mem) > { > @@ -5149,10 +5149,16 @@ static long kvm_vm_ioctl(struct file *filp, > struct kvm_userspace_memory_region2 mem; > unsigned long size; > - if (ioctl == KVM_SET_USER_MEMORY_REGION) > + if (ioctl == KVM_SET_USER_MEMORY_REGION) { > + /* > + * Fields beyond struct kvm_userspace_memory_region shouldn't be > + * accessed, but avoid leaking kernel memory in case of a bug. > + */ > + memset(&mem, 0, sizeof(mem)); > size = sizeof(struct kvm_userspace_memory_region); > - else > + } else { > size = sizeof(struct kvm_userspace_memory_region2); > + } > /* Ensure the common parts of the two structs are identical. */ > SANITY_CHECK_MEM_REGION_FIELD(slot); > @@ -5165,6 +5167,11 @@ static long kvm_vm_ioctl(struct file *filp, > if (copy_from_user(&mem, argp, size)) > goto out; > + r = -EINVAL; > + if (ioctl == KVM_SET_USER_MEMORY_REGION && > + (mem->flags & ~KVM_SET_USER_MEMORY_REGION_FLAGS)) > + goto out; > + > r = kvm_vm_ioctl_set_memory_region(kvm, &mem); > break; > } > > > That's a kind of patch that you can't really get wrong (though I have > the brown paper bag ready). > > Maintainance-wise it's fine, since flags are being added at a pace of > roughly one every five years, Heh, true. > and anyway it's also future proof: I placed the #define near > check_memory_region_flags so that in five years we remember to keep it up to > date. But worst case, the new flags will only be allowed by > KVM_SET_USER_MEMORY_REGION2 unnecessarily; there are no security issues > waiting to bite us. > > In sum, this is exactly the only kind of fix that should be in the v13->v14 > delta. Boiling the ocean can be fun too ;-)
On Mon, Oct 30, 2023, Sean Christopherson wrote: > On Mon, Oct 30, 2023, Paolo Bonzini wrote: > > On 10/27/23 20:21, Sean Christopherson wrote: > > > > > > + if (ioctl == KVM_SET_USER_MEMORY_REGION) > > > + size = sizeof(struct kvm_userspace_memory_region); > > > > This also needs a memset(&mem, 0, sizeof(mem)), otherwise the out-of-bounds > > access of the commit message becomes a kernel stack read. > > Ouch. There's some irony. Might be worth doing memset(&mem, -1, sizeof(mem)) > though as '0' is a valid file descriptor and a valid file offset. > > > Probably worth adding a check on valid flags here. > > Definitely needed. There's a very real bug here. But rather than duplicate flags > checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we > have the fancy guard(mutex) and there are no internal calls to kvm_set_memory_region(), > what if we: > > 1. Acquire/release slots_lock in __kvm_set_memory_region() Gah, this won't work with x86's usage, which acquire slots_lock quite far away from __kvm_set_memory_region() in several places. The rest of the idea still works, kvm_vm_ioctl_set_memory_region() will just need to take slots_lock manually. > 2. Call kvm_set_memory_region() from x86 code for the internal memslots > 3. Disallow *any* flags for internal memslots > 4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region() > 5. Pass @ioctl to kvm_vm_ioctl_set_memory_region() and allow KVM_MEM_PRIVATE > only for KVM_SET_USER_MEMORY_REGION2
© 2016 - 2024 Red Hat, Inc.