RFC: spapr/iommu: Enable in-kernel TCE acceleration via VFIO KVM device

[Qemu-devel] [PATCH qemu] RFC: spapr/iommu: Enable in-kernel TCE acceleration via VFIO KVM device

Posted by Alexey Kardashevskiy 8 years, 2 months ago

In order to enable TCE operations support in KVM, we have to inform
the KVM about VFIO groups being attached to specific LIOBNs. The KVM
already knows about VFIO groups, the only bit missing is which
in-kernel TCE table (the one with user visible TCEs) should update
the attached broups. There is an KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE
attribute of the VFIO KVM device which receives a groupfd/tablefd couple.

This adds get_attr()/set_attr() to IOMMUMemoryRegionClass, like
iommu_ops::domain_get_attr/domain_set_attr in the Linux kernel.

This implements get_attr() for sPAPR IOMMU to return a TCE table fd
as an IOMMU_ATTR_KVM_FD attribute. This also reads now
the KVM_CAP_SPAPR_TCE_VFIO capability to prevent the TCE table from
reallocating to the userspace if the KVM can accelerate TCE operations.

This finally notifies the VFIO KVM device about new group being attached
to a LIOBN.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---

Assuming it is accepted, does it make sense to split
include/exec/memory.h out and get merged separately?

---
 include/exec/memory.h | 10 ++++++++++
 target/ppc/kvm_ppc.h  |  6 ++++++
 hw/ppc/spapr_iommu.c  | 19 +++++++++++++++++++
 hw/vfio/common.c      | 24 ++++++++++++++++++++++++
 target/ppc/kvm.c      |  7 ++++++-
 hw/vfio/trace-events  |  1 +
 6 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 5ed4042..6395c6f 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -190,6 +190,10 @@ struct MemoryRegionOps {
     const MemoryRegionMmio old_mmio;
 };
 
+enum IOMMUMemoryRegionAttr {
+    IOMMU_ATTR_KVM_FD
+};
+
 typedef struct IOMMUMemoryRegionClass {
     /* private */
     struct DeviceClass parent_class;
@@ -210,6 +214,12 @@ typedef struct IOMMUMemoryRegionClass {
                                 IOMMUNotifierFlag new_flags);
     /* Set this up to provide customized IOMMU replay function */
     void (*replay)(IOMMUMemoryRegion *iommu, IOMMUNotifier *notifier);
+
+    /* Get/set IOMMU misc attributes */
+    int (*get_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr,
+                    void *data);
+    int (*set_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr,
+                    void *data);
 } IOMMUMemoryRegionClass;
 
 typedef struct CoalescedMemoryRange CoalescedMemoryRange;
diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index d6be38e..2b985e1 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -48,6 +48,7 @@ void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t page_shift,
 int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
 int kvmppc_reset_htab(int shift_hint);
 uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift);
+bool kvmppc_has_cap_spapr_vfio(void);
 #endif /* !CONFIG_USER_ONLY */
 bool kvmppc_has_cap_epr(void);
 int kvmppc_define_rtas_kernel_token(uint32_t token, const char *function);
@@ -231,6 +232,11 @@ static inline bool kvmppc_is_mem_backend_page_size_ok(const char *obj_path)
     return true;
 }
 
+static inline bool kvmppc_has_cap_spapr_vfio(void)
+{
+    return false;
+}
+
 #endif /* !CONFIG_USER_ONLY */
 
 static inline bool kvmppc_has_cap_epr(void)
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 5ccd785..ce8a769 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -17,6 +17,7 @@
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
 #include "qemu/osdep.h"
+#include <sys/ioctl.h>
 #include "qemu/error-report.h"
 #include "hw/hw.h"
 #include "qemu/log.h"
@@ -160,6 +161,19 @@ static uint64_t spapr_tce_get_min_page_size(IOMMUMemoryRegion *iommu)
     return 1ULL << tcet->page_shift;
 }
 
+static int spapr_tce_get_attr(IOMMUMemoryRegion *iommu,
+                              enum IOMMUMemoryRegionAttr attr, void *data)
+{
+    sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu);
+
+    if (attr == IOMMU_ATTR_KVM_FD && kvmppc_has_cap_spapr_vfio()) {
+        *(int *) data = tcet->fd;
+        return 0;
+    }
+
+    return -EINVAL;
+}
+
 static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
                                           IOMMUNotifierFlag old,
                                           IOMMUNotifierFlag new)
@@ -284,6 +298,10 @@ void spapr_tce_set_need_vfio(sPAPRTCETable *tcet, bool need_vfio)
 
     tcet->need_vfio = need_vfio;
 
+    if (!need_vfio || (tcet->fd != -1 && kvmppc_has_cap_spapr_vfio())) {
+        return;
+    }
+
     oldtable = tcet->table;
 
     tcet->table = spapr_tce_alloc_table(tcet->liobn,
@@ -643,6 +661,7 @@ static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
     imrc->translate = spapr_tce_translate_iommu;
     imrc->get_min_page_size = spapr_tce_get_min_page_size;
     imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
+    imrc->get_attr = spapr_tce_get_attr;
 }
 
 static const TypeInfo spapr_iommu_memory_region_info = {
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index cd81cc9..ed7717d 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -480,6 +480,30 @@ static void vfio_listener_region_add(MemoryListener *listener,
     if (memory_region_is_iommu(section->mr)) {
         VFIOGuestIOMMU *giommu;
         IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
+#ifdef CONFIG_KVM
+        struct kvm_vfio_spapr_tce param;
+        IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
+        VFIOGroup *group;
+        struct kvm_device_attr attr = {
+            .group = KVM_DEV_VFIO_GROUP,
+            .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
+            .addr = (uint64_t)(unsigned long)&param,
+        };
+
+        if (kvm_enabled() && imrc->get_attr &&
+            !imrc->get_attr(iommu_mr, IOMMU_ATTR_KVM_FD, &param.tablefd)) {
+
+            QLIST_FOREACH(group, &container->group_list, container_next) {
+                param.groupfd = group->fd;
+                if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
+                    error_report("vfio: failed to setup fd %d for a group with fd %d: %s",
+                                 param.tablefd, param.groupfd, strerror(errno));
+                    return;
+                }
+                trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
+            }
+        }
+#endif
 
         trace_vfio_listener_region_add_iommu(iova, end);
         /*
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 9d57deb..da7590a 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -136,7 +136,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
     cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
     cap_spapr_tce_64 = kvm_check_extension(s, KVM_CAP_SPAPR_TCE_64);
     cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
-    cap_spapr_vfio = false;
+    cap_spapr_vfio = kvm_vm_check_extension(s, KVM_CAP_SPAPR_TCE_VFIO);
     cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
     cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
     cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
@@ -2474,6 +2474,11 @@ bool kvmppc_has_cap_mmu_hash_v3(void)
     return cap_mmu_hash_v3;
 }
 
+bool kvmppc_has_cap_spapr_vfio(void)
+{
+    return cap_spapr_vfio;
+}
+
 PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
 {
     uint32_t host_pvr = mfpvr();
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index fae096c..3d34fe8 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" size=0
 vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" size=0x%"PRIx64" ret=%d"
 vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
 vfio_spapr_remove_window(uint64_t off) "offset=0x%"PRIx64
+vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
-- 
2.11.0

Re: [Qemu-devel] [PATCH qemu] RFC: spapr/iommu: Enable in-kernel TCE acceleration via VFIO KVM device

Posted by Alex Williamson 8 years, 2 months ago

On Tue, 12 Dec 2017 16:18:53 +1100
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:

> In order to enable TCE operations support in KVM, we have to inform
> the KVM about VFIO groups being attached to specific LIOBNs. The KVM
> already knows about VFIO groups, the only bit missing is which
> in-kernel TCE table (the one with user visible TCEs) should update
> the attached broups. There is an KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE
> attribute of the VFIO KVM device which receives a groupfd/tablefd couple.
> 
> This adds get_attr()/set_attr() to IOMMUMemoryRegionClass, like
> iommu_ops::domain_get_attr/domain_set_attr in the Linux kernel.
> 
> This implements get_attr() for sPAPR IOMMU to return a TCE table fd
> as an IOMMU_ATTR_KVM_FD attribute. This also reads now
> the KVM_CAP_SPAPR_TCE_VFIO capability to prevent the TCE table from
> reallocating to the userspace if the KVM can accelerate TCE operations.
> 
> This finally notifies the VFIO KVM device about new group being attached
> to a LIOBN.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> 
> Assuming it is accepted, does it make sense to split
> include/exec/memory.h out and get merged separately?
> 
> ---
>  include/exec/memory.h | 10 ++++++++++
>  target/ppc/kvm_ppc.h  |  6 ++++++
>  hw/ppc/spapr_iommu.c  | 19 +++++++++++++++++++
>  hw/vfio/common.c      | 24 ++++++++++++++++++++++++
>  target/ppc/kvm.c      |  7 ++++++-
>  hw/vfio/trace-events  |  1 +
>  6 files changed, 66 insertions(+), 1 deletion(-)
> 
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 5ed4042..6395c6f 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -190,6 +190,10 @@ struct MemoryRegionOps {
>      const MemoryRegionMmio old_mmio;
>  };
>  
> +enum IOMMUMemoryRegionAttr {
> +    IOMMU_ATTR_KVM_FD

You're generalizing the wrong thing here, this is specifically a
SPAPR_TCE_FD, call it that.

> +};
> +
>  typedef struct IOMMUMemoryRegionClass {
>      /* private */
>      struct DeviceClass parent_class;
> @@ -210,6 +214,12 @@ typedef struct IOMMUMemoryRegionClass {
>                                  IOMMUNotifierFlag new_flags);
>      /* Set this up to provide customized IOMMU replay function */
>      void (*replay)(IOMMUMemoryRegion *iommu, IOMMUNotifier *notifier);
> +
> +    /* Get/set IOMMU misc attributes */
> +    int (*get_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr,
> +                    void *data);
> +    int (*set_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr,
> +                    void *data);
>  } IOMMUMemoryRegionClass;
>  
>  typedef struct CoalescedMemoryRange CoalescedMemoryRange;
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index d6be38e..2b985e1 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -48,6 +48,7 @@ void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t page_shift,
>  int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
>  int kvmppc_reset_htab(int shift_hint);
>  uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift);
> +bool kvmppc_has_cap_spapr_vfio(void);
>  #endif /* !CONFIG_USER_ONLY */
>  bool kvmppc_has_cap_epr(void);
>  int kvmppc_define_rtas_kernel_token(uint32_t token, const char *function);
> @@ -231,6 +232,11 @@ static inline bool kvmppc_is_mem_backend_page_size_ok(const char *obj_path)
>      return true;
>  }
>  
> +static inline bool kvmppc_has_cap_spapr_vfio(void)
> +{
> +    return false;
> +}
> +
>  #endif /* !CONFIG_USER_ONLY */
>  
>  static inline bool kvmppc_has_cap_epr(void)
> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> index 5ccd785..ce8a769 100644
> --- a/hw/ppc/spapr_iommu.c
> +++ b/hw/ppc/spapr_iommu.c
> @@ -17,6 +17,7 @@
>   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
>   */
>  #include "qemu/osdep.h"
> +#include <sys/ioctl.h>
>  #include "qemu/error-report.h"
>  #include "hw/hw.h"
>  #include "qemu/log.h"
> @@ -160,6 +161,19 @@ static uint64_t spapr_tce_get_min_page_size(IOMMUMemoryRegion *iommu)
>      return 1ULL << tcet->page_shift;
>  }
>  
> +static int spapr_tce_get_attr(IOMMUMemoryRegion *iommu,
> +                              enum IOMMUMemoryRegionAttr attr, void *data)
> +{
> +    sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu);
> +
> +    if (attr == IOMMU_ATTR_KVM_FD && kvmppc_has_cap_spapr_vfio()) {
> +        *(int *) data = tcet->fd;
> +        return 0;
> +    }
> +
> +    return -EINVAL;
> +}
> +
>  static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
>                                            IOMMUNotifierFlag old,
>                                            IOMMUNotifierFlag new)
> @@ -284,6 +298,10 @@ void spapr_tce_set_need_vfio(sPAPRTCETable *tcet, bool need_vfio)
>  
>      tcet->need_vfio = need_vfio;
>  
> +    if (!need_vfio || (tcet->fd != -1 && kvmppc_has_cap_spapr_vfio())) {
> +        return;
> +    }
> +
>      oldtable = tcet->table;
>  
>      tcet->table = spapr_tce_alloc_table(tcet->liobn,
> @@ -643,6 +661,7 @@ static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
>      imrc->translate = spapr_tce_translate_iommu;
>      imrc->get_min_page_size = spapr_tce_get_min_page_size;
>      imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
> +    imrc->get_attr = spapr_tce_get_attr;
>  }
>  
>  static const TypeInfo spapr_iommu_memory_region_info = {
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index cd81cc9..ed7717d 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -480,6 +480,30 @@ static void vfio_listener_region_add(MemoryListener *listener,
>      if (memory_region_is_iommu(section->mr)) {
>          VFIOGuestIOMMU *giommu;
>          IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
> +#ifdef CONFIG_KVM
> +        struct kvm_vfio_spapr_tce param;
> +        IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
> +        VFIOGroup *group;
> +        struct kvm_device_attr attr = {
> +            .group = KVM_DEV_VFIO_GROUP,
> +            .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
> +            .addr = (uint64_t)(unsigned long)&param,
> +        };
> +
> +        if (kvm_enabled() && imrc->get_attr &&
> +            !imrc->get_attr(iommu_mr, IOMMU_ATTR_KVM_FD, &param.tablefd)) {

Imagine if another iommu implemented support for something as generic
sounding as KVM_FD and we go and call SPAPR_TCE specific setup for it...

If the get_attr() interface is acceptable in the memory API, it'd be
preferable to abstract it such that every caller doesn't need to test
for it.  Also there are ~3 patches here, implementing the memory API
change, implementing the spapr support for it, implementing the vfio
support.  Thanks,

Alex

> +
> +            QLIST_FOREACH(group, &container->group_list, container_next) {
> +                param.groupfd = group->fd;
> +                if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
> +                    error_report("vfio: failed to setup fd %d for a group with fd %d: %s",
> +                                 param.tablefd, param.groupfd, strerror(errno));
> +                    return;
> +                }
> +                trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
> +            }
> +        }
> +#endif
>  
>          trace_vfio_listener_region_add_iommu(iova, end);
>          /*
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index 9d57deb..da7590a 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -136,7 +136,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>      cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
>      cap_spapr_tce_64 = kvm_check_extension(s, KVM_CAP_SPAPR_TCE_64);
>      cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
> -    cap_spapr_vfio = false;
> +    cap_spapr_vfio = kvm_vm_check_extension(s, KVM_CAP_SPAPR_TCE_VFIO);
>      cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
>      cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
>      cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
> @@ -2474,6 +2474,11 @@ bool kvmppc_has_cap_mmu_hash_v3(void)
>      return cap_mmu_hash_v3;
>  }
>  
> +bool kvmppc_has_cap_spapr_vfio(void)
> +{
> +    return cap_spapr_vfio;
> +}
> +
>  PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
>  {
>      uint32_t host_pvr = mfpvr();
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index fae096c..3d34fe8 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" size=0
>  vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" size=0x%"PRIx64" ret=%d"
>  vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
>  vfio_spapr_remove_window(uint64_t off) "offset=0x%"PRIx64
> +vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"

Re: [Qemu-devel] [PATCH qemu] RFC: spapr/iommu: Enable in-kernel TCE acceleration via VFIO KVM device

Posted by Paolo Bonzini 8 years, 1 month ago

On 12/12/2017 06:46, Alex Williamson wrote:
>> +enum IOMMUMemoryRegionAttr {
>> +    IOMMU_ATTR_KVM_FD
> 
> You're generalizing the wrong thing here, this is specifically a
> SPAPR_TCE_FD, call it that.

... and you're not even implementing set_attr, so let's drop it.

My suggestion is to add a function in hw/vfio:

    int vfio_container_attach_kvm_spapr_tce(VFIOContainer *cont,
                                            int tablefd);

and an IOMMUMemoryRegionClass member:

    int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu,
                                    VFIOContainer *cont)

Then your implementation for the latter is as simple as this:

    if (!kvm_enabled() || !kvmppc_has_cap_spapr_vfio()) {
        sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu);
        return vfio_container_attach_kvm_spapr_tce(cont, tcet->fd);
    }

Thanks,

Paolo

>> +};
>> +
>>  typedef struct IOMMUMemoryRegionClass {
>>      /* private */
>>      struct DeviceClass parent_class;
>> @@ -210,6 +214,12 @@ typedef struct IOMMUMemoryRegionClass {
>>                                  IOMMUNotifierFlag new_flags);
>>      /* Set this up to provide customized IOMMU replay function */
>>      void (*replay)(IOMMUMemoryRegion *iommu, IOMMUNotifier *notifier);
>> +
>> +    /* Get/set IOMMU misc attributes */
>> +    int (*get_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr,
>> +                    void *data);
>> +    int (*set_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr,
>> +                    void *data);

Re: [Qemu-devel] [PATCH qemu] RFC: spapr/iommu: Enable in-kernel TCE acceleration via VFIO KVM device

Posted by Alex Williamson 8 years, 1 month ago

On Tue, 19 Dec 2017 12:12:35 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> On 12/12/2017 06:46, Alex Williamson wrote:
> >> +enum IOMMUMemoryRegionAttr {
> >> +    IOMMU_ATTR_KVM_FD  
> > 
> > You're generalizing the wrong thing here, this is specifically a
> > SPAPR_TCE_FD, call it that.  
> 
> ... and you're not even implementing set_attr, so let's drop it.
> 
> My suggestion is to add a function in hw/vfio:
> 
>     int vfio_container_attach_kvm_spapr_tce(VFIOContainer *cont,
>                                             int tablefd);
> 
> and an IOMMUMemoryRegionClass member:
> 
>     int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu,
>                                     VFIOContainer *cont)
> 
> Then your implementation for the latter is as simple as this:
> 
>     if (!kvm_enabled() || !kvmppc_has_cap_spapr_vfio()) {
>         sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu);
>         return vfio_container_attach_kvm_spapr_tce(cont, tcet->fd);
>     }

Ugh, exactly the sort of interface I've been trying to avoid, vfio
specific callbacks on common data structures handing out vfio private
data pointers, requiring additional exported functions from vfio for
each new user of it.  Why is this better?  Thanks,

Alex

Re: [Qemu-devel] [PATCH qemu] RFC: spapr/iommu: Enable in-kernel TCE acceleration via VFIO KVM device

Posted by Paolo Bonzini 8 years, 1 month ago

On 19/12/2017 15:09, Alex Williamson wrote:
> On Tue, 19 Dec 2017 12:12:35 +0100
> Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
>> On 12/12/2017 06:46, Alex Williamson wrote:
>>>> +enum IOMMUMemoryRegionAttr {
>>>> +    IOMMU_ATTR_KVM_FD  
>>>
>>> You're generalizing the wrong thing here, this is specifically a
>>> SPAPR_TCE_FD, call it that.  
>>
>> ... and you're not even implementing set_attr, so let's drop it.
>>
>> My suggestion is to add a function in hw/vfio:
>>
>>     int vfio_container_attach_kvm_spapr_tce(VFIOContainer *cont,
>>                                             int tablefd);
>>
>> and an IOMMUMemoryRegionClass member:
>>
>>     int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu,
>>                                     VFIOContainer *cont)
>>
>> Then your implementation for the latter is as simple as this:
>>
>>     if (!kvm_enabled() || !kvmppc_has_cap_spapr_vfio()) {
>>         sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu);
>>         return vfio_container_attach_kvm_spapr_tce(cont, tcet->fd);
>>     }
> 
> Ugh, exactly the sort of interface I've been trying to avoid, vfio
> specific callbacks on common data structures handing out vfio private
> data pointers,

True, VFIOContainer* is private, but in those declarations it's also opaque.

The VFIO container is the representation of the IOMMU, so it makes sense
to me to have a function to set it up according to QEMU's IOMMU object.
I don't think we will be introducing another object soon that is similar
to the VFIO container.

> requiring additional exported functions from vfio for
> each new user of it.  Why is this better?

I understand that you don't like having many exported functions, but you
are just pushing the problem on the memory.h side where you'd get many
attribute enums.

I suppose it would pay if you have many attributes and many clients,
and/or if the attributes need to be public similar to sysfs.  I mention
public vs. private because, before inventing a new mechanism for
attributes, it would make sense to look at QOM properties.  However,
here we have one client (VFIO) and one attribute (the KVM IOMMU fd) that
is pretty much internal to QEMU.

So, depending on the direction you want to pull/push the information to,
I would do either:

- as above, a generic IOMMU callback

     int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu,
                                     VFIOContainer *cont);

- an object-type check in VFIO, followed by calling a new function

     int spapr_tce_iommu_get_kvm_fd(IOMMUMemoryRegion *iommu);

- if we foresee having more IOMMU devices in KVM, let's rename
KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU and
add a new function

     int iommu_memory_region_get_kvm_fd(IOMMUMemoryRegion *iommu);

that requires no object-type check in VFIO.

Paolo

Re: [Qemu-devel] [PATCH qemu] RFC: spapr/iommu: Enable in-kernel TCE acceleration via VFIO KVM device

Posted by Alexey Kardashevskiy 8 years, 1 month ago

On 20/12/17 01:59, Paolo Bonzini wrote:
> On 19/12/2017 15:09, Alex Williamson wrote:
>> On Tue, 19 Dec 2017 12:12:35 +0100
>> Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>>> On 12/12/2017 06:46, Alex Williamson wrote:
>>>>> +enum IOMMUMemoryRegionAttr {
>>>>> +    IOMMU_ATTR_KVM_FD  
>>>>
>>>> You're generalizing the wrong thing here, this is specifically a
>>>> SPAPR_TCE_FD, call it that.  
>>>
>>> ... and you're not even implementing set_attr, so let's drop it.
>>>
>>> My suggestion is to add a function in hw/vfio:
>>>
>>>     int vfio_container_attach_kvm_spapr_tce(VFIOContainer *cont,
>>>                                             int tablefd);
>>>
>>> and an IOMMUMemoryRegionClass member:
>>>
>>>     int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu,
>>>                                     VFIOContainer *cont)
>>>
>>> Then your implementation for the latter is as simple as this:
>>>
>>>     if (!kvm_enabled() || !kvmppc_has_cap_spapr_vfio()) {
>>>         sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu);
>>>         return vfio_container_attach_kvm_spapr_tce(cont, tcet->fd);
>>>     }
>>
>> Ugh, exactly the sort of interface I've been trying to avoid, vfio
>> specific callbacks on common data structures handing out vfio private
>> data pointers,
> 
> True, VFIOContainer* is private, but in those declarations it's also opaque.
> 
> The VFIO container is the representation of the IOMMU, so it makes sense
> to me to have a function to set it up according to QEMU's IOMMU object.
> I don't think we will be introducing another object soon that is similar
> to the VFIO container.
> 
>> requiring additional exported functions from vfio for
>> each new user of it.  Why is this better?
> 
> I understand that you don't like having many exported functions, but you
> are just pushing the problem on the memory.h side where you'd get many
> attribute enums.
> 
> I suppose it would pay if you have many attributes and many clients,
> and/or if the attributes need to be public similar to sysfs.  I mention
> public vs. private because, before inventing a new mechanism for
> attributes, it would make sense to look at QOM properties.  However,
> here we have one client (VFIO) and one attribute (the KVM IOMMU fd) that
> is pretty much internal to QEMU.
> 
> So, depending on the direction you want to pull/push the information to,
> I would do either:
> 
> - as above, a generic IOMMU callback
> 
>      int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu,
>                                      VFIOContainer *cont);
> 
> - an object-type check in VFIO, followed by calling a new function
> 
>      int spapr_tce_iommu_get_kvm_fd(IOMMUMemoryRegion *iommu);
> 
> - if we foresee having more IOMMU devices in KVM, let's rename
> KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU and
> add a new function
> 
>      int iommu_memory_region_get_kvm_fd(IOMMUMemoryRegion *iommu);
> 
> that requires no object-type check in VFIO.

This is how it started and the comment was that this KVM fd needs to have a
well defined semantic which I struggle to provide.

I could just implement a "spapr-kvm-fd" QOM property on sPAPR's IOMMU MR if
this is acceptable.



-- 
Alexey

Re: [Qemu-devel] [PATCH qemu] RFC: spapr/iommu: Enable in-kernel TCE acceleration via VFIO KVM device

Posted by Paolo Bonzini 8 years, 1 month ago

On 20/12/2017 02:47, Alexey Kardashevskiy wrote:
>> - if we foresee having more IOMMU devices in KVM, let's rename
>> KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU and
>> add a new function
>>
>>      int iommu_memory_region_get_kvm_fd(IOMMUMemoryRegion *iommu);
>>
>> that requires no object-type check in VFIO.
>
> This is how it started and the comment was that this KVM fd needs to have a
> well defined semantic which I struggle to provide.

I don't think the definition should be anything more than "it can be
passed to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU".  I'll discuss it with Alex
later today.

Paolo

Re: [Qemu-devel] [PATCH qemu] RFC: spapr/iommu: Enable in-kernel TCE acceleration via VFIO KVM device

Posted by Alexey Kardashevskiy 8 years, 1 month ago

On 20/12/17 20:04, Paolo Bonzini wrote:
> On 20/12/2017 02:47, Alexey Kardashevskiy wrote:
>>> - if we foresee having more IOMMU devices in KVM, let's rename
>>> KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU and
>>> add a new function
>>>
>>>      int iommu_memory_region_get_kvm_fd(IOMMUMemoryRegion *iommu);
>>>
>>> that requires no object-type check in VFIO.
>>
>> This is how it started and the comment was that this KVM fd needs to have a
>> well defined semantic which I struggle to provide.
> 
> I don't think the definition should be anything more than "it can be
> passed to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU".  I'll discuss it with Alex
> later today.

That was more David's concern :)


-- 
Alexey

Re: [Qemu-devel] [PATCH qemu] RFC: spapr/iommu: Enable in-kernel TCE acceleration via VFIO KVM device

Posted by Paolo Bonzini 8 years, 1 month ago

On 20/12/2017 15:13, Alexey Kardashevskiy wrote:
> On 20/12/17 20:04, Paolo Bonzini wrote:
>> On 20/12/2017 02:47, Alexey Kardashevskiy wrote:
>>>> - if we foresee having more IOMMU devices in KVM, let's rename
>>>> KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU and
>>>> add a new function
>>>>
>>>>      int iommu_memory_region_get_kvm_fd(IOMMUMemoryRegion *iommu);
>>>>
>>>> that requires no object-type check in VFIO.
>>>
>>> This is how it started and the comment was that this KVM fd needs to have a
>>> well defined semantic which I struggle to provide.
>>
>> I don't think the definition should be anything more than "it can be
>> passed to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU".  I'll discuss it with Alex
>> later today.
> 
> That was more David's concern :)

The get_attr API is okay, just remove the unused set_attr and follow the
other review comments from Alex.

Paolo

Re: [Qemu-devel] [PATCH qemu] RFC: spapr/iommu: Enable in-kernel TCE acceleration via VFIO KVM device

Posted by David Gibson 8 years ago

On Tue, Dec 19, 2017 at 03:59:59PM +0100, Paolo Bonzini wrote:
> On 19/12/2017 15:09, Alex Williamson wrote:
> > On Tue, 19 Dec 2017 12:12:35 +0100
> > Paolo Bonzini <pbonzini@redhat.com> wrote:
> > 
> >> On 12/12/2017 06:46, Alex Williamson wrote:
> >>>> +enum IOMMUMemoryRegionAttr {
> >>>> +    IOMMU_ATTR_KVM_FD  
> >>>
> >>> You're generalizing the wrong thing here, this is specifically a
> >>> SPAPR_TCE_FD, call it that.  
> >>
> >> ... and you're not even implementing set_attr, so let's drop it.
> >>
> >> My suggestion is to add a function in hw/vfio:
> >>
> >>     int vfio_container_attach_kvm_spapr_tce(VFIOContainer *cont,
> >>                                             int tablefd);
> >>
> >> and an IOMMUMemoryRegionClass member:
> >>
> >>     int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu,
> >>                                     VFIOContainer *cont)
> >>
> >> Then your implementation for the latter is as simple as this:
> >>
> >>     if (!kvm_enabled() || !kvmppc_has_cap_spapr_vfio()) {
> >>         sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu);
> >>         return vfio_container_attach_kvm_spapr_tce(cont, tcet->fd);
> >>     }
> > 
> > Ugh, exactly the sort of interface I've been trying to avoid, vfio
> > specific callbacks on common data structures handing out vfio private
> > data pointers,
> 
> True, VFIOContainer* is private, but in those declarations it's also opaque.
> 
> The VFIO container is the representation of the IOMMU, so it makes sense
> to me to have a function to set it up according to QEMU's IOMMU object.
> I don't think we will be introducing another object soon that is similar
> to the VFIO container.
> 
> > requiring additional exported functions from vfio for
> > each new user of it.  Why is this better?
> 
> I understand that you don't like having many exported functions, but you
> are just pushing the problem on the memory.h side where you'd get many
> attribute enums.

It's more than just enums, doing it the other way around is putting
fairly intimate knowledge of a specific guest IOMMU workings into the
VFIO code.

Fundamentally this *requires* linking vfio knowledge to guest iommu
(kvm) knowledge, so some cross linkage we'd usually want to avoid is
inevitable.  I don't see that there's a strong argument for whether we
put the bit of vfio knowledge into the spapr viommu or the bit of
spapr viommu knowledge into vfio.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson