include/exec/memory.h | 10 ++++++++++ target/ppc/kvm_ppc.h | 6 ++++++ hw/ppc/spapr_iommu.c | 19 +++++++++++++++++++ hw/vfio/common.c | 24 ++++++++++++++++++++++++ target/ppc/kvm.c | 7 ++++++- hw/vfio/trace-events | 1 + 6 files changed, 66 insertions(+), 1 deletion(-)
In order to enable TCE operations support in KVM, we have to inform
the KVM about VFIO groups being attached to specific LIOBNs. The KVM
already knows about VFIO groups, the only bit missing is which
in-kernel TCE table (the one with user visible TCEs) should update
the attached broups. There is an KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE
attribute of the VFIO KVM device which receives a groupfd/tablefd couple.
This adds get_attr()/set_attr() to IOMMUMemoryRegionClass, like
iommu_ops::domain_get_attr/domain_set_attr in the Linux kernel.
This implements get_attr() for sPAPR IOMMU to return a TCE table fd
as an IOMMU_ATTR_KVM_FD attribute. This also reads now
the KVM_CAP_SPAPR_TCE_VFIO capability to prevent the TCE table from
reallocating to the userspace if the KVM can accelerate TCE operations.
This finally notifies the VFIO KVM device about new group being attached
to a LIOBN.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Assuming it is accepted, does it make sense to split
include/exec/memory.h out and get merged separately?
---
include/exec/memory.h | 10 ++++++++++
target/ppc/kvm_ppc.h | 6 ++++++
hw/ppc/spapr_iommu.c | 19 +++++++++++++++++++
hw/vfio/common.c | 24 ++++++++++++++++++++++++
target/ppc/kvm.c | 7 ++++++-
hw/vfio/trace-events | 1 +
6 files changed, 66 insertions(+), 1 deletion(-)
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 5ed4042..6395c6f 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -190,6 +190,10 @@ struct MemoryRegionOps {
const MemoryRegionMmio old_mmio;
};
+enum IOMMUMemoryRegionAttr {
+ IOMMU_ATTR_KVM_FD
+};
+
typedef struct IOMMUMemoryRegionClass {
/* private */
struct DeviceClass parent_class;
@@ -210,6 +214,12 @@ typedef struct IOMMUMemoryRegionClass {
IOMMUNotifierFlag new_flags);
/* Set this up to provide customized IOMMU replay function */
void (*replay)(IOMMUMemoryRegion *iommu, IOMMUNotifier *notifier);
+
+ /* Get/set IOMMU misc attributes */
+ int (*get_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr,
+ void *data);
+ int (*set_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr,
+ void *data);
} IOMMUMemoryRegionClass;
typedef struct CoalescedMemoryRange CoalescedMemoryRange;
diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index d6be38e..2b985e1 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -48,6 +48,7 @@ void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t page_shift,
int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
int kvmppc_reset_htab(int shift_hint);
uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift);
+bool kvmppc_has_cap_spapr_vfio(void);
#endif /* !CONFIG_USER_ONLY */
bool kvmppc_has_cap_epr(void);
int kvmppc_define_rtas_kernel_token(uint32_t token, const char *function);
@@ -231,6 +232,11 @@ static inline bool kvmppc_is_mem_backend_page_size_ok(const char *obj_path)
return true;
}
+static inline bool kvmppc_has_cap_spapr_vfio(void)
+{
+ return false;
+}
+
#endif /* !CONFIG_USER_ONLY */
static inline bool kvmppc_has_cap_epr(void)
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 5ccd785..ce8a769 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -17,6 +17,7 @@
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#include "qemu/osdep.h"
+#include <sys/ioctl.h>
#include "qemu/error-report.h"
#include "hw/hw.h"
#include "qemu/log.h"
@@ -160,6 +161,19 @@ static uint64_t spapr_tce_get_min_page_size(IOMMUMemoryRegion *iommu)
return 1ULL << tcet->page_shift;
}
+static int spapr_tce_get_attr(IOMMUMemoryRegion *iommu,
+ enum IOMMUMemoryRegionAttr attr, void *data)
+{
+ sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu);
+
+ if (attr == IOMMU_ATTR_KVM_FD && kvmppc_has_cap_spapr_vfio()) {
+ *(int *) data = tcet->fd;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
IOMMUNotifierFlag old,
IOMMUNotifierFlag new)
@@ -284,6 +298,10 @@ void spapr_tce_set_need_vfio(sPAPRTCETable *tcet, bool need_vfio)
tcet->need_vfio = need_vfio;
+ if (!need_vfio || (tcet->fd != -1 && kvmppc_has_cap_spapr_vfio())) {
+ return;
+ }
+
oldtable = tcet->table;
tcet->table = spapr_tce_alloc_table(tcet->liobn,
@@ -643,6 +661,7 @@ static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
imrc->translate = spapr_tce_translate_iommu;
imrc->get_min_page_size = spapr_tce_get_min_page_size;
imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
+ imrc->get_attr = spapr_tce_get_attr;
}
static const TypeInfo spapr_iommu_memory_region_info = {
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index cd81cc9..ed7717d 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -480,6 +480,30 @@ static void vfio_listener_region_add(MemoryListener *listener,
if (memory_region_is_iommu(section->mr)) {
VFIOGuestIOMMU *giommu;
IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
+#ifdef CONFIG_KVM
+ struct kvm_vfio_spapr_tce param;
+ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
+ VFIOGroup *group;
+ struct kvm_device_attr attr = {
+ .group = KVM_DEV_VFIO_GROUP,
+ .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
+ .addr = (uint64_t)(unsigned long)¶m,
+ };
+
+ if (kvm_enabled() && imrc->get_attr &&
+ !imrc->get_attr(iommu_mr, IOMMU_ATTR_KVM_FD, ¶m.tablefd)) {
+
+ QLIST_FOREACH(group, &container->group_list, container_next) {
+ param.groupfd = group->fd;
+ if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
+ error_report("vfio: failed to setup fd %d for a group with fd %d: %s",
+ param.tablefd, param.groupfd, strerror(errno));
+ return;
+ }
+ trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
+ }
+ }
+#endif
trace_vfio_listener_region_add_iommu(iova, end);
/*
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 9d57deb..da7590a 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -136,7 +136,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
cap_spapr_tce_64 = kvm_check_extension(s, KVM_CAP_SPAPR_TCE_64);
cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
- cap_spapr_vfio = false;
+ cap_spapr_vfio = kvm_vm_check_extension(s, KVM_CAP_SPAPR_TCE_VFIO);
cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
@@ -2474,6 +2474,11 @@ bool kvmppc_has_cap_mmu_hash_v3(void)
return cap_mmu_hash_v3;
}
+bool kvmppc_has_cap_spapr_vfio(void)
+{
+ return cap_spapr_vfio;
+}
+
PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
{
uint32_t host_pvr = mfpvr();
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index fae096c..3d34fe8 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" size=0
vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" size=0x%"PRIx64" ret=%d"
vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
vfio_spapr_remove_window(uint64_t off) "offset=0x%"PRIx64
+vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
--
2.11.0
On Tue, 12 Dec 2017 16:18:53 +1100 Alexey Kardashevskiy <aik@ozlabs.ru> wrote: > In order to enable TCE operations support in KVM, we have to inform > the KVM about VFIO groups being attached to specific LIOBNs. The KVM > already knows about VFIO groups, the only bit missing is which > in-kernel TCE table (the one with user visible TCEs) should update > the attached broups. There is an KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE > attribute of the VFIO KVM device which receives a groupfd/tablefd couple. > > This adds get_attr()/set_attr() to IOMMUMemoryRegionClass, like > iommu_ops::domain_get_attr/domain_set_attr in the Linux kernel. > > This implements get_attr() for sPAPR IOMMU to return a TCE table fd > as an IOMMU_ATTR_KVM_FD attribute. This also reads now > the KVM_CAP_SPAPR_TCE_VFIO capability to prevent the TCE table from > reallocating to the userspace if the KVM can accelerate TCE operations. > > This finally notifies the VFIO KVM device about new group being attached > to a LIOBN. > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > --- > > Assuming it is accepted, does it make sense to split > include/exec/memory.h out and get merged separately? > > --- > include/exec/memory.h | 10 ++++++++++ > target/ppc/kvm_ppc.h | 6 ++++++ > hw/ppc/spapr_iommu.c | 19 +++++++++++++++++++ > hw/vfio/common.c | 24 ++++++++++++++++++++++++ > target/ppc/kvm.c | 7 ++++++- > hw/vfio/trace-events | 1 + > 6 files changed, 66 insertions(+), 1 deletion(-) > > diff --git a/include/exec/memory.h b/include/exec/memory.h > index 5ed4042..6395c6f 100644 > --- a/include/exec/memory.h > +++ b/include/exec/memory.h > @@ -190,6 +190,10 @@ struct MemoryRegionOps { > const MemoryRegionMmio old_mmio; > }; > > +enum IOMMUMemoryRegionAttr { > + IOMMU_ATTR_KVM_FD You're generalizing the wrong thing here, this is specifically a SPAPR_TCE_FD, call it that. > +}; > + > typedef struct IOMMUMemoryRegionClass { > /* private */ > struct DeviceClass parent_class; > @@ -210,6 +214,12 @@ typedef struct IOMMUMemoryRegionClass { > IOMMUNotifierFlag new_flags); > /* Set this up to provide customized IOMMU replay function */ > void (*replay)(IOMMUMemoryRegion *iommu, IOMMUNotifier *notifier); > + > + /* Get/set IOMMU misc attributes */ > + int (*get_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr, > + void *data); > + int (*set_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr, > + void *data); > } IOMMUMemoryRegionClass; > > typedef struct CoalescedMemoryRange CoalescedMemoryRange; > diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h > index d6be38e..2b985e1 100644 > --- a/target/ppc/kvm_ppc.h > +++ b/target/ppc/kvm_ppc.h > @@ -48,6 +48,7 @@ void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t page_shift, > int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size); > int kvmppc_reset_htab(int shift_hint); > uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift); > +bool kvmppc_has_cap_spapr_vfio(void); > #endif /* !CONFIG_USER_ONLY */ > bool kvmppc_has_cap_epr(void); > int kvmppc_define_rtas_kernel_token(uint32_t token, const char *function); > @@ -231,6 +232,11 @@ static inline bool kvmppc_is_mem_backend_page_size_ok(const char *obj_path) > return true; > } > > +static inline bool kvmppc_has_cap_spapr_vfio(void) > +{ > + return false; > +} > + > #endif /* !CONFIG_USER_ONLY */ > > static inline bool kvmppc_has_cap_epr(void) > diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c > index 5ccd785..ce8a769 100644 > --- a/hw/ppc/spapr_iommu.c > +++ b/hw/ppc/spapr_iommu.c > @@ -17,6 +17,7 @@ > * License along with this library; if not, see <http://www.gnu.org/licenses/>. > */ > #include "qemu/osdep.h" > +#include <sys/ioctl.h> > #include "qemu/error-report.h" > #include "hw/hw.h" > #include "qemu/log.h" > @@ -160,6 +161,19 @@ static uint64_t spapr_tce_get_min_page_size(IOMMUMemoryRegion *iommu) > return 1ULL << tcet->page_shift; > } > > +static int spapr_tce_get_attr(IOMMUMemoryRegion *iommu, > + enum IOMMUMemoryRegionAttr attr, void *data) > +{ > + sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu); > + > + if (attr == IOMMU_ATTR_KVM_FD && kvmppc_has_cap_spapr_vfio()) { > + *(int *) data = tcet->fd; > + return 0; > + } > + > + return -EINVAL; > +} > + > static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu, > IOMMUNotifierFlag old, > IOMMUNotifierFlag new) > @@ -284,6 +298,10 @@ void spapr_tce_set_need_vfio(sPAPRTCETable *tcet, bool need_vfio) > > tcet->need_vfio = need_vfio; > > + if (!need_vfio || (tcet->fd != -1 && kvmppc_has_cap_spapr_vfio())) { > + return; > + } > + > oldtable = tcet->table; > > tcet->table = spapr_tce_alloc_table(tcet->liobn, > @@ -643,6 +661,7 @@ static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data) > imrc->translate = spapr_tce_translate_iommu; > imrc->get_min_page_size = spapr_tce_get_min_page_size; > imrc->notify_flag_changed = spapr_tce_notify_flag_changed; > + imrc->get_attr = spapr_tce_get_attr; > } > > static const TypeInfo spapr_iommu_memory_region_info = { > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index cd81cc9..ed7717d 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -480,6 +480,30 @@ static void vfio_listener_region_add(MemoryListener *listener, > if (memory_region_is_iommu(section->mr)) { > VFIOGuestIOMMU *giommu; > IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); > +#ifdef CONFIG_KVM > + struct kvm_vfio_spapr_tce param; > + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); > + VFIOGroup *group; > + struct kvm_device_attr attr = { > + .group = KVM_DEV_VFIO_GROUP, > + .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, > + .addr = (uint64_t)(unsigned long)¶m, > + }; > + > + if (kvm_enabled() && imrc->get_attr && > + !imrc->get_attr(iommu_mr, IOMMU_ATTR_KVM_FD, ¶m.tablefd)) { Imagine if another iommu implemented support for something as generic sounding as KVM_FD and we go and call SPAPR_TCE specific setup for it... If the get_attr() interface is acceptable in the memory API, it'd be preferable to abstract it such that every caller doesn't need to test for it. Also there are ~3 patches here, implementing the memory API change, implementing the spapr support for it, implementing the vfio support. Thanks, Alex > + > + QLIST_FOREACH(group, &container->group_list, container_next) { > + param.groupfd = group->fd; > + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { > + error_report("vfio: failed to setup fd %d for a group with fd %d: %s", > + param.tablefd, param.groupfd, strerror(errno)); > + return; > + } > + trace_vfio_spapr_group_attach(param.groupfd, param.tablefd); > + } > + } > +#endif > > trace_vfio_listener_region_add_iommu(iova, end); > /* > diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c > index 9d57deb..da7590a 100644 > --- a/target/ppc/kvm.c > +++ b/target/ppc/kvm.c > @@ -136,7 +136,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) > cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE); > cap_spapr_tce_64 = kvm_check_extension(s, KVM_CAP_SPAPR_TCE_64); > cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE); > - cap_spapr_vfio = false; > + cap_spapr_vfio = kvm_vm_check_extension(s, KVM_CAP_SPAPR_TCE_VFIO); > cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG); > cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR); > cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR); > @@ -2474,6 +2474,11 @@ bool kvmppc_has_cap_mmu_hash_v3(void) > return cap_mmu_hash_v3; > } > > +bool kvmppc_has_cap_spapr_vfio(void) > +{ > + return cap_spapr_vfio; > +} > + > PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void) > { > uint32_t host_pvr = mfpvr(); > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events > index fae096c..3d34fe8 100644 > --- a/hw/vfio/trace-events > +++ b/hw/vfio/trace-events > @@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" size=0 > vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" size=0x%"PRIx64" ret=%d" > vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64 > vfio_spapr_remove_window(uint64_t off) "offset=0x%"PRIx64 > +vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
On 12/12/2017 06:46, Alex Williamson wrote: >> +enum IOMMUMemoryRegionAttr { >> + IOMMU_ATTR_KVM_FD > > You're generalizing the wrong thing here, this is specifically a > SPAPR_TCE_FD, call it that. ... and you're not even implementing set_attr, so let's drop it. My suggestion is to add a function in hw/vfio: int vfio_container_attach_kvm_spapr_tce(VFIOContainer *cont, int tablefd); and an IOMMUMemoryRegionClass member: int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu, VFIOContainer *cont) Then your implementation for the latter is as simple as this: if (!kvm_enabled() || !kvmppc_has_cap_spapr_vfio()) { sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu); return vfio_container_attach_kvm_spapr_tce(cont, tcet->fd); } Thanks, Paolo >> +}; >> + >> typedef struct IOMMUMemoryRegionClass { >> /* private */ >> struct DeviceClass parent_class; >> @@ -210,6 +214,12 @@ typedef struct IOMMUMemoryRegionClass { >> IOMMUNotifierFlag new_flags); >> /* Set this up to provide customized IOMMU replay function */ >> void (*replay)(IOMMUMemoryRegion *iommu, IOMMUNotifier *notifier); >> + >> + /* Get/set IOMMU misc attributes */ >> + int (*get_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr, >> + void *data); >> + int (*set_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr, >> + void *data);
On Tue, 19 Dec 2017 12:12:35 +0100 Paolo Bonzini <pbonzini@redhat.com> wrote: > On 12/12/2017 06:46, Alex Williamson wrote: > >> +enum IOMMUMemoryRegionAttr { > >> + IOMMU_ATTR_KVM_FD > > > > You're generalizing the wrong thing here, this is specifically a > > SPAPR_TCE_FD, call it that. > > ... and you're not even implementing set_attr, so let's drop it. > > My suggestion is to add a function in hw/vfio: > > int vfio_container_attach_kvm_spapr_tce(VFIOContainer *cont, > int tablefd); > > and an IOMMUMemoryRegionClass member: > > int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu, > VFIOContainer *cont) > > Then your implementation for the latter is as simple as this: > > if (!kvm_enabled() || !kvmppc_has_cap_spapr_vfio()) { > sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu); > return vfio_container_attach_kvm_spapr_tce(cont, tcet->fd); > } Ugh, exactly the sort of interface I've been trying to avoid, vfio specific callbacks on common data structures handing out vfio private data pointers, requiring additional exported functions from vfio for each new user of it. Why is this better? Thanks, Alex
On 19/12/2017 15:09, Alex Williamson wrote: > On Tue, 19 Dec 2017 12:12:35 +0100 > Paolo Bonzini <pbonzini@redhat.com> wrote: > >> On 12/12/2017 06:46, Alex Williamson wrote: >>>> +enum IOMMUMemoryRegionAttr { >>>> + IOMMU_ATTR_KVM_FD >>> >>> You're generalizing the wrong thing here, this is specifically a >>> SPAPR_TCE_FD, call it that. >> >> ... and you're not even implementing set_attr, so let's drop it. >> >> My suggestion is to add a function in hw/vfio: >> >> int vfio_container_attach_kvm_spapr_tce(VFIOContainer *cont, >> int tablefd); >> >> and an IOMMUMemoryRegionClass member: >> >> int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu, >> VFIOContainer *cont) >> >> Then your implementation for the latter is as simple as this: >> >> if (!kvm_enabled() || !kvmppc_has_cap_spapr_vfio()) { >> sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu); >> return vfio_container_attach_kvm_spapr_tce(cont, tcet->fd); >> } > > Ugh, exactly the sort of interface I've been trying to avoid, vfio > specific callbacks on common data structures handing out vfio private > data pointers, True, VFIOContainer* is private, but in those declarations it's also opaque. The VFIO container is the representation of the IOMMU, so it makes sense to me to have a function to set it up according to QEMU's IOMMU object. I don't think we will be introducing another object soon that is similar to the VFIO container. > requiring additional exported functions from vfio for > each new user of it. Why is this better? I understand that you don't like having many exported functions, but you are just pushing the problem on the memory.h side where you'd get many attribute enums. I suppose it would pay if you have many attributes and many clients, and/or if the attributes need to be public similar to sysfs. I mention public vs. private because, before inventing a new mechanism for attributes, it would make sense to look at QOM properties. However, here we have one client (VFIO) and one attribute (the KVM IOMMU fd) that is pretty much internal to QEMU. So, depending on the direction you want to pull/push the information to, I would do either: - as above, a generic IOMMU callback int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu, VFIOContainer *cont); - an object-type check in VFIO, followed by calling a new function int spapr_tce_iommu_get_kvm_fd(IOMMUMemoryRegion *iommu); - if we foresee having more IOMMU devices in KVM, let's rename KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU and add a new function int iommu_memory_region_get_kvm_fd(IOMMUMemoryRegion *iommu); that requires no object-type check in VFIO. Paolo
On 20/12/17 01:59, Paolo Bonzini wrote: > On 19/12/2017 15:09, Alex Williamson wrote: >> On Tue, 19 Dec 2017 12:12:35 +0100 >> Paolo Bonzini <pbonzini@redhat.com> wrote: >> >>> On 12/12/2017 06:46, Alex Williamson wrote: >>>>> +enum IOMMUMemoryRegionAttr { >>>>> + IOMMU_ATTR_KVM_FD >>>> >>>> You're generalizing the wrong thing here, this is specifically a >>>> SPAPR_TCE_FD, call it that. >>> >>> ... and you're not even implementing set_attr, so let's drop it. >>> >>> My suggestion is to add a function in hw/vfio: >>> >>> int vfio_container_attach_kvm_spapr_tce(VFIOContainer *cont, >>> int tablefd); >>> >>> and an IOMMUMemoryRegionClass member: >>> >>> int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu, >>> VFIOContainer *cont) >>> >>> Then your implementation for the latter is as simple as this: >>> >>> if (!kvm_enabled() || !kvmppc_has_cap_spapr_vfio()) { >>> sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu); >>> return vfio_container_attach_kvm_spapr_tce(cont, tcet->fd); >>> } >> >> Ugh, exactly the sort of interface I've been trying to avoid, vfio >> specific callbacks on common data structures handing out vfio private >> data pointers, > > True, VFIOContainer* is private, but in those declarations it's also opaque. > > The VFIO container is the representation of the IOMMU, so it makes sense > to me to have a function to set it up according to QEMU's IOMMU object. > I don't think we will be introducing another object soon that is similar > to the VFIO container. > >> requiring additional exported functions from vfio for >> each new user of it. Why is this better? > > I understand that you don't like having many exported functions, but you > are just pushing the problem on the memory.h side where you'd get many > attribute enums. > > I suppose it would pay if you have many attributes and many clients, > and/or if the attributes need to be public similar to sysfs. I mention > public vs. private because, before inventing a new mechanism for > attributes, it would make sense to look at QOM properties. However, > here we have one client (VFIO) and one attribute (the KVM IOMMU fd) that > is pretty much internal to QEMU. > > So, depending on the direction you want to pull/push the information to, > I would do either: > > - as above, a generic IOMMU callback > > int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu, > VFIOContainer *cont); > > - an object-type check in VFIO, followed by calling a new function > > int spapr_tce_iommu_get_kvm_fd(IOMMUMemoryRegion *iommu); > > - if we foresee having more IOMMU devices in KVM, let's rename > KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU and > add a new function > > int iommu_memory_region_get_kvm_fd(IOMMUMemoryRegion *iommu); > > that requires no object-type check in VFIO. This is how it started and the comment was that this KVM fd needs to have a well defined semantic which I struggle to provide. I could just implement a "spapr-kvm-fd" QOM property on sPAPR's IOMMU MR if this is acceptable. -- Alexey
On 20/12/2017 02:47, Alexey Kardashevskiy wrote: >> - if we foresee having more IOMMU devices in KVM, let's rename >> KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU and >> add a new function >> >> int iommu_memory_region_get_kvm_fd(IOMMUMemoryRegion *iommu); >> >> that requires no object-type check in VFIO. > > This is how it started and the comment was that this KVM fd needs to have a > well defined semantic which I struggle to provide. I don't think the definition should be anything more than "it can be passed to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU". I'll discuss it with Alex later today. Paolo
On 20/12/17 20:04, Paolo Bonzini wrote: > On 20/12/2017 02:47, Alexey Kardashevskiy wrote: >>> - if we foresee having more IOMMU devices in KVM, let's rename >>> KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU and >>> add a new function >>> >>> int iommu_memory_region_get_kvm_fd(IOMMUMemoryRegion *iommu); >>> >>> that requires no object-type check in VFIO. >> >> This is how it started and the comment was that this KVM fd needs to have a >> well defined semantic which I struggle to provide. > > I don't think the definition should be anything more than "it can be > passed to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU". I'll discuss it with Alex > later today. That was more David's concern :) -- Alexey
On 20/12/2017 15:13, Alexey Kardashevskiy wrote: > On 20/12/17 20:04, Paolo Bonzini wrote: >> On 20/12/2017 02:47, Alexey Kardashevskiy wrote: >>>> - if we foresee having more IOMMU devices in KVM, let's rename >>>> KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU and >>>> add a new function >>>> >>>> int iommu_memory_region_get_kvm_fd(IOMMUMemoryRegion *iommu); >>>> >>>> that requires no object-type check in VFIO. >>> >>> This is how it started and the comment was that this KVM fd needs to have a >>> well defined semantic which I struggle to provide. >> >> I don't think the definition should be anything more than "it can be >> passed to KVM_DEV_VFIO_GROUP_ATTACH_IOMMU". I'll discuss it with Alex >> later today. > > That was more David's concern :) The get_attr API is okay, just remove the unused set_attr and follow the other review comments from Alex. Paolo
On Tue, Dec 19, 2017 at 03:59:59PM +0100, Paolo Bonzini wrote: > On 19/12/2017 15:09, Alex Williamson wrote: > > On Tue, 19 Dec 2017 12:12:35 +0100 > > Paolo Bonzini <pbonzini@redhat.com> wrote: > > > >> On 12/12/2017 06:46, Alex Williamson wrote: > >>>> +enum IOMMUMemoryRegionAttr { > >>>> + IOMMU_ATTR_KVM_FD > >>> > >>> You're generalizing the wrong thing here, this is specifically a > >>> SPAPR_TCE_FD, call it that. > >> > >> ... and you're not even implementing set_attr, so let's drop it. > >> > >> My suggestion is to add a function in hw/vfio: > >> > >> int vfio_container_attach_kvm_spapr_tce(VFIOContainer *cont, > >> int tablefd); > >> > >> and an IOMMUMemoryRegionClass member: > >> > >> int (*set_vfio_container_attrs)(IOMMUMemoryRegion *iommu, > >> VFIOContainer *cont) > >> > >> Then your implementation for the latter is as simple as this: > >> > >> if (!kvm_enabled() || !kvmppc_has_cap_spapr_vfio()) { > >> sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu); > >> return vfio_container_attach_kvm_spapr_tce(cont, tcet->fd); > >> } > > > > Ugh, exactly the sort of interface I've been trying to avoid, vfio > > specific callbacks on common data structures handing out vfio private > > data pointers, > > True, VFIOContainer* is private, but in those declarations it's also opaque. > > The VFIO container is the representation of the IOMMU, so it makes sense > to me to have a function to set it up according to QEMU's IOMMU object. > I don't think we will be introducing another object soon that is similar > to the VFIO container. > > > requiring additional exported functions from vfio for > > each new user of it. Why is this better? > > I understand that you don't like having many exported functions, but you > are just pushing the problem on the memory.h side where you'd get many > attribute enums. It's more than just enums, doing it the other way around is putting fairly intimate knowledge of a specific guest IOMMU workings into the VFIO code. Fundamentally this *requires* linking vfio knowledge to guest iommu (kvm) knowledge, so some cross linkage we'd usually want to avoid is inevitable. I don't see that there's a strong argument for whether we put the bit of vfio knowledge into the spapr viommu or the bit of spapr viommu knowledge into vfio. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
© 2016 - 2024 Red Hat, Inc.