This implements a notification for a new IOMMU group attached to
sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration.
This extends the TYPE_SPAPR_IOMMU_MEMORY_REGION class with a get_fd()
callback which returns KVM fd associated with LIOBN, the notifier uses it
to establish link between LIOBN and IOMMU group in the KVM.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
The practical reason for adding get_fd() as a callback is avoiding static
linking to spapt_tce_get_fd(): hw/vfio/spapr.c compiles when
CONFIG_SOFTMMU=y to avoid multiple "ifdef PSERIES"'s in the rest
of VFIO code but hw/ppc/spapr_iommu.c (where spapt_tce_get_fd() besides)
compiles only when CONFIG_PSERIES=y.
---
include/hw/ppc/spapr.h | 15 +++++++++++++++
include/hw/vfio/vfio-common.h | 2 ++
hw/ppc/spapr_iommu.c | 10 ++++++++++
hw/vfio/common.c | 10 ++++++++++
hw/vfio/spapr.c | 39 +++++++++++++++++++++++++++++++++++++++
hw/vfio/trace-events | 1 +
6 files changed, 77 insertions(+)
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 2a303a705c..c1d37e6356 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -591,6 +591,7 @@ void spapr_load_rtas(sPAPRMachineState *spapr, void *fdt, hwaddr addr);
#define RTAS_EVENT_SCAN_RATE 1
typedef struct sPAPRTCETable sPAPRTCETable;
+typedef struct sPAPRIOMMUMemoryRegionClass sPAPRIOMMUMemoryRegionClass;
#define TYPE_SPAPR_TCE_TABLE "spapr-tce-table"
#define SPAPR_TCE_TABLE(obj) \
@@ -599,6 +600,12 @@ typedef struct sPAPRTCETable sPAPRTCETable;
#define TYPE_SPAPR_IOMMU_MEMORY_REGION "spapr-iommu-memory-region"
#define SPAPR_IOMMU_MEMORY_REGION(obj) \
OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_SPAPR_IOMMU_MEMORY_REGION)
+#define SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(obj) \
+ OBJECT_GET_CLASS(sPAPRIOMMUMemoryRegionClass, obj, \
+ TYPE_SPAPR_IOMMU_MEMORY_REGION)
+#define SPAPR_IOMMU_MEMORY_REGION_CLASS(klass) \
+ OBJECT_CLASS_CHECK(sPAPRIOMMUMemoryRegionClass, klass, \
+ TYPE_SPAPR_IOMMU_MEMORY_REGION)
struct sPAPRTCETable {
DeviceState parent;
@@ -618,6 +625,14 @@ struct sPAPRTCETable {
QLIST_ENTRY(sPAPRTCETable) list;
};
+struct sPAPRIOMMUMemoryRegionClass {
+ /* private */
+ IOMMUMemoryRegionClass parent_class;
+
+ /* public */
+ int (*get_fd)(IOMMUMemoryRegion *iommu_mr);
+};
+
sPAPRTCETable *spapr_tce_find_by_liobn(target_ulong liobn);
struct sPAPREventLogEntry {
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index f3a2ac9fee..d245d3cecc 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -177,6 +177,8 @@ extern const MemoryListener vfio_prereg_listener;
int vfio_spapr_create_window(VFIOContainer *container,
MemoryRegionSection *section,
hwaddr *pgsize);
+int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
+ IOMMUMemoryRegion *iommumr);
int vfio_spapr_remove_window(VFIOContainer *container,
hwaddr offset_within_address_space);
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 307dc3021e..82fca61a75 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -171,6 +171,13 @@ static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
}
}
+static int spapr_tce_get_fd(IOMMUMemoryRegion *iommu_mr)
+{
+ sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
+
+ return tcet->fd;
+}
+
static int spapr_tce_table_post_load(void *opaque, int version_id)
{
sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
@@ -631,16 +638,19 @@ static TypeInfo spapr_tce_table_info = {
static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
{
IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+ sPAPRIOMMUMemoryRegionClass *simrc = SPAPR_IOMMU_MEMORY_REGION_CLASS(klass);
imrc->translate = spapr_tce_translate_iommu;
imrc->get_min_page_size = spapr_tce_get_min_page_size;
imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
+ simrc->get_fd = spapr_tce_get_fd;
}
static const TypeInfo spapr_iommu_memory_region_info = {
.parent = TYPE_IOMMU_MEMORY_REGION,
.name = TYPE_SPAPR_IOMMU_MEMORY_REGION,
.class_init = spapr_iommu_memory_region_class_init,
+ .class_size = sizeof(sPAPRIOMMUMemoryRegionClass),
};
static void register_types(void)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 7b2924c0ef..92f1f88ae8 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -454,6 +454,16 @@ static void vfio_listener_region_add(MemoryListener *listener,
goto fail;
}
+#ifdef CONFIG_KVM
+ if (kvm_enabled()) {
+ VFIOGroup *group;
+
+ QLIST_FOREACH(group, &container->group_list, container_next) {
+ vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd,
+ IOMMU_MEMORY_REGION(section->mr));
+ }
+ }
+#endif
vfio_host_win_add(container, section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(section->size) - 1, pgsize);
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 32fd6a9b54..2b9af75c03 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -15,8 +15,12 @@
#include "hw/vfio/vfio-common.h"
#include "hw/hw.h"
+#include "hw/ppc/spapr.h"
#include "qemu/error-report.h"
#include "trace.h"
+#ifdef CONFIG_KVM
+#include "linux/kvm.h"
+#endif
static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
{
@@ -188,6 +192,41 @@ int vfio_spapr_create_window(VFIOContainer *container,
return 0;
}
+int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
+ IOMMUMemoryRegion *iommu_mr)
+{
+#ifdef CONFIG_KVM
+ struct kvm_vfio_spapr_tce param = {
+ .groupfd = groupfd,
+ };
+ struct kvm_device_attr attr = {
+ .group = KVM_DEV_VFIO_GROUP,
+ .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
+ .addr = (uint64_t)(unsigned long)¶m,
+ };
+ IOMMUMemoryRegion *spapr_iommu_mr = SPAPR_IOMMU_MEMORY_REGION(iommu_mr);
+ sPAPRIOMMUMemoryRegionClass *simrc =
+ SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(spapr_iommu_mr);
+
+ if (!simrc->get_fd) {
+ error_report("vfio: No get_fd defined for IOMMU MR");
+ return -EFAULT;
+ }
+
+ param.tablefd = simrc->get_fd(spapr_iommu_mr);
+
+ if (param.tablefd != -1) {
+ if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
+ error_report("vfio: failed to setup fd %d for a group with fd %d: %s",
+ param.tablefd, param.groupfd, strerror(errno));
+ return -errno;
+ }
+ }
+ trace_vfio_spapr_notify_kvm(groupfd, param.tablefd);
+#endif
+ return 0;
+}
+
int vfio_spapr_remove_window(VFIOContainer *container,
hwaddr offset_within_address_space)
{
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 2561c6d31a..084a92f7c2 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"P
vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
+vfio_spapr_notify_kvm(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
--
2.11.0
On Thu, Jul 20, 2017 at 05:22:30PM +1000, Alexey Kardashevskiy wrote:
> This implements a notification for a new IOMMU group attached to
> sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration.
>
> This extends the TYPE_SPAPR_IOMMU_MEMORY_REGION class with a get_fd()
> callback which returns KVM fd associated with LIOBN, the notifier uses it
> to establish link between LIOBN and IOMMU group in the KVM.
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>
> The practical reason for adding get_fd() as a callback is avoiding static
> linking to spapt_tce_get_fd(): hw/vfio/spapr.c compiles when
> CONFIG_SOFTMMU=y to avoid multiple "ifdef PSERIES"'s in the rest
> of VFIO code but hw/ppc/spapr_iommu.c (where spapt_tce_get_fd() besides)
> compiles only when CONFIG_PSERIES=y.
Ok. Nonetheless I don't think the get_fd() method is a good idea.
First, it's basically an abstraction violation, exposing the region's
internal fd. Second, it's a method which only plausibly has one
implementation which is rarely sensible.
What this comes down to is that the guest IOMMU mechanism needs
information about host vfio groups mapped - for an optimization in
this case.
So what would make sense to me is to put an "add_vfio_group" method
into IOMMUMemoryRegionClass (or even MemoryRegionClass). In most
cases that will be NULL (== no-op). For the spapr IOMMU region, it
will (attempt to) connect the host group to the guest liobn.
> ---
> include/hw/ppc/spapr.h | 15 +++++++++++++++
> include/hw/vfio/vfio-common.h | 2 ++
> hw/ppc/spapr_iommu.c | 10 ++++++++++
> hw/vfio/common.c | 10 ++++++++++
> hw/vfio/spapr.c | 39 +++++++++++++++++++++++++++++++++++++++
> hw/vfio/trace-events | 1 +
> 6 files changed, 77 insertions(+)
>
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 2a303a705c..c1d37e6356 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -591,6 +591,7 @@ void spapr_load_rtas(sPAPRMachineState *spapr, void *fdt, hwaddr addr);
> #define RTAS_EVENT_SCAN_RATE 1
>
> typedef struct sPAPRTCETable sPAPRTCETable;
> +typedef struct sPAPRIOMMUMemoryRegionClass sPAPRIOMMUMemoryRegionClass;
>
> #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table"
> #define SPAPR_TCE_TABLE(obj) \
> @@ -599,6 +600,12 @@ typedef struct sPAPRTCETable sPAPRTCETable;
> #define TYPE_SPAPR_IOMMU_MEMORY_REGION "spapr-iommu-memory-region"
> #define SPAPR_IOMMU_MEMORY_REGION(obj) \
> OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_SPAPR_IOMMU_MEMORY_REGION)
> +#define SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(obj) \
> + OBJECT_GET_CLASS(sPAPRIOMMUMemoryRegionClass, obj, \
> + TYPE_SPAPR_IOMMU_MEMORY_REGION)
> +#define SPAPR_IOMMU_MEMORY_REGION_CLASS(klass) \
> + OBJECT_CLASS_CHECK(sPAPRIOMMUMemoryRegionClass, klass, \
> + TYPE_SPAPR_IOMMU_MEMORY_REGION)
>
> struct sPAPRTCETable {
> DeviceState parent;
> @@ -618,6 +625,14 @@ struct sPAPRTCETable {
> QLIST_ENTRY(sPAPRTCETable) list;
> };
>
> +struct sPAPRIOMMUMemoryRegionClass {
> + /* private */
> + IOMMUMemoryRegionClass parent_class;
> +
> + /* public */
> + int (*get_fd)(IOMMUMemoryRegion *iommu_mr);
> +};
> +
> sPAPRTCETable *spapr_tce_find_by_liobn(target_ulong liobn);
To make sure I'm understanding correctly: the MR subclass here is
representing a guest-side property, yes? It means that on the guest
side the IOMMU mappings are managed by the PAPR {GET,PUT}_TCE
interface.
> struct sPAPREventLogEntry {
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index f3a2ac9fee..d245d3cecc 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -177,6 +177,8 @@ extern const MemoryListener vfio_prereg_listener;
> int vfio_spapr_create_window(VFIOContainer *container,
> MemoryRegionSection *section,
> hwaddr *pgsize);
> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
> + IOMMUMemoryRegion *iommumr);
> int vfio_spapr_remove_window(VFIOContainer *container,
> hwaddr offset_within_address_space);
>
> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> index 307dc3021e..82fca61a75 100644
> --- a/hw/ppc/spapr_iommu.c
> +++ b/hw/ppc/spapr_iommu.c
> @@ -171,6 +171,13 @@ static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
> }
> }
>
> +static int spapr_tce_get_fd(IOMMUMemoryRegion *iommu_mr)
> +{
> + sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
> +
> + return tcet->fd;
Does this have a well defined value if there's no KVM?
> +}
> +
> static int spapr_tce_table_post_load(void *opaque, int version_id)
> {
> sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
> @@ -631,16 +638,19 @@ static TypeInfo spapr_tce_table_info = {
> static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
> {
> IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
> + sPAPRIOMMUMemoryRegionClass *simrc = SPAPR_IOMMU_MEMORY_REGION_CLASS(klass);
>
> imrc->translate = spapr_tce_translate_iommu;
> imrc->get_min_page_size = spapr_tce_get_min_page_size;
> imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
> + simrc->get_fd = spapr_tce_get_fd;
> }
>
> static const TypeInfo spapr_iommu_memory_region_info = {
> .parent = TYPE_IOMMU_MEMORY_REGION,
> .name = TYPE_SPAPR_IOMMU_MEMORY_REGION,
> .class_init = spapr_iommu_memory_region_class_init,
> + .class_size = sizeof(sPAPRIOMMUMemoryRegionClass),
> };
>
> static void register_types(void)
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 7b2924c0ef..92f1f88ae8 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -454,6 +454,16 @@ static void vfio_listener_region_add(MemoryListener *listener,
> goto fail;
> }
>
> +#ifdef CONFIG_KVM
> + if (kvm_enabled()) {
> + VFIOGroup *group;
> +
> + QLIST_FOREACH(group, &container->group_list, container_next) {
> + vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd,
> + IOMMU_MEMORY_REGION(section->mr));
> + }
> + }
So, here you're informing the region of the groups when the region is
mapped in. But don't you similarly need to notify if a group is added
to an existing address space? And won't you also need notifications
of groups/regions being removed?
> +#endif
> vfio_host_win_add(container, section->offset_within_address_space,
> section->offset_within_address_space +
> int128_get64(section->size) - 1, pgsize);
> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
> index 32fd6a9b54..2b9af75c03 100644
> --- a/hw/vfio/spapr.c
> +++ b/hw/vfio/spapr.c
> @@ -15,8 +15,12 @@
>
> #include "hw/vfio/vfio-common.h"
> #include "hw/hw.h"
> +#include "hw/ppc/spapr.h"
> #include "qemu/error-report.h"
> #include "trace.h"
> +#ifdef CONFIG_KVM
> +#include "linux/kvm.h"
> +#endif
>
> static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
> {
> @@ -188,6 +192,41 @@ int vfio_spapr_create_window(VFIOContainer *container,
> return 0;
> }
>
> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
> + IOMMUMemoryRegion *iommu_mr)
> +{
> +#ifdef CONFIG_KVM
> + struct kvm_vfio_spapr_tce param = {
> + .groupfd = groupfd,
> + };
> + struct kvm_device_attr attr = {
> + .group = KVM_DEV_VFIO_GROUP,
> + .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
> + .addr = (uint64_t)(unsigned long)¶m,
> + };
> + IOMMUMemoryRegion *spapr_iommu_mr = SPAPR_IOMMU_MEMORY_REGION(iommu_mr);
This will assert if you have a non-spapr guest IOMMU on a ppc host
(e.g. emulating an x86 with VT-d under TCG).
> + sPAPRIOMMUMemoryRegionClass *simrc =
> + SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(spapr_iommu_mr);
> +
> + if (!simrc->get_fd) {
> + error_report("vfio: No get_fd defined for IOMMU MR");
> + return -EFAULT;
> + }
> +
> + param.tablefd = simrc->get_fd(spapr_iommu_mr);
> +
> + if (param.tablefd != -1) {
> + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
> + error_report("vfio: failed to setup fd %d for a group with fd %d: %s",
> + param.tablefd, param.groupfd, strerror(errno));
> + return -errno;
> + }
> + }
> + trace_vfio_spapr_notify_kvm(groupfd, param.tablefd);
> +#endif
> + return 0;
> +}
> +
> int vfio_spapr_remove_window(VFIOContainer *container,
> hwaddr offset_within_address_space)
> {
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 2561c6d31a..084a92f7c2 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"P
> vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
> vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
> vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
> +vfio_spapr_notify_kvm(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On 25/08/17 16:21, David Gibson wrote:
> On Thu, Jul 20, 2017 at 05:22:30PM +1000, Alexey Kardashevskiy wrote:
>> This implements a notification for a new IOMMU group attached to
>> sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration.
>>
>> This extends the TYPE_SPAPR_IOMMU_MEMORY_REGION class with a get_fd()
>> callback which returns KVM fd associated with LIOBN, the notifier uses it
>> to establish link between LIOBN and IOMMU group in the KVM.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>
>> The practical reason for adding get_fd() as a callback is avoiding static
>> linking to spapt_tce_get_fd(): hw/vfio/spapr.c compiles when
>> CONFIG_SOFTMMU=y to avoid multiple "ifdef PSERIES"'s in the rest
>> of VFIO code but hw/ppc/spapr_iommu.c (where spapt_tce_get_fd() besides)
>> compiles only when CONFIG_PSERIES=y.
>
> Ok. Nonetheless I don't think the get_fd() method is a good idea.
> First, it's basically an abstraction violation, exposing the region's
> internal fd. Second, it's a method which only plausibly has one
> implementation which is rarely sensible.
>
> What this comes down to is that the guest IOMMU mechanism needs
> information about host vfio groups mapped - for an optimization in
> this case.
>
> So what would make sense to me is to put an "add_vfio_group" method
> into IOMMUMemoryRegionClass (or even MemoryRegionClass). In most
> cases that will be NULL (== no-op). For the spapr IOMMU region, it
> will (attempt to) connect the host group to the guest liobn.
Like this?
IOMMUMemoryRegionClass::add_vfio_group_kvm(int kvm_device_fd, int groupfd);
It is just cleaner to keep kvm_device_fd and its ioclts() in the same place
but ok.
>
>> ---
>> include/hw/ppc/spapr.h | 15 +++++++++++++++
>> include/hw/vfio/vfio-common.h | 2 ++
>> hw/ppc/spapr_iommu.c | 10 ++++++++++
>> hw/vfio/common.c | 10 ++++++++++
>> hw/vfio/spapr.c | 39 +++++++++++++++++++++++++++++++++++++++
>> hw/vfio/trace-events | 1 +
>> 6 files changed, 77 insertions(+)
>>
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index 2a303a705c..c1d37e6356 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -591,6 +591,7 @@ void spapr_load_rtas(sPAPRMachineState *spapr, void *fdt, hwaddr addr);
>> #define RTAS_EVENT_SCAN_RATE 1
>>
>> typedef struct sPAPRTCETable sPAPRTCETable;
>> +typedef struct sPAPRIOMMUMemoryRegionClass sPAPRIOMMUMemoryRegionClass;
>>
>> #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table"
>> #define SPAPR_TCE_TABLE(obj) \
>> @@ -599,6 +600,12 @@ typedef struct sPAPRTCETable sPAPRTCETable;
>> #define TYPE_SPAPR_IOMMU_MEMORY_REGION "spapr-iommu-memory-region"
>> #define SPAPR_IOMMU_MEMORY_REGION(obj) \
>> OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_SPAPR_IOMMU_MEMORY_REGION)
>> +#define SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(obj) \
>> + OBJECT_GET_CLASS(sPAPRIOMMUMemoryRegionClass, obj, \
>> + TYPE_SPAPR_IOMMU_MEMORY_REGION)
>> +#define SPAPR_IOMMU_MEMORY_REGION_CLASS(klass) \
>> + OBJECT_CLASS_CHECK(sPAPRIOMMUMemoryRegionClass, klass, \
>> + TYPE_SPAPR_IOMMU_MEMORY_REGION)
>>
>> struct sPAPRTCETable {
>> DeviceState parent;
>> @@ -618,6 +625,14 @@ struct sPAPRTCETable {
>> QLIST_ENTRY(sPAPRTCETable) list;
>> };
>>
>> +struct sPAPRIOMMUMemoryRegionClass {
>> + /* private */
>> + IOMMUMemoryRegionClass parent_class;
>> +
>> + /* public */
>> + int (*get_fd)(IOMMUMemoryRegion *iommu_mr);
>> +};
>> +
>> sPAPRTCETable *spapr_tce_find_by_liobn(target_ulong liobn);
>
> To make sure I'm understanding correctly: the MR subclass here is
> representing a guest-side property, yes? It means that on the guest
> side the IOMMU mappings are managed by the PAPR {GET,PUT}_TCE
> interface.
I do not understand the question, sorry. MR is a QEMU thing, only QEMU
devices use these MRs as MRs.
>> struct sPAPREventLogEntry {
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index f3a2ac9fee..d245d3cecc 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -177,6 +177,8 @@ extern const MemoryListener vfio_prereg_listener;
>> int vfio_spapr_create_window(VFIOContainer *container,
>> MemoryRegionSection *section,
>> hwaddr *pgsize);
>> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
>> + IOMMUMemoryRegion *iommumr);
>> int vfio_spapr_remove_window(VFIOContainer *container,
>> hwaddr offset_within_address_space);
>>
>> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
>> index 307dc3021e..82fca61a75 100644
>> --- a/hw/ppc/spapr_iommu.c
>> +++ b/hw/ppc/spapr_iommu.c
>> @@ -171,6 +171,13 @@ static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
>> }
>> }
>>
>> +static int spapr_tce_get_fd(IOMMUMemoryRegion *iommu_mr)
>> +{
>> + sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
>> +
>> + return tcet->fd;
>
> Does this have a well defined value if there's no KVM?
It is -1 in TCG, any other value is undefined.
>
>> +}
>> +
>> static int spapr_tce_table_post_load(void *opaque, int version_id)
>> {
>> sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
>> @@ -631,16 +638,19 @@ static TypeInfo spapr_tce_table_info = {
>> static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
>> {
>> IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>> + sPAPRIOMMUMemoryRegionClass *simrc = SPAPR_IOMMU_MEMORY_REGION_CLASS(klass);
>>
>> imrc->translate = spapr_tce_translate_iommu;
>> imrc->get_min_page_size = spapr_tce_get_min_page_size;
>> imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
>> + simrc->get_fd = spapr_tce_get_fd;
>> }
>>
>> static const TypeInfo spapr_iommu_memory_region_info = {
>> .parent = TYPE_IOMMU_MEMORY_REGION,
>> .name = TYPE_SPAPR_IOMMU_MEMORY_REGION,
>> .class_init = spapr_iommu_memory_region_class_init,
>> + .class_size = sizeof(sPAPRIOMMUMemoryRegionClass),
>> };
>>
>> static void register_types(void)
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 7b2924c0ef..92f1f88ae8 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -454,6 +454,16 @@ static void vfio_listener_region_add(MemoryListener *listener,
>> goto fail;
>> }
>>
>> +#ifdef CONFIG_KVM
>> + if (kvm_enabled()) {
>> + VFIOGroup *group;
>> +
>> + QLIST_FOREACH(group, &container->group_list, container_next) {
>> + vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd,
>> + IOMMU_MEMORY_REGION(section->mr));
>> + }
>> + }
>
> So, here you're informing the region of the groups when the region is
> mapped in. But don't you similarly need to notify if a group is added
> to an existing address space?
It is either in VFIO for ages (forever? in vfio_connect_container()), or I
did not understand the question...
> And won't you also need notifications
> of groups/regions being removed?
Same comment. vfio_disconnect_container()?
>
>> +#endif
>> vfio_host_win_add(container, section->offset_within_address_space,
>> section->offset_within_address_space +
>> int128_get64(section->size) - 1, pgsize);
>> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
>> index 32fd6a9b54..2b9af75c03 100644
>> --- a/hw/vfio/spapr.c
>> +++ b/hw/vfio/spapr.c
>> @@ -15,8 +15,12 @@
>>
>> #include "hw/vfio/vfio-common.h"
>> #include "hw/hw.h"
>> +#include "hw/ppc/spapr.h"
>> #include "qemu/error-report.h"
>> #include "trace.h"
>> +#ifdef CONFIG_KVM
>> +#include "linux/kvm.h"
>> +#endif
>>
>> static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
>> {
>> @@ -188,6 +192,41 @@ int vfio_spapr_create_window(VFIOContainer *container,
>> return 0;
>> }
>>
>> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
>> + IOMMUMemoryRegion *iommu_mr)
>> +{
>> +#ifdef CONFIG_KVM
>> + struct kvm_vfio_spapr_tce param = {
>> + .groupfd = groupfd,
>> + };
>> + struct kvm_device_attr attr = {
>> + .group = KVM_DEV_VFIO_GROUP,
>> + .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
>> + .addr = (uint64_t)(unsigned long)¶m,
>> + };
>> + IOMMUMemoryRegion *spapr_iommu_mr = SPAPR_IOMMU_MEMORY_REGION(iommu_mr);
>
> This will assert if you have a non-spapr guest IOMMU on a ppc host
> (e.g. emulating an x86 with VT-d under TCG).
Sure but this should not execute when TCG, hence "_kvm" in
"vfio_spapr_notify_kvm".
>
>> + sPAPRIOMMUMemoryRegionClass *simrc =
>> + SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(spapr_iommu_mr);
>> +
>> + if (!simrc->get_fd) {
>> + error_report("vfio: No get_fd defined for IOMMU MR");
>> + return -EFAULT;
>> + }
>> +
>> + param.tablefd = simrc->get_fd(spapr_iommu_mr);
>> +
>> + if (param.tablefd != -1) {
>> + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
>> + error_report("vfio: failed to setup fd %d for a group with fd %d: %s",
>> + param.tablefd, param.groupfd, strerror(errno));
>> + return -errno;
>> + }
>> + }
>> + trace_vfio_spapr_notify_kvm(groupfd, param.tablefd);
>> +#endif
>> + return 0;
>> +}
>> +
>> int vfio_spapr_remove_window(VFIOContainer *container,
>> hwaddr offset_within_address_space)
>> {
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 2561c6d31a..084a92f7c2 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"P
>> vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
>> vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
>> vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
>> +vfio_spapr_notify_kvm(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
>
--
Alexey
On 28/08/17 14:20, Alexey Kardashevskiy wrote:
> On 25/08/17 16:21, David Gibson wrote:
>> On Thu, Jul 20, 2017 at 05:22:30PM +1000, Alexey Kardashevskiy wrote:
>>> This implements a notification for a new IOMMU group attached to
>>> sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration.
>>>
>>> This extends the TYPE_SPAPR_IOMMU_MEMORY_REGION class with a get_fd()
>>> callback which returns KVM fd associated with LIOBN, the notifier uses it
>>> to establish link between LIOBN and IOMMU group in the KVM.
>>>
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Ping?
>>> ---
>>>
>>> The practical reason for adding get_fd() as a callback is avoiding static
>>> linking to spapt_tce_get_fd(): hw/vfio/spapr.c compiles when
>>> CONFIG_SOFTMMU=y to avoid multiple "ifdef PSERIES"'s in the rest
>>> of VFIO code but hw/ppc/spapr_iommu.c (where spapt_tce_get_fd() besides)
>>> compiles only when CONFIG_PSERIES=y.
>>
>> Ok. Nonetheless I don't think the get_fd() method is a good idea.
>> First, it's basically an abstraction violation, exposing the region's
>> internal fd. Second, it's a method which only plausibly has one
>> implementation which is rarely sensible.
>>
>> What this comes down to is that the guest IOMMU mechanism needs
>> information about host vfio groups mapped - for an optimization in
>> this case.
>>
>> So what would make sense to me is to put an "add_vfio_group" method
>> into IOMMUMemoryRegionClass (or even MemoryRegionClass). In most
>> cases that will be NULL (== no-op). For the spapr IOMMU region, it
>> will (attempt to) connect the host group to the guest liobn.
>
>
> Like this?
>
> IOMMUMemoryRegionClass::add_vfio_group_kvm(int kvm_device_fd, int groupfd);
>
> It is just cleaner to keep kvm_device_fd and its ioclts() in the same place
> but ok.
>
>
>>
>>> ---
>>> include/hw/ppc/spapr.h | 15 +++++++++++++++
>>> include/hw/vfio/vfio-common.h | 2 ++
>>> hw/ppc/spapr_iommu.c | 10 ++++++++++
>>> hw/vfio/common.c | 10 ++++++++++
>>> hw/vfio/spapr.c | 39 +++++++++++++++++++++++++++++++++++++++
>>> hw/vfio/trace-events | 1 +
>>> 6 files changed, 77 insertions(+)
>>>
>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>> index 2a303a705c..c1d37e6356 100644
>>> --- a/include/hw/ppc/spapr.h
>>> +++ b/include/hw/ppc/spapr.h
>>> @@ -591,6 +591,7 @@ void spapr_load_rtas(sPAPRMachineState *spapr, void *fdt, hwaddr addr);
>>> #define RTAS_EVENT_SCAN_RATE 1
>>>
>>> typedef struct sPAPRTCETable sPAPRTCETable;
>>> +typedef struct sPAPRIOMMUMemoryRegionClass sPAPRIOMMUMemoryRegionClass;
>>>
>>> #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table"
>>> #define SPAPR_TCE_TABLE(obj) \
>>> @@ -599,6 +600,12 @@ typedef struct sPAPRTCETable sPAPRTCETable;
>>> #define TYPE_SPAPR_IOMMU_MEMORY_REGION "spapr-iommu-memory-region"
>>> #define SPAPR_IOMMU_MEMORY_REGION(obj) \
>>> OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_SPAPR_IOMMU_MEMORY_REGION)
>>> +#define SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(obj) \
>>> + OBJECT_GET_CLASS(sPAPRIOMMUMemoryRegionClass, obj, \
>>> + TYPE_SPAPR_IOMMU_MEMORY_REGION)
>>> +#define SPAPR_IOMMU_MEMORY_REGION_CLASS(klass) \
>>> + OBJECT_CLASS_CHECK(sPAPRIOMMUMemoryRegionClass, klass, \
>>> + TYPE_SPAPR_IOMMU_MEMORY_REGION)
>>>
>>> struct sPAPRTCETable {
>>> DeviceState parent;
>>> @@ -618,6 +625,14 @@ struct sPAPRTCETable {
>>> QLIST_ENTRY(sPAPRTCETable) list;
>>> };
>>>
>>> +struct sPAPRIOMMUMemoryRegionClass {
>>> + /* private */
>>> + IOMMUMemoryRegionClass parent_class;
>>> +
>>> + /* public */
>>> + int (*get_fd)(IOMMUMemoryRegion *iommu_mr);
>>> +};
>>> +
>>> sPAPRTCETable *spapr_tce_find_by_liobn(target_ulong liobn);
>>
>> To make sure I'm understanding correctly: the MR subclass here is
>> representing a guest-side property, yes? It means that on the guest
>> side the IOMMU mappings are managed by the PAPR {GET,PUT}_TCE
>> interface.
>
> I do not understand the question, sorry. MR is a QEMU thing, only QEMU
> devices use these MRs as MRs.
>
>
>>> struct sPAPREventLogEntry {
>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>>> index f3a2ac9fee..d245d3cecc 100644
>>> --- a/include/hw/vfio/vfio-common.h
>>> +++ b/include/hw/vfio/vfio-common.h
>>> @@ -177,6 +177,8 @@ extern const MemoryListener vfio_prereg_listener;
>>> int vfio_spapr_create_window(VFIOContainer *container,
>>> MemoryRegionSection *section,
>>> hwaddr *pgsize);
>>> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
>>> + IOMMUMemoryRegion *iommumr);
>>> int vfio_spapr_remove_window(VFIOContainer *container,
>>> hwaddr offset_within_address_space);
>>>
>>> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
>>> index 307dc3021e..82fca61a75 100644
>>> --- a/hw/ppc/spapr_iommu.c
>>> +++ b/hw/ppc/spapr_iommu.c
>>> @@ -171,6 +171,13 @@ static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
>>> }
>>> }
>>>
>>> +static int spapr_tce_get_fd(IOMMUMemoryRegion *iommu_mr)
>>> +{
>>> + sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
>>> +
>>> + return tcet->fd;
>>
>> Does this have a well defined value if there's no KVM?
>
>
> It is -1 in TCG, any other value is undefined.
>
>
>>
>>> +}
>>> +
>>> static int spapr_tce_table_post_load(void *opaque, int version_id)
>>> {
>>> sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
>>> @@ -631,16 +638,19 @@ static TypeInfo spapr_tce_table_info = {
>>> static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
>>> {
>>> IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>>> + sPAPRIOMMUMemoryRegionClass *simrc = SPAPR_IOMMU_MEMORY_REGION_CLASS(klass);
>>>
>>> imrc->translate = spapr_tce_translate_iommu;
>>> imrc->get_min_page_size = spapr_tce_get_min_page_size;
>>> imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
>>> + simrc->get_fd = spapr_tce_get_fd;
>>> }
>>>
>>> static const TypeInfo spapr_iommu_memory_region_info = {
>>> .parent = TYPE_IOMMU_MEMORY_REGION,
>>> .name = TYPE_SPAPR_IOMMU_MEMORY_REGION,
>>> .class_init = spapr_iommu_memory_region_class_init,
>>> + .class_size = sizeof(sPAPRIOMMUMemoryRegionClass),
>>> };
>>>
>>> static void register_types(void)
>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>> index 7b2924c0ef..92f1f88ae8 100644
>>> --- a/hw/vfio/common.c
>>> +++ b/hw/vfio/common.c
>>> @@ -454,6 +454,16 @@ static void vfio_listener_region_add(MemoryListener *listener,
>>> goto fail;
>>> }
>>>
>>> +#ifdef CONFIG_KVM
>>> + if (kvm_enabled()) {
>>> + VFIOGroup *group;
>>> +
>>> + QLIST_FOREACH(group, &container->group_list, container_next) {
>>> + vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd,
>>> + IOMMU_MEMORY_REGION(section->mr));
>>> + }
>>> + }
>>
>> So, here you're informing the region of the groups when the region is
>> mapped in. But don't you similarly need to notify if a group is added
>> to an existing address space?
>
> It is either in VFIO for ages (forever? in vfio_connect_container()), or I
> did not understand the question...
>
>
>> And won't you also need notifications
>> of groups/regions being removed?
>
> Same comment. vfio_disconnect_container()?
>
>
>>
>>> +#endif
>>> vfio_host_win_add(container, section->offset_within_address_space,
>>> section->offset_within_address_space +
>>> int128_get64(section->size) - 1, pgsize);
>>> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
>>> index 32fd6a9b54..2b9af75c03 100644
>>> --- a/hw/vfio/spapr.c
>>> +++ b/hw/vfio/spapr.c
>>> @@ -15,8 +15,12 @@
>>>
>>> #include "hw/vfio/vfio-common.h"
>>> #include "hw/hw.h"
>>> +#include "hw/ppc/spapr.h"
>>> #include "qemu/error-report.h"
>>> #include "trace.h"
>>> +#ifdef CONFIG_KVM
>>> +#include "linux/kvm.h"
>>> +#endif
>>>
>>> static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
>>> {
>>> @@ -188,6 +192,41 @@ int vfio_spapr_create_window(VFIOContainer *container,
>>> return 0;
>>> }
>>>
>>> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
>>> + IOMMUMemoryRegion *iommu_mr)
>>> +{
>>> +#ifdef CONFIG_KVM
>>> + struct kvm_vfio_spapr_tce param = {
>>> + .groupfd = groupfd,
>>> + };
>>> + struct kvm_device_attr attr = {
>>> + .group = KVM_DEV_VFIO_GROUP,
>>> + .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
>>> + .addr = (uint64_t)(unsigned long)¶m,
>>> + };
>>> + IOMMUMemoryRegion *spapr_iommu_mr = SPAPR_IOMMU_MEMORY_REGION(iommu_mr);
>>
>> This will assert if you have a non-spapr guest IOMMU on a ppc host
>> (e.g. emulating an x86 with VT-d under TCG).
>
>
> Sure but this should not execute when TCG, hence "_kvm" in
> "vfio_spapr_notify_kvm".
>
--
Alexey
© 2016 - 2026 Red Hat, Inc.