The IOVA allocator currently ignores host reserved regions.
As a result some chosen IOVAs may collide with some of them,
resulting in VFIO MAP_DMA errors later on. This happens on ARM
where the MSI reserved window quickly is encountered:
[0x8000000, 0x8100000]. since 5.4 kernel, VFIO returns the usable
IOVA regions. So let's enumerate them in the prospect to avoid
them, later on.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
util/vfio-helpers.c | 75 +++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 73 insertions(+), 2 deletions(-)
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 583bdfb36f..8e91beba95 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -40,6 +40,11 @@ typedef struct {
uint64_t iova;
} IOVAMapping;
+struct IOVARange {
+ uint64_t start;
+ uint64_t end;
+};
+
struct QEMUVFIOState {
QemuMutex lock;
@@ -49,6 +54,8 @@ struct QEMUVFIOState {
int device;
RAMBlockNotifier ram_notifier;
struct vfio_region_info config_region_info, bar_region_info[6];
+ struct IOVARange *usable_iova_ranges;
+ uint8_t nb_iova_ranges;
/* These fields are protected by @lock */
/* VFIO's IO virtual address space is managed by splitting into a few
@@ -236,6 +243,36 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
return ret == size ? 0 : -errno;
}
+static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
+{
+ struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
+ struct vfio_info_cap_header *cap = first_cap;
+ void *offset = first_cap;
+ int i;
+
+ while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
+ if (cap->next) {
+ return;
+ }
+ offset += cap->next;
+ cap = (struct vfio_info_cap_header *)cap;
+ }
+
+ cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
+
+ s->nb_iova_ranges = cap_iova_range->nr_iovas;
+ if (s->nb_iova_ranges > 1) {
+ s->usable_iova_ranges =
+ g_realloc(s->usable_iova_ranges,
+ s->nb_iova_ranges * sizeof(struct IOVARange));
+ }
+
+ for (i = 0; i < s->nb_iova_ranges; i++) {
+ s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
+ s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
+ }
+}
+
static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
Error **errp)
{
@@ -243,10 +280,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
int i;
uint16_t pci_cmd;
struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
- struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
+ struct vfio_iommu_type1_info *iommu_info = NULL;
+ size_t iommu_info_size = sizeof(*iommu_info);
struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
char *group_file = NULL;
+ s->usable_iova_ranges = NULL;
+
/* Create a new container */
s->container = open("/dev/vfio/vfio", O_RDWR);
@@ -310,13 +350,38 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
goto fail;
}
+ iommu_info = g_malloc0(iommu_info_size);
+ iommu_info->argsz = iommu_info_size;
+
/* Get additional IOMMU info */
- if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
+ if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
error_setg_errno(errp, errno, "Failed to get IOMMU info");
ret = -errno;
goto fail;
}
+ /*
+ * if the kernel does not report usable IOVA regions, choose
+ * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
+ */
+ s->nb_iova_ranges = 1;
+ s->usable_iova_ranges = g_new0(struct IOVARange, 1);
+ s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
+ s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
+
+ if (iommu_info->argsz > iommu_info_size) {
+ void *first_cap;
+
+ iommu_info_size = iommu_info->argsz;
+ iommu_info = g_realloc(iommu_info, iommu_info_size);
+ if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
+ ret = -errno;
+ goto fail;
+ }
+ first_cap = (void *)iommu_info + iommu_info->cap_offset;
+ collect_usable_iova_ranges(s, first_cap);
+ }
+
s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
if (s->device < 0) {
@@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
if (ret) {
goto fail;
}
+ g_free(iommu_info);
return 0;
fail:
+ g_free(s->usable_iova_ranges);
+ s->nb_iova_ranges = 0;
+ g_free(iommu_info);
close(s->group);
fail_container:
close(s->container);
@@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
}
ram_block_notifier_remove(&s->ram_notifier);
+ g_free(s->usable_iova_ranges);
+ s->nb_iova_ranges = 0;
qemu_vfio_reset(s);
close(s->device);
close(s->group);
--
2.21.3
On 2020-09-25 15:48, Eric Auger wrote:
> The IOVA allocator currently ignores host reserved regions.
> As a result some chosen IOVAs may collide with some of them,
> resulting in VFIO MAP_DMA errors later on. This happens on ARM
> where the MSI reserved window quickly is encountered:
> [0x8000000, 0x8100000]. since 5.4 kernel, VFIO returns the usable
> IOVA regions. So let's enumerate them in the prospect to avoid
> them, later on.
>
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> ---
> util/vfio-helpers.c | 75 +++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 73 insertions(+), 2 deletions(-)
>
> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
> index 583bdfb36f..8e91beba95 100644
> --- a/util/vfio-helpers.c
> +++ b/util/vfio-helpers.c
> @@ -40,6 +40,11 @@ typedef struct {
> uint64_t iova;
> } IOVAMapping;
>
> +struct IOVARange {
> + uint64_t start;
> + uint64_t end;
> +};
> +
> struct QEMUVFIOState {
> QemuMutex lock;
>
> @@ -49,6 +54,8 @@ struct QEMUVFIOState {
> int device;
> RAMBlockNotifier ram_notifier;
> struct vfio_region_info config_region_info, bar_region_info[6];
> + struct IOVARange *usable_iova_ranges;
> + uint8_t nb_iova_ranges;
>
> /* These fields are protected by @lock */
> /* VFIO's IO virtual address space is managed by splitting into a few
> @@ -236,6 +243,36 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
> return ret == size ? 0 : -errno;
> }
>
> +static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
> +{
> + struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
> + struct vfio_info_cap_header *cap = first_cap;
> + void *offset = first_cap;
> + int i;
> +
> + while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
> + if (cap->next) {
This check looks reversed.
> + return;
> + }
> + offset += cap->next;
@offset is unused.
> + cap = (struct vfio_info_cap_header *)cap;
This assignment is nop.
> + }
> +
> + cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
> +
> + s->nb_iova_ranges = cap_iova_range->nr_iovas;
> + if (s->nb_iova_ranges > 1) {
> + s->usable_iova_ranges =
> + g_realloc(s->usable_iova_ranges,
> + s->nb_iova_ranges * sizeof(struct IOVARange));
> + }
> +
> + for (i = 0; i < s->nb_iova_ranges; i++) {
s/ / /
> + s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
> + s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
> + }
> +}
> +
> static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
> Error **errp)
> {
> @@ -243,10 +280,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
> int i;
> uint16_t pci_cmd;
> struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
> - struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
> + struct vfio_iommu_type1_info *iommu_info = NULL;
> + size_t iommu_info_size = sizeof(*iommu_info);
> struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
> char *group_file = NULL;
>
> + s->usable_iova_ranges = NULL;
> +
> /* Create a new container */
> s->container = open("/dev/vfio/vfio", O_RDWR);
>
> @@ -310,13 +350,38 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
> goto fail;
> }
>
> + iommu_info = g_malloc0(iommu_info_size);
> + iommu_info->argsz = iommu_info_size;
> +
> /* Get additional IOMMU info */
> - if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
> + if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
> error_setg_errno(errp, errno, "Failed to get IOMMU info");
> ret = -errno;
> goto fail;
> }
>
> + /*
> + * if the kernel does not report usable IOVA regions, choose
> + * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
> + */
> + s->nb_iova_ranges = 1;
> + s->usable_iova_ranges = g_new0(struct IOVARange, 1);
> + s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
> + s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
> +
> + if (iommu_info->argsz > iommu_info_size) {
> + void *first_cap;
> +
> + iommu_info_size = iommu_info->argsz;
> + iommu_info = g_realloc(iommu_info, iommu_info_size);
> + if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
> + ret = -errno;
> + goto fail;
> + }
> + first_cap = (void *)iommu_info + iommu_info->cap_offset;
> + collect_usable_iova_ranges(s, first_cap);
> + }
> +
> s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
>
> if (s->device < 0) {
> @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
> if (ret) {
> goto fail;
> }
> + g_free(iommu_info);
> return 0;
> fail:
> + g_free(s->usable_iova_ranges);
Set s->usable_iova_ranges to NULL to avoid double free?
> + s->nb_iova_ranges = 0;
> + g_free(iommu_info);
> close(s->group);
> fail_container:
> close(s->container);
> @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
> qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
> }
> ram_block_notifier_remove(&s->ram_notifier);
> + g_free(s->usable_iova_ranges);
> + s->nb_iova_ranges = 0;
> qemu_vfio_reset(s);
> close(s->device);
> close(s->group);
> --
> 2.21.3
>
>
Fam
Hi Fam,
On 9/25/20 4:43 PM, Fam Zheng wrote:
> On 2020-09-25 15:48, Eric Auger wrote:
>> The IOVA allocator currently ignores host reserved regions.
>> As a result some chosen IOVAs may collide with some of them,
>> resulting in VFIO MAP_DMA errors later on. This happens on ARM
>> where the MSI reserved window quickly is encountered:
>> [0x8000000, 0x8100000]. since 5.4 kernel, VFIO returns the usable
>> IOVA regions. So let's enumerate them in the prospect to avoid
>> them, later on.
>>
>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>> ---
>> util/vfio-helpers.c | 75 +++++++++++++++++++++++++++++++++++++++++++--
>> 1 file changed, 73 insertions(+), 2 deletions(-)
>>
>> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
>> index 583bdfb36f..8e91beba95 100644
>> --- a/util/vfio-helpers.c
>> +++ b/util/vfio-helpers.c
>> @@ -40,6 +40,11 @@ typedef struct {
>> uint64_t iova;
>> } IOVAMapping;
>>
>> +struct IOVARange {
>> + uint64_t start;
>> + uint64_t end;
>> +};
>> +
>> struct QEMUVFIOState {
>> QemuMutex lock;
>>
>> @@ -49,6 +54,8 @@ struct QEMUVFIOState {
>> int device;
>> RAMBlockNotifier ram_notifier;
>> struct vfio_region_info config_region_info, bar_region_info[6];
>> + struct IOVARange *usable_iova_ranges;
>> + uint8_t nb_iova_ranges;
>>
>> /* These fields are protected by @lock */
>> /* VFIO's IO virtual address space is managed by splitting into a few
>> @@ -236,6 +243,36 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
>> return ret == size ? 0 : -errno;
>> }
>>
>> +static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
>> +{
>> + struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
>> + struct vfio_info_cap_header *cap = first_cap;
>> + void *offset = first_cap;
>> + int i;
>> +
>> + while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
>> + if (cap->next) {
>
> This check looks reversed.
>
>> + return;
>> + }
>> + offset += cap->next;
>
> @offset is unused.
>
>> + cap = (struct vfio_info_cap_header *)cap;
>
> This assignment is nop.
ugh indeed, that loop implementation is totally crap. I will test the
rewriting by adding an extra cap on kernel side.
>
>> + }
>> +
>> + cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
>> +
>> + s->nb_iova_ranges = cap_iova_range->nr_iovas;
>> + if (s->nb_iova_ranges > 1) {
>> + s->usable_iova_ranges =
>> + g_realloc(s->usable_iova_ranges,
>> + s->nb_iova_ranges * sizeof(struct IOVARange));
>> + }
>> +
>> + for (i = 0; i < s->nb_iova_ranges; i++) {
>
> s/ / /
ok
>
>> + s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
>> + s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
>> + }
>> +}
>> +
>> static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>> Error **errp)
>> {
>> @@ -243,10 +280,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>> int i;
>> uint16_t pci_cmd;
>> struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
>> - struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
>> + struct vfio_iommu_type1_info *iommu_info = NULL;
>> + size_t iommu_info_size = sizeof(*iommu_info);
>> struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
>> char *group_file = NULL;
>>
>> + s->usable_iova_ranges = NULL;
>> +
>> /* Create a new container */
>> s->container = open("/dev/vfio/vfio", O_RDWR);
>>
>> @@ -310,13 +350,38 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>> goto fail;
>> }
>>
>> + iommu_info = g_malloc0(iommu_info_size);
>> + iommu_info->argsz = iommu_info_size;
>> +
>> /* Get additional IOMMU info */
>> - if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
>> + if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
>> error_setg_errno(errp, errno, "Failed to get IOMMU info");
>> ret = -errno;
>> goto fail;
>> }
>>
>> + /*
>> + * if the kernel does not report usable IOVA regions, choose
>> + * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
>> + */
>> + s->nb_iova_ranges = 1;
>> + s->usable_iova_ranges = g_new0(struct IOVARange, 1);
>> + s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
>> + s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
>> +
>> + if (iommu_info->argsz > iommu_info_size) {
>> + void *first_cap;
>> +
>> + iommu_info_size = iommu_info->argsz;
>> + iommu_info = g_realloc(iommu_info, iommu_info_size);
>> + if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
>> + ret = -errno;
>> + goto fail;
>> + }
>> + first_cap = (void *)iommu_info + iommu_info->cap_offset;
>> + collect_usable_iova_ranges(s, first_cap);
>> + }
>> +
>> s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
>>
>> if (s->device < 0) {
>> @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>> if (ret) {
>> goto fail;
>> }
>> + g_free(iommu_info);
>> return 0;
>> fail:
>> + g_free(s->usable_iova_ranges);
>
> Set s->usable_iova_ranges to NULL to avoid double free?
I think I did at the beginning of qemu_vfio_init_pci()
Thanks
Eric
>
>> + s->nb_iova_ranges = 0;
>> + g_free(iommu_info);
>> close(s->group);
>> fail_container:
>> close(s->container);
>> @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
>> qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
>> }
>> ram_block_notifier_remove(&s->ram_notifier);
>> + g_free(s->usable_iova_ranges);
>> + s->nb_iova_ranges = 0;
>> qemu_vfio_reset(s);
>> close(s->device);
>> close(s->group);
>> --
>> 2.21.3
>>
>>
>
> Fam
>
On Fri, 2020-09-25 at 17:23 +0200, Auger Eric wrote:
> > > @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState
> > > *s, const char *device,
> > > if (ret) {
> > > goto fail;
> > > }
> > > + g_free(iommu_info);
> > > return 0;
> > > fail:
> > > + g_free(s->usable_iova_ranges);
> >
> > Set s->usable_iova_ranges to NULL to avoid double free?
>
> I think I did at the beginning of qemu_vfio_init_pci()
Yes, but I mean clearing the pointer will make calling
qemu_vfio_close() safe, there is also a g_free() on this one.
Fam
>
> Thanks
>
> Eric
> >
> > > + s->nb_iova_ranges = 0;
> > > + g_free(iommu_info);
> > > close(s->group);
> > > fail_container:
> > > close(s->container);
> > > @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
> > > qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
> > > }
> > > ram_block_notifier_remove(&s->ram_notifier);
> > > + g_free(s->usable_iova_ranges);
> > > + s->nb_iova_ranges = 0;
> > > qemu_vfio_reset(s);
> > > close(s->device);
> > > close(s->group);
> > > --
> > > 2.21.3
> > >
> > >
> >
> > Fam
> >
>
>
Hi Fam,
On 9/25/20 5:44 PM, Fam Zheng wrote:
> On Fri, 2020-09-25 at 17:23 +0200, Auger Eric wrote:
>>>> @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState
>>>> *s, const char *device,
>>>> if (ret) {
>>>> goto fail;
>>>> }
>>>> + g_free(iommu_info);
>>>> return 0;
>>>> fail:
>>>> + g_free(s->usable_iova_ranges);
>>>
>>> Set s->usable_iova_ranges to NULL to avoid double free?
>>
>> I think I did at the beginning of qemu_vfio_init_pci()
>
> Yes, but I mean clearing the pointer will make calling
> qemu_vfio_close() safe, there is also a g_free() on this one.
Oh yes, got it.
Thank you for the review.
Best Regards
Eric
>
> Fam
>
>>
>> Thanks
>>
>> Eric
>>>
>>>> + s->nb_iova_ranges = 0;
>>>> + g_free(iommu_info);
>>>> close(s->group);
>>>> fail_container:
>>>> close(s->container);
>>>> @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
>>>> qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
>>>> }
>>>> ram_block_notifier_remove(&s->ram_notifier);
>>>> + g_free(s->usable_iova_ranges);
>>>> + s->nb_iova_ranges = 0;
>>>> qemu_vfio_reset(s);
>>>> close(s->device);
>>>> close(s->group);
>>>> --
>>>> 2.21.3
>>>>
>>>>
>>>
>>> Fam
>>>
>>
>>
>
© 2016 - 2026 Red Hat, Inc.