Hi Philippe,
On 10/26/20 11:55 AM, Philippe Mathieu-Daudé wrote:
> This driver uses the host page size to align its memory regions,
> but this size is not always compatible with the IOMMU. Add a
> check if the size matches, and bails out providing a hint what
> is the minimum page size the driver should use.
>
> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
> ---
> util/vfio-helpers.c | 28 ++++++++++++++++++++++++++--
> util/trace-events | 1 +
> 2 files changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
> index 5e288dfa113..874d76c2a2a 100644
> --- a/util/vfio-helpers.c
> +++ b/util/vfio-helpers.c
> @@ -11,6 +11,7 @@
> */
>
> #include "qemu/osdep.h"
> +#include "qemu/cutils.h"
> #include <sys/ioctl.h>
> #include <linux/vfio.h>
> #include "qapi/error.h"
> @@ -288,7 +289,7 @@ static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf)
> }
>
> static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
> - Error **errp)
> + size_t *requested_page_size, Error **errp)
> {
> int ret;
> int i;
> @@ -299,6 +300,8 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
> struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
> char *group_file = NULL;
>
> + assert(requested_page_size && is_power_of_2(*requested_page_size));
> +
> s->usable_iova_ranges = NULL;
>
> /* Create a new container */
> @@ -373,6 +376,27 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
> ret = -errno;
> goto fail;
> }
> + if (!(iommu_info->flags & VFIO_IOMMU_INFO_PGSIZES)) {
> + error_setg(errp, "Failed to get IOMMU page size info");
> + ret = -EINVAL;
> + goto fail;
> + }
> + trace_qemu_vfio_iommu_iova_pgsizes(iommu_info->iova_pgsizes);
> + if (!(iommu_info->iova_pgsizes & *requested_page_size)) {
> + g_autofree char *req_page_size_str = size_to_str(*requested_page_size);
> + g_autofree char *min_page_size_str = NULL;
> + uint64_t pgsizes_masked;
> +
> + pgsizes_masked = MAKE_64BIT_MASK(0, ctz64(*requested_page_size));
> + *requested_page_size = 1U << ctz64(iommu_info->iova_pgsizes
> + & ~pgsizes_masked);
> + min_page_size_str = size_to_str(*requested_page_size);
> + error_setg(errp, "Unsupported IOMMU page size: %s", req_page_size_str);
> + error_append_hint(errp, "Minimum IOMMU page size: %s\n",
> + min_page_size_str);
this blocks the 64kB tentative support. Before I was able to run the UC
with 64kB page host while the MPS used by the device is 4kB. Of course I
have no evidence yet my work is correct - besides it works in my case
for a sepcific device - but at least we should make sure we do not
introduce a new blocker here.
Also as discussed together
f68453237b block/nvme: Map doorbells pages write-only
causes troubles with 64kB pages as there, you attempt to map 2
consecutive 4kB pages with different attributes. The 2d mmap fails.
Thanks
Eric
> + ret = -EINVAL;
> + goto fail;
> + }
>
> /*
> * if the kernel does not report usable IOVA regions, choose
> @@ -520,7 +544,7 @@ QEMUVFIOState *qemu_vfio_open_pci(const char *device, size_t *min_page_size,
> int r;
> QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
>
> - r = qemu_vfio_init_pci(s, device, errp);
> + r = qemu_vfio_init_pci(s, device, min_page_size, errp);
> if (r) {
> g_free(s);
> return NULL;
> diff --git a/util/trace-events b/util/trace-events
> index 7faad2a718c..3c36def9f30 100644
> --- a/util/trace-events
> +++ b/util/trace-events
> @@ -87,6 +87,7 @@ qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host
> qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d &iova %p"
> qemu_vfio_dma_mapped(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64" size 0x%zx"
> qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
> +qemu_vfio_iommu_iova_pgsizes(uint64_t iova_pgsizes) "iommu page size bitmask: 0x%08"PRIx64
> qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size %d (region ofs 0x%"PRIx64" size %"PRId64")"
> qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size %d (region ofs 0x%"PRIx64" size %"PRId64")"
> qemu_vfio_region_info(const char *desc, uint64_t offset, uint64_t size, uint32_t cap_offset) "region '%s' ofs 0x%"PRIx64" size %"PRId64" cap_ofs %"PRId32
>