From nobody Tue Oct 28 04:18:25 2025
Delivered-To: importer@patchew.org
Received-SPF: pass (zoho.com: domain of gnu.org designates 208.118.235.17 as
 permitted sender) client-ip=208.118.235.17;
 envelope-from=qemu-devel-bounces+importer=patchew.org@nongnu.org;
 helo=lists.gnu.org;
Authentication-Results: mx.zohomail.com;
	spf=pass (zoho.com: domain of gnu.org designates 208.118.235.17 as permitted
 sender)  smtp.mailfrom=qemu-devel-bounces+importer=patchew.org@nongnu.org
Return-Path: <qemu-devel-bounces+importer=patchew.org@nongnu.org>
Received: from lists.gnu.org (lists.gnu.org [208.118.235.17]) by
 mx.zohomail.com
	with SMTPS id 1515576232075740.9500203514507;
 Wed, 10 Jan 2018 01:23:52 -0800 (PST)
Received: from localhost ([::1]:56029 helo=lists.gnu.org)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <qemu-devel-bounces+importer=patchew.org@nongnu.org>)
	id 1eZCbv-0000dl-48
	for importer@patchew.org; Wed, 10 Jan 2018 04:23:51 -0500
Received: from eggs.gnu.org ([2001:4830:134:3::10]:48888)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <famz@redhat.com>) id 1eZCXa-0005Uh-Cs
	for qemu-devel@nongnu.org; Wed, 10 Jan 2018 04:19:25 -0500
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <famz@redhat.com>) id 1eZCXX-0008OL-Go
	for qemu-devel@nongnu.org; Wed, 10 Jan 2018 04:19:22 -0500
Received: from mx1.redhat.com ([209.132.183.28]:50670)
	by eggs.gnu.org with esmtps (TLS1.0:DHE_RSA_AES_256_CBC_SHA1:32)
	(Exim 4.71) (envelope-from <famz@redhat.com>)
	id 1eZCXN-0008FI-9R; Wed, 10 Jan 2018 04:19:09 -0500
Received: from smtp.corp.redhat.com (int-mx04.intmail.prod.int.phx2.redhat.com
	[10.5.11.14])
	(using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits))
	(No client certificate requested)
	by mx1.redhat.com (Postfix) with ESMTPS id 5D14825B89;
	Wed, 10 Jan 2018 09:19:08 +0000 (UTC)
Received: from lemon.usersys.redhat.com (ovpn-12-52.pek2.redhat.com
	[10.72.12.52])
	by smtp.corp.redhat.com (Postfix) with ESMTP id EE5D37B8D6;
	Wed, 10 Jan 2018 09:19:02 +0000 (UTC)
From: Fam Zheng <famz@redhat.com>
To: qemu-devel@nongnu.org
Date: Wed, 10 Jan 2018 17:18:39 +0800
Message-Id: <20180110091846.10699-3-famz@redhat.com>
In-Reply-To: <20180110091846.10699-1-famz@redhat.com>
References: <20180110091846.10699-1-famz@redhat.com>
X-Scanned-By: MIMEDefang 2.79 on 10.5.11.14
X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16
	(mx1.redhat.com [10.5.110.39]);
	Wed, 10 Jan 2018 09:19:08 +0000 (UTC)
X-detected-operating-system: by eggs.gnu.org: GNU/Linux 2.2.x-3.x [generic]
	[fuzzy]
X-Received-From: 209.132.183.28
Subject: [Qemu-devel] [PATCH v4 2/9] util: Introduce vfio helpers
X-BeenThere: qemu-devel@nongnu.org
X-Mailman-Version: 2.1.21
Precedence: list
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel/>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
Cc: Kevin Wolf <kwolf@redhat.com>, Fam Zheng <famz@redhat.com>,
	qemu-block@nongnu.org, Markus Armbruster <armbru@redhat.com>,
	Max Reitz <mreitz@redhat.com>, Keith Busch <keith.busch@intel.com>,
	Stefan Hajnoczi <stefanha@redhat.com>, Paolo Bonzini <pbonzini@redhat.com>,
	Karl Rister <krister@redhat.com>
Errors-To: qemu-devel-bounces+importer=patchew.org@nongnu.org
Sender: "Qemu-devel" <qemu-devel-bounces+importer=patchew.org@nongnu.org>
X-ZohoMail: RSF_0  Z_629925259 SPT_0
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

This is a library to manage the host vfio interface, which could be used
to implement userspace device driver code in QEMU such as NVMe or net
controllers.

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/vfio-helpers.h |  30 ++
 util/Makefile.objs          |   1 +
 util/trace-events           |  11 +
 util/vfio-helpers.c         | 723 ++++++++++++++++++++++++++++++++++++++++=
++++
 4 files changed, 765 insertions(+)
 create mode 100644 include/qemu/vfio-helpers.h
 create mode 100644 util/vfio-helpers.c

diff --git a/include/qemu/vfio-helpers.h b/include/qemu/vfio-helpers.h
new file mode 100644
index 0000000000..6bdba3b66e
--- /dev/null
+++ b/include/qemu/vfio-helpers.h
@@ -0,0 +1,30 @@
+/*
+ * QEMU VFIO helpers
+ *
+ * Copyright 2016 - 2018 Red Hat, Inc.
+ *
+ * Authors:
+ *   Fam Zheng <famz@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or late=
r.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_VFIO_HELPERS_H
+#define QEMU_VFIO_HELPERS_H
+#include "qemu/typedefs.h"
+
+typedef struct QEMUVFIOState QEMUVFIOState;
+
+QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp);
+void qemu_vfio_close(QEMUVFIOState *s);
+int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
+                      bool temporary, uint64_t *iova_list);
+int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s);
+void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host);
+void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, Error **errp);
+void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar);
+int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
+                           int irq_type, Error **errp);
+
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 2973b0a323..3fb611631f 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -46,3 +46,4 @@ util-obj-y +=3D qht.o
 util-obj-y +=3D range.o
 util-obj-y +=3D stats64.o
 util-obj-y +=3D systemd.o
+util-obj-$(CONFIG_LINUX) +=3D vfio-helpers.o
diff --git a/util/trace-events b/util/trace-events
index 025499f83f..2f57bf2337 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -59,3 +59,14 @@ lockcnt_futex_wake(const void *lockcnt) "lockcnt %p waki=
ng up one waiter"
 # util/qemu-thread-posix.c
 qemu_mutex_locked(void *lock) "locked mutex %p"
 qemu_mutex_unlocked(void *lock) "unlocked mutex %p"
+
+# util/vfio-helpers.c
+qemu_vfio_dma_reset_temporary(void *s) "s %p"
+qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p siz=
e 0x%zx"
+qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p s=
ize 0x%zx"
+qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
+qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_=
t iova) "s %p host %p size %zu index %d iova 0x%"PRIx64
+qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %=
p host %p size %zu iova 0x%"PRIx64
+qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64=
_t *iova) "s %p host %p size %zu temporary %d iova %p"
+qemu_vfio_dma_map_invalid(void *s, void *mapping_host, size_t mapping_size=
, void *host, size_t size) "s %p mapping %p %zu requested %p %zu"
+qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
new file mode 100644
index 0000000000..b5df67a737
--- /dev/null
+++ b/util/vfio-helpers.c
@@ -0,0 +1,723 @@
+/*
+ * VFIO utility
+ *
+ * Copyright 2016 - 2018 Red Hat, Inc.
+ *
+ * Authors:
+ *   Fam Zheng <famz@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or late=
r.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include <sys/ioctl.h>
+#include <linux/vfio.h>
+#include "qapi/error.h"
+#include "exec/ramlist.h"
+#include "exec/cpu-common.h"
+#include "trace.h"
+#include "qemu/queue.h"
+#include "qemu/error-report.h"
+#include "standard-headers/linux/pci_regs.h"
+#include "qemu/event_notifier.h"
+#include "qemu/vfio-helpers.h"
+#include "trace.h"
+
+#define QEMU_VFIO_DEBUG 0
+
+#define QEMU_VFIO_IOVA_MIN 0x10000ULL
+/* XXX: Once VFIO exposes the iova bit width in the IOMMU capability inter=
face,
+ * we can use a runtime limit; alternatively it's also possible to do plat=
form
+ * specific detection by reading sysfs entries. Until then, 39 is a safe b=
et.
+ **/
+#define QEMU_VFIO_IOVA_MAX (1ULL << 39)
+
+typedef struct {
+    /* Page aligned addr. */
+    void *host;
+    size_t size;
+    uint64_t iova;
+} IOVAMapping;
+
+struct QEMUVFIOState {
+    QemuMutex lock;
+
+    /* These fields are protected by BQL */
+    int container;
+    int group;
+    int device;
+    RAMBlockNotifier ram_notifier;
+    struct vfio_region_info config_region_info, bar_region_info[6];
+
+    /* These fields are protected by @lock */
+    /* VFIO's IO virtual address space is managed by splitting into a few
+     * sections:
+     *
+     * ---------------       <=3D 0
+     * |xxxxxxxxxxxxx|
+     * |-------------|       <=3D QEMU_VFIO_IOVA_MIN
+     * |             |
+     * |    Fixed    |
+     * |             |
+     * |-------------|       <=3D low_water_mark
+     * |             |
+     * |    Free     |
+     * |             |
+     * |-------------|       <=3D high_water_mark
+     * |             |
+     * |    Temp     |
+     * |             |
+     * |-------------|       <=3D QEMU_VFIO_IOVA_MAX
+     * |xxxxxxxxxxxxx|
+     * |xxxxxxxxxxxxx|
+     * ---------------
+     *
+     * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
+     *
+     * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
+     *   [QEMU_VFIO_IOVA_MIN, low_water_mark).  Once allocated they will n=
ot be
+     *   reclaimed - low_water_mark never shrinks;
+     *
+     * - IOVAs in range [low_water_mark, high_water_mark) are free;
+     *
+     * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
+     *   mappings. At each qemu_vfio_dma_reset_temporary() call, the whole=
 area
+     *   is recycled. The caller should make sure I/O's depending on these
+     *   mappings are completed before calling.
+     **/
+    uint64_t low_water_mark;
+    uint64_t high_water_mark;
+    IOVAMapping *mappings;
+    int nr_mappings;
+};
+
+/**
+ * Find group file by PCI device address as specified @device, and return =
the
+ * path. The returned string is owned by caller and should be g_free'ed la=
ter.
+ */
+static char *sysfs_find_group_file(const char *device, Error **errp)
+{
+    char *sysfs_link;
+    char *sysfs_group;
+    char *p;
+    char *path =3D NULL;
+
+    sysfs_link =3D g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", =
device);
+    sysfs_group =3D g_malloc(PATH_MAX);
+    if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) =3D=3D -1) {
+        error_setg_errno(errp, errno, "Failed to find iommu group sysfs pa=
th");
+        goto out;
+    }
+    p =3D strrchr(sysfs_group, '/');
+    if (!p) {
+        error_setg(errp, "Failed to find iommu group number");
+        goto out;
+    }
+
+    path =3D g_strdup_printf("/dev/vfio/%s", p + 1);
+out:
+    g_free(sysfs_link);
+    g_free(sysfs_group);
+    return path;
+}
+
+static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
+{
+    assert(index >=3D 0 && index < ARRAY_SIZE(s->bar_region_info));
+}
+
+static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **err=
p)
+{
+    assert_bar_index_valid(s, index);
+    s->bar_region_info[index] =3D (struct vfio_region_info) {
+        .index =3D VFIO_PCI_BAR0_REGION_INDEX + index,
+        .argsz =3D sizeof(struct vfio_region_info),
+    };
+    if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[=
index])) {
+        error_setg_errno(errp, errno, "Failed to get BAR region info");
+        return -errno;
+    }
+
+    return 0;
+}
+
+/**
+ * Map a PCI bar area.
+ */
+void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, Error **errp)
+{
+    void *p;
+    assert_bar_index_valid(s, index);
+    p =3D mmap(NULL, MIN(8192, s->bar_region_info[index].size),
+             PROT_READ | PROT_WRITE, MAP_SHARED,
+             s->device, s->bar_region_info[index].offset);
+    if (p =3D=3D MAP_FAILED) {
+        error_setg_errno(errp, errno, "Failed to map BAR region");
+        p =3D NULL;
+    }
+    return p;
+}
+
+/**
+ * Unmap a PCI bar area.
+ */
+void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar)
+{
+    if (bar) {
+        munmap(bar, MIN(8192, s->bar_region_info[index].size));
+    }
+}
+
+/**
+ * Initialize device IRQ with @irq_type and and register an event notifier.
+ */
+int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
+                           int irq_type, Error **errp)
+{
+    int r;
+    struct vfio_irq_set *irq_set;
+    size_t irq_set_size;
+    struct vfio_irq_info irq_info =3D { .argsz =3D sizeof(irq_info) };
+
+    irq_info.index =3D irq_type;
+    if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
+        error_setg_errno(errp, errno, "Failed to get device interrupt info=
");
+        return -errno;
+    }
+    if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+        error_setg(errp, "Device interrupt doesn't support eventfd");
+        return -EINVAL;
+    }
+
+    irq_set_size =3D sizeof(*irq_set) + sizeof(int);
+    irq_set =3D g_malloc0(irq_set_size);
+
+    /* Get to a known IRQ state */
+    *irq_set =3D (struct vfio_irq_set) {
+        .argsz =3D irq_set_size,
+        .flags =3D VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+        .index =3D irq_info.index,
+        .start =3D 0,
+        .count =3D 1,
+    };
+
+    *(int *)&irq_set->data =3D event_notifier_get_fd(e);
+    r =3D ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
+    g_free(irq_set);
+    if (r) {
+        error_setg_errno(errp, errno, "Failed to setup device interrupt");
+        return -errno;
+    }
+    return 0;
+}
+
+static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
+                                     int size, int ofs)
+{
+    int ret;
+
+    do {
+        ret =3D pread(s->device, buf, size, s->config_region_info.offset +=
 ofs);
+    } while (ret =3D=3D -1 && errno =3D=3D EINTR);
+    return ret =3D=3D size ? 0 : -errno;
+}
+
+static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int siz=
e, int ofs)
+{
+    int ret;
+
+    do {
+        ret =3D pwrite(s->device, buf, size, s->config_region_info.offset =
+ ofs);
+    } while (ret =3D=3D -1 && errno =3D=3D EINTR);
+    return ret =3D=3D size ? 0 : -errno;
+}
+
+static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
+                              Error **errp)
+{
+    int ret;
+    int i;
+    uint16_t pci_cmd;
+    struct vfio_group_status group_status =3D { .argsz =3D sizeof(group_st=
atus) };
+    struct vfio_iommu_type1_info iommu_info =3D { .argsz =3D sizeof(iommu_=
info) };
+    struct vfio_device_info device_info =3D { .argsz =3D sizeof(device_inf=
o) };
+    char *group_file =3D NULL;
+
+    /* Create a new container */
+    s->container =3D open("/dev/vfio/vfio", O_RDWR);
+
+    if (s->container =3D=3D -1) {
+        error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
+        return -errno;
+    }
+    if (ioctl(s->container, VFIO_GET_API_VERSION) !=3D VFIO_API_VERSION) {
+        error_setg(errp, "Invalid VFIO version");
+        ret =3D -EINVAL;
+        goto fail_container;
+    }
+
+    if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
+        error_setg_errno(errp, errno, "VFIO IOMMU check failed");
+        ret =3D -EINVAL;
+        goto fail_container;
+    }
+
+    /* Open the group */
+    group_file =3D sysfs_find_group_file(device, errp);
+    if (!group_file) {
+        ret =3D -EINVAL;
+        goto fail_container;
+    }
+
+    s->group =3D open(group_file, O_RDWR);
+    if (s->group =3D=3D -1) {
+        error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
+                         group_file);
+        g_free(group_file);
+        ret =3D -errno;
+        goto fail_container;
+    }
+    g_free(group_file);
+
+    /* Test the group is viable and available */
+    if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
+        error_setg_errno(errp, errno, "Failed to get VFIO group status");
+        ret =3D -errno;
+        goto fail;
+    }
+
+    if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+        error_setg(errp, "VFIO group is not viable");
+        ret =3D -EINVAL;
+        goto fail;
+    }
+
+    /* Add the group to the container */
+    if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
+        error_setg_errno(errp, errno, "Failed to add group to VFIO contain=
er");
+        ret =3D -errno;
+        goto fail;
+    }
+
+    /* Enable the IOMMU model we want */
+    if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
+        error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
+        ret =3D -errno;
+        goto fail;
+    }
+
+    /* Get additional IOMMU info */
+    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
+        error_setg_errno(errp, errno, "Failed to get IOMMU info");
+        ret =3D -errno;
+        goto fail;
+    }
+
+    s->device =3D ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
+
+    if (s->device < 0) {
+        error_setg_errno(errp, errno, "Failed to get device fd");
+        ret =3D -errno;
+        goto fail;
+    }
+
+    /* Test and setup the device */
+    if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
+        error_setg_errno(errp, errno, "Failed to get device info");
+        ret =3D -errno;
+        goto fail;
+    }
+
+    if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
+        error_setg(errp, "Invalid device regions");
+        ret =3D -EINVAL;
+        goto fail;
+    }
+
+    s->config_region_info =3D (struct vfio_region_info) {
+        .index =3D VFIO_PCI_CONFIG_REGION_INDEX,
+        .argsz =3D sizeof(struct vfio_region_info),
+    };
+    if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_in=
fo)) {
+        error_setg_errno(errp, errno, "Failed to get config region info");
+        ret =3D -errno;
+        goto fail;
+    }
+
+    for (i =3D 0; i < 6; i++) {
+        ret =3D qemu_vfio_pci_init_bar(s, i, errp);
+        if (ret) {
+            goto fail;
+        }
+    }
+
+    /* Enable bus master */
+    ret =3D qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_CO=
MMAND);
+    if (ret) {
+        goto fail;
+    }
+    pci_cmd |=3D PCI_COMMAND_MASTER;
+    ret =3D qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_C=
OMMAND);
+    if (ret) {
+        goto fail;
+    }
+    return 0;
+fail:
+    close(s->group);
+fail_container:
+    close(s->container);
+    return ret;
+}
+
+static void qemu_vfio_ram_block_added(RAMBlockNotifier *n,
+                                      void *host, size_t size)
+{
+    QEMUVFIOState *s =3D container_of(n, QEMUVFIOState, ram_notifier);
+    trace_qemu_vfio_ram_block_added(s, host, size);
+    qemu_vfio_dma_map(s, host, size, false, NULL);
+}
+
+static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n,
+                                        void *host, size_t size)
+{
+    QEMUVFIOState *s =3D container_of(n, QEMUVFIOState, ram_notifier);
+    if (host) {
+        trace_qemu_vfio_ram_block_removed(s, host, size);
+        qemu_vfio_dma_unmap(s, host);
+    }
+}
+
+static int qemu_vfio_init_ramblock(const char *block_name, void *host_addr,
+                                   ram_addr_t offset, ram_addr_t length,
+                                   void *opaque)
+{
+    int ret;
+    QEMUVFIOState *s =3D opaque;
+
+    if (!host_addr) {
+        return 0;
+    }
+    ret =3D qemu_vfio_dma_map(s, host_addr, length, false, NULL);
+    if (ret) {
+        fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %ld\n",
+                host_addr, length);
+    }
+    return 0;
+}
+
+static void qemu_vfio_open_common(QEMUVFIOState *s)
+{
+    s->ram_notifier.ram_block_added =3D qemu_vfio_ram_block_added;
+    s->ram_notifier.ram_block_removed =3D qemu_vfio_ram_block_removed;
+    ram_block_notifier_add(&s->ram_notifier);
+    s->low_water_mark =3D QEMU_VFIO_IOVA_MIN;
+    s->high_water_mark =3D QEMU_VFIO_IOVA_MAX;
+    qemu_ram_foreach_block(qemu_vfio_init_ramblock, s);
+    qemu_mutex_init(&s->lock);
+}
+
+/**
+ * Open a PCI device, e.g. "0000:00:01.0".
+ */
+QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
+{
+    int r;
+    QEMUVFIOState *s =3D g_new0(QEMUVFIOState, 1);
+
+    r =3D qemu_vfio_init_pci(s, device, errp);
+    if (r) {
+        g_free(s);
+        return NULL;
+    }
+    qemu_vfio_open_common(s);
+    return s;
+}
+
+static void qemu_vfio_dump_mapping(IOVAMapping *m)
+{
+    if (QEMU_VFIO_DEBUG) {
+        printf("  vfio mapping %p %lx to %lx\n", m->host, m->size, m->iova=
);
+    }
+}
+
+static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
+{
+    int i;
+
+    if (QEMU_VFIO_DEBUG) {
+        printf("vfio mappings\n");
+        for (i =3D 0; i < s->nr_mappings; ++i) {
+            qemu_vfio_dump_mapping(&s->mappings[i]);
+        }
+    }
+}
+
+/**
+ * Find the mapping entry that contains [host, host + size) and set @index=
 to
+ * the position. If no entry contains it, @index is the position _after_ w=
hich
+ * to insert the new mapping. IOW, it is the index of the largest element =
that
+ * is smaller than @host, or -1 if no entry is.
+ */
+static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
+                                           int *index)
+{
+    IOVAMapping *p =3D s->mappings;
+    IOVAMapping *q =3D p ? p + s->nr_mappings - 1 : NULL;
+    IOVAMapping *mid;
+    trace_qemu_vfio_find_mapping(s, host);
+    if (!p) {
+        *index =3D -1;
+        return NULL;
+    }
+    while (true) {
+        mid =3D p + (q - p) / 2;
+        if (mid =3D=3D p) {
+            break;
+        }
+        if (mid->host > host) {
+            q =3D mid;
+        } else if (mid->host < host) {
+            p =3D mid;
+        } else {
+            break;
+        }
+    }
+    if (mid->host > host) {
+        mid--;
+    } else if (mid < &s->mappings[s->nr_mappings - 1]
+               && (mid + 1)->host <=3D host) {
+        mid++;
+    }
+    *index =3D mid - &s->mappings[0];
+    if (mid >=3D &s->mappings[0] &&
+        mid->host <=3D host && mid->host + mid->size > host) {
+        assert(mid < &s->mappings[s->nr_mappings]);
+        return mid;
+    }
+    /* At this point *index + 1 is the right position to insert the new
+     * mapping.*/
+    return NULL;
+}
+
+/**
+ * Allocate IOVA and and create a new mapping record and insert it in @s.
+ */
+static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
+                                          void *host, size_t size,
+                                          int index, uint64_t iova)
+{
+    int shift;
+    IOVAMapping m =3D {.host =3D host, .size =3D size, iova =3D iova};
+    IOVAMapping *insert;
+
+    assert(QEMU_IS_ALIGNED(size, getpagesize()));
+    assert(QEMU_IS_ALIGNED(s->low_water_mark, getpagesize()));
+    assert(QEMU_IS_ALIGNED(s->high_water_mark, getpagesize()));
+    trace_qemu_vfio_new_mapping(s, host, size, index, iova);
+
+    assert(index >=3D 0);
+    s->nr_mappings++;
+    s->mappings =3D g_realloc_n(s->mappings, sizeof(s->mappings[0]),
+                              s->nr_mappings);
+    insert =3D &s->mappings[index];
+    shift =3D s->nr_mappings - index - 1;
+    if (shift) {
+        memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
+    }
+    *insert =3D m;
+    return insert;
+}
+
+/* Do the DMA mapping with VFIO. */
+static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
+                                uint64_t iova)
+{
+    struct vfio_iommu_type1_dma_map dma_map =3D {
+        .argsz =3D sizeof(dma_map),
+        .flags =3D VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+        .iova =3D iova,
+        .vaddr =3D (uintptr_t)host,
+        .size =3D size,
+    };
+    trace_qemu_vfio_do_mapping(s, host, size, iova);
+
+    if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
+        error_report("VFIO_MAP_DMA: %d", -errno);
+        return -errno;
+    }
+    return 0;
+}
+
+/**
+ * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
+ */
+static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
+                                   Error **errp)
+{
+    int index;
+    struct vfio_iommu_type1_dma_unmap unmap =3D {
+        .argsz =3D sizeof(unmap),
+        .flags =3D 0,
+        .iova =3D mapping->iova,
+        .size =3D mapping->size,
+    };
+
+    index =3D mapping - s->mappings;
+    assert(mapping->size > 0);
+    assert(QEMU_IS_ALIGNED(mapping->size, getpagesize()));
+    assert(index >=3D 0 && index < s->nr_mappings);
+    if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+        error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno);
+    }
+    memmove(mapping, &s->mappings[index + 1],
+            sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
+    s->nr_mappings--;
+    s->mappings =3D g_realloc_n(s->mappings, sizeof(s->mappings[0]),
+                              s->nr_mappings);
+}
+
+/* Check if the mapping list is (ascending) ordered. */
+static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
+{
+    int i;
+    if (QEMU_VFIO_DEBUG) {
+        for (i =3D 0; i < s->nr_mappings - 1; ++i) {
+            if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
+                fprintf(stderr, "item %d not sorted!\n", i);
+                qemu_vfio_dump_mappings(s);
+                return false;
+            }
+            if (!(s->mappings[i].host + s->mappings[i].size <=3D
+                  s->mappings[i + 1].host)) {
+                fprintf(stderr, "item %d overlap with next!\n", i);
+                qemu_vfio_dump_mappings(s);
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+/* Map [host, host + size) area into a contiguous IOVA address space, and =
store
+ * the result in @iova if not NULL. The caller need to make sure the area =
is
+ * aligned to page size, and mustn't overlap with existing mapping areas (=
split
+ * mapping status within this area is not allowed).
+ */
+int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
+                      bool temporary, uint64_t *iova)
+{
+    int ret =3D 0;
+    int index;
+    IOVAMapping *mapping;
+    uint64_t iova0;
+
+    assert(QEMU_PTR_IS_ALIGNED(host, getpagesize()));
+    assert(QEMU_IS_ALIGNED(size, getpagesize()));
+    trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
+    qemu_mutex_lock(&s->lock);
+    mapping =3D qemu_vfio_find_mapping(s, host, &index);
+    if (mapping) {
+        iova0 =3D mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->h=
ost);
+    } else {
+        if (s->high_water_mark - s->low_water_mark + 1 < size) {
+            ret =3D -ENOMEM;
+            goto out;
+        }
+        if (!temporary) {
+            iova0 =3D s->low_water_mark;
+            mapping =3D qemu_vfio_add_mapping(s, host, size, index + 1, io=
va0);
+            if (!mapping) {
+                ret =3D -ENOMEM;
+                goto out;
+            }
+            assert(qemu_vfio_verify_mappings(s));
+            ret =3D qemu_vfio_do_mapping(s, host, size, iova0);
+            if (ret) {
+                qemu_vfio_undo_mapping(s, mapping, NULL);
+                goto out;
+            }
+            s->low_water_mark +=3D size;
+            qemu_vfio_dump_mappings(s);
+        } else {
+            iova0 =3D s->high_water_mark - size;
+            ret =3D qemu_vfio_do_mapping(s, host, size, iova0);
+            if (ret) {
+                goto out;
+            }
+            s->high_water_mark -=3D size;
+        }
+    }
+    if (iova) {
+        *iova =3D iova0;
+    }
+    qemu_mutex_unlock(&s->lock);
+out:
+    return ret;
+}
+
+/* Reset the high watermark and free all "temporary" mappings. */
+int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
+{
+    struct vfio_iommu_type1_dma_unmap unmap =3D {
+        .argsz =3D sizeof(unmap),
+        .flags =3D 0,
+        .iova =3D s->high_water_mark,
+        .size =3D QEMU_VFIO_IOVA_MAX - s->high_water_mark,
+    };
+    trace_qemu_vfio_dma_reset_temporary(s);
+    qemu_mutex_lock(&s->lock);
+    if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+        error_report("VFIO_UNMAP_DMA: %d", -errno);
+        qemu_mutex_unlock(&s->lock);
+        return -errno;
+    }
+    s->high_water_mark =3D QEMU_VFIO_IOVA_MAX;
+    qemu_mutex_unlock(&s->lock);
+    return 0;
+}
+
+/* Unmapping the whole area that was previously mapped with
+ * qemu_vfio_dma_map(). */
+void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
+{
+    int index =3D 0;
+    IOVAMapping *m;
+
+    if (!host) {
+        return;
+    }
+
+    trace_qemu_vfio_dma_unmap(s, host);
+    qemu_mutex_lock(&s->lock);
+    m =3D qemu_vfio_find_mapping(s, host, &index);
+    if (!m) {
+        goto out;
+    }
+    qemu_vfio_undo_mapping(s, m, NULL);
+out:
+    qemu_mutex_unlock(&s->lock);
+}
+
+static void qemu_vfio_reset(QEMUVFIOState *s)
+{
+    ioctl(s->device, VFIO_DEVICE_RESET);
+}
+
+/* Close and free the VFIO resources. */
+void qemu_vfio_close(QEMUVFIOState *s)
+{
+    int i;
+
+    if (!s) {
+        return;
+    }
+    for (i =3D 0; i < s->nr_mappings; ++i) {
+        qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
+    }
+    ram_block_notifier_remove(&s->ram_notifier);
+    qemu_vfio_reset(s);
+    close(s->device);
+    close(s->group);
+    close(s->container);
+}
--=20
2.14.3