[RFC PATCH v4 18/20] vhost: Add VhostIOVATree

Eugenio Pérez posted 20 patches 4 years, 4 months ago
There is a newer version of this series
[RFC PATCH v4 18/20] vhost: Add VhostIOVATree
Posted by Eugenio Pérez 4 years, 4 months ago
This tree is able to look for a translated address from an IOVA address.

At first glance is similar to util/iova-tree. However, SVQ working on
devices with limited IOVA space need more capabilities, like allocating
IOVA chunks or perform reverse translations (qemu addresses to iova).

The allocation capability, as "assign a free IOVA address to this chunk
of memory in qemu's address space" allows shadow virtqueue to create a
new address space that is not restricted by guest's addressable one, so
we can allocate shadow vqs vrings outside of its reachability, nor
qemu's one. At the moment, the allocation is just done growing, not
allowing deletion.

A different name could be used, but ordered searchable array is a
little bit long though.

It duplicates the array so it can search efficiently both directions,
and it will signal overlap if iova or the translated address is
present in it's each array.

Use of array will be changed to util-iova-tree in future series.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
 hw/virtio/vhost-iova-tree.h |  40 +++++++
 hw/virtio/vhost-iova-tree.c | 230 ++++++++++++++++++++++++++++++++++++
 hw/virtio/meson.build       |   2 +-
 3 files changed, 271 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-iova-tree.h
 create mode 100644 hw/virtio/vhost-iova-tree.c

diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
new file mode 100644
index 0000000000..d163a88905
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.h
@@ -0,0 +1,40 @@
+/*
+ * vhost software live migration ring
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
+#define HW_VIRTIO_VHOST_IOVA_TREE_H
+
+#include "exec/memory.h"
+
+typedef struct VhostDMAMap {
+    void *translated_addr;
+    hwaddr iova;
+    hwaddr size;                /* Inclusive */
+    IOMMUAccessFlags perm;
+} VhostDMAMap;
+
+typedef enum VhostDMAMapNewRC {
+    VHOST_DMA_MAP_NO_SPACE = -3,
+    VHOST_DMA_MAP_OVERLAP = -2,
+    VHOST_DMA_MAP_INVALID = -1,
+    VHOST_DMA_MAP_OK = 0,
+} VhostDMAMapNewRC;
+
+typedef struct VhostIOVATree VhostIOVATree;
+
+VhostIOVATree *vhost_iova_tree_new(void);
+void vhost_iova_tree_unref(VhostIOVATree *iova_rm);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_unref);
+
+const VhostDMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_rm,
+                                             const VhostDMAMap *map);
+VhostDMAMapNewRC vhost_iova_tree_alloc(VhostIOVATree *iova_rm,
+                                       VhostDMAMap *map);
+
+#endif
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
new file mode 100644
index 0000000000..c284e27607
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.c
@@ -0,0 +1,230 @@
+/*
+ * vhost software live migration ring
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "vhost-iova-tree.h"
+
+#define G_ARRAY_NOT_ZERO_TERMINATED false
+#define G_ARRAY_NOT_CLEAR_ON_ALLOC false
+
+#define iova_min qemu_real_host_page_size
+
+/**
+ * VhostIOVATree, able to:
+ * - Translate iova address
+ * - Reverse translate iova address (from translated to iova)
+ * - Allocate IOVA regions for translated range (potentially slow operation)
+ *
+ * Note that it cannot remove nodes.
+ */
+struct VhostIOVATree {
+    /* Ordered array of reverse translations, IOVA address to qemu memory. */
+    GArray *iova_taddr_map;
+
+    /*
+     * Ordered array of translations from qemu virtual memory address to iova
+     */
+    GArray *taddr_iova_map;
+};
+
+/**
+ * Inserts an element after an existing one in garray.
+ *
+ * @array      The array
+ * @prev_elem  The previous element of array of NULL if prepending
+ * @map        The DMA map
+ *
+ * It provides the aditional advantage of being type safe over
+ * g_array_insert_val, which accepts a reference pointer instead of a value
+ * with no complains.
+ */
+static void vhost_iova_tree_insert_after(GArray *array,
+                                         const VhostDMAMap *prev_elem,
+                                         const VhostDMAMap *map)
+{
+    size_t pos;
+
+    if (!prev_elem) {
+        pos = 0;
+    } else {
+        pos = prev_elem - &g_array_index(array, typeof(*prev_elem), 0) + 1;
+    }
+
+    g_array_insert_val(array, pos, *map);
+}
+
+static gint vhost_iova_tree_cmp_taddr(gconstpointer a, gconstpointer b)
+{
+    const VhostDMAMap *m1 = a, *m2 = b;
+
+    if (m1->translated_addr > m2->translated_addr + m2->size) {
+        return 1;
+    }
+
+    if (m1->translated_addr + m1->size < m2->translated_addr) {
+        return -1;
+    }
+
+    /* Overlapped */
+    return 0;
+}
+
+/**
+ * Find the previous node to a given iova
+ *
+ * @array  The ascending ordered-by-translated-addr array of VhostDMAMap
+ * @map    The map to insert
+ * @prev   Returned location of the previous map
+ *
+ * Return VHOST_DMA_MAP_OK if everything went well, or VHOST_DMA_MAP_OVERLAP if
+ * it already exists. It is ok to use this function to check if a given range
+ * exists, but it will use a linear search.
+ *
+ * TODO: We can use bsearch to locate the entry if we save the state in the
+ * needle, knowing that the needle is always the first argument to
+ * compare_func.
+ */
+static VhostDMAMapNewRC vhost_iova_tree_find_prev(const GArray *array,
+                                                  GCompareFunc compare_func,
+                                                  const VhostDMAMap *map,
+                                                  const VhostDMAMap **prev)
+{
+    size_t i;
+    int r;
+
+    *prev = NULL;
+    for (i = 0; i < array->len; ++i) {
+        r = compare_func(map, &g_array_index(array, typeof(*map), i));
+        if (r == 0) {
+            return VHOST_DMA_MAP_OVERLAP;
+        }
+        if (r < 0) {
+            return VHOST_DMA_MAP_OK;
+        }
+
+        *prev = &g_array_index(array, typeof(**prev), i);
+    }
+
+    return VHOST_DMA_MAP_OK;
+}
+
+/**
+ * Create a new IOVA tree
+ *
+ * Returns the new IOVA tree
+ */
+VhostIOVATree *vhost_iova_tree_new(void)
+{
+    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
+    tree->iova_taddr_map = g_array_new(G_ARRAY_NOT_ZERO_TERMINATED,
+                                       G_ARRAY_NOT_CLEAR_ON_ALLOC,
+                                       sizeof(VhostDMAMap));
+    tree->taddr_iova_map = g_array_new(G_ARRAY_NOT_ZERO_TERMINATED,
+                                       G_ARRAY_NOT_CLEAR_ON_ALLOC,
+                                       sizeof(VhostDMAMap));
+    return tree;
+}
+
+/**
+ * Destroy an IOVA tree
+ *
+ * @tree  The iova tree
+ */
+void vhost_iova_tree_unref(VhostIOVATree *tree)
+{
+    g_array_unref(g_steal_pointer(&tree->iova_taddr_map));
+    g_array_unref(g_steal_pointer(&tree->taddr_iova_map));
+}
+
+/**
+ * Find the IOVA address stored from a memory address
+ *
+ * @tree     The iova tree
+ * @map      The map with the memory address
+ *
+ * Return the stored mapping, or NULL if not found.
+ */
+const VhostDMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
+                                             const VhostDMAMap *map)
+{
+    /*
+     * This can be replaced with g_array_binary_search (Since glib 2.62) when
+     * that version become common enough.
+     */
+    return bsearch(map, tree->taddr_iova_map->data, tree->taddr_iova_map->len,
+                   sizeof(*map), vhost_iova_tree_cmp_taddr);
+}
+
+static bool vhost_iova_tree_find_iova_hole(const GArray *iova_map,
+                                           const VhostDMAMap *map,
+                                           const VhostDMAMap **prev_elem)
+{
+    size_t i;
+    hwaddr iova = iova_min;
+
+    *prev_elem = NULL;
+    for (i = 0; i < iova_map->len; i++) {
+        const VhostDMAMap *next = &g_array_index(iova_map, typeof(*next), i);
+        hwaddr hole_end = next->iova;
+        if (map->size < hole_end - iova) {
+            return true;
+        }
+
+        iova = next->iova + next->size + 1;
+        *prev_elem = next;
+    }
+
+    return ((hwaddr)-1 - iova) > iova_map->len;
+}
+
+/**
+ * Allocate a new mapping
+ *
+ * @tree  The iova tree
+ * @map   The iova map
+ *
+ * Returns:
+ * - VHOST_DMA_MAP_OK if the map fits in the container
+ * - VHOST_DMA_MAP_INVALID if the map does not make sense (like size overflow)
+ * - VHOST_DMA_MAP_OVERLAP if the tree already contains that map
+ * - VHOST_DMA_MAP_NO_SPACE if iova_rm cannot allocate more space.
+ *
+ * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
+ */
+VhostDMAMapNewRC vhost_iova_tree_alloc(VhostIOVATree *tree,
+                                       VhostDMAMap *map)
+{
+    const VhostDMAMap *qemu_prev, *iova_prev;
+    int find_prev_rc;
+    bool fit;
+
+    if (map->translated_addr + map->size < map->translated_addr ||
+        map->iova + map->size < map->iova || map->perm == IOMMU_NONE) {
+        return VHOST_DMA_MAP_INVALID;
+    }
+
+    /* Search for a hole in iova space big enough */
+    fit = vhost_iova_tree_find_iova_hole(tree->iova_taddr_map, map,
+                                         &iova_prev);
+    if (!fit) {
+        return VHOST_DMA_MAP_NO_SPACE;
+    }
+
+    map->iova = iova_prev ? (iova_prev->iova + iova_prev->size) + 1 : iova_min;
+    find_prev_rc = vhost_iova_tree_find_prev(tree->taddr_iova_map,
+                                             vhost_iova_tree_cmp_taddr, map,
+                                             &qemu_prev);
+    if (find_prev_rc == VHOST_DMA_MAP_OVERLAP) {
+        return VHOST_DMA_MAP_OVERLAP;
+    }
+
+    vhost_iova_tree_insert_after(tree->iova_taddr_map, iova_prev, map);
+    vhost_iova_tree_insert_after(tree->taddr_iova_map, qemu_prev, map);
+    return VHOST_DMA_MAP_OK;
+}
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index 8b5a0225fe..cb306b83c6 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -11,7 +11,7 @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
-- 
2.27.0


Re: [RFC PATCH v4 18/20] vhost: Add VhostIOVATree
Posted by Jason Wang 4 years, 3 months ago
在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> This tree is able to look for a translated address from an IOVA address.
>
> At first glance is similar to util/iova-tree. However, SVQ working on
> devices with limited IOVA space need more capabilities, like allocating
> IOVA chunks or perform reverse translations (qemu addresses to iova).


I don't see any reverse translation is used in the shadow code. Or 
anything I missed?


>
> The allocation capability, as "assign a free IOVA address to this chunk
> of memory in qemu's address space" allows shadow virtqueue to create a
> new address space that is not restricted by guest's addressable one, so
> we can allocate shadow vqs vrings outside of its reachability, nor
> qemu's one. At the moment, the allocation is just done growing, not
> allowing deletion.
>
> A different name could be used, but ordered searchable array is a
> little bit long though.
>
> It duplicates the array so it can search efficiently both directions,
> and it will signal overlap if iova or the translated address is
> present in it's each array.
>
> Use of array will be changed to util-iova-tree in future series.


Adding Peter.

It looks to me the only thing miseed is the iova allocator. And it looks 
to me it's better to decouple the allocator from the iova tree.

Then we had:

1) initialize iova range
2) iova = iova_alloc(size)
3) built the iova tree map
4) buffer forwarding
5) iova_free(size)


>
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> ---
>   hw/virtio/vhost-iova-tree.h |  40 +++++++
>   hw/virtio/vhost-iova-tree.c | 230 ++++++++++++++++++++++++++++++++++++
>   hw/virtio/meson.build       |   2 +-
>   3 files changed, 271 insertions(+), 1 deletion(-)
>   create mode 100644 hw/virtio/vhost-iova-tree.h
>   create mode 100644 hw/virtio/vhost-iova-tree.c
>
> diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
> new file mode 100644
> index 0000000000..d163a88905
> --- /dev/null
> +++ b/hw/virtio/vhost-iova-tree.h
> @@ -0,0 +1,40 @@
> +/*
> + * vhost software live migration ring
> + *
> + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
> + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +
> +#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
> +#define HW_VIRTIO_VHOST_IOVA_TREE_H
> +
> +#include "exec/memory.h"
> +
> +typedef struct VhostDMAMap {
> +    void *translated_addr;
> +    hwaddr iova;
> +    hwaddr size;                /* Inclusive */
> +    IOMMUAccessFlags perm;
> +} VhostDMAMap;
> +
> +typedef enum VhostDMAMapNewRC {
> +    VHOST_DMA_MAP_NO_SPACE = -3,
> +    VHOST_DMA_MAP_OVERLAP = -2,
> +    VHOST_DMA_MAP_INVALID = -1,
> +    VHOST_DMA_MAP_OK = 0,
> +} VhostDMAMapNewRC;
> +
> +typedef struct VhostIOVATree VhostIOVATree;
> +
> +VhostIOVATree *vhost_iova_tree_new(void);
> +void vhost_iova_tree_unref(VhostIOVATree *iova_rm);
> +G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_unref);
> +
> +const VhostDMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_rm,
> +                                             const VhostDMAMap *map);
> +VhostDMAMapNewRC vhost_iova_tree_alloc(VhostIOVATree *iova_rm,
> +                                       VhostDMAMap *map);
> +
> +#endif
> diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
> new file mode 100644
> index 0000000000..c284e27607
> --- /dev/null
> +++ b/hw/virtio/vhost-iova-tree.c
> @@ -0,0 +1,230 @@
> +/*
> + * vhost software live migration ring
> + *
> + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
> + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +
> +#include "qemu/osdep.h"
> +#include "vhost-iova-tree.h"
> +
> +#define G_ARRAY_NOT_ZERO_TERMINATED false
> +#define G_ARRAY_NOT_CLEAR_ON_ALLOC false
> +
> +#define iova_min qemu_real_host_page_size
> +
> +/**
> + * VhostIOVATree, able to:
> + * - Translate iova address
> + * - Reverse translate iova address (from translated to iova)
> + * - Allocate IOVA regions for translated range (potentially slow operation)
> + *
> + * Note that it cannot remove nodes.
> + */
> +struct VhostIOVATree {
> +    /* Ordered array of reverse translations, IOVA address to qemu memory. */
> +    GArray *iova_taddr_map;
> +
> +    /*
> +     * Ordered array of translations from qemu virtual memory address to iova
> +     */
> +    GArray *taddr_iova_map;
> +};


Any reason for using GArray? Is it faster?


> +
> +/**
> + * Inserts an element after an existing one in garray.
> + *
> + * @array      The array
> + * @prev_elem  The previous element of array of NULL if prepending
> + * @map        The DMA map
> + *
> + * It provides the aditional advantage of being type safe over
> + * g_array_insert_val, which accepts a reference pointer instead of a value
> + * with no complains.
> + */
> +static void vhost_iova_tree_insert_after(GArray *array,
> +                                         const VhostDMAMap *prev_elem,
> +                                         const VhostDMAMap *map)
> +{
> +    size_t pos;
> +
> +    if (!prev_elem) {
> +        pos = 0;
> +    } else {
> +        pos = prev_elem - &g_array_index(array, typeof(*prev_elem), 0) + 1;
> +    }
> +
> +    g_array_insert_val(array, pos, *map);
> +}
> +
> +static gint vhost_iova_tree_cmp_taddr(gconstpointer a, gconstpointer b)
> +{
> +    const VhostDMAMap *m1 = a, *m2 = b;
> +
> +    if (m1->translated_addr > m2->translated_addr + m2->size) {
> +        return 1;
> +    }
> +
> +    if (m1->translated_addr + m1->size < m2->translated_addr) {
> +        return -1;
> +    }
> +
> +    /* Overlapped */
> +    return 0;
> +}
> +
> +/**
> + * Find the previous node to a given iova
> + *
> + * @array  The ascending ordered-by-translated-addr array of VhostDMAMap
> + * @map    The map to insert
> + * @prev   Returned location of the previous map
> + *
> + * Return VHOST_DMA_MAP_OK if everything went well, or VHOST_DMA_MAP_OVERLAP if
> + * it already exists. It is ok to use this function to check if a given range
> + * exists, but it will use a linear search.
> + *
> + * TODO: We can use bsearch to locate the entry if we save the state in the
> + * needle, knowing that the needle is always the first argument to
> + * compare_func.
> + */
> +static VhostDMAMapNewRC vhost_iova_tree_find_prev(const GArray *array,
> +                                                  GCompareFunc compare_func,
> +                                                  const VhostDMAMap *map,
> +                                                  const VhostDMAMap **prev)
> +{
> +    size_t i;
> +    int r;
> +
> +    *prev = NULL;
> +    for (i = 0; i < array->len; ++i) {
> +        r = compare_func(map, &g_array_index(array, typeof(*map), i));
> +        if (r == 0) {
> +            return VHOST_DMA_MAP_OVERLAP;
> +        }
> +        if (r < 0) {
> +            return VHOST_DMA_MAP_OK;
> +        }
> +
> +        *prev = &g_array_index(array, typeof(**prev), i);
> +    }
> +
> +    return VHOST_DMA_MAP_OK;
> +}
> +
> +/**
> + * Create a new IOVA tree
> + *
> + * Returns the new IOVA tree
> + */
> +VhostIOVATree *vhost_iova_tree_new(void)
> +{


So I think it needs to be initialized with the range we get from 
get_iova_range().

Thanks


> +    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
> +    tree->iova_taddr_map = g_array_new(G_ARRAY_NOT_ZERO_TERMINATED,
> +                                       G_ARRAY_NOT_CLEAR_ON_ALLOC,
> +                                       sizeof(VhostDMAMap));
> +    tree->taddr_iova_map = g_array_new(G_ARRAY_NOT_ZERO_TERMINATED,
> +                                       G_ARRAY_NOT_CLEAR_ON_ALLOC,
> +                                       sizeof(VhostDMAMap));
> +    return tree;
> +}
> +
> +/**
> + * Destroy an IOVA tree
> + *
> + * @tree  The iova tree
> + */
> +void vhost_iova_tree_unref(VhostIOVATree *tree)
> +{
> +    g_array_unref(g_steal_pointer(&tree->iova_taddr_map));
> +    g_array_unref(g_steal_pointer(&tree->taddr_iova_map));
> +}
> +
> +/**
> + * Find the IOVA address stored from a memory address
> + *
> + * @tree     The iova tree
> + * @map      The map with the memory address
> + *
> + * Return the stored mapping, or NULL if not found.
> + */
> +const VhostDMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
> +                                             const VhostDMAMap *map)
> +{
> +    /*
> +     * This can be replaced with g_array_binary_search (Since glib 2.62) when
> +     * that version become common enough.
> +     */
> +    return bsearch(map, tree->taddr_iova_map->data, tree->taddr_iova_map->len,
> +                   sizeof(*map), vhost_iova_tree_cmp_taddr);
> +}
> +
> +static bool vhost_iova_tree_find_iova_hole(const GArray *iova_map,
> +                                           const VhostDMAMap *map,
> +                                           const VhostDMAMap **prev_elem)
> +{
> +    size_t i;
> +    hwaddr iova = iova_min;
> +
> +    *prev_elem = NULL;
> +    for (i = 0; i < iova_map->len; i++) {
> +        const VhostDMAMap *next = &g_array_index(iova_map, typeof(*next), i);
> +        hwaddr hole_end = next->iova;
> +        if (map->size < hole_end - iova) {
> +            return true;
> +        }
> +
> +        iova = next->iova + next->size + 1;
> +        *prev_elem = next;
> +    }
> +
> +    return ((hwaddr)-1 - iova) > iova_map->len;
> +}
> +
> +/**
> + * Allocate a new mapping
> + *
> + * @tree  The iova tree
> + * @map   The iova map
> + *
> + * Returns:
> + * - VHOST_DMA_MAP_OK if the map fits in the container
> + * - VHOST_DMA_MAP_INVALID if the map does not make sense (like size overflow)
> + * - VHOST_DMA_MAP_OVERLAP if the tree already contains that map
> + * - VHOST_DMA_MAP_NO_SPACE if iova_rm cannot allocate more space.
> + *
> + * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
> + */
> +VhostDMAMapNewRC vhost_iova_tree_alloc(VhostIOVATree *tree,
> +                                       VhostDMAMap *map)
> +{
> +    const VhostDMAMap *qemu_prev, *iova_prev;
> +    int find_prev_rc;
> +    bool fit;
> +
> +    if (map->translated_addr + map->size < map->translated_addr ||
> +        map->iova + map->size < map->iova || map->perm == IOMMU_NONE) {
> +        return VHOST_DMA_MAP_INVALID;
> +    }
> +
> +    /* Search for a hole in iova space big enough */
> +    fit = vhost_iova_tree_find_iova_hole(tree->iova_taddr_map, map,
> +                                         &iova_prev);
> +    if (!fit) {
> +        return VHOST_DMA_MAP_NO_SPACE;
> +    }
> +
> +    map->iova = iova_prev ? (iova_prev->iova + iova_prev->size) + 1 : iova_min;
> +    find_prev_rc = vhost_iova_tree_find_prev(tree->taddr_iova_map,
> +                                             vhost_iova_tree_cmp_taddr, map,
> +                                             &qemu_prev);
> +    if (find_prev_rc == VHOST_DMA_MAP_OVERLAP) {
> +        return VHOST_DMA_MAP_OVERLAP;
> +    }
> +
> +    vhost_iova_tree_insert_after(tree->iova_taddr_map, iova_prev, map);
> +    vhost_iova_tree_insert_after(tree->taddr_iova_map, qemu_prev, map);
> +    return VHOST_DMA_MAP_OK;
> +}
> diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
> index 8b5a0225fe..cb306b83c6 100644
> --- a/hw/virtio/meson.build
> +++ b/hw/virtio/meson.build
> @@ -11,7 +11,7 @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
>   
>   virtio_ss = ss.source_set()
>   virtio_ss.add(files('virtio.c'))
> -virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
> +virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
>   virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
>   virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
>   virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))


Re: [RFC PATCH v4 18/20] vhost: Add VhostIOVATree
Posted by Jason Wang 4 years, 3 months ago
On Tue, Oct 19, 2021 at 4:32 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > This tree is able to look for a translated address from an IOVA address.
> >
> > At first glance is similar to util/iova-tree. However, SVQ working on
> > devices with limited IOVA space need more capabilities, like allocating
> > IOVA chunks or perform reverse translations (qemu addresses to iova).
>
>
> I don't see any reverse translation is used in the shadow code. Or
> anything I missed?

Ok, it looks to me that it is used in the iova allocator. But I think
it's better to decouple it to an independent allocator instead of
vhost iova tree.

Thanks


Re: [RFC PATCH v4 18/20] vhost: Add VhostIOVATree
Posted by Eugenio Perez Martin 4 years, 3 months ago
On Tue, Oct 19, 2021 at 11:23 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Tue, Oct 19, 2021 at 4:32 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> >
> > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > This tree is able to look for a translated address from an IOVA address.
> > >
> > > At first glance is similar to util/iova-tree. However, SVQ working on
> > > devices with limited IOVA space need more capabilities, like allocating
> > > IOVA chunks or perform reverse translations (qemu addresses to iova).
> >
> >
> > I don't see any reverse translation is used in the shadow code. Or
> > anything I missed?
>
> Ok, it looks to me that it is used in the iova allocator. But I think
> it's better to decouple it to an independent allocator instead of
> vhost iova tree.
>

Reverse translation is used every time a buffer is made available,
since buffers content are not copied, only the descriptors to SVQ
vring.

At this point all the limits are copied to vhost iova tree in the next
revision I will send, defined at its creation at
vhost_iova_tree_new(). They are outside of util/iova-tree, only sent
to the latter at allocation time.

Since vhost_iova_tree has its own vhost_iova_tree_alloc(), that wraps
the iova_tree_alloc() [1], limits could be kept in vhost-vdpa and make
them an argument of vhost_iova_tree_alloc. But I'm not sure if it's
what you are proposing or I'm missing something.

Either way, I think it is harder to talk about this specific case
without code, since this one still does not address the limits. Would
you prefer me to send another RFC in WIP quality, with *not* all
comments addressed? I would say that there is not a lot of pending
work to send the next one, but it might be easier for all of us.

Thanks!

[1] This util/iova-tree method will be proposed in the next series,
and vhost_iova_tree wraps it since it needs to keep in sync both
trees: iova->qemu vaddr for iova allocation and the reverse one to
translate available buffers.

> Thanks
>


Re: [RFC PATCH v4 18/20] vhost: Add VhostIOVATree
Posted by Jason Wang 4 years, 3 months ago
On Wed, Oct 20, 2021 at 3:54 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Tue, Oct 19, 2021 at 11:23 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Tue, Oct 19, 2021 at 4:32 PM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > >
> > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > This tree is able to look for a translated address from an IOVA address.
> > > >
> > > > At first glance is similar to util/iova-tree. However, SVQ working on
> > > > devices with limited IOVA space need more capabilities, like allocating
> > > > IOVA chunks or perform reverse translations (qemu addresses to iova).
> > >
> > >
> > > I don't see any reverse translation is used in the shadow code. Or
> > > anything I missed?
> >
> > Ok, it looks to me that it is used in the iova allocator. But I think
> > it's better to decouple it to an independent allocator instead of
> > vhost iova tree.
> >
>
> Reverse translation is used every time a buffer is made available,
> since buffers content are not copied, only the descriptors to SVQ
> vring.

I may miss something but I didn't see the code? Qemu knows the VA of
virtqueue, and the VA of the VQ is stored in the VirtQueueElem?

>
> At this point all the limits are copied to vhost iova tree in the next
> revision I will send, defined at its creation at
> vhost_iova_tree_new(). They are outside of util/iova-tree, only sent
> to the latter at allocation time.
>
> Since vhost_iova_tree has its own vhost_iova_tree_alloc(), that wraps
> the iova_tree_alloc() [1], limits could be kept in vhost-vdpa and make
> them an argument of vhost_iova_tree_alloc. But I'm not sure if it's
> what you are proposing or I'm missing something.

If the reverse translation is only used in iova allocation, I meant to
split the logic of IOVA allocation itself.

>
> Either way, I think it is harder to talk about this specific case
> without code, since this one still does not address the limits. Would
> you prefer me to send another RFC in WIP quality, with *not* all
> comments addressed? I would say that there is not a lot of pending
> work to send the next one, but it might be easier for all of us.

I'd prefer to try to address them all, otherwise it's not easy to see
what is missing.

Thanks

>
> Thanks!
>
> [1] This util/iova-tree method will be proposed in the next series,
> and vhost_iova_tree wraps it since it needs to keep in sync both
> trees: iova->qemu vaddr for iova allocation and the reverse one to
> translate available buffers.
>
> > Thanks
> >
>


Re: [RFC PATCH v4 18/20] vhost: Add VhostIOVATree
Posted by Eugenio Perez Martin 4 years, 3 months ago
On Wed, Oct 20, 2021 at 11:01 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, Oct 20, 2021 at 3:54 PM Eugenio Perez Martin
> <eperezma@redhat.com> wrote:
> >
> > On Tue, Oct 19, 2021 at 11:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Tue, Oct 19, 2021 at 4:32 PM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > >
> > > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > > This tree is able to look for a translated address from an IOVA address.
> > > > >
> > > > > At first glance is similar to util/iova-tree. However, SVQ working on
> > > > > devices with limited IOVA space need more capabilities, like allocating
> > > > > IOVA chunks or perform reverse translations (qemu addresses to iova).
> > > >
> > > >
> > > > I don't see any reverse translation is used in the shadow code. Or
> > > > anything I missed?
> > >
> > > Ok, it looks to me that it is used in the iova allocator. But I think
> > > it's better to decouple it to an independent allocator instead of
> > > vhost iova tree.
> > >
> >
> > Reverse translation is used every time a buffer is made available,
> > since buffers content are not copied, only the descriptors to SVQ
> > vring.
>
> I may miss something but I didn't see the code? Qemu knows the VA of
> virtqueue, and the VA of the VQ is stored in the VirtQueueElem?
>

It's used in the patch 20/20, could that be the misunderstanding? The
function calling it is vhost_svq_translate_addr.

Qemu knows the VA address of the buffer, but it must offer a valid SVQ
iova to the device. That is the translation I mean.

> >
> > At this point all the limits are copied to vhost iova tree in the next
> > revision I will send, defined at its creation at
> > vhost_iova_tree_new(). They are outside of util/iova-tree, only sent
> > to the latter at allocation time.
> >
> > Since vhost_iova_tree has its own vhost_iova_tree_alloc(), that wraps
> > the iova_tree_alloc() [1], limits could be kept in vhost-vdpa and make
> > them an argument of vhost_iova_tree_alloc. But I'm not sure if it's
> > what you are proposing or I'm missing something.
>
> If the reverse translation is only used in iova allocation, I meant to
> split the logic of IOVA allocation itself.
>

Still don't understand it, sorry :). In SVQ setup we allocate an iova
address for every guest's GPA address its driver can use. After that
there should be no allocation unless memory is hotplugged.

So the limits are only needed precisely at allocation time. Not sure
if that is what you mean here, but to first allocate and then check if
it is within the range could lead to false negatives, since there
could be a valid range *in* the address but the iova allocator
returned us another range that fell outside the range. How could we
know the cause if it is not using the range itself?

> >
> > Either way, I think it is harder to talk about this specific case
> > without code, since this one still does not address the limits. Would
> > you prefer me to send another RFC in WIP quality, with *not* all
> > comments addressed? I would say that there is not a lot of pending
> > work to send the next one, but it might be easier for all of us.
>
> I'd prefer to try to address them all, otherwise it's not easy to see
> what is missing.
>

Got it, I will do it that way then!

Thanks!

> Thanks
>
> >
> > Thanks!
> >
> > [1] This util/iova-tree method will be proposed in the next series,
> > and vhost_iova_tree wraps it since it needs to keep in sync both
> > trees: iova->qemu vaddr for iova allocation and the reverse one to
> > translate available buffers.
> >
> > > Thanks
> > >
> >
>


Re: [RFC PATCH v4 18/20] vhost: Add VhostIOVATree
Posted by Jason Wang 4 years, 3 months ago
On Wed, Oct 20, 2021 at 8:07 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Wed, Oct 20, 2021 at 11:01 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Wed, Oct 20, 2021 at 3:54 PM Eugenio Perez Martin
> > <eperezma@redhat.com> wrote:
> > >
> > > On Tue, Oct 19, 2021 at 11:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Tue, Oct 19, 2021 at 4:32 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > >
> > > > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > > > This tree is able to look for a translated address from an IOVA address.
> > > > > >
> > > > > > At first glance is similar to util/iova-tree. However, SVQ working on
> > > > > > devices with limited IOVA space need more capabilities, like allocating
> > > > > > IOVA chunks or perform reverse translations (qemu addresses to iova).
> > > > >
> > > > >
> > > > > I don't see any reverse translation is used in the shadow code. Or
> > > > > anything I missed?
> > > >
> > > > Ok, it looks to me that it is used in the iova allocator. But I think
> > > > it's better to decouple it to an independent allocator instead of
> > > > vhost iova tree.
> > > >
> > >
> > > Reverse translation is used every time a buffer is made available,
> > > since buffers content are not copied, only the descriptors to SVQ
> > > vring.
> >
> > I may miss something but I didn't see the code? Qemu knows the VA of
> > virtqueue, and the VA of the VQ is stored in the VirtQueueElem?
> >
>
> It's used in the patch 20/20, could that be the misunderstanding? The
> function calling it is vhost_svq_translate_addr.
>
> Qemu knows the VA address of the buffer, but it must offer a valid SVQ
> iova to the device. That is the translation I mean.

Ok, I get you. So if I understand correctly, what you did is:

1) allocate IOVA during region_add
2) preform VA->IOVA reverse lookup in handle_kick

This should be fine, but here're some suggestions:

1) remove the assert(map) in vhost_svq_translate_addr() since guest
can add e.g BAR address
2) we probably need a better name vhost_iova_tree_alloc(), maybe
"vhost_iova_tree_map_alloc()"

There's actually another method.

1) don't do IOVA/map allocation in region_add()
2) do the allocation in handle_kick(), then we know the IOVA so no
reverse lookup

The advantage is that this can work for the case of vIOMMU. And they
should perform the same:

1) you method avoid the iova allocation per sg
2) my method avoid the reverse lookup per sg

>
> > >
> > > At this point all the limits are copied to vhost iova tree in the next
> > > revision I will send, defined at its creation at
> > > vhost_iova_tree_new(). They are outside of util/iova-tree, only sent
> > > to the latter at allocation time.
> > >
> > > Since vhost_iova_tree has its own vhost_iova_tree_alloc(), that wraps
> > > the iova_tree_alloc() [1], limits could be kept in vhost-vdpa and make
> > > them an argument of vhost_iova_tree_alloc. But I'm not sure if it's
> > > what you are proposing or I'm missing something.
> >
> > If the reverse translation is only used in iova allocation, I meant to
> > split the logic of IOVA allocation itself.
> >
>
> Still don't understand it, sorry :). In SVQ setup we allocate an iova
> address for every guest's GPA address its driver can use. After that
> there should be no allocation unless memory is hotplugged.
>
> So the limits are only needed precisely at allocation time. Not sure
> if that is what you mean here, but to first allocate and then check if
> it is within the range could lead to false negatives, since there
> could be a valid range *in* the address but the iova allocator
> returned us another range that fell outside the range. How could we
> know the cause if it is not using the range itself?

See my above reply. And we can teach the iova allocator to return the
IOVA in the range that vhost-vDPA supports.

Thanks

>
> > >
> > > Either way, I think it is harder to talk about this specific case
> > > without code, since this one still does not address the limits. Would
> > > you prefer me to send another RFC in WIP quality, with *not* all
> > > comments addressed? I would say that there is not a lot of pending
> > > work to send the next one, but it might be easier for all of us.
> >
> > I'd prefer to try to address them all, otherwise it's not easy to see
> > what is missing.
> >
>
> Got it, I will do it that way then!
>
> Thanks!
>
> > Thanks
> >
> > >
> > > Thanks!
> > >
> > > [1] This util/iova-tree method will be proposed in the next series,
> > > and vhost_iova_tree wraps it since it needs to keep in sync both
> > > trees: iova->qemu vaddr for iova allocation and the reverse one to
> > > translate available buffers.
> > >
> > > > Thanks
> > > >
> > >
> >
>


Re: [RFC PATCH v4 18/20] vhost: Add VhostIOVATree
Posted by Eugenio Perez Martin 4 years, 3 months ago
On Thu, Oct 21, 2021 at 4:34 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, Oct 20, 2021 at 8:07 PM Eugenio Perez Martin
> <eperezma@redhat.com> wrote:
> >
> > On Wed, Oct 20, 2021 at 11:01 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Wed, Oct 20, 2021 at 3:54 PM Eugenio Perez Martin
> > > <eperezma@redhat.com> wrote:
> > > >
> > > > On Tue, Oct 19, 2021 at 11:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Tue, Oct 19, 2021 at 4:32 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > >
> > > > > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > > > > This tree is able to look for a translated address from an IOVA address.
> > > > > > >
> > > > > > > At first glance is similar to util/iova-tree. However, SVQ working on
> > > > > > > devices with limited IOVA space need more capabilities, like allocating
> > > > > > > IOVA chunks or perform reverse translations (qemu addresses to iova).
> > > > > >
> > > > > >
> > > > > > I don't see any reverse translation is used in the shadow code. Or
> > > > > > anything I missed?
> > > > >
> > > > > Ok, it looks to me that it is used in the iova allocator. But I think
> > > > > it's better to decouple it to an independent allocator instead of
> > > > > vhost iova tree.
> > > > >
> > > >
> > > > Reverse translation is used every time a buffer is made available,
> > > > since buffers content are not copied, only the descriptors to SVQ
> > > > vring.
> > >
> > > I may miss something but I didn't see the code? Qemu knows the VA of
> > > virtqueue, and the VA of the VQ is stored in the VirtQueueElem?
> > >
> >
> > It's used in the patch 20/20, could that be the misunderstanding? The
> > function calling it is vhost_svq_translate_addr.
> >
> > Qemu knows the VA address of the buffer, but it must offer a valid SVQ
> > iova to the device. That is the translation I mean.
>
> Ok, I get you. So if I understand correctly, what you did is:
>
> 1) allocate IOVA during region_add
> 2) preform VA->IOVA reverse lookup in handle_kick
>
> This should be fine, but here're some suggestions:
>
> 1) remove the assert(map) in vhost_svq_translate_addr() since guest
> can add e.g BAR address

Wouldn't VirtQueue block them in virtqueue_pop / address_space_read_*
functions? I'm fine to remove it but I would say it adds value against
coding error.

> 2) we probably need a better name vhost_iova_tree_alloc(), maybe
> "vhost_iova_tree_map_alloc()"
>

Ok I will change for the next version.

> There's actually another method.
>
> 1) don't do IOVA/map allocation in region_add()
> 2) do the allocation in handle_kick(), then we know the IOVA so no
> reverse lookup
>
> The advantage is that this can work for the case of vIOMMU. And they
> should perform the same:
>
> 1) you method avoid the iova allocation per sg
> 2) my method avoid the reverse lookup per sg
>

It's somehow doable, but we are replacing a tree search with a linear
insertion at this moment.

I would say that guest's IOVA -> qemu vaddr part works with no change
for vIOMMU, since VirtQueue's virtqueue_pop already gives us the vaddr
even in the case of vIOMMU. The only change I would add for that case
is the SVQ -> device map/unmapping part, so the device cannot access
random addresses but only the exposed ones. I'm assuming that part is
O(1).

This way, we already have a tree with all the possible guest's
addresses, and we only need to look for it's SVQ iova -> vaddr
translation. This is a O(log(N)) operation, and read only, so it's
easily parallelizable when we make each SVQ in it's own thread (if
needed). The only thing left is to expose that with an iommu miss
(O(1)) and unmap it on used buffers processing (also O(1)). The
domination operation keeps being VirtQueue's own code lookup for
guest's IOVA -> GPA, which I'm assuming is already well optimized and
will benefit from future optimizations since qemu's memory system is
frequently used.

To optimize your use case we would need to add a custom (and smarter
than the currently used) allocator to SVQ. I've been looking for ways
to reuse glibc or similar in our own arenas but with no luck. It will
be code that SVQ needs to maintain by and for itself anyway.

In either case it should not be hard to switch to your method, just a
few call changes in the future, if we achieve a faster allocator.

Would that make sense?

> >
> > > >
> > > > At this point all the limits are copied to vhost iova tree in the next
> > > > revision I will send, defined at its creation at
> > > > vhost_iova_tree_new(). They are outside of util/iova-tree, only sent
> > > > to the latter at allocation time.
> > > >
> > > > Since vhost_iova_tree has its own vhost_iova_tree_alloc(), that wraps
> > > > the iova_tree_alloc() [1], limits could be kept in vhost-vdpa and make
> > > > them an argument of vhost_iova_tree_alloc. But I'm not sure if it's
> > > > what you are proposing or I'm missing something.
> > >
> > > If the reverse translation is only used in iova allocation, I meant to
> > > split the logic of IOVA allocation itself.
> > >
> >
> > Still don't understand it, sorry :). In SVQ setup we allocate an iova
> > address for every guest's GPA address its driver can use. After that
> > there should be no allocation unless memory is hotplugged.
> >
> > So the limits are only needed precisely at allocation time. Not sure
> > if that is what you mean here, but to first allocate and then check if
> > it is within the range could lead to false negatives, since there
> > could be a valid range *in* the address but the iova allocator
> > returned us another range that fell outside the range. How could we
> > know the cause if it is not using the range itself?
>
> See my above reply. And we can teach the iova allocator to return the
> IOVA in the range that vhost-vDPA supports.
>

Ok,

For the next series it will be that way. I'm pretty sure we are
aligned in this part, but the lack of code in this series makes it
very hard to discuss it :).

Thanks!

> Thanks
>
> >
> > > >
> > > > Either way, I think it is harder to talk about this specific case
> > > > without code, since this one still does not address the limits. Would
> > > > you prefer me to send another RFC in WIP quality, with *not* all
> > > > comments addressed? I would say that there is not a lot of pending
> > > > work to send the next one, but it might be easier for all of us.
> > >
> > > I'd prefer to try to address them all, otherwise it's not easy to see
> > > what is missing.
> > >
> >
> > Got it, I will do it that way then!
> >
> > Thanks!
> >
> > > Thanks
> > >
> > > >
> > > > Thanks!
> > > >
> > > > [1] This util/iova-tree method will be proposed in the next series,
> > > > and vhost_iova_tree wraps it since it needs to keep in sync both
> > > > trees: iova->qemu vaddr for iova allocation and the reverse one to
> > > > translate available buffers.
> > > >
> > > > > Thanks
> > > > >
> > > >
> > >
> >
>


Re: [RFC PATCH v4 18/20] vhost: Add VhostIOVATree
Posted by Jason Wang 4 years, 3 months ago
On Thu, Oct 21, 2021 at 3:03 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Thu, Oct 21, 2021 at 4:34 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Wed, Oct 20, 2021 at 8:07 PM Eugenio Perez Martin
> > <eperezma@redhat.com> wrote:
> > >
> > > On Wed, Oct 20, 2021 at 11:01 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Wed, Oct 20, 2021 at 3:54 PM Eugenio Perez Martin
> > > > <eperezma@redhat.com> wrote:
> > > > >
> > > > > On Tue, Oct 19, 2021 at 11:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > > On Tue, Oct 19, 2021 at 4:32 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > >
> > > > > > >
> > > > > > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > > > > > This tree is able to look for a translated address from an IOVA address.
> > > > > > > >
> > > > > > > > At first glance is similar to util/iova-tree. However, SVQ working on
> > > > > > > > devices with limited IOVA space need more capabilities, like allocating
> > > > > > > > IOVA chunks or perform reverse translations (qemu addresses to iova).
> > > > > > >
> > > > > > >
> > > > > > > I don't see any reverse translation is used in the shadow code. Or
> > > > > > > anything I missed?
> > > > > >
> > > > > > Ok, it looks to me that it is used in the iova allocator. But I think
> > > > > > it's better to decouple it to an independent allocator instead of
> > > > > > vhost iova tree.
> > > > > >
> > > > >
> > > > > Reverse translation is used every time a buffer is made available,
> > > > > since buffers content are not copied, only the descriptors to SVQ
> > > > > vring.
> > > >
> > > > I may miss something but I didn't see the code? Qemu knows the VA of
> > > > virtqueue, and the VA of the VQ is stored in the VirtQueueElem?
> > > >
> > >
> > > It's used in the patch 20/20, could that be the misunderstanding? The
> > > function calling it is vhost_svq_translate_addr.
> > >
> > > Qemu knows the VA address of the buffer, but it must offer a valid SVQ
> > > iova to the device. That is the translation I mean.
> >
> > Ok, I get you. So if I understand correctly, what you did is:
> >
> > 1) allocate IOVA during region_add
> > 2) preform VA->IOVA reverse lookup in handle_kick
> >
> > This should be fine, but here're some suggestions:
> >
> > 1) remove the assert(map) in vhost_svq_translate_addr() since guest
> > can add e.g BAR address
>
> Wouldn't VirtQueue block them in virtqueue_pop / address_space_read_*
> functions? I'm fine to remove it but I would say it adds value against
> coding error.

I think not. Though these addresses were excluded in
vhost_vdpa_listener_skipped_section(). For Qemu memory core, they are
valid addresses. Qemu emulate how hardware work (e.g pci p2p), so dma
to bar is allowed.

>
> > 2) we probably need a better name vhost_iova_tree_alloc(), maybe
> > "vhost_iova_tree_map_alloc()"
> >
>
> Ok I will change for the next version.
>
> > There's actually another method.
> >
> > 1) don't do IOVA/map allocation in region_add()
> > 2) do the allocation in handle_kick(), then we know the IOVA so no
> > reverse lookup
> >
> > The advantage is that this can work for the case of vIOMMU. And they
> > should perform the same:
> >
> > 1) you method avoid the iova allocation per sg
> > 2) my method avoid the reverse lookup per sg
> >
>
> It's somehow doable, but we are replacing a tree search with a linear
> insertion at this moment.
>
> I would say that guest's IOVA -> qemu vaddr part works with no change
> for vIOMMU, since VirtQueue's virtqueue_pop already gives us the vaddr
> even in the case of vIOMMU.

So in this case:

1) listener gives us GPA->host IOVA (host IOVA is allocated per GPA)
2) virtqueue_pop gives us guest IOVA -> VA

We still need extra logic to lookup the vIOMMU to get the guest IOVA
-> GPA then we can know the host IOVA.

If we allocate after virtqueue_pop(), we can follow the same logic as
without vIOMMU. Just allocate an host IOVA then all is done.

> The only change I would add for that case
> is the SVQ -> device map/unmapping part, so the device cannot access
> random addresses but only the exposed ones. I'm assuming that part is
> O(1).
>
> This way, we already have a tree with all the possible guest's
> addresses, and we only need to look for it's SVQ iova -> vaddr
> translation. This is a O(log(N)) operation,

Yes, but it's requires traverse the vIOMMU page table which should be
slower than our own iova tree?

> and read only, so it's
> easily parallelizable when we make each SVQ in it's own thread (if
> needed).

Yes, this is because the host IOVA was allocated before by the memory listener.

> The only thing left is to expose that with an iommu miss
> (O(1)) and unmap it on used buffers processing (also O(1)). The
> domination operation keeps being VirtQueue's own code lookup for
> guest's IOVA -> GPA, which I'm assuming is already well optimized and
> will benefit from future optimizations since qemu's memory system is
> frequently used.
>
> To optimize your use case we would need to add a custom (and smarter
> than the currently used) allocator to SVQ. I've been looking for ways
> to reuse glibc or similar in our own arenas but with no luck. It will
> be code that SVQ needs to maintain by and for itself anyway.

The benefit is to have separate iova allocation from the tree.

>
> In either case it should not be hard to switch to your method, just a
> few call changes in the future, if we achieve a faster allocator.
>
> Would that make sense?

Yes, feel free to choose any method you wish or feel simpler in the next series.

>
> > >
> > > > >
> > > > > At this point all the limits are copied to vhost iova tree in the next
> > > > > revision I will send, defined at its creation at
> > > > > vhost_iova_tree_new(). They are outside of util/iova-tree, only sent
> > > > > to the latter at allocation time.
> > > > >
> > > > > Since vhost_iova_tree has its own vhost_iova_tree_alloc(), that wraps
> > > > > the iova_tree_alloc() [1], limits could be kept in vhost-vdpa and make
> > > > > them an argument of vhost_iova_tree_alloc. But I'm not sure if it's
> > > > > what you are proposing or I'm missing something.
> > > >
> > > > If the reverse translation is only used in iova allocation, I meant to
> > > > split the logic of IOVA allocation itself.
> > > >
> > >
> > > Still don't understand it, sorry :). In SVQ setup we allocate an iova
> > > address for every guest's GPA address its driver can use. After that
> > > there should be no allocation unless memory is hotplugged.
> > >
> > > So the limits are only needed precisely at allocation time. Not sure
> > > if that is what you mean here, but to first allocate and then check if
> > > it is within the range could lead to false negatives, since there
> > > could be a valid range *in* the address but the iova allocator
> > > returned us another range that fell outside the range. How could we
> > > know the cause if it is not using the range itself?
> >
> > See my above reply. And we can teach the iova allocator to return the
> > IOVA in the range that vhost-vDPA supports.
> >
>
> Ok,
>
> For the next series it will be that way. I'm pretty sure we are
> aligned in this part, but the lack of code in this series makes it
> very hard to discuss it :).

Fine. Let's see.

Thanks

>
> Thanks!
>
> > Thanks
> >
> > >
> > > > >
> > > > > Either way, I think it is harder to talk about this specific case
> > > > > without code, since this one still does not address the limits. Would
> > > > > you prefer me to send another RFC in WIP quality, with *not* all
> > > > > comments addressed? I would say that there is not a lot of pending
> > > > > work to send the next one, but it might be easier for all of us.
> > > >
> > > > I'd prefer to try to address them all, otherwise it's not easy to see
> > > > what is missing.
> > > >
> > >
> > > Got it, I will do it that way then!
> > >
> > > Thanks!
> > >
> > > > Thanks
> > > >
> > > > >
> > > > > Thanks!
> > > > >
> > > > > [1] This util/iova-tree method will be proposed in the next series,
> > > > > and vhost_iova_tree wraps it since it needs to keep in sync both
> > > > > trees: iova->qemu vaddr for iova allocation and the reverse one to
> > > > > translate available buffers.
> > > > >
> > > > > > Thanks
> > > > > >
> > > > >
> > > >
> > >
> >
>


Re: [RFC PATCH v4 18/20] vhost: Add VhostIOVATree
Posted by Eugenio Perez Martin 4 years, 3 months ago
On Thu, Oct 21, 2021 at 10:12 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Thu, Oct 21, 2021 at 3:03 PM Eugenio Perez Martin
> <eperezma@redhat.com> wrote:
> >
> > On Thu, Oct 21, 2021 at 4:34 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Wed, Oct 20, 2021 at 8:07 PM Eugenio Perez Martin
> > > <eperezma@redhat.com> wrote:
> > > >
> > > > On Wed, Oct 20, 2021 at 11:01 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Wed, Oct 20, 2021 at 3:54 PM Eugenio Perez Martin
> > > > > <eperezma@redhat.com> wrote:
> > > > > >
> > > > > > On Tue, Oct 19, 2021 at 11:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > >
> > > > > > > On Tue, Oct 19, 2021 at 4:32 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > >
> > > > > > > >
> > > > > > > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > > > > > > This tree is able to look for a translated address from an IOVA address.
> > > > > > > > >
> > > > > > > > > At first glance is similar to util/iova-tree. However, SVQ working on
> > > > > > > > > devices with limited IOVA space need more capabilities, like allocating
> > > > > > > > > IOVA chunks or perform reverse translations (qemu addresses to iova).
> > > > > > > >
> > > > > > > >
> > > > > > > > I don't see any reverse translation is used in the shadow code. Or
> > > > > > > > anything I missed?
> > > > > > >
> > > > > > > Ok, it looks to me that it is used in the iova allocator. But I think
> > > > > > > it's better to decouple it to an independent allocator instead of
> > > > > > > vhost iova tree.
> > > > > > >
> > > > > >
> > > > > > Reverse translation is used every time a buffer is made available,
> > > > > > since buffers content are not copied, only the descriptors to SVQ
> > > > > > vring.
> > > > >
> > > > > I may miss something but I didn't see the code? Qemu knows the VA of
> > > > > virtqueue, and the VA of the VQ is stored in the VirtQueueElem?
> > > > >
> > > >
> > > > It's used in the patch 20/20, could that be the misunderstanding? The
> > > > function calling it is vhost_svq_translate_addr.
> > > >
> > > > Qemu knows the VA address of the buffer, but it must offer a valid SVQ
> > > > iova to the device. That is the translation I mean.
> > >
> > > Ok, I get you. So if I understand correctly, what you did is:
> > >
> > > 1) allocate IOVA during region_add
> > > 2) preform VA->IOVA reverse lookup in handle_kick
> > >
> > > This should be fine, but here're some suggestions:
> > >
> > > 1) remove the assert(map) in vhost_svq_translate_addr() since guest
> > > can add e.g BAR address
> >
> > Wouldn't VirtQueue block them in virtqueue_pop / address_space_read_*
> > functions? I'm fine to remove it but I would say it adds value against
> > coding error.
>
> I think not. Though these addresses were excluded in
> vhost_vdpa_listener_skipped_section(). For Qemu memory core, they are
> valid addresses. Qemu emulate how hardware work (e.g pci p2p), so dma
> to bar is allowed.
>

Ok I will treat them as errors.

> >
> > > 2) we probably need a better name vhost_iova_tree_alloc(), maybe
> > > "vhost_iova_tree_map_alloc()"
> > >
> >
> > Ok I will change for the next version.
> >
> > > There's actually another method.
> > >
> > > 1) don't do IOVA/map allocation in region_add()
> > > 2) do the allocation in handle_kick(), then we know the IOVA so no
> > > reverse lookup
> > >
> > > The advantage is that this can work for the case of vIOMMU. And they
> > > should perform the same:
> > >
> > > 1) you method avoid the iova allocation per sg
> > > 2) my method avoid the reverse lookup per sg
> > >
> >
> > It's somehow doable, but we are replacing a tree search with a linear
> > insertion at this moment.
> >
> > I would say that guest's IOVA -> qemu vaddr part works with no change
> > for vIOMMU, since VirtQueue's virtqueue_pop already gives us the vaddr
> > even in the case of vIOMMU.
>
> So in this case:
>
> 1) listener gives us GPA->host IOVA (host IOVA is allocated per GPA)

Right, that was a miss from my side, I think I get your point way better now.

So now vhost-iova-tree translates GPA -> host IOVA in vIOMMU case, and
it is updated at the same frequency than guest physical memory hotplug
/ unplug (little during migration, I guess). There are special entries
for SVQ vrings, that the tree does not map with GPA for obvious
reasons, and you cannot locate them when looking by GPA.

Let's assume too that only SVQ vrings have been sent as IOMMU / IOTLB
map, with the relation Host iova -> qemu's VA.

> 2) virtqueue_pop gives us guest IOVA -> VA
>
> We still need extra logic to lookup the vIOMMU to get the guest IOVA
> GPA then we can know the host IOVA.
>

That's somehow right, but I think this does not need to be *another*
search, insertion, etc. Please see below.

> If we allocate after virtqueue_pop(), we can follow the same logic as
> without vIOMMU. Just allocate an host IOVA then all is done.
>
> > The only change I would add for that case
> > is the SVQ -> device map/unmapping part, so the device cannot access
> > random addresses but only the exposed ones. I'm assuming that part is
> > O(1).
> >
> > This way, we already have a tree with all the possible guest's
> > addresses, and we only need to look for it's SVQ iova -> vaddr
> > translation. This is a O(log(N)) operation,
>
> Yes, but it's requires traverse the vIOMMU page table which should be
> slower than our own iova tree?
>

The lookup over vIOMMU is not needed (to perform twice), since
virtqueue_pop already do it. We already have that data here, just need
to extract it. Not saying that is complicated, just saying that I
didn't dedicate a lot of time to figure out how. The calltrace of it
is:

#0  address_space_translate_iommu
    (iommu_mr, xlat, plen_out, page_mask_out, is_write, is_mmio,
target_as, attrs) at ../softmmu/physmem.c:418
#1  flatview_do_translate
    (fv, addr, xlat, plen_out, page_mask_out, is_write, is_mmio,
target_as, attrs) at ../softmmu/physmem.c:505
#2  flatview_translate
    (fv, addr, xlat, plen, is_write, attrs) at ../softmmu/physmem.c:565
#3  address_space_map (as, addr, plen, is_write, attrs)
    at ../softmmu/physmem.c:3183
#4  dma_memory_map (as, addr, len, dir)
    at /home/qemu/svq/include/sysemu/dma.h:202
#5  virtqueue_map_desc
    (vdev, p_num_sg, addr, iov, max_num_sg, is_write, pa, sz) at
../hw/virtio/virtio.c:1314
#6  virtqueue_split_pop (vq, sz) at ../hw/virtio/virtio.c:1488

So with that GPA we can locate its correspond entry in the
vhost-iova-tree, in a read-only operation, O(log(N)). And element
address in qemu's va is not going to change until we mark it as used.

This process (all the stack call trace) needs to be serialized somehow
in qemu's memory system internals, I'm just assuming that it will be
faster than the one we can do in SVQ with little effort, and it will
help to reduce duplication. If is not the case, I think it is even
more beneficial to improve it, than to reinvent it in SVQ.

After that, an iommu map needs to be sent to the device, as (qemu's
iommu obtained from the tree, qemu's VA, length, ...). We may even
batch them. Another option is to wait for the miss(), but I think that
would be a waste of resources.

The reverse is also true with the unmapping: When we see an used
descriptor, IOTLB unmap(s) will be sent before send the descriptor to
guest as used.

> > and read only, so it's
> > easily parallelizable when we make each SVQ in it's own thread (if
> > needed).
>
> Yes, this is because the host IOVA was allocated before by the memory listener.
>

Right.

> > The only thing left is to expose that with an iommu miss
> > (O(1)) and unmap it on used buffers processing (also O(1)). The
> > domination operation keeps being VirtQueue's own code lookup for
> > guest's IOVA -> GPA, which I'm assuming is already well optimized and
> > will benefit from future optimizations since qemu's memory system is
> > frequently used.
> >
> > To optimize your use case we would need to add a custom (and smarter
> > than the currently used) allocator to SVQ. I've been looking for ways
> > to reuse glibc or similar in our own arenas but with no luck. It will
> > be code that SVQ needs to maintain by and for itself anyway.
>
> The benefit is to have separate iova allocation from the tree.
>
> >
> > In either case it should not be hard to switch to your method, just a
> > few call changes in the future, if we achieve a faster allocator.
> >
> > Would that make sense?
>
> Yes, feel free to choose any method you wish or feel simpler in the next series.
>
> >
> > > >
> > > > > >
> > > > > > At this point all the limits are copied to vhost iova tree in the next
> > > > > > revision I will send, defined at its creation at
> > > > > > vhost_iova_tree_new(). They are outside of util/iova-tree, only sent
> > > > > > to the latter at allocation time.
> > > > > >
> > > > > > Since vhost_iova_tree has its own vhost_iova_tree_alloc(), that wraps
> > > > > > the iova_tree_alloc() [1], limits could be kept in vhost-vdpa and make
> > > > > > them an argument of vhost_iova_tree_alloc. But I'm not sure if it's
> > > > > > what you are proposing or I'm missing something.
> > > > >
> > > > > If the reverse translation is only used in iova allocation, I meant to
> > > > > split the logic of IOVA allocation itself.
> > > > >
> > > >
> > > > Still don't understand it, sorry :). In SVQ setup we allocate an iova
> > > > address for every guest's GPA address its driver can use. After that
> > > > there should be no allocation unless memory is hotplugged.
> > > >
> > > > So the limits are only needed precisely at allocation time. Not sure
> > > > if that is what you mean here, but to first allocate and then check if
> > > > it is within the range could lead to false negatives, since there
> > > > could be a valid range *in* the address but the iova allocator
> > > > returned us another range that fell outside the range. How could we
> > > > know the cause if it is not using the range itself?
> > >
> > > See my above reply. And we can teach the iova allocator to return the
> > > IOVA in the range that vhost-vDPA supports.
> > >
> >
> > Ok,
> >
> > For the next series it will be that way. I'm pretty sure we are
> > aligned in this part, but the lack of code in this series makes it
> > very hard to discuss it :).
>
> Fine. Let's see.
>
> Thanks
>
> >
> > Thanks!
> >
> > > Thanks
> > >
> > > >
> > > > > >
> > > > > > Either way, I think it is harder to talk about this specific case
> > > > > > without code, since this one still does not address the limits. Would
> > > > > > you prefer me to send another RFC in WIP quality, with *not* all
> > > > > > comments addressed? I would say that there is not a lot of pending
> > > > > > work to send the next one, but it might be easier for all of us.
> > > > >
> > > > > I'd prefer to try to address them all, otherwise it's not easy to see
> > > > > what is missing.
> > > > >
> > > >
> > > > Got it, I will do it that way then!
> > > >
> > > > Thanks!
> > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > Thanks!
> > > > > >
> > > > > > [1] This util/iova-tree method will be proposed in the next series,
> > > > > > and vhost_iova_tree wraps it since it needs to keep in sync both
> > > > > > trees: iova->qemu vaddr for iova allocation and the reverse one to
> > > > > > translate available buffers.
> > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > >
> > > > >
> > > >
> > >
> >
>


Re: [RFC PATCH v4 18/20] vhost: Add VhostIOVATree
Posted by Jason Wang 4 years, 3 months ago
On Thu, Oct 21, 2021 at 10:34 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Thu, Oct 21, 2021 at 10:12 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Thu, Oct 21, 2021 at 3:03 PM Eugenio Perez Martin
> > <eperezma@redhat.com> wrote:
> > >
> > > On Thu, Oct 21, 2021 at 4:34 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Wed, Oct 20, 2021 at 8:07 PM Eugenio Perez Martin
> > > > <eperezma@redhat.com> wrote:
> > > > >
> > > > > On Wed, Oct 20, 2021 at 11:01 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > > On Wed, Oct 20, 2021 at 3:54 PM Eugenio Perez Martin
> > > > > > <eperezma@redhat.com> wrote:
> > > > > > >
> > > > > > > On Tue, Oct 19, 2021 at 11:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Tue, Oct 19, 2021 at 4:32 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > > > > > > > This tree is able to look for a translated address from an IOVA address.
> > > > > > > > > >
> > > > > > > > > > At first glance is similar to util/iova-tree. However, SVQ working on
> > > > > > > > > > devices with limited IOVA space need more capabilities, like allocating
> > > > > > > > > > IOVA chunks or perform reverse translations (qemu addresses to iova).
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > I don't see any reverse translation is used in the shadow code. Or
> > > > > > > > > anything I missed?
> > > > > > > >
> > > > > > > > Ok, it looks to me that it is used in the iova allocator. But I think
> > > > > > > > it's better to decouple it to an independent allocator instead of
> > > > > > > > vhost iova tree.
> > > > > > > >
> > > > > > >
> > > > > > > Reverse translation is used every time a buffer is made available,
> > > > > > > since buffers content are not copied, only the descriptors to SVQ
> > > > > > > vring.
> > > > > >
> > > > > > I may miss something but I didn't see the code? Qemu knows the VA of
> > > > > > virtqueue, and the VA of the VQ is stored in the VirtQueueElem?
> > > > > >
> > > > >
> > > > > It's used in the patch 20/20, could that be the misunderstanding? The
> > > > > function calling it is vhost_svq_translate_addr.
> > > > >
> > > > > Qemu knows the VA address of the buffer, but it must offer a valid SVQ
> > > > > iova to the device. That is the translation I mean.
> > > >
> > > > Ok, I get you. So if I understand correctly, what you did is:
> > > >
> > > > 1) allocate IOVA during region_add
> > > > 2) preform VA->IOVA reverse lookup in handle_kick
> > > >
> > > > This should be fine, but here're some suggestions:
> > > >
> > > > 1) remove the assert(map) in vhost_svq_translate_addr() since guest
> > > > can add e.g BAR address
> > >
> > > Wouldn't VirtQueue block them in virtqueue_pop / address_space_read_*
> > > functions? I'm fine to remove it but I would say it adds value against
> > > coding error.
> >
> > I think not. Though these addresses were excluded in
> > vhost_vdpa_listener_skipped_section(). For Qemu memory core, they are
> > valid addresses. Qemu emulate how hardware work (e.g pci p2p), so dma
> > to bar is allowed.
> >
>
> Ok I will treat them as errors.
>
> > >
> > > > 2) we probably need a better name vhost_iova_tree_alloc(), maybe
> > > > "vhost_iova_tree_map_alloc()"
> > > >
> > >
> > > Ok I will change for the next version.
> > >
> > > > There's actually another method.
> > > >
> > > > 1) don't do IOVA/map allocation in region_add()
> > > > 2) do the allocation in handle_kick(), then we know the IOVA so no
> > > > reverse lookup
> > > >
> > > > The advantage is that this can work for the case of vIOMMU. And they
> > > > should perform the same:
> > > >
> > > > 1) you method avoid the iova allocation per sg
> > > > 2) my method avoid the reverse lookup per sg
> > > >
> > >
> > > It's somehow doable, but we are replacing a tree search with a linear
> > > insertion at this moment.
> > >
> > > I would say that guest's IOVA -> qemu vaddr part works with no change
> > > for vIOMMU, since VirtQueue's virtqueue_pop already gives us the vaddr
> > > even in the case of vIOMMU.
> >
> > So in this case:
> >
> > 1) listener gives us GPA->host IOVA (host IOVA is allocated per GPA)
>
> Right, that was a miss from my side, I think I get your point way better now.
>
> So now vhost-iova-tree translates GPA -> host IOVA in vIOMMU case, and
> it is updated at the same frequency than guest physical memory hotplug
> / unplug (little during migration, I guess). There are special entries
> for SVQ vrings, that the tree does not map with GPA for obvious
> reasons, and you cannot locate them when looking by GPA.

Yes.

>
> Let's assume too that only SVQ vrings have been sent as IOMMU / IOTLB
> map, with the relation Host iova -> qemu's VA.
>
> > 2) virtqueue_pop gives us guest IOVA -> VA
> >
> > We still need extra logic to lookup the vIOMMU to get the guest IOVA
> > GPA then we can know the host IOVA.
> >
>
> That's somehow right, but I think this does not need to be *another*
> search, insertion, etc. Please see below.
>
> > If we allocate after virtqueue_pop(), we can follow the same logic as
> > without vIOMMU. Just allocate an host IOVA then all is done.
> >
> > > The only change I would add for that case
> > > is the SVQ -> device map/unmapping part, so the device cannot access
> > > random addresses but only the exposed ones. I'm assuming that part is
> > > O(1).
> > >
> > > This way, we already have a tree with all the possible guest's
> > > addresses, and we only need to look for it's SVQ iova -> vaddr
> > > translation. This is a O(log(N)) operation,
> >
> > Yes, but it's requires traverse the vIOMMU page table which should be
> > slower than our own iova tree?
> >
>
> The lookup over vIOMMU is not needed (to perform twice), since
> virtqueue_pop already do it. We already have that data here, just need
> to extract it.

For 'extract' do you mean fetching it from IOMMU's IOTLB via
address_space_get_iotlb_entry()? Yes, it would be faster and probably
an O(1).

> Not saying that is complicated, just saying that I
> didn't dedicate a lot of time to figure out how. The calltrace of it
> is:
>
> #0  address_space_translate_iommu
>     (iommu_mr, xlat, plen_out, page_mask_out, is_write, is_mmio,
> target_as, attrs) at ../softmmu/physmem.c:418
> #1  flatview_do_translate
>     (fv, addr, xlat, plen_out, page_mask_out, is_write, is_mmio,
> target_as, attrs) at ../softmmu/physmem.c:505
> #2  flatview_translate
>     (fv, addr, xlat, plen, is_write, attrs) at ../softmmu/physmem.c:565
> #3  address_space_map (as, addr, plen, is_write, attrs)
>     at ../softmmu/physmem.c:3183
> #4  dma_memory_map (as, addr, len, dir)
>     at /home/qemu/svq/include/sysemu/dma.h:202
> #5  virtqueue_map_desc
>     (vdev, p_num_sg, addr, iov, max_num_sg, is_write, pa, sz) at
> ../hw/virtio/virtio.c:1314
> #6  virtqueue_split_pop (vq, sz) at ../hw/virtio/virtio.c:1488
>
> So with that GPA we can locate its correspond entry in the
> vhost-iova-tree, in a read-only operation, O(log(N)). And element
> address in qemu's va is not going to change until we mark it as used.
>
> This process (all the stack call trace) needs to be serialized somehow
> in qemu's memory system internals, I'm just assuming that it will be
> faster than the one we can do in SVQ with little effort, and it will
> help to reduce duplication. If is not the case, I think it is even
> more beneficial to improve it, than to reinvent it in SVQ.

I think so.

Thanks

>
> After that, an iommu map needs to be sent to the device, as (qemu's
> iommu obtained from the tree, qemu's VA, length, ...). We may even
> batch them. Another option is to wait for the miss(), but I think that
> would be a waste of resources.
>
> The reverse is also true with the unmapping: When we see an used
> descriptor, IOTLB unmap(s) will be sent before send the descriptor to
> guest as used.
>
> > > and read only, so it's
> > > easily parallelizable when we make each SVQ in it's own thread (if
> > > needed).
> >
> > Yes, this is because the host IOVA was allocated before by the memory listener.
> >
>
> Right.
>
> > > The only thing left is to expose that with an iommu miss
> > > (O(1)) and unmap it on used buffers processing (also O(1)). The
> > > domination operation keeps being VirtQueue's own code lookup for
> > > guest's IOVA -> GPA, which I'm assuming is already well optimized and
> > > will benefit from future optimizations since qemu's memory system is
> > > frequently used.
> > >
> > > To optimize your use case we would need to add a custom (and smarter
> > > than the currently used) allocator to SVQ. I've been looking for ways
> > > to reuse glibc or similar in our own arenas but with no luck. It will
> > > be code that SVQ needs to maintain by and for itself anyway.
> >
> > The benefit is to have separate iova allocation from the tree.
> >
> > >
> > > In either case it should not be hard to switch to your method, just a
> > > few call changes in the future, if we achieve a faster allocator.
> > >
> > > Would that make sense?
> >
> > Yes, feel free to choose any method you wish or feel simpler in the next series.
> >
> > >
> > > > >
> > > > > > >
> > > > > > > At this point all the limits are copied to vhost iova tree in the next
> > > > > > > revision I will send, defined at its creation at
> > > > > > > vhost_iova_tree_new(). They are outside of util/iova-tree, only sent
> > > > > > > to the latter at allocation time.
> > > > > > >
> > > > > > > Since vhost_iova_tree has its own vhost_iova_tree_alloc(), that wraps
> > > > > > > the iova_tree_alloc() [1], limits could be kept in vhost-vdpa and make
> > > > > > > them an argument of vhost_iova_tree_alloc. But I'm not sure if it's
> > > > > > > what you are proposing or I'm missing something.
> > > > > >
> > > > > > If the reverse translation is only used in iova allocation, I meant to
> > > > > > split the logic of IOVA allocation itself.
> > > > > >
> > > > >
> > > > > Still don't understand it, sorry :). In SVQ setup we allocate an iova
> > > > > address for every guest's GPA address its driver can use. After that
> > > > > there should be no allocation unless memory is hotplugged.
> > > > >
> > > > > So the limits are only needed precisely at allocation time. Not sure
> > > > > if that is what you mean here, but to first allocate and then check if
> > > > > it is within the range could lead to false negatives, since there
> > > > > could be a valid range *in* the address but the iova allocator
> > > > > returned us another range that fell outside the range. How could we
> > > > > know the cause if it is not using the range itself?
> > > >
> > > > See my above reply. And we can teach the iova allocator to return the
> > > > IOVA in the range that vhost-vDPA supports.
> > > >
> > >
> > > Ok,
> > >
> > > For the next series it will be that way. I'm pretty sure we are
> > > aligned in this part, but the lack of code in this series makes it
> > > very hard to discuss it :).
> >
> > Fine. Let's see.
> >
> > Thanks
> >
> > >
> > > Thanks!
> > >
> > > > Thanks
> > > >
> > > > >
> > > > > > >
> > > > > > > Either way, I think it is harder to talk about this specific case
> > > > > > > without code, since this one still does not address the limits. Would
> > > > > > > you prefer me to send another RFC in WIP quality, with *not* all
> > > > > > > comments addressed? I would say that there is not a lot of pending
> > > > > > > work to send the next one, but it might be easier for all of us.
> > > > > >
> > > > > > I'd prefer to try to address them all, otherwise it's not easy to see
> > > > > > what is missing.
> > > > > >
> > > > >
> > > > > Got it, I will do it that way then!
> > > > >
> > > > > Thanks!
> > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > > Thanks!
> > > > > > >
> > > > > > > [1] This util/iova-tree method will be proposed in the next series,
> > > > > > > and vhost_iova_tree wraps it since it needs to keep in sync both
> > > > > > > trees: iova->qemu vaddr for iova allocation and the reverse one to
> > > > > > > translate available buffers.
> > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > > >
> > >
> >
>


Re: [RFC PATCH v4 18/20] vhost: Add VhostIOVATree
Posted by Eugenio Perez Martin 4 years, 3 months ago
On Tue, Oct 19, 2021 at 10:32 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > This tree is able to look for a translated address from an IOVA address.
> >
> > At first glance is similar to util/iova-tree. However, SVQ working on
> > devices with limited IOVA space need more capabilities, like allocating
> > IOVA chunks or perform reverse translations (qemu addresses to iova).
>
>
> I don't see any reverse translation is used in the shadow code. Or
> anything I missed?
>
>
> >
> > The allocation capability, as "assign a free IOVA address to this chunk
> > of memory in qemu's address space" allows shadow virtqueue to create a
> > new address space that is not restricted by guest's addressable one, so
> > we can allocate shadow vqs vrings outside of its reachability, nor
> > qemu's one. At the moment, the allocation is just done growing, not
> > allowing deletion.
> >
> > A different name could be used, but ordered searchable array is a
> > little bit long though.
> >
> > It duplicates the array so it can search efficiently both directions,
> > and it will signal overlap if iova or the translated address is
> > present in it's each array.
> >
> > Use of array will be changed to util-iova-tree in future series.
>
>
> Adding Peter.
>

Thanks, I missed CC him!

> It looks to me the only thing miseed is the iova allocator. And it looks
> to me it's better to decouple the allocator from the iova tree.
>
> Then we had:
>
> 1) initialize iova range
> 2) iova = iova_alloc(size)
> 3) built the iova tree map
> 4) buffer forwarding
> 5) iova_free(size)
>

The next series I will send once I have solved all the comments is
done that way, but the allocation is done in iova tree, not outside.
Reasons below.

>
> >
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > ---
> >   hw/virtio/vhost-iova-tree.h |  40 +++++++
> >   hw/virtio/vhost-iova-tree.c | 230 ++++++++++++++++++++++++++++++++++++
> >   hw/virtio/meson.build       |   2 +-
> >   3 files changed, 271 insertions(+), 1 deletion(-)
> >   create mode 100644 hw/virtio/vhost-iova-tree.h
> >   create mode 100644 hw/virtio/vhost-iova-tree.c
> >
> > diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
> > new file mode 100644
> > index 0000000000..d163a88905
> > --- /dev/null
> > +++ b/hw/virtio/vhost-iova-tree.h
> > @@ -0,0 +1,40 @@
> > +/*
> > + * vhost software live migration ring
> > + *
> > + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
> > + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
> > + *
> > + * SPDX-License-Identifier: GPL-2.0-or-later
> > + */
> > +
> > +#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
> > +#define HW_VIRTIO_VHOST_IOVA_TREE_H
> > +
> > +#include "exec/memory.h"
> > +
> > +typedef struct VhostDMAMap {
> > +    void *translated_addr;
> > +    hwaddr iova;
> > +    hwaddr size;                /* Inclusive */
> > +    IOMMUAccessFlags perm;
> > +} VhostDMAMap;
> > +
> > +typedef enum VhostDMAMapNewRC {
> > +    VHOST_DMA_MAP_NO_SPACE = -3,
> > +    VHOST_DMA_MAP_OVERLAP = -2,
> > +    VHOST_DMA_MAP_INVALID = -1,
> > +    VHOST_DMA_MAP_OK = 0,
> > +} VhostDMAMapNewRC;
> > +
> > +typedef struct VhostIOVATree VhostIOVATree;
> > +
> > +VhostIOVATree *vhost_iova_tree_new(void);
> > +void vhost_iova_tree_unref(VhostIOVATree *iova_rm);
> > +G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_unref);
> > +
> > +const VhostDMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_rm,
> > +                                             const VhostDMAMap *map);
> > +VhostDMAMapNewRC vhost_iova_tree_alloc(VhostIOVATree *iova_rm,
> > +                                       VhostDMAMap *map);
> > +
> > +#endif
> > diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
> > new file mode 100644
> > index 0000000000..c284e27607
> > --- /dev/null
> > +++ b/hw/virtio/vhost-iova-tree.c
> > @@ -0,0 +1,230 @@
> > +/*
> > + * vhost software live migration ring
> > + *
> > + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
> > + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
> > + *
> > + * SPDX-License-Identifier: GPL-2.0-or-later
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "vhost-iova-tree.h"
> > +
> > +#define G_ARRAY_NOT_ZERO_TERMINATED false
> > +#define G_ARRAY_NOT_CLEAR_ON_ALLOC false
> > +
> > +#define iova_min qemu_real_host_page_size
> > +
> > +/**
> > + * VhostIOVATree, able to:
> > + * - Translate iova address
> > + * - Reverse translate iova address (from translated to iova)
> > + * - Allocate IOVA regions for translated range (potentially slow operation)
> > + *
> > + * Note that it cannot remove nodes.
> > + */
> > +struct VhostIOVATree {
> > +    /* Ordered array of reverse translations, IOVA address to qemu memory. */
> > +    GArray *iova_taddr_map;
> > +
> > +    /*
> > +     * Ordered array of translations from qemu virtual memory address to iova
> > +     */
> > +    GArray *taddr_iova_map;
> > +};
>
>
> Any reason for using GArray? Is it faster?
>

To be honest, I used a GArray mainly for prototyping reasons, because
it allowed me to "insert a next element" once I've located either a
hole (in case of iova) or an address. To do it in the iova tree
required to either add code to iterate elements with arguments or to
allocate them. Another possibility is to add yet another structure
with "free regions".

I would say that at this moment GArray will be faster than GTree due
to GArray ability to bisect it and the better data locality, since we
will not be adding and deleting regions frequently during migration.
But that could change in case we support viommu platforms, and I
didn't measure it.

For the next revision I've added allocation capabilities to iova tree,
which I think fits pretty well there, and a separated qemu's vaddr ->
iova translation GTree. The allocation still transverse the utils/iova
tree linearly, but we can add a more performant allocator in the
future easily if needed. It should not affect anyway unless we hotplug
memory or similar.

>
> > +
> > +/**
> > + * Inserts an element after an existing one in garray.
> > + *
> > + * @array      The array
> > + * @prev_elem  The previous element of array of NULL if prepending
> > + * @map        The DMA map
> > + *
> > + * It provides the aditional advantage of being type safe over
> > + * g_array_insert_val, which accepts a reference pointer instead of a value
> > + * with no complains.
> > + */
> > +static void vhost_iova_tree_insert_after(GArray *array,
> > +                                         const VhostDMAMap *prev_elem,
> > +                                         const VhostDMAMap *map)
> > +{
> > +    size_t pos;
> > +
> > +    if (!prev_elem) {
> > +        pos = 0;
> > +    } else {
> > +        pos = prev_elem - &g_array_index(array, typeof(*prev_elem), 0) + 1;
> > +    }
> > +
> > +    g_array_insert_val(array, pos, *map);
> > +}
> > +
> > +static gint vhost_iova_tree_cmp_taddr(gconstpointer a, gconstpointer b)
> > +{
> > +    const VhostDMAMap *m1 = a, *m2 = b;
> > +
> > +    if (m1->translated_addr > m2->translated_addr + m2->size) {
> > +        return 1;
> > +    }
> > +
> > +    if (m1->translated_addr + m1->size < m2->translated_addr) {
> > +        return -1;
> > +    }
> > +
> > +    /* Overlapped */
> > +    return 0;
> > +}
> > +
> > +/**
> > + * Find the previous node to a given iova
> > + *
> > + * @array  The ascending ordered-by-translated-addr array of VhostDMAMap
> > + * @map    The map to insert
> > + * @prev   Returned location of the previous map
> > + *
> > + * Return VHOST_DMA_MAP_OK if everything went well, or VHOST_DMA_MAP_OVERLAP if
> > + * it already exists. It is ok to use this function to check if a given range
> > + * exists, but it will use a linear search.
> > + *
> > + * TODO: We can use bsearch to locate the entry if we save the state in the
> > + * needle, knowing that the needle is always the first argument to
> > + * compare_func.
> > + */
> > +static VhostDMAMapNewRC vhost_iova_tree_find_prev(const GArray *array,
> > +                                                  GCompareFunc compare_func,
> > +                                                  const VhostDMAMap *map,
> > +                                                  const VhostDMAMap **prev)
> > +{
> > +    size_t i;
> > +    int r;
> > +
> > +    *prev = NULL;
> > +    for (i = 0; i < array->len; ++i) {
> > +        r = compare_func(map, &g_array_index(array, typeof(*map), i));
> > +        if (r == 0) {
> > +            return VHOST_DMA_MAP_OVERLAP;
> > +        }
> > +        if (r < 0) {
> > +            return VHOST_DMA_MAP_OK;
> > +        }
> > +
> > +        *prev = &g_array_index(array, typeof(**prev), i);
> > +    }
> > +
> > +    return VHOST_DMA_MAP_OK;
> > +}
> > +
> > +/**
> > + * Create a new IOVA tree
> > + *
> > + * Returns the new IOVA tree
> > + */
> > +VhostIOVATree *vhost_iova_tree_new(void)
> > +{
>
>
> So I think it needs to be initialized with the range we get from
> get_iova_range().
>

Right, it is done that way for the next revision.

> Thanks
>
>
> > +    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
> > +    tree->iova_taddr_map = g_array_new(G_ARRAY_NOT_ZERO_TERMINATED,
> > +                                       G_ARRAY_NOT_CLEAR_ON_ALLOC,
> > +                                       sizeof(VhostDMAMap));
> > +    tree->taddr_iova_map = g_array_new(G_ARRAY_NOT_ZERO_TERMINATED,
> > +                                       G_ARRAY_NOT_CLEAR_ON_ALLOC,
> > +                                       sizeof(VhostDMAMap));
> > +    return tree;
> > +}
> > +
> > +/**
> > + * Destroy an IOVA tree
> > + *
> > + * @tree  The iova tree
> > + */
> > +void vhost_iova_tree_unref(VhostIOVATree *tree)
> > +{
> > +    g_array_unref(g_steal_pointer(&tree->iova_taddr_map));
> > +    g_array_unref(g_steal_pointer(&tree->taddr_iova_map));
> > +}
> > +
> > +/**
> > + * Find the IOVA address stored from a memory address
> > + *
> > + * @tree     The iova tree
> > + * @map      The map with the memory address
> > + *
> > + * Return the stored mapping, or NULL if not found.
> > + */
> > +const VhostDMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
> > +                                             const VhostDMAMap *map)
> > +{
> > +    /*
> > +     * This can be replaced with g_array_binary_search (Since glib 2.62) when
> > +     * that version become common enough.
> > +     */
> > +    return bsearch(map, tree->taddr_iova_map->data, tree->taddr_iova_map->len,
> > +                   sizeof(*map), vhost_iova_tree_cmp_taddr);
> > +}
> > +
> > +static bool vhost_iova_tree_find_iova_hole(const GArray *iova_map,
> > +                                           const VhostDMAMap *map,
> > +                                           const VhostDMAMap **prev_elem)
> > +{
> > +    size_t i;
> > +    hwaddr iova = iova_min;
> > +
> > +    *prev_elem = NULL;
> > +    for (i = 0; i < iova_map->len; i++) {
> > +        const VhostDMAMap *next = &g_array_index(iova_map, typeof(*next), i);
> > +        hwaddr hole_end = next->iova;
> > +        if (map->size < hole_end - iova) {
> > +            return true;
> > +        }
> > +
> > +        iova = next->iova + next->size + 1;
> > +        *prev_elem = next;
> > +    }
> > +
> > +    return ((hwaddr)-1 - iova) > iova_map->len;
> > +}
> > +
> > +/**
> > + * Allocate a new mapping
> > + *
> > + * @tree  The iova tree
> > + * @map   The iova map
> > + *
> > + * Returns:
> > + * - VHOST_DMA_MAP_OK if the map fits in the container
> > + * - VHOST_DMA_MAP_INVALID if the map does not make sense (like size overflow)
> > + * - VHOST_DMA_MAP_OVERLAP if the tree already contains that map
> > + * - VHOST_DMA_MAP_NO_SPACE if iova_rm cannot allocate more space.
> > + *
> > + * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
> > + */
> > +VhostDMAMapNewRC vhost_iova_tree_alloc(VhostIOVATree *tree,
> > +                                       VhostDMAMap *map)
> > +{
> > +    const VhostDMAMap *qemu_prev, *iova_prev;
> > +    int find_prev_rc;
> > +    bool fit;
> > +
> > +    if (map->translated_addr + map->size < map->translated_addr ||
> > +        map->iova + map->size < map->iova || map->perm == IOMMU_NONE) {
> > +        return VHOST_DMA_MAP_INVALID;
> > +    }
> > +
> > +    /* Search for a hole in iova space big enough */
> > +    fit = vhost_iova_tree_find_iova_hole(tree->iova_taddr_map, map,
> > +                                         &iova_prev);
> > +    if (!fit) {
> > +        return VHOST_DMA_MAP_NO_SPACE;
> > +    }
> > +
> > +    map->iova = iova_prev ? (iova_prev->iova + iova_prev->size) + 1 : iova_min;
> > +    find_prev_rc = vhost_iova_tree_find_prev(tree->taddr_iova_map,
> > +                                             vhost_iova_tree_cmp_taddr, map,
> > +                                             &qemu_prev);
> > +    if (find_prev_rc == VHOST_DMA_MAP_OVERLAP) {
> > +        return VHOST_DMA_MAP_OVERLAP;
> > +    }
> > +
> > +    vhost_iova_tree_insert_after(tree->iova_taddr_map, iova_prev, map);
> > +    vhost_iova_tree_insert_after(tree->taddr_iova_map, qemu_prev, map);
> > +    return VHOST_DMA_MAP_OK;
> > +}
> > diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
> > index 8b5a0225fe..cb306b83c6 100644
> > --- a/hw/virtio/meson.build
> > +++ b/hw/virtio/meson.build
> > @@ -11,7 +11,7 @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
> >
> >   virtio_ss = ss.source_set()
> >   virtio_ss.add(files('virtio.c'))
> > -virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
> > +virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
> >   virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
> >   virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
> >   virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
>