Add support for assigning Address Space Identifiers (ASIDs) to each VQ
group. This enables mapping each group into a distinct memory space.
Now that the driver can change ASID in the middle of operation, the
domain that each vq address point is also protected by domain_lock.
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
v9:
* Replace mutex with RCU, as the vdpa map_ops can run from atomic
context.
v8:
* Revert the mutex to rwlock change, it needs proper profiling to
justify it.
v7:
* Take write lock in the error path (Jason).
v6:
* Make vdpa_dev_add use gotos for error handling (MST).
* s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
(MST).
* Fix struct name not matching in the doc.
v5:
* Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
ioctl (Jason).
* Properly set domain bounce size to divide equally between nas (Jason).
* Exclude "padding" member from the only >V1 members in
vduse_dev_request.
v4:
* Divide each domain bounce size between the device bounce size (Jason).
* revert unneeded addr = NULL assignment (Jason)
* Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
return; } (Jason)
* Change a bad multiline comment, using @ caracter instead of * (Jason).
* Consider config->nas == 0 as a fail (Jason).
v3:
* Get the vduse domain through the vduse_as in the map functions
(Jason).
* Squash with the patch creating the vduse_as struct (Jason).
* Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
(Jason)
v2:
* Convert the use of mutex to rwlock.
RFC v3:
* Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
value to reduce memory consumption, but vqs are already limited to
that value and userspace VDUSE is able to allocate that many vqs.
* Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
VDUSE_IOTLB_GET_INFO.
* Use of array_index_nospec in VDUSE device ioctls.
* Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
* Move the umem mutex to asid struct so there is no contention between
ASIDs.
RFC v2:
* Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
part of the struct is the same.
---
drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
include/uapi/linux/vduse.h | 53 ++++-
2 files changed, 314 insertions(+), 109 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 97be04f73fbf..ff95ed56f22d 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -11,6 +11,7 @@
#include "linux/virtio_net.h"
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/rcupdate.h>
#include <linux/cdev.h>
#include <linux/device.h>
#include <linux/eventfd.h>
@@ -41,6 +42,7 @@
#define VDUSE_DEV_MAX (1U << MINORBITS)
#define VDUSE_DEV_MAX_GROUPS 0xffff
+#define VDUSE_DEV_MAX_AS 0xffff
#define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
#define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
#define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
@@ -86,7 +88,14 @@ struct vduse_umem {
struct mm_struct *mm;
};
+struct vduse_as {
+ struct vduse_iova_domain *domain;
+ struct vduse_umem *umem;
+ struct mutex mem_lock;
+};
+
struct vduse_vq_group {
+ struct vduse_as *as __rcu;
struct vduse_dev *dev;
};
@@ -94,7 +103,7 @@ struct vduse_dev {
struct vduse_vdpa *vdev;
struct device *dev;
struct vduse_virtqueue **vqs;
- struct vduse_iova_domain *domain;
+ struct vduse_as *as;
char *name;
struct mutex lock;
spinlock_t msg_lock;
@@ -122,9 +131,8 @@ struct vduse_dev {
u32 vq_num;
u32 vq_align;
u32 ngroups;
- struct vduse_umem *umem;
+ u32 nas;
struct vduse_vq_group *groups;
- struct mutex mem_lock;
unsigned int bounce_size;
struct mutex domain_lock;
};
@@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
return vduse_dev_msg_sync(dev, &msg);
}
-static int vduse_dev_update_iotlb(struct vduse_dev *dev,
+static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
u64 start, u64 last)
{
struct vduse_dev_msg msg = { 0 };
@@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
return -EINVAL;
msg.req.type = VDUSE_UPDATE_IOTLB;
- msg.req.iova.start = start;
- msg.req.iova.last = last;
+ if (dev->api_version < VDUSE_API_VERSION_1) {
+ msg.req.iova.start = start;
+ msg.req.iova.last = last;
+ } else {
+ msg.req.iova_v2.start = start;
+ msg.req.iova_v2.last = last;
+ msg.req.iova_v2.asid = asid;
+ }
return vduse_dev_msg_sync(dev, &msg);
}
@@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
return mask;
}
+/* Force set the asid to a vq group without a message to the VDUSE device */
+static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
+ unsigned int group, unsigned int asid)
+{
+ /*
+ * Two concurrent updates to this pointer are valid as they cannot
+ * point to an invalid region. It is ok for them to race as long as
+ * the readers see a consistent state through RCU.
+ */
+ rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
+}
+
static void vduse_dev_reset(struct vduse_dev *dev)
{
int i;
- struct vduse_iova_domain *domain = dev->domain;
/* The coherent mappings are handled in vduse_dev_free_coherent() */
- if (domain && domain->bounce_map)
- vduse_domain_reset_bounce_map(domain);
+ for (i = 0; i < dev->nas; i++) {
+ struct vduse_iova_domain *domain = dev->as[i].domain;
+
+ if (domain && domain->bounce_map)
+ vduse_domain_reset_bounce_map(domain);
+ }
+
+ for (i = 0; i < dev->ngroups; i++)
+ vduse_set_group_asid_nomsg(dev, i, 0);
down_write(&dev->rwsem);
@@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
return ret;
}
+static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
+ unsigned int asid)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ struct vduse_dev_msg msg = { 0 };
+ int r;
+
+ if (dev->api_version < VDUSE_API_VERSION_1 ||
+ group >= dev->ngroups || asid >= dev->nas)
+ return -EINVAL;
+
+ msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
+ msg.req.vq_group_asid.group = group;
+ msg.req.vq_group_asid.asid = asid;
+
+ r = vduse_dev_msg_sync(dev, &msg);
+ if (r < 0)
+ return r;
+
+ vduse_set_group_asid_nomsg(dev, group, asid);
+ return 0;
+}
+
static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
struct vdpa_vq_state *state)
{
@@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
int ret;
- ret = vduse_domain_set_map(dev->domain, iotlb);
+ ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
if (ret)
return ret;
- ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
+ ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
if (ret) {
- vduse_domain_clear_map(dev->domain, iotlb);
+ vduse_domain_clear_map(dev->as[asid].domain, iotlb);
return ret;
}
@@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
.get_vq_affinity = vduse_vdpa_get_vq_affinity,
.reset = vduse_vdpa_reset,
.set_map = vduse_vdpa_set_map,
+ .set_group_asid = vduse_set_group_asid,
.get_vq_map = vduse_get_vq_map,
.free = vduse_vdpa_free,
};
@@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
enum dma_data_direction dir)
{
struct vduse_dev *vdev;
+ struct vduse_as *as;
struct vduse_iova_domain *domain;
if (!token.group)
return;
vdev = token.group->dev;
- domain = vdev->domain;
-
+ rcu_read_lock();
+ as = rcu_dereference(token.group->as);
+ domain = as->domain;
+ rcu_read_unlock();
vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
}
@@ -868,14 +927,17 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token,
enum dma_data_direction dir)
{
struct vduse_dev *vdev;
+ struct vduse_as *as;
struct vduse_iova_domain *domain;
if (!token.group)
return;
vdev = token.group->dev;
- domain = vdev->domain;
-
+ rcu_read_lock();
+ as = rcu_dereference(token.group->as);
+ domain = as->domain;
+ rcu_read_unlock();
vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
}
@@ -885,15 +947,21 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
unsigned long attrs)
{
struct vduse_dev *vdev;
+ struct vduse_as *as;
struct vduse_iova_domain *domain;
+ dma_addr_t r;
if (!token.group)
return DMA_MAPPING_ERROR;
vdev = token.group->dev;
- domain = vdev->domain;
+ rcu_read_lock();
+ as = rcu_dereference(token.group->as);
+ domain = as->domain;
+ rcu_read_unlock();
+ r = vduse_domain_map_page(domain, page, offset, size, dir, attrs);
- return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
+ return r;
}
static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr,
@@ -901,21 +969,25 @@ static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr,
unsigned long attrs)
{
struct vduse_dev *vdev;
+ struct vduse_as *as;
struct vduse_iova_domain *domain;
if (!token.group)
return;
vdev = token.group->dev;
- domain = vdev->domain;
-
- return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
+ rcu_read_lock();
+ as = rcu_dereference(token.group->as);
+ domain = as->domain;
+ rcu_read_unlock();
+ vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
}
static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size,
dma_addr_t *dma_addr, gfp_t flag)
{
struct vduse_dev *vdev;
+ struct vduse_as *as;
struct vduse_iova_domain *domain;
unsigned long iova;
void *addr;
@@ -925,13 +997,14 @@ static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size,
return NULL;
vdev = token.group->dev;
- domain = vdev->domain;
+ rcu_read_lock();
+ as = rcu_dereference(token.group->as);
+ domain = as->domain;
+ rcu_read_unlock();
addr = vduse_domain_alloc_coherent(domain, size,
(dma_addr_t *)&iova, flag);
- if (!addr)
- return NULL;
-
- *dma_addr = (dma_addr_t)iova;
+ if (addr)
+ *dma_addr = (dma_addr_t)iova;
return addr;
}
@@ -941,29 +1014,39 @@ static void vduse_dev_free_coherent(union virtio_map token, size_t size,
unsigned long attrs)
{
struct vduse_dev *vdev;
+ struct vduse_as *as;
struct vduse_iova_domain *domain;
if (!token.group)
return;
vdev = token.group->dev;
- domain = vdev->domain;
-
+ rcu_read_lock();
+ as = rcu_dereference(token.group->as);
+ domain = as->domain;
+ rcu_read_unlock();
vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
+ mutex_unlock(&vdev->domain_lock);
}
static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr)
{
struct vduse_dev *vdev;
+ struct vduse_as *as;
struct vduse_iova_domain *domain;
+ size_t bounce_size;
if (!token.group)
return false;
vdev = token.group->dev;
- domain = vdev->domain;
+ rcu_read_lock();
+ as = rcu_dereference(token.group->as);
+ domain = as->domain;
+ rcu_read_unlock();
+ bounce_size = domain->bounce_size;
- return dma_addr < domain->bounce_size;
+ return dma_addr < bounce_size;
}
static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr)
@@ -976,15 +1059,22 @@ static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr)
static size_t vduse_dev_max_mapping_size(union virtio_map token)
{
struct vduse_dev *vdev;
+ struct vduse_as *as;
struct vduse_iova_domain *domain;
+ size_t bounce_size;
if (!token.group)
return 0;
vdev = token.group->dev;
- domain = vdev->domain;
+ rcu_read_lock();
+ as = rcu_dereference(token.group->as);
+ domain = as->domain;
+ rcu_read_unlock();
+ domain = token.group->as->domain;
+ bounce_size = domain->bounce_size;
- return domain->bounce_size;
+ return bounce_size;
}
static const struct virtio_map_ops vduse_map_ops = {
@@ -1124,39 +1214,40 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
return ret;
}
-static int vduse_dev_dereg_umem(struct vduse_dev *dev,
+static int vduse_dev_dereg_umem(struct vduse_dev *dev, u32 asid,
u64 iova, u64 size)
{
int ret;
- mutex_lock(&dev->mem_lock);
+ mutex_lock(&dev->as[asid].mem_lock);
ret = -ENOENT;
- if (!dev->umem)
+ if (!dev->as[asid].umem)
goto unlock;
ret = -EINVAL;
- if (!dev->domain)
+ if (!dev->as[asid].domain)
goto unlock;
- if (dev->umem->iova != iova || size != dev->domain->bounce_size)
+ if (dev->as[asid].umem->iova != iova ||
+ size != dev->as[asid].domain->bounce_size)
goto unlock;
- vduse_domain_remove_user_bounce_pages(dev->domain);
- unpin_user_pages_dirty_lock(dev->umem->pages,
- dev->umem->npages, true);
- atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm);
- mmdrop(dev->umem->mm);
- vfree(dev->umem->pages);
- kfree(dev->umem);
- dev->umem = NULL;
+ vduse_domain_remove_user_bounce_pages(dev->as[asid].domain);
+ unpin_user_pages_dirty_lock(dev->as[asid].umem->pages,
+ dev->as[asid].umem->npages, true);
+ atomic64_sub(dev->as[asid].umem->npages, &dev->as[asid].umem->mm->pinned_vm);
+ mmdrop(dev->as[asid].umem->mm);
+ vfree(dev->as[asid].umem->pages);
+ kfree(dev->as[asid].umem);
+ dev->as[asid].umem = NULL;
ret = 0;
unlock:
- mutex_unlock(&dev->mem_lock);
+ mutex_unlock(&dev->as[asid].mem_lock);
return ret;
}
static int vduse_dev_reg_umem(struct vduse_dev *dev,
- u64 iova, u64 uaddr, u64 size)
+ u32 asid, u64 iova, u64 uaddr, u64 size)
{
struct page **page_list = NULL;
struct vduse_umem *umem = NULL;
@@ -1164,14 +1255,14 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
unsigned long npages, lock_limit;
int ret;
- if (!dev->domain || !dev->domain->bounce_map ||
- size != dev->domain->bounce_size ||
+ if (!dev->as[asid].domain || !dev->as[asid].domain->bounce_map ||
+ size != dev->as[asid].domain->bounce_size ||
iova != 0 || uaddr & ~PAGE_MASK)
return -EINVAL;
- mutex_lock(&dev->mem_lock);
+ mutex_lock(&dev->as[asid].mem_lock);
ret = -EEXIST;
- if (dev->umem)
+ if (dev->as[asid].umem)
goto unlock;
ret = -ENOMEM;
@@ -1195,7 +1286,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
goto out;
}
- ret = vduse_domain_add_user_bounce_pages(dev->domain,
+ ret = vduse_domain_add_user_bounce_pages(dev->as[asid].domain,
page_list, pinned);
if (ret)
goto out;
@@ -1208,7 +1299,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
umem->mm = current->mm;
mmgrab(current->mm);
- dev->umem = umem;
+ dev->as[asid].umem = umem;
out:
if (ret && pinned > 0)
unpin_user_pages(page_list, pinned);
@@ -1219,7 +1310,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
vfree(page_list);
kfree(umem);
}
- mutex_unlock(&dev->mem_lock);
+ mutex_unlock(&dev->as[asid].mem_lock);
return ret;
}
@@ -1251,47 +1342,66 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case VDUSE_IOTLB_GET_FD: {
- struct vduse_iotlb_entry entry;
+ struct vduse_iotlb_entry_v2 entry;
struct vhost_iotlb_map *map;
struct vdpa_map_file *map_file;
struct file *f = NULL;
+ u32 asid;
ret = -EFAULT;
- if (copy_from_user(&entry, argp, sizeof(entry)))
- break;
+ if (dev->api_version >= VDUSE_API_VERSION_1) {
+ if (copy_from_user(&entry, argp, sizeof(entry)))
+ break;
+ } else {
+ entry.asid = 0;
+ if (copy_from_user(&entry.v1, argp,
+ sizeof(entry.v1)))
+ break;
+ }
ret = -EINVAL;
- if (entry.start > entry.last)
+ if (entry.v1.start > entry.v1.last)
+ break;
+
+ if (entry.asid >= dev->nas)
break;
mutex_lock(&dev->domain_lock);
- if (!dev->domain) {
+ asid = array_index_nospec(entry.asid, dev->nas);
+ if (!dev->as[asid].domain) {
mutex_unlock(&dev->domain_lock);
break;
}
- spin_lock(&dev->domain->iotlb_lock);
- map = vhost_iotlb_itree_first(dev->domain->iotlb,
- entry.start, entry.last);
+ spin_lock(&dev->as[asid].domain->iotlb_lock);
+ map = vhost_iotlb_itree_first(dev->as[asid].domain->iotlb,
+ entry.v1.start, entry.v1.last);
if (map) {
map_file = (struct vdpa_map_file *)map->opaque;
f = get_file(map_file->file);
- entry.offset = map_file->offset;
- entry.start = map->start;
- entry.last = map->last;
- entry.perm = map->perm;
+ entry.v1.offset = map_file->offset;
+ entry.v1.start = map->start;
+ entry.v1.last = map->last;
+ entry.v1.perm = map->perm;
}
- spin_unlock(&dev->domain->iotlb_lock);
+ spin_unlock(&dev->as[asid].domain->iotlb_lock);
mutex_unlock(&dev->domain_lock);
ret = -EINVAL;
if (!f)
break;
- ret = -EFAULT;
- if (copy_to_user(argp, &entry, sizeof(entry))) {
+ if (dev->api_version >= VDUSE_API_VERSION_1)
+ ret = copy_to_user(argp, &entry,
+ sizeof(entry));
+ else
+ ret = copy_to_user(argp, &entry.v1,
+ sizeof(entry.v1));
+
+ if (ret) {
+ ret = -EFAULT;
fput(f);
break;
}
- ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm));
+ ret = receive_fd(f, NULL, perm_to_file_flags(entry.v1.perm));
fput(f);
break;
}
@@ -1436,6 +1546,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
}
case VDUSE_IOTLB_REG_UMEM: {
struct vduse_iova_umem umem;
+ u32 asid;
ret = -EFAULT;
if (copy_from_user(&umem, argp, sizeof(umem)))
@@ -1443,17 +1554,21 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
ret = -EINVAL;
if (!is_mem_zero((const char *)umem.reserved,
- sizeof(umem.reserved)))
+ sizeof(umem.reserved)) ||
+ (dev->api_version < VDUSE_API_VERSION_1 &&
+ umem.asid != 0) || umem.asid >= dev->nas)
break;
mutex_lock(&dev->domain_lock);
- ret = vduse_dev_reg_umem(dev, umem.iova,
+ asid = array_index_nospec(umem.asid, dev->nas);
+ ret = vduse_dev_reg_umem(dev, asid, umem.iova,
umem.uaddr, umem.size);
mutex_unlock(&dev->domain_lock);
break;
}
case VDUSE_IOTLB_DEREG_UMEM: {
struct vduse_iova_umem umem;
+ u32 asid;
ret = -EFAULT;
if (copy_from_user(&umem, argp, sizeof(umem)))
@@ -1461,10 +1576,15 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
ret = -EINVAL;
if (!is_mem_zero((const char *)umem.reserved,
- sizeof(umem.reserved)))
+ sizeof(umem.reserved)) ||
+ (dev->api_version < VDUSE_API_VERSION_1 &&
+ umem.asid != 0) ||
+ umem.asid >= dev->nas)
break;
+
mutex_lock(&dev->domain_lock);
- ret = vduse_dev_dereg_umem(dev, umem.iova,
+ asid = array_index_nospec(umem.asid, dev->nas);
+ ret = vduse_dev_dereg_umem(dev, asid, umem.iova,
umem.size);
mutex_unlock(&dev->domain_lock);
break;
@@ -1472,6 +1592,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
case VDUSE_IOTLB_GET_INFO: {
struct vduse_iova_info info;
struct vhost_iotlb_map *map;
+ u32 asid;
ret = -EFAULT;
if (copy_from_user(&info, argp, sizeof(info)))
@@ -1485,23 +1606,31 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
sizeof(info.reserved)))
break;
+ if (dev->api_version < VDUSE_API_VERSION_1) {
+ if (info.asid)
+ break;
+ } else if (info.asid >= dev->nas)
+ break;
+
mutex_lock(&dev->domain_lock);
- if (!dev->domain) {
+ asid = array_index_nospec(info.asid, dev->nas);
+ if (!dev->as[asid].domain) {
mutex_unlock(&dev->domain_lock);
break;
}
- spin_lock(&dev->domain->iotlb_lock);
- map = vhost_iotlb_itree_first(dev->domain->iotlb,
+ spin_lock(&dev->as[asid].domain->iotlb_lock);
+ map = vhost_iotlb_itree_first(dev->as[asid].domain->iotlb,
info.start, info.last);
if (map) {
info.start = map->start;
info.last = map->last;
info.capability = 0;
- if (dev->domain->bounce_map && map->start == 0 &&
- map->last == dev->domain->bounce_size - 1)
+ if (dev->as[asid].domain->bounce_map &&
+ map->start == 0 &&
+ map->last == dev->as[asid].domain->bounce_size - 1)
info.capability |= VDUSE_IOVA_CAP_UMEM;
}
- spin_unlock(&dev->domain->iotlb_lock);
+ spin_unlock(&dev->as[asid].domain->iotlb_lock);
mutex_unlock(&dev->domain_lock);
if (!map)
break;
@@ -1526,8 +1655,10 @@ static int vduse_dev_release(struct inode *inode, struct file *file)
struct vduse_dev *dev = file->private_data;
mutex_lock(&dev->domain_lock);
- if (dev->domain)
- vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
+ for (int i = 0; i < dev->nas; i++)
+ if (dev->as[i].domain)
+ vduse_dev_dereg_umem(dev, i, 0,
+ dev->as[i].domain->bounce_size);
mutex_unlock(&dev->domain_lock);
spin_lock(&dev->msg_lock);
/* Make sure the inflight messages can processed after reconncection */
@@ -1746,7 +1877,6 @@ static struct vduse_dev *vduse_dev_create(void)
return NULL;
mutex_init(&dev->lock);
- mutex_init(&dev->mem_lock);
mutex_init(&dev->domain_lock);
spin_lock_init(&dev->msg_lock);
INIT_LIST_HEAD(&dev->send_list);
@@ -1797,8 +1927,11 @@ static int vduse_destroy_dev(char *name)
idr_remove(&vduse_idr, dev->minor);
kvfree(dev->config);
vduse_dev_deinit_vqs(dev);
- if (dev->domain)
- vduse_domain_destroy(dev->domain);
+ for (int i = 0; i < dev->nas; i++) {
+ if (dev->as[i].domain)
+ vduse_domain_destroy(dev->as[i].domain);
+ }
+ kfree(dev->as);
kfree(dev->name);
kfree(dev->groups);
vduse_dev_destroy(dev);
@@ -1845,12 +1978,17 @@ static bool vduse_validate_config(struct vduse_dev_config *config,
sizeof(config->reserved)))
return false;
- if (api_version < VDUSE_API_VERSION_1 && config->ngroups)
+ if (api_version < VDUSE_API_VERSION_1 &&
+ (config->ngroups || config->nas))
return false;
- if (api_version >= VDUSE_API_VERSION_1 &&
- (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS))
- return false;
+ if (api_version >= VDUSE_API_VERSION_1) {
+ if (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS)
+ return false;
+
+ if (!config->nas || config->nas > VDUSE_DEV_MAX_AS)
+ return false;
+ }
if (config->vq_align > PAGE_SIZE)
return false;
@@ -1915,7 +2053,8 @@ static ssize_t bounce_size_store(struct device *device,
ret = -EPERM;
mutex_lock(&dev->domain_lock);
- if (dev->domain)
+ /* Assuming that if the first domain is allocated, all are allocated */
+ if (dev->as[0].domain)
goto unlock;
ret = kstrtouint(buf, 10, &bounce_size);
@@ -1977,6 +2116,13 @@ static int vduse_create_dev(struct vduse_dev_config *config,
for (u32 i = 0; i < dev->ngroups; ++i)
dev->groups[i].dev = dev;
+ dev->nas = (dev->api_version < VDUSE_API_VERSION_1) ? 1 : config->nas;
+ dev->as = kcalloc(dev->nas, sizeof(dev->as[0]), GFP_KERNEL);
+ if (!dev->as)
+ goto err_as;
+ for (int i = 0; i < dev->nas; i++)
+ mutex_init(&dev->as[i].mem_lock);
+
dev->name = kstrdup(config->name, GFP_KERNEL);
if (!dev->name)
goto err_str;
@@ -2013,6 +2159,8 @@ static int vduse_create_dev(struct vduse_dev_config *config,
err_idr:
kfree(dev->name);
err_str:
+ kfree(dev->as);
+err_as:
kfree(dev->groups);
err_vq_groups:
vduse_dev_destroy(dev);
@@ -2138,7 +2286,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
&vduse_vdpa_config_ops, &vduse_map_ops,
- dev->ngroups, 1, name, true);
+ dev->ngroups, dev->nas, name, true);
if (IS_ERR(vdev))
return PTR_ERR(vdev);
@@ -2153,7 +2301,8 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
const struct vdpa_dev_set_config *config)
{
struct vduse_dev *dev;
- int ret;
+ size_t domain_bounce_size;
+ int ret, i;
mutex_lock(&vduse_lock);
dev = vduse_find_dev(name);
@@ -2167,29 +2316,38 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
return ret;
mutex_lock(&dev->domain_lock);
- if (!dev->domain)
- dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
- dev->bounce_size);
- mutex_unlock(&dev->domain_lock);
- if (!dev->domain) {
- ret = -ENOMEM;
- goto domain_err;
+ ret = 0;
+
+ domain_bounce_size = dev->bounce_size / dev->nas;
+ for (i = 0; i < dev->nas; ++i) {
+ dev->as[i].domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
+ domain_bounce_size);
+ if (!dev->as[i].domain) {
+ ret = -ENOMEM;
+ goto err;
+ }
}
+ mutex_unlock(&dev->domain_lock);
+
ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
- if (ret) {
- goto register_err;
- }
+ if (ret)
+ goto err_register;
return 0;
-register_err:
+err_register:
mutex_lock(&dev->domain_lock);
- vduse_domain_destroy(dev->domain);
- dev->domain = NULL;
+
+err:
+ for (int j = 0; j < i; j++) {
+ if (dev->as[j].domain) {
+ vduse_domain_destroy(dev->as[j].domain);
+ dev->as[j].domain = NULL;
+ }
+ }
mutex_unlock(&dev->domain_lock);
-domain_err:
put_device(&dev->vdev->vdpa.dev);
return ret;
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
index a3d51cf6df3a..da2c5e47990e 100644
--- a/include/uapi/linux/vduse.h
+++ b/include/uapi/linux/vduse.h
@@ -47,7 +47,8 @@ struct vduse_dev_config {
__u32 vq_num;
__u32 vq_align;
__u32 ngroups; /* if VDUSE_API_VERSION >= 1 */
- __u32 reserved[12];
+ __u32 nas; /* if VDUSE_API_VERSION >= 1 */
+ __u32 reserved[11];
__u32 config_size;
__u8 config[];
};
@@ -82,6 +83,18 @@ struct vduse_iotlb_entry {
__u8 perm;
};
+/**
+ * struct vduse_iotlb_entry_v2 - entry of IOTLB to describe one IOVA region in an ASID
+ * @v1: the original vduse_iotlb_entry
+ * @asid: address space ID of the IOVA region
+ *
+ * Structure used by VDUSE_IOTLB_GET_FD ioctl to find an overlapped IOVA region.
+ */
+struct vduse_iotlb_entry_v2 {
+ struct vduse_iotlb_entry v1;
+ __u32 asid;
+};
+
/*
* Find the first IOVA region that overlaps with the range [start, last]
* and return the corresponding file descriptor. Return -EINVAL means the
@@ -166,6 +179,16 @@ struct vduse_vq_state_packed {
__u16 last_used_idx;
};
+/**
+ * struct vduse_vq_group_asid - virtqueue group ASID
+ * @group: Index of the virtqueue group
+ * @asid: Address space ID of the group
+ */
+struct vduse_vq_group_asid {
+ __u32 group;
+ __u32 asid;
+};
+
/**
* struct vduse_vq_info - information of a virtqueue
* @index: virtqueue index
@@ -225,6 +248,7 @@ struct vduse_vq_eventfd {
* @uaddr: start address of userspace memory, it must be aligned to page size
* @iova: start of the IOVA region
* @size: size of the IOVA region
+ * @asid: Address space ID of the IOVA region
* @reserved: for future use, needs to be initialized to zero
*
* Structure used by VDUSE_IOTLB_REG_UMEM and VDUSE_IOTLB_DEREG_UMEM
@@ -234,7 +258,8 @@ struct vduse_iova_umem {
__u64 uaddr;
__u64 iova;
__u64 size;
- __u64 reserved[3];
+ __u32 asid;
+ __u32 reserved[5];
};
/* Register userspace memory for IOVA regions */
@@ -248,6 +273,7 @@ struct vduse_iova_umem {
* @start: start of the IOVA region
* @last: last of the IOVA region
* @capability: capability of the IOVA region
+ * @asid: Address space ID of the IOVA region, only if device API version >= 1
* @reserved: for future use, needs to be initialized to zero
*
* Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of
@@ -258,7 +284,8 @@ struct vduse_iova_info {
__u64 last;
#define VDUSE_IOVA_CAP_UMEM (1 << 0)
__u64 capability;
- __u64 reserved[3];
+ __u32 asid; /* Only if device API version >= 1 */
+ __u32 reserved[5];
};
/*
@@ -280,6 +307,7 @@ enum vduse_req_type {
VDUSE_GET_VQ_STATE,
VDUSE_SET_STATUS,
VDUSE_UPDATE_IOTLB,
+ VDUSE_SET_VQ_GROUP_ASID,
};
/**
@@ -314,6 +342,18 @@ struct vduse_iova_range {
__u64 last;
};
+/**
+ * struct vduse_iova_range - IOVA range [start, last] if API_VERSION >= 1
+ * @start: start of the IOVA range
+ * @last: last of the IOVA range
+ * @asid: address space ID of the IOVA range
+ */
+struct vduse_iova_range_v2 {
+ __u64 start;
+ __u64 last;
+ __u32 asid;
+};
+
/**
* struct vduse_dev_request - control request
* @type: request type
@@ -322,6 +362,8 @@ struct vduse_iova_range {
* @vq_state: virtqueue state, only index field is available
* @s: device status
* @iova: IOVA range for updating
+ * @iova_v2: IOVA range for updating if API_VERSION >= 1
+ * @vq_group_asid: ASID of a virtqueue group
* @padding: padding
*
* Structure used by read(2) on /dev/vduse/$NAME.
@@ -334,6 +376,11 @@ struct vduse_dev_request {
struct vduse_vq_state vq_state;
struct vduse_dev_status s;
struct vduse_iova_range iova;
+ /* Following members but padding exist only if vduse api
+ * version >= 1
+ */;
+ struct vduse_iova_range_v2 iova_v2;
+ struct vduse_vq_group_asid vq_group_asid;
__u32 padding[32];
};
};
--
2.51.1
> Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> group. This enables mapping each group into a distinct memory space.
>
> Now that the driver can change ASID in the middle of operation, the
> domain that each vq address point is also protected by domain_lock.
>
> Acked-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> ---
> v9:
> * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> context.
>
> v8:
> * Revert the mutex to rwlock change, it needs proper profiling to
> justify it.
>
> v7:
> * Take write lock in the error path (Jason).
>
> v6:
> * Make vdpa_dev_add use gotos for error handling (MST).
> * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> (MST).
> * Fix struct name not matching in the doc.
>
> v5:
> * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> ioctl (Jason).
> * Properly set domain bounce size to divide equally between nas (Jason).
> * Exclude "padding" member from the only >V1 members in
> vduse_dev_request.
>
> v4:
> * Divide each domain bounce size between the device bounce size (Jason).
> * revert unneeded addr = NULL assignment (Jason)
> * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> return; } (Jason)
> * Change a bad multiline comment, using @ caracter instead of * (Jason).
> * Consider config->nas == 0 as a fail (Jason).
>
> v3:
> * Get the vduse domain through the vduse_as in the map functions
> (Jason).
> * Squash with the patch creating the vduse_as struct (Jason).
> * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> (Jason)
>
> v2:
> * Convert the use of mutex to rwlock.
>
> RFC v3:
> * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> value to reduce memory consumption, but vqs are already limited to
> that value and userspace VDUSE is able to allocate that many vqs.
> * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> VDUSE_IOTLB_GET_INFO.
> * Use of array_index_nospec in VDUSE device ioctls.
> * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> * Move the umem mutex to asid struct so there is no contention between
> ASIDs.
>
> RFC v2:
> * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> part of the struct is the same.
> ---
> drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> include/uapi/linux/vduse.h | 53 ++++-
> 2 files changed, 314 insertions(+), 109 deletions(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index 97be04f73fbf..ff95ed56f22d 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -11,6 +11,7 @@
> #include "linux/virtio_net.h"
> #include <linux/init.h>
> #include <linux/module.h>
> +#include <linux/rcupdate.h>
> #include <linux/cdev.h>
> #include <linux/device.h>
> #include <linux/eventfd.h>
> @@ -41,6 +42,7 @@
>
> #define VDUSE_DEV_MAX (1U << MINORBITS)
> #define VDUSE_DEV_MAX_GROUPS 0xffff
> +#define VDUSE_DEV_MAX_AS 0xffff
> #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> @@ -86,7 +88,14 @@ struct vduse_umem {
> struct mm_struct *mm;
> };
>
> +struct vduse_as {
> + struct vduse_iova_domain *domain;
> + struct vduse_umem *umem;
> + struct mutex mem_lock;
> +};
> +
> struct vduse_vq_group {
> + struct vduse_as *as __rcu;
> struct vduse_dev *dev;
> };
>
> @@ -94,7 +103,7 @@ struct vduse_dev {
> struct vduse_vdpa *vdev;
> struct device *dev;
> struct vduse_virtqueue **vqs;
> - struct vduse_iova_domain *domain;
> + struct vduse_as *as;
> char *name;
> struct mutex lock;
> spinlock_t msg_lock;
> @@ -122,9 +131,8 @@ struct vduse_dev {
> u32 vq_num;
> u32 vq_align;
> u32 ngroups;
> - struct vduse_umem *umem;
> + u32 nas;
> struct vduse_vq_group *groups;
> - struct mutex mem_lock;
> unsigned int bounce_size;
> struct mutex domain_lock;
> };
> @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> return vduse_dev_msg_sync(dev, &msg);
> }
>
> -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> u64 start, u64 last)
> {
> struct vduse_dev_msg msg = { 0 };
> @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> return -EINVAL;
>
> msg.req.type = VDUSE_UPDATE_IOTLB;
> - msg.req.iova.start = start;
> - msg.req.iova.last = last;
> + if (dev->api_version < VDUSE_API_VERSION_1) {
> + msg.req.iova.start = start;
> + msg.req.iova.last = last;
> + } else {
> + msg.req.iova_v2.start = start;
> + msg.req.iova_v2.last = last;
> + msg.req.iova_v2.asid = asid;
> + }
>
> return vduse_dev_msg_sync(dev, &msg);
> }
> @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> return mask;
> }
>
> +/* Force set the asid to a vq group without a message to the VDUSE device */
> +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> + unsigned int group, unsigned int asid)
> +{
> + /*
> + * Two concurrent updates to this pointer are valid as they cannot
> + * point to an invalid region. It is ok for them to race as long as
> + * the readers see a consistent state through RCU.
> + */
> + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
> +}
> +
> static void vduse_dev_reset(struct vduse_dev *dev)
> {
> int i;
> - struct vduse_iova_domain *domain = dev->domain;
>
> /* The coherent mappings are handled in vduse_dev_free_coherent() */
> - if (domain && domain->bounce_map)
> - vduse_domain_reset_bounce_map(domain);
> + for (i = 0; i < dev->nas; i++) {
> + struct vduse_iova_domain *domain = dev->as[i].domain;
> +
> + if (domain && domain->bounce_map)
> + vduse_domain_reset_bounce_map(domain);
> + }
> +
> + for (i = 0; i < dev->ngroups; i++)
> + vduse_set_group_asid_nomsg(dev, i, 0);
>
> down_write(&dev->rwsem);
>
> @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> return ret;
> }
>
> +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> + unsigned int asid)
> +{
> + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> + struct vduse_dev_msg msg = { 0 };
> + int r;
> +
> + if (dev->api_version < VDUSE_API_VERSION_1 ||
> + group >= dev->ngroups || asid >= dev->nas)
> + return -EINVAL;
> +
> + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> + msg.req.vq_group_asid.group = group;
> + msg.req.vq_group_asid.asid = asid;
> +
> + r = vduse_dev_msg_sync(dev, &msg);
> + if (r < 0)
> + return r;
> +
> + vduse_set_group_asid_nomsg(dev, group, asid);
> + return 0;
> +}
> +
> static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> struct vdpa_vq_state *state)
> {
> @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> int ret;
>
> - ret = vduse_domain_set_map(dev->domain, iotlb);
> + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> if (ret)
> return ret;
>
> - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> if (ret) {
> - vduse_domain_clear_map(dev->domain, iotlb);
> + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> return ret;
> }
>
> @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> .reset = vduse_vdpa_reset,
> .set_map = vduse_vdpa_set_map,
> + .set_group_asid = vduse_set_group_asid,
> .get_vq_map = vduse_get_vq_map,
> .free = vduse_vdpa_free,
> };
> @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> enum dma_data_direction dir)
> {
> struct vduse_dev *vdev;
> + struct vduse_as *as;
> struct vduse_iova_domain *domain;
>
> if (!token.group)
> return;
>
> vdev = token.group->dev;
> - domain = vdev->domain;
> -
> + rcu_read_lock();
> + as = rcu_dereference(token.group->as);
> + domain = as->domain;
> + rcu_read_unlock();
> vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
> }
>
> @@ -868,14 +927,17 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token,
> enum dma_data_direction dir)
> {
> struct vduse_dev *vdev;
> + struct vduse_as *as;
> struct vduse_iova_domain *domain;
>
> if (!token.group)
> return;
>
> vdev = token.group->dev;
> - domain = vdev->domain;
> -
> + rcu_read_lock();
> + as = rcu_dereference(token.group->as);
> + domain = as->domain;
> + rcu_read_unlock();
> vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
> }
>
> @@ -885,15 +947,21 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
> unsigned long attrs)
> {
> struct vduse_dev *vdev;
> + struct vduse_as *as;
> struct vduse_iova_domain *domain;
> + dma_addr_t r;
>
> if (!token.group)
> return DMA_MAPPING_ERROR;
>
> vdev = token.group->dev;
> - domain = vdev->domain;
> + rcu_read_lock();
> + as = rcu_dereference(token.group->as);
> + domain = as->domain;
> + rcu_read_unlock();
> + r = vduse_domain_map_page(domain, page, offset, size, dir, attrs);
>
> - return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
> + return r;
> }
>
> static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr,
> @@ -901,21 +969,25 @@ static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr,
> unsigned long attrs)
> {
> struct vduse_dev *vdev;
> + struct vduse_as *as;
> struct vduse_iova_domain *domain;
>
> if (!token.group)
> return;
>
> vdev = token.group->dev;
> - domain = vdev->domain;
> -
> - return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
> + rcu_read_lock();
> + as = rcu_dereference(token.group->as);
> + domain = as->domain;
> + rcu_read_unlock();
> + vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
> }
>
> static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size,
> dma_addr_t *dma_addr, gfp_t flag)
> {
> struct vduse_dev *vdev;
> + struct vduse_as *as;
> struct vduse_iova_domain *domain;
> unsigned long iova;
> void *addr;
> @@ -925,13 +997,14 @@ static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size,
> return NULL;
>
> vdev = token.group->dev;
> - domain = vdev->domain;
> + rcu_read_lock();
> + as = rcu_dereference(token.group->as);
> + domain = as->domain;
> + rcu_read_unlock();
> addr = vduse_domain_alloc_coherent(domain, size,
> (dma_addr_t *)&iova, flag);
> - if (!addr)
> - return NULL;
> -
> - *dma_addr = (dma_addr_t)iova;
> + if (addr)
> + *dma_addr = (dma_addr_t)iova;
>
> return addr;
> }
> @@ -941,29 +1014,39 @@ static void vduse_dev_free_coherent(union virtio_map token, size_t size,
> unsigned long attrs)
> {
> struct vduse_dev *vdev;
> + struct vduse_as *as;
> struct vduse_iova_domain *domain;
>
> if (!token.group)
> return;
>
> vdev = token.group->dev;
> - domain = vdev->domain;
> -
> + rcu_read_lock();
> + as = rcu_dereference(token.group->as);
> + domain = as->domain;
> + rcu_read_unlock();
> vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
> + mutex_unlock(&vdev->domain_lock);
> }
>
> static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr)
> {
> struct vduse_dev *vdev;
> + struct vduse_as *as;
> struct vduse_iova_domain *domain;
> + size_t bounce_size;
>
> if (!token.group)
> return false;
>
> vdev = token.group->dev;
> - domain = vdev->domain;
> + rcu_read_lock();
> + as = rcu_dereference(token.group->as);
> + domain = as->domain;
> + rcu_read_unlock();
> + bounce_size = domain->bounce_size;
>
> - return dma_addr < domain->bounce_size;
> + return dma_addr < bounce_size;
> }
>
> static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr)
> @@ -976,15 +1059,22 @@ static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr)
> static size_t vduse_dev_max_mapping_size(union virtio_map token)
> {
> struct vduse_dev *vdev;
> + struct vduse_as *as;
> struct vduse_iova_domain *domain;
> + size_t bounce_size;
>
> if (!token.group)
> return 0;
>
> vdev = token.group->dev;
> - domain = vdev->domain;
> + rcu_read_lock();
> + as = rcu_dereference(token.group->as);
> + domain = as->domain;
> + rcu_read_unlock();
> + domain = token.group->as->domain;
This line should be removed?
Thanks,
Yongji
On Mon, Nov 17, 2025 at 4:10 AM 谢永吉 <xieyongji@bytedance.com> wrote:
>
> > Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> > group. This enables mapping each group into a distinct memory space.
> >
> > Now that the driver can change ASID in the middle of operation, the
> > domain that each vq address point is also protected by domain_lock.
> >
> > Acked-by: Jason Wang <jasowang@redhat.com>
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > ---
> > v9:
> > * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> > context.
> >
> > v8:
> > * Revert the mutex to rwlock change, it needs proper profiling to
> > justify it.
> >
> > v7:
> > * Take write lock in the error path (Jason).
> >
> > v6:
> > * Make vdpa_dev_add use gotos for error handling (MST).
> > * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> > (MST).
> > * Fix struct name not matching in the doc.
> >
> > v5:
> > * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> > ioctl (Jason).
> > * Properly set domain bounce size to divide equally between nas (Jason).
> > * Exclude "padding" member from the only >V1 members in
> > vduse_dev_request.
> >
> > v4:
> > * Divide each domain bounce size between the device bounce size (Jason).
> > * revert unneeded addr = NULL assignment (Jason)
> > * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> > return; } (Jason)
> > * Change a bad multiline comment, using @ caracter instead of * (Jason).
> > * Consider config->nas == 0 as a fail (Jason).
> >
> > v3:
> > * Get the vduse domain through the vduse_as in the map functions
> > (Jason).
> > * Squash with the patch creating the vduse_as struct (Jason).
> > * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> > (Jason)
> >
> > v2:
> > * Convert the use of mutex to rwlock.
> >
> > RFC v3:
> > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> > value to reduce memory consumption, but vqs are already limited to
> > that value and userspace VDUSE is able to allocate that many vqs.
> > * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> > VDUSE_IOTLB_GET_INFO.
> > * Use of array_index_nospec in VDUSE device ioctls.
> > * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> > * Move the umem mutex to asid struct so there is no contention between
> > ASIDs.
> >
> > RFC v2:
> > * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> > part of the struct is the same.
> > ---
> > drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> > include/uapi/linux/vduse.h | 53 ++++-
> > 2 files changed, 314 insertions(+), 109 deletions(-)
> >
> > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > index 97be04f73fbf..ff95ed56f22d 100644
> > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > @@ -11,6 +11,7 @@
> > #include "linux/virtio_net.h"
> > #include <linux/init.h>
> > #include <linux/module.h>
> > +#include <linux/rcupdate.h>
> > #include <linux/cdev.h>
> > #include <linux/device.h>
> > #include <linux/eventfd.h>
> > @@ -41,6 +42,7 @@
> >
> > #define VDUSE_DEV_MAX (1U << MINORBITS)
> > #define VDUSE_DEV_MAX_GROUPS 0xffff
> > +#define VDUSE_DEV_MAX_AS 0xffff
> > #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> > #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> > @@ -86,7 +88,14 @@ struct vduse_umem {
> > struct mm_struct *mm;
> > };
> >
> > +struct vduse_as {
> > + struct vduse_iova_domain *domain;
> > + struct vduse_umem *umem;
> > + struct mutex mem_lock;
> > +};
> > +
> > struct vduse_vq_group {
> > + struct vduse_as *as __rcu;
> > struct vduse_dev *dev;
> > };
> >
> > @@ -94,7 +103,7 @@ struct vduse_dev {
> > struct vduse_vdpa *vdev;
> > struct device *dev;
> > struct vduse_virtqueue **vqs;
> > - struct vduse_iova_domain *domain;
> > + struct vduse_as *as;
> > char *name;
> > struct mutex lock;
> > spinlock_t msg_lock;
> > @@ -122,9 +131,8 @@ struct vduse_dev {
> > u32 vq_num;
> > u32 vq_align;
> > u32 ngroups;
> > - struct vduse_umem *umem;
> > + u32 nas;
> > struct vduse_vq_group *groups;
> > - struct mutex mem_lock;
> > unsigned int bounce_size;
> > struct mutex domain_lock;
> > };
> > @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> > return vduse_dev_msg_sync(dev, &msg);
> > }
> >
> > -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> > u64 start, u64 last)
> > {
> > struct vduse_dev_msg msg = { 0 };
> > @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > return -EINVAL;
> >
> > msg.req.type = VDUSE_UPDATE_IOTLB;
> > - msg.req.iova.start = start;
> > - msg.req.iova.last = last;
> > + if (dev->api_version < VDUSE_API_VERSION_1) {
> > + msg.req.iova.start = start;
> > + msg.req.iova.last = last;
> > + } else {
> > + msg.req.iova_v2.start = start;
> > + msg.req.iova_v2.last = last;
> > + msg.req.iova_v2.asid = asid;
> > + }
> >
> > return vduse_dev_msg_sync(dev, &msg);
> > }
> > @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> > return mask;
> > }
> >
> > +/* Force set the asid to a vq group without a message to the VDUSE device */
> > +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> > + unsigned int group, unsigned int asid)
> > +{
> > + /*
> > + * Two concurrent updates to this pointer are valid as they cannot
> > + * point to an invalid region. It is ok for them to race as long as
> > + * the readers see a consistent state through RCU.
> > + */
> > + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
> > +}
> > +
> > static void vduse_dev_reset(struct vduse_dev *dev)
> > {
> > int i;
> > - struct vduse_iova_domain *domain = dev->domain;
> >
> > /* The coherent mappings are handled in vduse_dev_free_coherent() */
> > - if (domain && domain->bounce_map)
> > - vduse_domain_reset_bounce_map(domain);
> > + for (i = 0; i < dev->nas; i++) {
> > + struct vduse_iova_domain *domain = dev->as[i].domain;
> > +
> > + if (domain && domain->bounce_map)
> > + vduse_domain_reset_bounce_map(domain);
> > + }
> > +
> > + for (i = 0; i < dev->ngroups; i++)
> > + vduse_set_group_asid_nomsg(dev, i, 0);
> >
> > down_write(&dev->rwsem);
> >
> > @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> > return ret;
> > }
> >
> > +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> > + unsigned int asid)
> > +{
> > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > + struct vduse_dev_msg msg = { 0 };
> > + int r;
> > +
> > + if (dev->api_version < VDUSE_API_VERSION_1 ||
> > + group >= dev->ngroups || asid >= dev->nas)
> > + return -EINVAL;
> > +
> > + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> > + msg.req.vq_group_asid.group = group;
> > + msg.req.vq_group_asid.asid = asid;
> > +
> > + r = vduse_dev_msg_sync(dev, &msg);
> > + if (r < 0)
> > + return r;
> > +
> > + vduse_set_group_asid_nomsg(dev, group, asid);
> > + return 0;
> > +}
> > +
> > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> > struct vdpa_vq_state *state)
> > {
> > @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> > struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > int ret;
> >
> > - ret = vduse_domain_set_map(dev->domain, iotlb);
> > + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> > if (ret)
> > return ret;
> >
> > - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> > + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> > if (ret) {
> > - vduse_domain_clear_map(dev->domain, iotlb);
> > + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> > return ret;
> > }
> >
> > @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> > .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> > .reset = vduse_vdpa_reset,
> > .set_map = vduse_vdpa_set_map,
> > + .set_group_asid = vduse_set_group_asid,
> > .get_vq_map = vduse_get_vq_map,
> > .free = vduse_vdpa_free,
> > };
> > @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> > enum dma_data_direction dir)
> > {
> > struct vduse_dev *vdev;
> > + struct vduse_as *as;
> > struct vduse_iova_domain *domain;
> >
> > if (!token.group)
> > return;
> >
> > vdev = token.group->dev;
> > - domain = vdev->domain;
> > -
> > + rcu_read_lock();
> > + as = rcu_dereference(token.group->as);
> > + domain = as->domain;
> > + rcu_read_unlock();
> > vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
> > }
> >
> > @@ -868,14 +927,17 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token,
> > enum dma_data_direction dir)
> > {
> > struct vduse_dev *vdev;
> > + struct vduse_as *as;
> > struct vduse_iova_domain *domain;
> >
> > if (!token.group)
> > return;
> >
> > vdev = token.group->dev;
> > - domain = vdev->domain;
> > -
> > + rcu_read_lock();
> > + as = rcu_dereference(token.group->as);
> > + domain = as->domain;
> > + rcu_read_unlock();
> > vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
> > }
> >
> > @@ -885,15 +947,21 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
> > unsigned long attrs)
> > {
> > struct vduse_dev *vdev;
> > + struct vduse_as *as;
> > struct vduse_iova_domain *domain;
> > + dma_addr_t r;
> >
> > if (!token.group)
> > return DMA_MAPPING_ERROR;
> >
> > vdev = token.group->dev;
> > - domain = vdev->domain;
> > + rcu_read_lock();
> > + as = rcu_dereference(token.group->as);
> > + domain = as->domain;
> > + rcu_read_unlock();
> > + r = vduse_domain_map_page(domain, page, offset, size, dir, attrs);
> >
> > - return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
> > + return r;
> > }
> >
> > static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr,
> > @@ -901,21 +969,25 @@ static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr,
> > unsigned long attrs)
> > {
> > struct vduse_dev *vdev;
> > + struct vduse_as *as;
> > struct vduse_iova_domain *domain;
> >
> > if (!token.group)
> > return;
> >
> > vdev = token.group->dev;
> > - domain = vdev->domain;
> > -
> > - return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
> > + rcu_read_lock();
> > + as = rcu_dereference(token.group->as);
> > + domain = as->domain;
> > + rcu_read_unlock();
> > + vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
> > }
> >
> > static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size,
> > dma_addr_t *dma_addr, gfp_t flag)
> > {
> > struct vduse_dev *vdev;
> > + struct vduse_as *as;
> > struct vduse_iova_domain *domain;
> > unsigned long iova;
> > void *addr;
> > @@ -925,13 +997,14 @@ static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size,
> > return NULL;
> >
> > vdev = token.group->dev;
> > - domain = vdev->domain;
> > + rcu_read_lock();
> > + as = rcu_dereference(token.group->as);
> > + domain = as->domain;
> > + rcu_read_unlock();
> > addr = vduse_domain_alloc_coherent(domain, size,
> > (dma_addr_t *)&iova, flag);
> > - if (!addr)
> > - return NULL;
> > -
> > - *dma_addr = (dma_addr_t)iova;
> > + if (addr)
> > + *dma_addr = (dma_addr_t)iova;
> >
> > return addr;
> > }
> > @@ -941,29 +1014,39 @@ static void vduse_dev_free_coherent(union virtio_map token, size_t size,
> > unsigned long attrs)
> > {
> > struct vduse_dev *vdev;
> > + struct vduse_as *as;
> > struct vduse_iova_domain *domain;
> >
> > if (!token.group)
> > return;
> >
> > vdev = token.group->dev;
> > - domain = vdev->domain;
> > -
> > + rcu_read_lock();
> > + as = rcu_dereference(token.group->as);
> > + domain = as->domain;
> > + rcu_read_unlock();
> > vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
> > + mutex_unlock(&vdev->domain_lock);
> > }
> >
> > static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr)
> > {
> > struct vduse_dev *vdev;
> > + struct vduse_as *as;
> > struct vduse_iova_domain *domain;
> > + size_t bounce_size;
> >
> > if (!token.group)
> > return false;
> >
> > vdev = token.group->dev;
> > - domain = vdev->domain;
> > + rcu_read_lock();
> > + as = rcu_dereference(token.group->as);
> > + domain = as->domain;
> > + rcu_read_unlock();
> > + bounce_size = domain->bounce_size;
> >
> > - return dma_addr < domain->bounce_size;
> > + return dma_addr < bounce_size;
> > }
> >
> > static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr)
> > @@ -976,15 +1059,22 @@ static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr)
> > static size_t vduse_dev_max_mapping_size(union virtio_map token)
> > {
> > struct vduse_dev *vdev;
> > + struct vduse_as *as;
> > struct vduse_iova_domain *domain;
> > + size_t bounce_size;
> >
> > if (!token.group)
> > return 0;
> >
> > vdev = token.group->dev;
> > - domain = vdev->domain;
> > + rcu_read_lock();
> > + as = rcu_dereference(token.group->as);
> > + domain = as->domain;
> > + rcu_read_unlock();
> > + domain = token.group->as->domain;
>
> This line should be removed?
>
Totally, thank you very much for the catch!
On Thu, Nov 13, 2025 at 7:56 PM Eugenio Pérez <eperezma@redhat.com> wrote:
>
> Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> group. This enables mapping each group into a distinct memory space.
>
> Now that the driver can change ASID in the middle of operation, the
> domain that each vq address point is also protected by domain_lock.
Maybe it's better to document what is protected by RCU and how.
More below.
>
> Acked-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> ---
> v9:
> * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> context.
>
> v8:
> * Revert the mutex to rwlock change, it needs proper profiling to
> justify it.
>
> v7:
> * Take write lock in the error path (Jason).
>
> v6:
> * Make vdpa_dev_add use gotos for error handling (MST).
> * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> (MST).
> * Fix struct name not matching in the doc.
>
> v5:
> * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> ioctl (Jason).
> * Properly set domain bounce size to divide equally between nas (Jason).
> * Exclude "padding" member from the only >V1 members in
> vduse_dev_request.
>
> v4:
> * Divide each domain bounce size between the device bounce size (Jason).
> * revert unneeded addr = NULL assignment (Jason)
> * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> return; } (Jason)
> * Change a bad multiline comment, using @ caracter instead of * (Jason).
> * Consider config->nas == 0 as a fail (Jason).
>
> v3:
> * Get the vduse domain through the vduse_as in the map functions
> (Jason).
> * Squash with the patch creating the vduse_as struct (Jason).
> * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> (Jason)
>
> v2:
> * Convert the use of mutex to rwlock.
>
> RFC v3:
> * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> value to reduce memory consumption, but vqs are already limited to
> that value and userspace VDUSE is able to allocate that many vqs.
> * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> VDUSE_IOTLB_GET_INFO.
> * Use of array_index_nospec in VDUSE device ioctls.
> * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> * Move the umem mutex to asid struct so there is no contention between
> ASIDs.
>
> RFC v2:
> * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> part of the struct is the same.
> ---
> drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> include/uapi/linux/vduse.h | 53 ++++-
> 2 files changed, 314 insertions(+), 109 deletions(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index 97be04f73fbf..ff95ed56f22d 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -11,6 +11,7 @@
> #include "linux/virtio_net.h"
> #include <linux/init.h>
> #include <linux/module.h>
> +#include <linux/rcupdate.h>
> #include <linux/cdev.h>
> #include <linux/device.h>
> #include <linux/eventfd.h>
> @@ -41,6 +42,7 @@
>
> #define VDUSE_DEV_MAX (1U << MINORBITS)
> #define VDUSE_DEV_MAX_GROUPS 0xffff
> +#define VDUSE_DEV_MAX_AS 0xffff
> #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> @@ -86,7 +88,14 @@ struct vduse_umem {
> struct mm_struct *mm;
> };
>
> +struct vduse_as {
> + struct vduse_iova_domain *domain;
> + struct vduse_umem *umem;
> + struct mutex mem_lock;
> +};
> +
> struct vduse_vq_group {
> + struct vduse_as *as __rcu;
> struct vduse_dev *dev;
> };
>
> @@ -94,7 +103,7 @@ struct vduse_dev {
> struct vduse_vdpa *vdev;
> struct device *dev;
> struct vduse_virtqueue **vqs;
> - struct vduse_iova_domain *domain;
> + struct vduse_as *as;
> char *name;
> struct mutex lock;
> spinlock_t msg_lock;
> @@ -122,9 +131,8 @@ struct vduse_dev {
> u32 vq_num;
> u32 vq_align;
> u32 ngroups;
> - struct vduse_umem *umem;
> + u32 nas;
> struct vduse_vq_group *groups;
> - struct mutex mem_lock;
> unsigned int bounce_size;
> struct mutex domain_lock;
> };
> @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> return vduse_dev_msg_sync(dev, &msg);
> }
>
> -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> u64 start, u64 last)
> {
> struct vduse_dev_msg msg = { 0 };
> @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> return -EINVAL;
>
> msg.req.type = VDUSE_UPDATE_IOTLB;
> - msg.req.iova.start = start;
> - msg.req.iova.last = last;
> + if (dev->api_version < VDUSE_API_VERSION_1) {
> + msg.req.iova.start = start;
> + msg.req.iova.last = last;
> + } else {
> + msg.req.iova_v2.start = start;
> + msg.req.iova_v2.last = last;
> + msg.req.iova_v2.asid = asid;
> + }
>
> return vduse_dev_msg_sync(dev, &msg);
> }
> @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> return mask;
> }
>
> +/* Force set the asid to a vq group without a message to the VDUSE device */
> +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> + unsigned int group, unsigned int asid)
> +{
> + /*
> + * Two concurrent updates to this pointer are valid as they cannot
> + * point to an invalid region. It is ok for them to race as long as
> + * the readers see a consistent state through RCU.
> + */
> + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
I'd expect at least a synchronize_rcu() here to wait for the read is done?
> +}
> +
> static void vduse_dev_reset(struct vduse_dev *dev)
> {
> int i;
> - struct vduse_iova_domain *domain = dev->domain;
>
> /* The coherent mappings are handled in vduse_dev_free_coherent() */
> - if (domain && domain->bounce_map)
> - vduse_domain_reset_bounce_map(domain);
> + for (i = 0; i < dev->nas; i++) {
> + struct vduse_iova_domain *domain = dev->as[i].domain;
> +
> + if (domain && domain->bounce_map)
> + vduse_domain_reset_bounce_map(domain);
> + }
> +
> + for (i = 0; i < dev->ngroups; i++)
> + vduse_set_group_asid_nomsg(dev, i, 0);
>
> down_write(&dev->rwsem);
>
> @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> return ret;
> }
>
> +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> + unsigned int asid)
> +{
> + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> + struct vduse_dev_msg msg = { 0 };
> + int r;
> +
> + if (dev->api_version < VDUSE_API_VERSION_1 ||
> + group >= dev->ngroups || asid >= dev->nas)
> + return -EINVAL;
> +
> + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> + msg.req.vq_group_asid.group = group;
> + msg.req.vq_group_asid.asid = asid;
> +
> + r = vduse_dev_msg_sync(dev, &msg);
> + if (r < 0)
> + return r;
> +
> + vduse_set_group_asid_nomsg(dev, group, asid);
> + return 0;
> +}
> +
> static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> struct vdpa_vq_state *state)
> {
> @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> int ret;
>
> - ret = vduse_domain_set_map(dev->domain, iotlb);
> + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> if (ret)
> return ret;
>
> - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> if (ret) {
> - vduse_domain_clear_map(dev->domain, iotlb);
> + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> return ret;
> }
>
> @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> .reset = vduse_vdpa_reset,
> .set_map = vduse_vdpa_set_map,
> + .set_group_asid = vduse_set_group_asid,
> .get_vq_map = vduse_get_vq_map,
> .free = vduse_vdpa_free,
> };
> @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> enum dma_data_direction dir)
> {
> struct vduse_dev *vdev;
> + struct vduse_as *as;
> struct vduse_iova_domain *domain;
>
> if (!token.group)
> return;
>
> vdev = token.group->dev;
> - domain = vdev->domain;
> -
> + rcu_read_lock();
> + as = rcu_dereference(token.group->as);
> + domain = as->domain;
> + rcu_read_unlock();
> vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
This is suspicious, at least we should do rcu_read_unlock() after
vduse_domain_sync_single_for_device(), otherwise I don't see how RCU
works.
> }
>
> @@ -868,14 +927,17 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token,
> enum dma_data_direction dir)
> {
> struct vduse_dev *vdev;
> + struct vduse_as *as;
> struct vduse_iova_domain *domain;
>
> if (!token.group)
> return;
>
> vdev = token.group->dev;
> - domain = vdev->domain;
> -
> + rcu_read_lock();
> + as = rcu_dereference(token.group->as);
> + domain = as->domain;
> + rcu_read_unlock();
> vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
> }
>
> @@ -885,15 +947,21 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
> unsigned long attrs)
> {
> struct vduse_dev *vdev;
> + struct vduse_as *as;
> struct vduse_iova_domain *domain;
> + dma_addr_t r;
>
> if (!token.group)
> return DMA_MAPPING_ERROR;
>
> vdev = token.group->dev;
> - domain = vdev->domain;
> + rcu_read_lock();
> + as = rcu_dereference(token.group->as);
> + domain = as->domain;
> + rcu_read_unlock();
> + r = vduse_domain_map_page(domain, page, offset, size, dir, attrs);
Same here.
Thanks
On Fri, Nov 14, 2025 at 1:55 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Thu, Nov 13, 2025 at 7:56 PM Eugenio Pérez <eperezma@redhat.com> wrote:
> >
> > Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> > group. This enables mapping each group into a distinct memory space.
> >
> > Now that the driver can change ASID in the middle of operation, the
> > domain that each vq address point is also protected by domain_lock.
>
> Maybe it's better to document what is protected by RCU and how.
>
I added the _rcu annotation but I can expand it for sure. I can also
modify the commit message.
> More below.
>
> >
> > Acked-by: Jason Wang <jasowang@redhat.com>
I forgot to remove this, my bad!
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > ---
> > v9:
> > * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> > context.
> >
> > v8:
> > * Revert the mutex to rwlock change, it needs proper profiling to
> > justify it.
> >
> > v7:
> > * Take write lock in the error path (Jason).
> >
> > v6:
> > * Make vdpa_dev_add use gotos for error handling (MST).
> > * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> > (MST).
> > * Fix struct name not matching in the doc.
> >
> > v5:
> > * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> > ioctl (Jason).
> > * Properly set domain bounce size to divide equally between nas (Jason).
> > * Exclude "padding" member from the only >V1 members in
> > vduse_dev_request.
> >
> > v4:
> > * Divide each domain bounce size between the device bounce size (Jason).
> > * revert unneeded addr = NULL assignment (Jason)
> > * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> > return; } (Jason)
> > * Change a bad multiline comment, using @ caracter instead of * (Jason).
> > * Consider config->nas == 0 as a fail (Jason).
> >
> > v3:
> > * Get the vduse domain through the vduse_as in the map functions
> > (Jason).
> > * Squash with the patch creating the vduse_as struct (Jason).
> > * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> > (Jason)
> >
> > v2:
> > * Convert the use of mutex to rwlock.
> >
> > RFC v3:
> > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> > value to reduce memory consumption, but vqs are already limited to
> > that value and userspace VDUSE is able to allocate that many vqs.
> > * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> > VDUSE_IOTLB_GET_INFO.
> > * Use of array_index_nospec in VDUSE device ioctls.
> > * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> > * Move the umem mutex to asid struct so there is no contention between
> > ASIDs.
> >
> > RFC v2:
> > * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> > part of the struct is the same.
> > ---
> > drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> > include/uapi/linux/vduse.h | 53 ++++-
> > 2 files changed, 314 insertions(+), 109 deletions(-)
> >
> > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > index 97be04f73fbf..ff95ed56f22d 100644
> > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > @@ -11,6 +11,7 @@
> > #include "linux/virtio_net.h"
> > #include <linux/init.h>
> > #include <linux/module.h>
> > +#include <linux/rcupdate.h>
> > #include <linux/cdev.h>
> > #include <linux/device.h>
> > #include <linux/eventfd.h>
> > @@ -41,6 +42,7 @@
> >
> > #define VDUSE_DEV_MAX (1U << MINORBITS)
> > #define VDUSE_DEV_MAX_GROUPS 0xffff
> > +#define VDUSE_DEV_MAX_AS 0xffff
> > #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> > #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> > @@ -86,7 +88,14 @@ struct vduse_umem {
> > struct mm_struct *mm;
> > };
> >
> > +struct vduse_as {
> > + struct vduse_iova_domain *domain;
> > + struct vduse_umem *umem;
> > + struct mutex mem_lock;
> > +};
> > +
> > struct vduse_vq_group {
> > + struct vduse_as *as __rcu;
> > struct vduse_dev *dev;
> > };
> >
> > @@ -94,7 +103,7 @@ struct vduse_dev {
> > struct vduse_vdpa *vdev;
> > struct device *dev;
> > struct vduse_virtqueue **vqs;
> > - struct vduse_iova_domain *domain;
> > + struct vduse_as *as;
> > char *name;
> > struct mutex lock;
> > spinlock_t msg_lock;
> > @@ -122,9 +131,8 @@ struct vduse_dev {
> > u32 vq_num;
> > u32 vq_align;
> > u32 ngroups;
> > - struct vduse_umem *umem;
> > + u32 nas;
> > struct vduse_vq_group *groups;
> > - struct mutex mem_lock;
> > unsigned int bounce_size;
> > struct mutex domain_lock;
> > };
> > @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> > return vduse_dev_msg_sync(dev, &msg);
> > }
> >
> > -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> > u64 start, u64 last)
> > {
> > struct vduse_dev_msg msg = { 0 };
> > @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > return -EINVAL;
> >
> > msg.req.type = VDUSE_UPDATE_IOTLB;
> > - msg.req.iova.start = start;
> > - msg.req.iova.last = last;
> > + if (dev->api_version < VDUSE_API_VERSION_1) {
> > + msg.req.iova.start = start;
> > + msg.req.iova.last = last;
> > + } else {
> > + msg.req.iova_v2.start = start;
> > + msg.req.iova_v2.last = last;
> > + msg.req.iova_v2.asid = asid;
> > + }
> >
> > return vduse_dev_msg_sync(dev, &msg);
> > }
> > @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> > return mask;
> > }
> >
> > +/* Force set the asid to a vq group without a message to the VDUSE device */
> > +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> > + unsigned int group, unsigned int asid)
> > +{
> > + /*
> > + * Two concurrent updates to this pointer are valid as they cannot
> > + * point to an invalid region. It is ok for them to race as long as
> > + * the readers see a consistent state through RCU.
> > + */
> > + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
>
> I'd expect at least a synchronize_rcu() here to wait for the read is done?
>
What's the use? The only thing left here is to return from
vduse_set_group_asid_nomsg, and we don't need to wait for readers
here, do we?
> > +}
> > +
> > static void vduse_dev_reset(struct vduse_dev *dev)
> > {
> > int i;
> > - struct vduse_iova_domain *domain = dev->domain;
> >
> > /* The coherent mappings are handled in vduse_dev_free_coherent() */
> > - if (domain && domain->bounce_map)
> > - vduse_domain_reset_bounce_map(domain);
> > + for (i = 0; i < dev->nas; i++) {
> > + struct vduse_iova_domain *domain = dev->as[i].domain;
> > +
> > + if (domain && domain->bounce_map)
> > + vduse_domain_reset_bounce_map(domain);
> > + }
> > +
> > + for (i = 0; i < dev->ngroups; i++)
> > + vduse_set_group_asid_nomsg(dev, i, 0);
> >
> > down_write(&dev->rwsem);
> >
> > @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> > return ret;
> > }
> >
> > +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> > + unsigned int asid)
> > +{
> > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > + struct vduse_dev_msg msg = { 0 };
> > + int r;
> > +
> > + if (dev->api_version < VDUSE_API_VERSION_1 ||
> > + group >= dev->ngroups || asid >= dev->nas)
> > + return -EINVAL;
> > +
> > + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> > + msg.req.vq_group_asid.group = group;
> > + msg.req.vq_group_asid.asid = asid;
> > +
> > + r = vduse_dev_msg_sync(dev, &msg);
> > + if (r < 0)
> > + return r;
> > +
> > + vduse_set_group_asid_nomsg(dev, group, asid);
> > + return 0;
> > +}
> > +
> > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> > struct vdpa_vq_state *state)
> > {
> > @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> > struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > int ret;
> >
> > - ret = vduse_domain_set_map(dev->domain, iotlb);
> > + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> > if (ret)
> > return ret;
> >
> > - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> > + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> > if (ret) {
> > - vduse_domain_clear_map(dev->domain, iotlb);
> > + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> > return ret;
> > }
> >
> > @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> > .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> > .reset = vduse_vdpa_reset,
> > .set_map = vduse_vdpa_set_map,
> > + .set_group_asid = vduse_set_group_asid,
> > .get_vq_map = vduse_get_vq_map,
> > .free = vduse_vdpa_free,
> > };
> > @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> > enum dma_data_direction dir)
> > {
> > struct vduse_dev *vdev;
> > + struct vduse_as *as;
> > struct vduse_iova_domain *domain;
> >
> > if (!token.group)
> > return;
> >
> > vdev = token.group->dev;
> > - domain = vdev->domain;
> > -
> > + rcu_read_lock();
> > + as = rcu_dereference(token.group->as);
> > + domain = as->domain;
> > + rcu_read_unlock();
> > vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
>
> This is suspicious, at least we should do rcu_read_unlock() after
> vduse_domain_sync_single_for_device(), otherwise I don't see how RCU
> works.
>
RCU is protecting that the address space pointer of the vq group is
not modified concurrently with the access. Ideally, this should be a
full lock, but just making sure that all accesses from the reader are
coherent is enough. Userspace should expect nothing if it uses the map
and modifies the vq group ASID at the same time anyway, but the kernel
needs to be sure that it does not see intermediate states. TBH, we
could move to a READ_ONCE / WRITE_ONCE, would that be more clear?
The function vduse_domain uses its own lock to protect concurrent
access to the maps of the ASID itself, as they were needed before
implementing ASID already.
> > }
> >
> > @@ -868,14 +927,17 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token,
> > enum dma_data_direction dir)
> > {
> > struct vduse_dev *vdev;
> > + struct vduse_as *as;
> > struct vduse_iova_domain *domain;
> >
> > if (!token.group)
> > return;
> >
> > vdev = token.group->dev;
> > - domain = vdev->domain;
> > -
> > + rcu_read_lock();
> > + as = rcu_dereference(token.group->as);
> > + domain = as->domain;
> > + rcu_read_unlock();
> > vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
> > }
> >
> > @@ -885,15 +947,21 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
> > unsigned long attrs)
> > {
> > struct vduse_dev *vdev;
> > + struct vduse_as *as;
> > struct vduse_iova_domain *domain;
> > + dma_addr_t r;
> >
> > if (!token.group)
> > return DMA_MAPPING_ERROR;
> >
> > vdev = token.group->dev;
> > - domain = vdev->domain;
> > + rcu_read_lock();
> > + as = rcu_dereference(token.group->as);
> > + domain = as->domain;
> > + rcu_read_unlock();
> > + r = vduse_domain_map_page(domain, page, offset, size, dir, attrs);
>
> Same here.
>
> Thanks
>
On Fri, Nov 14, 2025 at 12:25:03PM +0100, Eugenio Perez Martin wrote: > RCU is protecting that the address space pointer of the vq group is > not modified concurrently with the access. Ideally, this should be a > full lock, but just making sure that all accesses from the reader are > coherent is enough. Userspace should expect nothing if it uses the map > and modifies the vq group ASID at the same time anyway, but the kernel > needs to be sure that it does not see intermediate states. TBH, we > could move to a READ_ONCE / WRITE_ONCE, would that be more clear? generally rcu itself does not need ONCE macros. these are for funky lockless things, and rcu can be seen as a kind of lock, after all. -- MST
On Mon, Nov 17, 2025 at 11:50 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Fri, Nov 14, 2025 at 12:25:03PM +0100, Eugenio Perez Martin wrote: > > RCU is protecting that the address space pointer of the vq group is > > not modified concurrently with the access. Ideally, this should be a > > full lock, but just making sure that all accesses from the reader are > > coherent is enough. Userspace should expect nothing if it uses the map > > and modifies the vq group ASID at the same time anyway, but the kernel > > needs to be sure that it does not see intermediate states. TBH, we > > could move to a READ_ONCE / WRITE_ONCE, would that be more clear? > > generally rcu itself does not need ONCE macros. > these are for funky lockless things, and rcu can be > seen as a kind of lock, after all. > Right, I meant to replace RCU by READ_ONCE / WRITE ONCE. But I also prefer RCU.
On Fri, Nov 14, 2025 at 7:25 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Fri, Nov 14, 2025 at 1:55 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Thu, Nov 13, 2025 at 7:56 PM Eugenio Pérez <eperezma@redhat.com> wrote:
> > >
> > > Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> > > group. This enables mapping each group into a distinct memory space.
> > >
> > > Now that the driver can change ASID in the middle of operation, the
> > > domain that each vq address point is also protected by domain_lock.
> >
> > Maybe it's better to document what is protected by RCU and how.
> >
>
> I added the _rcu annotation but I can expand it for sure. I can also
> modify the commit message.
>
> > More below.
> >
> > >
> > > Acked-by: Jason Wang <jasowang@redhat.com>
>
> I forgot to remove this, my bad!
>
> > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > ---
> > > v9:
> > > * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> > > context.
> > >
> > > v8:
> > > * Revert the mutex to rwlock change, it needs proper profiling to
> > > justify it.
> > >
> > > v7:
> > > * Take write lock in the error path (Jason).
> > >
> > > v6:
> > > * Make vdpa_dev_add use gotos for error handling (MST).
> > > * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> > > (MST).
> > > * Fix struct name not matching in the doc.
> > >
> > > v5:
> > > * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> > > ioctl (Jason).
> > > * Properly set domain bounce size to divide equally between nas (Jason).
> > > * Exclude "padding" member from the only >V1 members in
> > > vduse_dev_request.
> > >
> > > v4:
> > > * Divide each domain bounce size between the device bounce size (Jason).
> > > * revert unneeded addr = NULL assignment (Jason)
> > > * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> > > return; } (Jason)
> > > * Change a bad multiline comment, using @ caracter instead of * (Jason).
> > > * Consider config->nas == 0 as a fail (Jason).
> > >
> > > v3:
> > > * Get the vduse domain through the vduse_as in the map functions
> > > (Jason).
> > > * Squash with the patch creating the vduse_as struct (Jason).
> > > * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> > > (Jason)
> > >
> > > v2:
> > > * Convert the use of mutex to rwlock.
> > >
> > > RFC v3:
> > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> > > value to reduce memory consumption, but vqs are already limited to
> > > that value and userspace VDUSE is able to allocate that many vqs.
> > > * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> > > VDUSE_IOTLB_GET_INFO.
> > > * Use of array_index_nospec in VDUSE device ioctls.
> > > * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> > > * Move the umem mutex to asid struct so there is no contention between
> > > ASIDs.
> > >
> > > RFC v2:
> > > * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> > > part of the struct is the same.
> > > ---
> > > drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> > > include/uapi/linux/vduse.h | 53 ++++-
> > > 2 files changed, 314 insertions(+), 109 deletions(-)
> > >
> > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > index 97be04f73fbf..ff95ed56f22d 100644
> > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > @@ -11,6 +11,7 @@
> > > #include "linux/virtio_net.h"
> > > #include <linux/init.h>
> > > #include <linux/module.h>
> > > +#include <linux/rcupdate.h>
> > > #include <linux/cdev.h>
> > > #include <linux/device.h>
> > > #include <linux/eventfd.h>
> > > @@ -41,6 +42,7 @@
> > >
> > > #define VDUSE_DEV_MAX (1U << MINORBITS)
> > > #define VDUSE_DEV_MAX_GROUPS 0xffff
> > > +#define VDUSE_DEV_MAX_AS 0xffff
> > > #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> > > #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> > > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> > > @@ -86,7 +88,14 @@ struct vduse_umem {
> > > struct mm_struct *mm;
> > > };
> > >
> > > +struct vduse_as {
> > > + struct vduse_iova_domain *domain;
> > > + struct vduse_umem *umem;
> > > + struct mutex mem_lock;
> > > +};
> > > +
> > > struct vduse_vq_group {
> > > + struct vduse_as *as __rcu;
> > > struct vduse_dev *dev;
> > > };
> > >
> > > @@ -94,7 +103,7 @@ struct vduse_dev {
> > > struct vduse_vdpa *vdev;
> > > struct device *dev;
> > > struct vduse_virtqueue **vqs;
> > > - struct vduse_iova_domain *domain;
> > > + struct vduse_as *as;
> > > char *name;
> > > struct mutex lock;
> > > spinlock_t msg_lock;
> > > @@ -122,9 +131,8 @@ struct vduse_dev {
> > > u32 vq_num;
> > > u32 vq_align;
> > > u32 ngroups;
> > > - struct vduse_umem *umem;
> > > + u32 nas;
> > > struct vduse_vq_group *groups;
> > > - struct mutex mem_lock;
> > > unsigned int bounce_size;
> > > struct mutex domain_lock;
> > > };
> > > @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> > > return vduse_dev_msg_sync(dev, &msg);
> > > }
> > >
> > > -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> > > u64 start, u64 last)
> > > {
> > > struct vduse_dev_msg msg = { 0 };
> > > @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > return -EINVAL;
> > >
> > > msg.req.type = VDUSE_UPDATE_IOTLB;
> > > - msg.req.iova.start = start;
> > > - msg.req.iova.last = last;
> > > + if (dev->api_version < VDUSE_API_VERSION_1) {
> > > + msg.req.iova.start = start;
> > > + msg.req.iova.last = last;
> > > + } else {
> > > + msg.req.iova_v2.start = start;
> > > + msg.req.iova_v2.last = last;
> > > + msg.req.iova_v2.asid = asid;
> > > + }
> > >
> > > return vduse_dev_msg_sync(dev, &msg);
> > > }
> > > @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> > > return mask;
> > > }
> > >
> > > +/* Force set the asid to a vq group without a message to the VDUSE device */
> > > +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> > > + unsigned int group, unsigned int asid)
> > > +{
> > > + /*
> > > + * Two concurrent updates to this pointer are valid as they cannot
> > > + * point to an invalid region. It is ok for them to race as long as
> > > + * the readers see a consistent state through RCU.
> > > + */
> > > + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
> >
> > I'd expect at least a synchronize_rcu() here to wait for the read is done?
> >
>
> What's the use? The only thing left here is to return from
> vduse_set_group_asid_nomsg, and we don't need to wait for readers
> here, do we?
See below.
>
> > > +}
> > > +
> > > static void vduse_dev_reset(struct vduse_dev *dev)
> > > {
> > > int i;
> > > - struct vduse_iova_domain *domain = dev->domain;
> > >
> > > /* The coherent mappings are handled in vduse_dev_free_coherent() */
> > > - if (domain && domain->bounce_map)
> > > - vduse_domain_reset_bounce_map(domain);
> > > + for (i = 0; i < dev->nas; i++) {
> > > + struct vduse_iova_domain *domain = dev->as[i].domain;
> > > +
> > > + if (domain && domain->bounce_map)
> > > + vduse_domain_reset_bounce_map(domain);
> > > + }
> > > +
> > > + for (i = 0; i < dev->ngroups; i++)
> > > + vduse_set_group_asid_nomsg(dev, i, 0);
> > >
> > > down_write(&dev->rwsem);
> > >
> > > @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> > > return ret;
> > > }
> > >
> > > +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> > > + unsigned int asid)
> > > +{
> > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > + struct vduse_dev_msg msg = { 0 };
> > > + int r;
> > > +
> > > + if (dev->api_version < VDUSE_API_VERSION_1 ||
> > > + group >= dev->ngroups || asid >= dev->nas)
> > > + return -EINVAL;
> > > +
> > > + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> > > + msg.req.vq_group_asid.group = group;
> > > + msg.req.vq_group_asid.asid = asid;
> > > +
> > > + r = vduse_dev_msg_sync(dev, &msg);
> > > + if (r < 0)
> > > + return r;
> > > +
> > > + vduse_set_group_asid_nomsg(dev, group, asid);
> > > + return 0;
> > > +}
> > > +
> > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> > > struct vdpa_vq_state *state)
> > > {
> > > @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> > > struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > int ret;
> > >
> > > - ret = vduse_domain_set_map(dev->domain, iotlb);
> > > + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> > > if (ret)
> > > return ret;
> > >
> > > - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> > > + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> > > if (ret) {
> > > - vduse_domain_clear_map(dev->domain, iotlb);
> > > + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> > > return ret;
> > > }
> > >
> > > @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> > > .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> > > .reset = vduse_vdpa_reset,
> > > .set_map = vduse_vdpa_set_map,
> > > + .set_group_asid = vduse_set_group_asid,
> > > .get_vq_map = vduse_get_vq_map,
> > > .free = vduse_vdpa_free,
> > > };
> > > @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> > > enum dma_data_direction dir)
> > > {
> > > struct vduse_dev *vdev;
> > > + struct vduse_as *as;
> > > struct vduse_iova_domain *domain;
> > >
> > > if (!token.group)
> > > return;
> > >
> > > vdev = token.group->dev;
> > > - domain = vdev->domain;
> > > -
> > > + rcu_read_lock();
> > > + as = rcu_dereference(token.group->as);
> > > + domain = as->domain;
> > > + rcu_read_unlock();
> > > vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
> >
> > This is suspicious, at least we should do rcu_read_unlock() after
> > vduse_domain_sync_single_for_device(), otherwise I don't see how RCU
> > works.
> >
>
> RCU is protecting that the address space pointer of the vq group is
> not modified concurrently with the access. Ideally, this should be a
> full lock, but just making sure that all accesses from the reader are
> coherent is enough. Userspace should expect nothing if it uses the map
> and modifies the vq group ASID at the same time anyway, but the kernel
> needs to be sure that it does not see intermediate states. TBH, we
> could move to a READ_ONCE / WRITE_ONCE, would that be more clear?
Using READ_ONCE/WRITE_ONCE() needs to make sure the ordering is
handled correctly.
But I meant what happens if
[cpu0]rcu_read_lock()
[cpu0]as = rcu_dereference(token.group->as)
[cpu0]...
[cpu0]rcu_read_unlock()
[cpu1]rcu_assign_pointer(token.group->as)
[cpu0]vduse_domain_sync_single_for_device()
If this is not an issue, RCU is not a must, but please explain why.
If this is an issue, we need to fix it.
It's basically a question that
1) should we need to wait for the DMA to be completed before assigning
to the new as
2) should we track the set_group_asid() for the group that has pending
DMA to avoid potential issue
>
> The function vduse_domain uses its own lock to protect concurrent
> access to the maps of the ASID itself, as they were needed before
> implementing ASID already.
Thanks
>
> > > }
> > >
> > > @@ -868,14 +927,17 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token,
> > > enum dma_data_direction dir)
> > > {
> > > struct vduse_dev *vdev;
> > > + struct vduse_as *as;
> > > struct vduse_iova_domain *domain;
> > >
> > > if (!token.group)
> > > return;
> > >
> > > vdev = token.group->dev;
> > > - domain = vdev->domain;
> > > -
> > > + rcu_read_lock();
> > > + as = rcu_dereference(token.group->as);
> > > + domain = as->domain;
> > > + rcu_read_unlock();
> > > vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
> > > }
> > >
> > > @@ -885,15 +947,21 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
> > > unsigned long attrs)
> > > {
> > > struct vduse_dev *vdev;
> > > + struct vduse_as *as;
> > > struct vduse_iova_domain *domain;
> > > + dma_addr_t r;
> > >
> > > if (!token.group)
> > > return DMA_MAPPING_ERROR;
> > >
> > > vdev = token.group->dev;
> > > - domain = vdev->domain;
> > > + rcu_read_lock();
> > > + as = rcu_dereference(token.group->as);
> > > + domain = as->domain;
> > > + rcu_read_unlock();
> > > + r = vduse_domain_map_page(domain, page, offset, size, dir, attrs);
> >
> > Same here.
> >
> > Thanks
> >
>
On Mon, Nov 17, 2025 at 5:23 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Fri, Nov 14, 2025 at 7:25 PM Eugenio Perez Martin
> <eperezma@redhat.com> wrote:
> >
> > On Fri, Nov 14, 2025 at 1:55 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Thu, Nov 13, 2025 at 7:56 PM Eugenio Pérez <eperezma@redhat.com> wrote:
> > > >
> > > > Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> > > > group. This enables mapping each group into a distinct memory space.
> > > >
> > > > Now that the driver can change ASID in the middle of operation, the
> > > > domain that each vq address point is also protected by domain_lock.
> > >
> > > Maybe it's better to document what is protected by RCU and how.
> > >
> >
> > I added the _rcu annotation but I can expand it for sure. I can also
> > modify the commit message.
> >
> > > More below.
> > >
> > > >
> > > > Acked-by: Jason Wang <jasowang@redhat.com>
> >
> > I forgot to remove this, my bad!
> >
> > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > ---
> > > > v9:
> > > > * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> > > > context.
> > > >
> > > > v8:
> > > > * Revert the mutex to rwlock change, it needs proper profiling to
> > > > justify it.
> > > >
> > > > v7:
> > > > * Take write lock in the error path (Jason).
> > > >
> > > > v6:
> > > > * Make vdpa_dev_add use gotos for error handling (MST).
> > > > * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> > > > (MST).
> > > > * Fix struct name not matching in the doc.
> > > >
> > > > v5:
> > > > * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> > > > ioctl (Jason).
> > > > * Properly set domain bounce size to divide equally between nas (Jason).
> > > > * Exclude "padding" member from the only >V1 members in
> > > > vduse_dev_request.
> > > >
> > > > v4:
> > > > * Divide each domain bounce size between the device bounce size (Jason).
> > > > * revert unneeded addr = NULL assignment (Jason)
> > > > * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> > > > return; } (Jason)
> > > > * Change a bad multiline comment, using @ caracter instead of * (Jason).
> > > > * Consider config->nas == 0 as a fail (Jason).
> > > >
> > > > v3:
> > > > * Get the vduse domain through the vduse_as in the map functions
> > > > (Jason).
> > > > * Squash with the patch creating the vduse_as struct (Jason).
> > > > * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> > > > (Jason)
> > > >
> > > > v2:
> > > > * Convert the use of mutex to rwlock.
> > > >
> > > > RFC v3:
> > > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> > > > value to reduce memory consumption, but vqs are already limited to
> > > > that value and userspace VDUSE is able to allocate that many vqs.
> > > > * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> > > > VDUSE_IOTLB_GET_INFO.
> > > > * Use of array_index_nospec in VDUSE device ioctls.
> > > > * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> > > > * Move the umem mutex to asid struct so there is no contention between
> > > > ASIDs.
> > > >
> > > > RFC v2:
> > > > * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> > > > part of the struct is the same.
> > > > ---
> > > > drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> > > > include/uapi/linux/vduse.h | 53 ++++-
> > > > 2 files changed, 314 insertions(+), 109 deletions(-)
> > > >
> > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > index 97be04f73fbf..ff95ed56f22d 100644
> > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > @@ -11,6 +11,7 @@
> > > > #include "linux/virtio_net.h"
> > > > #include <linux/init.h>
> > > > #include <linux/module.h>
> > > > +#include <linux/rcupdate.h>
> > > > #include <linux/cdev.h>
> > > > #include <linux/device.h>
> > > > #include <linux/eventfd.h>
> > > > @@ -41,6 +42,7 @@
> > > >
> > > > #define VDUSE_DEV_MAX (1U << MINORBITS)
> > > > #define VDUSE_DEV_MAX_GROUPS 0xffff
> > > > +#define VDUSE_DEV_MAX_AS 0xffff
> > > > #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> > > > #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> > > > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> > > > @@ -86,7 +88,14 @@ struct vduse_umem {
> > > > struct mm_struct *mm;
> > > > };
> > > >
> > > > +struct vduse_as {
> > > > + struct vduse_iova_domain *domain;
> > > > + struct vduse_umem *umem;
> > > > + struct mutex mem_lock;
> > > > +};
> > > > +
> > > > struct vduse_vq_group {
> > > > + struct vduse_as *as __rcu;
> > > > struct vduse_dev *dev;
> > > > };
> > > >
> > > > @@ -94,7 +103,7 @@ struct vduse_dev {
> > > > struct vduse_vdpa *vdev;
> > > > struct device *dev;
> > > > struct vduse_virtqueue **vqs;
> > > > - struct vduse_iova_domain *domain;
> > > > + struct vduse_as *as;
> > > > char *name;
> > > > struct mutex lock;
> > > > spinlock_t msg_lock;
> > > > @@ -122,9 +131,8 @@ struct vduse_dev {
> > > > u32 vq_num;
> > > > u32 vq_align;
> > > > u32 ngroups;
> > > > - struct vduse_umem *umem;
> > > > + u32 nas;
> > > > struct vduse_vq_group *groups;
> > > > - struct mutex mem_lock;
> > > > unsigned int bounce_size;
> > > > struct mutex domain_lock;
> > > > };
> > > > @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> > > > return vduse_dev_msg_sync(dev, &msg);
> > > > }
> > > >
> > > > -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> > > > u64 start, u64 last)
> > > > {
> > > > struct vduse_dev_msg msg = { 0 };
> > > > @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > return -EINVAL;
> > > >
> > > > msg.req.type = VDUSE_UPDATE_IOTLB;
> > > > - msg.req.iova.start = start;
> > > > - msg.req.iova.last = last;
> > > > + if (dev->api_version < VDUSE_API_VERSION_1) {
> > > > + msg.req.iova.start = start;
> > > > + msg.req.iova.last = last;
> > > > + } else {
> > > > + msg.req.iova_v2.start = start;
> > > > + msg.req.iova_v2.last = last;
> > > > + msg.req.iova_v2.asid = asid;
> > > > + }
> > > >
> > > > return vduse_dev_msg_sync(dev, &msg);
> > > > }
> > > > @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> > > > return mask;
> > > > }
> > > >
> > > > +/* Force set the asid to a vq group without a message to the VDUSE device */
> > > > +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> > > > + unsigned int group, unsigned int asid)
> > > > +{
> > > > + /*
> > > > + * Two concurrent updates to this pointer are valid as they cannot
> > > > + * point to an invalid region. It is ok for them to race as long as
> > > > + * the readers see a consistent state through RCU.
> > > > + */
> > > > + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
> > >
> > > I'd expect at least a synchronize_rcu() here to wait for the read is done?
> > >
> >
> > What's the use? The only thing left here is to return from
> > vduse_set_group_asid_nomsg, and we don't need to wait for readers
> > here, do we?
>
> See below.
>
> >
> > > > +}
> > > > +
> > > > static void vduse_dev_reset(struct vduse_dev *dev)
> > > > {
> > > > int i;
> > > > - struct vduse_iova_domain *domain = dev->domain;
> > > >
> > > > /* The coherent mappings are handled in vduse_dev_free_coherent() */
> > > > - if (domain && domain->bounce_map)
> > > > - vduse_domain_reset_bounce_map(domain);
> > > > + for (i = 0; i < dev->nas; i++) {
> > > > + struct vduse_iova_domain *domain = dev->as[i].domain;
> > > > +
> > > > + if (domain && domain->bounce_map)
> > > > + vduse_domain_reset_bounce_map(domain);
> > > > + }
> > > > +
> > > > + for (i = 0; i < dev->ngroups; i++)
> > > > + vduse_set_group_asid_nomsg(dev, i, 0);
> > > >
> > > > down_write(&dev->rwsem);
> > > >
> > > > @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> > > > return ret;
> > > > }
> > > >
> > > > +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> > > > + unsigned int asid)
> > > > +{
> > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > + struct vduse_dev_msg msg = { 0 };
> > > > + int r;
> > > > +
> > > > + if (dev->api_version < VDUSE_API_VERSION_1 ||
> > > > + group >= dev->ngroups || asid >= dev->nas)
> > > > + return -EINVAL;
> > > > +
> > > > + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> > > > + msg.req.vq_group_asid.group = group;
> > > > + msg.req.vq_group_asid.asid = asid;
> > > > +
> > > > + r = vduse_dev_msg_sync(dev, &msg);
> > > > + if (r < 0)
> > > > + return r;
> > > > +
> > > > + vduse_set_group_asid_nomsg(dev, group, asid);
> > > > + return 0;
> > > > +}
> > > > +
> > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> > > > struct vdpa_vq_state *state)
> > > > {
> > > > @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> > > > struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > int ret;
> > > >
> > > > - ret = vduse_domain_set_map(dev->domain, iotlb);
> > > > + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> > > > if (ret)
> > > > return ret;
> > > >
> > > > - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> > > > + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> > > > if (ret) {
> > > > - vduse_domain_clear_map(dev->domain, iotlb);
> > > > + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> > > > return ret;
> > > > }
> > > >
> > > > @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> > > > .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> > > > .reset = vduse_vdpa_reset,
> > > > .set_map = vduse_vdpa_set_map,
> > > > + .set_group_asid = vduse_set_group_asid,
> > > > .get_vq_map = vduse_get_vq_map,
> > > > .free = vduse_vdpa_free,
> > > > };
> > > > @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> > > > enum dma_data_direction dir)
> > > > {
> > > > struct vduse_dev *vdev;
> > > > + struct vduse_as *as;
> > > > struct vduse_iova_domain *domain;
> > > >
> > > > if (!token.group)
> > > > return;
> > > >
> > > > vdev = token.group->dev;
> > > > - domain = vdev->domain;
> > > > -
> > > > + rcu_read_lock();
> > > > + as = rcu_dereference(token.group->as);
> > > > + domain = as->domain;
> > > > + rcu_read_unlock();
> > > > vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
> > >
> > > This is suspicious, at least we should do rcu_read_unlock() after
> > > vduse_domain_sync_single_for_device(), otherwise I don't see how RCU
> > > works.
> > >
> >
> > RCU is protecting that the address space pointer of the vq group is
> > not modified concurrently with the access. Ideally, this should be a
> > full lock, but just making sure that all accesses from the reader are
> > coherent is enough. Userspace should expect nothing if it uses the map
> > and modifies the vq group ASID at the same time anyway, but the kernel
> > needs to be sure that it does not see intermediate states. TBH, we
> > could move to a READ_ONCE / WRITE_ONCE, would that be more clear?
>
> Using READ_ONCE/WRITE_ONCE() needs to make sure the ordering is
> handled correctly.
>
> But I meant what happens if
>
> [cpu0]rcu_read_lock()
> [cpu0]as = rcu_dereference(token.group->as)
> [cpu0]...
> [cpu0]rcu_read_unlock()
> [cpu1]rcu_assign_pointer(token.group->as)
> [cpu0]vduse_domain_sync_single_for_device()
>
That should go ok. What I'm trying to protect here is the iterations
in vduse_domain_sync_single_for_device -> vduse_domain_bounce.
I'm going to embed that function here in
vduse_dev_sync_single_for_device and omit RCU and some details to make
the point easier:
vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
iova, size_t size, ...) {
read_lock(&token.group->as->domain);
while (size)
map = token.group->as->domain->bounce_maps[iova];
sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
...
page = token_group->as->domain->bounce_maps
addr = kmap_local_page(page);
do_bounce(map->orig_phys, addr, sz, dir);
kunmap_local(addr);
size -= sz;
iova += sz;
}
read_unlock(&token.group->as->domain);
}
Now, depending on the point where another execution thread changes
token_group->as and how the compiler has chosen to generate the
machine code, the outcome could be:
1) The domain read lock of one ASID is taken but the domain lock of
another as is unlocked.
2) We iterate until iova is ok for the ASID we're handling, but not
for the other one. So we access an invalid offset in
bounce_maps[iova].
And I guess there are other possible outcomes too.
So I need to make sure that the pointer accesses in all
vduse_domain_bounce is coherent. I'm ok if it takes the one before the
concurrent call to vduse_set_group_asid_nomsg or the one after that,
as the lifetime of all domains are bound to the device. But it cannot
change in the middle of the operation:
vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
iova, size_t size, ...) {
as = token.group->as;
// Tell the compiler to never replace "as" by "token.group->as" after this.
read_lock(&as->domain);
while (size)
map = as->domain->bounce_maps[iova];
sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
...
page = as->domain->bounce_maps
addr = kmap_local_page(page);
do_bounce(map->orig_phys, addr, sz, dir);
kunmap_local(addr);
size -= sz;
iova += sz;
}
read_unlock(&as->domain);
}
That can be done in many ways. Probably the read_lock is already
enough but it is not explicit that it is protecting token.group->as,
and future changes could remove it. To me, RCU is the most clear way
to do it, but even a volatile read (READ_ONCE?) would do.
> If this is not an issue, RCU is not a must, but please explain why.
> If this is an issue, we need to fix it.
>
> It's basically a question that
>
> 1) should we need to wait for the DMA to be completed before assigning
> to the new as
I don't think so, it is valid to assign a new as and let the ongoing
operation to continue. It is racy and the operation could fail, but
the kernel just returns an error and doesn't access invalid memory or
similar.
> 2) should we track the set_group_asid() for the group that has pending
> DMA to avoid potential issue
>
No, the group will outlive the operation as it is bound to the device.
> >
> > The function vduse_domain uses its own lock to protect concurrent
> > access to the maps of the ASID itself, as they were needed before
> > implementing ASID already.
>
> Thanks
>
> >
> > > > }
> > > >
> > > > @@ -868,14 +927,17 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token,
> > > > enum dma_data_direction dir)
> > > > {
> > > > struct vduse_dev *vdev;
> > > > + struct vduse_as *as;
> > > > struct vduse_iova_domain *domain;
> > > >
> > > > if (!token.group)
> > > > return;
> > > >
> > > > vdev = token.group->dev;
> > > > - domain = vdev->domain;
> > > > -
> > > > + rcu_read_lock();
> > > > + as = rcu_dereference(token.group->as);
> > > > + domain = as->domain;
> > > > + rcu_read_unlock();
> > > > vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
> > > > }
> > > >
> > > > @@ -885,15 +947,21 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
> > > > unsigned long attrs)
> > > > {
> > > > struct vduse_dev *vdev;
> > > > + struct vduse_as *as;
> > > > struct vduse_iova_domain *domain;
> > > > + dma_addr_t r;
> > > >
> > > > if (!token.group)
> > > > return DMA_MAPPING_ERROR;
> > > >
> > > > vdev = token.group->dev;
> > > > - domain = vdev->domain;
> > > > + rcu_read_lock();
> > > > + as = rcu_dereference(token.group->as);
> > > > + domain = as->domain;
> > > > + rcu_read_unlock();
> > > > + r = vduse_domain_map_page(domain, page, offset, size, dir, attrs);
> > >
> > > Same here.
> > >
> > > Thanks
> > >
> >
>
On Mon, Nov 17, 2025 at 8:16 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Mon, Nov 17, 2025 at 5:23 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Fri, Nov 14, 2025 at 7:25 PM Eugenio Perez Martin
> > <eperezma@redhat.com> wrote:
> > >
> > > On Fri, Nov 14, 2025 at 1:55 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Thu, Nov 13, 2025 at 7:56 PM Eugenio Pérez <eperezma@redhat.com> wrote:
> > > > >
> > > > > Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> > > > > group. This enables mapping each group into a distinct memory space.
> > > > >
> > > > > Now that the driver can change ASID in the middle of operation, the
> > > > > domain that each vq address point is also protected by domain_lock.
> > > >
> > > > Maybe it's better to document what is protected by RCU and how.
> > > >
> > >
> > > I added the _rcu annotation but I can expand it for sure. I can also
> > > modify the commit message.
> > >
> > > > More below.
> > > >
> > > > >
> > > > > Acked-by: Jason Wang <jasowang@redhat.com>
> > >
> > > I forgot to remove this, my bad!
> > >
> > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > ---
> > > > > v9:
> > > > > * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> > > > > context.
> > > > >
> > > > > v8:
> > > > > * Revert the mutex to rwlock change, it needs proper profiling to
> > > > > justify it.
> > > > >
> > > > > v7:
> > > > > * Take write lock in the error path (Jason).
> > > > >
> > > > > v6:
> > > > > * Make vdpa_dev_add use gotos for error handling (MST).
> > > > > * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> > > > > (MST).
> > > > > * Fix struct name not matching in the doc.
> > > > >
> > > > > v5:
> > > > > * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> > > > > ioctl (Jason).
> > > > > * Properly set domain bounce size to divide equally between nas (Jason).
> > > > > * Exclude "padding" member from the only >V1 members in
> > > > > vduse_dev_request.
> > > > >
> > > > > v4:
> > > > > * Divide each domain bounce size between the device bounce size (Jason).
> > > > > * revert unneeded addr = NULL assignment (Jason)
> > > > > * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> > > > > return; } (Jason)
> > > > > * Change a bad multiline comment, using @ caracter instead of * (Jason).
> > > > > * Consider config->nas == 0 as a fail (Jason).
> > > > >
> > > > > v3:
> > > > > * Get the vduse domain through the vduse_as in the map functions
> > > > > (Jason).
> > > > > * Squash with the patch creating the vduse_as struct (Jason).
> > > > > * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> > > > > (Jason)
> > > > >
> > > > > v2:
> > > > > * Convert the use of mutex to rwlock.
> > > > >
> > > > > RFC v3:
> > > > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> > > > > value to reduce memory consumption, but vqs are already limited to
> > > > > that value and userspace VDUSE is able to allocate that many vqs.
> > > > > * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> > > > > VDUSE_IOTLB_GET_INFO.
> > > > > * Use of array_index_nospec in VDUSE device ioctls.
> > > > > * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> > > > > * Move the umem mutex to asid struct so there is no contention between
> > > > > ASIDs.
> > > > >
> > > > > RFC v2:
> > > > > * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> > > > > part of the struct is the same.
> > > > > ---
> > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> > > > > include/uapi/linux/vduse.h | 53 ++++-
> > > > > 2 files changed, 314 insertions(+), 109 deletions(-)
> > > > >
> > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > index 97be04f73fbf..ff95ed56f22d 100644
> > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > @@ -11,6 +11,7 @@
> > > > > #include "linux/virtio_net.h"
> > > > > #include <linux/init.h>
> > > > > #include <linux/module.h>
> > > > > +#include <linux/rcupdate.h>
> > > > > #include <linux/cdev.h>
> > > > > #include <linux/device.h>
> > > > > #include <linux/eventfd.h>
> > > > > @@ -41,6 +42,7 @@
> > > > >
> > > > > #define VDUSE_DEV_MAX (1U << MINORBITS)
> > > > > #define VDUSE_DEV_MAX_GROUPS 0xffff
> > > > > +#define VDUSE_DEV_MAX_AS 0xffff
> > > > > #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> > > > > #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> > > > > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> > > > > @@ -86,7 +88,14 @@ struct vduse_umem {
> > > > > struct mm_struct *mm;
> > > > > };
> > > > >
> > > > > +struct vduse_as {
> > > > > + struct vduse_iova_domain *domain;
> > > > > + struct vduse_umem *umem;
> > > > > + struct mutex mem_lock;
> > > > > +};
> > > > > +
> > > > > struct vduse_vq_group {
> > > > > + struct vduse_as *as __rcu;
> > > > > struct vduse_dev *dev;
> > > > > };
> > > > >
> > > > > @@ -94,7 +103,7 @@ struct vduse_dev {
> > > > > struct vduse_vdpa *vdev;
> > > > > struct device *dev;
> > > > > struct vduse_virtqueue **vqs;
> > > > > - struct vduse_iova_domain *domain;
> > > > > + struct vduse_as *as;
> > > > > char *name;
> > > > > struct mutex lock;
> > > > > spinlock_t msg_lock;
> > > > > @@ -122,9 +131,8 @@ struct vduse_dev {
> > > > > u32 vq_num;
> > > > > u32 vq_align;
> > > > > u32 ngroups;
> > > > > - struct vduse_umem *umem;
> > > > > + u32 nas;
> > > > > struct vduse_vq_group *groups;
> > > > > - struct mutex mem_lock;
> > > > > unsigned int bounce_size;
> > > > > struct mutex domain_lock;
> > > > > };
> > > > > @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > }
> > > > >
> > > > > -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> > > > > u64 start, u64 last)
> > > > > {
> > > > > struct vduse_dev_msg msg = { 0 };
> > > > > @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > return -EINVAL;
> > > > >
> > > > > msg.req.type = VDUSE_UPDATE_IOTLB;
> > > > > - msg.req.iova.start = start;
> > > > > - msg.req.iova.last = last;
> > > > > + if (dev->api_version < VDUSE_API_VERSION_1) {
> > > > > + msg.req.iova.start = start;
> > > > > + msg.req.iova.last = last;
> > > > > + } else {
> > > > > + msg.req.iova_v2.start = start;
> > > > > + msg.req.iova_v2.last = last;
> > > > > + msg.req.iova_v2.asid = asid;
> > > > > + }
> > > > >
> > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > }
> > > > > @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> > > > > return mask;
> > > > > }
> > > > >
> > > > > +/* Force set the asid to a vq group without a message to the VDUSE device */
> > > > > +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> > > > > + unsigned int group, unsigned int asid)
> > > > > +{
> > > > > + /*
> > > > > + * Two concurrent updates to this pointer are valid as they cannot
> > > > > + * point to an invalid region. It is ok for them to race as long as
> > > > > + * the readers see a consistent state through RCU.
> > > > > + */
> > > > > + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
> > > >
> > > > I'd expect at least a synchronize_rcu() here to wait for the read is done?
> > > >
> > >
> > > What's the use? The only thing left here is to return from
> > > vduse_set_group_asid_nomsg, and we don't need to wait for readers
> > > here, do we?
> >
> > See below.
> >
> > >
> > > > > +}
> > > > > +
> > > > > static void vduse_dev_reset(struct vduse_dev *dev)
> > > > > {
> > > > > int i;
> > > > > - struct vduse_iova_domain *domain = dev->domain;
> > > > >
> > > > > /* The coherent mappings are handled in vduse_dev_free_coherent() */
> > > > > - if (domain && domain->bounce_map)
> > > > > - vduse_domain_reset_bounce_map(domain);
> > > > > + for (i = 0; i < dev->nas; i++) {
> > > > > + struct vduse_iova_domain *domain = dev->as[i].domain;
> > > > > +
> > > > > + if (domain && domain->bounce_map)
> > > > > + vduse_domain_reset_bounce_map(domain);
> > > > > + }
> > > > > +
> > > > > + for (i = 0; i < dev->ngroups; i++)
> > > > > + vduse_set_group_asid_nomsg(dev, i, 0);
> > > > >
> > > > > down_write(&dev->rwsem);
> > > > >
> > > > > @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> > > > > return ret;
> > > > > }
> > > > >
> > > > > +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> > > > > + unsigned int asid)
> > > > > +{
> > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > + struct vduse_dev_msg msg = { 0 };
> > > > > + int r;
> > > > > +
> > > > > + if (dev->api_version < VDUSE_API_VERSION_1 ||
> > > > > + group >= dev->ngroups || asid >= dev->nas)
> > > > > + return -EINVAL;
> > > > > +
> > > > > + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> > > > > + msg.req.vq_group_asid.group = group;
> > > > > + msg.req.vq_group_asid.asid = asid;
> > > > > +
> > > > > + r = vduse_dev_msg_sync(dev, &msg);
> > > > > + if (r < 0)
> > > > > + return r;
> > > > > +
> > > > > + vduse_set_group_asid_nomsg(dev, group, asid);
> > > > > + return 0;
> > > > > +}
> > > > > +
> > > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> > > > > struct vdpa_vq_state *state)
> > > > > {
> > > > > @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> > > > > struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > int ret;
> > > > >
> > > > > - ret = vduse_domain_set_map(dev->domain, iotlb);
> > > > > + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> > > > > if (ret)
> > > > > return ret;
> > > > >
> > > > > - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> > > > > + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> > > > > if (ret) {
> > > > > - vduse_domain_clear_map(dev->domain, iotlb);
> > > > > + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> > > > > return ret;
> > > > > }
> > > > >
> > > > > @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> > > > > .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> > > > > .reset = vduse_vdpa_reset,
> > > > > .set_map = vduse_vdpa_set_map,
> > > > > + .set_group_asid = vduse_set_group_asid,
> > > > > .get_vq_map = vduse_get_vq_map,
> > > > > .free = vduse_vdpa_free,
> > > > > };
> > > > > @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> > > > > enum dma_data_direction dir)
> > > > > {
> > > > > struct vduse_dev *vdev;
> > > > > + struct vduse_as *as;
> > > > > struct vduse_iova_domain *domain;
> > > > >
> > > > > if (!token.group)
> > > > > return;
> > > > >
> > > > > vdev = token.group->dev;
> > > > > - domain = vdev->domain;
> > > > > -
> > > > > + rcu_read_lock();
> > > > > + as = rcu_dereference(token.group->as);
> > > > > + domain = as->domain;
> > > > > + rcu_read_unlock();
> > > > > vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
> > > >
> > > > This is suspicious, at least we should do rcu_read_unlock() after
> > > > vduse_domain_sync_single_for_device(), otherwise I don't see how RCU
> > > > works.
> > > >
> > >
> > > RCU is protecting that the address space pointer of the vq group is
> > > not modified concurrently with the access. Ideally, this should be a
> > > full lock, but just making sure that all accesses from the reader are
> > > coherent is enough. Userspace should expect nothing if it uses the map
> > > and modifies the vq group ASID at the same time anyway, but the kernel
> > > needs to be sure that it does not see intermediate states. TBH, we
> > > could move to a READ_ONCE / WRITE_ONCE, would that be more clear?
> >
> > Using READ_ONCE/WRITE_ONCE() needs to make sure the ordering is
> > handled correctly.
> >
> > But I meant what happens if
> >
> > [cpu0]rcu_read_lock()
> > [cpu0]as = rcu_dereference(token.group->as)
> > [cpu0]...
> > [cpu0]rcu_read_unlock()
> > [cpu1]rcu_assign_pointer(token.group->as)
> > [cpu0]vduse_domain_sync_single_for_device()
> >
>
> That should go ok. What I'm trying to protect here is the iterations
> in vduse_domain_sync_single_for_device -> vduse_domain_bounce.
>
> I'm going to embed that function here in
> vduse_dev_sync_single_for_device and omit RCU and some details to make
> the point easier:
>
> vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> iova, size_t size, ...) {
> read_lock(&token.group->as->domain);
> while (size)
> map = token.group->as->domain->bounce_maps[iova];
> sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
>
> ...
> page = token_group->as->domain->bounce_maps
> addr = kmap_local_page(page);
> do_bounce(map->orig_phys, addr, sz, dir);
> kunmap_local(addr);
> size -= sz;
> iova += sz;
> }
> read_unlock(&token.group->as->domain);
> }
Right, so I meant for rwlock like semantic (let's forget the sleeping here).
vduse_set_group_asid_nomsg() should use "write lock" so it must wait
for the "read lock" to be done. But this is not the logic that is
implemented in this patch as there's no synchronize_rcu() in the
vduse_set_group_asid_nomsg(). We need to explain why set_group_asid()
doesn't need to wait and if this is true, we probably don't need RCU
but to make sure the load/store is atomic.
>
> Now, depending on the point where another execution thread changes
> token_group->as and how the compiler has chosen to generate the
> machine code, the outcome could be:
> 1) The domain read lock of one ASID is taken but the domain lock of
> another as is unlocked.
> 2) We iterate until iova is ok for the ASID we're handling, but not
> for the other one. So we access an invalid offset in
> bounce_maps[iova].
>
> And I guess there are other possible outcomes too.
>
> So I need to make sure that the pointer accesses in all
> vduse_domain_bounce is coherent.
I'm not sure I got here, but it looks like it accepts a domain
parameter and is protected by the bounce lock so we are probably fine
here?
> I'm ok if it takes the one before the
> concurrent call to vduse_set_group_asid_nomsg or the one after that,
> as the lifetime of all domains are bound to the device. But it cannot
> change in the middle of the operation:
>
> vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> iova, size_t size, ...) {
> as = token.group->as;
> // Tell the compiler to never replace "as" by "token.group->as" after this.
> read_lock(&as->domain);
> while (size)
> map = as->domain->bounce_maps[iova];
> sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
>
> ...
> page = as->domain->bounce_maps
> addr = kmap_local_page(page);
> do_bounce(map->orig_phys, addr, sz, dir);
> kunmap_local(addr);
> size -= sz;
> iova += sz;
> }
> read_unlock(&as->domain);
> }
>
> That can be done in many ways. Probably the read_lock is already
> enough but it is not explicit that it is protecting token.group->as,
> and future changes could remove it. To me, RCU is the most clear way
> to do it, but even a volatile read (READ_ONCE?) would do.
I wonder if another group rwlock is sufficient here:
for set_group_as_id()
write_lock(&dev->groups[group].lock);
dev->groups[group].as = &dev->as[asid];
write_unlock(&dev->groups[group].lock);
for the case where we need defer as
read_lock(&dev->groups[group].lock);
as = dev->groups[group].as;
//using as
read_unlock(&dev->groups[group].lock);
If this works, we don't need to bother with thinking if the
wait/synchronizre_rcu() is really needed or not?
>
> > If this is not an issue, RCU is not a must, but please explain why.
> > If this is an issue, we need to fix it.
> >
> > It's basically a question that
> >
> > 1) should we need to wait for the DMA to be completed before assigning
> > to the new as
>
> I don't think so, it is valid to assign a new as and let the ongoing
> operation to continue. It is racy and the operation could fail, but
> the kernel just returns an error and doesn't access invalid memory or
> similar.
See below.
>
> > 2) should we track the set_group_asid() for the group that has pending
> > DMA to avoid potential issue
> >
>
> No, the group will outlive the operation as it is bound to the device.
I meant e.g the DMA could be triggered by the device. For example, the
device may try to trigger an interrupt when the kernel is trying to
assign a new asid. So I wonder if guest can use this to poke Qemu's
memory etc. But if you mean we depend on the IOTLB to guard against
this, I'm fine, but let's document why we don't need it and how the
IOTLB layer can help to eliminate such risk.
Anyhow, tracking and failing seems to be more robust. For example,
kernel swiotlb has a layer to track whether the address that is being
unmapped is mapped before. In our case we should make sure when race
happens, isolation via ASID won't be broken.
Thanks
>
> > >
> > > The function vduse_domain uses its own lock to protect concurrent
> > > access to the maps of the ASID itself, as they were needed before
> > > implementing ASID already.
> >
> > Thanks
> >
> > >
> > > > > }
> > > > >
> > > > > @@ -868,14 +927,17 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token,
> > > > > enum dma_data_direction dir)
> > > > > {
> > > > > struct vduse_dev *vdev;
> > > > > + struct vduse_as *as;
> > > > > struct vduse_iova_domain *domain;
> > > > >
> > > > > if (!token.group)
> > > > > return;
> > > > >
> > > > > vdev = token.group->dev;
> > > > > - domain = vdev->domain;
> > > > > -
> > > > > + rcu_read_lock();
> > > > > + as = rcu_dereference(token.group->as);
> > > > > + domain = as->domain;
> > > > > + rcu_read_unlock();
> > > > > vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
> > > > > }
> > > > >
> > > > > @@ -885,15 +947,21 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
> > > > > unsigned long attrs)
> > > > > {
> > > > > struct vduse_dev *vdev;
> > > > > + struct vduse_as *as;
> > > > > struct vduse_iova_domain *domain;
> > > > > + dma_addr_t r;
> > > > >
> > > > > if (!token.group)
> > > > > return DMA_MAPPING_ERROR;
> > > > >
> > > > > vdev = token.group->dev;
> > > > > - domain = vdev->domain;
> > > > > + rcu_read_lock();
> > > > > + as = rcu_dereference(token.group->as);
> > > > > + domain = as->domain;
> > > > > + rcu_read_unlock();
> > > > > + r = vduse_domain_map_page(domain, page, offset, size, dir, attrs);
> > > >
> > > > Same here.
> > > >
> > > > Thanks
> > > >
> > >
> >
>
On Wed, Nov 19, 2025 at 3:39 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Mon, Nov 17, 2025 at 8:16 PM Eugenio Perez Martin
> <eperezma@redhat.com> wrote:
> >
> > On Mon, Nov 17, 2025 at 5:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Fri, Nov 14, 2025 at 7:25 PM Eugenio Perez Martin
> > > <eperezma@redhat.com> wrote:
> > > >
> > > > On Fri, Nov 14, 2025 at 1:55 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Thu, Nov 13, 2025 at 7:56 PM Eugenio Pérez <eperezma@redhat.com> wrote:
> > > > > >
> > > > > > Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> > > > > > group. This enables mapping each group into a distinct memory space.
> > > > > >
> > > > > > Now that the driver can change ASID in the middle of operation, the
> > > > > > domain that each vq address point is also protected by domain_lock.
> > > > >
> > > > > Maybe it's better to document what is protected by RCU and how.
> > > > >
> > > >
> > > > I added the _rcu annotation but I can expand it for sure. I can also
> > > > modify the commit message.
> > > >
> > > > > More below.
> > > > >
> > > > > >
> > > > > > Acked-by: Jason Wang <jasowang@redhat.com>
> > > >
> > > > I forgot to remove this, my bad!
> > > >
> > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > > ---
> > > > > > v9:
> > > > > > * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> > > > > > context.
> > > > > >
> > > > > > v8:
> > > > > > * Revert the mutex to rwlock change, it needs proper profiling to
> > > > > > justify it.
> > > > > >
> > > > > > v7:
> > > > > > * Take write lock in the error path (Jason).
> > > > > >
> > > > > > v6:
> > > > > > * Make vdpa_dev_add use gotos for error handling (MST).
> > > > > > * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> > > > > > (MST).
> > > > > > * Fix struct name not matching in the doc.
> > > > > >
> > > > > > v5:
> > > > > > * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> > > > > > ioctl (Jason).
> > > > > > * Properly set domain bounce size to divide equally between nas (Jason).
> > > > > > * Exclude "padding" member from the only >V1 members in
> > > > > > vduse_dev_request.
> > > > > >
> > > > > > v4:
> > > > > > * Divide each domain bounce size between the device bounce size (Jason).
> > > > > > * revert unneeded addr = NULL assignment (Jason)
> > > > > > * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> > > > > > return; } (Jason)
> > > > > > * Change a bad multiline comment, using @ caracter instead of * (Jason).
> > > > > > * Consider config->nas == 0 as a fail (Jason).
> > > > > >
> > > > > > v3:
> > > > > > * Get the vduse domain through the vduse_as in the map functions
> > > > > > (Jason).
> > > > > > * Squash with the patch creating the vduse_as struct (Jason).
> > > > > > * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> > > > > > (Jason)
> > > > > >
> > > > > > v2:
> > > > > > * Convert the use of mutex to rwlock.
> > > > > >
> > > > > > RFC v3:
> > > > > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> > > > > > value to reduce memory consumption, but vqs are already limited to
> > > > > > that value and userspace VDUSE is able to allocate that many vqs.
> > > > > > * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> > > > > > VDUSE_IOTLB_GET_INFO.
> > > > > > * Use of array_index_nospec in VDUSE device ioctls.
> > > > > > * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> > > > > > * Move the umem mutex to asid struct so there is no contention between
> > > > > > ASIDs.
> > > > > >
> > > > > > RFC v2:
> > > > > > * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> > > > > > part of the struct is the same.
> > > > > > ---
> > > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> > > > > > include/uapi/linux/vduse.h | 53 ++++-
> > > > > > 2 files changed, 314 insertions(+), 109 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > index 97be04f73fbf..ff95ed56f22d 100644
> > > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > @@ -11,6 +11,7 @@
> > > > > > #include "linux/virtio_net.h"
> > > > > > #include <linux/init.h>
> > > > > > #include <linux/module.h>
> > > > > > +#include <linux/rcupdate.h>
> > > > > > #include <linux/cdev.h>
> > > > > > #include <linux/device.h>
> > > > > > #include <linux/eventfd.h>
> > > > > > @@ -41,6 +42,7 @@
> > > > > >
> > > > > > #define VDUSE_DEV_MAX (1U << MINORBITS)
> > > > > > #define VDUSE_DEV_MAX_GROUPS 0xffff
> > > > > > +#define VDUSE_DEV_MAX_AS 0xffff
> > > > > > #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> > > > > > #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> > > > > > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> > > > > > @@ -86,7 +88,14 @@ struct vduse_umem {
> > > > > > struct mm_struct *mm;
> > > > > > };
> > > > > >
> > > > > > +struct vduse_as {
> > > > > > + struct vduse_iova_domain *domain;
> > > > > > + struct vduse_umem *umem;
> > > > > > + struct mutex mem_lock;
> > > > > > +};
> > > > > > +
> > > > > > struct vduse_vq_group {
> > > > > > + struct vduse_as *as __rcu;
> > > > > > struct vduse_dev *dev;
> > > > > > };
> > > > > >
> > > > > > @@ -94,7 +103,7 @@ struct vduse_dev {
> > > > > > struct vduse_vdpa *vdev;
> > > > > > struct device *dev;
> > > > > > struct vduse_virtqueue **vqs;
> > > > > > - struct vduse_iova_domain *domain;
> > > > > > + struct vduse_as *as;
> > > > > > char *name;
> > > > > > struct mutex lock;
> > > > > > spinlock_t msg_lock;
> > > > > > @@ -122,9 +131,8 @@ struct vduse_dev {
> > > > > > u32 vq_num;
> > > > > > u32 vq_align;
> > > > > > u32 ngroups;
> > > > > > - struct vduse_umem *umem;
> > > > > > + u32 nas;
> > > > > > struct vduse_vq_group *groups;
> > > > > > - struct mutex mem_lock;
> > > > > > unsigned int bounce_size;
> > > > > > struct mutex domain_lock;
> > > > > > };
> > > > > > @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > }
> > > > > >
> > > > > > -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> > > > > > u64 start, u64 last)
> > > > > > {
> > > > > > struct vduse_dev_msg msg = { 0 };
> > > > > > @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > return -EINVAL;
> > > > > >
> > > > > > msg.req.type = VDUSE_UPDATE_IOTLB;
> > > > > > - msg.req.iova.start = start;
> > > > > > - msg.req.iova.last = last;
> > > > > > + if (dev->api_version < VDUSE_API_VERSION_1) {
> > > > > > + msg.req.iova.start = start;
> > > > > > + msg.req.iova.last = last;
> > > > > > + } else {
> > > > > > + msg.req.iova_v2.start = start;
> > > > > > + msg.req.iova_v2.last = last;
> > > > > > + msg.req.iova_v2.asid = asid;
> > > > > > + }
> > > > > >
> > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > }
> > > > > > @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> > > > > > return mask;
> > > > > > }
> > > > > >
> > > > > > +/* Force set the asid to a vq group without a message to the VDUSE device */
> > > > > > +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> > > > > > + unsigned int group, unsigned int asid)
> > > > > > +{
> > > > > > + /*
> > > > > > + * Two concurrent updates to this pointer are valid as they cannot
> > > > > > + * point to an invalid region. It is ok for them to race as long as
> > > > > > + * the readers see a consistent state through RCU.
> > > > > > + */
> > > > > > + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
> > > > >
> > > > > I'd expect at least a synchronize_rcu() here to wait for the read is done?
> > > > >
> > > >
> > > > What's the use? The only thing left here is to return from
> > > > vduse_set_group_asid_nomsg, and we don't need to wait for readers
> > > > here, do we?
> > >
> > > See below.
> > >
> > > >
> > > > > > +}
> > > > > > +
> > > > > > static void vduse_dev_reset(struct vduse_dev *dev)
> > > > > > {
> > > > > > int i;
> > > > > > - struct vduse_iova_domain *domain = dev->domain;
> > > > > >
> > > > > > /* The coherent mappings are handled in vduse_dev_free_coherent() */
> > > > > > - if (domain && domain->bounce_map)
> > > > > > - vduse_domain_reset_bounce_map(domain);
> > > > > > + for (i = 0; i < dev->nas; i++) {
> > > > > > + struct vduse_iova_domain *domain = dev->as[i].domain;
> > > > > > +
> > > > > > + if (domain && domain->bounce_map)
> > > > > > + vduse_domain_reset_bounce_map(domain);
> > > > > > + }
> > > > > > +
> > > > > > + for (i = 0; i < dev->ngroups; i++)
> > > > > > + vduse_set_group_asid_nomsg(dev, i, 0);
> > > > > >
> > > > > > down_write(&dev->rwsem);
> > > > > >
> > > > > > @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> > > > > > return ret;
> > > > > > }
> > > > > >
> > > > > > +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> > > > > > + unsigned int asid)
> > > > > > +{
> > > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > + struct vduse_dev_msg msg = { 0 };
> > > > > > + int r;
> > > > > > +
> > > > > > + if (dev->api_version < VDUSE_API_VERSION_1 ||
> > > > > > + group >= dev->ngroups || asid >= dev->nas)
> > > > > > + return -EINVAL;
> > > > > > +
> > > > > > + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> > > > > > + msg.req.vq_group_asid.group = group;
> > > > > > + msg.req.vq_group_asid.asid = asid;
> > > > > > +
> > > > > > + r = vduse_dev_msg_sync(dev, &msg);
> > > > > > + if (r < 0)
> > > > > > + return r;
> > > > > > +
> > > > > > + vduse_set_group_asid_nomsg(dev, group, asid);
> > > > > > + return 0;
> > > > > > +}
> > > > > > +
> > > > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> > > > > > struct vdpa_vq_state *state)
> > > > > > {
> > > > > > @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> > > > > > struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > int ret;
> > > > > >
> > > > > > - ret = vduse_domain_set_map(dev->domain, iotlb);
> > > > > > + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> > > > > > if (ret)
> > > > > > return ret;
> > > > > >
> > > > > > - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> > > > > > + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> > > > > > if (ret) {
> > > > > > - vduse_domain_clear_map(dev->domain, iotlb);
> > > > > > + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> > > > > > return ret;
> > > > > > }
> > > > > >
> > > > > > @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> > > > > > .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> > > > > > .reset = vduse_vdpa_reset,
> > > > > > .set_map = vduse_vdpa_set_map,
> > > > > > + .set_group_asid = vduse_set_group_asid,
> > > > > > .get_vq_map = vduse_get_vq_map,
> > > > > > .free = vduse_vdpa_free,
> > > > > > };
> > > > > > @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> > > > > > enum dma_data_direction dir)
> > > > > > {
> > > > > > struct vduse_dev *vdev;
> > > > > > + struct vduse_as *as;
> > > > > > struct vduse_iova_domain *domain;
> > > > > >
> > > > > > if (!token.group)
> > > > > > return;
> > > > > >
> > > > > > vdev = token.group->dev;
> > > > > > - domain = vdev->domain;
> > > > > > -
> > > > > > + rcu_read_lock();
> > > > > > + as = rcu_dereference(token.group->as);
> > > > > > + domain = as->domain;
> > > > > > + rcu_read_unlock();
> > > > > > vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
> > > > >
> > > > > This is suspicious, at least we should do rcu_read_unlock() after
> > > > > vduse_domain_sync_single_for_device(), otherwise I don't see how RCU
> > > > > works.
> > > > >
> > > >
> > > > RCU is protecting that the address space pointer of the vq group is
> > > > not modified concurrently with the access. Ideally, this should be a
> > > > full lock, but just making sure that all accesses from the reader are
> > > > coherent is enough. Userspace should expect nothing if it uses the map
> > > > and modifies the vq group ASID at the same time anyway, but the kernel
> > > > needs to be sure that it does not see intermediate states. TBH, we
> > > > could move to a READ_ONCE / WRITE_ONCE, would that be more clear?
> > >
> > > Using READ_ONCE/WRITE_ONCE() needs to make sure the ordering is
> > > handled correctly.
> > >
> > > But I meant what happens if
> > >
> > > [cpu0]rcu_read_lock()
> > > [cpu0]as = rcu_dereference(token.group->as)
> > > [cpu0]...
> > > [cpu0]rcu_read_unlock()
> > > [cpu1]rcu_assign_pointer(token.group->as)
> > > [cpu0]vduse_domain_sync_single_for_device()
> > >
> >
> > That should go ok. What I'm trying to protect here is the iterations
> > in vduse_domain_sync_single_for_device -> vduse_domain_bounce.
> >
> > I'm going to embed that function here in
> > vduse_dev_sync_single_for_device and omit RCU and some details to make
> > the point easier:
> >
> > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > iova, size_t size, ...) {
> > read_lock(&token.group->as->domain);
> > while (size)
> > map = token.group->as->domain->bounce_maps[iova];
> > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> >
> > ...
> > page = token_group->as->domain->bounce_maps
> > addr = kmap_local_page(page);
> > do_bounce(map->orig_phys, addr, sz, dir);
> > kunmap_local(addr);
> > size -= sz;
> > iova += sz;
> > }
> > read_unlock(&token.group->as->domain);
> > }
>
> Right, so I meant for rwlock like semantic (let's forget the sleeping here).
>
> vduse_set_group_asid_nomsg() should use "write lock" so it must wait
> for the "read lock" to be done.
No, it doesn't need to wait as long as the reader part uses its own copy.
> But this is not the logic that is
> implemented in this patch as there's no synchronize_rcu() in the
> vduse_set_group_asid_nomsg().
We only set the pointer on the writer's side, we do nothing like
freeing resources. Should we set the pointer before or after
syncrhonize_rcu()? What do we need to do on the other side of
syncrhonize_rcu()?
> We need to explain why set_group_asid()
> doesn't need to wait and if this is true, we probably don't need RCU
> but to make sure the load/store is atomic.
>
What about:
* It does not matter if other thread modify group->as as long as the
reader uses the same as for all its operation. It performs a local
copy for that reason.
* It does not matter if multiple threads modify group->as as long as
the update is atomic.
?
> >
> > Now, depending on the point where another execution thread changes
> > token_group->as and how the compiler has chosen to generate the
> > machine code, the outcome could be:
> > 1) The domain read lock of one ASID is taken but the domain lock of
> > another as is unlocked.
> > 2) We iterate until iova is ok for the ASID we're handling, but not
> > for the other one. So we access an invalid offset in
> > bounce_maps[iova].
> >
> > And I guess there are other possible outcomes too.
> >
> > So I need to make sure that the pointer accesses in all
> > vduse_domain_bounce is coherent.
>
> I'm not sure I got here, but it looks like it accepts a domain
> parameter and is protected by the bounce lock so we are probably fine
> here?
>
The bounce lock only protects the iotlb tree, not the pointer to that
iotlb tree.
> > I'm ok if it takes the one before the
> > concurrent call to vduse_set_group_asid_nomsg or the one after that,
> > as the lifetime of all domains are bound to the device. But it cannot
> > change in the middle of the operation:
> >
> > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > iova, size_t size, ...) {
> > as = token.group->as;
> > // Tell the compiler to never replace "as" by "token.group->as" after this.
> > read_lock(&as->domain);
> > while (size)
> > map = as->domain->bounce_maps[iova];
> > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> >
> > ...
> > page = as->domain->bounce_maps
> > addr = kmap_local_page(page);
> > do_bounce(map->orig_phys, addr, sz, dir);
> > kunmap_local(addr);
> > size -= sz;
> > iova += sz;
> > }
> > read_unlock(&as->domain);
> > }
> >
> > That can be done in many ways. Probably the read_lock is already
> > enough but it is not explicit that it is protecting token.group->as,
> > and future changes could remove it. To me, RCU is the most clear way
> > to do it, but even a volatile read (READ_ONCE?) would do.
>
> I wonder if another group rwlock is sufficient here:
>
> for set_group_as_id()
>
> write_lock(&dev->groups[group].lock);
> dev->groups[group].as = &dev->as[asid];
> write_unlock(&dev->groups[group].lock);
>
> for the case where we need defer as
>
> read_lock(&dev->groups[group].lock);
> as = dev->groups[group].as;
> //using as
> read_unlock(&dev->groups[group].lock);
>
> If this works, we don't need to bother with thinking if the
> wait/synchronizre_rcu() is really needed or not?
>
A rwlock is sufficient but we need to modify the allocation code
somehow. Also, I thought we wanted to avoid the overhead of taking the
read lock in the DMA ops too.
Another disadvantage of the lock vs RCU or READ_ONCE is that the vq
group ASID change needs to wait for the DMA operation to finish
instead of just applying for the next DMA ops. Not like vq group ASID
change would be in the hot path anyway, just pointing it out.
> >
> > > If this is not an issue, RCU is not a must, but please explain why.
> > > If this is an issue, we need to fix it.
> > >
> > > It's basically a question that
> > >
> > > 1) should we need to wait for the DMA to be completed before assigning
> > > to the new as
> >
> > I don't think so, it is valid to assign a new as and let the ongoing
> > operation to continue. It is racy and the operation could fail, but
> > the kernel just returns an error and doesn't access invalid memory or
> > similar.
>
> See below.
>
> >
> > > 2) should we track the set_group_asid() for the group that has pending
> > > DMA to avoid potential issue
> > >
> >
> > No, the group will outlive the operation as it is bound to the device.
>
> I meant e.g the DMA could be triggered by the device. For example, the
> device may try to trigger an interrupt when the kernel is trying to
> assign a new asid. So I wonder if guest can use this to poke Qemu's
> memory etc.
I'm not sure I get this point. If QEMU changes the ASID of the vq
group sent to the guest the race does not matter anymore: it is
explicitly opening the possibility from the guest to poke QEMU's
memory unless the guest is totally paused.
> But if you mean we depend on the IOTLB to guard against
> this, I'm fine, but let's document why we don't need it and how the
> IOTLB layer can help to eliminate such risk.
>
No, I'm not happy about letting iotlb lock to protect this too as
they're at different levels actually: One is protecting iotlb trees
modifications while they're being read and the other is protecting the
ASID assignment to different vq groups. To reuse them means a
modification in any tree blocks the change of vq group ASID, for
example.
> Anyhow, tracking and failing seems to be more robust.
I'm not sure I get this. If a DMA read starts in one ASID and then
QEMU changes the ASID of the vq group, do you prefer it to fail rather
than continue reading from the original ASID? It seems hard to
communicate that the ASID has changed to the DMA operation callback.
> For example,
> kernel swiotlb has a layer to track whether the address that is being
> unmapped is mapped before. In our case we should make sure when race
> happens, isolation via ASID won't be broken.
>
Right but it cannot protect if the iotlb is valid in the second ASID
but just points to a different physical / virtual page.
On Wed, Nov 19, 2025 at 5:27 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Wed, Nov 19, 2025 at 3:39 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Mon, Nov 17, 2025 at 8:16 PM Eugenio Perez Martin
> > <eperezma@redhat.com> wrote:
> > >
> > > On Mon, Nov 17, 2025 at 5:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Fri, Nov 14, 2025 at 7:25 PM Eugenio Perez Martin
> > > > <eperezma@redhat.com> wrote:
> > > > >
> > > > > On Fri, Nov 14, 2025 at 1:55 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > > On Thu, Nov 13, 2025 at 7:56 PM Eugenio Pérez <eperezma@redhat.com> wrote:
> > > > > > >
> > > > > > > Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> > > > > > > group. This enables mapping each group into a distinct memory space.
> > > > > > >
> > > > > > > Now that the driver can change ASID in the middle of operation, the
> > > > > > > domain that each vq address point is also protected by domain_lock.
> > > > > >
> > > > > > Maybe it's better to document what is protected by RCU and how.
> > > > > >
> > > > >
> > > > > I added the _rcu annotation but I can expand it for sure. I can also
> > > > > modify the commit message.
> > > > >
> > > > > > More below.
> > > > > >
> > > > > > >
> > > > > > > Acked-by: Jason Wang <jasowang@redhat.com>
> > > > >
> > > > > I forgot to remove this, my bad!
> > > > >
> > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > > > ---
> > > > > > > v9:
> > > > > > > * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> > > > > > > context.
> > > > > > >
> > > > > > > v8:
> > > > > > > * Revert the mutex to rwlock change, it needs proper profiling to
> > > > > > > justify it.
> > > > > > >
> > > > > > > v7:
> > > > > > > * Take write lock in the error path (Jason).
> > > > > > >
> > > > > > > v6:
> > > > > > > * Make vdpa_dev_add use gotos for error handling (MST).
> > > > > > > * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> > > > > > > (MST).
> > > > > > > * Fix struct name not matching in the doc.
> > > > > > >
> > > > > > > v5:
> > > > > > > * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> > > > > > > ioctl (Jason).
> > > > > > > * Properly set domain bounce size to divide equally between nas (Jason).
> > > > > > > * Exclude "padding" member from the only >V1 members in
> > > > > > > vduse_dev_request.
> > > > > > >
> > > > > > > v4:
> > > > > > > * Divide each domain bounce size between the device bounce size (Jason).
> > > > > > > * revert unneeded addr = NULL assignment (Jason)
> > > > > > > * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> > > > > > > return; } (Jason)
> > > > > > > * Change a bad multiline comment, using @ caracter instead of * (Jason).
> > > > > > > * Consider config->nas == 0 as a fail (Jason).
> > > > > > >
> > > > > > > v3:
> > > > > > > * Get the vduse domain through the vduse_as in the map functions
> > > > > > > (Jason).
> > > > > > > * Squash with the patch creating the vduse_as struct (Jason).
> > > > > > > * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> > > > > > > (Jason)
> > > > > > >
> > > > > > > v2:
> > > > > > > * Convert the use of mutex to rwlock.
> > > > > > >
> > > > > > > RFC v3:
> > > > > > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> > > > > > > value to reduce memory consumption, but vqs are already limited to
> > > > > > > that value and userspace VDUSE is able to allocate that many vqs.
> > > > > > > * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> > > > > > > VDUSE_IOTLB_GET_INFO.
> > > > > > > * Use of array_index_nospec in VDUSE device ioctls.
> > > > > > > * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> > > > > > > * Move the umem mutex to asid struct so there is no contention between
> > > > > > > ASIDs.
> > > > > > >
> > > > > > > RFC v2:
> > > > > > > * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> > > > > > > part of the struct is the same.
> > > > > > > ---
> > > > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> > > > > > > include/uapi/linux/vduse.h | 53 ++++-
> > > > > > > 2 files changed, 314 insertions(+), 109 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > index 97be04f73fbf..ff95ed56f22d 100644
> > > > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > @@ -11,6 +11,7 @@
> > > > > > > #include "linux/virtio_net.h"
> > > > > > > #include <linux/init.h>
> > > > > > > #include <linux/module.h>
> > > > > > > +#include <linux/rcupdate.h>
> > > > > > > #include <linux/cdev.h>
> > > > > > > #include <linux/device.h>
> > > > > > > #include <linux/eventfd.h>
> > > > > > > @@ -41,6 +42,7 @@
> > > > > > >
> > > > > > > #define VDUSE_DEV_MAX (1U << MINORBITS)
> > > > > > > #define VDUSE_DEV_MAX_GROUPS 0xffff
> > > > > > > +#define VDUSE_DEV_MAX_AS 0xffff
> > > > > > > #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> > > > > > > #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> > > > > > > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> > > > > > > @@ -86,7 +88,14 @@ struct vduse_umem {
> > > > > > > struct mm_struct *mm;
> > > > > > > };
> > > > > > >
> > > > > > > +struct vduse_as {
> > > > > > > + struct vduse_iova_domain *domain;
> > > > > > > + struct vduse_umem *umem;
> > > > > > > + struct mutex mem_lock;
> > > > > > > +};
> > > > > > > +
> > > > > > > struct vduse_vq_group {
> > > > > > > + struct vduse_as *as __rcu;
> > > > > > > struct vduse_dev *dev;
> > > > > > > };
> > > > > > >
> > > > > > > @@ -94,7 +103,7 @@ struct vduse_dev {
> > > > > > > struct vduse_vdpa *vdev;
> > > > > > > struct device *dev;
> > > > > > > struct vduse_virtqueue **vqs;
> > > > > > > - struct vduse_iova_domain *domain;
> > > > > > > + struct vduse_as *as;
> > > > > > > char *name;
> > > > > > > struct mutex lock;
> > > > > > > spinlock_t msg_lock;
> > > > > > > @@ -122,9 +131,8 @@ struct vduse_dev {
> > > > > > > u32 vq_num;
> > > > > > > u32 vq_align;
> > > > > > > u32 ngroups;
> > > > > > > - struct vduse_umem *umem;
> > > > > > > + u32 nas;
> > > > > > > struct vduse_vq_group *groups;
> > > > > > > - struct mutex mem_lock;
> > > > > > > unsigned int bounce_size;
> > > > > > > struct mutex domain_lock;
> > > > > > > };
> > > > > > > @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > }
> > > > > > >
> > > > > > > -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> > > > > > > u64 start, u64 last)
> > > > > > > {
> > > > > > > struct vduse_dev_msg msg = { 0 };
> > > > > > > @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > return -EINVAL;
> > > > > > >
> > > > > > > msg.req.type = VDUSE_UPDATE_IOTLB;
> > > > > > > - msg.req.iova.start = start;
> > > > > > > - msg.req.iova.last = last;
> > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1) {
> > > > > > > + msg.req.iova.start = start;
> > > > > > > + msg.req.iova.last = last;
> > > > > > > + } else {
> > > > > > > + msg.req.iova_v2.start = start;
> > > > > > > + msg.req.iova_v2.last = last;
> > > > > > > + msg.req.iova_v2.asid = asid;
> > > > > > > + }
> > > > > > >
> > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > }
> > > > > > > @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> > > > > > > return mask;
> > > > > > > }
> > > > > > >
> > > > > > > +/* Force set the asid to a vq group without a message to the VDUSE device */
> > > > > > > +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> > > > > > > + unsigned int group, unsigned int asid)
> > > > > > > +{
> > > > > > > + /*
> > > > > > > + * Two concurrent updates to this pointer are valid as they cannot
> > > > > > > + * point to an invalid region. It is ok for them to race as long as
> > > > > > > + * the readers see a consistent state through RCU.
> > > > > > > + */
> > > > > > > + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
> > > > > >
> > > > > > I'd expect at least a synchronize_rcu() here to wait for the read is done?
> > > > > >
> > > > >
> > > > > What's the use? The only thing left here is to return from
> > > > > vduse_set_group_asid_nomsg, and we don't need to wait for readers
> > > > > here, do we?
> > > >
> > > > See below.
> > > >
> > > > >
> > > > > > > +}
> > > > > > > +
> > > > > > > static void vduse_dev_reset(struct vduse_dev *dev)
> > > > > > > {
> > > > > > > int i;
> > > > > > > - struct vduse_iova_domain *domain = dev->domain;
> > > > > > >
> > > > > > > /* The coherent mappings are handled in vduse_dev_free_coherent() */
> > > > > > > - if (domain && domain->bounce_map)
> > > > > > > - vduse_domain_reset_bounce_map(domain);
> > > > > > > + for (i = 0; i < dev->nas; i++) {
> > > > > > > + struct vduse_iova_domain *domain = dev->as[i].domain;
> > > > > > > +
> > > > > > > + if (domain && domain->bounce_map)
> > > > > > > + vduse_domain_reset_bounce_map(domain);
> > > > > > > + }
> > > > > > > +
> > > > > > > + for (i = 0; i < dev->ngroups; i++)
> > > > > > > + vduse_set_group_asid_nomsg(dev, i, 0);
> > > > > > >
> > > > > > > down_write(&dev->rwsem);
> > > > > > >
> > > > > > > @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> > > > > > > return ret;
> > > > > > > }
> > > > > > >
> > > > > > > +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> > > > > > > + unsigned int asid)
> > > > > > > +{
> > > > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > + struct vduse_dev_msg msg = { 0 };
> > > > > > > + int r;
> > > > > > > +
> > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1 ||
> > > > > > > + group >= dev->ngroups || asid >= dev->nas)
> > > > > > > + return -EINVAL;
> > > > > > > +
> > > > > > > + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> > > > > > > + msg.req.vq_group_asid.group = group;
> > > > > > > + msg.req.vq_group_asid.asid = asid;
> > > > > > > +
> > > > > > > + r = vduse_dev_msg_sync(dev, &msg);
> > > > > > > + if (r < 0)
> > > > > > > + return r;
> > > > > > > +
> > > > > > > + vduse_set_group_asid_nomsg(dev, group, asid);
> > > > > > > + return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> > > > > > > struct vdpa_vq_state *state)
> > > > > > > {
> > > > > > > @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> > > > > > > struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > int ret;
> > > > > > >
> > > > > > > - ret = vduse_domain_set_map(dev->domain, iotlb);
> > > > > > > + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> > > > > > > if (ret)
> > > > > > > return ret;
> > > > > > >
> > > > > > > - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> > > > > > > + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> > > > > > > if (ret) {
> > > > > > > - vduse_domain_clear_map(dev->domain, iotlb);
> > > > > > > + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> > > > > > > return ret;
> > > > > > > }
> > > > > > >
> > > > > > > @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> > > > > > > .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> > > > > > > .reset = vduse_vdpa_reset,
> > > > > > > .set_map = vduse_vdpa_set_map,
> > > > > > > + .set_group_asid = vduse_set_group_asid,
> > > > > > > .get_vq_map = vduse_get_vq_map,
> > > > > > > .free = vduse_vdpa_free,
> > > > > > > };
> > > > > > > @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> > > > > > > enum dma_data_direction dir)
> > > > > > > {
> > > > > > > struct vduse_dev *vdev;
> > > > > > > + struct vduse_as *as;
> > > > > > > struct vduse_iova_domain *domain;
> > > > > > >
> > > > > > > if (!token.group)
> > > > > > > return;
> > > > > > >
> > > > > > > vdev = token.group->dev;
> > > > > > > - domain = vdev->domain;
> > > > > > > -
> > > > > > > + rcu_read_lock();
> > > > > > > + as = rcu_dereference(token.group->as);
> > > > > > > + domain = as->domain;
> > > > > > > + rcu_read_unlock();
> > > > > > > vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
> > > > > >
> > > > > > This is suspicious, at least we should do rcu_read_unlock() after
> > > > > > vduse_domain_sync_single_for_device(), otherwise I don't see how RCU
> > > > > > works.
> > > > > >
> > > > >
> > > > > RCU is protecting that the address space pointer of the vq group is
> > > > > not modified concurrently with the access. Ideally, this should be a
> > > > > full lock, but just making sure that all accesses from the reader are
> > > > > coherent is enough. Userspace should expect nothing if it uses the map
> > > > > and modifies the vq group ASID at the same time anyway, but the kernel
> > > > > needs to be sure that it does not see intermediate states. TBH, we
> > > > > could move to a READ_ONCE / WRITE_ONCE, would that be more clear?
> > > >
> > > > Using READ_ONCE/WRITE_ONCE() needs to make sure the ordering is
> > > > handled correctly.
> > > >
> > > > But I meant what happens if
> > > >
> > > > [cpu0]rcu_read_lock()
> > > > [cpu0]as = rcu_dereference(token.group->as)
> > > > [cpu0]...
> > > > [cpu0]rcu_read_unlock()
> > > > [cpu1]rcu_assign_pointer(token.group->as)
> > > > [cpu0]vduse_domain_sync_single_for_device()
> > > >
> > >
> > > That should go ok. What I'm trying to protect here is the iterations
> > > in vduse_domain_sync_single_for_device -> vduse_domain_bounce.
> > >
> > > I'm going to embed that function here in
> > > vduse_dev_sync_single_for_device and omit RCU and some details to make
> > > the point easier:
> > >
> > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > iova, size_t size, ...) {
> > > read_lock(&token.group->as->domain);
> > > while (size)
> > > map = token.group->as->domain->bounce_maps[iova];
> > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > >
> > > ...
> > > page = token_group->as->domain->bounce_maps
> > > addr = kmap_local_page(page);
> > > do_bounce(map->orig_phys, addr, sz, dir);
> > > kunmap_local(addr);
> > > size -= sz;
> > > iova += sz;
> > > }
> > > read_unlock(&token.group->as->domain);
> > > }
> >
> > Right, so I meant for rwlock like semantic (let's forget the sleeping here).
> >
> > vduse_set_group_asid_nomsg() should use "write lock" so it must wait
> > for the "read lock" to be done.
>
> No, it doesn't need to wait as long as the reader part uses its own copy.
It probably won't crash but I meant if we have logic issues. For
example, once set_group_asid() return, there should still be a pending
DMA that is using the old as.
>
> > But this is not the logic that is
> > implemented in this patch as there's no synchronize_rcu() in the
> > vduse_set_group_asid_nomsg().
>
> We only set the pointer on the writer's side, we do nothing like
> freeing resources. Should we set the pointer before or after
> syncrhonize_rcu()? What do we need to do on the other side of
> syncrhonize_rcu()?
Usually we don't need special care on the read side. But as discussed,
synchronize_rcu() is not a must but we need to explain why it is safe
and I'm not sure Michael is fine with that.
If we just want to make sure the order of publish and read, we can
switch to use smp_store_release() and smp_load_acqurie().
>
> > We need to explain why set_group_asid()
> > doesn't need to wait and if this is true, we probably don't need RCU
> > but to make sure the load/store is atomic.
> >
>
> What about:
>
> * It does not matter if other thread modify group->as as long as the
> reader uses the same as for all its operation. It performs a local
> copy for that reason.
> * It does not matter if multiple threads modify group->as as long as
> the update is atomic.
See above reply.
>
> ?
>
> > >
> > > Now, depending on the point where another execution thread changes
> > > token_group->as and how the compiler has chosen to generate the
> > > machine code, the outcome could be:
> > > 1) The domain read lock of one ASID is taken but the domain lock of
> > > another as is unlocked.
> > > 2) We iterate until iova is ok for the ASID we're handling, but not
> > > for the other one. So we access an invalid offset in
> > > bounce_maps[iova].
> > >
> > > And I guess there are other possible outcomes too.
> > >
> > > So I need to make sure that the pointer accesses in all
> > > vduse_domain_bounce is coherent.
> >
> > I'm not sure I got here, but it looks like it accepts a domain
> > parameter and is protected by the bounce lock so we are probably fine
> > here?
> >
>
> The bounce lock only protects the iotlb tree, not the pointer to that
> iotlb tree.
>
> > > I'm ok if it takes the one before the
> > > concurrent call to vduse_set_group_asid_nomsg or the one after that,
> > > as the lifetime of all domains are bound to the device. But it cannot
> > > change in the middle of the operation:
> > >
> > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > iova, size_t size, ...) {
> > > as = token.group->as;
> > > // Tell the compiler to never replace "as" by "token.group->as" after this.
> > > read_lock(&as->domain);
> > > while (size)
> > > map = as->domain->bounce_maps[iova];
> > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > >
> > > ...
> > > page = as->domain->bounce_maps
> > > addr = kmap_local_page(page);
> > > do_bounce(map->orig_phys, addr, sz, dir);
> > > kunmap_local(addr);
> > > size -= sz;
> > > iova += sz;
> > > }
> > > read_unlock(&as->domain);
> > > }
> > >
> > > That can be done in many ways. Probably the read_lock is already
> > > enough but it is not explicit that it is protecting token.group->as,
> > > and future changes could remove it. To me, RCU is the most clear way
> > > to do it, but even a volatile read (READ_ONCE?) would do.
> >
> > I wonder if another group rwlock is sufficient here:
> >
> > for set_group_as_id()
> >
> > write_lock(&dev->groups[group].lock);
> > dev->groups[group].as = &dev->as[asid];
> > write_unlock(&dev->groups[group].lock);
> >
> > for the case where we need defer as
> >
> > read_lock(&dev->groups[group].lock);
> > as = dev->groups[group].as;
> > //using as
> > read_unlock(&dev->groups[group].lock);
> >
> > If this works, we don't need to bother with thinking if the
> > wait/synchronizre_rcu() is really needed or not?
> >
>
> A rwlock is sufficient but we need to modify the allocation code
> somehow. Also, I thought we wanted to avoid the overhead of taking the
> read lock in the DMA ops too.
Right, but it would always be a balance. We can make sure it works
correctly first then do optimization on top.
>
> Another disadvantage of the lock vs RCU or READ_ONCE is that the vq
> group ASID change needs to wait for the DMA operation to finish
> instead of just applying for the next DMA ops. Not like vq group ASID
> change would be in the hot path anyway, just pointing it out.
>
> > >
> > > > If this is not an issue, RCU is not a must, but please explain why.
> > > > If this is an issue, we need to fix it.
> > > >
> > > > It's basically a question that
> > > >
> > > > 1) should we need to wait for the DMA to be completed before assigning
> > > > to the new as
> > >
> > > I don't think so, it is valid to assign a new as and let the ongoing
> > > operation to continue. It is racy and the operation could fail, but
> > > the kernel just returns an error and doesn't access invalid memory or
> > > similar.
> >
> > See below.
> >
> > >
> > > > 2) should we track the set_group_asid() for the group that has pending
> > > > DMA to avoid potential issue
> > > >
> > >
> > > No, the group will outlive the operation as it is bound to the device.
> >
> > I meant e.g the DMA could be triggered by the device. For example, the
> > device may try to trigger an interrupt when the kernel is trying to
> > assign a new asid. So I wonder if guest can use this to poke Qemu's
> > memory etc.
>
> I'm not sure I get this point. If QEMU changes the ASID of the vq
> group sent to the guest the race does not matter anymore: it is
> explicitly opening the possibility from the guest to poke QEMU's
> memory unless the guest is totally paused.
Basically what I meant, assuming group0.as = as0
cpu0] dma_map(group0.as, addr, DMA_FROM_DEVICE)
cpu1] set_group_asid(group0.as, as1)
cpu0] dma_unmap(group0.as, addr, DMA_FROM_DEVICE)
cpu0 may read as1 while it wants as0 actually?
>
> > But if you mean we depend on the IOTLB to guard against
> > this, I'm fine, but let's document why we don't need it and how the
> > IOTLB layer can help to eliminate such risk.
> >
>
> No, I'm not happy about letting iotlb lock to protect this too as
> they're at different levels actually: One is protecting iotlb trees
> modifications while they're being read and the other is protecting the
> ASID assignment to different vq groups. To reuse them means a
> modification in any tree blocks the change of vq group ASID, for
> example.
>
> > Anyhow, tracking and failing seems to be more robust.
>
> I'm not sure I get this. If a DMA read starts in one ASID and then
> QEMU changes the ASID of the vq group, do you prefer it to fail rather
> than continue reading from the original ASID?
If possible, it would be better.
> It seems hard to
> communicate that the ASID has changed to the DMA operation callback.
Maybe we can encode this into iova.
>
> > For example,
> > kernel swiotlb has a layer to track whether the address that is being
> > unmapped is mapped before. In our case we should make sure when race
> > happens, isolation via ASID won't be broken.
> >
>
> Right but it cannot protect if the iotlb is valid in the second ASID
> but just points to a different physical / virtual page.
>
Thanks
On Thu, Nov 20, 2025 at 2:38 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, Nov 19, 2025 at 5:27 PM Eugenio Perez Martin
> <eperezma@redhat.com> wrote:
> >
> > On Wed, Nov 19, 2025 at 3:39 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Mon, Nov 17, 2025 at 8:16 PM Eugenio Perez Martin
> > > <eperezma@redhat.com> wrote:
> > > >
> > > > On Mon, Nov 17, 2025 at 5:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Fri, Nov 14, 2025 at 7:25 PM Eugenio Perez Martin
> > > > > <eperezma@redhat.com> wrote:
> > > > > >
> > > > > > On Fri, Nov 14, 2025 at 1:55 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > >
> > > > > > > On Thu, Nov 13, 2025 at 7:56 PM Eugenio Pérez <eperezma@redhat.com> wrote:
> > > > > > > >
> > > > > > > > Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> > > > > > > > group. This enables mapping each group into a distinct memory space.
> > > > > > > >
> > > > > > > > Now that the driver can change ASID in the middle of operation, the
> > > > > > > > domain that each vq address point is also protected by domain_lock.
> > > > > > >
> > > > > > > Maybe it's better to document what is protected by RCU and how.
> > > > > > >
> > > > > >
> > > > > > I added the _rcu annotation but I can expand it for sure. I can also
> > > > > > modify the commit message.
> > > > > >
> > > > > > > More below.
> > > > > > >
> > > > > > > >
> > > > > > > > Acked-by: Jason Wang <jasowang@redhat.com>
> > > > > >
> > > > > > I forgot to remove this, my bad!
> > > > > >
> > > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > > > > ---
> > > > > > > > v9:
> > > > > > > > * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> > > > > > > > context.
> > > > > > > >
> > > > > > > > v8:
> > > > > > > > * Revert the mutex to rwlock change, it needs proper profiling to
> > > > > > > > justify it.
> > > > > > > >
> > > > > > > > v7:
> > > > > > > > * Take write lock in the error path (Jason).
> > > > > > > >
> > > > > > > > v6:
> > > > > > > > * Make vdpa_dev_add use gotos for error handling (MST).
> > > > > > > > * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> > > > > > > > (MST).
> > > > > > > > * Fix struct name not matching in the doc.
> > > > > > > >
> > > > > > > > v5:
> > > > > > > > * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> > > > > > > > ioctl (Jason).
> > > > > > > > * Properly set domain bounce size to divide equally between nas (Jason).
> > > > > > > > * Exclude "padding" member from the only >V1 members in
> > > > > > > > vduse_dev_request.
> > > > > > > >
> > > > > > > > v4:
> > > > > > > > * Divide each domain bounce size between the device bounce size (Jason).
> > > > > > > > * revert unneeded addr = NULL assignment (Jason)
> > > > > > > > * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> > > > > > > > return; } (Jason)
> > > > > > > > * Change a bad multiline comment, using @ caracter instead of * (Jason).
> > > > > > > > * Consider config->nas == 0 as a fail (Jason).
> > > > > > > >
> > > > > > > > v3:
> > > > > > > > * Get the vduse domain through the vduse_as in the map functions
> > > > > > > > (Jason).
> > > > > > > > * Squash with the patch creating the vduse_as struct (Jason).
> > > > > > > > * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> > > > > > > > (Jason)
> > > > > > > >
> > > > > > > > v2:
> > > > > > > > * Convert the use of mutex to rwlock.
> > > > > > > >
> > > > > > > > RFC v3:
> > > > > > > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> > > > > > > > value to reduce memory consumption, but vqs are already limited to
> > > > > > > > that value and userspace VDUSE is able to allocate that many vqs.
> > > > > > > > * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> > > > > > > > VDUSE_IOTLB_GET_INFO.
> > > > > > > > * Use of array_index_nospec in VDUSE device ioctls.
> > > > > > > > * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> > > > > > > > * Move the umem mutex to asid struct so there is no contention between
> > > > > > > > ASIDs.
> > > > > > > >
> > > > > > > > RFC v2:
> > > > > > > > * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> > > > > > > > part of the struct is the same.
> > > > > > > > ---
> > > > > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> > > > > > > > include/uapi/linux/vduse.h | 53 ++++-
> > > > > > > > 2 files changed, 314 insertions(+), 109 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > index 97be04f73fbf..ff95ed56f22d 100644
> > > > > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > @@ -11,6 +11,7 @@
> > > > > > > > #include "linux/virtio_net.h"
> > > > > > > > #include <linux/init.h>
> > > > > > > > #include <linux/module.h>
> > > > > > > > +#include <linux/rcupdate.h>
> > > > > > > > #include <linux/cdev.h>
> > > > > > > > #include <linux/device.h>
> > > > > > > > #include <linux/eventfd.h>
> > > > > > > > @@ -41,6 +42,7 @@
> > > > > > > >
> > > > > > > > #define VDUSE_DEV_MAX (1U << MINORBITS)
> > > > > > > > #define VDUSE_DEV_MAX_GROUPS 0xffff
> > > > > > > > +#define VDUSE_DEV_MAX_AS 0xffff
> > > > > > > > #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> > > > > > > > #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> > > > > > > > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> > > > > > > > @@ -86,7 +88,14 @@ struct vduse_umem {
> > > > > > > > struct mm_struct *mm;
> > > > > > > > };
> > > > > > > >
> > > > > > > > +struct vduse_as {
> > > > > > > > + struct vduse_iova_domain *domain;
> > > > > > > > + struct vduse_umem *umem;
> > > > > > > > + struct mutex mem_lock;
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > struct vduse_vq_group {
> > > > > > > > + struct vduse_as *as __rcu;
> > > > > > > > struct vduse_dev *dev;
> > > > > > > > };
> > > > > > > >
> > > > > > > > @@ -94,7 +103,7 @@ struct vduse_dev {
> > > > > > > > struct vduse_vdpa *vdev;
> > > > > > > > struct device *dev;
> > > > > > > > struct vduse_virtqueue **vqs;
> > > > > > > > - struct vduse_iova_domain *domain;
> > > > > > > > + struct vduse_as *as;
> > > > > > > > char *name;
> > > > > > > > struct mutex lock;
> > > > > > > > spinlock_t msg_lock;
> > > > > > > > @@ -122,9 +131,8 @@ struct vduse_dev {
> > > > > > > > u32 vq_num;
> > > > > > > > u32 vq_align;
> > > > > > > > u32 ngroups;
> > > > > > > > - struct vduse_umem *umem;
> > > > > > > > + u32 nas;
> > > > > > > > struct vduse_vq_group *groups;
> > > > > > > > - struct mutex mem_lock;
> > > > > > > > unsigned int bounce_size;
> > > > > > > > struct mutex domain_lock;
> > > > > > > > };
> > > > > > > > @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> > > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > > }
> > > > > > > >
> > > > > > > > -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> > > > > > > > u64 start, u64 last)
> > > > > > > > {
> > > > > > > > struct vduse_dev_msg msg = { 0 };
> > > > > > > > @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > > return -EINVAL;
> > > > > > > >
> > > > > > > > msg.req.type = VDUSE_UPDATE_IOTLB;
> > > > > > > > - msg.req.iova.start = start;
> > > > > > > > - msg.req.iova.last = last;
> > > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1) {
> > > > > > > > + msg.req.iova.start = start;
> > > > > > > > + msg.req.iova.last = last;
> > > > > > > > + } else {
> > > > > > > > + msg.req.iova_v2.start = start;
> > > > > > > > + msg.req.iova_v2.last = last;
> > > > > > > > + msg.req.iova_v2.asid = asid;
> > > > > > > > + }
> > > > > > > >
> > > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > > }
> > > > > > > > @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> > > > > > > > return mask;
> > > > > > > > }
> > > > > > > >
> > > > > > > > +/* Force set the asid to a vq group without a message to the VDUSE device */
> > > > > > > > +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> > > > > > > > + unsigned int group, unsigned int asid)
> > > > > > > > +{
> > > > > > > > + /*
> > > > > > > > + * Two concurrent updates to this pointer are valid as they cannot
> > > > > > > > + * point to an invalid region. It is ok for them to race as long as
> > > > > > > > + * the readers see a consistent state through RCU.
> > > > > > > > + */
> > > > > > > > + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
> > > > > > >
> > > > > > > I'd expect at least a synchronize_rcu() here to wait for the read is done?
> > > > > > >
> > > > > >
> > > > > > What's the use? The only thing left here is to return from
> > > > > > vduse_set_group_asid_nomsg, and we don't need to wait for readers
> > > > > > here, do we?
> > > > >
> > > > > See below.
> > > > >
> > > > > >
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > static void vduse_dev_reset(struct vduse_dev *dev)
> > > > > > > > {
> > > > > > > > int i;
> > > > > > > > - struct vduse_iova_domain *domain = dev->domain;
> > > > > > > >
> > > > > > > > /* The coherent mappings are handled in vduse_dev_free_coherent() */
> > > > > > > > - if (domain && domain->bounce_map)
> > > > > > > > - vduse_domain_reset_bounce_map(domain);
> > > > > > > > + for (i = 0; i < dev->nas; i++) {
> > > > > > > > + struct vduse_iova_domain *domain = dev->as[i].domain;
> > > > > > > > +
> > > > > > > > + if (domain && domain->bounce_map)
> > > > > > > > + vduse_domain_reset_bounce_map(domain);
> > > > > > > > + }
> > > > > > > > +
> > > > > > > > + for (i = 0; i < dev->ngroups; i++)
> > > > > > > > + vduse_set_group_asid_nomsg(dev, i, 0);
> > > > > > > >
> > > > > > > > down_write(&dev->rwsem);
> > > > > > > >
> > > > > > > > @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> > > > > > > > return ret;
> > > > > > > > }
> > > > > > > >
> > > > > > > > +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> > > > > > > > + unsigned int asid)
> > > > > > > > +{
> > > > > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > > + struct vduse_dev_msg msg = { 0 };
> > > > > > > > + int r;
> > > > > > > > +
> > > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1 ||
> > > > > > > > + group >= dev->ngroups || asid >= dev->nas)
> > > > > > > > + return -EINVAL;
> > > > > > > > +
> > > > > > > > + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> > > > > > > > + msg.req.vq_group_asid.group = group;
> > > > > > > > + msg.req.vq_group_asid.asid = asid;
> > > > > > > > +
> > > > > > > > + r = vduse_dev_msg_sync(dev, &msg);
> > > > > > > > + if (r < 0)
> > > > > > > > + return r;
> > > > > > > > +
> > > > > > > > + vduse_set_group_asid_nomsg(dev, group, asid);
> > > > > > > > + return 0;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> > > > > > > > struct vdpa_vq_state *state)
> > > > > > > > {
> > > > > > > > @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> > > > > > > > struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > > int ret;
> > > > > > > >
> > > > > > > > - ret = vduse_domain_set_map(dev->domain, iotlb);
> > > > > > > > + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> > > > > > > > if (ret)
> > > > > > > > return ret;
> > > > > > > >
> > > > > > > > - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> > > > > > > > + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> > > > > > > > if (ret) {
> > > > > > > > - vduse_domain_clear_map(dev->domain, iotlb);
> > > > > > > > + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> > > > > > > > return ret;
> > > > > > > > }
> > > > > > > >
> > > > > > > > @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> > > > > > > > .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> > > > > > > > .reset = vduse_vdpa_reset,
> > > > > > > > .set_map = vduse_vdpa_set_map,
> > > > > > > > + .set_group_asid = vduse_set_group_asid,
> > > > > > > > .get_vq_map = vduse_get_vq_map,
> > > > > > > > .free = vduse_vdpa_free,
> > > > > > > > };
> > > > > > > > @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> > > > > > > > enum dma_data_direction dir)
> > > > > > > > {
> > > > > > > > struct vduse_dev *vdev;
> > > > > > > > + struct vduse_as *as;
> > > > > > > > struct vduse_iova_domain *domain;
> > > > > > > >
> > > > > > > > if (!token.group)
> > > > > > > > return;
> > > > > > > >
> > > > > > > > vdev = token.group->dev;
> > > > > > > > - domain = vdev->domain;
> > > > > > > > -
> > > > > > > > + rcu_read_lock();
> > > > > > > > + as = rcu_dereference(token.group->as);
> > > > > > > > + domain = as->domain;
> > > > > > > > + rcu_read_unlock();
> > > > > > > > vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
> > > > > > >
> > > > > > > This is suspicious, at least we should do rcu_read_unlock() after
> > > > > > > vduse_domain_sync_single_for_device(), otherwise I don't see how RCU
> > > > > > > works.
> > > > > > >
> > > > > >
> > > > > > RCU is protecting that the address space pointer of the vq group is
> > > > > > not modified concurrently with the access. Ideally, this should be a
> > > > > > full lock, but just making sure that all accesses from the reader are
> > > > > > coherent is enough. Userspace should expect nothing if it uses the map
> > > > > > and modifies the vq group ASID at the same time anyway, but the kernel
> > > > > > needs to be sure that it does not see intermediate states. TBH, we
> > > > > > could move to a READ_ONCE / WRITE_ONCE, would that be more clear?
> > > > >
> > > > > Using READ_ONCE/WRITE_ONCE() needs to make sure the ordering is
> > > > > handled correctly.
> > > > >
> > > > > But I meant what happens if
> > > > >
> > > > > [cpu0]rcu_read_lock()
> > > > > [cpu0]as = rcu_dereference(token.group->as)
> > > > > [cpu0]...
> > > > > [cpu0]rcu_read_unlock()
> > > > > [cpu1]rcu_assign_pointer(token.group->as)
> > > > > [cpu0]vduse_domain_sync_single_for_device()
> > > > >
> > > >
> > > > That should go ok. What I'm trying to protect here is the iterations
> > > > in vduse_domain_sync_single_for_device -> vduse_domain_bounce.
> > > >
> > > > I'm going to embed that function here in
> > > > vduse_dev_sync_single_for_device and omit RCU and some details to make
> > > > the point easier:
> > > >
> > > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > > iova, size_t size, ...) {
> > > > read_lock(&token.group->as->domain);
> > > > while (size)
> > > > map = token.group->as->domain->bounce_maps[iova];
> > > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > > >
> > > > ...
> > > > page = token_group->as->domain->bounce_maps
> > > > addr = kmap_local_page(page);
> > > > do_bounce(map->orig_phys, addr, sz, dir);
> > > > kunmap_local(addr);
> > > > size -= sz;
> > > > iova += sz;
> > > > }
> > > > read_unlock(&token.group->as->domain);
> > > > }
> > >
> > > Right, so I meant for rwlock like semantic (let's forget the sleeping here).
> > >
> > > vduse_set_group_asid_nomsg() should use "write lock" so it must wait
> > > for the "read lock" to be done.
> >
> > No, it doesn't need to wait as long as the reader part uses its own copy.
>
> It probably won't crash but I meant if we have logic issues. For
> example, once set_group_asid() return, there should still be a pending
> DMA that is using the old as.
>
> >
> > > But this is not the logic that is
> > > implemented in this patch as there's no synchronize_rcu() in the
> > > vduse_set_group_asid_nomsg().
> >
> > We only set the pointer on the writer's side, we do nothing like
> > freeing resources. Should we set the pointer before or after
> > syncrhonize_rcu()? What do we need to do on the other side of
> > syncrhonize_rcu()?
>
> Usually we don't need special care on the read side. But as discussed,
> synchronize_rcu() is not a must but we need to explain why it is safe
> and I'm not sure Michael is fine with that.
> If we just want to make sure the order of publish and read, we can
> switch to use smp_store_release() and smp_load_acqurie().
>
> >
> > > We need to explain why set_group_asid()
> > > doesn't need to wait and if this is true, we probably don't need RCU
> > > but to make sure the load/store is atomic.
> > >
> >
> > What about:
> >
> > * It does not matter if other thread modify group->as as long as the
> > reader uses the same as for all its operation. It performs a local
> > copy for that reason.
> > * It does not matter if multiple threads modify group->as as long as
> > the update is atomic.
>
> See above reply.
>
> >
> > ?
> >
> > > >
> > > > Now, depending on the point where another execution thread changes
> > > > token_group->as and how the compiler has chosen to generate the
> > > > machine code, the outcome could be:
> > > > 1) The domain read lock of one ASID is taken but the domain lock of
> > > > another as is unlocked.
> > > > 2) We iterate until iova is ok for the ASID we're handling, but not
> > > > for the other one. So we access an invalid offset in
> > > > bounce_maps[iova].
> > > >
> > > > And I guess there are other possible outcomes too.
> > > >
> > > > So I need to make sure that the pointer accesses in all
> > > > vduse_domain_bounce is coherent.
> > >
> > > I'm not sure I got here, but it looks like it accepts a domain
> > > parameter and is protected by the bounce lock so we are probably fine
> > > here?
> > >
> >
> > The bounce lock only protects the iotlb tree, not the pointer to that
> > iotlb tree.
> >
> > > > I'm ok if it takes the one before the
> > > > concurrent call to vduse_set_group_asid_nomsg or the one after that,
> > > > as the lifetime of all domains are bound to the device. But it cannot
> > > > change in the middle of the operation:
> > > >
> > > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > > iova, size_t size, ...) {
> > > > as = token.group->as;
> > > > // Tell the compiler to never replace "as" by "token.group->as" after this.
> > > > read_lock(&as->domain);
> > > > while (size)
> > > > map = as->domain->bounce_maps[iova];
> > > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > > >
> > > > ...
> > > > page = as->domain->bounce_maps
> > > > addr = kmap_local_page(page);
> > > > do_bounce(map->orig_phys, addr, sz, dir);
> > > > kunmap_local(addr);
> > > > size -= sz;
> > > > iova += sz;
> > > > }
> > > > read_unlock(&as->domain);
> > > > }
> > > >
> > > > That can be done in many ways. Probably the read_lock is already
> > > > enough but it is not explicit that it is protecting token.group->as,
> > > > and future changes could remove it. To me, RCU is the most clear way
> > > > to do it, but even a volatile read (READ_ONCE?) would do.
> > >
> > > I wonder if another group rwlock is sufficient here:
> > >
> > > for set_group_as_id()
> > >
> > > write_lock(&dev->groups[group].lock);
> > > dev->groups[group].as = &dev->as[asid];
> > > write_unlock(&dev->groups[group].lock);
> > >
> > > for the case where we need defer as
> > >
> > > read_lock(&dev->groups[group].lock);
> > > as = dev->groups[group].as;
> > > //using as
> > > read_unlock(&dev->groups[group].lock);
> > >
> > > If this works, we don't need to bother with thinking if the
> > > wait/synchronizre_rcu() is really needed or not?
> > >
> >
> > A rwlock is sufficient but we need to modify the allocation code
> > somehow. Also, I thought we wanted to avoid the overhead of taking the
> > read lock in the DMA ops too.
>
> Right, but it would always be a balance. We can make sure it works
> correctly first then do optimization on top.
>
> >
> > Another disadvantage of the lock vs RCU or READ_ONCE is that the vq
> > group ASID change needs to wait for the DMA operation to finish
> > instead of just applying for the next DMA ops. Not like vq group ASID
> > change would be in the hot path anyway, just pointing it out.
> >
> > > >
> > > > > If this is not an issue, RCU is not a must, but please explain why.
> > > > > If this is an issue, we need to fix it.
> > > > >
> > > > > It's basically a question that
> > > > >
> > > > > 1) should we need to wait for the DMA to be completed before assigning
> > > > > to the new as
> > > >
> > > > I don't think so, it is valid to assign a new as and let the ongoing
> > > > operation to continue. It is racy and the operation could fail, but
> > > > the kernel just returns an error and doesn't access invalid memory or
> > > > similar.
> > >
> > > See below.
> > >
> > > >
> > > > > 2) should we track the set_group_asid() for the group that has pending
> > > > > DMA to avoid potential issue
> > > > >
> > > >
> > > > No, the group will outlive the operation as it is bound to the device.
> > >
> > > I meant e.g the DMA could be triggered by the device. For example, the
> > > device may try to trigger an interrupt when the kernel is trying to
> > > assign a new asid. So I wonder if guest can use this to poke Qemu's
> > > memory etc.
> >
> > I'm not sure I get this point. If QEMU changes the ASID of the vq
> > group sent to the guest the race does not matter anymore: it is
> > explicitly opening the possibility from the guest to poke QEMU's
> > memory unless the guest is totally paused.
>
> Basically what I meant, assuming group0.as = as0
>
> cpu0] dma_map(group0.as, addr, DMA_FROM_DEVICE)
> cpu1] set_group_asid(group0.as, as1)
> cpu0] dma_unmap(group0.as, addr, DMA_FROM_DEVICE)
>
> cpu0 may read as1 while it wants as0 actually?
>
Yes, kind of. That's my point: adding synchronization at vduse level
does not fix it.
There is no way to do that call from vhost/vdpa or userland, as there
is no way to get the AS of a vq group, only to set it. The closest
thing is to add a cache at that level, but that implies to add
mutithreading sync on that upper layer, either vhost/vdpa or userland,
not in VDUSE.
From vhost/vdpa level, all mapping calls (.set_map, .dma_map,
.dma_unmap) calls take the ASID directly, not the vq group. So the
call to set_group_asid does not need to access the vq group.
Now let's say that we add that vdpa_ops callback (and ioctls) that
maps and unmap based on a vq_group. And all of the operations
(dma_map, set_group_asid, and dma_unmap) are serialized by taking the
same mutex. cpu0 still may dma_unmap over as0 if set_group_asid is not
properly serialized at vhost/vdpa or userland level:
void* thread0_func(void* arg) {
struct {
int vq_group = 0,
int iova, size, perm, ...
} s;
int fd = (intptr_t)arg;
ioctl(fd, VHOST_VDPA_VQ_GROUP_DMA_MAP, &s);
// TODO: Signal thread0 that it can proceed with SET_GROUP_ASID
// TODO: Wait until thread0 complete SET_GROUP_ASID
ioctl(fd, VHOST_VDPA_VQ_GROUP_DMA_UNMAP, &data);
return NULL;
}
void* thread1_func(void* arg) {
struct vhost_vring_state s = {
.index = 0,
.num = 1,
};
int fd = (int)(intptr_t)arg;
// TODO: Wait until thread2 calls dma_map
ioctl(fd, VHOST_VDPA_SET_GROUP_ASID, &s)
// TODO: Signal thread2 that can proceed with dma_unmap
return NULL;
}
int main() {
pthread_t thread0, thread1;
int fd = open("/dev/vhost-vdpa-0", ...)
pthread_create(&thread0, NULL, thread0_func, (void *)(intptr_t)fd);
pthread_create(&thread1, NULL, thread1_func, (void *)(intptr_t)fd);
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
return EXIT_SUCCESS;
}
---
We need something to synchronize at userland level here, filling the TODOs.
We can replace the hypothetical VHOST_VDPA_VQ_GROUP_DMA_MAP and _UNMAP
with access to vqs, and the result is the same: The userland
application is the one that needs to serialize the access from thread0
if it wants predictive outcome against the accesses from thread1.
There is no way to do it at vduse level.
We need some syncrhonization to avoid malicious or buggy userland to
mess things, that's for sure. So DMA_MAP and DMA_UNMAP does not half
update the iotlb tree. And I'll send the next version with rwlock,
protecting as much as possible. But I want to make clear that it will
not avoid the race you describe here.
> >
> > > But if you mean we depend on the IOTLB to guard against
> > > this, I'm fine, but let's document why we don't need it and how the
> > > IOTLB layer can help to eliminate such risk.
> > >
> >
> > No, I'm not happy about letting iotlb lock to protect this too as
> > they're at different levels actually: One is protecting iotlb trees
> > modifications while they're being read and the other is protecting the
> > ASID assignment to different vq groups. To reuse them means a
> > modification in any tree blocks the change of vq group ASID, for
> > example.
> >
> > > Anyhow, tracking and failing seems to be more robust.
> >
> > I'm not sure I get this. If a DMA read starts in one ASID and then
> > QEMU changes the ASID of the vq group, do you prefer it to fail rather
> > than continue reading from the original ASID?
>
> If possible, it would be better.
>
It's hard for me to think in some way that does not add a lot of
overhead or it is very complex. But it is good to know this is
acceptable.
> > It seems hard to
> > communicate that the ASID has changed to the DMA operation callback.
>
> Maybe we can encode this into iova.
>
I'm adding the full rwlock, but can you expand on your idea on this?
It would be great to have it documented in case we need future
optimizations.
On Wed, Dec 3, 2025 at 3:58 PM Eugenio Perez Martin <eperezma@redhat.com> wrote:
>
> On Thu, Nov 20, 2025 at 2:38 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Wed, Nov 19, 2025 at 5:27 PM Eugenio Perez Martin
> > <eperezma@redhat.com> wrote:
> > >
> > > On Wed, Nov 19, 2025 at 3:39 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Mon, Nov 17, 2025 at 8:16 PM Eugenio Perez Martin
> > > > <eperezma@redhat.com> wrote:
> > > > >
> > > > > On Mon, Nov 17, 2025 at 5:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > > On Fri, Nov 14, 2025 at 7:25 PM Eugenio Perez Martin
> > > > > > <eperezma@redhat.com> wrote:
> > > > > > >
> > > > > > > On Fri, Nov 14, 2025 at 1:55 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Thu, Nov 13, 2025 at 7:56 PM Eugenio Pérez <eperezma@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> > > > > > > > > group. This enables mapping each group into a distinct memory space.
> > > > > > > > >
> > > > > > > > > Now that the driver can change ASID in the middle of operation, the
> > > > > > > > > domain that each vq address point is also protected by domain_lock.
> > > > > > > >
> > > > > > > > Maybe it's better to document what is protected by RCU and how.
> > > > > > > >
> > > > > > >
> > > > > > > I added the _rcu annotation but I can expand it for sure. I can also
> > > > > > > modify the commit message.
> > > > > > >
> > > > > > > > More below.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Acked-by: Jason Wang <jasowang@redhat.com>
> > > > > > >
> > > > > > > I forgot to remove this, my bad!
> > > > > > >
> > > > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > > > > > ---
> > > > > > > > > v9:
> > > > > > > > > * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> > > > > > > > > context.
> > > > > > > > >
> > > > > > > > > v8:
> > > > > > > > > * Revert the mutex to rwlock change, it needs proper profiling to
> > > > > > > > > justify it.
> > > > > > > > >
> > > > > > > > > v7:
> > > > > > > > > * Take write lock in the error path (Jason).
> > > > > > > > >
> > > > > > > > > v6:
> > > > > > > > > * Make vdpa_dev_add use gotos for error handling (MST).
> > > > > > > > > * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> > > > > > > > > (MST).
> > > > > > > > > * Fix struct name not matching in the doc.
> > > > > > > > >
> > > > > > > > > v5:
> > > > > > > > > * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> > > > > > > > > ioctl (Jason).
> > > > > > > > > * Properly set domain bounce size to divide equally between nas (Jason).
> > > > > > > > > * Exclude "padding" member from the only >V1 members in
> > > > > > > > > vduse_dev_request.
> > > > > > > > >
> > > > > > > > > v4:
> > > > > > > > > * Divide each domain bounce size between the device bounce size (Jason).
> > > > > > > > > * revert unneeded addr = NULL assignment (Jason)
> > > > > > > > > * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> > > > > > > > > return; } (Jason)
> > > > > > > > > * Change a bad multiline comment, using @ caracter instead of * (Jason).
> > > > > > > > > * Consider config->nas == 0 as a fail (Jason).
> > > > > > > > >
> > > > > > > > > v3:
> > > > > > > > > * Get the vduse domain through the vduse_as in the map functions
> > > > > > > > > (Jason).
> > > > > > > > > * Squash with the patch creating the vduse_as struct (Jason).
> > > > > > > > > * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> > > > > > > > > (Jason)
> > > > > > > > >
> > > > > > > > > v2:
> > > > > > > > > * Convert the use of mutex to rwlock.
> > > > > > > > >
> > > > > > > > > RFC v3:
> > > > > > > > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> > > > > > > > > value to reduce memory consumption, but vqs are already limited to
> > > > > > > > > that value and userspace VDUSE is able to allocate that many vqs.
> > > > > > > > > * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> > > > > > > > > VDUSE_IOTLB_GET_INFO.
> > > > > > > > > * Use of array_index_nospec in VDUSE device ioctls.
> > > > > > > > > * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> > > > > > > > > * Move the umem mutex to asid struct so there is no contention between
> > > > > > > > > ASIDs.
> > > > > > > > >
> > > > > > > > > RFC v2:
> > > > > > > > > * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> > > > > > > > > part of the struct is the same.
> > > > > > > > > ---
> > > > > > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> > > > > > > > > include/uapi/linux/vduse.h | 53 ++++-
> > > > > > > > > 2 files changed, 314 insertions(+), 109 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > index 97be04f73fbf..ff95ed56f22d 100644
> > > > > > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > @@ -11,6 +11,7 @@
> > > > > > > > > #include "linux/virtio_net.h"
> > > > > > > > > #include <linux/init.h>
> > > > > > > > > #include <linux/module.h>
> > > > > > > > > +#include <linux/rcupdate.h>
> > > > > > > > > #include <linux/cdev.h>
> > > > > > > > > #include <linux/device.h>
> > > > > > > > > #include <linux/eventfd.h>
> > > > > > > > > @@ -41,6 +42,7 @@
> > > > > > > > >
> > > > > > > > > #define VDUSE_DEV_MAX (1U << MINORBITS)
> > > > > > > > > #define VDUSE_DEV_MAX_GROUPS 0xffff
> > > > > > > > > +#define VDUSE_DEV_MAX_AS 0xffff
> > > > > > > > > #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> > > > > > > > > #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> > > > > > > > > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> > > > > > > > > @@ -86,7 +88,14 @@ struct vduse_umem {
> > > > > > > > > struct mm_struct *mm;
> > > > > > > > > };
> > > > > > > > >
> > > > > > > > > +struct vduse_as {
> > > > > > > > > + struct vduse_iova_domain *domain;
> > > > > > > > > + struct vduse_umem *umem;
> > > > > > > > > + struct mutex mem_lock;
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > > struct vduse_vq_group {
> > > > > > > > > + struct vduse_as *as __rcu;
> > > > > > > > > struct vduse_dev *dev;
> > > > > > > > > };
> > > > > > > > >
> > > > > > > > > @@ -94,7 +103,7 @@ struct vduse_dev {
> > > > > > > > > struct vduse_vdpa *vdev;
> > > > > > > > > struct device *dev;
> > > > > > > > > struct vduse_virtqueue **vqs;
> > > > > > > > > - struct vduse_iova_domain *domain;
> > > > > > > > > + struct vduse_as *as;
> > > > > > > > > char *name;
> > > > > > > > > struct mutex lock;
> > > > > > > > > spinlock_t msg_lock;
> > > > > > > > > @@ -122,9 +131,8 @@ struct vduse_dev {
> > > > > > > > > u32 vq_num;
> > > > > > > > > u32 vq_align;
> > > > > > > > > u32 ngroups;
> > > > > > > > > - struct vduse_umem *umem;
> > > > > > > > > + u32 nas;
> > > > > > > > > struct vduse_vq_group *groups;
> > > > > > > > > - struct mutex mem_lock;
> > > > > > > > > unsigned int bounce_size;
> > > > > > > > > struct mutex domain_lock;
> > > > > > > > > };
> > > > > > > > > @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> > > > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > > -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > > > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> > > > > > > > > u64 start, u64 last)
> > > > > > > > > {
> > > > > > > > > struct vduse_dev_msg msg = { 0 };
> > > > > > > > > @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > > > return -EINVAL;
> > > > > > > > >
> > > > > > > > > msg.req.type = VDUSE_UPDATE_IOTLB;
> > > > > > > > > - msg.req.iova.start = start;
> > > > > > > > > - msg.req.iova.last = last;
> > > > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1) {
> > > > > > > > > + msg.req.iova.start = start;
> > > > > > > > > + msg.req.iova.last = last;
> > > > > > > > > + } else {
> > > > > > > > > + msg.req.iova_v2.start = start;
> > > > > > > > > + msg.req.iova_v2.last = last;
> > > > > > > > > + msg.req.iova_v2.asid = asid;
> > > > > > > > > + }
> > > > > > > > >
> > > > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > }
> > > > > > > > > @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> > > > > > > > > return mask;
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > > +/* Force set the asid to a vq group without a message to the VDUSE device */
> > > > > > > > > +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> > > > > > > > > + unsigned int group, unsigned int asid)
> > > > > > > > > +{
> > > > > > > > > + /*
> > > > > > > > > + * Two concurrent updates to this pointer are valid as they cannot
> > > > > > > > > + * point to an invalid region. It is ok for them to race as long as
> > > > > > > > > + * the readers see a consistent state through RCU.
> > > > > > > > > + */
> > > > > > > > > + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
> > > > > > > >
> > > > > > > > I'd expect at least a synchronize_rcu() here to wait for the read is done?
> > > > > > > >
> > > > > > >
> > > > > > > What's the use? The only thing left here is to return from
> > > > > > > vduse_set_group_asid_nomsg, and we don't need to wait for readers
> > > > > > > here, do we?
> > > > > >
> > > > > > See below.
> > > > > >
> > > > > > >
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > static void vduse_dev_reset(struct vduse_dev *dev)
> > > > > > > > > {
> > > > > > > > > int i;
> > > > > > > > > - struct vduse_iova_domain *domain = dev->domain;
> > > > > > > > >
> > > > > > > > > /* The coherent mappings are handled in vduse_dev_free_coherent() */
> > > > > > > > > - if (domain && domain->bounce_map)
> > > > > > > > > - vduse_domain_reset_bounce_map(domain);
> > > > > > > > > + for (i = 0; i < dev->nas; i++) {
> > > > > > > > > + struct vduse_iova_domain *domain = dev->as[i].domain;
> > > > > > > > > +
> > > > > > > > > + if (domain && domain->bounce_map)
> > > > > > > > > + vduse_domain_reset_bounce_map(domain);
> > > > > > > > > + }
> > > > > > > > > +
> > > > > > > > > + for (i = 0; i < dev->ngroups; i++)
> > > > > > > > > + vduse_set_group_asid_nomsg(dev, i, 0);
> > > > > > > > >
> > > > > > > > > down_write(&dev->rwsem);
> > > > > > > > >
> > > > > > > > > @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> > > > > > > > > return ret;
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > > +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> > > > > > > > > + unsigned int asid)
> > > > > > > > > +{
> > > > > > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > > > + struct vduse_dev_msg msg = { 0 };
> > > > > > > > > + int r;
> > > > > > > > > +
> > > > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1 ||
> > > > > > > > > + group >= dev->ngroups || asid >= dev->nas)
> > > > > > > > > + return -EINVAL;
> > > > > > > > > +
> > > > > > > > > + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> > > > > > > > > + msg.req.vq_group_asid.group = group;
> > > > > > > > > + msg.req.vq_group_asid.asid = asid;
> > > > > > > > > +
> > > > > > > > > + r = vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > + if (r < 0)
> > > > > > > > > + return r;
> > > > > > > > > +
> > > > > > > > > + vduse_set_group_asid_nomsg(dev, group, asid);
> > > > > > > > > + return 0;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> > > > > > > > > struct vdpa_vq_state *state)
> > > > > > > > > {
> > > > > > > > > @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> > > > > > > > > struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > > > int ret;
> > > > > > > > >
> > > > > > > > > - ret = vduse_domain_set_map(dev->domain, iotlb);
> > > > > > > > > + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> > > > > > > > > if (ret)
> > > > > > > > > return ret;
> > > > > > > > >
> > > > > > > > > - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> > > > > > > > > + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> > > > > > > > > if (ret) {
> > > > > > > > > - vduse_domain_clear_map(dev->domain, iotlb);
> > > > > > > > > + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> > > > > > > > > return ret;
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > > @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> > > > > > > > > .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> > > > > > > > > .reset = vduse_vdpa_reset,
> > > > > > > > > .set_map = vduse_vdpa_set_map,
> > > > > > > > > + .set_group_asid = vduse_set_group_asid,
> > > > > > > > > .get_vq_map = vduse_get_vq_map,
> > > > > > > > > .free = vduse_vdpa_free,
> > > > > > > > > };
> > > > > > > > > @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> > > > > > > > > enum dma_data_direction dir)
> > > > > > > > > {
> > > > > > > > > struct vduse_dev *vdev;
> > > > > > > > > + struct vduse_as *as;
> > > > > > > > > struct vduse_iova_domain *domain;
> > > > > > > > >
> > > > > > > > > if (!token.group)
> > > > > > > > > return;
> > > > > > > > >
> > > > > > > > > vdev = token.group->dev;
> > > > > > > > > - domain = vdev->domain;
> > > > > > > > > -
> > > > > > > > > + rcu_read_lock();
> > > > > > > > > + as = rcu_dereference(token.group->as);
> > > > > > > > > + domain = as->domain;
> > > > > > > > > + rcu_read_unlock();
> > > > > > > > > vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
> > > > > > > >
> > > > > > > > This is suspicious, at least we should do rcu_read_unlock() after
> > > > > > > > vduse_domain_sync_single_for_device(), otherwise I don't see how RCU
> > > > > > > > works.
> > > > > > > >
> > > > > > >
> > > > > > > RCU is protecting that the address space pointer of the vq group is
> > > > > > > not modified concurrently with the access. Ideally, this should be a
> > > > > > > full lock, but just making sure that all accesses from the reader are
> > > > > > > coherent is enough. Userspace should expect nothing if it uses the map
> > > > > > > and modifies the vq group ASID at the same time anyway, but the kernel
> > > > > > > needs to be sure that it does not see intermediate states. TBH, we
> > > > > > > could move to a READ_ONCE / WRITE_ONCE, would that be more clear?
> > > > > >
> > > > > > Using READ_ONCE/WRITE_ONCE() needs to make sure the ordering is
> > > > > > handled correctly.
> > > > > >
> > > > > > But I meant what happens if
> > > > > >
> > > > > > [cpu0]rcu_read_lock()
> > > > > > [cpu0]as = rcu_dereference(token.group->as)
> > > > > > [cpu0]...
> > > > > > [cpu0]rcu_read_unlock()
> > > > > > [cpu1]rcu_assign_pointer(token.group->as)
> > > > > > [cpu0]vduse_domain_sync_single_for_device()
> > > > > >
> > > > >
> > > > > That should go ok. What I'm trying to protect here is the iterations
> > > > > in vduse_domain_sync_single_for_device -> vduse_domain_bounce.
> > > > >
> > > > > I'm going to embed that function here in
> > > > > vduse_dev_sync_single_for_device and omit RCU and some details to make
> > > > > the point easier:
> > > > >
> > > > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > > > iova, size_t size, ...) {
> > > > > read_lock(&token.group->as->domain);
> > > > > while (size)
> > > > > map = token.group->as->domain->bounce_maps[iova];
> > > > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > > > >
> > > > > ...
> > > > > page = token_group->as->domain->bounce_maps
> > > > > addr = kmap_local_page(page);
> > > > > do_bounce(map->orig_phys, addr, sz, dir);
> > > > > kunmap_local(addr);
> > > > > size -= sz;
> > > > > iova += sz;
> > > > > }
> > > > > read_unlock(&token.group->as->domain);
> > > > > }
> > > >
> > > > Right, so I meant for rwlock like semantic (let's forget the sleeping here).
> > > >
> > > > vduse_set_group_asid_nomsg() should use "write lock" so it must wait
> > > > for the "read lock" to be done.
> > >
> > > No, it doesn't need to wait as long as the reader part uses its own copy.
> >
> > It probably won't crash but I meant if we have logic issues. For
> > example, once set_group_asid() return, there should still be a pending
> > DMA that is using the old as.
> >
> > >
> > > > But this is not the logic that is
> > > > implemented in this patch as there's no synchronize_rcu() in the
> > > > vduse_set_group_asid_nomsg().
> > >
> > > We only set the pointer on the writer's side, we do nothing like
> > > freeing resources. Should we set the pointer before or after
> > > syncrhonize_rcu()? What do we need to do on the other side of
> > > syncrhonize_rcu()?
> >
> > Usually we don't need special care on the read side. But as discussed,
> > synchronize_rcu() is not a must but we need to explain why it is safe
> > and I'm not sure Michael is fine with that.
> > If we just want to make sure the order of publish and read, we can
> > switch to use smp_store_release() and smp_load_acqurie().
> >
> > >
> > > > We need to explain why set_group_asid()
> > > > doesn't need to wait and if this is true, we probably don't need RCU
> > > > but to make sure the load/store is atomic.
> > > >
> > >
> > > What about:
> > >
> > > * It does not matter if other thread modify group->as as long as the
> > > reader uses the same as for all its operation. It performs a local
> > > copy for that reason.
> > > * It does not matter if multiple threads modify group->as as long as
> > > the update is atomic.
> >
> > See above reply.
> >
> > >
> > > ?
> > >
> > > > >
> > > > > Now, depending on the point where another execution thread changes
> > > > > token_group->as and how the compiler has chosen to generate the
> > > > > machine code, the outcome could be:
> > > > > 1) The domain read lock of one ASID is taken but the domain lock of
> > > > > another as is unlocked.
> > > > > 2) We iterate until iova is ok for the ASID we're handling, but not
> > > > > for the other one. So we access an invalid offset in
> > > > > bounce_maps[iova].
> > > > >
> > > > > And I guess there are other possible outcomes too.
> > > > >
> > > > > So I need to make sure that the pointer accesses in all
> > > > > vduse_domain_bounce is coherent.
> > > >
> > > > I'm not sure I got here, but it looks like it accepts a domain
> > > > parameter and is protected by the bounce lock so we are probably fine
> > > > here?
> > > >
> > >
> > > The bounce lock only protects the iotlb tree, not the pointer to that
> > > iotlb tree.
> > >
> > > > > I'm ok if it takes the one before the
> > > > > concurrent call to vduse_set_group_asid_nomsg or the one after that,
> > > > > as the lifetime of all domains are bound to the device. But it cannot
> > > > > change in the middle of the operation:
> > > > >
> > > > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > > > iova, size_t size, ...) {
> > > > > as = token.group->as;
> > > > > // Tell the compiler to never replace "as" by "token.group->as" after this.
> > > > > read_lock(&as->domain);
> > > > > while (size)
> > > > > map = as->domain->bounce_maps[iova];
> > > > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > > > >
> > > > > ...
> > > > > page = as->domain->bounce_maps
> > > > > addr = kmap_local_page(page);
> > > > > do_bounce(map->orig_phys, addr, sz, dir);
> > > > > kunmap_local(addr);
> > > > > size -= sz;
> > > > > iova += sz;
> > > > > }
> > > > > read_unlock(&as->domain);
> > > > > }
> > > > >
> > > > > That can be done in many ways. Probably the read_lock is already
> > > > > enough but it is not explicit that it is protecting token.group->as,
> > > > > and future changes could remove it. To me, RCU is the most clear way
> > > > > to do it, but even a volatile read (READ_ONCE?) would do.
> > > >
> > > > I wonder if another group rwlock is sufficient here:
> > > >
> > > > for set_group_as_id()
> > > >
> > > > write_lock(&dev->groups[group].lock);
> > > > dev->groups[group].as = &dev->as[asid];
> > > > write_unlock(&dev->groups[group].lock);
> > > >
> > > > for the case where we need defer as
> > > >
> > > > read_lock(&dev->groups[group].lock);
> > > > as = dev->groups[group].as;
> > > > //using as
> > > > read_unlock(&dev->groups[group].lock);
> > > >
> > > > If this works, we don't need to bother with thinking if the
> > > > wait/synchronizre_rcu() is really needed or not?
> > > >
> > >
> > > A rwlock is sufficient but we need to modify the allocation code
> > > somehow. Also, I thought we wanted to avoid the overhead of taking the
> > > read lock in the DMA ops too.
> >
> > Right, but it would always be a balance. We can make sure it works
> > correctly first then do optimization on top.
> >
> > >
> > > Another disadvantage of the lock vs RCU or READ_ONCE is that the vq
> > > group ASID change needs to wait for the DMA operation to finish
> > > instead of just applying for the next DMA ops. Not like vq group ASID
> > > change would be in the hot path anyway, just pointing it out.
> > >
> > > > >
> > > > > > If this is not an issue, RCU is not a must, but please explain why.
> > > > > > If this is an issue, we need to fix it.
> > > > > >
> > > > > > It's basically a question that
> > > > > >
> > > > > > 1) should we need to wait for the DMA to be completed before assigning
> > > > > > to the new as
> > > > >
> > > > > I don't think so, it is valid to assign a new as and let the ongoing
> > > > > operation to continue. It is racy and the operation could fail, but
> > > > > the kernel just returns an error and doesn't access invalid memory or
> > > > > similar.
> > > >
> > > > See below.
> > > >
> > > > >
> > > > > > 2) should we track the set_group_asid() for the group that has pending
> > > > > > DMA to avoid potential issue
> > > > > >
> > > > >
> > > > > No, the group will outlive the operation as it is bound to the device.
> > > >
> > > > I meant e.g the DMA could be triggered by the device. For example, the
> > > > device may try to trigger an interrupt when the kernel is trying to
> > > > assign a new asid. So I wonder if guest can use this to poke Qemu's
> > > > memory etc.
> > >
> > > I'm not sure I get this point. If QEMU changes the ASID of the vq
> > > group sent to the guest the race does not matter anymore: it is
> > > explicitly opening the possibility from the guest to poke QEMU's
> > > memory unless the guest is totally paused.
> >
> > Basically what I meant, assuming group0.as = as0
> >
> > cpu0] dma_map(group0.as, addr, DMA_FROM_DEVICE)
> > cpu1] set_group_asid(group0.as, as1)
> > cpu0] dma_unmap(group0.as, addr, DMA_FROM_DEVICE)
> >
> > cpu0 may read as1 while it wants as0 actually?
> >
>
> Yes, kind of. That's my point: adding synchronization at vduse level
> does not fix it.
>
> There is no way to do that call from vhost/vdpa or userland, as there
> is no way to get the AS of a vq group, only to set it. The closest
> thing is to add a cache at that level, but that implies to add
> mutithreading sync on that upper layer, either vhost/vdpa or userland,
> not in VDUSE.
Probably.
>
> From vhost/vdpa level, all mapping calls (.set_map, .dma_map,
> .dma_unmap) calls take the ASID directly, not the vq group. So the
> call to set_group_asid does not need to access the vq group.
>
> Now let's say that we add that vdpa_ops callback (and ioctls) that
> maps and unmap based on a vq_group. And all of the operations
> (dma_map, set_group_asid, and dma_unmap) are serialized by taking the
> same mutex. cpu0 still may dma_unmap over as0 if set_group_asid is not
> properly serialized at vhost/vdpa or userland level:
>
> void* thread0_func(void* arg) {
> struct {
> int vq_group = 0,
> int iova, size, perm, ...
> } s;
> int fd = (intptr_t)arg;
>
> ioctl(fd, VHOST_VDPA_VQ_GROUP_DMA_MAP, &s);
> // TODO: Signal thread0 that it can proceed with SET_GROUP_ASID
> // TODO: Wait until thread0 complete SET_GROUP_ASID
>
> ioctl(fd, VHOST_VDPA_VQ_GROUP_DMA_UNMAP, &data);
>
> return NULL;
> }
>
> void* thread1_func(void* arg) {
> struct vhost_vring_state s = {
> .index = 0,
> .num = 1,
> };
> int fd = (int)(intptr_t)arg;
>
> // TODO: Wait until thread2 calls dma_map
This is something exactly rwlock or synchronize_rcu() can do? Or is
this the charge of the vDPA parent to do that?
If it's the responsibility of the parent, it would be much harder for
VDUSE as the datapath is implemented in usersapce via mmap() or
umem_reg.
Looking at existing implementation:
- for mlx5e, it looks like it assumes the set_group_asid() works only
without DRIVER_OK.
- for the simulator, it looks like it can synchronize with the
datapath with the spinlock as datapath is emulated
> ioctl(fd, VHOST_VDPA_SET_GROUP_ASID, &s)
> // TODO: Signal thread2 that can proceed with dma_unmap
But the issue I mention is that the, from the view of the vDPA bus:
1) it offers set_group_as_id()
2) it doesn't know if virtio-vdpa or vhost-vdpa is used
So theoretically, set_group_as_id() could happen between
dma_addr = dma_map();
and
dma_unmap(dma_adrr);
But those two dma_addr refers to the different address space. Instead
of trying to do synchronization, maybe we can simply fail
set_group_asid if DRIVER_OK is set.
>
> return NULL;
> }
>
> int main() {
> pthread_t thread0, thread1;
> int fd = open("/dev/vhost-vdpa-0", ...)
>
> pthread_create(&thread0, NULL, thread0_func, (void *)(intptr_t)fd);
> pthread_create(&thread1, NULL, thread1_func, (void *)(intptr_t)fd);
>
> pthread_join(thread1, NULL);
> pthread_join(thread2, NULL);
>
> return EXIT_SUCCESS;
> }
> ---
>
> We need something to synchronize at userland level here, filling the TODOs.
>
> We can replace the hypothetical VHOST_VDPA_VQ_GROUP_DMA_MAP and _UNMAP
> with access to vqs, and the result is the same: The userland
> application is the one that needs to serialize the access from thread0
> if it wants predictive outcome against the accesses from thread1.
> There is no way to do it at vduse level.
>
> We need some syncrhonization to avoid malicious or buggy userland to
> mess things, that's for sure. So DMA_MAP and DMA_UNMAP does not half
> update the iotlb tree. And I'll send the next version with rwlock,
> protecting as much as possible. But I want to make clear that it will
> not avoid the race you describe here.
See above.
>
> > >
> > > > But if you mean we depend on the IOTLB to guard against
> > > > this, I'm fine, but let's document why we don't need it and how the
> > > > IOTLB layer can help to eliminate such risk.
> > > >
> > >
> > > No, I'm not happy about letting iotlb lock to protect this too as
> > > they're at different levels actually: One is protecting iotlb trees
> > > modifications while they're being read and the other is protecting the
> > > ASID assignment to different vq groups. To reuse them means a
> > > modification in any tree blocks the change of vq group ASID, for
> > > example.
> > >
> > > > Anyhow, tracking and failing seems to be more robust.
> > >
> > > I'm not sure I get this. If a DMA read starts in one ASID and then
> > > QEMU changes the ASID of the vq group, do you prefer it to fail rather
> > > than continue reading from the original ASID?
> >
> > If possible, it would be better.
> >
>
> It's hard for me to think in some way that does not add a lot of
> overhead or it is very complex. But it is good to know this is
> acceptable.
>
> > > It seems hard to
> > > communicate that the ASID has changed to the DMA operation callback.
> >
> > Maybe we can encode this into iova.
> >
>
> I'm adding the full rwlock, but can you expand on your idea on this?
Encode the asid to the upper bits of IOVA, so when doing dma_unmap()
we can compare the group/as with the one that is encoded in dma_addr.
If it differs, warn or bug.
If we can make sure there's no set_group_asid() when DRIVER_OK is set,
it's not a must then (or could be treated as a kind of hardening).
> It would be great to have it documented in case we need future
> optimizations.
>
Thanks
On Thu, Dec 4, 2025 at 3:15 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, Dec 3, 2025 at 3:58 PM Eugenio Perez Martin <eperezma@redhat.com> wrote:
> >
> > On Thu, Nov 20, 2025 at 2:38 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Wed, Nov 19, 2025 at 5:27 PM Eugenio Perez Martin
> > > <eperezma@redhat.com> wrote:
> > > >
> > > > On Wed, Nov 19, 2025 at 3:39 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Mon, Nov 17, 2025 at 8:16 PM Eugenio Perez Martin
> > > > > <eperezma@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, Nov 17, 2025 at 5:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > >
> > > > > > > On Fri, Nov 14, 2025 at 7:25 PM Eugenio Perez Martin
> > > > > > > <eperezma@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Fri, Nov 14, 2025 at 1:55 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Thu, Nov 13, 2025 at 7:56 PM Eugenio Pérez <eperezma@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> > > > > > > > > > group. This enables mapping each group into a distinct memory space.
> > > > > > > > > >
> > > > > > > > > > Now that the driver can change ASID in the middle of operation, the
> > > > > > > > > > domain that each vq address point is also protected by domain_lock.
> > > > > > > > >
> > > > > > > > > Maybe it's better to document what is protected by RCU and how.
> > > > > > > > >
> > > > > > > >
> > > > > > > > I added the _rcu annotation but I can expand it for sure. I can also
> > > > > > > > modify the commit message.
> > > > > > > >
> > > > > > > > > More below.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Acked-by: Jason Wang <jasowang@redhat.com>
> > > > > > > >
> > > > > > > > I forgot to remove this, my bad!
> > > > > > > >
> > > > > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > > > > > > ---
> > > > > > > > > > v9:
> > > > > > > > > > * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> > > > > > > > > > context.
> > > > > > > > > >
> > > > > > > > > > v8:
> > > > > > > > > > * Revert the mutex to rwlock change, it needs proper profiling to
> > > > > > > > > > justify it.
> > > > > > > > > >
> > > > > > > > > > v7:
> > > > > > > > > > * Take write lock in the error path (Jason).
> > > > > > > > > >
> > > > > > > > > > v6:
> > > > > > > > > > * Make vdpa_dev_add use gotos for error handling (MST).
> > > > > > > > > > * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> > > > > > > > > > (MST).
> > > > > > > > > > * Fix struct name not matching in the doc.
> > > > > > > > > >
> > > > > > > > > > v5:
> > > > > > > > > > * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> > > > > > > > > > ioctl (Jason).
> > > > > > > > > > * Properly set domain bounce size to divide equally between nas (Jason).
> > > > > > > > > > * Exclude "padding" member from the only >V1 members in
> > > > > > > > > > vduse_dev_request.
> > > > > > > > > >
> > > > > > > > > > v4:
> > > > > > > > > > * Divide each domain bounce size between the device bounce size (Jason).
> > > > > > > > > > * revert unneeded addr = NULL assignment (Jason)
> > > > > > > > > > * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> > > > > > > > > > return; } (Jason)
> > > > > > > > > > * Change a bad multiline comment, using @ caracter instead of * (Jason).
> > > > > > > > > > * Consider config->nas == 0 as a fail (Jason).
> > > > > > > > > >
> > > > > > > > > > v3:
> > > > > > > > > > * Get the vduse domain through the vduse_as in the map functions
> > > > > > > > > > (Jason).
> > > > > > > > > > * Squash with the patch creating the vduse_as struct (Jason).
> > > > > > > > > > * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> > > > > > > > > > (Jason)
> > > > > > > > > >
> > > > > > > > > > v2:
> > > > > > > > > > * Convert the use of mutex to rwlock.
> > > > > > > > > >
> > > > > > > > > > RFC v3:
> > > > > > > > > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> > > > > > > > > > value to reduce memory consumption, but vqs are already limited to
> > > > > > > > > > that value and userspace VDUSE is able to allocate that many vqs.
> > > > > > > > > > * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> > > > > > > > > > VDUSE_IOTLB_GET_INFO.
> > > > > > > > > > * Use of array_index_nospec in VDUSE device ioctls.
> > > > > > > > > > * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> > > > > > > > > > * Move the umem mutex to asid struct so there is no contention between
> > > > > > > > > > ASIDs.
> > > > > > > > > >
> > > > > > > > > > RFC v2:
> > > > > > > > > > * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> > > > > > > > > > part of the struct is the same.
> > > > > > > > > > ---
> > > > > > > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> > > > > > > > > > include/uapi/linux/vduse.h | 53 ++++-
> > > > > > > > > > 2 files changed, 314 insertions(+), 109 deletions(-)
> > > > > > > > > >
> > > > > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > > index 97be04f73fbf..ff95ed56f22d 100644
> > > > > > > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > > @@ -11,6 +11,7 @@
> > > > > > > > > > #include "linux/virtio_net.h"
> > > > > > > > > > #include <linux/init.h>
> > > > > > > > > > #include <linux/module.h>
> > > > > > > > > > +#include <linux/rcupdate.h>
> > > > > > > > > > #include <linux/cdev.h>
> > > > > > > > > > #include <linux/device.h>
> > > > > > > > > > #include <linux/eventfd.h>
> > > > > > > > > > @@ -41,6 +42,7 @@
> > > > > > > > > >
> > > > > > > > > > #define VDUSE_DEV_MAX (1U << MINORBITS)
> > > > > > > > > > #define VDUSE_DEV_MAX_GROUPS 0xffff
> > > > > > > > > > +#define VDUSE_DEV_MAX_AS 0xffff
> > > > > > > > > > #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> > > > > > > > > > #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> > > > > > > > > > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> > > > > > > > > > @@ -86,7 +88,14 @@ struct vduse_umem {
> > > > > > > > > > struct mm_struct *mm;
> > > > > > > > > > };
> > > > > > > > > >
> > > > > > > > > > +struct vduse_as {
> > > > > > > > > > + struct vduse_iova_domain *domain;
> > > > > > > > > > + struct vduse_umem *umem;
> > > > > > > > > > + struct mutex mem_lock;
> > > > > > > > > > +};
> > > > > > > > > > +
> > > > > > > > > > struct vduse_vq_group {
> > > > > > > > > > + struct vduse_as *as __rcu;
> > > > > > > > > > struct vduse_dev *dev;
> > > > > > > > > > };
> > > > > > > > > >
> > > > > > > > > > @@ -94,7 +103,7 @@ struct vduse_dev {
> > > > > > > > > > struct vduse_vdpa *vdev;
> > > > > > > > > > struct device *dev;
> > > > > > > > > > struct vduse_virtqueue **vqs;
> > > > > > > > > > - struct vduse_iova_domain *domain;
> > > > > > > > > > + struct vduse_as *as;
> > > > > > > > > > char *name;
> > > > > > > > > > struct mutex lock;
> > > > > > > > > > spinlock_t msg_lock;
> > > > > > > > > > @@ -122,9 +131,8 @@ struct vduse_dev {
> > > > > > > > > > u32 vq_num;
> > > > > > > > > > u32 vq_align;
> > > > > > > > > > u32 ngroups;
> > > > > > > > > > - struct vduse_umem *umem;
> > > > > > > > > > + u32 nas;
> > > > > > > > > > struct vduse_vq_group *groups;
> > > > > > > > > > - struct mutex mem_lock;
> > > > > > > > > > unsigned int bounce_size;
> > > > > > > > > > struct mutex domain_lock;
> > > > > > > > > > };
> > > > > > > > > > @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> > > > > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > > }
> > > > > > > > > >
> > > > > > > > > > -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > > > > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> > > > > > > > > > u64 start, u64 last)
> > > > > > > > > > {
> > > > > > > > > > struct vduse_dev_msg msg = { 0 };
> > > > > > > > > > @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > > > > return -EINVAL;
> > > > > > > > > >
> > > > > > > > > > msg.req.type = VDUSE_UPDATE_IOTLB;
> > > > > > > > > > - msg.req.iova.start = start;
> > > > > > > > > > - msg.req.iova.last = last;
> > > > > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1) {
> > > > > > > > > > + msg.req.iova.start = start;
> > > > > > > > > > + msg.req.iova.last = last;
> > > > > > > > > > + } else {
> > > > > > > > > > + msg.req.iova_v2.start = start;
> > > > > > > > > > + msg.req.iova_v2.last = last;
> > > > > > > > > > + msg.req.iova_v2.asid = asid;
> > > > > > > > > > + }
> > > > > > > > > >
> > > > > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > > }
> > > > > > > > > > @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> > > > > > > > > > return mask;
> > > > > > > > > > }
> > > > > > > > > >
> > > > > > > > > > +/* Force set the asid to a vq group without a message to the VDUSE device */
> > > > > > > > > > +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> > > > > > > > > > + unsigned int group, unsigned int asid)
> > > > > > > > > > +{
> > > > > > > > > > + /*
> > > > > > > > > > + * Two concurrent updates to this pointer are valid as they cannot
> > > > > > > > > > + * point to an invalid region. It is ok for them to race as long as
> > > > > > > > > > + * the readers see a consistent state through RCU.
> > > > > > > > > > + */
> > > > > > > > > > + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
> > > > > > > > >
> > > > > > > > > I'd expect at least a synchronize_rcu() here to wait for the read is done?
> > > > > > > > >
> > > > > > > >
> > > > > > > > What's the use? The only thing left here is to return from
> > > > > > > > vduse_set_group_asid_nomsg, and we don't need to wait for readers
> > > > > > > > here, do we?
> > > > > > >
> > > > > > > See below.
> > > > > > >
> > > > > > > >
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > static void vduse_dev_reset(struct vduse_dev *dev)
> > > > > > > > > > {
> > > > > > > > > > int i;
> > > > > > > > > > - struct vduse_iova_domain *domain = dev->domain;
> > > > > > > > > >
> > > > > > > > > > /* The coherent mappings are handled in vduse_dev_free_coherent() */
> > > > > > > > > > - if (domain && domain->bounce_map)
> > > > > > > > > > - vduse_domain_reset_bounce_map(domain);
> > > > > > > > > > + for (i = 0; i < dev->nas; i++) {
> > > > > > > > > > + struct vduse_iova_domain *domain = dev->as[i].domain;
> > > > > > > > > > +
> > > > > > > > > > + if (domain && domain->bounce_map)
> > > > > > > > > > + vduse_domain_reset_bounce_map(domain);
> > > > > > > > > > + }
> > > > > > > > > > +
> > > > > > > > > > + for (i = 0; i < dev->ngroups; i++)
> > > > > > > > > > + vduse_set_group_asid_nomsg(dev, i, 0);
> > > > > > > > > >
> > > > > > > > > > down_write(&dev->rwsem);
> > > > > > > > > >
> > > > > > > > > > @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> > > > > > > > > > return ret;
> > > > > > > > > > }
> > > > > > > > > >
> > > > > > > > > > +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> > > > > > > > > > + unsigned int asid)
> > > > > > > > > > +{
> > > > > > > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > > > > + struct vduse_dev_msg msg = { 0 };
> > > > > > > > > > + int r;
> > > > > > > > > > +
> > > > > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1 ||
> > > > > > > > > > + group >= dev->ngroups || asid >= dev->nas)
> > > > > > > > > > + return -EINVAL;
> > > > > > > > > > +
> > > > > > > > > > + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> > > > > > > > > > + msg.req.vq_group_asid.group = group;
> > > > > > > > > > + msg.req.vq_group_asid.asid = asid;
> > > > > > > > > > +
> > > > > > > > > > + r = vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > > + if (r < 0)
> > > > > > > > > > + return r;
> > > > > > > > > > +
> > > > > > > > > > + vduse_set_group_asid_nomsg(dev, group, asid);
> > > > > > > > > > + return 0;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> > > > > > > > > > struct vdpa_vq_state *state)
> > > > > > > > > > {
> > > > > > > > > > @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> > > > > > > > > > struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > > > > int ret;
> > > > > > > > > >
> > > > > > > > > > - ret = vduse_domain_set_map(dev->domain, iotlb);
> > > > > > > > > > + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> > > > > > > > > > if (ret)
> > > > > > > > > > return ret;
> > > > > > > > > >
> > > > > > > > > > - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> > > > > > > > > > + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> > > > > > > > > > if (ret) {
> > > > > > > > > > - vduse_domain_clear_map(dev->domain, iotlb);
> > > > > > > > > > + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> > > > > > > > > > return ret;
> > > > > > > > > > }
> > > > > > > > > >
> > > > > > > > > > @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> > > > > > > > > > .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> > > > > > > > > > .reset = vduse_vdpa_reset,
> > > > > > > > > > .set_map = vduse_vdpa_set_map,
> > > > > > > > > > + .set_group_asid = vduse_set_group_asid,
> > > > > > > > > > .get_vq_map = vduse_get_vq_map,
> > > > > > > > > > .free = vduse_vdpa_free,
> > > > > > > > > > };
> > > > > > > > > > @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> > > > > > > > > > enum dma_data_direction dir)
> > > > > > > > > > {
> > > > > > > > > > struct vduse_dev *vdev;
> > > > > > > > > > + struct vduse_as *as;
> > > > > > > > > > struct vduse_iova_domain *domain;
> > > > > > > > > >
> > > > > > > > > > if (!token.group)
> > > > > > > > > > return;
> > > > > > > > > >
> > > > > > > > > > vdev = token.group->dev;
> > > > > > > > > > - domain = vdev->domain;
> > > > > > > > > > -
> > > > > > > > > > + rcu_read_lock();
> > > > > > > > > > + as = rcu_dereference(token.group->as);
> > > > > > > > > > + domain = as->domain;
> > > > > > > > > > + rcu_read_unlock();
> > > > > > > > > > vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
> > > > > > > > >
> > > > > > > > > This is suspicious, at least we should do rcu_read_unlock() after
> > > > > > > > > vduse_domain_sync_single_for_device(), otherwise I don't see how RCU
> > > > > > > > > works.
> > > > > > > > >
> > > > > > > >
> > > > > > > > RCU is protecting that the address space pointer of the vq group is
> > > > > > > > not modified concurrently with the access. Ideally, this should be a
> > > > > > > > full lock, but just making sure that all accesses from the reader are
> > > > > > > > coherent is enough. Userspace should expect nothing if it uses the map
> > > > > > > > and modifies the vq group ASID at the same time anyway, but the kernel
> > > > > > > > needs to be sure that it does not see intermediate states. TBH, we
> > > > > > > > could move to a READ_ONCE / WRITE_ONCE, would that be more clear?
> > > > > > >
> > > > > > > Using READ_ONCE/WRITE_ONCE() needs to make sure the ordering is
> > > > > > > handled correctly.
> > > > > > >
> > > > > > > But I meant what happens if
> > > > > > >
> > > > > > > [cpu0]rcu_read_lock()
> > > > > > > [cpu0]as = rcu_dereference(token.group->as)
> > > > > > > [cpu0]...
> > > > > > > [cpu0]rcu_read_unlock()
> > > > > > > [cpu1]rcu_assign_pointer(token.group->as)
> > > > > > > [cpu0]vduse_domain_sync_single_for_device()
> > > > > > >
> > > > > >
> > > > > > That should go ok. What I'm trying to protect here is the iterations
> > > > > > in vduse_domain_sync_single_for_device -> vduse_domain_bounce.
> > > > > >
> > > > > > I'm going to embed that function here in
> > > > > > vduse_dev_sync_single_for_device and omit RCU and some details to make
> > > > > > the point easier:
> > > > > >
> > > > > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > > > > iova, size_t size, ...) {
> > > > > > read_lock(&token.group->as->domain);
> > > > > > while (size)
> > > > > > map = token.group->as->domain->bounce_maps[iova];
> > > > > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > > > > >
> > > > > > ...
> > > > > > page = token_group->as->domain->bounce_maps
> > > > > > addr = kmap_local_page(page);
> > > > > > do_bounce(map->orig_phys, addr, sz, dir);
> > > > > > kunmap_local(addr);
> > > > > > size -= sz;
> > > > > > iova += sz;
> > > > > > }
> > > > > > read_unlock(&token.group->as->domain);
> > > > > > }
> > > > >
> > > > > Right, so I meant for rwlock like semantic (let's forget the sleeping here).
> > > > >
> > > > > vduse_set_group_asid_nomsg() should use "write lock" so it must wait
> > > > > for the "read lock" to be done.
> > > >
> > > > No, it doesn't need to wait as long as the reader part uses its own copy.
> > >
> > > It probably won't crash but I meant if we have logic issues. For
> > > example, once set_group_asid() return, there should still be a pending
> > > DMA that is using the old as.
> > >
> > > >
> > > > > But this is not the logic that is
> > > > > implemented in this patch as there's no synchronize_rcu() in the
> > > > > vduse_set_group_asid_nomsg().
> > > >
> > > > We only set the pointer on the writer's side, we do nothing like
> > > > freeing resources. Should we set the pointer before or after
> > > > syncrhonize_rcu()? What do we need to do on the other side of
> > > > syncrhonize_rcu()?
> > >
> > > Usually we don't need special care on the read side. But as discussed,
> > > synchronize_rcu() is not a must but we need to explain why it is safe
> > > and I'm not sure Michael is fine with that.
> > > If we just want to make sure the order of publish and read, we can
> > > switch to use smp_store_release() and smp_load_acqurie().
> > >
> > > >
> > > > > We need to explain why set_group_asid()
> > > > > doesn't need to wait and if this is true, we probably don't need RCU
> > > > > but to make sure the load/store is atomic.
> > > > >
> > > >
> > > > What about:
> > > >
> > > > * It does not matter if other thread modify group->as as long as the
> > > > reader uses the same as for all its operation. It performs a local
> > > > copy for that reason.
> > > > * It does not matter if multiple threads modify group->as as long as
> > > > the update is atomic.
> > >
> > > See above reply.
> > >
> > > >
> > > > ?
> > > >
> > > > > >
> > > > > > Now, depending on the point where another execution thread changes
> > > > > > token_group->as and how the compiler has chosen to generate the
> > > > > > machine code, the outcome could be:
> > > > > > 1) The domain read lock of one ASID is taken but the domain lock of
> > > > > > another as is unlocked.
> > > > > > 2) We iterate until iova is ok for the ASID we're handling, but not
> > > > > > for the other one. So we access an invalid offset in
> > > > > > bounce_maps[iova].
> > > > > >
> > > > > > And I guess there are other possible outcomes too.
> > > > > >
> > > > > > So I need to make sure that the pointer accesses in all
> > > > > > vduse_domain_bounce is coherent.
> > > > >
> > > > > I'm not sure I got here, but it looks like it accepts a domain
> > > > > parameter and is protected by the bounce lock so we are probably fine
> > > > > here?
> > > > >
> > > >
> > > > The bounce lock only protects the iotlb tree, not the pointer to that
> > > > iotlb tree.
> > > >
> > > > > > I'm ok if it takes the one before the
> > > > > > concurrent call to vduse_set_group_asid_nomsg or the one after that,
> > > > > > as the lifetime of all domains are bound to the device. But it cannot
> > > > > > change in the middle of the operation:
> > > > > >
> > > > > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > > > > iova, size_t size, ...) {
> > > > > > as = token.group->as;
> > > > > > // Tell the compiler to never replace "as" by "token.group->as" after this.
> > > > > > read_lock(&as->domain);
> > > > > > while (size)
> > > > > > map = as->domain->bounce_maps[iova];
> > > > > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > > > > >
> > > > > > ...
> > > > > > page = as->domain->bounce_maps
> > > > > > addr = kmap_local_page(page);
> > > > > > do_bounce(map->orig_phys, addr, sz, dir);
> > > > > > kunmap_local(addr);
> > > > > > size -= sz;
> > > > > > iova += sz;
> > > > > > }
> > > > > > read_unlock(&as->domain);
> > > > > > }
> > > > > >
> > > > > > That can be done in many ways. Probably the read_lock is already
> > > > > > enough but it is not explicit that it is protecting token.group->as,
> > > > > > and future changes could remove it. To me, RCU is the most clear way
> > > > > > to do it, but even a volatile read (READ_ONCE?) would do.
> > > > >
> > > > > I wonder if another group rwlock is sufficient here:
> > > > >
> > > > > for set_group_as_id()
> > > > >
> > > > > write_lock(&dev->groups[group].lock);
> > > > > dev->groups[group].as = &dev->as[asid];
> > > > > write_unlock(&dev->groups[group].lock);
> > > > >
> > > > > for the case where we need defer as
> > > > >
> > > > > read_lock(&dev->groups[group].lock);
> > > > > as = dev->groups[group].as;
> > > > > //using as
> > > > > read_unlock(&dev->groups[group].lock);
> > > > >
> > > > > If this works, we don't need to bother with thinking if the
> > > > > wait/synchronizre_rcu() is really needed or not?
> > > > >
> > > >
> > > > A rwlock is sufficient but we need to modify the allocation code
> > > > somehow. Also, I thought we wanted to avoid the overhead of taking the
> > > > read lock in the DMA ops too.
> > >
> > > Right, but it would always be a balance. We can make sure it works
> > > correctly first then do optimization on top.
> > >
> > > >
> > > > Another disadvantage of the lock vs RCU or READ_ONCE is that the vq
> > > > group ASID change needs to wait for the DMA operation to finish
> > > > instead of just applying for the next DMA ops. Not like vq group ASID
> > > > change would be in the hot path anyway, just pointing it out.
> > > >
> > > > > >
> > > > > > > If this is not an issue, RCU is not a must, but please explain why.
> > > > > > > If this is an issue, we need to fix it.
> > > > > > >
> > > > > > > It's basically a question that
> > > > > > >
> > > > > > > 1) should we need to wait for the DMA to be completed before assigning
> > > > > > > to the new as
> > > > > >
> > > > > > I don't think so, it is valid to assign a new as and let the ongoing
> > > > > > operation to continue. It is racy and the operation could fail, but
> > > > > > the kernel just returns an error and doesn't access invalid memory or
> > > > > > similar.
> > > > >
> > > > > See below.
> > > > >
> > > > > >
> > > > > > > 2) should we track the set_group_asid() for the group that has pending
> > > > > > > DMA to avoid potential issue
> > > > > > >
> > > > > >
> > > > > > No, the group will outlive the operation as it is bound to the device.
> > > > >
> > > > > I meant e.g the DMA could be triggered by the device. For example, the
> > > > > device may try to trigger an interrupt when the kernel is trying to
> > > > > assign a new asid. So I wonder if guest can use this to poke Qemu's
> > > > > memory etc.
> > > >
> > > > I'm not sure I get this point. If QEMU changes the ASID of the vq
> > > > group sent to the guest the race does not matter anymore: it is
> > > > explicitly opening the possibility from the guest to poke QEMU's
> > > > memory unless the guest is totally paused.
> > >
> > > Basically what I meant, assuming group0.as = as0
> > >
> > > cpu0] dma_map(group0.as, addr, DMA_FROM_DEVICE)
> > > cpu1] set_group_asid(group0.as, as1)
> > > cpu0] dma_unmap(group0.as, addr, DMA_FROM_DEVICE)
> > >
> > > cpu0 may read as1 while it wants as0 actually?
> > >
> >
> > Yes, kind of. That's my point: adding synchronization at vduse level
> > does not fix it.
> >
> > There is no way to do that call from vhost/vdpa or userland, as there
> > is no way to get the AS of a vq group, only to set it. The closest
> > thing is to add a cache at that level, but that implies to add
> > mutithreading sync on that upper layer, either vhost/vdpa or userland,
> > not in VDUSE.
>
> Probably.
>
> >
> > From vhost/vdpa level, all mapping calls (.set_map, .dma_map,
> > .dma_unmap) calls take the ASID directly, not the vq group. So the
> > call to set_group_asid does not need to access the vq group.
> >
> > Now let's say that we add that vdpa_ops callback (and ioctls) that
> > maps and unmap based on a vq_group. And all of the operations
> > (dma_map, set_group_asid, and dma_unmap) are serialized by taking the
> > same mutex. cpu0 still may dma_unmap over as0 if set_group_asid is not
> > properly serialized at vhost/vdpa or userland level:
> >
> > void* thread0_func(void* arg) {
> > struct {
> > int vq_group = 0,
> > int iova, size, perm, ...
> > } s;
> > int fd = (intptr_t)arg;
> >
> > ioctl(fd, VHOST_VDPA_VQ_GROUP_DMA_MAP, &s);
> > // TODO: Signal thread0 that it can proceed with SET_GROUP_ASID
> > // TODO: Wait until thread0 complete SET_GROUP_ASID
> >
> > ioctl(fd, VHOST_VDPA_VQ_GROUP_DMA_UNMAP, &data);
> >
> > return NULL;
> > }
> >
> > void* thread1_func(void* arg) {
> > struct vhost_vring_state s = {
> > .index = 0,
> > .num = 1,
> > };
> > int fd = (int)(intptr_t)arg;
> >
> > // TODO: Wait until thread2 calls dma_map
>
> This is something exactly rwlock or synchronize_rcu() can do? Or is
> this the charge of the vDPA parent to do that?
>
No, that's an userland synchronization problem that cannot be solved
at kernel level. If we need to do something similar at vdpa core level
(unlikely), we will need to synchronize at that level too.
If we add an rwlock but don't implement the synchronization in the
userland program marked as TODO, this sequence is also possible:
cpu0] DMA map ioctl call in userland
-> Take read_lock in the vduse module
-> Update the IOTLB tree of ASID 0
-> unlock read_lock
Now we have two possibilities: either cpu0] DMA_UNMAP is called or
cpu1] set_group_asid is called. The VDUSE module rwlock protects that
they will not run at the same time, but we need to implement something
at the userland level if we want a predictive outcome, marked as the
TODO in the comments. If, by chance, the DMA unmap is the one that
comes next, the AS updated is the 0:
cpu0] DMA unmap ioctl call in userland
-> Take read_lock in the vduse module
-> Update the IOTLB tree of ASID 0
-> unlock read_lock
cpu1] set_group_asid ioctl call in userland
-> Take write_lock in the vduse module
-> Update ASID of the VQ GROUP 0 to 1
-> unlock write_lock
If set_group_asid run first by chance, ASID 1 is the one that is updated:
cpu1] set_group_asid ioctl call in userland
-> Take write_lock in the vduse module
-> Update ASID of the VQ GROUP 0 to 1
-> unlock write_lock
cpu0] DMA unmap ioctl call in userland
-> Take read_lock in the vduse module
-> Update the IOTLB tree of ASID 0
-> unlock read_lock
On the other hand we have this version of the series that allows these
actions to run at the same time. It just makes sure that the update of
the IOTLB tree is coherent, by copying the vq group ASID value at one
point in time and making sure it sticks to that ASID until the end of
the set_map call. I'm not adding the synchronize_rcu call because we
stated it should not be called from userland, but the outcome is
similar.
Let me put another example: It's like calling fdup() and write() from
different threads over the same set of fds without userland
synchronization to me. The kernel protects things like not write part
of the content in one file and another part on the other file, but the
content of the files at the end of the writes() is just not
predictable. And the kernel just cannot make it predictable.
static int fd_a;
static int fd_b;
void *write_thread(void *arg) {
const char *msg = "Writing to fd_b\n";
write(fd_b, msg, strlen(msg));
return NULL;
}
void *dup_thread(void *arg) {
dup2(fd_a, fd_b);
return NULL;
}
int main() {
pthread_t writer, duper;
fd_a = open("/tmp/a", O_WRONLY | O_CREAT | O_TRUNC, 0644);
fd_b = open("/tmp/b", O_WRONLY | O_CREAT | O_TRUNC, 0644);
pthread_create(&writer, NULL, write_thread, NULL);
pthread_create(&duper, NULL, dup_thread, NULL);
pthread_join(writer, NULL);
pthread_join(duper, NULL);
close(fd_a);
close(fd_b);
return 0;
}
--
> If it's the responsibility of the parent, it would be much harder for
> VDUSE as the datapath is implemented in usersapce via mmap() or
> umem_reg.
>
> Looking at existing implementation:
>
> - for mlx5e, it looks like it assumes the set_group_asid() works only
> without DRIVER_OK.
> - for the simulator, it looks like it can synchronize with the
> datapath with the spinlock as datapath is emulated
>
Yes, the next version will use the rwlock spinlock. I just need to
take out the allocation of the domain for it to be valid.
> > ioctl(fd, VHOST_VDPA_SET_GROUP_ASID, &s)
> > // TODO: Signal thread2 that can proceed with dma_unmap
>
> But the issue I mention is that the, from the view of the vDPA bus:
>
> 1) it offers set_group_as_id()
> 2) it doesn't know if virtio-vdpa or vhost-vdpa is used
>
> So theoretically, set_group_as_id() could happen between
>
> dma_addr = dma_map();
>
> and
>
> dma_unmap(dma_adrr);
>
> But those two dma_addr refers to the different address space.
I don't get this, these calls take the ASID as the parameter, not the
vq group. I thought this was by design, as telling what vq groups
update seems way more difficult to me. Can you put an example of an
userland application that has the race you describe with the existing
ioctls?
> Instead
> of trying to do synchronization, maybe we can simply fail
> set_group_asid if DRIVER_OK is set.
>
That's a good possibility, especially since mlx5 already does it.
There is ongoing work to enable dataplane SVQ dynamically without
needing to reset the whole device, but we will need a new feature flag
to know if the parent driver supports it.
> >
> > return NULL;
> > }
> >
> > int main() {
> > pthread_t thread0, thread1;
> > int fd = open("/dev/vhost-vdpa-0", ...)
> >
> > pthread_create(&thread0, NULL, thread0_func, (void *)(intptr_t)fd);
> > pthread_create(&thread1, NULL, thread1_func, (void *)(intptr_t)fd);
> >
> > pthread_join(thread1, NULL);
> > pthread_join(thread2, NULL);
> >
> > return EXIT_SUCCESS;
> > }
> > ---
> >
> > We need something to synchronize at userland level here, filling the TODOs.
> >
> > We can replace the hypothetical VHOST_VDPA_VQ_GROUP_DMA_MAP and _UNMAP
> > with access to vqs, and the result is the same: The userland
> > application is the one that needs to serialize the access from thread0
> > if it wants predictive outcome against the accesses from thread1.
> > There is no way to do it at vduse level.
> >
> > We need some syncrhonization to avoid malicious or buggy userland to
> > mess things, that's for sure. So DMA_MAP and DMA_UNMAP does not half
> > update the iotlb tree. And I'll send the next version with rwlock,
> > protecting as much as possible. But I want to make clear that it will
> > not avoid the race you describe here.
>
> See above.
>
> >
> > > >
> > > > > But if you mean we depend on the IOTLB to guard against
> > > > > this, I'm fine, but let's document why we don't need it and how the
> > > > > IOTLB layer can help to eliminate such risk.
> > > > >
> > > >
> > > > No, I'm not happy about letting iotlb lock to protect this too as
> > > > they're at different levels actually: One is protecting iotlb trees
> > > > modifications while they're being read and the other is protecting the
> > > > ASID assignment to different vq groups. To reuse them means a
> > > > modification in any tree blocks the change of vq group ASID, for
> > > > example.
> > > >
> > > > > Anyhow, tracking and failing seems to be more robust.
> > > >
> > > > I'm not sure I get this. If a DMA read starts in one ASID and then
> > > > QEMU changes the ASID of the vq group, do you prefer it to fail rather
> > > > than continue reading from the original ASID?
> > >
> > > If possible, it would be better.
> > >
> >
> > It's hard for me to think in some way that does not add a lot of
> > overhead or it is very complex. But it is good to know this is
> > acceptable.
> >
> > > > It seems hard to
> > > > communicate that the ASID has changed to the DMA operation callback.
> > >
> > > Maybe we can encode this into iova.
> > >
> >
> > I'm adding the full rwlock, but can you expand on your idea on this?
>
> Encode the asid to the upper bits of IOVA, so when doing dma_unmap()
> we can compare the group/as with the one that is encoded in dma_addr.
> If it differs, warn or bug.
>
But the ASID is already a parameter in the dma_unmap. I thought you
meant to encode in the device's memory read.
> If we can make sure there's no set_group_asid() when DRIVER_OK is set,
> it's not a must then (or could be treated as a kind of hardening).
>
> > It would be great to have it documented in case we need future
> > optimizations.
> >
>
> Thanks
>
On Thu, Dec 4, 2025 at 4:33 PM Eugenio Perez Martin <eperezma@redhat.com> wrote:
>
> On Thu, Dec 4, 2025 at 3:15 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Wed, Dec 3, 2025 at 3:58 PM Eugenio Perez Martin <eperezma@redhat.com> wrote:
> > >
> > > On Thu, Nov 20, 2025 at 2:38 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Wed, Nov 19, 2025 at 5:27 PM Eugenio Perez Martin
> > > > <eperezma@redhat.com> wrote:
> > > > >
> > > > > On Wed, Nov 19, 2025 at 3:39 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, Nov 17, 2025 at 8:16 PM Eugenio Perez Martin
> > > > > > <eperezma@redhat.com> wrote:
> > > > > > >
> > > > > > > On Mon, Nov 17, 2025 at 5:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Fri, Nov 14, 2025 at 7:25 PM Eugenio Perez Martin
> > > > > > > > <eperezma@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Fri, Nov 14, 2025 at 1:55 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Thu, Nov 13, 2025 at 7:56 PM Eugenio Pérez <eperezma@redhat.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> > > > > > > > > > > group. This enables mapping each group into a distinct memory space.
> > > > > > > > > > >
> > > > > > > > > > > Now that the driver can change ASID in the middle of operation, the
> > > > > > > > > > > domain that each vq address point is also protected by domain_lock.
> > > > > > > > > >
> > > > > > > > > > Maybe it's better to document what is protected by RCU and how.
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > I added the _rcu annotation but I can expand it for sure. I can also
> > > > > > > > > modify the commit message.
> > > > > > > > >
> > > > > > > > > > More below.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Acked-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > >
> > > > > > > > > I forgot to remove this, my bad!
> > > > > > > > >
> > > > > > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > > > > > > > ---
> > > > > > > > > > > v9:
> > > > > > > > > > > * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> > > > > > > > > > > context.
> > > > > > > > > > >
> > > > > > > > > > > v8:
> > > > > > > > > > > * Revert the mutex to rwlock change, it needs proper profiling to
> > > > > > > > > > > justify it.
> > > > > > > > > > >
> > > > > > > > > > > v7:
> > > > > > > > > > > * Take write lock in the error path (Jason).
> > > > > > > > > > >
> > > > > > > > > > > v6:
> > > > > > > > > > > * Make vdpa_dev_add use gotos for error handling (MST).
> > > > > > > > > > > * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> > > > > > > > > > > (MST).
> > > > > > > > > > > * Fix struct name not matching in the doc.
> > > > > > > > > > >
> > > > > > > > > > > v5:
> > > > > > > > > > > * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> > > > > > > > > > > ioctl (Jason).
> > > > > > > > > > > * Properly set domain bounce size to divide equally between nas (Jason).
> > > > > > > > > > > * Exclude "padding" member from the only >V1 members in
> > > > > > > > > > > vduse_dev_request.
> > > > > > > > > > >
> > > > > > > > > > > v4:
> > > > > > > > > > > * Divide each domain bounce size between the device bounce size (Jason).
> > > > > > > > > > > * revert unneeded addr = NULL assignment (Jason)
> > > > > > > > > > > * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> > > > > > > > > > > return; } (Jason)
> > > > > > > > > > > * Change a bad multiline comment, using @ caracter instead of * (Jason).
> > > > > > > > > > > * Consider config->nas == 0 as a fail (Jason).
> > > > > > > > > > >
> > > > > > > > > > > v3:
> > > > > > > > > > > * Get the vduse domain through the vduse_as in the map functions
> > > > > > > > > > > (Jason).
> > > > > > > > > > > * Squash with the patch creating the vduse_as struct (Jason).
> > > > > > > > > > > * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> > > > > > > > > > > (Jason)
> > > > > > > > > > >
> > > > > > > > > > > v2:
> > > > > > > > > > > * Convert the use of mutex to rwlock.
> > > > > > > > > > >
> > > > > > > > > > > RFC v3:
> > > > > > > > > > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> > > > > > > > > > > value to reduce memory consumption, but vqs are already limited to
> > > > > > > > > > > that value and userspace VDUSE is able to allocate that many vqs.
> > > > > > > > > > > * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> > > > > > > > > > > VDUSE_IOTLB_GET_INFO.
> > > > > > > > > > > * Use of array_index_nospec in VDUSE device ioctls.
> > > > > > > > > > > * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> > > > > > > > > > > * Move the umem mutex to asid struct so there is no contention between
> > > > > > > > > > > ASIDs.
> > > > > > > > > > >
> > > > > > > > > > > RFC v2:
> > > > > > > > > > > * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> > > > > > > > > > > part of the struct is the same.
> > > > > > > > > > > ---
> > > > > > > > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> > > > > > > > > > > include/uapi/linux/vduse.h | 53 ++++-
> > > > > > > > > > > 2 files changed, 314 insertions(+), 109 deletions(-)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > > > index 97be04f73fbf..ff95ed56f22d 100644
> > > > > > > > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > > > @@ -11,6 +11,7 @@
> > > > > > > > > > > #include "linux/virtio_net.h"
> > > > > > > > > > > #include <linux/init.h>
> > > > > > > > > > > #include <linux/module.h>
> > > > > > > > > > > +#include <linux/rcupdate.h>
> > > > > > > > > > > #include <linux/cdev.h>
> > > > > > > > > > > #include <linux/device.h>
> > > > > > > > > > > #include <linux/eventfd.h>
> > > > > > > > > > > @@ -41,6 +42,7 @@
> > > > > > > > > > >
> > > > > > > > > > > #define VDUSE_DEV_MAX (1U << MINORBITS)
> > > > > > > > > > > #define VDUSE_DEV_MAX_GROUPS 0xffff
> > > > > > > > > > > +#define VDUSE_DEV_MAX_AS 0xffff
> > > > > > > > > > > #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> > > > > > > > > > > #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> > > > > > > > > > > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> > > > > > > > > > > @@ -86,7 +88,14 @@ struct vduse_umem {
> > > > > > > > > > > struct mm_struct *mm;
> > > > > > > > > > > };
> > > > > > > > > > >
> > > > > > > > > > > +struct vduse_as {
> > > > > > > > > > > + struct vduse_iova_domain *domain;
> > > > > > > > > > > + struct vduse_umem *umem;
> > > > > > > > > > > + struct mutex mem_lock;
> > > > > > > > > > > +};
> > > > > > > > > > > +
> > > > > > > > > > > struct vduse_vq_group {
> > > > > > > > > > > + struct vduse_as *as __rcu;
> > > > > > > > > > > struct vduse_dev *dev;
> > > > > > > > > > > };
> > > > > > > > > > >
> > > > > > > > > > > @@ -94,7 +103,7 @@ struct vduse_dev {
> > > > > > > > > > > struct vduse_vdpa *vdev;
> > > > > > > > > > > struct device *dev;
> > > > > > > > > > > struct vduse_virtqueue **vqs;
> > > > > > > > > > > - struct vduse_iova_domain *domain;
> > > > > > > > > > > + struct vduse_as *as;
> > > > > > > > > > > char *name;
> > > > > > > > > > > struct mutex lock;
> > > > > > > > > > > spinlock_t msg_lock;
> > > > > > > > > > > @@ -122,9 +131,8 @@ struct vduse_dev {
> > > > > > > > > > > u32 vq_num;
> > > > > > > > > > > u32 vq_align;
> > > > > > > > > > > u32 ngroups;
> > > > > > > > > > > - struct vduse_umem *umem;
> > > > > > > > > > > + u32 nas;
> > > > > > > > > > > struct vduse_vq_group *groups;
> > > > > > > > > > > - struct mutex mem_lock;
> > > > > > > > > > > unsigned int bounce_size;
> > > > > > > > > > > struct mutex domain_lock;
> > > > > > > > > > > };
> > > > > > > > > > > @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> > > > > > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > > > }
> > > > > > > > > > >
> > > > > > > > > > > -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > > > > > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> > > > > > > > > > > u64 start, u64 last)
> > > > > > > > > > > {
> > > > > > > > > > > struct vduse_dev_msg msg = { 0 };
> > > > > > > > > > > @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > > > > > return -EINVAL;
> > > > > > > > > > >
> > > > > > > > > > > msg.req.type = VDUSE_UPDATE_IOTLB;
> > > > > > > > > > > - msg.req.iova.start = start;
> > > > > > > > > > > - msg.req.iova.last = last;
> > > > > > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1) {
> > > > > > > > > > > + msg.req.iova.start = start;
> > > > > > > > > > > + msg.req.iova.last = last;
> > > > > > > > > > > + } else {
> > > > > > > > > > > + msg.req.iova_v2.start = start;
> > > > > > > > > > > + msg.req.iova_v2.last = last;
> > > > > > > > > > > + msg.req.iova_v2.asid = asid;
> > > > > > > > > > > + }
> > > > > > > > > > >
> > > > > > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > > > }
> > > > > > > > > > > @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> > > > > > > > > > > return mask;
> > > > > > > > > > > }
> > > > > > > > > > >
> > > > > > > > > > > +/* Force set the asid to a vq group without a message to the VDUSE device */
> > > > > > > > > > > +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> > > > > > > > > > > + unsigned int group, unsigned int asid)
> > > > > > > > > > > +{
> > > > > > > > > > > + /*
> > > > > > > > > > > + * Two concurrent updates to this pointer are valid as they cannot
> > > > > > > > > > > + * point to an invalid region. It is ok for them to race as long as
> > > > > > > > > > > + * the readers see a consistent state through RCU.
> > > > > > > > > > > + */
> > > > > > > > > > > + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
> > > > > > > > > >
> > > > > > > > > > I'd expect at least a synchronize_rcu() here to wait for the read is done?
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > What's the use? The only thing left here is to return from
> > > > > > > > > vduse_set_group_asid_nomsg, and we don't need to wait for readers
> > > > > > > > > here, do we?
> > > > > > > >
> > > > > > > > See below.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > static void vduse_dev_reset(struct vduse_dev *dev)
> > > > > > > > > > > {
> > > > > > > > > > > int i;
> > > > > > > > > > > - struct vduse_iova_domain *domain = dev->domain;
> > > > > > > > > > >
> > > > > > > > > > > /* The coherent mappings are handled in vduse_dev_free_coherent() */
> > > > > > > > > > > - if (domain && domain->bounce_map)
> > > > > > > > > > > - vduse_domain_reset_bounce_map(domain);
> > > > > > > > > > > + for (i = 0; i < dev->nas; i++) {
> > > > > > > > > > > + struct vduse_iova_domain *domain = dev->as[i].domain;
> > > > > > > > > > > +
> > > > > > > > > > > + if (domain && domain->bounce_map)
> > > > > > > > > > > + vduse_domain_reset_bounce_map(domain);
> > > > > > > > > > > + }
> > > > > > > > > > > +
> > > > > > > > > > > + for (i = 0; i < dev->ngroups; i++)
> > > > > > > > > > > + vduse_set_group_asid_nomsg(dev, i, 0);
> > > > > > > > > > >
> > > > > > > > > > > down_write(&dev->rwsem);
> > > > > > > > > > >
> > > > > > > > > > > @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> > > > > > > > > > > return ret;
> > > > > > > > > > > }
> > > > > > > > > > >
> > > > > > > > > > > +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> > > > > > > > > > > + unsigned int asid)
> > > > > > > > > > > +{
> > > > > > > > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > > > > > + struct vduse_dev_msg msg = { 0 };
> > > > > > > > > > > + int r;
> > > > > > > > > > > +
> > > > > > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1 ||
> > > > > > > > > > > + group >= dev->ngroups || asid >= dev->nas)
> > > > > > > > > > > + return -EINVAL;
> > > > > > > > > > > +
> > > > > > > > > > > + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> > > > > > > > > > > + msg.req.vq_group_asid.group = group;
> > > > > > > > > > > + msg.req.vq_group_asid.asid = asid;
> > > > > > > > > > > +
> > > > > > > > > > > + r = vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > > > + if (r < 0)
> > > > > > > > > > > + return r;
> > > > > > > > > > > +
> > > > > > > > > > > + vduse_set_group_asid_nomsg(dev, group, asid);
> > > > > > > > > > > + return 0;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> > > > > > > > > > > struct vdpa_vq_state *state)
> > > > > > > > > > > {
> > > > > > > > > > > @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> > > > > > > > > > > struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > > > > > int ret;
> > > > > > > > > > >
> > > > > > > > > > > - ret = vduse_domain_set_map(dev->domain, iotlb);
> > > > > > > > > > > + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> > > > > > > > > > > if (ret)
> > > > > > > > > > > return ret;
> > > > > > > > > > >
> > > > > > > > > > > - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> > > > > > > > > > > + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> > > > > > > > > > > if (ret) {
> > > > > > > > > > > - vduse_domain_clear_map(dev->domain, iotlb);
> > > > > > > > > > > + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> > > > > > > > > > > return ret;
> > > > > > > > > > > }
> > > > > > > > > > >
> > > > > > > > > > > @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> > > > > > > > > > > .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> > > > > > > > > > > .reset = vduse_vdpa_reset,
> > > > > > > > > > > .set_map = vduse_vdpa_set_map,
> > > > > > > > > > > + .set_group_asid = vduse_set_group_asid,
> > > > > > > > > > > .get_vq_map = vduse_get_vq_map,
> > > > > > > > > > > .free = vduse_vdpa_free,
> > > > > > > > > > > };
> > > > > > > > > > > @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> > > > > > > > > > > enum dma_data_direction dir)
> > > > > > > > > > > {
> > > > > > > > > > > struct vduse_dev *vdev;
> > > > > > > > > > > + struct vduse_as *as;
> > > > > > > > > > > struct vduse_iova_domain *domain;
> > > > > > > > > > >
> > > > > > > > > > > if (!token.group)
> > > > > > > > > > > return;
> > > > > > > > > > >
> > > > > > > > > > > vdev = token.group->dev;
> > > > > > > > > > > - domain = vdev->domain;
> > > > > > > > > > > -
> > > > > > > > > > > + rcu_read_lock();
> > > > > > > > > > > + as = rcu_dereference(token.group->as);
> > > > > > > > > > > + domain = as->domain;
> > > > > > > > > > > + rcu_read_unlock();
> > > > > > > > > > > vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
> > > > > > > > > >
> > > > > > > > > > This is suspicious, at least we should do rcu_read_unlock() after
> > > > > > > > > > vduse_domain_sync_single_for_device(), otherwise I don't see how RCU
> > > > > > > > > > works.
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > RCU is protecting that the address space pointer of the vq group is
> > > > > > > > > not modified concurrently with the access. Ideally, this should be a
> > > > > > > > > full lock, but just making sure that all accesses from the reader are
> > > > > > > > > coherent is enough. Userspace should expect nothing if it uses the map
> > > > > > > > > and modifies the vq group ASID at the same time anyway, but the kernel
> > > > > > > > > needs to be sure that it does not see intermediate states. TBH, we
> > > > > > > > > could move to a READ_ONCE / WRITE_ONCE, would that be more clear?
> > > > > > > >
> > > > > > > > Using READ_ONCE/WRITE_ONCE() needs to make sure the ordering is
> > > > > > > > handled correctly.
> > > > > > > >
> > > > > > > > But I meant what happens if
> > > > > > > >
> > > > > > > > [cpu0]rcu_read_lock()
> > > > > > > > [cpu0]as = rcu_dereference(token.group->as)
> > > > > > > > [cpu0]...
> > > > > > > > [cpu0]rcu_read_unlock()
> > > > > > > > [cpu1]rcu_assign_pointer(token.group->as)
> > > > > > > > [cpu0]vduse_domain_sync_single_for_device()
> > > > > > > >
> > > > > > >
> > > > > > > That should go ok. What I'm trying to protect here is the iterations
> > > > > > > in vduse_domain_sync_single_for_device -> vduse_domain_bounce.
> > > > > > >
> > > > > > > I'm going to embed that function here in
> > > > > > > vduse_dev_sync_single_for_device and omit RCU and some details to make
> > > > > > > the point easier:
> > > > > > >
> > > > > > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > > > > > iova, size_t size, ...) {
> > > > > > > read_lock(&token.group->as->domain);
> > > > > > > while (size)
> > > > > > > map = token.group->as->domain->bounce_maps[iova];
> > > > > > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > > > > > >
> > > > > > > ...
> > > > > > > page = token_group->as->domain->bounce_maps
> > > > > > > addr = kmap_local_page(page);
> > > > > > > do_bounce(map->orig_phys, addr, sz, dir);
> > > > > > > kunmap_local(addr);
> > > > > > > size -= sz;
> > > > > > > iova += sz;
> > > > > > > }
> > > > > > > read_unlock(&token.group->as->domain);
> > > > > > > }
> > > > > >
> > > > > > Right, so I meant for rwlock like semantic (let's forget the sleeping here).
> > > > > >
> > > > > > vduse_set_group_asid_nomsg() should use "write lock" so it must wait
> > > > > > for the "read lock" to be done.
> > > > >
> > > > > No, it doesn't need to wait as long as the reader part uses its own copy.
> > > >
> > > > It probably won't crash but I meant if we have logic issues. For
> > > > example, once set_group_asid() return, there should still be a pending
> > > > DMA that is using the old as.
> > > >
> > > > >
> > > > > > But this is not the logic that is
> > > > > > implemented in this patch as there's no synchronize_rcu() in the
> > > > > > vduse_set_group_asid_nomsg().
> > > > >
> > > > > We only set the pointer on the writer's side, we do nothing like
> > > > > freeing resources. Should we set the pointer before or after
> > > > > syncrhonize_rcu()? What do we need to do on the other side of
> > > > > syncrhonize_rcu()?
> > > >
> > > > Usually we don't need special care on the read side. But as discussed,
> > > > synchronize_rcu() is not a must but we need to explain why it is safe
> > > > and I'm not sure Michael is fine with that.
> > > > If we just want to make sure the order of publish and read, we can
> > > > switch to use smp_store_release() and smp_load_acqurie().
> > > >
> > > > >
> > > > > > We need to explain why set_group_asid()
> > > > > > doesn't need to wait and if this is true, we probably don't need RCU
> > > > > > but to make sure the load/store is atomic.
> > > > > >
> > > > >
> > > > > What about:
> > > > >
> > > > > * It does not matter if other thread modify group->as as long as the
> > > > > reader uses the same as for all its operation. It performs a local
> > > > > copy for that reason.
> > > > > * It does not matter if multiple threads modify group->as as long as
> > > > > the update is atomic.
> > > >
> > > > See above reply.
> > > >
> > > > >
> > > > > ?
> > > > >
> > > > > > >
> > > > > > > Now, depending on the point where another execution thread changes
> > > > > > > token_group->as and how the compiler has chosen to generate the
> > > > > > > machine code, the outcome could be:
> > > > > > > 1) The domain read lock of one ASID is taken but the domain lock of
> > > > > > > another as is unlocked.
> > > > > > > 2) We iterate until iova is ok for the ASID we're handling, but not
> > > > > > > for the other one. So we access an invalid offset in
> > > > > > > bounce_maps[iova].
> > > > > > >
> > > > > > > And I guess there are other possible outcomes too.
> > > > > > >
> > > > > > > So I need to make sure that the pointer accesses in all
> > > > > > > vduse_domain_bounce is coherent.
> > > > > >
> > > > > > I'm not sure I got here, but it looks like it accepts a domain
> > > > > > parameter and is protected by the bounce lock so we are probably fine
> > > > > > here?
> > > > > >
> > > > >
> > > > > The bounce lock only protects the iotlb tree, not the pointer to that
> > > > > iotlb tree.
> > > > >
> > > > > > > I'm ok if it takes the one before the
> > > > > > > concurrent call to vduse_set_group_asid_nomsg or the one after that,
> > > > > > > as the lifetime of all domains are bound to the device. But it cannot
> > > > > > > change in the middle of the operation:
> > > > > > >
> > > > > > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > > > > > iova, size_t size, ...) {
> > > > > > > as = token.group->as;
> > > > > > > // Tell the compiler to never replace "as" by "token.group->as" after this.
> > > > > > > read_lock(&as->domain);
> > > > > > > while (size)
> > > > > > > map = as->domain->bounce_maps[iova];
> > > > > > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > > > > > >
> > > > > > > ...
> > > > > > > page = as->domain->bounce_maps
> > > > > > > addr = kmap_local_page(page);
> > > > > > > do_bounce(map->orig_phys, addr, sz, dir);
> > > > > > > kunmap_local(addr);
> > > > > > > size -= sz;
> > > > > > > iova += sz;
> > > > > > > }
> > > > > > > read_unlock(&as->domain);
> > > > > > > }
> > > > > > >
> > > > > > > That can be done in many ways. Probably the read_lock is already
> > > > > > > enough but it is not explicit that it is protecting token.group->as,
> > > > > > > and future changes could remove it. To me, RCU is the most clear way
> > > > > > > to do it, but even a volatile read (READ_ONCE?) would do.
> > > > > >
> > > > > > I wonder if another group rwlock is sufficient here:
> > > > > >
> > > > > > for set_group_as_id()
> > > > > >
> > > > > > write_lock(&dev->groups[group].lock);
> > > > > > dev->groups[group].as = &dev->as[asid];
> > > > > > write_unlock(&dev->groups[group].lock);
> > > > > >
> > > > > > for the case where we need defer as
> > > > > >
> > > > > > read_lock(&dev->groups[group].lock);
> > > > > > as = dev->groups[group].as;
> > > > > > //using as
> > > > > > read_unlock(&dev->groups[group].lock);
> > > > > >
> > > > > > If this works, we don't need to bother with thinking if the
> > > > > > wait/synchronizre_rcu() is really needed or not?
> > > > > >
> > > > >
> > > > > A rwlock is sufficient but we need to modify the allocation code
> > > > > somehow. Also, I thought we wanted to avoid the overhead of taking the
> > > > > read lock in the DMA ops too.
> > > >
> > > > Right, but it would always be a balance. We can make sure it works
> > > > correctly first then do optimization on top.
> > > >
> > > > >
> > > > > Another disadvantage of the lock vs RCU or READ_ONCE is that the vq
> > > > > group ASID change needs to wait for the DMA operation to finish
> > > > > instead of just applying for the next DMA ops. Not like vq group ASID
> > > > > change would be in the hot path anyway, just pointing it out.
> > > > >
> > > > > > >
> > > > > > > > If this is not an issue, RCU is not a must, but please explain why.
> > > > > > > > If this is an issue, we need to fix it.
> > > > > > > >
> > > > > > > > It's basically a question that
> > > > > > > >
> > > > > > > > 1) should we need to wait for the DMA to be completed before assigning
> > > > > > > > to the new as
> > > > > > >
> > > > > > > I don't think so, it is valid to assign a new as and let the ongoing
> > > > > > > operation to continue. It is racy and the operation could fail, but
> > > > > > > the kernel just returns an error and doesn't access invalid memory or
> > > > > > > similar.
> > > > > >
> > > > > > See below.
> > > > > >
> > > > > > >
> > > > > > > > 2) should we track the set_group_asid() for the group that has pending
> > > > > > > > DMA to avoid potential issue
> > > > > > > >
> > > > > > >
> > > > > > > No, the group will outlive the operation as it is bound to the device.
> > > > > >
> > > > > > I meant e.g the DMA could be triggered by the device. For example, the
> > > > > > device may try to trigger an interrupt when the kernel is trying to
> > > > > > assign a new asid. So I wonder if guest can use this to poke Qemu's
> > > > > > memory etc.
> > > > >
> > > > > I'm not sure I get this point. If QEMU changes the ASID of the vq
> > > > > group sent to the guest the race does not matter anymore: it is
> > > > > explicitly opening the possibility from the guest to poke QEMU's
> > > > > memory unless the guest is totally paused.
> > > >
> > > > Basically what I meant, assuming group0.as = as0
> > > >
> > > > cpu0] dma_map(group0.as, addr, DMA_FROM_DEVICE)
> > > > cpu1] set_group_asid(group0.as, as1)
> > > > cpu0] dma_unmap(group0.as, addr, DMA_FROM_DEVICE)
> > > >
> > > > cpu0 may read as1 while it wants as0 actually?
> > > >
> > >
> > > Yes, kind of. That's my point: adding synchronization at vduse level
> > > does not fix it.
> > >
> > > There is no way to do that call from vhost/vdpa or userland, as there
> > > is no way to get the AS of a vq group, only to set it. The closest
> > > thing is to add a cache at that level, but that implies to add
> > > mutithreading sync on that upper layer, either vhost/vdpa or userland,
> > > not in VDUSE.
> >
> > Probably.
> >
> > >
> > > From vhost/vdpa level, all mapping calls (.set_map, .dma_map,
> > > .dma_unmap) calls take the ASID directly, not the vq group. So the
> > > call to set_group_asid does not need to access the vq group.
> > >
> > > Now let's say that we add that vdpa_ops callback (and ioctls) that
> > > maps and unmap based on a vq_group. And all of the operations
> > > (dma_map, set_group_asid, and dma_unmap) are serialized by taking the
> > > same mutex. cpu0 still may dma_unmap over as0 if set_group_asid is not
> > > properly serialized at vhost/vdpa or userland level:
> > >
> > > void* thread0_func(void* arg) {
> > > struct {
> > > int vq_group = 0,
> > > int iova, size, perm, ...
> > > } s;
> > > int fd = (intptr_t)arg;
> > >
> > > ioctl(fd, VHOST_VDPA_VQ_GROUP_DMA_MAP, &s);
I don't get the semantic of VHOST_VDPA_VQ_GROUP_DMA_MAP. And it's
probably wrong to allow userspace to map based on group.
We have VHOST_IOTLB_UPDATE which map pages based on as (asid) this is correct.
More below
> > > // TODO: Signal thread0 that it can proceed with SET_GROUP_ASID
> > > // TODO: Wait until thread0 complete SET_GROUP_ASID
> > >
> > > ioctl(fd, VHOST_VDPA_VQ_GROUP_DMA_UNMAP, &data);
> > >
> > > return NULL;
> > > }
> > >
> > > void* thread1_func(void* arg) {
> > > struct vhost_vring_state s = {
> > > .index = 0,
> > > .num = 1,
> > > };
> > > int fd = (int)(intptr_t)arg;
> > >
> > > // TODO: Wait until thread2 calls dma_map
> >
> > This is something exactly rwlock or synchronize_rcu() can do? Or is
> > this the charge of the vDPA parent to do that?
> >
>
> No, that's an userland synchronization problem that cannot be solved
> at kernel level. If we need to do something similar at vdpa core level
> (unlikely), we will need to synchronize at that level too.
Ok, basically I think we are talking about different things. That's fine.
If I understand you correctly, you mean we need to synchronize between
IOTLB updating (IOTLB_UPDATE/IOTLB_INVALIDATE) and set_group_asid()?
This sounds unnecessary since:
1) IOTLB_UPDATE/IOTLB_INVALIDATE is updating the address space internal mappings
2) set_group_asid() is to assign an AS to a group
>
> If we add an rwlock but don't implement the synchronization in the
> userland program marked as TODO, this sequence is also possible:
>
> cpu0] DMA map ioctl call in userland
> -> Take read_lock in the vduse module
> -> Update the IOTLB tree of ASID 0
> -> unlock read_lock
>
> Now we have two possibilities: either cpu0] DMA_UNMAP is called or
> cpu1] set_group_asid is called. The VDUSE module rwlock protects that
> they will not run at the same time, but we need to implement something
> at the userland level if we want a predictive outcome, marked as the
> TODO in the comments. If, by chance, the DMA unmap is the one that
> comes next, the AS updated is the 0:
>
> cpu0] DMA unmap ioctl call in userland
> -> Take read_lock in the vduse module
> -> Update the IOTLB tree of ASID 0
> -> unlock read_lock
> cpu1] set_group_asid ioctl call in userland
> -> Take write_lock in the vduse module
> -> Update ASID of the VQ GROUP 0 to 1
> -> unlock write_lock
>
> If set_group_asid run first by chance, ASID 1 is the one that is updated:
> cpu1] set_group_asid ioctl call in userland
> -> Take write_lock in the vduse module
> -> Update ASID of the VQ GROUP 0 to 1
> -> unlock write_lock
> cpu0] DMA unmap ioctl call in userland
> -> Take read_lock in the vduse module
> -> Update the IOTLB tree of ASID 0
> -> unlock read_lock
>
> On the other hand we have this version of the series that allows these
> actions to run at the same time. It just makes sure that the update of
> the IOTLB tree is coherent, by copying the vq group ASID value at one
> point in time and making sure it sticks to that ASID until the end of
> the set_map call. I'm not adding the synchronize_rcu call because we
> stated it should not be called from userland, but the outcome is
> similar.
I think we should figure out if VHOST_VDPA_VQ_GROUP_DMA_MAP is useful or not.
>
> Let me put another example: It's like calling fdup() and write() from
> different threads over the same set of fds without userland
> synchronization to me. The kernel protects things like not write part
> of the content in one file and another part on the other file, but the
> content of the files at the end of the writes() is just not
> predictable. And the kernel just cannot make it predictable.
>
> static int fd_a;
> static int fd_b;
>
> void *write_thread(void *arg) {
> const char *msg = "Writing to fd_b\n";
> write(fd_b, msg, strlen(msg));
> return NULL;
> }
>
> void *dup_thread(void *arg) {
> dup2(fd_a, fd_b);
> return NULL;
> }
>
> int main() {
> pthread_t writer, duper;
>
> fd_a = open("/tmp/a", O_WRONLY | O_CREAT | O_TRUNC, 0644);
> fd_b = open("/tmp/b", O_WRONLY | O_CREAT | O_TRUNC, 0644);
>
> pthread_create(&writer, NULL, write_thread, NULL);
> pthread_create(&duper, NULL, dup_thread, NULL);
>
> pthread_join(writer, NULL);
> pthread_join(duper, NULL);
>
> close(fd_a);
> close(fd_b);
>
> return 0;
> }
> --
>
> > If it's the responsibility of the parent, it would be much harder for
> > VDUSE as the datapath is implemented in usersapce via mmap() or
> > umem_reg.
> >
> > Looking at existing implementation:
> >
> > - for mlx5e, it looks like it assumes the set_group_asid() works only
> > without DRIVER_OK.
> > - for the simulator, it looks like it can synchronize with the
> > datapath with the spinlock as datapath is emulated
> >
>
> Yes, the next version will use the rwlock spinlock. I just need to
> take out the allocation of the domain for it to be valid.
>
> > > ioctl(fd, VHOST_VDPA_SET_GROUP_ASID, &s)
> > > // TODO: Signal thread2 that can proceed with dma_unmap
> >
> > But the issue I mention is that the, from the view of the vDPA bus:
> >
> > 1) it offers set_group_as_id()
> > 2) it doesn't know if virtio-vdpa or vhost-vdpa is used
> >
> > So theoretically, set_group_as_id() could happen between
> >
> > dma_addr = dma_map();
> >
> > and
> >
> > dma_unmap(dma_adrr);
> >
> > But those two dma_addr refers to the different address space.
>
> I don't get this, these calls take the ASID as the parameter, not the
> vq group. I thought this was by design, as telling what vq groups
> update seems way more difficult to me. Can you put an example of an
> userland application that has the race you describe with the existing
> ioctls?
Just to clarify, I for dma_map()/dma_unmap() this is not the UAPI part
(and we don't have that). I basically mean the dma API which is used
by virtio-vDPA.
>
> > Instead
> > of trying to do synchronization, maybe we can simply fail
> > set_group_asid if DRIVER_OK is set.
> >
>
> That's a good possibility, especially since mlx5 already does it.
> There is ongoing work to enable dataplane SVQ dynamically without
> needing to reset the whole device, but we will need a new feature flag
> to know if the parent driver supports it.
Did you mean the SVQ may update the group asid while DRIVER_OK is set?
If yes, we need to fix that.
>
> > >
> > > return NULL;
> > > }
> > >
> > > int main() {
> > > pthread_t thread0, thread1;
> > > int fd = open("/dev/vhost-vdpa-0", ...)
> > >
> > > pthread_create(&thread0, NULL, thread0_func, (void *)(intptr_t)fd);
> > > pthread_create(&thread1, NULL, thread1_func, (void *)(intptr_t)fd);
> > >
> > > pthread_join(thread1, NULL);
> > > pthread_join(thread2, NULL);
> > >
> > > return EXIT_SUCCESS;
> > > }
> > > ---
> > >
> > > We need something to synchronize at userland level here, filling the TODOs.
> > >
> > > We can replace the hypothetical VHOST_VDPA_VQ_GROUP_DMA_MAP and _UNMAP
> > > with access to vqs, and the result is the same: The userland
> > > application is the one that needs to serialize the access from thread0
> > > if it wants predictive outcome against the accesses from thread1.
> > > There is no way to do it at vduse level.
> > >
> > > We need some syncrhonization to avoid malicious or buggy userland to
> > > mess things, that's for sure. So DMA_MAP and DMA_UNMAP does not half
> > > update the iotlb tree. And I'll send the next version with rwlock,
> > > protecting as much as possible. But I want to make clear that it will
> > > not avoid the race you describe here.
> >
> > See above.
> >
> > >
> > > > >
> > > > > > But if you mean we depend on the IOTLB to guard against
> > > > > > this, I'm fine, but let's document why we don't need it and how the
> > > > > > IOTLB layer can help to eliminate such risk.
> > > > > >
> > > > >
> > > > > No, I'm not happy about letting iotlb lock to protect this too as
> > > > > they're at different levels actually: One is protecting iotlb trees
> > > > > modifications while they're being read and the other is protecting the
> > > > > ASID assignment to different vq groups. To reuse them means a
> > > > > modification in any tree blocks the change of vq group ASID, for
> > > > > example.
> > > > >
> > > > > > Anyhow, tracking and failing seems to be more robust.
> > > > >
> > > > > I'm not sure I get this. If a DMA read starts in one ASID and then
> > > > > QEMU changes the ASID of the vq group, do you prefer it to fail rather
> > > > > than continue reading from the original ASID?
> > > >
> > > > If possible, it would be better.
> > > >
> > >
> > > It's hard for me to think in some way that does not add a lot of
> > > overhead or it is very complex. But it is good to know this is
> > > acceptable.
> > >
> > > > > It seems hard to
> > > > > communicate that the ASID has changed to the DMA operation callback.
> > > >
> > > > Maybe we can encode this into iova.
> > > >
> > >
> > > I'm adding the full rwlock, but can you expand on your idea on this?
> >
> > Encode the asid to the upper bits of IOVA, so when doing dma_unmap()
> > we can compare the group/as with the one that is encoded in dma_addr.
> > If it differs, warn or bug.
> >
>
> But the ASID is already a parameter in the dma_unmap. I thought you
> meant to encode in the device's memory read.
To avoid unmap a address that is not belong to the this ASID.
>
> > If we can make sure there's no set_group_asid() when DRIVER_OK is set,
> > it's not a must then (or could be treated as a kind of hardening).
> >
> > > It would be great to have it documented in case we need future
> > > optimizations.
> > >
> >
> > Thanks
> >
Thanks
On Fri, Dec 5, 2025 at 2:52 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Thu, Dec 4, 2025 at 4:33 PM Eugenio Perez Martin <eperezma@redhat.com> wrote:
> >
> > On Thu, Dec 4, 2025 at 3:15 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Wed, Dec 3, 2025 at 3:58 PM Eugenio Perez Martin <eperezma@redhat.com> wrote:
> > > >
> > > > On Thu, Nov 20, 2025 at 2:38 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Wed, Nov 19, 2025 at 5:27 PM Eugenio Perez Martin
> > > > > <eperezma@redhat.com> wrote:
> > > > > >
> > > > > > On Wed, Nov 19, 2025 at 3:39 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > >
> > > > > > > On Mon, Nov 17, 2025 at 8:16 PM Eugenio Perez Martin
> > > > > > > <eperezma@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Nov 17, 2025 at 5:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Fri, Nov 14, 2025 at 7:25 PM Eugenio Perez Martin
> > > > > > > > > <eperezma@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Fri, Nov 14, 2025 at 1:55 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Thu, Nov 13, 2025 at 7:56 PM Eugenio Pérez <eperezma@redhat.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> > > > > > > > > > > > group. This enables mapping each group into a distinct memory space.
> > > > > > > > > > > >
> > > > > > > > > > > > Now that the driver can change ASID in the middle of operation, the
> > > > > > > > > > > > domain that each vq address point is also protected by domain_lock.
> > > > > > > > > > >
> > > > > > > > > > > Maybe it's better to document what is protected by RCU and how.
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > I added the _rcu annotation but I can expand it for sure. I can also
> > > > > > > > > > modify the commit message.
> > > > > > > > > >
> > > > > > > > > > > More below.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Acked-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > >
> > > > > > > > > > I forgot to remove this, my bad!
> > > > > > > > > >
> > > > > > > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > > > > > > > > ---
> > > > > > > > > > > > v9:
> > > > > > > > > > > > * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> > > > > > > > > > > > context.
> > > > > > > > > > > >
> > > > > > > > > > > > v8:
> > > > > > > > > > > > * Revert the mutex to rwlock change, it needs proper profiling to
> > > > > > > > > > > > justify it.
> > > > > > > > > > > >
> > > > > > > > > > > > v7:
> > > > > > > > > > > > * Take write lock in the error path (Jason).
> > > > > > > > > > > >
> > > > > > > > > > > > v6:
> > > > > > > > > > > > * Make vdpa_dev_add use gotos for error handling (MST).
> > > > > > > > > > > > * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> > > > > > > > > > > > (MST).
> > > > > > > > > > > > * Fix struct name not matching in the doc.
> > > > > > > > > > > >
> > > > > > > > > > > > v5:
> > > > > > > > > > > > * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> > > > > > > > > > > > ioctl (Jason).
> > > > > > > > > > > > * Properly set domain bounce size to divide equally between nas (Jason).
> > > > > > > > > > > > * Exclude "padding" member from the only >V1 members in
> > > > > > > > > > > > vduse_dev_request.
> > > > > > > > > > > >
> > > > > > > > > > > > v4:
> > > > > > > > > > > > * Divide each domain bounce size between the device bounce size (Jason).
> > > > > > > > > > > > * revert unneeded addr = NULL assignment (Jason)
> > > > > > > > > > > > * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> > > > > > > > > > > > return; } (Jason)
> > > > > > > > > > > > * Change a bad multiline comment, using @ caracter instead of * (Jason).
> > > > > > > > > > > > * Consider config->nas == 0 as a fail (Jason).
> > > > > > > > > > > >
> > > > > > > > > > > > v3:
> > > > > > > > > > > > * Get the vduse domain through the vduse_as in the map functions
> > > > > > > > > > > > (Jason).
> > > > > > > > > > > > * Squash with the patch creating the vduse_as struct (Jason).
> > > > > > > > > > > > * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> > > > > > > > > > > > (Jason)
> > > > > > > > > > > >
> > > > > > > > > > > > v2:
> > > > > > > > > > > > * Convert the use of mutex to rwlock.
> > > > > > > > > > > >
> > > > > > > > > > > > RFC v3:
> > > > > > > > > > > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> > > > > > > > > > > > value to reduce memory consumption, but vqs are already limited to
> > > > > > > > > > > > that value and userspace VDUSE is able to allocate that many vqs.
> > > > > > > > > > > > * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> > > > > > > > > > > > VDUSE_IOTLB_GET_INFO.
> > > > > > > > > > > > * Use of array_index_nospec in VDUSE device ioctls.
> > > > > > > > > > > > * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> > > > > > > > > > > > * Move the umem mutex to asid struct so there is no contention between
> > > > > > > > > > > > ASIDs.
> > > > > > > > > > > >
> > > > > > > > > > > > RFC v2:
> > > > > > > > > > > > * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> > > > > > > > > > > > part of the struct is the same.
> > > > > > > > > > > > ---
> > > > > > > > > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> > > > > > > > > > > > include/uapi/linux/vduse.h | 53 ++++-
> > > > > > > > > > > > 2 files changed, 314 insertions(+), 109 deletions(-)
> > > > > > > > > > > >
> > > > > > > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > > > > index 97be04f73fbf..ff95ed56f22d 100644
> > > > > > > > > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > > > > @@ -11,6 +11,7 @@
> > > > > > > > > > > > #include "linux/virtio_net.h"
> > > > > > > > > > > > #include <linux/init.h>
> > > > > > > > > > > > #include <linux/module.h>
> > > > > > > > > > > > +#include <linux/rcupdate.h>
> > > > > > > > > > > > #include <linux/cdev.h>
> > > > > > > > > > > > #include <linux/device.h>
> > > > > > > > > > > > #include <linux/eventfd.h>
> > > > > > > > > > > > @@ -41,6 +42,7 @@
> > > > > > > > > > > >
> > > > > > > > > > > > #define VDUSE_DEV_MAX (1U << MINORBITS)
> > > > > > > > > > > > #define VDUSE_DEV_MAX_GROUPS 0xffff
> > > > > > > > > > > > +#define VDUSE_DEV_MAX_AS 0xffff
> > > > > > > > > > > > #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> > > > > > > > > > > > #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> > > > > > > > > > > > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> > > > > > > > > > > > @@ -86,7 +88,14 @@ struct vduse_umem {
> > > > > > > > > > > > struct mm_struct *mm;
> > > > > > > > > > > > };
> > > > > > > > > > > >
> > > > > > > > > > > > +struct vduse_as {
> > > > > > > > > > > > + struct vduse_iova_domain *domain;
> > > > > > > > > > > > + struct vduse_umem *umem;
> > > > > > > > > > > > + struct mutex mem_lock;
> > > > > > > > > > > > +};
> > > > > > > > > > > > +
> > > > > > > > > > > > struct vduse_vq_group {
> > > > > > > > > > > > + struct vduse_as *as __rcu;
> > > > > > > > > > > > struct vduse_dev *dev;
> > > > > > > > > > > > };
> > > > > > > > > > > >
> > > > > > > > > > > > @@ -94,7 +103,7 @@ struct vduse_dev {
> > > > > > > > > > > > struct vduse_vdpa *vdev;
> > > > > > > > > > > > struct device *dev;
> > > > > > > > > > > > struct vduse_virtqueue **vqs;
> > > > > > > > > > > > - struct vduse_iova_domain *domain;
> > > > > > > > > > > > + struct vduse_as *as;
> > > > > > > > > > > > char *name;
> > > > > > > > > > > > struct mutex lock;
> > > > > > > > > > > > spinlock_t msg_lock;
> > > > > > > > > > > > @@ -122,9 +131,8 @@ struct vduse_dev {
> > > > > > > > > > > > u32 vq_num;
> > > > > > > > > > > > u32 vq_align;
> > > > > > > > > > > > u32 ngroups;
> > > > > > > > > > > > - struct vduse_umem *umem;
> > > > > > > > > > > > + u32 nas;
> > > > > > > > > > > > struct vduse_vq_group *groups;
> > > > > > > > > > > > - struct mutex mem_lock;
> > > > > > > > > > > > unsigned int bounce_size;
> > > > > > > > > > > > struct mutex domain_lock;
> > > > > > > > > > > > };
> > > > > > > > > > > > @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> > > > > > > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > > > > }
> > > > > > > > > > > >
> > > > > > > > > > > > -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > > > > > > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> > > > > > > > > > > > u64 start, u64 last)
> > > > > > > > > > > > {
> > > > > > > > > > > > struct vduse_dev_msg msg = { 0 };
> > > > > > > > > > > > @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > > > > > > return -EINVAL;
> > > > > > > > > > > >
> > > > > > > > > > > > msg.req.type = VDUSE_UPDATE_IOTLB;
> > > > > > > > > > > > - msg.req.iova.start = start;
> > > > > > > > > > > > - msg.req.iova.last = last;
> > > > > > > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1) {
> > > > > > > > > > > > + msg.req.iova.start = start;
> > > > > > > > > > > > + msg.req.iova.last = last;
> > > > > > > > > > > > + } else {
> > > > > > > > > > > > + msg.req.iova_v2.start = start;
> > > > > > > > > > > > + msg.req.iova_v2.last = last;
> > > > > > > > > > > > + msg.req.iova_v2.asid = asid;
> > > > > > > > > > > > + }
> > > > > > > > > > > >
> > > > > > > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > > > > }
> > > > > > > > > > > > @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> > > > > > > > > > > > return mask;
> > > > > > > > > > > > }
> > > > > > > > > > > >
> > > > > > > > > > > > +/* Force set the asid to a vq group without a message to the VDUSE device */
> > > > > > > > > > > > +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> > > > > > > > > > > > + unsigned int group, unsigned int asid)
> > > > > > > > > > > > +{
> > > > > > > > > > > > + /*
> > > > > > > > > > > > + * Two concurrent updates to this pointer are valid as they cannot
> > > > > > > > > > > > + * point to an invalid region. It is ok for them to race as long as
> > > > > > > > > > > > + * the readers see a consistent state through RCU.
> > > > > > > > > > > > + */
> > > > > > > > > > > > + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
> > > > > > > > > > >
> > > > > > > > > > > I'd expect at least a synchronize_rcu() here to wait for the read is done?
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > What's the use? The only thing left here is to return from
> > > > > > > > > > vduse_set_group_asid_nomsg, and we don't need to wait for readers
> > > > > > > > > > here, do we?
> > > > > > > > >
> > > > > > > > > See below.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > static void vduse_dev_reset(struct vduse_dev *dev)
> > > > > > > > > > > > {
> > > > > > > > > > > > int i;
> > > > > > > > > > > > - struct vduse_iova_domain *domain = dev->domain;
> > > > > > > > > > > >
> > > > > > > > > > > > /* The coherent mappings are handled in vduse_dev_free_coherent() */
> > > > > > > > > > > > - if (domain && domain->bounce_map)
> > > > > > > > > > > > - vduse_domain_reset_bounce_map(domain);
> > > > > > > > > > > > + for (i = 0; i < dev->nas; i++) {
> > > > > > > > > > > > + struct vduse_iova_domain *domain = dev->as[i].domain;
> > > > > > > > > > > > +
> > > > > > > > > > > > + if (domain && domain->bounce_map)
> > > > > > > > > > > > + vduse_domain_reset_bounce_map(domain);
> > > > > > > > > > > > + }
> > > > > > > > > > > > +
> > > > > > > > > > > > + for (i = 0; i < dev->ngroups; i++)
> > > > > > > > > > > > + vduse_set_group_asid_nomsg(dev, i, 0);
> > > > > > > > > > > >
> > > > > > > > > > > > down_write(&dev->rwsem);
> > > > > > > > > > > >
> > > > > > > > > > > > @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> > > > > > > > > > > > return ret;
> > > > > > > > > > > > }
> > > > > > > > > > > >
> > > > > > > > > > > > +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> > > > > > > > > > > > + unsigned int asid)
> > > > > > > > > > > > +{
> > > > > > > > > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > > > > > > + struct vduse_dev_msg msg = { 0 };
> > > > > > > > > > > > + int r;
> > > > > > > > > > > > +
> > > > > > > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1 ||
> > > > > > > > > > > > + group >= dev->ngroups || asid >= dev->nas)
> > > > > > > > > > > > + return -EINVAL;
> > > > > > > > > > > > +
> > > > > > > > > > > > + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> > > > > > > > > > > > + msg.req.vq_group_asid.group = group;
> > > > > > > > > > > > + msg.req.vq_group_asid.asid = asid;
> > > > > > > > > > > > +
> > > > > > > > > > > > + r = vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > > > > + if (r < 0)
> > > > > > > > > > > > + return r;
> > > > > > > > > > > > +
> > > > > > > > > > > > + vduse_set_group_asid_nomsg(dev, group, asid);
> > > > > > > > > > > > + return 0;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> > > > > > > > > > > > struct vdpa_vq_state *state)
> > > > > > > > > > > > {
> > > > > > > > > > > > @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> > > > > > > > > > > > struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > > > > > > int ret;
> > > > > > > > > > > >
> > > > > > > > > > > > - ret = vduse_domain_set_map(dev->domain, iotlb);
> > > > > > > > > > > > + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> > > > > > > > > > > > if (ret)
> > > > > > > > > > > > return ret;
> > > > > > > > > > > >
> > > > > > > > > > > > - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> > > > > > > > > > > > + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> > > > > > > > > > > > if (ret) {
> > > > > > > > > > > > - vduse_domain_clear_map(dev->domain, iotlb);
> > > > > > > > > > > > + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> > > > > > > > > > > > return ret;
> > > > > > > > > > > > }
> > > > > > > > > > > >
> > > > > > > > > > > > @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> > > > > > > > > > > > .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> > > > > > > > > > > > .reset = vduse_vdpa_reset,
> > > > > > > > > > > > .set_map = vduse_vdpa_set_map,
> > > > > > > > > > > > + .set_group_asid = vduse_set_group_asid,
> > > > > > > > > > > > .get_vq_map = vduse_get_vq_map,
> > > > > > > > > > > > .free = vduse_vdpa_free,
> > > > > > > > > > > > };
> > > > > > > > > > > > @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> > > > > > > > > > > > enum dma_data_direction dir)
> > > > > > > > > > > > {
> > > > > > > > > > > > struct vduse_dev *vdev;
> > > > > > > > > > > > + struct vduse_as *as;
> > > > > > > > > > > > struct vduse_iova_domain *domain;
> > > > > > > > > > > >
> > > > > > > > > > > > if (!token.group)
> > > > > > > > > > > > return;
> > > > > > > > > > > >
> > > > > > > > > > > > vdev = token.group->dev;
> > > > > > > > > > > > - domain = vdev->domain;
> > > > > > > > > > > > -
> > > > > > > > > > > > + rcu_read_lock();
> > > > > > > > > > > > + as = rcu_dereference(token.group->as);
> > > > > > > > > > > > + domain = as->domain;
> > > > > > > > > > > > + rcu_read_unlock();
> > > > > > > > > > > > vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
> > > > > > > > > > >
> > > > > > > > > > > This is suspicious, at least we should do rcu_read_unlock() after
> > > > > > > > > > > vduse_domain_sync_single_for_device(), otherwise I don't see how RCU
> > > > > > > > > > > works.
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > RCU is protecting that the address space pointer of the vq group is
> > > > > > > > > > not modified concurrently with the access. Ideally, this should be a
> > > > > > > > > > full lock, but just making sure that all accesses from the reader are
> > > > > > > > > > coherent is enough. Userspace should expect nothing if it uses the map
> > > > > > > > > > and modifies the vq group ASID at the same time anyway, but the kernel
> > > > > > > > > > needs to be sure that it does not see intermediate states. TBH, we
> > > > > > > > > > could move to a READ_ONCE / WRITE_ONCE, would that be more clear?
> > > > > > > > >
> > > > > > > > > Using READ_ONCE/WRITE_ONCE() needs to make sure the ordering is
> > > > > > > > > handled correctly.
> > > > > > > > >
> > > > > > > > > But I meant what happens if
> > > > > > > > >
> > > > > > > > > [cpu0]rcu_read_lock()
> > > > > > > > > [cpu0]as = rcu_dereference(token.group->as)
> > > > > > > > > [cpu0]...
> > > > > > > > > [cpu0]rcu_read_unlock()
> > > > > > > > > [cpu1]rcu_assign_pointer(token.group->as)
> > > > > > > > > [cpu0]vduse_domain_sync_single_for_device()
> > > > > > > > >
> > > > > > > >
> > > > > > > > That should go ok. What I'm trying to protect here is the iterations
> > > > > > > > in vduse_domain_sync_single_for_device -> vduse_domain_bounce.
> > > > > > > >
> > > > > > > > I'm going to embed that function here in
> > > > > > > > vduse_dev_sync_single_for_device and omit RCU and some details to make
> > > > > > > > the point easier:
> > > > > > > >
> > > > > > > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > > > > > > iova, size_t size, ...) {
> > > > > > > > read_lock(&token.group->as->domain);
> > > > > > > > while (size)
> > > > > > > > map = token.group->as->domain->bounce_maps[iova];
> > > > > > > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > > > > > > >
> > > > > > > > ...
> > > > > > > > page = token_group->as->domain->bounce_maps
> > > > > > > > addr = kmap_local_page(page);
> > > > > > > > do_bounce(map->orig_phys, addr, sz, dir);
> > > > > > > > kunmap_local(addr);
> > > > > > > > size -= sz;
> > > > > > > > iova += sz;
> > > > > > > > }
> > > > > > > > read_unlock(&token.group->as->domain);
> > > > > > > > }
> > > > > > >
> > > > > > > Right, so I meant for rwlock like semantic (let's forget the sleeping here).
> > > > > > >
> > > > > > > vduse_set_group_asid_nomsg() should use "write lock" so it must wait
> > > > > > > for the "read lock" to be done.
> > > > > >
> > > > > > No, it doesn't need to wait as long as the reader part uses its own copy.
> > > > >
> > > > > It probably won't crash but I meant if we have logic issues. For
> > > > > example, once set_group_asid() return, there should still be a pending
> > > > > DMA that is using the old as.
> > > > >
> > > > > >
> > > > > > > But this is not the logic that is
> > > > > > > implemented in this patch as there's no synchronize_rcu() in the
> > > > > > > vduse_set_group_asid_nomsg().
> > > > > >
> > > > > > We only set the pointer on the writer's side, we do nothing like
> > > > > > freeing resources. Should we set the pointer before or after
> > > > > > syncrhonize_rcu()? What do we need to do on the other side of
> > > > > > syncrhonize_rcu()?
> > > > >
> > > > > Usually we don't need special care on the read side. But as discussed,
> > > > > synchronize_rcu() is not a must but we need to explain why it is safe
> > > > > and I'm not sure Michael is fine with that.
> > > > > If we just want to make sure the order of publish and read, we can
> > > > > switch to use smp_store_release() and smp_load_acqurie().
> > > > >
> > > > > >
> > > > > > > We need to explain why set_group_asid()
> > > > > > > doesn't need to wait and if this is true, we probably don't need RCU
> > > > > > > but to make sure the load/store is atomic.
> > > > > > >
> > > > > >
> > > > > > What about:
> > > > > >
> > > > > > * It does not matter if other thread modify group->as as long as the
> > > > > > reader uses the same as for all its operation. It performs a local
> > > > > > copy for that reason.
> > > > > > * It does not matter if multiple threads modify group->as as long as
> > > > > > the update is atomic.
> > > > >
> > > > > See above reply.
> > > > >
> > > > > >
> > > > > > ?
> > > > > >
> > > > > > > >
> > > > > > > > Now, depending on the point where another execution thread changes
> > > > > > > > token_group->as and how the compiler has chosen to generate the
> > > > > > > > machine code, the outcome could be:
> > > > > > > > 1) The domain read lock of one ASID is taken but the domain lock of
> > > > > > > > another as is unlocked.
> > > > > > > > 2) We iterate until iova is ok for the ASID we're handling, but not
> > > > > > > > for the other one. So we access an invalid offset in
> > > > > > > > bounce_maps[iova].
> > > > > > > >
> > > > > > > > And I guess there are other possible outcomes too.
> > > > > > > >
> > > > > > > > So I need to make sure that the pointer accesses in all
> > > > > > > > vduse_domain_bounce is coherent.
> > > > > > >
> > > > > > > I'm not sure I got here, but it looks like it accepts a domain
> > > > > > > parameter and is protected by the bounce lock so we are probably fine
> > > > > > > here?
> > > > > > >
> > > > > >
> > > > > > The bounce lock only protects the iotlb tree, not the pointer to that
> > > > > > iotlb tree.
> > > > > >
> > > > > > > > I'm ok if it takes the one before the
> > > > > > > > concurrent call to vduse_set_group_asid_nomsg or the one after that,
> > > > > > > > as the lifetime of all domains are bound to the device. But it cannot
> > > > > > > > change in the middle of the operation:
> > > > > > > >
> > > > > > > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > > > > > > iova, size_t size, ...) {
> > > > > > > > as = token.group->as;
> > > > > > > > // Tell the compiler to never replace "as" by "token.group->as" after this.
> > > > > > > > read_lock(&as->domain);
> > > > > > > > while (size)
> > > > > > > > map = as->domain->bounce_maps[iova];
> > > > > > > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > > > > > > >
> > > > > > > > ...
> > > > > > > > page = as->domain->bounce_maps
> > > > > > > > addr = kmap_local_page(page);
> > > > > > > > do_bounce(map->orig_phys, addr, sz, dir);
> > > > > > > > kunmap_local(addr);
> > > > > > > > size -= sz;
> > > > > > > > iova += sz;
> > > > > > > > }
> > > > > > > > read_unlock(&as->domain);
> > > > > > > > }
> > > > > > > >
> > > > > > > > That can be done in many ways. Probably the read_lock is already
> > > > > > > > enough but it is not explicit that it is protecting token.group->as,
> > > > > > > > and future changes could remove it. To me, RCU is the most clear way
> > > > > > > > to do it, but even a volatile read (READ_ONCE?) would do.
> > > > > > >
> > > > > > > I wonder if another group rwlock is sufficient here:
> > > > > > >
> > > > > > > for set_group_as_id()
> > > > > > >
> > > > > > > write_lock(&dev->groups[group].lock);
> > > > > > > dev->groups[group].as = &dev->as[asid];
> > > > > > > write_unlock(&dev->groups[group].lock);
> > > > > > >
> > > > > > > for the case where we need defer as
> > > > > > >
> > > > > > > read_lock(&dev->groups[group].lock);
> > > > > > > as = dev->groups[group].as;
> > > > > > > //using as
> > > > > > > read_unlock(&dev->groups[group].lock);
> > > > > > >
> > > > > > > If this works, we don't need to bother with thinking if the
> > > > > > > wait/synchronizre_rcu() is really needed or not?
> > > > > > >
> > > > > >
> > > > > > A rwlock is sufficient but we need to modify the allocation code
> > > > > > somehow. Also, I thought we wanted to avoid the overhead of taking the
> > > > > > read lock in the DMA ops too.
> > > > >
> > > > > Right, but it would always be a balance. We can make sure it works
> > > > > correctly first then do optimization on top.
> > > > >
> > > > > >
> > > > > > Another disadvantage of the lock vs RCU or READ_ONCE is that the vq
> > > > > > group ASID change needs to wait for the DMA operation to finish
> > > > > > instead of just applying for the next DMA ops. Not like vq group ASID
> > > > > > change would be in the hot path anyway, just pointing it out.
> > > > > >
> > > > > > > >
> > > > > > > > > If this is not an issue, RCU is not a must, but please explain why.
> > > > > > > > > If this is an issue, we need to fix it.
> > > > > > > > >
> > > > > > > > > It's basically a question that
> > > > > > > > >
> > > > > > > > > 1) should we need to wait for the DMA to be completed before assigning
> > > > > > > > > to the new as
> > > > > > > >
> > > > > > > > I don't think so, it is valid to assign a new as and let the ongoing
> > > > > > > > operation to continue. It is racy and the operation could fail, but
> > > > > > > > the kernel just returns an error and doesn't access invalid memory or
> > > > > > > > similar.
> > > > > > >
> > > > > > > See below.
> > > > > > >
> > > > > > > >
> > > > > > > > > 2) should we track the set_group_asid() for the group that has pending
> > > > > > > > > DMA to avoid potential issue
> > > > > > > > >
> > > > > > > >
> > > > > > > > No, the group will outlive the operation as it is bound to the device.
> > > > > > >
> > > > > > > I meant e.g the DMA could be triggered by the device. For example, the
> > > > > > > device may try to trigger an interrupt when the kernel is trying to
> > > > > > > assign a new asid. So I wonder if guest can use this to poke Qemu's
> > > > > > > memory etc.
> > > > > >
> > > > > > I'm not sure I get this point. If QEMU changes the ASID of the vq
> > > > > > group sent to the guest the race does not matter anymore: it is
> > > > > > explicitly opening the possibility from the guest to poke QEMU's
> > > > > > memory unless the guest is totally paused.
> > > > >
> > > > > Basically what I meant, assuming group0.as = as0
> > > > >
> > > > > cpu0] dma_map(group0.as, addr, DMA_FROM_DEVICE)
> > > > > cpu1] set_group_asid(group0.as, as1)
> > > > > cpu0] dma_unmap(group0.as, addr, DMA_FROM_DEVICE)
[1]
> > > > >
> > > > > cpu0 may read as1 while it wants as0 actually?
> > > > >
> > > >
> > > > Yes, kind of. That's my point: adding synchronization at vduse level
> > > > does not fix it.
> > > >
> > > > There is no way to do that call from vhost/vdpa or userland, as there
> > > > is no way to get the AS of a vq group, only to set it. The closest
> > > > thing is to add a cache at that level, but that implies to add
> > > > mutithreading sync on that upper layer, either vhost/vdpa or userland,
> > > > not in VDUSE.
[2]
> > >
> > > Probably.
> > >
> > > >
> > > > From vhost/vdpa level, all mapping calls (.set_map, .dma_map,
> > > > .dma_unmap) calls take the ASID directly, not the vq group. So the
> > > > call to set_group_asid does not need to access the vq group.
> > > >
[3]
> > > > Now let's say that we add that vdpa_ops callback (and ioctls) that
> > > > maps and unmap based on a vq_group. And all of the operations
> > > > (dma_map, set_group_asid, and dma_unmap) are serialized by taking the
> > > > same mutex. cpu0 still may dma_unmap over as0 if set_group_asid is not
> > > > properly serialized at vhost/vdpa or userland level:
> > > >
> > > > void* thread0_func(void* arg) {
> > > > struct {
> > > > int vq_group = 0,
> > > > int iova, size, perm, ...
> > > > } s;
> > > > int fd = (intptr_t)arg;
> > > >
> > > > ioctl(fd, VHOST_VDPA_VQ_GROUP_DMA_MAP, &s);
>
> I don't get the semantic of VHOST_VDPA_VQ_GROUP_DMA_MAP. And it's
> probably wrong to allow userspace to map based on group.
>
> We have VHOST_IOTLB_UPDATE which map pages based on as (asid) this is correct.
>
Yes, I said that at [2], in reply to your scenario at [1] :). Did I
misunderstand?
> More below
>
> > > > // TODO: Signal thread0 that it can proceed with SET_GROUP_ASID
> > > > // TODO: Wait until thread0 complete SET_GROUP_ASID
> > > >
> > > > ioctl(fd, VHOST_VDPA_VQ_GROUP_DMA_UNMAP, &data);
> > > >
> > > > return NULL;
> > > > }
> > > >
> > > > void* thread1_func(void* arg) {
> > > > struct vhost_vring_state s = {
> > > > .index = 0,
> > > > .num = 1,
> > > > };
> > > > int fd = (int)(intptr_t)arg;
> > > >
> > > > // TODO: Wait until thread2 calls dma_map
> > >
> > > This is something exactly rwlock or synchronize_rcu() can do? Or is
> > > this the charge of the vDPA parent to do that?
> > >
> >
> > No, that's an userland synchronization problem that cannot be solved
> > at kernel level. If we need to do something similar at vdpa core level
> > (unlikely), we will need to synchronize at that level too.
>
> Ok, basically I think we are talking about different things. That's fine.
>
> If I understand you correctly, you mean we need to synchronize between
> IOTLB updating (IOTLB_UPDATE/IOTLB_INVALIDATE) and set_group_asid()?
>
> This sounds unnecessary since:
>
> 1) IOTLB_UPDATE/IOTLB_INVALIDATE is updating the address space internal mappings
> 2) set_group_asid() is to assign an AS to a group
>
Right, we're on the same page here [3].
> >
> > If we add an rwlock but don't implement the synchronization in the
> > userland program marked as TODO, this sequence is also possible:
> >
> > cpu0] DMA map ioctl call in userland
> > -> Take read_lock in the vduse module
> > -> Update the IOTLB tree of ASID 0
> > -> unlock read_lock
> >
> > Now we have two possibilities: either cpu0] DMA_UNMAP is called or
> > cpu1] set_group_asid is called. The VDUSE module rwlock protects that
> > they will not run at the same time, but we need to implement something
> > at the userland level if we want a predictive outcome, marked as the
> > TODO in the comments. If, by chance, the DMA unmap is the one that
> > comes next, the AS updated is the 0:
> >
> > cpu0] DMA unmap ioctl call in userland
> > -> Take read_lock in the vduse module
> > -> Update the IOTLB tree of ASID 0
> > -> unlock read_lock
> > cpu1] set_group_asid ioctl call in userland
> > -> Take write_lock in the vduse module
> > -> Update ASID of the VQ GROUP 0 to 1
> > -> unlock write_lock
> >
> > If set_group_asid run first by chance, ASID 1 is the one that is updated:
> > cpu1] set_group_asid ioctl call in userland
> > -> Take write_lock in the vduse module
> > -> Update ASID of the VQ GROUP 0 to 1
> > -> unlock write_lock
> > cpu0] DMA unmap ioctl call in userland
> > -> Take read_lock in the vduse module
> > -> Update the IOTLB tree of ASID 0
> > -> unlock read_lock
> >
> > On the other hand we have this version of the series that allows these
> > actions to run at the same time. It just makes sure that the update of
> > the IOTLB tree is coherent, by copying the vq group ASID value at one
> > point in time and making sure it sticks to that ASID until the end of
> > the set_map call. I'm not adding the synchronize_rcu call because we
> > stated it should not be called from userland, but the outcome is
> > similar.
>
> I think we should figure out if VHOST_VDPA_VQ_GROUP_DMA_MAP is useful or not.
>
It's not. I just wanted to expand how I understood your scenario, but
let me know if I missed something!
> >
> > Let me put another example: It's like calling fdup() and write() from
> > different threads over the same set of fds without userland
> > synchronization to me. The kernel protects things like not write part
> > of the content in one file and another part on the other file, but the
> > content of the files at the end of the writes() is just not
> > predictable. And the kernel just cannot make it predictable.
> >
> > static int fd_a;
> > static int fd_b;
> >
> > void *write_thread(void *arg) {
> > const char *msg = "Writing to fd_b\n";
> > write(fd_b, msg, strlen(msg));
> > return NULL;
> > }
> >
> > void *dup_thread(void *arg) {
> > dup2(fd_a, fd_b);
> > return NULL;
> > }
> >
> > int main() {
> > pthread_t writer, duper;
> >
> > fd_a = open("/tmp/a", O_WRONLY | O_CREAT | O_TRUNC, 0644);
> > fd_b = open("/tmp/b", O_WRONLY | O_CREAT | O_TRUNC, 0644);
> >
> > pthread_create(&writer, NULL, write_thread, NULL);
> > pthread_create(&duper, NULL, dup_thread, NULL);
> >
> > pthread_join(writer, NULL);
> > pthread_join(duper, NULL);
> >
> > close(fd_a);
> > close(fd_b);
> >
> > return 0;
> > }
> > --
> >
> > > If it's the responsibility of the parent, it would be much harder for
> > > VDUSE as the datapath is implemented in usersapce via mmap() or
> > > umem_reg.
> > >
> > > Looking at existing implementation:
> > >
> > > - for mlx5e, it looks like it assumes the set_group_asid() works only
> > > without DRIVER_OK.
> > > - for the simulator, it looks like it can synchronize with the
> > > datapath with the spinlock as datapath is emulated
> > >
> >
> > Yes, the next version will use the rwlock spinlock. I just need to
> > take out the allocation of the domain for it to be valid.
> >
> > > > ioctl(fd, VHOST_VDPA_SET_GROUP_ASID, &s)
> > > > // TODO: Signal thread2 that can proceed with dma_unmap
> > >
> > > But the issue I mention is that the, from the view of the vDPA bus:
> > >
> > > 1) it offers set_group_as_id()
> > > 2) it doesn't know if virtio-vdpa or vhost-vdpa is used
> > >
> > > So theoretically, set_group_as_id() could happen between
> > >
> > > dma_addr = dma_map();
> > >
> > > and
> > >
> > > dma_unmap(dma_adrr);
> > >
> > > But those two dma_addr refers to the different address space.
> >
> > I don't get this, these calls take the ASID as the parameter, not the
> > vq group. I thought this was by design, as telling what vq groups
> > update seems way more difficult to me. Can you put an example of an
> > userland application that has the race you describe with the existing
> > ioctls?
>
> Just to clarify, I for dma_map()/dma_unmap() this is not the UAPI part
> (and we don't have that). I basically mean the dma API which is used
> by virtio-vDPA.
>
So let's say that the virtio_vdpa driver is able to call
.set_group_asid just to explain how much it is protected:
->vduse_dev_map_page and ->vduse_dev_unmap_page
In this series, rcu_dereference makes sure that only the stack copy of
the domain pointer is used, but the next version the code will have a
read_lock taken protecting that set_group_asid actions on
token.group->as cannot run at the same time that the
vduse_domain_map_page(...) or vduse_domain_unmap_page(...) function
call run in cpu0:
static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
unsigned long offset, size_t size,
enum dma_data_direction dir,
unsigned long attrs)
{
struct vduse_iova_domain *domain;
dma_addr_t r;
if (!token.group)
return DMA_MAPPING_ERROR;
read_lock(&dev->groups[group].as_lock);
domain = token.group->as->domain;
r = vduse_domain_map_page(domain, page, offset, size, dir, attrs);
read_unlock(&dev->groups[group].as_lock);
return r;
}
static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr,
size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
struct vduse_iova_domain *domain;
if (!token.group)
return;
read_lock(&dev->groups[group].as_lock);
domain = token.group->as->domain;
vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
read_unlock(&dev->groups[group].as_lock);
}
---
-> vduse_set_group_asid will have the corresponding write_lock, so it
cannot modify any vq group AS pointer while they're mapping or
unmapping pages from a domain:
static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
unsigned int group, unsigned int asid)
{
write_lock(&dev->groups[group].as_lock);
dev->groups[group].as = &dev->as[asid];
write_unlock(&dev->groups[group].as_lock);
}
---
Now your question:
assuming group0.as = as0
cpu0] dma_map(group0.as, addr, DMA_FROM_DEVICE)
cpu1] set_group_asid(group0.as, as1)
cpu0] dma_unmap(group0.as, addr, DMA_FROM_DEVICE)
cpu0 may read as1 while it wants as0 actually?
--
Yes, it can happen even with the rwlock or any other synchronization
in VDUSE, as it does not order the vdpa core calls.
This is the code I have in mind to prove this, let me know if I'm
missing something from your scenario:
static int thread1_func(void *data)
{
vduse_dev_map_page(token, ...);
printk(KERN_INFO "map_page called\n");
// TODO: Need a way to know if the group ASID has changed to
// know if this unmap is valid or not.
vduse_dev_unmap_page(token, ...);
printk(KERN_INFO "unmap_page called\n");
return 0;
}
static int thread2_func(void *data)
{
u32 vq_group = 0;
u32 asid = 1;
vdpa->ops->set_group_asid(global_vdpa, vq_group, asid);
printk(KERN_INFO "set_group_asid called\n");
return 0;
}
static int __init kernel_threads_init(void)
{
struct task_struct *thread1, *thread2;
thread1 = kthread_run(thread1_func, NULL, "vdpa_map_unmap");
thread2 = kthread_run(thread2_func, NULL, "vdpa_set_asid");
return 0;
}
---
I keep thinking that this is a race condition in the hypothetical vdpa
core or the module that runs this code, not solvable in VDUSE. We need
to add barriers at this level, not VDUSE.
But I think it is a good idea to add some code in VDUSE to mitigate
it. Maybe just adding the expected vq queue index to
vdev->map->map_page and vdev->map->unmap_page? This way we can issue
an error if the race happens.
> >
> > > Instead
> > > of trying to do synchronization, maybe we can simply fail
> > > set_group_asid if DRIVER_OK is set.
> > >
> >
> > That's a good possibility, especially since mlx5 already does it.
> > There is ongoing work to enable dataplane SVQ dynamically without
> > needing to reset the whole device, but we will need a new feature flag
> > to know if the parent driver supports it.
>
> Did you mean the SVQ may update the group asid while DRIVER_OK is set?
> If yes, we need to fix that.
>
Not at this moment.
> >
> > > >
> > > > return NULL;
> > > > }
> > > >
> > > > int main() {
> > > > pthread_t thread0, thread1;
> > > > int fd = open("/dev/vhost-vdpa-0", ...)
> > > >
> > > > pthread_create(&thread0, NULL, thread0_func, (void *)(intptr_t)fd);
> > > > pthread_create(&thread1, NULL, thread1_func, (void *)(intptr_t)fd);
> > > >
> > > > pthread_join(thread1, NULL);
> > > > pthread_join(thread2, NULL);
> > > >
> > > > return EXIT_SUCCESS;
> > > > }
> > > > ---
> > > >
> > > > We need something to synchronize at userland level here, filling the TODOs.
> > > >
> > > > We can replace the hypothetical VHOST_VDPA_VQ_GROUP_DMA_MAP and _UNMAP
> > > > with access to vqs, and the result is the same: The userland
> > > > application is the one that needs to serialize the access from thread0
> > > > if it wants predictive outcome against the accesses from thread1.
> > > > There is no way to do it at vduse level.
> > > >
> > > > We need some syncrhonization to avoid malicious or buggy userland to
> > > > mess things, that's for sure. So DMA_MAP and DMA_UNMAP does not half
> > > > update the iotlb tree. And I'll send the next version with rwlock,
> > > > protecting as much as possible. But I want to make clear that it will
> > > > not avoid the race you describe here.
> > >
> > > See above.
> > >
> > > >
> > > > > >
> > > > > > > But if you mean we depend on the IOTLB to guard against
> > > > > > > this, I'm fine, but let's document why we don't need it and how the
> > > > > > > IOTLB layer can help to eliminate such risk.
> > > > > > >
> > > > > >
> > > > > > No, I'm not happy about letting iotlb lock to protect this too as
> > > > > > they're at different levels actually: One is protecting iotlb trees
> > > > > > modifications while they're being read and the other is protecting the
> > > > > > ASID assignment to different vq groups. To reuse them means a
> > > > > > modification in any tree blocks the change of vq group ASID, for
> > > > > > example.
> > > > > >
> > > > > > > Anyhow, tracking and failing seems to be more robust.
> > > > > >
> > > > > > I'm not sure I get this. If a DMA read starts in one ASID and then
> > > > > > QEMU changes the ASID of the vq group, do you prefer it to fail rather
> > > > > > than continue reading from the original ASID?
> > > > >
> > > > > If possible, it would be better.
> > > > >
> > > >
> > > > It's hard for me to think in some way that does not add a lot of
> > > > overhead or it is very complex. But it is good to know this is
> > > > acceptable.
> > > >
> > > > > > It seems hard to
> > > > > > communicate that the ASID has changed to the DMA operation callback.
> > > > >
> > > > > Maybe we can encode this into iova.
> > > > >
> > > >
> > > > I'm adding the full rwlock, but can you expand on your idea on this?
> > >
> > > Encode the asid to the upper bits of IOVA, so when doing dma_unmap()
> > > we can compare the group/as with the one that is encoded in dma_addr.
> > > If it differs, warn or bug.
> > >
> >
> > But the ASID is already a parameter in the dma_unmap. I thought you
> > meant to encode in the device's memory read.
>
> To avoid unmap a address that is not belong to the this ASID.
>
> >
> > > If we can make sure there's no set_group_asid() when DRIVER_OK is set,
> > > it's not a must then (or could be treated as a kind of hardening).
> > >
> > > > It would be great to have it documented in case we need future
> > > > optimizations.
> > > >
> > >
> > > Thanks
> > >
>
> Thanks
>
On Fri, Dec 5, 2025 at 11:40 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Fri, Dec 5, 2025 at 2:52 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Thu, Dec 4, 2025 at 4:33 PM Eugenio Perez Martin <eperezma@redhat.com> wrote:
> > >
> > > On Thu, Dec 4, 2025 at 3:15 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Wed, Dec 3, 2025 at 3:58 PM Eugenio Perez Martin <eperezma@redhat.com> wrote:
> > > > >
> > > > > On Thu, Nov 20, 2025 at 2:38 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > > On Wed, Nov 19, 2025 at 5:27 PM Eugenio Perez Martin
> > > > > > <eperezma@redhat.com> wrote:
> > > > > > >
> > > > > > > On Wed, Nov 19, 2025 at 3:39 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Nov 17, 2025 at 8:16 PM Eugenio Perez Martin
> > > > > > > > <eperezma@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Mon, Nov 17, 2025 at 5:23 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Fri, Nov 14, 2025 at 7:25 PM Eugenio Perez Martin
> > > > > > > > > > <eperezma@redhat.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Fri, Nov 14, 2025 at 1:55 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > On Thu, Nov 13, 2025 at 7:56 PM Eugenio Pérez <eperezma@redhat.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > Add support for assigning Address Space Identifiers (ASIDs) to each VQ
> > > > > > > > > > > > > group. This enables mapping each group into a distinct memory space.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Now that the driver can change ASID in the middle of operation, the
> > > > > > > > > > > > > domain that each vq address point is also protected by domain_lock.
> > > > > > > > > > > >
> > > > > > > > > > > > Maybe it's better to document what is protected by RCU and how.
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > I added the _rcu annotation but I can expand it for sure. I can also
> > > > > > > > > > > modify the commit message.
> > > > > > > > > > >
> > > > > > > > > > > > More below.
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Acked-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > > >
> > > > > > > > > > > I forgot to remove this, my bad!
> > > > > > > > > > >
> > > > > > > > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > > > > > > > > > ---
> > > > > > > > > > > > > v9:
> > > > > > > > > > > > > * Replace mutex with RCU, as the vdpa map_ops can run from atomic
> > > > > > > > > > > > > context.
> > > > > > > > > > > > >
> > > > > > > > > > > > > v8:
> > > > > > > > > > > > > * Revert the mutex to rwlock change, it needs proper profiling to
> > > > > > > > > > > > > justify it.
> > > > > > > > > > > > >
> > > > > > > > > > > > > v7:
> > > > > > > > > > > > > * Take write lock in the error path (Jason).
> > > > > > > > > > > > >
> > > > > > > > > > > > > v6:
> > > > > > > > > > > > > * Make vdpa_dev_add use gotos for error handling (MST).
> > > > > > > > > > > > > * s/(dev->api_version < 1) ?/(dev->api_version < VDUSE_API_VERSION_1) ?/
> > > > > > > > > > > > > (MST).
> > > > > > > > > > > > > * Fix struct name not matching in the doc.
> > > > > > > > > > > > >
> > > > > > > > > > > > > v5:
> > > > > > > > > > > > > * Properly return errno if copy_to_user returns >0 in VDUSE_IOTLB_GET_FD
> > > > > > > > > > > > > ioctl (Jason).
> > > > > > > > > > > > > * Properly set domain bounce size to divide equally between nas (Jason).
> > > > > > > > > > > > > * Exclude "padding" member from the only >V1 members in
> > > > > > > > > > > > > vduse_dev_request.
> > > > > > > > > > > > >
> > > > > > > > > > > > > v4:
> > > > > > > > > > > > > * Divide each domain bounce size between the device bounce size (Jason).
> > > > > > > > > > > > > * revert unneeded addr = NULL assignment (Jason)
> > > > > > > > > > > > > * Change if (x && (y || z)) return to if (x) { if (y) return; if (z)
> > > > > > > > > > > > > return; } (Jason)
> > > > > > > > > > > > > * Change a bad multiline comment, using @ caracter instead of * (Jason).
> > > > > > > > > > > > > * Consider config->nas == 0 as a fail (Jason).
> > > > > > > > > > > > >
> > > > > > > > > > > > > v3:
> > > > > > > > > > > > > * Get the vduse domain through the vduse_as in the map functions
> > > > > > > > > > > > > (Jason).
> > > > > > > > > > > > > * Squash with the patch creating the vduse_as struct (Jason).
> > > > > > > > > > > > > * Create VDUSE_DEV_MAX_AS instead of comparing agains a magic number
> > > > > > > > > > > > > (Jason)
> > > > > > > > > > > > >
> > > > > > > > > > > > > v2:
> > > > > > > > > > > > > * Convert the use of mutex to rwlock.
> > > > > > > > > > > > >
> > > > > > > > > > > > > RFC v3:
> > > > > > > > > > > > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
> > > > > > > > > > > > > value to reduce memory consumption, but vqs are already limited to
> > > > > > > > > > > > > that value and userspace VDUSE is able to allocate that many vqs.
> > > > > > > > > > > > > * Remove TODO about merging VDUSE_IOTLB_GET_FD ioctl with
> > > > > > > > > > > > > VDUSE_IOTLB_GET_INFO.
> > > > > > > > > > > > > * Use of array_index_nospec in VDUSE device ioctls.
> > > > > > > > > > > > > * Embed vduse_iotlb_entry into vduse_iotlb_entry_v2.
> > > > > > > > > > > > > * Move the umem mutex to asid struct so there is no contention between
> > > > > > > > > > > > > ASIDs.
> > > > > > > > > > > > >
> > > > > > > > > > > > > RFC v2:
> > > > > > > > > > > > > * Make iotlb entry the last one of vduse_iotlb_entry_v2 so the first
> > > > > > > > > > > > > part of the struct is the same.
> > > > > > > > > > > > > ---
> > > > > > > > > > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 370 ++++++++++++++++++++---------
> > > > > > > > > > > > > include/uapi/linux/vduse.h | 53 ++++-
> > > > > > > > > > > > > 2 files changed, 314 insertions(+), 109 deletions(-)
> > > > > > > > > > > > >
> > > > > > > > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > > > > > index 97be04f73fbf..ff95ed56f22d 100644
> > > > > > > > > > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > > > > > > > > > @@ -11,6 +11,7 @@
> > > > > > > > > > > > > #include "linux/virtio_net.h"
> > > > > > > > > > > > > #include <linux/init.h>
> > > > > > > > > > > > > #include <linux/module.h>
> > > > > > > > > > > > > +#include <linux/rcupdate.h>
> > > > > > > > > > > > > #include <linux/cdev.h>
> > > > > > > > > > > > > #include <linux/device.h>
> > > > > > > > > > > > > #include <linux/eventfd.h>
> > > > > > > > > > > > > @@ -41,6 +42,7 @@
> > > > > > > > > > > > >
> > > > > > > > > > > > > #define VDUSE_DEV_MAX (1U << MINORBITS)
> > > > > > > > > > > > > #define VDUSE_DEV_MAX_GROUPS 0xffff
> > > > > > > > > > > > > +#define VDUSE_DEV_MAX_AS 0xffff
> > > > > > > > > > > > > #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> > > > > > > > > > > > > #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> > > > > > > > > > > > > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> > > > > > > > > > > > > @@ -86,7 +88,14 @@ struct vduse_umem {
> > > > > > > > > > > > > struct mm_struct *mm;
> > > > > > > > > > > > > };
> > > > > > > > > > > > >
> > > > > > > > > > > > > +struct vduse_as {
> > > > > > > > > > > > > + struct vduse_iova_domain *domain;
> > > > > > > > > > > > > + struct vduse_umem *umem;
> > > > > > > > > > > > > + struct mutex mem_lock;
> > > > > > > > > > > > > +};
> > > > > > > > > > > > > +
> > > > > > > > > > > > > struct vduse_vq_group {
> > > > > > > > > > > > > + struct vduse_as *as __rcu;
> > > > > > > > > > > > > struct vduse_dev *dev;
> > > > > > > > > > > > > };
> > > > > > > > > > > > >
> > > > > > > > > > > > > @@ -94,7 +103,7 @@ struct vduse_dev {
> > > > > > > > > > > > > struct vduse_vdpa *vdev;
> > > > > > > > > > > > > struct device *dev;
> > > > > > > > > > > > > struct vduse_virtqueue **vqs;
> > > > > > > > > > > > > - struct vduse_iova_domain *domain;
> > > > > > > > > > > > > + struct vduse_as *as;
> > > > > > > > > > > > > char *name;
> > > > > > > > > > > > > struct mutex lock;
> > > > > > > > > > > > > spinlock_t msg_lock;
> > > > > > > > > > > > > @@ -122,9 +131,8 @@ struct vduse_dev {
> > > > > > > > > > > > > u32 vq_num;
> > > > > > > > > > > > > u32 vq_align;
> > > > > > > > > > > > > u32 ngroups;
> > > > > > > > > > > > > - struct vduse_umem *umem;
> > > > > > > > > > > > > + u32 nas;
> > > > > > > > > > > > > struct vduse_vq_group *groups;
> > > > > > > > > > > > > - struct mutex mem_lock;
> > > > > > > > > > > > > unsigned int bounce_size;
> > > > > > > > > > > > > struct mutex domain_lock;
> > > > > > > > > > > > > };
> > > > > > > > > > > > > @@ -314,7 +322,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
> > > > > > > > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > > > > > }
> > > > > > > > > > > > >
> > > > > > > > > > > > > -static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > > > > > > > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
> > > > > > > > > > > > > u64 start, u64 last)
> > > > > > > > > > > > > {
> > > > > > > > > > > > > struct vduse_dev_msg msg = { 0 };
> > > > > > > > > > > > > @@ -323,8 +331,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
> > > > > > > > > > > > > return -EINVAL;
> > > > > > > > > > > > >
> > > > > > > > > > > > > msg.req.type = VDUSE_UPDATE_IOTLB;
> > > > > > > > > > > > > - msg.req.iova.start = start;
> > > > > > > > > > > > > - msg.req.iova.last = last;
> > > > > > > > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1) {
> > > > > > > > > > > > > + msg.req.iova.start = start;
> > > > > > > > > > > > > + msg.req.iova.last = last;
> > > > > > > > > > > > > + } else {
> > > > > > > > > > > > > + msg.req.iova_v2.start = start;
> > > > > > > > > > > > > + msg.req.iova_v2.last = last;
> > > > > > > > > > > > > + msg.req.iova_v2.asid = asid;
> > > > > > > > > > > > > + }
> > > > > > > > > > > > >
> > > > > > > > > > > > > return vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > > > > > }
> > > > > > > > > > > > > @@ -436,14 +450,32 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
> > > > > > > > > > > > > return mask;
> > > > > > > > > > > > > }
> > > > > > > > > > > > >
> > > > > > > > > > > > > +/* Force set the asid to a vq group without a message to the VDUSE device */
> > > > > > > > > > > > > +static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> > > > > > > > > > > > > + unsigned int group, unsigned int asid)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > + /*
> > > > > > > > > > > > > + * Two concurrent updates to this pointer are valid as they cannot
> > > > > > > > > > > > > + * point to an invalid region. It is ok for them to race as long as
> > > > > > > > > > > > > + * the readers see a consistent state through RCU.
> > > > > > > > > > > > > + */
> > > > > > > > > > > > > + rcu_assign_pointer(dev->groups[group].as, &dev->as[asid]);
> > > > > > > > > > > >
> > > > > > > > > > > > I'd expect at least a synchronize_rcu() here to wait for the read is done?
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > What's the use? The only thing left here is to return from
> > > > > > > > > > > vduse_set_group_asid_nomsg, and we don't need to wait for readers
> > > > > > > > > > > here, do we?
> > > > > > > > > >
> > > > > > > > > > See below.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > static void vduse_dev_reset(struct vduse_dev *dev)
> > > > > > > > > > > > > {
> > > > > > > > > > > > > int i;
> > > > > > > > > > > > > - struct vduse_iova_domain *domain = dev->domain;
> > > > > > > > > > > > >
> > > > > > > > > > > > > /* The coherent mappings are handled in vduse_dev_free_coherent() */
> > > > > > > > > > > > > - if (domain && domain->bounce_map)
> > > > > > > > > > > > > - vduse_domain_reset_bounce_map(domain);
> > > > > > > > > > > > > + for (i = 0; i < dev->nas; i++) {
> > > > > > > > > > > > > + struct vduse_iova_domain *domain = dev->as[i].domain;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > + if (domain && domain->bounce_map)
> > > > > > > > > > > > > + vduse_domain_reset_bounce_map(domain);
> > > > > > > > > > > > > + }
> > > > > > > > > > > > > +
> > > > > > > > > > > > > + for (i = 0; i < dev->ngroups; i++)
> > > > > > > > > > > > > + vduse_set_group_asid_nomsg(dev, i, 0);
> > > > > > > > > > > > >
> > > > > > > > > > > > > down_write(&dev->rwsem);
> > > > > > > > > > > > >
> > > > > > > > > > > > > @@ -623,6 +655,29 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
> > > > > > > > > > > > > return ret;
> > > > > > > > > > > > > }
> > > > > > > > > > > > >
> > > > > > > > > > > > > +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
> > > > > > > > > > > > > + unsigned int asid)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > > > > > > > + struct vduse_dev_msg msg = { 0 };
> > > > > > > > > > > > > + int r;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > + if (dev->api_version < VDUSE_API_VERSION_1 ||
> > > > > > > > > > > > > + group >= dev->ngroups || asid >= dev->nas)
> > > > > > > > > > > > > + return -EINVAL;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > + msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
> > > > > > > > > > > > > + msg.req.vq_group_asid.group = group;
> > > > > > > > > > > > > + msg.req.vq_group_asid.asid = asid;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > + r = vduse_dev_msg_sync(dev, &msg);
> > > > > > > > > > > > > + if (r < 0)
> > > > > > > > > > > > > + return r;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > + vduse_set_group_asid_nomsg(dev, group, asid);
> > > > > > > > > > > > > + return 0;
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> > > > > > > > > > > > > struct vdpa_vq_state *state)
> > > > > > > > > > > > > {
> > > > > > > > > > > > > @@ -794,13 +849,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> > > > > > > > > > > > > struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > > > > > > > > > > > int ret;
> > > > > > > > > > > > >
> > > > > > > > > > > > > - ret = vduse_domain_set_map(dev->domain, iotlb);
> > > > > > > > > > > > > + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
> > > > > > > > > > > > > if (ret)
> > > > > > > > > > > > > return ret;
> > > > > > > > > > > > >
> > > > > > > > > > > > > - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
> > > > > > > > > > > > > + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
> > > > > > > > > > > > > if (ret) {
> > > > > > > > > > > > > - vduse_domain_clear_map(dev->domain, iotlb);
> > > > > > > > > > > > > + vduse_domain_clear_map(dev->as[asid].domain, iotlb);
> > > > > > > > > > > > > return ret;
> > > > > > > > > > > > > }
> > > > > > > > > > > > >
> > > > > > > > > > > > > @@ -843,6 +898,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> > > > > > > > > > > > > .get_vq_affinity = vduse_vdpa_get_vq_affinity,
> > > > > > > > > > > > > .reset = vduse_vdpa_reset,
> > > > > > > > > > > > > .set_map = vduse_vdpa_set_map,
> > > > > > > > > > > > > + .set_group_asid = vduse_set_group_asid,
> > > > > > > > > > > > > .get_vq_map = vduse_get_vq_map,
> > > > > > > > > > > > > .free = vduse_vdpa_free,
> > > > > > > > > > > > > };
> > > > > > > > > > > > > @@ -852,14 +908,17 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
> > > > > > > > > > > > > enum dma_data_direction dir)
> > > > > > > > > > > > > {
> > > > > > > > > > > > > struct vduse_dev *vdev;
> > > > > > > > > > > > > + struct vduse_as *as;
> > > > > > > > > > > > > struct vduse_iova_domain *domain;
> > > > > > > > > > > > >
> > > > > > > > > > > > > if (!token.group)
> > > > > > > > > > > > > return;
> > > > > > > > > > > > >
> > > > > > > > > > > > > vdev = token.group->dev;
> > > > > > > > > > > > > - domain = vdev->domain;
> > > > > > > > > > > > > -
> > > > > > > > > > > > > + rcu_read_lock();
> > > > > > > > > > > > > + as = rcu_dereference(token.group->as);
> > > > > > > > > > > > > + domain = as->domain;
> > > > > > > > > > > > > + rcu_read_unlock();
> > > > > > > > > > > > > vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
> > > > > > > > > > > >
> > > > > > > > > > > > This is suspicious, at least we should do rcu_read_unlock() after
> > > > > > > > > > > > vduse_domain_sync_single_for_device(), otherwise I don't see how RCU
> > > > > > > > > > > > works.
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > RCU is protecting that the address space pointer of the vq group is
> > > > > > > > > > > not modified concurrently with the access. Ideally, this should be a
> > > > > > > > > > > full lock, but just making sure that all accesses from the reader are
> > > > > > > > > > > coherent is enough. Userspace should expect nothing if it uses the map
> > > > > > > > > > > and modifies the vq group ASID at the same time anyway, but the kernel
> > > > > > > > > > > needs to be sure that it does not see intermediate states. TBH, we
> > > > > > > > > > > could move to a READ_ONCE / WRITE_ONCE, would that be more clear?
> > > > > > > > > >
> > > > > > > > > > Using READ_ONCE/WRITE_ONCE() needs to make sure the ordering is
> > > > > > > > > > handled correctly.
> > > > > > > > > >
> > > > > > > > > > But I meant what happens if
> > > > > > > > > >
> > > > > > > > > > [cpu0]rcu_read_lock()
> > > > > > > > > > [cpu0]as = rcu_dereference(token.group->as)
> > > > > > > > > > [cpu0]...
> > > > > > > > > > [cpu0]rcu_read_unlock()
> > > > > > > > > > [cpu1]rcu_assign_pointer(token.group->as)
> > > > > > > > > > [cpu0]vduse_domain_sync_single_for_device()
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > That should go ok. What I'm trying to protect here is the iterations
> > > > > > > > > in vduse_domain_sync_single_for_device -> vduse_domain_bounce.
> > > > > > > > >
> > > > > > > > > I'm going to embed that function here in
> > > > > > > > > vduse_dev_sync_single_for_device and omit RCU and some details to make
> > > > > > > > > the point easier:
> > > > > > > > >
> > > > > > > > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > > > > > > > iova, size_t size, ...) {
> > > > > > > > > read_lock(&token.group->as->domain);
> > > > > > > > > while (size)
> > > > > > > > > map = token.group->as->domain->bounce_maps[iova];
> > > > > > > > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > > > > > > > >
> > > > > > > > > ...
> > > > > > > > > page = token_group->as->domain->bounce_maps
> > > > > > > > > addr = kmap_local_page(page);
> > > > > > > > > do_bounce(map->orig_phys, addr, sz, dir);
> > > > > > > > > kunmap_local(addr);
> > > > > > > > > size -= sz;
> > > > > > > > > iova += sz;
> > > > > > > > > }
> > > > > > > > > read_unlock(&token.group->as->domain);
> > > > > > > > > }
> > > > > > > >
> > > > > > > > Right, so I meant for rwlock like semantic (let's forget the sleeping here).
> > > > > > > >
> > > > > > > > vduse_set_group_asid_nomsg() should use "write lock" so it must wait
> > > > > > > > for the "read lock" to be done.
> > > > > > >
> > > > > > > No, it doesn't need to wait as long as the reader part uses its own copy.
> > > > > >
> > > > > > It probably won't crash but I meant if we have logic issues. For
> > > > > > example, once set_group_asid() return, there should still be a pending
> > > > > > DMA that is using the old as.
> > > > > >
> > > > > > >
> > > > > > > > But this is not the logic that is
> > > > > > > > implemented in this patch as there's no synchronize_rcu() in the
> > > > > > > > vduse_set_group_asid_nomsg().
> > > > > > >
> > > > > > > We only set the pointer on the writer's side, we do nothing like
> > > > > > > freeing resources. Should we set the pointer before or after
> > > > > > > syncrhonize_rcu()? What do we need to do on the other side of
> > > > > > > syncrhonize_rcu()?
> > > > > >
> > > > > > Usually we don't need special care on the read side. But as discussed,
> > > > > > synchronize_rcu() is not a must but we need to explain why it is safe
> > > > > > and I'm not sure Michael is fine with that.
> > > > > > If we just want to make sure the order of publish and read, we can
> > > > > > switch to use smp_store_release() and smp_load_acqurie().
> > > > > >
> > > > > > >
> > > > > > > > We need to explain why set_group_asid()
> > > > > > > > doesn't need to wait and if this is true, we probably don't need RCU
> > > > > > > > but to make sure the load/store is atomic.
> > > > > > > >
> > > > > > >
> > > > > > > What about:
> > > > > > >
> > > > > > > * It does not matter if other thread modify group->as as long as the
> > > > > > > reader uses the same as for all its operation. It performs a local
> > > > > > > copy for that reason.
> > > > > > > * It does not matter if multiple threads modify group->as as long as
> > > > > > > the update is atomic.
> > > > > >
> > > > > > See above reply.
> > > > > >
> > > > > > >
> > > > > > > ?
> > > > > > >
> > > > > > > > >
> > > > > > > > > Now, depending on the point where another execution thread changes
> > > > > > > > > token_group->as and how the compiler has chosen to generate the
> > > > > > > > > machine code, the outcome could be:
> > > > > > > > > 1) The domain read lock of one ASID is taken but the domain lock of
> > > > > > > > > another as is unlocked.
> > > > > > > > > 2) We iterate until iova is ok for the ASID we're handling, but not
> > > > > > > > > for the other one. So we access an invalid offset in
> > > > > > > > > bounce_maps[iova].
> > > > > > > > >
> > > > > > > > > And I guess there are other possible outcomes too.
> > > > > > > > >
> > > > > > > > > So I need to make sure that the pointer accesses in all
> > > > > > > > > vduse_domain_bounce is coherent.
> > > > > > > >
> > > > > > > > I'm not sure I got here, but it looks like it accepts a domain
> > > > > > > > parameter and is protected by the bounce lock so we are probably fine
> > > > > > > > here?
> > > > > > > >
> > > > > > >
> > > > > > > The bounce lock only protects the iotlb tree, not the pointer to that
> > > > > > > iotlb tree.
> > > > > > >
> > > > > > > > > I'm ok if it takes the one before the
> > > > > > > > > concurrent call to vduse_set_group_asid_nomsg or the one after that,
> > > > > > > > > as the lifetime of all domains are bound to the device. But it cannot
> > > > > > > > > change in the middle of the operation:
> > > > > > > > >
> > > > > > > > > vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t
> > > > > > > > > iova, size_t size, ...) {
> > > > > > > > > as = token.group->as;
> > > > > > > > > // Tell the compiler to never replace "as" by "token.group->as" after this.
> > > > > > > > > read_lock(&as->domain);
> > > > > > > > > while (size)
> > > > > > > > > map = as->domain->bounce_maps[iova];
> > > > > > > > > sz = min_t(size_t, BOUNCE_MAP_SIZE, size);
> > > > > > > > >
> > > > > > > > > ...
> > > > > > > > > page = as->domain->bounce_maps
> > > > > > > > > addr = kmap_local_page(page);
> > > > > > > > > do_bounce(map->orig_phys, addr, sz, dir);
> > > > > > > > > kunmap_local(addr);
> > > > > > > > > size -= sz;
> > > > > > > > > iova += sz;
> > > > > > > > > }
> > > > > > > > > read_unlock(&as->domain);
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > > That can be done in many ways. Probably the read_lock is already
> > > > > > > > > enough but it is not explicit that it is protecting token.group->as,
> > > > > > > > > and future changes could remove it. To me, RCU is the most clear way
> > > > > > > > > to do it, but even a volatile read (READ_ONCE?) would do.
> > > > > > > >
> > > > > > > > I wonder if another group rwlock is sufficient here:
> > > > > > > >
> > > > > > > > for set_group_as_id()
> > > > > > > >
> > > > > > > > write_lock(&dev->groups[group].lock);
> > > > > > > > dev->groups[group].as = &dev->as[asid];
> > > > > > > > write_unlock(&dev->groups[group].lock);
> > > > > > > >
> > > > > > > > for the case where we need defer as
> > > > > > > >
> > > > > > > > read_lock(&dev->groups[group].lock);
> > > > > > > > as = dev->groups[group].as;
> > > > > > > > //using as
> > > > > > > > read_unlock(&dev->groups[group].lock);
> > > > > > > >
> > > > > > > > If this works, we don't need to bother with thinking if the
> > > > > > > > wait/synchronizre_rcu() is really needed or not?
> > > > > > > >
> > > > > > >
> > > > > > > A rwlock is sufficient but we need to modify the allocation code
> > > > > > > somehow. Also, I thought we wanted to avoid the overhead of taking the
> > > > > > > read lock in the DMA ops too.
> > > > > >
> > > > > > Right, but it would always be a balance. We can make sure it works
> > > > > > correctly first then do optimization on top.
> > > > > >
> > > > > > >
> > > > > > > Another disadvantage of the lock vs RCU or READ_ONCE is that the vq
> > > > > > > group ASID change needs to wait for the DMA operation to finish
> > > > > > > instead of just applying for the next DMA ops. Not like vq group ASID
> > > > > > > change would be in the hot path anyway, just pointing it out.
> > > > > > >
> > > > > > > > >
> > > > > > > > > > If this is not an issue, RCU is not a must, but please explain why.
> > > > > > > > > > If this is an issue, we need to fix it.
> > > > > > > > > >
> > > > > > > > > > It's basically a question that
> > > > > > > > > >
> > > > > > > > > > 1) should we need to wait for the DMA to be completed before assigning
> > > > > > > > > > to the new as
> > > > > > > > >
> > > > > > > > > I don't think so, it is valid to assign a new as and let the ongoing
> > > > > > > > > operation to continue. It is racy and the operation could fail, but
> > > > > > > > > the kernel just returns an error and doesn't access invalid memory or
> > > > > > > > > similar.
> > > > > > > >
> > > > > > > > See below.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > 2) should we track the set_group_asid() for the group that has pending
> > > > > > > > > > DMA to avoid potential issue
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > No, the group will outlive the operation as it is bound to the device.
> > > > > > > >
> > > > > > > > I meant e.g the DMA could be triggered by the device. For example, the
> > > > > > > > device may try to trigger an interrupt when the kernel is trying to
> > > > > > > > assign a new asid. So I wonder if guest can use this to poke Qemu's
> > > > > > > > memory etc.
> > > > > > >
> > > > > > > I'm not sure I get this point. If QEMU changes the ASID of the vq
> > > > > > > group sent to the guest the race does not matter anymore: it is
> > > > > > > explicitly opening the possibility from the guest to poke QEMU's
> > > > > > > memory unless the guest is totally paused.
> > > > > >
> > > > > > Basically what I meant, assuming group0.as = as0
> > > > > >
> > > > > > cpu0] dma_map(group0.as, addr, DMA_FROM_DEVICE)
> > > > > > cpu1] set_group_asid(group0.as, as1)
> > > > > > cpu0] dma_unmap(group0.as, addr, DMA_FROM_DEVICE)
>
> [1]
>
> > > > > >
> > > > > > cpu0 may read as1 while it wants as0 actually?
> > > > > >
> > > > >
> > > > > Yes, kind of. That's my point: adding synchronization at vduse level
> > > > > does not fix it.
> > > > >
> > > > > There is no way to do that call from vhost/vdpa or userland, as there
> > > > > is no way to get the AS of a vq group, only to set it. The closest
> > > > > thing is to add a cache at that level, but that implies to add
> > > > > mutithreading sync on that upper layer, either vhost/vdpa or userland,
> > > > > not in VDUSE.
>
> [2]
>
> > > >
> > > > Probably.
> > > >
> > > > >
> > > > > From vhost/vdpa level, all mapping calls (.set_map, .dma_map,
> > > > > .dma_unmap) calls take the ASID directly, not the vq group. So the
> > > > > call to set_group_asid does not need to access the vq group.
> > > > >
>
> [3]
>
> > > > > Now let's say that we add that vdpa_ops callback (and ioctls) that
> > > > > maps and unmap based on a vq_group. And all of the operations
> > > > > (dma_map, set_group_asid, and dma_unmap) are serialized by taking the
> > > > > same mutex. cpu0 still may dma_unmap over as0 if set_group_asid is not
> > > > > properly serialized at vhost/vdpa or userland level:
> > > > >
> > > > > void* thread0_func(void* arg) {
> > > > > struct {
> > > > > int vq_group = 0,
> > > > > int iova, size, perm, ...
> > > > > } s;
> > > > > int fd = (intptr_t)arg;
> > > > >
> > > > > ioctl(fd, VHOST_VDPA_VQ_GROUP_DMA_MAP, &s);
> >
> > I don't get the semantic of VHOST_VDPA_VQ_GROUP_DMA_MAP. And it's
> > probably wrong to allow userspace to map based on group.
> >
> > We have VHOST_IOTLB_UPDATE which map pages based on as (asid) this is correct.
> >
>
> Yes, I said that at [2], in reply to your scenario at [1] :). Did I
> misunderstand?
Nope, it looks like it's me that misread your reply.
>
> > More below
> >
> > > > > // TODO: Signal thread0 that it can proceed with SET_GROUP_ASID
> > > > > // TODO: Wait until thread0 complete SET_GROUP_ASID
> > > > >
> > > > > ioctl(fd, VHOST_VDPA_VQ_GROUP_DMA_UNMAP, &data);
> > > > >
> > > > > return NULL;
> > > > > }
> > > > >
> > > > > void* thread1_func(void* arg) {
> > > > > struct vhost_vring_state s = {
> > > > > .index = 0,
> > > > > .num = 1,
> > > > > };
> > > > > int fd = (int)(intptr_t)arg;
> > > > >
> > > > > // TODO: Wait until thread2 calls dma_map
> > > >
> > > > This is something exactly rwlock or synchronize_rcu() can do? Or is
> > > > this the charge of the vDPA parent to do that?
> > > >
> > >
> > > No, that's an userland synchronization problem that cannot be solved
> > > at kernel level. If we need to do something similar at vdpa core level
> > > (unlikely), we will need to synchronize at that level too.
> >
> > Ok, basically I think we are talking about different things. That's fine.
> >
> > If I understand you correctly, you mean we need to synchronize between
> > IOTLB updating (IOTLB_UPDATE/IOTLB_INVALIDATE) and set_group_asid()?
> >
> > This sounds unnecessary since:
> >
> > 1) IOTLB_UPDATE/IOTLB_INVALIDATE is updating the address space internal mappings
> > 2) set_group_asid() is to assign an AS to a group
> >
>
> Right, we're on the same page here [3].
>
> > >
> > > If we add an rwlock but don't implement the synchronization in the
> > > userland program marked as TODO, this sequence is also possible:
> > >
> > > cpu0] DMA map ioctl call in userland
> > > -> Take read_lock in the vduse module
> > > -> Update the IOTLB tree of ASID 0
> > > -> unlock read_lock
> > >
> > > Now we have two possibilities: either cpu0] DMA_UNMAP is called or
> > > cpu1] set_group_asid is called. The VDUSE module rwlock protects that
> > > they will not run at the same time, but we need to implement something
> > > at the userland level if we want a predictive outcome, marked as the
> > > TODO in the comments. If, by chance, the DMA unmap is the one that
> > > comes next, the AS updated is the 0:
> > >
> > > cpu0] DMA unmap ioctl call in userland
> > > -> Take read_lock in the vduse module
> > > -> Update the IOTLB tree of ASID 0
> > > -> unlock read_lock
> > > cpu1] set_group_asid ioctl call in userland
> > > -> Take write_lock in the vduse module
> > > -> Update ASID of the VQ GROUP 0 to 1
> > > -> unlock write_lock
> > >
> > > If set_group_asid run first by chance, ASID 1 is the one that is updated:
> > > cpu1] set_group_asid ioctl call in userland
> > > -> Take write_lock in the vduse module
> > > -> Update ASID of the VQ GROUP 0 to 1
> > > -> unlock write_lock
> > > cpu0] DMA unmap ioctl call in userland
> > > -> Take read_lock in the vduse module
> > > -> Update the IOTLB tree of ASID 0
> > > -> unlock read_lock
> > >
> > > On the other hand we have this version of the series that allows these
> > > actions to run at the same time. It just makes sure that the update of
> > > the IOTLB tree is coherent, by copying the vq group ASID value at one
> > > point in time and making sure it sticks to that ASID until the end of
> > > the set_map call. I'm not adding the synchronize_rcu call because we
> > > stated it should not be called from userland, but the outcome is
> > > similar.
> >
> > I think we should figure out if VHOST_VDPA_VQ_GROUP_DMA_MAP is useful or not.
> >
>
> It's not. I just wanted to expand how I understood your scenario, but
> let me know if I missed something!
>
> > >
> > > Let me put another example: It's like calling fdup() and write() from
> > > different threads over the same set of fds without userland
> > > synchronization to me. The kernel protects things like not write part
> > > of the content in one file and another part on the other file, but the
> > > content of the files at the end of the writes() is just not
> > > predictable. And the kernel just cannot make it predictable.
> > >
> > > static int fd_a;
> > > static int fd_b;
> > >
> > > void *write_thread(void *arg) {
> > > const char *msg = "Writing to fd_b\n";
> > > write(fd_b, msg, strlen(msg));
> > > return NULL;
> > > }
> > >
> > > void *dup_thread(void *arg) {
> > > dup2(fd_a, fd_b);
> > > return NULL;
> > > }
> > >
> > > int main() {
> > > pthread_t writer, duper;
> > >
> > > fd_a = open("/tmp/a", O_WRONLY | O_CREAT | O_TRUNC, 0644);
> > > fd_b = open("/tmp/b", O_WRONLY | O_CREAT | O_TRUNC, 0644);
> > >
> > > pthread_create(&writer, NULL, write_thread, NULL);
> > > pthread_create(&duper, NULL, dup_thread, NULL);
> > >
> > > pthread_join(writer, NULL);
> > > pthread_join(duper, NULL);
> > >
> > > close(fd_a);
> > > close(fd_b);
> > >
> > > return 0;
> > > }
> > > --
> > >
> > > > If it's the responsibility of the parent, it would be much harder for
> > > > VDUSE as the datapath is implemented in usersapce via mmap() or
> > > > umem_reg.
> > > >
> > > > Looking at existing implementation:
> > > >
> > > > - for mlx5e, it looks like it assumes the set_group_asid() works only
> > > > without DRIVER_OK.
> > > > - for the simulator, it looks like it can synchronize with the
> > > > datapath with the spinlock as datapath is emulated
> > > >
> > >
> > > Yes, the next version will use the rwlock spinlock. I just need to
> > > take out the allocation of the domain for it to be valid.
> > >
> > > > > ioctl(fd, VHOST_VDPA_SET_GROUP_ASID, &s)
> > > > > // TODO: Signal thread2 that can proceed with dma_unmap
> > > >
> > > > But the issue I mention is that the, from the view of the vDPA bus:
> > > >
> > > > 1) it offers set_group_as_id()
> > > > 2) it doesn't know if virtio-vdpa or vhost-vdpa is used
> > > >
> > > > So theoretically, set_group_as_id() could happen between
> > > >
> > > > dma_addr = dma_map();
> > > >
> > > > and
> > > >
> > > > dma_unmap(dma_adrr);
> > > >
> > > > But those two dma_addr refers to the different address space.
> > >
> > > I don't get this, these calls take the ASID as the parameter, not the
> > > vq group. I thought this was by design, as telling what vq groups
> > > update seems way more difficult to me. Can you put an example of an
> > > userland application that has the race you describe with the existing
> > > ioctls?
> >
> > Just to clarify, I for dma_map()/dma_unmap() this is not the UAPI part
> > (and we don't have that). I basically mean the dma API which is used
> > by virtio-vDPA.
> >
>
> So let's say that the virtio_vdpa driver is able to call
> .set_group_asid just to explain how much it is protected:
>
> ->vduse_dev_map_page and ->vduse_dev_unmap_page
> In this series, rcu_dereference makes sure that only the stack copy of
> the domain pointer is used, but the next version the code will have a
> read_lock taken protecting that set_group_asid actions on
> token.group->as cannot run at the same time that the
> vduse_domain_map_page(...) or vduse_domain_unmap_page(...) function
> call run in cpu0:
>
> static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
> unsigned long offset, size_t size,
> enum dma_data_direction dir,
> unsigned long attrs)
> {
> struct vduse_iova_domain *domain;
> dma_addr_t r;
>
> if (!token.group)
> return DMA_MAPPING_ERROR;
>
> read_lock(&dev->groups[group].as_lock);
> domain = token.group->as->domain;
> r = vduse_domain_map_page(domain, page, offset, size, dir, attrs);
> read_unlock(&dev->groups[group].as_lock);
>
> return r;
> }
>
> static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr,
> size_t size, enum dma_data_direction dir,
> unsigned long attrs)
> {
> struct vduse_iova_domain *domain;
>
> if (!token.group)
> return;
>
> read_lock(&dev->groups[group].as_lock);
> domain = token.group->as->domain;
> vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
> read_unlock(&dev->groups[group].as_lock);
> }
> ---
>
> -> vduse_set_group_asid will have the corresponding write_lock, so it
> cannot modify any vq group AS pointer while they're mapping or
> unmapping pages from a domain:
> static void vduse_set_group_asid_nomsg(struct vduse_dev *dev,
> unsigned int group, unsigned int asid)
> {
> write_lock(&dev->groups[group].as_lock);
> dev->groups[group].as = &dev->as[asid];
> write_unlock(&dev->groups[group].as_lock);
> }
> ---
>
> Now your question:
>
> assuming group0.as = as0
>
> cpu0] dma_map(group0.as, addr, DMA_FROM_DEVICE)
> cpu1] set_group_asid(group0.as, as1)
> cpu0] dma_unmap(group0.as, addr, DMA_FROM_DEVICE)
>
> cpu0 may read as1 while it wants as0 actually?
> --
>
> Yes, it can happen even with the rwlock or any other synchronization
> in VDUSE, as it does not order the vdpa core calls.
>
> This is the code I have in mind to prove this, let me know if I'm
> missing something from your scenario:
>
> static int thread1_func(void *data)
> {
> vduse_dev_map_page(token, ...);
> printk(KERN_INFO "map_page called\n");
>
> // TODO: Need a way to know if the group ASID has changed to
> // know if this unmap is valid or not.
> vduse_dev_unmap_page(token, ...);
> printk(KERN_INFO "unmap_page called\n");
>
> return 0;
> }
>
> static int thread2_func(void *data)
> {
> u32 vq_group = 0;
> u32 asid = 1;
>
> vdpa->ops->set_group_asid(global_vdpa, vq_group, asid);
> printk(KERN_INFO "set_group_asid called\n");
>
> return 0;
> }
>
> static int __init kernel_threads_init(void)
> {
> struct task_struct *thread1, *thread2;
>
> thread1 = kthread_run(thread1_func, NULL, "vdpa_map_unmap");
> thread2 = kthread_run(thread2_func, NULL, "vdpa_set_asid");
>
> return 0;
> }
>
> ---
>
> I keep thinking that this is a race condition in the hypothetical vdpa
> core or the module that runs this code, not solvable in VDUSE. We need
> to add barriers at this level, not VDUSE.
>
> But I think it is a good idea to add some code in VDUSE to mitigate
> it. Maybe just adding the expected vq queue index to
> vdev->map->map_page and vdev->map->unmap_page?
This seems to be more complicated than encoding asid to the iova.
> This way we can issue
> an error if the race happens.
As discussed, it could be avoided by disallowing the call to
set_group_asid() when DRIVER_OK is set.
>
> > >
> > > > Instead
> > > > of trying to do synchronization, maybe we can simply fail
> > > > set_group_asid if DRIVER_OK is set.
> > > >
> > >
> > > That's a good possibility, especially since mlx5 already does it.
> > > There is ongoing work to enable dataplane SVQ dynamically without
> > > needing to reset the whole device, but we will need a new feature flag
> > > to know if the parent driver supports it.
> >
> > Did you mean the SVQ may update the group asid while DRIVER_OK is set?
> > If yes, we need to fix that.
> >
>
> Not at this moment.
Great.
Thanks
On Wed, Nov 19, 2025 at 10:26:43AM +0100, Eugenio Perez Martin wrote: > > But this is not the logic that is > > implemented in this patch as there's no synchronize_rcu() in the > > vduse_set_group_asid_nomsg(). > > We only set the pointer on the writer's side, we do nothing like > freeing resources. Should we set the pointer before or after > syncrhonize_rcu()? synchronize_rcu is called after writer makes it's changes. > What do we need to do on the other side of > syncrhonize_rcu()? Presumably, return so the caller knows the as has been updated. However, user-triggerable synchronize_rcu() is almost always a bug. If that's what is going on, you want srcu. -- MST
On Wed, Nov 19, 2025 at 10:32 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Wed, Nov 19, 2025 at 10:26:43AM +0100, Eugenio Perez Martin wrote: > > > But this is not the logic that is > > > implemented in this patch as there's no synchronize_rcu() in the > > > vduse_set_group_asid_nomsg(). > > > > We only set the pointer on the writer's side, we do nothing like > > freeing resources. Should we set the pointer before or after > > syncrhonize_rcu()? > > synchronize_rcu is called after writer makes it's changes. > > > What do we need to do on the other side of > > syncrhonize_rcu()? > > Presumably, return so the caller knows the as has been updated. > I'm happy to add the syncrhonize_rcu() just in case, but the caller of vduse_set_group_asid_nomsg does not need to know that the reader has been updated. The first caller is vduse_dev_reset which has its own way to avoid being called while vqs are being processed. In particular, it reset their addresses which is way more dangerous. I could call it with dev->rwsem down though. The second one is set_group_asid vdpa callback which is called from the ioctl itself. Moreover, rcu_assign_pointer is WRITE_ONCE by itself so we know all the readers will get the new value after it. So what's the value of explicitly waiting for all the readers to finalize their DMA operation? I'd understand if we need to do modifications or memory management in the now unused ASID, but that's not the case here. > However, user-triggerable synchronize_rcu() is almost always a bug. > > If that's what is going on, you want srcu. > I did not know about it, thanks! but I think all the code that can sleep is out of the RCU critical sections now, isn't it?
On Wed, Nov 19, 2025 at 11:38:32AM +0100, Eugenio Perez Martin wrote: > On Wed, Nov 19, 2025 at 10:32 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Wed, Nov 19, 2025 at 10:26:43AM +0100, Eugenio Perez Martin wrote: > > > > But this is not the logic that is > > > > implemented in this patch as there's no synchronize_rcu() in the > > > > vduse_set_group_asid_nomsg(). > > > > > > We only set the pointer on the writer's side, we do nothing like > > > freeing resources. Should we set the pointer before or after > > > syncrhonize_rcu()? > > > > synchronize_rcu is called after writer makes it's changes. > > > > > What do we need to do on the other side of > > > syncrhonize_rcu()? > > > > Presumably, return so the caller knows the as has been updated. > > > > I'm happy to add the syncrhonize_rcu() just in case, but the caller of > vduse_set_group_asid_nomsg does not need to know that the reader has > been updated. > > The first caller is vduse_dev_reset which has its own way to avoid > being called while vqs are being processed. In particular, it reset > their addresses which is way more dangerous. I could call it with > dev->rwsem down though. > > The second one is set_group_asid vdpa callback which is called from > the ioctl itself. This one for sure needs to know that after ioctl completed the new AS is in use. > Moreover, rcu_assign_pointer is WRITE_ONCE by itself so we know all > the readers will get the new value after it. No we don't. The words "after it" do not have a meaning on an SMP system. The only way to know that another CPU sees a value is to synchronize with it. > So what's the value of > explicitly waiting for all the readers to finalize their DMA operation? Read Documentation/memory-barriers.txt If still unclear, read it again )) > I'd understand if we need to do modifications or memory management in > the now unused ASID, but that's not the case here. > > > However, user-triggerable synchronize_rcu() is almost always a bug. > > > > If that's what is going on, you want srcu. > > > > I did not know about it, thanks! but I think all the code that can > sleep is out of the RCU critical sections now, isn't it? what I tried to say is that if triggering synchronize_rcu from userspace is a problem, srcu is one solution.
© 2016 - 2025 Red Hat, Inc.