This allows sepparate the different virtqueues in groups that shares the
same address space. Asking the VDUSE device for the groups of the vq at
the beginning as they're needed for the DMA API.
Allocating 3 vq groups as net is the device that need the most groups:
* Dataplane (guest passthrough)
* CVQ
* Shadowed vrings.
Future versions of the series can include dynamic allocation of the
groups array so VDUSE can declare more groups.
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
v2:
* Now the vq group is in vduse_vq_config struct instead of issuing one
VDUSE message per vq.
v1:
* Fix: Remove BIT_ULL(VIRTIO_S_*), as _S_ is already the bit (Maxime)
RFC v3:
* Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower
value to reduce memory consumption, but vqs are already limited to
that value and userspace VDUSE is able to allocate that many vqs.
* Remove the descs vq group capability as it will not be used and we can
add it on top.
* Do not ask for vq groups in number of vq groups < 2.
* Move the valid vq groups range check to vduse_validate_config.
RFC v2:
* Cache group information in kernel, as we need to provide the vq map
tokens properly.
* Add descs vq group to optimize SVQ forwarding and support indirect
descriptors out of the box.
---
drivers/vdpa/vdpa_user/vduse_dev.c | 37 ++++++++++++++++++++++++++----
include/uapi/linux/vduse.h | 12 +++++++---
2 files changed, 41 insertions(+), 8 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 2b6a8958ffe0..42f8807911d4 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -59,6 +59,7 @@ struct vduse_virtqueue {
struct vdpa_vq_state state;
bool ready;
bool kicked;
+ u32 vq_group;
spinlock_t kick_lock;
spinlock_t irq_lock;
struct eventfd_ctx *kickfd;
@@ -115,6 +116,7 @@ struct vduse_dev {
u8 status;
u32 vq_num;
u32 vq_align;
+ u32 ngroups;
struct vduse_umem *umem;
struct mutex mem_lock;
unsigned int bounce_size;
@@ -593,6 +595,13 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
return 0;
}
+static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ return dev->vqs[idx]->vq_group;
+}
+
static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
struct vdpa_vq_state *state)
{
@@ -790,6 +799,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
.set_vq_cb = vduse_vdpa_set_vq_cb,
.set_vq_num = vduse_vdpa_set_vq_num,
.get_vq_size = vduse_vdpa_get_vq_size,
+ .get_vq_group = vduse_get_vq_group,
.set_vq_ready = vduse_vdpa_set_vq_ready,
.get_vq_ready = vduse_vdpa_get_vq_ready,
.set_vq_state = vduse_vdpa_set_vq_state,
@@ -1253,12 +1263,21 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
if (config.index >= dev->vq_num)
break;
- if (!is_mem_zero((const char *)config.reserved,
- sizeof(config.reserved)))
+ if (dev->api_version < VDUSE_API_VERSION_1 && config.group)
+ break;
+
+ if (dev->api_version >= VDUSE_API_VERSION_1 &&
+ config.group > dev->ngroups)
+ break;
+
+ if (config.reserved1 ||
+ !is_mem_zero((const char *)config.reserved2,
+ sizeof(config.reserved2)))
break;
index = array_index_nospec(config.index, dev->vq_num);
dev->vqs[index]->num_max = config.max_size;
+ dev->vqs[index]->vq_group = config.group;
ret = 0;
break;
}
@@ -1738,12 +1757,19 @@ static bool features_is_valid(struct vduse_dev_config *config)
return true;
}
-static bool vduse_validate_config(struct vduse_dev_config *config)
+static bool vduse_validate_config(struct vduse_dev_config *config,
+ u64 api_version)
{
if (!is_mem_zero((const char *)config->reserved,
sizeof(config->reserved)))
return false;
+ if (api_version < VDUSE_API_VERSION_1 && config->ngroups)
+ return false;
+
+ if (api_version >= VDUSE_API_VERSION_1 && config->ngroups > 0xffff)
+ return false;
+
if (config->vq_align > PAGE_SIZE)
return false;
@@ -1859,6 +1885,7 @@ static int vduse_create_dev(struct vduse_dev_config *config,
dev->device_features = config->features;
dev->device_id = config->device_id;
dev->vendor_id = config->vendor_id;
+ dev->ngroups = (dev->api_version < 1) ? 1 : (config->ngroups ?: 1);
dev->name = kstrdup(config->name, GFP_KERNEL);
if (!dev->name)
goto err_str;
@@ -1937,7 +1964,7 @@ static long vduse_ioctl(struct file *file, unsigned int cmd,
break;
ret = -EINVAL;
- if (vduse_validate_config(&config) == false)
+ if (!vduse_validate_config(&config, control->api_version))
break;
buf = vmemdup_user(argp + size, config.config_size);
@@ -2018,7 +2045,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
&vduse_vdpa_config_ops, &vduse_map_ops,
- 1, 1, name, true);
+ dev->ngroups, 1, name, true);
if (IS_ERR(vdev))
return PTR_ERR(vdev);
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
index ccb92a1efce0..a3d51cf6df3a 100644
--- a/include/uapi/linux/vduse.h
+++ b/include/uapi/linux/vduse.h
@@ -31,6 +31,7 @@
* @features: virtio features
* @vq_num: the number of virtqueues
* @vq_align: the allocation alignment of virtqueue's metadata
+ * @ngroups: number of vq groups that VDUSE device declares
* @reserved: for future use, needs to be initialized to zero
* @config_size: the size of the configuration space
* @config: the buffer of the configuration space
@@ -45,7 +46,8 @@ struct vduse_dev_config {
__u64 features;
__u32 vq_num;
__u32 vq_align;
- __u32 reserved[13];
+ __u32 ngroups; /* if VDUSE_API_VERSION >= 1 */
+ __u32 reserved[12];
__u32 config_size;
__u8 config[];
};
@@ -122,14 +124,18 @@ struct vduse_config_data {
* struct vduse_vq_config - basic configuration of a virtqueue
* @index: virtqueue index
* @max_size: the max size of virtqueue
- * @reserved: for future use, needs to be initialized to zero
+ * @reserved1: for future use, needs to be initialized to zero
+ * @group: virtqueue group
+ * @reserved2: for future use, needs to be initialized to zero
*
* Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue.
*/
struct vduse_vq_config {
__u32 index;
__u16 max_size;
- __u16 reserved[13];
+ __u16 reserved1;
+ __u32 group;
+ __u16 reserved2[10];
};
/*
--
2.51.0
On Tue, Sep 16, 2025 at 9:08 PM Eugenio Pérez <eperezma@redhat.com> wrote: > > This allows sepparate the different virtqueues in groups that shares the > same address space. Asking the VDUSE device for the groups of the vq at > the beginning as they're needed for the DMA API. > > Allocating 3 vq groups as net is the device that need the most groups: > * Dataplane (guest passthrough) > * CVQ > * Shadowed vrings. > > Future versions of the series can include dynamic allocation of the > groups array so VDUSE can declare more groups. > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > --- > v2: > * Now the vq group is in vduse_vq_config struct instead of issuing one > VDUSE message per vq. > > v1: > * Fix: Remove BIT_ULL(VIRTIO_S_*), as _S_ is already the bit (Maxime) > > RFC v3: > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower > value to reduce memory consumption, but vqs are already limited to > that value and userspace VDUSE is able to allocate that many vqs. > * Remove the descs vq group capability as it will not be used and we can > add it on top. > * Do not ask for vq groups in number of vq groups < 2. > * Move the valid vq groups range check to vduse_validate_config. > > RFC v2: > * Cache group information in kernel, as we need to provide the vq map > tokens properly. > * Add descs vq group to optimize SVQ forwarding and support indirect > descriptors out of the box. > --- > drivers/vdpa/vdpa_user/vduse_dev.c | 37 ++++++++++++++++++++++++++---- > include/uapi/linux/vduse.h | 12 +++++++--- > 2 files changed, 41 insertions(+), 8 deletions(-) > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > index 2b6a8958ffe0..42f8807911d4 100644 > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > @@ -59,6 +59,7 @@ struct vduse_virtqueue { > struct vdpa_vq_state state; > bool ready; > bool kicked; > + u32 vq_group; > spinlock_t kick_lock; > spinlock_t irq_lock; > struct eventfd_ctx *kickfd; > @@ -115,6 +116,7 @@ struct vduse_dev { > u8 status; > u32 vq_num; > u32 vq_align; > + u32 ngroups; > struct vduse_umem *umem; > struct mutex mem_lock; > unsigned int bounce_size; > @@ -593,6 +595,13 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, > return 0; > } > > +static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx) > +{ > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > + > + return dev->vqs[idx]->vq_group; I wonder if we should fail if VDUSE_VQ_SETUP is not set by userspace? > +} > + > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, > struct vdpa_vq_state *state) > { > @@ -790,6 +799,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { > .set_vq_cb = vduse_vdpa_set_vq_cb, > .set_vq_num = vduse_vdpa_set_vq_num, > .get_vq_size = vduse_vdpa_get_vq_size, > + .get_vq_group = vduse_get_vq_group, > .set_vq_ready = vduse_vdpa_set_vq_ready, > .get_vq_ready = vduse_vdpa_get_vq_ready, > .set_vq_state = vduse_vdpa_set_vq_state, > @@ -1253,12 +1263,21 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > if (config.index >= dev->vq_num) > break; > > - if (!is_mem_zero((const char *)config.reserved, > - sizeof(config.reserved))) > + if (dev->api_version < VDUSE_API_VERSION_1 && config.group) > + break; > + > + if (dev->api_version >= VDUSE_API_VERSION_1 && > + config.group > dev->ngroups) > + break; > + > + if (config.reserved1 || > + !is_mem_zero((const char *)config.reserved2, > + sizeof(config.reserved2))) > break; What's the reason for having this check? I mean if we don't check this since the day0, it might be too late to do that. > > index = array_index_nospec(config.index, dev->vq_num); > dev->vqs[index]->num_max = config.max_size; > + dev->vqs[index]->vq_group = config.group; > ret = 0; > break; > } > @@ -1738,12 +1757,19 @@ static bool features_is_valid(struct vduse_dev_config *config) > return true; > } > > -static bool vduse_validate_config(struct vduse_dev_config *config) > +static bool vduse_validate_config(struct vduse_dev_config *config, > + u64 api_version) > { > if (!is_mem_zero((const char *)config->reserved, > sizeof(config->reserved))) > return false; > > + if (api_version < VDUSE_API_VERSION_1 && config->ngroups) > + return false; Better with ngroups > 1? > + > + if (api_version >= VDUSE_API_VERSION_1 && config->ngroups > 0xffff) > + return false; > + > if (config->vq_align > PAGE_SIZE) > return false; > > @@ -1859,6 +1885,7 @@ static int vduse_create_dev(struct vduse_dev_config *config, > dev->device_features = config->features; > dev->device_id = config->device_id; > dev->vendor_id = config->vendor_id; > + dev->ngroups = (dev->api_version < 1) ? 1 : (config->ngroups ?: 1); > dev->name = kstrdup(config->name, GFP_KERNEL); > if (!dev->name) > goto err_str; > @@ -1937,7 +1964,7 @@ static long vduse_ioctl(struct file *file, unsigned int cmd, > break; > > ret = -EINVAL; > - if (vduse_validate_config(&config) == false) > + if (!vduse_validate_config(&config, control->api_version)) > break; > > buf = vmemdup_user(argp + size, config.config_size); > @@ -2018,7 +2045,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) > > vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, > &vduse_vdpa_config_ops, &vduse_map_ops, > - 1, 1, name, true); > + dev->ngroups, 1, name, true); > if (IS_ERR(vdev)) > return PTR_ERR(vdev); > > diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h > index ccb92a1efce0..a3d51cf6df3a 100644 > --- a/include/uapi/linux/vduse.h > +++ b/include/uapi/linux/vduse.h > @@ -31,6 +31,7 @@ > * @features: virtio features > * @vq_num: the number of virtqueues > * @vq_align: the allocation alignment of virtqueue's metadata > + * @ngroups: number of vq groups that VDUSE device declares > * @reserved: for future use, needs to be initialized to zero > * @config_size: the size of the configuration space > * @config: the buffer of the configuration space > @@ -45,7 +46,8 @@ struct vduse_dev_config { > __u64 features; > __u32 vq_num; > __u32 vq_align; > - __u32 reserved[13]; > + __u32 ngroups; /* if VDUSE_API_VERSION >= 1 */ > + __u32 reserved[12]; > __u32 config_size; > __u8 config[]; > }; > @@ -122,14 +124,18 @@ struct vduse_config_data { > * struct vduse_vq_config - basic configuration of a virtqueue > * @index: virtqueue index > * @max_size: the max size of virtqueue > - * @reserved: for future use, needs to be initialized to zero > + * @reserved1: for future use, needs to be initialized to zero > + * @group: virtqueue group > + * @reserved2: for future use, needs to be initialized to zero > * > * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue. > */ > struct vduse_vq_config { > __u32 index; > __u16 max_size; > - __u16 reserved[13]; > + __u16 reserved1; > + __u32 group; This makes me think if u16 is sufficient as I see: u32 (*get_vq_group)(struct vdpa_device *vdev, u16 idx); The index is u16 but the group is u32 ... > + __u16 reserved2[10]; > }; > > /* > -- > 2.51.0 > Thanks
On Wed, Sep 17, 2025 at 10:37 AM Jason Wang <jasowang@redhat.com> wrote: > > On Tue, Sep 16, 2025 at 9:08 PM Eugenio Pérez <eperezma@redhat.com> wrote: > > > > This allows sepparate the different virtqueues in groups that shares the > > same address space. Asking the VDUSE device for the groups of the vq at > > the beginning as they're needed for the DMA API. > > > > Allocating 3 vq groups as net is the device that need the most groups: > > * Dataplane (guest passthrough) > > * CVQ > > * Shadowed vrings. > > > > Future versions of the series can include dynamic allocation of the > > groups array so VDUSE can declare more groups. > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > > --- > > v2: > > * Now the vq group is in vduse_vq_config struct instead of issuing one > > VDUSE message per vq. > > > > v1: > > * Fix: Remove BIT_ULL(VIRTIO_S_*), as _S_ is already the bit (Maxime) > > > > RFC v3: > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower > > value to reduce memory consumption, but vqs are already limited to > > that value and userspace VDUSE is able to allocate that many vqs. > > * Remove the descs vq group capability as it will not be used and we can > > add it on top. > > * Do not ask for vq groups in number of vq groups < 2. > > * Move the valid vq groups range check to vduse_validate_config. > > > > RFC v2: > > * Cache group information in kernel, as we need to provide the vq map > > tokens properly. > > * Add descs vq group to optimize SVQ forwarding and support indirect > > descriptors out of the box. > > --- > > drivers/vdpa/vdpa_user/vduse_dev.c | 37 ++++++++++++++++++++++++++---- > > include/uapi/linux/vduse.h | 12 +++++++--- > > 2 files changed, 41 insertions(+), 8 deletions(-) > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > > index 2b6a8958ffe0..42f8807911d4 100644 > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > @@ -59,6 +59,7 @@ struct vduse_virtqueue { > > struct vdpa_vq_state state; > > bool ready; > > bool kicked; > > + u32 vq_group; > > spinlock_t kick_lock; > > spinlock_t irq_lock; > > struct eventfd_ctx *kickfd; > > @@ -115,6 +116,7 @@ struct vduse_dev { > > u8 status; > > u32 vq_num; > > u32 vq_align; > > + u32 ngroups; > > struct vduse_umem *umem; > > struct mutex mem_lock; > > unsigned int bounce_size; > > @@ -593,6 +595,13 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, > > return 0; > > } > > > > +static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx) > > +{ > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > + > > + return dev->vqs[idx]->vq_group; > > I wonder if we should fail if VDUSE_VQ_SETUP is not set by userspace? > I'm kind of ok with implementing it, but I see it as redundant as if the VDUSE device does not call the vq max size will be 0 anyway. > > +} > > + > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, > > struct vdpa_vq_state *state) > > { > > @@ -790,6 +799,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { > > .set_vq_cb = vduse_vdpa_set_vq_cb, > > .set_vq_num = vduse_vdpa_set_vq_num, > > .get_vq_size = vduse_vdpa_get_vq_size, > > + .get_vq_group = vduse_get_vq_group, > > .set_vq_ready = vduse_vdpa_set_vq_ready, > > .get_vq_ready = vduse_vdpa_get_vq_ready, > > .set_vq_state = vduse_vdpa_set_vq_state, > > @@ -1253,12 +1263,21 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > > if (config.index >= dev->vq_num) > > break; > > > > - if (!is_mem_zero((const char *)config.reserved, > > - sizeof(config.reserved))) > > + if (dev->api_version < VDUSE_API_VERSION_1 && config.group) > > + break; > > + > > + if (dev->api_version >= VDUSE_API_VERSION_1 && > > + config.group > dev->ngroups) > > + break; > > + > > + if (config.reserved1 || > > + !is_mem_zero((const char *)config.reserved2, > > + sizeof(config.reserved2))) > > break; > > What's the reason for having this check? I mean if we don't check this > since the day0, it might be too late to do that. > It's been checked from day0, both reserved1 and reserved2 were included in the config.reserved before the patch, and the replaced lines already checked with is_mem_zero [1]. > > > > index = array_index_nospec(config.index, dev->vq_num); > > dev->vqs[index]->num_max = config.max_size; > > + dev->vqs[index]->vq_group = config.group; > > ret = 0; > > break; > > } > > @@ -1738,12 +1757,19 @@ static bool features_is_valid(struct vduse_dev_config *config) > > return true; > > } > > > > -static bool vduse_validate_config(struct vduse_dev_config *config) > > +static bool vduse_validate_config(struct vduse_dev_config *config, > > + u64 api_version) > > { > > if (!is_mem_zero((const char *)config->reserved, > > sizeof(config->reserved))) > > return false; > > > > + if (api_version < VDUSE_API_VERSION_1 && config->ngroups) > > + return false; > > Better with ngroups > 1? > Same here. The kernel in the master branch already returns false if that position of "reserved" is 1, and we are changing the behavior if we don't include the 1 too. > > + > > + if (api_version >= VDUSE_API_VERSION_1 && config->ngroups > 0xffff) > > + return false; > > + > > if (config->vq_align > PAGE_SIZE) > > return false; > > > > @@ -1859,6 +1885,7 @@ static int vduse_create_dev(struct vduse_dev_config *config, > > dev->device_features = config->features; > > dev->device_id = config->device_id; > > dev->vendor_id = config->vendor_id; > > + dev->ngroups = (dev->api_version < 1) ? 1 : (config->ngroups ?: 1); > > dev->name = kstrdup(config->name, GFP_KERNEL); > > if (!dev->name) > > goto err_str; > > @@ -1937,7 +1964,7 @@ static long vduse_ioctl(struct file *file, unsigned int cmd, > > break; > > > > ret = -EINVAL; > > - if (vduse_validate_config(&config) == false) > > + if (!vduse_validate_config(&config, control->api_version)) > > break; > > > > buf = vmemdup_user(argp + size, config.config_size); > > @@ -2018,7 +2045,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) > > > > vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, > > &vduse_vdpa_config_ops, &vduse_map_ops, > > - 1, 1, name, true); > > + dev->ngroups, 1, name, true); > > if (IS_ERR(vdev)) > > return PTR_ERR(vdev); > > > > diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h > > index ccb92a1efce0..a3d51cf6df3a 100644 > > --- a/include/uapi/linux/vduse.h > > +++ b/include/uapi/linux/vduse.h > > @@ -31,6 +31,7 @@ > > * @features: virtio features > > * @vq_num: the number of virtqueues > > * @vq_align: the allocation alignment of virtqueue's metadata > > + * @ngroups: number of vq groups that VDUSE device declares > > * @reserved: for future use, needs to be initialized to zero > > * @config_size: the size of the configuration space > > * @config: the buffer of the configuration space > > @@ -45,7 +46,8 @@ struct vduse_dev_config { > > __u64 features; > > __u32 vq_num; > > __u32 vq_align; > > - __u32 reserved[13]; > > + __u32 ngroups; /* if VDUSE_API_VERSION >= 1 */ > > + __u32 reserved[12]; > > __u32 config_size; > > __u8 config[]; > > }; > > @@ -122,14 +124,18 @@ struct vduse_config_data { > > * struct vduse_vq_config - basic configuration of a virtqueue > > * @index: virtqueue index > > * @max_size: the max size of virtqueue > > - * @reserved: for future use, needs to be initialized to zero > > + * @reserved1: for future use, needs to be initialized to zero > > + * @group: virtqueue group > > + * @reserved2: for future use, needs to be initialized to zero > > * > > * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue. > > */ > > struct vduse_vq_config { > > __u32 index; > > __u16 max_size; > > - __u16 reserved[13]; > > + __u16 reserved1; > > + __u32 group; > > This makes me think if u16 is sufficient as I see: > > u32 (*get_vq_group)(struct vdpa_device *vdev, u16 idx); > > The index is u16 but the group is u32 ... > That's a very good point, but I got the reverse logic actually :). All the vhost userland ioctls use an unsigned int (==u32) already, so VDUSE should keep going with that. If any, get_vq_group (and similar) kernel internal API should move to u32 to keep up with userland API. [1] https://patchew.org/linux/20250606115012.1331551-1-eperezma@redhat.com/20250606115012.1331551-4-eperezma@redhat.com/
On Wed, Sep 17, 2025 at 11:42 PM Eugenio Perez Martin <eperezma@redhat.com> wrote: > > On Wed, Sep 17, 2025 at 10:37 AM Jason Wang <jasowang@redhat.com> wrote: > > > > On Tue, Sep 16, 2025 at 9:08 PM Eugenio Pérez <eperezma@redhat.com> wrote: > > > > > > This allows sepparate the different virtqueues in groups that shares the > > > same address space. Asking the VDUSE device for the groups of the vq at > > > the beginning as they're needed for the DMA API. > > > > > > Allocating 3 vq groups as net is the device that need the most groups: > > > * Dataplane (guest passthrough) > > > * CVQ > > > * Shadowed vrings. > > > > > > Future versions of the series can include dynamic allocation of the > > > groups array so VDUSE can declare more groups. > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > > > --- > > > v2: > > > * Now the vq group is in vduse_vq_config struct instead of issuing one > > > VDUSE message per vq. > > > > > > v1: > > > * Fix: Remove BIT_ULL(VIRTIO_S_*), as _S_ is already the bit (Maxime) > > > > > > RFC v3: > > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower > > > value to reduce memory consumption, but vqs are already limited to > > > that value and userspace VDUSE is able to allocate that many vqs. > > > * Remove the descs vq group capability as it will not be used and we can > > > add it on top. > > > * Do not ask for vq groups in number of vq groups < 2. > > > * Move the valid vq groups range check to vduse_validate_config. > > > > > > RFC v2: > > > * Cache group information in kernel, as we need to provide the vq map > > > tokens properly. > > > * Add descs vq group to optimize SVQ forwarding and support indirect > > > descriptors out of the box. > > > --- > > > drivers/vdpa/vdpa_user/vduse_dev.c | 37 ++++++++++++++++++++++++++---- > > > include/uapi/linux/vduse.h | 12 +++++++--- > > > 2 files changed, 41 insertions(+), 8 deletions(-) > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > > > index 2b6a8958ffe0..42f8807911d4 100644 > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > > @@ -59,6 +59,7 @@ struct vduse_virtqueue { > > > struct vdpa_vq_state state; > > > bool ready; > > > bool kicked; > > > + u32 vq_group; > > > spinlock_t kick_lock; > > > spinlock_t irq_lock; > > > struct eventfd_ctx *kickfd; > > > @@ -115,6 +116,7 @@ struct vduse_dev { > > > u8 status; > > > u32 vq_num; > > > u32 vq_align; > > > + u32 ngroups; > > > struct vduse_umem *umem; > > > struct mutex mem_lock; > > > unsigned int bounce_size; > > > @@ -593,6 +595,13 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, > > > return 0; > > > } > > > > > > +static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx) > > > +{ > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > > + > > > + return dev->vqs[idx]->vq_group; > > > > I wonder if we should fail if VDUSE_VQ_SETUP is not set by userspace? > > > > I'm kind of ok with implementing it, but I see it as redundant as if > the VDUSE device does not call the vq max size will be 0 anyway. Is this better to fail (I meant return int instead of a u32, probably require changes in the vdpa core)? > > > > +} > > > + > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, > > > struct vdpa_vq_state *state) > > > { > > > @@ -790,6 +799,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { > > > .set_vq_cb = vduse_vdpa_set_vq_cb, > > > .set_vq_num = vduse_vdpa_set_vq_num, > > > .get_vq_size = vduse_vdpa_get_vq_size, > > > + .get_vq_group = vduse_get_vq_group, > > > .set_vq_ready = vduse_vdpa_set_vq_ready, > > > .get_vq_ready = vduse_vdpa_get_vq_ready, > > > .set_vq_state = vduse_vdpa_set_vq_state, > > > @@ -1253,12 +1263,21 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > > > if (config.index >= dev->vq_num) > > > break; > > > > > > - if (!is_mem_zero((const char *)config.reserved, > > > - sizeof(config.reserved))) > > > + if (dev->api_version < VDUSE_API_VERSION_1 && config.group) > > > + break; > > > + > > > + if (dev->api_version >= VDUSE_API_VERSION_1 && > > > + config.group > dev->ngroups) > > > + break; > > > + > > > + if (config.reserved1 || > > > + !is_mem_zero((const char *)config.reserved2, > > > + sizeof(config.reserved2))) > > > break; > > > > What's the reason for having this check? I mean if we don't check this > > since the day0, it might be too late to do that. > > > > It's been checked from day0, both reserved1 and reserved2 were > included in the config.reserved before the patch, and the replaced > lines already checked with is_mem_zero [1]. Ok. I see. > > > > > > > index = array_index_nospec(config.index, dev->vq_num); > > > dev->vqs[index]->num_max = config.max_size; > > > + dev->vqs[index]->vq_group = config.group; > > > ret = 0; > > > break; > > > } > > > @@ -1738,12 +1757,19 @@ static bool features_is_valid(struct vduse_dev_config *config) > > > return true; > > > } > > > > > > -static bool vduse_validate_config(struct vduse_dev_config *config) > > > +static bool vduse_validate_config(struct vduse_dev_config *config, > > > + u64 api_version) > > > { > > > if (!is_mem_zero((const char *)config->reserved, > > > sizeof(config->reserved))) > > > return false; > > > > > > + if (api_version < VDUSE_API_VERSION_1 && config->ngroups) > > > + return false; > > > > Better with ngroups > 1? > > > > Same here. The kernel in the master branch already returns false if > that position of "reserved" is 1, and we are changing the behavior if > we don't include the 1 too. > > > > + > > > + if (api_version >= VDUSE_API_VERSION_1 && config->ngroups > 0xffff) > > > + return false; > > > + > > > if (config->vq_align > PAGE_SIZE) > > > return false; > > > > > > @@ -1859,6 +1885,7 @@ static int vduse_create_dev(struct vduse_dev_config *config, > > > dev->device_features = config->features; > > > dev->device_id = config->device_id; > > > dev->vendor_id = config->vendor_id; > > > + dev->ngroups = (dev->api_version < 1) ? 1 : (config->ngroups ?: 1); > > > dev->name = kstrdup(config->name, GFP_KERNEL); > > > if (!dev->name) > > > goto err_str; > > > @@ -1937,7 +1964,7 @@ static long vduse_ioctl(struct file *file, unsigned int cmd, > > > break; > > > > > > ret = -EINVAL; > > > - if (vduse_validate_config(&config) == false) > > > + if (!vduse_validate_config(&config, control->api_version)) > > > break; > > > > > > buf = vmemdup_user(argp + size, config.config_size); > > > @@ -2018,7 +2045,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) > > > > > > vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, > > > &vduse_vdpa_config_ops, &vduse_map_ops, > > > - 1, 1, name, true); > > > + dev->ngroups, 1, name, true); > > > if (IS_ERR(vdev)) > > > return PTR_ERR(vdev); > > > > > > diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h > > > index ccb92a1efce0..a3d51cf6df3a 100644 > > > --- a/include/uapi/linux/vduse.h > > > +++ b/include/uapi/linux/vduse.h > > > @@ -31,6 +31,7 @@ > > > * @features: virtio features > > > * @vq_num: the number of virtqueues > > > * @vq_align: the allocation alignment of virtqueue's metadata > > > + * @ngroups: number of vq groups that VDUSE device declares > > > * @reserved: for future use, needs to be initialized to zero > > > * @config_size: the size of the configuration space > > > * @config: the buffer of the configuration space > > > @@ -45,7 +46,8 @@ struct vduse_dev_config { > > > __u64 features; > > > __u32 vq_num; > > > __u32 vq_align; > > > - __u32 reserved[13]; > > > + __u32 ngroups; /* if VDUSE_API_VERSION >= 1 */ > > > + __u32 reserved[12]; > > > __u32 config_size; > > > __u8 config[]; > > > }; > > > @@ -122,14 +124,18 @@ struct vduse_config_data { > > > * struct vduse_vq_config - basic configuration of a virtqueue > > > * @index: virtqueue index > > > * @max_size: the max size of virtqueue > > > - * @reserved: for future use, needs to be initialized to zero > > > + * @reserved1: for future use, needs to be initialized to zero > > > + * @group: virtqueue group > > > + * @reserved2: for future use, needs to be initialized to zero > > > * > > > * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue. > > > */ > > > struct vduse_vq_config { > > > __u32 index; > > > __u16 max_size; > > > - __u16 reserved[13]; > > > + __u16 reserved1; > > > + __u32 group; > > > > This makes me think if u16 is sufficient as I see: > > > > u32 (*get_vq_group)(struct vdpa_device *vdev, u16 idx); > > > > The index is u16 but the group is u32 ... > > > > That's a very good point, but I got the reverse logic actually :). All > the vhost userland ioctls use an unsigned int (==u32) already, so > VDUSE should keep going with that. If any, get_vq_group (and similar) > kernel internal API should move to u32 to keep up with userland API. Ok, I see this: struct vhost_vring_state { unsigned int index; unsigned int num; }; It is just a little bit weird that we leave a hole. Thanks > > [1] https://patchew.org/linux/20250606115012.1331551-1-eperezma@redhat.com/20250606115012.1331551-4-eperezma@redhat.com/ >
On Thu, Sep 18, 2025 at 8:00 AM Jason Wang <jasowang@redhat.com> wrote: > > On Wed, Sep 17, 2025 at 11:42 PM Eugenio Perez Martin > <eperezma@redhat.com> wrote: > > > > On Wed, Sep 17, 2025 at 10:37 AM Jason Wang <jasowang@redhat.com> wrote: > > > > > > On Tue, Sep 16, 2025 at 9:08 PM Eugenio Pérez <eperezma@redhat.com> wrote: > > > > > > > > This allows sepparate the different virtqueues in groups that shares the > > > > same address space. Asking the VDUSE device for the groups of the vq at > > > > the beginning as they're needed for the DMA API. > > > > > > > > Allocating 3 vq groups as net is the device that need the most groups: > > > > * Dataplane (guest passthrough) > > > > * CVQ > > > > * Shadowed vrings. > > > > > > > > Future versions of the series can include dynamic allocation of the > > > > groups array so VDUSE can declare more groups. > > > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > > > > --- > > > > v2: > > > > * Now the vq group is in vduse_vq_config struct instead of issuing one > > > > VDUSE message per vq. > > > > > > > > v1: > > > > * Fix: Remove BIT_ULL(VIRTIO_S_*), as _S_ is already the bit (Maxime) > > > > > > > > RFC v3: > > > > * Increase VDUSE_MAX_VQ_GROUPS to 0xffff (Jason). It was set to a lower > > > > value to reduce memory consumption, but vqs are already limited to > > > > that value and userspace VDUSE is able to allocate that many vqs. > > > > * Remove the descs vq group capability as it will not be used and we can > > > > add it on top. > > > > * Do not ask for vq groups in number of vq groups < 2. > > > > * Move the valid vq groups range check to vduse_validate_config. > > > > > > > > RFC v2: > > > > * Cache group information in kernel, as we need to provide the vq map > > > > tokens properly. > > > > * Add descs vq group to optimize SVQ forwarding and support indirect > > > > descriptors out of the box. > > > > --- > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 37 ++++++++++++++++++++++++++---- > > > > include/uapi/linux/vduse.h | 12 +++++++--- > > > > 2 files changed, 41 insertions(+), 8 deletions(-) > > > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > > > > index 2b6a8958ffe0..42f8807911d4 100644 > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > > > @@ -59,6 +59,7 @@ struct vduse_virtqueue { > > > > struct vdpa_vq_state state; > > > > bool ready; > > > > bool kicked; > > > > + u32 vq_group; > > > > spinlock_t kick_lock; > > > > spinlock_t irq_lock; > > > > struct eventfd_ctx *kickfd; > > > > @@ -115,6 +116,7 @@ struct vduse_dev { > > > > u8 status; > > > > u32 vq_num; > > > > u32 vq_align; > > > > + u32 ngroups; > > > > struct vduse_umem *umem; > > > > struct mutex mem_lock; > > > > unsigned int bounce_size; > > > > @@ -593,6 +595,13 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, > > > > return 0; > > > > } > > > > > > > > +static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx) > > > > +{ > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > > > + > > > > + return dev->vqs[idx]->vq_group; > > > > > > I wonder if we should fail if VDUSE_VQ_SETUP is not set by userspace? > > > > > > > I'm kind of ok with implementing it, but I see it as redundant as if > > the VDUSE device does not call the vq max size will be 0 anyway. > > Is this better to fail (I meant return int instead of a u32, probably > require changes in the vdpa core)? > Sending a new series with this. > > > > > > +} > > > > + > > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, > > > > struct vdpa_vq_state *state) > > > > { > > > > @@ -790,6 +799,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { > > > > .set_vq_cb = vduse_vdpa_set_vq_cb, > > > > .set_vq_num = vduse_vdpa_set_vq_num, > > > > .get_vq_size = vduse_vdpa_get_vq_size, > > > > + .get_vq_group = vduse_get_vq_group, > > > > .set_vq_ready = vduse_vdpa_set_vq_ready, > > > > .get_vq_ready = vduse_vdpa_get_vq_ready, > > > > .set_vq_state = vduse_vdpa_set_vq_state, > > > > @@ -1253,12 +1263,21 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > > > > if (config.index >= dev->vq_num) > > > > break; > > > > > > > > - if (!is_mem_zero((const char *)config.reserved, > > > > - sizeof(config.reserved))) > > > > + if (dev->api_version < VDUSE_API_VERSION_1 && config.group) > > > > + break; > > > > + > > > > + if (dev->api_version >= VDUSE_API_VERSION_1 && > > > > + config.group > dev->ngroups) > > > > + break; > > > > + > > > > + if (config.reserved1 || > > > > + !is_mem_zero((const char *)config.reserved2, > > > > + sizeof(config.reserved2))) > > > > break; > > > > > > What's the reason for having this check? I mean if we don't check this > > > since the day0, it might be too late to do that. > > > > > > > It's been checked from day0, both reserved1 and reserved2 were > > included in the config.reserved before the patch, and the replaced > > lines already checked with is_mem_zero [1]. > > Ok. I see. > > > > > > > > > > > index = array_index_nospec(config.index, dev->vq_num); > > > > dev->vqs[index]->num_max = config.max_size; > > > > + dev->vqs[index]->vq_group = config.group; > > > > ret = 0; > > > > break; > > > > } > > > > @@ -1738,12 +1757,19 @@ static bool features_is_valid(struct vduse_dev_config *config) > > > > return true; > > > > } > > > > > > > > -static bool vduse_validate_config(struct vduse_dev_config *config) > > > > +static bool vduse_validate_config(struct vduse_dev_config *config, > > > > + u64 api_version) > > > > { > > > > if (!is_mem_zero((const char *)config->reserved, > > > > sizeof(config->reserved))) > > > > return false; > > > > > > > > + if (api_version < VDUSE_API_VERSION_1 && config->ngroups) > > > > + return false; > > > > > > Better with ngroups > 1? > > > > > > > Same here. The kernel in the master branch already returns false if > > that position of "reserved" is 1, and we are changing the behavior if > > we don't include the 1 too. > > > > > > + > > > > + if (api_version >= VDUSE_API_VERSION_1 && config->ngroups > 0xffff) > > > > + return false; > > > > + > > > > if (config->vq_align > PAGE_SIZE) > > > > return false; > > > > > > > > @@ -1859,6 +1885,7 @@ static int vduse_create_dev(struct vduse_dev_config *config, > > > > dev->device_features = config->features; > > > > dev->device_id = config->device_id; > > > > dev->vendor_id = config->vendor_id; > > > > + dev->ngroups = (dev->api_version < 1) ? 1 : (config->ngroups ?: 1); > > > > dev->name = kstrdup(config->name, GFP_KERNEL); > > > > if (!dev->name) > > > > goto err_str; > > > > @@ -1937,7 +1964,7 @@ static long vduse_ioctl(struct file *file, unsigned int cmd, > > > > break; > > > > > > > > ret = -EINVAL; > > > > - if (vduse_validate_config(&config) == false) > > > > + if (!vduse_validate_config(&config, control->api_version)) > > > > break; > > > > > > > > buf = vmemdup_user(argp + size, config.config_size); > > > > @@ -2018,7 +2045,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) > > > > > > > > vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, > > > > &vduse_vdpa_config_ops, &vduse_map_ops, > > > > - 1, 1, name, true); > > > > + dev->ngroups, 1, name, true); > > > > if (IS_ERR(vdev)) > > > > return PTR_ERR(vdev); > > > > > > > > diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h > > > > index ccb92a1efce0..a3d51cf6df3a 100644 > > > > --- a/include/uapi/linux/vduse.h > > > > +++ b/include/uapi/linux/vduse.h > > > > @@ -31,6 +31,7 @@ > > > > * @features: virtio features > > > > * @vq_num: the number of virtqueues > > > > * @vq_align: the allocation alignment of virtqueue's metadata > > > > + * @ngroups: number of vq groups that VDUSE device declares > > > > * @reserved: for future use, needs to be initialized to zero > > > > * @config_size: the size of the configuration space > > > > * @config: the buffer of the configuration space > > > > @@ -45,7 +46,8 @@ struct vduse_dev_config { > > > > __u64 features; > > > > __u32 vq_num; > > > > __u32 vq_align; > > > > - __u32 reserved[13]; > > > > + __u32 ngroups; /* if VDUSE_API_VERSION >= 1 */ > > > > + __u32 reserved[12]; > > > > __u32 config_size; > > > > __u8 config[]; > > > > }; > > > > @@ -122,14 +124,18 @@ struct vduse_config_data { > > > > * struct vduse_vq_config - basic configuration of a virtqueue > > > > * @index: virtqueue index > > > > * @max_size: the max size of virtqueue > > > > - * @reserved: for future use, needs to be initialized to zero > > > > + * @reserved1: for future use, needs to be initialized to zero > > > > + * @group: virtqueue group > > > > + * @reserved2: for future use, needs to be initialized to zero > > > > * > > > > * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue. > > > > */ > > > > struct vduse_vq_config { > > > > __u32 index; > > > > __u16 max_size; > > > > - __u16 reserved[13]; > > > > + __u16 reserved1; > > > > + __u32 group; > > > > > > This makes me think if u16 is sufficient as I see: > > > > > > u32 (*get_vq_group)(struct vdpa_device *vdev, u16 idx); > > > > > > The index is u16 but the group is u32 ... > > > > > > > That's a very good point, but I got the reverse logic actually :). All > > the vhost userland ioctls use an unsigned int (==u32) already, so > > VDUSE should keep going with that. If any, get_vq_group (and similar) > > kernel internal API should move to u32 to keep up with userland API. > > Ok, I see this: > > struct vhost_vring_state { > unsigned int index; > unsigned int num; > }; > > It is just a little bit weird that we leave a hole. > I don't like the hole in the struct either, but I find it cosmetic compared to the drawbacks of the alternatives. I find it even more cosmetic seeing that 6.17 is approaching :).
© 2016 - 2025 Red Hat, Inc.