This allows sepparate the different virtqueues in groups that shares the
same address space. Asking the VDUSE device for the groups of the vq at
the beginning as they're needed for the DMA API.
Allocating 3 vq groups as net is the device that need the most groups:
* Dataplane (guest passthrough)
* CVQ
* Shadowed vrings.
Future versions of the series can include dynamic allocation of the
groups array so VDUSE can declare more groups.
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
v2:
* Cache group information in kernel, as we need to provide the vq map
tokens properly.
* Add descs vq group to optimize SVQ forwarding and support indirect
descriptors out of the box.
---
drivers/vdpa/vdpa_user/vduse_dev.c | 71 +++++++++++++++++++++++++++++-
include/uapi/linux/vduse.h | 19 +++++++-
2 files changed, 88 insertions(+), 2 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index d858c4389cc1..d1f6d00a9c71 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -46,6 +46,11 @@
#define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024)
#define VDUSE_MSG_DEFAULT_TIMEOUT 30
+/*
+ * Let's make it 3 for simplicity.
+ */
+#define VDUSE_MAX_VQ_GROUPS 3
+
#define IRQ_UNBOUND -1
struct vduse_virtqueue {
@@ -58,6 +63,8 @@ struct vduse_virtqueue {
struct vdpa_vq_state state;
bool ready;
bool kicked;
+ u32 vq_group;
+ u32 vq_desc_group;
spinlock_t kick_lock;
spinlock_t irq_lock;
struct eventfd_ctx *kickfd;
@@ -114,6 +121,7 @@ struct vduse_dev {
u8 status;
u32 vq_num;
u32 vq_align;
+ u32 ngroups;
struct vduse_umem *umem;
struct mutex mem_lock;
unsigned int bounce_size;
@@ -592,6 +600,20 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
return 0;
}
+static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ return dev->vqs[idx]->vq_group;
+}
+
+static u32 vduse_get_vq_desc_group(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ return dev->vqs[idx]->vq_desc_group;
+}
+
static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
struct vdpa_vq_state *state)
{
@@ -678,13 +700,48 @@ static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
return dev->status;
}
+static int vduse_fill_vq_groups(struct vduse_dev *dev)
+{
+ if (dev->api_version < VDUSE_API_VERSION_1)
+ return 0;
+
+ for (int i = 0; i < dev->vdev->vdpa.nvqs; ++i) {
+ struct vduse_dev_msg msg = { 0 };
+ int ret;
+
+ msg.req.type = VDUSE_GET_VQ_GROUP;
+ msg.req.vq_group.index = i;
+ ret = vduse_dev_msg_sync(dev, &msg);
+ if (ret)
+ return ret;
+
+ dev->vqs[i]->vq_group = msg.resp.vq_group.num;
+
+ msg.req.type = VDUSE_GET_VRING_DESC_GROUP;
+ ret = vduse_dev_msg_sync(dev, &msg);
+ if (ret)
+ return ret;
+
+ dev->vqs[i]->vq_desc_group = msg.resp.vq_group.num;
+ }
+
+ return 0;
+}
+
static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ u8 previous_status = dev->status;
if (vduse_dev_set_status(dev, status))
return;
+ if ((dev->status ^ previous_status) &
+ BIT_ULL(VIRTIO_CONFIG_S_FEATURES_OK) &&
+ status & (1ULL << VIRTIO_CONFIG_S_FEATURES_OK))
+ if (vduse_fill_vq_groups(dev))
+ return;
+
dev->status = status;
}
@@ -789,6 +846,8 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
.set_vq_cb = vduse_vdpa_set_vq_cb,
.set_vq_num = vduse_vdpa_set_vq_num,
.get_vq_size = vduse_vdpa_get_vq_size,
+ .get_vq_group = vduse_get_vq_group,
+ .get_vq_desc_group = vduse_get_vq_desc_group,
.set_vq_ready = vduse_vdpa_set_vq_ready,
.get_vq_ready = vduse_vdpa_get_vq_ready,
.set_vq_state = vduse_vdpa_set_vq_state,
@@ -1856,6 +1915,16 @@ static int vduse_create_dev(struct vduse_dev_config *config,
dev->device_features = config->features;
dev->device_id = config->device_id;
dev->vendor_id = config->vendor_id;
+ if (dev->api_version >= 1) {
+ if (config->ngroups > VDUSE_MAX_VQ_GROUPS) {
+ pr_err("Not creating a VDUSE device with %u vq groups. Max: %u",
+ config->ngroups, VDUSE_MAX_VQ_GROUPS);
+ goto err;
+ }
+ dev->ngroups = config->ngroups ?: 1;
+ } else {
+ dev->ngroups = 1;
+ }
dev->name = kstrdup(config->name, GFP_KERNEL);
if (!dev->name)
goto err_str;
@@ -2016,7 +2085,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
&vduse_vdpa_config_ops, &vduse_map_ops,
- 1, 1, name, true);
+ dev->ngroups, 1, name, true);
if (IS_ERR(vdev))
return PTR_ERR(vdev);
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
index 9a56d0416bfe..b4b139dc76bb 100644
--- a/include/uapi/linux/vduse.h
+++ b/include/uapi/linux/vduse.h
@@ -31,6 +31,7 @@
* @features: virtio features
* @vq_num: the number of virtqueues
* @vq_align: the allocation alignment of virtqueue's metadata
+ * @ngroups: number of vq groups that VDUSE device declares
* @reserved: for future use, needs to be initialized to zero
* @config_size: the size of the configuration space
* @config: the buffer of the configuration space
@@ -45,7 +46,8 @@ struct vduse_dev_config {
__u64 features;
__u32 vq_num;
__u32 vq_align;
- __u32 reserved[13];
+ __u32 ngroups; /* if VDUSE_API_VERSION >= 1 */
+ __u32 reserved[12];
__u32 config_size;
__u8 config[];
};
@@ -160,6 +162,16 @@ struct vduse_vq_state_packed {
__u16 last_used_idx;
};
+/**
+ * struct vduse_vq_group - virtqueue group
+ * @num: Index of the virtqueue group
+ * @num: Group
+ */
+struct vduse_vq_group {
+ __u32 index;
+ __u32 num;
+};
+
/**
* struct vduse_vq_info - information of a virtqueue
* @index: virtqueue index
@@ -182,6 +194,7 @@ struct vduse_vq_info {
union {
struct vduse_vq_state_split split;
struct vduse_vq_state_packed packed;
+ struct vduse_vq_group group;
};
__u8 ready;
};
@@ -274,6 +287,8 @@ enum vduse_req_type {
VDUSE_GET_VQ_STATE,
VDUSE_SET_STATUS,
VDUSE_UPDATE_IOTLB,
+ VDUSE_GET_VQ_GROUP,
+ VDUSE_GET_VRING_DESC_GROUP,
};
/**
@@ -328,6 +343,7 @@ struct vduse_dev_request {
struct vduse_vq_state vq_state;
struct vduse_dev_status s;
struct vduse_iova_range iova;
+ struct vduse_vq_group vq_group; /* Only if vduse api version >= 1 */
__u32 padding[32];
};
};
@@ -350,6 +366,7 @@ struct vduse_dev_response {
__u32 reserved[4];
union {
struct vduse_vq_state vq_state;
+ struct vduse_vq_group vq_group; /* Only if vduse api version >= 1 */
__u32 padding[32];
};
};
--
2.50.1
On Thu, Aug 7, 2025 at 7:58 PM Eugenio Pérez <eperezma@redhat.com> wrote: > > This allows sepparate the different virtqueues in groups that shares the > same address space. Asking the VDUSE device for the groups of the vq at > the beginning as they're needed for the DMA API. > > Allocating 3 vq groups as net is the device that need the most groups: > * Dataplane (guest passthrough) > * CVQ > * Shadowed vrings. > > Future versions of the series can include dynamic allocation of the > groups array so VDUSE can declare more groups. > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > --- > v2: > * Cache group information in kernel, as we need to provide the vq map > tokens properly. > * Add descs vq group to optimize SVQ forwarding and support indirect > descriptors out of the box. > --- > drivers/vdpa/vdpa_user/vduse_dev.c | 71 +++++++++++++++++++++++++++++- > include/uapi/linux/vduse.h | 19 +++++++- > 2 files changed, 88 insertions(+), 2 deletions(-) > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > index d858c4389cc1..d1f6d00a9c71 100644 > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > @@ -46,6 +46,11 @@ > #define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024) > #define VDUSE_MSG_DEFAULT_TIMEOUT 30 > > +/* > + * Let's make it 3 for simplicity. > + */ > +#define VDUSE_MAX_VQ_GROUPS 3 I think we can release this to something like 64. Otherwise we might bump the version again just to increase the limitation? Or having a sysfs entry like bounce_size? > + > #define IRQ_UNBOUND -1 > > struct vduse_virtqueue { > @@ -58,6 +63,8 @@ struct vduse_virtqueue { > struct vdpa_vq_state state; > bool ready; > bool kicked; > + u32 vq_group; > + u32 vq_desc_group; > spinlock_t kick_lock; > spinlock_t irq_lock; > struct eventfd_ctx *kickfd; > @@ -114,6 +121,7 @@ struct vduse_dev { > u8 status; > u32 vq_num; > u32 vq_align; > + u32 ngroups; > struct vduse_umem *umem; > struct mutex mem_lock; > unsigned int bounce_size; > @@ -592,6 +600,20 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, > return 0; > } > > +static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx) > +{ > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > + > + return dev->vqs[idx]->vq_group; > +} > + > +static u32 vduse_get_vq_desc_group(struct vdpa_device *vdpa, u16 idx) > +{ > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > + > + return dev->vqs[idx]->vq_desc_group; > +} > + > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, > struct vdpa_vq_state *state) > { > @@ -678,13 +700,48 @@ static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa) > return dev->status; > } > > +static int vduse_fill_vq_groups(struct vduse_dev *dev) > +{ > + if (dev->api_version < VDUSE_API_VERSION_1) > + return 0; > + > + for (int i = 0; i < dev->vdev->vdpa.nvqs; ++i) { > + struct vduse_dev_msg msg = { 0 }; > + int ret; > + > + msg.req.type = VDUSE_GET_VQ_GROUP; > + msg.req.vq_group.index = i; > + ret = vduse_dev_msg_sync(dev, &msg); I fail to understand why the default group mapping is not done during device creation. > + if (ret) > + return ret; > + > + dev->vqs[i]->vq_group = msg.resp.vq_group.num; > + > + msg.req.type = VDUSE_GET_VRING_DESC_GROUP; > + ret = vduse_dev_msg_sync(dev, &msg); > + if (ret) > + return ret; > + > + dev->vqs[i]->vq_desc_group = msg.resp.vq_group.num; > + } > + > + return 0; > +} > + > static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status) > { > struct vduse_dev *dev = vdpa_to_vduse(vdpa); > + u8 previous_status = dev->status; > > if (vduse_dev_set_status(dev, status)) > return; > > + if ((dev->status ^ previous_status) & > + BIT_ULL(VIRTIO_CONFIG_S_FEATURES_OK) && > + status & (1ULL << VIRTIO_CONFIG_S_FEATURES_OK)) > + if (vduse_fill_vq_groups(dev)) Can we merge the two messages into a single one? Or can we use a shared memory for storing such mapping? For example, if we have 256 queues it would be very slow. Thanks
On Mon, Aug 11, 2025 at 5:10 AM Jason Wang <jasowang@redhat.com> wrote: > > On Thu, Aug 7, 2025 at 7:58 PM Eugenio Pérez <eperezma@redhat.com> wrote: > > > > This allows sepparate the different virtqueues in groups that shares the > > same address space. Asking the VDUSE device for the groups of the vq at > > the beginning as they're needed for the DMA API. > > > > Allocating 3 vq groups as net is the device that need the most groups: > > * Dataplane (guest passthrough) > > * CVQ > > * Shadowed vrings. > > > > Future versions of the series can include dynamic allocation of the > > groups array so VDUSE can declare more groups. > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > > --- > > v2: > > * Cache group information in kernel, as we need to provide the vq map > > tokens properly. > > * Add descs vq group to optimize SVQ forwarding and support indirect > > descriptors out of the box. > > --- > > drivers/vdpa/vdpa_user/vduse_dev.c | 71 +++++++++++++++++++++++++++++- > > include/uapi/linux/vduse.h | 19 +++++++- > > 2 files changed, 88 insertions(+), 2 deletions(-) > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > > index d858c4389cc1..d1f6d00a9c71 100644 > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > @@ -46,6 +46,11 @@ > > #define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024) > > #define VDUSE_MSG_DEFAULT_TIMEOUT 30 > > > > +/* > > + * Let's make it 3 for simplicity. > > + */ > > +#define VDUSE_MAX_VQ_GROUPS 3 > > I think we can release this to something like 64. Otherwise we might > bump the version again just to increase the limitation? Or having a > sysfs entry like bounce_size? > I think it should not be linked to the version, but it is true there is no way for VDUSE devices to check the maximum VQ groups / ASID that the kernel supports. To handle as bounce_size seems the best option, good point. I'll send a new version with that! > > + > > #define IRQ_UNBOUND -1 > > > > struct vduse_virtqueue { > > @@ -58,6 +63,8 @@ struct vduse_virtqueue { > > struct vdpa_vq_state state; > > bool ready; > > bool kicked; > > + u32 vq_group; > > + u32 vq_desc_group; > > spinlock_t kick_lock; > > spinlock_t irq_lock; > > struct eventfd_ctx *kickfd; > > @@ -114,6 +121,7 @@ struct vduse_dev { > > u8 status; > > u32 vq_num; > > u32 vq_align; > > + u32 ngroups; > > struct vduse_umem *umem; > > struct mutex mem_lock; > > unsigned int bounce_size; > > @@ -592,6 +600,20 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, > > return 0; > > } > > > > +static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx) > > +{ > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > + > > + return dev->vqs[idx]->vq_group; > > +} > > + > > +static u32 vduse_get_vq_desc_group(struct vdpa_device *vdpa, u16 idx) > > +{ > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > + > > + return dev->vqs[idx]->vq_desc_group; > > +} > > + > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, > > struct vdpa_vq_state *state) > > { > > @@ -678,13 +700,48 @@ static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa) > > return dev->status; > > } > > > > +static int vduse_fill_vq_groups(struct vduse_dev *dev) > > +{ > > + if (dev->api_version < VDUSE_API_VERSION_1) > > + return 0; > > + > > + for (int i = 0; i < dev->vdev->vdpa.nvqs; ++i) { > > + struct vduse_dev_msg msg = { 0 }; > > + int ret; > > + > > + msg.req.type = VDUSE_GET_VQ_GROUP; > > + msg.req.vq_group.index = i; > > + ret = vduse_dev_msg_sync(dev, &msg); > > I fail to understand why the default group mapping is not done during > device creation. > Because it changes depending on the features. If a new device has 5 virtqueues and the device wants to isolate the CVQ, the CVQ position depends on the features that the guest's acks: * If MQ is acked the isolated vq is #5 * If MQ is not acked the isolated vq is #3. > > + if (ret) > > + return ret; > > + > > + dev->vqs[i]->vq_group = msg.resp.vq_group.num; > > + > > + msg.req.type = VDUSE_GET_VRING_DESC_GROUP; > > + ret = vduse_dev_msg_sync(dev, &msg); > > + if (ret) > > + return ret; > > + > > + dev->vqs[i]->vq_desc_group = msg.resp.vq_group.num; > > + } > > + > > + return 0; > > +} > > + > > static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status) > > { > > struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > + u8 previous_status = dev->status; > > > > if (vduse_dev_set_status(dev, status)) > > return; > > > > + if ((dev->status ^ previous_status) & > > + BIT_ULL(VIRTIO_CONFIG_S_FEATURES_OK) && > > + status & (1ULL << VIRTIO_CONFIG_S_FEATURES_OK)) > > + if (vduse_fill_vq_groups(dev)) > > Can we merge the two messages into a single one? Or can we use a > shared memory for storing such mapping? > > For example, if we have 256 queues it would be very slow. > To merge it in the same message is good to me, sure. To make it a table in shm seems more complicated, unless we accept a void * in the reply and VDUSE uses copy_from_user. If that is ok here, then sure.
On Mon, Aug 11, 2025 at 5:52 PM Eugenio Perez Martin <eperezma@redhat.com> wrote: > > On Mon, Aug 11, 2025 at 5:10 AM Jason Wang <jasowang@redhat.com> wrote: > > > > On Thu, Aug 7, 2025 at 7:58 PM Eugenio Pérez <eperezma@redhat.com> wrote: > > > > > > This allows sepparate the different virtqueues in groups that shares the > > > same address space. Asking the VDUSE device for the groups of the vq at > > > the beginning as they're needed for the DMA API. > > > > > > Allocating 3 vq groups as net is the device that need the most groups: > > > * Dataplane (guest passthrough) > > > * CVQ > > > * Shadowed vrings. > > > > > > Future versions of the series can include dynamic allocation of the > > > groups array so VDUSE can declare more groups. > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > > > --- > > > v2: > > > * Cache group information in kernel, as we need to provide the vq map > > > tokens properly. > > > * Add descs vq group to optimize SVQ forwarding and support indirect > > > descriptors out of the box. > > > --- > > > drivers/vdpa/vdpa_user/vduse_dev.c | 71 +++++++++++++++++++++++++++++- > > > include/uapi/linux/vduse.h | 19 +++++++- > > > 2 files changed, 88 insertions(+), 2 deletions(-) > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > > > index d858c4389cc1..d1f6d00a9c71 100644 > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > > @@ -46,6 +46,11 @@ > > > #define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024) > > > #define VDUSE_MSG_DEFAULT_TIMEOUT 30 > > > > > > +/* > > > + * Let's make it 3 for simplicity. > > > + */ > > > +#define VDUSE_MAX_VQ_GROUPS 3 > > > > I think we can release this to something like 64. Otherwise we might > > bump the version again just to increase the limitation? Or having a > > sysfs entry like bounce_size? > > > > I think it should not be linked to the version, but it is true there > is no way for VDUSE devices to check the maximum VQ groups / ASID that > the kernel supports. > > To handle as bounce_size seems the best option, good point. I'll send > a new version with that! > > > > + > > > #define IRQ_UNBOUND -1 > > > > > > struct vduse_virtqueue { > > > @@ -58,6 +63,8 @@ struct vduse_virtqueue { > > > struct vdpa_vq_state state; > > > bool ready; > > > bool kicked; > > > + u32 vq_group; > > > + u32 vq_desc_group; > > > spinlock_t kick_lock; > > > spinlock_t irq_lock; > > > struct eventfd_ctx *kickfd; > > > @@ -114,6 +121,7 @@ struct vduse_dev { > > > u8 status; > > > u32 vq_num; > > > u32 vq_align; > > > + u32 ngroups; > > > struct vduse_umem *umem; > > > struct mutex mem_lock; > > > unsigned int bounce_size; > > > @@ -592,6 +600,20 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, > > > return 0; > > > } > > > > > > +static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx) > > > +{ > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > > + > > > + return dev->vqs[idx]->vq_group; > > > +} > > > + > > > +static u32 vduse_get_vq_desc_group(struct vdpa_device *vdpa, u16 idx) > > > +{ > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > > + > > > + return dev->vqs[idx]->vq_desc_group; > > > +} > > > + > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, > > > struct vdpa_vq_state *state) > > > { > > > @@ -678,13 +700,48 @@ static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa) > > > return dev->status; > > > } > > > > > > +static int vduse_fill_vq_groups(struct vduse_dev *dev) > > > +{ > > > + if (dev->api_version < VDUSE_API_VERSION_1) > > > + return 0; > > > + > > > + for (int i = 0; i < dev->vdev->vdpa.nvqs; ++i) { > > > + struct vduse_dev_msg msg = { 0 }; > > > + int ret; > > > + > > > + msg.req.type = VDUSE_GET_VQ_GROUP; > > > + msg.req.vq_group.index = i; > > > + ret = vduse_dev_msg_sync(dev, &msg); > > > > I fail to understand why the default group mapping is not done during > > device creation. > > > > Because it changes depending on the features. > > If a new device has 5 virtqueues and the device wants to isolate the > CVQ, the CVQ position depends on the features that the guest's acks: > * If MQ is acked the isolated vq is #5 > * If MQ is not acked the isolated vq is #3. I see we are still damaged by the design of the cvq index. But this is just a static branch not a dynamic one. If we can find ways to make it static it would be better. > > > > + if (ret) > > > + return ret; > > > + > > > + dev->vqs[i]->vq_group = msg.resp.vq_group.num; > > > + > > > + msg.req.type = VDUSE_GET_VRING_DESC_GROUP; > > > + ret = vduse_dev_msg_sync(dev, &msg); > > > + if (ret) > > > + return ret; > > > + > > > + dev->vqs[i]->vq_desc_group = msg.resp.vq_group.num; > > > + } > > > + > > > + return 0; > > > +} > > > + > > > static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status) > > > { > > > struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > > + u8 previous_status = dev->status; > > > > > > if (vduse_dev_set_status(dev, status)) > > > return; > > > > > > + if ((dev->status ^ previous_status) & > > > + BIT_ULL(VIRTIO_CONFIG_S_FEATURES_OK) && > > > + status & (1ULL << VIRTIO_CONFIG_S_FEATURES_OK)) > > > + if (vduse_fill_vq_groups(dev)) > > > > Can we merge the two messages into a single one? Or can we use a > > shared memory for storing such mapping? > > > > For example, if we have 256 queues it would be very slow. > > > > To merge it in the same message is good to me, sure. We can start from this if we can't find a way to provision vq to group mapping during device creation. > To make it a > table in shm seems more complicated, unless we accept a void * in the > reply and VDUSE uses copy_from_user. If that is ok here, then sure. > This looks tricky indeed. Thanks
On Tue, Aug 12, 2025 at 5:01 AM Jason Wang <jasowang@redhat.com> wrote: > > On Mon, Aug 11, 2025 at 5:52 PM Eugenio Perez Martin > <eperezma@redhat.com> wrote: > > > > On Mon, Aug 11, 2025 at 5:10 AM Jason Wang <jasowang@redhat.com> wrote: > > > > > > On Thu, Aug 7, 2025 at 7:58 PM Eugenio Pérez <eperezma@redhat.com> wrote: > > > > > > > > This allows sepparate the different virtqueues in groups that shares the > > > > same address space. Asking the VDUSE device for the groups of the vq at > > > > the beginning as they're needed for the DMA API. > > > > > > > > Allocating 3 vq groups as net is the device that need the most groups: > > > > * Dataplane (guest passthrough) > > > > * CVQ > > > > * Shadowed vrings. > > > > > > > > Future versions of the series can include dynamic allocation of the > > > > groups array so VDUSE can declare more groups. > > > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > > > > --- > > > > v2: > > > > * Cache group information in kernel, as we need to provide the vq map > > > > tokens properly. > > > > * Add descs vq group to optimize SVQ forwarding and support indirect > > > > descriptors out of the box. > > > > --- > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 71 +++++++++++++++++++++++++++++- > > > > include/uapi/linux/vduse.h | 19 +++++++- > > > > 2 files changed, 88 insertions(+), 2 deletions(-) > > > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > > > > index d858c4389cc1..d1f6d00a9c71 100644 > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > > > @@ -46,6 +46,11 @@ > > > > #define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024) > > > > #define VDUSE_MSG_DEFAULT_TIMEOUT 30 > > > > > > > > +/* > > > > + * Let's make it 3 for simplicity. > > > > + */ > > > > +#define VDUSE_MAX_VQ_GROUPS 3 > > > > > > I think we can release this to something like 64. Otherwise we might > > > bump the version again just to increase the limitation? Or having a > > > sysfs entry like bounce_size? > > > > > > > I think it should not be linked to the version, but it is true there > > is no way for VDUSE devices to check the maximum VQ groups / ASID that > > the kernel supports. > > > > To handle as bounce_size seems the best option, good point. I'll send > > a new version with that! > > > > > > + > > > > #define IRQ_UNBOUND -1 > > > > > > > > struct vduse_virtqueue { > > > > @@ -58,6 +63,8 @@ struct vduse_virtqueue { > > > > struct vdpa_vq_state state; > > > > bool ready; > > > > bool kicked; > > > > + u32 vq_group; > > > > + u32 vq_desc_group; > > > > spinlock_t kick_lock; > > > > spinlock_t irq_lock; > > > > struct eventfd_ctx *kickfd; > > > > @@ -114,6 +121,7 @@ struct vduse_dev { > > > > u8 status; > > > > u32 vq_num; > > > > u32 vq_align; > > > > + u32 ngroups; > > > > struct vduse_umem *umem; > > > > struct mutex mem_lock; > > > > unsigned int bounce_size; > > > > @@ -592,6 +600,20 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, > > > > return 0; > > > > } > > > > > > > > +static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx) > > > > +{ > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > > > + > > > > + return dev->vqs[idx]->vq_group; > > > > +} > > > > + > > > > +static u32 vduse_get_vq_desc_group(struct vdpa_device *vdpa, u16 idx) > > > > +{ > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > > > + > > > > + return dev->vqs[idx]->vq_desc_group; > > > > +} > > > > + > > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, > > > > struct vdpa_vq_state *state) > > > > { > > > > @@ -678,13 +700,48 @@ static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa) > > > > return dev->status; > > > > } > > > > > > > > +static int vduse_fill_vq_groups(struct vduse_dev *dev) > > > > +{ > > > > + if (dev->api_version < VDUSE_API_VERSION_1) > > > > + return 0; > > > > + > > > > + for (int i = 0; i < dev->vdev->vdpa.nvqs; ++i) { > > > > + struct vduse_dev_msg msg = { 0 }; > > > > + int ret; > > > > + > > > > + msg.req.type = VDUSE_GET_VQ_GROUP; > > > > + msg.req.vq_group.index = i; > > > > + ret = vduse_dev_msg_sync(dev, &msg); > > > > > > I fail to understand why the default group mapping is not done during > > > device creation. > > > > > > > Because it changes depending on the features. > > > > If a new device has 5 virtqueues and the device wants to isolate the > > CVQ, the CVQ position depends on the features that the guest's acks: > > * If MQ is acked the isolated vq is #5 > > * If MQ is not acked the isolated vq is #3. > > I see we are still damaged by the design of the cvq index. But this is > just a static branch not a dynamic one. If we can find ways to make it > static it would be better. > > > > > > > + if (ret) > > > > + return ret; > > > > + > > > > + dev->vqs[i]->vq_group = msg.resp.vq_group.num; > > > > + > > > > + msg.req.type = VDUSE_GET_VRING_DESC_GROUP; > > > > + ret = vduse_dev_msg_sync(dev, &msg); > > > > + if (ret) > > > > + return ret; > > > > + > > > > + dev->vqs[i]->vq_desc_group = msg.resp.vq_group.num; > > > > + } > > > > + > > > > + return 0; > > > > +} > > > > + > > > > static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status) > > > > { > > > > struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > > > + u8 previous_status = dev->status; > > > > > > > > if (vduse_dev_set_status(dev, status)) > > > > return; > > > > > > > > + if ((dev->status ^ previous_status) & > > > > + BIT_ULL(VIRTIO_CONFIG_S_FEATURES_OK) && > > > > + status & (1ULL << VIRTIO_CONFIG_S_FEATURES_OK)) > > > > + if (vduse_fill_vq_groups(dev)) > > > > > > Can we merge the two messages into a single one? Or can we use a > > > shared memory for storing such mapping? > > > > > > For example, if we have 256 queues it would be very slow. > > > > > > > To merge it in the same message is good to me, sure. > > We can start from this if we can't find a way to provision vq to group > mapping during device creation. > Note that I don't think it is worth implementing these in this series, but to add them on top in another one. Because I don't think we will find devices with a lot of virtqueues for now. But here are some ideas to mitigate the startup time cost: 1) Support more than one virtqueue in the same vduse request / reply Something like: vduse_dev_response{ u32 req_id; u32 result; union { ... struct vduse_vq_group { u32 num_vqs_requested; struct { u32 vq_idx; u32 vq_group; } vqs[15]; } u32 padding[32]; } } Choosing 15 to fill the current size of vduse_dev_response struct. 2) Pointer chasing in the struct written Same as previous, but the vqs struct is actually a pointer in userspace. This way it can be arbitrarily big. vduse_dev_response{ u32 req_id; u32 result; union { ... struct vduse_vq_group { u32 num_vqs_requested; struct { u32 vq_idx; u32 vq_group; } *vqs; } u32 padding[32]; } } I cannot locate any use of this in write() data, but it is more or less common in ioctl. 3) Allow VQ_GROUP_BATCH_BEGIN and _END, similar to how memory map works in vhost_vdpa. As many vq_group response as needed in between. +) Assume that any vq not mentioned in the reply is vq group 0. > > To make it a > > table in shm seems more complicated, unless we accept a void * in the > > reply and VDUSE uses copy_from_user. If that is ok here, then sure. > > > > This looks tricky indeed. > > Thanks >
On Wed, Aug 13, 2025 at 6:12 PM Eugenio Perez Martin <eperezma@redhat.com> wrote: > > On Tue, Aug 12, 2025 at 5:01 AM Jason Wang <jasowang@redhat.com> wrote: > > > > On Mon, Aug 11, 2025 at 5:52 PM Eugenio Perez Martin > > <eperezma@redhat.com> wrote: > > > > > > On Mon, Aug 11, 2025 at 5:10 AM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > On Thu, Aug 7, 2025 at 7:58 PM Eugenio Pérez <eperezma@redhat.com> wrote: > > > > > > > > > > This allows sepparate the different virtqueues in groups that shares the > > > > > same address space. Asking the VDUSE device for the groups of the vq at > > > > > the beginning as they're needed for the DMA API. > > > > > > > > > > Allocating 3 vq groups as net is the device that need the most groups: > > > > > * Dataplane (guest passthrough) > > > > > * CVQ > > > > > * Shadowed vrings. > > > > > > > > > > Future versions of the series can include dynamic allocation of the > > > > > groups array so VDUSE can declare more groups. > > > > > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > > > > > --- > > > > > v2: > > > > > * Cache group information in kernel, as we need to provide the vq map > > > > > tokens properly. > > > > > * Add descs vq group to optimize SVQ forwarding and support indirect > > > > > descriptors out of the box. > > > > > --- > > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 71 +++++++++++++++++++++++++++++- > > > > > include/uapi/linux/vduse.h | 19 +++++++- > > > > > 2 files changed, 88 insertions(+), 2 deletions(-) > > > > > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > > > > > index d858c4389cc1..d1f6d00a9c71 100644 > > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > > > > @@ -46,6 +46,11 @@ > > > > > #define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024) > > > > > #define VDUSE_MSG_DEFAULT_TIMEOUT 30 > > > > > > > > > > +/* > > > > > + * Let's make it 3 for simplicity. > > > > > + */ > > > > > +#define VDUSE_MAX_VQ_GROUPS 3 > > > > > > > > I think we can release this to something like 64. Otherwise we might > > > > bump the version again just to increase the limitation? Or having a > > > > sysfs entry like bounce_size? > > > > > > > > > > I think it should not be linked to the version, but it is true there > > > is no way for VDUSE devices to check the maximum VQ groups / ASID that > > > the kernel supports. > > > > > > To handle as bounce_size seems the best option, good point. I'll send > > > a new version with that! > > > > > > > > + > > > > > #define IRQ_UNBOUND -1 > > > > > > > > > > struct vduse_virtqueue { > > > > > @@ -58,6 +63,8 @@ struct vduse_virtqueue { > > > > > struct vdpa_vq_state state; > > > > > bool ready; > > > > > bool kicked; > > > > > + u32 vq_group; > > > > > + u32 vq_desc_group; > > > > > spinlock_t kick_lock; > > > > > spinlock_t irq_lock; > > > > > struct eventfd_ctx *kickfd; > > > > > @@ -114,6 +121,7 @@ struct vduse_dev { > > > > > u8 status; > > > > > u32 vq_num; > > > > > u32 vq_align; > > > > > + u32 ngroups; > > > > > struct vduse_umem *umem; > > > > > struct mutex mem_lock; > > > > > unsigned int bounce_size; > > > > > @@ -592,6 +600,20 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, > > > > > return 0; > > > > > } > > > > > > > > > > +static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx) > > > > > +{ > > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > > > > + > > > > > + return dev->vqs[idx]->vq_group; > > > > > +} > > > > > + > > > > > +static u32 vduse_get_vq_desc_group(struct vdpa_device *vdpa, u16 idx) > > > > > +{ > > > > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > > > > + > > > > > + return dev->vqs[idx]->vq_desc_group; > > > > > +} > > > > > + > > > > > static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, > > > > > struct vdpa_vq_state *state) > > > > > { > > > > > @@ -678,13 +700,48 @@ static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa) > > > > > return dev->status; > > > > > } > > > > > > > > > > +static int vduse_fill_vq_groups(struct vduse_dev *dev) > > > > > +{ > > > > > + if (dev->api_version < VDUSE_API_VERSION_1) > > > > > + return 0; > > > > > + > > > > > + for (int i = 0; i < dev->vdev->vdpa.nvqs; ++i) { > > > > > + struct vduse_dev_msg msg = { 0 }; > > > > > + int ret; > > > > > + > > > > > + msg.req.type = VDUSE_GET_VQ_GROUP; > > > > > + msg.req.vq_group.index = i; > > > > > + ret = vduse_dev_msg_sync(dev, &msg); > > > > > > > > I fail to understand why the default group mapping is not done during > > > > device creation. > > > > > > > > > > Because it changes depending on the features. > > > > > > If a new device has 5 virtqueues and the device wants to isolate the > > > CVQ, the CVQ position depends on the features that the guest's acks: > > > * If MQ is acked the isolated vq is #5 > > > * If MQ is not acked the isolated vq is #3. > > > > I see we are still damaged by the design of the cvq index. But this is > > just a static branch not a dynamic one. If we can find ways to make it > > static it would be better. > > > > > > > > > > + if (ret) > > > > > + return ret; > > > > > + > > > > > + dev->vqs[i]->vq_group = msg.resp.vq_group.num; > > > > > + > > > > > + msg.req.type = VDUSE_GET_VRING_DESC_GROUP; > > > > > + ret = vduse_dev_msg_sync(dev, &msg); > > > > > + if (ret) > > > > > + return ret; > > > > > + > > > > > + dev->vqs[i]->vq_desc_group = msg.resp.vq_group.num; > > > > > + } > > > > > + > > > > > + return 0; > > > > > +} > > > > > + > > > > > static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status) > > > > > { > > > > > struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > > > > + u8 previous_status = dev->status; > > > > > > > > > > if (vduse_dev_set_status(dev, status)) > > > > > return; > > > > > > > > > > + if ((dev->status ^ previous_status) & > > > > > + BIT_ULL(VIRTIO_CONFIG_S_FEATURES_OK) && > > > > > + status & (1ULL << VIRTIO_CONFIG_S_FEATURES_OK)) > > > > > + if (vduse_fill_vq_groups(dev)) > > > > > > > > Can we merge the two messages into a single one? Or can we use a > > > > shared memory for storing such mapping? > > > > > > > > For example, if we have 256 queues it would be very slow. > > > > > > > > > > To merge it in the same message is good to me, sure. > > > > We can start from this if we can't find a way to provision vq to group > > mapping during device creation. > > > > Note that I don't think it is worth implementing these in this series, > but to add them on top in another one. Because I don't think we will > find devices with a lot of virtqueues for now. But here are some ideas > to mitigate the startup time cost: > > 1) Support more than one virtqueue in the same vduse request / reply > Something like: > vduse_dev_response{ > u32 req_id; > u32 result; > union { > ... > struct vduse_vq_group { > u32 num_vqs_requested; > struct { > u32 vq_idx; > u32 vq_group; > } vqs[15]; > } > u32 padding[32]; > } > } > > Choosing 15 to fill the current size of vduse_dev_response struct. > > 2) Pointer chasing in the struct written > > Same as previous, but the vqs struct is actually a pointer in > userspace. This way it can be arbitrarily big. > > vduse_dev_response{ > u32 req_id; > u32 result; > union { > ... > struct vduse_vq_group { > u32 num_vqs_requested; > struct { > u32 vq_idx; > u32 vq_group; > } *vqs; > } > u32 padding[32]; > } > } > > I cannot locate any use of this in write() data, but it is more or > less common in ioctl. > > 3) Allow VQ_GROUP_BATCH_BEGIN and _END, similar to how memory map > works in vhost_vdpa. As many vq_group response as needed in between. > > +) Assume that any vq not mentioned in the reply is vq group 0. Or make VDUSE io_uring compatible. (Anyhow it looks like another topic). I'm fine if Michael and others are fine with starting with merging the message. It's worth trying to reduce the userspace/kernel interaction as much as possible. Thanks > > > > > To make it a > > > table in shm seems more complicated, unless we accept a void * in the > > > reply and VDUSE uses copy_from_user. If that is ok here, then sure. > > > > > > > This looks tricky indeed. > > > > Thanks > > >
On 8/7/25 1:57 PM, Eugenio Pérez wrote: > diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h > index 9a56d0416bfe..b4b139dc76bb 100644 > --- a/include/uapi/linux/vduse.h > +++ b/include/uapi/linux/vduse.h > @@ -31,6 +31,7 @@ > * @features: virtio features > * @vq_num: the number of virtqueues > * @vq_align: the allocation alignment of virtqueue's metadata > + * @ngroups: number of vq groups that VDUSE device declares > * @reserved: for future use, needs to be initialized to zero > * @config_size: the size of the configuration space > * @config: the buffer of the configuration space > @@ -45,7 +46,8 @@ struct vduse_dev_config { > __u64 features; > __u32 vq_num; > __u32 vq_align; > - __u32 reserved[13]; > + __u32 ngroups; /* if VDUSE_API_VERSION >= 1 */ > + __u32 reserved[12]; > __u32 config_size; > __u8 config[]; > }; > @@ -160,6 +162,16 @@ struct vduse_vq_state_packed { > __u16 last_used_idx; > }; > > +/** > + * struct vduse_vq_group - virtqueue group > + * @num: Index of the virtqueue group > + * @num: Group > + */ > +struct vduse_vq_group { > + __u32 index; > + __u32 num; > +}; > + > /** > * struct vduse_vq_info - information of a virtqueue > * @index: virtqueue index > @@ -182,6 +194,7 @@ struct vduse_vq_info { > union { > struct vduse_vq_state_split split; > struct vduse_vq_state_packed packed; > + struct vduse_vq_group group; > }; > __u8 ready; > }; > @@ -274,6 +287,8 @@ enum vduse_req_type { > VDUSE_GET_VQ_STATE, > VDUSE_SET_STATUS, > VDUSE_UPDATE_IOTLB, > + VDUSE_GET_VQ_GROUP, > + VDUSE_GET_VRING_DESC_GROUP, > }; > > /** > @@ -328,6 +343,7 @@ struct vduse_dev_request { > struct vduse_vq_state vq_state; > struct vduse_dev_status s; > struct vduse_iova_range iova; > + struct vduse_vq_group vq_group; /* Only if vduse api version >= 1 */ > __u32 padding[32]; > }; > }; > @@ -350,6 +366,7 @@ struct vduse_dev_response { > __u32 reserved[4]; > union { > struct vduse_vq_state vq_state; > + struct vduse_vq_group vq_group; /* Only if vduse api version >= 1 */ > __u32 padding[32]; > }; > }; Same comment as for patch 5, padding should be updated. Regards, Maxime
© 2016 - 2025 Red Hat, Inc.