Since we already support irq callback affinity management,
let's implement the set/get_vq_affinity callbacks to make
it possible for the virtio device driver to change or be
aware of the affinity. This would make it possible for the
virtio-blk driver to build the blk-mq queues based on the
irq callback affinity.
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 24 ++++++++++++++++++++++++
1 file changed, 24 insertions(+)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 90c2896039d9..6507a78abc9d 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -739,6 +739,28 @@ static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
spin_unlock(&vq->irq_affinity_lock);
}
+static int vduse_vdpa_set_vq_affinity(struct vdpa_device *vdpa, u16 idx,
+ const struct cpumask *cpu_mask)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask);
+ vduse_vq_update_effective_cpu(dev->vqs[idx]);
+
+ return 0;
+}
+
+static const struct cpumask *
+vduse_vdpa_get_vq_affinity(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ if (dev->vqs[idx]->irq_effective_cpu == -1)
+ return NULL;
+
+ return &dev->vqs[idx]->irq_affinity;
+}
+
static void vduse_vdpa_set_irq_affinity(struct vdpa_device *vdpa,
struct irq_affinity *desc)
{
@@ -807,6 +829,8 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
.get_config = vduse_vdpa_get_config,
.set_config = vduse_vdpa_set_config,
.get_generation = vduse_vdpa_get_generation,
+ .set_vq_affinity = vduse_vdpa_set_vq_affinity,
+ .get_vq_affinity = vduse_vdpa_get_vq_affinity,
.set_irq_affinity = vduse_vdpa_set_irq_affinity,
.reset = vduse_vdpa_reset,
.set_map = vduse_vdpa_set_map,
--
2.20.1
Add sysfs interface for each vduse virtqueue to
show the affinity and effective affinity for irq
callback.
And we can also use this interface to change the
effective affinity which must be a subset of the
irq callback affinity mask for the virtqueue. This
might be useful for performance tuning when the irq
callback affinity mask contains more than one CPU.
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 148 ++++++++++++++++++++++++++---
1 file changed, 137 insertions(+), 11 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 6507a78abc9d..c65f84100e30 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -61,6 +61,7 @@ struct vduse_virtqueue {
int irq_effective_cpu;
struct cpumask irq_affinity;
spinlock_t irq_affinity_lock;
+ struct kobject kobj;
};
struct vduse_dev;
@@ -1419,6 +1420,120 @@ static const struct file_operations vduse_dev_fops = {
.llseek = noop_llseek,
};
+static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
+{
+ return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
+}
+
+static ssize_t irq_cb_effective_affinity_show(struct vduse_virtqueue *vq,
+ char *buf)
+{
+ struct cpumask all_mask = CPU_MASK_ALL;
+ const struct cpumask *mask = &all_mask;
+
+ if (vq->irq_effective_cpu != -1)
+ mask = get_cpu_mask(vq->irq_effective_cpu);
+
+ return sprintf(buf, "%*pb\n", cpumask_pr_args(mask));
+}
+
+static ssize_t irq_cb_effective_affinity_store(struct vduse_virtqueue *vq,
+ const char *buf, size_t count)
+{
+ cpumask_var_t new_value;
+ int ret;
+
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = cpumask_parse(buf, new_value);
+ if (ret)
+ goto free_mask;
+
+ ret = -EINVAL;
+ if (!cpumask_intersects(new_value, &vq->irq_affinity))
+ goto free_mask;
+
+ spin_lock(&vq->irq_affinity_lock);
+
+ if (vq->irq_effective_cpu != -1)
+ per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) -= 1;
+
+ vq->irq_effective_cpu = cpumask_first(new_value);
+ per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) += 1;
+
+ spin_unlock(&vq->irq_affinity_lock);
+ ret = count;
+
+free_mask:
+ free_cpumask_var(new_value);
+ return ret;
+}
+
+struct vq_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct vduse_virtqueue *vq, char *buf);
+ ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf,
+ size_t count);
+};
+
+static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RO(irq_cb_affinity);
+static struct vq_sysfs_entry irq_cb_effective_affinity_attr =
+ __ATTR_RW(irq_cb_effective_affinity);
+
+static struct attribute *vq_attrs[] = {
+ &irq_cb_affinity_attr.attr,
+ &irq_cb_effective_affinity_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(vq);
+
+static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct vduse_virtqueue *vq = container_of(kobj,
+ struct vduse_virtqueue, kobj);
+ struct vq_sysfs_entry *entry = container_of(attr,
+ struct vq_sysfs_entry, attr);
+
+ if (!entry->show)
+ return -EIO;
+
+ return entry->show(vq, buf);
+}
+
+static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct vduse_virtqueue *vq = container_of(kobj,
+ struct vduse_virtqueue, kobj);
+ struct vq_sysfs_entry *entry = container_of(attr,
+ struct vq_sysfs_entry, attr);
+
+ if (!entry->store)
+ return -EIO;
+
+ return entry->store(vq, buf, count);
+}
+
+static const struct sysfs_ops vq_sysfs_ops = {
+ .show = vq_attr_show,
+ .store = vq_attr_store,
+};
+
+static void vq_release(struct kobject *kobj)
+{
+ struct vduse_virtqueue *vq = container_of(kobj,
+ struct vduse_virtqueue, kobj);
+ kfree(vq);
+}
+
+static struct kobj_type vq_type = {
+ .release = vq_release,
+ .sysfs_ops = &vq_sysfs_ops,
+ .default_groups = vq_groups,
+};
+
static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
{
int i;
@@ -1427,13 +1542,13 @@ static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
return;
for (i = 0; i < dev->vq_num; i++)
- kfree(dev->vqs[i]);
+ kobject_put(&dev->vqs[i]->kobj);
kfree(dev->vqs);
}
static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
{
- int i;
+ int ret, i;
dev->vq_align = vq_align;
dev->vq_num = vq_num;
@@ -1443,8 +1558,10 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
for (i = 0; i < vq_num; i++) {
dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL);
- if (!dev->vqs[i])
+ if (!dev->vqs[i]) {
+ ret = -ENOMEM;
goto err;
+ }
dev->vqs[i]->index = i;
dev->vqs[i]->irq_effective_cpu = -1;
@@ -1454,15 +1571,23 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
spin_lock_init(&dev->vqs[i]->irq_lock);
spin_lock_init(&dev->vqs[i]->irq_affinity_lock);
cpumask_setall(&dev->vqs[i]->irq_affinity);
+
+ kobject_init(&dev->vqs[i]->kobj, &vq_type);
+ ret = kobject_add(&dev->vqs[i]->kobj,
+ &dev->dev->kobj, "vq%d", i);
+ if (ret) {
+ kfree(dev->vqs[i]);
+ goto err;
+ }
}
return 0;
err:
while (i--)
- kfree(dev->vqs[i]);
+ kobject_put(&dev->vqs[i]->kobj);
kfree(dev->vqs);
dev->vqs = NULL;
- return -ENOMEM;
+ return ret;
}
static struct vduse_dev *vduse_dev_create(void)
@@ -1637,10 +1762,6 @@ static int vduse_create_dev(struct vduse_dev_config *config,
dev->config = config_buf;
dev->config_size = config->config_size;
- ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
- if (ret)
- goto err_vqs;
-
ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
if (ret < 0)
goto err_idr;
@@ -1654,14 +1775,19 @@ static int vduse_create_dev(struct vduse_dev_config *config,
ret = PTR_ERR(dev->dev);
goto err_dev;
}
+
+ ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
+ if (ret)
+ goto err_vqs;
+
__module_get(THIS_MODULE);
return 0;
+err_vqs:
+ device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
err_dev:
idr_remove(&vduse_idr, dev->minor);
err_idr:
- vduse_dev_deinit_vqs(dev);
-err_vqs:
vduse_domain_destroy(dev->domain);
err_domain:
kfree(dev->name);
--
2.20.1
On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <xieyongji@bytedance.com> wrote: > > Add sysfs interface for each vduse virtqueue to > show the affinity and effective affinity for irq > callback. > > And we can also use this interface to change the > effective affinity which must be a subset of the > irq callback affinity mask for the virtqueue. This > might be useful for performance tuning when the irq > callback affinity mask contains more than one CPU. > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com> > --- > drivers/vdpa/vdpa_user/vduse_dev.c | 148 ++++++++++++++++++++++++++--- > 1 file changed, 137 insertions(+), 11 deletions(-) > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > index 6507a78abc9d..c65f84100e30 100644 > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > @@ -61,6 +61,7 @@ struct vduse_virtqueue { > int irq_effective_cpu; > struct cpumask irq_affinity; > spinlock_t irq_affinity_lock; > + struct kobject kobj; > }; > > struct vduse_dev; > @@ -1419,6 +1420,120 @@ static const struct file_operations vduse_dev_fops = { > .llseek = noop_llseek, > }; > > +static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf) > +{ > + return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity)); > +} > + > +static ssize_t irq_cb_effective_affinity_show(struct vduse_virtqueue *vq, > + char *buf) > +{ > + struct cpumask all_mask = CPU_MASK_ALL; > + const struct cpumask *mask = &all_mask; > + > + if (vq->irq_effective_cpu != -1) > + mask = get_cpu_mask(vq->irq_effective_cpu); Shouldn't this be vq->irq_affinity? > + > + return sprintf(buf, "%*pb\n", cpumask_pr_args(mask)); > +} > + > +static ssize_t irq_cb_effective_affinity_store(struct vduse_virtqueue *vq, > + const char *buf, size_t count) > +{ > + cpumask_var_t new_value; > + int ret; > + > + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) > + return -ENOMEM; > + > + ret = cpumask_parse(buf, new_value); > + if (ret) > + goto free_mask; > + > + ret = -EINVAL; > + if (!cpumask_intersects(new_value, &vq->irq_affinity)) > + goto free_mask; > + > + spin_lock(&vq->irq_affinity_lock); > + > + if (vq->irq_effective_cpu != -1) > + per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) -= 1; > + > + vq->irq_effective_cpu = cpumask_first(new_value); Does this mean except for the first cpu, the rest of the mask is unused? Thanks > + per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) += 1; > + > + spin_unlock(&vq->irq_affinity_lock); > + ret = count; > + > +free_mask: > + free_cpumask_var(new_value); > + return ret; > +} > + > +struct vq_sysfs_entry { > + struct attribute attr; > + ssize_t (*show)(struct vduse_virtqueue *vq, char *buf); > + ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf, > + size_t count); > +}; > + > +static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RO(irq_cb_affinity); > +static struct vq_sysfs_entry irq_cb_effective_affinity_attr = > + __ATTR_RW(irq_cb_effective_affinity); > + > +static struct attribute *vq_attrs[] = { > + &irq_cb_affinity_attr.attr, > + &irq_cb_effective_affinity_attr.attr, > + NULL, > +}; > +ATTRIBUTE_GROUPS(vq); > + > +static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr, > + char *buf) > +{ > + struct vduse_virtqueue *vq = container_of(kobj, > + struct vduse_virtqueue, kobj); > + struct vq_sysfs_entry *entry = container_of(attr, > + struct vq_sysfs_entry, attr); > + > + if (!entry->show) > + return -EIO; > + > + return entry->show(vq, buf); > +} > + > +static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr, > + const char *buf, size_t count) > +{ > + struct vduse_virtqueue *vq = container_of(kobj, > + struct vduse_virtqueue, kobj); > + struct vq_sysfs_entry *entry = container_of(attr, > + struct vq_sysfs_entry, attr); > + > + if (!entry->store) > + return -EIO; > + > + return entry->store(vq, buf, count); > +} > + > +static const struct sysfs_ops vq_sysfs_ops = { > + .show = vq_attr_show, > + .store = vq_attr_store, > +}; > + > +static void vq_release(struct kobject *kobj) > +{ > + struct vduse_virtqueue *vq = container_of(kobj, > + struct vduse_virtqueue, kobj); > + kfree(vq); > +} > + > +static struct kobj_type vq_type = { > + .release = vq_release, > + .sysfs_ops = &vq_sysfs_ops, > + .default_groups = vq_groups, > +}; > + > static void vduse_dev_deinit_vqs(struct vduse_dev *dev) > { > int i; > @@ -1427,13 +1542,13 @@ static void vduse_dev_deinit_vqs(struct vduse_dev *dev) > return; > > for (i = 0; i < dev->vq_num; i++) > - kfree(dev->vqs[i]); > + kobject_put(&dev->vqs[i]->kobj); > kfree(dev->vqs); > } > > static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num) > { > - int i; > + int ret, i; > > dev->vq_align = vq_align; > dev->vq_num = vq_num; > @@ -1443,8 +1558,10 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num) > > for (i = 0; i < vq_num; i++) { > dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL); > - if (!dev->vqs[i]) > + if (!dev->vqs[i]) { > + ret = -ENOMEM; > goto err; > + } > > dev->vqs[i]->index = i; > dev->vqs[i]->irq_effective_cpu = -1; > @@ -1454,15 +1571,23 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num) > spin_lock_init(&dev->vqs[i]->irq_lock); > spin_lock_init(&dev->vqs[i]->irq_affinity_lock); > cpumask_setall(&dev->vqs[i]->irq_affinity); > + > + kobject_init(&dev->vqs[i]->kobj, &vq_type); > + ret = kobject_add(&dev->vqs[i]->kobj, > + &dev->dev->kobj, "vq%d", i); > + if (ret) { > + kfree(dev->vqs[i]); > + goto err; > + } > } > > return 0; > err: > while (i--) > - kfree(dev->vqs[i]); > + kobject_put(&dev->vqs[i]->kobj); > kfree(dev->vqs); > dev->vqs = NULL; > - return -ENOMEM; > + return ret; > } > > static struct vduse_dev *vduse_dev_create(void) > @@ -1637,10 +1762,6 @@ static int vduse_create_dev(struct vduse_dev_config *config, > dev->config = config_buf; > dev->config_size = config->config_size; > > - ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num); > - if (ret) > - goto err_vqs; > - > ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL); > if (ret < 0) > goto err_idr; > @@ -1654,14 +1775,19 @@ static int vduse_create_dev(struct vduse_dev_config *config, > ret = PTR_ERR(dev->dev); > goto err_dev; > } > + > + ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num); > + if (ret) > + goto err_vqs; > + > __module_get(THIS_MODULE); > > return 0; > +err_vqs: > + device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor)); > err_dev: > idr_remove(&vduse_idr, dev->minor); > err_idr: > - vduse_dev_deinit_vqs(dev); > -err_vqs: > vduse_domain_destroy(dev->domain); > err_domain: > kfree(dev->name); > -- > 2.20.1 >
On Fri, Dec 16, 2022 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote: > > On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <xieyongji@bytedance.com> wrote: > > > > Add sysfs interface for each vduse virtqueue to > > show the affinity and effective affinity for irq > > callback. > > > > And we can also use this interface to change the > > effective affinity which must be a subset of the > > irq callback affinity mask for the virtqueue. This > > might be useful for performance tuning when the irq > > callback affinity mask contains more than one CPU. > > > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com> > > --- > > drivers/vdpa/vdpa_user/vduse_dev.c | 148 ++++++++++++++++++++++++++--- > > 1 file changed, 137 insertions(+), 11 deletions(-) > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > > index 6507a78abc9d..c65f84100e30 100644 > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > @@ -61,6 +61,7 @@ struct vduse_virtqueue { > > int irq_effective_cpu; > > struct cpumask irq_affinity; > > spinlock_t irq_affinity_lock; > > + struct kobject kobj; > > }; > > > > struct vduse_dev; > > @@ -1419,6 +1420,120 @@ static const struct file_operations vduse_dev_fops = { > > .llseek = noop_llseek, > > }; > > > > +static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf) > > +{ > > + return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity)); > > +} > > + > > +static ssize_t irq_cb_effective_affinity_show(struct vduse_virtqueue *vq, > > + char *buf) > > +{ > > + struct cpumask all_mask = CPU_MASK_ALL; > > + const struct cpumask *mask = &all_mask; > > + > > + if (vq->irq_effective_cpu != -1) > > + mask = get_cpu_mask(vq->irq_effective_cpu); > > Shouldn't this be vq->irq_affinity? > This sysfs interface is provided for effective irq affinity rather than irq affinity. We created another read-only sysfs interface for irq affinity. > > + > > + return sprintf(buf, "%*pb\n", cpumask_pr_args(mask)); > > +} > > + > > +static ssize_t irq_cb_effective_affinity_store(struct vduse_virtqueue *vq, > > + const char *buf, size_t count) > > +{ > > + cpumask_var_t new_value; > > + int ret; > > + > > + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) > > + return -ENOMEM; > > + > > + ret = cpumask_parse(buf, new_value); > > + if (ret) > > + goto free_mask; > > + > > + ret = -EINVAL; > > + if (!cpumask_intersects(new_value, &vq->irq_affinity)) > > + goto free_mask; > > + > > + spin_lock(&vq->irq_affinity_lock); > > + > > + if (vq->irq_effective_cpu != -1) > > + per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) -= 1; > > + > > + vq->irq_effective_cpu = cpumask_first(new_value); > > Does this mean except for the first cpu, the rest of the mask is unused? > Yes, but the user should always specify a mask that only contains one CPU to make it work as expected. This sysfs interface is used to specify the effective irq affinity rather than irq affinity. Thanks, Yongji
On Mon, Dec 19, 2022 at 1:16 PM Yongji Xie <xieyongji@bytedance.com> wrote: > > On Fri, Dec 16, 2022 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote: > > > > On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <xieyongji@bytedance.com> wrote: > > > > > > Add sysfs interface for each vduse virtqueue to > > > show the affinity and effective affinity for irq > > > callback. > > > > > > And we can also use this interface to change the > > > effective affinity which must be a subset of the > > > irq callback affinity mask for the virtqueue. This > > > might be useful for performance tuning when the irq > > > callback affinity mask contains more than one CPU. > > > > > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com> > > > --- > > > drivers/vdpa/vdpa_user/vduse_dev.c | 148 ++++++++++++++++++++++++++--- > > > 1 file changed, 137 insertions(+), 11 deletions(-) > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > > > index 6507a78abc9d..c65f84100e30 100644 > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > > @@ -61,6 +61,7 @@ struct vduse_virtqueue { > > > int irq_effective_cpu; > > > struct cpumask irq_affinity; > > > spinlock_t irq_affinity_lock; > > > + struct kobject kobj; > > > }; > > > > > > struct vduse_dev; > > > @@ -1419,6 +1420,120 @@ static const struct file_operations vduse_dev_fops = { > > > .llseek = noop_llseek, > > > }; > > > > > > +static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf) > > > +{ > > > + return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity)); > > > +} > > > + > > > +static ssize_t irq_cb_effective_affinity_show(struct vduse_virtqueue *vq, > > > + char *buf) > > > +{ > > > + struct cpumask all_mask = CPU_MASK_ALL; > > > + const struct cpumask *mask = &all_mask; > > > + > > > + if (vq->irq_effective_cpu != -1) > > > + mask = get_cpu_mask(vq->irq_effective_cpu); > > > > Shouldn't this be vq->irq_affinity? > > > > This sysfs interface is provided for effective irq affinity rather > than irq affinity. We created another read-only sysfs interface for > irq affinity. > > > > + > > > + return sprintf(buf, "%*pb\n", cpumask_pr_args(mask)); > > > +} > > > + > > > +static ssize_t irq_cb_effective_affinity_store(struct vduse_virtqueue *vq, > > > + const char *buf, size_t count) > > > +{ > > > + cpumask_var_t new_value; > > > + int ret; > > > + > > > + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) > > > + return -ENOMEM; > > > + > > > + ret = cpumask_parse(buf, new_value); > > > + if (ret) > > > + goto free_mask; > > > + > > > + ret = -EINVAL; > > > + if (!cpumask_intersects(new_value, &vq->irq_affinity)) > > > + goto free_mask; > > > + > > > + spin_lock(&vq->irq_affinity_lock); > > > + > > > + if (vq->irq_effective_cpu != -1) > > > + per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) -= 1; > > > + > > > + vq->irq_effective_cpu = cpumask_first(new_value); > > > > Does this mean except for the first cpu, the rest of the mask is unused? > > > > Yes, but the user should always specify a mask that only contains one > CPU to make it work as expected. This doesn't seem to be the way that the IRQ affinity{hint} exported via /sys work. Any reason for doing this? (E.g we may have the require to limit the IRQ/callback to a NUMA node instead of a specific cpu) Thanks > This sysfs interface is used to > specify the effective irq affinity rather than irq affinity. > > Thanks, > Yongji >
On Tue, Dec 20, 2022 at 2:29 PM Jason Wang <jasowang@redhat.com> wrote: > > On Mon, Dec 19, 2022 at 1:16 PM Yongji Xie <xieyongji@bytedance.com> wrote: > > > > On Fri, Dec 16, 2022 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <xieyongji@bytedance.com> wrote: > > > > > > > > Add sysfs interface for each vduse virtqueue to > > > > show the affinity and effective affinity for irq > > > > callback. > > > > > > > > And we can also use this interface to change the > > > > effective affinity which must be a subset of the > > > > irq callback affinity mask for the virtqueue. This > > > > might be useful for performance tuning when the irq > > > > callback affinity mask contains more than one CPU. > > > > > > > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com> > > > > --- > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 148 ++++++++++++++++++++++++++--- > > > > 1 file changed, 137 insertions(+), 11 deletions(-) > > > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > > > > index 6507a78abc9d..c65f84100e30 100644 > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > > > @@ -61,6 +61,7 @@ struct vduse_virtqueue { > > > > int irq_effective_cpu; > > > > struct cpumask irq_affinity; > > > > spinlock_t irq_affinity_lock; > > > > + struct kobject kobj; > > > > }; > > > > > > > > struct vduse_dev; > > > > @@ -1419,6 +1420,120 @@ static const struct file_operations vduse_dev_fops = { > > > > .llseek = noop_llseek, > > > > }; > > > > > > > > +static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf) > > > > +{ > > > > + return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity)); > > > > +} > > > > + > > > > +static ssize_t irq_cb_effective_affinity_show(struct vduse_virtqueue *vq, > > > > + char *buf) > > > > +{ > > > > + struct cpumask all_mask = CPU_MASK_ALL; > > > > + const struct cpumask *mask = &all_mask; > > > > + > > > > + if (vq->irq_effective_cpu != -1) > > > > + mask = get_cpu_mask(vq->irq_effective_cpu); > > > > > > Shouldn't this be vq->irq_affinity? > > > > > > > This sysfs interface is provided for effective irq affinity rather > > than irq affinity. We created another read-only sysfs interface for > > irq affinity. > > > > > > + > > > > + return sprintf(buf, "%*pb\n", cpumask_pr_args(mask)); > > > > +} > > > > + > > > > +static ssize_t irq_cb_effective_affinity_store(struct vduse_virtqueue *vq, > > > > + const char *buf, size_t count) > > > > +{ > > > > + cpumask_var_t new_value; > > > > + int ret; > > > > + > > > > + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) > > > > + return -ENOMEM; > > > > + > > > > + ret = cpumask_parse(buf, new_value); > > > > + if (ret) > > > > + goto free_mask; > > > > + > > > > + ret = -EINVAL; > > > > + if (!cpumask_intersects(new_value, &vq->irq_affinity)) > > > > + goto free_mask; > > > > + > > > > + spin_lock(&vq->irq_affinity_lock); > > > > + > > > > + if (vq->irq_effective_cpu != -1) > > > > + per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) -= 1; > > > > + > > > > + vq->irq_effective_cpu = cpumask_first(new_value); > > > > > > Does this mean except for the first cpu, the rest of the mask is unused? > > > > > > > Yes, but the user should always specify a mask that only contains one > > CPU to make it work as expected. > > This doesn't seem to be the way that the IRQ affinity{hint} exported > via /sys work. Any reason for doing this? > > (E.g we may have the require to limit the IRQ/callback to a NUMA node > instead of a specific cpu) > Yes, I think we need to make the sysfs interface for irq affinity writable. The effective irq affinity can be removed now since we choose using round-robin to spread IRQs between CPUs. Thanks, Yongji
Add enable_irq_wq sysfs interface to control whether
use workqueue to inject irq or not. The vhost-vdpa case
can benefit from it.
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 50 +++++++++++++++++++++++++++++-
1 file changed, 49 insertions(+), 1 deletion(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index c65f84100e30..ed06c7afd484 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -62,6 +62,7 @@ struct vduse_virtqueue {
struct cpumask irq_affinity;
spinlock_t irq_affinity_lock;
struct kobject kobj;
+ bool enable_irq_wq;
};
struct vduse_dev;
@@ -1013,6 +1014,26 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
return ret;
}
+static int vduse_dev_inject_vq_irq(struct vduse_dev *dev,
+ struct vduse_virtqueue *vq)
+{
+ int ret = -EINVAL;
+
+ down_read(&dev->rwsem);
+ if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
+ goto unlock;
+
+ ret = 0;
+ spin_lock_irq(&vq->irq_lock);
+ if (vq->ready && vq->cb.callback)
+ vq->cb.callback(vq->cb.private);
+ spin_unlock_irq(&vq->irq_lock);
+unlock:
+ up_read(&dev->rwsem);
+
+ return ret;
+}
+
static int vduse_dev_dereg_umem(struct vduse_dev *dev,
u64 iova, u64 size)
{
@@ -1278,8 +1299,12 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
break;
index = array_index_nospec(index, dev->vq_num);
- ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index]->inject,
+ if (dev->vqs[index]->enable_irq_wq)
+ ret = vduse_dev_queue_irq_work(dev,
+ &dev->vqs[index]->inject,
dev->vqs[index]->irq_effective_cpu);
+ else
+ ret = vduse_dev_inject_vq_irq(dev, dev->vqs[index]);
break;
}
case VDUSE_IOTLB_REG_UMEM: {
@@ -1420,6 +1445,26 @@ static const struct file_operations vduse_dev_fops = {
.llseek = noop_llseek,
};
+static ssize_t enable_irq_wq_show(struct vduse_virtqueue *vq, char *buf)
+{
+ return sprintf(buf, "%d\n", vq->enable_irq_wq);
+}
+
+static ssize_t enable_irq_wq_store(struct vduse_virtqueue *vq,
+ const char *buf, size_t count)
+{
+ bool enabled;
+ int ret;
+
+ ret = kstrtobool(buf, &enabled);
+ if (ret)
+ return ret;
+
+ vq->enable_irq_wq = enabled;
+
+ return count;
+}
+
static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
{
return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
@@ -1480,10 +1525,12 @@ struct vq_sysfs_entry {
static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RO(irq_cb_affinity);
static struct vq_sysfs_entry irq_cb_effective_affinity_attr =
__ATTR_RW(irq_cb_effective_affinity);
+static struct vq_sysfs_entry enable_irq_wq_attr = __ATTR_RW(enable_irq_wq);
static struct attribute *vq_attrs[] = {
&irq_cb_affinity_attr.attr,
&irq_cb_effective_affinity_attr.attr,
+ &enable_irq_wq_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(vq);
@@ -1565,6 +1612,7 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
dev->vqs[i]->index = i;
dev->vqs[i]->irq_effective_cpu = -1;
+ dev->vqs[i]->enable_irq_wq = true;
INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject);
INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
spin_lock_init(&dev->vqs[i]->kick_lock);
--
2.20.1
On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <xieyongji@bytedance.com> wrote: > > Add enable_irq_wq sysfs interface to control whether > use workqueue to inject irq or not. The vhost-vdpa case > can benefit from it. Do we have a benchmark result for this? Or I wonder if we can extend set_vq_cb() by associating an eventfd then VDUSE can signal that eventfd directly? Thanks > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com> > --- > drivers/vdpa/vdpa_user/vduse_dev.c | 50 +++++++++++++++++++++++++++++- > 1 file changed, 49 insertions(+), 1 deletion(-) > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > index c65f84100e30..ed06c7afd484 100644 > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > @@ -62,6 +62,7 @@ struct vduse_virtqueue { > struct cpumask irq_affinity; > spinlock_t irq_affinity_lock; > struct kobject kobj; > + bool enable_irq_wq; > }; > > struct vduse_dev; > @@ -1013,6 +1014,26 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev, > return ret; > } > > +static int vduse_dev_inject_vq_irq(struct vduse_dev *dev, > + struct vduse_virtqueue *vq) > +{ > + int ret = -EINVAL; > + > + down_read(&dev->rwsem); > + if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK)) > + goto unlock; > + > + ret = 0; > + spin_lock_irq(&vq->irq_lock); > + if (vq->ready && vq->cb.callback) > + vq->cb.callback(vq->cb.private); > + spin_unlock_irq(&vq->irq_lock); > +unlock: > + up_read(&dev->rwsem); > + > + return ret; > +} > + > static int vduse_dev_dereg_umem(struct vduse_dev *dev, > u64 iova, u64 size) > { > @@ -1278,8 +1299,12 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > break; > > index = array_index_nospec(index, dev->vq_num); > - ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index]->inject, > + if (dev->vqs[index]->enable_irq_wq) > + ret = vduse_dev_queue_irq_work(dev, > + &dev->vqs[index]->inject, > dev->vqs[index]->irq_effective_cpu); > + else > + ret = vduse_dev_inject_vq_irq(dev, dev->vqs[index]); > break; > } > case VDUSE_IOTLB_REG_UMEM: { > @@ -1420,6 +1445,26 @@ static const struct file_operations vduse_dev_fops = { > .llseek = noop_llseek, > }; > > +static ssize_t enable_irq_wq_show(struct vduse_virtqueue *vq, char *buf) > +{ > + return sprintf(buf, "%d\n", vq->enable_irq_wq); > +} > + > +static ssize_t enable_irq_wq_store(struct vduse_virtqueue *vq, > + const char *buf, size_t count) > +{ > + bool enabled; > + int ret; > + > + ret = kstrtobool(buf, &enabled); > + if (ret) > + return ret; > + > + vq->enable_irq_wq = enabled; > + > + return count; > +} > + > static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf) > { > return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity)); > @@ -1480,10 +1525,12 @@ struct vq_sysfs_entry { > static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RO(irq_cb_affinity); > static struct vq_sysfs_entry irq_cb_effective_affinity_attr = > __ATTR_RW(irq_cb_effective_affinity); > +static struct vq_sysfs_entry enable_irq_wq_attr = __ATTR_RW(enable_irq_wq); > > static struct attribute *vq_attrs[] = { > &irq_cb_affinity_attr.attr, > &irq_cb_effective_affinity_attr.attr, > + &enable_irq_wq_attr.attr, > NULL, > }; > ATTRIBUTE_GROUPS(vq); > @@ -1565,6 +1612,7 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num) > > dev->vqs[i]->index = i; > dev->vqs[i]->irq_effective_cpu = -1; > + dev->vqs[i]->enable_irq_wq = true; > INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject); > INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work); > spin_lock_init(&dev->vqs[i]->kick_lock); > -- > 2.20.1 >
On Fri, Dec 16, 2022 at 1:43 PM Jason Wang <jasowang@redhat.com> wrote: > > On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <xieyongji@bytedance.com> wrote: > > > > Add enable_irq_wq sysfs interface to control whether > > use workqueue to inject irq or not. The vhost-vdpa case > > can benefit from it. > > Do we have a benchmark result for this? > It can reduce 2~3 us latency in our sync I/O test. > Or I wonder if we can extend set_vq_cb() by associating an eventfd > then VDUSE can signal that eventfd directly? > It looks good to me. I can try this way in v3. Thanks, Yongji
Delay creating iova domain until the vduse device is binded
to vdpa bus.
This is a preparation for adding sysfs interface to support
specifying bounce buffer size for the iova domain.
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 75 +++++++++++++++++++++---------
1 file changed, 53 insertions(+), 22 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index ed06c7afd484..bd1ba6c33e09 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -112,6 +112,8 @@ struct vduse_dev {
u32 vq_align;
struct vduse_umem *umem;
struct mutex mem_lock;
+ unsigned int bounce_size;
+ struct mutex domain_lock;
};
struct vduse_dev_msg {
@@ -427,7 +429,7 @@ static void vduse_dev_reset(struct vduse_dev *dev)
struct vduse_iova_domain *domain = dev->domain;
/* The coherent mappings are handled in vduse_dev_free_coherent() */
- if (domain->bounce_map)
+ if (domain && domain->bounce_map)
vduse_domain_reset_bounce_map(domain);
down_write(&dev->rwsem);
@@ -1045,6 +1047,9 @@ static int vduse_dev_dereg_umem(struct vduse_dev *dev,
goto unlock;
ret = -EINVAL;
+ if (!dev->domain)
+ goto unlock;
+
if (dev->umem->iova != iova || size != dev->domain->bounce_size)
goto unlock;
@@ -1071,7 +1076,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
unsigned long npages, lock_limit;
int ret;
- if (!dev->domain->bounce_map ||
+ if (!dev->domain || !dev->domain->bounce_map ||
size != dev->domain->bounce_size ||
iova != 0 || uaddr & ~PAGE_MASK)
return -EINVAL;
@@ -1145,7 +1150,6 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
struct vduse_iotlb_entry entry;
struct vhost_iotlb_map *map;
struct vdpa_map_file *map_file;
- struct vduse_iova_domain *domain = dev->domain;
struct file *f = NULL;
ret = -EFAULT;
@@ -1156,8 +1160,13 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
if (entry.start > entry.last)
break;
- spin_lock(&domain->iotlb_lock);
- map = vhost_iotlb_itree_first(domain->iotlb,
+ mutex_lock(&dev->domain_lock);
+ if (!dev->domain) {
+ mutex_unlock(&dev->domain_lock);
+ break;
+ }
+ spin_lock(&dev->domain->iotlb_lock);
+ map = vhost_iotlb_itree_first(dev->domain->iotlb,
entry.start, entry.last);
if (map) {
map_file = (struct vdpa_map_file *)map->opaque;
@@ -1167,7 +1176,8 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
entry.last = map->last;
entry.perm = map->perm;
}
- spin_unlock(&domain->iotlb_lock);
+ spin_unlock(&dev->domain->iotlb_lock);
+ mutex_unlock(&dev->domain_lock);
ret = -EINVAL;
if (!f)
break;
@@ -1319,8 +1329,10 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
sizeof(umem.reserved)))
break;
+ mutex_lock(&dev->domain_lock);
ret = vduse_dev_reg_umem(dev, umem.iova,
umem.uaddr, umem.size);
+ mutex_unlock(&dev->domain_lock);
break;
}
case VDUSE_IOTLB_DEREG_UMEM: {
@@ -1334,15 +1346,15 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
if (!is_mem_zero((const char *)umem.reserved,
sizeof(umem.reserved)))
break;
-
+ mutex_lock(&dev->domain_lock);
ret = vduse_dev_dereg_umem(dev, umem.iova,
umem.size);
+ mutex_unlock(&dev->domain_lock);
break;
}
case VDUSE_IOTLB_GET_INFO: {
struct vduse_iova_info info;
struct vhost_iotlb_map *map;
- struct vduse_iova_domain *domain = dev->domain;
ret = -EFAULT;
if (copy_from_user(&info, argp, sizeof(info)))
@@ -1356,18 +1368,24 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
sizeof(info.reserved)))
break;
- spin_lock(&domain->iotlb_lock);
- map = vhost_iotlb_itree_first(domain->iotlb,
+ mutex_lock(&dev->domain_lock);
+ if (!dev->domain) {
+ mutex_unlock(&dev->domain_lock);
+ break;
+ }
+ spin_lock(&dev->domain->iotlb_lock);
+ map = vhost_iotlb_itree_first(dev->domain->iotlb,
info.start, info.last);
if (map) {
info.start = map->start;
info.last = map->last;
info.capability = 0;
- if (domain->bounce_map && map->start == 0 &&
- map->last == domain->bounce_size - 1)
+ if (dev->domain->bounce_map && map->start == 0 &&
+ map->last == dev->domain->bounce_size - 1)
info.capability |= VDUSE_IOVA_CAP_UMEM;
}
- spin_unlock(&domain->iotlb_lock);
+ spin_unlock(&dev->domain->iotlb_lock);
+ mutex_unlock(&dev->domain_lock);
if (!map)
break;
@@ -1390,7 +1408,10 @@ static int vduse_dev_release(struct inode *inode, struct file *file)
{
struct vduse_dev *dev = file->private_data;
- vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
+ mutex_lock(&dev->domain_lock);
+ if (dev->domain)
+ vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
+ mutex_unlock(&dev->domain_lock);
spin_lock(&dev->msg_lock);
/* Make sure the inflight messages can processed after reconncection */
list_splice_init(&dev->recv_list, &dev->send_list);
@@ -1647,6 +1668,7 @@ static struct vduse_dev *vduse_dev_create(void)
mutex_init(&dev->lock);
mutex_init(&dev->mem_lock);
+ mutex_init(&dev->domain_lock);
spin_lock_init(&dev->msg_lock);
INIT_LIST_HEAD(&dev->send_list);
INIT_LIST_HEAD(&dev->recv_list);
@@ -1696,7 +1718,8 @@ static int vduse_destroy_dev(char *name)
idr_remove(&vduse_idr, dev->minor);
kvfree(dev->config);
vduse_dev_deinit_vqs(dev);
- vduse_domain_destroy(dev->domain);
+ if (dev->domain)
+ vduse_domain_destroy(dev->domain);
kfree(dev->name);
vduse_dev_destroy(dev);
module_put(THIS_MODULE);
@@ -1802,11 +1825,7 @@ static int vduse_create_dev(struct vduse_dev_config *config,
if (!dev->name)
goto err_str;
- dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
- VDUSE_BOUNCE_SIZE);
- if (!dev->domain)
- goto err_domain;
-
+ dev->bounce_size = VDUSE_BOUNCE_SIZE;
dev->config = config_buf;
dev->config_size = config->config_size;
@@ -1836,8 +1855,6 @@ static int vduse_create_dev(struct vduse_dev_config *config,
err_dev:
idr_remove(&vduse_idr, dev->minor);
err_idr:
- vduse_domain_destroy(dev->domain);
-err_domain:
kfree(dev->name);
err_str:
vduse_dev_destroy(dev);
@@ -2004,9 +2021,23 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
if (ret)
return ret;
+ mutex_lock(&dev->domain_lock);
+ if (!dev->domain)
+ dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
+ dev->bounce_size);
+ mutex_unlock(&dev->domain_lock);
+ if (!dev->domain) {
+ put_device(&dev->vdev->vdpa.dev);
+ return -ENOMEM;
+ }
+
ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
if (ret) {
put_device(&dev->vdev->vdpa.dev);
+ mutex_lock(&dev->domain_lock);
+ vduse_domain_destroy(dev->domain);
+ dev->domain = NULL;
+ mutex_unlock(&dev->domain_lock);
return ret;
}
--
2.20.1
On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <xieyongji@bytedance.com> wrote: > > Delay creating iova domain until the vduse device is binded > to vdpa bus. s/binded/registered/ Other than this. Acked-by: Jason Wang <jasowang@redhat.com> Thanks > > This is a preparation for adding sysfs interface to support > specifying bounce buffer size for the iova domain. > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com> > --- > drivers/vdpa/vdpa_user/vduse_dev.c | 75 +++++++++++++++++++++--------- > 1 file changed, 53 insertions(+), 22 deletions(-) > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > index ed06c7afd484..bd1ba6c33e09 100644 > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > @@ -112,6 +112,8 @@ struct vduse_dev { > u32 vq_align; > struct vduse_umem *umem; > struct mutex mem_lock; > + unsigned int bounce_size; > + struct mutex domain_lock; > }; > > struct vduse_dev_msg { > @@ -427,7 +429,7 @@ static void vduse_dev_reset(struct vduse_dev *dev) > struct vduse_iova_domain *domain = dev->domain; > > /* The coherent mappings are handled in vduse_dev_free_coherent() */ > - if (domain->bounce_map) > + if (domain && domain->bounce_map) > vduse_domain_reset_bounce_map(domain); > > down_write(&dev->rwsem); > @@ -1045,6 +1047,9 @@ static int vduse_dev_dereg_umem(struct vduse_dev *dev, > goto unlock; > > ret = -EINVAL; > + if (!dev->domain) > + goto unlock; > + > if (dev->umem->iova != iova || size != dev->domain->bounce_size) > goto unlock; > > @@ -1071,7 +1076,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, > unsigned long npages, lock_limit; > int ret; > > - if (!dev->domain->bounce_map || > + if (!dev->domain || !dev->domain->bounce_map || > size != dev->domain->bounce_size || > iova != 0 || uaddr & ~PAGE_MASK) > return -EINVAL; > @@ -1145,7 +1150,6 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > struct vduse_iotlb_entry entry; > struct vhost_iotlb_map *map; > struct vdpa_map_file *map_file; > - struct vduse_iova_domain *domain = dev->domain; > struct file *f = NULL; > > ret = -EFAULT; > @@ -1156,8 +1160,13 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > if (entry.start > entry.last) > break; > > - spin_lock(&domain->iotlb_lock); > - map = vhost_iotlb_itree_first(domain->iotlb, > + mutex_lock(&dev->domain_lock); > + if (!dev->domain) { > + mutex_unlock(&dev->domain_lock); > + break; > + } > + spin_lock(&dev->domain->iotlb_lock); > + map = vhost_iotlb_itree_first(dev->domain->iotlb, > entry.start, entry.last); > if (map) { > map_file = (struct vdpa_map_file *)map->opaque; > @@ -1167,7 +1176,8 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > entry.last = map->last; > entry.perm = map->perm; > } > - spin_unlock(&domain->iotlb_lock); > + spin_unlock(&dev->domain->iotlb_lock); > + mutex_unlock(&dev->domain_lock); > ret = -EINVAL; > if (!f) > break; > @@ -1319,8 +1329,10 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > sizeof(umem.reserved))) > break; > > + mutex_lock(&dev->domain_lock); > ret = vduse_dev_reg_umem(dev, umem.iova, > umem.uaddr, umem.size); > + mutex_unlock(&dev->domain_lock); > break; > } > case VDUSE_IOTLB_DEREG_UMEM: { > @@ -1334,15 +1346,15 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > if (!is_mem_zero((const char *)umem.reserved, > sizeof(umem.reserved))) > break; > - > + mutex_lock(&dev->domain_lock); > ret = vduse_dev_dereg_umem(dev, umem.iova, > umem.size); > + mutex_unlock(&dev->domain_lock); > break; > } > case VDUSE_IOTLB_GET_INFO: { > struct vduse_iova_info info; > struct vhost_iotlb_map *map; > - struct vduse_iova_domain *domain = dev->domain; > > ret = -EFAULT; > if (copy_from_user(&info, argp, sizeof(info))) > @@ -1356,18 +1368,24 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > sizeof(info.reserved))) > break; > > - spin_lock(&domain->iotlb_lock); > - map = vhost_iotlb_itree_first(domain->iotlb, > + mutex_lock(&dev->domain_lock); > + if (!dev->domain) { > + mutex_unlock(&dev->domain_lock); > + break; > + } > + spin_lock(&dev->domain->iotlb_lock); > + map = vhost_iotlb_itree_first(dev->domain->iotlb, > info.start, info.last); > if (map) { > info.start = map->start; > info.last = map->last; > info.capability = 0; > - if (domain->bounce_map && map->start == 0 && > - map->last == domain->bounce_size - 1) > + if (dev->domain->bounce_map && map->start == 0 && > + map->last == dev->domain->bounce_size - 1) > info.capability |= VDUSE_IOVA_CAP_UMEM; > } > - spin_unlock(&domain->iotlb_lock); > + spin_unlock(&dev->domain->iotlb_lock); > + mutex_unlock(&dev->domain_lock); > if (!map) > break; > > @@ -1390,7 +1408,10 @@ static int vduse_dev_release(struct inode *inode, struct file *file) > { > struct vduse_dev *dev = file->private_data; > > - vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size); > + mutex_lock(&dev->domain_lock); > + if (dev->domain) > + vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size); > + mutex_unlock(&dev->domain_lock); > spin_lock(&dev->msg_lock); > /* Make sure the inflight messages can processed after reconncection */ > list_splice_init(&dev->recv_list, &dev->send_list); > @@ -1647,6 +1668,7 @@ static struct vduse_dev *vduse_dev_create(void) > > mutex_init(&dev->lock); > mutex_init(&dev->mem_lock); > + mutex_init(&dev->domain_lock); > spin_lock_init(&dev->msg_lock); > INIT_LIST_HEAD(&dev->send_list); > INIT_LIST_HEAD(&dev->recv_list); > @@ -1696,7 +1718,8 @@ static int vduse_destroy_dev(char *name) > idr_remove(&vduse_idr, dev->minor); > kvfree(dev->config); > vduse_dev_deinit_vqs(dev); > - vduse_domain_destroy(dev->domain); > + if (dev->domain) > + vduse_domain_destroy(dev->domain); > kfree(dev->name); > vduse_dev_destroy(dev); > module_put(THIS_MODULE); > @@ -1802,11 +1825,7 @@ static int vduse_create_dev(struct vduse_dev_config *config, > if (!dev->name) > goto err_str; > > - dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, > - VDUSE_BOUNCE_SIZE); > - if (!dev->domain) > - goto err_domain; > - > + dev->bounce_size = VDUSE_BOUNCE_SIZE; > dev->config = config_buf; > dev->config_size = config->config_size; > > @@ -1836,8 +1855,6 @@ static int vduse_create_dev(struct vduse_dev_config *config, > err_dev: > idr_remove(&vduse_idr, dev->minor); > err_idr: > - vduse_domain_destroy(dev->domain); > -err_domain: > kfree(dev->name); > err_str: > vduse_dev_destroy(dev); > @@ -2004,9 +2021,23 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, > if (ret) > return ret; > > + mutex_lock(&dev->domain_lock); > + if (!dev->domain) > + dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, > + dev->bounce_size); > + mutex_unlock(&dev->domain_lock); > + if (!dev->domain) { > + put_device(&dev->vdev->vdpa.dev); > + return -ENOMEM; > + } > + > ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num); > if (ret) { > put_device(&dev->vdev->vdpa.dev); > + mutex_lock(&dev->domain_lock); > + vduse_domain_destroy(dev->domain); > + dev->domain = NULL; > + mutex_unlock(&dev->domain_lock); > return ret; > } > > -- > 2.20.1 >
As discussed in [1], this adds sysfs interface to support
specifying bounce buffer size in virtio-vdpa case. It would
be a performance tuning parameter for high throughput workloads.
[1] https://www.spinics.net/lists/netdev/msg754288.html
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 45 +++++++++++++++++++++++++++++-
1 file changed, 44 insertions(+), 1 deletion(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index bd1ba6c33e09..0458d61cee1f 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -38,8 +38,11 @@
#define DRV_LICENSE "GPL v2"
#define VDUSE_DEV_MAX (1U << MINORBITS)
+#define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
+#define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
#define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
-#define VDUSE_IOVA_SIZE (128 * 1024 * 1024)
+/* 128 MB reserved for virtqueue creation */
+#define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024)
#define VDUSE_MSG_DEFAULT_TIMEOUT 30
struct vduse_virtqueue {
@@ -1795,8 +1798,48 @@ static ssize_t msg_timeout_store(struct device *device,
static DEVICE_ATTR_RW(msg_timeout);
+static ssize_t bounce_size_show(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct vduse_dev *dev = dev_get_drvdata(device);
+
+ return sysfs_emit(buf, "%u\n", dev->bounce_size);
+}
+
+static ssize_t bounce_size_store(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct vduse_dev *dev = dev_get_drvdata(device);
+ unsigned int bounce_size;
+ int ret;
+
+ ret = -EPERM;
+ mutex_lock(&dev->domain_lock);
+ if (dev->domain)
+ goto unlock;
+
+ ret = kstrtouint(buf, 10, &bounce_size);
+ if (ret < 0)
+ goto unlock;
+
+ ret = -EINVAL;
+ if (bounce_size > VDUSE_MAX_BOUNCE_SIZE ||
+ bounce_size < VDUSE_MIN_BOUNCE_SIZE)
+ goto unlock;
+
+ dev->bounce_size = bounce_size & PAGE_MASK;
+ ret = count;
+unlock:
+ mutex_unlock(&dev->domain_lock);
+ return ret;
+}
+
+static DEVICE_ATTR_RW(bounce_size);
+
static struct attribute *vduse_dev_attrs[] = {
&dev_attr_msg_timeout.attr,
+ &dev_attr_bounce_size.attr,
NULL
};
--
2.20.1
On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <xieyongji@bytedance.com> wrote: > > As discussed in [1], this adds sysfs interface to support > specifying bounce buffer size in virtio-vdpa case. It would > be a performance tuning parameter for high throughput workloads. > > [1] https://www.spinics.net/lists/netdev/msg754288.html > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com> Acked-by: Jason Wang <jasowang@redhat.com> Thanks > --- > drivers/vdpa/vdpa_user/vduse_dev.c | 45 +++++++++++++++++++++++++++++- > 1 file changed, 44 insertions(+), 1 deletion(-) > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > index bd1ba6c33e09..0458d61cee1f 100644 > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > @@ -38,8 +38,11 @@ > #define DRV_LICENSE "GPL v2" > > #define VDUSE_DEV_MAX (1U << MINORBITS) > +#define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024) > +#define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024) > #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024) > -#define VDUSE_IOVA_SIZE (128 * 1024 * 1024) > +/* 128 MB reserved for virtqueue creation */ > +#define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024) > #define VDUSE_MSG_DEFAULT_TIMEOUT 30 > > struct vduse_virtqueue { > @@ -1795,8 +1798,48 @@ static ssize_t msg_timeout_store(struct device *device, > > static DEVICE_ATTR_RW(msg_timeout); > > +static ssize_t bounce_size_show(struct device *device, > + struct device_attribute *attr, char *buf) > +{ > + struct vduse_dev *dev = dev_get_drvdata(device); > + > + return sysfs_emit(buf, "%u\n", dev->bounce_size); > +} > + > +static ssize_t bounce_size_store(struct device *device, > + struct device_attribute *attr, > + const char *buf, size_t count) > +{ > + struct vduse_dev *dev = dev_get_drvdata(device); > + unsigned int bounce_size; > + int ret; > + > + ret = -EPERM; > + mutex_lock(&dev->domain_lock); > + if (dev->domain) > + goto unlock; > + > + ret = kstrtouint(buf, 10, &bounce_size); > + if (ret < 0) > + goto unlock; > + > + ret = -EINVAL; > + if (bounce_size > VDUSE_MAX_BOUNCE_SIZE || > + bounce_size < VDUSE_MIN_BOUNCE_SIZE) > + goto unlock; > + > + dev->bounce_size = bounce_size & PAGE_MASK; > + ret = count; > +unlock: > + mutex_unlock(&dev->domain_lock); > + return ret; > +} > + > +static DEVICE_ATTR_RW(bounce_size); > + > static struct attribute *vduse_dev_attrs[] = { > &dev_attr_msg_timeout.attr, > + &dev_attr_bounce_size.attr, > NULL > }; > > -- > 2.20.1 >
© 2016 - 2025 Red Hat, Inc.