[v2] Use Intel DSA accelerator to offload zero page checking in multifd live migration.

[PATCH v2 08/20] util/dsa: Implement DSA task enqueue and dequeue.

Posted by Hao Xiang 1 year ago

* Use a safe thread queue for DSA task enqueue/dequeue.
* Implement DSA task submission.
* Implement DSA batch task submission.

Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>
---
 include/qemu/dsa.h |  35 ++++++++
 util/dsa.c         | 196 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 231 insertions(+)

diff --git a/include/qemu/dsa.h b/include/qemu/dsa.h
index 30246b507e..23f55185be 100644
--- a/include/qemu/dsa.h
+++ b/include/qemu/dsa.h
@@ -12,6 +12,41 @@
 #include <linux/idxd.h>
 #include "x86intrin.h"
 
+enum dsa_task_type {
+    DSA_TASK = 0,
+    DSA_BATCH_TASK
+};
+
+enum dsa_task_status {
+    DSA_TASK_READY = 0,
+    DSA_TASK_PROCESSING,
+    DSA_TASK_COMPLETION
+};
+
+typedef void (*buffer_zero_dsa_completion_fn)(void *);
+
+typedef struct buffer_zero_batch_task {
+    struct dsa_hw_desc batch_descriptor;
+    struct dsa_hw_desc *descriptors;
+    struct dsa_completion_record batch_completion __attribute__((aligned(32)));
+    struct dsa_completion_record *completions;
+    struct dsa_device_group *group;
+    struct dsa_device *device;
+    buffer_zero_dsa_completion_fn completion_callback;
+    QemuSemaphore sem_task_complete;
+    enum dsa_task_type task_type;
+    enum dsa_task_status status;
+    bool *results;
+    int batch_size;
+    QSIMPLEQ_ENTRY(buffer_zero_batch_task) entry;
+} buffer_zero_batch_task;
+
+#else
+
+struct buffer_zero_batch_task {
+    bool *results;
+};
+
 #endif
 
 /**
diff --git a/util/dsa.c b/util/dsa.c
index 8edaa892ec..f82282ce99 100644
--- a/util/dsa.c
+++ b/util/dsa.c
@@ -245,6 +245,200 @@ dsa_device_group_get_next_device(struct dsa_device_group *group)
     return &group->dsa_devices[current];
 }
 
+/**
+ * @brief Empties out the DSA task queue.
+ *
+ * @param group A pointer to the DSA device group.
+ */
+static void
+dsa_empty_task_queue(struct dsa_device_group *group)
+{
+    qemu_mutex_lock(&group->task_queue_lock);
+    dsa_task_queue *task_queue = &group->task_queue;
+    while (!QSIMPLEQ_EMPTY(task_queue)) {
+        QSIMPLEQ_REMOVE_HEAD(task_queue, entry);
+    }
+    qemu_mutex_unlock(&group->task_queue_lock);
+}
+
+/**
+ * @brief Adds a task to the DSA task queue.
+ *
+ * @param group A pointer to the DSA device group.
+ * @param context A pointer to the DSA task to enqueue.
+ *
+ * @return int Zero if successful, otherwise a proper error code.
+ */
+static int
+dsa_task_enqueue(struct dsa_device_group *group,
+                 struct buffer_zero_batch_task *task)
+{
+    dsa_task_queue *task_queue = &group->task_queue;
+    QemuMutex *task_queue_lock = &group->task_queue_lock;
+    QemuCond *task_queue_cond = &group->task_queue_cond;
+
+    bool notify = false;
+
+    qemu_mutex_lock(task_queue_lock);
+
+    if (!group->running) {
+        fprintf(stderr, "DSA: Tried to queue task to stopped device queue\n");
+        qemu_mutex_unlock(task_queue_lock);
+        return -1;
+    }
+
+    // The queue is empty. This enqueue operation is a 0->1 transition.
+    if (QSIMPLEQ_EMPTY(task_queue))
+        notify = true;
+
+    QSIMPLEQ_INSERT_TAIL(task_queue, task, entry);
+
+    // We need to notify the waiter for 0->1 transitions.
+    if (notify)
+        qemu_cond_signal(task_queue_cond);
+
+    qemu_mutex_unlock(task_queue_lock);
+
+    return 0;
+}
+
+/**
+ * @brief Takes a DSA task out of the task queue.
+ *
+ * @param group A pointer to the DSA device group.
+ * @return buffer_zero_batch_task* The DSA task being dequeued.
+ */
+__attribute__((unused))
+static struct buffer_zero_batch_task *
+dsa_task_dequeue(struct dsa_device_group *group)
+{
+    struct buffer_zero_batch_task *task = NULL;
+    dsa_task_queue *task_queue = &group->task_queue;
+    QemuMutex *task_queue_lock = &group->task_queue_lock;
+    QemuCond *task_queue_cond = &group->task_queue_cond;
+
+    qemu_mutex_lock(task_queue_lock);
+
+    while (true) {
+        if (!group->running)
+            goto exit;
+        task = QSIMPLEQ_FIRST(task_queue);
+        if (task != NULL) {
+            break;
+        }
+        qemu_cond_wait(task_queue_cond, task_queue_lock);
+    }
+
+    QSIMPLEQ_REMOVE_HEAD(task_queue, entry);
+
+exit:
+    qemu_mutex_unlock(task_queue_lock);
+    return task;
+}
+
+/**
+ * @brief Submits a DSA work item to the device work queue.
+ *
+ * @param wq A pointer to the DSA work queue's device memory.
+ * @param descriptor A pointer to the DSA work item descriptor.
+ *
+ * @return Zero if successful, non-zero otherwise.
+ */
+static int
+submit_wi_int(void *wq, struct dsa_hw_desc *descriptor)
+{
+    uint64_t retry = 0;
+
+    _mm_sfence();
+
+    while (true) {
+        if (_enqcmd(wq, descriptor) == 0) {
+            break;
+        }
+        retry++;
+        if (retry > max_retry_count) {
+            fprintf(stderr, "Submit work retry %lu times.\n", retry);
+            exit(1);
+        }
+    }
+
+    return 0;
+}
+
+/**
+ * @brief Synchronously submits a DSA work item to the
+ *        device work queue.
+ *
+ * @param wq A pointer to the DSA worjk queue's device memory.
+ * @param descriptor A pointer to the DSA work item descriptor.
+ *
+ * @return int Zero if successful, non-zero otherwise.
+ */
+__attribute__((unused))
+static int
+submit_wi(void *wq, struct dsa_hw_desc *descriptor)
+{
+    return submit_wi_int(wq, descriptor);
+}
+
+/**
+ * @brief Asynchronously submits a DSA work item to the
+ *        device work queue.
+ *
+ * @param task A pointer to the buffer zero task.
+ *
+ * @return int Zero if successful, non-zero otherwise.
+ */
+__attribute__((unused))
+static int
+submit_wi_async(struct buffer_zero_batch_task *task)
+{
+    struct dsa_device_group *device_group = task->group;
+    struct dsa_device *device_instance = task->device;
+    int ret;
+
+    assert(task->task_type == DSA_TASK);
+
+    task->status = DSA_TASK_PROCESSING;
+
+    ret = submit_wi_int(device_instance->work_queue,
+                        &task->descriptors[0]);
+    if (ret != 0)
+        return ret;
+
+    return dsa_task_enqueue(device_group, task);
+}
+
+/**
+ * @brief Asynchronously submits a DSA batch work item to the
+ *        device work queue.
+ *
+ * @param batch_task A pointer to the batch buffer zero task.
+ *
+ * @return int Zero if successful, non-zero otherwise.
+ */
+__attribute__((unused))
+static int
+submit_batch_wi_async(struct buffer_zero_batch_task *batch_task)
+{
+    struct dsa_device_group *device_group = batch_task->group;
+    struct dsa_device *device_instance = batch_task->device;
+    int ret;
+
+    assert(batch_task->task_type == DSA_BATCH_TASK);
+    assert(batch_task->batch_descriptor.desc_count <= batch_task->batch_size);
+    assert(batch_task->status == DSA_TASK_READY);
+
+    batch_task->status = DSA_TASK_PROCESSING;
+
+    ret = submit_wi_int(device_instance->work_queue,
+                        &batch_task->batch_descriptor);
+    if (ret != 0)
+        return ret;
+
+    return dsa_task_enqueue(device_group, batch_task);
+}
+
 /**
  * @brief Check if DSA is running.
  *
@@ -301,6 +495,8 @@ void dsa_stop(void)
     if (!group->running) {
         return;
     }
+
+    dsa_empty_task_queue(group);
 }
 
 /**
-- 
2.30.2

Re: [PATCH v2 08/20] util/dsa: Implement DSA task enqueue and dequeue.

Posted by Fabiano Rosas 11 months, 2 weeks ago

Hao Xiang <hao.xiang@bytedance.com> writes:

> * Use a safe thread queue for DSA task enqueue/dequeue.
> * Implement DSA task submission.
> * Implement DSA batch task submission.
>
> Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>
> ---
>  include/qemu/dsa.h |  35 ++++++++
>  util/dsa.c         | 196 +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 231 insertions(+)
>
> diff --git a/include/qemu/dsa.h b/include/qemu/dsa.h
> index 30246b507e..23f55185be 100644
> --- a/include/qemu/dsa.h
> +++ b/include/qemu/dsa.h
> @@ -12,6 +12,41 @@
>  #include <linux/idxd.h>
>  #include "x86intrin.h"
>  
> +enum dsa_task_type {

Our coding style requires CamelCase for enums and typedef'ed structures.

> +    DSA_TASK = 0,
> +    DSA_BATCH_TASK
> +};
> +
> +enum dsa_task_status {
> +    DSA_TASK_READY = 0,
> +    DSA_TASK_PROCESSING,
> +    DSA_TASK_COMPLETION
> +};
> +
> +typedef void (*buffer_zero_dsa_completion_fn)(void *);

We don't really need the "buffer_zero" mention in any of this
code. Simply dsa_batch_task or batch_task would suffice.

> +
> +typedef struct buffer_zero_batch_task {
> +    struct dsa_hw_desc batch_descriptor;
> +    struct dsa_hw_desc *descriptors;
> +    struct dsa_completion_record batch_completion __attribute__((aligned(32)));
> +    struct dsa_completion_record *completions;
> +    struct dsa_device_group *group;
> +    struct dsa_device *device;
> +    buffer_zero_dsa_completion_fn completion_callback;
> +    QemuSemaphore sem_task_complete;
> +    enum dsa_task_type task_type;
> +    enum dsa_task_status status;
> +    bool *results;
> +    int batch_size;
> +    QSIMPLEQ_ENTRY(buffer_zero_batch_task) entry;
> +} buffer_zero_batch_task;

I see data specific to this implementation and data coming from the
library, maybe these would be better organized in two separate
structures with the qemu-specific having a pointer to the generic
one. Looking ahead in the series, there seems to be migration data
coming into this as well.

> +
> +#else
> +
> +struct buffer_zero_batch_task {
> +    bool *results;
> +};
> +
>  #endif
>  
>  /**
> diff --git a/util/dsa.c b/util/dsa.c
> index 8edaa892ec..f82282ce99 100644
> --- a/util/dsa.c
> +++ b/util/dsa.c
> @@ -245,6 +245,200 @@ dsa_device_group_get_next_device(struct dsa_device_group *group)
>      return &group->dsa_devices[current];
>  }
>  
> +/**
> + * @brief Empties out the DSA task queue.
> + *
> + * @param group A pointer to the DSA device group.
> + */
> +static void
> +dsa_empty_task_queue(struct dsa_device_group *group)
> +{
> +    qemu_mutex_lock(&group->task_queue_lock);
> +    dsa_task_queue *task_queue = &group->task_queue;
> +    while (!QSIMPLEQ_EMPTY(task_queue)) {
> +        QSIMPLEQ_REMOVE_HEAD(task_queue, entry);
> +    }
> +    qemu_mutex_unlock(&group->task_queue_lock);
> +}
> +
> +/**
> + * @brief Adds a task to the DSA task queue.
> + *
> + * @param group A pointer to the DSA device group.
> + * @param context A pointer to the DSA task to enqueue.
> + *
> + * @return int Zero if successful, otherwise a proper error code.
> + */
> +static int
> +dsa_task_enqueue(struct dsa_device_group *group,
> +                 struct buffer_zero_batch_task *task)
> +{
> +    dsa_task_queue *task_queue = &group->task_queue;
> +    QemuMutex *task_queue_lock = &group->task_queue_lock;
> +    QemuCond *task_queue_cond = &group->task_queue_cond;
> +
> +    bool notify = false;
> +
> +    qemu_mutex_lock(task_queue_lock);
> +
> +    if (!group->running) {
> +        fprintf(stderr, "DSA: Tried to queue task to stopped device queue\n");
> +        qemu_mutex_unlock(task_queue_lock);
> +        return -1;
> +    }
> +
> +    // The queue is empty. This enqueue operation is a 0->1 transition.
> +    if (QSIMPLEQ_EMPTY(task_queue))
> +        notify = true;
> +
> +    QSIMPLEQ_INSERT_TAIL(task_queue, task, entry);
> +
> +    // We need to notify the waiter for 0->1 transitions.
> +    if (notify)
> +        qemu_cond_signal(task_queue_cond);
> +
> +    qemu_mutex_unlock(task_queue_lock);
> +
> +    return 0;
> +}
> +
> +/**
> + * @brief Takes a DSA task out of the task queue.
> + *
> + * @param group A pointer to the DSA device group.
> + * @return buffer_zero_batch_task* The DSA task being dequeued.
> + */
> +__attribute__((unused))
> +static struct buffer_zero_batch_task *
> +dsa_task_dequeue(struct dsa_device_group *group)
> +{
> +    struct buffer_zero_batch_task *task = NULL;
> +    dsa_task_queue *task_queue = &group->task_queue;
> +    QemuMutex *task_queue_lock = &group->task_queue_lock;
> +    QemuCond *task_queue_cond = &group->task_queue_cond;
> +
> +    qemu_mutex_lock(task_queue_lock);
> +
> +    while (true) {
> +        if (!group->running)
> +            goto exit;
> +        task = QSIMPLEQ_FIRST(task_queue);
> +        if (task != NULL) {
> +            break;
> +        }
> +        qemu_cond_wait(task_queue_cond, task_queue_lock);
> +    }
> +
> +    QSIMPLEQ_REMOVE_HEAD(task_queue, entry);
> +
> +exit:
> +    qemu_mutex_unlock(task_queue_lock);
> +    return task;
> +}
> +
> +/**
> + * @brief Submits a DSA work item to the device work queue.
> + *
> + * @param wq A pointer to the DSA work queue's device memory.
> + * @param descriptor A pointer to the DSA work item descriptor.
> + *
> + * @return Zero if successful, non-zero otherwise.
> + */
> +static int
> +submit_wi_int(void *wq, struct dsa_hw_desc *descriptor)
> +{
> +    uint64_t retry = 0;
> +
> +    _mm_sfence();
> +
> +    while (true) {
> +        if (_enqcmd(wq, descriptor) == 0) {
> +            break;
> +        }
> +        retry++;
> +        if (retry > max_retry_count) {

'max_retry_count' is UINT64_MAX so 'retry' will wrap around.

> +            fprintf(stderr, "Submit work retry %lu times.\n", retry);
> +            exit(1);

Is this not the case where we'd fallback to the CPU?

You should not exit() here, but return non-zero as the documentation
mentions and the callers expect.

> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +/**
> + * @brief Synchronously submits a DSA work item to the
> + *        device work queue.
> + *
> + * @param wq A pointer to the DSA worjk queue's device memory.
> + * @param descriptor A pointer to the DSA work item descriptor.
> + *
> + * @return int Zero if successful, non-zero otherwise.
> + */
> +__attribute__((unused))
> +static int
> +submit_wi(void *wq, struct dsa_hw_desc *descriptor)
> +{
> +    return submit_wi_int(wq, descriptor);
> +}
> +
> +/**
> + * @brief Asynchronously submits a DSA work item to the
> + *        device work queue.
> + *
> + * @param task A pointer to the buffer zero task.
> + *
> + * @return int Zero if successful, non-zero otherwise.
> + */
> +__attribute__((unused))
> +static int
> +submit_wi_async(struct buffer_zero_batch_task *task)
> +{
> +    struct dsa_device_group *device_group = task->group;
> +    struct dsa_device *device_instance = task->device;
> +    int ret;
> +
> +    assert(task->task_type == DSA_TASK);
> +
> +    task->status = DSA_TASK_PROCESSING;
> +
> +    ret = submit_wi_int(device_instance->work_queue,
> +                        &task->descriptors[0]);
> +    if (ret != 0)
> +        return ret;
> +
> +    return dsa_task_enqueue(device_group, task);
> +}
> +
> +/**
> + * @brief Asynchronously submits a DSA batch work item to the
> + *        device work queue.
> + *
> + * @param batch_task A pointer to the batch buffer zero task.
> + *
> + * @return int Zero if successful, non-zero otherwise.
> + */
> +__attribute__((unused))
> +static int
> +submit_batch_wi_async(struct buffer_zero_batch_task *batch_task)
> +{
> +    struct dsa_device_group *device_group = batch_task->group;
> +    struct dsa_device *device_instance = batch_task->device;
> +    int ret;
> +
> +    assert(batch_task->task_type == DSA_BATCH_TASK);
> +    assert(batch_task->batch_descriptor.desc_count <= batch_task->batch_size);
> +    assert(batch_task->status == DSA_TASK_READY);
> +
> +    batch_task->status = DSA_TASK_PROCESSING;
> +
> +    ret = submit_wi_int(device_instance->work_queue,
> +                        &batch_task->batch_descriptor);
> +    if (ret != 0)
> +        return ret;
> +
> +    return dsa_task_enqueue(device_group, batch_task);
> +}

At this point in the series submit_wi_async() and
submit_batch_wi_async() look the same to me without the asserts. Can't
we consolidate them?

There's also the fact that both functions receive a _batch_ task but one
is supposed to work in batches and the other is not. That could be
solved by renaming the structure I guess.

> +
>  /**
>   * @brief Check if DSA is running.
>   *
> @@ -301,6 +495,8 @@ void dsa_stop(void)
>      if (!group->running) {
>          return;
>      }
> +
> +    dsa_empty_task_queue(group);
>  }
>  
>  /**

Re: [External] Re: [PATCH v2 08/20] util/dsa: Implement DSA task enqueue and dequeue.

Posted by Hao Xiang 11 months ago

On Tue, Dec 12, 2023 at 8:10 AM Fabiano Rosas <farosas@suse.de> wrote:
>
> Hao Xiang <hao.xiang@bytedance.com> writes:
>
> > * Use a safe thread queue for DSA task enqueue/dequeue.
> > * Implement DSA task submission.
> > * Implement DSA batch task submission.
> >
> > Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>
> > ---
> >  include/qemu/dsa.h |  35 ++++++++
> >  util/dsa.c         | 196 +++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 231 insertions(+)
> >
> > diff --git a/include/qemu/dsa.h b/include/qemu/dsa.h
> > index 30246b507e..23f55185be 100644
> > --- a/include/qemu/dsa.h
> > +++ b/include/qemu/dsa.h
> > @@ -12,6 +12,41 @@
> >  #include <linux/idxd.h>
> >  #include "x86intrin.h"
> >
> > +enum dsa_task_type {
>
> Our coding style requires CamelCase for enums and typedef'ed structures.

When I wrote this, I found numerous instances where snake case and no
typedef enum are used. But I do see the camel case and typedef'ed
instances now. Converted to that.

>
> > +    DSA_TASK = 0,
> > +    DSA_BATCH_TASK
> > +};
> > +
> > +enum dsa_task_status {
> > +    DSA_TASK_READY = 0,
> > +    DSA_TASK_PROCESSING,
> > +    DSA_TASK_COMPLETION
> > +};
> > +
> > +typedef void (*buffer_zero_dsa_completion_fn)(void *);
>
> We don't really need the "buffer_zero" mention in any of this
> code. Simply dsa_batch_task or batch_task would suffice.

I removed "buffer_zero" prefix in some of the places.

>
> > +
> > +typedef struct buffer_zero_batch_task {
> > +    struct dsa_hw_desc batch_descriptor;
> > +    struct dsa_hw_desc *descriptors;
> > +    struct dsa_completion_record batch_completion __attribute__((aligned(32)));
> > +    struct dsa_completion_record *completions;
> > +    struct dsa_device_group *group;
> > +    struct dsa_device *device;
> > +    buffer_zero_dsa_completion_fn completion_callback;
> > +    QemuSemaphore sem_task_complete;
> > +    enum dsa_task_type task_type;
> > +    enum dsa_task_status status;
> > +    bool *results;
> > +    int batch_size;
> > +    QSIMPLEQ_ENTRY(buffer_zero_batch_task) entry;
> > +} buffer_zero_batch_task;
>
> I see data specific to this implementation and data coming from the
> library, maybe these would be better organized in two separate
> structures with the qemu-specific having a pointer to the generic
> one. Looking ahead in the series, there seems to be migration data
> coming into this as well.

I refactored to create a generic structure batch_task and a DSA
specific version dsa_batch_task. batch_task has a pointer to
dsa_batch_task if DSA compilation option is enabled.

>
> > +
> > +#else
> > +
> > +struct buffer_zero_batch_task {
> > +    bool *results;
> > +};
> > +
> >  #endif
> >
> >  /**
> > diff --git a/util/dsa.c b/util/dsa.c
> > index 8edaa892ec..f82282ce99 100644
> > --- a/util/dsa.c
> > +++ b/util/dsa.c
> > @@ -245,6 +245,200 @@ dsa_device_group_get_next_device(struct dsa_device_group *group)
> >      return &group->dsa_devices[current];
> >  }
> >
> > +/**
> > + * @brief Empties out the DSA task queue.
> > + *
> > + * @param group A pointer to the DSA device group.
> > + */
> > +static void
> > +dsa_empty_task_queue(struct dsa_device_group *group)
> > +{
> > +    qemu_mutex_lock(&group->task_queue_lock);
> > +    dsa_task_queue *task_queue = &group->task_queue;
> > +    while (!QSIMPLEQ_EMPTY(task_queue)) {
> > +        QSIMPLEQ_REMOVE_HEAD(task_queue, entry);
> > +    }
> > +    qemu_mutex_unlock(&group->task_queue_lock);
> > +}
> > +
> > +/**
> > + * @brief Adds a task to the DSA task queue.
> > + *
> > + * @param group A pointer to the DSA device group.
> > + * @param context A pointer to the DSA task to enqueue.
> > + *
> > + * @return int Zero if successful, otherwise a proper error code.
> > + */
> > +static int
> > +dsa_task_enqueue(struct dsa_device_group *group,
> > +                 struct buffer_zero_batch_task *task)
> > +{
> > +    dsa_task_queue *task_queue = &group->task_queue;
> > +    QemuMutex *task_queue_lock = &group->task_queue_lock;
> > +    QemuCond *task_queue_cond = &group->task_queue_cond;
> > +
> > +    bool notify = false;
> > +
> > +    qemu_mutex_lock(task_queue_lock);
> > +
> > +    if (!group->running) {
> > +        fprintf(stderr, "DSA: Tried to queue task to stopped device queue\n");
> > +        qemu_mutex_unlock(task_queue_lock);
> > +        return -1;
> > +    }
> > +
> > +    // The queue is empty. This enqueue operation is a 0->1 transition.
> > +    if (QSIMPLEQ_EMPTY(task_queue))
> > +        notify = true;
> > +
> > +    QSIMPLEQ_INSERT_TAIL(task_queue, task, entry);
> > +
> > +    // We need to notify the waiter for 0->1 transitions.
> > +    if (notify)
> > +        qemu_cond_signal(task_queue_cond);
> > +
> > +    qemu_mutex_unlock(task_queue_lock);
> > +
> > +    return 0;
> > +}
> > +
> > +/**
> > + * @brief Takes a DSA task out of the task queue.
> > + *
> > + * @param group A pointer to the DSA device group.
> > + * @return buffer_zero_batch_task* The DSA task being dequeued.
> > + */
> > +__attribute__((unused))
> > +static struct buffer_zero_batch_task *
> > +dsa_task_dequeue(struct dsa_device_group *group)
> > +{
> > +    struct buffer_zero_batch_task *task = NULL;
> > +    dsa_task_queue *task_queue = &group->task_queue;
> > +    QemuMutex *task_queue_lock = &group->task_queue_lock;
> > +    QemuCond *task_queue_cond = &group->task_queue_cond;
> > +
> > +    qemu_mutex_lock(task_queue_lock);
> > +
> > +    while (true) {
> > +        if (!group->running)
> > +            goto exit;
> > +        task = QSIMPLEQ_FIRST(task_queue);
> > +        if (task != NULL) {
> > +            break;
> > +        }
> > +        qemu_cond_wait(task_queue_cond, task_queue_lock);
> > +    }
> > +
> > +    QSIMPLEQ_REMOVE_HEAD(task_queue, entry);
> > +
> > +exit:
> > +    qemu_mutex_unlock(task_queue_lock);
> > +    return task;
> > +}
> > +
> > +/**
> > + * @brief Submits a DSA work item to the device work queue.
> > + *
> > + * @param wq A pointer to the DSA work queue's device memory.
> > + * @param descriptor A pointer to the DSA work item descriptor.
> > + *
> > + * @return Zero if successful, non-zero otherwise.
> > + */
> > +static int
> > +submit_wi_int(void *wq, struct dsa_hw_desc *descriptor)
> > +{
> > +    uint64_t retry = 0;
> > +
> > +    _mm_sfence();
> > +
> > +    while (true) {
> > +        if (_enqcmd(wq, descriptor) == 0) {
> > +            break;
> > +        }
> > +        retry++;
> > +        if (retry > max_retry_count) {
>
> 'max_retry_count' is UINT64_MAX so 'retry' will wrap around.
>
> > +            fprintf(stderr, "Submit work retry %lu times.\n", retry);
> > +            exit(1);
>
> Is this not the case where we'd fallback to the CPU?

"retry" here means _enqcmd returned a failure because the shared DSA
queue is full. When we run out of retry counts, we definitely have a
bug that prevents any DSA task from completing. So this situation is
really not expected and we don't want to fallback to use CPU.

>
> You should not exit() here, but return non-zero as the documentation
> mentions and the callers expect.

 I will propagate this error all the way up to multifd_send_thread.

>
> > +        }
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +/**
> > + * @brief Synchronously submits a DSA work item to the
> > + *        device work queue.
> > + *
> > + * @param wq A pointer to the DSA worjk queue's device memory.
> > + * @param descriptor A pointer to the DSA work item descriptor.
> > + *
> > + * @return int Zero if successful, non-zero otherwise.
> > + */
> > +__attribute__((unused))
> > +static int
> > +submit_wi(void *wq, struct dsa_hw_desc *descriptor)
> > +{
> > +    return submit_wi_int(wq, descriptor);
> > +}
> > +
> > +/**
> > + * @brief Asynchronously submits a DSA work item to the
> > + *        device work queue.
> > + *
> > + * @param task A pointer to the buffer zero task.
> > + *
> > + * @return int Zero if successful, non-zero otherwise.
> > + */
> > +__attribute__((unused))
> > +static int
> > +submit_wi_async(struct buffer_zero_batch_task *task)
> > +{
> > +    struct dsa_device_group *device_group = task->group;
> > +    struct dsa_device *device_instance = task->device;
> > +    int ret;
> > +
> > +    assert(task->task_type == DSA_TASK);
> > +
> > +    task->status = DSA_TASK_PROCESSING;
> > +
> > +    ret = submit_wi_int(device_instance->work_queue,
> > +                        &task->descriptors[0]);
> > +    if (ret != 0)
> > +        return ret;
> > +
> > +    return dsa_task_enqueue(device_group, task);
> > +}
> > +
> > +/**
> > + * @brief Asynchronously submits a DSA batch work item to the
> > + *        device work queue.
> > + *
> > + * @param batch_task A pointer to the batch buffer zero task.
> > + *
> > + * @return int Zero if successful, non-zero otherwise.
> > + */
> > +__attribute__((unused))
> > +static int
> > +submit_batch_wi_async(struct buffer_zero_batch_task *batch_task)
> > +{
> > +    struct dsa_device_group *device_group = batch_task->group;
> > +    struct dsa_device *device_instance = batch_task->device;
> > +    int ret;
> > +
> > +    assert(batch_task->task_type == DSA_BATCH_TASK);
> > +    assert(batch_task->batch_descriptor.desc_count <= batch_task->batch_size);
> > +    assert(batch_task->status == DSA_TASK_READY);
> > +
> > +    batch_task->status = DSA_TASK_PROCESSING;
> > +
> > +    ret = submit_wi_int(device_instance->work_queue,
> > +                        &batch_task->batch_descriptor);
> > +    if (ret != 0)
> > +        return ret;
> > +
> > +    return dsa_task_enqueue(device_group, batch_task);
> > +}
>
> At this point in the series submit_wi_async() and
> submit_batch_wi_async() look the same to me without the asserts. Can't
> we consolidate them?
>
> There's also the fact that both functions receive a _batch_ task but one
> is supposed to work in batches and the other is not. That could be
> solved by renaming the structure I guess.

So we do need to have two functions to handle a single task and a
batch task respectively. This is due to how DSA is designed at the
lower level. When we submit a task to DSA hardware, the task
description can be an individual task or a batch task containing a
pointer to an array of individual tasks. The workflow tries to
aggregate a lot of individual tasks and put them into a batch task.
However, there are times when only 1 task is available but DSA doesn't
accept a batch task description with only 1 individual task in it so
we always need a path to submit an individual task. I used to have two
data structures representing an individual task and a batch task but I
converged them into the batch task right now. The two functions are
just using different fields of the same structure to process
individual task vs batch task. submit_wi_async and
submit_batch_wi_async are different on the actual descriptor passed
into the submit_wi_int call. Yes, the two functions look similar but
they are not completely the same and because its implementation is so
simple it doesn't worth adding a unified helper layer to have them
both calling that helper layer. I went back and forth between the
current implementation and the solution you suggested but ended up
using the current implementation. But let me know if you still prefer
a converged helper function.

>
> > +
> >  /**
> >   * @brief Check if DSA is running.
> >   *
> > @@ -301,6 +495,8 @@ void dsa_stop(void)
> >      if (!group->running) {
> >          return;
> >      }
> > +
> > +    dsa_empty_task_queue(group);
> >  }
> >
> >  /**