Add interfaces for user application to submit command and wait for its
completion.
Co-developed-by: Min Ma <min.ma@amd.com>
Signed-off-by: Min Ma <min.ma@amd.com>
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
---
drivers/accel/amdxdna/aie2_ctx.c | 624 +++++++++++++++++-
drivers/accel/amdxdna/aie2_message.c | 343 ++++++++++
drivers/accel/amdxdna/aie2_pci.c | 6 +
drivers/accel/amdxdna/aie2_pci.h | 35 +
drivers/accel/amdxdna/aie2_psp.c | 2 +
drivers/accel/amdxdna/aie2_smu.c | 2 +
drivers/accel/amdxdna/amdxdna_ctx.c | 375 ++++++++++-
drivers/accel/amdxdna/amdxdna_ctx.h | 110 +++
drivers/accel/amdxdna/amdxdna_gem.c | 1 +
.../accel/amdxdna/amdxdna_mailbox_helper.c | 5 +
drivers/accel/amdxdna/amdxdna_pci_drv.c | 6 +
drivers/accel/amdxdna/amdxdna_pci_drv.h | 5 +
drivers/accel/amdxdna/amdxdna_sysfs.c | 5 +
drivers/accel/amdxdna/npu1_regs.c | 1 +
drivers/accel/amdxdna/npu2_regs.c | 1 +
drivers/accel/amdxdna/npu4_regs.c | 1 +
drivers/accel/amdxdna/npu5_regs.c | 1 +
include/trace/events/amdxdna.h | 41 ++
include/uapi/drm/amdxdna_accel.h | 59 ++
19 files changed, 1614 insertions(+), 9 deletions(-)
diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index 617fc05077d9..f9010a902c99 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -8,8 +8,11 @@
#include <drm/drm_gem.h>
#include <drm/drm_gem_shmem_helper.h>
#include <drm/drm_print.h>
+#include <linux/hmm.h>
#include <linux/types.h>
+#include <trace/events/amdxdna.h>
+#include "aie2_msg_priv.h"
#include "aie2_pci.h"
#include "aie2_solver.h"
#include "amdxdna_ctx.h"
@@ -17,6 +20,361 @@
#include "amdxdna_mailbox.h"
#include "amdxdna_pci_drv.h"
+bool force_cmdlist;
+module_param(force_cmdlist, bool, 0600);
+MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default false)");
+
+#define HWCTX_MAX_TIMEOUT 60000 /* miliseconds */
+
+static int
+aie2_hwctx_add_job(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job)
+{
+ struct amdxdna_sched_job *other;
+ int idx;
+
+ idx = get_job_idx(hwctx->priv->seq);
+ /* When pending list full, hwctx->seq points to oldest fence */
+ other = hwctx->priv->pending[idx];
+ if (other && other->fence)
+ return -EAGAIN;
+
+ if (other) {
+ dma_fence_put(other->out_fence);
+ amdxdna_job_put(other);
+ }
+
+ hwctx->priv->pending[idx] = job;
+ job->seq = hwctx->priv->seq++;
+ kref_get(&job->refcnt);
+
+ return 0;
+}
+
+static struct amdxdna_sched_job *
+aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq)
+{
+ int idx;
+
+ /* Special sequence number for oldest fence if exist */
+ if (seq == AMDXDNA_INVALID_CMD_HANDLE) {
+ idx = get_job_idx(hwctx->priv->seq);
+ goto out;
+ }
+
+ if (seq >= hwctx->priv->seq)
+ return ERR_PTR(-EINVAL);
+
+ if (seq + HWCTX_MAX_CMDS < hwctx->priv->seq)
+ return NULL;
+
+ idx = get_job_idx(seq);
+
+out:
+ return hwctx->priv->pending[idx];
+}
+
+/* The bad_job is used in aie2_sched_job_timedout, otherwise, set it to NULL */
+static void aie2_hwctx_stop(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwctx,
+ struct drm_sched_job *bad_job)
+{
+ drm_sched_stop(&hwctx->priv->sched, bad_job);
+ aie2_destroy_context(xdna->dev_handle, hwctx);
+}
+
+static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwctx)
+{
+ struct amdxdna_gem_obj *heap = hwctx->priv->heap;
+ int ret;
+
+ ret = aie2_create_context(xdna->dev_handle, hwctx);
+ if (ret) {
+ XDNA_ERR(xdna, "Create hwctx failed, ret %d", ret);
+ goto out;
+ }
+
+ ret = aie2_map_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
+ heap->mem.userptr, heap->mem.size);
+ if (ret) {
+ XDNA_ERR(xdna, "Map host buf failed, ret %d", ret);
+ goto out;
+ }
+
+ if (hwctx->status != HWCTX_STAT_READY) {
+ XDNA_DBG(xdna, "hwctx is not ready, status %d", hwctx->status);
+ goto out;
+ }
+
+ ret = aie2_config_cu(hwctx);
+ if (ret) {
+ XDNA_ERR(xdna, "Config cu failed, ret %d", ret);
+ goto out;
+ }
+
+out:
+ drm_sched_start(&hwctx->priv->sched);
+ XDNA_DBG(xdna, "%s restarted, ret %d", hwctx->name, ret);
+ return ret;
+}
+
+void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map)
+{
+ struct amdxdna_dev *xdna = client->xdna;
+ struct amdxdna_hwctx *hwctx;
+ int next = 0;
+
+ drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+ mutex_lock(&client->hwctx_lock);
+ idr_for_each_entry_continue(&client->hwctx_idr, hwctx, next) {
+ /* check if the HW context uses the error column */
+ if (!(col_map & amdxdna_hwctx_col_map(hwctx)))
+ continue;
+
+ aie2_hwctx_stop(xdna, hwctx, NULL);
+ hwctx->old_status = hwctx->status;
+ hwctx->status = HWCTX_STAT_STOP;
+ XDNA_DBG(xdna, "Stop %s", hwctx->name);
+ }
+ mutex_unlock(&client->hwctx_lock);
+}
+
+void aie2_restart_ctx(struct amdxdna_client *client)
+{
+ struct amdxdna_dev *xdna = client->xdna;
+ struct amdxdna_hwctx *hwctx;
+ int next = 0;
+
+ drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+ mutex_lock(&client->hwctx_lock);
+ idr_for_each_entry_continue(&client->hwctx_idr, hwctx, next) {
+ if (hwctx->status != HWCTX_STAT_STOP)
+ continue;
+
+ hwctx->status = hwctx->old_status;
+ XDNA_DBG(xdna, "Resetting %s", hwctx->name);
+ aie2_hwctx_restart(xdna, hwctx);
+ }
+ mutex_unlock(&client->hwctx_lock);
+}
+
+static int aie2_hwctx_wait_for_idle(struct amdxdna_hwctx *hwctx)
+{
+ struct amdxdna_sched_job *job;
+
+ mutex_lock(&hwctx->priv->io_lock);
+ if (!hwctx->priv->seq) {
+ mutex_unlock(&hwctx->priv->io_lock);
+ return 0;
+ }
+
+ job = aie2_hwctx_get_job(hwctx, hwctx->priv->seq - 1);
+ if (IS_ERR_OR_NULL(job)) {
+ mutex_unlock(&hwctx->priv->io_lock);
+ XDNA_WARN(hwctx->client->xdna, "Corrupted pending list");
+ return 0;
+ }
+ mutex_unlock(&hwctx->priv->io_lock);
+
+ wait_event(hwctx->priv->job_free_wq, !job->fence);
+
+ return 0;
+}
+
+static void
+aie2_sched_notify(struct amdxdna_sched_job *job)
+{
+ struct dma_fence *fence = job->fence;
+
+ job->hwctx->priv->completed++;
+ dma_fence_signal(fence);
+ trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", job->seq);
+ dma_fence_put(fence);
+ mmput(job->mm);
+ amdxdna_job_put(job);
+}
+
+static int
+aie2_sched_resp_handler(void *handle, const u32 *data, size_t size)
+{
+ struct amdxdna_sched_job *job = handle;
+ struct amdxdna_gem_obj *cmd_abo;
+ u32 ret = 0;
+ u32 status;
+
+ cmd_abo = job->cmd_bo;
+
+ if (unlikely(!data))
+ goto out;
+
+ if (unlikely(size != sizeof(u32))) {
+ amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ status = *data;
+ XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
+ if (status == AIE2_STATUS_SUCCESS)
+ amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
+ else
+ amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ERROR);
+
+out:
+ aie2_sched_notify(job);
+ return ret;
+}
+
+static int
+aie2_sched_nocmd_resp_handler(void *handle, const u32 *data, size_t size)
+{
+ struct amdxdna_sched_job *job = handle;
+ u32 ret = 0;
+ u32 status;
+
+ if (unlikely(!data))
+ goto out;
+
+ if (unlikely(size != sizeof(u32))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ status = *data;
+ XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
+
+out:
+ aie2_sched_notify(job);
+ return ret;
+}
+
+static int
+aie2_sched_cmdlist_resp_handler(void *handle, const u32 *data, size_t size)
+{
+ struct amdxdna_sched_job *job = handle;
+ struct amdxdna_gem_obj *cmd_abo;
+ struct cmd_chain_resp *resp;
+ struct amdxdna_dev *xdna;
+ u32 fail_cmd_status;
+ u32 fail_cmd_idx;
+ u32 ret = 0;
+
+ cmd_abo = job->cmd_bo;
+ if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
+ amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ resp = (struct cmd_chain_resp *)data;
+ xdna = job->hwctx->client->xdna;
+ XDNA_DBG(xdna, "Status 0x%x", resp->status);
+ if (resp->status == AIE2_STATUS_SUCCESS) {
+ amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
+ goto out;
+ }
+
+ /* Slow path to handle error, read from ringbuf on BAR */
+ fail_cmd_idx = resp->fail_cmd_idx;
+ fail_cmd_status = resp->fail_cmd_status;
+ XDNA_DBG(xdna, "Failed cmd idx %d, status 0x%x",
+ fail_cmd_idx, fail_cmd_status);
+
+ if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
+ amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
+ ret = -EINVAL;
+ goto out;
+ }
+ amdxdna_cmd_set_state(cmd_abo, fail_cmd_status);
+
+ if (amdxdna_cmd_get_op(cmd_abo) == ERT_CMD_CHAIN) {
+ struct amdxdna_cmd_chain *cc = amdxdna_cmd_get_payload(cmd_abo, NULL);
+
+ cc->error_index = fail_cmd_idx;
+ if (cc->error_index >= cc->command_count)
+ cc->error_index = 0;
+ }
+out:
+ aie2_sched_notify(job);
+ return ret;
+}
+
+static struct dma_fence *
+aie2_sched_job_run(struct drm_sched_job *sched_job)
+{
+ struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
+ struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+ struct amdxdna_hwctx *hwctx = job->hwctx;
+ struct dma_fence *fence;
+ int ret;
+
+ if (!mmget_not_zero(job->mm))
+ return ERR_PTR(-ESRCH);
+
+ kref_get(&job->refcnt);
+ fence = dma_fence_get(job->fence);
+
+ if (unlikely(!cmd_abo)) {
+ ret = aie2_sync_bo(hwctx, job, aie2_sched_nocmd_resp_handler);
+ goto out;
+ }
+
+ amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_NEW);
+
+ if (amdxdna_cmd_get_op(cmd_abo) == ERT_CMD_CHAIN)
+ ret = aie2_cmdlist_multi_execbuf(hwctx, job, aie2_sched_cmdlist_resp_handler);
+ else if (force_cmdlist)
+ ret = aie2_cmdlist_single_execbuf(hwctx, job, aie2_sched_cmdlist_resp_handler);
+ else
+ ret = aie2_execbuf(hwctx, job, aie2_sched_resp_handler);
+
+out:
+ if (ret) {
+ dma_fence_put(job->fence);
+ amdxdna_job_put(job);
+ mmput(job->mm);
+ fence = ERR_PTR(ret);
+ }
+ trace_xdna_job(sched_job, hwctx->name, "sent to device", job->seq);
+
+ return fence;
+}
+
+static void aie2_sched_job_free(struct drm_sched_job *sched_job)
+{
+ struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
+ struct amdxdna_hwctx *hwctx = job->hwctx;
+
+ trace_xdna_job(sched_job, hwctx->name, "job free", job->seq);
+ drm_sched_job_cleanup(sched_job);
+ job->fence = NULL;
+ amdxdna_job_put(job);
+
+ wake_up(&hwctx->priv->job_free_wq);
+}
+
+static enum drm_gpu_sched_stat
+aie2_sched_job_timedout(struct drm_sched_job *sched_job)
+{
+ struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
+ struct amdxdna_hwctx *hwctx = job->hwctx;
+ struct amdxdna_dev *xdna;
+
+ xdna = hwctx->client->xdna;
+ trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
+ mutex_lock(&xdna->dev_lock);
+ aie2_hwctx_stop(xdna, hwctx, sched_job);
+
+ aie2_hwctx_restart(xdna, hwctx);
+ mutex_unlock(&xdna->dev_lock);
+
+ return DRM_GPU_SCHED_STAT_NOMINAL;
+}
+
+const struct drm_sched_backend_ops sched_ops = {
+ .run_job = aie2_sched_job_run,
+ .free_job = aie2_sched_job_free,
+ .timedout_job = aie2_sched_job_timedout,
+};
+
static int aie2_hwctx_col_list(struct amdxdna_hwctx *hwctx)
{
struct amdxdna_dev *xdna = hwctx->client->xdna;
@@ -130,9 +488,10 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
{
struct amdxdna_client *client = hwctx->client;
struct amdxdna_dev *xdna = client->xdna;
+ struct drm_gpu_scheduler *sched;
struct amdxdna_hwctx_priv *priv;
struct amdxdna_gem_obj *heap;
- int ret;
+ int i, ret;
priv = kzalloc(sizeof(*hwctx->priv), GFP_KERNEL);
if (!priv)
@@ -157,10 +516,48 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
goto put_heap;
}
+ for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
+ struct amdxdna_gem_obj *abo;
+ struct amdxdna_drm_create_bo args = {
+ .flags = 0,
+ .type = AMDXDNA_BO_DEV,
+ .vaddr = 0,
+ .size = MAX_CHAIN_CMDBUF_SIZE,
+ };
+
+ abo = amdxdna_drm_alloc_dev_bo(&xdna->ddev, &args, client->filp, true);
+ if (IS_ERR(abo)) {
+ ret = PTR_ERR(abo);
+ goto free_cmd_bufs;
+ }
+
+ XDNA_DBG(xdna, "Command buf %d addr 0x%llx size 0x%lx",
+ i, abo->mem.dev_addr, abo->mem.size);
+ priv->cmd_buf[i] = abo;
+ }
+
+ sched = &priv->sched;
+ mutex_init(&priv->io_lock);
+ ret = drm_sched_init(sched, &sched_ops, NULL, DRM_SCHED_PRIORITY_COUNT,
+ HWCTX_MAX_CMDS, 0, msecs_to_jiffies(HWCTX_MAX_TIMEOUT),
+ NULL, NULL, hwctx->name, xdna->ddev.dev);
+ if (ret) {
+ XDNA_ERR(xdna, "Failed to init DRM scheduler. ret %d", ret);
+ goto free_cmd_bufs;
+ }
+
+ ret = drm_sched_entity_init(&priv->entity, DRM_SCHED_PRIORITY_NORMAL,
+ &sched, 1, NULL);
+ if (ret) {
+ XDNA_ERR(xdna, "Failed to initial sched entiry. ret %d", ret);
+ goto free_sched;
+ }
+ init_waitqueue_head(&priv->job_free_wq);
+
ret = aie2_hwctx_col_list(hwctx);
if (ret) {
XDNA_ERR(xdna, "Create col list failed, ret %d", ret);
- goto unpin;
+ goto free_entity;
}
ret = aie2_alloc_resource(hwctx);
@@ -185,7 +582,16 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
aie2_release_resource(hwctx);
free_col_list:
kfree(hwctx->col_list);
-unpin:
+free_entity:
+ drm_sched_entity_destroy(&priv->entity);
+free_sched:
+ drm_sched_fini(&priv->sched);
+free_cmd_bufs:
+ for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
+ if (!priv->cmd_buf[i])
+ continue;
+ drm_gem_object_put(to_gobj(priv->cmd_buf[i]));
+ }
amdxdna_gem_unpin(heap);
put_heap:
drm_gem_object_put(to_gobj(heap));
@@ -196,11 +602,43 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx)
{
+ struct amdxdna_sched_job *job;
+ struct amdxdna_dev *xdna;
+ int idx;
+
+ xdna = hwctx->client->xdna;
+ drm_sched_wqueue_stop(&hwctx->priv->sched);
+
+ /* Now, scheduler will not send command to device. */
aie2_release_resource(hwctx);
+ /*
+ * All submitted commands are aborted.
+ * Restart scheduler queues to cleanup jobs. The amdxdna_sched_job_run()
+ * will return NODEV if it is called.
+ */
+ drm_sched_wqueue_start(&hwctx->priv->sched);
+
+ aie2_hwctx_wait_for_idle(hwctx);
+ drm_sched_entity_destroy(&hwctx->priv->entity);
+ drm_sched_fini(&hwctx->priv->sched);
+
+ for (idx = 0; idx < HWCTX_MAX_CMDS; idx++) {
+ job = hwctx->priv->pending[idx];
+ if (!job)
+ continue;
+
+ dma_fence_put(job->out_fence);
+ amdxdna_job_put(job);
+ }
+ XDNA_DBG(xdna, "%s sequence number %lld", hwctx->name, hwctx->priv->seq);
+
+ for (idx = 0; idx < ARRAY_SIZE(hwctx->priv->cmd_buf); idx++)
+ drm_gem_object_put(to_gobj(hwctx->priv->cmd_buf[idx]));
amdxdna_gem_unpin(hwctx->priv->heap);
drm_gem_object_put(to_gobj(hwctx->priv->heap));
+ mutex_destroy(&hwctx->priv->io_lock);
kfree(hwctx->col_list);
kfree(hwctx->priv);
kfree(hwctx->cus);
@@ -267,3 +705,183 @@ int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *bu
return -EOPNOTSUPP;
}
}
+
+static int aie2_populate_range(struct amdxdna_gem_obj *abo)
+{
+ struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
+ struct mm_struct *mm = abo->mem.notifier.mm;
+ struct hmm_range range = { 0 };
+ unsigned long timeout;
+ int ret;
+
+ XDNA_INFO_ONCE(xdna, "populate memory range %llx size %lx",
+ abo->mem.userptr, abo->mem.size);
+ range.notifier = &abo->mem.notifier;
+ range.start = abo->mem.userptr;
+ range.end = abo->mem.userptr + abo->mem.size;
+ range.hmm_pfns = abo->mem.pfns;
+ range.default_flags = HMM_PFN_REQ_FAULT;
+
+ if (!mmget_not_zero(mm))
+ return -EFAULT;
+
+ timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+again:
+ range.notifier_seq = mmu_interval_read_begin(&abo->mem.notifier);
+ mmap_read_lock(mm);
+ ret = hmm_range_fault(&range);
+ mmap_read_unlock(mm);
+ if (ret) {
+ if (time_after(jiffies, timeout)) {
+ ret = -ETIME;
+ goto put_mm;
+ }
+
+ if (ret == -EBUSY)
+ goto again;
+
+ goto put_mm;
+ }
+
+ dma_resv_lock(to_gobj(abo)->resv, NULL);
+ if (mmu_interval_read_retry(&abo->mem.notifier, range.notifier_seq)) {
+ dma_resv_unlock(to_gobj(abo)->resv);
+ goto again;
+ }
+ abo->mem.map_invalid = false;
+ dma_resv_unlock(to_gobj(abo)->resv);
+
+put_mm:
+ mmput(mm);
+ return ret;
+}
+
+int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq)
+{
+ struct amdxdna_dev *xdna = hwctx->client->xdna;
+ struct ww_acquire_ctx acquire_ctx;
+ struct amdxdna_gem_obj *abo;
+ unsigned long timeout = 0;
+ int ret, i;
+
+ ret = drm_sched_job_init(&job->base, &hwctx->priv->entity, 1, hwctx);
+ if (ret) {
+ XDNA_ERR(xdna, "DRM job init failed, ret %d", ret);
+ return ret;
+ }
+
+ drm_sched_job_arm(&job->base);
+ job->out_fence = dma_fence_get(&job->base.s_fence->finished);
+
+retry:
+ ret = drm_gem_lock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+ if (ret) {
+ XDNA_WARN(xdna, "Failed to reverve fence, ret %d", ret);
+ goto put_fence;
+ }
+
+ for (i = 0; i < job->bo_cnt; i++) {
+ abo = to_xdna_obj(job->bos[i]);
+ if (abo->mem.map_invalid) {
+ drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+ if (!timeout) {
+ timeout = jiffies +
+ msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+ } else if (time_after(jiffies, timeout)) {
+ ret = -ETIME;
+ goto put_fence;
+ }
+
+ ret = aie2_populate_range(abo);
+ if (ret)
+ goto put_fence;
+ goto retry;
+ }
+
+ ret = dma_resv_reserve_fences(job->bos[i]->resv, 1);
+ if (ret) {
+ XDNA_WARN(xdna, "Failed to reserve fences %d", ret);
+ drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+ goto put_fence;
+ }
+ }
+
+ for (i = 0; i < job->bo_cnt; i++)
+ dma_resv_add_fence(job->bos[i]->resv, job->out_fence, DMA_RESV_USAGE_WRITE);
+ drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+
+ mutex_lock(&hwctx->priv->io_lock);
+ ret = aie2_hwctx_add_job(hwctx, job);
+ if (ret) {
+ mutex_unlock(&hwctx->priv->io_lock);
+ goto signal_fence;
+ }
+
+ *seq = job->seq;
+ drm_sched_entity_push_job(&job->base);
+ mutex_unlock(&hwctx->priv->io_lock);
+
+ return 0;
+
+signal_fence:
+ dma_fence_signal(job->out_fence);
+put_fence:
+ dma_fence_put(job->out_fence);
+ drm_sched_job_cleanup(&job->base);
+ return ret;
+}
+
+int aie2_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout)
+{
+ signed long remaining = MAX_SCHEDULE_TIMEOUT;
+ struct amdxdna_sched_job *job;
+ struct dma_fence *out_fence;
+ long ret;
+
+ mutex_lock(&hwctx->priv->io_lock);
+ job = aie2_hwctx_get_job(hwctx, seq);
+ if (IS_ERR(job)) {
+ mutex_unlock(&hwctx->priv->io_lock);
+ ret = PTR_ERR(job);
+ goto out;
+ }
+
+ if (unlikely(!job)) {
+ mutex_unlock(&hwctx->priv->io_lock);
+ ret = 0;
+ goto out;
+ }
+ out_fence = dma_fence_get(job->out_fence);
+ mutex_unlock(&hwctx->priv->io_lock);
+
+ if (timeout)
+ remaining = msecs_to_jiffies(timeout);
+
+ ret = dma_fence_wait_timeout(out_fence, true, remaining);
+ if (!ret)
+ ret = -ETIME;
+ else if (ret > 0)
+ ret = 0;
+
+ dma_fence_put(out_fence);
+out:
+ return ret;
+}
+
+void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo,
+ unsigned long cur_seq)
+{
+ struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
+ struct drm_gem_object *gobj = to_gobj(abo);
+ long ret;
+
+ dma_resv_lock(gobj->resv, NULL);
+ abo->mem.map_invalid = true;
+ mmu_interval_set_seq(&abo->mem.notifier, cur_seq);
+ ret = dma_resv_wait_timeout(gobj->resv, DMA_RESV_USAGE_BOOKKEEP,
+ true, MAX_SCHEDULE_TIMEOUT);
+ dma_resv_unlock(gobj->resv);
+
+ if (!ret || ret == -ERESTARTSYS)
+ XDNA_ERR(xdna, "Failed to wait for bo, ret %ld", ret);
+}
diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
index 28bd0560db61..3dc4a9a8571e 100644
--- a/drivers/accel/amdxdna/aie2_message.c
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -4,10 +4,12 @@
*/
#include <drm/amdxdna_accel.h>
+#include <drm/drm_cache.h>
#include <drm/drm_device.h>
#include <drm/drm_gem.h>
#include <drm/drm_gem_shmem_helper.h>
#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
#include <linux/errno.h>
#include <linux/pci.h>
#include <linux/types.h>
@@ -361,3 +363,344 @@ int aie2_config_cu(struct amdxdna_hwctx *hwctx)
msg.opcode, resp.status, ret);
return ret;
}
+
+int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+ int (*notify_cb)(void *, const u32 *, size_t))
+{
+ struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+ struct amdxdna_dev *xdna = hwctx->client->xdna;
+ struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+ union {
+ struct execute_buffer_req ebuf;
+ struct exec_dpu_req dpu;
+ } req;
+ struct xdna_mailbox_msg msg;
+ u32 payload_len;
+ void *payload;
+ int cu_idx;
+ int ret;
+ u32 op;
+
+ if (!chann)
+ return -ENODEV;
+
+ payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len);
+ if (!payload) {
+ XDNA_ERR(xdna, "Invalid command, cannot get payload");
+ return -EINVAL;
+ }
+
+ cu_idx = amdxdna_cmd_get_cu_idx(cmd_abo);
+ if (cu_idx < 0) {
+ XDNA_DBG(xdna, "Invalid cu idx");
+ return -EINVAL;
+ }
+
+ op = amdxdna_cmd_get_op(cmd_abo);
+ switch (op) {
+ case ERT_START_CU:
+ if (unlikely(payload_len > sizeof(req.ebuf.payload)))
+ XDNA_DBG(xdna, "Invalid ebuf payload len: %d", payload_len);
+ req.ebuf.cu_idx = cu_idx;
+ memcpy(req.ebuf.payload, payload, sizeof(req.ebuf.payload));
+ msg.send_size = sizeof(req.ebuf);
+ msg.opcode = MSG_OP_EXECUTE_BUFFER_CF;
+ break;
+ case ERT_START_NPU: {
+ struct amdxdna_cmd_start_npu *sn = payload;
+
+ if (unlikely(payload_len - sizeof(*sn) > sizeof(req.dpu.payload)))
+ XDNA_DBG(xdna, "Invalid dpu payload len: %d", payload_len);
+ req.dpu.inst_buf_addr = sn->buffer;
+ req.dpu.inst_size = sn->buffer_size;
+ req.dpu.inst_prop_cnt = sn->prop_count;
+ req.dpu.cu_idx = cu_idx;
+ memcpy(req.dpu.payload, sn->prop_args, sizeof(req.dpu.payload));
+ msg.send_size = sizeof(req.dpu);
+ msg.opcode = MSG_OP_EXEC_DPU;
+ break;
+ }
+ default:
+ XDNA_DBG(xdna, "Invalid ERT cmd op code: %d", op);
+ return -EINVAL;
+ }
+ msg.handle = job;
+ msg.notify_cb = notify_cb;
+ msg.send_data = (u8 *)&req;
+ print_hex_dump_debug("cmd: ", DUMP_PREFIX_OFFSET, 16, 4, &req,
+ 0x40, false);
+
+ ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+ if (ret) {
+ XDNA_ERR(xdna, "Send message failed");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int
+aie2_cmdlist_fill_one_slot_cf(void *cmd_buf, u32 offset,
+ struct amdxdna_gem_obj *abo, u32 *size)
+{
+ struct cmd_chain_slot_execbuf_cf *buf = cmd_buf + offset;
+ int cu_idx = amdxdna_cmd_get_cu_idx(abo);
+ u32 payload_len;
+ void *payload;
+
+ if (cu_idx < 0)
+ return -EINVAL;
+
+ payload = amdxdna_cmd_get_payload(abo, &payload_len);
+ if (!payload)
+ return -EINVAL;
+
+ if (!slot_cf_has_space(offset, payload_len))
+ return -ENOSPC;
+
+ buf->cu_idx = cu_idx;
+ buf->arg_cnt = payload_len / sizeof(u32);
+ memcpy(buf->args, payload, payload_len);
+ /* Accurate buf size to hint firmware to do necessary copy */
+ *size = sizeof(*buf) + payload_len;
+ return 0;
+}
+
+static int
+aie2_cmdlist_fill_one_slot_dpu(void *cmd_buf, u32 offset,
+ struct amdxdna_gem_obj *abo, u32 *size)
+{
+ struct cmd_chain_slot_dpu *buf = cmd_buf + offset;
+ int cu_idx = amdxdna_cmd_get_cu_idx(abo);
+ struct amdxdna_cmd_start_npu *sn;
+ u32 payload_len;
+ void *payload;
+ u32 arg_sz;
+
+ if (cu_idx < 0)
+ return -EINVAL;
+
+ payload = amdxdna_cmd_get_payload(abo, &payload_len);
+ if (!payload)
+ return -EINVAL;
+ sn = payload;
+ arg_sz = payload_len - sizeof(*sn);
+ if (payload_len < sizeof(*sn) || arg_sz > MAX_DPU_ARGS_SIZE)
+ return -EINVAL;
+
+ if (!slot_dpu_has_space(offset, arg_sz))
+ return -ENOSPC;
+
+ buf->inst_buf_addr = sn->buffer;
+ buf->inst_size = sn->buffer_size;
+ buf->inst_prop_cnt = sn->prop_count;
+ buf->cu_idx = cu_idx;
+ buf->arg_cnt = arg_sz / sizeof(u32);
+ memcpy(buf->args, sn->prop_args, arg_sz);
+
+ /* Accurate buf size to hint firmware to do necessary copy */
+ *size += sizeof(*buf) + arg_sz;
+ return 0;
+}
+
+static int
+aie2_cmdlist_fill_one_slot(u32 op, struct amdxdna_gem_obj *cmdbuf_abo, u32 offset,
+ struct amdxdna_gem_obj *abo, u32 *size)
+{
+ u32 this_op = amdxdna_cmd_get_op(abo);
+ void *cmd_buf = cmdbuf_abo->mem.kva;
+ int ret;
+
+ if (this_op != op) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ switch (op) {
+ case ERT_START_CU:
+ ret = aie2_cmdlist_fill_one_slot_cf(cmd_buf, offset, abo, size);
+ break;
+ case ERT_START_NPU:
+ ret = aie2_cmdlist_fill_one_slot_dpu(cmd_buf, offset, abo, size);
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ }
+
+done:
+ if (ret) {
+ XDNA_ERR(abo->client->xdna, "Can't fill slot for cmd op %d ret %d",
+ op, ret);
+ }
+ return ret;
+}
+
+static inline struct amdxdna_gem_obj *
+aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job)
+{
+ int idx = get_job_idx(job->seq);
+
+ return job->hwctx->priv->cmd_buf[idx];
+}
+
+static void
+aie2_cmdlist_prepare_request(struct cmd_chain_req *req,
+ struct amdxdna_gem_obj *cmdbuf_abo, u32 size, u32 cnt)
+{
+ req->buf_addr = cmdbuf_abo->mem.dev_addr;
+ req->buf_size = size;
+ req->count = cnt;
+ drm_clflush_virt_range(cmdbuf_abo->mem.kva, size);
+ XDNA_DBG(cmdbuf_abo->client->xdna, "Command buf addr 0x%llx size 0x%x count %d",
+ req->buf_addr, size, cnt);
+}
+
+static inline u32
+aie2_cmd_op_to_msg_op(u32 op)
+{
+ switch (op) {
+ case ERT_START_CU:
+ return MSG_OP_CHAIN_EXEC_BUFFER_CF;
+ case ERT_START_NPU:
+ return MSG_OP_CHAIN_EXEC_DPU;
+ default:
+ return MSG_OP_MAX_OPCODE;
+ }
+}
+
+int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
+ struct amdxdna_sched_job *job,
+ int (*notify_cb)(void *, const u32 *, size_t))
+{
+ struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job);
+ struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+ struct amdxdna_client *client = hwctx->client;
+ struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+ struct amdxdna_cmd_chain *payload;
+ struct xdna_mailbox_msg msg;
+ struct cmd_chain_req req;
+ u32 payload_len;
+ u32 offset = 0;
+ u32 size;
+ int ret;
+ u32 op;
+ u32 i;
+
+ op = amdxdna_cmd_get_op(cmd_abo);
+ payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len);
+ if (op != ERT_CMD_CHAIN || !payload ||
+ payload_len < struct_size(payload, data, payload->command_count))
+ return -EINVAL;
+
+ for (i = 0; i < payload->command_count; i++) {
+ u32 boh = (u32)(payload->data[i]);
+ struct amdxdna_gem_obj *abo;
+
+ abo = amdxdna_gem_get_obj(client, boh, AMDXDNA_BO_CMD);
+ if (!abo) {
+ XDNA_ERR(client->xdna, "Failed to find cmd BO %d", boh);
+ return -ENOENT;
+ }
+
+ /* All sub-cmd should have same op, use the first one. */
+ if (i == 0)
+ op = amdxdna_cmd_get_op(abo);
+
+ ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, offset, abo, &size);
+ amdxdna_gem_put_obj(abo);
+ if (ret)
+ return -EINVAL;
+
+ offset += size;
+ }
+
+ /* The offset is the accumulated total size of the cmd buffer */
+ aie2_cmdlist_prepare_request(&req, cmdbuf_abo, offset, payload->command_count);
+
+ msg.opcode = aie2_cmd_op_to_msg_op(op);
+ if (msg.opcode == MSG_OP_MAX_OPCODE)
+ return -EOPNOTSUPP;
+ msg.handle = job;
+ msg.notify_cb = notify_cb;
+ msg.send_data = (u8 *)&req;
+ msg.send_size = sizeof(req);
+ ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+ if (ret) {
+ XDNA_ERR(hwctx->client->xdna, "Send message failed");
+ return ret;
+ }
+
+ return 0;
+}
+
+int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
+ struct amdxdna_sched_job *job,
+ int (*notify_cb)(void *, const u32 *, size_t))
+{
+ struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job);
+ struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+ struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+ struct xdna_mailbox_msg msg;
+ struct cmd_chain_req req;
+ u32 size;
+ int ret;
+ u32 op;
+
+ op = amdxdna_cmd_get_op(cmd_abo);
+ ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, 0, cmd_abo, &size);
+ if (ret)
+ return ret;
+
+ aie2_cmdlist_prepare_request(&req, cmdbuf_abo, size, 1);
+
+ msg.opcode = aie2_cmd_op_to_msg_op(op);
+ if (msg.opcode == MSG_OP_MAX_OPCODE)
+ return -EOPNOTSUPP;
+ msg.handle = job;
+ msg.notify_cb = notify_cb;
+ msg.send_data = (u8 *)&req;
+ msg.send_size = sizeof(req);
+ ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+ if (ret) {
+ XDNA_ERR(hwctx->client->xdna, "Send message failed");
+ return ret;
+ }
+
+ return 0;
+}
+
+int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+ int (*notify_cb)(void *, const u32 *, size_t))
+{
+ struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+ struct amdxdna_gem_obj *abo = to_xdna_obj(job->bos[0]);
+ struct amdxdna_dev *xdna = hwctx->client->xdna;
+ struct xdna_mailbox_msg msg;
+ struct sync_bo_req req;
+ int ret = 0;
+
+ req.src_addr = 0;
+ req.dst_addr = abo->mem.dev_addr - hwctx->client->dev_heap->mem.dev_addr;
+ req.size = abo->mem.size;
+
+ /* Device to Host */
+ req.type = FIELD_PREP(AIE2_MSG_SYNC_BO_SRC_TYPE, SYNC_BO_DEV_MEM) |
+ FIELD_PREP(AIE2_MSG_SYNC_BO_DST_TYPE, SYNC_BO_HOST_MEM);
+
+ XDNA_DBG(xdna, "sync %d bytes src(0x%llx) to dst(0x%llx) completed",
+ req.size, req.src_addr, req.dst_addr);
+
+ msg.handle = job;
+ msg.notify_cb = notify_cb;
+ msg.send_data = (u8 *)&req;
+ msg.send_size = sizeof(req);
+ msg.opcode = MSG_OP_SYNC_BO;
+
+ ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+ if (ret) {
+ XDNA_ERR(xdna, "Send message failed");
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index ee9f114bc229..6017826a7104 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -5,8 +5,10 @@
#include <drm/amdxdna_accel.h>
#include <drm/drm_device.h>
+#include <drm/drm_gem_shmem_helper.h>
#include <drm/drm_managed.h>
#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
#include <linux/errno.h>
#include <linux/firmware.h>
#include <linux/iommu.h>
@@ -17,6 +19,7 @@
#include "aie2_pci.h"
#include "aie2_solver.h"
#include "amdxdna_ctx.h"
+#include "amdxdna_gem.h"
#include "amdxdna_mailbox.h"
#include "amdxdna_pci_drv.h"
@@ -499,4 +502,7 @@ const struct amdxdna_dev_ops aie2_ops = {
.hwctx_init = aie2_hwctx_init,
.hwctx_fini = aie2_hwctx_fini,
.hwctx_config = aie2_hwctx_config,
+ .cmd_submit = aie2_cmd_submit,
+ .cmd_wait = aie2_cmd_wait,
+ .hmm_invalidate = aie2_hmm_invalidate,
};
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index 3ac936e2c9d1..81877d9c0542 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -76,6 +76,7 @@ enum psp_reg_idx {
PSP_MAX_REGS /* Keep this at the end */
};
+struct amdxdna_client;
struct amdxdna_fw_ver;
struct amdxdna_hwctx;
@@ -118,9 +119,28 @@ struct rt_config {
u32 value;
};
+/*
+ * Define the maximum number of pending commands in a hardware context.
+ * Must be power of 2!
+ */
+#define HWCTX_MAX_CMDS 4
+#define get_job_idx(seq) ((seq) & (HWCTX_MAX_CMDS - 1))
struct amdxdna_hwctx_priv {
struct amdxdna_gem_obj *heap;
void *mbox_chann;
+
+ struct drm_gpu_scheduler sched;
+ struct drm_sched_entity entity;
+
+ struct mutex io_lock; /* protect seq and cmd order */
+ struct wait_queue_head job_free_wq;
+ struct amdxdna_sched_job *pending[HWCTX_MAX_CMDS];
+ u32 num_pending;
+ u64 seq;
+ /* Completed job counter */
+ u64 completed;
+
+ struct amdxdna_gem_obj *cmd_buf[HWCTX_MAX_CMDS];
};
struct amdxdna_dev_hdl {
@@ -199,10 +219,25 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
int aie2_config_cu(struct amdxdna_hwctx *hwctx);
+int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+ int (*notify_cb)(void *, const u32 *, size_t));
+int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
+ struct amdxdna_sched_job *job,
+ int (*notify_cb)(void *, const u32 *, size_t));
+int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
+ struct amdxdna_sched_job *job,
+ int (*notify_cb)(void *, const u32 *, size_t));
+int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+ int (*notify_cb)(void *, const u32 *, size_t));
/* aie2_hwctx.c */
int aie2_hwctx_init(struct amdxdna_hwctx *hwctx);
void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx);
int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
+int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
+int aie2_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
+void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
+void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map);
+void aie2_restart_ctx(struct amdxdna_client *client);
#endif /* _AIE2_PCI_H_ */
diff --git a/drivers/accel/amdxdna/aie2_psp.c b/drivers/accel/amdxdna/aie2_psp.c
index 2efcfd1941bf..35ba55e2ab1e 100644
--- a/drivers/accel/amdxdna/aie2_psp.c
+++ b/drivers/accel/amdxdna/aie2_psp.c
@@ -4,8 +4,10 @@
*/
#include <drm/drm_device.h>
+#include <drm/drm_gem_shmem_helper.h>
#include <drm/drm_managed.h>
#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
#include <linux/iopoll.h>
#include "aie2_pci.h"
diff --git a/drivers/accel/amdxdna/aie2_smu.c b/drivers/accel/amdxdna/aie2_smu.c
index 3fa7064649aa..91893d438da7 100644
--- a/drivers/accel/amdxdna/aie2_smu.c
+++ b/drivers/accel/amdxdna/aie2_smu.c
@@ -4,7 +4,9 @@
*/
#include <drm/drm_device.h>
+#include <drm/drm_gem_shmem_helper.h>
#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
#include <linux/iopoll.h>
#include "aie2_pci.h"
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
index 8acf8bfe0db9..b76640e1fdd0 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.c
+++ b/drivers/accel/amdxdna/amdxdna_ctx.c
@@ -7,17 +7,65 @@
#include <drm/drm_device.h>
#include <drm/drm_drv.h>
#include <drm/drm_file.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_gem_shmem_helper.h>
#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+#include <trace/events/amdxdna.h>
#include "amdxdna_ctx.h"
+#include "amdxdna_gem.h"
#include "amdxdna_pci_drv.h"
#define MAX_HWCTX_ID 255
+#define MAX_ARG_COUNT 4095
-static void amdxdna_hwctx_destroy(struct amdxdna_hwctx *hwctx)
+struct amdxdna_fence {
+ struct dma_fence base;
+ spinlock_t lock; /* for base */
+ struct amdxdna_hwctx *hwctx;
+};
+
+static const char *amdxdna_fence_get_driver_name(struct dma_fence *fence)
+{
+ return KBUILD_MODNAME;
+}
+
+static const char *amdxdna_fence_get_timeline_name(struct dma_fence *fence)
+{
+ struct amdxdna_fence *xdna_fence;
+
+ xdna_fence = container_of(fence, struct amdxdna_fence, base);
+
+ return xdna_fence->hwctx->name;
+}
+
+static const struct dma_fence_ops fence_ops = {
+ .get_driver_name = amdxdna_fence_get_driver_name,
+ .get_timeline_name = amdxdna_fence_get_timeline_name,
+};
+
+static struct dma_fence *amdxdna_fence_create(struct amdxdna_hwctx *hwctx)
+{
+ struct amdxdna_fence *fence;
+
+ fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+ if (!fence)
+ return NULL;
+
+ fence->hwctx = hwctx;
+ spin_lock_init(&fence->lock);
+ dma_fence_init(&fence->base, &fence_ops, &fence->lock, hwctx->id, 0);
+ return &fence->base;
+}
+
+static void amdxdna_hwctx_destroy_rcu(struct amdxdna_hwctx *hwctx,
+ struct srcu_struct *ss)
{
struct amdxdna_dev *xdna = hwctx->client->xdna;
+ synchronize_srcu(ss);
+
/* At this point, user is not able to submit new commands */
mutex_lock(&xdna->dev_lock);
xdna->dev_info->ops->hwctx_fini(hwctx);
@@ -27,6 +75,46 @@ static void amdxdna_hwctx_destroy(struct amdxdna_hwctx *hwctx)
kfree(hwctx);
}
+void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size)
+{
+ struct amdxdna_cmd *cmd = abo->mem.kva;
+ u32 num_masks, count;
+
+ if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN)
+ num_masks = 0;
+ else
+ num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, cmd->header);
+
+ if (size) {
+ count = FIELD_GET(AMDXDNA_CMD_COUNT, cmd->header);
+ if (unlikely(count <= num_masks)) {
+ *size = 0;
+ return NULL;
+ }
+ *size = (count - num_masks) * sizeof(u32);
+ }
+ return &cmd->data[num_masks];
+}
+
+int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
+{
+ struct amdxdna_cmd *cmd = abo->mem.kva;
+ u32 num_masks, i;
+ u32 *cu_mask;
+
+ if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN)
+ return -1;
+
+ num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, cmd->header);
+ cu_mask = cmd->data;
+ for (i = 0; i < num_masks; i++) {
+ if (cu_mask[i])
+ return ffs(cu_mask[i]) - 1;
+ }
+
+ return -1;
+}
+
/*
* This should be called in close() and remove(). DO NOT call in other syscalls.
* This guarantee that when hwctx and resources will be released, if user
@@ -43,7 +131,7 @@ void amdxdna_hwctx_remove_all(struct amdxdna_client *client)
client->pid, hwctx->id);
idr_remove(&client->hwctx_idr, hwctx->id);
mutex_unlock(&client->hwctx_lock);
- amdxdna_hwctx_destroy(hwctx);
+ amdxdna_hwctx_destroy_rcu(hwctx, &client->hwctx_srcu);
mutex_lock(&client->hwctx_lock);
}
mutex_unlock(&client->hwctx_lock);
@@ -134,6 +222,12 @@ int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct d
if (!drm_dev_enter(dev, &idx))
return -ENODEV;
+ /*
+ * Use hwctx_lock to achieve exclusion with other hwctx writers,
+ * SRCU to synchronize with exec/wait command ioctls.
+ *
+ * The pushed jobs are handled by DRM scheduler during destroy.
+ */
mutex_lock(&client->hwctx_lock);
hwctx = idr_find(&client->hwctx_idr, args->handle);
if (!hwctx) {
@@ -146,7 +240,7 @@ int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct d
idr_remove(&client->hwctx_idr, hwctx->id);
mutex_unlock(&client->hwctx_lock);
- amdxdna_hwctx_destroy(hwctx);
+ amdxdna_hwctx_destroy_rcu(hwctx, &client->hwctx_srcu);
XDNA_DBG(xdna, "PID %d destroyed HW context %d", client->pid, args->handle);
out:
@@ -160,10 +254,10 @@ int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
struct amdxdna_drm_config_hwctx *args = data;
struct amdxdna_dev *xdna = to_xdna_dev(dev);
struct amdxdna_hwctx *hwctx;
+ int ret, idx;
u32 buf_size;
void *buf;
u64 val;
- int ret;
if (!xdna->dev_info->ops->hwctx_config)
return -EOPNOTSUPP;
@@ -202,17 +296,286 @@ int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
}
mutex_lock(&xdna->dev_lock);
+ idx = srcu_read_lock(&client->hwctx_srcu);
hwctx = idr_find(&client->hwctx_idr, args->handle);
if (!hwctx) {
XDNA_DBG(xdna, "PID %d failed to get hwctx %d", client->pid, args->handle);
ret = -EINVAL;
- goto unlock;
+ goto unlock_srcu;
}
ret = xdna->dev_info->ops->hwctx_config(hwctx, args->param_type, val, buf, buf_size);
-unlock:
+unlock_srcu:
+ srcu_read_unlock(&client->hwctx_srcu, idx);
mutex_unlock(&xdna->dev_lock);
kfree(buf);
return ret;
}
+
+static void
+amdxdna_arg_bos_put(struct amdxdna_sched_job *job)
+{
+ int i;
+
+ for (i = 0; i < job->bo_cnt; i++) {
+ if (!job->bos[i])
+ break;
+ drm_gem_object_put(job->bos[i]);
+ }
+}
+
+static int
+amdxdna_arg_bos_lookup(struct amdxdna_client *client,
+ struct amdxdna_sched_job *job,
+ u32 *bo_hdls, u32 bo_cnt)
+{
+ struct drm_gem_object *gobj;
+ int i, ret;
+
+ job->bo_cnt = bo_cnt;
+ for (i = 0; i < job->bo_cnt; i++) {
+ struct amdxdna_gem_obj *abo;
+
+ gobj = drm_gem_object_lookup(client->filp, bo_hdls[i]);
+ if (!gobj) {
+ ret = -ENOENT;
+ goto put_shmem_bo;
+ }
+ abo = to_xdna_obj(gobj);
+
+ mutex_lock(&abo->lock);
+ if (abo->pinned) {
+ mutex_unlock(&abo->lock);
+ job->bos[i] = gobj;
+ continue;
+ }
+
+ ret = amdxdna_gem_pin_nolock(abo);
+ if (ret) {
+ mutex_unlock(&abo->lock);
+ drm_gem_object_put(gobj);
+ goto put_shmem_bo;
+ }
+ abo->pinned = true;
+ mutex_unlock(&abo->lock);
+
+ job->bos[i] = gobj;
+ }
+
+ return 0;
+
+put_shmem_bo:
+ amdxdna_arg_bos_put(job);
+ return ret;
+}
+
+static void amdxdna_sched_job_release(struct kref *ref)
+{
+ struct amdxdna_sched_job *job;
+
+ job = container_of(ref, struct amdxdna_sched_job, refcnt);
+
+ trace_amdxdna_debug_point(job->hwctx->name, job->seq, "job release");
+ amdxdna_arg_bos_put(job);
+ amdxdna_gem_put_obj(job->cmd_bo);
+ kfree(job);
+}
+
+void amdxdna_job_put(struct amdxdna_sched_job *job)
+{
+ kref_put(&job->refcnt, amdxdna_sched_job_release);
+}
+
+int amdxdna_cmd_submit(struct amdxdna_client *client,
+ u32 cmd_bo_hdl, u32 *arg_bo_hdls, u32 arg_bo_cnt,
+ u32 hwctx_hdl, u64 *seq)
+{
+ struct amdxdna_dev *xdna = client->xdna;
+ struct amdxdna_sched_job *job;
+ struct amdxdna_hwctx *hwctx;
+ int ret, idx;
+
+ XDNA_DBG(xdna, "Command BO hdl %d, Arg BO count %d", cmd_bo_hdl, arg_bo_cnt);
+ job = kzalloc(struct_size(job, bos, arg_bo_cnt), GFP_KERNEL);
+ if (!job)
+ return -ENOMEM;
+
+ if (cmd_bo_hdl != AMDXDNA_INVALID_BO_HANDLE) {
+ job->cmd_bo = amdxdna_gem_get_obj(client, cmd_bo_hdl, AMDXDNA_BO_CMD);
+ if (!job->cmd_bo) {
+ XDNA_ERR(xdna, "Failed to get cmd bo from %d", cmd_bo_hdl);
+ ret = -EINVAL;
+ goto free_job;
+ }
+ } else {
+ job->cmd_bo = NULL;
+ }
+
+ ret = amdxdna_arg_bos_lookup(client, job, arg_bo_hdls, arg_bo_cnt);
+ if (ret) {
+ XDNA_ERR(xdna, "Argument BOs lookup failed, ret %d", ret);
+ goto cmd_put;
+ }
+
+ idx = srcu_read_lock(&client->hwctx_srcu);
+ hwctx = idr_find(&client->hwctx_idr, hwctx_hdl);
+ if (!hwctx) {
+ XDNA_DBG(xdna, "PID %d failed to get hwctx %d",
+ client->pid, hwctx_hdl);
+ ret = -EINVAL;
+ goto unlock_srcu;
+ }
+
+ if (hwctx->status != HWCTX_STAT_READY) {
+ XDNA_ERR(xdna, "HW Context is not ready");
+ ret = -EINVAL;
+ goto unlock_srcu;
+ }
+
+ job->hwctx = hwctx;
+ job->mm = current->mm;
+
+ job->fence = amdxdna_fence_create(hwctx);
+ if (!job->fence) {
+ XDNA_ERR(xdna, "Failed to create fence");
+ ret = -ENOMEM;
+ goto unlock_srcu;
+ }
+ kref_init(&job->refcnt);
+
+ ret = xdna->dev_info->ops->cmd_submit(hwctx, job, seq);
+ if (ret)
+ goto put_fence;
+
+ /*
+ * The amdxdna_hwctx_destroy_rcu() will release hwctx and associated
+ * resource after synchronize_srcu(). The submitted jobs should be
+ * handled by the queue, for example DRM scheduler, in device layer.
+ * For here we can unlock SRCU.
+ */
+ srcu_read_unlock(&client->hwctx_srcu, idx);
+ trace_amdxdna_debug_point(hwctx->name, *seq, "job pushed");
+
+ return 0;
+
+put_fence:
+ dma_fence_put(job->fence);
+unlock_srcu:
+ srcu_read_unlock(&client->hwctx_srcu, idx);
+ amdxdna_arg_bos_put(job);
+cmd_put:
+ amdxdna_gem_put_obj(job->cmd_bo);
+free_job:
+ kfree(job);
+ return ret;
+}
+
+/*
+ * The submit command ioctl submits a command to firmware. One firmware command
+ * may contain multiple command BOs for processing as a whole.
+ * The command sequence number is returned which can be used for wait command ioctl.
+ */
+static int amdxdna_drm_submit_execbuf(struct amdxdna_client *client,
+ struct amdxdna_drm_exec_cmd *args)
+{
+ struct amdxdna_dev *xdna = client->xdna;
+ u32 *arg_bo_hdls;
+ u32 cmd_bo_hdl;
+ int ret;
+
+ if (!args->arg_count || args->arg_count > MAX_ARG_COUNT) {
+ XDNA_ERR(xdna, "Invalid arg bo count %d", args->arg_count);
+ return -EINVAL;
+ }
+
+ /* Only support single command for now. */
+ if (args->cmd_count != 1) {
+ XDNA_ERR(xdna, "Invalid cmd bo count %d", args->cmd_count);
+ return -EINVAL;
+ }
+
+ cmd_bo_hdl = (u32)args->cmd_handles;
+ arg_bo_hdls = kcalloc(args->arg_count, sizeof(u32), GFP_KERNEL);
+ if (!arg_bo_hdls)
+ return -ENOMEM;
+ ret = copy_from_user(arg_bo_hdls, u64_to_user_ptr(args->args),
+ args->arg_count * sizeof(u32));
+ if (ret) {
+ ret = -EFAULT;
+ goto free_cmd_bo_hdls;
+ }
+
+ ret = amdxdna_cmd_submit(client, cmd_bo_hdl, arg_bo_hdls,
+ args->arg_count, args->hwctx, &args->seq);
+ if (ret)
+ XDNA_DBG(xdna, "Submit cmds failed, ret %d", ret);
+
+free_cmd_bo_hdls:
+ kfree(arg_bo_hdls);
+ if (!ret)
+ XDNA_DBG(xdna, "Pushed cmd %lld to scheduler", args->seq);
+ return ret;
+}
+
+int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+{
+ struct amdxdna_client *client = filp->driver_priv;
+ struct amdxdna_drm_exec_cmd *args = data;
+
+ if (args->ext || args->ext_flags)
+ return -EINVAL;
+
+ switch (args->type) {
+ case AMDXDNA_CMD_SUBMIT_EXEC_BUF:
+ return amdxdna_drm_submit_execbuf(client, args);
+ }
+
+ XDNA_ERR(client->xdna, "Invalid command type %d", args->type);
+ return -EINVAL;
+}
+
+int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl,
+ u64 seq, u32 timeout)
+{
+ struct amdxdna_dev *xdna = client->xdna;
+ struct amdxdna_hwctx *hwctx;
+ int ret, idx;
+
+ if (!xdna->dev_info->ops->cmd_wait)
+ return -EOPNOTSUPP;
+
+ /* For locking concerns, see amdxdna_drm_exec_cmd_ioctl. */
+ idx = srcu_read_lock(&client->hwctx_srcu);
+ hwctx = idr_find(&client->hwctx_idr, hwctx_hdl);
+ if (!hwctx) {
+ XDNA_DBG(xdna, "PID %d failed to get hwctx %d",
+ client->pid, hwctx_hdl);
+ ret = -EINVAL;
+ goto unlock_hwctx_srcu;
+ }
+
+ ret = xdna->dev_info->ops->cmd_wait(hwctx, seq, timeout);
+
+unlock_hwctx_srcu:
+ srcu_read_unlock(&client->hwctx_srcu, idx);
+ return ret;
+}
+
+int amdxdna_drm_wait_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+{
+ struct amdxdna_client *client = filp->driver_priv;
+ struct amdxdna_dev *xdna = to_xdna_dev(dev);
+ struct amdxdna_drm_wait_cmd *args = data;
+ int ret;
+
+ XDNA_DBG(xdna, "PID %d hwctx %d timeout set %d ms for cmd %lld",
+ client->pid, args->hwctx, args->timeout, args->seq);
+
+ ret = amdxdna_cmd_wait(client, args->hwctx, args->seq, args->timeout);
+
+ XDNA_DBG(xdna, "PID %d hwctx %d cmd %lld wait finished, ret %d",
+ client->pid, args->hwctx, args->seq, ret);
+
+ return ret;
+}
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
index 665b3208897d..65f9c1dfe32c 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -6,6 +6,52 @@
#ifndef _AMDXDNA_CTX_H_
#define _AMDXDNA_CTX_H_
+#include "amdxdna_gem.h"
+
+struct amdxdna_hwctx_priv;
+
+enum ert_cmd_opcode {
+ ERT_START_CU = 0,
+ ERT_CMD_CHAIN = 19,
+ ERT_START_NPU = 20,
+};
+
+enum ert_cmd_state {
+ ERT_CMD_STATE_INVALID,
+ ERT_CMD_STATE_NEW,
+ ERT_CMD_STATE_QUEUED,
+ ERT_CMD_STATE_RUNNING,
+ ERT_CMD_STATE_COMPLETED,
+ ERT_CMD_STATE_ERROR,
+ ERT_CMD_STATE_ABORT,
+ ERT_CMD_STATE_SUBMITTED,
+ ERT_CMD_STATE_TIMEOUT,
+ ERT_CMD_STATE_NORESPONSE,
+};
+
+/*
+ * Interpretation of the beginning of data payload for ERT_START_NPU in
+ * amdxdna_cmd. The rest of the payload in amdxdna_cmd is regular kernel args.
+ */
+struct amdxdna_cmd_start_npu {
+ u64 buffer; /* instruction buffer address */
+ u32 buffer_size; /* size of buffer in bytes */
+ u32 prop_count; /* properties count */
+ u32 prop_args[]; /* properties and regular kernel arguments */
+};
+
+/*
+ * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in
+ * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles.
+ */
+struct amdxdna_cmd_chain {
+ u32 command_count;
+ u32 submit_index;
+ u32 error_index;
+ u32 reserved[3];
+ u64 data[] __counted_by(command_count);
+};
+
/* Exec buffer command header format */
#define AMDXDNA_CMD_STATE GENMASK(3, 0)
#define AMDXDNA_CMD_EXTRA_CU_MASK GENMASK(11, 10)
@@ -40,9 +86,73 @@ struct amdxdna_hwctx {
struct amdxdna_hwctx_param_config_cu *cus;
};
+#define drm_job_to_xdna_job(j) \
+ container_of(j, struct amdxdna_sched_job, base)
+
+struct amdxdna_sched_job {
+ struct drm_sched_job base;
+ struct kref refcnt;
+ struct amdxdna_hwctx *hwctx;
+ struct mm_struct *mm;
+ /* The fence to notice DRM scheduler that job is done by hardware */
+ struct dma_fence *fence;
+ /* user can wait on this fence */
+ struct dma_fence *out_fence;
+ u64 seq;
+ struct amdxdna_gem_obj *cmd_bo;
+ size_t bo_cnt;
+ struct drm_gem_object *bos[] __counted_by(bo_cnt);
+};
+
+static inline u32
+amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
+{
+ struct amdxdna_cmd *cmd = abo->mem.kva;
+
+ return FIELD_GET(AMDXDNA_CMD_OPCODE, cmd->header);
+}
+
+static inline void
+amdxdna_cmd_set_state(struct amdxdna_gem_obj *abo, enum ert_cmd_state s)
+{
+ struct amdxdna_cmd *cmd = abo->mem.kva;
+
+ cmd->header &= ~AMDXDNA_CMD_STATE;
+ cmd->header |= FIELD_PREP(AMDXDNA_CMD_STATE, s);
+}
+
+static inline enum ert_cmd_state
+amdxdna_cmd_get_state(struct amdxdna_gem_obj *abo)
+{
+ struct amdxdna_cmd *cmd = abo->mem.kva;
+
+ return FIELD_GET(AMDXDNA_CMD_STATE, cmd->header);
+}
+
+void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size);
+int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
+
+static inline u32 amdxdna_hwctx_col_map(struct amdxdna_hwctx *hwctx)
+{
+ return GENMASK(hwctx->start_col + hwctx->num_col - 1,
+ hwctx->start_col);
+}
+
+void amdxdna_job_put(struct amdxdna_sched_job *job);
+
void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
+
+int amdxdna_cmd_submit(struct amdxdna_client *client,
+ u32 cmd_bo_hdls, u32 *arg_bo_hdls, u32 arg_bo_cnt,
+ u32 hwctx_hdl, u64 *seq);
+
+int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl,
+ u64 seq, u32 timeout);
+
int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
+int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
+int amdxdna_drm_wait_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
#endif /* _AMDXDNA_CTX_H_ */
diff --git a/drivers/accel/amdxdna/amdxdna_gem.c b/drivers/accel/amdxdna/amdxdna_gem.c
index 66373baa4600..091674a4bbfa 100644
--- a/drivers/accel/amdxdna/amdxdna_gem.c
+++ b/drivers/accel/amdxdna/amdxdna_gem.c
@@ -8,6 +8,7 @@
#include <drm/drm_device.h>
#include <drm/drm_gem.h>
#include <drm/drm_gem_shmem_helper.h>
+#include <drm/gpu_scheduler.h>
#include <linux/iosys-map.h>
#include <linux/vmalloc.h>
diff --git a/drivers/accel/amdxdna/amdxdna_mailbox_helper.c b/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
index 42b615394605..5139a9c96a91 100644
--- a/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
+++ b/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
@@ -3,10 +3,15 @@
* Copyright (C) 2024, Advanced Micro Devices, Inc.
*/
+#include <drm/amdxdna_accel.h>
#include <drm/drm_device.h>
#include <drm/drm_print.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/gpu_scheduler.h>
#include <linux/completion.h>
+#include "amdxdna_gem.h"
#include "amdxdna_mailbox.h"
#include "amdxdna_mailbox_helper.h"
#include "amdxdna_pci_drv.h"
diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c
index 47ea79d4a021..5c1e863825e0 100644
--- a/drivers/accel/amdxdna/amdxdna_pci_drv.c
+++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
@@ -10,6 +10,7 @@
#include <drm/drm_gem_shmem_helper.h>
#include <drm/drm_ioctl.h>
#include <drm/drm_managed.h>
+#include <drm/gpu_scheduler.h>
#include <linux/iommu.h>
#include <linux/pci.h>
@@ -64,6 +65,7 @@ static int amdxdna_drm_open(struct drm_device *ddev, struct drm_file *filp)
goto unbind_sva;
}
mutex_init(&client->hwctx_lock);
+ init_srcu_struct(&client->hwctx_srcu);
idr_init_base(&client->hwctx_idr, AMDXDNA_INVALID_CTX_HANDLE + 1);
mutex_init(&client->mm_lock);
@@ -93,6 +95,7 @@ static void amdxdna_drm_close(struct drm_device *ddev, struct drm_file *filp)
XDNA_DBG(xdna, "closing pid %d", client->pid);
idr_destroy(&client->hwctx_idr);
+ cleanup_srcu_struct(&client->hwctx_srcu);
mutex_destroy(&client->hwctx_lock);
mutex_destroy(&client->mm_lock);
if (client->dev_heap)
@@ -133,6 +136,9 @@ static const struct drm_ioctl_desc amdxdna_drm_ioctls[] = {
DRM_IOCTL_DEF_DRV(AMDXDNA_CREATE_BO, amdxdna_drm_create_bo_ioctl, 0),
DRM_IOCTL_DEF_DRV(AMDXDNA_GET_BO_INFO, amdxdna_drm_get_bo_info_ioctl, 0),
DRM_IOCTL_DEF_DRV(AMDXDNA_SYNC_BO, amdxdna_drm_sync_bo_ioctl, 0),
+ /* Exectuion */
+ DRM_IOCTL_DEF_DRV(AMDXDNA_EXEC_CMD, amdxdna_drm_submit_cmd_ioctl, 0),
+ DRM_IOCTL_DEF_DRV(AMDXDNA_WAIT_CMD, amdxdna_drm_wait_cmd_ioctl, 0),
};
static const struct file_operations amdxdna_fops = {
diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.h b/drivers/accel/amdxdna/amdxdna_pci_drv.h
index 3dddde4ac12a..0324e73094b2 100644
--- a/drivers/accel/amdxdna/amdxdna_pci_drv.h
+++ b/drivers/accel/amdxdna/amdxdna_pci_drv.h
@@ -20,6 +20,7 @@ extern const struct drm_driver amdxdna_drm_drv;
struct amdxdna_dev;
struct amdxdna_gem_obj;
struct amdxdna_hwctx;
+struct amdxdna_sched_job;
/*
* struct amdxdna_dev_ops - Device hardware operation callbacks
@@ -31,6 +32,8 @@ struct amdxdna_dev_ops {
void (*hwctx_fini)(struct amdxdna_hwctx *hwctx);
int (*hwctx_config)(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
void (*hmm_invalidate)(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
+ int (*cmd_submit)(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
+ int (*cmd_wait)(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
};
/*
@@ -88,6 +91,8 @@ struct amdxdna_client {
struct list_head node;
pid_t pid;
struct mutex hwctx_lock; /* protect hwctx */
+ /* do NOT wait this srcu when hwctx_lock is hold */
+ struct srcu_struct hwctx_srcu;
struct idr hwctx_idr;
struct amdxdna_dev *xdna;
struct drm_file *filp;
diff --git a/drivers/accel/amdxdna/amdxdna_sysfs.c b/drivers/accel/amdxdna/amdxdna_sysfs.c
index 668b94b92714..f27e4ee960a0 100644
--- a/drivers/accel/amdxdna/amdxdna_sysfs.c
+++ b/drivers/accel/amdxdna/amdxdna_sysfs.c
@@ -3,9 +3,14 @@
* Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
*/
+#include <drm/amdxdna_accel.h>
#include <drm/drm_device.h>
+#include <drm/drm_gem_shmem_helper.h>
#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/types.h>
+#include "amdxdna_gem.h"
#include "amdxdna_pci_drv.h"
static ssize_t vbnv_show(struct device *dev, struct device_attribute *attr, char *buf)
diff --git a/drivers/accel/amdxdna/npu1_regs.c b/drivers/accel/amdxdna/npu1_regs.c
index 720aab0ed7c4..f00c50461b09 100644
--- a/drivers/accel/amdxdna/npu1_regs.c
+++ b/drivers/accel/amdxdna/npu1_regs.c
@@ -5,6 +5,7 @@
#include <drm/amdxdna_accel.h>
#include <drm/drm_device.h>
+#include <drm/gpu_scheduler.h>
#include <linux/sizes.h>
#include "aie2_pci.h"
diff --git a/drivers/accel/amdxdna/npu2_regs.c b/drivers/accel/amdxdna/npu2_regs.c
index f3ea18bcf294..00cb381031d2 100644
--- a/drivers/accel/amdxdna/npu2_regs.c
+++ b/drivers/accel/amdxdna/npu2_regs.c
@@ -5,6 +5,7 @@
#include <drm/amdxdna_accel.h>
#include <drm/drm_device.h>
+#include <drm/gpu_scheduler.h>
#include <linux/sizes.h>
#include "aie2_pci.h"
diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c
index db61142f0d4e..b6dae9667cca 100644
--- a/drivers/accel/amdxdna/npu4_regs.c
+++ b/drivers/accel/amdxdna/npu4_regs.c
@@ -5,6 +5,7 @@
#include <drm/amdxdna_accel.h>
#include <drm/drm_device.h>
+#include <drm/gpu_scheduler.h>
#include <linux/sizes.h>
#include "aie2_pci.h"
diff --git a/drivers/accel/amdxdna/npu5_regs.c b/drivers/accel/amdxdna/npu5_regs.c
index debf4e95b9bb..bed1baf8e160 100644
--- a/drivers/accel/amdxdna/npu5_regs.c
+++ b/drivers/accel/amdxdna/npu5_regs.c
@@ -5,6 +5,7 @@
#include <drm/amdxdna_accel.h>
#include <drm/drm_device.h>
+#include <drm/gpu_scheduler.h>
#include <linux/sizes.h>
#include "aie2_pci.h"
diff --git a/include/trace/events/amdxdna.h b/include/trace/events/amdxdna.h
index 33343d8f0622..c6cb2da7b706 100644
--- a/include/trace/events/amdxdna.h
+++ b/include/trace/events/amdxdna.h
@@ -9,8 +9,49 @@
#if !defined(_TRACE_AMDXDNA_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_AMDXDNA_H
+#include <drm/gpu_scheduler.h>
#include <linux/tracepoint.h>
+TRACE_EVENT(amdxdna_debug_point,
+ TP_PROTO(const char *name, u64 number, const char *str),
+
+ TP_ARGS(name, number, str),
+
+ TP_STRUCT__entry(__string(name, name)
+ __field(u64, number)
+ __string(str, str)),
+
+ TP_fast_assign(__assign_str(name);
+ __entry->number = number;
+ __assign_str(str);),
+
+ TP_printk("%s:%llu %s", __get_str(name), __entry->number,
+ __get_str(str))
+);
+
+TRACE_EVENT(xdna_job,
+ TP_PROTO(struct drm_sched_job *sched_job, const char *name, const char *str, u64 seq),
+
+ TP_ARGS(sched_job, name, str, seq),
+
+ TP_STRUCT__entry(__string(name, name)
+ __string(str, str)
+ __field(u64, fence_context)
+ __field(u64, fence_seqno)
+ __field(u64, seq)),
+
+ TP_fast_assign(__assign_str(name);
+ __assign_str(str);
+ __entry->fence_context = sched_job->s_fence->finished.context;
+ __entry->fence_seqno = sched_job->s_fence->finished.seqno;
+ __entry->seq = seq;),
+
+ TP_printk("fence=(context:%llu, seqno:%lld), %s seq#:%lld %s",
+ __entry->fence_context, __entry->fence_seqno,
+ __get_str(name), __entry->seq,
+ __get_str(str))
+);
+
DECLARE_EVENT_CLASS(xdna_mbox_msg,
TP_PROTO(char *name, u8 chann_id, u32 opcode, u32 msg_id),
diff --git a/include/uapi/drm/amdxdna_accel.h b/include/uapi/drm/amdxdna_accel.h
index 3792750834b2..08f3ec7146ab 100644
--- a/include/uapi/drm/amdxdna_accel.h
+++ b/include/uapi/drm/amdxdna_accel.h
@@ -13,6 +13,7 @@
extern "C" {
#endif
+#define AMDXDNA_INVALID_CMD_HANDLE (~0UL)
#define AMDXDNA_INVALID_ADDR (~0UL)
#define AMDXDNA_INVALID_CTX_HANDLE 0
#define AMDXDNA_INVALID_BO_HANDLE 0
@@ -29,6 +30,8 @@ enum amdxdna_drm_ioctl_id {
DRM_AMDXDNA_CREATE_BO,
DRM_AMDXDNA_GET_BO_INFO,
DRM_AMDXDNA_SYNC_BO,
+ DRM_AMDXDNA_EXEC_CMD,
+ DRM_AMDXDNA_WAIT_CMD,
};
/**
@@ -201,6 +204,54 @@ struct amdxdna_drm_sync_bo {
__u64 size;
};
+enum amdxdna_cmd_type {
+ AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0,
+ AMDXDNA_CMD_SUBMIT_DEPENDENCY,
+ AMDXDNA_CMD_SUBMIT_SIGNAL,
+};
+
+/**
+ * struct amdxdna_drm_exec_cmd - Execute command.
+ * @ext: MBZ.
+ * @ext_flags: MBZ.
+ * @hwctx: Hardware context handle.
+ * @type: One of command type in enum amdxdna_cmd_type.
+ * @cmd_handles: Array of command handles or the command handle itself
+ * in case of just one.
+ * @args: Array of arguments for all command handles.
+ * @cmd_count: Number of command handles in the cmd_handles array.
+ * @arg_count: Number of arguments in the args array.
+ * @seq: Returned sequence number for this command.
+ */
+struct amdxdna_drm_exec_cmd {
+ __u64 ext;
+ __u64 ext_flags;
+ __u32 hwctx;
+ __u32 type;
+ __u64 cmd_handles;
+ __u64 args;
+ __u32 cmd_count;
+ __u32 arg_count;
+ __u64 seq;
+};
+
+/**
+ * struct amdxdna_drm_wait_cmd - Wait exectuion command.
+ *
+ * @hwctx: hardware context handle.
+ * @timeout: timeout in ms, 0 implies infinite wait.
+ * @seq: sequence number of the command returned by execute command.
+ *
+ * Wait a command specified by seq to be completed.
+ * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot
+ * to submit a new command.
+ */
+struct amdxdna_drm_wait_cmd {
+ __u32 hwctx;
+ __u32 timeout;
+ __u64 seq;
+};
+
#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \
struct amdxdna_drm_create_hwctx)
@@ -225,6 +276,14 @@ struct amdxdna_drm_sync_bo {
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, \
struct amdxdna_drm_sync_bo)
+#define DRM_IOCTL_AMDXDNA_EXEC_CMD \
+ DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, \
+ struct amdxdna_drm_exec_cmd)
+
+#define DRM_IOCTL_AMDXDNA_WAIT_CMD \
+ DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, \
+ struct amdxdna_drm_wait_cmd)
+
#if defined(__cplusplus)
} /* extern c end */
#endif
--
2.34.1
On Fri, Oct 11, 2024 at 04:12:41PM -0700, Lizhi Hou wrote:
> Add interfaces for user application to submit command and wait for its
> completion.
>
> Co-developed-by: Min Ma <min.ma@amd.com>
> Signed-off-by: Min Ma <min.ma@amd.com>
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> ---
> drivers/accel/amdxdna/aie2_ctx.c | 624 +++++++++++++++++-
> drivers/accel/amdxdna/aie2_message.c | 343 ++++++++++
> drivers/accel/amdxdna/aie2_pci.c | 6 +
> drivers/accel/amdxdna/aie2_pci.h | 35 +
> drivers/accel/amdxdna/aie2_psp.c | 2 +
> drivers/accel/amdxdna/aie2_smu.c | 2 +
> drivers/accel/amdxdna/amdxdna_ctx.c | 375 ++++++++++-
> drivers/accel/amdxdna/amdxdna_ctx.h | 110 +++
> drivers/accel/amdxdna/amdxdna_gem.c | 1 +
> .../accel/amdxdna/amdxdna_mailbox_helper.c | 5 +
> drivers/accel/amdxdna/amdxdna_pci_drv.c | 6 +
> drivers/accel/amdxdna/amdxdna_pci_drv.h | 5 +
> drivers/accel/amdxdna/amdxdna_sysfs.c | 5 +
> drivers/accel/amdxdna/npu1_regs.c | 1 +
> drivers/accel/amdxdna/npu2_regs.c | 1 +
> drivers/accel/amdxdna/npu4_regs.c | 1 +
> drivers/accel/amdxdna/npu5_regs.c | 1 +
> include/trace/events/amdxdna.h | 41 ++
> include/uapi/drm/amdxdna_accel.h | 59 ++
> 19 files changed, 1614 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
> index 617fc05077d9..f9010a902c99 100644
> --- a/drivers/accel/amdxdna/aie2_ctx.c
> +++ b/drivers/accel/amdxdna/aie2_ctx.c
> @@ -8,8 +8,11 @@
> #include <drm/drm_gem.h>
> #include <drm/drm_gem_shmem_helper.h>
> #include <drm/drm_print.h>
> +#include <linux/hmm.h>
> #include <linux/types.h>
> +#include <trace/events/amdxdna.h>
>
> +#include "aie2_msg_priv.h"
> #include "aie2_pci.h"
> #include "aie2_solver.h"
> #include "amdxdna_ctx.h"
> @@ -17,6 +20,361 @@
> #include "amdxdna_mailbox.h"
> #include "amdxdna_pci_drv.h"
>
> +bool force_cmdlist;
> +module_param(force_cmdlist, bool, 0600);
> +MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default false)");
> +
> +#define HWCTX_MAX_TIMEOUT 60000 /* miliseconds */
> +
> +static int
> +aie2_hwctx_add_job(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job)
> +{
> + struct amdxdna_sched_job *other;
> + int idx;
> +
> + idx = get_job_idx(hwctx->priv->seq);
> + /* When pending list full, hwctx->seq points to oldest fence */
> + other = hwctx->priv->pending[idx];
> + if (other && other->fence)
> + return -EAGAIN;
> +
> + if (other) {
> + dma_fence_put(other->out_fence);
> + amdxdna_job_put(other);
> + }
> +
> + hwctx->priv->pending[idx] = job;
> + job->seq = hwctx->priv->seq++;
> + kref_get(&job->refcnt);
> +
> + return 0;
> +}
> +
> +static struct amdxdna_sched_job *
> +aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq)
> +{
> + int idx;
> +
> + /* Special sequence number for oldest fence if exist */
> + if (seq == AMDXDNA_INVALID_CMD_HANDLE) {
> + idx = get_job_idx(hwctx->priv->seq);
> + goto out;
> + }
> +
> + if (seq >= hwctx->priv->seq)
> + return ERR_PTR(-EINVAL);
> +
> + if (seq + HWCTX_MAX_CMDS < hwctx->priv->seq)
> + return NULL;
> +
> + idx = get_job_idx(seq);
> +
> +out:
> + return hwctx->priv->pending[idx];
> +}
> +
> +/* The bad_job is used in aie2_sched_job_timedout, otherwise, set it to NULL */
> +static void aie2_hwctx_stop(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwctx,
> + struct drm_sched_job *bad_job)
> +{
> + drm_sched_stop(&hwctx->priv->sched, bad_job);
> + aie2_destroy_context(xdna->dev_handle, hwctx);
> +}
> +
> +static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwctx)
> +{
> + struct amdxdna_gem_obj *heap = hwctx->priv->heap;
> + int ret;
> +
> + ret = aie2_create_context(xdna->dev_handle, hwctx);
> + if (ret) {
> + XDNA_ERR(xdna, "Create hwctx failed, ret %d", ret);
> + goto out;
> + }
> +
> + ret = aie2_map_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
> + heap->mem.userptr, heap->mem.size);
> + if (ret) {
> + XDNA_ERR(xdna, "Map host buf failed, ret %d", ret);
> + goto out;
> + }
> +
> + if (hwctx->status != HWCTX_STAT_READY) {
> + XDNA_DBG(xdna, "hwctx is not ready, status %d", hwctx->status);
> + goto out;
> + }
> +
> + ret = aie2_config_cu(hwctx);
> + if (ret) {
> + XDNA_ERR(xdna, "Config cu failed, ret %d", ret);
> + goto out;
> + }
> +
> +out:
> + drm_sched_start(&hwctx->priv->sched);
> + XDNA_DBG(xdna, "%s restarted, ret %d", hwctx->name, ret);
> + return ret;
> +}
> +
> +void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map)
> +{
> + struct amdxdna_dev *xdna = client->xdna;
> + struct amdxdna_hwctx *hwctx;
> + int next = 0;
> +
> + drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
> + mutex_lock(&client->hwctx_lock);
> + idr_for_each_entry_continue(&client->hwctx_idr, hwctx, next) {
> + /* check if the HW context uses the error column */
> + if (!(col_map & amdxdna_hwctx_col_map(hwctx)))
> + continue;
> +
> + aie2_hwctx_stop(xdna, hwctx, NULL);
> + hwctx->old_status = hwctx->status;
> + hwctx->status = HWCTX_STAT_STOP;
> + XDNA_DBG(xdna, "Stop %s", hwctx->name);
> + }
> + mutex_unlock(&client->hwctx_lock);
> +}
> +
> +void aie2_restart_ctx(struct amdxdna_client *client)
> +{
> + struct amdxdna_dev *xdna = client->xdna;
> + struct amdxdna_hwctx *hwctx;
> + int next = 0;
> +
> + drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
> + mutex_lock(&client->hwctx_lock);
> + idr_for_each_entry_continue(&client->hwctx_idr, hwctx, next) {
> + if (hwctx->status != HWCTX_STAT_STOP)
> + continue;
> +
> + hwctx->status = hwctx->old_status;
> + XDNA_DBG(xdna, "Resetting %s", hwctx->name);
> + aie2_hwctx_restart(xdna, hwctx);
> + }
> + mutex_unlock(&client->hwctx_lock);
> +}
> +
> +static int aie2_hwctx_wait_for_idle(struct amdxdna_hwctx *hwctx)
> +{
> + struct amdxdna_sched_job *job;
> +
> + mutex_lock(&hwctx->priv->io_lock);
> + if (!hwctx->priv->seq) {
> + mutex_unlock(&hwctx->priv->io_lock);
> + return 0;
> + }
> +
> + job = aie2_hwctx_get_job(hwctx, hwctx->priv->seq - 1);
> + if (IS_ERR_OR_NULL(job)) {
> + mutex_unlock(&hwctx->priv->io_lock);
> + XDNA_WARN(hwctx->client->xdna, "Corrupted pending list");
> + return 0;
> + }
> + mutex_unlock(&hwctx->priv->io_lock);
> +
> + wait_event(hwctx->priv->job_free_wq, !job->fence);
> +
> + return 0;
> +}
> +
> +static void
> +aie2_sched_notify(struct amdxdna_sched_job *job)
> +{
> + struct dma_fence *fence = job->fence;
> +
> + job->hwctx->priv->completed++;
> + dma_fence_signal(fence);
> + trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", job->seq);
> + dma_fence_put(fence);
> + mmput(job->mm);
> + amdxdna_job_put(job);
> +}
> +
> +static int
> +aie2_sched_resp_handler(void *handle, const u32 *data, size_t size)
> +{
> + struct amdxdna_sched_job *job = handle;
> + struct amdxdna_gem_obj *cmd_abo;
> + u32 ret = 0;
> + u32 status;
> +
> + cmd_abo = job->cmd_bo;
> +
> + if (unlikely(!data))
> + goto out;
> +
> + if (unlikely(size != sizeof(u32))) {
> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + status = *data;
> + XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
> + if (status == AIE2_STATUS_SUCCESS)
> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
> + else
> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ERROR);
> +
> +out:
> + aie2_sched_notify(job);
> + return ret;
> +}
> +
> +static int
> +aie2_sched_nocmd_resp_handler(void *handle, const u32 *data, size_t size)
> +{
> + struct amdxdna_sched_job *job = handle;
> + u32 ret = 0;
> + u32 status;
> +
> + if (unlikely(!data))
> + goto out;
> +
> + if (unlikely(size != sizeof(u32))) {
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + status = *data;
> + XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
> +
> +out:
> + aie2_sched_notify(job);
> + return ret;
> +}
> +
> +static int
> +aie2_sched_cmdlist_resp_handler(void *handle, const u32 *data, size_t size)
> +{
> + struct amdxdna_sched_job *job = handle;
> + struct amdxdna_gem_obj *cmd_abo;
> + struct cmd_chain_resp *resp;
> + struct amdxdna_dev *xdna;
> + u32 fail_cmd_status;
> + u32 fail_cmd_idx;
> + u32 ret = 0;
> +
> + cmd_abo = job->cmd_bo;
> + if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + resp = (struct cmd_chain_resp *)data;
> + xdna = job->hwctx->client->xdna;
> + XDNA_DBG(xdna, "Status 0x%x", resp->status);
> + if (resp->status == AIE2_STATUS_SUCCESS) {
> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
> + goto out;
> + }
> +
> + /* Slow path to handle error, read from ringbuf on BAR */
> + fail_cmd_idx = resp->fail_cmd_idx;
> + fail_cmd_status = resp->fail_cmd_status;
> + XDNA_DBG(xdna, "Failed cmd idx %d, status 0x%x",
> + fail_cmd_idx, fail_cmd_status);
> +
> + if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
> + ret = -EINVAL;
> + goto out;
> + }
> + amdxdna_cmd_set_state(cmd_abo, fail_cmd_status);
> +
> + if (amdxdna_cmd_get_op(cmd_abo) == ERT_CMD_CHAIN) {
> + struct amdxdna_cmd_chain *cc = amdxdna_cmd_get_payload(cmd_abo, NULL);
> +
> + cc->error_index = fail_cmd_idx;
> + if (cc->error_index >= cc->command_count)
> + cc->error_index = 0;
> + }
> +out:
> + aie2_sched_notify(job);
> + return ret;
> +}
> +
> +static struct dma_fence *
> +aie2_sched_job_run(struct drm_sched_job *sched_job)
> +{
> + struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
> + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
> + struct amdxdna_hwctx *hwctx = job->hwctx;
> + struct dma_fence *fence;
> + int ret;
> +
> + if (!mmget_not_zero(job->mm))
> + return ERR_PTR(-ESRCH);
> +
> + kref_get(&job->refcnt);
> + fence = dma_fence_get(job->fence);
> +
> + if (unlikely(!cmd_abo)) {
> + ret = aie2_sync_bo(hwctx, job, aie2_sched_nocmd_resp_handler);
> + goto out;
> + }
> +
> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_NEW);
> +
> + if (amdxdna_cmd_get_op(cmd_abo) == ERT_CMD_CHAIN)
> + ret = aie2_cmdlist_multi_execbuf(hwctx, job, aie2_sched_cmdlist_resp_handler);
> + else if (force_cmdlist)
> + ret = aie2_cmdlist_single_execbuf(hwctx, job, aie2_sched_cmdlist_resp_handler);
> + else
> + ret = aie2_execbuf(hwctx, job, aie2_sched_resp_handler);
> +
> +out:
> + if (ret) {
> + dma_fence_put(job->fence);
> + amdxdna_job_put(job);
> + mmput(job->mm);
> + fence = ERR_PTR(ret);
> + }
> + trace_xdna_job(sched_job, hwctx->name, "sent to device", job->seq);
> +
> + return fence;
> +}
> +
> +static void aie2_sched_job_free(struct drm_sched_job *sched_job)
> +{
> + struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
> + struct amdxdna_hwctx *hwctx = job->hwctx;
> +
> + trace_xdna_job(sched_job, hwctx->name, "job free", job->seq);
> + drm_sched_job_cleanup(sched_job);
> + job->fence = NULL;
> + amdxdna_job_put(job);
> +
> + wake_up(&hwctx->priv->job_free_wq);
> +}
> +
> +static enum drm_gpu_sched_stat
> +aie2_sched_job_timedout(struct drm_sched_job *sched_job)
> +{
> + struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
> + struct amdxdna_hwctx *hwctx = job->hwctx;
> + struct amdxdna_dev *xdna;
> +
> + xdna = hwctx->client->xdna;
> + trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
> + mutex_lock(&xdna->dev_lock);
> + aie2_hwctx_stop(xdna, hwctx, sched_job);
> +
> + aie2_hwctx_restart(xdna, hwctx);
> + mutex_unlock(&xdna->dev_lock);
> +
> + return DRM_GPU_SCHED_STAT_NOMINAL;
> +}
> +
> +const struct drm_sched_backend_ops sched_ops = {
> + .run_job = aie2_sched_job_run,
> + .free_job = aie2_sched_job_free,
> + .timedout_job = aie2_sched_job_timedout,
> +};
> +
> static int aie2_hwctx_col_list(struct amdxdna_hwctx *hwctx)
> {
> struct amdxdna_dev *xdna = hwctx->client->xdna;
> @@ -130,9 +488,10 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
> {
> struct amdxdna_client *client = hwctx->client;
> struct amdxdna_dev *xdna = client->xdna;
> + struct drm_gpu_scheduler *sched;
> struct amdxdna_hwctx_priv *priv;
> struct amdxdna_gem_obj *heap;
> - int ret;
> + int i, ret;
>
> priv = kzalloc(sizeof(*hwctx->priv), GFP_KERNEL);
> if (!priv)
> @@ -157,10 +516,48 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
> goto put_heap;
> }
>
> + for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
> + struct amdxdna_gem_obj *abo;
> + struct amdxdna_drm_create_bo args = {
> + .flags = 0,
> + .type = AMDXDNA_BO_DEV,
> + .vaddr = 0,
> + .size = MAX_CHAIN_CMDBUF_SIZE,
> + };
> +
> + abo = amdxdna_drm_alloc_dev_bo(&xdna->ddev, &args, client->filp, true);
> + if (IS_ERR(abo)) {
> + ret = PTR_ERR(abo);
> + goto free_cmd_bufs;
> + }
> +
> + XDNA_DBG(xdna, "Command buf %d addr 0x%llx size 0x%lx",
> + i, abo->mem.dev_addr, abo->mem.size);
> + priv->cmd_buf[i] = abo;
> + }
> +
> + sched = &priv->sched;
> + mutex_init(&priv->io_lock);
> + ret = drm_sched_init(sched, &sched_ops, NULL, DRM_SCHED_PRIORITY_COUNT,
> + HWCTX_MAX_CMDS, 0, msecs_to_jiffies(HWCTX_MAX_TIMEOUT),
> + NULL, NULL, hwctx->name, xdna->ddev.dev);
> + if (ret) {
> + XDNA_ERR(xdna, "Failed to init DRM scheduler. ret %d", ret);
> + goto free_cmd_bufs;
> + }
> +
> + ret = drm_sched_entity_init(&priv->entity, DRM_SCHED_PRIORITY_NORMAL,
> + &sched, 1, NULL);
> + if (ret) {
> + XDNA_ERR(xdna, "Failed to initial sched entiry. ret %d", ret);
> + goto free_sched;
> + }
> + init_waitqueue_head(&priv->job_free_wq);
> +
> ret = aie2_hwctx_col_list(hwctx);
> if (ret) {
> XDNA_ERR(xdna, "Create col list failed, ret %d", ret);
> - goto unpin;
> + goto free_entity;
> }
>
> ret = aie2_alloc_resource(hwctx);
> @@ -185,7 +582,16 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
> aie2_release_resource(hwctx);
> free_col_list:
> kfree(hwctx->col_list);
> -unpin:
> +free_entity:
> + drm_sched_entity_destroy(&priv->entity);
> +free_sched:
> + drm_sched_fini(&priv->sched);
> +free_cmd_bufs:
> + for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
> + if (!priv->cmd_buf[i])
> + continue;
> + drm_gem_object_put(to_gobj(priv->cmd_buf[i]));
> + }
> amdxdna_gem_unpin(heap);
> put_heap:
> drm_gem_object_put(to_gobj(heap));
> @@ -196,11 +602,43 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
>
> void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx)
> {
> + struct amdxdna_sched_job *job;
> + struct amdxdna_dev *xdna;
> + int idx;
> +
> + xdna = hwctx->client->xdna;
> + drm_sched_wqueue_stop(&hwctx->priv->sched);
> +
> + /* Now, scheduler will not send command to device. */
> aie2_release_resource(hwctx);
>
> + /*
> + * All submitted commands are aborted.
> + * Restart scheduler queues to cleanup jobs. The amdxdna_sched_job_run()
> + * will return NODEV if it is called.
> + */
> + drm_sched_wqueue_start(&hwctx->priv->sched);
> +
> + aie2_hwctx_wait_for_idle(hwctx);
> + drm_sched_entity_destroy(&hwctx->priv->entity);
> + drm_sched_fini(&hwctx->priv->sched);
> +
> + for (idx = 0; idx < HWCTX_MAX_CMDS; idx++) {
> + job = hwctx->priv->pending[idx];
> + if (!job)
> + continue;
> +
> + dma_fence_put(job->out_fence);
> + amdxdna_job_put(job);
> + }
> + XDNA_DBG(xdna, "%s sequence number %lld", hwctx->name, hwctx->priv->seq);
> +
> + for (idx = 0; idx < ARRAY_SIZE(hwctx->priv->cmd_buf); idx++)
> + drm_gem_object_put(to_gobj(hwctx->priv->cmd_buf[idx]));
> amdxdna_gem_unpin(hwctx->priv->heap);
> drm_gem_object_put(to_gobj(hwctx->priv->heap));
>
> + mutex_destroy(&hwctx->priv->io_lock);
> kfree(hwctx->col_list);
> kfree(hwctx->priv);
> kfree(hwctx->cus);
> @@ -267,3 +705,183 @@ int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *bu
> return -EOPNOTSUPP;
> }
> }
> +
> +static int aie2_populate_range(struct amdxdna_gem_obj *abo)
> +{
> + struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
> + struct mm_struct *mm = abo->mem.notifier.mm;
> + struct hmm_range range = { 0 };
> + unsigned long timeout;
> + int ret;
> +
> + XDNA_INFO_ONCE(xdna, "populate memory range %llx size %lx",
> + abo->mem.userptr, abo->mem.size);
> + range.notifier = &abo->mem.notifier;
> + range.start = abo->mem.userptr;
> + range.end = abo->mem.userptr + abo->mem.size;
> + range.hmm_pfns = abo->mem.pfns;
> + range.default_flags = HMM_PFN_REQ_FAULT;
> +
> + if (!mmget_not_zero(mm))
> + return -EFAULT;
> +
> + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> +again:
> + range.notifier_seq = mmu_interval_read_begin(&abo->mem.notifier);
> + mmap_read_lock(mm);
> + ret = hmm_range_fault(&range);
> + mmap_read_unlock(mm);
> + if (ret) {
> + if (time_after(jiffies, timeout)) {
> + ret = -ETIME;
> + goto put_mm;
> + }
> +
> + if (ret == -EBUSY)
> + goto again;
> +
> + goto put_mm;
> + }
> +
> + dma_resv_lock(to_gobj(abo)->resv, NULL);
> + if (mmu_interval_read_retry(&abo->mem.notifier, range.notifier_seq)) {
> + dma_resv_unlock(to_gobj(abo)->resv);
> + goto again;
> + }
> + abo->mem.map_invalid = false;
> + dma_resv_unlock(to_gobj(abo)->resv);
> +
> +put_mm:
> + mmput(mm);
> + return ret;
> +}
> +
> +int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq)
> +{
> + struct amdxdna_dev *xdna = hwctx->client->xdna;
> + struct ww_acquire_ctx acquire_ctx;
> + struct amdxdna_gem_obj *abo;
> + unsigned long timeout = 0;
> + int ret, i;
> +
> + ret = drm_sched_job_init(&job->base, &hwctx->priv->entity, 1, hwctx);
> + if (ret) {
> + XDNA_ERR(xdna, "DRM job init failed, ret %d", ret);
> + return ret;
> + }
> +
> + drm_sched_job_arm(&job->base);
> + job->out_fence = dma_fence_get(&job->base.s_fence->finished);
> +
> +retry:
> + ret = drm_gem_lock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
> + if (ret) {
> + XDNA_WARN(xdna, "Failed to reverve fence, ret %d", ret);
> + goto put_fence;
> + }
> +
> + for (i = 0; i < job->bo_cnt; i++) {
> + abo = to_xdna_obj(job->bos[i]);
> + if (abo->mem.map_invalid) {
> + drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
> + if (!timeout) {
> + timeout = jiffies +
> + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> + } else if (time_after(jiffies, timeout)) {
> + ret = -ETIME;
> + goto put_fence;
> + }
> +
> + ret = aie2_populate_range(abo);
> + if (ret)
> + goto put_fence;
> + goto retry;
> + }
> +
> + ret = dma_resv_reserve_fences(job->bos[i]->resv, 1);
> + if (ret) {
> + XDNA_WARN(xdna, "Failed to reserve fences %d", ret);
> + drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
> + goto put_fence;
> + }
> + }
> +
> + for (i = 0; i < job->bo_cnt; i++)
> + dma_resv_add_fence(job->bos[i]->resv, job->out_fence, DMA_RESV_USAGE_WRITE);
> + drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
> +
> + mutex_lock(&hwctx->priv->io_lock);
> + ret = aie2_hwctx_add_job(hwctx, job);
> + if (ret) {
> + mutex_unlock(&hwctx->priv->io_lock);
> + goto signal_fence;
> + }
> +
> + *seq = job->seq;
> + drm_sched_entity_push_job(&job->base);
> + mutex_unlock(&hwctx->priv->io_lock);
> +
> + return 0;
> +
> +signal_fence:
> + dma_fence_signal(job->out_fence);
> +put_fence:
> + dma_fence_put(job->out_fence);
> + drm_sched_job_cleanup(&job->base);
> + return ret;
> +}
> +
> +int aie2_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout)
> +{
> + signed long remaining = MAX_SCHEDULE_TIMEOUT;
> + struct amdxdna_sched_job *job;
> + struct dma_fence *out_fence;
> + long ret;
> +
> + mutex_lock(&hwctx->priv->io_lock);
> + job = aie2_hwctx_get_job(hwctx, seq);
> + if (IS_ERR(job)) {
> + mutex_unlock(&hwctx->priv->io_lock);
> + ret = PTR_ERR(job);
> + goto out;
> + }
> +
> + if (unlikely(!job)) {
> + mutex_unlock(&hwctx->priv->io_lock);
> + ret = 0;
> + goto out;
> + }
> + out_fence = dma_fence_get(job->out_fence);
> + mutex_unlock(&hwctx->priv->io_lock);
> +
> + if (timeout)
> + remaining = msecs_to_jiffies(timeout);
> +
> + ret = dma_fence_wait_timeout(out_fence, true, remaining);
> + if (!ret)
> + ret = -ETIME;
> + else if (ret > 0)
> + ret = 0;
> +
> + dma_fence_put(out_fence);
> +out:
> + return ret;
> +}
> +
> +void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo,
> + unsigned long cur_seq)
> +{
> + struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
> + struct drm_gem_object *gobj = to_gobj(abo);
> + long ret;
> +
> + dma_resv_lock(gobj->resv, NULL);
Was randomly looking as I was interested - drive by comment...
I think you have a locking inversion here.
MMU notifiers are in the path of reclaim (Can't allocate memory),
dma-resv lock can allocate memory while being held, thus you can
deadlock.
Lockdep should blow up here. See [1] [2].
[1] https://elixir.bootlin.com/linux/v6.11.3/source/mm/page_alloc.c#L3833
[2] https://elixir.bootlin.com/linux/v6.11.3/source/drivers/dma-buf/dma-resv.c#L773
Matt
> + abo->mem.map_invalid = true;
> + mmu_interval_set_seq(&abo->mem.notifier, cur_seq);
> + ret = dma_resv_wait_timeout(gobj->resv, DMA_RESV_USAGE_BOOKKEEP,
> + true, MAX_SCHEDULE_TIMEOUT);
> + dma_resv_unlock(gobj->resv);
> +
> + if (!ret || ret == -ERESTARTSYS)
> + XDNA_ERR(xdna, "Failed to wait for bo, ret %ld", ret);
> +}
> diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
> index 28bd0560db61..3dc4a9a8571e 100644
> --- a/drivers/accel/amdxdna/aie2_message.c
> +++ b/drivers/accel/amdxdna/aie2_message.c
> @@ -4,10 +4,12 @@
> */
>
> #include <drm/amdxdna_accel.h>
> +#include <drm/drm_cache.h>
> #include <drm/drm_device.h>
> #include <drm/drm_gem.h>
> #include <drm/drm_gem_shmem_helper.h>
> #include <drm/drm_print.h>
> +#include <drm/gpu_scheduler.h>
> #include <linux/errno.h>
> #include <linux/pci.h>
> #include <linux/types.h>
> @@ -361,3 +363,344 @@ int aie2_config_cu(struct amdxdna_hwctx *hwctx)
> msg.opcode, resp.status, ret);
> return ret;
> }
> +
> +int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
> + int (*notify_cb)(void *, const u32 *, size_t))
> +{
> + struct mailbox_channel *chann = hwctx->priv->mbox_chann;
> + struct amdxdna_dev *xdna = hwctx->client->xdna;
> + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
> + union {
> + struct execute_buffer_req ebuf;
> + struct exec_dpu_req dpu;
> + } req;
> + struct xdna_mailbox_msg msg;
> + u32 payload_len;
> + void *payload;
> + int cu_idx;
> + int ret;
> + u32 op;
> +
> + if (!chann)
> + return -ENODEV;
> +
> + payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len);
> + if (!payload) {
> + XDNA_ERR(xdna, "Invalid command, cannot get payload");
> + return -EINVAL;
> + }
> +
> + cu_idx = amdxdna_cmd_get_cu_idx(cmd_abo);
> + if (cu_idx < 0) {
> + XDNA_DBG(xdna, "Invalid cu idx");
> + return -EINVAL;
> + }
> +
> + op = amdxdna_cmd_get_op(cmd_abo);
> + switch (op) {
> + case ERT_START_CU:
> + if (unlikely(payload_len > sizeof(req.ebuf.payload)))
> + XDNA_DBG(xdna, "Invalid ebuf payload len: %d", payload_len);
> + req.ebuf.cu_idx = cu_idx;
> + memcpy(req.ebuf.payload, payload, sizeof(req.ebuf.payload));
> + msg.send_size = sizeof(req.ebuf);
> + msg.opcode = MSG_OP_EXECUTE_BUFFER_CF;
> + break;
> + case ERT_START_NPU: {
> + struct amdxdna_cmd_start_npu *sn = payload;
> +
> + if (unlikely(payload_len - sizeof(*sn) > sizeof(req.dpu.payload)))
> + XDNA_DBG(xdna, "Invalid dpu payload len: %d", payload_len);
> + req.dpu.inst_buf_addr = sn->buffer;
> + req.dpu.inst_size = sn->buffer_size;
> + req.dpu.inst_prop_cnt = sn->prop_count;
> + req.dpu.cu_idx = cu_idx;
> + memcpy(req.dpu.payload, sn->prop_args, sizeof(req.dpu.payload));
> + msg.send_size = sizeof(req.dpu);
> + msg.opcode = MSG_OP_EXEC_DPU;
> + break;
> + }
> + default:
> + XDNA_DBG(xdna, "Invalid ERT cmd op code: %d", op);
> + return -EINVAL;
> + }
> + msg.handle = job;
> + msg.notify_cb = notify_cb;
> + msg.send_data = (u8 *)&req;
> + print_hex_dump_debug("cmd: ", DUMP_PREFIX_OFFSET, 16, 4, &req,
> + 0x40, false);
> +
> + ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
> + if (ret) {
> + XDNA_ERR(xdna, "Send message failed");
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> +static int
> +aie2_cmdlist_fill_one_slot_cf(void *cmd_buf, u32 offset,
> + struct amdxdna_gem_obj *abo, u32 *size)
> +{
> + struct cmd_chain_slot_execbuf_cf *buf = cmd_buf + offset;
> + int cu_idx = amdxdna_cmd_get_cu_idx(abo);
> + u32 payload_len;
> + void *payload;
> +
> + if (cu_idx < 0)
> + return -EINVAL;
> +
> + payload = amdxdna_cmd_get_payload(abo, &payload_len);
> + if (!payload)
> + return -EINVAL;
> +
> + if (!slot_cf_has_space(offset, payload_len))
> + return -ENOSPC;
> +
> + buf->cu_idx = cu_idx;
> + buf->arg_cnt = payload_len / sizeof(u32);
> + memcpy(buf->args, payload, payload_len);
> + /* Accurate buf size to hint firmware to do necessary copy */
> + *size = sizeof(*buf) + payload_len;
> + return 0;
> +}
> +
> +static int
> +aie2_cmdlist_fill_one_slot_dpu(void *cmd_buf, u32 offset,
> + struct amdxdna_gem_obj *abo, u32 *size)
> +{
> + struct cmd_chain_slot_dpu *buf = cmd_buf + offset;
> + int cu_idx = amdxdna_cmd_get_cu_idx(abo);
> + struct amdxdna_cmd_start_npu *sn;
> + u32 payload_len;
> + void *payload;
> + u32 arg_sz;
> +
> + if (cu_idx < 0)
> + return -EINVAL;
> +
> + payload = amdxdna_cmd_get_payload(abo, &payload_len);
> + if (!payload)
> + return -EINVAL;
> + sn = payload;
> + arg_sz = payload_len - sizeof(*sn);
> + if (payload_len < sizeof(*sn) || arg_sz > MAX_DPU_ARGS_SIZE)
> + return -EINVAL;
> +
> + if (!slot_dpu_has_space(offset, arg_sz))
> + return -ENOSPC;
> +
> + buf->inst_buf_addr = sn->buffer;
> + buf->inst_size = sn->buffer_size;
> + buf->inst_prop_cnt = sn->prop_count;
> + buf->cu_idx = cu_idx;
> + buf->arg_cnt = arg_sz / sizeof(u32);
> + memcpy(buf->args, sn->prop_args, arg_sz);
> +
> + /* Accurate buf size to hint firmware to do necessary copy */
> + *size += sizeof(*buf) + arg_sz;
> + return 0;
> +}
> +
> +static int
> +aie2_cmdlist_fill_one_slot(u32 op, struct amdxdna_gem_obj *cmdbuf_abo, u32 offset,
> + struct amdxdna_gem_obj *abo, u32 *size)
> +{
> + u32 this_op = amdxdna_cmd_get_op(abo);
> + void *cmd_buf = cmdbuf_abo->mem.kva;
> + int ret;
> +
> + if (this_op != op) {
> + ret = -EINVAL;
> + goto done;
> + }
> +
> + switch (op) {
> + case ERT_START_CU:
> + ret = aie2_cmdlist_fill_one_slot_cf(cmd_buf, offset, abo, size);
> + break;
> + case ERT_START_NPU:
> + ret = aie2_cmdlist_fill_one_slot_dpu(cmd_buf, offset, abo, size);
> + break;
> + default:
> + ret = -EOPNOTSUPP;
> + }
> +
> +done:
> + if (ret) {
> + XDNA_ERR(abo->client->xdna, "Can't fill slot for cmd op %d ret %d",
> + op, ret);
> + }
> + return ret;
> +}
> +
> +static inline struct amdxdna_gem_obj *
> +aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job)
> +{
> + int idx = get_job_idx(job->seq);
> +
> + return job->hwctx->priv->cmd_buf[idx];
> +}
> +
> +static void
> +aie2_cmdlist_prepare_request(struct cmd_chain_req *req,
> + struct amdxdna_gem_obj *cmdbuf_abo, u32 size, u32 cnt)
> +{
> + req->buf_addr = cmdbuf_abo->mem.dev_addr;
> + req->buf_size = size;
> + req->count = cnt;
> + drm_clflush_virt_range(cmdbuf_abo->mem.kva, size);
> + XDNA_DBG(cmdbuf_abo->client->xdna, "Command buf addr 0x%llx size 0x%x count %d",
> + req->buf_addr, size, cnt);
> +}
> +
> +static inline u32
> +aie2_cmd_op_to_msg_op(u32 op)
> +{
> + switch (op) {
> + case ERT_START_CU:
> + return MSG_OP_CHAIN_EXEC_BUFFER_CF;
> + case ERT_START_NPU:
> + return MSG_OP_CHAIN_EXEC_DPU;
> + default:
> + return MSG_OP_MAX_OPCODE;
> + }
> +}
> +
> +int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
> + struct amdxdna_sched_job *job,
> + int (*notify_cb)(void *, const u32 *, size_t))
> +{
> + struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job);
> + struct mailbox_channel *chann = hwctx->priv->mbox_chann;
> + struct amdxdna_client *client = hwctx->client;
> + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
> + struct amdxdna_cmd_chain *payload;
> + struct xdna_mailbox_msg msg;
> + struct cmd_chain_req req;
> + u32 payload_len;
> + u32 offset = 0;
> + u32 size;
> + int ret;
> + u32 op;
> + u32 i;
> +
> + op = amdxdna_cmd_get_op(cmd_abo);
> + payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len);
> + if (op != ERT_CMD_CHAIN || !payload ||
> + payload_len < struct_size(payload, data, payload->command_count))
> + return -EINVAL;
> +
> + for (i = 0; i < payload->command_count; i++) {
> + u32 boh = (u32)(payload->data[i]);
> + struct amdxdna_gem_obj *abo;
> +
> + abo = amdxdna_gem_get_obj(client, boh, AMDXDNA_BO_CMD);
> + if (!abo) {
> + XDNA_ERR(client->xdna, "Failed to find cmd BO %d", boh);
> + return -ENOENT;
> + }
> +
> + /* All sub-cmd should have same op, use the first one. */
> + if (i == 0)
> + op = amdxdna_cmd_get_op(abo);
> +
> + ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, offset, abo, &size);
> + amdxdna_gem_put_obj(abo);
> + if (ret)
> + return -EINVAL;
> +
> + offset += size;
> + }
> +
> + /* The offset is the accumulated total size of the cmd buffer */
> + aie2_cmdlist_prepare_request(&req, cmdbuf_abo, offset, payload->command_count);
> +
> + msg.opcode = aie2_cmd_op_to_msg_op(op);
> + if (msg.opcode == MSG_OP_MAX_OPCODE)
> + return -EOPNOTSUPP;
> + msg.handle = job;
> + msg.notify_cb = notify_cb;
> + msg.send_data = (u8 *)&req;
> + msg.send_size = sizeof(req);
> + ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
> + if (ret) {
> + XDNA_ERR(hwctx->client->xdna, "Send message failed");
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> +int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
> + struct amdxdna_sched_job *job,
> + int (*notify_cb)(void *, const u32 *, size_t))
> +{
> + struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job);
> + struct mailbox_channel *chann = hwctx->priv->mbox_chann;
> + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
> + struct xdna_mailbox_msg msg;
> + struct cmd_chain_req req;
> + u32 size;
> + int ret;
> + u32 op;
> +
> + op = amdxdna_cmd_get_op(cmd_abo);
> + ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, 0, cmd_abo, &size);
> + if (ret)
> + return ret;
> +
> + aie2_cmdlist_prepare_request(&req, cmdbuf_abo, size, 1);
> +
> + msg.opcode = aie2_cmd_op_to_msg_op(op);
> + if (msg.opcode == MSG_OP_MAX_OPCODE)
> + return -EOPNOTSUPP;
> + msg.handle = job;
> + msg.notify_cb = notify_cb;
> + msg.send_data = (u8 *)&req;
> + msg.send_size = sizeof(req);
> + ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
> + if (ret) {
> + XDNA_ERR(hwctx->client->xdna, "Send message failed");
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> +int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
> + int (*notify_cb)(void *, const u32 *, size_t))
> +{
> + struct mailbox_channel *chann = hwctx->priv->mbox_chann;
> + struct amdxdna_gem_obj *abo = to_xdna_obj(job->bos[0]);
> + struct amdxdna_dev *xdna = hwctx->client->xdna;
> + struct xdna_mailbox_msg msg;
> + struct sync_bo_req req;
> + int ret = 0;
> +
> + req.src_addr = 0;
> + req.dst_addr = abo->mem.dev_addr - hwctx->client->dev_heap->mem.dev_addr;
> + req.size = abo->mem.size;
> +
> + /* Device to Host */
> + req.type = FIELD_PREP(AIE2_MSG_SYNC_BO_SRC_TYPE, SYNC_BO_DEV_MEM) |
> + FIELD_PREP(AIE2_MSG_SYNC_BO_DST_TYPE, SYNC_BO_HOST_MEM);
> +
> + XDNA_DBG(xdna, "sync %d bytes src(0x%llx) to dst(0x%llx) completed",
> + req.size, req.src_addr, req.dst_addr);
> +
> + msg.handle = job;
> + msg.notify_cb = notify_cb;
> + msg.send_data = (u8 *)&req;
> + msg.send_size = sizeof(req);
> + msg.opcode = MSG_OP_SYNC_BO;
> +
> + ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
> + if (ret) {
> + XDNA_ERR(xdna, "Send message failed");
> + return ret;
> + }
> +
> + return 0;
> +}
> diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
> index ee9f114bc229..6017826a7104 100644
> --- a/drivers/accel/amdxdna/aie2_pci.c
> +++ b/drivers/accel/amdxdna/aie2_pci.c
> @@ -5,8 +5,10 @@
>
> #include <drm/amdxdna_accel.h>
> #include <drm/drm_device.h>
> +#include <drm/drm_gem_shmem_helper.h>
> #include <drm/drm_managed.h>
> #include <drm/drm_print.h>
> +#include <drm/gpu_scheduler.h>
> #include <linux/errno.h>
> #include <linux/firmware.h>
> #include <linux/iommu.h>
> @@ -17,6 +19,7 @@
> #include "aie2_pci.h"
> #include "aie2_solver.h"
> #include "amdxdna_ctx.h"
> +#include "amdxdna_gem.h"
> #include "amdxdna_mailbox.h"
> #include "amdxdna_pci_drv.h"
>
> @@ -499,4 +502,7 @@ const struct amdxdna_dev_ops aie2_ops = {
> .hwctx_init = aie2_hwctx_init,
> .hwctx_fini = aie2_hwctx_fini,
> .hwctx_config = aie2_hwctx_config,
> + .cmd_submit = aie2_cmd_submit,
> + .cmd_wait = aie2_cmd_wait,
> + .hmm_invalidate = aie2_hmm_invalidate,
> };
> diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
> index 3ac936e2c9d1..81877d9c0542 100644
> --- a/drivers/accel/amdxdna/aie2_pci.h
> +++ b/drivers/accel/amdxdna/aie2_pci.h
> @@ -76,6 +76,7 @@ enum psp_reg_idx {
> PSP_MAX_REGS /* Keep this at the end */
> };
>
> +struct amdxdna_client;
> struct amdxdna_fw_ver;
> struct amdxdna_hwctx;
>
> @@ -118,9 +119,28 @@ struct rt_config {
> u32 value;
> };
>
> +/*
> + * Define the maximum number of pending commands in a hardware context.
> + * Must be power of 2!
> + */
> +#define HWCTX_MAX_CMDS 4
> +#define get_job_idx(seq) ((seq) & (HWCTX_MAX_CMDS - 1))
> struct amdxdna_hwctx_priv {
> struct amdxdna_gem_obj *heap;
> void *mbox_chann;
> +
> + struct drm_gpu_scheduler sched;
> + struct drm_sched_entity entity;
> +
> + struct mutex io_lock; /* protect seq and cmd order */
> + struct wait_queue_head job_free_wq;
> + struct amdxdna_sched_job *pending[HWCTX_MAX_CMDS];
> + u32 num_pending;
> + u64 seq;
> + /* Completed job counter */
> + u64 completed;
> +
> + struct amdxdna_gem_obj *cmd_buf[HWCTX_MAX_CMDS];
> };
>
> struct amdxdna_dev_hdl {
> @@ -199,10 +219,25 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
> int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
> int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
> int aie2_config_cu(struct amdxdna_hwctx *hwctx);
> +int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
> + int (*notify_cb)(void *, const u32 *, size_t));
> +int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
> + struct amdxdna_sched_job *job,
> + int (*notify_cb)(void *, const u32 *, size_t));
> +int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
> + struct amdxdna_sched_job *job,
> + int (*notify_cb)(void *, const u32 *, size_t));
> +int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
> + int (*notify_cb)(void *, const u32 *, size_t));
>
> /* aie2_hwctx.c */
> int aie2_hwctx_init(struct amdxdna_hwctx *hwctx);
> void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx);
> int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
> +int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
> +int aie2_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
> +void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
> +void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map);
> +void aie2_restart_ctx(struct amdxdna_client *client);
>
> #endif /* _AIE2_PCI_H_ */
> diff --git a/drivers/accel/amdxdna/aie2_psp.c b/drivers/accel/amdxdna/aie2_psp.c
> index 2efcfd1941bf..35ba55e2ab1e 100644
> --- a/drivers/accel/amdxdna/aie2_psp.c
> +++ b/drivers/accel/amdxdna/aie2_psp.c
> @@ -4,8 +4,10 @@
> */
>
> #include <drm/drm_device.h>
> +#include <drm/drm_gem_shmem_helper.h>
> #include <drm/drm_managed.h>
> #include <drm/drm_print.h>
> +#include <drm/gpu_scheduler.h>
> #include <linux/iopoll.h>
>
> #include "aie2_pci.h"
> diff --git a/drivers/accel/amdxdna/aie2_smu.c b/drivers/accel/amdxdna/aie2_smu.c
> index 3fa7064649aa..91893d438da7 100644
> --- a/drivers/accel/amdxdna/aie2_smu.c
> +++ b/drivers/accel/amdxdna/aie2_smu.c
> @@ -4,7 +4,9 @@
> */
>
> #include <drm/drm_device.h>
> +#include <drm/drm_gem_shmem_helper.h>
> #include <drm/drm_print.h>
> +#include <drm/gpu_scheduler.h>
> #include <linux/iopoll.h>
>
> #include "aie2_pci.h"
> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
> index 8acf8bfe0db9..b76640e1fdd0 100644
> --- a/drivers/accel/amdxdna/amdxdna_ctx.c
> +++ b/drivers/accel/amdxdna/amdxdna_ctx.c
> @@ -7,17 +7,65 @@
> #include <drm/drm_device.h>
> #include <drm/drm_drv.h>
> #include <drm/drm_file.h>
> +#include <drm/drm_gem.h>
> +#include <drm/drm_gem_shmem_helper.h>
> #include <drm/drm_print.h>
> +#include <drm/gpu_scheduler.h>
> +#include <trace/events/amdxdna.h>
>
> #include "amdxdna_ctx.h"
> +#include "amdxdna_gem.h"
> #include "amdxdna_pci_drv.h"
>
> #define MAX_HWCTX_ID 255
> +#define MAX_ARG_COUNT 4095
>
> -static void amdxdna_hwctx_destroy(struct amdxdna_hwctx *hwctx)
> +struct amdxdna_fence {
> + struct dma_fence base;
> + spinlock_t lock; /* for base */
> + struct amdxdna_hwctx *hwctx;
> +};
> +
> +static const char *amdxdna_fence_get_driver_name(struct dma_fence *fence)
> +{
> + return KBUILD_MODNAME;
> +}
> +
> +static const char *amdxdna_fence_get_timeline_name(struct dma_fence *fence)
> +{
> + struct amdxdna_fence *xdna_fence;
> +
> + xdna_fence = container_of(fence, struct amdxdna_fence, base);
> +
> + return xdna_fence->hwctx->name;
> +}
> +
> +static const struct dma_fence_ops fence_ops = {
> + .get_driver_name = amdxdna_fence_get_driver_name,
> + .get_timeline_name = amdxdna_fence_get_timeline_name,
> +};
> +
> +static struct dma_fence *amdxdna_fence_create(struct amdxdna_hwctx *hwctx)
> +{
> + struct amdxdna_fence *fence;
> +
> + fence = kzalloc(sizeof(*fence), GFP_KERNEL);
> + if (!fence)
> + return NULL;
> +
> + fence->hwctx = hwctx;
> + spin_lock_init(&fence->lock);
> + dma_fence_init(&fence->base, &fence_ops, &fence->lock, hwctx->id, 0);
> + return &fence->base;
> +}
> +
> +static void amdxdna_hwctx_destroy_rcu(struct amdxdna_hwctx *hwctx,
> + struct srcu_struct *ss)
> {
> struct amdxdna_dev *xdna = hwctx->client->xdna;
>
> + synchronize_srcu(ss);
> +
> /* At this point, user is not able to submit new commands */
> mutex_lock(&xdna->dev_lock);
> xdna->dev_info->ops->hwctx_fini(hwctx);
> @@ -27,6 +75,46 @@ static void amdxdna_hwctx_destroy(struct amdxdna_hwctx *hwctx)
> kfree(hwctx);
> }
>
> +void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size)
> +{
> + struct amdxdna_cmd *cmd = abo->mem.kva;
> + u32 num_masks, count;
> +
> + if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN)
> + num_masks = 0;
> + else
> + num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, cmd->header);
> +
> + if (size) {
> + count = FIELD_GET(AMDXDNA_CMD_COUNT, cmd->header);
> + if (unlikely(count <= num_masks)) {
> + *size = 0;
> + return NULL;
> + }
> + *size = (count - num_masks) * sizeof(u32);
> + }
> + return &cmd->data[num_masks];
> +}
> +
> +int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
> +{
> + struct amdxdna_cmd *cmd = abo->mem.kva;
> + u32 num_masks, i;
> + u32 *cu_mask;
> +
> + if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN)
> + return -1;
> +
> + num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, cmd->header);
> + cu_mask = cmd->data;
> + for (i = 0; i < num_masks; i++) {
> + if (cu_mask[i])
> + return ffs(cu_mask[i]) - 1;
> + }
> +
> + return -1;
> +}
> +
> /*
> * This should be called in close() and remove(). DO NOT call in other syscalls.
> * This guarantee that when hwctx and resources will be released, if user
> @@ -43,7 +131,7 @@ void amdxdna_hwctx_remove_all(struct amdxdna_client *client)
> client->pid, hwctx->id);
> idr_remove(&client->hwctx_idr, hwctx->id);
> mutex_unlock(&client->hwctx_lock);
> - amdxdna_hwctx_destroy(hwctx);
> + amdxdna_hwctx_destroy_rcu(hwctx, &client->hwctx_srcu);
> mutex_lock(&client->hwctx_lock);
> }
> mutex_unlock(&client->hwctx_lock);
> @@ -134,6 +222,12 @@ int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct d
> if (!drm_dev_enter(dev, &idx))
> return -ENODEV;
>
> + /*
> + * Use hwctx_lock to achieve exclusion with other hwctx writers,
> + * SRCU to synchronize with exec/wait command ioctls.
> + *
> + * The pushed jobs are handled by DRM scheduler during destroy.
> + */
> mutex_lock(&client->hwctx_lock);
> hwctx = idr_find(&client->hwctx_idr, args->handle);
> if (!hwctx) {
> @@ -146,7 +240,7 @@ int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct d
> idr_remove(&client->hwctx_idr, hwctx->id);
> mutex_unlock(&client->hwctx_lock);
>
> - amdxdna_hwctx_destroy(hwctx);
> + amdxdna_hwctx_destroy_rcu(hwctx, &client->hwctx_srcu);
>
> XDNA_DBG(xdna, "PID %d destroyed HW context %d", client->pid, args->handle);
> out:
> @@ -160,10 +254,10 @@ int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
> struct amdxdna_drm_config_hwctx *args = data;
> struct amdxdna_dev *xdna = to_xdna_dev(dev);
> struct amdxdna_hwctx *hwctx;
> + int ret, idx;
> u32 buf_size;
> void *buf;
> u64 val;
> - int ret;
>
> if (!xdna->dev_info->ops->hwctx_config)
> return -EOPNOTSUPP;
> @@ -202,17 +296,286 @@ int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
> }
>
> mutex_lock(&xdna->dev_lock);
> + idx = srcu_read_lock(&client->hwctx_srcu);
> hwctx = idr_find(&client->hwctx_idr, args->handle);
> if (!hwctx) {
> XDNA_DBG(xdna, "PID %d failed to get hwctx %d", client->pid, args->handle);
> ret = -EINVAL;
> - goto unlock;
> + goto unlock_srcu;
> }
>
> ret = xdna->dev_info->ops->hwctx_config(hwctx, args->param_type, val, buf, buf_size);
>
> -unlock:
> +unlock_srcu:
> + srcu_read_unlock(&client->hwctx_srcu, idx);
> mutex_unlock(&xdna->dev_lock);
> kfree(buf);
> return ret;
> }
> +
> +static void
> +amdxdna_arg_bos_put(struct amdxdna_sched_job *job)
> +{
> + int i;
> +
> + for (i = 0; i < job->bo_cnt; i++) {
> + if (!job->bos[i])
> + break;
> + drm_gem_object_put(job->bos[i]);
> + }
> +}
> +
> +static int
> +amdxdna_arg_bos_lookup(struct amdxdna_client *client,
> + struct amdxdna_sched_job *job,
> + u32 *bo_hdls, u32 bo_cnt)
> +{
> + struct drm_gem_object *gobj;
> + int i, ret;
> +
> + job->bo_cnt = bo_cnt;
> + for (i = 0; i < job->bo_cnt; i++) {
> + struct amdxdna_gem_obj *abo;
> +
> + gobj = drm_gem_object_lookup(client->filp, bo_hdls[i]);
> + if (!gobj) {
> + ret = -ENOENT;
> + goto put_shmem_bo;
> + }
> + abo = to_xdna_obj(gobj);
> +
> + mutex_lock(&abo->lock);
> + if (abo->pinned) {
> + mutex_unlock(&abo->lock);
> + job->bos[i] = gobj;
> + continue;
> + }
> +
> + ret = amdxdna_gem_pin_nolock(abo);
> + if (ret) {
> + mutex_unlock(&abo->lock);
> + drm_gem_object_put(gobj);
> + goto put_shmem_bo;
> + }
> + abo->pinned = true;
> + mutex_unlock(&abo->lock);
> +
> + job->bos[i] = gobj;
> + }
> +
> + return 0;
> +
> +put_shmem_bo:
> + amdxdna_arg_bos_put(job);
> + return ret;
> +}
> +
> +static void amdxdna_sched_job_release(struct kref *ref)
> +{
> + struct amdxdna_sched_job *job;
> +
> + job = container_of(ref, struct amdxdna_sched_job, refcnt);
> +
> + trace_amdxdna_debug_point(job->hwctx->name, job->seq, "job release");
> + amdxdna_arg_bos_put(job);
> + amdxdna_gem_put_obj(job->cmd_bo);
> + kfree(job);
> +}
> +
> +void amdxdna_job_put(struct amdxdna_sched_job *job)
> +{
> + kref_put(&job->refcnt, amdxdna_sched_job_release);
> +}
> +
> +int amdxdna_cmd_submit(struct amdxdna_client *client,
> + u32 cmd_bo_hdl, u32 *arg_bo_hdls, u32 arg_bo_cnt,
> + u32 hwctx_hdl, u64 *seq)
> +{
> + struct amdxdna_dev *xdna = client->xdna;
> + struct amdxdna_sched_job *job;
> + struct amdxdna_hwctx *hwctx;
> + int ret, idx;
> +
> + XDNA_DBG(xdna, "Command BO hdl %d, Arg BO count %d", cmd_bo_hdl, arg_bo_cnt);
> + job = kzalloc(struct_size(job, bos, arg_bo_cnt), GFP_KERNEL);
> + if (!job)
> + return -ENOMEM;
> +
> + if (cmd_bo_hdl != AMDXDNA_INVALID_BO_HANDLE) {
> + job->cmd_bo = amdxdna_gem_get_obj(client, cmd_bo_hdl, AMDXDNA_BO_CMD);
> + if (!job->cmd_bo) {
> + XDNA_ERR(xdna, "Failed to get cmd bo from %d", cmd_bo_hdl);
> + ret = -EINVAL;
> + goto free_job;
> + }
> + } else {
> + job->cmd_bo = NULL;
> + }
> +
> + ret = amdxdna_arg_bos_lookup(client, job, arg_bo_hdls, arg_bo_cnt);
> + if (ret) {
> + XDNA_ERR(xdna, "Argument BOs lookup failed, ret %d", ret);
> + goto cmd_put;
> + }
> +
> + idx = srcu_read_lock(&client->hwctx_srcu);
> + hwctx = idr_find(&client->hwctx_idr, hwctx_hdl);
> + if (!hwctx) {
> + XDNA_DBG(xdna, "PID %d failed to get hwctx %d",
> + client->pid, hwctx_hdl);
> + ret = -EINVAL;
> + goto unlock_srcu;
> + }
> +
> + if (hwctx->status != HWCTX_STAT_READY) {
> + XDNA_ERR(xdna, "HW Context is not ready");
> + ret = -EINVAL;
> + goto unlock_srcu;
> + }
> +
> + job->hwctx = hwctx;
> + job->mm = current->mm;
> +
> + job->fence = amdxdna_fence_create(hwctx);
> + if (!job->fence) {
> + XDNA_ERR(xdna, "Failed to create fence");
> + ret = -ENOMEM;
> + goto unlock_srcu;
> + }
> + kref_init(&job->refcnt);
> +
> + ret = xdna->dev_info->ops->cmd_submit(hwctx, job, seq);
> + if (ret)
> + goto put_fence;
> +
> + /*
> + * The amdxdna_hwctx_destroy_rcu() will release hwctx and associated
> + * resource after synchronize_srcu(). The submitted jobs should be
> + * handled by the queue, for example DRM scheduler, in device layer.
> + * For here we can unlock SRCU.
> + */
> + srcu_read_unlock(&client->hwctx_srcu, idx);
> + trace_amdxdna_debug_point(hwctx->name, *seq, "job pushed");
> +
> + return 0;
> +
> +put_fence:
> + dma_fence_put(job->fence);
> +unlock_srcu:
> + srcu_read_unlock(&client->hwctx_srcu, idx);
> + amdxdna_arg_bos_put(job);
> +cmd_put:
> + amdxdna_gem_put_obj(job->cmd_bo);
> +free_job:
> + kfree(job);
> + return ret;
> +}
> +
> +/*
> + * The submit command ioctl submits a command to firmware. One firmware command
> + * may contain multiple command BOs for processing as a whole.
> + * The command sequence number is returned which can be used for wait command ioctl.
> + */
> +static int amdxdna_drm_submit_execbuf(struct amdxdna_client *client,
> + struct amdxdna_drm_exec_cmd *args)
> +{
> + struct amdxdna_dev *xdna = client->xdna;
> + u32 *arg_bo_hdls;
> + u32 cmd_bo_hdl;
> + int ret;
> +
> + if (!args->arg_count || args->arg_count > MAX_ARG_COUNT) {
> + XDNA_ERR(xdna, "Invalid arg bo count %d", args->arg_count);
> + return -EINVAL;
> + }
> +
> + /* Only support single command for now. */
> + if (args->cmd_count != 1) {
> + XDNA_ERR(xdna, "Invalid cmd bo count %d", args->cmd_count);
> + return -EINVAL;
> + }
> +
> + cmd_bo_hdl = (u32)args->cmd_handles;
> + arg_bo_hdls = kcalloc(args->arg_count, sizeof(u32), GFP_KERNEL);
> + if (!arg_bo_hdls)
> + return -ENOMEM;
> + ret = copy_from_user(arg_bo_hdls, u64_to_user_ptr(args->args),
> + args->arg_count * sizeof(u32));
> + if (ret) {
> + ret = -EFAULT;
> + goto free_cmd_bo_hdls;
> + }
> +
> + ret = amdxdna_cmd_submit(client, cmd_bo_hdl, arg_bo_hdls,
> + args->arg_count, args->hwctx, &args->seq);
> + if (ret)
> + XDNA_DBG(xdna, "Submit cmds failed, ret %d", ret);
> +
> +free_cmd_bo_hdls:
> + kfree(arg_bo_hdls);
> + if (!ret)
> + XDNA_DBG(xdna, "Pushed cmd %lld to scheduler", args->seq);
> + return ret;
> +}
> +
> +int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
> +{
> + struct amdxdna_client *client = filp->driver_priv;
> + struct amdxdna_drm_exec_cmd *args = data;
> +
> + if (args->ext || args->ext_flags)
> + return -EINVAL;
> +
> + switch (args->type) {
> + case AMDXDNA_CMD_SUBMIT_EXEC_BUF:
> + return amdxdna_drm_submit_execbuf(client, args);
> + }
> +
> + XDNA_ERR(client->xdna, "Invalid command type %d", args->type);
> + return -EINVAL;
> +}
> +
> +int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl,
> + u64 seq, u32 timeout)
> +{
> + struct amdxdna_dev *xdna = client->xdna;
> + struct amdxdna_hwctx *hwctx;
> + int ret, idx;
> +
> + if (!xdna->dev_info->ops->cmd_wait)
> + return -EOPNOTSUPP;
> +
> + /* For locking concerns, see amdxdna_drm_exec_cmd_ioctl. */
> + idx = srcu_read_lock(&client->hwctx_srcu);
> + hwctx = idr_find(&client->hwctx_idr, hwctx_hdl);
> + if (!hwctx) {
> + XDNA_DBG(xdna, "PID %d failed to get hwctx %d",
> + client->pid, hwctx_hdl);
> + ret = -EINVAL;
> + goto unlock_hwctx_srcu;
> + }
> +
> + ret = xdna->dev_info->ops->cmd_wait(hwctx, seq, timeout);
> +
> +unlock_hwctx_srcu:
> + srcu_read_unlock(&client->hwctx_srcu, idx);
> + return ret;
> +}
> +
> +int amdxdna_drm_wait_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
> +{
> + struct amdxdna_client *client = filp->driver_priv;
> + struct amdxdna_dev *xdna = to_xdna_dev(dev);
> + struct amdxdna_drm_wait_cmd *args = data;
> + int ret;
> +
> + XDNA_DBG(xdna, "PID %d hwctx %d timeout set %d ms for cmd %lld",
> + client->pid, args->hwctx, args->timeout, args->seq);
> +
> + ret = amdxdna_cmd_wait(client, args->hwctx, args->seq, args->timeout);
> +
> + XDNA_DBG(xdna, "PID %d hwctx %d cmd %lld wait finished, ret %d",
> + client->pid, args->hwctx, args->seq, ret);
> +
> + return ret;
> +}
> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
> index 665b3208897d..65f9c1dfe32c 100644
> --- a/drivers/accel/amdxdna/amdxdna_ctx.h
> +++ b/drivers/accel/amdxdna/amdxdna_ctx.h
> @@ -6,6 +6,52 @@
> #ifndef _AMDXDNA_CTX_H_
> #define _AMDXDNA_CTX_H_
>
> +#include "amdxdna_gem.h"
> +
> +struct amdxdna_hwctx_priv;
> +
> +enum ert_cmd_opcode {
> + ERT_START_CU = 0,
> + ERT_CMD_CHAIN = 19,
> + ERT_START_NPU = 20,
> +};
> +
> +enum ert_cmd_state {
> + ERT_CMD_STATE_INVALID,
> + ERT_CMD_STATE_NEW,
> + ERT_CMD_STATE_QUEUED,
> + ERT_CMD_STATE_RUNNING,
> + ERT_CMD_STATE_COMPLETED,
> + ERT_CMD_STATE_ERROR,
> + ERT_CMD_STATE_ABORT,
> + ERT_CMD_STATE_SUBMITTED,
> + ERT_CMD_STATE_TIMEOUT,
> + ERT_CMD_STATE_NORESPONSE,
> +};
> +
> +/*
> + * Interpretation of the beginning of data payload for ERT_START_NPU in
> + * amdxdna_cmd. The rest of the payload in amdxdna_cmd is regular kernel args.
> + */
> +struct amdxdna_cmd_start_npu {
> + u64 buffer; /* instruction buffer address */
> + u32 buffer_size; /* size of buffer in bytes */
> + u32 prop_count; /* properties count */
> + u32 prop_args[]; /* properties and regular kernel arguments */
> +};
> +
> +/*
> + * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in
> + * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles.
> + */
> +struct amdxdna_cmd_chain {
> + u32 command_count;
> + u32 submit_index;
> + u32 error_index;
> + u32 reserved[3];
> + u64 data[] __counted_by(command_count);
> +};
> +
> /* Exec buffer command header format */
> #define AMDXDNA_CMD_STATE GENMASK(3, 0)
> #define AMDXDNA_CMD_EXTRA_CU_MASK GENMASK(11, 10)
> @@ -40,9 +86,73 @@ struct amdxdna_hwctx {
> struct amdxdna_hwctx_param_config_cu *cus;
> };
>
> +#define drm_job_to_xdna_job(j) \
> + container_of(j, struct amdxdna_sched_job, base)
> +
> +struct amdxdna_sched_job {
> + struct drm_sched_job base;
> + struct kref refcnt;
> + struct amdxdna_hwctx *hwctx;
> + struct mm_struct *mm;
> + /* The fence to notice DRM scheduler that job is done by hardware */
> + struct dma_fence *fence;
> + /* user can wait on this fence */
> + struct dma_fence *out_fence;
> + u64 seq;
> + struct amdxdna_gem_obj *cmd_bo;
> + size_t bo_cnt;
> + struct drm_gem_object *bos[] __counted_by(bo_cnt);
> +};
> +
> +static inline u32
> +amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
> +{
> + struct amdxdna_cmd *cmd = abo->mem.kva;
> +
> + return FIELD_GET(AMDXDNA_CMD_OPCODE, cmd->header);
> +}
> +
> +static inline void
> +amdxdna_cmd_set_state(struct amdxdna_gem_obj *abo, enum ert_cmd_state s)
> +{
> + struct amdxdna_cmd *cmd = abo->mem.kva;
> +
> + cmd->header &= ~AMDXDNA_CMD_STATE;
> + cmd->header |= FIELD_PREP(AMDXDNA_CMD_STATE, s);
> +}
> +
> +static inline enum ert_cmd_state
> +amdxdna_cmd_get_state(struct amdxdna_gem_obj *abo)
> +{
> + struct amdxdna_cmd *cmd = abo->mem.kva;
> +
> + return FIELD_GET(AMDXDNA_CMD_STATE, cmd->header);
> +}
> +
> +void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size);
> +int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
> +
> +static inline u32 amdxdna_hwctx_col_map(struct amdxdna_hwctx *hwctx)
> +{
> + return GENMASK(hwctx->start_col + hwctx->num_col - 1,
> + hwctx->start_col);
> +}
> +
> +void amdxdna_job_put(struct amdxdna_sched_job *job);
> +
> void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
> +
> +int amdxdna_cmd_submit(struct amdxdna_client *client,
> + u32 cmd_bo_hdls, u32 *arg_bo_hdls, u32 arg_bo_cnt,
> + u32 hwctx_hdl, u64 *seq);
> +
> +int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl,
> + u64 seq, u32 timeout);
> +
> int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
> int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
> int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
> +int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
> +int amdxdna_drm_wait_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
>
> #endif /* _AMDXDNA_CTX_H_ */
> diff --git a/drivers/accel/amdxdna/amdxdna_gem.c b/drivers/accel/amdxdna/amdxdna_gem.c
> index 66373baa4600..091674a4bbfa 100644
> --- a/drivers/accel/amdxdna/amdxdna_gem.c
> +++ b/drivers/accel/amdxdna/amdxdna_gem.c
> @@ -8,6 +8,7 @@
> #include <drm/drm_device.h>
> #include <drm/drm_gem.h>
> #include <drm/drm_gem_shmem_helper.h>
> +#include <drm/gpu_scheduler.h>
> #include <linux/iosys-map.h>
> #include <linux/vmalloc.h>
>
> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox_helper.c b/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
> index 42b615394605..5139a9c96a91 100644
> --- a/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
> +++ b/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
> @@ -3,10 +3,15 @@
> * Copyright (C) 2024, Advanced Micro Devices, Inc.
> */
>
> +#include <drm/amdxdna_accel.h>
> #include <drm/drm_device.h>
> #include <drm/drm_print.h>
> +#include <drm/drm_gem.h>
> +#include <drm/drm_gem_shmem_helper.h>
> +#include <drm/gpu_scheduler.h>
> #include <linux/completion.h>
>
> +#include "amdxdna_gem.h"
> #include "amdxdna_mailbox.h"
> #include "amdxdna_mailbox_helper.h"
> #include "amdxdna_pci_drv.h"
> diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c
> index 47ea79d4a021..5c1e863825e0 100644
> --- a/drivers/accel/amdxdna/amdxdna_pci_drv.c
> +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
> @@ -10,6 +10,7 @@
> #include <drm/drm_gem_shmem_helper.h>
> #include <drm/drm_ioctl.h>
> #include <drm/drm_managed.h>
> +#include <drm/gpu_scheduler.h>
> #include <linux/iommu.h>
> #include <linux/pci.h>
>
> @@ -64,6 +65,7 @@ static int amdxdna_drm_open(struct drm_device *ddev, struct drm_file *filp)
> goto unbind_sva;
> }
> mutex_init(&client->hwctx_lock);
> + init_srcu_struct(&client->hwctx_srcu);
> idr_init_base(&client->hwctx_idr, AMDXDNA_INVALID_CTX_HANDLE + 1);
> mutex_init(&client->mm_lock);
>
> @@ -93,6 +95,7 @@ static void amdxdna_drm_close(struct drm_device *ddev, struct drm_file *filp)
> XDNA_DBG(xdna, "closing pid %d", client->pid);
>
> idr_destroy(&client->hwctx_idr);
> + cleanup_srcu_struct(&client->hwctx_srcu);
> mutex_destroy(&client->hwctx_lock);
> mutex_destroy(&client->mm_lock);
> if (client->dev_heap)
> @@ -133,6 +136,9 @@ static const struct drm_ioctl_desc amdxdna_drm_ioctls[] = {
> DRM_IOCTL_DEF_DRV(AMDXDNA_CREATE_BO, amdxdna_drm_create_bo_ioctl, 0),
> DRM_IOCTL_DEF_DRV(AMDXDNA_GET_BO_INFO, amdxdna_drm_get_bo_info_ioctl, 0),
> DRM_IOCTL_DEF_DRV(AMDXDNA_SYNC_BO, amdxdna_drm_sync_bo_ioctl, 0),
> + /* Exectuion */
> + DRM_IOCTL_DEF_DRV(AMDXDNA_EXEC_CMD, amdxdna_drm_submit_cmd_ioctl, 0),
> + DRM_IOCTL_DEF_DRV(AMDXDNA_WAIT_CMD, amdxdna_drm_wait_cmd_ioctl, 0),
> };
>
> static const struct file_operations amdxdna_fops = {
> diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.h b/drivers/accel/amdxdna/amdxdna_pci_drv.h
> index 3dddde4ac12a..0324e73094b2 100644
> --- a/drivers/accel/amdxdna/amdxdna_pci_drv.h
> +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.h
> @@ -20,6 +20,7 @@ extern const struct drm_driver amdxdna_drm_drv;
> struct amdxdna_dev;
> struct amdxdna_gem_obj;
> struct amdxdna_hwctx;
> +struct amdxdna_sched_job;
>
> /*
> * struct amdxdna_dev_ops - Device hardware operation callbacks
> @@ -31,6 +32,8 @@ struct amdxdna_dev_ops {
> void (*hwctx_fini)(struct amdxdna_hwctx *hwctx);
> int (*hwctx_config)(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
> void (*hmm_invalidate)(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
> + int (*cmd_submit)(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
> + int (*cmd_wait)(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
> };
>
> /*
> @@ -88,6 +91,8 @@ struct amdxdna_client {
> struct list_head node;
> pid_t pid;
> struct mutex hwctx_lock; /* protect hwctx */
> + /* do NOT wait this srcu when hwctx_lock is hold */
> + struct srcu_struct hwctx_srcu;
> struct idr hwctx_idr;
> struct amdxdna_dev *xdna;
> struct drm_file *filp;
> diff --git a/drivers/accel/amdxdna/amdxdna_sysfs.c b/drivers/accel/amdxdna/amdxdna_sysfs.c
> index 668b94b92714..f27e4ee960a0 100644
> --- a/drivers/accel/amdxdna/amdxdna_sysfs.c
> +++ b/drivers/accel/amdxdna/amdxdna_sysfs.c
> @@ -3,9 +3,14 @@
> * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
> */
>
> +#include <drm/amdxdna_accel.h>
> #include <drm/drm_device.h>
> +#include <drm/drm_gem_shmem_helper.h>
> #include <drm/drm_print.h>
> +#include <drm/gpu_scheduler.h>
> +#include <linux/types.h>
>
> +#include "amdxdna_gem.h"
> #include "amdxdna_pci_drv.h"
>
> static ssize_t vbnv_show(struct device *dev, struct device_attribute *attr, char *buf)
> diff --git a/drivers/accel/amdxdna/npu1_regs.c b/drivers/accel/amdxdna/npu1_regs.c
> index 720aab0ed7c4..f00c50461b09 100644
> --- a/drivers/accel/amdxdna/npu1_regs.c
> +++ b/drivers/accel/amdxdna/npu1_regs.c
> @@ -5,6 +5,7 @@
>
> #include <drm/amdxdna_accel.h>
> #include <drm/drm_device.h>
> +#include <drm/gpu_scheduler.h>
> #include <linux/sizes.h>
>
> #include "aie2_pci.h"
> diff --git a/drivers/accel/amdxdna/npu2_regs.c b/drivers/accel/amdxdna/npu2_regs.c
> index f3ea18bcf294..00cb381031d2 100644
> --- a/drivers/accel/amdxdna/npu2_regs.c
> +++ b/drivers/accel/amdxdna/npu2_regs.c
> @@ -5,6 +5,7 @@
>
> #include <drm/amdxdna_accel.h>
> #include <drm/drm_device.h>
> +#include <drm/gpu_scheduler.h>
> #include <linux/sizes.h>
>
> #include "aie2_pci.h"
> diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c
> index db61142f0d4e..b6dae9667cca 100644
> --- a/drivers/accel/amdxdna/npu4_regs.c
> +++ b/drivers/accel/amdxdna/npu4_regs.c
> @@ -5,6 +5,7 @@
>
> #include <drm/amdxdna_accel.h>
> #include <drm/drm_device.h>
> +#include <drm/gpu_scheduler.h>
> #include <linux/sizes.h>
>
> #include "aie2_pci.h"
> diff --git a/drivers/accel/amdxdna/npu5_regs.c b/drivers/accel/amdxdna/npu5_regs.c
> index debf4e95b9bb..bed1baf8e160 100644
> --- a/drivers/accel/amdxdna/npu5_regs.c
> +++ b/drivers/accel/amdxdna/npu5_regs.c
> @@ -5,6 +5,7 @@
>
> #include <drm/amdxdna_accel.h>
> #include <drm/drm_device.h>
> +#include <drm/gpu_scheduler.h>
> #include <linux/sizes.h>
>
> #include "aie2_pci.h"
> diff --git a/include/trace/events/amdxdna.h b/include/trace/events/amdxdna.h
> index 33343d8f0622..c6cb2da7b706 100644
> --- a/include/trace/events/amdxdna.h
> +++ b/include/trace/events/amdxdna.h
> @@ -9,8 +9,49 @@
> #if !defined(_TRACE_AMDXDNA_H) || defined(TRACE_HEADER_MULTI_READ)
> #define _TRACE_AMDXDNA_H
>
> +#include <drm/gpu_scheduler.h>
> #include <linux/tracepoint.h>
>
> +TRACE_EVENT(amdxdna_debug_point,
> + TP_PROTO(const char *name, u64 number, const char *str),
> +
> + TP_ARGS(name, number, str),
> +
> + TP_STRUCT__entry(__string(name, name)
> + __field(u64, number)
> + __string(str, str)),
> +
> + TP_fast_assign(__assign_str(name);
> + __entry->number = number;
> + __assign_str(str);),
> +
> + TP_printk("%s:%llu %s", __get_str(name), __entry->number,
> + __get_str(str))
> +);
> +
> +TRACE_EVENT(xdna_job,
> + TP_PROTO(struct drm_sched_job *sched_job, const char *name, const char *str, u64 seq),
> +
> + TP_ARGS(sched_job, name, str, seq),
> +
> + TP_STRUCT__entry(__string(name, name)
> + __string(str, str)
> + __field(u64, fence_context)
> + __field(u64, fence_seqno)
> + __field(u64, seq)),
> +
> + TP_fast_assign(__assign_str(name);
> + __assign_str(str);
> + __entry->fence_context = sched_job->s_fence->finished.context;
> + __entry->fence_seqno = sched_job->s_fence->finished.seqno;
> + __entry->seq = seq;),
> +
> + TP_printk("fence=(context:%llu, seqno:%lld), %s seq#:%lld %s",
> + __entry->fence_context, __entry->fence_seqno,
> + __get_str(name), __entry->seq,
> + __get_str(str))
> +);
> +
> DECLARE_EVENT_CLASS(xdna_mbox_msg,
> TP_PROTO(char *name, u8 chann_id, u32 opcode, u32 msg_id),
>
> diff --git a/include/uapi/drm/amdxdna_accel.h b/include/uapi/drm/amdxdna_accel.h
> index 3792750834b2..08f3ec7146ab 100644
> --- a/include/uapi/drm/amdxdna_accel.h
> +++ b/include/uapi/drm/amdxdna_accel.h
> @@ -13,6 +13,7 @@
> extern "C" {
> #endif
>
> +#define AMDXDNA_INVALID_CMD_HANDLE (~0UL)
> #define AMDXDNA_INVALID_ADDR (~0UL)
> #define AMDXDNA_INVALID_CTX_HANDLE 0
> #define AMDXDNA_INVALID_BO_HANDLE 0
> @@ -29,6 +30,8 @@ enum amdxdna_drm_ioctl_id {
> DRM_AMDXDNA_CREATE_BO,
> DRM_AMDXDNA_GET_BO_INFO,
> DRM_AMDXDNA_SYNC_BO,
> + DRM_AMDXDNA_EXEC_CMD,
> + DRM_AMDXDNA_WAIT_CMD,
> };
>
> /**
> @@ -201,6 +204,54 @@ struct amdxdna_drm_sync_bo {
> __u64 size;
> };
>
> +enum amdxdna_cmd_type {
> + AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0,
> + AMDXDNA_CMD_SUBMIT_DEPENDENCY,
> + AMDXDNA_CMD_SUBMIT_SIGNAL,
> +};
> +
> +/**
> + * struct amdxdna_drm_exec_cmd - Execute command.
> + * @ext: MBZ.
> + * @ext_flags: MBZ.
> + * @hwctx: Hardware context handle.
> + * @type: One of command type in enum amdxdna_cmd_type.
> + * @cmd_handles: Array of command handles or the command handle itself
> + * in case of just one.
> + * @args: Array of arguments for all command handles.
> + * @cmd_count: Number of command handles in the cmd_handles array.
> + * @arg_count: Number of arguments in the args array.
> + * @seq: Returned sequence number for this command.
> + */
> +struct amdxdna_drm_exec_cmd {
> + __u64 ext;
> + __u64 ext_flags;
> + __u32 hwctx;
> + __u32 type;
> + __u64 cmd_handles;
> + __u64 args;
> + __u32 cmd_count;
> + __u32 arg_count;
> + __u64 seq;
> +};
> +
> +/**
> + * struct amdxdna_drm_wait_cmd - Wait exectuion command.
> + *
> + * @hwctx: hardware context handle.
> + * @timeout: timeout in ms, 0 implies infinite wait.
> + * @seq: sequence number of the command returned by execute command.
> + *
> + * Wait a command specified by seq to be completed.
> + * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot
> + * to submit a new command.
> + */
> +struct amdxdna_drm_wait_cmd {
> + __u32 hwctx;
> + __u32 timeout;
> + __u64 seq;
> +};
> +
> #define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \
> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \
> struct amdxdna_drm_create_hwctx)
> @@ -225,6 +276,14 @@ struct amdxdna_drm_sync_bo {
> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, \
> struct amdxdna_drm_sync_bo)
>
> +#define DRM_IOCTL_AMDXDNA_EXEC_CMD \
> + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, \
> + struct amdxdna_drm_exec_cmd)
> +
> +#define DRM_IOCTL_AMDXDNA_WAIT_CMD \
> + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, \
> + struct amdxdna_drm_wait_cmd)
> +
> #if defined(__cplusplus)
> } /* extern c end */
> #endif
> --
> 2.34.1
>
On 10/14/24 19:13, Matthew Brost wrote:
> On Fri, Oct 11, 2024 at 04:12:41PM -0700, Lizhi Hou wrote:
>> Add interfaces for user application to submit command and wait for its
>> completion.
>>
>> Co-developed-by: Min Ma <min.ma@amd.com>
>> Signed-off-by: Min Ma <min.ma@amd.com>
>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>> ---
>> drivers/accel/amdxdna/aie2_ctx.c | 624 +++++++++++++++++-
>> drivers/accel/amdxdna/aie2_message.c | 343 ++++++++++
>> drivers/accel/amdxdna/aie2_pci.c | 6 +
>> drivers/accel/amdxdna/aie2_pci.h | 35 +
>> drivers/accel/amdxdna/aie2_psp.c | 2 +
>> drivers/accel/amdxdna/aie2_smu.c | 2 +
>> drivers/accel/amdxdna/amdxdna_ctx.c | 375 ++++++++++-
>> drivers/accel/amdxdna/amdxdna_ctx.h | 110 +++
>> drivers/accel/amdxdna/amdxdna_gem.c | 1 +
>> .../accel/amdxdna/amdxdna_mailbox_helper.c | 5 +
>> drivers/accel/amdxdna/amdxdna_pci_drv.c | 6 +
>> drivers/accel/amdxdna/amdxdna_pci_drv.h | 5 +
>> drivers/accel/amdxdna/amdxdna_sysfs.c | 5 +
>> drivers/accel/amdxdna/npu1_regs.c | 1 +
>> drivers/accel/amdxdna/npu2_regs.c | 1 +
>> drivers/accel/amdxdna/npu4_regs.c | 1 +
>> drivers/accel/amdxdna/npu5_regs.c | 1 +
>> include/trace/events/amdxdna.h | 41 ++
>> include/uapi/drm/amdxdna_accel.h | 59 ++
>> 19 files changed, 1614 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
>> index 617fc05077d9..f9010a902c99 100644
>> --- a/drivers/accel/amdxdna/aie2_ctx.c
>> +++ b/drivers/accel/amdxdna/aie2_ctx.c
>> @@ -8,8 +8,11 @@
>> #include <drm/drm_gem.h>
>> #include <drm/drm_gem_shmem_helper.h>
>> #include <drm/drm_print.h>
>> +#include <linux/hmm.h>
>> #include <linux/types.h>
>> +#include <trace/events/amdxdna.h>
>>
>> +#include "aie2_msg_priv.h"
>> #include "aie2_pci.h"
>> #include "aie2_solver.h"
>> #include "amdxdna_ctx.h"
>> @@ -17,6 +20,361 @@
>> #include "amdxdna_mailbox.h"
>> #include "amdxdna_pci_drv.h"
>>
>> +bool force_cmdlist;
>> +module_param(force_cmdlist, bool, 0600);
>> +MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default false)");
>> +
>> +#define HWCTX_MAX_TIMEOUT 60000 /* miliseconds */
>> +
>> +static int
>> +aie2_hwctx_add_job(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job)
>> +{
>> + struct amdxdna_sched_job *other;
>> + int idx;
>> +
>> + idx = get_job_idx(hwctx->priv->seq);
>> + /* When pending list full, hwctx->seq points to oldest fence */
>> + other = hwctx->priv->pending[idx];
>> + if (other && other->fence)
>> + return -EAGAIN;
>> +
>> + if (other) {
>> + dma_fence_put(other->out_fence);
>> + amdxdna_job_put(other);
>> + }
>> +
>> + hwctx->priv->pending[idx] = job;
>> + job->seq = hwctx->priv->seq++;
>> + kref_get(&job->refcnt);
>> +
>> + return 0;
>> +}
>> +
>> +static struct amdxdna_sched_job *
>> +aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq)
>> +{
>> + int idx;
>> +
>> + /* Special sequence number for oldest fence if exist */
>> + if (seq == AMDXDNA_INVALID_CMD_HANDLE) {
>> + idx = get_job_idx(hwctx->priv->seq);
>> + goto out;
>> + }
>> +
>> + if (seq >= hwctx->priv->seq)
>> + return ERR_PTR(-EINVAL);
>> +
>> + if (seq + HWCTX_MAX_CMDS < hwctx->priv->seq)
>> + return NULL;
>> +
>> + idx = get_job_idx(seq);
>> +
>> +out:
>> + return hwctx->priv->pending[idx];
>> +}
>> +
>> +/* The bad_job is used in aie2_sched_job_timedout, otherwise, set it to NULL */
>> +static void aie2_hwctx_stop(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwctx,
>> + struct drm_sched_job *bad_job)
>> +{
>> + drm_sched_stop(&hwctx->priv->sched, bad_job);
>> + aie2_destroy_context(xdna->dev_handle, hwctx);
>> +}
>> +
>> +static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwctx)
>> +{
>> + struct amdxdna_gem_obj *heap = hwctx->priv->heap;
>> + int ret;
>> +
>> + ret = aie2_create_context(xdna->dev_handle, hwctx);
>> + if (ret) {
>> + XDNA_ERR(xdna, "Create hwctx failed, ret %d", ret);
>> + goto out;
>> + }
>> +
>> + ret = aie2_map_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
>> + heap->mem.userptr, heap->mem.size);
>> + if (ret) {
>> + XDNA_ERR(xdna, "Map host buf failed, ret %d", ret);
>> + goto out;
>> + }
>> +
>> + if (hwctx->status != HWCTX_STAT_READY) {
>> + XDNA_DBG(xdna, "hwctx is not ready, status %d", hwctx->status);
>> + goto out;
>> + }
>> +
>> + ret = aie2_config_cu(hwctx);
>> + if (ret) {
>> + XDNA_ERR(xdna, "Config cu failed, ret %d", ret);
>> + goto out;
>> + }
>> +
>> +out:
>> + drm_sched_start(&hwctx->priv->sched);
>> + XDNA_DBG(xdna, "%s restarted, ret %d", hwctx->name, ret);
>> + return ret;
>> +}
>> +
>> +void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map)
>> +{
>> + struct amdxdna_dev *xdna = client->xdna;
>> + struct amdxdna_hwctx *hwctx;
>> + int next = 0;
>> +
>> + drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
>> + mutex_lock(&client->hwctx_lock);
>> + idr_for_each_entry_continue(&client->hwctx_idr, hwctx, next) {
>> + /* check if the HW context uses the error column */
>> + if (!(col_map & amdxdna_hwctx_col_map(hwctx)))
>> + continue;
>> +
>> + aie2_hwctx_stop(xdna, hwctx, NULL);
>> + hwctx->old_status = hwctx->status;
>> + hwctx->status = HWCTX_STAT_STOP;
>> + XDNA_DBG(xdna, "Stop %s", hwctx->name);
>> + }
>> + mutex_unlock(&client->hwctx_lock);
>> +}
>> +
>> +void aie2_restart_ctx(struct amdxdna_client *client)
>> +{
>> + struct amdxdna_dev *xdna = client->xdna;
>> + struct amdxdna_hwctx *hwctx;
>> + int next = 0;
>> +
>> + drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
>> + mutex_lock(&client->hwctx_lock);
>> + idr_for_each_entry_continue(&client->hwctx_idr, hwctx, next) {
>> + if (hwctx->status != HWCTX_STAT_STOP)
>> + continue;
>> +
>> + hwctx->status = hwctx->old_status;
>> + XDNA_DBG(xdna, "Resetting %s", hwctx->name);
>> + aie2_hwctx_restart(xdna, hwctx);
>> + }
>> + mutex_unlock(&client->hwctx_lock);
>> +}
>> +
>> +static int aie2_hwctx_wait_for_idle(struct amdxdna_hwctx *hwctx)
>> +{
>> + struct amdxdna_sched_job *job;
>> +
>> + mutex_lock(&hwctx->priv->io_lock);
>> + if (!hwctx->priv->seq) {
>> + mutex_unlock(&hwctx->priv->io_lock);
>> + return 0;
>> + }
>> +
>> + job = aie2_hwctx_get_job(hwctx, hwctx->priv->seq - 1);
>> + if (IS_ERR_OR_NULL(job)) {
>> + mutex_unlock(&hwctx->priv->io_lock);
>> + XDNA_WARN(hwctx->client->xdna, "Corrupted pending list");
>> + return 0;
>> + }
>> + mutex_unlock(&hwctx->priv->io_lock);
>> +
>> + wait_event(hwctx->priv->job_free_wq, !job->fence);
>> +
>> + return 0;
>> +}
>> +
>> +static void
>> +aie2_sched_notify(struct amdxdna_sched_job *job)
>> +{
>> + struct dma_fence *fence = job->fence;
>> +
>> + job->hwctx->priv->completed++;
>> + dma_fence_signal(fence);
>> + trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", job->seq);
>> + dma_fence_put(fence);
>> + mmput(job->mm);
>> + amdxdna_job_put(job);
>> +}
>> +
>> +static int
>> +aie2_sched_resp_handler(void *handle, const u32 *data, size_t size)
>> +{
>> + struct amdxdna_sched_job *job = handle;
>> + struct amdxdna_gem_obj *cmd_abo;
>> + u32 ret = 0;
>> + u32 status;
>> +
>> + cmd_abo = job->cmd_bo;
>> +
>> + if (unlikely(!data))
>> + goto out;
>> +
>> + if (unlikely(size != sizeof(u32))) {
>> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
>> + ret = -EINVAL;
>> + goto out;
>> + }
>> +
>> + status = *data;
>> + XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
>> + if (status == AIE2_STATUS_SUCCESS)
>> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
>> + else
>> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ERROR);
>> +
>> +out:
>> + aie2_sched_notify(job);
>> + return ret;
>> +}
>> +
>> +static int
>> +aie2_sched_nocmd_resp_handler(void *handle, const u32 *data, size_t size)
>> +{
>> + struct amdxdna_sched_job *job = handle;
>> + u32 ret = 0;
>> + u32 status;
>> +
>> + if (unlikely(!data))
>> + goto out;
>> +
>> + if (unlikely(size != sizeof(u32))) {
>> + ret = -EINVAL;
>> + goto out;
>> + }
>> +
>> + status = *data;
>> + XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
>> +
>> +out:
>> + aie2_sched_notify(job);
>> + return ret;
>> +}
>> +
>> +static int
>> +aie2_sched_cmdlist_resp_handler(void *handle, const u32 *data, size_t size)
>> +{
>> + struct amdxdna_sched_job *job = handle;
>> + struct amdxdna_gem_obj *cmd_abo;
>> + struct cmd_chain_resp *resp;
>> + struct amdxdna_dev *xdna;
>> + u32 fail_cmd_status;
>> + u32 fail_cmd_idx;
>> + u32 ret = 0;
>> +
>> + cmd_abo = job->cmd_bo;
>> + if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
>> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
>> + ret = -EINVAL;
>> + goto out;
>> + }
>> +
>> + resp = (struct cmd_chain_resp *)data;
>> + xdna = job->hwctx->client->xdna;
>> + XDNA_DBG(xdna, "Status 0x%x", resp->status);
>> + if (resp->status == AIE2_STATUS_SUCCESS) {
>> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
>> + goto out;
>> + }
>> +
>> + /* Slow path to handle error, read from ringbuf on BAR */
>> + fail_cmd_idx = resp->fail_cmd_idx;
>> + fail_cmd_status = resp->fail_cmd_status;
>> + XDNA_DBG(xdna, "Failed cmd idx %d, status 0x%x",
>> + fail_cmd_idx, fail_cmd_status);
>> +
>> + if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
>> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
>> + ret = -EINVAL;
>> + goto out;
>> + }
>> + amdxdna_cmd_set_state(cmd_abo, fail_cmd_status);
>> +
>> + if (amdxdna_cmd_get_op(cmd_abo) == ERT_CMD_CHAIN) {
>> + struct amdxdna_cmd_chain *cc = amdxdna_cmd_get_payload(cmd_abo, NULL);
>> +
>> + cc->error_index = fail_cmd_idx;
>> + if (cc->error_index >= cc->command_count)
>> + cc->error_index = 0;
>> + }
>> +out:
>> + aie2_sched_notify(job);
>> + return ret;
>> +}
>> +
>> +static struct dma_fence *
>> +aie2_sched_job_run(struct drm_sched_job *sched_job)
>> +{
>> + struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
>> + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
>> + struct amdxdna_hwctx *hwctx = job->hwctx;
>> + struct dma_fence *fence;
>> + int ret;
>> +
>> + if (!mmget_not_zero(job->mm))
>> + return ERR_PTR(-ESRCH);
>> +
>> + kref_get(&job->refcnt);
>> + fence = dma_fence_get(job->fence);
>> +
>> + if (unlikely(!cmd_abo)) {
>> + ret = aie2_sync_bo(hwctx, job, aie2_sched_nocmd_resp_handler);
>> + goto out;
>> + }
>> +
>> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_NEW);
>> +
>> + if (amdxdna_cmd_get_op(cmd_abo) == ERT_CMD_CHAIN)
>> + ret = aie2_cmdlist_multi_execbuf(hwctx, job, aie2_sched_cmdlist_resp_handler);
>> + else if (force_cmdlist)
>> + ret = aie2_cmdlist_single_execbuf(hwctx, job, aie2_sched_cmdlist_resp_handler);
>> + else
>> + ret = aie2_execbuf(hwctx, job, aie2_sched_resp_handler);
>> +
>> +out:
>> + if (ret) {
>> + dma_fence_put(job->fence);
>> + amdxdna_job_put(job);
>> + mmput(job->mm);
>> + fence = ERR_PTR(ret);
>> + }
>> + trace_xdna_job(sched_job, hwctx->name, "sent to device", job->seq);
>> +
>> + return fence;
>> +}
>> +
>> +static void aie2_sched_job_free(struct drm_sched_job *sched_job)
>> +{
>> + struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
>> + struct amdxdna_hwctx *hwctx = job->hwctx;
>> +
>> + trace_xdna_job(sched_job, hwctx->name, "job free", job->seq);
>> + drm_sched_job_cleanup(sched_job);
>> + job->fence = NULL;
>> + amdxdna_job_put(job);
>> +
>> + wake_up(&hwctx->priv->job_free_wq);
>> +}
>> +
>> +static enum drm_gpu_sched_stat
>> +aie2_sched_job_timedout(struct drm_sched_job *sched_job)
>> +{
>> + struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
>> + struct amdxdna_hwctx *hwctx = job->hwctx;
>> + struct amdxdna_dev *xdna;
>> +
>> + xdna = hwctx->client->xdna;
>> + trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
>> + mutex_lock(&xdna->dev_lock);
>> + aie2_hwctx_stop(xdna, hwctx, sched_job);
>> +
>> + aie2_hwctx_restart(xdna, hwctx);
>> + mutex_unlock(&xdna->dev_lock);
>> +
>> + return DRM_GPU_SCHED_STAT_NOMINAL;
>> +}
>> +
>> +const struct drm_sched_backend_ops sched_ops = {
>> + .run_job = aie2_sched_job_run,
>> + .free_job = aie2_sched_job_free,
>> + .timedout_job = aie2_sched_job_timedout,
>> +};
>> +
>> static int aie2_hwctx_col_list(struct amdxdna_hwctx *hwctx)
>> {
>> struct amdxdna_dev *xdna = hwctx->client->xdna;
>> @@ -130,9 +488,10 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
>> {
>> struct amdxdna_client *client = hwctx->client;
>> struct amdxdna_dev *xdna = client->xdna;
>> + struct drm_gpu_scheduler *sched;
>> struct amdxdna_hwctx_priv *priv;
>> struct amdxdna_gem_obj *heap;
>> - int ret;
>> + int i, ret;
>>
>> priv = kzalloc(sizeof(*hwctx->priv), GFP_KERNEL);
>> if (!priv)
>> @@ -157,10 +516,48 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
>> goto put_heap;
>> }
>>
>> + for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
>> + struct amdxdna_gem_obj *abo;
>> + struct amdxdna_drm_create_bo args = {
>> + .flags = 0,
>> + .type = AMDXDNA_BO_DEV,
>> + .vaddr = 0,
>> + .size = MAX_CHAIN_CMDBUF_SIZE,
>> + };
>> +
>> + abo = amdxdna_drm_alloc_dev_bo(&xdna->ddev, &args, client->filp, true);
>> + if (IS_ERR(abo)) {
>> + ret = PTR_ERR(abo);
>> + goto free_cmd_bufs;
>> + }
>> +
>> + XDNA_DBG(xdna, "Command buf %d addr 0x%llx size 0x%lx",
>> + i, abo->mem.dev_addr, abo->mem.size);
>> + priv->cmd_buf[i] = abo;
>> + }
>> +
>> + sched = &priv->sched;
>> + mutex_init(&priv->io_lock);
>> + ret = drm_sched_init(sched, &sched_ops, NULL, DRM_SCHED_PRIORITY_COUNT,
>> + HWCTX_MAX_CMDS, 0, msecs_to_jiffies(HWCTX_MAX_TIMEOUT),
>> + NULL, NULL, hwctx->name, xdna->ddev.dev);
>> + if (ret) {
>> + XDNA_ERR(xdna, "Failed to init DRM scheduler. ret %d", ret);
>> + goto free_cmd_bufs;
>> + }
>> +
>> + ret = drm_sched_entity_init(&priv->entity, DRM_SCHED_PRIORITY_NORMAL,
>> + &sched, 1, NULL);
>> + if (ret) {
>> + XDNA_ERR(xdna, "Failed to initial sched entiry. ret %d", ret);
>> + goto free_sched;
>> + }
>> + init_waitqueue_head(&priv->job_free_wq);
>> +
>> ret = aie2_hwctx_col_list(hwctx);
>> if (ret) {
>> XDNA_ERR(xdna, "Create col list failed, ret %d", ret);
>> - goto unpin;
>> + goto free_entity;
>> }
>>
>> ret = aie2_alloc_resource(hwctx);
>> @@ -185,7 +582,16 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
>> aie2_release_resource(hwctx);
>> free_col_list:
>> kfree(hwctx->col_list);
>> -unpin:
>> +free_entity:
>> + drm_sched_entity_destroy(&priv->entity);
>> +free_sched:
>> + drm_sched_fini(&priv->sched);
>> +free_cmd_bufs:
>> + for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
>> + if (!priv->cmd_buf[i])
>> + continue;
>> + drm_gem_object_put(to_gobj(priv->cmd_buf[i]));
>> + }
>> amdxdna_gem_unpin(heap);
>> put_heap:
>> drm_gem_object_put(to_gobj(heap));
>> @@ -196,11 +602,43 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
>>
>> void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx)
>> {
>> + struct amdxdna_sched_job *job;
>> + struct amdxdna_dev *xdna;
>> + int idx;
>> +
>> + xdna = hwctx->client->xdna;
>> + drm_sched_wqueue_stop(&hwctx->priv->sched);
>> +
>> + /* Now, scheduler will not send command to device. */
>> aie2_release_resource(hwctx);
>>
>> + /*
>> + * All submitted commands are aborted.
>> + * Restart scheduler queues to cleanup jobs. The amdxdna_sched_job_run()
>> + * will return NODEV if it is called.
>> + */
>> + drm_sched_wqueue_start(&hwctx->priv->sched);
>> +
>> + aie2_hwctx_wait_for_idle(hwctx);
>> + drm_sched_entity_destroy(&hwctx->priv->entity);
>> + drm_sched_fini(&hwctx->priv->sched);
>> +
>> + for (idx = 0; idx < HWCTX_MAX_CMDS; idx++) {
>> + job = hwctx->priv->pending[idx];
>> + if (!job)
>> + continue;
>> +
>> + dma_fence_put(job->out_fence);
>> + amdxdna_job_put(job);
>> + }
>> + XDNA_DBG(xdna, "%s sequence number %lld", hwctx->name, hwctx->priv->seq);
>> +
>> + for (idx = 0; idx < ARRAY_SIZE(hwctx->priv->cmd_buf); idx++)
>> + drm_gem_object_put(to_gobj(hwctx->priv->cmd_buf[idx]));
>> amdxdna_gem_unpin(hwctx->priv->heap);
>> drm_gem_object_put(to_gobj(hwctx->priv->heap));
>>
>> + mutex_destroy(&hwctx->priv->io_lock);
>> kfree(hwctx->col_list);
>> kfree(hwctx->priv);
>> kfree(hwctx->cus);
>> @@ -267,3 +705,183 @@ int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *bu
>> return -EOPNOTSUPP;
>> }
>> }
>> +
>> +static int aie2_populate_range(struct amdxdna_gem_obj *abo)
>> +{
>> + struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
>> + struct mm_struct *mm = abo->mem.notifier.mm;
>> + struct hmm_range range = { 0 };
>> + unsigned long timeout;
>> + int ret;
>> +
>> + XDNA_INFO_ONCE(xdna, "populate memory range %llx size %lx",
>> + abo->mem.userptr, abo->mem.size);
>> + range.notifier = &abo->mem.notifier;
>> + range.start = abo->mem.userptr;
>> + range.end = abo->mem.userptr + abo->mem.size;
>> + range.hmm_pfns = abo->mem.pfns;
>> + range.default_flags = HMM_PFN_REQ_FAULT;
>> +
>> + if (!mmget_not_zero(mm))
>> + return -EFAULT;
>> +
>> + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
>> +again:
>> + range.notifier_seq = mmu_interval_read_begin(&abo->mem.notifier);
>> + mmap_read_lock(mm);
>> + ret = hmm_range_fault(&range);
>> + mmap_read_unlock(mm);
>> + if (ret) {
>> + if (time_after(jiffies, timeout)) {
>> + ret = -ETIME;
>> + goto put_mm;
>> + }
>> +
>> + if (ret == -EBUSY)
>> + goto again;
>> +
>> + goto put_mm;
>> + }
>> +
>> + dma_resv_lock(to_gobj(abo)->resv, NULL);
>> + if (mmu_interval_read_retry(&abo->mem.notifier, range.notifier_seq)) {
>> + dma_resv_unlock(to_gobj(abo)->resv);
>> + goto again;
>> + }
>> + abo->mem.map_invalid = false;
>> + dma_resv_unlock(to_gobj(abo)->resv);
>> +
>> +put_mm:
>> + mmput(mm);
>> + return ret;
>> +}
>> +
>> +int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq)
>> +{
>> + struct amdxdna_dev *xdna = hwctx->client->xdna;
>> + struct ww_acquire_ctx acquire_ctx;
>> + struct amdxdna_gem_obj *abo;
>> + unsigned long timeout = 0;
>> + int ret, i;
>> +
>> + ret = drm_sched_job_init(&job->base, &hwctx->priv->entity, 1, hwctx);
>> + if (ret) {
>> + XDNA_ERR(xdna, "DRM job init failed, ret %d", ret);
>> + return ret;
>> + }
>> +
>> + drm_sched_job_arm(&job->base);
>> + job->out_fence = dma_fence_get(&job->base.s_fence->finished);
>> +
>> +retry:
>> + ret = drm_gem_lock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
>> + if (ret) {
>> + XDNA_WARN(xdna, "Failed to reverve fence, ret %d", ret);
>> + goto put_fence;
>> + }
>> +
>> + for (i = 0; i < job->bo_cnt; i++) {
>> + abo = to_xdna_obj(job->bos[i]);
>> + if (abo->mem.map_invalid) {
>> + drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
>> + if (!timeout) {
>> + timeout = jiffies +
>> + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
>> + } else if (time_after(jiffies, timeout)) {
>> + ret = -ETIME;
>> + goto put_fence;
>> + }
>> +
>> + ret = aie2_populate_range(abo);
>> + if (ret)
>> + goto put_fence;
>> + goto retry;
>> + }
>> +
>> + ret = dma_resv_reserve_fences(job->bos[i]->resv, 1);
>> + if (ret) {
>> + XDNA_WARN(xdna, "Failed to reserve fences %d", ret);
>> + drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
>> + goto put_fence;
>> + }
>> + }
>> +
>> + for (i = 0; i < job->bo_cnt; i++)
>> + dma_resv_add_fence(job->bos[i]->resv, job->out_fence, DMA_RESV_USAGE_WRITE);
>> + drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
>> +
>> + mutex_lock(&hwctx->priv->io_lock);
>> + ret = aie2_hwctx_add_job(hwctx, job);
>> + if (ret) {
>> + mutex_unlock(&hwctx->priv->io_lock);
>> + goto signal_fence;
>> + }
>> +
>> + *seq = job->seq;
>> + drm_sched_entity_push_job(&job->base);
>> + mutex_unlock(&hwctx->priv->io_lock);
>> +
>> + return 0;
>> +
>> +signal_fence:
>> + dma_fence_signal(job->out_fence);
>> +put_fence:
>> + dma_fence_put(job->out_fence);
>> + drm_sched_job_cleanup(&job->base);
>> + return ret;
>> +}
>> +
>> +int aie2_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout)
>> +{
>> + signed long remaining = MAX_SCHEDULE_TIMEOUT;
>> + struct amdxdna_sched_job *job;
>> + struct dma_fence *out_fence;
>> + long ret;
>> +
>> + mutex_lock(&hwctx->priv->io_lock);
>> + job = aie2_hwctx_get_job(hwctx, seq);
>> + if (IS_ERR(job)) {
>> + mutex_unlock(&hwctx->priv->io_lock);
>> + ret = PTR_ERR(job);
>> + goto out;
>> + }
>> +
>> + if (unlikely(!job)) {
>> + mutex_unlock(&hwctx->priv->io_lock);
>> + ret = 0;
>> + goto out;
>> + }
>> + out_fence = dma_fence_get(job->out_fence);
>> + mutex_unlock(&hwctx->priv->io_lock);
>> +
>> + if (timeout)
>> + remaining = msecs_to_jiffies(timeout);
>> +
>> + ret = dma_fence_wait_timeout(out_fence, true, remaining);
>> + if (!ret)
>> + ret = -ETIME;
>> + else if (ret > 0)
>> + ret = 0;
>> +
>> + dma_fence_put(out_fence);
>> +out:
>> + return ret;
>> +}
>> +
>> +void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo,
>> + unsigned long cur_seq)
>> +{
>> + struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
>> + struct drm_gem_object *gobj = to_gobj(abo);
>> + long ret;
>> +
>> + dma_resv_lock(gobj->resv, NULL);
> Was randomly looking as I was interested - drive by comment...
>
> I think you have a locking inversion here.
>
> MMU notifiers are in the path of reclaim (Can't allocate memory),
> dma-resv lock can allocate memory while being held, thus you can
> deadlock.
Thanks for point this out, I will use a mutex lock instead.
Lizhi
>
> Lockdep should blow up here. See [1] [2].
>
> [1] https://elixir.bootlin.com/linux/v6.11.3/source/mm/page_alloc.c#L3833
> [2] https://elixir.bootlin.com/linux/v6.11.3/source/drivers/dma-buf/dma-resv.c#L773
>
> Matt
>
>> + abo->mem.map_invalid = true;
>> + mmu_interval_set_seq(&abo->mem.notifier, cur_seq);
>> + ret = dma_resv_wait_timeout(gobj->resv, DMA_RESV_USAGE_BOOKKEEP,
>> + true, MAX_SCHEDULE_TIMEOUT);
>> + dma_resv_unlock(gobj->resv);
>> +
>> + if (!ret || ret == -ERESTARTSYS)
>> + XDNA_ERR(xdna, "Failed to wait for bo, ret %ld", ret);
>> +}
>> diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
>> index 28bd0560db61..3dc4a9a8571e 100644
>> --- a/drivers/accel/amdxdna/aie2_message.c
>> +++ b/drivers/accel/amdxdna/aie2_message.c
>> @@ -4,10 +4,12 @@
>> */
>>
>> #include <drm/amdxdna_accel.h>
>> +#include <drm/drm_cache.h>
>> #include <drm/drm_device.h>
>> #include <drm/drm_gem.h>
>> #include <drm/drm_gem_shmem_helper.h>
>> #include <drm/drm_print.h>
>> +#include <drm/gpu_scheduler.h>
>> #include <linux/errno.h>
>> #include <linux/pci.h>
>> #include <linux/types.h>
>> @@ -361,3 +363,344 @@ int aie2_config_cu(struct amdxdna_hwctx *hwctx)
>> msg.opcode, resp.status, ret);
>> return ret;
>> }
>> +
>> +int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
>> + int (*notify_cb)(void *, const u32 *, size_t))
>> +{
>> + struct mailbox_channel *chann = hwctx->priv->mbox_chann;
>> + struct amdxdna_dev *xdna = hwctx->client->xdna;
>> + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
>> + union {
>> + struct execute_buffer_req ebuf;
>> + struct exec_dpu_req dpu;
>> + } req;
>> + struct xdna_mailbox_msg msg;
>> + u32 payload_len;
>> + void *payload;
>> + int cu_idx;
>> + int ret;
>> + u32 op;
>> +
>> + if (!chann)
>> + return -ENODEV;
>> +
>> + payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len);
>> + if (!payload) {
>> + XDNA_ERR(xdna, "Invalid command, cannot get payload");
>> + return -EINVAL;
>> + }
>> +
>> + cu_idx = amdxdna_cmd_get_cu_idx(cmd_abo);
>> + if (cu_idx < 0) {
>> + XDNA_DBG(xdna, "Invalid cu idx");
>> + return -EINVAL;
>> + }
>> +
>> + op = amdxdna_cmd_get_op(cmd_abo);
>> + switch (op) {
>> + case ERT_START_CU:
>> + if (unlikely(payload_len > sizeof(req.ebuf.payload)))
>> + XDNA_DBG(xdna, "Invalid ebuf payload len: %d", payload_len);
>> + req.ebuf.cu_idx = cu_idx;
>> + memcpy(req.ebuf.payload, payload, sizeof(req.ebuf.payload));
>> + msg.send_size = sizeof(req.ebuf);
>> + msg.opcode = MSG_OP_EXECUTE_BUFFER_CF;
>> + break;
>> + case ERT_START_NPU: {
>> + struct amdxdna_cmd_start_npu *sn = payload;
>> +
>> + if (unlikely(payload_len - sizeof(*sn) > sizeof(req.dpu.payload)))
>> + XDNA_DBG(xdna, "Invalid dpu payload len: %d", payload_len);
>> + req.dpu.inst_buf_addr = sn->buffer;
>> + req.dpu.inst_size = sn->buffer_size;
>> + req.dpu.inst_prop_cnt = sn->prop_count;
>> + req.dpu.cu_idx = cu_idx;
>> + memcpy(req.dpu.payload, sn->prop_args, sizeof(req.dpu.payload));
>> + msg.send_size = sizeof(req.dpu);
>> + msg.opcode = MSG_OP_EXEC_DPU;
>> + break;
>> + }
>> + default:
>> + XDNA_DBG(xdna, "Invalid ERT cmd op code: %d", op);
>> + return -EINVAL;
>> + }
>> + msg.handle = job;
>> + msg.notify_cb = notify_cb;
>> + msg.send_data = (u8 *)&req;
>> + print_hex_dump_debug("cmd: ", DUMP_PREFIX_OFFSET, 16, 4, &req,
>> + 0x40, false);
>> +
>> + ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
>> + if (ret) {
>> + XDNA_ERR(xdna, "Send message failed");
>> + return ret;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int
>> +aie2_cmdlist_fill_one_slot_cf(void *cmd_buf, u32 offset,
>> + struct amdxdna_gem_obj *abo, u32 *size)
>> +{
>> + struct cmd_chain_slot_execbuf_cf *buf = cmd_buf + offset;
>> + int cu_idx = amdxdna_cmd_get_cu_idx(abo);
>> + u32 payload_len;
>> + void *payload;
>> +
>> + if (cu_idx < 0)
>> + return -EINVAL;
>> +
>> + payload = amdxdna_cmd_get_payload(abo, &payload_len);
>> + if (!payload)
>> + return -EINVAL;
>> +
>> + if (!slot_cf_has_space(offset, payload_len))
>> + return -ENOSPC;
>> +
>> + buf->cu_idx = cu_idx;
>> + buf->arg_cnt = payload_len / sizeof(u32);
>> + memcpy(buf->args, payload, payload_len);
>> + /* Accurate buf size to hint firmware to do necessary copy */
>> + *size = sizeof(*buf) + payload_len;
>> + return 0;
>> +}
>> +
>> +static int
>> +aie2_cmdlist_fill_one_slot_dpu(void *cmd_buf, u32 offset,
>> + struct amdxdna_gem_obj *abo, u32 *size)
>> +{
>> + struct cmd_chain_slot_dpu *buf = cmd_buf + offset;
>> + int cu_idx = amdxdna_cmd_get_cu_idx(abo);
>> + struct amdxdna_cmd_start_npu *sn;
>> + u32 payload_len;
>> + void *payload;
>> + u32 arg_sz;
>> +
>> + if (cu_idx < 0)
>> + return -EINVAL;
>> +
>> + payload = amdxdna_cmd_get_payload(abo, &payload_len);
>> + if (!payload)
>> + return -EINVAL;
>> + sn = payload;
>> + arg_sz = payload_len - sizeof(*sn);
>> + if (payload_len < sizeof(*sn) || arg_sz > MAX_DPU_ARGS_SIZE)
>> + return -EINVAL;
>> +
>> + if (!slot_dpu_has_space(offset, arg_sz))
>> + return -ENOSPC;
>> +
>> + buf->inst_buf_addr = sn->buffer;
>> + buf->inst_size = sn->buffer_size;
>> + buf->inst_prop_cnt = sn->prop_count;
>> + buf->cu_idx = cu_idx;
>> + buf->arg_cnt = arg_sz / sizeof(u32);
>> + memcpy(buf->args, sn->prop_args, arg_sz);
>> +
>> + /* Accurate buf size to hint firmware to do necessary copy */
>> + *size += sizeof(*buf) + arg_sz;
>> + return 0;
>> +}
>> +
>> +static int
>> +aie2_cmdlist_fill_one_slot(u32 op, struct amdxdna_gem_obj *cmdbuf_abo, u32 offset,
>> + struct amdxdna_gem_obj *abo, u32 *size)
>> +{
>> + u32 this_op = amdxdna_cmd_get_op(abo);
>> + void *cmd_buf = cmdbuf_abo->mem.kva;
>> + int ret;
>> +
>> + if (this_op != op) {
>> + ret = -EINVAL;
>> + goto done;
>> + }
>> +
>> + switch (op) {
>> + case ERT_START_CU:
>> + ret = aie2_cmdlist_fill_one_slot_cf(cmd_buf, offset, abo, size);
>> + break;
>> + case ERT_START_NPU:
>> + ret = aie2_cmdlist_fill_one_slot_dpu(cmd_buf, offset, abo, size);
>> + break;
>> + default:
>> + ret = -EOPNOTSUPP;
>> + }
>> +
>> +done:
>> + if (ret) {
>> + XDNA_ERR(abo->client->xdna, "Can't fill slot for cmd op %d ret %d",
>> + op, ret);
>> + }
>> + return ret;
>> +}
>> +
>> +static inline struct amdxdna_gem_obj *
>> +aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job)
>> +{
>> + int idx = get_job_idx(job->seq);
>> +
>> + return job->hwctx->priv->cmd_buf[idx];
>> +}
>> +
>> +static void
>> +aie2_cmdlist_prepare_request(struct cmd_chain_req *req,
>> + struct amdxdna_gem_obj *cmdbuf_abo, u32 size, u32 cnt)
>> +{
>> + req->buf_addr = cmdbuf_abo->mem.dev_addr;
>> + req->buf_size = size;
>> + req->count = cnt;
>> + drm_clflush_virt_range(cmdbuf_abo->mem.kva, size);
>> + XDNA_DBG(cmdbuf_abo->client->xdna, "Command buf addr 0x%llx size 0x%x count %d",
>> + req->buf_addr, size, cnt);
>> +}
>> +
>> +static inline u32
>> +aie2_cmd_op_to_msg_op(u32 op)
>> +{
>> + switch (op) {
>> + case ERT_START_CU:
>> + return MSG_OP_CHAIN_EXEC_BUFFER_CF;
>> + case ERT_START_NPU:
>> + return MSG_OP_CHAIN_EXEC_DPU;
>> + default:
>> + return MSG_OP_MAX_OPCODE;
>> + }
>> +}
>> +
>> +int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
>> + struct amdxdna_sched_job *job,
>> + int (*notify_cb)(void *, const u32 *, size_t))
>> +{
>> + struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job);
>> + struct mailbox_channel *chann = hwctx->priv->mbox_chann;
>> + struct amdxdna_client *client = hwctx->client;
>> + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
>> + struct amdxdna_cmd_chain *payload;
>> + struct xdna_mailbox_msg msg;
>> + struct cmd_chain_req req;
>> + u32 payload_len;
>> + u32 offset = 0;
>> + u32 size;
>> + int ret;
>> + u32 op;
>> + u32 i;
>> +
>> + op = amdxdna_cmd_get_op(cmd_abo);
>> + payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len);
>> + if (op != ERT_CMD_CHAIN || !payload ||
>> + payload_len < struct_size(payload, data, payload->command_count))
>> + return -EINVAL;
>> +
>> + for (i = 0; i < payload->command_count; i++) {
>> + u32 boh = (u32)(payload->data[i]);
>> + struct amdxdna_gem_obj *abo;
>> +
>> + abo = amdxdna_gem_get_obj(client, boh, AMDXDNA_BO_CMD);
>> + if (!abo) {
>> + XDNA_ERR(client->xdna, "Failed to find cmd BO %d", boh);
>> + return -ENOENT;
>> + }
>> +
>> + /* All sub-cmd should have same op, use the first one. */
>> + if (i == 0)
>> + op = amdxdna_cmd_get_op(abo);
>> +
>> + ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, offset, abo, &size);
>> + amdxdna_gem_put_obj(abo);
>> + if (ret)
>> + return -EINVAL;
>> +
>> + offset += size;
>> + }
>> +
>> + /* The offset is the accumulated total size of the cmd buffer */
>> + aie2_cmdlist_prepare_request(&req, cmdbuf_abo, offset, payload->command_count);
>> +
>> + msg.opcode = aie2_cmd_op_to_msg_op(op);
>> + if (msg.opcode == MSG_OP_MAX_OPCODE)
>> + return -EOPNOTSUPP;
>> + msg.handle = job;
>> + msg.notify_cb = notify_cb;
>> + msg.send_data = (u8 *)&req;
>> + msg.send_size = sizeof(req);
>> + ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
>> + if (ret) {
>> + XDNA_ERR(hwctx->client->xdna, "Send message failed");
>> + return ret;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
>> + struct amdxdna_sched_job *job,
>> + int (*notify_cb)(void *, const u32 *, size_t))
>> +{
>> + struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job);
>> + struct mailbox_channel *chann = hwctx->priv->mbox_chann;
>> + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
>> + struct xdna_mailbox_msg msg;
>> + struct cmd_chain_req req;
>> + u32 size;
>> + int ret;
>> + u32 op;
>> +
>> + op = amdxdna_cmd_get_op(cmd_abo);
>> + ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, 0, cmd_abo, &size);
>> + if (ret)
>> + return ret;
>> +
>> + aie2_cmdlist_prepare_request(&req, cmdbuf_abo, size, 1);
>> +
>> + msg.opcode = aie2_cmd_op_to_msg_op(op);
>> + if (msg.opcode == MSG_OP_MAX_OPCODE)
>> + return -EOPNOTSUPP;
>> + msg.handle = job;
>> + msg.notify_cb = notify_cb;
>> + msg.send_data = (u8 *)&req;
>> + msg.send_size = sizeof(req);
>> + ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
>> + if (ret) {
>> + XDNA_ERR(hwctx->client->xdna, "Send message failed");
>> + return ret;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
>> + int (*notify_cb)(void *, const u32 *, size_t))
>> +{
>> + struct mailbox_channel *chann = hwctx->priv->mbox_chann;
>> + struct amdxdna_gem_obj *abo = to_xdna_obj(job->bos[0]);
>> + struct amdxdna_dev *xdna = hwctx->client->xdna;
>> + struct xdna_mailbox_msg msg;
>> + struct sync_bo_req req;
>> + int ret = 0;
>> +
>> + req.src_addr = 0;
>> + req.dst_addr = abo->mem.dev_addr - hwctx->client->dev_heap->mem.dev_addr;
>> + req.size = abo->mem.size;
>> +
>> + /* Device to Host */
>> + req.type = FIELD_PREP(AIE2_MSG_SYNC_BO_SRC_TYPE, SYNC_BO_DEV_MEM) |
>> + FIELD_PREP(AIE2_MSG_SYNC_BO_DST_TYPE, SYNC_BO_HOST_MEM);
>> +
>> + XDNA_DBG(xdna, "sync %d bytes src(0x%llx) to dst(0x%llx) completed",
>> + req.size, req.src_addr, req.dst_addr);
>> +
>> + msg.handle = job;
>> + msg.notify_cb = notify_cb;
>> + msg.send_data = (u8 *)&req;
>> + msg.send_size = sizeof(req);
>> + msg.opcode = MSG_OP_SYNC_BO;
>> +
>> + ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
>> + if (ret) {
>> + XDNA_ERR(xdna, "Send message failed");
>> + return ret;
>> + }
>> +
>> + return 0;
>> +}
>> diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
>> index ee9f114bc229..6017826a7104 100644
>> --- a/drivers/accel/amdxdna/aie2_pci.c
>> +++ b/drivers/accel/amdxdna/aie2_pci.c
>> @@ -5,8 +5,10 @@
>>
>> #include <drm/amdxdna_accel.h>
>> #include <drm/drm_device.h>
>> +#include <drm/drm_gem_shmem_helper.h>
>> #include <drm/drm_managed.h>
>> #include <drm/drm_print.h>
>> +#include <drm/gpu_scheduler.h>
>> #include <linux/errno.h>
>> #include <linux/firmware.h>
>> #include <linux/iommu.h>
>> @@ -17,6 +19,7 @@
>> #include "aie2_pci.h"
>> #include "aie2_solver.h"
>> #include "amdxdna_ctx.h"
>> +#include "amdxdna_gem.h"
>> #include "amdxdna_mailbox.h"
>> #include "amdxdna_pci_drv.h"
>>
>> @@ -499,4 +502,7 @@ const struct amdxdna_dev_ops aie2_ops = {
>> .hwctx_init = aie2_hwctx_init,
>> .hwctx_fini = aie2_hwctx_fini,
>> .hwctx_config = aie2_hwctx_config,
>> + .cmd_submit = aie2_cmd_submit,
>> + .cmd_wait = aie2_cmd_wait,
>> + .hmm_invalidate = aie2_hmm_invalidate,
>> };
>> diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
>> index 3ac936e2c9d1..81877d9c0542 100644
>> --- a/drivers/accel/amdxdna/aie2_pci.h
>> +++ b/drivers/accel/amdxdna/aie2_pci.h
>> @@ -76,6 +76,7 @@ enum psp_reg_idx {
>> PSP_MAX_REGS /* Keep this at the end */
>> };
>>
>> +struct amdxdna_client;
>> struct amdxdna_fw_ver;
>> struct amdxdna_hwctx;
>>
>> @@ -118,9 +119,28 @@ struct rt_config {
>> u32 value;
>> };
>>
>> +/*
>> + * Define the maximum number of pending commands in a hardware context.
>> + * Must be power of 2!
>> + */
>> +#define HWCTX_MAX_CMDS 4
>> +#define get_job_idx(seq) ((seq) & (HWCTX_MAX_CMDS - 1))
>> struct amdxdna_hwctx_priv {
>> struct amdxdna_gem_obj *heap;
>> void *mbox_chann;
>> +
>> + struct drm_gpu_scheduler sched;
>> + struct drm_sched_entity entity;
>> +
>> + struct mutex io_lock; /* protect seq and cmd order */
>> + struct wait_queue_head job_free_wq;
>> + struct amdxdna_sched_job *pending[HWCTX_MAX_CMDS];
>> + u32 num_pending;
>> + u64 seq;
>> + /* Completed job counter */
>> + u64 completed;
>> +
>> + struct amdxdna_gem_obj *cmd_buf[HWCTX_MAX_CMDS];
>> };
>>
>> struct amdxdna_dev_hdl {
>> @@ -199,10 +219,25 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
>> int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
>> int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
>> int aie2_config_cu(struct amdxdna_hwctx *hwctx);
>> +int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
>> + int (*notify_cb)(void *, const u32 *, size_t));
>> +int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
>> + struct amdxdna_sched_job *job,
>> + int (*notify_cb)(void *, const u32 *, size_t));
>> +int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
>> + struct amdxdna_sched_job *job,
>> + int (*notify_cb)(void *, const u32 *, size_t));
>> +int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
>> + int (*notify_cb)(void *, const u32 *, size_t));
>>
>> /* aie2_hwctx.c */
>> int aie2_hwctx_init(struct amdxdna_hwctx *hwctx);
>> void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx);
>> int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
>> +int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
>> +int aie2_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
>> +void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
>> +void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map);
>> +void aie2_restart_ctx(struct amdxdna_client *client);
>>
>> #endif /* _AIE2_PCI_H_ */
>> diff --git a/drivers/accel/amdxdna/aie2_psp.c b/drivers/accel/amdxdna/aie2_psp.c
>> index 2efcfd1941bf..35ba55e2ab1e 100644
>> --- a/drivers/accel/amdxdna/aie2_psp.c
>> +++ b/drivers/accel/amdxdna/aie2_psp.c
>> @@ -4,8 +4,10 @@
>> */
>>
>> #include <drm/drm_device.h>
>> +#include <drm/drm_gem_shmem_helper.h>
>> #include <drm/drm_managed.h>
>> #include <drm/drm_print.h>
>> +#include <drm/gpu_scheduler.h>
>> #include <linux/iopoll.h>
>>
>> #include "aie2_pci.h"
>> diff --git a/drivers/accel/amdxdna/aie2_smu.c b/drivers/accel/amdxdna/aie2_smu.c
>> index 3fa7064649aa..91893d438da7 100644
>> --- a/drivers/accel/amdxdna/aie2_smu.c
>> +++ b/drivers/accel/amdxdna/aie2_smu.c
>> @@ -4,7 +4,9 @@
>> */
>>
>> #include <drm/drm_device.h>
>> +#include <drm/drm_gem_shmem_helper.h>
>> #include <drm/drm_print.h>
>> +#include <drm/gpu_scheduler.h>
>> #include <linux/iopoll.h>
>>
>> #include "aie2_pci.h"
>> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
>> index 8acf8bfe0db9..b76640e1fdd0 100644
>> --- a/drivers/accel/amdxdna/amdxdna_ctx.c
>> +++ b/drivers/accel/amdxdna/amdxdna_ctx.c
>> @@ -7,17 +7,65 @@
>> #include <drm/drm_device.h>
>> #include <drm/drm_drv.h>
>> #include <drm/drm_file.h>
>> +#include <drm/drm_gem.h>
>> +#include <drm/drm_gem_shmem_helper.h>
>> #include <drm/drm_print.h>
>> +#include <drm/gpu_scheduler.h>
>> +#include <trace/events/amdxdna.h>
>>
>> #include "amdxdna_ctx.h"
>> +#include "amdxdna_gem.h"
>> #include "amdxdna_pci_drv.h"
>>
>> #define MAX_HWCTX_ID 255
>> +#define MAX_ARG_COUNT 4095
>>
>> -static void amdxdna_hwctx_destroy(struct amdxdna_hwctx *hwctx)
>> +struct amdxdna_fence {
>> + struct dma_fence base;
>> + spinlock_t lock; /* for base */
>> + struct amdxdna_hwctx *hwctx;
>> +};
>> +
>> +static const char *amdxdna_fence_get_driver_name(struct dma_fence *fence)
>> +{
>> + return KBUILD_MODNAME;
>> +}
>> +
>> +static const char *amdxdna_fence_get_timeline_name(struct dma_fence *fence)
>> +{
>> + struct amdxdna_fence *xdna_fence;
>> +
>> + xdna_fence = container_of(fence, struct amdxdna_fence, base);
>> +
>> + return xdna_fence->hwctx->name;
>> +}
>> +
>> +static const struct dma_fence_ops fence_ops = {
>> + .get_driver_name = amdxdna_fence_get_driver_name,
>> + .get_timeline_name = amdxdna_fence_get_timeline_name,
>> +};
>> +
>> +static struct dma_fence *amdxdna_fence_create(struct amdxdna_hwctx *hwctx)
>> +{
>> + struct amdxdna_fence *fence;
>> +
>> + fence = kzalloc(sizeof(*fence), GFP_KERNEL);
>> + if (!fence)
>> + return NULL;
>> +
>> + fence->hwctx = hwctx;
>> + spin_lock_init(&fence->lock);
>> + dma_fence_init(&fence->base, &fence_ops, &fence->lock, hwctx->id, 0);
>> + return &fence->base;
>> +}
>> +
>> +static void amdxdna_hwctx_destroy_rcu(struct amdxdna_hwctx *hwctx,
>> + struct srcu_struct *ss)
>> {
>> struct amdxdna_dev *xdna = hwctx->client->xdna;
>>
>> + synchronize_srcu(ss);
>> +
>> /* At this point, user is not able to submit new commands */
>> mutex_lock(&xdna->dev_lock);
>> xdna->dev_info->ops->hwctx_fini(hwctx);
>> @@ -27,6 +75,46 @@ static void amdxdna_hwctx_destroy(struct amdxdna_hwctx *hwctx)
>> kfree(hwctx);
>> }
>>
>> +void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size)
>> +{
>> + struct amdxdna_cmd *cmd = abo->mem.kva;
>> + u32 num_masks, count;
>> +
>> + if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN)
>> + num_masks = 0;
>> + else
>> + num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, cmd->header);
>> +
>> + if (size) {
>> + count = FIELD_GET(AMDXDNA_CMD_COUNT, cmd->header);
>> + if (unlikely(count <= num_masks)) {
>> + *size = 0;
>> + return NULL;
>> + }
>> + *size = (count - num_masks) * sizeof(u32);
>> + }
>> + return &cmd->data[num_masks];
>> +}
>> +
>> +int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
>> +{
>> + struct amdxdna_cmd *cmd = abo->mem.kva;
>> + u32 num_masks, i;
>> + u32 *cu_mask;
>> +
>> + if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN)
>> + return -1;
>> +
>> + num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, cmd->header);
>> + cu_mask = cmd->data;
>> + for (i = 0; i < num_masks; i++) {
>> + if (cu_mask[i])
>> + return ffs(cu_mask[i]) - 1;
>> + }
>> +
>> + return -1;
>> +}
>> +
>> /*
>> * This should be called in close() and remove(). DO NOT call in other syscalls.
>> * This guarantee that when hwctx and resources will be released, if user
>> @@ -43,7 +131,7 @@ void amdxdna_hwctx_remove_all(struct amdxdna_client *client)
>> client->pid, hwctx->id);
>> idr_remove(&client->hwctx_idr, hwctx->id);
>> mutex_unlock(&client->hwctx_lock);
>> - amdxdna_hwctx_destroy(hwctx);
>> + amdxdna_hwctx_destroy_rcu(hwctx, &client->hwctx_srcu);
>> mutex_lock(&client->hwctx_lock);
>> }
>> mutex_unlock(&client->hwctx_lock);
>> @@ -134,6 +222,12 @@ int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct d
>> if (!drm_dev_enter(dev, &idx))
>> return -ENODEV;
>>
>> + /*
>> + * Use hwctx_lock to achieve exclusion with other hwctx writers,
>> + * SRCU to synchronize with exec/wait command ioctls.
>> + *
>> + * The pushed jobs are handled by DRM scheduler during destroy.
>> + */
>> mutex_lock(&client->hwctx_lock);
>> hwctx = idr_find(&client->hwctx_idr, args->handle);
>> if (!hwctx) {
>> @@ -146,7 +240,7 @@ int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct d
>> idr_remove(&client->hwctx_idr, hwctx->id);
>> mutex_unlock(&client->hwctx_lock);
>>
>> - amdxdna_hwctx_destroy(hwctx);
>> + amdxdna_hwctx_destroy_rcu(hwctx, &client->hwctx_srcu);
>>
>> XDNA_DBG(xdna, "PID %d destroyed HW context %d", client->pid, args->handle);
>> out:
>> @@ -160,10 +254,10 @@ int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
>> struct amdxdna_drm_config_hwctx *args = data;
>> struct amdxdna_dev *xdna = to_xdna_dev(dev);
>> struct amdxdna_hwctx *hwctx;
>> + int ret, idx;
>> u32 buf_size;
>> void *buf;
>> u64 val;
>> - int ret;
>>
>> if (!xdna->dev_info->ops->hwctx_config)
>> return -EOPNOTSUPP;
>> @@ -202,17 +296,286 @@ int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
>> }
>>
>> mutex_lock(&xdna->dev_lock);
>> + idx = srcu_read_lock(&client->hwctx_srcu);
>> hwctx = idr_find(&client->hwctx_idr, args->handle);
>> if (!hwctx) {
>> XDNA_DBG(xdna, "PID %d failed to get hwctx %d", client->pid, args->handle);
>> ret = -EINVAL;
>> - goto unlock;
>> + goto unlock_srcu;
>> }
>>
>> ret = xdna->dev_info->ops->hwctx_config(hwctx, args->param_type, val, buf, buf_size);
>>
>> -unlock:
>> +unlock_srcu:
>> + srcu_read_unlock(&client->hwctx_srcu, idx);
>> mutex_unlock(&xdna->dev_lock);
>> kfree(buf);
>> return ret;
>> }
>> +
>> +static void
>> +amdxdna_arg_bos_put(struct amdxdna_sched_job *job)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < job->bo_cnt; i++) {
>> + if (!job->bos[i])
>> + break;
>> + drm_gem_object_put(job->bos[i]);
>> + }
>> +}
>> +
>> +static int
>> +amdxdna_arg_bos_lookup(struct amdxdna_client *client,
>> + struct amdxdna_sched_job *job,
>> + u32 *bo_hdls, u32 bo_cnt)
>> +{
>> + struct drm_gem_object *gobj;
>> + int i, ret;
>> +
>> + job->bo_cnt = bo_cnt;
>> + for (i = 0; i < job->bo_cnt; i++) {
>> + struct amdxdna_gem_obj *abo;
>> +
>> + gobj = drm_gem_object_lookup(client->filp, bo_hdls[i]);
>> + if (!gobj) {
>> + ret = -ENOENT;
>> + goto put_shmem_bo;
>> + }
>> + abo = to_xdna_obj(gobj);
>> +
>> + mutex_lock(&abo->lock);
>> + if (abo->pinned) {
>> + mutex_unlock(&abo->lock);
>> + job->bos[i] = gobj;
>> + continue;
>> + }
>> +
>> + ret = amdxdna_gem_pin_nolock(abo);
>> + if (ret) {
>> + mutex_unlock(&abo->lock);
>> + drm_gem_object_put(gobj);
>> + goto put_shmem_bo;
>> + }
>> + abo->pinned = true;
>> + mutex_unlock(&abo->lock);
>> +
>> + job->bos[i] = gobj;
>> + }
>> +
>> + return 0;
>> +
>> +put_shmem_bo:
>> + amdxdna_arg_bos_put(job);
>> + return ret;
>> +}
>> +
>> +static void amdxdna_sched_job_release(struct kref *ref)
>> +{
>> + struct amdxdna_sched_job *job;
>> +
>> + job = container_of(ref, struct amdxdna_sched_job, refcnt);
>> +
>> + trace_amdxdna_debug_point(job->hwctx->name, job->seq, "job release");
>> + amdxdna_arg_bos_put(job);
>> + amdxdna_gem_put_obj(job->cmd_bo);
>> + kfree(job);
>> +}
>> +
>> +void amdxdna_job_put(struct amdxdna_sched_job *job)
>> +{
>> + kref_put(&job->refcnt, amdxdna_sched_job_release);
>> +}
>> +
>> +int amdxdna_cmd_submit(struct amdxdna_client *client,
>> + u32 cmd_bo_hdl, u32 *arg_bo_hdls, u32 arg_bo_cnt,
>> + u32 hwctx_hdl, u64 *seq)
>> +{
>> + struct amdxdna_dev *xdna = client->xdna;
>> + struct amdxdna_sched_job *job;
>> + struct amdxdna_hwctx *hwctx;
>> + int ret, idx;
>> +
>> + XDNA_DBG(xdna, "Command BO hdl %d, Arg BO count %d", cmd_bo_hdl, arg_bo_cnt);
>> + job = kzalloc(struct_size(job, bos, arg_bo_cnt), GFP_KERNEL);
>> + if (!job)
>> + return -ENOMEM;
>> +
>> + if (cmd_bo_hdl != AMDXDNA_INVALID_BO_HANDLE) {
>> + job->cmd_bo = amdxdna_gem_get_obj(client, cmd_bo_hdl, AMDXDNA_BO_CMD);
>> + if (!job->cmd_bo) {
>> + XDNA_ERR(xdna, "Failed to get cmd bo from %d", cmd_bo_hdl);
>> + ret = -EINVAL;
>> + goto free_job;
>> + }
>> + } else {
>> + job->cmd_bo = NULL;
>> + }
>> +
>> + ret = amdxdna_arg_bos_lookup(client, job, arg_bo_hdls, arg_bo_cnt);
>> + if (ret) {
>> + XDNA_ERR(xdna, "Argument BOs lookup failed, ret %d", ret);
>> + goto cmd_put;
>> + }
>> +
>> + idx = srcu_read_lock(&client->hwctx_srcu);
>> + hwctx = idr_find(&client->hwctx_idr, hwctx_hdl);
>> + if (!hwctx) {
>> + XDNA_DBG(xdna, "PID %d failed to get hwctx %d",
>> + client->pid, hwctx_hdl);
>> + ret = -EINVAL;
>> + goto unlock_srcu;
>> + }
>> +
>> + if (hwctx->status != HWCTX_STAT_READY) {
>> + XDNA_ERR(xdna, "HW Context is not ready");
>> + ret = -EINVAL;
>> + goto unlock_srcu;
>> + }
>> +
>> + job->hwctx = hwctx;
>> + job->mm = current->mm;
>> +
>> + job->fence = amdxdna_fence_create(hwctx);
>> + if (!job->fence) {
>> + XDNA_ERR(xdna, "Failed to create fence");
>> + ret = -ENOMEM;
>> + goto unlock_srcu;
>> + }
>> + kref_init(&job->refcnt);
>> +
>> + ret = xdna->dev_info->ops->cmd_submit(hwctx, job, seq);
>> + if (ret)
>> + goto put_fence;
>> +
>> + /*
>> + * The amdxdna_hwctx_destroy_rcu() will release hwctx and associated
>> + * resource after synchronize_srcu(). The submitted jobs should be
>> + * handled by the queue, for example DRM scheduler, in device layer.
>> + * For here we can unlock SRCU.
>> + */
>> + srcu_read_unlock(&client->hwctx_srcu, idx);
>> + trace_amdxdna_debug_point(hwctx->name, *seq, "job pushed");
>> +
>> + return 0;
>> +
>> +put_fence:
>> + dma_fence_put(job->fence);
>> +unlock_srcu:
>> + srcu_read_unlock(&client->hwctx_srcu, idx);
>> + amdxdna_arg_bos_put(job);
>> +cmd_put:
>> + amdxdna_gem_put_obj(job->cmd_bo);
>> +free_job:
>> + kfree(job);
>> + return ret;
>> +}
>> +
>> +/*
>> + * The submit command ioctl submits a command to firmware. One firmware command
>> + * may contain multiple command BOs for processing as a whole.
>> + * The command sequence number is returned which can be used for wait command ioctl.
>> + */
>> +static int amdxdna_drm_submit_execbuf(struct amdxdna_client *client,
>> + struct amdxdna_drm_exec_cmd *args)
>> +{
>> + struct amdxdna_dev *xdna = client->xdna;
>> + u32 *arg_bo_hdls;
>> + u32 cmd_bo_hdl;
>> + int ret;
>> +
>> + if (!args->arg_count || args->arg_count > MAX_ARG_COUNT) {
>> + XDNA_ERR(xdna, "Invalid arg bo count %d", args->arg_count);
>> + return -EINVAL;
>> + }
>> +
>> + /* Only support single command for now. */
>> + if (args->cmd_count != 1) {
>> + XDNA_ERR(xdna, "Invalid cmd bo count %d", args->cmd_count);
>> + return -EINVAL;
>> + }
>> +
>> + cmd_bo_hdl = (u32)args->cmd_handles;
>> + arg_bo_hdls = kcalloc(args->arg_count, sizeof(u32), GFP_KERNEL);
>> + if (!arg_bo_hdls)
>> + return -ENOMEM;
>> + ret = copy_from_user(arg_bo_hdls, u64_to_user_ptr(args->args),
>> + args->arg_count * sizeof(u32));
>> + if (ret) {
>> + ret = -EFAULT;
>> + goto free_cmd_bo_hdls;
>> + }
>> +
>> + ret = amdxdna_cmd_submit(client, cmd_bo_hdl, arg_bo_hdls,
>> + args->arg_count, args->hwctx, &args->seq);
>> + if (ret)
>> + XDNA_DBG(xdna, "Submit cmds failed, ret %d", ret);
>> +
>> +free_cmd_bo_hdls:
>> + kfree(arg_bo_hdls);
>> + if (!ret)
>> + XDNA_DBG(xdna, "Pushed cmd %lld to scheduler", args->seq);
>> + return ret;
>> +}
>> +
>> +int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>> +{
>> + struct amdxdna_client *client = filp->driver_priv;
>> + struct amdxdna_drm_exec_cmd *args = data;
>> +
>> + if (args->ext || args->ext_flags)
>> + return -EINVAL;
>> +
>> + switch (args->type) {
>> + case AMDXDNA_CMD_SUBMIT_EXEC_BUF:
>> + return amdxdna_drm_submit_execbuf(client, args);
>> + }
>> +
>> + XDNA_ERR(client->xdna, "Invalid command type %d", args->type);
>> + return -EINVAL;
>> +}
>> +
>> +int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl,
>> + u64 seq, u32 timeout)
>> +{
>> + struct amdxdna_dev *xdna = client->xdna;
>> + struct amdxdna_hwctx *hwctx;
>> + int ret, idx;
>> +
>> + if (!xdna->dev_info->ops->cmd_wait)
>> + return -EOPNOTSUPP;
>> +
>> + /* For locking concerns, see amdxdna_drm_exec_cmd_ioctl. */
>> + idx = srcu_read_lock(&client->hwctx_srcu);
>> + hwctx = idr_find(&client->hwctx_idr, hwctx_hdl);
>> + if (!hwctx) {
>> + XDNA_DBG(xdna, "PID %d failed to get hwctx %d",
>> + client->pid, hwctx_hdl);
>> + ret = -EINVAL;
>> + goto unlock_hwctx_srcu;
>> + }
>> +
>> + ret = xdna->dev_info->ops->cmd_wait(hwctx, seq, timeout);
>> +
>> +unlock_hwctx_srcu:
>> + srcu_read_unlock(&client->hwctx_srcu, idx);
>> + return ret;
>> +}
>> +
>> +int amdxdna_drm_wait_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>> +{
>> + struct amdxdna_client *client = filp->driver_priv;
>> + struct amdxdna_dev *xdna = to_xdna_dev(dev);
>> + struct amdxdna_drm_wait_cmd *args = data;
>> + int ret;
>> +
>> + XDNA_DBG(xdna, "PID %d hwctx %d timeout set %d ms for cmd %lld",
>> + client->pid, args->hwctx, args->timeout, args->seq);
>> +
>> + ret = amdxdna_cmd_wait(client, args->hwctx, args->seq, args->timeout);
>> +
>> + XDNA_DBG(xdna, "PID %d hwctx %d cmd %lld wait finished, ret %d",
>> + client->pid, args->hwctx, args->seq, ret);
>> +
>> + return ret;
>> +}
>> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
>> index 665b3208897d..65f9c1dfe32c 100644
>> --- a/drivers/accel/amdxdna/amdxdna_ctx.h
>> +++ b/drivers/accel/amdxdna/amdxdna_ctx.h
>> @@ -6,6 +6,52 @@
>> #ifndef _AMDXDNA_CTX_H_
>> #define _AMDXDNA_CTX_H_
>>
>> +#include "amdxdna_gem.h"
>> +
>> +struct amdxdna_hwctx_priv;
>> +
>> +enum ert_cmd_opcode {
>> + ERT_START_CU = 0,
>> + ERT_CMD_CHAIN = 19,
>> + ERT_START_NPU = 20,
>> +};
>> +
>> +enum ert_cmd_state {
>> + ERT_CMD_STATE_INVALID,
>> + ERT_CMD_STATE_NEW,
>> + ERT_CMD_STATE_QUEUED,
>> + ERT_CMD_STATE_RUNNING,
>> + ERT_CMD_STATE_COMPLETED,
>> + ERT_CMD_STATE_ERROR,
>> + ERT_CMD_STATE_ABORT,
>> + ERT_CMD_STATE_SUBMITTED,
>> + ERT_CMD_STATE_TIMEOUT,
>> + ERT_CMD_STATE_NORESPONSE,
>> +};
>> +
>> +/*
>> + * Interpretation of the beginning of data payload for ERT_START_NPU in
>> + * amdxdna_cmd. The rest of the payload in amdxdna_cmd is regular kernel args.
>> + */
>> +struct amdxdna_cmd_start_npu {
>> + u64 buffer; /* instruction buffer address */
>> + u32 buffer_size; /* size of buffer in bytes */
>> + u32 prop_count; /* properties count */
>> + u32 prop_args[]; /* properties and regular kernel arguments */
>> +};
>> +
>> +/*
>> + * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in
>> + * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles.
>> + */
>> +struct amdxdna_cmd_chain {
>> + u32 command_count;
>> + u32 submit_index;
>> + u32 error_index;
>> + u32 reserved[3];
>> + u64 data[] __counted_by(command_count);
>> +};
>> +
>> /* Exec buffer command header format */
>> #define AMDXDNA_CMD_STATE GENMASK(3, 0)
>> #define AMDXDNA_CMD_EXTRA_CU_MASK GENMASK(11, 10)
>> @@ -40,9 +86,73 @@ struct amdxdna_hwctx {
>> struct amdxdna_hwctx_param_config_cu *cus;
>> };
>>
>> +#define drm_job_to_xdna_job(j) \
>> + container_of(j, struct amdxdna_sched_job, base)
>> +
>> +struct amdxdna_sched_job {
>> + struct drm_sched_job base;
>> + struct kref refcnt;
>> + struct amdxdna_hwctx *hwctx;
>> + struct mm_struct *mm;
>> + /* The fence to notice DRM scheduler that job is done by hardware */
>> + struct dma_fence *fence;
>> + /* user can wait on this fence */
>> + struct dma_fence *out_fence;
>> + u64 seq;
>> + struct amdxdna_gem_obj *cmd_bo;
>> + size_t bo_cnt;
>> + struct drm_gem_object *bos[] __counted_by(bo_cnt);
>> +};
>> +
>> +static inline u32
>> +amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
>> +{
>> + struct amdxdna_cmd *cmd = abo->mem.kva;
>> +
>> + return FIELD_GET(AMDXDNA_CMD_OPCODE, cmd->header);
>> +}
>> +
>> +static inline void
>> +amdxdna_cmd_set_state(struct amdxdna_gem_obj *abo, enum ert_cmd_state s)
>> +{
>> + struct amdxdna_cmd *cmd = abo->mem.kva;
>> +
>> + cmd->header &= ~AMDXDNA_CMD_STATE;
>> + cmd->header |= FIELD_PREP(AMDXDNA_CMD_STATE, s);
>> +}
>> +
>> +static inline enum ert_cmd_state
>> +amdxdna_cmd_get_state(struct amdxdna_gem_obj *abo)
>> +{
>> + struct amdxdna_cmd *cmd = abo->mem.kva;
>> +
>> + return FIELD_GET(AMDXDNA_CMD_STATE, cmd->header);
>> +}
>> +
>> +void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size);
>> +int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
>> +
>> +static inline u32 amdxdna_hwctx_col_map(struct amdxdna_hwctx *hwctx)
>> +{
>> + return GENMASK(hwctx->start_col + hwctx->num_col - 1,
>> + hwctx->start_col);
>> +}
>> +
>> +void amdxdna_job_put(struct amdxdna_sched_job *job);
>> +
>> void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
>> +
>> +int amdxdna_cmd_submit(struct amdxdna_client *client,
>> + u32 cmd_bo_hdls, u32 *arg_bo_hdls, u32 arg_bo_cnt,
>> + u32 hwctx_hdl, u64 *seq);
>> +
>> +int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl,
>> + u64 seq, u32 timeout);
>> +
>> int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
>> int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
>> int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
>> +int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
>> +int amdxdna_drm_wait_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
>>
>> #endif /* _AMDXDNA_CTX_H_ */
>> diff --git a/drivers/accel/amdxdna/amdxdna_gem.c b/drivers/accel/amdxdna/amdxdna_gem.c
>> index 66373baa4600..091674a4bbfa 100644
>> --- a/drivers/accel/amdxdna/amdxdna_gem.c
>> +++ b/drivers/accel/amdxdna/amdxdna_gem.c
>> @@ -8,6 +8,7 @@
>> #include <drm/drm_device.h>
>> #include <drm/drm_gem.h>
>> #include <drm/drm_gem_shmem_helper.h>
>> +#include <drm/gpu_scheduler.h>
>> #include <linux/iosys-map.h>
>> #include <linux/vmalloc.h>
>>
>> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox_helper.c b/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
>> index 42b615394605..5139a9c96a91 100644
>> --- a/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
>> +++ b/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
>> @@ -3,10 +3,15 @@
>> * Copyright (C) 2024, Advanced Micro Devices, Inc.
>> */
>>
>> +#include <drm/amdxdna_accel.h>
>> #include <drm/drm_device.h>
>> #include <drm/drm_print.h>
>> +#include <drm/drm_gem.h>
>> +#include <drm/drm_gem_shmem_helper.h>
>> +#include <drm/gpu_scheduler.h>
>> #include <linux/completion.h>
>>
>> +#include "amdxdna_gem.h"
>> #include "amdxdna_mailbox.h"
>> #include "amdxdna_mailbox_helper.h"
>> #include "amdxdna_pci_drv.h"
>> diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c
>> index 47ea79d4a021..5c1e863825e0 100644
>> --- a/drivers/accel/amdxdna/amdxdna_pci_drv.c
>> +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
>> @@ -10,6 +10,7 @@
>> #include <drm/drm_gem_shmem_helper.h>
>> #include <drm/drm_ioctl.h>
>> #include <drm/drm_managed.h>
>> +#include <drm/gpu_scheduler.h>
>> #include <linux/iommu.h>
>> #include <linux/pci.h>
>>
>> @@ -64,6 +65,7 @@ static int amdxdna_drm_open(struct drm_device *ddev, struct drm_file *filp)
>> goto unbind_sva;
>> }
>> mutex_init(&client->hwctx_lock);
>> + init_srcu_struct(&client->hwctx_srcu);
>> idr_init_base(&client->hwctx_idr, AMDXDNA_INVALID_CTX_HANDLE + 1);
>> mutex_init(&client->mm_lock);
>>
>> @@ -93,6 +95,7 @@ static void amdxdna_drm_close(struct drm_device *ddev, struct drm_file *filp)
>> XDNA_DBG(xdna, "closing pid %d", client->pid);
>>
>> idr_destroy(&client->hwctx_idr);
>> + cleanup_srcu_struct(&client->hwctx_srcu);
>> mutex_destroy(&client->hwctx_lock);
>> mutex_destroy(&client->mm_lock);
>> if (client->dev_heap)
>> @@ -133,6 +136,9 @@ static const struct drm_ioctl_desc amdxdna_drm_ioctls[] = {
>> DRM_IOCTL_DEF_DRV(AMDXDNA_CREATE_BO, amdxdna_drm_create_bo_ioctl, 0),
>> DRM_IOCTL_DEF_DRV(AMDXDNA_GET_BO_INFO, amdxdna_drm_get_bo_info_ioctl, 0),
>> DRM_IOCTL_DEF_DRV(AMDXDNA_SYNC_BO, amdxdna_drm_sync_bo_ioctl, 0),
>> + /* Exectuion */
>> + DRM_IOCTL_DEF_DRV(AMDXDNA_EXEC_CMD, amdxdna_drm_submit_cmd_ioctl, 0),
>> + DRM_IOCTL_DEF_DRV(AMDXDNA_WAIT_CMD, amdxdna_drm_wait_cmd_ioctl, 0),
>> };
>>
>> static const struct file_operations amdxdna_fops = {
>> diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.h b/drivers/accel/amdxdna/amdxdna_pci_drv.h
>> index 3dddde4ac12a..0324e73094b2 100644
>> --- a/drivers/accel/amdxdna/amdxdna_pci_drv.h
>> +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.h
>> @@ -20,6 +20,7 @@ extern const struct drm_driver amdxdna_drm_drv;
>> struct amdxdna_dev;
>> struct amdxdna_gem_obj;
>> struct amdxdna_hwctx;
>> +struct amdxdna_sched_job;
>>
>> /*
>> * struct amdxdna_dev_ops - Device hardware operation callbacks
>> @@ -31,6 +32,8 @@ struct amdxdna_dev_ops {
>> void (*hwctx_fini)(struct amdxdna_hwctx *hwctx);
>> int (*hwctx_config)(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
>> void (*hmm_invalidate)(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
>> + int (*cmd_submit)(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
>> + int (*cmd_wait)(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
>> };
>>
>> /*
>> @@ -88,6 +91,8 @@ struct amdxdna_client {
>> struct list_head node;
>> pid_t pid;
>> struct mutex hwctx_lock; /* protect hwctx */
>> + /* do NOT wait this srcu when hwctx_lock is hold */
>> + struct srcu_struct hwctx_srcu;
>> struct idr hwctx_idr;
>> struct amdxdna_dev *xdna;
>> struct drm_file *filp;
>> diff --git a/drivers/accel/amdxdna/amdxdna_sysfs.c b/drivers/accel/amdxdna/amdxdna_sysfs.c
>> index 668b94b92714..f27e4ee960a0 100644
>> --- a/drivers/accel/amdxdna/amdxdna_sysfs.c
>> +++ b/drivers/accel/amdxdna/amdxdna_sysfs.c
>> @@ -3,9 +3,14 @@
>> * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
>> */
>>
>> +#include <drm/amdxdna_accel.h>
>> #include <drm/drm_device.h>
>> +#include <drm/drm_gem_shmem_helper.h>
>> #include <drm/drm_print.h>
>> +#include <drm/gpu_scheduler.h>
>> +#include <linux/types.h>
>>
>> +#include "amdxdna_gem.h"
>> #include "amdxdna_pci_drv.h"
>>
>> static ssize_t vbnv_show(struct device *dev, struct device_attribute *attr, char *buf)
>> diff --git a/drivers/accel/amdxdna/npu1_regs.c b/drivers/accel/amdxdna/npu1_regs.c
>> index 720aab0ed7c4..f00c50461b09 100644
>> --- a/drivers/accel/amdxdna/npu1_regs.c
>> +++ b/drivers/accel/amdxdna/npu1_regs.c
>> @@ -5,6 +5,7 @@
>>
>> #include <drm/amdxdna_accel.h>
>> #include <drm/drm_device.h>
>> +#include <drm/gpu_scheduler.h>
>> #include <linux/sizes.h>
>>
>> #include "aie2_pci.h"
>> diff --git a/drivers/accel/amdxdna/npu2_regs.c b/drivers/accel/amdxdna/npu2_regs.c
>> index f3ea18bcf294..00cb381031d2 100644
>> --- a/drivers/accel/amdxdna/npu2_regs.c
>> +++ b/drivers/accel/amdxdna/npu2_regs.c
>> @@ -5,6 +5,7 @@
>>
>> #include <drm/amdxdna_accel.h>
>> #include <drm/drm_device.h>
>> +#include <drm/gpu_scheduler.h>
>> #include <linux/sizes.h>
>>
>> #include "aie2_pci.h"
>> diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c
>> index db61142f0d4e..b6dae9667cca 100644
>> --- a/drivers/accel/amdxdna/npu4_regs.c
>> +++ b/drivers/accel/amdxdna/npu4_regs.c
>> @@ -5,6 +5,7 @@
>>
>> #include <drm/amdxdna_accel.h>
>> #include <drm/drm_device.h>
>> +#include <drm/gpu_scheduler.h>
>> #include <linux/sizes.h>
>>
>> #include "aie2_pci.h"
>> diff --git a/drivers/accel/amdxdna/npu5_regs.c b/drivers/accel/amdxdna/npu5_regs.c
>> index debf4e95b9bb..bed1baf8e160 100644
>> --- a/drivers/accel/amdxdna/npu5_regs.c
>> +++ b/drivers/accel/amdxdna/npu5_regs.c
>> @@ -5,6 +5,7 @@
>>
>> #include <drm/amdxdna_accel.h>
>> #include <drm/drm_device.h>
>> +#include <drm/gpu_scheduler.h>
>> #include <linux/sizes.h>
>>
>> #include "aie2_pci.h"
>> diff --git a/include/trace/events/amdxdna.h b/include/trace/events/amdxdna.h
>> index 33343d8f0622..c6cb2da7b706 100644
>> --- a/include/trace/events/amdxdna.h
>> +++ b/include/trace/events/amdxdna.h
>> @@ -9,8 +9,49 @@
>> #if !defined(_TRACE_AMDXDNA_H) || defined(TRACE_HEADER_MULTI_READ)
>> #define _TRACE_AMDXDNA_H
>>
>> +#include <drm/gpu_scheduler.h>
>> #include <linux/tracepoint.h>
>>
>> +TRACE_EVENT(amdxdna_debug_point,
>> + TP_PROTO(const char *name, u64 number, const char *str),
>> +
>> + TP_ARGS(name, number, str),
>> +
>> + TP_STRUCT__entry(__string(name, name)
>> + __field(u64, number)
>> + __string(str, str)),
>> +
>> + TP_fast_assign(__assign_str(name);
>> + __entry->number = number;
>> + __assign_str(str);),
>> +
>> + TP_printk("%s:%llu %s", __get_str(name), __entry->number,
>> + __get_str(str))
>> +);
>> +
>> +TRACE_EVENT(xdna_job,
>> + TP_PROTO(struct drm_sched_job *sched_job, const char *name, const char *str, u64 seq),
>> +
>> + TP_ARGS(sched_job, name, str, seq),
>> +
>> + TP_STRUCT__entry(__string(name, name)
>> + __string(str, str)
>> + __field(u64, fence_context)
>> + __field(u64, fence_seqno)
>> + __field(u64, seq)),
>> +
>> + TP_fast_assign(__assign_str(name);
>> + __assign_str(str);
>> + __entry->fence_context = sched_job->s_fence->finished.context;
>> + __entry->fence_seqno = sched_job->s_fence->finished.seqno;
>> + __entry->seq = seq;),
>> +
>> + TP_printk("fence=(context:%llu, seqno:%lld), %s seq#:%lld %s",
>> + __entry->fence_context, __entry->fence_seqno,
>> + __get_str(name), __entry->seq,
>> + __get_str(str))
>> +);
>> +
>> DECLARE_EVENT_CLASS(xdna_mbox_msg,
>> TP_PROTO(char *name, u8 chann_id, u32 opcode, u32 msg_id),
>>
>> diff --git a/include/uapi/drm/amdxdna_accel.h b/include/uapi/drm/amdxdna_accel.h
>> index 3792750834b2..08f3ec7146ab 100644
>> --- a/include/uapi/drm/amdxdna_accel.h
>> +++ b/include/uapi/drm/amdxdna_accel.h
>> @@ -13,6 +13,7 @@
>> extern "C" {
>> #endif
>>
>> +#define AMDXDNA_INVALID_CMD_HANDLE (~0UL)
>> #define AMDXDNA_INVALID_ADDR (~0UL)
>> #define AMDXDNA_INVALID_CTX_HANDLE 0
>> #define AMDXDNA_INVALID_BO_HANDLE 0
>> @@ -29,6 +30,8 @@ enum amdxdna_drm_ioctl_id {
>> DRM_AMDXDNA_CREATE_BO,
>> DRM_AMDXDNA_GET_BO_INFO,
>> DRM_AMDXDNA_SYNC_BO,
>> + DRM_AMDXDNA_EXEC_CMD,
>> + DRM_AMDXDNA_WAIT_CMD,
>> };
>>
>> /**
>> @@ -201,6 +204,54 @@ struct amdxdna_drm_sync_bo {
>> __u64 size;
>> };
>>
>> +enum amdxdna_cmd_type {
>> + AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0,
>> + AMDXDNA_CMD_SUBMIT_DEPENDENCY,
>> + AMDXDNA_CMD_SUBMIT_SIGNAL,
>> +};
>> +
>> +/**
>> + * struct amdxdna_drm_exec_cmd - Execute command.
>> + * @ext: MBZ.
>> + * @ext_flags: MBZ.
>> + * @hwctx: Hardware context handle.
>> + * @type: One of command type in enum amdxdna_cmd_type.
>> + * @cmd_handles: Array of command handles or the command handle itself
>> + * in case of just one.
>> + * @args: Array of arguments for all command handles.
>> + * @cmd_count: Number of command handles in the cmd_handles array.
>> + * @arg_count: Number of arguments in the args array.
>> + * @seq: Returned sequence number for this command.
>> + */
>> +struct amdxdna_drm_exec_cmd {
>> + __u64 ext;
>> + __u64 ext_flags;
>> + __u32 hwctx;
>> + __u32 type;
>> + __u64 cmd_handles;
>> + __u64 args;
>> + __u32 cmd_count;
>> + __u32 arg_count;
>> + __u64 seq;
>> +};
>> +
>> +/**
>> + * struct amdxdna_drm_wait_cmd - Wait exectuion command.
>> + *
>> + * @hwctx: hardware context handle.
>> + * @timeout: timeout in ms, 0 implies infinite wait.
>> + * @seq: sequence number of the command returned by execute command.
>> + *
>> + * Wait a command specified by seq to be completed.
>> + * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot
>> + * to submit a new command.
>> + */
>> +struct amdxdna_drm_wait_cmd {
>> + __u32 hwctx;
>> + __u32 timeout;
>> + __u64 seq;
>> +};
>> +
>> #define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \
>> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \
>> struct amdxdna_drm_create_hwctx)
>> @@ -225,6 +276,14 @@ struct amdxdna_drm_sync_bo {
>> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, \
>> struct amdxdna_drm_sync_bo)
>>
>> +#define DRM_IOCTL_AMDXDNA_EXEC_CMD \
>> + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, \
>> + struct amdxdna_drm_exec_cmd)
>> +
>> +#define DRM_IOCTL_AMDXDNA_WAIT_CMD \
>> + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, \
>> + struct amdxdna_drm_wait_cmd)
>> +
>> #if defined(__cplusplus)
>> } /* extern c end */
>> #endif
>> --
>> 2.34.1
>>
On Wed, Oct 16, 2024 at 08:53:05PM -0700, Lizhi Hou wrote:
>
> On 10/14/24 19:13, Matthew Brost wrote:
> > On Fri, Oct 11, 2024 at 04:12:41PM -0700, Lizhi Hou wrote:
> > > Add interfaces for user application to submit command and wait for its
> > > completion.
> > >
> > > Co-developed-by: Min Ma <min.ma@amd.com>
> > > Signed-off-by: Min Ma <min.ma@amd.com>
> > > Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> > > ---
> > > drivers/accel/amdxdna/aie2_ctx.c | 624 +++++++++++++++++-
> > > drivers/accel/amdxdna/aie2_message.c | 343 ++++++++++
> > > drivers/accel/amdxdna/aie2_pci.c | 6 +
> > > drivers/accel/amdxdna/aie2_pci.h | 35 +
> > > drivers/accel/amdxdna/aie2_psp.c | 2 +
> > > drivers/accel/amdxdna/aie2_smu.c | 2 +
> > > drivers/accel/amdxdna/amdxdna_ctx.c | 375 ++++++++++-
> > > drivers/accel/amdxdna/amdxdna_ctx.h | 110 +++
> > > drivers/accel/amdxdna/amdxdna_gem.c | 1 +
> > > .../accel/amdxdna/amdxdna_mailbox_helper.c | 5 +
> > > drivers/accel/amdxdna/amdxdna_pci_drv.c | 6 +
> > > drivers/accel/amdxdna/amdxdna_pci_drv.h | 5 +
> > > drivers/accel/amdxdna/amdxdna_sysfs.c | 5 +
> > > drivers/accel/amdxdna/npu1_regs.c | 1 +
> > > drivers/accel/amdxdna/npu2_regs.c | 1 +
> > > drivers/accel/amdxdna/npu4_regs.c | 1 +
> > > drivers/accel/amdxdna/npu5_regs.c | 1 +
> > > include/trace/events/amdxdna.h | 41 ++
> > > include/uapi/drm/amdxdna_accel.h | 59 ++
> > > 19 files changed, 1614 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
> > > index 617fc05077d9..f9010a902c99 100644
> > > --- a/drivers/accel/amdxdna/aie2_ctx.c
> > > +++ b/drivers/accel/amdxdna/aie2_ctx.c
> > > @@ -8,8 +8,11 @@
> > > #include <drm/drm_gem.h>
> > > #include <drm/drm_gem_shmem_helper.h>
> > > #include <drm/drm_print.h>
> > > +#include <linux/hmm.h>
> > > #include <linux/types.h>
> > > +#include <trace/events/amdxdna.h>
> > > +#include "aie2_msg_priv.h"
> > > #include "aie2_pci.h"
> > > #include "aie2_solver.h"
> > > #include "amdxdna_ctx.h"
> > > @@ -17,6 +20,361 @@
> > > #include "amdxdna_mailbox.h"
> > > #include "amdxdna_pci_drv.h"
> > > +bool force_cmdlist;
> > > +module_param(force_cmdlist, bool, 0600);
> > > +MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default false)");
> > > +
> > > +#define HWCTX_MAX_TIMEOUT 60000 /* miliseconds */
> > > +
> > > +static int
> > > +aie2_hwctx_add_job(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job)
> > > +{
> > > + struct amdxdna_sched_job *other;
> > > + int idx;
> > > +
> > > + idx = get_job_idx(hwctx->priv->seq);
> > > + /* When pending list full, hwctx->seq points to oldest fence */
> > > + other = hwctx->priv->pending[idx];
> > > + if (other && other->fence)
> > > + return -EAGAIN;
> > > +
> > > + if (other) {
> > > + dma_fence_put(other->out_fence);
> > > + amdxdna_job_put(other);
> > > + }
> > > +
> > > + hwctx->priv->pending[idx] = job;
> > > + job->seq = hwctx->priv->seq++;
> > > + kref_get(&job->refcnt);
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static struct amdxdna_sched_job *
> > > +aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq)
> > > +{
> > > + int idx;
> > > +
> > > + /* Special sequence number for oldest fence if exist */
> > > + if (seq == AMDXDNA_INVALID_CMD_HANDLE) {
> > > + idx = get_job_idx(hwctx->priv->seq);
> > > + goto out;
> > > + }
> > > +
> > > + if (seq >= hwctx->priv->seq)
> > > + return ERR_PTR(-EINVAL);
> > > +
> > > + if (seq + HWCTX_MAX_CMDS < hwctx->priv->seq)
> > > + return NULL;
> > > +
> > > + idx = get_job_idx(seq);
> > > +
> > > +out:
> > > + return hwctx->priv->pending[idx];
> > > +}
> > > +
> > > +/* The bad_job is used in aie2_sched_job_timedout, otherwise, set it to NULL */
> > > +static void aie2_hwctx_stop(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwctx,
> > > + struct drm_sched_job *bad_job)
> > > +{
> > > + drm_sched_stop(&hwctx->priv->sched, bad_job);
> > > + aie2_destroy_context(xdna->dev_handle, hwctx);
> > > +}
> > > +
> > > +static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwctx)
> > > +{
> > > + struct amdxdna_gem_obj *heap = hwctx->priv->heap;
> > > + int ret;
> > > +
> > > + ret = aie2_create_context(xdna->dev_handle, hwctx);
> > > + if (ret) {
> > > + XDNA_ERR(xdna, "Create hwctx failed, ret %d", ret);
> > > + goto out;
> > > + }
> > > +
> > > + ret = aie2_map_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
> > > + heap->mem.userptr, heap->mem.size);
> > > + if (ret) {
> > > + XDNA_ERR(xdna, "Map host buf failed, ret %d", ret);
> > > + goto out;
> > > + }
> > > +
> > > + if (hwctx->status != HWCTX_STAT_READY) {
> > > + XDNA_DBG(xdna, "hwctx is not ready, status %d", hwctx->status);
> > > + goto out;
> > > + }
> > > +
> > > + ret = aie2_config_cu(hwctx);
> > > + if (ret) {
> > > + XDNA_ERR(xdna, "Config cu failed, ret %d", ret);
> > > + goto out;
> > > + }
> > > +
> > > +out:
> > > + drm_sched_start(&hwctx->priv->sched);
> > > + XDNA_DBG(xdna, "%s restarted, ret %d", hwctx->name, ret);
> > > + return ret;
> > > +}
> > > +
> > > +void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map)
> > > +{
> > > + struct amdxdna_dev *xdna = client->xdna;
> > > + struct amdxdna_hwctx *hwctx;
> > > + int next = 0;
> > > +
> > > + drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
> > > + mutex_lock(&client->hwctx_lock);
> > > + idr_for_each_entry_continue(&client->hwctx_idr, hwctx, next) {
> > > + /* check if the HW context uses the error column */
> > > + if (!(col_map & amdxdna_hwctx_col_map(hwctx)))
> > > + continue;
> > > +
> > > + aie2_hwctx_stop(xdna, hwctx, NULL);
> > > + hwctx->old_status = hwctx->status;
> > > + hwctx->status = HWCTX_STAT_STOP;
> > > + XDNA_DBG(xdna, "Stop %s", hwctx->name);
> > > + }
> > > + mutex_unlock(&client->hwctx_lock);
> > > +}
> > > +
> > > +void aie2_restart_ctx(struct amdxdna_client *client)
> > > +{
> > > + struct amdxdna_dev *xdna = client->xdna;
> > > + struct amdxdna_hwctx *hwctx;
> > > + int next = 0;
> > > +
> > > + drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
> > > + mutex_lock(&client->hwctx_lock);
> > > + idr_for_each_entry_continue(&client->hwctx_idr, hwctx, next) {
> > > + if (hwctx->status != HWCTX_STAT_STOP)
> > > + continue;
> > > +
> > > + hwctx->status = hwctx->old_status;
> > > + XDNA_DBG(xdna, "Resetting %s", hwctx->name);
> > > + aie2_hwctx_restart(xdna, hwctx);
> > > + }
> > > + mutex_unlock(&client->hwctx_lock);
> > > +}
> > > +
> > > +static int aie2_hwctx_wait_for_idle(struct amdxdna_hwctx *hwctx)
> > > +{
> > > + struct amdxdna_sched_job *job;
> > > +
> > > + mutex_lock(&hwctx->priv->io_lock);
> > > + if (!hwctx->priv->seq) {
> > > + mutex_unlock(&hwctx->priv->io_lock);
> > > + return 0;
> > > + }
> > > +
> > > + job = aie2_hwctx_get_job(hwctx, hwctx->priv->seq - 1);
> > > + if (IS_ERR_OR_NULL(job)) {
> > > + mutex_unlock(&hwctx->priv->io_lock);
> > > + XDNA_WARN(hwctx->client->xdna, "Corrupted pending list");
> > > + return 0;
> > > + }
> > > + mutex_unlock(&hwctx->priv->io_lock);
> > > +
> > > + wait_event(hwctx->priv->job_free_wq, !job->fence);
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static void
> > > +aie2_sched_notify(struct amdxdna_sched_job *job)
> > > +{
> > > + struct dma_fence *fence = job->fence;
> > > +
> > > + job->hwctx->priv->completed++;
> > > + dma_fence_signal(fence);
> > > + trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", job->seq);
> > > + dma_fence_put(fence);
> > > + mmput(job->mm);
> > > + amdxdna_job_put(job);
> > > +}
> > > +
> > > +static int
> > > +aie2_sched_resp_handler(void *handle, const u32 *data, size_t size)
> > > +{
> > > + struct amdxdna_sched_job *job = handle;
> > > + struct amdxdna_gem_obj *cmd_abo;
> > > + u32 ret = 0;
> > > + u32 status;
> > > +
> > > + cmd_abo = job->cmd_bo;
> > > +
> > > + if (unlikely(!data))
> > > + goto out;
> > > +
> > > + if (unlikely(size != sizeof(u32))) {
> > > + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
> > > + ret = -EINVAL;
> > > + goto out;
> > > + }
> > > +
> > > + status = *data;
> > > + XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
> > > + if (status == AIE2_STATUS_SUCCESS)
> > > + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
> > > + else
> > > + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ERROR);
> > > +
> > > +out:
> > > + aie2_sched_notify(job);
> > > + return ret;
> > > +}
> > > +
> > > +static int
> > > +aie2_sched_nocmd_resp_handler(void *handle, const u32 *data, size_t size)
> > > +{
> > > + struct amdxdna_sched_job *job = handle;
> > > + u32 ret = 0;
> > > + u32 status;
> > > +
> > > + if (unlikely(!data))
> > > + goto out;
> > > +
> > > + if (unlikely(size != sizeof(u32))) {
> > > + ret = -EINVAL;
> > > + goto out;
> > > + }
> > > +
> > > + status = *data;
> > > + XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
> > > +
> > > +out:
> > > + aie2_sched_notify(job);
> > > + return ret;
> > > +}
> > > +
> > > +static int
> > > +aie2_sched_cmdlist_resp_handler(void *handle, const u32 *data, size_t size)
> > > +{
> > > + struct amdxdna_sched_job *job = handle;
> > > + struct amdxdna_gem_obj *cmd_abo;
> > > + struct cmd_chain_resp *resp;
> > > + struct amdxdna_dev *xdna;
> > > + u32 fail_cmd_status;
> > > + u32 fail_cmd_idx;
> > > + u32 ret = 0;
> > > +
> > > + cmd_abo = job->cmd_bo;
> > > + if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
> > > + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
> > > + ret = -EINVAL;
> > > + goto out;
> > > + }
> > > +
> > > + resp = (struct cmd_chain_resp *)data;
> > > + xdna = job->hwctx->client->xdna;
> > > + XDNA_DBG(xdna, "Status 0x%x", resp->status);
> > > + if (resp->status == AIE2_STATUS_SUCCESS) {
> > > + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
> > > + goto out;
> > > + }
> > > +
> > > + /* Slow path to handle error, read from ringbuf on BAR */
> > > + fail_cmd_idx = resp->fail_cmd_idx;
> > > + fail_cmd_status = resp->fail_cmd_status;
> > > + XDNA_DBG(xdna, "Failed cmd idx %d, status 0x%x",
> > > + fail_cmd_idx, fail_cmd_status);
> > > +
> > > + if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
> > > + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
> > > + ret = -EINVAL;
> > > + goto out;
> > > + }
> > > + amdxdna_cmd_set_state(cmd_abo, fail_cmd_status);
> > > +
> > > + if (amdxdna_cmd_get_op(cmd_abo) == ERT_CMD_CHAIN) {
> > > + struct amdxdna_cmd_chain *cc = amdxdna_cmd_get_payload(cmd_abo, NULL);
> > > +
> > > + cc->error_index = fail_cmd_idx;
> > > + if (cc->error_index >= cc->command_count)
> > > + cc->error_index = 0;
> > > + }
> > > +out:
> > > + aie2_sched_notify(job);
> > > + return ret;
> > > +}
> > > +
> > > +static struct dma_fence *
> > > +aie2_sched_job_run(struct drm_sched_job *sched_job)
> > > +{
> > > + struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
> > > + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
> > > + struct amdxdna_hwctx *hwctx = job->hwctx;
> > > + struct dma_fence *fence;
> > > + int ret;
> > > +
> > > + if (!mmget_not_zero(job->mm))
> > > + return ERR_PTR(-ESRCH);
> > > +
> > > + kref_get(&job->refcnt);
> > > + fence = dma_fence_get(job->fence);
> > > +
> > > + if (unlikely(!cmd_abo)) {
> > > + ret = aie2_sync_bo(hwctx, job, aie2_sched_nocmd_resp_handler);
> > > + goto out;
> > > + }
> > > +
> > > + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_NEW);
> > > +
> > > + if (amdxdna_cmd_get_op(cmd_abo) == ERT_CMD_CHAIN)
> > > + ret = aie2_cmdlist_multi_execbuf(hwctx, job, aie2_sched_cmdlist_resp_handler);
> > > + else if (force_cmdlist)
> > > + ret = aie2_cmdlist_single_execbuf(hwctx, job, aie2_sched_cmdlist_resp_handler);
> > > + else
> > > + ret = aie2_execbuf(hwctx, job, aie2_sched_resp_handler);
> > > +
> > > +out:
> > > + if (ret) {
> > > + dma_fence_put(job->fence);
> > > + amdxdna_job_put(job);
> > > + mmput(job->mm);
> > > + fence = ERR_PTR(ret);
> > > + }
> > > + trace_xdna_job(sched_job, hwctx->name, "sent to device", job->seq);
> > > +
> > > + return fence;
> > > +}
> > > +
> > > +static void aie2_sched_job_free(struct drm_sched_job *sched_job)
> > > +{
> > > + struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
> > > + struct amdxdna_hwctx *hwctx = job->hwctx;
> > > +
> > > + trace_xdna_job(sched_job, hwctx->name, "job free", job->seq);
> > > + drm_sched_job_cleanup(sched_job);
> > > + job->fence = NULL;
> > > + amdxdna_job_put(job);
> > > +
> > > + wake_up(&hwctx->priv->job_free_wq);
> > > +}
> > > +
> > > +static enum drm_gpu_sched_stat
> > > +aie2_sched_job_timedout(struct drm_sched_job *sched_job)
> > > +{
> > > + struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
> > > + struct amdxdna_hwctx *hwctx = job->hwctx;
> > > + struct amdxdna_dev *xdna;
> > > +
> > > + xdna = hwctx->client->xdna;
> > > + trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
> > > + mutex_lock(&xdna->dev_lock);
> > > + aie2_hwctx_stop(xdna, hwctx, sched_job);
> > > +
> > > + aie2_hwctx_restart(xdna, hwctx);
> > > + mutex_unlock(&xdna->dev_lock);
> > > +
> > > + return DRM_GPU_SCHED_STAT_NOMINAL;
> > > +}
> > > +
> > > +const struct drm_sched_backend_ops sched_ops = {
> > > + .run_job = aie2_sched_job_run,
> > > + .free_job = aie2_sched_job_free,
> > > + .timedout_job = aie2_sched_job_timedout,
> > > +};
> > > +
> > > static int aie2_hwctx_col_list(struct amdxdna_hwctx *hwctx)
> > > {
> > > struct amdxdna_dev *xdna = hwctx->client->xdna;
> > > @@ -130,9 +488,10 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
> > > {
> > > struct amdxdna_client *client = hwctx->client;
> > > struct amdxdna_dev *xdna = client->xdna;
> > > + struct drm_gpu_scheduler *sched;
> > > struct amdxdna_hwctx_priv *priv;
> > > struct amdxdna_gem_obj *heap;
> > > - int ret;
> > > + int i, ret;
> > > priv = kzalloc(sizeof(*hwctx->priv), GFP_KERNEL);
> > > if (!priv)
> > > @@ -157,10 +516,48 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
> > > goto put_heap;
> > > }
> > > + for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
> > > + struct amdxdna_gem_obj *abo;
> > > + struct amdxdna_drm_create_bo args = {
> > > + .flags = 0,
> > > + .type = AMDXDNA_BO_DEV,
> > > + .vaddr = 0,
> > > + .size = MAX_CHAIN_CMDBUF_SIZE,
> > > + };
> > > +
> > > + abo = amdxdna_drm_alloc_dev_bo(&xdna->ddev, &args, client->filp, true);
> > > + if (IS_ERR(abo)) {
> > > + ret = PTR_ERR(abo);
> > > + goto free_cmd_bufs;
> > > + }
> > > +
> > > + XDNA_DBG(xdna, "Command buf %d addr 0x%llx size 0x%lx",
> > > + i, abo->mem.dev_addr, abo->mem.size);
> > > + priv->cmd_buf[i] = abo;
> > > + }
> > > +
> > > + sched = &priv->sched;
> > > + mutex_init(&priv->io_lock);
> > > + ret = drm_sched_init(sched, &sched_ops, NULL, DRM_SCHED_PRIORITY_COUNT,
> > > + HWCTX_MAX_CMDS, 0, msecs_to_jiffies(HWCTX_MAX_TIMEOUT),
> > > + NULL, NULL, hwctx->name, xdna->ddev.dev);
> > > + if (ret) {
> > > + XDNA_ERR(xdna, "Failed to init DRM scheduler. ret %d", ret);
> > > + goto free_cmd_bufs;
> > > + }
> > > +
> > > + ret = drm_sched_entity_init(&priv->entity, DRM_SCHED_PRIORITY_NORMAL,
> > > + &sched, 1, NULL);
> > > + if (ret) {
> > > + XDNA_ERR(xdna, "Failed to initial sched entiry. ret %d", ret);
> > > + goto free_sched;
> > > + }
> > > + init_waitqueue_head(&priv->job_free_wq);
> > > +
> > > ret = aie2_hwctx_col_list(hwctx);
> > > if (ret) {
> > > XDNA_ERR(xdna, "Create col list failed, ret %d", ret);
> > > - goto unpin;
> > > + goto free_entity;
> > > }
> > > ret = aie2_alloc_resource(hwctx);
> > > @@ -185,7 +582,16 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
> > > aie2_release_resource(hwctx);
> > > free_col_list:
> > > kfree(hwctx->col_list);
> > > -unpin:
> > > +free_entity:
> > > + drm_sched_entity_destroy(&priv->entity);
> > > +free_sched:
> > > + drm_sched_fini(&priv->sched);
> > > +free_cmd_bufs:
> > > + for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
> > > + if (!priv->cmd_buf[i])
> > > + continue;
> > > + drm_gem_object_put(to_gobj(priv->cmd_buf[i]));
> > > + }
> > > amdxdna_gem_unpin(heap);
> > > put_heap:
> > > drm_gem_object_put(to_gobj(heap));
> > > @@ -196,11 +602,43 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
> > > void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx)
> > > {
> > > + struct amdxdna_sched_job *job;
> > > + struct amdxdna_dev *xdna;
> > > + int idx;
> > > +
> > > + xdna = hwctx->client->xdna;
> > > + drm_sched_wqueue_stop(&hwctx->priv->sched);
> > > +
> > > + /* Now, scheduler will not send command to device. */
> > > aie2_release_resource(hwctx);
> > > + /*
> > > + * All submitted commands are aborted.
> > > + * Restart scheduler queues to cleanup jobs. The amdxdna_sched_job_run()
> > > + * will return NODEV if it is called.
> > > + */
> > > + drm_sched_wqueue_start(&hwctx->priv->sched);
> > > +
> > > + aie2_hwctx_wait_for_idle(hwctx);
> > > + drm_sched_entity_destroy(&hwctx->priv->entity);
> > > + drm_sched_fini(&hwctx->priv->sched);
> > > +
> > > + for (idx = 0; idx < HWCTX_MAX_CMDS; idx++) {
> > > + job = hwctx->priv->pending[idx];
> > > + if (!job)
> > > + continue;
> > > +
> > > + dma_fence_put(job->out_fence);
> > > + amdxdna_job_put(job);
> > > + }
> > > + XDNA_DBG(xdna, "%s sequence number %lld", hwctx->name, hwctx->priv->seq);
> > > +
> > > + for (idx = 0; idx < ARRAY_SIZE(hwctx->priv->cmd_buf); idx++)
> > > + drm_gem_object_put(to_gobj(hwctx->priv->cmd_buf[idx]));
> > > amdxdna_gem_unpin(hwctx->priv->heap);
> > > drm_gem_object_put(to_gobj(hwctx->priv->heap));
> > > + mutex_destroy(&hwctx->priv->io_lock);
> > > kfree(hwctx->col_list);
> > > kfree(hwctx->priv);
> > > kfree(hwctx->cus);
> > > @@ -267,3 +705,183 @@ int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *bu
> > > return -EOPNOTSUPP;
> > > }
> > > }
> > > +
> > > +static int aie2_populate_range(struct amdxdna_gem_obj *abo)
> > > +{
> > > + struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
> > > + struct mm_struct *mm = abo->mem.notifier.mm;
> > > + struct hmm_range range = { 0 };
> > > + unsigned long timeout;
> > > + int ret;
> > > +
> > > + XDNA_INFO_ONCE(xdna, "populate memory range %llx size %lx",
> > > + abo->mem.userptr, abo->mem.size);
> > > + range.notifier = &abo->mem.notifier;
> > > + range.start = abo->mem.userptr;
> > > + range.end = abo->mem.userptr + abo->mem.size;
> > > + range.hmm_pfns = abo->mem.pfns;
> > > + range.default_flags = HMM_PFN_REQ_FAULT;
> > > +
> > > + if (!mmget_not_zero(mm))
> > > + return -EFAULT;
> > > +
> > > + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > +again:
> > > + range.notifier_seq = mmu_interval_read_begin(&abo->mem.notifier);
> > > + mmap_read_lock(mm);
> > > + ret = hmm_range_fault(&range);
> > > + mmap_read_unlock(mm);
> > > + if (ret) {
> > > + if (time_after(jiffies, timeout)) {
> > > + ret = -ETIME;
> > > + goto put_mm;
> > > + }
> > > +
> > > + if (ret == -EBUSY)
> > > + goto again;
> > > +
> > > + goto put_mm;
> > > + }
> > > +
> > > + dma_resv_lock(to_gobj(abo)->resv, NULL);
> > > + if (mmu_interval_read_retry(&abo->mem.notifier, range.notifier_seq)) {
> > > + dma_resv_unlock(to_gobj(abo)->resv);
> > > + goto again;
> > > + }
> > > + abo->mem.map_invalid = false;
> > > + dma_resv_unlock(to_gobj(abo)->resv);
> > > +
> > > +put_mm:
> > > + mmput(mm);
> > > + return ret;
> > > +}
> > > +
> > > +int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq)
> > > +{
> > > + struct amdxdna_dev *xdna = hwctx->client->xdna;
> > > + struct ww_acquire_ctx acquire_ctx;
> > > + struct amdxdna_gem_obj *abo;
> > > + unsigned long timeout = 0;
> > > + int ret, i;
> > > +
> > > + ret = drm_sched_job_init(&job->base, &hwctx->priv->entity, 1, hwctx);
> > > + if (ret) {
> > > + XDNA_ERR(xdna, "DRM job init failed, ret %d", ret);
> > > + return ret;
> > > + }
> > > +
> > > + drm_sched_job_arm(&job->base);
> > > + job->out_fence = dma_fence_get(&job->base.s_fence->finished);
> > > +
> > > +retry:
> > > + ret = drm_gem_lock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
> > > + if (ret) {
> > > + XDNA_WARN(xdna, "Failed to reverve fence, ret %d", ret);
> > > + goto put_fence;
> > > + }
> > > +
> > > + for (i = 0; i < job->bo_cnt; i++) {
> > > + abo = to_xdna_obj(job->bos[i]);
> > > + if (abo->mem.map_invalid) {
> > > + drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
> > > + if (!timeout) {
> > > + timeout = jiffies +
> > > + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > + } else if (time_after(jiffies, timeout)) {
> > > + ret = -ETIME;
> > > + goto put_fence;
> > > + }
> > > +
> > > + ret = aie2_populate_range(abo);
> > > + if (ret)
> > > + goto put_fence;
> > > + goto retry;
> > > + }
> > > +
> > > + ret = dma_resv_reserve_fences(job->bos[i]->resv, 1);
> > > + if (ret) {
> > > + XDNA_WARN(xdna, "Failed to reserve fences %d", ret);
> > > + drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
> > > + goto put_fence;
> > > + }
> > > + }
> > > +
> > > + for (i = 0; i < job->bo_cnt; i++)
> > > + dma_resv_add_fence(job->bos[i]->resv, job->out_fence, DMA_RESV_USAGE_WRITE);
> > > + drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
> > > +
> > > + mutex_lock(&hwctx->priv->io_lock);
> > > + ret = aie2_hwctx_add_job(hwctx, job);
> > > + if (ret) {
> > > + mutex_unlock(&hwctx->priv->io_lock);
> > > + goto signal_fence;
> > > + }
> > > +
> > > + *seq = job->seq;
> > > + drm_sched_entity_push_job(&job->base);
> > > + mutex_unlock(&hwctx->priv->io_lock);
> > > +
> > > + return 0;
> > > +
> > > +signal_fence:
> > > + dma_fence_signal(job->out_fence);
> > > +put_fence:
> > > + dma_fence_put(job->out_fence);
> > > + drm_sched_job_cleanup(&job->base);
> > > + return ret;
> > > +}
> > > +
> > > +int aie2_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout)
> > > +{
> > > + signed long remaining = MAX_SCHEDULE_TIMEOUT;
> > > + struct amdxdna_sched_job *job;
> > > + struct dma_fence *out_fence;
> > > + long ret;
> > > +
> > > + mutex_lock(&hwctx->priv->io_lock);
> > > + job = aie2_hwctx_get_job(hwctx, seq);
> > > + if (IS_ERR(job)) {
> > > + mutex_unlock(&hwctx->priv->io_lock);
> > > + ret = PTR_ERR(job);
> > > + goto out;
> > > + }
> > > +
> > > + if (unlikely(!job)) {
> > > + mutex_unlock(&hwctx->priv->io_lock);
> > > + ret = 0;
> > > + goto out;
> > > + }
> > > + out_fence = dma_fence_get(job->out_fence);
> > > + mutex_unlock(&hwctx->priv->io_lock);
> > > +
> > > + if (timeout)
> > > + remaining = msecs_to_jiffies(timeout);
> > > +
> > > + ret = dma_fence_wait_timeout(out_fence, true, remaining);
> > > + if (!ret)
> > > + ret = -ETIME;
> > > + else if (ret > 0)
> > > + ret = 0;
> > > +
> > > + dma_fence_put(out_fence);
> > > +out:
> > > + return ret;
> > > +}
> > > +
> > > +void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo,
> > > + unsigned long cur_seq)
> > > +{
> > > + struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
> > > + struct drm_gem_object *gobj = to_gobj(abo);
> > > + long ret;
> > > +
> > > + dma_resv_lock(gobj->resv, NULL);
> > Was randomly looking as I was interested - drive by comment...
> >
> > I think you have a locking inversion here.
> >
> > MMU notifiers are in the path of reclaim (Can't allocate memory),
> > dma-resv lock can allocate memory while being held, thus you can
> > deadlock.
>
> Thanks for point this out, I will use a mutex lock instead.
>
No problem, this locking is pretty easy to get wrong. I also suggest
priming your mutex with reclaim too to catch bug early that only pop
after your notifier fires.
e.g.
fs_reclaim_acquire(GFP_KERNEL);
might_lock(your notifier mutex);
fs_reclaim_release(GFP_KERNEL);
Matt
>
> Lizhi
>
> >
> > Lockdep should blow up here. See [1] [2].
> >
> > [1] https://elixir.bootlin.com/linux/v6.11.3/source/mm/page_alloc.c#L3833
> > [2] https://elixir.bootlin.com/linux/v6.11.3/source/drivers/dma-buf/dma-resv.c#L773
> >
> > Matt
> >
> > > + abo->mem.map_invalid = true;
> > > + mmu_interval_set_seq(&abo->mem.notifier, cur_seq);
> > > + ret = dma_resv_wait_timeout(gobj->resv, DMA_RESV_USAGE_BOOKKEEP,
> > > + true, MAX_SCHEDULE_TIMEOUT);
> > > + dma_resv_unlock(gobj->resv);
> > > +
> > > + if (!ret || ret == -ERESTARTSYS)
> > > + XDNA_ERR(xdna, "Failed to wait for bo, ret %ld", ret);
> > > +}
> > > diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
> > > index 28bd0560db61..3dc4a9a8571e 100644
> > > --- a/drivers/accel/amdxdna/aie2_message.c
> > > +++ b/drivers/accel/amdxdna/aie2_message.c
> > > @@ -4,10 +4,12 @@
> > > */
> > > #include <drm/amdxdna_accel.h>
> > > +#include <drm/drm_cache.h>
> > > #include <drm/drm_device.h>
> > > #include <drm/drm_gem.h>
> > > #include <drm/drm_gem_shmem_helper.h>
> > > #include <drm/drm_print.h>
> > > +#include <drm/gpu_scheduler.h>
> > > #include <linux/errno.h>
> > > #include <linux/pci.h>
> > > #include <linux/types.h>
> > > @@ -361,3 +363,344 @@ int aie2_config_cu(struct amdxdna_hwctx *hwctx)
> > > msg.opcode, resp.status, ret);
> > > return ret;
> > > }
> > > +
> > > +int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
> > > + int (*notify_cb)(void *, const u32 *, size_t))
> > > +{
> > > + struct mailbox_channel *chann = hwctx->priv->mbox_chann;
> > > + struct amdxdna_dev *xdna = hwctx->client->xdna;
> > > + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
> > > + union {
> > > + struct execute_buffer_req ebuf;
> > > + struct exec_dpu_req dpu;
> > > + } req;
> > > + struct xdna_mailbox_msg msg;
> > > + u32 payload_len;
> > > + void *payload;
> > > + int cu_idx;
> > > + int ret;
> > > + u32 op;
> > > +
> > > + if (!chann)
> > > + return -ENODEV;
> > > +
> > > + payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len);
> > > + if (!payload) {
> > > + XDNA_ERR(xdna, "Invalid command, cannot get payload");
> > > + return -EINVAL;
> > > + }
> > > +
> > > + cu_idx = amdxdna_cmd_get_cu_idx(cmd_abo);
> > > + if (cu_idx < 0) {
> > > + XDNA_DBG(xdna, "Invalid cu idx");
> > > + return -EINVAL;
> > > + }
> > > +
> > > + op = amdxdna_cmd_get_op(cmd_abo);
> > > + switch (op) {
> > > + case ERT_START_CU:
> > > + if (unlikely(payload_len > sizeof(req.ebuf.payload)))
> > > + XDNA_DBG(xdna, "Invalid ebuf payload len: %d", payload_len);
> > > + req.ebuf.cu_idx = cu_idx;
> > > + memcpy(req.ebuf.payload, payload, sizeof(req.ebuf.payload));
> > > + msg.send_size = sizeof(req.ebuf);
> > > + msg.opcode = MSG_OP_EXECUTE_BUFFER_CF;
> > > + break;
> > > + case ERT_START_NPU: {
> > > + struct amdxdna_cmd_start_npu *sn = payload;
> > > +
> > > + if (unlikely(payload_len - sizeof(*sn) > sizeof(req.dpu.payload)))
> > > + XDNA_DBG(xdna, "Invalid dpu payload len: %d", payload_len);
> > > + req.dpu.inst_buf_addr = sn->buffer;
> > > + req.dpu.inst_size = sn->buffer_size;
> > > + req.dpu.inst_prop_cnt = sn->prop_count;
> > > + req.dpu.cu_idx = cu_idx;
> > > + memcpy(req.dpu.payload, sn->prop_args, sizeof(req.dpu.payload));
> > > + msg.send_size = sizeof(req.dpu);
> > > + msg.opcode = MSG_OP_EXEC_DPU;
> > > + break;
> > > + }
> > > + default:
> > > + XDNA_DBG(xdna, "Invalid ERT cmd op code: %d", op);
> > > + return -EINVAL;
> > > + }
> > > + msg.handle = job;
> > > + msg.notify_cb = notify_cb;
> > > + msg.send_data = (u8 *)&req;
> > > + print_hex_dump_debug("cmd: ", DUMP_PREFIX_OFFSET, 16, 4, &req,
> > > + 0x40, false);
> > > +
> > > + ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
> > > + if (ret) {
> > > + XDNA_ERR(xdna, "Send message failed");
> > > + return ret;
> > > + }
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static int
> > > +aie2_cmdlist_fill_one_slot_cf(void *cmd_buf, u32 offset,
> > > + struct amdxdna_gem_obj *abo, u32 *size)
> > > +{
> > > + struct cmd_chain_slot_execbuf_cf *buf = cmd_buf + offset;
> > > + int cu_idx = amdxdna_cmd_get_cu_idx(abo);
> > > + u32 payload_len;
> > > + void *payload;
> > > +
> > > + if (cu_idx < 0)
> > > + return -EINVAL;
> > > +
> > > + payload = amdxdna_cmd_get_payload(abo, &payload_len);
> > > + if (!payload)
> > > + return -EINVAL;
> > > +
> > > + if (!slot_cf_has_space(offset, payload_len))
> > > + return -ENOSPC;
> > > +
> > > + buf->cu_idx = cu_idx;
> > > + buf->arg_cnt = payload_len / sizeof(u32);
> > > + memcpy(buf->args, payload, payload_len);
> > > + /* Accurate buf size to hint firmware to do necessary copy */
> > > + *size = sizeof(*buf) + payload_len;
> > > + return 0;
> > > +}
> > > +
> > > +static int
> > > +aie2_cmdlist_fill_one_slot_dpu(void *cmd_buf, u32 offset,
> > > + struct amdxdna_gem_obj *abo, u32 *size)
> > > +{
> > > + struct cmd_chain_slot_dpu *buf = cmd_buf + offset;
> > > + int cu_idx = amdxdna_cmd_get_cu_idx(abo);
> > > + struct amdxdna_cmd_start_npu *sn;
> > > + u32 payload_len;
> > > + void *payload;
> > > + u32 arg_sz;
> > > +
> > > + if (cu_idx < 0)
> > > + return -EINVAL;
> > > +
> > > + payload = amdxdna_cmd_get_payload(abo, &payload_len);
> > > + if (!payload)
> > > + return -EINVAL;
> > > + sn = payload;
> > > + arg_sz = payload_len - sizeof(*sn);
> > > + if (payload_len < sizeof(*sn) || arg_sz > MAX_DPU_ARGS_SIZE)
> > > + return -EINVAL;
> > > +
> > > + if (!slot_dpu_has_space(offset, arg_sz))
> > > + return -ENOSPC;
> > > +
> > > + buf->inst_buf_addr = sn->buffer;
> > > + buf->inst_size = sn->buffer_size;
> > > + buf->inst_prop_cnt = sn->prop_count;
> > > + buf->cu_idx = cu_idx;
> > > + buf->arg_cnt = arg_sz / sizeof(u32);
> > > + memcpy(buf->args, sn->prop_args, arg_sz);
> > > +
> > > + /* Accurate buf size to hint firmware to do necessary copy */
> > > + *size += sizeof(*buf) + arg_sz;
> > > + return 0;
> > > +}
> > > +
> > > +static int
> > > +aie2_cmdlist_fill_one_slot(u32 op, struct amdxdna_gem_obj *cmdbuf_abo, u32 offset,
> > > + struct amdxdna_gem_obj *abo, u32 *size)
> > > +{
> > > + u32 this_op = amdxdna_cmd_get_op(abo);
> > > + void *cmd_buf = cmdbuf_abo->mem.kva;
> > > + int ret;
> > > +
> > > + if (this_op != op) {
> > > + ret = -EINVAL;
> > > + goto done;
> > > + }
> > > +
> > > + switch (op) {
> > > + case ERT_START_CU:
> > > + ret = aie2_cmdlist_fill_one_slot_cf(cmd_buf, offset, abo, size);
> > > + break;
> > > + case ERT_START_NPU:
> > > + ret = aie2_cmdlist_fill_one_slot_dpu(cmd_buf, offset, abo, size);
> > > + break;
> > > + default:
> > > + ret = -EOPNOTSUPP;
> > > + }
> > > +
> > > +done:
> > > + if (ret) {
> > > + XDNA_ERR(abo->client->xdna, "Can't fill slot for cmd op %d ret %d",
> > > + op, ret);
> > > + }
> > > + return ret;
> > > +}
> > > +
> > > +static inline struct amdxdna_gem_obj *
> > > +aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job)
> > > +{
> > > + int idx = get_job_idx(job->seq);
> > > +
> > > + return job->hwctx->priv->cmd_buf[idx];
> > > +}
> > > +
> > > +static void
> > > +aie2_cmdlist_prepare_request(struct cmd_chain_req *req,
> > > + struct amdxdna_gem_obj *cmdbuf_abo, u32 size, u32 cnt)
> > > +{
> > > + req->buf_addr = cmdbuf_abo->mem.dev_addr;
> > > + req->buf_size = size;
> > > + req->count = cnt;
> > > + drm_clflush_virt_range(cmdbuf_abo->mem.kva, size);
> > > + XDNA_DBG(cmdbuf_abo->client->xdna, "Command buf addr 0x%llx size 0x%x count %d",
> > > + req->buf_addr, size, cnt);
> > > +}
> > > +
> > > +static inline u32
> > > +aie2_cmd_op_to_msg_op(u32 op)
> > > +{
> > > + switch (op) {
> > > + case ERT_START_CU:
> > > + return MSG_OP_CHAIN_EXEC_BUFFER_CF;
> > > + case ERT_START_NPU:
> > > + return MSG_OP_CHAIN_EXEC_DPU;
> > > + default:
> > > + return MSG_OP_MAX_OPCODE;
> > > + }
> > > +}
> > > +
> > > +int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
> > > + struct amdxdna_sched_job *job,
> > > + int (*notify_cb)(void *, const u32 *, size_t))
> > > +{
> > > + struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job);
> > > + struct mailbox_channel *chann = hwctx->priv->mbox_chann;
> > > + struct amdxdna_client *client = hwctx->client;
> > > + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
> > > + struct amdxdna_cmd_chain *payload;
> > > + struct xdna_mailbox_msg msg;
> > > + struct cmd_chain_req req;
> > > + u32 payload_len;
> > > + u32 offset = 0;
> > > + u32 size;
> > > + int ret;
> > > + u32 op;
> > > + u32 i;
> > > +
> > > + op = amdxdna_cmd_get_op(cmd_abo);
> > > + payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len);
> > > + if (op != ERT_CMD_CHAIN || !payload ||
> > > + payload_len < struct_size(payload, data, payload->command_count))
> > > + return -EINVAL;
> > > +
> > > + for (i = 0; i < payload->command_count; i++) {
> > > + u32 boh = (u32)(payload->data[i]);
> > > + struct amdxdna_gem_obj *abo;
> > > +
> > > + abo = amdxdna_gem_get_obj(client, boh, AMDXDNA_BO_CMD);
> > > + if (!abo) {
> > > + XDNA_ERR(client->xdna, "Failed to find cmd BO %d", boh);
> > > + return -ENOENT;
> > > + }
> > > +
> > > + /* All sub-cmd should have same op, use the first one. */
> > > + if (i == 0)
> > > + op = amdxdna_cmd_get_op(abo);
> > > +
> > > + ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, offset, abo, &size);
> > > + amdxdna_gem_put_obj(abo);
> > > + if (ret)
> > > + return -EINVAL;
> > > +
> > > + offset += size;
> > > + }
> > > +
> > > + /* The offset is the accumulated total size of the cmd buffer */
> > > + aie2_cmdlist_prepare_request(&req, cmdbuf_abo, offset, payload->command_count);
> > > +
> > > + msg.opcode = aie2_cmd_op_to_msg_op(op);
> > > + if (msg.opcode == MSG_OP_MAX_OPCODE)
> > > + return -EOPNOTSUPP;
> > > + msg.handle = job;
> > > + msg.notify_cb = notify_cb;
> > > + msg.send_data = (u8 *)&req;
> > > + msg.send_size = sizeof(req);
> > > + ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
> > > + if (ret) {
> > > + XDNA_ERR(hwctx->client->xdna, "Send message failed");
> > > + return ret;
> > > + }
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
> > > + struct amdxdna_sched_job *job,
> > > + int (*notify_cb)(void *, const u32 *, size_t))
> > > +{
> > > + struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job);
> > > + struct mailbox_channel *chann = hwctx->priv->mbox_chann;
> > > + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
> > > + struct xdna_mailbox_msg msg;
> > > + struct cmd_chain_req req;
> > > + u32 size;
> > > + int ret;
> > > + u32 op;
> > > +
> > > + op = amdxdna_cmd_get_op(cmd_abo);
> > > + ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, 0, cmd_abo, &size);
> > > + if (ret)
> > > + return ret;
> > > +
> > > + aie2_cmdlist_prepare_request(&req, cmdbuf_abo, size, 1);
> > > +
> > > + msg.opcode = aie2_cmd_op_to_msg_op(op);
> > > + if (msg.opcode == MSG_OP_MAX_OPCODE)
> > > + return -EOPNOTSUPP;
> > > + msg.handle = job;
> > > + msg.notify_cb = notify_cb;
> > > + msg.send_data = (u8 *)&req;
> > > + msg.send_size = sizeof(req);
> > > + ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
> > > + if (ret) {
> > > + XDNA_ERR(hwctx->client->xdna, "Send message failed");
> > > + return ret;
> > > + }
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
> > > + int (*notify_cb)(void *, const u32 *, size_t))
> > > +{
> > > + struct mailbox_channel *chann = hwctx->priv->mbox_chann;
> > > + struct amdxdna_gem_obj *abo = to_xdna_obj(job->bos[0]);
> > > + struct amdxdna_dev *xdna = hwctx->client->xdna;
> > > + struct xdna_mailbox_msg msg;
> > > + struct sync_bo_req req;
> > > + int ret = 0;
> > > +
> > > + req.src_addr = 0;
> > > + req.dst_addr = abo->mem.dev_addr - hwctx->client->dev_heap->mem.dev_addr;
> > > + req.size = abo->mem.size;
> > > +
> > > + /* Device to Host */
> > > + req.type = FIELD_PREP(AIE2_MSG_SYNC_BO_SRC_TYPE, SYNC_BO_DEV_MEM) |
> > > + FIELD_PREP(AIE2_MSG_SYNC_BO_DST_TYPE, SYNC_BO_HOST_MEM);
> > > +
> > > + XDNA_DBG(xdna, "sync %d bytes src(0x%llx) to dst(0x%llx) completed",
> > > + req.size, req.src_addr, req.dst_addr);
> > > +
> > > + msg.handle = job;
> > > + msg.notify_cb = notify_cb;
> > > + msg.send_data = (u8 *)&req;
> > > + msg.send_size = sizeof(req);
> > > + msg.opcode = MSG_OP_SYNC_BO;
> > > +
> > > + ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
> > > + if (ret) {
> > > + XDNA_ERR(xdna, "Send message failed");
> > > + return ret;
> > > + }
> > > +
> > > + return 0;
> > > +}
> > > diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
> > > index ee9f114bc229..6017826a7104 100644
> > > --- a/drivers/accel/amdxdna/aie2_pci.c
> > > +++ b/drivers/accel/amdxdna/aie2_pci.c
> > > @@ -5,8 +5,10 @@
> > > #include <drm/amdxdna_accel.h>
> > > #include <drm/drm_device.h>
> > > +#include <drm/drm_gem_shmem_helper.h>
> > > #include <drm/drm_managed.h>
> > > #include <drm/drm_print.h>
> > > +#include <drm/gpu_scheduler.h>
> > > #include <linux/errno.h>
> > > #include <linux/firmware.h>
> > > #include <linux/iommu.h>
> > > @@ -17,6 +19,7 @@
> > > #include "aie2_pci.h"
> > > #include "aie2_solver.h"
> > > #include "amdxdna_ctx.h"
> > > +#include "amdxdna_gem.h"
> > > #include "amdxdna_mailbox.h"
> > > #include "amdxdna_pci_drv.h"
> > > @@ -499,4 +502,7 @@ const struct amdxdna_dev_ops aie2_ops = {
> > > .hwctx_init = aie2_hwctx_init,
> > > .hwctx_fini = aie2_hwctx_fini,
> > > .hwctx_config = aie2_hwctx_config,
> > > + .cmd_submit = aie2_cmd_submit,
> > > + .cmd_wait = aie2_cmd_wait,
> > > + .hmm_invalidate = aie2_hmm_invalidate,
> > > };
> > > diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
> > > index 3ac936e2c9d1..81877d9c0542 100644
> > > --- a/drivers/accel/amdxdna/aie2_pci.h
> > > +++ b/drivers/accel/amdxdna/aie2_pci.h
> > > @@ -76,6 +76,7 @@ enum psp_reg_idx {
> > > PSP_MAX_REGS /* Keep this at the end */
> > > };
> > > +struct amdxdna_client;
> > > struct amdxdna_fw_ver;
> > > struct amdxdna_hwctx;
> > > @@ -118,9 +119,28 @@ struct rt_config {
> > > u32 value;
> > > };
> > > +/*
> > > + * Define the maximum number of pending commands in a hardware context.
> > > + * Must be power of 2!
> > > + */
> > > +#define HWCTX_MAX_CMDS 4
> > > +#define get_job_idx(seq) ((seq) & (HWCTX_MAX_CMDS - 1))
> > > struct amdxdna_hwctx_priv {
> > > struct amdxdna_gem_obj *heap;
> > > void *mbox_chann;
> > > +
> > > + struct drm_gpu_scheduler sched;
> > > + struct drm_sched_entity entity;
> > > +
> > > + struct mutex io_lock; /* protect seq and cmd order */
> > > + struct wait_queue_head job_free_wq;
> > > + struct amdxdna_sched_job *pending[HWCTX_MAX_CMDS];
> > > + u32 num_pending;
> > > + u64 seq;
> > > + /* Completed job counter */
> > > + u64 completed;
> > > +
> > > + struct amdxdna_gem_obj *cmd_buf[HWCTX_MAX_CMDS];
> > > };
> > > struct amdxdna_dev_hdl {
> > > @@ -199,10 +219,25 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
> > > int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
> > > int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
> > > int aie2_config_cu(struct amdxdna_hwctx *hwctx);
> > > +int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
> > > + int (*notify_cb)(void *, const u32 *, size_t));
> > > +int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
> > > + struct amdxdna_sched_job *job,
> > > + int (*notify_cb)(void *, const u32 *, size_t));
> > > +int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
> > > + struct amdxdna_sched_job *job,
> > > + int (*notify_cb)(void *, const u32 *, size_t));
> > > +int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
> > > + int (*notify_cb)(void *, const u32 *, size_t));
> > > /* aie2_hwctx.c */
> > > int aie2_hwctx_init(struct amdxdna_hwctx *hwctx);
> > > void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx);
> > > int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
> > > +int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
> > > +int aie2_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
> > > +void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
> > > +void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map);
> > > +void aie2_restart_ctx(struct amdxdna_client *client);
> > > #endif /* _AIE2_PCI_H_ */
> > > diff --git a/drivers/accel/amdxdna/aie2_psp.c b/drivers/accel/amdxdna/aie2_psp.c
> > > index 2efcfd1941bf..35ba55e2ab1e 100644
> > > --- a/drivers/accel/amdxdna/aie2_psp.c
> > > +++ b/drivers/accel/amdxdna/aie2_psp.c
> > > @@ -4,8 +4,10 @@
> > > */
> > > #include <drm/drm_device.h>
> > > +#include <drm/drm_gem_shmem_helper.h>
> > > #include <drm/drm_managed.h>
> > > #include <drm/drm_print.h>
> > > +#include <drm/gpu_scheduler.h>
> > > #include <linux/iopoll.h>
> > > #include "aie2_pci.h"
> > > diff --git a/drivers/accel/amdxdna/aie2_smu.c b/drivers/accel/amdxdna/aie2_smu.c
> > > index 3fa7064649aa..91893d438da7 100644
> > > --- a/drivers/accel/amdxdna/aie2_smu.c
> > > +++ b/drivers/accel/amdxdna/aie2_smu.c
> > > @@ -4,7 +4,9 @@
> > > */
> > > #include <drm/drm_device.h>
> > > +#include <drm/drm_gem_shmem_helper.h>
> > > #include <drm/drm_print.h>
> > > +#include <drm/gpu_scheduler.h>
> > > #include <linux/iopoll.h>
> > > #include "aie2_pci.h"
> > > diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
> > > index 8acf8bfe0db9..b76640e1fdd0 100644
> > > --- a/drivers/accel/amdxdna/amdxdna_ctx.c
> > > +++ b/drivers/accel/amdxdna/amdxdna_ctx.c
> > > @@ -7,17 +7,65 @@
> > > #include <drm/drm_device.h>
> > > #include <drm/drm_drv.h>
> > > #include <drm/drm_file.h>
> > > +#include <drm/drm_gem.h>
> > > +#include <drm/drm_gem_shmem_helper.h>
> > > #include <drm/drm_print.h>
> > > +#include <drm/gpu_scheduler.h>
> > > +#include <trace/events/amdxdna.h>
> > > #include "amdxdna_ctx.h"
> > > +#include "amdxdna_gem.h"
> > > #include "amdxdna_pci_drv.h"
> > > #define MAX_HWCTX_ID 255
> > > +#define MAX_ARG_COUNT 4095
> > > -static void amdxdna_hwctx_destroy(struct amdxdna_hwctx *hwctx)
> > > +struct amdxdna_fence {
> > > + struct dma_fence base;
> > > + spinlock_t lock; /* for base */
> > > + struct amdxdna_hwctx *hwctx;
> > > +};
> > > +
> > > +static const char *amdxdna_fence_get_driver_name(struct dma_fence *fence)
> > > +{
> > > + return KBUILD_MODNAME;
> > > +}
> > > +
> > > +static const char *amdxdna_fence_get_timeline_name(struct dma_fence *fence)
> > > +{
> > > + struct amdxdna_fence *xdna_fence;
> > > +
> > > + xdna_fence = container_of(fence, struct amdxdna_fence, base);
> > > +
> > > + return xdna_fence->hwctx->name;
> > > +}
> > > +
> > > +static const struct dma_fence_ops fence_ops = {
> > > + .get_driver_name = amdxdna_fence_get_driver_name,
> > > + .get_timeline_name = amdxdna_fence_get_timeline_name,
> > > +};
> > > +
> > > +static struct dma_fence *amdxdna_fence_create(struct amdxdna_hwctx *hwctx)
> > > +{
> > > + struct amdxdna_fence *fence;
> > > +
> > > + fence = kzalloc(sizeof(*fence), GFP_KERNEL);
> > > + if (!fence)
> > > + return NULL;
> > > +
> > > + fence->hwctx = hwctx;
> > > + spin_lock_init(&fence->lock);
> > > + dma_fence_init(&fence->base, &fence_ops, &fence->lock, hwctx->id, 0);
> > > + return &fence->base;
> > > +}
> > > +
> > > +static void amdxdna_hwctx_destroy_rcu(struct amdxdna_hwctx *hwctx,
> > > + struct srcu_struct *ss)
> > > {
> > > struct amdxdna_dev *xdna = hwctx->client->xdna;
> > > + synchronize_srcu(ss);
> > > +
> > > /* At this point, user is not able to submit new commands */
> > > mutex_lock(&xdna->dev_lock);
> > > xdna->dev_info->ops->hwctx_fini(hwctx);
> > > @@ -27,6 +75,46 @@ static void amdxdna_hwctx_destroy(struct amdxdna_hwctx *hwctx)
> > > kfree(hwctx);
> > > }
> > > +void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size)
> > > +{
> > > + struct amdxdna_cmd *cmd = abo->mem.kva;
> > > + u32 num_masks, count;
> > > +
> > > + if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN)
> > > + num_masks = 0;
> > > + else
> > > + num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, cmd->header);
> > > +
> > > + if (size) {
> > > + count = FIELD_GET(AMDXDNA_CMD_COUNT, cmd->header);
> > > + if (unlikely(count <= num_masks)) {
> > > + *size = 0;
> > > + return NULL;
> > > + }
> > > + *size = (count - num_masks) * sizeof(u32);
> > > + }
> > > + return &cmd->data[num_masks];
> > > +}
> > > +
> > > +int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
> > > +{
> > > + struct amdxdna_cmd *cmd = abo->mem.kva;
> > > + u32 num_masks, i;
> > > + u32 *cu_mask;
> > > +
> > > + if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN)
> > > + return -1;
> > > +
> > > + num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, cmd->header);
> > > + cu_mask = cmd->data;
> > > + for (i = 0; i < num_masks; i++) {
> > > + if (cu_mask[i])
> > > + return ffs(cu_mask[i]) - 1;
> > > + }
> > > +
> > > + return -1;
> > > +}
> > > +
> > > /*
> > > * This should be called in close() and remove(). DO NOT call in other syscalls.
> > > * This guarantee that when hwctx and resources will be released, if user
> > > @@ -43,7 +131,7 @@ void amdxdna_hwctx_remove_all(struct amdxdna_client *client)
> > > client->pid, hwctx->id);
> > > idr_remove(&client->hwctx_idr, hwctx->id);
> > > mutex_unlock(&client->hwctx_lock);
> > > - amdxdna_hwctx_destroy(hwctx);
> > > + amdxdna_hwctx_destroy_rcu(hwctx, &client->hwctx_srcu);
> > > mutex_lock(&client->hwctx_lock);
> > > }
> > > mutex_unlock(&client->hwctx_lock);
> > > @@ -134,6 +222,12 @@ int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct d
> > > if (!drm_dev_enter(dev, &idx))
> > > return -ENODEV;
> > > + /*
> > > + * Use hwctx_lock to achieve exclusion with other hwctx writers,
> > > + * SRCU to synchronize with exec/wait command ioctls.
> > > + *
> > > + * The pushed jobs are handled by DRM scheduler during destroy.
> > > + */
> > > mutex_lock(&client->hwctx_lock);
> > > hwctx = idr_find(&client->hwctx_idr, args->handle);
> > > if (!hwctx) {
> > > @@ -146,7 +240,7 @@ int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct d
> > > idr_remove(&client->hwctx_idr, hwctx->id);
> > > mutex_unlock(&client->hwctx_lock);
> > > - amdxdna_hwctx_destroy(hwctx);
> > > + amdxdna_hwctx_destroy_rcu(hwctx, &client->hwctx_srcu);
> > > XDNA_DBG(xdna, "PID %d destroyed HW context %d", client->pid, args->handle);
> > > out:
> > > @@ -160,10 +254,10 @@ int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
> > > struct amdxdna_drm_config_hwctx *args = data;
> > > struct amdxdna_dev *xdna = to_xdna_dev(dev);
> > > struct amdxdna_hwctx *hwctx;
> > > + int ret, idx;
> > > u32 buf_size;
> > > void *buf;
> > > u64 val;
> > > - int ret;
> > > if (!xdna->dev_info->ops->hwctx_config)
> > > return -EOPNOTSUPP;
> > > @@ -202,17 +296,286 @@ int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
> > > }
> > > mutex_lock(&xdna->dev_lock);
> > > + idx = srcu_read_lock(&client->hwctx_srcu);
> > > hwctx = idr_find(&client->hwctx_idr, args->handle);
> > > if (!hwctx) {
> > > XDNA_DBG(xdna, "PID %d failed to get hwctx %d", client->pid, args->handle);
> > > ret = -EINVAL;
> > > - goto unlock;
> > > + goto unlock_srcu;
> > > }
> > > ret = xdna->dev_info->ops->hwctx_config(hwctx, args->param_type, val, buf, buf_size);
> > > -unlock:
> > > +unlock_srcu:
> > > + srcu_read_unlock(&client->hwctx_srcu, idx);
> > > mutex_unlock(&xdna->dev_lock);
> > > kfree(buf);
> > > return ret;
> > > }
> > > +
> > > +static void
> > > +amdxdna_arg_bos_put(struct amdxdna_sched_job *job)
> > > +{
> > > + int i;
> > > +
> > > + for (i = 0; i < job->bo_cnt; i++) {
> > > + if (!job->bos[i])
> > > + break;
> > > + drm_gem_object_put(job->bos[i]);
> > > + }
> > > +}
> > > +
> > > +static int
> > > +amdxdna_arg_bos_lookup(struct amdxdna_client *client,
> > > + struct amdxdna_sched_job *job,
> > > + u32 *bo_hdls, u32 bo_cnt)
> > > +{
> > > + struct drm_gem_object *gobj;
> > > + int i, ret;
> > > +
> > > + job->bo_cnt = bo_cnt;
> > > + for (i = 0; i < job->bo_cnt; i++) {
> > > + struct amdxdna_gem_obj *abo;
> > > +
> > > + gobj = drm_gem_object_lookup(client->filp, bo_hdls[i]);
> > > + if (!gobj) {
> > > + ret = -ENOENT;
> > > + goto put_shmem_bo;
> > > + }
> > > + abo = to_xdna_obj(gobj);
> > > +
> > > + mutex_lock(&abo->lock);
> > > + if (abo->pinned) {
> > > + mutex_unlock(&abo->lock);
> > > + job->bos[i] = gobj;
> > > + continue;
> > > + }
> > > +
> > > + ret = amdxdna_gem_pin_nolock(abo);
> > > + if (ret) {
> > > + mutex_unlock(&abo->lock);
> > > + drm_gem_object_put(gobj);
> > > + goto put_shmem_bo;
> > > + }
> > > + abo->pinned = true;
> > > + mutex_unlock(&abo->lock);
> > > +
> > > + job->bos[i] = gobj;
> > > + }
> > > +
> > > + return 0;
> > > +
> > > +put_shmem_bo:
> > > + amdxdna_arg_bos_put(job);
> > > + return ret;
> > > +}
> > > +
> > > +static void amdxdna_sched_job_release(struct kref *ref)
> > > +{
> > > + struct amdxdna_sched_job *job;
> > > +
> > > + job = container_of(ref, struct amdxdna_sched_job, refcnt);
> > > +
> > > + trace_amdxdna_debug_point(job->hwctx->name, job->seq, "job release");
> > > + amdxdna_arg_bos_put(job);
> > > + amdxdna_gem_put_obj(job->cmd_bo);
> > > + kfree(job);
> > > +}
> > > +
> > > +void amdxdna_job_put(struct amdxdna_sched_job *job)
> > > +{
> > > + kref_put(&job->refcnt, amdxdna_sched_job_release);
> > > +}
> > > +
> > > +int amdxdna_cmd_submit(struct amdxdna_client *client,
> > > + u32 cmd_bo_hdl, u32 *arg_bo_hdls, u32 arg_bo_cnt,
> > > + u32 hwctx_hdl, u64 *seq)
> > > +{
> > > + struct amdxdna_dev *xdna = client->xdna;
> > > + struct amdxdna_sched_job *job;
> > > + struct amdxdna_hwctx *hwctx;
> > > + int ret, idx;
> > > +
> > > + XDNA_DBG(xdna, "Command BO hdl %d, Arg BO count %d", cmd_bo_hdl, arg_bo_cnt);
> > > + job = kzalloc(struct_size(job, bos, arg_bo_cnt), GFP_KERNEL);
> > > + if (!job)
> > > + return -ENOMEM;
> > > +
> > > + if (cmd_bo_hdl != AMDXDNA_INVALID_BO_HANDLE) {
> > > + job->cmd_bo = amdxdna_gem_get_obj(client, cmd_bo_hdl, AMDXDNA_BO_CMD);
> > > + if (!job->cmd_bo) {
> > > + XDNA_ERR(xdna, "Failed to get cmd bo from %d", cmd_bo_hdl);
> > > + ret = -EINVAL;
> > > + goto free_job;
> > > + }
> > > + } else {
> > > + job->cmd_bo = NULL;
> > > + }
> > > +
> > > + ret = amdxdna_arg_bos_lookup(client, job, arg_bo_hdls, arg_bo_cnt);
> > > + if (ret) {
> > > + XDNA_ERR(xdna, "Argument BOs lookup failed, ret %d", ret);
> > > + goto cmd_put;
> > > + }
> > > +
> > > + idx = srcu_read_lock(&client->hwctx_srcu);
> > > + hwctx = idr_find(&client->hwctx_idr, hwctx_hdl);
> > > + if (!hwctx) {
> > > + XDNA_DBG(xdna, "PID %d failed to get hwctx %d",
> > > + client->pid, hwctx_hdl);
> > > + ret = -EINVAL;
> > > + goto unlock_srcu;
> > > + }
> > > +
> > > + if (hwctx->status != HWCTX_STAT_READY) {
> > > + XDNA_ERR(xdna, "HW Context is not ready");
> > > + ret = -EINVAL;
> > > + goto unlock_srcu;
> > > + }
> > > +
> > > + job->hwctx = hwctx;
> > > + job->mm = current->mm;
> > > +
> > > + job->fence = amdxdna_fence_create(hwctx);
> > > + if (!job->fence) {
> > > + XDNA_ERR(xdna, "Failed to create fence");
> > > + ret = -ENOMEM;
> > > + goto unlock_srcu;
> > > + }
> > > + kref_init(&job->refcnt);
> > > +
> > > + ret = xdna->dev_info->ops->cmd_submit(hwctx, job, seq);
> > > + if (ret)
> > > + goto put_fence;
> > > +
> > > + /*
> > > + * The amdxdna_hwctx_destroy_rcu() will release hwctx and associated
> > > + * resource after synchronize_srcu(). The submitted jobs should be
> > > + * handled by the queue, for example DRM scheduler, in device layer.
> > > + * For here we can unlock SRCU.
> > > + */
> > > + srcu_read_unlock(&client->hwctx_srcu, idx);
> > > + trace_amdxdna_debug_point(hwctx->name, *seq, "job pushed");
> > > +
> > > + return 0;
> > > +
> > > +put_fence:
> > > + dma_fence_put(job->fence);
> > > +unlock_srcu:
> > > + srcu_read_unlock(&client->hwctx_srcu, idx);
> > > + amdxdna_arg_bos_put(job);
> > > +cmd_put:
> > > + amdxdna_gem_put_obj(job->cmd_bo);
> > > +free_job:
> > > + kfree(job);
> > > + return ret;
> > > +}
> > > +
> > > +/*
> > > + * The submit command ioctl submits a command to firmware. One firmware command
> > > + * may contain multiple command BOs for processing as a whole.
> > > + * The command sequence number is returned which can be used for wait command ioctl.
> > > + */
> > > +static int amdxdna_drm_submit_execbuf(struct amdxdna_client *client,
> > > + struct amdxdna_drm_exec_cmd *args)
> > > +{
> > > + struct amdxdna_dev *xdna = client->xdna;
> > > + u32 *arg_bo_hdls;
> > > + u32 cmd_bo_hdl;
> > > + int ret;
> > > +
> > > + if (!args->arg_count || args->arg_count > MAX_ARG_COUNT) {
> > > + XDNA_ERR(xdna, "Invalid arg bo count %d", args->arg_count);
> > > + return -EINVAL;
> > > + }
> > > +
> > > + /* Only support single command for now. */
> > > + if (args->cmd_count != 1) {
> > > + XDNA_ERR(xdna, "Invalid cmd bo count %d", args->cmd_count);
> > > + return -EINVAL;
> > > + }
> > > +
> > > + cmd_bo_hdl = (u32)args->cmd_handles;
> > > + arg_bo_hdls = kcalloc(args->arg_count, sizeof(u32), GFP_KERNEL);
> > > + if (!arg_bo_hdls)
> > > + return -ENOMEM;
> > > + ret = copy_from_user(arg_bo_hdls, u64_to_user_ptr(args->args),
> > > + args->arg_count * sizeof(u32));
> > > + if (ret) {
> > > + ret = -EFAULT;
> > > + goto free_cmd_bo_hdls;
> > > + }
> > > +
> > > + ret = amdxdna_cmd_submit(client, cmd_bo_hdl, arg_bo_hdls,
> > > + args->arg_count, args->hwctx, &args->seq);
> > > + if (ret)
> > > + XDNA_DBG(xdna, "Submit cmds failed, ret %d", ret);
> > > +
> > > +free_cmd_bo_hdls:
> > > + kfree(arg_bo_hdls);
> > > + if (!ret)
> > > + XDNA_DBG(xdna, "Pushed cmd %lld to scheduler", args->seq);
> > > + return ret;
> > > +}
> > > +
> > > +int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
> > > +{
> > > + struct amdxdna_client *client = filp->driver_priv;
> > > + struct amdxdna_drm_exec_cmd *args = data;
> > > +
> > > + if (args->ext || args->ext_flags)
> > > + return -EINVAL;
> > > +
> > > + switch (args->type) {
> > > + case AMDXDNA_CMD_SUBMIT_EXEC_BUF:
> > > + return amdxdna_drm_submit_execbuf(client, args);
> > > + }
> > > +
> > > + XDNA_ERR(client->xdna, "Invalid command type %d", args->type);
> > > + return -EINVAL;
> > > +}
> > > +
> > > +int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl,
> > > + u64 seq, u32 timeout)
> > > +{
> > > + struct amdxdna_dev *xdna = client->xdna;
> > > + struct amdxdna_hwctx *hwctx;
> > > + int ret, idx;
> > > +
> > > + if (!xdna->dev_info->ops->cmd_wait)
> > > + return -EOPNOTSUPP;
> > > +
> > > + /* For locking concerns, see amdxdna_drm_exec_cmd_ioctl. */
> > > + idx = srcu_read_lock(&client->hwctx_srcu);
> > > + hwctx = idr_find(&client->hwctx_idr, hwctx_hdl);
> > > + if (!hwctx) {
> > > + XDNA_DBG(xdna, "PID %d failed to get hwctx %d",
> > > + client->pid, hwctx_hdl);
> > > + ret = -EINVAL;
> > > + goto unlock_hwctx_srcu;
> > > + }
> > > +
> > > + ret = xdna->dev_info->ops->cmd_wait(hwctx, seq, timeout);
> > > +
> > > +unlock_hwctx_srcu:
> > > + srcu_read_unlock(&client->hwctx_srcu, idx);
> > > + return ret;
> > > +}
> > > +
> > > +int amdxdna_drm_wait_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
> > > +{
> > > + struct amdxdna_client *client = filp->driver_priv;
> > > + struct amdxdna_dev *xdna = to_xdna_dev(dev);
> > > + struct amdxdna_drm_wait_cmd *args = data;
> > > + int ret;
> > > +
> > > + XDNA_DBG(xdna, "PID %d hwctx %d timeout set %d ms for cmd %lld",
> > > + client->pid, args->hwctx, args->timeout, args->seq);
> > > +
> > > + ret = amdxdna_cmd_wait(client, args->hwctx, args->seq, args->timeout);
> > > +
> > > + XDNA_DBG(xdna, "PID %d hwctx %d cmd %lld wait finished, ret %d",
> > > + client->pid, args->hwctx, args->seq, ret);
> > > +
> > > + return ret;
> > > +}
> > > diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
> > > index 665b3208897d..65f9c1dfe32c 100644
> > > --- a/drivers/accel/amdxdna/amdxdna_ctx.h
> > > +++ b/drivers/accel/amdxdna/amdxdna_ctx.h
> > > @@ -6,6 +6,52 @@
> > > #ifndef _AMDXDNA_CTX_H_
> > > #define _AMDXDNA_CTX_H_
> > > +#include "amdxdna_gem.h"
> > > +
> > > +struct amdxdna_hwctx_priv;
> > > +
> > > +enum ert_cmd_opcode {
> > > + ERT_START_CU = 0,
> > > + ERT_CMD_CHAIN = 19,
> > > + ERT_START_NPU = 20,
> > > +};
> > > +
> > > +enum ert_cmd_state {
> > > + ERT_CMD_STATE_INVALID,
> > > + ERT_CMD_STATE_NEW,
> > > + ERT_CMD_STATE_QUEUED,
> > > + ERT_CMD_STATE_RUNNING,
> > > + ERT_CMD_STATE_COMPLETED,
> > > + ERT_CMD_STATE_ERROR,
> > > + ERT_CMD_STATE_ABORT,
> > > + ERT_CMD_STATE_SUBMITTED,
> > > + ERT_CMD_STATE_TIMEOUT,
> > > + ERT_CMD_STATE_NORESPONSE,
> > > +};
> > > +
> > > +/*
> > > + * Interpretation of the beginning of data payload for ERT_START_NPU in
> > > + * amdxdna_cmd. The rest of the payload in amdxdna_cmd is regular kernel args.
> > > + */
> > > +struct amdxdna_cmd_start_npu {
> > > + u64 buffer; /* instruction buffer address */
> > > + u32 buffer_size; /* size of buffer in bytes */
> > > + u32 prop_count; /* properties count */
> > > + u32 prop_args[]; /* properties and regular kernel arguments */
> > > +};
> > > +
> > > +/*
> > > + * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in
> > > + * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles.
> > > + */
> > > +struct amdxdna_cmd_chain {
> > > + u32 command_count;
> > > + u32 submit_index;
> > > + u32 error_index;
> > > + u32 reserved[3];
> > > + u64 data[] __counted_by(command_count);
> > > +};
> > > +
> > > /* Exec buffer command header format */
> > > #define AMDXDNA_CMD_STATE GENMASK(3, 0)
> > > #define AMDXDNA_CMD_EXTRA_CU_MASK GENMASK(11, 10)
> > > @@ -40,9 +86,73 @@ struct amdxdna_hwctx {
> > > struct amdxdna_hwctx_param_config_cu *cus;
> > > };
> > > +#define drm_job_to_xdna_job(j) \
> > > + container_of(j, struct amdxdna_sched_job, base)
> > > +
> > > +struct amdxdna_sched_job {
> > > + struct drm_sched_job base;
> > > + struct kref refcnt;
> > > + struct amdxdna_hwctx *hwctx;
> > > + struct mm_struct *mm;
> > > + /* The fence to notice DRM scheduler that job is done by hardware */
> > > + struct dma_fence *fence;
> > > + /* user can wait on this fence */
> > > + struct dma_fence *out_fence;
> > > + u64 seq;
> > > + struct amdxdna_gem_obj *cmd_bo;
> > > + size_t bo_cnt;
> > > + struct drm_gem_object *bos[] __counted_by(bo_cnt);
> > > +};
> > > +
> > > +static inline u32
> > > +amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
> > > +{
> > > + struct amdxdna_cmd *cmd = abo->mem.kva;
> > > +
> > > + return FIELD_GET(AMDXDNA_CMD_OPCODE, cmd->header);
> > > +}
> > > +
> > > +static inline void
> > > +amdxdna_cmd_set_state(struct amdxdna_gem_obj *abo, enum ert_cmd_state s)
> > > +{
> > > + struct amdxdna_cmd *cmd = abo->mem.kva;
> > > +
> > > + cmd->header &= ~AMDXDNA_CMD_STATE;
> > > + cmd->header |= FIELD_PREP(AMDXDNA_CMD_STATE, s);
> > > +}
> > > +
> > > +static inline enum ert_cmd_state
> > > +amdxdna_cmd_get_state(struct amdxdna_gem_obj *abo)
> > > +{
> > > + struct amdxdna_cmd *cmd = abo->mem.kva;
> > > +
> > > + return FIELD_GET(AMDXDNA_CMD_STATE, cmd->header);
> > > +}
> > > +
> > > +void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size);
> > > +int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
> > > +
> > > +static inline u32 amdxdna_hwctx_col_map(struct amdxdna_hwctx *hwctx)
> > > +{
> > > + return GENMASK(hwctx->start_col + hwctx->num_col - 1,
> > > + hwctx->start_col);
> > > +}
> > > +
> > > +void amdxdna_job_put(struct amdxdna_sched_job *job);
> > > +
> > > void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
> > > +
> > > +int amdxdna_cmd_submit(struct amdxdna_client *client,
> > > + u32 cmd_bo_hdls, u32 *arg_bo_hdls, u32 arg_bo_cnt,
> > > + u32 hwctx_hdl, u64 *seq);
> > > +
> > > +int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl,
> > > + u64 seq, u32 timeout);
> > > +
> > > int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
> > > int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
> > > int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
> > > +int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
> > > +int amdxdna_drm_wait_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
> > > #endif /* _AMDXDNA_CTX_H_ */
> > > diff --git a/drivers/accel/amdxdna/amdxdna_gem.c b/drivers/accel/amdxdna/amdxdna_gem.c
> > > index 66373baa4600..091674a4bbfa 100644
> > > --- a/drivers/accel/amdxdna/amdxdna_gem.c
> > > +++ b/drivers/accel/amdxdna/amdxdna_gem.c
> > > @@ -8,6 +8,7 @@
> > > #include <drm/drm_device.h>
> > > #include <drm/drm_gem.h>
> > > #include <drm/drm_gem_shmem_helper.h>
> > > +#include <drm/gpu_scheduler.h>
> > > #include <linux/iosys-map.h>
> > > #include <linux/vmalloc.h>
> > > diff --git a/drivers/accel/amdxdna/amdxdna_mailbox_helper.c b/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
> > > index 42b615394605..5139a9c96a91 100644
> > > --- a/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
> > > +++ b/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
> > > @@ -3,10 +3,15 @@
> > > * Copyright (C) 2024, Advanced Micro Devices, Inc.
> > > */
> > > +#include <drm/amdxdna_accel.h>
> > > #include <drm/drm_device.h>
> > > #include <drm/drm_print.h>
> > > +#include <drm/drm_gem.h>
> > > +#include <drm/drm_gem_shmem_helper.h>
> > > +#include <drm/gpu_scheduler.h>
> > > #include <linux/completion.h>
> > > +#include "amdxdna_gem.h"
> > > #include "amdxdna_mailbox.h"
> > > #include "amdxdna_mailbox_helper.h"
> > > #include "amdxdna_pci_drv.h"
> > > diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c
> > > index 47ea79d4a021..5c1e863825e0 100644
> > > --- a/drivers/accel/amdxdna/amdxdna_pci_drv.c
> > > +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
> > > @@ -10,6 +10,7 @@
> > > #include <drm/drm_gem_shmem_helper.h>
> > > #include <drm/drm_ioctl.h>
> > > #include <drm/drm_managed.h>
> > > +#include <drm/gpu_scheduler.h>
> > > #include <linux/iommu.h>
> > > #include <linux/pci.h>
> > > @@ -64,6 +65,7 @@ static int amdxdna_drm_open(struct drm_device *ddev, struct drm_file *filp)
> > > goto unbind_sva;
> > > }
> > > mutex_init(&client->hwctx_lock);
> > > + init_srcu_struct(&client->hwctx_srcu);
> > > idr_init_base(&client->hwctx_idr, AMDXDNA_INVALID_CTX_HANDLE + 1);
> > > mutex_init(&client->mm_lock);
> > > @@ -93,6 +95,7 @@ static void amdxdna_drm_close(struct drm_device *ddev, struct drm_file *filp)
> > > XDNA_DBG(xdna, "closing pid %d", client->pid);
> > > idr_destroy(&client->hwctx_idr);
> > > + cleanup_srcu_struct(&client->hwctx_srcu);
> > > mutex_destroy(&client->hwctx_lock);
> > > mutex_destroy(&client->mm_lock);
> > > if (client->dev_heap)
> > > @@ -133,6 +136,9 @@ static const struct drm_ioctl_desc amdxdna_drm_ioctls[] = {
> > > DRM_IOCTL_DEF_DRV(AMDXDNA_CREATE_BO, amdxdna_drm_create_bo_ioctl, 0),
> > > DRM_IOCTL_DEF_DRV(AMDXDNA_GET_BO_INFO, amdxdna_drm_get_bo_info_ioctl, 0),
> > > DRM_IOCTL_DEF_DRV(AMDXDNA_SYNC_BO, amdxdna_drm_sync_bo_ioctl, 0),
> > > + /* Exectuion */
> > > + DRM_IOCTL_DEF_DRV(AMDXDNA_EXEC_CMD, amdxdna_drm_submit_cmd_ioctl, 0),
> > > + DRM_IOCTL_DEF_DRV(AMDXDNA_WAIT_CMD, amdxdna_drm_wait_cmd_ioctl, 0),
> > > };
> > > static const struct file_operations amdxdna_fops = {
> > > diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.h b/drivers/accel/amdxdna/amdxdna_pci_drv.h
> > > index 3dddde4ac12a..0324e73094b2 100644
> > > --- a/drivers/accel/amdxdna/amdxdna_pci_drv.h
> > > +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.h
> > > @@ -20,6 +20,7 @@ extern const struct drm_driver amdxdna_drm_drv;
> > > struct amdxdna_dev;
> > > struct amdxdna_gem_obj;
> > > struct amdxdna_hwctx;
> > > +struct amdxdna_sched_job;
> > > /*
> > > * struct amdxdna_dev_ops - Device hardware operation callbacks
> > > @@ -31,6 +32,8 @@ struct amdxdna_dev_ops {
> > > void (*hwctx_fini)(struct amdxdna_hwctx *hwctx);
> > > int (*hwctx_config)(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
> > > void (*hmm_invalidate)(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
> > > + int (*cmd_submit)(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
> > > + int (*cmd_wait)(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
> > > };
> > > /*
> > > @@ -88,6 +91,8 @@ struct amdxdna_client {
> > > struct list_head node;
> > > pid_t pid;
> > > struct mutex hwctx_lock; /* protect hwctx */
> > > + /* do NOT wait this srcu when hwctx_lock is hold */
> > > + struct srcu_struct hwctx_srcu;
> > > struct idr hwctx_idr;
> > > struct amdxdna_dev *xdna;
> > > struct drm_file *filp;
> > > diff --git a/drivers/accel/amdxdna/amdxdna_sysfs.c b/drivers/accel/amdxdna/amdxdna_sysfs.c
> > > index 668b94b92714..f27e4ee960a0 100644
> > > --- a/drivers/accel/amdxdna/amdxdna_sysfs.c
> > > +++ b/drivers/accel/amdxdna/amdxdna_sysfs.c
> > > @@ -3,9 +3,14 @@
> > > * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
> > > */
> > > +#include <drm/amdxdna_accel.h>
> > > #include <drm/drm_device.h>
> > > +#include <drm/drm_gem_shmem_helper.h>
> > > #include <drm/drm_print.h>
> > > +#include <drm/gpu_scheduler.h>
> > > +#include <linux/types.h>
> > > +#include "amdxdna_gem.h"
> > > #include "amdxdna_pci_drv.h"
> > > static ssize_t vbnv_show(struct device *dev, struct device_attribute *attr, char *buf)
> > > diff --git a/drivers/accel/amdxdna/npu1_regs.c b/drivers/accel/amdxdna/npu1_regs.c
> > > index 720aab0ed7c4..f00c50461b09 100644
> > > --- a/drivers/accel/amdxdna/npu1_regs.c
> > > +++ b/drivers/accel/amdxdna/npu1_regs.c
> > > @@ -5,6 +5,7 @@
> > > #include <drm/amdxdna_accel.h>
> > > #include <drm/drm_device.h>
> > > +#include <drm/gpu_scheduler.h>
> > > #include <linux/sizes.h>
> > > #include "aie2_pci.h"
> > > diff --git a/drivers/accel/amdxdna/npu2_regs.c b/drivers/accel/amdxdna/npu2_regs.c
> > > index f3ea18bcf294..00cb381031d2 100644
> > > --- a/drivers/accel/amdxdna/npu2_regs.c
> > > +++ b/drivers/accel/amdxdna/npu2_regs.c
> > > @@ -5,6 +5,7 @@
> > > #include <drm/amdxdna_accel.h>
> > > #include <drm/drm_device.h>
> > > +#include <drm/gpu_scheduler.h>
> > > #include <linux/sizes.h>
> > > #include "aie2_pci.h"
> > > diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c
> > > index db61142f0d4e..b6dae9667cca 100644
> > > --- a/drivers/accel/amdxdna/npu4_regs.c
> > > +++ b/drivers/accel/amdxdna/npu4_regs.c
> > > @@ -5,6 +5,7 @@
> > > #include <drm/amdxdna_accel.h>
> > > #include <drm/drm_device.h>
> > > +#include <drm/gpu_scheduler.h>
> > > #include <linux/sizes.h>
> > > #include "aie2_pci.h"
> > > diff --git a/drivers/accel/amdxdna/npu5_regs.c b/drivers/accel/amdxdna/npu5_regs.c
> > > index debf4e95b9bb..bed1baf8e160 100644
> > > --- a/drivers/accel/amdxdna/npu5_regs.c
> > > +++ b/drivers/accel/amdxdna/npu5_regs.c
> > > @@ -5,6 +5,7 @@
> > > #include <drm/amdxdna_accel.h>
> > > #include <drm/drm_device.h>
> > > +#include <drm/gpu_scheduler.h>
> > > #include <linux/sizes.h>
> > > #include "aie2_pci.h"
> > > diff --git a/include/trace/events/amdxdna.h b/include/trace/events/amdxdna.h
> > > index 33343d8f0622..c6cb2da7b706 100644
> > > --- a/include/trace/events/amdxdna.h
> > > +++ b/include/trace/events/amdxdna.h
> > > @@ -9,8 +9,49 @@
> > > #if !defined(_TRACE_AMDXDNA_H) || defined(TRACE_HEADER_MULTI_READ)
> > > #define _TRACE_AMDXDNA_H
> > > +#include <drm/gpu_scheduler.h>
> > > #include <linux/tracepoint.h>
> > > +TRACE_EVENT(amdxdna_debug_point,
> > > + TP_PROTO(const char *name, u64 number, const char *str),
> > > +
> > > + TP_ARGS(name, number, str),
> > > +
> > > + TP_STRUCT__entry(__string(name, name)
> > > + __field(u64, number)
> > > + __string(str, str)),
> > > +
> > > + TP_fast_assign(__assign_str(name);
> > > + __entry->number = number;
> > > + __assign_str(str);),
> > > +
> > > + TP_printk("%s:%llu %s", __get_str(name), __entry->number,
> > > + __get_str(str))
> > > +);
> > > +
> > > +TRACE_EVENT(xdna_job,
> > > + TP_PROTO(struct drm_sched_job *sched_job, const char *name, const char *str, u64 seq),
> > > +
> > > + TP_ARGS(sched_job, name, str, seq),
> > > +
> > > + TP_STRUCT__entry(__string(name, name)
> > > + __string(str, str)
> > > + __field(u64, fence_context)
> > > + __field(u64, fence_seqno)
> > > + __field(u64, seq)),
> > > +
> > > + TP_fast_assign(__assign_str(name);
> > > + __assign_str(str);
> > > + __entry->fence_context = sched_job->s_fence->finished.context;
> > > + __entry->fence_seqno = sched_job->s_fence->finished.seqno;
> > > + __entry->seq = seq;),
> > > +
> > > + TP_printk("fence=(context:%llu, seqno:%lld), %s seq#:%lld %s",
> > > + __entry->fence_context, __entry->fence_seqno,
> > > + __get_str(name), __entry->seq,
> > > + __get_str(str))
> > > +);
> > > +
> > > DECLARE_EVENT_CLASS(xdna_mbox_msg,
> > > TP_PROTO(char *name, u8 chann_id, u32 opcode, u32 msg_id),
> > > diff --git a/include/uapi/drm/amdxdna_accel.h b/include/uapi/drm/amdxdna_accel.h
> > > index 3792750834b2..08f3ec7146ab 100644
> > > --- a/include/uapi/drm/amdxdna_accel.h
> > > +++ b/include/uapi/drm/amdxdna_accel.h
> > > @@ -13,6 +13,7 @@
> > > extern "C" {
> > > #endif
> > > +#define AMDXDNA_INVALID_CMD_HANDLE (~0UL)
> > > #define AMDXDNA_INVALID_ADDR (~0UL)
> > > #define AMDXDNA_INVALID_CTX_HANDLE 0
> > > #define AMDXDNA_INVALID_BO_HANDLE 0
> > > @@ -29,6 +30,8 @@ enum amdxdna_drm_ioctl_id {
> > > DRM_AMDXDNA_CREATE_BO,
> > > DRM_AMDXDNA_GET_BO_INFO,
> > > DRM_AMDXDNA_SYNC_BO,
> > > + DRM_AMDXDNA_EXEC_CMD,
> > > + DRM_AMDXDNA_WAIT_CMD,
> > > };
> > > /**
> > > @@ -201,6 +204,54 @@ struct amdxdna_drm_sync_bo {
> > > __u64 size;
> > > };
> > > +enum amdxdna_cmd_type {
> > > + AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0,
> > > + AMDXDNA_CMD_SUBMIT_DEPENDENCY,
> > > + AMDXDNA_CMD_SUBMIT_SIGNAL,
> > > +};
> > > +
> > > +/**
> > > + * struct amdxdna_drm_exec_cmd - Execute command.
> > > + * @ext: MBZ.
> > > + * @ext_flags: MBZ.
> > > + * @hwctx: Hardware context handle.
> > > + * @type: One of command type in enum amdxdna_cmd_type.
> > > + * @cmd_handles: Array of command handles or the command handle itself
> > > + * in case of just one.
> > > + * @args: Array of arguments for all command handles.
> > > + * @cmd_count: Number of command handles in the cmd_handles array.
> > > + * @arg_count: Number of arguments in the args array.
> > > + * @seq: Returned sequence number for this command.
> > > + */
> > > +struct amdxdna_drm_exec_cmd {
> > > + __u64 ext;
> > > + __u64 ext_flags;
> > > + __u32 hwctx;
> > > + __u32 type;
> > > + __u64 cmd_handles;
> > > + __u64 args;
> > > + __u32 cmd_count;
> > > + __u32 arg_count;
> > > + __u64 seq;
> > > +};
> > > +
> > > +/**
> > > + * struct amdxdna_drm_wait_cmd - Wait exectuion command.
> > > + *
> > > + * @hwctx: hardware context handle.
> > > + * @timeout: timeout in ms, 0 implies infinite wait.
> > > + * @seq: sequence number of the command returned by execute command.
> > > + *
> > > + * Wait a command specified by seq to be completed.
> > > + * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot
> > > + * to submit a new command.
> > > + */
> > > +struct amdxdna_drm_wait_cmd {
> > > + __u32 hwctx;
> > > + __u32 timeout;
> > > + __u64 seq;
> > > +};
> > > +
> > > #define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \
> > > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \
> > > struct amdxdna_drm_create_hwctx)
> > > @@ -225,6 +276,14 @@ struct amdxdna_drm_sync_bo {
> > > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, \
> > > struct amdxdna_drm_sync_bo)
> > > +#define DRM_IOCTL_AMDXDNA_EXEC_CMD \
> > > + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, \
> > > + struct amdxdna_drm_exec_cmd)
> > > +
> > > +#define DRM_IOCTL_AMDXDNA_WAIT_CMD \
> > > + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, \
> > > + struct amdxdna_drm_wait_cmd)
> > > +
> > > #if defined(__cplusplus)
> > > } /* extern c end */
> > > #endif
> > > --
> > > 2.34.1
> > >
© 2016 - 2026 Red Hat, Inc.