For any remote call to DSP, after sending an invocation message,
fastRPC driver waits for glink response and during this time the
CPU can go into low power modes. Adding a polling mode support
with which fastRPC driver will poll continuously on a memory
after sending a message to remote subsystem which will eliminate
CPU wakeup and scheduling latencies and reduce fastRPC overhead.
With this change, DSP always sends a glink response which will
get ignored if polling mode didn't time out.
Signed-off-by: Ekansh Gupta <quic_ekangupt@quicinc.com>
---
drivers/misc/fastrpc.c | 122 +++++++++++++++++++++++++++++++++---
include/uapi/misc/fastrpc.h | 3 +-
2 files changed, 114 insertions(+), 11 deletions(-)
diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
index cfacee0dded5..257a741af115 100644
--- a/drivers/misc/fastrpc.c
+++ b/drivers/misc/fastrpc.c
@@ -19,6 +19,7 @@
#include <linux/rpmsg.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
+#include <linux/delay.h>
#include <linux/firmware/qcom/qcom_scm.h>
#include <uapi/misc/fastrpc.h>
#include <linux/of_reserved_mem.h>
@@ -38,6 +39,7 @@
#define FASTRPC_CTX_MAX (256)
#define FASTRPC_INIT_HANDLE 1
#define FASTRPC_DSP_UTILITIES_HANDLE 2
+#define FASTRPC_MAX_STATIC_HANDLE (20)
#define FASTRPC_CTXID_MASK (0xFF0)
#define INIT_FILELEN_MAX (2 * 1024 * 1024)
#define INIT_FILE_NAMELEN_MAX (128)
@@ -106,6 +108,19 @@
#define miscdev_to_fdevice(d) container_of(d, struct fastrpc_device, miscdev)
+/* Poll response number from remote processor for call completion */
+#define FASTRPC_POLL_RESPONSE (0xdecaf)
+/* timeout in us for polling until memory barrier */
+#define FASTRPC_POLL_TIME_MEM_UPDATE (500)
+
+/* Response types supported for RPC calls */
+enum fastrpc_response_flags {
+ /* normal job completion glink response */
+ NORMAL_RESPONSE = 0,
+ /* process updates poll memory instead of glink response */
+ POLL_MODE = 1,
+};
+
static const char *domains[FASTRPC_DEV_MAX] = { "adsp", "mdsp",
"sdsp", "cdsp", "cdsp1" };
struct fastrpc_phy_page {
@@ -238,9 +253,16 @@ struct fastrpc_invoke_ctx {
u32 sc;
u64 *fdlist;
u32 *crclist;
+ u32 *poll;
void __user *crc;
u64 ctxid;
u64 msg_sz;
+ /* Threads poll for specified timeout and fall back to glink wait */
+ u64 poll_timeout;
+ /* work done status flag */
+ bool is_work_done;
+ /* response flags from remote processor */
+ enum fastrpc_response_flags rsp_flags;
struct kref refcount;
struct list_head node; /* list of ctxs */
struct completion work;
@@ -258,6 +280,7 @@ struct fastrpc_invoke_ctx {
struct fastrpc_ctx_args {
struct fastrpc_invoke_args *args;
void __user *crc;
+ u64 poll_timeout;
};
struct fastrpc_session_ctx {
@@ -619,11 +642,14 @@ static struct fastrpc_invoke_ctx *fastrpc_context_alloc(
fastrpc_channel_ctx_get(cctx);
ctx->crc = cargs->crc;
+ ctx->poll_timeout = cargs->poll_timeout;
ctx->sc = sc;
ctx->retval = -1;
ctx->pid = current->pid;
ctx->client_id = user->client_id;
ctx->cctx = cctx;
+ ctx->rsp_flags = NORMAL_RESPONSE;
+ ctx->is_work_done = false;
init_completion(&ctx->work);
INIT_WORK(&ctx->put_work, fastrpc_context_put_wq);
@@ -882,7 +908,8 @@ static int fastrpc_get_meta_size(struct fastrpc_invoke_ctx *ctx)
sizeof(struct fastrpc_invoke_buf) +
sizeof(struct fastrpc_phy_page)) * ctx->nscalars +
sizeof(u64) * FASTRPC_MAX_FDLIST +
- sizeof(u32) * FASTRPC_MAX_CRCLIST;
+ sizeof(u32) * FASTRPC_MAX_CRCLIST +
+ sizeof(u32);
return size;
}
@@ -975,6 +1002,8 @@ static int fastrpc_get_args(u32 kernel, struct fastrpc_invoke_ctx *ctx)
pages = fastrpc_phy_page_start(list, ctx->nscalars);
ctx->fdlist = (u64 *)(pages + ctx->nscalars);
ctx->crclist = (u32 *)(ctx->fdlist + FASTRPC_MAX_FDLIST);
+ ctx->poll = (u32 *)(ctx->crclist + FASTRPC_MAX_CRCLIST);
+
args = (uintptr_t)ctx->buf->virt + metalen;
rlen = pkt_size - metalen;
ctx->rpra = rpra;
@@ -1145,6 +1174,72 @@ static int fastrpc_invoke_send(struct fastrpc_session_ctx *sctx,
}
+static int poll_for_remote_response(struct fastrpc_invoke_ctx *ctx, u64 timeout)
+{
+ int err = -EIO, i, j;
+
+ /* poll on memory for DSP response. Return failure on timeout */
+ for (i = 0, j = 0; i < timeout; i++, j++) {
+ if (*ctx->poll == FASTRPC_POLL_RESPONSE) {
+ err = 0;
+ ctx->is_work_done = true;
+ ctx->retval = 0;
+ break;
+ }
+ if (j == FASTRPC_POLL_TIME_MEM_UPDATE) {
+ /* make sure that all poll memory writes by DSP are seen by CPU */
+ dma_rmb();
+ j = 0;
+ }
+ udelay(1);
+ }
+ return err;
+}
+
+static inline int fastrpc_wait_for_response(struct fastrpc_invoke_ctx *ctx,
+ u32 kernel)
+{
+ int err = 0;
+
+ if (kernel) {
+ if (!wait_for_completion_timeout(&ctx->work, 10 * HZ))
+ err = -ETIMEDOUT;
+ } else {
+ err = wait_for_completion_interruptible(&ctx->work);
+ }
+
+ return err;
+}
+
+static int fastrpc_wait_for_completion(struct fastrpc_invoke_ctx *ctx,
+ u32 kernel)
+{
+ int err;
+
+ do {
+ switch (ctx->rsp_flags) {
+ case NORMAL_RESPONSE:
+ err = fastrpc_wait_for_response(ctx, kernel);
+ if (err || ctx->is_work_done)
+ return err;
+ break;
+ case POLL_MODE:
+ err = poll_for_remote_response(ctx, ctx->poll_timeout);
+ /* If polling timed out, move to normal response mode */
+ if (err)
+ ctx->rsp_flags = NORMAL_RESPONSE;
+ break;
+ default:
+ err = -EBADR;
+ dev_dbg(ctx->fl->sctx->dev,
+ "unsupported response type:0x%x\n", ctx->rsp_flags);
+ break;
+ }
+ } while (!ctx->is_work_done);
+
+ return err;
+}
+
static int fastrpc_internal_invoke(struct fastrpc_user *fl, u32 kernel,
u32 handle, u32 sc,
struct fastrpc_ctx_args *cargs)
@@ -1180,16 +1275,20 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl, u32 kernel,
if (err)
goto bail;
- if (kernel) {
- if (!wait_for_completion_timeout(&ctx->work, 10 * HZ))
- err = -ETIMEDOUT;
- } else {
- err = wait_for_completion_interruptible(&ctx->work);
- }
+ if (ctx->poll_timeout != 0 && handle > FASTRPC_MAX_STATIC_HANDLE
+ && fl->pd == USER_PD)
+ ctx->rsp_flags = POLL_MODE;
+ err = fastrpc_wait_for_completion(ctx, kernel);
if (err)
goto bail;
+ if (!ctx->is_work_done) {
+ err = -ETIMEDOUT;
+ dev_dbg(fl->sctx->dev, "Invalid workdone state for handle 0x%x, sc 0x%x\n",
+ handle, sc);
+ goto bail;
+ }
/* make sure that all memory writes by DSP are seen by CPU */
dma_rmb();
/* populate all the output buffers with results */
@@ -1769,7 +1868,7 @@ static int fastrpc_invokev2(struct fastrpc_user *fl, char __user *argp)
return -EFAULT;
/* Check if all reserved fields are zero */
- for (i = 0; i < 16; i++) {
+ for (i = 0; i < 14; i++) {
if (inv2.reserved[i] != 0)
return -EINVAL;
}
@@ -1779,6 +1878,7 @@ static int fastrpc_invokev2(struct fastrpc_user *fl, char __user *argp)
return -ENOMEM;
cargs->crc = (void __user *)(uintptr_t)inv2.crc;
+ cargs->poll_timeout = inv2.poll_timeout;
err = fastrpc_remote_invoke(fl, &inv2.inv, cargs);
kfree(cargs);
@@ -2581,12 +2681,14 @@ static int fastrpc_rpmsg_callback(struct rpmsg_device *rpdev, void *data,
ctx = idr_find(&cctx->ctx_idr, ctxid);
spin_unlock_irqrestore(&cctx->lock, flags);
+ /* Ignore this failure as context returned will be NULL for polling mode */
if (!ctx) {
- dev_err(&rpdev->dev, "No context ID matches response\n");
- return -ENOENT;
+ dev_dbg(&rpdev->dev, "No context ID matches response\n");
+ return 0;
}
ctx->retval = rsp->retval;
+ ctx->is_work_done = true;
complete(&ctx->work);
/*
diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h
index 406b80555d41..1920c537bbbf 100644
--- a/include/uapi/misc/fastrpc.h
+++ b/include/uapi/misc/fastrpc.h
@@ -84,7 +84,8 @@ struct fastrpc_invoke {
struct fastrpc_invoke_v2 {
struct fastrpc_invoke inv;
__u64 crc;
- __u32 reserved[16];
+ __u64 poll_timeout;
+ __u32 reserved[14];
};
struct fastrpc_init_create {
--
2.34.1
On Mon, Jan 27, 2025 at 10:12:38AM +0530, Ekansh Gupta wrote:
> For any remote call to DSP, after sending an invocation message,
> fastRPC driver waits for glink response and during this time the
> CPU can go into low power modes. Adding a polling mode support
> with which fastRPC driver will poll continuously on a memory
> after sending a message to remote subsystem which will eliminate
> CPU wakeup and scheduling latencies and reduce fastRPC overhead.
> With this change, DSP always sends a glink response which will
> get ignored if polling mode didn't time out.
Is there a chance to implement actual async I/O protocol with the help
of the poll() call instead of hiding the polling / wait inside the
invoke2?
>
> Signed-off-by: Ekansh Gupta <quic_ekangupt@quicinc.com>
> ---
> drivers/misc/fastrpc.c | 122 +++++++++++++++++++++++++++++++++---
> include/uapi/misc/fastrpc.h | 3 +-
> 2 files changed, 114 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
> index cfacee0dded5..257a741af115 100644
> --- a/drivers/misc/fastrpc.c
> +++ b/drivers/misc/fastrpc.c
> @@ -19,6 +19,7 @@
> #include <linux/rpmsg.h>
> #include <linux/scatterlist.h>
> #include <linux/slab.h>
> +#include <linux/delay.h>
> #include <linux/firmware/qcom/qcom_scm.h>
> #include <uapi/misc/fastrpc.h>
> #include <linux/of_reserved_mem.h>
> @@ -38,6 +39,7 @@
> #define FASTRPC_CTX_MAX (256)
> #define FASTRPC_INIT_HANDLE 1
> #define FASTRPC_DSP_UTILITIES_HANDLE 2
> +#define FASTRPC_MAX_STATIC_HANDLE (20)
> #define FASTRPC_CTXID_MASK (0xFF0)
> #define INIT_FILELEN_MAX (2 * 1024 * 1024)
> #define INIT_FILE_NAMELEN_MAX (128)
> @@ -106,6 +108,19 @@
>
> #define miscdev_to_fdevice(d) container_of(d, struct fastrpc_device, miscdev)
>
> +/* Poll response number from remote processor for call completion */
> +#define FASTRPC_POLL_RESPONSE (0xdecaf)
> +/* timeout in us for polling until memory barrier */
> +#define FASTRPC_POLL_TIME_MEM_UPDATE (500)
> +
> +/* Response types supported for RPC calls */
> +enum fastrpc_response_flags {
> + /* normal job completion glink response */
> + NORMAL_RESPONSE = 0,
> + /* process updates poll memory instead of glink response */
> + POLL_MODE = 1,
> +};
> +
> static const char *domains[FASTRPC_DEV_MAX] = { "adsp", "mdsp",
> "sdsp", "cdsp", "cdsp1" };
> struct fastrpc_phy_page {
> @@ -238,9 +253,16 @@ struct fastrpc_invoke_ctx {
> u32 sc;
> u64 *fdlist;
> u32 *crclist;
> + u32 *poll;
> void __user *crc;
> u64 ctxid;
> u64 msg_sz;
> + /* Threads poll for specified timeout and fall back to glink wait */
> + u64 poll_timeout;
> + /* work done status flag */
> + bool is_work_done;
> + /* response flags from remote processor */
> + enum fastrpc_response_flags rsp_flags;
> struct kref refcount;
> struct list_head node; /* list of ctxs */
> struct completion work;
> @@ -258,6 +280,7 @@ struct fastrpc_invoke_ctx {
> struct fastrpc_ctx_args {
> struct fastrpc_invoke_args *args;
> void __user *crc;
> + u64 poll_timeout;
> };
>
> struct fastrpc_session_ctx {
> @@ -619,11 +642,14 @@ static struct fastrpc_invoke_ctx *fastrpc_context_alloc(
> fastrpc_channel_ctx_get(cctx);
>
> ctx->crc = cargs->crc;
> + ctx->poll_timeout = cargs->poll_timeout;
> ctx->sc = sc;
> ctx->retval = -1;
> ctx->pid = current->pid;
> ctx->client_id = user->client_id;
> ctx->cctx = cctx;
> + ctx->rsp_flags = NORMAL_RESPONSE;
> + ctx->is_work_done = false;
> init_completion(&ctx->work);
> INIT_WORK(&ctx->put_work, fastrpc_context_put_wq);
>
> @@ -882,7 +908,8 @@ static int fastrpc_get_meta_size(struct fastrpc_invoke_ctx *ctx)
> sizeof(struct fastrpc_invoke_buf) +
> sizeof(struct fastrpc_phy_page)) * ctx->nscalars +
> sizeof(u64) * FASTRPC_MAX_FDLIST +
> - sizeof(u32) * FASTRPC_MAX_CRCLIST;
> + sizeof(u32) * FASTRPC_MAX_CRCLIST +
> + sizeof(u32);
>
> return size;
> }
> @@ -975,6 +1002,8 @@ static int fastrpc_get_args(u32 kernel, struct fastrpc_invoke_ctx *ctx)
> pages = fastrpc_phy_page_start(list, ctx->nscalars);
> ctx->fdlist = (u64 *)(pages + ctx->nscalars);
> ctx->crclist = (u32 *)(ctx->fdlist + FASTRPC_MAX_FDLIST);
> + ctx->poll = (u32 *)(ctx->crclist + FASTRPC_MAX_CRCLIST);
> +
> args = (uintptr_t)ctx->buf->virt + metalen;
> rlen = pkt_size - metalen;
> ctx->rpra = rpra;
> @@ -1145,6 +1174,72 @@ static int fastrpc_invoke_send(struct fastrpc_session_ctx *sctx,
>
> }
>
> +static int poll_for_remote_response(struct fastrpc_invoke_ctx *ctx, u64 timeout)
> +{
> + int err = -EIO, i, j;
> +
> + /* poll on memory for DSP response. Return failure on timeout */
> + for (i = 0, j = 0; i < timeout; i++, j++) {
> + if (*ctx->poll == FASTRPC_POLL_RESPONSE) {
> + err = 0;
> + ctx->is_work_done = true;
> + ctx->retval = 0;
> + break;
> + }
> + if (j == FASTRPC_POLL_TIME_MEM_UPDATE) {
> + /* make sure that all poll memory writes by DSP are seen by CPU */
> + dma_rmb();
> + j = 0;
> + }
> + udelay(1);
> + }
> + return err;
> +}
> +
> +static inline int fastrpc_wait_for_response(struct fastrpc_invoke_ctx *ctx,
> + u32 kernel)
> +{
> + int err = 0;
> +
> + if (kernel) {
> + if (!wait_for_completion_timeout(&ctx->work, 10 * HZ))
> + err = -ETIMEDOUT;
> + } else {
> + err = wait_for_completion_interruptible(&ctx->work);
> + }
> +
> + return err;
> +}
> +
> +static int fastrpc_wait_for_completion(struct fastrpc_invoke_ctx *ctx,
> + u32 kernel)
> +{
> + int err;
> +
> + do {
> + switch (ctx->rsp_flags) {
> + case NORMAL_RESPONSE:
> + err = fastrpc_wait_for_response(ctx, kernel);
> + if (err || ctx->is_work_done)
> + return err;
> + break;
> + case POLL_MODE:
> + err = poll_for_remote_response(ctx, ctx->poll_timeout);
> + /* If polling timed out, move to normal response mode */
> + if (err)
> + ctx->rsp_flags = NORMAL_RESPONSE;
> + break;
> + default:
> + err = -EBADR;
> + dev_dbg(ctx->fl->sctx->dev,
> + "unsupported response type:0x%x\n", ctx->rsp_flags);
> + break;
> + }
> + } while (!ctx->is_work_done);
> +
> + return err;
> +}
> +
> static int fastrpc_internal_invoke(struct fastrpc_user *fl, u32 kernel,
> u32 handle, u32 sc,
> struct fastrpc_ctx_args *cargs)
> @@ -1180,16 +1275,20 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl, u32 kernel,
> if (err)
> goto bail;
>
> - if (kernel) {
> - if (!wait_for_completion_timeout(&ctx->work, 10 * HZ))
> - err = -ETIMEDOUT;
> - } else {
> - err = wait_for_completion_interruptible(&ctx->work);
> - }
> + if (ctx->poll_timeout != 0 && handle > FASTRPC_MAX_STATIC_HANDLE
> + && fl->pd == USER_PD)
> + ctx->rsp_flags = POLL_MODE;
>
> + err = fastrpc_wait_for_completion(ctx, kernel);
> if (err)
> goto bail;
>
> + if (!ctx->is_work_done) {
> + err = -ETIMEDOUT;
> + dev_dbg(fl->sctx->dev, "Invalid workdone state for handle 0x%x, sc 0x%x\n",
> + handle, sc);
> + goto bail;
> + }
> /* make sure that all memory writes by DSP are seen by CPU */
> dma_rmb();
> /* populate all the output buffers with results */
> @@ -1769,7 +1868,7 @@ static int fastrpc_invokev2(struct fastrpc_user *fl, char __user *argp)
> return -EFAULT;
>
> /* Check if all reserved fields are zero */
> - for (i = 0; i < 16; i++) {
> + for (i = 0; i < 14; i++) {
> if (inv2.reserved[i] != 0)
> return -EINVAL;
> }
> @@ -1779,6 +1878,7 @@ static int fastrpc_invokev2(struct fastrpc_user *fl, char __user *argp)
> return -ENOMEM;
>
> cargs->crc = (void __user *)(uintptr_t)inv2.crc;
> + cargs->poll_timeout = inv2.poll_timeout;
>
> err = fastrpc_remote_invoke(fl, &inv2.inv, cargs);
> kfree(cargs);
> @@ -2581,12 +2681,14 @@ static int fastrpc_rpmsg_callback(struct rpmsg_device *rpdev, void *data,
> ctx = idr_find(&cctx->ctx_idr, ctxid);
> spin_unlock_irqrestore(&cctx->lock, flags);
>
> + /* Ignore this failure as context returned will be NULL for polling mode */
> if (!ctx) {
> - dev_err(&rpdev->dev, "No context ID matches response\n");
> - return -ENOENT;
> + dev_dbg(&rpdev->dev, "No context ID matches response\n");
> + return 0;
> }
>
> ctx->retval = rsp->retval;
> + ctx->is_work_done = true;
> complete(&ctx->work);
>
> /*
> diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h
> index 406b80555d41..1920c537bbbf 100644
> --- a/include/uapi/misc/fastrpc.h
> +++ b/include/uapi/misc/fastrpc.h
> @@ -84,7 +84,8 @@ struct fastrpc_invoke {
> struct fastrpc_invoke_v2 {
> struct fastrpc_invoke inv;
> __u64 crc;
> - __u32 reserved[16];
> + __u64 poll_timeout;
> + __u32 reserved[14];
> };
>
> struct fastrpc_init_create {
> --
> 2.34.1
>
--
With best wishes
Dmitry
On 1/29/2025 4:59 AM, Dmitry Baryshkov wrote:
> On Mon, Jan 27, 2025 at 10:12:38AM +0530, Ekansh Gupta wrote:
>> For any remote call to DSP, after sending an invocation message,
>> fastRPC driver waits for glink response and during this time the
>> CPU can go into low power modes. Adding a polling mode support
>> with which fastRPC driver will poll continuously on a memory
>> after sending a message to remote subsystem which will eliminate
>> CPU wakeup and scheduling latencies and reduce fastRPC overhead.
>> With this change, DSP always sends a glink response which will
>> get ignored if polling mode didn't time out.
> Is there a chance to implement actual async I/O protocol with the help
> of the poll() call instead of hiding the polling / wait inside the
> invoke2?
This design is based on the implementation on DSP firmware as of today:
Call flow: https://github.com/quic-ekangupt/fastrpc/blob/invokev2/Docs/invoke_v2.md#5-polling-mode
Can you please give some reference to the async I/O protocol that you've
suggested? I can check if it can be implemented here.
--ekansh
>
>> Signed-off-by: Ekansh Gupta <quic_ekangupt@quicinc.com>
>> ---
>> drivers/misc/fastrpc.c | 122 +++++++++++++++++++++++++++++++++---
>> include/uapi/misc/fastrpc.h | 3 +-
>> 2 files changed, 114 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
>> index cfacee0dded5..257a741af115 100644
>> --- a/drivers/misc/fastrpc.c
>> +++ b/drivers/misc/fastrpc.c
>> @@ -19,6 +19,7 @@
>> #include <linux/rpmsg.h>
>> #include <linux/scatterlist.h>
>> #include <linux/slab.h>
>> +#include <linux/delay.h>
>> #include <linux/firmware/qcom/qcom_scm.h>
>> #include <uapi/misc/fastrpc.h>
>> #include <linux/of_reserved_mem.h>
>> @@ -38,6 +39,7 @@
>> #define FASTRPC_CTX_MAX (256)
>> #define FASTRPC_INIT_HANDLE 1
>> #define FASTRPC_DSP_UTILITIES_HANDLE 2
>> +#define FASTRPC_MAX_STATIC_HANDLE (20)
>> #define FASTRPC_CTXID_MASK (0xFF0)
>> #define INIT_FILELEN_MAX (2 * 1024 * 1024)
>> #define INIT_FILE_NAMELEN_MAX (128)
>> @@ -106,6 +108,19 @@
>>
>> #define miscdev_to_fdevice(d) container_of(d, struct fastrpc_device, miscdev)
>>
>> +/* Poll response number from remote processor for call completion */
>> +#define FASTRPC_POLL_RESPONSE (0xdecaf)
>> +/* timeout in us for polling until memory barrier */
>> +#define FASTRPC_POLL_TIME_MEM_UPDATE (500)
>> +
>> +/* Response types supported for RPC calls */
>> +enum fastrpc_response_flags {
>> + /* normal job completion glink response */
>> + NORMAL_RESPONSE = 0,
>> + /* process updates poll memory instead of glink response */
>> + POLL_MODE = 1,
>> +};
>> +
>> static const char *domains[FASTRPC_DEV_MAX] = { "adsp", "mdsp",
>> "sdsp", "cdsp", "cdsp1" };
>> struct fastrpc_phy_page {
>> @@ -238,9 +253,16 @@ struct fastrpc_invoke_ctx {
>> u32 sc;
>> u64 *fdlist;
>> u32 *crclist;
>> + u32 *poll;
>> void __user *crc;
>> u64 ctxid;
>> u64 msg_sz;
>> + /* Threads poll for specified timeout and fall back to glink wait */
>> + u64 poll_timeout;
>> + /* work done status flag */
>> + bool is_work_done;
>> + /* response flags from remote processor */
>> + enum fastrpc_response_flags rsp_flags;
>> struct kref refcount;
>> struct list_head node; /* list of ctxs */
>> struct completion work;
>> @@ -258,6 +280,7 @@ struct fastrpc_invoke_ctx {
>> struct fastrpc_ctx_args {
>> struct fastrpc_invoke_args *args;
>> void __user *crc;
>> + u64 poll_timeout;
>> };
>>
>> struct fastrpc_session_ctx {
>> @@ -619,11 +642,14 @@ static struct fastrpc_invoke_ctx *fastrpc_context_alloc(
>> fastrpc_channel_ctx_get(cctx);
>>
>> ctx->crc = cargs->crc;
>> + ctx->poll_timeout = cargs->poll_timeout;
>> ctx->sc = sc;
>> ctx->retval = -1;
>> ctx->pid = current->pid;
>> ctx->client_id = user->client_id;
>> ctx->cctx = cctx;
>> + ctx->rsp_flags = NORMAL_RESPONSE;
>> + ctx->is_work_done = false;
>> init_completion(&ctx->work);
>> INIT_WORK(&ctx->put_work, fastrpc_context_put_wq);
>>
>> @@ -882,7 +908,8 @@ static int fastrpc_get_meta_size(struct fastrpc_invoke_ctx *ctx)
>> sizeof(struct fastrpc_invoke_buf) +
>> sizeof(struct fastrpc_phy_page)) * ctx->nscalars +
>> sizeof(u64) * FASTRPC_MAX_FDLIST +
>> - sizeof(u32) * FASTRPC_MAX_CRCLIST;
>> + sizeof(u32) * FASTRPC_MAX_CRCLIST +
>> + sizeof(u32);
>>
>> return size;
>> }
>> @@ -975,6 +1002,8 @@ static int fastrpc_get_args(u32 kernel, struct fastrpc_invoke_ctx *ctx)
>> pages = fastrpc_phy_page_start(list, ctx->nscalars);
>> ctx->fdlist = (u64 *)(pages + ctx->nscalars);
>> ctx->crclist = (u32 *)(ctx->fdlist + FASTRPC_MAX_FDLIST);
>> + ctx->poll = (u32 *)(ctx->crclist + FASTRPC_MAX_CRCLIST);
>> +
>> args = (uintptr_t)ctx->buf->virt + metalen;
>> rlen = pkt_size - metalen;
>> ctx->rpra = rpra;
>> @@ -1145,6 +1174,72 @@ static int fastrpc_invoke_send(struct fastrpc_session_ctx *sctx,
>>
>> }
>>
>> +static int poll_for_remote_response(struct fastrpc_invoke_ctx *ctx, u64 timeout)
>> +{
>> + int err = -EIO, i, j;
>> +
>> + /* poll on memory for DSP response. Return failure on timeout */
>> + for (i = 0, j = 0; i < timeout; i++, j++) {
>> + if (*ctx->poll == FASTRPC_POLL_RESPONSE) {
>> + err = 0;
>> + ctx->is_work_done = true;
>> + ctx->retval = 0;
>> + break;
>> + }
>> + if (j == FASTRPC_POLL_TIME_MEM_UPDATE) {
>> + /* make sure that all poll memory writes by DSP are seen by CPU */
>> + dma_rmb();
>> + j = 0;
>> + }
>> + udelay(1);
>> + }
>> + return err;
>> +}
>> +
>> +static inline int fastrpc_wait_for_response(struct fastrpc_invoke_ctx *ctx,
>> + u32 kernel)
>> +{
>> + int err = 0;
>> +
>> + if (kernel) {
>> + if (!wait_for_completion_timeout(&ctx->work, 10 * HZ))
>> + err = -ETIMEDOUT;
>> + } else {
>> + err = wait_for_completion_interruptible(&ctx->work);
>> + }
>> +
>> + return err;
>> +}
>> +
>> +static int fastrpc_wait_for_completion(struct fastrpc_invoke_ctx *ctx,
>> + u32 kernel)
>> +{
>> + int err;
>> +
>> + do {
>> + switch (ctx->rsp_flags) {
>> + case NORMAL_RESPONSE:
>> + err = fastrpc_wait_for_response(ctx, kernel);
>> + if (err || ctx->is_work_done)
>> + return err;
>> + break;
>> + case POLL_MODE:
>> + err = poll_for_remote_response(ctx, ctx->poll_timeout);
>> + /* If polling timed out, move to normal response mode */
>> + if (err)
>> + ctx->rsp_flags = NORMAL_RESPONSE;
>> + break;
>> + default:
>> + err = -EBADR;
>> + dev_dbg(ctx->fl->sctx->dev,
>> + "unsupported response type:0x%x\n", ctx->rsp_flags);
>> + break;
>> + }
>> + } while (!ctx->is_work_done);
>> +
>> + return err;
>> +}
>> +
>> static int fastrpc_internal_invoke(struct fastrpc_user *fl, u32 kernel,
>> u32 handle, u32 sc,
>> struct fastrpc_ctx_args *cargs)
>> @@ -1180,16 +1275,20 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl, u32 kernel,
>> if (err)
>> goto bail;
>>
>> - if (kernel) {
>> - if (!wait_for_completion_timeout(&ctx->work, 10 * HZ))
>> - err = -ETIMEDOUT;
>> - } else {
>> - err = wait_for_completion_interruptible(&ctx->work);
>> - }
>> + if (ctx->poll_timeout != 0 && handle > FASTRPC_MAX_STATIC_HANDLE
>> + && fl->pd == USER_PD)
>> + ctx->rsp_flags = POLL_MODE;
>>
>> + err = fastrpc_wait_for_completion(ctx, kernel);
>> if (err)
>> goto bail;
>>
>> + if (!ctx->is_work_done) {
>> + err = -ETIMEDOUT;
>> + dev_dbg(fl->sctx->dev, "Invalid workdone state for handle 0x%x, sc 0x%x\n",
>> + handle, sc);
>> + goto bail;
>> + }
>> /* make sure that all memory writes by DSP are seen by CPU */
>> dma_rmb();
>> /* populate all the output buffers with results */
>> @@ -1769,7 +1868,7 @@ static int fastrpc_invokev2(struct fastrpc_user *fl, char __user *argp)
>> return -EFAULT;
>>
>> /* Check if all reserved fields are zero */
>> - for (i = 0; i < 16; i++) {
>> + for (i = 0; i < 14; i++) {
>> if (inv2.reserved[i] != 0)
>> return -EINVAL;
>> }
>> @@ -1779,6 +1878,7 @@ static int fastrpc_invokev2(struct fastrpc_user *fl, char __user *argp)
>> return -ENOMEM;
>>
>> cargs->crc = (void __user *)(uintptr_t)inv2.crc;
>> + cargs->poll_timeout = inv2.poll_timeout;
>>
>> err = fastrpc_remote_invoke(fl, &inv2.inv, cargs);
>> kfree(cargs);
>> @@ -2581,12 +2681,14 @@ static int fastrpc_rpmsg_callback(struct rpmsg_device *rpdev, void *data,
>> ctx = idr_find(&cctx->ctx_idr, ctxid);
>> spin_unlock_irqrestore(&cctx->lock, flags);
>>
>> + /* Ignore this failure as context returned will be NULL for polling mode */
>> if (!ctx) {
>> - dev_err(&rpdev->dev, "No context ID matches response\n");
>> - return -ENOENT;
>> + dev_dbg(&rpdev->dev, "No context ID matches response\n");
>> + return 0;
>> }
>>
>> ctx->retval = rsp->retval;
>> + ctx->is_work_done = true;
>> complete(&ctx->work);
>>
>> /*
>> diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h
>> index 406b80555d41..1920c537bbbf 100644
>> --- a/include/uapi/misc/fastrpc.h
>> +++ b/include/uapi/misc/fastrpc.h
>> @@ -84,7 +84,8 @@ struct fastrpc_invoke {
>> struct fastrpc_invoke_v2 {
>> struct fastrpc_invoke inv;
>> __u64 crc;
>> - __u32 reserved[16];
>> + __u64 poll_timeout;
>> + __u32 reserved[14];
>> };
>>
>> struct fastrpc_init_create {
>> --
>> 2.34.1
>>
On Wed, Jan 29, 2025 at 11:12:16AM +0530, Ekansh Gupta wrote: > > > > On 1/29/2025 4:59 AM, Dmitry Baryshkov wrote: > > On Mon, Jan 27, 2025 at 10:12:38AM +0530, Ekansh Gupta wrote: > >> For any remote call to DSP, after sending an invocation message, > >> fastRPC driver waits for glink response and during this time the > >> CPU can go into low power modes. Adding a polling mode support > >> with which fastRPC driver will poll continuously on a memory > >> after sending a message to remote subsystem which will eliminate > >> CPU wakeup and scheduling latencies and reduce fastRPC overhead. > >> With this change, DSP always sends a glink response which will > >> get ignored if polling mode didn't time out. > > Is there a chance to implement actual async I/O protocol with the help > > of the poll() call instead of hiding the polling / wait inside the > > invoke2? > > This design is based on the implementation on DSP firmware as of today: > Call flow: https://github.com/quic-ekangupt/fastrpc/blob/invokev2/Docs/invoke_v2.md#5-polling-mode > > Can you please give some reference to the async I/O protocol that you've > suggested? I can check if it can be implemented here. As with the typical poll() call implementation: - write some data using ioctl - call poll() / select() to wait for the data to be processed - read data using another ioctl Getting back to your patch. from you commit message it is not clear, which SoCs support this feature. Reminding you that we are supporting all kinds of platforms, including the ones that are EoLed by Qualcomm. Next, you wrote that in-driver polling eliminates CPU wakeup and scheduling. However this should also increase power consumption. Is there any measurable difference in the latencies, granted that you already use ioctl() syscall, as such there will be two context switches. What is the actual impact? -- With best wishes Dmitry
On 1/29/2025 4:10 PM, Dmitry Baryshkov wrote: > On Wed, Jan 29, 2025 at 11:12:16AM +0530, Ekansh Gupta wrote: >> >> >> On 1/29/2025 4:59 AM, Dmitry Baryshkov wrote: >>> On Mon, Jan 27, 2025 at 10:12:38AM +0530, Ekansh Gupta wrote: >>>> For any remote call to DSP, after sending an invocation message, >>>> fastRPC driver waits for glink response and during this time the >>>> CPU can go into low power modes. Adding a polling mode support >>>> with which fastRPC driver will poll continuously on a memory >>>> after sending a message to remote subsystem which will eliminate >>>> CPU wakeup and scheduling latencies and reduce fastRPC overhead. >>>> With this change, DSP always sends a glink response which will >>>> get ignored if polling mode didn't time out. >>> Is there a chance to implement actual async I/O protocol with the help >>> of the poll() call instead of hiding the polling / wait inside the >>> invoke2? >> This design is based on the implementation on DSP firmware as of today: >> Call flow: https://github.com/quic-ekangupt/fastrpc/blob/invokev2/Docs/invoke_v2.md#5-polling-mode >> >> Can you please give some reference to the async I/O protocol that you've >> suggested? I can check if it can be implemented here. > As with the typical poll() call implementation: > - write some data using ioctl > - call poll() / select() to wait for the data to be processed > - read data using another ioctl > > Getting back to your patch. from you commit message it is not clear, > which SoCs support this feature. Reminding you that we are supporting > all kinds of platforms, including the ones that are EoLed by Qualcomm. > > Next, you wrote that in-driver polling eliminates CPU wakeup and > scheduling. However this should also increase power consumption. Is > there any measurable difference in the latencies, granted that you > already use ioctl() syscall, as such there will be two context switches. > What is the actual impact? Hi Dmitry, Thank you for your feedback. I'm currently reworking this change and adding testing details. Regarding the SoC support, I'll add all the necessary information. For now, with in-driver polling, we are seeing significant performance improvements for calls with different sized buffers. On polling supporting platform, I've observed an ~80us improvement in latency. You can find more details in the test results here: https://github.com/quic/fastrpc/pull/134/files#diff-7dbc6537cd3ade7fea5766229cf585db585704e02730efd72e7afc9b148e28ed Regarding your concerns about power consumption, while in-driver polling eliminates CPU wakeup and scheduling, it does increase power consumption. However, the performance gains seem to outweigh this increase. Do you think the poll implementation that you suggested above could provide similar improvements? Thanks, Ekansh > >
On Thu, Mar 20, 2025 at 07:19:31PM +0530, Ekansh Gupta wrote: > > > On 1/29/2025 4:10 PM, Dmitry Baryshkov wrote: > > On Wed, Jan 29, 2025 at 11:12:16AM +0530, Ekansh Gupta wrote: > >> > >> > >> On 1/29/2025 4:59 AM, Dmitry Baryshkov wrote: > >>> On Mon, Jan 27, 2025 at 10:12:38AM +0530, Ekansh Gupta wrote: > >>>> For any remote call to DSP, after sending an invocation message, > >>>> fastRPC driver waits for glink response and during this time the > >>>> CPU can go into low power modes. Adding a polling mode support > >>>> with which fastRPC driver will poll continuously on a memory > >>>> after sending a message to remote subsystem which will eliminate > >>>> CPU wakeup and scheduling latencies and reduce fastRPC overhead. > >>>> With this change, DSP always sends a glink response which will > >>>> get ignored if polling mode didn't time out. > >>> Is there a chance to implement actual async I/O protocol with the help > >>> of the poll() call instead of hiding the polling / wait inside the > >>> invoke2? > >> This design is based on the implementation on DSP firmware as of today: > >> Call flow: https://github.com/quic-ekangupt/fastrpc/blob/invokev2/Docs/invoke_v2.md#5-polling-mode > >> > >> Can you please give some reference to the async I/O protocol that you've > >> suggested? I can check if it can be implemented here. > > As with the typical poll() call implementation: > > - write some data using ioctl > > - call poll() / select() to wait for the data to be processed > > - read data using another ioctl > > > > Getting back to your patch. from you commit message it is not clear, > > which SoCs support this feature. Reminding you that we are supporting > > all kinds of platforms, including the ones that are EoLed by Qualcomm. > > > > Next, you wrote that in-driver polling eliminates CPU wakeup and > > scheduling. However this should also increase power consumption. Is > > there any measurable difference in the latencies, granted that you > > already use ioctl() syscall, as such there will be two context switches. > > What is the actual impact? > > Hi Dmitry, > > Thank you for your feedback. > > I'm currently reworking this change and adding testing details. Regarding the SoC > support, I'll add all the necessary information. Please make sure that both the kernel and the userspace can handle the 'non-supported' case properly. > For now, with in-driver > polling, we are seeing significant performance improvements for calls > with different sized buffers. On polling supporting platform, I've observed an > ~80us improvement in latency. You can find more details in the test > results here: > https://github.com/quic/fastrpc/pull/134/files#diff-7dbc6537cd3ade7fea5766229cf585db585704e02730efd72e7afc9b148e28ed Does the improvement come from the CPU not goint to idle or from the glink response processing? > Regarding your concerns about power consumption, while in-driver polling > eliminates CPU wakeup and scheduling, it does increase power consumption. > However, the performance gains seem to outweigh this increase. > > Do you think the poll implementation that you suggested above could provide similar > improvements? No, I agree here. I was more concentrated on userspace polling rather than hw polling. -- With best wishes Dmitry
On 3/20/2025 7:45 PM, Dmitry Baryshkov wrote: > On Thu, Mar 20, 2025 at 07:19:31PM +0530, Ekansh Gupta wrote: >> >> On 1/29/2025 4:10 PM, Dmitry Baryshkov wrote: >>> On Wed, Jan 29, 2025 at 11:12:16AM +0530, Ekansh Gupta wrote: >>>> >>>> On 1/29/2025 4:59 AM, Dmitry Baryshkov wrote: >>>>> On Mon, Jan 27, 2025 at 10:12:38AM +0530, Ekansh Gupta wrote: >>>>>> For any remote call to DSP, after sending an invocation message, >>>>>> fastRPC driver waits for glink response and during this time the >>>>>> CPU can go into low power modes. Adding a polling mode support >>>>>> with which fastRPC driver will poll continuously on a memory >>>>>> after sending a message to remote subsystem which will eliminate >>>>>> CPU wakeup and scheduling latencies and reduce fastRPC overhead. >>>>>> With this change, DSP always sends a glink response which will >>>>>> get ignored if polling mode didn't time out. >>>>> Is there a chance to implement actual async I/O protocol with the help >>>>> of the poll() call instead of hiding the polling / wait inside the >>>>> invoke2? >>>> This design is based on the implementation on DSP firmware as of today: >>>> Call flow: https://github.com/quic-ekangupt/fastrpc/blob/invokev2/Docs/invoke_v2.md#5-polling-mode >>>> >>>> Can you please give some reference to the async I/O protocol that you've >>>> suggested? I can check if it can be implemented here. >>> As with the typical poll() call implementation: >>> - write some data using ioctl >>> - call poll() / select() to wait for the data to be processed >>> - read data using another ioctl >>> >>> Getting back to your patch. from you commit message it is not clear, >>> which SoCs support this feature. Reminding you that we are supporting >>> all kinds of platforms, including the ones that are EoLed by Qualcomm. >>> >>> Next, you wrote that in-driver polling eliminates CPU wakeup and >>> scheduling. However this should also increase power consumption. Is >>> there any measurable difference in the latencies, granted that you >>> already use ioctl() syscall, as such there will be two context switches. >>> What is the actual impact? >> Hi Dmitry, >> >> Thank you for your feedback. >> >> I'm currently reworking this change and adding testing details. Regarding the SoC >> support, I'll add all the necessary information. > Please make sure that both the kernel and the userspace can handle the > 'non-supported' case properly. Yes, I will include changes to handle in both userspace and kernel. > >> For now, with in-driver >> polling, we are seeing significant performance improvements for calls >> with different sized buffers. On polling supporting platform, I've observed an >> ~80us improvement in latency. You can find more details in the test >> results here: >> https://github.com/quic/fastrpc/pull/134/files#diff-7dbc6537cd3ade7fea5766229cf585db585704e02730efd72e7afc9b148e28ed > Does the improvement come from the CPU not goint to idle or from the > glink response processing? Although both are contributing to performance improvement, the major improvement is coming from CPU not going to idle state. Thanks, Ekansh > >> Regarding your concerns about power consumption, while in-driver polling >> eliminates CPU wakeup and scheduling, it does increase power consumption. >> However, the performance gains seem to outweigh this increase. >> >> Do you think the poll implementation that you suggested above could provide similar >> improvements? > No, I agree here. I was more concentrated on userspace polling rather > than hw polling. >
On 3/20/2025 9:27 PM, Ekansh Gupta wrote: > > On 3/20/2025 7:45 PM, Dmitry Baryshkov wrote: >> On Thu, Mar 20, 2025 at 07:19:31PM +0530, Ekansh Gupta wrote: >>> On 1/29/2025 4:10 PM, Dmitry Baryshkov wrote: >>>> On Wed, Jan 29, 2025 at 11:12:16AM +0530, Ekansh Gupta wrote: >>>>> On 1/29/2025 4:59 AM, Dmitry Baryshkov wrote: >>>>>> On Mon, Jan 27, 2025 at 10:12:38AM +0530, Ekansh Gupta wrote: >>>>>>> For any remote call to DSP, after sending an invocation message, >>>>>>> fastRPC driver waits for glink response and during this time the >>>>>>> CPU can go into low power modes. Adding a polling mode support >>>>>>> with which fastRPC driver will poll continuously on a memory >>>>>>> after sending a message to remote subsystem which will eliminate >>>>>>> CPU wakeup and scheduling latencies and reduce fastRPC overhead. >>>>>>> With this change, DSP always sends a glink response which will >>>>>>> get ignored if polling mode didn't time out. >>>>>> Is there a chance to implement actual async I/O protocol with the help >>>>>> of the poll() call instead of hiding the polling / wait inside the >>>>>> invoke2? >>>>> This design is based on the implementation on DSP firmware as of today: >>>>> Call flow: https://github.com/quic-ekangupt/fastrpc/blob/invokev2/Docs/invoke_v2.md#5-polling-mode >>>>> >>>>> Can you please give some reference to the async I/O protocol that you've >>>>> suggested? I can check if it can be implemented here. >>>> As with the typical poll() call implementation: >>>> - write some data using ioctl >>>> - call poll() / select() to wait for the data to be processed >>>> - read data using another ioctl >>>> >>>> Getting back to your patch. from you commit message it is not clear, >>>> which SoCs support this feature. Reminding you that we are supporting >>>> all kinds of platforms, including the ones that are EoLed by Qualcomm. >>>> >>>> Next, you wrote that in-driver polling eliminates CPU wakeup and >>>> scheduling. However this should also increase power consumption. Is >>>> there any measurable difference in the latencies, granted that you >>>> already use ioctl() syscall, as such there will be two context switches. >>>> What is the actual impact? >>> Hi Dmitry, >>> >>> Thank you for your feedback. >>> >>> I'm currently reworking this change and adding testing details. Regarding the SoC >>> support, I'll add all the necessary information. >> Please make sure that both the kernel and the userspace can handle the >> 'non-supported' case properly. > Yes, I will include changes to handle in both userspace and kernel. I am seeking additional suggestions on handling "non-supported" cases before making the changes. Userspace: To enable DSP side polling, a remote call is made as defined in the DSP image. If this call fails, polling mode will not be enabled from userspace. Kernel: Since this is a DSP-specific feature, I plan to add a devicetree property, such as "qcom,polling-supported," under the fastrpc node if the DSP supports polling mode. Does this approach seem appropriate, or is there a better way to handle this? Thanks, Ekansh > >>> For now, with in-driver >>> polling, we are seeing significant performance improvements for calls >>> with different sized buffers. On polling supporting platform, I've observed an >>> ~80us improvement in latency. You can find more details in the test >>> results here: >>> https://github.com/quic/fastrpc/pull/134/files#diff-7dbc6537cd3ade7fea5766229cf585db585704e02730efd72e7afc9b148e28ed >> Does the improvement come from the CPU not goint to idle or from the >> glink response processing? > Although both are contributing to performance improvement, the major > improvement is coming from CPU not going to idle state. > > Thanks, > Ekansh > >>> Regarding your concerns about power consumption, while in-driver polling >>> eliminates CPU wakeup and scheduling, it does increase power consumption. >>> However, the performance gains seem to outweigh this increase. >>> >>> Do you think the poll implementation that you suggested above could provide similar >>> improvements? >> No, I agree here. I was more concentrated on userspace polling rather >> than hw polling. >>
On Fri, 21 Mar 2025 at 12:18, Ekansh Gupta <ekansh.gupta@oss.qualcomm.com> wrote: > > > > On 3/20/2025 9:27 PM, Ekansh Gupta wrote: > > > > On 3/20/2025 7:45 PM, Dmitry Baryshkov wrote: > >> On Thu, Mar 20, 2025 at 07:19:31PM +0530, Ekansh Gupta wrote: > >>> On 1/29/2025 4:10 PM, Dmitry Baryshkov wrote: > >>>> On Wed, Jan 29, 2025 at 11:12:16AM +0530, Ekansh Gupta wrote: > >>>>> On 1/29/2025 4:59 AM, Dmitry Baryshkov wrote: > >>>>>> On Mon, Jan 27, 2025 at 10:12:38AM +0530, Ekansh Gupta wrote: > >>>>>>> For any remote call to DSP, after sending an invocation message, > >>>>>>> fastRPC driver waits for glink response and during this time the > >>>>>>> CPU can go into low power modes. Adding a polling mode support > >>>>>>> with which fastRPC driver will poll continuously on a memory > >>>>>>> after sending a message to remote subsystem which will eliminate > >>>>>>> CPU wakeup and scheduling latencies and reduce fastRPC overhead. > >>>>>>> With this change, DSP always sends a glink response which will > >>>>>>> get ignored if polling mode didn't time out. > >>>>>> Is there a chance to implement actual async I/O protocol with the help > >>>>>> of the poll() call instead of hiding the polling / wait inside the > >>>>>> invoke2? > >>>>> This design is based on the implementation on DSP firmware as of today: > >>>>> Call flow: https://github.com/quic-ekangupt/fastrpc/blob/invokev2/Docs/invoke_v2.md#5-polling-mode > >>>>> > >>>>> Can you please give some reference to the async I/O protocol that you've > >>>>> suggested? I can check if it can be implemented here. > >>>> As with the typical poll() call implementation: > >>>> - write some data using ioctl > >>>> - call poll() / select() to wait for the data to be processed > >>>> - read data using another ioctl > >>>> > >>>> Getting back to your patch. from you commit message it is not clear, > >>>> which SoCs support this feature. Reminding you that we are supporting > >>>> all kinds of platforms, including the ones that are EoLed by Qualcomm. > >>>> > >>>> Next, you wrote that in-driver polling eliminates CPU wakeup and > >>>> scheduling. However this should also increase power consumption. Is > >>>> there any measurable difference in the latencies, granted that you > >>>> already use ioctl() syscall, as such there will be two context switches. > >>>> What is the actual impact? > >>> Hi Dmitry, > >>> > >>> Thank you for your feedback. > >>> > >>> I'm currently reworking this change and adding testing details. Regarding the SoC > >>> support, I'll add all the necessary information. > >> Please make sure that both the kernel and the userspace can handle the > >> 'non-supported' case properly. > > Yes, I will include changes to handle in both userspace and kernel. > > I am seeking additional suggestions on handling "non-supported" cases before making the > changes. > > Userspace: To enable DSP side polling, a remote call is made as defined in the DSP image. > If this call fails, polling mode will not be enabled from userspace. No. Instead userspace should check with the kernel, which capabilities are supported. Don't perform API calls which knowingly can fail. > > Kernel: Since this is a DSP-specific feature, I plan to add a devicetree property, such > as "qcom,polling-supported," under the fastrpc node if the DSP supports polling mode. This doesn't sound like a logical solution. The kernel already knows the hardware that it is running on. As such, there should be no need to further describe the hardware in DT. If the DSP firmware can report its capabilities, use that. If not, extend the schema to add an SoC-specific compatibility string. As a last resort we can use of_machine_is_compatible(). > > Does this approach seem appropriate, or is there a better way to handle this? > > Thanks, > Ekansh > > > > >>> For now, with in-driver > >>> polling, we are seeing significant performance improvements for calls > >>> with different sized buffers. On polling supporting platform, I've observed an > >>> ~80us improvement in latency. You can find more details in the test > >>> results here: > >>> https://github.com/quic/fastrpc/pull/134/files#diff-7dbc6537cd3ade7fea5766229cf585db585704e02730efd72e7afc9b148e28ed > >> Does the improvement come from the CPU not goint to idle or from the > >> glink response processing? > > Although both are contributing to performance improvement, the major > > improvement is coming from CPU not going to idle state. > > > > Thanks, > > Ekansh > > > >>> Regarding your concerns about power consumption, while in-driver polling > >>> eliminates CPU wakeup and scheduling, it does increase power consumption. > >>> However, the performance gains seem to outweigh this increase. > >>> > >>> Do you think the poll implementation that you suggested above could provide similar > >>> improvements? > >> No, I agree here. I was more concentrated on userspace polling rather > >> than hw polling. > >> > -- With best wishes Dmitry
On 3/21/2025 4:47 PM, Dmitry Baryshkov wrote: > On Fri, 21 Mar 2025 at 12:18, Ekansh Gupta > <ekansh.gupta@oss.qualcomm.com> wrote: >> >> >> On 3/20/2025 9:27 PM, Ekansh Gupta wrote: >>> On 3/20/2025 7:45 PM, Dmitry Baryshkov wrote: >>>> On Thu, Mar 20, 2025 at 07:19:31PM +0530, Ekansh Gupta wrote: >>>>> On 1/29/2025 4:10 PM, Dmitry Baryshkov wrote: >>>>>> On Wed, Jan 29, 2025 at 11:12:16AM +0530, Ekansh Gupta wrote: >>>>>>> On 1/29/2025 4:59 AM, Dmitry Baryshkov wrote: >>>>>>>> On Mon, Jan 27, 2025 at 10:12:38AM +0530, Ekansh Gupta wrote: >>>>>>>>> For any remote call to DSP, after sending an invocation message, >>>>>>>>> fastRPC driver waits for glink response and during this time the >>>>>>>>> CPU can go into low power modes. Adding a polling mode support >>>>>>>>> with which fastRPC driver will poll continuously on a memory >>>>>>>>> after sending a message to remote subsystem which will eliminate >>>>>>>>> CPU wakeup and scheduling latencies and reduce fastRPC overhead. >>>>>>>>> With this change, DSP always sends a glink response which will >>>>>>>>> get ignored if polling mode didn't time out. >>>>>>>> Is there a chance to implement actual async I/O protocol with the help >>>>>>>> of the poll() call instead of hiding the polling / wait inside the >>>>>>>> invoke2? >>>>>>> This design is based on the implementation on DSP firmware as of today: >>>>>>> Call flow: https://github.com/quic-ekangupt/fastrpc/blob/invokev2/Docs/invoke_v2.md#5-polling-mode >>>>>>> >>>>>>> Can you please give some reference to the async I/O protocol that you've >>>>>>> suggested? I can check if it can be implemented here. >>>>>> As with the typical poll() call implementation: >>>>>> - write some data using ioctl >>>>>> - call poll() / select() to wait for the data to be processed >>>>>> - read data using another ioctl >>>>>> >>>>>> Getting back to your patch. from you commit message it is not clear, >>>>>> which SoCs support this feature. Reminding you that we are supporting >>>>>> all kinds of platforms, including the ones that are EoLed by Qualcomm. >>>>>> >>>>>> Next, you wrote that in-driver polling eliminates CPU wakeup and >>>>>> scheduling. However this should also increase power consumption. Is >>>>>> there any measurable difference in the latencies, granted that you >>>>>> already use ioctl() syscall, as such there will be two context switches. >>>>>> What is the actual impact? >>>>> Hi Dmitry, >>>>> >>>>> Thank you for your feedback. >>>>> >>>>> I'm currently reworking this change and adding testing details. Regarding the SoC >>>>> support, I'll add all the necessary information. >>>> Please make sure that both the kernel and the userspace can handle the >>>> 'non-supported' case properly. >>> Yes, I will include changes to handle in both userspace and kernel. >> I am seeking additional suggestions on handling "non-supported" cases before making the >> changes. >> >> Userspace: To enable DSP side polling, a remote call is made as defined in the DSP image. >> If this call fails, polling mode will not be enabled from userspace. > No. Instead userspace should check with the kernel, which capabilities > are supported. Don't perform API calls which knowingly can fail. > >> Kernel: Since this is a DSP-specific feature, I plan to add a devicetree property, such >> as "qcom,polling-supported," under the fastrpc node if the DSP supports polling mode. > This doesn't sound like a logical solution. The kernel already knows > the hardware that it is running on. As such, there should be no need > to further describe the hardware in DT. If the DSP firmware can report > its capabilities, use that. If not, extend the schema to add an > SoC-specific compatibility string. As a last resort we can use > of_machine_is_compatible(). Thanks for your suggestions. I'll try these out. --Ekansh > >> Does this approach seem appropriate, or is there a better way to handle this? >> >> Thanks, >> Ekansh >> >>>>> For now, with in-driver >>>>> polling, we are seeing significant performance improvements for calls >>>>> with different sized buffers. On polling supporting platform, I've observed an >>>>> ~80us improvement in latency. You can find more details in the test >>>>> results here: >>>>> https://github.com/quic/fastrpc/pull/134/files#diff-7dbc6537cd3ade7fea5766229cf585db585704e02730efd72e7afc9b148e28ed >>>> Does the improvement come from the CPU not goint to idle or from the >>>> glink response processing? >>> Although both are contributing to performance improvement, the major >>> improvement is coming from CPU not going to idle state. >>> >>> Thanks, >>> Ekansh >>> >>>>> Regarding your concerns about power consumption, while in-driver polling >>>>> eliminates CPU wakeup and scheduling, it does increase power consumption. >>>>> However, the performance gains seem to outweigh this increase. >>>>> >>>>> Do you think the poll implementation that you suggested above could provide similar >>>>> improvements? >>>> No, I agree here. I was more concentrated on userspace polling rather >>>> than hw polling. >>>> >
© 2016 - 2026 Red Hat, Inc.