misc: fastrpc: fix context leak and hang on signal-interrupted invoke

[PATCH v1] misc: fastrpc: fix context leak and hang on signal-interrupted invoke

Posted by Anandu Krishnan E 2 weeks ago

fastrpc invokes work by sending an RPC message to the DSP and blocking
in wait_for_completion_interruptible() until the DSP responds. If a
signal arrives during this wait, the syscall returns -ERESTARTSYS and
the invoke context which holds the in-flight DMA buffers and
completion state is left stranded in fl->pending.

On the next syscall attempt (either auto-restarted by the kernel via
SA_RESTART or manually retried by user-space after EINTR), a fresh
context is allocated and the RPC message is re-sent to the DSP. This
has two consequences:

  - The original context leaks in fl->pending until the file is closed.
  - The DSP receives a duplicate invocation. If the DSP was mid-way
    through processing the first request and had issued a reverse RPC
    call back to the host, the retry sends a new forward request
    instead of the expected reverse-RPC response. The DSP thread
    waiting for that response is never woken, causing a hang.

Fix this by saving the interrupted context to a new fl->interrupted
list on -ERESTARTSYS. When the same thread retries the invoke with a
matching sc, restore the context and jump directly to the wait,
skipping context allocation and message re-send.

Also drain fl->interrupted on process exit and complete any sleeping
contexts with -EPIPE when the rpmsg channel is removed.

Fixes: 387f625585d1 ("misc: fastrpc: handle interrupted contexts")
Cc: stable@kernel.org
Signed-off-by: Anandu Krishnan E <anandu.e@oss.qualcomm.com>
---
 drivers/misc/fastrpc.c | 69 ++++++++++++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 16 deletions(-)

diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
index 1080f9acf70a..22d0b0592c10 100644
--- a/drivers/misc/fastrpc.c
+++ b/drivers/misc/fastrpc.c
@@ -280,7 +280,6 @@ struct fastrpc_channel_ctx {
 	struct fastrpc_device *secure_fdevice;
 	struct fastrpc_device *fdevice;
 	struct fastrpc_buf *remote_heap;
-	struct list_head invoke_interrupted_mmaps;
 	bool secure;
 	bool unsigned_support;
 	u64 dma_mask;
@@ -297,6 +296,7 @@ struct fastrpc_user {
 	struct list_head user;
 	struct list_head maps;
 	struct list_head pending;
+	struct list_head interrupted;
 	struct list_head mmaps;
 
 	struct fastrpc_channel_ctx *cctx;
@@ -542,6 +542,36 @@ static void fastrpc_context_put_wq(struct work_struct *work)
 	fastrpc_context_put(ctx);
 }
 
+static void fastrpc_context_save_interrupted(struct fastrpc_invoke_ctx *ctx)
+{
+	spin_lock(&ctx->fl->lock);
+	list_del(&ctx->node);
+	list_add_tail(&ctx->node, &ctx->fl->interrupted);
+	spin_unlock(&ctx->fl->lock);
+}
+
+static struct fastrpc_invoke_ctx *fastrpc_context_restore_interrupted(
+			struct fastrpc_user *fl, u32 sc)
+{
+	struct fastrpc_invoke_ctx *ctx = NULL, *ictx, *n;
+
+	spin_lock(&fl->lock);
+	list_for_each_entry_safe(ictx, n, &fl->interrupted, node) {
+		if (ictx->pid != current->pid)
+			continue;
+		if (ictx->sc != sc || ictx->fl != fl) {
+			spin_unlock(&fl->lock);
+			return ERR_PTR(-EINVAL);
+		}
+		ctx = ictx;
+		list_del(&ctx->node);
+		list_add_tail(&ctx->node, &fl->pending);
+		break;
+	}
+	spin_unlock(&fl->lock);
+	return ctx;
+}
+
 #define CMP(aa, bb) ((aa) == (bb) ? 0 : (aa) < (bb) ? -1 : 1)
 static int olaps_cmp(const void *a, const void *b)
 {
@@ -1197,8 +1227,6 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl,  u32 kernel,
 				   struct fastrpc_invoke_args *args)
 {
 	struct fastrpc_invoke_ctx *ctx = NULL;
-	struct fastrpc_buf *buf, *b;
-
 	int err = 0;
 
 	if (!fl->sctx)
@@ -1212,6 +1240,14 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl,  u32 kernel,
 		return -EPERM;
 	}
 
+	if (!kernel) {
+		ctx = fastrpc_context_restore_interrupted(fl, sc);
+		if (IS_ERR(ctx))
+			return PTR_ERR(ctx);
+		if (ctx)
+			goto wait;
+	}
+
 	ctx = fastrpc_context_alloc(fl, kernel, sc, args);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
@@ -1227,6 +1263,7 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl,  u32 kernel,
 	if (err)
 		goto bail;
 
+wait:
 	if (kernel) {
 		if (!wait_for_completion_timeout(&ctx->work, 10 * HZ))
 			err = -ETIMEDOUT;
@@ -1250,7 +1287,9 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl,  u32 kernel,
 		goto bail;
 
 bail:
-	if (err != -ERESTARTSYS && err != -ETIMEDOUT) {
+	if (ctx && err == -ERESTARTSYS) {
+		fastrpc_context_save_interrupted(ctx);
+	} else if (ctx && err != -ETIMEDOUT) {
 		/* We are done with this compute context */
 		spin_lock(&fl->lock);
 		list_del(&ctx->node);
@@ -1258,13 +1297,6 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl,  u32 kernel,
 		fastrpc_context_put(ctx);
 	}
 
-	if (err == -ERESTARTSYS) {
-		list_for_each_entry_safe(buf, b, &fl->mmaps, node) {
-			list_del(&buf->node);
-			list_add_tail(&buf->node, &fl->cctx->invoke_interrupted_mmaps);
-		}
-	}
-
 	if (err)
 		dev_dbg(fl->sctx->dev, "Error: Invoke Failed %d\n", err);
 
@@ -1598,6 +1630,11 @@ static int fastrpc_device_release(struct inode *inode, struct file *file)
 		fastrpc_context_put(ctx);
 	}
 
+	list_for_each_entry_safe(ctx, n, &fl->interrupted, node) {
+		list_del(&ctx->node);
+		fastrpc_context_put(ctx);
+	}
+
 	list_for_each_entry_safe(map, m, &fl->maps, node)
 		fastrpc_map_put(map);
 
@@ -1637,6 +1674,7 @@ static int fastrpc_device_open(struct inode *inode, struct file *filp)
 	spin_lock_init(&fl->lock);
 	mutex_init(&fl->mutex);
 	INIT_LIST_HEAD(&fl->pending);
+	INIT_LIST_HEAD(&fl->interrupted);
 	INIT_LIST_HEAD(&fl->maps);
 	INIT_LIST_HEAD(&fl->mmaps);
 	INIT_LIST_HEAD(&fl->user);
@@ -2435,7 +2473,6 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device *rpdev)
 	rdev->dma_mask = &data->dma_mask;
 	dma_set_mask_and_coherent(rdev, DMA_BIT_MASK(32));
 	INIT_LIST_HEAD(&data->users);
-	INIT_LIST_HEAD(&data->invoke_interrupted_mmaps);
 	spin_lock_init(&data->lock);
 	idr_init(&data->ctx_idr);
 	data->domain_id = domain_id;
@@ -2467,13 +2504,16 @@ static void fastrpc_notify_users(struct fastrpc_user *user)
 		ctx->retval = -EPIPE;
 		complete(&ctx->work);
 	}
+	list_for_each_entry(ctx, &user->interrupted, node) {
+		ctx->retval = -EPIPE;
+		complete(&ctx->work);
+	}
 	spin_unlock(&user->lock);
 }
 
 static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev)
 {
 	struct fastrpc_channel_ctx *cctx = dev_get_drvdata(&rpdev->dev);
-	struct fastrpc_buf *buf, *b;
 	struct fastrpc_user *user;
 	unsigned long flags;
 
@@ -2490,9 +2530,6 @@ static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev)
 	if (cctx->secure_fdevice)
 		misc_deregister(&cctx->secure_fdevice->miscdev);
 
-	list_for_each_entry_safe(buf, b, &cctx->invoke_interrupted_mmaps, node)
-		list_del(&buf->node);
-
 	if (cctx->remote_heap)
 		fastrpc_buf_free(cctx->remote_heap);
 
-- 
2.34.1

Re: [PATCH v1] misc: fastrpc: fix context leak and hang on signal-interrupted invoke

Posted by Dmitry Baryshkov 2 weeks ago

On Mon, May 25, 2026 at 06:12:22PM +0530, Anandu Krishnan E wrote:
> fastrpc invokes work by sending an RPC message to the DSP and blocking
> in wait_for_completion_interruptible() until the DSP responds. If a
> signal arrives during this wait, the syscall returns -ERESTARTSYS and
> the invoke context which holds the in-flight DMA buffers and
> completion state is left stranded in fl->pending.
> 
> On the next syscall attempt (either auto-restarted by the kernel via
> SA_RESTART or manually retried by user-space after EINTR), a fresh
> context is allocated and the RPC message is re-sent to the DSP. This
> has two consequences:
> 
>   - The original context leaks in fl->pending until the file is closed.
>   - The DSP receives a duplicate invocation. If the DSP was mid-way
>     through processing the first request and had issued a reverse RPC
>     call back to the host, the retry sends a new forward request
>     instead of the expected reverse-RPC response. The DSP thread
>     waiting for that response is never woken, causing a hang.
> 
> Fix this by saving the interrupted context to a new fl->interrupted
> list on -ERESTARTSYS. When the same thread retries the invoke with a
> matching sc, restore the context and jump directly to the wait,
> skipping context allocation and message re-send.

What if the userspace doesn't honour -ERESTARTSYS and submits a new
workload?

> 
> Also drain fl->interrupted on process exit and complete any sleeping
> contexts with -EPIPE when the rpmsg channel is removed.
> 
> Fixes: 387f625585d1 ("misc: fastrpc: handle interrupted contexts")
> Cc: stable@kernel.org
> Signed-off-by: Anandu Krishnan E <anandu.e@oss.qualcomm.com>
> ---
>  drivers/misc/fastrpc.c | 69 ++++++++++++++++++++++++++++++++----------
>  1 file changed, 53 insertions(+), 16 deletions(-)
> 

-- 
With best wishes
Dmitry

Re: [PATCH v1] misc: fastrpc: fix context leak and hang on signal-interrupted invoke

Posted by Anandu Krishnan E 1 week, 6 days ago


On 25-05-2026 06:58 pm, Dmitry Baryshkov wrote:
> On Mon, May 25, 2026 at 06:12:22PM +0530, Anandu Krishnan E wrote:
>> fastrpc invokes work by sending an RPC message to the DSP and blocking
>> in wait_for_completion_interruptible() until the DSP responds. If a
>> signal arrives during this wait, the syscall returns -ERESTARTSYS and
>> the invoke context which holds the in-flight DMA buffers and
>> completion state is left stranded in fl->pending.
>>
>> On the next syscall attempt (either auto-restarted by the kernel via
>> SA_RESTART or manually retried by user-space after EINTR), a fresh
>> context is allocated and the RPC message is re-sent to the DSP. This
>> has two consequences:
>>
>>   - The original context leaks in fl->pending until the file is closed.
>>   - The DSP receives a duplicate invocation. If the DSP was mid-way
>>     through processing the first request and had issued a reverse RPC
>>     call back to the host, the retry sends a new forward request
>>     instead of the expected reverse-RPC response. The DSP thread
>>     waiting for that response is never woken, causing a hang.
>>
>> Fix this by saving the interrupted context to a new fl->interrupted
>> list on -ERESTARTSYS. When the same thread retries the invoke with a
>> matching sc, restore the context and jump directly to the wait,
>> skipping context allocation and message re-send.
> What if the userspace doesn't honour -ERESTARTSYS and submits a new
> workload?
If the same thread submits a new workload with a different sc while an
interrupted context is pending, 
fastrpc_context_restore_interrupted() detects the mismatch (ictx->sc !=
sc) and returns -EINVAL .
This blocks the new submission and forces the thread to resolve its
in-flight interrupted context first.

This is intentional. The original DSP request is still in-flight and the
DSP may issue a reverse RPC back to 
the host as part of completing it. If we allowed a new forward request
from the same thread in parallel, 
the DSP would receive an unexpected message instead of the reverse-RPC
response it is waiting for, 
causing a hang/ unexpected behavior.

i am open to new suggestions as well , can you please help suggest what
is the correct way to handle 
interrupted scenarios ?. 
>
>> Also drain fl->interrupted on process exit and complete any sleeping
>> contexts with -EPIPE when the rpmsg channel is removed.
>>
>> Fixes: 387f625585d1 ("misc: fastrpc: handle interrupted contexts")
>> Cc: stable@kernel.org
>> Signed-off-by: Anandu Krishnan E <anandu.e@oss.qualcomm.com>
>> ---
>>  drivers/misc/fastrpc.c | 69 ++++++++++++++++++++++++++++++++----------
>>  1 file changed, 53 insertions(+), 16 deletions(-)
>>