[v9] media: cadence,ti: CSI2RX Multistream Support

[PATCH v9 15/19] media: ti: j721e-csi2rx: Change the drain architecture for multistream

Posted by Rishikesh Donadkar 1 month, 1 week ago

On buffer starvation the DMA is marked IDLE, and the stale data in the
internal FIFOs gets drained only on the next VIDIOC_QBUF call from the
userspace. This approach works fine for a single stream case.

But in multistream scenarios, buffer starvation for one stream i.e. one
virtual channel, can block the shared HW FIFO of the CSI2RX IP. This can
stall the pipeline for all other virtual channels, even if buffers are
available for them.

This patch introduces a new architecture, that continuously drains data
from the shared HW FIFO into a small (32KiB) buffer if no buffers are made
available to the driver from the userspace. This ensures independence
between different streams, where a slower downstream element for one
camera does not block streaming for other cameras.

Additionally, after a drain is done for a VC, the next frame will be a
partial frame, as a portion of its data will have already been drained
before a valid buffer is queued by user space to the driver.

Use wait for completion barrier to make sure the shared hardware FIFO
is cleared of the data at the end of stream after the source has stopped
sending data.

Reviewed-by: Jai Luthra <jai.luthra@ideasonboard.com>
Reviewed-by: Yemike Abhilash Chandra <y-abhilashchandra@ti.com>
Signed-off-by: Rishikesh Donadkar <r-donadkar@ti.com>
---
 .../platform/ti/j721e-csi2rx/j721e-csi2rx.c   | 112 ++++++++----------
 1 file changed, 50 insertions(+), 62 deletions(-)

diff --git a/drivers/media/platform/ti/j721e-csi2rx/j721e-csi2rx.c b/drivers/media/platform/ti/j721e-csi2rx/j721e-csi2rx.c
index fa6152464d4b6..e713293696eb1 100644
--- a/drivers/media/platform/ti/j721e-csi2rx/j721e-csi2rx.c
+++ b/drivers/media/platform/ti/j721e-csi2rx/j721e-csi2rx.c
@@ -82,7 +82,6 @@ struct ti_csi2rx_buffer {
 
 enum ti_csi2rx_dma_state {
 	TI_CSI2RX_DMA_STOPPED,	/* Streaming not started yet. */
-	TI_CSI2RX_DMA_IDLE,	/* Streaming but no pending DMA operation. */
 	TI_CSI2RX_DMA_ACTIVE,	/* Streaming and pending DMA operation. */
 };
 
@@ -109,6 +108,7 @@ struct ti_csi2rx_ctx {
 	struct v4l2_format		v_fmt;
 	struct ti_csi2rx_dma		dma;
 	struct media_pad		pad;
+	struct completion		drain_complete;
 	u32				sequence;
 	u32				idx;
 	u32				vc;
@@ -251,6 +251,10 @@ static const struct ti_csi2rx_fmt ti_csi2rx_formats[] = {
 static int ti_csi2rx_start_dma(struct ti_csi2rx_ctx *ctx,
 			       struct ti_csi2rx_buffer *buf);
 
+/* Forward declarations needed by ti_csi2rx_drain_callback. */
+static int ti_csi2rx_drain_dma(struct ti_csi2rx_ctx *ctx);
+static int ti_csi2rx_dma_submit_pending(struct ti_csi2rx_ctx *ctx);
+
 static const struct ti_csi2rx_fmt *find_format_by_fourcc(u32 pixelformat)
 {
 	unsigned int i;
@@ -617,9 +621,32 @@ static void ti_csi2rx_setup_shim(struct ti_csi2rx_ctx *ctx)
 
 static void ti_csi2rx_drain_callback(void *param)
 {
-	struct completion *drain_complete = param;
+	struct ti_csi2rx_ctx *ctx = param;
+	struct ti_csi2rx_dma *dma = &ctx->dma;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dma->lock, flags);
+
+	if (dma->state == TI_CSI2RX_DMA_STOPPED) {
+		complete(&ctx->drain_complete);
+		spin_unlock_irqrestore(&dma->lock, flags);
+		return;
+	}
 
-	complete(drain_complete);
+	/*
+	 * If dma->queue is empty, it indicates that no buffer has been
+	 * provided by user space. In this case, initiate a transactions
+	 * to drain the DMA. Since one drain of size DRAIN_BUFFER_SIZE
+	 * will be done here, the subsequent frame will be a
+	 * partial frame, with a size of frame_size - DRAIN_BUFFER_SIZE
+	 */
+	if (list_empty(&dma->queue)) {
+		if (ti_csi2rx_drain_dma(ctx))
+			dev_warn(ctx->csi->dev, "DMA drain failed\n");
+	} else {
+		ti_csi2rx_dma_submit_pending(ctx);
+	}
+	spin_unlock_irqrestore(&dma->lock, flags);
 }
 
 /*
@@ -637,12 +664,9 @@ static int ti_csi2rx_drain_dma(struct ti_csi2rx_ctx *ctx)
 {
 	struct ti_csi2rx_dev *csi = ctx->csi;
 	struct dma_async_tx_descriptor *desc;
-	struct completion drain_complete;
 	dma_cookie_t cookie;
 	int ret;
 
-	init_completion(&drain_complete);
-
 	desc = dmaengine_prep_slave_single(ctx->dma.chan, csi->drain.paddr,
 					   csi->drain.len, DMA_DEV_TO_MEM,
 					   DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
@@ -652,7 +676,7 @@ static int ti_csi2rx_drain_dma(struct ti_csi2rx_ctx *ctx)
 	}
 
 	desc->callback = ti_csi2rx_drain_callback;
-	desc->callback_param = &drain_complete;
+	desc->callback_param = ctx;
 
 	cookie = dmaengine_submit(desc);
 	ret = dma_submit_error(cookie);
@@ -661,13 +685,6 @@ static int ti_csi2rx_drain_dma(struct ti_csi2rx_ctx *ctx)
 
 	dma_async_issue_pending(ctx->dma.chan);
 
-	if (!wait_for_completion_timeout(&drain_complete,
-					 msecs_to_jiffies(DRAIN_TIMEOUT_MS))) {
-		dmaengine_terminate_sync(ctx->dma.chan);
-		dev_dbg(csi->dev, "DMA transfer timed out for drain buffer\n");
-		ret = -ETIMEDOUT;
-		goto out;
-	}
 out:
 	return ret;
 }
@@ -716,9 +733,11 @@ static void ti_csi2rx_dma_callback(void *param)
 
 	ti_csi2rx_dma_submit_pending(ctx);
 
-	if (list_empty(&dma->submitted))
-		dma->state = TI_CSI2RX_DMA_IDLE;
-
+	if (list_empty(&dma->submitted)) {
+		if (ti_csi2rx_drain_dma(ctx))
+			dev_warn(ctx->csi->dev,
+				 "DMA drain failed on one of the transactions\n");
+	}
 	spin_unlock_irqrestore(&dma->lock, flags);
 }
 
@@ -754,6 +773,7 @@ static int ti_csi2rx_start_dma(struct ti_csi2rx_ctx *ctx,
 static void ti_csi2rx_stop_dma(struct ti_csi2rx_ctx *ctx)
 {
 	struct ti_csi2rx_dma *dma = &ctx->dma;
+	struct ti_csi2rx_dev *csi = ctx->csi;
 	enum ti_csi2rx_dma_state state;
 	unsigned long flags;
 	int ret;
@@ -763,6 +783,8 @@ static void ti_csi2rx_stop_dma(struct ti_csi2rx_ctx *ctx)
 	dma->state = TI_CSI2RX_DMA_STOPPED;
 	spin_unlock_irqrestore(&dma->lock, flags);
 
+	init_completion(&ctx->drain_complete);
+
 	if (state != TI_CSI2RX_DMA_STOPPED) {
 		/*
 		 * Normal DMA termination does not clean up pending data on
@@ -771,11 +793,20 @@ static void ti_csi2rx_stop_dma(struct ti_csi2rx_ctx *ctx)
 		 * enforced before terminating DMA.
 		 */
 		ret = ti_csi2rx_drain_dma(ctx);
-		if (ret && ret != -ETIMEDOUT)
+		if (ret)
 			dev_warn(ctx->csi->dev,
 				 "Failed to drain DMA. Next frame might be bogus\n");
 	}
 
+	/* We wait for the drain to complete so that the stream stops
+	 * cleanly, making sure the shared hardware FIFO is cleared of
+	 * data from the current stream. No more data will be coming from
+	 * the source after this.
+	 */
+	if (!wait_for_completion_timeout(&ctx->drain_complete,
+					 msecs_to_jiffies(DRAIN_TIMEOUT_MS)))
+		dev_dbg(csi->dev, "DMA transfer timed out for drain buffer\n");
+
 	ret = dmaengine_terminate_sync(ctx->dma.chan);
 	if (ret)
 		dev_err(ctx->csi->dev, "Failed to stop DMA: %d\n", ret);
@@ -838,57 +869,14 @@ static void ti_csi2rx_buffer_queue(struct vb2_buffer *vb)
 	struct ti_csi2rx_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue);
 	struct ti_csi2rx_buffer *buf;
 	struct ti_csi2rx_dma *dma = &ctx->dma;
-	bool restart_dma = false;
 	unsigned long flags = 0;
-	int ret;
 
 	buf = container_of(vb, struct ti_csi2rx_buffer, vb.vb2_buf);
 	buf->ctx = ctx;
 
 	spin_lock_irqsave(&dma->lock, flags);
-	/*
-	 * Usually the DMA callback takes care of queueing the pending buffers.
-	 * But if DMA has stalled due to lack of buffers, restart it now.
-	 */
-	if (dma->state == TI_CSI2RX_DMA_IDLE) {
-		/*
-		 * Do not restart DMA with the lock held because
-		 * ti_csi2rx_drain_dma() might block for completion.
-		 * There won't be a race on queueing DMA anyway since the
-		 * callback is not being fired.
-		 */
-		restart_dma = true;
-		dma->state = TI_CSI2RX_DMA_ACTIVE;
-	} else {
-		list_add_tail(&buf->list, &dma->queue);
-	}
+	list_add_tail(&buf->list, &dma->queue);
 	spin_unlock_irqrestore(&dma->lock, flags);
-
-	if (restart_dma) {
-		/*
-		 * Once frames start dropping, some data gets stuck in the DMA
-		 * pipeline somewhere. So the first DMA transfer after frame
-		 * drops gives a partial frame. This is obviously not useful to
-		 * the application and will only confuse it. Issue a DMA
-		 * transaction to drain that up.
-		 */
-		ret = ti_csi2rx_drain_dma(ctx);
-		if (ret && ret != -ETIMEDOUT)
-			dev_warn(ctx->csi->dev,
-				 "Failed to drain DMA. Next frame might be bogus\n");
-
-		spin_lock_irqsave(&dma->lock, flags);
-		ret = ti_csi2rx_start_dma(ctx, buf);
-		if (ret) {
-			vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR);
-			dma->state = TI_CSI2RX_DMA_IDLE;
-			spin_unlock_irqrestore(&dma->lock, flags);
-			dev_err(ctx->csi->dev, "Failed to start DMA: %d\n", ret);
-		} else {
-			list_add_tail(&buf->list, &dma->submitted);
-			spin_unlock_irqrestore(&dma->lock, flags);
-		}
-	}
 }
 
 static int ti_csi2rx_get_stream(struct ti_csi2rx_ctx *ctx)
-- 
2.34.1

Re: [PATCH v9 15/19] media: ti: j721e-csi2rx: Change the drain architecture for multistream

Posted by Tomi Valkeinen 3 weeks, 4 days ago

Hi,

On 30/12/2025 10:32, Rishikesh Donadkar wrote:
> On buffer starvation the DMA is marked IDLE, and the stale data in the
> internal FIFOs gets drained only on the next VIDIOC_QBUF call from the
> userspace. This approach works fine for a single stream case.
> 
> But in multistream scenarios, buffer starvation for one stream i.e. one
> virtual channel, can block the shared HW FIFO of the CSI2RX IP. This can
> stall the pipeline for all other virtual channels, even if buffers are
> available for them.

One stream is filtered based on VC & DT, but the above only mentions VC.
And then later uses VC when referring to the stream. I think you can
drop the VC parts, and just talk about streams.

> This patch introduces a new architecture, that continuously drains data
> from the shared HW FIFO into a small (32KiB) buffer if no buffers are made
> available to the driver from the userspace. This ensures independence
> between different streams, where a slower downstream element for one
> camera does not block streaming for other cameras.
> 
> Additionally, after a drain is done for a VC, the next frame will be a
> partial frame, as a portion of its data will have already been drained
> before a valid buffer is queued by user space to the driver.

This makes it sounds we drain a single 32KB piece. But won't we continue
draining that stream until the stream is stopped or the user provides a
buffer?

Also, does the DMA not offer us ways to drain a full frame? There's no
way to e.g. set the DMA TX increment to 0, so that it would just write
to a single location in memory? Or just set the target to void.

 Tomi

> Use wait for completion barrier to make sure the shared hardware FIFO
> is cleared of the data at the end of stream after the source has stopped
> sending data.
> 
> Reviewed-by: Jai Luthra <jai.luthra@ideasonboard.com>
> Reviewed-by: Yemike Abhilash Chandra <y-abhilashchandra@ti.com>
> Signed-off-by: Rishikesh Donadkar <r-donadkar@ti.com>
> ---
>  .../platform/ti/j721e-csi2rx/j721e-csi2rx.c   | 112 ++++++++----------
>  1 file changed, 50 insertions(+), 62 deletions(-)
> 
> diff --git a/drivers/media/platform/ti/j721e-csi2rx/j721e-csi2rx.c b/drivers/media/platform/ti/j721e-csi2rx/j721e-csi2rx.c
> index fa6152464d4b6..e713293696eb1 100644
> --- a/drivers/media/platform/ti/j721e-csi2rx/j721e-csi2rx.c
> +++ b/drivers/media/platform/ti/j721e-csi2rx/j721e-csi2rx.c
> @@ -82,7 +82,6 @@ struct ti_csi2rx_buffer {
>  
>  enum ti_csi2rx_dma_state {
>  	TI_CSI2RX_DMA_STOPPED,	/* Streaming not started yet. */
> -	TI_CSI2RX_DMA_IDLE,	/* Streaming but no pending DMA operation. */
>  	TI_CSI2RX_DMA_ACTIVE,	/* Streaming and pending DMA operation. */
>  };
>  
> @@ -109,6 +108,7 @@ struct ti_csi2rx_ctx {
>  	struct v4l2_format		v_fmt;
>  	struct ti_csi2rx_dma		dma;
>  	struct media_pad		pad;
> +	struct completion		drain_complete;
>  	u32				sequence;
>  	u32				idx;
>  	u32				vc;
> @@ -251,6 +251,10 @@ static const struct ti_csi2rx_fmt ti_csi2rx_formats[] = {
>  static int ti_csi2rx_start_dma(struct ti_csi2rx_ctx *ctx,
>  			       struct ti_csi2rx_buffer *buf);
>  
> +/* Forward declarations needed by ti_csi2rx_drain_callback. */
> +static int ti_csi2rx_drain_dma(struct ti_csi2rx_ctx *ctx);
> +static int ti_csi2rx_dma_submit_pending(struct ti_csi2rx_ctx *ctx);
> +
>  static const struct ti_csi2rx_fmt *find_format_by_fourcc(u32 pixelformat)
>  {
>  	unsigned int i;
> @@ -617,9 +621,32 @@ static void ti_csi2rx_setup_shim(struct ti_csi2rx_ctx *ctx)
>  
>  static void ti_csi2rx_drain_callback(void *param)
>  {
> -	struct completion *drain_complete = param;
> +	struct ti_csi2rx_ctx *ctx = param;
> +	struct ti_csi2rx_dma *dma = &ctx->dma;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&dma->lock, flags);
> +
> +	if (dma->state == TI_CSI2RX_DMA_STOPPED) {
> +		complete(&ctx->drain_complete);
> +		spin_unlock_irqrestore(&dma->lock, flags);
> +		return;
> +	}
>  
> -	complete(drain_complete);
> +	/*
> +	 * If dma->queue is empty, it indicates that no buffer has been
> +	 * provided by user space. In this case, initiate a transactions
> +	 * to drain the DMA. Since one drain of size DRAIN_BUFFER_SIZE
> +	 * will be done here, the subsequent frame will be a
> +	 * partial frame, with a size of frame_size - DRAIN_BUFFER_SIZE
> +	 */
> +	if (list_empty(&dma->queue)) {
> +		if (ti_csi2rx_drain_dma(ctx))
> +			dev_warn(ctx->csi->dev, "DMA drain failed\n");
> +	} else {
> +		ti_csi2rx_dma_submit_pending(ctx);
> +	}
> +	spin_unlock_irqrestore(&dma->lock, flags);
>  }
>  
>  /*
> @@ -637,12 +664,9 @@ static int ti_csi2rx_drain_dma(struct ti_csi2rx_ctx *ctx)
>  {
>  	struct ti_csi2rx_dev *csi = ctx->csi;
>  	struct dma_async_tx_descriptor *desc;
> -	struct completion drain_complete;
>  	dma_cookie_t cookie;
>  	int ret;
>  
> -	init_completion(&drain_complete);
> -
>  	desc = dmaengine_prep_slave_single(ctx->dma.chan, csi->drain.paddr,
>  					   csi->drain.len, DMA_DEV_TO_MEM,
>  					   DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
> @@ -652,7 +676,7 @@ static int ti_csi2rx_drain_dma(struct ti_csi2rx_ctx *ctx)
>  	}
>  
>  	desc->callback = ti_csi2rx_drain_callback;
> -	desc->callback_param = &drain_complete;
> +	desc->callback_param = ctx;
>  
>  	cookie = dmaengine_submit(desc);
>  	ret = dma_submit_error(cookie);
> @@ -661,13 +685,6 @@ static int ti_csi2rx_drain_dma(struct ti_csi2rx_ctx *ctx)
>  
>  	dma_async_issue_pending(ctx->dma.chan);
>  
> -	if (!wait_for_completion_timeout(&drain_complete,
> -					 msecs_to_jiffies(DRAIN_TIMEOUT_MS))) {
> -		dmaengine_terminate_sync(ctx->dma.chan);
> -		dev_dbg(csi->dev, "DMA transfer timed out for drain buffer\n");
> -		ret = -ETIMEDOUT;
> -		goto out;
> -	}
>  out:
>  	return ret;
>  }
> @@ -716,9 +733,11 @@ static void ti_csi2rx_dma_callback(void *param)
>  
>  	ti_csi2rx_dma_submit_pending(ctx);
>  
> -	if (list_empty(&dma->submitted))
> -		dma->state = TI_CSI2RX_DMA_IDLE;
> -
> +	if (list_empty(&dma->submitted)) {
> +		if (ti_csi2rx_drain_dma(ctx))
> +			dev_warn(ctx->csi->dev,
> +				 "DMA drain failed on one of the transactions\n");
> +	}
>  	spin_unlock_irqrestore(&dma->lock, flags);
>  }
>  
> @@ -754,6 +773,7 @@ static int ti_csi2rx_start_dma(struct ti_csi2rx_ctx *ctx,
>  static void ti_csi2rx_stop_dma(struct ti_csi2rx_ctx *ctx)
>  {
>  	struct ti_csi2rx_dma *dma = &ctx->dma;
> +	struct ti_csi2rx_dev *csi = ctx->csi;
>  	enum ti_csi2rx_dma_state state;
>  	unsigned long flags;
>  	int ret;
> @@ -763,6 +783,8 @@ static void ti_csi2rx_stop_dma(struct ti_csi2rx_ctx *ctx)
>  	dma->state = TI_CSI2RX_DMA_STOPPED;
>  	spin_unlock_irqrestore(&dma->lock, flags);
>  
> +	init_completion(&ctx->drain_complete);
> +
>  	if (state != TI_CSI2RX_DMA_STOPPED) {
>  		/*
>  		 * Normal DMA termination does not clean up pending data on
> @@ -771,11 +793,20 @@ static void ti_csi2rx_stop_dma(struct ti_csi2rx_ctx *ctx)
>  		 * enforced before terminating DMA.
>  		 */
>  		ret = ti_csi2rx_drain_dma(ctx);
> -		if (ret && ret != -ETIMEDOUT)
> +		if (ret)
>  			dev_warn(ctx->csi->dev,
>  				 "Failed to drain DMA. Next frame might be bogus\n");
>  	}
>  
> +	/* We wait for the drain to complete so that the stream stops
> +	 * cleanly, making sure the shared hardware FIFO is cleared of
> +	 * data from the current stream. No more data will be coming from
> +	 * the source after this.
> +	 */
> +	if (!wait_for_completion_timeout(&ctx->drain_complete,
> +					 msecs_to_jiffies(DRAIN_TIMEOUT_MS)))
> +		dev_dbg(csi->dev, "DMA transfer timed out for drain buffer\n");
> +
>  	ret = dmaengine_terminate_sync(ctx->dma.chan);
>  	if (ret)
>  		dev_err(ctx->csi->dev, "Failed to stop DMA: %d\n", ret);
> @@ -838,57 +869,14 @@ static void ti_csi2rx_buffer_queue(struct vb2_buffer *vb)
>  	struct ti_csi2rx_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue);
>  	struct ti_csi2rx_buffer *buf;
>  	struct ti_csi2rx_dma *dma = &ctx->dma;
> -	bool restart_dma = false;
>  	unsigned long flags = 0;
> -	int ret;
>  
>  	buf = container_of(vb, struct ti_csi2rx_buffer, vb.vb2_buf);
>  	buf->ctx = ctx;
>  
>  	spin_lock_irqsave(&dma->lock, flags);
> -	/*
> -	 * Usually the DMA callback takes care of queueing the pending buffers.
> -	 * But if DMA has stalled due to lack of buffers, restart it now.
> -	 */
> -	if (dma->state == TI_CSI2RX_DMA_IDLE) {
> -		/*
> -		 * Do not restart DMA with the lock held because
> -		 * ti_csi2rx_drain_dma() might block for completion.
> -		 * There won't be a race on queueing DMA anyway since the
> -		 * callback is not being fired.
> -		 */
> -		restart_dma = true;
> -		dma->state = TI_CSI2RX_DMA_ACTIVE;
> -	} else {
> -		list_add_tail(&buf->list, &dma->queue);
> -	}
> +	list_add_tail(&buf->list, &dma->queue);
>  	spin_unlock_irqrestore(&dma->lock, flags);
> -
> -	if (restart_dma) {
> -		/*
> -		 * Once frames start dropping, some data gets stuck in the DMA
> -		 * pipeline somewhere. So the first DMA transfer after frame
> -		 * drops gives a partial frame. This is obviously not useful to
> -		 * the application and will only confuse it. Issue a DMA
> -		 * transaction to drain that up.
> -		 */
> -		ret = ti_csi2rx_drain_dma(ctx);
> -		if (ret && ret != -ETIMEDOUT)
> -			dev_warn(ctx->csi->dev,
> -				 "Failed to drain DMA. Next frame might be bogus\n");
> -
> -		spin_lock_irqsave(&dma->lock, flags);
> -		ret = ti_csi2rx_start_dma(ctx, buf);
> -		if (ret) {
> -			vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR);
> -			dma->state = TI_CSI2RX_DMA_IDLE;
> -			spin_unlock_irqrestore(&dma->lock, flags);
> -			dev_err(ctx->csi->dev, "Failed to start DMA: %d\n", ret);
> -		} else {
> -			list_add_tail(&buf->list, &dma->submitted);
> -			spin_unlock_irqrestore(&dma->lock, flags);
> -		}
> -	}
>  }
>  
>  static int ti_csi2rx_get_stream(struct ti_csi2rx_ctx *ctx)

Re: [PATCH v9 15/19] media: ti: j721e-csi2rx: Change the drain architecture for multistream

Posted by Rishikesh Donadkar 3 weeks ago

On 15/01/26 18:07, Tomi Valkeinen wrote:
> Hi,


Hi Tomi,

Thank you for the review !

>
> On 30/12/2025 10:32, Rishikesh Donadkar wrote:
>> On buffer starvation the DMA is marked IDLE, and the stale data in the
>> internal FIFOs gets drained only on the next VIDIOC_QBUF call from the
>> userspace. This approach works fine for a single stream case.
>>
>> But in multistream scenarios, buffer starvation for one stream i.e. one
>> virtual channel, can block the shared HW FIFO of the CSI2RX IP. This can
>> stall the pipeline for all other virtual channels, even if buffers are
>> available for them.
> One stream is filtered based on VC & DT, but the above only mentions VC.
> And then later uses VC when referring to the stream. I think you can
> drop the VC parts, and just talk about streams.


Okay, will do !

>
>> This patch introduces a new architecture, that continuously drains data
>> from the shared HW FIFO into a small (32KiB) buffer if no buffers are made
>> available to the driver from the userspace. This ensures independence
>> between different streams, where a slower downstream element for one
>> camera does not block streaming for other cameras.
>>
>> Additionally, after a drain is done for a VC, the next frame will be a
>> partial frame, as a portion of its data will have already been drained
>> before a valid buffer is queued by user space to the driver.
> This makes it sounds we drain a single 32KB piece. But won't we continue
> draining that stream until the stream is stopped or the user provides a
> buffer?


Thanks for pointing out, I will change it to talk about continuous draining.

Rishikesh

>
> Also, does the DMA not offer us ways to drain a full frame? There's no
> way to e.g. set the DMA TX increment to 0, so that it would just write
> to a single location in memory? Or just set the target to void.
>
>   Tomi
>
>> Use wait for completion barrier to make sure the shared hardware FIFO
>> is cleared of the data at the end of stream after the source has stopped
>> sending data.
>>
>> Reviewed-by: Jai Luthra <jai.luthra@ideasonboard.com>
>> Reviewed-by: Yemike Abhilash Chandra <y-abhilashchandra@ti.com>
>> Signed-off-by: Rishikesh Donadkar <r-donadkar@ti.com>
>> ---
>>   .../platform/ti/j721e-csi2rx/j721e-csi2rx.c   | 112 ++++++++----------
>>   1 file changed, 50 insertions(+), 62 deletions(-)
>>
>> diff --git a/drivers/media/platform/ti/j721e-csi2rx/j721e-csi2rx.c b/drivers/media/platform/ti/j721e-csi2rx/j721e-csi2rx.c
>> index fa6152464d4b6..e713293696eb1 100644
>> --- a/drivers/media/platform/ti/j721e-csi2rx/j721e-csi2rx.c
>> +++ b/drivers/media/platform/ti/j721e-csi2rx/j721e-csi2rx.c
>> @@ -82,7 +82,6 @@ struct ti_csi2rx_buffer {
>>   
>>   enum ti_csi2rx_dma_state {
>>   	TI_CSI2RX_DMA_STOPPED,	/* Streaming not started yet. */
>> -	TI_CSI2RX_DMA_IDLE,	/* Streaming but no pending DMA operation. */
>>   	TI_CSI2RX_DMA_ACTIVE,	/* Streaming and pending DMA operation. */
>>   };
>>   
>> @@ -109,6 +108,7 @@ struct ti_csi2rx_ctx {
>>   	struct v4l2_format		v_fmt;
>>   	struct ti_csi2rx_dma		dma;
>>   	struct media_pad		pad;
>> +	struct completion		drain_complete;
>>   	u32				sequence;
>>   	u32				idx;
>>   	u32				vc;
>> @@ -251,6 +251,10 @@ static const struct ti_csi2rx_fmt ti_csi2rx_formats[] = {
>>   static int ti_csi2rx_start_dma(struct ti_csi2rx_ctx *ctx,
>>   			       struct ti_csi2rx_buffer *buf);
>>   
>> +/* Forward declarations needed by ti_csi2rx_drain_callback. */
>> +static int ti_csi2rx_drain_dma(struct ti_csi2rx_ctx *ctx);
>> +static int ti_csi2rx_dma_submit_pending(struct ti_csi2rx_ctx *ctx);
>> +
>>   static const struct ti_csi2rx_fmt *find_format_by_fourcc(u32 pixelformat)
>>   {
>>   	unsigned int i;
>> @@ -617,9 +621,32 @@ static void ti_csi2rx_setup_shim(struct ti_csi2rx_ctx *ctx)
>>   
>>   static void ti_csi2rx_drain_callback(void *param)
>>   {
>> -	struct completion *drain_complete = param;
>> +	struct ti_csi2rx_ctx *ctx = param;
>> +	struct ti_csi2rx_dma *dma = &ctx->dma;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&dma->lock, flags);
>> +
>> +	if (dma->state == TI_CSI2RX_DMA_STOPPED) {
>> +		complete(&ctx->drain_complete);
>> +		spin_unlock_irqrestore(&dma->lock, flags);
>> +		return;
>> +	}
>>   
>> -	complete(drain_complete);
>> +	/*
>> +	 * If dma->queue is empty, it indicates that no buffer has been
>> +	 * provided by user space. In this case, initiate a transactions
>> +	 * to drain the DMA. Since one drain of size DRAIN_BUFFER_SIZE
>> +	 * will be done here, the subsequent frame will be a
>> +	 * partial frame, with a size of frame_size - DRAIN_BUFFER_SIZE
>> +	 */
>> +	if (list_empty(&dma->queue)) {
>> +		if (ti_csi2rx_drain_dma(ctx))
>> +			dev_warn(ctx->csi->dev, "DMA drain failed\n");
>> +	} else {
>> +		ti_csi2rx_dma_submit_pending(ctx);
>> +	}
>> +	spin_unlock_irqrestore(&dma->lock, flags);
>>   }
>>   
>>   /*
>> @@ -637,12 +664,9 @@ static int ti_csi2rx_drain_dma(struct ti_csi2rx_ctx *ctx)
>>   {
>>   	struct ti_csi2rx_dev *csi = ctx->csi;
>>   	struct dma_async_tx_descriptor *desc;
>> -	struct completion drain_complete;
>>   	dma_cookie_t cookie;
>>   	int ret;
>>   
>> -	init_completion(&drain_complete);
>> -
>>   	desc = dmaengine_prep_slave_single(ctx->dma.chan, csi->drain.paddr,
>>   					   csi->drain.len, DMA_DEV_TO_MEM,
>>   					   DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
>> @@ -652,7 +676,7 @@ static int ti_csi2rx_drain_dma(struct ti_csi2rx_ctx *ctx)
>>   	}
>>   
>>   	desc->callback = ti_csi2rx_drain_callback;
>> -	desc->callback_param = &drain_complete;
>> +	desc->callback_param = ctx;
>>   
>>   	cookie = dmaengine_submit(desc);
>>   	ret = dma_submit_error(cookie);
>> @@ -661,13 +685,6 @@ static int ti_csi2rx_drain_dma(struct ti_csi2rx_ctx *ctx)
>>   
>>   	dma_async_issue_pending(ctx->dma.chan);
>>   
>> -	if (!wait_for_completion_timeout(&drain_complete,
>> -					 msecs_to_jiffies(DRAIN_TIMEOUT_MS))) {
>> -		dmaengine_terminate_sync(ctx->dma.chan);
>> -		dev_dbg(csi->dev, "DMA transfer timed out for drain buffer\n");
>> -		ret = -ETIMEDOUT;
>> -		goto out;
>> -	}
>>   out:
>>   	return ret;
>>   }
>> @@ -716,9 +733,11 @@ static void ti_csi2rx_dma_callback(void *param)
>>   
>>   	ti_csi2rx_dma_submit_pending(ctx);
>>   
>> -	if (list_empty(&dma->submitted))
>> -		dma->state = TI_CSI2RX_DMA_IDLE;
>> -
>> +	if (list_empty(&dma->submitted)) {
>> +		if (ti_csi2rx_drain_dma(ctx))
>> +			dev_warn(ctx->csi->dev,
>> +				 "DMA drain failed on one of the transactions\n");
>> +	}
>>   	spin_unlock_irqrestore(&dma->lock, flags);
>>   }
>>   
>> @@ -754,6 +773,7 @@ static int ti_csi2rx_start_dma(struct ti_csi2rx_ctx *ctx,
>>   static void ti_csi2rx_stop_dma(struct ti_csi2rx_ctx *ctx)
>>   {
>>   	struct ti_csi2rx_dma *dma = &ctx->dma;
>> +	struct ti_csi2rx_dev *csi = ctx->csi;
>>   	enum ti_csi2rx_dma_state state;
>>   	unsigned long flags;
>>   	int ret;
>> @@ -763,6 +783,8 @@ static void ti_csi2rx_stop_dma(struct ti_csi2rx_ctx *ctx)
>>   	dma->state = TI_CSI2RX_DMA_STOPPED;
>>   	spin_unlock_irqrestore(&dma->lock, flags);
>>   
>> +	init_completion(&ctx->drain_complete);
>> +
>>   	if (state != TI_CSI2RX_DMA_STOPPED) {
>>   		/*
>>   		 * Normal DMA termination does not clean up pending data on
>> @@ -771,11 +793,20 @@ static void ti_csi2rx_stop_dma(struct ti_csi2rx_ctx *ctx)
>>   		 * enforced before terminating DMA.
>>   		 */
>>   		ret = ti_csi2rx_drain_dma(ctx);
>> -		if (ret && ret != -ETIMEDOUT)
>> +		if (ret)
>>   			dev_warn(ctx->csi->dev,
>>   				 "Failed to drain DMA. Next frame might be bogus\n");
>>   	}
>>   
>> +	/* We wait for the drain to complete so that the stream stops
>> +	 * cleanly, making sure the shared hardware FIFO is cleared of
>> +	 * data from the current stream. No more data will be coming from
>> +	 * the source after this.
>> +	 */
>> +	if (!wait_for_completion_timeout(&ctx->drain_complete,
>> +					 msecs_to_jiffies(DRAIN_TIMEOUT_MS)))
>> +		dev_dbg(csi->dev, "DMA transfer timed out for drain buffer\n");
>> +
>>   	ret = dmaengine_terminate_sync(ctx->dma.chan);
>>   	if (ret)
>>   		dev_err(ctx->csi->dev, "Failed to stop DMA: %d\n", ret);
>> @@ -838,57 +869,14 @@ static void ti_csi2rx_buffer_queue(struct vb2_buffer *vb)
>>   	struct ti_csi2rx_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue);
>>   	struct ti_csi2rx_buffer *buf;
>>   	struct ti_csi2rx_dma *dma = &ctx->dma;
>> -	bool restart_dma = false;
>>   	unsigned long flags = 0;
>> -	int ret;
>>   
>>   	buf = container_of(vb, struct ti_csi2rx_buffer, vb.vb2_buf);
>>   	buf->ctx = ctx;
>>   
>>   	spin_lock_irqsave(&dma->lock, flags);
>> -	/*
>> -	 * Usually the DMA callback takes care of queueing the pending buffers.
>> -	 * But if DMA has stalled due to lack of buffers, restart it now.
>> -	 */
>> -	if (dma->state == TI_CSI2RX_DMA_IDLE) {
>> -		/*
>> -		 * Do not restart DMA with the lock held because
>> -		 * ti_csi2rx_drain_dma() might block for completion.
>> -		 * There won't be a race on queueing DMA anyway since the
>> -		 * callback is not being fired.
>> -		 */
>> -		restart_dma = true;
>> -		dma->state = TI_CSI2RX_DMA_ACTIVE;
>> -	} else {
>> -		list_add_tail(&buf->list, &dma->queue);
>> -	}
>> +	list_add_tail(&buf->list, &dma->queue);
>>   	spin_unlock_irqrestore(&dma->lock, flags);
>> -
>> -	if (restart_dma) {
>> -		/*
>> -		 * Once frames start dropping, some data gets stuck in the DMA
>> -		 * pipeline somewhere. So the first DMA transfer after frame
>> -		 * drops gives a partial frame. This is obviously not useful to
>> -		 * the application and will only confuse it. Issue a DMA
>> -		 * transaction to drain that up.
>> -		 */
>> -		ret = ti_csi2rx_drain_dma(ctx);
>> -		if (ret && ret != -ETIMEDOUT)
>> -			dev_warn(ctx->csi->dev,
>> -				 "Failed to drain DMA. Next frame might be bogus\n");
>> -
>> -		spin_lock_irqsave(&dma->lock, flags);
>> -		ret = ti_csi2rx_start_dma(ctx, buf);
>> -		if (ret) {
>> -			vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR);
>> -			dma->state = TI_CSI2RX_DMA_IDLE;
>> -			spin_unlock_irqrestore(&dma->lock, flags);
>> -			dev_err(ctx->csi->dev, "Failed to start DMA: %d\n", ret);
>> -		} else {
>> -			list_add_tail(&buf->list, &dma->submitted);
>> -			spin_unlock_irqrestore(&dma->lock, flags);
>> -		}
>> -	}
>>   }
>>   
>>   static int ti_csi2rx_get_stream(struct ti_csi2rx_ctx *ctx)

Re: [PATCH v9 15/19] media: ti: j721e-csi2rx: Change the drain architecture for multistream

Posted by Jai Luthra 3 weeks, 4 days ago

Hi Tomi,

Quoting Tomi Valkeinen (2026-01-15 18:07:07)
> Hi,
> 
> On 30/12/2025 10:32, Rishikesh Donadkar wrote:
> > On buffer starvation the DMA is marked IDLE, and the stale data in the
> > internal FIFOs gets drained only on the next VIDIOC_QBUF call from the
> > userspace. This approach works fine for a single stream case.
> > 
> > But in multistream scenarios, buffer starvation for one stream i.e. one
> > virtual channel, can block the shared HW FIFO of the CSI2RX IP. This can
> > stall the pipeline for all other virtual channels, even if buffers are
> > available for them.
> 
> One stream is filtered based on VC & DT, but the above only mentions VC.
> And then later uses VC when referring to the stream. I think you can
> drop the VC parts, and just talk about streams.
> 
> > This patch introduces a new architecture, that continuously drains data
> > from the shared HW FIFO into a small (32KiB) buffer if no buffers are made
> > available to the driver from the userspace. This ensures independence
> > between different streams, where a slower downstream element for one
> > camera does not block streaming for other cameras.
> > 
> > Additionally, after a drain is done for a VC, the next frame will be a
> > partial frame, as a portion of its data will have already been drained
> > before a valid buffer is queued by user space to the driver.
> 
> This makes it sounds we drain a single 32KB piece. But won't we continue
> draining that stream until the stream is stopped or the user provides a
> buffer?
> 
> Also, does the DMA not offer us ways to drain a full frame? There's no
> way to e.g. set the DMA TX increment to 0, so that it would just write
> to a single location in memory? Or just set the target to void.
> 

+ Vignesh

AFAIU the dmaengine API is the first limitation here, and the actual
Transfer Record (TR) structure for BCDMA might support writing to a single
address. It also might be possible to use dmaengine_prep_dma_cyclic to
setup a auto-repeating TR like it's used for audio.. maybe that can be
explored separate from this series.

But yes, ideally we need to set the target to void, which I don't think is
supported by the HW (TI folks please correct me if I'm wrong :)

Thanks,
    Jai

>  Tomi
> 
> > Use wait for completion barrier to make sure the shared hardware FIFO
> > is cleared of the data at the end of stream after the source has stopped
> > sending data.
> > 
> > Reviewed-by: Jai Luthra <jai.luthra@ideasonboard.com>
> > Reviewed-by: Yemike Abhilash Chandra <y-abhilashchandra@ti.com>
> > Signed-off-by: Rishikesh Donadkar <r-donadkar@ti.com>

[...]

Re: [PATCH v9 15/19] media: ti: j721e-csi2rx: Change the drain architecture for multistream

Posted by Rishikesh Donadkar 3 weeks ago

On 15/01/26 21:32, Jai Luthra wrote:
> Hi Tomi,
>
> Quoting Tomi Valkeinen (2026-01-15 18:07:07)
>> Hi,
>>
>> On 30/12/2025 10:32, Rishikesh Donadkar wrote:
>>> On buffer starvation the DMA is marked IDLE, and the stale data in the
>>> internal FIFOs gets drained only on the next VIDIOC_QBUF call from the
>>> userspace. This approach works fine for a single stream case.
>>>
>>> But in multistream scenarios, buffer starvation for one stream i.e. one
>>> virtual channel, can block the shared HW FIFO of the CSI2RX IP. This can
>>> stall the pipeline for all other virtual channels, even if buffers are
>>> available for them.
>> One stream is filtered based on VC & DT, but the above only mentions VC.
>> And then later uses VC when referring to the stream. I think you can
>> drop the VC parts, and just talk about streams.
>>
>>> This patch introduces a new architecture, that continuously drains data
>>> from the shared HW FIFO into a small (32KiB) buffer if no buffers are made
>>> available to the driver from the userspace. This ensures independence
>>> between different streams, where a slower downstream element for one
>>> camera does not block streaming for other cameras.
>>>
>>> Additionally, after a drain is done for a VC, the next frame will be a
>>> partial frame, as a portion of its data will have already been drained
>>> before a valid buffer is queued by user space to the driver.
>> This makes it sounds we drain a single 32KB piece. But won't we continue
>> draining that stream until the stream is stopped or the user provides a
>> buffer?
>>
>> Also, does the DMA not offer us ways to drain a full frame? There's no
>> way to e.g. set the DMA TX increment to 0, so that it would just write
>> to a single location in memory? Or just set the target to void.
>>
> + Vignesh
>
> AFAIU the dmaengine API is the first limitation here, and the actual
> Transfer Record (TR) structure for BCDMA might support writing to a single
> address. It also might be possible to use dmaengine_prep_dma_cyclic to
> setup a auto-repeating TR like it's used for audio.. maybe that can be
> explored separate from this series.
Makes sense, if we have such API we can drain a full frame as Tomi 
suggested and also not care about the next frame being a partial one.
>
> But yes, ideally we need to set the target to void, which I don't think is
> supported by the HW (TI folks please correct me if I'm wrong :)
>
> Thanks,
>      Jai
>
>>   Tomi
>>
>>> Use wait for completion barrier to make sure the shared hardware FIFO
>>> is cleared of the data at the end of stream after the source has stopped
>>> sending data.
>>>
>>> Reviewed-by: Jai Luthra <jai.luthra@ideasonboard.com>
>>> Reviewed-by: Yemike Abhilash Chandra <y-abhilashchandra@ti.com>
>>> Signed-off-by: Rishikesh Donadkar <r-donadkar@ti.com>
> [...]

Re: [PATCH v9 15/19] media: ti: j721e-csi2rx: Change the drain architecture for multistream

Posted by Vignesh Raghavendra 3 weeks, 3 days ago


On 15/01/26 21:32, Jai Luthra wrote:
> Hi Tomi,
> 
> Quoting Tomi Valkeinen (2026-01-15 18:07:07)
>> Hi,
>>
>> On 30/12/2025 10:32, Rishikesh Donadkar wrote:
>>> On buffer starvation the DMA is marked IDLE, and the stale data in the
>>> internal FIFOs gets drained only on the next VIDIOC_QBUF call from the
>>> userspace. This approach works fine for a single stream case.
>>>
>>> But in multistream scenarios, buffer starvation for one stream i.e. one
>>> virtual channel, can block the shared HW FIFO of the CSI2RX IP. This can
>>> stall the pipeline for all other virtual channels, even if buffers are
>>> available for them.
>>
>> One stream is filtered based on VC & DT, but the above only mentions VC.
>> And then later uses VC when referring to the stream. I think you can
>> drop the VC parts, and just talk about streams.
>>
>>> This patch introduces a new architecture, that continuously drains data
>>> from the shared HW FIFO into a small (32KiB) buffer if no buffers are made
>>> available to the driver from the userspace. This ensures independence
>>> between different streams, where a slower downstream element for one
>>> camera does not block streaming for other cameras.
>>>
>>> Additionally, after a drain is done for a VC, the next frame will be a
>>> partial frame, as a portion of its data will have already been drained
>>> before a valid buffer is queued by user space to the driver.
>>
>> This makes it sounds we drain a single 32KB piece. But won't we continue
>> draining that stream until the stream is stopped or the user provides a
>> buffer?
>>
>> Also, does the DMA not offer us ways to drain a full frame? There's no
>> way to e.g. set the DMA TX increment to 0, so that it would just write
>> to a single location in memory? Or just set the target to void.
>>
> 
> + Vignesh
> 
> AFAIU the dmaengine API is the first limitation here, and the actual
> Transfer Record (TR) structure for BCDMA might support writing to a single
> address. It also might be possible to use dmaengine_prep_dma_cyclic to
> setup a auto-repeating TR like it's used for audio.. maybe that can be
> explored separate from this series.
> 

Yeah. there is no dmaengine API that can do mem-to-mem transfer with
constant addressing mode on src or dst. But the DMA HW on TI K3 SoCs are
capable of doing so.


[...]

-- 
Regards
Vignesh
https://ti.com/opensource