[v12] crypto/dmaengine: qce: introduce BAM locking and use DMA for register I/O

[PATCH v12 05/12] dmaengine: qcom: bam_dma: add support for BAM locking

Posted by Bartosz Golaszewski 4 weeks ago

Add support for BAM pipe locking. To that end: when starting DMA on an RX
channel - prepend the existing queue of issued descriptors with an
additional "dummy" command descriptor with the LOCK bit set. Once the
transaction is done (no more issued descriptors), issue one more dummy
descriptor with the UNLOCK bit.

We *must* wait until the transaction is signalled as done because we
must not perform any writes into config registers while the engine is
busy.

The dummy writes must be issued into a scratchpad register of the client
so provide a mechanism to communicate the right address via descriptor
metadata.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
 drivers/dma/qcom/bam_dma.c       | 175 ++++++++++++++++++++++++++++++++++++++-
 include/linux/dma/qcom_bam_dma.h |   4 +
 2 files changed, 176 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index 83491e7c2f17d8c9d12a1a055baea7e3a0a75a53..627c85a2df4dcdbac247d831a4aef047c2189456 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -28,11 +28,13 @@
 #include <linux/clk.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
+#include <linux/dma/qcom_bam_dma.h>
 #include <linux/dmaengine.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
+#include <linux/lockdep.h>
 #include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/of_dma.h>
@@ -60,6 +62,8 @@ struct bam_desc_hw {
 #define DESC_FLAG_EOB BIT(13)
 #define DESC_FLAG_NWD BIT(12)
 #define DESC_FLAG_CMD BIT(11)
+#define DESC_FLAG_LOCK BIT(10)
+#define DESC_FLAG_UNLOCK BIT(9)
 
 struct bam_async_desc {
 	struct virt_dma_desc vd;
@@ -391,6 +395,14 @@ struct bam_chan {
 	struct list_head desc_list;
 
 	struct list_head node;
+
+	/* BAM locking infrastructure */
+	bool locked;
+	phys_addr_t scratchpad_addr;
+	struct scatterlist lock_sg;
+	struct scatterlist unlock_sg;
+	struct bam_cmd_element lock_ce;
+	struct bam_cmd_element unlock_ce;
 };
 
 static inline struct bam_chan *to_bam_chan(struct dma_chan *common)
@@ -652,6 +664,27 @@ static int bam_slave_config(struct dma_chan *chan,
 	return 0;
 }
 
+static int bam_metadata_attach(struct dma_async_tx_descriptor *desc, void *data, size_t len)
+{
+	struct bam_chan *bchan = to_bam_chan(desc->chan);
+	const struct bam_device_data *bdata = bchan->bdev->dev_data;
+	struct bam_desc_metadata *metadata = data;
+
+	if (!data)
+		return -EINVAL;
+
+	if (!bdata->pipe_lock_supported)
+		return -EOPNOTSUPP;
+
+	bchan->scratchpad_addr = metadata->scratchpad_addr;
+
+	return 0;
+}
+
+static const struct dma_descriptor_metadata_ops bam_metadata_ops = {
+	.attach = bam_metadata_attach,
+};
+
 /**
  * bam_prep_slave_sg - Prep slave sg transaction
  *
@@ -668,6 +701,7 @@ static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan,
 	void *context)
 {
 	struct bam_chan *bchan = to_bam_chan(chan);
+	struct dma_async_tx_descriptor *tx_desc;
 	struct bam_device *bdev = bchan->bdev;
 	struct bam_async_desc *async_desc;
 	struct scatterlist *sg;
@@ -723,7 +757,12 @@ static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan,
 		} while (remainder > 0);
 	}
 
-	return vchan_tx_prep(&bchan->vc, &async_desc->vd, flags);
+	tx_desc = vchan_tx_prep(&bchan->vc, &async_desc->vd, flags);
+	if (!tx_desc)
+		return NULL;
+
+	tx_desc->metadata_ops = &bam_metadata_ops;
+	return tx_desc;
 }
 
 /**
@@ -1012,6 +1051,112 @@ static void bam_apply_new_config(struct bam_chan *bchan,
 	bchan->reconfigure = 0;
 }
 
+static struct bam_async_desc *
+bam_make_lock_desc(struct bam_chan *bchan, struct scatterlist *sg,
+		   struct bam_cmd_element *ce, unsigned long flag)
+{
+	struct dma_chan *chan = &bchan->vc.chan;
+	struct bam_async_desc *async_desc;
+	struct bam_desc_hw *desc;
+	struct virt_dma_desc *vd;
+	struct virt_dma_chan *vc;
+	unsigned int mapped;
+	dma_cookie_t cookie;
+	int ret;
+
+	sg_init_table(sg, 1);
+
+	async_desc = kzalloc_flex(*async_desc, desc, 1, GFP_NOWAIT);
+	if (!async_desc) {
+		dev_err(bchan->bdev->dev, "failed to allocate the BAM lock descriptor\n");
+		return NULL;
+	}
+
+	async_desc->num_desc = 1;
+	async_desc->curr_desc = async_desc->desc;
+	async_desc->dir = DMA_MEM_TO_DEV;
+
+	desc = async_desc->desc;
+
+	bam_prep_ce_le32(ce, bchan->scratchpad_addr, BAM_WRITE_COMMAND, 0);
+	sg_set_buf(sg, ce, sizeof(*ce));
+
+	mapped = dma_map_sg_attrs(chan->slave, sg, 1, DMA_TO_DEVICE, DMA_PREP_CMD);
+	if (!mapped) {
+		kfree(async_desc);
+		return NULL;
+	}
+
+	desc->flags |= cpu_to_le16(DESC_FLAG_CMD | flag);
+	desc->addr = sg_dma_address(sg);
+	desc->size = sizeof(struct bam_cmd_element);
+
+	vc = &bchan->vc;
+	vd = &async_desc->vd;
+
+	dma_async_tx_descriptor_init(&vd->tx, &vc->chan);
+	vd->tx.flags = DMA_PREP_CMD;
+	vd->tx.desc_free = vchan_tx_desc_free;
+	vd->tx_result.result = DMA_TRANS_NOERROR;
+	vd->tx_result.residue = 0;
+
+	cookie = dma_cookie_assign(&vd->tx);
+	ret = dma_submit_error(cookie);
+	if (ret)
+		return NULL;
+
+	return async_desc;
+}
+
+static int bam_do_setup_pipe_lock(struct bam_chan *bchan, bool lock)
+{
+	struct bam_device *bdev = bchan->bdev;
+	const struct bam_device_data *bdata = bdev->dev_data;
+	struct bam_async_desc *lock_desc;
+	struct bam_cmd_element *ce;
+	struct scatterlist *sgl;
+	unsigned long flag;
+
+	lockdep_assert_held(&bchan->vc.lock);
+
+	if (!bdata->pipe_lock_supported || !bchan->scratchpad_addr ||
+	    bchan->slave.direction != DMA_MEM_TO_DEV)
+		return 0;
+
+	if (lock) {
+		sgl = &bchan->lock_sg;
+		ce = &bchan->lock_ce;
+		flag = DESC_FLAG_LOCK;
+	} else {
+		sgl = &bchan->unlock_sg;
+		ce = &bchan->unlock_ce;
+		flag = DESC_FLAG_UNLOCK;
+	}
+
+	lock_desc = bam_make_lock_desc(bchan, sgl, ce, flag);
+	if (!lock_desc)
+		return -ENOMEM;
+
+	if (lock)
+		list_add(&lock_desc->vd.node, &bchan->vc.desc_issued);
+	else
+		list_add_tail(&lock_desc->vd.node, &bchan->vc.desc_issued);
+
+	bchan->locked = lock;
+
+	return 0;
+}
+
+static int bam_setup_pipe_lock(struct bam_chan *bchan)
+{
+	return bam_do_setup_pipe_lock(bchan, true);
+}
+
+static int bam_setup_pipe_unlock(struct bam_chan *bchan)
+{
+	return bam_do_setup_pipe_lock(bchan, false);
+}
+
 /**
  * bam_start_dma - start next transaction
  * @bchan: bam dma channel
@@ -1121,6 +1266,7 @@ static void bam_dma_work(struct work_struct *work)
 	struct bam_device *bdev = from_work(bdev, work, work);
 	struct bam_chan *bchan;
 	unsigned int i;
+	int ret;
 
 	/* go through the channels and kick off transactions */
 	for (i = 0; i < bdev->num_channels; i++) {
@@ -1128,6 +1274,13 @@ static void bam_dma_work(struct work_struct *work)
 
 		guard(spinlock_irqsave)(&bchan->vc.lock);
 
+		if (list_empty(&bchan->vc.desc_issued) && bchan->locked) {
+			ret = bam_setup_pipe_unlock(bchan);
+			if (ret)
+				dev_err(bchan->vc.chan.slave,
+					"Failed to set up the pipe unlock descriptor\n");
+		}
+
 		if (!list_empty(&bchan->vc.desc_issued) && !IS_BUSY(bchan))
 			bam_start_dma(bchan);
 	}
@@ -1142,9 +1295,17 @@ static void bam_dma_work(struct work_struct *work)
 static void bam_issue_pending(struct dma_chan *chan)
 {
 	struct bam_chan *bchan = to_bam_chan(chan);
+	int ret;
 
 	guard(spinlock_irqsave)(&bchan->vc.lock);
 
+	if (!bchan->locked) {
+		ret = bam_setup_pipe_lock(bchan);
+		if (ret)
+			dev_err(bchan->vc.chan.slave,
+				"Failed to set up the pipe lock descriptor\n");
+	}
+
 	/* if work pending and idle, start a transaction */
 	if (vchan_issue_pending(&bchan->vc) && !IS_BUSY(bchan))
 		bam_start_dma(bchan);
@@ -1157,8 +1318,15 @@ static void bam_issue_pending(struct dma_chan *chan)
  */
 static void bam_dma_free_desc(struct virt_dma_desc *vd)
 {
-	struct bam_async_desc *async_desc = container_of(vd,
-			struct bam_async_desc, vd);
+	struct bam_async_desc *async_desc = container_of(vd, struct bam_async_desc, vd);
+	struct bam_desc_hw *desc = async_desc->desc;
+	struct dma_chan *chan = vd->tx.chan;
+	struct bam_chan *bchan = to_bam_chan(chan);
+
+	if (le16_to_cpu(desc->flags) & DESC_FLAG_LOCK)
+		dma_unmap_sg(chan->slave, &bchan->lock_sg, 1, DMA_TO_DEVICE);
+	else if (le16_to_cpu(desc->flags) & DESC_FLAG_UNLOCK)
+		dma_unmap_sg(chan->slave, &bchan->unlock_sg, 1, DMA_TO_DEVICE);
 
 	kfree(async_desc);
 }
@@ -1350,6 +1518,7 @@ static int bam_dma_probe(struct platform_device *pdev)
 	bdev->common.device_terminate_all = bam_dma_terminate_all;
 	bdev->common.device_issue_pending = bam_issue_pending;
 	bdev->common.device_tx_status = bam_tx_status;
+	bdev->common.desc_metadata_modes = DESC_METADATA_CLIENT;
 	bdev->common.dev = bdev->dev;
 
 	ret = dma_async_device_register(&bdev->common);
diff --git a/include/linux/dma/qcom_bam_dma.h b/include/linux/dma/qcom_bam_dma.h
index 68fc0e643b1b97fe4520d5878daa322b81f4f559..f85e0c72407b5e1a733750ac87bbaba6af6e8c78 100644
--- a/include/linux/dma/qcom_bam_dma.h
+++ b/include/linux/dma/qcom_bam_dma.h
@@ -34,6 +34,10 @@ enum bam_command_type {
 	BAM_READ_COMMAND,
 };
 
+struct bam_desc_metadata {
+	phys_addr_t scratchpad_addr;
+};
+
 /*
  * prep_bam_ce_le32 - Wrapper function to prepare a single BAM command
  * element with the data already in le32 format.

-- 
2.47.3

Re: [PATCH v12 05/12] dmaengine: qcom: bam_dma: add support for BAM locking

Posted by Manivannan Sadhasivam 3 weeks, 6 days ago

On Tue, Mar 10, 2026 at 04:44:19PM +0100, Bartosz Golaszewski wrote:
> Add support for BAM pipe locking. To that end: when starting DMA on an RX
> channel - prepend the existing queue of issued descriptors with an
> additional "dummy" command descriptor with the LOCK bit set. Once the
> transaction is done (no more issued descriptors), issue one more dummy
> descriptor with the UNLOCK bit.
> 
> We *must* wait until the transaction is signalled as done because we
> must not perform any writes into config registers while the engine is
> busy.
> 
> The dummy writes must be issued into a scratchpad register of the client
> so provide a mechanism to communicate the right address via descriptor
> metadata.
> 
> Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
> ---
>  drivers/dma/qcom/bam_dma.c       | 175 ++++++++++++++++++++++++++++++++++++++-
>  include/linux/dma/qcom_bam_dma.h |   4 +
>  2 files changed, 176 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
> index 83491e7c2f17d8c9d12a1a055baea7e3a0a75a53..627c85a2df4dcdbac247d831a4aef047c2189456 100644
> --- a/drivers/dma/qcom/bam_dma.c
> +++ b/drivers/dma/qcom/bam_dma.c
> @@ -28,11 +28,13 @@
>  #include <linux/clk.h>
>  #include <linux/device.h>
>  #include <linux/dma-mapping.h>
> +#include <linux/dma/qcom_bam_dma.h>
>  #include <linux/dmaengine.h>
>  #include <linux/init.h>
>  #include <linux/interrupt.h>
>  #include <linux/io.h>
>  #include <linux/kernel.h>
> +#include <linux/lockdep.h>
>  #include <linux/module.h>
>  #include <linux/of_address.h>
>  #include <linux/of_dma.h>
> @@ -60,6 +62,8 @@ struct bam_desc_hw {
>  #define DESC_FLAG_EOB BIT(13)
>  #define DESC_FLAG_NWD BIT(12)
>  #define DESC_FLAG_CMD BIT(11)
> +#define DESC_FLAG_LOCK BIT(10)
> +#define DESC_FLAG_UNLOCK BIT(9)
>  
>  struct bam_async_desc {
>  	struct virt_dma_desc vd;
> @@ -391,6 +395,14 @@ struct bam_chan {
>  	struct list_head desc_list;
>  
>  	struct list_head node;
> +
> +	/* BAM locking infrastructure */
> +	bool locked;

Nit: Move this boolean at the end to avoid hole in-between.

> +	phys_addr_t scratchpad_addr;
> +	struct scatterlist lock_sg;
> +	struct scatterlist unlock_sg;
> +	struct bam_cmd_element lock_ce;
> +	struct bam_cmd_element unlock_ce;
>  };
>  
>  static inline struct bam_chan *to_bam_chan(struct dma_chan *common)
> @@ -652,6 +664,27 @@ static int bam_slave_config(struct dma_chan *chan,
>  	return 0;
>  }
>  
> +static int bam_metadata_attach(struct dma_async_tx_descriptor *desc, void *data, size_t len)
> +{
> +	struct bam_chan *bchan = to_bam_chan(desc->chan);
> +	const struct bam_device_data *bdata = bchan->bdev->dev_data;
> +	struct bam_desc_metadata *metadata = data;
> +
> +	if (!data)
> +		return -EINVAL;
> +
> +	if (!bdata->pipe_lock_supported)
> +		return -EOPNOTSUPP;

I don't think you should error out if pipe lock is not supported. You can safely
return 0 so that the client can continue to do DMA. Otherwise, if the client
tries to do DMA on a non-pipe lock supported platform (a valid case), DMA will
fail.

There is also no incentive for the clients to know whether pipe lock is
supported or not as they can proceed anyway.

> +
> +	bchan->scratchpad_addr = metadata->scratchpad_addr;
> +
> +	return 0;
> +}
> +
> +static const struct dma_descriptor_metadata_ops bam_metadata_ops = {
> +	.attach = bam_metadata_attach,
> +};
> +
>  /**
>   * bam_prep_slave_sg - Prep slave sg transaction
>   *
> @@ -668,6 +701,7 @@ static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan,
>  	void *context)
>  {
>  	struct bam_chan *bchan = to_bam_chan(chan);
> +	struct dma_async_tx_descriptor *tx_desc;
>  	struct bam_device *bdev = bchan->bdev;
>  	struct bam_async_desc *async_desc;
>  	struct scatterlist *sg;
> @@ -723,7 +757,12 @@ static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan,
>  		} while (remainder > 0);
>  	}
>  
> -	return vchan_tx_prep(&bchan->vc, &async_desc->vd, flags);
> +	tx_desc = vchan_tx_prep(&bchan->vc, &async_desc->vd, flags);
> +	if (!tx_desc)
> +		return NULL;
> +
> +	tx_desc->metadata_ops = &bam_metadata_ops;
> +	return tx_desc;
>  }
>  
>  /**
> @@ -1012,6 +1051,112 @@ static void bam_apply_new_config(struct bam_chan *bchan,
>  	bchan->reconfigure = 0;
>  }
>  
> +static struct bam_async_desc *
> +bam_make_lock_desc(struct bam_chan *bchan, struct scatterlist *sg,
> +		   struct bam_cmd_element *ce, unsigned long flag)
> +{
> +	struct dma_chan *chan = &bchan->vc.chan;
> +	struct bam_async_desc *async_desc;
> +	struct bam_desc_hw *desc;
> +	struct virt_dma_desc *vd;
> +	struct virt_dma_chan *vc;
> +	unsigned int mapped;
> +	dma_cookie_t cookie;
> +	int ret;
> +
> +	sg_init_table(sg, 1);
> +
> +	async_desc = kzalloc_flex(*async_desc, desc, 1, GFP_NOWAIT);
> +	if (!async_desc) {
> +		dev_err(bchan->bdev->dev, "failed to allocate the BAM lock descriptor\n");
> +		return NULL;
> +	}
> +
> +	async_desc->num_desc = 1;
> +	async_desc->curr_desc = async_desc->desc;
> +	async_desc->dir = DMA_MEM_TO_DEV;
> +
> +	desc = async_desc->desc;
> +
> +	bam_prep_ce_le32(ce, bchan->scratchpad_addr, BAM_WRITE_COMMAND, 0);
> +	sg_set_buf(sg, ce, sizeof(*ce));
> +
> +	mapped = dma_map_sg_attrs(chan->slave, sg, 1, DMA_TO_DEVICE, DMA_PREP_CMD);
> +	if (!mapped) {
> +		kfree(async_desc);
> +		return NULL;
> +	}
> +
> +	desc->flags |= cpu_to_le16(DESC_FLAG_CMD | flag);
> +	desc->addr = sg_dma_address(sg);
> +	desc->size = sizeof(struct bam_cmd_element);
> +
> +	vc = &bchan->vc;
> +	vd = &async_desc->vd;
> +
> +	dma_async_tx_descriptor_init(&vd->tx, &vc->chan);
> +	vd->tx.flags = DMA_PREP_CMD;
> +	vd->tx.desc_free = vchan_tx_desc_free;
> +	vd->tx_result.result = DMA_TRANS_NOERROR;
> +	vd->tx_result.residue = 0;
> +
> +	cookie = dma_cookie_assign(&vd->tx);
> +	ret = dma_submit_error(cookie);
> +	if (ret)
> +		return NULL;

Why can't you pass the actual error pointer to the caller. Right now, caller
just treats all failures as -ENOMEM.

> +
> +	return async_desc;
> +}
> +
> +static int bam_do_setup_pipe_lock(struct bam_chan *bchan, bool lock)
> +{
> +	struct bam_device *bdev = bchan->bdev;
> +	const struct bam_device_data *bdata = bdev->dev_data;
> +	struct bam_async_desc *lock_desc;
> +	struct bam_cmd_element *ce;
> +	struct scatterlist *sgl;
> +	unsigned long flag;
> +
> +	lockdep_assert_held(&bchan->vc.lock);
> +
> +	if (!bdata->pipe_lock_supported || !bchan->scratchpad_addr ||
> +	    bchan->slave.direction != DMA_MEM_TO_DEV)
> +		return 0;
> +
> +	if (lock) {
> +		sgl = &bchan->lock_sg;
> +		ce = &bchan->lock_ce;
> +		flag = DESC_FLAG_LOCK;
> +	} else {
> +		sgl = &bchan->unlock_sg;
> +		ce = &bchan->unlock_ce;
> +		flag = DESC_FLAG_UNLOCK;
> +	}
> +
> +	lock_desc = bam_make_lock_desc(bchan, sgl, ce, flag);
> +	if (!lock_desc)
> +		return -ENOMEM;
> +
> +	if (lock)
> +		list_add(&lock_desc->vd.node, &bchan->vc.desc_issued);
> +	else
> +		list_add_tail(&lock_desc->vd.node, &bchan->vc.desc_issued);
> +
> +	bchan->locked = lock;
> +
> +	return 0;
> +}
> +
> +static int bam_setup_pipe_lock(struct bam_chan *bchan)
> +{
> +	return bam_do_setup_pipe_lock(bchan, true);
> +}
> +
> +static int bam_setup_pipe_unlock(struct bam_chan *bchan)
> +{
> +	return bam_do_setup_pipe_lock(bchan, false);
> +}
> +
>  /**
>   * bam_start_dma - start next transaction
>   * @bchan: bam dma channel
> @@ -1121,6 +1266,7 @@ static void bam_dma_work(struct work_struct *work)
>  	struct bam_device *bdev = from_work(bdev, work, work);
>  	struct bam_chan *bchan;
>  	unsigned int i;
> +	int ret;
>  
>  	/* go through the channels and kick off transactions */
>  	for (i = 0; i < bdev->num_channels; i++) {
> @@ -1128,6 +1274,13 @@ static void bam_dma_work(struct work_struct *work)
>  
>  		guard(spinlock_irqsave)(&bchan->vc.lock);
>  
> +		if (list_empty(&bchan->vc.desc_issued) && bchan->locked) {

I fully agree with Stephan that we cannot rely on this to ensure completion of
previous commands. But I can't help with the NWD behavior :/

> +			ret = bam_setup_pipe_unlock(bchan);

if (bam_setup_pipe_unlock())?

> +			if (ret)
> +				dev_err(bchan->vc.chan.slave,
> +					"Failed to set up the pipe unlock descriptor\n");
> +		}
> +
>  		if (!list_empty(&bchan->vc.desc_issued) && !IS_BUSY(bchan))
>  			bam_start_dma(bchan);
>  	}
> @@ -1142,9 +1295,17 @@ static void bam_dma_work(struct work_struct *work)
>  static void bam_issue_pending(struct dma_chan *chan)
>  {
>  	struct bam_chan *bchan = to_bam_chan(chan);
> +	int ret;
>  
>  	guard(spinlock_irqsave)(&bchan->vc.lock);
>  
> +	if (!bchan->locked) {
> +		ret = bam_setup_pipe_lock(bchan);

if (bam_setup_pipe_lock())?

- Mani

-- 
மணிவண்ணன் சதாசிவம்

Re: [PATCH v12 05/12] dmaengine: qcom: bam_dma: add support for BAM locking

Posted by Stephan Gerhold 3 weeks, 6 days ago

On Tue, Mar 10, 2026 at 04:44:19PM +0100, Bartosz Golaszewski wrote:
> Add support for BAM pipe locking. To that end: when starting DMA on an RX
> channel - prepend the existing queue of issued descriptors with an
> additional "dummy" command descriptor with the LOCK bit set. Once the
> transaction is done (no more issued descriptors), issue one more dummy
> descriptor with the UNLOCK bit.
> 
> We *must* wait until the transaction is signalled as done because we
> must not perform any writes into config registers while the engine is
> busy.
> 
> [...]
> diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
> index 83491e7c2f17d8c9d12a1a055baea7e3a0a75a53..627c85a2df4dcdbac247d831a4aef047c2189456 100644
> --- a/drivers/dma/qcom/bam_dma.c
> +++ b/drivers/dma/qcom/bam_dma.c
> [...]
> +static int bam_do_setup_pipe_lock(struct bam_chan *bchan, bool lock)
> +{
> +	struct bam_device *bdev = bchan->bdev;
> +	const struct bam_device_data *bdata = bdev->dev_data;
> +	struct bam_async_desc *lock_desc;
> +	struct bam_cmd_element *ce;
> +	struct scatterlist *sgl;
> +	unsigned long flag;
> +
> +	lockdep_assert_held(&bchan->vc.lock);
> +
> +	if (!bdata->pipe_lock_supported || !bchan->scratchpad_addr ||
> +	    bchan->slave.direction != DMA_MEM_TO_DEV)
> +		return 0;
> +
> +	if (lock) {
> +		sgl = &bchan->lock_sg;
> +		ce = &bchan->lock_ce;
> +		flag = DESC_FLAG_LOCK;
> +	} else {
> +		sgl = &bchan->unlock_sg;
> +		ce = &bchan->unlock_ce;
> +		flag = DESC_FLAG_UNLOCK;
> +	}
> +
> +	lock_desc = bam_make_lock_desc(bchan, sgl, ce, flag);
> +	if (!lock_desc)
> +		return -ENOMEM;
> +
> +	if (lock)
> +		list_add(&lock_desc->vd.node, &bchan->vc.desc_issued);
> +	else
> +		list_add_tail(&lock_desc->vd.node, &bchan->vc.desc_issued);
> +
> +	bchan->locked = lock;
> +
> +	return 0;
> +}
> +
> +static int bam_setup_pipe_lock(struct bam_chan *bchan)
> +{
> +	return bam_do_setup_pipe_lock(bchan, true);
> +}
> +
> +static int bam_setup_pipe_unlock(struct bam_chan *bchan)
> +{
> +	return bam_do_setup_pipe_lock(bchan, false);
> +}
> +
>  /**
>   * bam_start_dma - start next transaction
>   * @bchan: bam dma channel
> @@ -1121,6 +1266,7 @@ static void bam_dma_work(struct work_struct *work)
>  	struct bam_device *bdev = from_work(bdev, work, work);
>  	struct bam_chan *bchan;
>  	unsigned int i;
> +	int ret;
>  
>  	/* go through the channels and kick off transactions */
>  	for (i = 0; i < bdev->num_channels; i++) {
> @@ -1128,6 +1274,13 @@ static void bam_dma_work(struct work_struct *work)
>  
>  		guard(spinlock_irqsave)(&bchan->vc.lock);
>  
> +		if (list_empty(&bchan->vc.desc_issued) && bchan->locked) {
> +			ret = bam_setup_pipe_unlock(bchan);
> +			if (ret)
> +				dev_err(bchan->vc.chan.slave,
> +					"Failed to set up the pipe unlock descriptor\n");
> +		}
> +
>  		if (!list_empty(&bchan->vc.desc_issued) && !IS_BUSY(bchan))
>  			bam_start_dma(bchan);
>  	}

I'm not entirely sure if this actually guarantees waiting with the
unlock until the transaction is "done", for two reasons:

 1. &bchan->vc.desc_issued looks like a "TODO" list for descriptors we
    haven't fully managed to squeeze into the BAM FIFO yet. It doesn't
    tell you which descriptors have been consumed and finished
    processing inside the FIFO.

    Consider e.g. the following case: The client has issued a number of
    descriptors, they all fit into the FIFO. The first descriptor has a
    callback assigned, so we ask the BAM to send us an interrupt when it
    has been consumed. We get the interrupt for the first descriptor and
    process_channel_irqs() marks it as completed, the rest of the
    descriptors are still pending. &bchan->vc.desc_issued is empty, so
    you queue the unlock command before the rest of the descriptors have
    finished.

 2. From reading the BAM chapter in the APQ8016E TRM I get the
    impression that by default an interrupt for a descriptor just tells
    you that the descriptor was consumed by the BAM (and forwarded to
    the peripheral). If you want to guarantee that the transaction is
    actually done on the peripheral side before allowing writes into
    config registers, you would need to set the NWD (Notify When Done)
    bit (aka DMA_PREP_FENCE) on the last descriptor before the unlock
    command.

    NWD seems to stall descriptor processing until the peripheral
    signals completion, so this might allow you to immediately queue the
    unlock command like in v11. The downside is that you would need to
    make assumptions about the set of commands submitted by the client
    again... The downstream driver seems to set NWD on the data
    descriptor immediately before the UNLOCK command [1].

    The chapter in the APQ8016E TRM kind of contradicts itself
    sometimes, but there is this sentence for example: "On the data
    descriptor preceding command descriptor, NWD bit must be asserted in
    order to assure that all the data has been transferred and the
    peripheral is ready to be re-configured."

Thanks,
Stephan

[1]: https://git.codelinaro.org/clo/la/platform/vendor/qcom/opensource/securemsm-kernel/-/blob/fa55a96773d3fbfcd96beb2965efcaaae5697816/crypto-qti/qce50.c#L5361-5362

Re: [PATCH v12 05/12] dmaengine: qcom: bam_dma: add support for BAM locking

Posted by Bartosz Golaszewski 3 weeks, 6 days ago

On Wed, Mar 11, 2026 at 11:00 AM Stephan Gerhold
<stephan.gerhold@linaro.org> wrote:
>
> I'm not entirely sure if this actually guarantees waiting with the
> unlock until the transaction is "done", for two reasons:
>
>  1. &bchan->vc.desc_issued looks like a "TODO" list for descriptors we
>     haven't fully managed to squeeze into the BAM FIFO yet. It doesn't
>     tell you which descriptors have been consumed and finished
>     processing inside the FIFO.
>
>     Consider e.g. the following case: The client has issued a number of
>     descriptors, they all fit into the FIFO. The first descriptor has a
>     callback assigned, so we ask the BAM to send us an interrupt when it
>     has been consumed. We get the interrupt for the first descriptor and
>     process_channel_irqs() marks it as completed, the rest of the
>     descriptors are still pending. &bchan->vc.desc_issued is empty, so
>     you queue the unlock command before the rest of the descriptors have
>     finished.
>

Thanks for looking into it. Good catch, I think you're right.

>  2. From reading the BAM chapter in the APQ8016E TRM I get the
>     impression that by default an interrupt for a descriptor just tells
>     you that the descriptor was consumed by the BAM (and forwarded to
>     the peripheral). If you want to guarantee that the transaction is
>     actually done on the peripheral side before allowing writes into
>     config registers, you would need to set the NWD (Notify When Done)
>     bit (aka DMA_PREP_FENCE) on the last descriptor before the unlock
>     command.
>
>     NWD seems to stall descriptor processing until the peripheral
>     signals completion, so this might allow you to immediately queue the
>     unlock command like in v11. The downside is that you would need to
>     make assumptions about the set of commands submitted by the client
>     again... The downstream driver seems to set NWD on the data
>     descriptor immediately before the UNLOCK command [1].
>

If what we have in the queue is:

  [DATA] [DATA] [DATA] [CMD]

And we want to extend it with LOCK/UNLOCK like so:

  [LOCK] [DATA] [DATA] [DATA] [CMD] [UNLOCK]

Should the NWD go with the last DATA descriptor or the last descriptor period
whether data or command?

It's, again, not very clear from reading tha part.

Bart

>     The chapter in the APQ8016E TRM kind of contradicts itself
>     sometimes, but there is this sentence for example: "On the data
>     descriptor preceding command descriptor, NWD bit must be asserted in
>     order to assure that all the data has been transferred and the
>     peripheral is ready to be re-configured."
>
> Thanks,
> Stephan
>
> [1]: https://git.codelinaro.org/clo/la/platform/vendor/qcom/opensource/securemsm-kernel/-/blob/fa55a96773d3fbfcd96beb2965efcaaae5697816/crypto-qti/qce50.c#L5361-5362

Re: [PATCH v12 05/12] dmaengine: qcom: bam_dma: add support for BAM locking

Posted by Stephan Gerhold 3 weeks, 6 days ago

On Wed, Mar 11, 2026 at 03:32:38AM -0700, Bartosz Golaszewski wrote:
> On Wed, Mar 11, 2026 at 11:00 AM Stephan Gerhold
> <stephan.gerhold@linaro.org> wrote:
> >
> > I'm not entirely sure if this actually guarantees waiting with the
> > unlock until the transaction is "done", for two reasons:
> >
> >  1. &bchan->vc.desc_issued looks like a "TODO" list for descriptors we
> >     haven't fully managed to squeeze into the BAM FIFO yet. It doesn't
> >     tell you which descriptors have been consumed and finished
> >     processing inside the FIFO.
> >
> >     Consider e.g. the following case: The client has issued a number of
> >     descriptors, they all fit into the FIFO. The first descriptor has a
> >     callback assigned, so we ask the BAM to send us an interrupt when it
> >     has been consumed. We get the interrupt for the first descriptor and
> >     process_channel_irqs() marks it as completed, the rest of the
> >     descriptors are still pending. &bchan->vc.desc_issued is empty, so
> >     you queue the unlock command before the rest of the descriptors have
> >     finished.
> >
> 
> Thanks for looking into it. Good catch, I think you're right.
> 
> >  2. From reading the BAM chapter in the APQ8016E TRM I get the
> >     impression that by default an interrupt for a descriptor just tells
> >     you that the descriptor was consumed by the BAM (and forwarded to
> >     the peripheral). If you want to guarantee that the transaction is
> >     actually done on the peripheral side before allowing writes into
> >     config registers, you would need to set the NWD (Notify When Done)
> >     bit (aka DMA_PREP_FENCE) on the last descriptor before the unlock
> >     command.
> >
> >     NWD seems to stall descriptor processing until the peripheral
> >     signals completion, so this might allow you to immediately queue the
> >     unlock command like in v11. The downside is that you would need to
> >     make assumptions about the set of commands submitted by the client
> >     again... The downstream driver seems to set NWD on the data
> >     descriptor immediately before the UNLOCK command [1].
> >
> 
> If what we have in the queue is:
> 
>   [DATA] [DATA] [DATA] [CMD]
> 
> And we want to extend it with LOCK/UNLOCK like so:
> 
>   [LOCK] [DATA] [DATA] [DATA] [CMD] [UNLOCK]
> 
> Should the NWD go with the last DATA descriptor or the last descriptor period
> whether data or command?
> 
> It's, again, not very clear from reading tha part.
> 

I'm not sure, my impression is that the exact behavior of NWD is quite
specific to the actual peripheral (i.e. QCE, QPIC NAND, etc). In the
downstream drivers:

 - QCE seems to add NWD to the last data descriptor before the UNLOCK
   (as I wrote, it seems to queue command descriptors before data).

 - QPIC NAND has a dedicated "cmd" pipe that doesn't get any data
   descriptors, it specifies NWD always for the EXEC_CMD register write,
   which isn't even the last descriptor. This is also done in mainline
   already (see NAND_BAM_NWD in qcom_write_reg_dma() [1]).

It is possible that NWD works only when attached to certain descriptors
(when there is an actual operation running that gets completed by a
certain descriptor), so we might not be able to simply add NWD to the
last descriptor. :/

I suppose you could argue that "make sure engine does not get
re-configured while busy" is a requirement of QCE and not BAM, so
perhaps it would be easiest and safest if you just add DMA_PREP_FENCE to
the right descriptor inside the QCE driver. qcom_nandc has that already.

Thanks,
Stephan

[1]: https://elixir.bootlin.com/linux/v7.0-rc3/source/drivers/mtd/nand/qpic_common.c#L484