[v8] Provide a new two step DMA mapping API

[PATCH v8 24/24] nvme-pci: optimize single-segment handling

Posted by Leon Romanovsky 9 months, 3 weeks ago

From: Kanchan Joshi <joshi.k@samsung.com>

blk_rq_dma_map API is costly for single-segment requests.
Avoid using it and map the bio_vec directly.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/nvme/host/pci.c | 65 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 60 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8d99a8f871ea..cf020de82962 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -216,6 +216,11 @@ struct nvme_queue {
 	struct completion delete_done;
 };
 
+enum {
+	IOD_LARGE_DESCRIPTORS = 1, /* uses the full page sized descriptor pool */
+	IOD_SINGLE_SEGMENT = 2, /* single segment dma mapping */
+};
+
 /*
  * The nvme_iod describes the data in an I/O.
  */
@@ -224,7 +229,7 @@ struct nvme_iod {
 	struct nvme_command cmd;
 	bool aborted;
 	u8 nr_descriptors;	/* # of PRP/SGL descriptors */
-	bool large_descriptors;	/* uses the full page sized descriptor pool */
+	unsigned int flags;
 	unsigned int total_len; /* length of the entire transfer */
 	unsigned int total_meta_len; /* length of the entire metadata transfer */
 	dma_addr_t meta_dma;
@@ -529,7 +534,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
 static inline struct dma_pool *nvme_dma_pool(struct nvme_dev *dev,
 		struct nvme_iod *iod)
 {
-	if (iod->large_descriptors)
+	if (iod->flags & IOD_LARGE_DESCRIPTORS)
 		return dev->prp_page_pool;
 	return dev->prp_small_pool;
 }
@@ -630,6 +635,15 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
 static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	unsigned int nr_segments = blk_rq_nr_phys_segments(req);
+	dma_addr_t dma_addr;
+
+	if (nr_segments == 1 && (iod->flags & IOD_SINGLE_SEGMENT)) {
+		dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
+		dma_unmap_page(dev->dev, dma_addr, iod->total_len,
+				rq_dma_dir(req));
+		return;
+	}
 
 	if (!blk_rq_dma_unmap(req, dev->dev, &iod->dma_state, iod->total_len)) {
 		if (iod->cmd.common.flags & NVME_CMD_SGL_METABUF)
@@ -642,6 +656,41 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 		nvme_free_descriptors(dev, req);
 }
 
+static bool nvme_try_setup_prp_simple(struct nvme_dev *dev, struct request *req,
+				      struct nvme_rw_command *cmnd,
+				      struct blk_dma_iter *iter)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct bio_vec bv = req_bvec(req);
+	unsigned int first_prp_len;
+
+	if (is_pci_p2pdma_page(bv.bv_page))
+		return false;
+	if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + bv.bv_len >
+	    NVME_CTRL_PAGE_SIZE * 2)
+		return false;
+
+	iter->addr = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
+	if (dma_mapping_error(dev->dev, iter->addr)) {
+		iter->status = BLK_STS_RESOURCE;
+		goto out;
+	}
+	iod->total_len = bv.bv_len;
+	cmnd->dptr.prp1 = cpu_to_le64(iter->addr);
+
+	first_prp_len = NVME_CTRL_PAGE_SIZE -
+			(bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1));
+	if (bv.bv_len > first_prp_len)
+		cmnd->dptr.prp2 = cpu_to_le64(iter->addr + first_prp_len);
+	else
+		cmnd->dptr.prp2 = 0;
+
+	iter->status = BLK_STS_OK;
+	iod->flags |= IOD_SINGLE_SEGMENT;
+out:
+	return true;
+}
+
 static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 					struct request *req)
 {
@@ -652,6 +701,12 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 	dma_addr_t prp1_dma, prp2_dma = 0;
 	unsigned int prp_len, i;
 	__le64 *prp_list;
+	unsigned int nr_segments = blk_rq_nr_phys_segments(req);
+
+	if (nr_segments == 1) {
+		if (nvme_try_setup_prp_simple(dev, req, cmnd, &iter))
+			return iter.status;
+	}
 
 	if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
 		return iter.status;
@@ -693,7 +748,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 
 	if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) >
 	    NVME_SMALL_DESCRIPTOR_SIZE / sizeof(__le64))
-		iod->large_descriptors = true;
+		iod->flags |= IOD_LARGE_DESCRIPTORS;
 
 	prp_list = dma_pool_alloc(nvme_dma_pool(dev, iod), GFP_ATOMIC,
 			&prp2_dma);
@@ -808,7 +863,7 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
 	}
 
 	if (entries > NVME_SMALL_DESCRIPTOR_SIZE / sizeof(*sg_list))
-		iod->large_descriptors = true;
+		iod->flags |= IOD_LARGE_DESCRIPTORS;
 
 	sg_list = dma_pool_alloc(nvme_dma_pool(dev, iod), GFP_ATOMIC, &sgl_dma);
 	if (!sg_list)
@@ -932,7 +987,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 
 	iod->aborted = false;
 	iod->nr_descriptors = 0;
-	iod->large_descriptors = false;
+	iod->flags = 0;
 	iod->total_len = 0;
 	iod->total_meta_len = 0;
 
-- 
2.49.0

Re: [PATCH v8 24/24] nvme-pci: optimize single-segment handling

Posted by Christoph Hellwig 9 months, 3 weeks ago

On Fri, Apr 18, 2025 at 09:47:54AM +0300, Leon Romanovsky wrote:
> From: Kanchan Joshi <joshi.k@samsung.com>
> 
> blk_rq_dma_map API is costly for single-segment requests.
> Avoid using it and map the bio_vec directly.

This needs to be folded into the earlier patches or split prep patches
instead of undoing work done earlier, preferably combined with a bit
of code movement so that the new nvme_try_setup_prp_simple stays in
the same place as before and the diff shows it reusing code.

E.g. change

"nvme-pci: use a better encoding for small prp pool allocations" to
already use the flags instead of my boolean, and maybe include 
abort in the flags instead of using a separate bool so that we
don't increase hte iod size.

Slot in a new patch after that that dropping the single SGL segment
fastpath if we think we don't need that, although if we need the PRP
one I suspect that one would still be very helpful as well.

Add a patch if we want the try_ version of, although when keeping
the optimization for SGLs as well that are will look a bit different.

I'm happy to give away my patch authorship credits if that helps with
the folding.

Re: [PATCH v8 24/24] nvme-pci: optimize single-segment handling

Posted by Leon Romanovsky 9 months, 3 weeks ago

On Tue, Apr 22, 2025 at 06:39:56AM +0200, Christoph Hellwig wrote:
> On Fri, Apr 18, 2025 at 09:47:54AM +0300, Leon Romanovsky wrote:
> > From: Kanchan Joshi <joshi.k@samsung.com>
> > 
> > blk_rq_dma_map API is costly for single-segment requests.
> > Avoid using it and map the bio_vec directly.
> 
> This needs to be folded into the earlier patches or split prep patches
> instead of undoing work done earlier, preferably combined with a bit
> of code movement so that the new nvme_try_setup_prp_simple stays in
> the same place as before and the diff shows it reusing code.
> 
> E.g. change
> 
> "nvme-pci: use a better encoding for small prp pool allocations" to
> already use the flags instead of my boolean, and maybe include 
> abort in the flags instead of using a separate bool so that we
> don't increase hte iod size.
> 
> Slot in a new patch after that that dropping the single SGL segment
> fastpath if we think we don't need that, although if we need the PRP
> one I suspect that one would still be very helpful as well.
> 
> Add a patch if we want the try_ version of, although when keeping
> the optimization for SGLs as well that are will look a bit different.

I uploaded new tag dma-split-Apr-22 with the changes. At the end, I
decided to keep SGL optimized path too, but didn't create separate
patch for try_* variants as they require block iterator, which is
introduced in conversion patch only.

> 
> I'm happy to give away my patch authorship credits if that helps with
> the folding.

I kept you as an author. Please advise if I need to change it to be someone else.

Thanks

Re: [PATCH v8 24/24] nvme-pci: optimize single-segment handling

Posted by Leon Romanovsky 9 months, 3 weeks ago

On Tue, Apr 22, 2025 at 06:39:56AM +0200, Christoph Hellwig wrote:
> On Fri, Apr 18, 2025 at 09:47:54AM +0300, Leon Romanovsky wrote:
> > From: Kanchan Joshi <joshi.k@samsung.com>
> > 
> > blk_rq_dma_map API is costly for single-segment requests.
> > Avoid using it and map the bio_vec directly.
> 
> This needs to be folded into the earlier patches or split prep patches
> instead of undoing work done earlier, preferably combined with a bit
> of code movement so that the new nvme_try_setup_prp_simple stays in
> the same place as before and the diff shows it reusing code.
> 
> E.g. change
> 
> "nvme-pci: use a better encoding for small prp pool allocations" to
> already use the flags instead of my boolean, and maybe include 
> abort in the flags instead of using a separate bool so that we
> don't increase hte iod size.
> 
> Slot in a new patch after that that dropping the single SGL segment
> fastpath if we think we don't need that, although if we need the PRP
> one I suspect that one would still be very helpful as well.
> 
> Add a patch if we want the try_ version of, although when keeping
> the optimization for SGLs as well that are will look a bit different.
> 
> I'm happy to give away my patch authorship credits if that helps with
> the folding.

I used this patch separation for two reasons:
1. Your conversion patch is absolutely correct and this patch is an
optimization.
2. Wanted to make sure that I'll post to the ML the code exactly as it
was tested.

I'll try to split/fold this patch now.

Thanks

Re: [PATCH v8 24/24] nvme-pci: optimize single-segment handling

Posted by Damien Le Moal 9 months, 3 weeks ago

On 4/18/25 15:47, Leon Romanovsky wrote:
> From: Kanchan Joshi <joshi.k@samsung.com>
> 
> blk_rq_dma_map API is costly for single-segment requests.
> Avoid using it and map the bio_vec directly.
> 
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>  drivers/nvme/host/pci.c | 65 +++++++++++++++++++++++++++++++++++++----
>  1 file changed, 60 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 8d99a8f871ea..cf020de82962 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -216,6 +216,11 @@ struct nvme_queue {
>  	struct completion delete_done;
>  };
>  
> +enum {
> +	IOD_LARGE_DESCRIPTORS = 1, /* uses the full page sized descriptor pool */
> +	IOD_SINGLE_SEGMENT = 2, /* single segment dma mapping */
> +};
> +
>  /*
>   * The nvme_iod describes the data in an I/O.
>   */
> @@ -224,7 +229,7 @@ struct nvme_iod {
>  	struct nvme_command cmd;
>  	bool aborted;
>  	u8 nr_descriptors;	/* # of PRP/SGL descriptors */
> -	bool large_descriptors;	/* uses the full page sized descriptor pool */
> +	unsigned int flags;
>  	unsigned int total_len; /* length of the entire transfer */
>  	unsigned int total_meta_len; /* length of the entire metadata transfer */
>  	dma_addr_t meta_dma;
> @@ -529,7 +534,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
>  static inline struct dma_pool *nvme_dma_pool(struct nvme_dev *dev,
>  		struct nvme_iod *iod)
>  {
> -	if (iod->large_descriptors)
> +	if (iod->flags & IOD_LARGE_DESCRIPTORS)
>  		return dev->prp_page_pool;
>  	return dev->prp_small_pool;
>  }
> @@ -630,6 +635,15 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
>  static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> +	unsigned int nr_segments = blk_rq_nr_phys_segments(req);
> +	dma_addr_t dma_addr;
> +
> +	if (nr_segments == 1 && (iod->flags & IOD_SINGLE_SEGMENT)) {

nvme_pci_setup_prps() calls nvme_try_setup_prp_simple() which sets
IOD_SINGLE_SEGMENT if and only if the req has a single phys segment. So why do
you need to count the segments again here ? Looking at the flag only should be
enough, no ?

> +		dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
> +		dma_unmap_page(dev->dev, dma_addr, iod->total_len,
> +				rq_dma_dir(req));
> +		return;
> +	}
>  
>  	if (!blk_rq_dma_unmap(req, dev->dev, &iod->dma_state, iod->total_len)) {
>  		if (iod->cmd.common.flags & NVME_CMD_SGL_METABUF)
> @@ -642,6 +656,41 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
>  		nvme_free_descriptors(dev, req);
>  }
>  
> +static bool nvme_try_setup_prp_simple(struct nvme_dev *dev, struct request *req,
> +				      struct nvme_rw_command *cmnd,
> +				      struct blk_dma_iter *iter)
> +{
> +	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> +	struct bio_vec bv = req_bvec(req);
> +	unsigned int first_prp_len;
> +
> +	if (is_pci_p2pdma_page(bv.bv_page))
> +		return false;
> +	if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + bv.bv_len >
> +	    NVME_CTRL_PAGE_SIZE * 2)
> +		return false;
> +
> +	iter->addr = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
> +	if (dma_mapping_error(dev->dev, iter->addr)) {
> +		iter->status = BLK_STS_RESOURCE;
> +		goto out;
> +	}
> +	iod->total_len = bv.bv_len;
> +	cmnd->dptr.prp1 = cpu_to_le64(iter->addr);
> +
> +	first_prp_len = NVME_CTRL_PAGE_SIZE -
> +			(bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1));
> +	if (bv.bv_len > first_prp_len)
> +		cmnd->dptr.prp2 = cpu_to_le64(iter->addr + first_prp_len);
> +	else
> +		cmnd->dptr.prp2 = 0;
> +
> +	iter->status = BLK_STS_OK;
> +	iod->flags |= IOD_SINGLE_SEGMENT;
> +out:
> +	return true;
> +}
> +
>  static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
>  					struct request *req)
>  {
> @@ -652,6 +701,12 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
>  	dma_addr_t prp1_dma, prp2_dma = 0;
>  	unsigned int prp_len, i;
>  	__le64 *prp_list;
> +	unsigned int nr_segments = blk_rq_nr_phys_segments(req);
> +
> +	if (nr_segments == 1) {
> +		if (nvme_try_setup_prp_simple(dev, req, cmnd, &iter))
> +			return iter.status;
> +	}
>  
>  	if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
>  		return iter.status;
> @@ -693,7 +748,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
>  
>  	if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) >
>  	    NVME_SMALL_DESCRIPTOR_SIZE / sizeof(__le64))
> -		iod->large_descriptors = true;
> +		iod->flags |= IOD_LARGE_DESCRIPTORS;
>  
>  	prp_list = dma_pool_alloc(nvme_dma_pool(dev, iod), GFP_ATOMIC,
>  			&prp2_dma);
> @@ -808,7 +863,7 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
>  	}
>  
>  	if (entries > NVME_SMALL_DESCRIPTOR_SIZE / sizeof(*sg_list))
> -		iod->large_descriptors = true;
> +		iod->flags |= IOD_LARGE_DESCRIPTORS;
>  
>  	sg_list = dma_pool_alloc(nvme_dma_pool(dev, iod), GFP_ATOMIC, &sgl_dma);
>  	if (!sg_list)
> @@ -932,7 +987,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
>  
>  	iod->aborted = false;
>  	iod->nr_descriptors = 0;
> -	iod->large_descriptors = false;
> +	iod->flags = 0;
>  	iod->total_len = 0;
>  	iod->total_meta_len = 0;
>  


-- 
Damien Le Moal
Western Digital Research

Re: [PATCH v8 24/24] nvme-pci: optimize single-segment handling

Posted by Kanchan Joshi 9 months, 3 weeks ago

> > +     if (nr_segments == 1 && (iod->flags & IOD_SINGLE_SEGMENT)) {
>
> nvme_pci_setup_prps() calls nvme_try_setup_prp_simple() which sets
> IOD_SINGLE_SEGMENT if and only if the req has a single phys segment. So why do
> you need to count the segments again here ? Looking at the flag only should be
> enough, no ?

Yes, the flag will be enough.
I started with nr_segments first, but felt the need of a flag when I
reached to handle the unmap part.

This can be changed if the series requires an iteration.
Or I can do this as part of the cleanup which I anyway need to do to
replace the "iod->aborted" field with a flag.

-- 
Kanchan

Re: [PATCH v8 24/24] nvme-pci: optimize single-segment handling

Posted by Leon Romanovsky 9 months, 3 weeks ago

On Fri, Apr 18, 2025 at 05:02:38PM +0900, Damien Le Moal wrote:
> On 4/18/25 15:47, Leon Romanovsky wrote:
> > From: Kanchan Joshi <joshi.k@samsung.com>
> > 
> > blk_rq_dma_map API is costly for single-segment requests.
> > Avoid using it and map the bio_vec directly.
> > 
> > Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> > Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> >  drivers/nvme/host/pci.c | 65 +++++++++++++++++++++++++++++++++++++----
> >  1 file changed, 60 insertions(+), 5 deletions(-)

<...>

> >  static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
> >  {
> >  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> > +	unsigned int nr_segments = blk_rq_nr_phys_segments(req);
> > +	dma_addr_t dma_addr;
> > +
> > +	if (nr_segments == 1 && (iod->flags & IOD_SINGLE_SEGMENT)) {
> 
> nvme_pci_setup_prps() calls nvme_try_setup_prp_simple() which sets
> IOD_SINGLE_SEGMENT if and only if the req has a single phys segment. So why do
> you need to count the segments again here ? Looking at the flag only should be
> enough, no ?

Yes, you are right. There is no need in extra check of nr_segments and
it is enough to rely on IOD_SINGLE_SEGMENT.

Thanks