From: Kanchan Joshi <joshi.k@samsung.com>
blk_rq_dma_map API is costly for single-segment requests.
Avoid using it and map the bio_vec directly.
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/nvme/host/pci.c | 65 +++++++++++++++++++++++++++++++++++++----
1 file changed, 60 insertions(+), 5 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8d99a8f871ea..cf020de82962 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -216,6 +216,11 @@ struct nvme_queue {
struct completion delete_done;
};
+enum {
+ IOD_LARGE_DESCRIPTORS = 1, /* uses the full page sized descriptor pool */
+ IOD_SINGLE_SEGMENT = 2, /* single segment dma mapping */
+};
+
/*
* The nvme_iod describes the data in an I/O.
*/
@@ -224,7 +229,7 @@ struct nvme_iod {
struct nvme_command cmd;
bool aborted;
u8 nr_descriptors; /* # of PRP/SGL descriptors */
- bool large_descriptors; /* uses the full page sized descriptor pool */
+ unsigned int flags;
unsigned int total_len; /* length of the entire transfer */
unsigned int total_meta_len; /* length of the entire metadata transfer */
dma_addr_t meta_dma;
@@ -529,7 +534,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
static inline struct dma_pool *nvme_dma_pool(struct nvme_dev *dev,
struct nvme_iod *iod)
{
- if (iod->large_descriptors)
+ if (iod->flags & IOD_LARGE_DESCRIPTORS)
return dev->prp_page_pool;
return dev->prp_small_pool;
}
@@ -630,6 +635,15 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ unsigned int nr_segments = blk_rq_nr_phys_segments(req);
+ dma_addr_t dma_addr;
+
+ if (nr_segments == 1 && (iod->flags & IOD_SINGLE_SEGMENT)) {
+ dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
+ dma_unmap_page(dev->dev, dma_addr, iod->total_len,
+ rq_dma_dir(req));
+ return;
+ }
if (!blk_rq_dma_unmap(req, dev->dev, &iod->dma_state, iod->total_len)) {
if (iod->cmd.common.flags & NVME_CMD_SGL_METABUF)
@@ -642,6 +656,41 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
nvme_free_descriptors(dev, req);
}
+static bool nvme_try_setup_prp_simple(struct nvme_dev *dev, struct request *req,
+ struct nvme_rw_command *cmnd,
+ struct blk_dma_iter *iter)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct bio_vec bv = req_bvec(req);
+ unsigned int first_prp_len;
+
+ if (is_pci_p2pdma_page(bv.bv_page))
+ return false;
+ if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + bv.bv_len >
+ NVME_CTRL_PAGE_SIZE * 2)
+ return false;
+
+ iter->addr = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
+ if (dma_mapping_error(dev->dev, iter->addr)) {
+ iter->status = BLK_STS_RESOURCE;
+ goto out;
+ }
+ iod->total_len = bv.bv_len;
+ cmnd->dptr.prp1 = cpu_to_le64(iter->addr);
+
+ first_prp_len = NVME_CTRL_PAGE_SIZE -
+ (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1));
+ if (bv.bv_len > first_prp_len)
+ cmnd->dptr.prp2 = cpu_to_le64(iter->addr + first_prp_len);
+ else
+ cmnd->dptr.prp2 = 0;
+
+ iter->status = BLK_STS_OK;
+ iod->flags |= IOD_SINGLE_SEGMENT;
+out:
+ return true;
+}
+
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
struct request *req)
{
@@ -652,6 +701,12 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
dma_addr_t prp1_dma, prp2_dma = 0;
unsigned int prp_len, i;
__le64 *prp_list;
+ unsigned int nr_segments = blk_rq_nr_phys_segments(req);
+
+ if (nr_segments == 1) {
+ if (nvme_try_setup_prp_simple(dev, req, cmnd, &iter))
+ return iter.status;
+ }
if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
return iter.status;
@@ -693,7 +748,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) >
NVME_SMALL_DESCRIPTOR_SIZE / sizeof(__le64))
- iod->large_descriptors = true;
+ iod->flags |= IOD_LARGE_DESCRIPTORS;
prp_list = dma_pool_alloc(nvme_dma_pool(dev, iod), GFP_ATOMIC,
&prp2_dma);
@@ -808,7 +863,7 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
}
if (entries > NVME_SMALL_DESCRIPTOR_SIZE / sizeof(*sg_list))
- iod->large_descriptors = true;
+ iod->flags |= IOD_LARGE_DESCRIPTORS;
sg_list = dma_pool_alloc(nvme_dma_pool(dev, iod), GFP_ATOMIC, &sgl_dma);
if (!sg_list)
@@ -932,7 +987,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
iod->aborted = false;
iod->nr_descriptors = 0;
- iod->large_descriptors = false;
+ iod->flags = 0;
iod->total_len = 0;
iod->total_meta_len = 0;
--
2.49.0
On Fri, Apr 18, 2025 at 09:47:54AM +0300, Leon Romanovsky wrote: > From: Kanchan Joshi <joshi.k@samsung.com> > > blk_rq_dma_map API is costly for single-segment requests. > Avoid using it and map the bio_vec directly. This needs to be folded into the earlier patches or split prep patches instead of undoing work done earlier, preferably combined with a bit of code movement so that the new nvme_try_setup_prp_simple stays in the same place as before and the diff shows it reusing code. E.g. change "nvme-pci: use a better encoding for small prp pool allocations" to already use the flags instead of my boolean, and maybe include abort in the flags instead of using a separate bool so that we don't increase hte iod size. Slot in a new patch after that that dropping the single SGL segment fastpath if we think we don't need that, although if we need the PRP one I suspect that one would still be very helpful as well. Add a patch if we want the try_ version of, although when keeping the optimization for SGLs as well that are will look a bit different. I'm happy to give away my patch authorship credits if that helps with the folding.
On Tue, Apr 22, 2025 at 06:39:56AM +0200, Christoph Hellwig wrote: > On Fri, Apr 18, 2025 at 09:47:54AM +0300, Leon Romanovsky wrote: > > From: Kanchan Joshi <joshi.k@samsung.com> > > > > blk_rq_dma_map API is costly for single-segment requests. > > Avoid using it and map the bio_vec directly. > > This needs to be folded into the earlier patches or split prep patches > instead of undoing work done earlier, preferably combined with a bit > of code movement so that the new nvme_try_setup_prp_simple stays in > the same place as before and the diff shows it reusing code. > > E.g. change > > "nvme-pci: use a better encoding for small prp pool allocations" to > already use the flags instead of my boolean, and maybe include > abort in the flags instead of using a separate bool so that we > don't increase hte iod size. > > Slot in a new patch after that that dropping the single SGL segment > fastpath if we think we don't need that, although if we need the PRP > one I suspect that one would still be very helpful as well. > > Add a patch if we want the try_ version of, although when keeping > the optimization for SGLs as well that are will look a bit different. I uploaded new tag dma-split-Apr-22 with the changes. At the end, I decided to keep SGL optimized path too, but didn't create separate patch for try_* variants as they require block iterator, which is introduced in conversion patch only. > > I'm happy to give away my patch authorship credits if that helps with > the folding. I kept you as an author. Please advise if I need to change it to be someone else. Thanks
On Tue, Apr 22, 2025 at 06:39:56AM +0200, Christoph Hellwig wrote: > On Fri, Apr 18, 2025 at 09:47:54AM +0300, Leon Romanovsky wrote: > > From: Kanchan Joshi <joshi.k@samsung.com> > > > > blk_rq_dma_map API is costly for single-segment requests. > > Avoid using it and map the bio_vec directly. > > This needs to be folded into the earlier patches or split prep patches > instead of undoing work done earlier, preferably combined with a bit > of code movement so that the new nvme_try_setup_prp_simple stays in > the same place as before and the diff shows it reusing code. > > E.g. change > > "nvme-pci: use a better encoding for small prp pool allocations" to > already use the flags instead of my boolean, and maybe include > abort in the flags instead of using a separate bool so that we > don't increase hte iod size. > > Slot in a new patch after that that dropping the single SGL segment > fastpath if we think we don't need that, although if we need the PRP > one I suspect that one would still be very helpful as well. > > Add a patch if we want the try_ version of, although when keeping > the optimization for SGLs as well that are will look a bit different. > > I'm happy to give away my patch authorship credits if that helps with > the folding. I used this patch separation for two reasons: 1. Your conversion patch is absolutely correct and this patch is an optimization. 2. Wanted to make sure that I'll post to the ML the code exactly as it was tested. I'll try to split/fold this patch now. Thanks
On 4/18/25 15:47, Leon Romanovsky wrote:
> From: Kanchan Joshi <joshi.k@samsung.com>
>
> blk_rq_dma_map API is costly for single-segment requests.
> Avoid using it and map the bio_vec directly.
>
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
> drivers/nvme/host/pci.c | 65 +++++++++++++++++++++++++++++++++++++----
> 1 file changed, 60 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 8d99a8f871ea..cf020de82962 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -216,6 +216,11 @@ struct nvme_queue {
> struct completion delete_done;
> };
>
> +enum {
> + IOD_LARGE_DESCRIPTORS = 1, /* uses the full page sized descriptor pool */
> + IOD_SINGLE_SEGMENT = 2, /* single segment dma mapping */
> +};
> +
> /*
> * The nvme_iod describes the data in an I/O.
> */
> @@ -224,7 +229,7 @@ struct nvme_iod {
> struct nvme_command cmd;
> bool aborted;
> u8 nr_descriptors; /* # of PRP/SGL descriptors */
> - bool large_descriptors; /* uses the full page sized descriptor pool */
> + unsigned int flags;
> unsigned int total_len; /* length of the entire transfer */
> unsigned int total_meta_len; /* length of the entire metadata transfer */
> dma_addr_t meta_dma;
> @@ -529,7 +534,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
> static inline struct dma_pool *nvme_dma_pool(struct nvme_dev *dev,
> struct nvme_iod *iod)
> {
> - if (iod->large_descriptors)
> + if (iod->flags & IOD_LARGE_DESCRIPTORS)
> return dev->prp_page_pool;
> return dev->prp_small_pool;
> }
> @@ -630,6 +635,15 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
> static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
> {
> struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> + unsigned int nr_segments = blk_rq_nr_phys_segments(req);
> + dma_addr_t dma_addr;
> +
> + if (nr_segments == 1 && (iod->flags & IOD_SINGLE_SEGMENT)) {
nvme_pci_setup_prps() calls nvme_try_setup_prp_simple() which sets
IOD_SINGLE_SEGMENT if and only if the req has a single phys segment. So why do
you need to count the segments again here ? Looking at the flag only should be
enough, no ?
> + dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
> + dma_unmap_page(dev->dev, dma_addr, iod->total_len,
> + rq_dma_dir(req));
> + return;
> + }
>
> if (!blk_rq_dma_unmap(req, dev->dev, &iod->dma_state, iod->total_len)) {
> if (iod->cmd.common.flags & NVME_CMD_SGL_METABUF)
> @@ -642,6 +656,41 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
> nvme_free_descriptors(dev, req);
> }
>
> +static bool nvme_try_setup_prp_simple(struct nvme_dev *dev, struct request *req,
> + struct nvme_rw_command *cmnd,
> + struct blk_dma_iter *iter)
> +{
> + struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> + struct bio_vec bv = req_bvec(req);
> + unsigned int first_prp_len;
> +
> + if (is_pci_p2pdma_page(bv.bv_page))
> + return false;
> + if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + bv.bv_len >
> + NVME_CTRL_PAGE_SIZE * 2)
> + return false;
> +
> + iter->addr = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
> + if (dma_mapping_error(dev->dev, iter->addr)) {
> + iter->status = BLK_STS_RESOURCE;
> + goto out;
> + }
> + iod->total_len = bv.bv_len;
> + cmnd->dptr.prp1 = cpu_to_le64(iter->addr);
> +
> + first_prp_len = NVME_CTRL_PAGE_SIZE -
> + (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1));
> + if (bv.bv_len > first_prp_len)
> + cmnd->dptr.prp2 = cpu_to_le64(iter->addr + first_prp_len);
> + else
> + cmnd->dptr.prp2 = 0;
> +
> + iter->status = BLK_STS_OK;
> + iod->flags |= IOD_SINGLE_SEGMENT;
> +out:
> + return true;
> +}
> +
> static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
> struct request *req)
> {
> @@ -652,6 +701,12 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
> dma_addr_t prp1_dma, prp2_dma = 0;
> unsigned int prp_len, i;
> __le64 *prp_list;
> + unsigned int nr_segments = blk_rq_nr_phys_segments(req);
> +
> + if (nr_segments == 1) {
> + if (nvme_try_setup_prp_simple(dev, req, cmnd, &iter))
> + return iter.status;
> + }
>
> if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
> return iter.status;
> @@ -693,7 +748,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
>
> if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) >
> NVME_SMALL_DESCRIPTOR_SIZE / sizeof(__le64))
> - iod->large_descriptors = true;
> + iod->flags |= IOD_LARGE_DESCRIPTORS;
>
> prp_list = dma_pool_alloc(nvme_dma_pool(dev, iod), GFP_ATOMIC,
> &prp2_dma);
> @@ -808,7 +863,7 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
> }
>
> if (entries > NVME_SMALL_DESCRIPTOR_SIZE / sizeof(*sg_list))
> - iod->large_descriptors = true;
> + iod->flags |= IOD_LARGE_DESCRIPTORS;
>
> sg_list = dma_pool_alloc(nvme_dma_pool(dev, iod), GFP_ATOMIC, &sgl_dma);
> if (!sg_list)
> @@ -932,7 +987,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
>
> iod->aborted = false;
> iod->nr_descriptors = 0;
> - iod->large_descriptors = false;
> + iod->flags = 0;
> iod->total_len = 0;
> iod->total_meta_len = 0;
>
--
Damien Le Moal
Western Digital Research
> > + if (nr_segments == 1 && (iod->flags & IOD_SINGLE_SEGMENT)) {
>
> nvme_pci_setup_prps() calls nvme_try_setup_prp_simple() which sets
> IOD_SINGLE_SEGMENT if and only if the req has a single phys segment. So why do
> you need to count the segments again here ? Looking at the flag only should be
> enough, no ?
Yes, the flag will be enough.
I started with nr_segments first, but felt the need of a flag when I
reached to handle the unmap part.
This can be changed if the series requires an iteration.
Or I can do this as part of the cleanup which I anyway need to do to
replace the "iod->aborted" field with a flag.
--
Kanchan
On Fri, Apr 18, 2025 at 05:02:38PM +0900, Damien Le Moal wrote:
> On 4/18/25 15:47, Leon Romanovsky wrote:
> > From: Kanchan Joshi <joshi.k@samsung.com>
> >
> > blk_rq_dma_map API is costly for single-segment requests.
> > Avoid using it and map the bio_vec directly.
> >
> > Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> > Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> > drivers/nvme/host/pci.c | 65 +++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 60 insertions(+), 5 deletions(-)
<...>
> > static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
> > {
> > struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> > + unsigned int nr_segments = blk_rq_nr_phys_segments(req);
> > + dma_addr_t dma_addr;
> > +
> > + if (nr_segments == 1 && (iod->flags & IOD_SINGLE_SEGMENT)) {
>
> nvme_pci_setup_prps() calls nvme_try_setup_prp_simple() which sets
> IOD_SINGLE_SEGMENT if and only if the req has a single phys segment. So why do
> you need to count the segments again here ? Looking at the flag only should be
> enough, no ?
Yes, you are right. There is no need in extra check of nr_segments and
it is enough to rely on IOD_SINGLE_SEGMENT.
Thanks
© 2016 - 2025 Red Hat, Inc.