drivers/nvme/host/pci.c | 51 +++++++++++------------------------------ 1 file changed, 14 insertions(+), 37 deletions(-)
Fixes kernel block errors originating from the hard-coded 256-byte
alignment of dma_pool_create(). The NVMe specification requires a PRP
List PBAO offset field of 0h, i.e. a PRP List must be aligned to the
configured 4096-byte memory page size.
Essentially reverts commit 99802a7aee2b ("NVMe: Optimise memory usage
for I/Os between 4k and 128k")
Resolved by using default PRP pool which is properly aligned.
Signed-off-by: Matthew Waltz <matthewwaltzis@gmail.com>
---
drivers/nvme/host/pci.c | 51 +++++++++++------------------------------
1 file changed, 14 insertions(+), 37 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6a99ed680915..4fcb159c7df3 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -119,7 +119,6 @@ struct nvme_dev {
u32 __iomem *dbs;
struct device *dev;
struct dma_pool *prp_page_pool;
- struct dma_pool *prp_small_pool;
unsigned online_queues;
unsigned max_qid;
unsigned io_queues[HCTX_MAX_TYPES];
@@ -228,7 +227,7 @@ struct nvme_iod {
struct nvme_queue *nvmeq;
bool use_sgl;
int aborted;
- int npages; /* In the PRP list. 0 means small pool in use */
+ int npages; /* In the PRP list. */
int nents; /* Used in scatterlist */
dma_addr_t first_dma;
unsigned int dma_len; /* length of single DMA segment mapping */
@@ -598,10 +597,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
WARN_ON_ONCE(!iod->nents);
nvme_unmap_sg(dev, req);
- if (iod->npages == 0)
- dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
- iod->first_dma);
- else if (iod->use_sgl)
+ if (iod->use_sgl)
nvme_free_sgls(dev, req);
else
nvme_free_prps(dev, req);
@@ -635,7 +631,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
__le64 *prp_list;
void **list = nvme_pci_iod_list(req);
dma_addr_t prp_dma;
- int nprps, i;
+ int i;
length -= (NVME_CTRL_PAGE_SIZE - offset);
if (length <= 0) {
@@ -657,14 +653,8 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
goto done;
}
- nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
- if (nprps <= (256 / 8)) {
- pool = dev->prp_small_pool;
- iod->npages = 0;
- } else {
- pool = dev->prp_page_pool;
- iod->npages = 1;
- }
+ pool = dev->prp_page_pool;
+ iod->npages = 1;
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
if (!prp_list) {
@@ -753,13 +743,8 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
return BLK_STS_OK;
}
- if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
- pool = dev->prp_small_pool;
- iod->npages = 0;
- } else {
- pool = dev->prp_page_pool;
- iod->npages = 1;
- }
+ pool = dev->prp_page_pool;
+ iod->npages = 1;
sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
if (!sg_list) {
@@ -2727,7 +2712,7 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
return 0;
}
-static int nvme_setup_prp_pools(struct nvme_dev *dev)
+static int nvme_setup_prp_pool(struct nvme_dev *dev)
{
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
NVME_CTRL_PAGE_SIZE,
@@ -2735,20 +2720,12 @@ static int nvme_setup_prp_pools(struct nvme_dev *dev)
if (!dev->prp_page_pool)
return -ENOMEM;
- /* Optimisation for I/Os between 4k and 128k */
- dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
- 256, 256, 0);
- if (!dev->prp_small_pool) {
- dma_pool_destroy(dev->prp_page_pool);
- return -ENOMEM;
- }
return 0;
}
-static void nvme_release_prp_pools(struct nvme_dev *dev)
+static void nvme_release_prp_pool(struct nvme_dev *dev)
{
dma_pool_destroy(dev->prp_page_pool);
- dma_pool_destroy(dev->prp_small_pool);
}
static void nvme_free_tagset(struct nvme_dev *dev)
@@ -3080,7 +3057,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
mutex_init(&dev->shutdown_lock);
- result = nvme_setup_prp_pools(dev);
+ result = nvme_setup_prp_pool(dev);
if (result)
goto unmap;
@@ -3109,7 +3086,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
GFP_KERNEL, node);
if (!dev->iod_mempool) {
result = -ENOMEM;
- goto release_pools;
+ goto release_pool;
}
result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
@@ -3126,8 +3103,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
release_mempool:
mempool_destroy(dev->iod_mempool);
- release_pools:
- nvme_release_prp_pools(dev);
+ release_pool:
+ nvme_release_prp_pool(dev);
unmap:
nvme_dev_unmap(dev);
put_pci:
@@ -3198,7 +3175,7 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
nvme_free_queues(dev, 0);
- nvme_release_prp_pools(dev);
+ nvme_release_prp_pool(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
}
--
2.25.1
On Sat, Feb 12, 2022 at 01:06:49PM -0700, Matthew Waltz wrote: > Fixes kernel block errors originating from the hard-coded 256-byte > alignment of dma_pool_create(). The NVMe specification requires a PRP > List PBAO offset field of 0h, i.e. a PRP List must be aligned to the > configured 4096-byte memory page size. That is not correct. The spec (2.0a section 4.1.1) says: The first PRP List entry [...] shall be qword aligned and may also have a non-zero offset within the memory page.
That is the wrong part of the specification, describing the PRP entries,
not the list pointer. The relevant portion of the spec (2.0a figure
109 page 129) says:
If this entry is not the first PRP entry in the command or a
PRP List pointer in a command, then the Offset portion of this
field shall be cleared to 0h
On Sat, Feb 12, 2022 at 1:48 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Sat, Feb 12, 2022 at 01:06:49PM -0700, Matthew Waltz wrote:
> > Fixes kernel block errors originating from the hard-coded 256-byte
> > alignment of dma_pool_create(). The NVMe specification requires a PRP
> > List PBAO offset field of 0h, i.e. a PRP List must be aligned to the
> > configured 4096-byte memory page size.
>
> That is not correct. The spec (2.0a section 4.1.1) says:
>
> The first PRP List entry [...] shall be qword aligned and may also
> have a non-zero offset within the memory page.
On Sat, Feb 12, 2022 at 01:53:59PM -0700, Matt Waltz wrote: > That is the wrong part of the specification, describing the PRP entries, > not the list pointer. The text for qword alignment rules specifically says "PRP List entry", as is associated with command's prp2 field. > The relevant portion of the spec (2.0a figure > 109 page 129) says: > > If this entry is not the first PRP entry in the command or a > PRP List pointer in a command, then the Offset portion of this > field shall be cleared to 0h That only applies to chaining list elements. It does not apply to the "PRP list pointer in a command". The driver allocates from the 256B dma pool only for the command's PRP list pointer, so we're fine.
On Sat, Feb 12, 2022 at 2:03 PM Keith Busch <kbusch@kernel.org> wrote: > That only applies to chaining list elements. It does not apply to the > "PRP list pointer in a command". The driver allocates from the 256B dma > pool only for the command's PRP list pointer, so we're fine. Ah yes, I was confused by the PRP2 definition which says it can be a "PRP list pointer", now I feel extremely silly. Guess I'll need to look elsewhere for a fix for this bug I am encountering, thank you for the clarity!
© 2016 - 2026 Red Hat, Inc.