drivers/nvme/host/core.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-)
Use async function calls to make namespace scanning happen in parallel.
Without the patch, NVME namespaces are scanned serially, so it can take
a long time for all of a controller's namespaces to become available,
especially with a slower (TCP) interface with large number of
namespaces.
It is not uncommon to have large numbers (hundreds or thousands) of
namespaces on nvme-of with storage servers.
The time it took for all namespaces to show up after connecting (via
TCP) to a controller with 1002 namespaces was measured on one system:
network latency without patch with patch
0 6s 1s
50ms 210s 10s
100ms 417s 18s
Measurements taken on another system show the effect of the patch on the
time nvme_scan_work() took to complete, when connecting to a linux
nvme-of target with varying numbers of namespaces, on a network of
400us.
namespaces without patch with patch
1 16ms 14ms
2 24ms 16ms
4 49ms 22ms
8 101ms 33ms
16 207ms 56ms
100 1.4s 0.6s
1000 12.9s 2.0s
On the same system, connecting to a local PCIe NVMe drive (a Samsung
PM1733) instead of a network target:
namespaces without patch with patch
1 13ms 12ms
2 41ms 13ms
Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
---
changes from V3:
* changed "next_idx" to "next_nsid" in async_scan_info (cosmetic)
* added comments to struct async_scan_info
* changed nvme_scan_ns_list() to use local ns_list pointer
changes from V2:
* make a separate function nvme_scan_ns_async() that calls
nvme_scan_ns(), instead of modifying nvme_scan_ns()
* only scan asynchronously from nvme_scan_ns_list(), not from
nvme_scan_ns_sequential()
* provide more timing data in the commit message
changes from V1:
* remove module param to enable/disable async scanning
* add scan time measurements to commit message
drivers/nvme/host/core.c | 40 +++++++++++++++++++++++++++++++++++++++-
1 file changed, 39 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 782090ce0bc1..dc43146dc03d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4,6 +4,7 @@
* Copyright (c) 2011-2014, Intel Corporation.
*/
+#include <linux/async.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/blk-integrity.h>
@@ -3952,6 +3953,35 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
}
}
+/*
+ * struct async_scan_info - keeps track of controller & NSIDs to scan
+ * @ctrl: Controller on which namespaces are being scanned
+ * @next_nsid: Index of next NSID to scan in ns_list
+ * @ns_list: Pointer to list of NSIDs to scan
+ *
+ * Note: There is a single async_scan_info structure shared by all instances
+ * of nvme_scan_ns_async() scanning a given controller, so the atomic
+ * operations on next_nsid are critical to ensure each instance scans a unique
+ * NSID.
+ */
+struct async_scan_info {
+ struct nvme_ctrl *ctrl;
+ atomic_t next_nsid;
+ __le32 *ns_list;
+};
+
+static void nvme_scan_ns_async(void *data, async_cookie_t cookie)
+{
+ struct async_scan_info *scan_info = data;
+ int idx;
+ u32 nsid;
+
+ idx = (u32)atomic_fetch_add(1, &scan_info->next_nsid);
+ nsid = le32_to_cpu(scan_info->ns_list[idx]);
+
+ nvme_scan_ns(scan_info->ctrl, nsid);
+}
+
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid)
{
@@ -3978,11 +4008,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
__le32 *ns_list;
u32 prev = 0;
int ret = 0, i;
+ ASYNC_DOMAIN(domain);
+ struct async_scan_info scan_info;
ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
if (!ns_list)
return -ENOMEM;
+ scan_info.ctrl = ctrl;
+ scan_info.ns_list = ns_list;
for (;;) {
struct nvme_command cmd = {
.identify.opcode = nvme_admin_identify,
@@ -3998,19 +4032,23 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
goto free;
}
+ atomic_set(&scan_info.next_nsid, 0);
for (i = 0; i < nr_entries; i++) {
u32 nsid = le32_to_cpu(ns_list[i]);
if (!nsid) /* end of the list? */
goto out;
- nvme_scan_ns(ctrl, nsid);
+ async_schedule_domain(nvme_scan_ns_async, &scan_info,
+ &domain);
while (++prev < nsid)
nvme_ns_remove_by_nsid(ctrl, prev);
}
+ async_synchronize_full_domain(&domain);
}
out:
nvme_remove_invalid_namespaces(ctrl, prev);
free:
+ async_synchronize_full_domain(&domain);
kfree(ns_list);
return ret;
}
--
2.39.3
Hi,
two tiny nitpicks that can probably be fixed during patch application.
On 2024-07-17 13:55:50+0000, Stuart Hayes wrote:
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 782090ce0bc1..dc43146dc03d 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -4,6 +4,7 @@
> * Copyright (c) 2011-2014, Intel Corporation.
> */
>
> +#include <linux/async.h>
> #include <linux/blkdev.h>
> #include <linux/blk-mq.h>
> #include <linux/blk-integrity.h>
> @@ -3952,6 +3953,35 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> }
> }
>
> +/*
Kernel-doc with /** ?
> + * struct async_scan_info - keeps track of controller & NSIDs to scan
> + * @ctrl: Controller on which namespaces are being scanned
> + * @next_nsid: Index of next NSID to scan in ns_list
> + * @ns_list: Pointer to list of NSIDs to scan
> + *
> + * Note: There is a single async_scan_info structure shared by all instances
> + * of nvme_scan_ns_async() scanning a given controller, so the atomic
> + * operations on next_nsid are critical to ensure each instance scans a unique
> + * NSID.
> + */
> +struct async_scan_info {
> + struct nvme_ctrl *ctrl;
> + atomic_t next_nsid;
> + __le32 *ns_list;
> +};
> +
> +static void nvme_scan_ns_async(void *data, async_cookie_t cookie)
> +{
> + struct async_scan_info *scan_info = data;
> + int idx;
> + u32 nsid;
> +
> + idx = (u32)atomic_fetch_add(1, &scan_info->next_nsid);
atomic_fetch_inc() ?
> + nsid = le32_to_cpu(scan_info->ns_list[idx]);
> +
> + nvme_scan_ns(scan_info->ctrl, nsid);
> +}
> +
On Wed, Jul 17, 2024 at 11:16:19PM +0200, Thomas Weißschuh wrote:
> On 2024-07-17 13:55:50+0000, Stuart Hayes wrote:
> > +static void nvme_scan_ns_async(void *data, async_cookie_t cookie)
> > +{
> > + struct async_scan_info *scan_info = data;
> > + int idx;
> > + u32 nsid;
> > +
> > + idx = (u32)atomic_fetch_add(1, &scan_info->next_nsid);
>
> atomic_fetch_inc() ?
Good call. Also, that returns an int, and 'idx' is an int too, so the
(u32) cast inbetween is unnecessary. The highest 'next_nsid' could
possibly be anyway is 1023, so int is fine.
On Wed, Jul 17, 2024 at 01:55:50PM -0500, Stuart Hayes wrote:
Looks good to me. Just one minor comment below.
> @@ -3978,11 +4008,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
> __le32 *ns_list;
> u32 prev = 0;
> int ret = 0, i;
> + ASYNC_DOMAIN(domain);
> + struct async_scan_info scan_info;
>
> ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
> if (!ns_list)
> return -ENOMEM;
>
> + scan_info.ctrl = ctrl;
> + scan_info.ns_list = ns_list;
> for (;;) {
> struct nvme_command cmd = {
> .identify.opcode = nvme_admin_identify,
> @@ -3998,19 +4032,23 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
> goto free;
> }
>
> + atomic_set(&scan_info.next_nsid, 0);
> for (i = 0; i < nr_entries; i++) {
> u32 nsid = le32_to_cpu(ns_list[i]);
>
> if (!nsid) /* end of the list? */
> goto out;
> - nvme_scan_ns(ctrl, nsid);
> + async_schedule_domain(nvme_scan_ns_async, &scan_info,
> + &domain);
> while (++prev < nsid)
> nvme_ns_remove_by_nsid(ctrl, prev);
> }
> + async_synchronize_full_domain(&domain);
> }
> out:
> nvme_remove_invalid_namespaces(ctrl, prev);
> free:
> + async_synchronize_full_domain(&domain);
A goto this "free" label appears to mean the async domain has nothing
scheduled, so synchronizing won't be necessary. This should be moved up
tharmlesso the "out" label, I think.
I can change that up when applying; just wanted to mention it in case
I'm missing something.
On 7/17/2024 3:46 PM, Keith Busch wrote:
> On Wed, Jul 17, 2024 at 01:55:50PM -0500, Stuart Hayes wrote:
>
> Looks good to me. Just one minor comment below.
>
>> @@ -3978,11 +4008,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
>> __le32 *ns_list;
>> u32 prev = 0;
>> int ret = 0, i;
>> + ASYNC_DOMAIN(domain);
>> + struct async_scan_info scan_info;
>>
>> ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
>> if (!ns_list)
>> return -ENOMEM;
>>
>> + scan_info.ctrl = ctrl;
>> + scan_info.ns_list = ns_list;
>> for (;;) {
>> struct nvme_command cmd = {
>> .identify.opcode = nvme_admin_identify,
>> @@ -3998,19 +4032,23 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
>> goto free;
>> }
>>
>> + atomic_set(&scan_info.next_nsid, 0);
>> for (i = 0; i < nr_entries; i++) {
>> u32 nsid = le32_to_cpu(ns_list[i]);
>>
>> if (!nsid) /* end of the list? */
>> goto out;
>> - nvme_scan_ns(ctrl, nsid);
>> + async_schedule_domain(nvme_scan_ns_async, &scan_info,
>> + &domain);
>> while (++prev < nsid)
>> nvme_ns_remove_by_nsid(ctrl, prev);
>> }
>> + async_synchronize_full_domain(&domain);
>> }
>> out:
>> nvme_remove_invalid_namespaces(ctrl, prev);
>> free:
>> + async_synchronize_full_domain(&domain);
>
> A goto this "free" label appears to mean the async domain has nothing
> scheduled, so synchronizing won't be necessary. This should be moved up
> tharmlesso the "out" label, I think.
>
> I can change that up when applying; just wanted to mention it in case
> I'm missing something.
I agree, you aren't missing anything. Thank you very much! If you want
me to submit a new patch with this and Thomas' changes, let me know.
On Wed, Jul 17, 2024 at 09:10:23PM -0500, stuart hayes wrote: > I agree, you aren't missing anything. Thank you very much! If you want > me to submit a new patch with this and Thomas' changes, let me know. No problem, I've folded in Thomas's suggestions as well, pushed to nvme-6.11. Thanks!
On Thu, Jul 18, 2024 at 01:14:20PM -0600, Keith Busch wrote: > On Wed, Jul 17, 2024 at 09:10:23PM -0500, stuart hayes wrote: > > I agree, you aren't missing anything. Thank you very much! If you want > > me to submit a new patch with this and Thomas' changes, let me know. > > No problem, I've folded in Thomas's suggestions as well, pushed to > nvme-6.11. Thanks! On further consideration, this is a bit late for 6.11 inclusion. This one could use time in linux-next, so it's top of my stack for 6.12 once that branch gets created.
© 2016 - 2025 Red Hat, Inc.