The FC transport is offloading the connect attempt to a workqueue. When
the attempt fails the transport is starting to cleanup resources. It is
possible for user space to trigger a crash because nvmf_ctrl_options are
exposed to sysfs.
This crash wasn't observed with blktests nvme/041 until now because the
retry loop was usually trying for several times (e.g. with defaults
600s) and the test would trigger the cleanup itself. Though we the
recent change not retrying to use invalid credentials the crash can be
easily triggered.
The simplest way to control the life time of nvmf_ctrl_options is by
using ref counting.
Signed-off-by: Daniel Wagner <dwagner@suse.de>
---
drivers/nvme/host/fabrics.c | 22 +++++++++++++++++++---
drivers/nvme/host/fabrics.h | 6 +++++-
drivers/nvme/host/fc.c | 14 +++++++++-----
drivers/nvme/host/rdma.c | 18 +++++++++++++-----
drivers/nvme/host/tcp.c | 21 ++++++++++++++-------
drivers/nvme/target/loop.c | 19 +++++++++++++------
6 files changed, 73 insertions(+), 27 deletions(-)
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 3499acbf6a82..888285fe2289 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -1222,8 +1222,11 @@ static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts,
return 0;
}
-void nvmf_free_options(struct nvmf_ctrl_options *opts)
+static void nvmf_free_options(struct kref *ref)
{
+ struct nvmf_ctrl_options *opts =
+ container_of(ref, struct nvmf_ctrl_options, ref);
+
nvmf_host_put(opts->host);
key_put(opts->keyring);
key_put(opts->tls_key);
@@ -1237,7 +1240,18 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts)
kfree(opts->dhchap_ctrl_secret);
kfree(opts);
}
-EXPORT_SYMBOL_GPL(nvmf_free_options);
+
+int nvmf_ctrl_options_get(struct nvmf_ctrl_options *opts)
+{
+ return kref_get_unless_zero(&opts->ref);
+}
+EXPORT_SYMBOL_GPL(nvmf_ctrl_options_get);
+
+void nvmf_ctrl_options_put(struct nvmf_ctrl_options *opts)
+{
+ kref_put(&opts->ref, nvmf_free_options);
+}
+EXPORT_SYMBOL_GPL(nvmf_ctrl_options_put);
#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN)
#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
@@ -1259,6 +1273,8 @@ nvmf_create_ctrl(struct device *dev, const char *buf)
if (!opts)
return ERR_PTR(-ENOMEM);
+ kref_init(&opts->ref);
+
ret = nvmf_parse_options(opts, buf);
if (ret)
goto out_free_opts;
@@ -1314,7 +1330,7 @@ nvmf_create_ctrl(struct device *dev, const char *buf)
out_unlock:
up_read(&nvmf_transports_rwsem);
out_free_opts:
- nvmf_free_options(opts);
+ nvmf_ctrl_options_put(opts);
return ERR_PTR(ret);
}
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 06cc54851b1b..8436533aed16 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -78,6 +78,7 @@ enum {
/**
* struct nvmf_ctrl_options - Used to hold the options specified
* with the parsing opts enum.
+ * @ref: for reference count of the data structure
* @mask: Used by the fabrics library to parse through sysfs options
* on adding a NVMe controller.
* @max_reconnects: maximum number of allowed reconnect attempts before removing
@@ -117,6 +118,7 @@ enum {
* @fast_io_fail_tmo: Fast I/O fail timeout in seconds
*/
struct nvmf_ctrl_options {
+ struct kref ref;
unsigned mask;
int max_reconnects;
char *transport;
@@ -146,6 +148,9 @@ struct nvmf_ctrl_options {
int fast_io_fail_tmo;
};
+int nvmf_ctrl_options_get(struct nvmf_ctrl_options *opts);
+void nvmf_ctrl_options_put(struct nvmf_ctrl_options *opts);
+
/*
* struct nvmf_transport_ops - used to register a specific
* fabric implementation of NVMe fabrics.
@@ -228,7 +233,6 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
int nvmf_register_transport(struct nvmf_transport_ops *ops);
void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
-void nvmf_free_options(struct nvmf_ctrl_options *opts);
int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index b81046c9f171..ddbc5b21af5b 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2406,8 +2406,7 @@ nvme_fc_ctrl_free(struct kref *ref)
nvme_fc_rport_put(ctrl->rport);
ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum);
- if (ctrl->ctrl.opts)
- nvmf_free_options(ctrl->ctrl.opts);
+ nvmf_ctrl_options_put(ctrl->ctrl.opts);
kfree(ctrl);
}
@@ -3474,10 +3473,15 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
goto out_fail;
}
+ if (!nvmf_ctrl_options_get(opts)) {
+ ret = -ENOLCK;
+ goto out_free_ctrl;
+ }
+
idx = ida_alloc(&nvme_fc_ctrl_cnt, GFP_KERNEL);
if (idx < 0) {
ret = -ENOSPC;
- goto out_free_ctrl;
+ goto out_free_opts;
}
/*
@@ -3583,8 +3587,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
cancel_work_sync(&ctrl->ctrl.reset_work);
cancel_delayed_work_sync(&ctrl->connect_work);
- ctrl->ctrl.opts = NULL;
-
/* initiate nvme ctrl ref counting teardown */
nvme_uninit_ctrl(&ctrl->ctrl);
@@ -3607,6 +3609,8 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
out_free_ida:
put_device(ctrl->dev);
ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum);
+out_free_opts:
+ nvmf_ctrl_options_put(opts);
out_free_ctrl:
kfree(ctrl);
out_fail:
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 20fdd40b1879..d3747795ad80 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -976,8 +976,8 @@ static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
list_del(&ctrl->list);
mutex_unlock(&nvme_rdma_ctrl_mutex);
- nvmf_free_options(nctrl->opts);
free_ctrl:
+ nvmf_ctrl_options_put(nctrl->opts);
kfree(ctrl->queues);
kfree(ctrl);
}
@@ -2236,6 +2236,12 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl)
return ERR_PTR(-ENOMEM);
+
+ if (!nvmf_ctrl_options_get(opts)) {
+ ret = -ENOLCK;
+ goto out_free_ctrl;
+ }
+
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->list);
@@ -2244,7 +2250,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL);
if (!opts->trsvcid) {
ret = -ENOMEM;
- goto out_free_ctrl;
+ goto out_free_opts;
}
opts->mask |= NVMF_OPT_TRSVCID;
}
@@ -2263,13 +2269,13 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
if (ret) {
pr_err("malformed src address passed: %s\n",
opts->host_traddr);
- goto out_free_ctrl;
+ goto out_free_opts;
}
}
if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) {
ret = -EALREADY;
- goto out_free_ctrl;
+ goto out_free_opts;
}
INIT_DELAYED_WORK(&ctrl->reconnect_work,
@@ -2286,7 +2292,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
GFP_KERNEL);
if (!ctrl->queues)
- goto out_free_ctrl;
+ goto out_free_opts;
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
0 /* no quirks, we're perfect! */);
@@ -2317,6 +2323,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
return ERR_PTR(ret);
out_kfree_queues:
kfree(ctrl->queues);
+out_free_opts:
+ nvmf_ctrl_options_put(opts);
out_free_ctrl:
kfree(ctrl);
return ERR_PTR(ret);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index a6d596e05602..3b20c5ed033f 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2349,8 +2349,8 @@ static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
list_del(&ctrl->list);
mutex_unlock(&nvme_tcp_ctrl_mutex);
- nvmf_free_options(nctrl->opts);
free_ctrl:
+ nvmf_ctrl_options_put(nctrl->opts);
kfree(ctrl->queues);
kfree(ctrl);
}
@@ -2678,6 +2678,11 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
if (!ctrl)
return ERR_PTR(-ENOMEM);
+ if (!nvmf_ctrl_options_get(opts)) {
+ ret = -ENOLCK;
+ goto out_free_ctrl;
+ }
+
INIT_LIST_HEAD(&ctrl->list);
ctrl->ctrl.opts = opts;
ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
@@ -2695,7 +2700,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
if (!opts->trsvcid) {
ret = -ENOMEM;
- goto out_free_ctrl;
+ goto out_free_opts;
}
opts->mask |= NVMF_OPT_TRSVCID;
}
@@ -2705,7 +2710,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
if (ret) {
pr_err("malformed address passed: %s:%s\n",
opts->traddr, opts->trsvcid);
- goto out_free_ctrl;
+ goto out_free_opts;
}
if (opts->mask & NVMF_OPT_HOST_TRADDR) {
@@ -2714,7 +2719,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
if (ret) {
pr_err("malformed src address passed: %s\n",
opts->host_traddr);
- goto out_free_ctrl;
+ goto out_free_opts;
}
}
@@ -2723,20 +2728,20 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
pr_err("invalid interface passed: %s\n",
opts->host_iface);
ret = -ENODEV;
- goto out_free_ctrl;
+ goto out_free_opts;
}
}
if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
ret = -EALREADY;
- goto out_free_ctrl;
+ goto out_free_opts;
}
ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
GFP_KERNEL);
if (!ctrl->queues) {
ret = -ENOMEM;
- goto out_free_ctrl;
+ goto out_free_opts;
}
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
@@ -2770,6 +2775,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
return ERR_PTR(ret);
out_kfree_queues:
kfree(ctrl->queues);
+out_free_opts:
+ nvmf_ctrl_options_put(opts);
out_free_ctrl:
kfree(ctrl);
return ERR_PTR(ret);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index e589915ddef8..de2ff7ed0657 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -283,8 +283,8 @@ static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl)
if (nctrl->tagset)
nvme_remove_io_tag_set(nctrl);
kfree(ctrl->queues);
- nvmf_free_options(nctrl->opts);
free_ctrl:
+ nvmf_ctrl_options_put(nctrl->opts);
kfree(ctrl);
}
@@ -543,6 +543,12 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl)
return ERR_PTR(-ENOMEM);
+
+ if (!nvmf_ctrl_options_get(opts)) {
+ ret = -ENOLCK;
+ goto out_free_ctrl;
+ }
+
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->list);
@@ -550,10 +556,8 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops,
0 /* no quirks, we're perfect! */);
- if (ret) {
- kfree(ctrl);
- goto out;
- }
+ if (ret)
+ goto out_free_opts;
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
WARN_ON_ONCE(1);
@@ -612,7 +616,10 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
out_uninit_ctrl:
nvme_uninit_ctrl(&ctrl->ctrl);
nvme_put_ctrl(&ctrl->ctrl);
-out:
+out_free_opts:
+ nvmf_ctrl_options_put(opts);
+out_free_ctrl:
+ kfree(ctrl);
if (ret > 0)
ret = -EIO;
return ERR_PTR(ret);
--
2.43.1
On 21/02/2024 15:24, Daniel Wagner wrote: > The FC transport is offloading the connect attempt to a workqueue. When > the attempt fails the transport is starting to cleanup resources. It is > possible for user space to trigger a crash because nvmf_ctrl_options are > exposed to sysfs. > > This crash wasn't observed with blktests nvme/041 until now because the > retry loop was usually trying for several times (e.g. with defaults > 600s) and the test would trigger the cleanup itself. Though we the > recent change not retrying to use invalid credentials the crash can be > easily triggered. > > The simplest way to control the life time of nvmf_ctrl_options is by > using ref counting. Why do we need a refcount for an object that has the same exact lifetime as the ctrl itself? It just feels like unneeded complication.
On Thu, Mar 07, 2024 at 12:27:43PM +0200, Sagi Grimberg wrote:
> Why do we need a refcount for an object that has the same exact lifetime
> as the ctrl itself? It just feels like unneeded complication.
My claim the UAF is also possible with the current code is not correct.
Or at least not easy to reproduce. I've re-tested a lot and I couldn't
reproduce it.
Though, the UAF is very simple to reproduce with the sync connect patch
applied (nvme-fc: wait for initial connect attempt to finish) together
with Hannes' patch (nvme: authentication error are always
non-retryable):
In this case, the initial connect fails and the resources are removed,
while we are waiting in
+ if (!opts->connect_async) {
+ enum nvme_ctrl_state state;
+
+ wait_for_completion(&ctrl->connect_completion);
+ state = nvme_ctrl_state(&ctrl->ctrl);
+ nvme_fc_ctrl_put(ctrl);
+
+ if (state != NVME_CTRL_LIVE) {
+ /* Cleanup is handled by the connect state machine */
+ return ERR_PTR(-EIO);
+ }
+ }
This opens up the race window. While we are waiting here for the
completion, the ctrl entry in sysfs is still reachable. Unfortunately,
we also fire an uevent which starts another instance of nvme-cli. And
the new instance of nvme-cli iterates over sysfs and reads the already
freed options object.
run blktests nvme/041 at 2024-03-11 18:13:38
nvmet: adding nsid 1 to subsystem blktests-subsystem-1
nvme nvme0: NVME-FC{0}: create association : host wwpn 0x20001100aa000002 rport wwpn 0x20001100aa000001: NQN "blktests-subsystem-1"
(NULL device *): {0:0} Association created
[8167] nvmet: ctrl 1 start keep-alive timer for 5 secs
[8167] nvmet: check nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349
[8167] nvmet: nvmet_setup_dhgroup: ctrl 1 selecting dhgroup 0
[8167] nvmet: nvmet_setup_auth: using hash none key fb 28 d3 79 af 04 ba 36 95 3b e5 89 6c bf 42 90 4a dd dd 1b d4 e8 ba ce b2 7c 16 d4 01 7d 4f 20
nvmet: creating nvm controller 1 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349 with DH-HMAC-CHAP.
nvme nvme0: qid 0: no key
nvme nvme0: qid 0: authentication setup failed
nvme nvme0: NVME-FC{0}: create_assoc failed, assoc_id f2139b60a42c0000 ret 16785
nvme nvme0: NVME-FC{0}: reset: Reconnect attempt failed (16785)
nvme nvme0: NVME-FC{0}: reconnect failure
nvme nvme0: Removing ctrl: NQN "blktests-subsystem-1"
==================================================================
BUG: KASAN: slab-use-after-free in nvme_class_uevent+0xb9/0x1a0 [nvme_core]
Read of size 8 at addr ffff888107229698 by task systemd-journal/578
CPU: 1 PID: 578 Comm: systemd-journal Tainted: G W 6.8.0-rc6+ #43 106200e85ab1e5c3399a68beb80cc63ca4823f3a
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 02/02/2022
Call Trace:
<TASK>
dump_stack_lvl+0x5b/0x80
print_report+0x163/0x800
? __virt_addr_valid+0x2f3/0x340
? nvme_class_uevent+0xb9/0x1a0 [nvme_core a5a8fc3d48e3ec2a76ff6521d70aebe532cfd700]
kasan_report+0xd0/0x110
? nvme_class_uevent+0xb9/0x1a0 [nvme_core a5a8fc3d48e3ec2a76ff6521d70aebe532cfd700]
nvme_class_uevent+0xb9/0x1a0 [nvme_core a5a8fc3d48e3ec2a76ff6521d70aebe532cfd700]
dev_uevent+0x374/0x640
uevent_show+0x187/0x2a0
dev_attr_show+0x5f/0xb0
sysfs_kf_seq_show+0x2a8/0x3f0
? __cfi_dev_attr_show+0x10/0x10
seq_read_iter+0x3f1/0xc00
vfs_read+0x6cf/0x960
ksys_read+0xd7/0x1a0
do_syscall_64+0xb1/0x180
? do_syscall_64+0xc0/0x180
entry_SYSCALL_64_after_hwframe+0x6e/0x76
RIP: 0033:0x7f4297b0a3dc
Code: ec 28 48 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 97 18 f8 ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 34 44 89 c7 48 89 44 24 08 e8 fd
RSP: 002b:00007ffd945ec430 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 0000564732197e60 RCX: 00007f4297b0a3dc
RDX: 0000000000001008 RSI: 0000564732197e60 RDI: 000000000000001a
RBP: 000000000000001a R08: 0000000000000000 R09: 0000000000000001
R10: 0000000000000004 R11: 0000000000000246 R12: 0000000000001007
R13: 0000000000001008 R14: ffffffffffffffff R15: 0000000000000002
</TASK>
Allocated by task 31249 on cpu 0 at 5508.645525s:
kasan_save_track+0x2c/0x90
__kasan_kmalloc+0x89/0xa0
kmalloc_trace+0x1f3/0x3c0
nvmf_dev_write+0x15c/0x2990 [nvme_fabrics]
vfs_write+0x1cd/0xb60
ksys_write+0xd7/0x1a0
do_syscall_64+0xb1/0x180
entry_SYSCALL_64_after_hwframe+0x6e/0x76
Freed by task 31249 on cpu 2 at 5508.686805s:
kasan_save_track+0x2c/0x90
kasan_save_free_info+0x4a/0x60
poison_slab_object+0x108/0x180
__kasan_slab_free+0x33/0x80
kfree+0x119/0x310
nvmf_dev_write+0x23e0/0x2990 [nvme_fabrics]
vfs_write+0x1cd/0xb60
ksys_write+0xd7/0x1a0
do_syscall_64+0xb1/0x180
entry_SYSCALL_64_after_hwframe+0x6e/0x76
The buggy address belongs to the object at ffff888107229680
which belongs to the cache kmalloc-192 of size 192
The buggy address is located 24 bytes inside of
freed 192-byte region [ffff888107229680, ffff888107229740)
The buggy address belongs to the physical page:
page:0000000070cf556f refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x107228
head:0000000070cf556f order:1 entire_mapcount:0 nr_pages_mapped:0 pincount:0
flags: 0x17ffffc0000840(slab|head|node=0|zone=2|lastcpupid=0x1fffff)
page_type: 0xffffffff()
raw: 0017ffffc0000840 ffff888100042c00 ffffea0004363500 dead000000000004
raw: 0000000000000000 00000000001c001c 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff888107229580: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 fc
ffff888107229600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff888107229680: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff888107229700: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
ffff888107229780: fc fc fc fc fa fb fb fb fb fb fb fb fb fb fb fb
==================================================================
On 3/11/24 18:36, Daniel Wagner wrote:
> On Thu, Mar 07, 2024 at 12:27:43PM +0200, Sagi Grimberg wrote:
>> Why do we need a refcount for an object that has the same exact lifetime
>> as the ctrl itself? It just feels like unneeded complication.
>
> My claim the UAF is also possible with the current code is not correct.
> Or at least not easy to reproduce. I've re-tested a lot and I couldn't
> reproduce it.
>
> Though, the UAF is very simple to reproduce with the sync connect patch
> applied (nvme-fc: wait for initial connect attempt to finish) together
> with Hannes' patch (nvme: authentication error are always
> non-retryable):
>
> In this case, the initial connect fails and the resources are removed,
> while we are waiting in
>
> + if (!opts->connect_async) {
> + enum nvme_ctrl_state state;
> +
> + wait_for_completion(&ctrl->connect_completion);
> + state = nvme_ctrl_state(&ctrl->ctrl);
> + nvme_fc_ctrl_put(ctrl);
> +
> + if (state != NVME_CTRL_LIVE) {
> + /* Cleanup is handled by the connect state machine */
> + return ERR_PTR(-EIO);
> + }
> + }
>
> This opens up the race window. While we are waiting here for the
> completion, the ctrl entry in sysfs is still reachable. Unfortunately,
> we also fire an uevent which starts another instance of nvme-cli. And
> the new instance of nvme-cli iterates over sysfs and reads the already
> freed options object.
>
Curiously enough, I had been digging into better error reporting for
nvme-fabrics. And the one thing I came up with is to make the controller
_options_ as a private pointer to seq_file.
With that we can allocate and initialize the options during open(),
and then have write() do the parsing and calling create_ctrl() as usual.
But read() would then always have access to the option structure, and
we can use this structure to pass any errors. EG parsing errors could
be reported by an 'err_mask' field and so on.
That would allow us to report errors back to nvme-cli, and,
incidentally, also require reference counting.
Two stones with a bird and all that.
Patch is in testing, and I'll be posting it once I get confirmation.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
On 2/21/24 14:24, Daniel Wagner wrote: > The FC transport is offloading the connect attempt to a workqueue. When > the attempt fails the transport is starting to cleanup resources. It is > possible for user space to trigger a crash because nvmf_ctrl_options are > exposed to sysfs. > > This crash wasn't observed with blktests nvme/041 until now because the > retry loop was usually trying for several times (e.g. with defaults > 600s) and the test would trigger the cleanup itself. Though we the > recent change not retrying to use invalid credentials the crash can be > easily triggered. > > The simplest way to control the life time of nvmf_ctrl_options is by > using ref counting. > > Signed-off-by: Daniel Wagner <dwagner@suse.de> > --- > drivers/nvme/host/fabrics.c | 22 +++++++++++++++++++--- > drivers/nvme/host/fabrics.h | 6 +++++- > drivers/nvme/host/fc.c | 14 +++++++++----- > drivers/nvme/host/rdma.c | 18 +++++++++++++----- > drivers/nvme/host/tcp.c | 21 ++++++++++++++------- > drivers/nvme/target/loop.c | 19 +++++++++++++------ > 6 files changed, 73 insertions(+), 27 deletions(-) > Reviewed-by: Hannes Reinecke <hare@suse.de> Cheers, Hannes -- Dr. Hannes Reinecke Kernel Storage Architect hare@suse.de +49 911 74053 688 SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
© 2016 - 2026 Red Hat, Inc.