nvme: bound the freeze drain in passthrough commands

[PATCH] nvme: bound the freeze drain in passthrough commands

Posted by Chao Shi 1 week, 5 days ago

nvme_passthru_start() drains in-flight I/O via the unbounded
nvme_wait_freeze() before submitting a command with command-set
effects (Format NVM, Sanitize, Namespace Management, vendor unique).
If a completion is silently dropped or the device hangs, the calling
task wedges with ctrl->scan_lock and ctrl->subsys->lock held, fanning
out into hung-task reports on any concurrent open/close/passthru on
the same controller:

  INFO: task syz-executor:NNNN blocked for more than 123 seconds.
   nvme_wait_freeze+0x82/0x100
   nvme_passthru_start drivers/nvme/host/core.c:1249 [inline]
   nvme_submit_user_cmd+0x1ee/0x3d0 drivers/nvme/host/ioctl.c:189

The other freeze-drain sites (pci shutdown, tcp/rdma reset) already
bound the wait with nvme_wait_freeze_timeout(NVME_IO_TIMEOUT).  Apply
it here too; on timeout, unwind the freeze and return -EBUSY (or
NVME_SC_INTERNAL on the nvmet path) instead of submitting the command.

Found by FuzzNvme(Syzkaller with FEMU fuzzing framework).

Acked-by: Sungwoo Kim <iam@sung-woo.kim>
Acked-by: Dave Tian <daveti@purdue.edu>
Acked-by: Weidong Zhu <weizhu@fiu.edu>
Signed-off-by: Chao Shi <coshi036@gmail.com>
---
 drivers/nvme/host/core.c       | 26 ++++++++++++++++++++------
 drivers/nvme/host/ioctl.c      |  7 ++++++-
 drivers/nvme/host/nvme.h       |  3 ++-
 drivers/nvme/target/passthru.c |  7 ++++++-
 4 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7bf228df6001..575f98b9a6cc 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1232,23 +1232,37 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
 }
 EXPORT_SYMBOL_NS_GPL(nvme_command_effects, "NVME_TARGET_PASSTHRU");
 
-u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
+int nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode,
+			u32 *effects)
 {
-	u32 effects = nvme_command_effects(ctrl, ns, opcode);
+	*effects = nvme_command_effects(ctrl, ns, opcode);
 
 	/*
 	 * For simplicity, IO to all namespaces is quiesced even if the command
-	 * effects say only one namespace is affected.
+	 * effects say only one namespace is affected.  Bound the drain wait so
+	 * a stuck I/O cannot wedge the passthrough caller (and any task on the
+	 * scan_lock or subsys lock) indefinitely; the other in-tree callers of
+	 * the freeze drain (pci shutdown, tcp/rdma reset) already use this same
+	 * NVME_IO_TIMEOUT bound.
 	 */
-	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
+	if (*effects & NVME_CMD_EFFECTS_CSE_MASK) {
 		mutex_lock(&ctrl->scan_lock);
 		mutex_lock(&ctrl->subsys->lock);
 		nvme_mpath_start_freeze(ctrl->subsys);
 		nvme_mpath_wait_freeze(ctrl->subsys);
 		nvme_start_freeze(ctrl);
-		nvme_wait_freeze(ctrl);
+		if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
+			dev_warn(ctrl->device,
+				 "I/O did not drain in %u seconds; aborting passthrough\n",
+				 nvme_io_timeout);
+			nvme_unfreeze(ctrl);
+			nvme_mpath_unfreeze(ctrl->subsys);
+			mutex_unlock(&ctrl->subsys->lock);
+			mutex_unlock(&ctrl->scan_lock);
+			return -EBUSY;
+		}
 	}
-	return effects;
+	return 0;
 }
 EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, "NVME_TARGET_PASSTHRU");
 
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index a9c097dacad6..762458a23b38 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -186,7 +186,12 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 	bio = req->bio;
 	ctrl = nvme_req(req)->ctrl;
 
-	effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
+	ret = nvme_passthru_start(ctrl, ns, cmd->common.opcode, &effects);
+	if (ret) {
+		if (bio)
+			blk_rq_unmap_user(bio);
+		goto out_free_req;
+	}
 	ret = nvme_execute_rq(req, false);
 	if (result)
 		*result = le64_to_cpu(nvme_req(req)->result.u64);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9a5f28c5103c..665d75de044e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -1211,7 +1211,8 @@ static inline void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl) {};
 
 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 			 u8 opcode);
-u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode);
+int nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode,
+			u32 *effects);
 int nvme_execute_rq(struct request *rq, bool at_head);
 void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
 		       struct nvme_command *cmd, int status);
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index 67c423a8b052..7b97bfc1ace6 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -220,7 +220,12 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
 	u32 effects;
 	int status;
 
-	effects = nvme_passthru_start(ctrl, ns, req->cmd->common.opcode);
+	status = nvme_passthru_start(ctrl, ns, req->cmd->common.opcode, &effects);
+	if (status) {
+		nvmet_req_complete(req, NVME_SC_INTERNAL);
+		blk_mq_free_request(rq);
+		return;
+	}
 	status = nvme_execute_rq(rq, false);
 	if (status == NVME_SC_SUCCESS &&
 	    req->cmd->common.opcode == nvme_admin_identify) {
-- 
2.43.0

Re: [PATCH] nvme: bound the freeze drain in passthrough commands

Posted by Keith Busch 1 week, 5 days ago

On Wed, May 27, 2026 at 01:59:23AM -0400, Chao Shi wrote:
> If a completion is silently dropped or the device hangs, the calling
> task wedges with ctrl->scan_lock and ctrl->subsys->lock held, fanning
> out into hung-task reports on any concurrent open/close/passthru on
> the same controller:

The IO timeout callbacks that nvme drivers provide are supposed to
forcefully reclaim any IO no matter what state the device is in. Is that
not happening for some reason?

Re: [PATCH] nvme: bound the freeze drain in passthrough commands

Posted by Christoph Hellwig 1 week, 5 days ago

On Wed, May 27, 2026 at 01:59:23AM -0400, Chao Shi wrote:
> nvme_passthru_start() drains in-flight I/O via the unbounded
> nvme_wait_freeze() before submitting a command with command-set
> effects (Format NVM, Sanitize, Namespace Management, vendor unique).
> If a completion is silently dropped or the device hangs, the calling
> task wedges with ctrl->scan_lock and ctrl->subsys->lock held, fanning
> out into hung-task reports on any concurrent open/close/passthru on
> the same controller:
> 
>   INFO: task syz-executor:NNNN blocked for more than 123 seconds.
>    nvme_wait_freeze+0x82/0x100
>    nvme_passthru_start drivers/nvme/host/core.c:1249 [inline]
>    nvme_submit_user_cmd+0x1ee/0x3d0 drivers/nvme/host/ioctl.c:189
> 
> The other freeze-drain sites (pci shutdown, tcp/rdma reset) already
> bound the wait with nvme_wait_freeze_timeout(NVME_IO_TIMEOUT).  Apply
> it here too; on timeout, unwind the freeze and return -EBUSY (or
> NVME_SC_INTERNAL on the nvmet path) instead of submitting the command.
> 
> Found by FuzzNvme(Syzkaller with FEMU fuzzing framework).

So not blocking forever sounds useful, but this might break existing
uses.  I guess we could do it based on the O_NONBLOCK flag if people
really cared.

Note that the blocked message itself is not a problem, but around
this time we should have done a controller reset and fixed up the
issue.  Does that not happen for your test case?