[PATCH v5] nvme: reject passthrough of driver-managed Set Features

Chao Shi posted 1 patch 18 hours ago
drivers/nvme/host/ioctl.c | 33 +++++++++++++++++++++++++++------
1 file changed, 27 insertions(+), 6 deletions(-)
[PATCH v5] nvme: reject passthrough of driver-managed Set Features
Posted by Chao Shi 18 hours ago
Since commit b58da2d270db ("nvme: update keep alive interval when kato
is modified"), userspace can start keep-alive on any transport via a
Set Features (KATO) passthrough command. nvme_keep_alive_work() then
allocates with BLK_MQ_REQ_RESERVED, but nvme_alloc_admin_tag_set()
only reserves admin tags for fabrics, so the allocation trips
WARN_ON_ONCE() in blk_mq_get_tag() and fails:

  nvme nvme0: keep-alive failed: -11

More generally, several Set Features change controller state that the
driver manages itself and cannot react to correctly when set behind
its back from userspace. Reject these in nvme_cmd_allowed():

  - KATO on non-fabrics (keep-alive is only armed for fabrics; on PCIe
    it has no reserved tag and an active keep-alive harms idle power
    states)
  - Host Behavior Support, Host Memory Buffer, Number of Queues, and
    Autonomous Power State Transition (all driver-managed)

Keep Alive on fabrics is unchanged. I/O commands are unaffected as the
check is confined to the admin path (ns == NULL).

Link: https://lore.kernel.org/linux-nvme/20260522162639.395802-1-coshi036@gmail.com/

Fixes: b58da2d270db ("nvme: update keep alive interval when kato is modified")

Found by FuzzNvme(Syzkaller with FEMU fuzzing framework).

Acked-by: Sungwoo Kim <iam@sung-woo.kim>
Acked-by: Dave Tian <daveti@purdue.edu>
Acked-by: Weidong Zhu <weizhu@fiu.edu>
Signed-off-by: Chao Shi <coshi036@gmail.com>
---

Reproducer for the keep-alive case (run as root on a PCIe NVMe device):

    #include <fcntl.h>
    #include <stdio.h>
    #include <string.h>
    #include <sys/ioctl.h>
    #include <linux/nvme_ioctl.h>

    int main(void)
    {
            struct nvme_admin_cmd cmd = {0};
            int fd = open("/dev/nvme0", O_RDWR);
            if (fd < 0) { perror("open"); return 1; }
            cmd.opcode = 0x09;       /* SET_FEATURES */
            cmd.cdw10  = 0x0f;       /* Feature ID: KATO */
            cmd.cdw11  = 5;          /* KATO = 5 seconds */
            if (ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd) < 0) {
                    perror("ioctl");
                    return 1;
            }
            return 0;
    }

On an unpatched kernel, within ~kato/2 seconds after the program exits,
dmesg shows:

    nvme nvme0: keep alive interval updated from 0 ms to 5000 ms
    WARNING: CPU: 0 PID: ... at block/blk-mq-tag.c:148 blk_mq_get_tag+...
    nvme nvme0: keep-alive failed: -11

With this patch the ioctl fails with EACCES on non-fabrics.

Changes since v4:
- Fold the check into the existing nvme_cmd_allowed() instead of a
  separate helper, and reject additional driver-managed Set Features
  (Host Behavior, Host Memory Buffer, Number of Queues, Autonomous
  Power State Transition) in the same switch (Keith Busch). The admin
  vs I/O distinction is now structural: the switch lives in the
  ns == NULL branch, so I/O commands (e.g. Dataset Management, which
  shares opcode 0x09 with Set Features) are never inspected.

Changes since v3:
- Only inspect admin commands so a DSM I/O command is not wrongly
  rejected (Keith Busch).

Changes since v2:
- Reject the KATO passthrough on non-fabrics instead of reserving an
  admin tag for all transports (Keith Busch, Christoph Hellwig).

Changes since v1:
- v2 added a spec citation and quirk discussion, superseded by the
  reject approach.

 drivers/nvme/host/ioctl.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index a9c097dacad6..31784506e845 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -14,8 +14,9 @@ enum {
 	NVME_IOCTL_PARTITION	= (1 << 1),
 };
 
-static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
-		unsigned int flags, bool open_for_write)
+static bool nvme_cmd_allowed(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+			     struct nvme_command *c, unsigned int flags,
+			     bool open_for_write)
 {
 	u32 effects;
 
@@ -50,6 +51,26 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
 			case NVME_ID_CNS_CTRL:
 				return true;
 			}
+		} else if (c->common.opcode == nvme_admin_set_features) {
+			/*
+			 * Reject Set Features that change controller state the
+			 * driver manages itself; setting them behind the driver's
+			 * back from userspace leaves it unable to react correctly.
+			 * Keep Alive is only armed for fabrics - on other
+			 * transports it has no reserved tag and harms idle power
+			 * states.
+			 */
+			switch (le32_to_cpu(c->features.fid) & 0xff) {
+			case NVME_FEAT_KATO:
+				if (ctrl->ops->flags & NVME_F_FABRICS)
+					break;
+				fallthrough;
+			case NVME_FEAT_HOST_BEHAVIOR:
+			case NVME_FEAT_HOST_MEM_BUF:
+			case NVME_FEAT_NUM_QUEUES:
+			case NVME_FEAT_AUTO_PST:
+				return false;
+			}
 		}
 		goto admin;
 	}
@@ -59,7 +80,7 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
 	 * and marks this command as supported.  If not reject unprivileged
 	 * passthrough.
 	 */
-	effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode);
+	effects = nvme_command_effects(ctrl, ns, c->common.opcode);
 	if (!(effects & NVME_CMD_EFFECTS_CSUPP))
 		goto admin;
 
@@ -308,7 +329,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
 	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
 
-	if (!nvme_cmd_allowed(ns, &c, 0, open_for_write))
+	if (!nvme_cmd_allowed(ctrl, ns, &c, 0, open_for_write))
 		return -EACCES;
 
 	if (cmd.timeout_ms)
@@ -355,7 +376,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
 	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
 
-	if (!nvme_cmd_allowed(ns, &c, flags, open_for_write))
+	if (!nvme_cmd_allowed(ctrl, ns, &c, flags, open_for_write))
 		return -EACCES;
 
 	if (cmd.timeout_ms)
@@ -472,7 +493,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14));
 	c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15));
 
-	if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode & FMODE_WRITE))
+	if (!nvme_cmd_allowed(ctrl, ns, &c, 0, ioucmd->file->f_mode & FMODE_WRITE))
 		return -EACCES;
 
 	d.metadata = READ_ONCE(cmd->metadata);
-- 
2.43.0