[PATCH v3] nvme: reject keep-alive passthrough on non-fabrics

Chao Shi posted 1 patch 2 days, 4 hours ago
There is a newer version of this series
drivers/nvme/host/ioctl.c | 36 ++++++++++++++++++++++++++++++++++++
1 file changed, 36 insertions(+)
[PATCH v3] nvme: reject keep-alive passthrough on non-fabrics
Posted by Chao Shi 2 days, 4 hours ago
Since commit b58da2d270db ("nvme: update keep alive interval when kato
is modified"), userspace can start keep-alive on any transport via a
Set Features (KATO) passthrough command. nvme_keep_alive_work() then
allocates with BLK_MQ_REQ_RESERVED, but nvme_alloc_admin_tag_set()
only reserves admin tags for fabrics, so the allocation trips
WARN_ON_ONCE() in blk_mq_get_tag() and fails:

  nvme nvme0: keep-alive failed: -11

Keep Alive is optional on PCIe (NVMe 2.0a section 5.27.1.12) and the
driver only arms keep-alive for fabrics; enabling it elsewhere has no
reserved tag and an active keep-alive command only harms idle power
states.

Reject Set Features commands the driver is not prepared to handle from
userspace passthrough, starting with KATO on non-fabrics. The check
can be extended to other problematic features as they are identified.

This guards the userspace passthrough paths (ioctl and io_uring); the
nvmet target passthru path is out of scope and is not changed here.

Link: https://lore.kernel.org/linux-nvme/20260515071248.2689513-1-coshi036@gmail.com/

Fixes: b58da2d270db ("nvme: update keep alive interval when kato is modified")

Found by FuzzNvme(Syzkaller with FEMU fuzzing framework).

Acked-by: Sungwoo Kim <iam@sung-woo.kim>
Acked-by: Dave Tian <daveti@purdue.edu>
Acked-by: Weidong Zhu <weizhu@fiu.edu>
Signed-off-by: Chao Shi <coshi036@gmail.com>
---

Reproducer (run as root on a PCIe NVMe device):

    #include <fcntl.h>
    #include <stdio.h>
    #include <string.h>
    #include <sys/ioctl.h>
    #include <linux/nvme_ioctl.h>

    int main(void)
    {
            struct nvme_admin_cmd cmd = {0};
            int fd = open("/dev/nvme0", O_RDWR);
            if (fd < 0) { perror("open"); return 1; }
            cmd.opcode = 0x09;       /* SET_FEATURES */
            cmd.cdw10  = 0x0f;       /* Feature ID: KATO */
            cmd.cdw11  = 5;          /* KATO = 5 seconds */
            if (ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd) < 0) {
                    perror("ioctl");
                    return 1;
            }
            return 0;
    }

On an unpatched kernel, within ~kato/2 seconds after the program exits,
dmesg shows:

    nvme nvme0: keep alive interval updated from 0 ms to 5000 ms
    WARNING: CPU: 0 PID: ... at block/blk-mq-tag.c:148 blk_mq_get_tag+...
    nvme nvme0: keep-alive failed: -11

With this patch the ioctl fails with EOPNOTSUPP on non-fabrics and
keep-alive is never started.

Changes since v2:
- Reject the KATO Set Features passthrough on non-fabrics instead of
  reserving an admin tag for all transports (Keith Busch, Christoph
  Hellwig). PCIe does not need keep-alive, and an active keep-alive
  command only harms idle power states.
- Implement as an extensible passthrough filter for Set Features
  commands the driver cannot handle.
- Drop the core.c reserved_tags change.

Changes since v1:
- v2 added a spec citation and a quirk discussion; both are superseded
  by the filter approach above.

 drivers/nvme/host/ioctl.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index a9c097dacad6..7705d9408396 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -86,6 +86,33 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
 	return capable(CAP_SYS_ADMIN);
 }
 
+/*
+ * Some Set Features commands change controller behaviour that the driver is
+ * not prepared to handle on every transport.  Reject such commands from
+ * userspace passthrough rather than letting them put the controller into a
+ * state the driver cannot deal with.  The list can be extended as other
+ * problematic features are identified.
+ */
+static bool nvme_passthru_cmd_allowed(struct nvme_ctrl *ctrl,
+				      struct nvme_command *c)
+{
+	if (c->common.opcode != nvme_admin_set_features)
+		return true;
+
+	switch (le32_to_cpu(c->common.cdw10) & 0xff) {
+	case NVME_FEAT_KATO:
+		/*
+		 * Keep Alive is optional on PCIe (NVMe 2.0a 5.27.1.12) and the
+		 * driver only arms keep-alive for fabrics.  Enabling it on
+		 * other transports starts a keep-alive command the driver is
+		 * not set up for and harms idle power states, so reject it.
+		 */
+		return ctrl->ops->flags & NVME_F_FABRICS;
+	default:
+		return true;
+	}
+}
+
 /*
  * Convert integer values from ioctl structures to user pointers, silently
  * ignoring the upper bits in the compat case to match behaviour of 32-bit
@@ -311,6 +338,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	if (!nvme_cmd_allowed(ns, &c, 0, open_for_write))
 		return -EACCES;
 
+	if (!nvme_passthru_cmd_allowed(ctrl, &c))
+		return -EOPNOTSUPP;
+
 	if (cmd.timeout_ms)
 		timeout = msecs_to_jiffies(cmd.timeout_ms);
 
@@ -358,6 +388,9 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	if (!nvme_cmd_allowed(ns, &c, flags, open_for_write))
 		return -EACCES;
 
+	if (!nvme_passthru_cmd_allowed(ctrl, &c))
+		return -EOPNOTSUPP;
+
 	if (cmd.timeout_ms)
 		timeout = msecs_to_jiffies(cmd.timeout_ms);
 
@@ -475,6 +508,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode & FMODE_WRITE))
 		return -EACCES;
 
+	if (!nvme_passthru_cmd_allowed(ctrl, &c))
+		return -EOPNOTSUPP;
+
 	d.metadata = READ_ONCE(cmd->metadata);
 	d.addr = READ_ONCE(cmd->addr);
 	d.data_len = READ_ONCE(cmd->data_len);
-- 
2.43.0
Re: [PATCH v3] nvme: reject keep-alive passthrough on non-fabrics
Posted by Keith Busch 2 days, 3 hours ago
On Fri, May 22, 2026 at 11:28:07AM -0400, Chao Shi wrote:
> +static bool nvme_passthru_cmd_allowed(struct nvme_ctrl *ctrl,
> +				      struct nvme_command *c)
> +{
> +	if (c->common.opcode != nvme_admin_set_features)
> +		return true;
> +
> +	switch (le32_to_cpu(c->common.cdw10) & 0xff) {
> +	case NVME_FEAT_KATO:
> +		/*
> +		 * Keep Alive is optional on PCIe (NVMe 2.0a 5.27.1.12) and the
> +		 * driver only arms keep-alive for fabrics.  Enabling it on
> +		 * other transports starts a keep-alive command the driver is
> +		 * not set up for and harms idle power states, so reject it.
> +		 */
> +		return ctrl->ops->flags & NVME_F_FABRICS;
> +	default:
> +		return true;
> +	}
> +}
> +
>  /*
>   * Convert integer values from ioctl structures to user pointers, silently
>   * ignoring the upper bits in the compat case to match behaviour of 32-bit
> @@ -311,6 +338,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>  	if (!nvme_cmd_allowed(ns, &c, 0, open_for_write))
>  		return -EACCES;
>  
> +	if (!nvme_passthru_cmd_allowed(ctrl, &c))
> +		return -EOPNOTSUPP;

You you have check if it's an admin command first. This going to break
the "Data Set Management" IO command.
Re: [PATCH v3] nvme: reject keep-alive passthrough on non-fabrics
Posted by Chao S 2 days, 3 hours ago
Hi Keith,

Thanks! Changed in v4.

Chao

On Fri, May 22, 2026 at 11:46 AM Keith Busch <kbusch@kernel.org> wrote:
>
> On Fri, May 22, 2026 at 11:28:07AM -0400, Chao Shi wrote:
> > +static bool nvme_passthru_cmd_allowed(struct nvme_ctrl *ctrl,
> > +                                   struct nvme_command *c)
> > +{
> > +     if (c->common.opcode != nvme_admin_set_features)
> > +             return true;
> > +
> > +     switch (le32_to_cpu(c->common.cdw10) & 0xff) {
> > +     case NVME_FEAT_KATO:
> > +             /*
> > +              * Keep Alive is optional on PCIe (NVMe 2.0a 5.27.1.12) and the
> > +              * driver only arms keep-alive for fabrics.  Enabling it on
> > +              * other transports starts a keep-alive command the driver is
> > +              * not set up for and harms idle power states, so reject it.
> > +              */
> > +             return ctrl->ops->flags & NVME_F_FABRICS;
> > +     default:
> > +             return true;
> > +     }
> > +}
> > +
> >  /*
> >   * Convert integer values from ioctl structures to user pointers, silently
> >   * ignoring the upper bits in the compat case to match behaviour of 32-bit
> > @@ -311,6 +338,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> >       if (!nvme_cmd_allowed(ns, &c, 0, open_for_write))
> >               return -EACCES;
> >
> > +     if (!nvme_passthru_cmd_allowed(ctrl, &c))
> > +             return -EOPNOTSUPP;
>
> You you have check if it's an admin command first. This going to break
> the "Data Set Management" IO command.