drivers/nvme/host/ioctl.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+)
Since commit b58da2d270db ("nvme: update keep alive interval when kato
is modified"), userspace can start keep-alive on any transport via a
Set Features (KATO) passthrough command. nvme_keep_alive_work() then
allocates with BLK_MQ_REQ_RESERVED, but nvme_alloc_admin_tag_set()
only reserves admin tags for fabrics, so the allocation trips
WARN_ON_ONCE() in blk_mq_get_tag() and fails:
nvme nvme0: keep-alive failed: -11
Keep Alive is optional on PCIe (NVMe 2.0a section 5.27.1.12) and the
driver only arms keep-alive for fabrics; enabling it elsewhere has no
reserved tag and an active keep-alive command only harms idle power
states.
Reject Set Features commands the driver is not prepared to handle from
userspace passthrough, starting with KATO on non-fabrics. The check
can be extended to other problematic features as they are identified.
This guards the userspace passthrough paths (ioctl and io_uring); the
nvmet target passthru path is out of scope and is not changed here.
Link: https://lore.kernel.org/linux-nvme/20260515071248.2689513-1-coshi036@gmail.com/
Fixes: b58da2d270db ("nvme: update keep alive interval when kato is modified")
Found by FuzzNvme(Syzkaller with FEMU fuzzing framework).
Acked-by: Sungwoo Kim <iam@sung-woo.kim>
Acked-by: Dave Tian <daveti@purdue.edu>
Acked-by: Weidong Zhu <weizhu@fiu.edu>
Signed-off-by: Chao Shi <coshi036@gmail.com>
---
Reproducer (run as root on a PCIe NVMe device):
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <sys/ioctl.h>
#include <linux/nvme_ioctl.h>
int main(void)
{
struct nvme_admin_cmd cmd = {0};
int fd = open("/dev/nvme0", O_RDWR);
if (fd < 0) { perror("open"); return 1; }
cmd.opcode = 0x09; /* SET_FEATURES */
cmd.cdw10 = 0x0f; /* Feature ID: KATO */
cmd.cdw11 = 5; /* KATO = 5 seconds */
if (ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd) < 0) {
perror("ioctl");
return 1;
}
return 0;
}
On an unpatched kernel, within ~kato/2 seconds after the program exits,
dmesg shows:
nvme nvme0: keep alive interval updated from 0 ms to 5000 ms
WARNING: CPU: 0 PID: ... at block/blk-mq-tag.c:148 blk_mq_get_tag+...
nvme nvme0: keep-alive failed: -11
With this patch the ioctl fails with EOPNOTSUPP on non-fabrics and
keep-alive is never started.
Changes since v2:
- Reject the KATO Set Features passthrough on non-fabrics instead of
reserving an admin tag for all transports (Keith Busch, Christoph
Hellwig). PCIe does not need keep-alive, and an active keep-alive
command only harms idle power states.
- Implement as an extensible passthrough filter for Set Features
commands the driver cannot handle.
- Drop the core.c reserved_tags change.
Changes since v1:
- v2 added a spec citation and a quirk discussion; both are superseded
by the filter approach above.
drivers/nvme/host/ioctl.c | 36 ++++++++++++++++++++++++++++++++++++
1 file changed, 36 insertions(+)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index a9c097dacad6..7705d9408396 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -86,6 +86,33 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
return capable(CAP_SYS_ADMIN);
}
+/*
+ * Some Set Features commands change controller behaviour that the driver is
+ * not prepared to handle on every transport. Reject such commands from
+ * userspace passthrough rather than letting them put the controller into a
+ * state the driver cannot deal with. The list can be extended as other
+ * problematic features are identified.
+ */
+static bool nvme_passthru_cmd_allowed(struct nvme_ctrl *ctrl,
+ struct nvme_command *c)
+{
+ if (c->common.opcode != nvme_admin_set_features)
+ return true;
+
+ switch (le32_to_cpu(c->common.cdw10) & 0xff) {
+ case NVME_FEAT_KATO:
+ /*
+ * Keep Alive is optional on PCIe (NVMe 2.0a 5.27.1.12) and the
+ * driver only arms keep-alive for fabrics. Enabling it on
+ * other transports starts a keep-alive command the driver is
+ * not set up for and harms idle power states, so reject it.
+ */
+ return ctrl->ops->flags & NVME_F_FABRICS;
+ default:
+ return true;
+ }
+}
+
/*
* Convert integer values from ioctl structures to user pointers, silently
* ignoring the upper bits in the compat case to match behaviour of 32-bit
@@ -311,6 +338,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (!nvme_cmd_allowed(ns, &c, 0, open_for_write))
return -EACCES;
+ if (!nvme_passthru_cmd_allowed(ctrl, &c))
+ return -EOPNOTSUPP;
+
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -358,6 +388,9 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (!nvme_cmd_allowed(ns, &c, flags, open_for_write))
return -EACCES;
+ if (!nvme_passthru_cmd_allowed(ctrl, &c))
+ return -EOPNOTSUPP;
+
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -475,6 +508,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode & FMODE_WRITE))
return -EACCES;
+ if (!nvme_passthru_cmd_allowed(ctrl, &c))
+ return -EOPNOTSUPP;
+
d.metadata = READ_ONCE(cmd->metadata);
d.addr = READ_ONCE(cmd->addr);
d.data_len = READ_ONCE(cmd->data_len);
--
2.43.0
On Fri, May 22, 2026 at 11:28:07AM -0400, Chao Shi wrote:
> +static bool nvme_passthru_cmd_allowed(struct nvme_ctrl *ctrl,
> + struct nvme_command *c)
> +{
> + if (c->common.opcode != nvme_admin_set_features)
> + return true;
> +
> + switch (le32_to_cpu(c->common.cdw10) & 0xff) {
> + case NVME_FEAT_KATO:
> + /*
> + * Keep Alive is optional on PCIe (NVMe 2.0a 5.27.1.12) and the
> + * driver only arms keep-alive for fabrics. Enabling it on
> + * other transports starts a keep-alive command the driver is
> + * not set up for and harms idle power states, so reject it.
> + */
> + return ctrl->ops->flags & NVME_F_FABRICS;
> + default:
> + return true;
> + }
> +}
> +
> /*
> * Convert integer values from ioctl structures to user pointers, silently
> * ignoring the upper bits in the compat case to match behaviour of 32-bit
> @@ -311,6 +338,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> if (!nvme_cmd_allowed(ns, &c, 0, open_for_write))
> return -EACCES;
>
> + if (!nvme_passthru_cmd_allowed(ctrl, &c))
> + return -EOPNOTSUPP;
You you have check if it's an admin command first. This going to break
the "Data Set Management" IO command.
Hi Keith,
Thanks! Changed in v4.
Chao
On Fri, May 22, 2026 at 11:46 AM Keith Busch <kbusch@kernel.org> wrote:
>
> On Fri, May 22, 2026 at 11:28:07AM -0400, Chao Shi wrote:
> > +static bool nvme_passthru_cmd_allowed(struct nvme_ctrl *ctrl,
> > + struct nvme_command *c)
> > +{
> > + if (c->common.opcode != nvme_admin_set_features)
> > + return true;
> > +
> > + switch (le32_to_cpu(c->common.cdw10) & 0xff) {
> > + case NVME_FEAT_KATO:
> > + /*
> > + * Keep Alive is optional on PCIe (NVMe 2.0a 5.27.1.12) and the
> > + * driver only arms keep-alive for fabrics. Enabling it on
> > + * other transports starts a keep-alive command the driver is
> > + * not set up for and harms idle power states, so reject it.
> > + */
> > + return ctrl->ops->flags & NVME_F_FABRICS;
> > + default:
> > + return true;
> > + }
> > +}
> > +
> > /*
> > * Convert integer values from ioctl structures to user pointers, silently
> > * ignoring the upper bits in the compat case to match behaviour of 32-bit
> > @@ -311,6 +338,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> > if (!nvme_cmd_allowed(ns, &c, 0, open_for_write))
> > return -EACCES;
> >
> > + if (!nvme_passthru_cmd_allowed(ctrl, &c))
> > + return -EOPNOTSUPP;
>
> You you have check if it's an admin command first. This going to break
> the "Data Set Management" IO command.
© 2016 - 2026 Red Hat, Inc.