drivers/nvme/host/ioctl.c | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+)
Since commit b58da2d270db ("nvme: update keep alive interval when kato
is modified"), userspace can start keep-alive on any transport via a
Set Features (KATO) passthrough command. nvme_keep_alive_work() then
allocates with BLK_MQ_REQ_RESERVED, but nvme_alloc_admin_tag_set()
only reserves admin tags for fabrics, so the allocation trips
WARN_ON_ONCE() in blk_mq_get_tag() and fails:
nvme nvme0: keep-alive failed: -11
Keep Alive is optional on PCIe (NVMe 2.0a section 5.27.1.12) and the
driver only arms keep-alive for fabrics; enabling it elsewhere has no
reserved tag and an active keep-alive command only harms idle power
states.
Reject Set Features commands the driver is not prepared to handle from
userspace passthrough, starting with KATO on non-fabrics. The check
can be extended to other problematic features as they are identified.
This guards the userspace passthrough paths (ioctl and io_uring); the
nvmet target passthru path is out of scope and is not changed here.
Link: https://lore.kernel.org/linux-nvme/20260515071248.2689513-1-coshi036@gmail.com/
Fixes: b58da2d270db ("nvme: update keep alive interval when kato is modified")
Found by FuzzNvme(Syzkaller with FEMU fuzzing framework).
Acked-by: Sungwoo Kim <iam@sung-woo.kim>
Acked-by: Dave Tian <daveti@purdue.edu>
Acked-by: Weidong Zhu <weizhu@fiu.edu>
Signed-off-by: Chao Shi <coshi036@gmail.com>
---
Reproducer (run as root on a PCIe NVMe device):
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <sys/ioctl.h>
#include <linux/nvme_ioctl.h>
int main(void)
{
struct nvme_admin_cmd cmd = {0};
int fd = open("/dev/nvme0", O_RDWR);
if (fd < 0) { perror("open"); return 1; }
cmd.opcode = 0x09; /* SET_FEATURES */
cmd.cdw10 = 0x0f; /* Feature ID: KATO */
cmd.cdw11 = 5; /* KATO = 5 seconds */
if (ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd) < 0) {
perror("ioctl");
return 1;
}
return 0;
}
On an unpatched kernel, within ~kato/2 seconds after the program exits,
dmesg shows:
nvme nvme0: keep alive interval updated from 0 ms to 5000 ms
WARNING: CPU: 0 PID: ... at block/blk-mq-tag.c:148 blk_mq_get_tag+...
nvme nvme0: keep-alive failed: -11
With this patch the ioctl fails with EOPNOTSUPP on non-fabrics and
keep-alive is never started.
Changes since v3:
- Only inspect admin commands (ns == NULL). I/O commands share the
opcode space with admin commands (Dataset Management is 0x09, same
as Set Features), so the previous version could wrongly reject a DSM
I/O command. Pass ns to the helper and bail out for I/O (Keith Busch).
Changes since v2:
- Reject the KATO Set Features passthrough on non-fabrics instead of
reserving an admin tag for all transports (Keith Busch, Christoph
Hellwig). PCIe does not need keep-alive, and an active keep-alive
command only harms idle power states.
- Implement as an extensible passthrough filter for Set Features
commands the driver cannot handle.
- Drop the core.c reserved_tags change.
Changes since v1:
- v2 added a spec citation and a quirk discussion; both are superseded
by the filter approach above.
drivers/nvme/host/ioctl.c | 42 +++++++++++++++++++++++++++++++++++++++
1 file changed, 42 insertions(+)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index a9c097dacad6..33caa3ae79e5 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -86,6 +86,39 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
return capable(CAP_SYS_ADMIN);
}
+/*
+ * Some Set Features commands change controller behaviour that the driver is
+ * not prepared to handle on every transport. Reject such commands from
+ * userspace passthrough rather than letting them put the controller into a
+ * state the driver cannot deal with. The list can be extended as other
+ * problematic features are identified.
+ */
+static bool nvme_passthru_cmd_allowed(struct nvme_ctrl *ctrl,
+ struct nvme_ns *ns,
+ struct nvme_command *c)
+{
+ /*
+ * This only filters admin commands (ns == NULL). I/O commands share
+ * the opcode space with admin commands - Dataset Management is 0x09,
+ * the same value as Set Features - so they must not be inspected here.
+ */
+ if (ns || c->common.opcode != nvme_admin_set_features)
+ return true;
+
+ switch (le32_to_cpu(c->common.cdw10) & 0xff) {
+ case NVME_FEAT_KATO:
+ /*
+ * Keep Alive is optional on PCIe (NVMe 2.0a 5.27.1.12) and the
+ * driver only arms keep-alive for fabrics. Enabling it on
+ * other transports starts a keep-alive command the driver is
+ * not set up for and harms idle power states, so reject it.
+ */
+ return ctrl->ops->flags & NVME_F_FABRICS;
+ default:
+ return true;
+ }
+}
+
/*
* Convert integer values from ioctl structures to user pointers, silently
* ignoring the upper bits in the compat case to match behaviour of 32-bit
@@ -311,6 +344,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (!nvme_cmd_allowed(ns, &c, 0, open_for_write))
return -EACCES;
+ if (!nvme_passthru_cmd_allowed(ctrl, ns, &c))
+ return -EOPNOTSUPP;
+
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -358,6 +394,9 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (!nvme_cmd_allowed(ns, &c, flags, open_for_write))
return -EACCES;
+ if (!nvme_passthru_cmd_allowed(ctrl, ns, &c))
+ return -EOPNOTSUPP;
+
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -475,6 +514,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode & FMODE_WRITE))
return -EACCES;
+ if (!nvme_passthru_cmd_allowed(ctrl, ns, &c))
+ return -EOPNOTSUPP;
+
d.metadata = READ_ONCE(cmd->metadata);
d.addr = READ_ONCE(cmd->addr);
d.data_len = READ_ONCE(cmd->data_len);
--
2.43.0
On Fri, May 22, 2026 at 12:26:39PM -0400, Chao Shi wrote:
> +/*
> + * Some Set Features commands change controller behaviour that the driver is
> + * not prepared to handle on every transport. Reject such commands from
> + * userspace passthrough rather than letting them put the controller into a
> + * state the driver cannot deal with. The list can be extended as other
> + * problematic features are identified.
> + */
> +static bool nvme_passthru_cmd_allowed(struct nvme_ctrl *ctrl,
> + struct nvme_ns *ns,
> + struct nvme_command *c)
> +{
> + /*
> + * This only filters admin commands (ns == NULL). I/O commands share
> + * the opcode space with admin commands - Dataset Management is 0x09,
> + * the same value as Set Features - so they must not be inspected here.
> + */
> + if (ns || c->common.opcode != nvme_admin_set_features)
> + return true;
> +
> + switch (le32_to_cpu(c->common.cdw10) & 0xff) {
> + case NVME_FEAT_KATO:
> + /*
> + * Keep Alive is optional on PCIe (NVMe 2.0a 5.27.1.12) and the
> + * driver only arms keep-alive for fabrics. Enabling it on
> + * other transports starts a keep-alive command the driver is
> + * not set up for and harms idle power states, so reject it.
> + */
> + return ctrl->ops->flags & NVME_F_FABRICS;
> + default:
> + return true;
> + }
> +}
This doesn't need to be its own function. You can add these checks to
the existing nvme_cmd_allowed():
---
@@ -50,6 +53,18 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
case NVME_ID_CNS_CTRL:
return true;
}
+ } else if (c->common.opcode == nvme_admin_set_features) {
+ switch (cpu_to_le32(c->features.fid) & 0xff) {
+ case NVME_FEAT_KATO:
+ if (ctrl->ops->flags & NVME_F_FABRICS)
+ break;
+ fallthrough;
+ case NVME_FEAT_HOST_BEHAVIOR:
+ case NVME_FEAT_HOST_MEM_BUF:
+ case NVME_FEAT_NUM_QUEUES:
+ case NVME_FEAT_AUTO_PST:
+ return false;
+ }
}
goto admin;
}
--
Hi Keith,
Thanks for the feedback, I understand your idea now. Patch v5 will
incorporate the advice.
Thanks again!
Chao
On Fri, May 22, 2026 at 3:32 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Fri, May 22, 2026 at 12:26:39PM -0400, Chao Shi wrote:
> > +/*
> > + * Some Set Features commands change controller behaviour that the driver is
> > + * not prepared to handle on every transport. Reject such commands from
> > + * userspace passthrough rather than letting them put the controller into a
> > + * state the driver cannot deal with. The list can be extended as other
> > + * problematic features are identified.
> > + */
> > +static bool nvme_passthru_cmd_allowed(struct nvme_ctrl *ctrl,
> > + struct nvme_ns *ns,
> > + struct nvme_command *c)
> > +{
> > + /*
> > + * This only filters admin commands (ns == NULL). I/O commands share
> > + * the opcode space with admin commands - Dataset Management is 0x09,
> > + * the same value as Set Features - so they must not be inspected here.
> > + */
> > + if (ns || c->common.opcode != nvme_admin_set_features)
> > + return true;
> > +
> > + switch (le32_to_cpu(c->common.cdw10) & 0xff) {
> > + case NVME_FEAT_KATO:
> > + /*
> > + * Keep Alive is optional on PCIe (NVMe 2.0a 5.27.1.12) and the
> > + * driver only arms keep-alive for fabrics. Enabling it on
> > + * other transports starts a keep-alive command the driver is
> > + * not set up for and harms idle power states, so reject it.
> > + */
> > + return ctrl->ops->flags & NVME_F_FABRICS;
> > + default:
> > + return true;
> > + }
> > +}
>
> This doesn't need to be its own function. You can add these checks to
> the existing nvme_cmd_allowed():
>
> ---
> @@ -50,6 +53,18 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
> case NVME_ID_CNS_CTRL:
> return true;
> }
> + } else if (c->common.opcode == nvme_admin_set_features) {
> + switch (cpu_to_le32(c->features.fid) & 0xff) {
> + case NVME_FEAT_KATO:
> + if (ctrl->ops->flags & NVME_F_FABRICS)
> + break;
> + fallthrough;
> + case NVME_FEAT_HOST_BEHAVIOR:
> + case NVME_FEAT_HOST_MEM_BUF:
> + case NVME_FEAT_NUM_QUEUES:
> + case NVME_FEAT_AUTO_PST:
> + return false;
> + }
> }
> goto admin;
> }
> --
© 2016 - 2026 Red Hat, Inc.