From: Alistair Francis <alistair.francis@wdc.com>
If the nvme_tcp_try_send() or nvme_tcp_try_recv() functions return
EKEYEXPIRED then the underlying TLS keys need to be updated. This occurs
on an KeyUpdate event as described in RFC8446
https://datatracker.ietf.org/doc/html/rfc8446#section-4.6.3.
If the NVMe Target (TLS server) initiates a KeyUpdate this patch will
allow the NVMe layer to process the KeyUpdate request and forward the
request to userspace. Userspace must then update the key to keep the
connection alive.
This patch allows us to handle the NVMe target sending a KeyUpdate
request without aborting the connection. At this time we don't support
initiating a KeyUpdate.
Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
---
v5:
- Cleanup code flow
- Check for MSG_CTRUNC in the msg_flags return from recvmsg
and use that to determine if it's a control message
v4:
- Remove all support for initiating KeyUpdate
- Don't call cancel_work() when updating keys
v3:
- Don't cancel existing handshake requests
v2:
- Don't change the state
- Use a helper function for KeyUpdates
- Continue sending in nvme_tcp_send_all() after a KeyUpdate
- Remove command message using recvmsg
drivers/nvme/host/tcp.c | 85 +++++++++++++++++++++++++++++++++--------
1 file changed, 70 insertions(+), 15 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 4797a4532b0d..5cec5a974bbf 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -172,6 +172,7 @@ struct nvme_tcp_queue {
bool tls_enabled;
u32 rcv_crc;
u32 snd_crc;
+ key_serial_t handshake_session_id;
__le32 exp_ddgst;
__le32 recv_ddgst;
struct completion tls_complete;
@@ -858,7 +859,10 @@ static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue,
static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
{
char *pdu = queue->pdu;
+ char cbuf[CMSG_LEN(sizeof(char))] = {};
struct msghdr msg = {
+ .msg_control = cbuf,
+ .msg_controllen = sizeof(cbuf),
.msg_flags = MSG_DONTWAIT,
};
struct kvec iov = {
@@ -873,12 +877,17 @@ static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
if (ret <= 0)
return ret;
+ hdr = queue->pdu;
+ if (hdr->type == TLS_HANDSHAKE_KEYUPDATE) {
+ dev_err(queue->ctrl->ctrl.device, "KeyUpdate message\n");
+ return 1;
+ }
+
queue->pdu_remaining -= ret;
queue->pdu_offset += ret;
if (queue->pdu_remaining)
return 0;
- hdr = queue->pdu;
if (unlikely(hdr->hlen != sizeof(struct nvme_tcp_rsp_pdu))) {
if (!nvme_tcp_recv_pdu_supported(hdr->type))
goto unsupported_pdu;
@@ -944,6 +953,7 @@ static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
struct request *rq =
nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+ char cbuf[CMSG_LEN(sizeof(char))] = {};
if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
return 0;
@@ -976,10 +986,26 @@ static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
if (ret < 0) {
- dev_err(queue->ctrl->ctrl.device,
- "queue %d failed to receive request %#x data",
- nvme_tcp_queue_id(queue), rq->tag);
- return ret;
+ /* If MSG_CTRUNC is set, it's a control message,
+ * so let's read the control message.
+ */
+ if (msg.msg_flags & MSG_CTRUNC) {
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT;
+ msg.msg_control = cbuf;
+ msg.msg_controllen = sizeof(cbuf);
+
+ ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
+ }
+
+ if (ret < 0) {
+ dev_dbg(queue->ctrl->ctrl.device,
+ "queue %d failed to receive request %#x data, %d",
+ nvme_tcp_queue_id(queue), rq->tag, ret);
+ return ret;
+ }
+
+ return 0;
}
if (queue->data_digest)
nvme_tcp_ddgst_calc(req, &queue->rcv_crc, ret);
@@ -1384,15 +1410,39 @@ static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
}
} while (result >= 0);
- if (result < 0 && result != -EAGAIN) {
- dev_err(queue->ctrl->ctrl.device,
- "receive failed: %d\n", result);
- queue->rd_enabled = false;
- nvme_tcp_error_recovery(&queue->ctrl->ctrl);
- } else if (result == -EAGAIN)
- result = 0;
+ if (result < 0) {
+ if (result != -EKEYEXPIRED && result != -EAGAIN) {
+ dev_err(queue->ctrl->ctrl.device,
+ "receive failed: %d\n", result);
+ queue->rd_enabled = false;
+ nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+ }
+ return result;
+ }
+
+ queue->nr_cqe = nr_cqe;
+ return nr_cqe;
+}
+
+static void update_tls_keys(struct nvme_tcp_queue *queue)
+{
+ int qid = nvme_tcp_queue_id(queue);
+ int ret;
+
+ dev_dbg(queue->ctrl->ctrl.device,
+ "updating key for queue %d\n", qid);
- return result < 0 ? result : (queue->nr_cqe = nr_cqe);
+ flush_work(&(queue->ctrl->ctrl).async_event_work);
+
+ ret = nvme_tcp_start_tls(&(queue->ctrl->ctrl),
+ queue, queue->ctrl->ctrl.tls_pskid,
+ HANDSHAKE_KEY_UPDATE_TYPE_RECEIVED);
+
+ if (ret < 0) {
+ dev_err(queue->ctrl->ctrl.device,
+ "failed to update the keys %d\n", ret);
+ nvme_tcp_fail_request(queue->request);
+ }
}
static void nvme_tcp_io_work(struct work_struct *w)
@@ -1417,8 +1467,11 @@ static void nvme_tcp_io_work(struct work_struct *w)
result = nvme_tcp_try_recvmsg(queue);
if (result > 0)
pending = true;
- else if (unlikely(result < 0))
- return;
+ else if (unlikely(result < 0)) {
+ if (result == -EKEYEXPIRED)
+ update_tls_keys(queue);
+ break;
+ }
/* did we get some space after spending time in recv? */
if (nvme_tcp_queue_has_pending(queue) &&
@@ -1726,6 +1779,7 @@ static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid,
ctrl->ctrl.tls_pskid = key_serial(tls_key);
key_put(tls_key);
queue->tls_err = 0;
+ queue->handshake_session_id = handshake_session_id;
}
out_complete:
@@ -1755,6 +1809,7 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
keyring = key_serial(nctrl->opts->keyring);
args.ta_keyring = keyring;
args.ta_timeout_ms = tls_handshake_timeout * 1000;
+ args.handshake_session_id = queue->handshake_session_id;
queue->tls_err = -EOPNOTSUPP;
init_completion(&queue->tls_complete);
if (keyupdate == HANDSHAKE_KEY_UPDATE_TYPE_UNSPEC)
--
2.51.1
On 12/11/2025 6:27, alistair23@gmail.com wrote:
> From: Alistair Francis <alistair.francis@wdc.com>
>
> If the nvme_tcp_try_send() or nvme_tcp_try_recv() functions return
> EKEYEXPIRED then the underlying TLS keys need to be updated. This occurs
> on an KeyUpdate event as described in RFC8446
> https://datatracker.ietf.org/doc/html/rfc8446#section-4.6.3.
>
> If the NVMe Target (TLS server) initiates a KeyUpdate this patch will
> allow the NVMe layer to process the KeyUpdate request and forward the
> request to userspace. Userspace must then update the key to keep the
> connection alive.
>
> This patch allows us to handle the NVMe target sending a KeyUpdate
> request without aborting the connection. At this time we don't support
> initiating a KeyUpdate.
>
> Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
> ---
I see this is on top of Hannes recvmsg patch. Worth noting in the patch
here at least.
> v5:
> - Cleanup code flow
> - Check for MSG_CTRUNC in the msg_flags return from recvmsg
> and use that to determine if it's a control message
> v4:
> - Remove all support for initiating KeyUpdate
> - Don't call cancel_work() when updating keys
> v3:
> - Don't cancel existing handshake requests
> v2:
> - Don't change the state
> - Use a helper function for KeyUpdates
> - Continue sending in nvme_tcp_send_all() after a KeyUpdate
> - Remove command message using recvmsg
>
> drivers/nvme/host/tcp.c | 85 +++++++++++++++++++++++++++++++++--------
> 1 file changed, 70 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 4797a4532b0d..5cec5a974bbf 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -172,6 +172,7 @@ struct nvme_tcp_queue {
> bool tls_enabled;
> u32 rcv_crc;
> u32 snd_crc;
> + key_serial_t handshake_session_id;
> __le32 exp_ddgst;
> __le32 recv_ddgst;
> struct completion tls_complete;
> @@ -858,7 +859,10 @@ static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue,
> static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
> {
> char *pdu = queue->pdu;
> + char cbuf[CMSG_LEN(sizeof(char))] = {};
> struct msghdr msg = {
> + .msg_control = cbuf,
> + .msg_controllen = sizeof(cbuf),
> .msg_flags = MSG_DONTWAIT,
> };
> struct kvec iov = {
> @@ -873,12 +877,17 @@ static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
> if (ret <= 0)
> return ret;
>
> + hdr = queue->pdu;
> + if (hdr->type == TLS_HANDSHAKE_KEYUPDATE) {
> + dev_err(queue->ctrl->ctrl.device, "KeyUpdate message\n");
> + return 1;
> + }
> +
> queue->pdu_remaining -= ret;
> queue->pdu_offset += ret;
> if (queue->pdu_remaining)
> return 0;
>
> - hdr = queue->pdu;
> if (unlikely(hdr->hlen != sizeof(struct nvme_tcp_rsp_pdu))) {
> if (!nvme_tcp_recv_pdu_supported(hdr->type))
> goto unsupported_pdu;
> @@ -944,6 +953,7 @@ static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
> struct request *rq =
> nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
> struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
> + char cbuf[CMSG_LEN(sizeof(char))] = {};
>
> if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
> return 0;
> @@ -976,10 +986,26 @@ static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
>
> ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
> if (ret < 0) {
> - dev_err(queue->ctrl->ctrl.device,
> - "queue %d failed to receive request %#x data",
> - nvme_tcp_queue_id(queue), rq->tag);
> - return ret;
> + /* If MSG_CTRUNC is set, it's a control message,
> + * so let's read the control message.
> + */
> + if (msg.msg_flags & MSG_CTRUNC) {
> + memset(&msg, 0, sizeof(msg));
> + msg.msg_flags = MSG_DONTWAIT;
> + msg.msg_control = cbuf;
> + msg.msg_controllen = sizeof(cbuf);
> +
> + ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
> + }
> +
> + if (ret < 0) {
> + dev_dbg(queue->ctrl->ctrl.device,
> + "queue %d failed to receive request %#x data, %d",
> + nvme_tcp_queue_id(queue), rq->tag, ret);
> + return ret;
> + }
> +
> + return 0;
> }
> if (queue->data_digest)
> nvme_tcp_ddgst_calc(req, &queue->rcv_crc, ret);
> @@ -1384,15 +1410,39 @@ static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
> }
> } while (result >= 0);
>
> - if (result < 0 && result != -EAGAIN) {
> - dev_err(queue->ctrl->ctrl.device,
> - "receive failed: %d\n", result);
> - queue->rd_enabled = false;
> - nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> - } else if (result == -EAGAIN)
> - result = 0;
> + if (result < 0) {
> + if (result != -EKEYEXPIRED && result != -EAGAIN) {
> + dev_err(queue->ctrl->ctrl.device,
> + "receive failed: %d\n", result);
> + queue->rd_enabled = false;
> + nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> + }
> + return result;
> + }
> +
> + queue->nr_cqe = nr_cqe;
> + return nr_cqe;
> +}
> +
> +static void update_tls_keys(struct nvme_tcp_queue *queue)
> +{
> + int qid = nvme_tcp_queue_id(queue);
> + int ret;
> +
> + dev_dbg(queue->ctrl->ctrl.device,
> + "updating key for queue %d\n", qid);
>
> - return result < 0 ? result : (queue->nr_cqe = nr_cqe);
> + flush_work(&(queue->ctrl->ctrl).async_event_work);
> +
> + ret = nvme_tcp_start_tls(&(queue->ctrl->ctrl),
> + queue, queue->ctrl->ctrl.tls_pskid,
> + HANDSHAKE_KEY_UPDATE_TYPE_RECEIVED);
No need to quiesce the queue or anything like that? everything continues
as usual?
Why are you flushing async_event_work?
On Mon, Dec 1, 2025 at 8:31 AM Sagi Grimberg <sagi@grimberg.me> wrote:
>
>
>
> On 12/11/2025 6:27, alistair23@gmail.com wrote:
> > From: Alistair Francis <alistair.francis@wdc.com>
> >
> > If the nvme_tcp_try_send() or nvme_tcp_try_recv() functions return
> > EKEYEXPIRED then the underlying TLS keys need to be updated. This occurs
> > on an KeyUpdate event as described in RFC8446
> > https://datatracker.ietf.org/doc/html/rfc8446#section-4.6.3.
> >
> > If the NVMe Target (TLS server) initiates a KeyUpdate this patch will
> > allow the NVMe layer to process the KeyUpdate request and forward the
> > request to userspace. Userspace must then update the key to keep the
> > connection alive.
> >
> > This patch allows us to handle the NVMe target sending a KeyUpdate
> > request without aborting the connection. At this time we don't support
> > initiating a KeyUpdate.
> >
> > Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
> > ---
>
> I see this is on top of Hannes recvmsg patch. Worth noting in the patch
> here at least.
>
> > v5:
> > - Cleanup code flow
> > - Check for MSG_CTRUNC in the msg_flags return from recvmsg
> > and use that to determine if it's a control message
> > v4:
> > - Remove all support for initiating KeyUpdate
> > - Don't call cancel_work() when updating keys
> > v3:
> > - Don't cancel existing handshake requests
> > v2:
> > - Don't change the state
> > - Use a helper function for KeyUpdates
> > - Continue sending in nvme_tcp_send_all() after a KeyUpdate
> > - Remove command message using recvmsg
> >
> > drivers/nvme/host/tcp.c | 85 +++++++++++++++++++++++++++++++++--------
> > 1 file changed, 70 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> > index 4797a4532b0d..5cec5a974bbf 100644
> > --- a/drivers/nvme/host/tcp.c
> > +++ b/drivers/nvme/host/tcp.c
> > @@ -172,6 +172,7 @@ struct nvme_tcp_queue {
> > bool tls_enabled;
> > u32 rcv_crc;
> > u32 snd_crc;
> > + key_serial_t handshake_session_id;
> > __le32 exp_ddgst;
> > __le32 recv_ddgst;
> > struct completion tls_complete;
> > @@ -858,7 +859,10 @@ static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue,
> > static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
> > {
> > char *pdu = queue->pdu;
> > + char cbuf[CMSG_LEN(sizeof(char))] = {};
> > struct msghdr msg = {
> > + .msg_control = cbuf,
> > + .msg_controllen = sizeof(cbuf),
> > .msg_flags = MSG_DONTWAIT,
> > };
> > struct kvec iov = {
> > @@ -873,12 +877,17 @@ static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
> > if (ret <= 0)
> > return ret;
> >
> > + hdr = queue->pdu;
> > + if (hdr->type == TLS_HANDSHAKE_KEYUPDATE) {
> > + dev_err(queue->ctrl->ctrl.device, "KeyUpdate message\n");
> > + return 1;
> > + }
> > +
> > queue->pdu_remaining -= ret;
> > queue->pdu_offset += ret;
> > if (queue->pdu_remaining)
> > return 0;
> >
> > - hdr = queue->pdu;
> > if (unlikely(hdr->hlen != sizeof(struct nvme_tcp_rsp_pdu))) {
> > if (!nvme_tcp_recv_pdu_supported(hdr->type))
> > goto unsupported_pdu;
> > @@ -944,6 +953,7 @@ static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
> > struct request *rq =
> > nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
> > struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
> > + char cbuf[CMSG_LEN(sizeof(char))] = {};
> >
> > if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
> > return 0;
> > @@ -976,10 +986,26 @@ static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
> >
> > ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
> > if (ret < 0) {
> > - dev_err(queue->ctrl->ctrl.device,
> > - "queue %d failed to receive request %#x data",
> > - nvme_tcp_queue_id(queue), rq->tag);
> > - return ret;
> > + /* If MSG_CTRUNC is set, it's a control message,
> > + * so let's read the control message.
> > + */
> > + if (msg.msg_flags & MSG_CTRUNC) {
> > + memset(&msg, 0, sizeof(msg));
> > + msg.msg_flags = MSG_DONTWAIT;
> > + msg.msg_control = cbuf;
> > + msg.msg_controllen = sizeof(cbuf);
> > +
> > + ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
> > + }
> > +
> > + if (ret < 0) {
> > + dev_dbg(queue->ctrl->ctrl.device,
> > + "queue %d failed to receive request %#x data, %d",
> > + nvme_tcp_queue_id(queue), rq->tag, ret);
> > + return ret;
> > + }
> > +
> > + return 0;
> > }
> > if (queue->data_digest)
> > nvme_tcp_ddgst_calc(req, &queue->rcv_crc, ret);
> > @@ -1384,15 +1410,39 @@ static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
> > }
> > } while (result >= 0);
> >
> > - if (result < 0 && result != -EAGAIN) {
> > - dev_err(queue->ctrl->ctrl.device,
> > - "receive failed: %d\n", result);
> > - queue->rd_enabled = false;
> > - nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> > - } else if (result == -EAGAIN)
> > - result = 0;
> > + if (result < 0) {
> > + if (result != -EKEYEXPIRED && result != -EAGAIN) {
> > + dev_err(queue->ctrl->ctrl.device,
> > + "receive failed: %d\n", result);
> > + queue->rd_enabled = false;
> > + nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> > + }
> > + return result;
> > + }
> > +
> > + queue->nr_cqe = nr_cqe;
> > + return nr_cqe;
> > +}
> > +
> > +static void update_tls_keys(struct nvme_tcp_queue *queue)
> > +{
> > + int qid = nvme_tcp_queue_id(queue);
> > + int ret;
> > +
> > + dev_dbg(queue->ctrl->ctrl.device,
> > + "updating key for queue %d\n", qid);
> >
> > - return result < 0 ? result : (queue->nr_cqe = nr_cqe);
> > + flush_work(&(queue->ctrl->ctrl).async_event_work);
> > +
> > + ret = nvme_tcp_start_tls(&(queue->ctrl->ctrl),
> > + queue, queue->ctrl->ctrl.tls_pskid,
> > + HANDSHAKE_KEY_UPDATE_TYPE_RECEIVED);
>
> No need to quiesce the queue or anything like that? everything continues
> as usual?
Everything just continues as usual. Besides a drop in throughput
waiting for the KeyUpdate you don't notice it. For testing I trigger
an update every 30 packets, it's slow but works
> Why are you flushing async_event_work?
I think that's just leftover from earlier, I'll remove it
Alistair
On 11/12/25 05:27, alistair23@gmail.com wrote:
> From: Alistair Francis <alistair.francis@wdc.com>
>
> If the nvme_tcp_try_send() or nvme_tcp_try_recv() functions return
> EKEYEXPIRED then the underlying TLS keys need to be updated. This occurs
> on an KeyUpdate event as described in RFC8446
> https://datatracker.ietf.org/doc/html/rfc8446#section-4.6.3.
>
> If the NVMe Target (TLS server) initiates a KeyUpdate this patch will
> allow the NVMe layer to process the KeyUpdate request and forward the
> request to userspace. Userspace must then update the key to keep the
> connection alive.
>
> This patch allows us to handle the NVMe target sending a KeyUpdate
> request without aborting the connection. At this time we don't support
> initiating a KeyUpdate.
>
> Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
> ---
> v5:
> - Cleanup code flow
> - Check for MSG_CTRUNC in the msg_flags return from recvmsg
> and use that to determine if it's a control message
> v4:
> - Remove all support for initiating KeyUpdate
> - Don't call cancel_work() when updating keys
> v3:
> - Don't cancel existing handshake requests
> v2:
> - Don't change the state
> - Use a helper function for KeyUpdates
> - Continue sending in nvme_tcp_send_all() after a KeyUpdate
> - Remove command message using recvmsg
>
> drivers/nvme/host/tcp.c | 85 +++++++++++++++++++++++++++++++++--------
> 1 file changed, 70 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 4797a4532b0d..5cec5a974bbf 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -172,6 +172,7 @@ struct nvme_tcp_queue {
> bool tls_enabled;
> u32 rcv_crc;
> u32 snd_crc;
> + key_serial_t handshake_session_id;
> __le32 exp_ddgst;
> __le32 recv_ddgst;
> struct completion tls_complete;
> @@ -858,7 +859,10 @@ static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue,
> static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
> {
> char *pdu = queue->pdu;
> + char cbuf[CMSG_LEN(sizeof(char))] = {};
> struct msghdr msg = {
> + .msg_control = cbuf,
> + .msg_controllen = sizeof(cbuf),
> .msg_flags = MSG_DONTWAIT,
> };
> struct kvec iov = {
> @@ -873,12 +877,17 @@ static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
> if (ret <= 0)
> return ret;
>
> + hdr = queue->pdu;
> + if (hdr->type == TLS_HANDSHAKE_KEYUPDATE) {
> + dev_err(queue->ctrl->ctrl.device, "KeyUpdate message\n");
> + return 1;
> + }
> +
Errm. 'hdr' is of type 'struct nvme_tcp_hdr', and that most certainly
does not define TLS_HANDSHAKE_KEYUPDATE. I think you should evaluate the
cmsg type here.
> queue->pdu_remaining -= ret;
> queue->pdu_offset += ret;
> if (queue->pdu_remaining)
> return 0;
>
> - hdr = queue->pdu;
> if (unlikely(hdr->hlen != sizeof(struct nvme_tcp_rsp_pdu))) {
> if (!nvme_tcp_recv_pdu_supported(hdr->type))
> goto unsupported_pdu;
> @@ -944,6 +953,7 @@ static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
> struct request *rq =
> nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
> struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
> + char cbuf[CMSG_LEN(sizeof(char))] = {};
>
> if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
> return 0;
> @@ -976,10 +986,26 @@ static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
>
> ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
> if (ret < 0) {
> - dev_err(queue->ctrl->ctrl.device,
> - "queue %d failed to receive request %#x data",
> - nvme_tcp_queue_id(queue), rq->tag);
> - return ret;
> + /* If MSG_CTRUNC is set, it's a control message,
> + * so let's read the control message.
> + */
> + if (msg.msg_flags & MSG_CTRUNC) {
> + memset(&msg, 0, sizeof(msg));
> + msg.msg_flags = MSG_DONTWAIT;
> + msg.msg_control = cbuf;
> + msg.msg_controllen = sizeof(cbuf);
> +
This is not correct; reading the control message implies a kernel
memory allocation as message buffer, not an interator (as it's the
case here).
> + ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
> + }
> +
> + if (ret < 0) {
> + dev_dbg(queue->ctrl->ctrl.device,
> + "queue %d failed to receive request %#x data, %d",
> + nvme_tcp_queue_id(queue), rq->tag, ret);
> + return ret;
> + }
> +
> + return 0;
> }
> if (queue->data_digest)
> nvme_tcp_ddgst_calc(req, &queue->rcv_crc, ret);
> @@ -1384,15 +1410,39 @@ static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
> }
> } while (result >= 0);
>
> - if (result < 0 && result != -EAGAIN) {
> - dev_err(queue->ctrl->ctrl.device,
> - "receive failed: %d\n", result);
> - queue->rd_enabled = false;
> - nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> - } else if (result == -EAGAIN)
> - result = 0;
> + if (result < 0) {
> + if (result != -EKEYEXPIRED && result != -EAGAIN) {
> + dev_err(queue->ctrl->ctrl.device,
> + "receive failed: %d\n", result);
> + queue->rd_enabled = false;
> + nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> + }
> + return result;
> + }
> +
> + queue->nr_cqe = nr_cqe;
> + return nr_cqe;
> +}
> +
> +static void update_tls_keys(struct nvme_tcp_queue *queue)
> +{
> + int qid = nvme_tcp_queue_id(queue);
> + int ret;
> +
> + dev_dbg(queue->ctrl->ctrl.device,
> + "updating key for queue %d\n", qid);
>
> - return result < 0 ? result : (queue->nr_cqe = nr_cqe);
> + flush_work(&(queue->ctrl->ctrl).async_event_work);
> +
> + ret = nvme_tcp_start_tls(&(queue->ctrl->ctrl),
> + queue, queue->ctrl->ctrl.tls_pskid,
> + HANDSHAKE_KEY_UPDATE_TYPE_RECEIVED);
> +
> + if (ret < 0) {
> + dev_err(queue->ctrl->ctrl.device,
> + "failed to update the keys %d\n", ret);
> + nvme_tcp_fail_request(queue->request);
> + }
> }
>
> static void nvme_tcp_io_work(struct work_struct *w)
> @@ -1417,8 +1467,11 @@ static void nvme_tcp_io_work(struct work_struct *w)
> result = nvme_tcp_try_recvmsg(queue);
> if (result > 0)
> pending = true;
> - else if (unlikely(result < 0))
> - return;
> + else if (unlikely(result < 0)) {
> + if (result == -EKEYEXPIRED)
> + update_tls_keys(queue);
> + break;
> + }
>
> /* did we get some space after spending time in recv? */
> if (nvme_tcp_queue_has_pending(queue) &&
> @@ -1726,6 +1779,7 @@ static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid,
> ctrl->ctrl.tls_pskid = key_serial(tls_key);
> key_put(tls_key);
> queue->tls_err = 0;
> + queue->handshake_session_id = handshake_session_id;
> }
>
> out_complete:
> @@ -1755,6 +1809,7 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
> keyring = key_serial(nctrl->opts->keyring);
> args.ta_keyring = keyring;
> args.ta_timeout_ms = tls_handshake_timeout * 1000;
> + args.handshake_session_id = queue->handshake_session_id;
> queue->tls_err = -EOPNOTSUPP;
> init_completion(&queue->tls_complete);
> if (keyupdate == HANDSHAKE_KEY_UPDATE_TYPE_UNSPEC)
Cheers,Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
On Thu, Nov 27, 2025 at 11:31 PM Hannes Reinecke <hare@suse.de> wrote:
>
> On 11/12/25 05:27, alistair23@gmail.com wrote:
> > From: Alistair Francis <alistair.francis@wdc.com>
> >
> > If the nvme_tcp_try_send() or nvme_tcp_try_recv() functions return
> > EKEYEXPIRED then the underlying TLS keys need to be updated. This occurs
> > on an KeyUpdate event as described in RFC8446
> > https://datatracker.ietf.org/doc/html/rfc8446#section-4.6.3.
> >
> > If the NVMe Target (TLS server) initiates a KeyUpdate this patch will
> > allow the NVMe layer to process the KeyUpdate request and forward the
> > request to userspace. Userspace must then update the key to keep the
> > connection alive.
> >
> > This patch allows us to handle the NVMe target sending a KeyUpdate
> > request without aborting the connection. At this time we don't support
> > initiating a KeyUpdate.
> >
> > Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
> > ---
> > v5:
> > - Cleanup code flow
> > - Check for MSG_CTRUNC in the msg_flags return from recvmsg
> > and use that to determine if it's a control message
> > v4:
> > - Remove all support for initiating KeyUpdate
> > - Don't call cancel_work() when updating keys
> > v3:
> > - Don't cancel existing handshake requests
> > v2:
> > - Don't change the state
> > - Use a helper function for KeyUpdates
> > - Continue sending in nvme_tcp_send_all() after a KeyUpdate
> > - Remove command message using recvmsg
> >
> > drivers/nvme/host/tcp.c | 85 +++++++++++++++++++++++++++++++++--------
> > 1 file changed, 70 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> > index 4797a4532b0d..5cec5a974bbf 100644
> > --- a/drivers/nvme/host/tcp.c
> > +++ b/drivers/nvme/host/tcp.c
> > @@ -172,6 +172,7 @@ struct nvme_tcp_queue {
> > bool tls_enabled;
> > u32 rcv_crc;
> > u32 snd_crc;
> > + key_serial_t handshake_session_id;
> > __le32 exp_ddgst;
> > __le32 recv_ddgst;
> > struct completion tls_complete;
> > @@ -858,7 +859,10 @@ static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue,
> > static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
> > {
> > char *pdu = queue->pdu;
> > + char cbuf[CMSG_LEN(sizeof(char))] = {};
> > struct msghdr msg = {
> > + .msg_control = cbuf,
> > + .msg_controllen = sizeof(cbuf),
> > .msg_flags = MSG_DONTWAIT,
> > };
> > struct kvec iov = {
> > @@ -873,12 +877,17 @@ static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
> > if (ret <= 0)
> > return ret;
> >
> > + hdr = queue->pdu;
> > + if (hdr->type == TLS_HANDSHAKE_KEYUPDATE) {
> > + dev_err(queue->ctrl->ctrl.device, "KeyUpdate message\n");
> > + return 1;
> > + }
> > +
>
> Errm. 'hdr' is of type 'struct nvme_tcp_hdr', and that most certainly
> does not define TLS_HANDSHAKE_KEYUPDATE. I think you should evaluate the
> cmsg type here.
>
> > queue->pdu_remaining -= ret;
> > queue->pdu_offset += ret;
> > if (queue->pdu_remaining)
> > return 0;
> >
> > - hdr = queue->pdu;
> > if (unlikely(hdr->hlen != sizeof(struct nvme_tcp_rsp_pdu))) {
> > if (!nvme_tcp_recv_pdu_supported(hdr->type))
> > goto unsupported_pdu;
> > @@ -944,6 +953,7 @@ static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
> > struct request *rq =
> > nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
> > struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
> > + char cbuf[CMSG_LEN(sizeof(char))] = {};
> >
> > if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
> > return 0;
> > @@ -976,10 +986,26 @@ static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
> >
> > ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
> > if (ret < 0) {
> > - dev_err(queue->ctrl->ctrl.device,
> > - "queue %d failed to receive request %#x data",
> > - nvme_tcp_queue_id(queue), rq->tag);
> > - return ret;
> > + /* If MSG_CTRUNC is set, it's a control message,
> > + * so let's read the control message.
> > + */
> > + if (msg.msg_flags & MSG_CTRUNC) {
> > + memset(&msg, 0, sizeof(msg));
> > + msg.msg_flags = MSG_DONTWAIT;
> > + msg.msg_control = cbuf;
> > + msg.msg_controllen = sizeof(cbuf);
> > +
> This is not correct; reading the control message implies a kernel
> memory allocation as message buffer, not an interator (as it's the
> case here).
I don't follow what you mean
Alistair
On 12/1/25 05:18, Alistair Francis wrote:
> On Thu, Nov 27, 2025 at 11:31 PM Hannes Reinecke <hare@suse.de> wrote:
>>
>> On 11/12/25 05:27, alistair23@gmail.com wrote:
>>> From: Alistair Francis <alistair.francis@wdc.com>
[ .. ]>>> @@ -976,10 +986,26 @@ static int nvme_tcp_recvmsg_data(struct
nvme_tcp_queue *queue)
>>>
>>> ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
>>> if (ret < 0) {
>>> - dev_err(queue->ctrl->ctrl.device,
>>> - "queue %d failed to receive request %#x data",
>>> - nvme_tcp_queue_id(queue), rq->tag);
>>> - return ret;
>>> + /* If MSG_CTRUNC is set, it's a control message,
>>> + * so let's read the control message.
>>> + */
>>> + if (msg.msg_flags & MSG_CTRUNC) {
>>> + memset(&msg, 0, sizeof(msg));
>>> + msg.msg_flags = MSG_DONTWAIT;
>>> + msg.msg_control = cbuf;
>>> + msg.msg_controllen = sizeof(cbuf);
>>> +
>> This is not correct; reading the control message implies a kernel
>> memory allocation as message buffer, not an interator (as it's the
>> case here).
>
> I don't follow what you mean
>
Ah, right. My comment refers to users of tls_alert_recv(), which we
don't do here.
Sorry for the noise.
You can add my:
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
On Wed, Nov 12, 2025 at 02:27:19PM +1000, alistair23@gmail.com wrote:
>
> ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
> if (ret < 0) {
> + /* If MSG_CTRUNC is set, it's a control message,
> + * so let's read the control message.
> + */
This is not the normal kernel comment style, which would be:
/*
* If MSG_CTRUNC is set, it's a control message, so
* let's read the control message.
+ */
> + if (msg.msg_flags & MSG_CTRUNC) {
> + memset(&msg, 0, sizeof(msg));
> + msg.msg_flags = MSG_DONTWAIT;
> + msg.msg_control = cbuf;
> + msg.msg_controllen = sizeof(cbuf);
> +
> + ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
Overly long line. Also given that we're in the main receive handler
it would be nice to have this outside the main flow using a goto and
an unlikely label anyway.
On 11/12/25 1:59 AM, Christoph Hellwig wrote:
> On Wed, Nov 12, 2025 at 02:27:19PM +1000, alistair23@gmail.com wrote:
>>
>> ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
>> if (ret < 0) {
>> + /* If MSG_CTRUNC is set, it's a control message,
>> + * so let's read the control message.
>> + */
> This is not the normal kernel comment style, which would be:
>
> /*
> * If MSG_CTRUNC is set, it's a control message, so
> * let's read the control message.
> + */
But it is correct style for net/ .
--
Chuck Lever
On Wed, Nov 12, 2025 at 09:31:35AM -0500, Chuck Lever wrote: > But it is correct style for net/ . Maybe. But why would non-net/ code care about their odd preference?
On 11/12/25 9:38 AM, Christoph Hellwig wrote: > On Wed, Nov 12, 2025 at 09:31:35AM -0500, Chuck Lever wrote: >> But it is correct style for net/ . > > Maybe. But why would non-net/ code care about their odd preference? Yes. My bad. Carry on! -- Chuck Lever
© 2016 - 2026 Red Hat, Inc.