[v2] net/tls: support maximum record size limit

[PATCH v2] net/tls: support maximum record size limit

Posted by Wilfred Mallawa 1 month ago

From: Wilfred Mallawa <wilfred.mallawa@wdc.com>

During a handshake, an endpoint may specify a maximum record size limit.
Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
maximum record size. Meaning that, the outgoing records from the kernel
can exceed a lower size negotiated during the handshake. In such a case,
the TLS endpoint must send a fatal "record_overflow" alert [1], and
thus the record is discarded.

Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
support. For these devices, supporting TLS record size negotiation is
necessary because the maximum TLS record size supported by the controller
is less than the default 16KB currently used by the kernel.

This patch adds support for retrieving the negotiated record size limit
during a handshake, and enforcing it at the TLS layer such that outgoing
records are no larger than the size negotiated. This patch depends on
the respective userspace support in tlshd and GnuTLS [2].

[1] https://www.rfc-editor.org/rfc/rfc8449
[2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005

Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>
---
 Documentation/networking/tls.rst |  7 ++++++
 include/net/tls.h                |  1 +
 include/uapi/linux/tls.h         |  2 ++
 net/tls/tls_main.c               | 39 ++++++++++++++++++++++++++++++--
 net/tls/tls_sw.c                 |  4 ++++
 5 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index 36cc7afc2527..0232df902320 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -280,6 +280,13 @@ If the record decrypted turns out to had been padded or is not a data
 record it will be decrypted again into a kernel buffer without zero copy.
 Such events are counted in the ``TlsDecryptRetry`` statistic.
 
+TLS_TX_RECORD_SIZE_LIM
+~~~~~~~~~~~~~~~~~~~~~~
+
+During a TLS handshake, an endpoint may use the record size limit extension
+to specify a maximum record size. This allows enforcing the specified record
+size limit, such that outgoing records do not exceed the limit specified.
+
 Statistics
 ==========
 
diff --git a/include/net/tls.h b/include/net/tls.h
index 857340338b69..c9a3759f27ca 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -226,6 +226,7 @@ struct tls_context {
 	u8 rx_conf:3;
 	u8 zerocopy_sendfile:1;
 	u8 rx_no_pad:1;
+	u16 record_size_limit;
 
 	int (*push_pending_record)(struct sock *sk, int flags);
 	void (*sk_write_space)(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index b66a800389cc..3add266d5916 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -41,6 +41,7 @@
 #define TLS_RX			2	/* Set receive parameters */
 #define TLS_TX_ZEROCOPY_RO	3	/* TX zerocopy (only sendfile now) */
 #define TLS_RX_EXPECT_NO_PAD	4	/* Attempt opportunistic zero-copy */
+#define TLS_TX_RECORD_SIZE_LIM	5	/* Maximum record size */
 
 /* Supported versions */
 #define TLS_VERSION_MINOR(ver)	((ver) & 0xFF)
@@ -194,6 +195,7 @@ enum {
 	TLS_INFO_RXCONF,
 	TLS_INFO_ZC_RO_TX,
 	TLS_INFO_RX_NO_PAD,
+	TLS_INFO_TX_RECORD_SIZE_LIM,
 	__TLS_INFO_MAX,
 };
 #define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index a3ccb3135e51..1098c01f2749 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -812,6 +812,31 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
 	return rc;
 }
 
+static int do_tls_setsockopt_record_size(struct sock *sk, sockptr_t optval,
+					 unsigned int optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	u16 value;
+
+	if (sockptr_is_null(optval) || optlen != sizeof(value))
+		return -EINVAL;
+
+	if (copy_from_sockptr(&value, optval, sizeof(value)))
+		return -EFAULT;
+
+	if (ctx->prot_info.version == TLS_1_2_VERSION &&
+	    value > TLS_MAX_PAYLOAD_SIZE)
+		return -EINVAL;
+
+	if (ctx->prot_info.version == TLS_1_3_VERSION &&
+	    value > TLS_MAX_PAYLOAD_SIZE + 1)
+		return -EINVAL;
+
+	ctx->record_size_limit = value;
+
+	return 0;
+}
+
 static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 			     unsigned int optlen)
 {
@@ -833,6 +858,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 	case TLS_RX_EXPECT_NO_PAD:
 		rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
 		break;
+	case TLS_TX_RECORD_SIZE_LIM:
+		rc = do_tls_setsockopt_record_size(sk, optval, optlen);
+		break;
 	default:
 		rc = -ENOPROTOOPT;
 		break;
@@ -1065,7 +1093,7 @@ static u16 tls_user_config(struct tls_context *ctx, bool tx)
 
 static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin)
 {
-	u16 version, cipher_type;
+	u16 version, cipher_type, record_size_limit;
 	struct tls_context *ctx;
 	struct nlattr *start;
 	int err;
@@ -1110,7 +1138,13 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin)
 		if (err)
 			goto nla_failure;
 	}
-
+	record_size_limit = ctx->record_size_limit;
+	if (record_size_limit) {
+		err = nla_put_u16(skb, TLS_INFO_TX_RECORD_SIZE_LIM,
+				  record_size_limit);
+		if (err)
+			goto nla_failure;
+	}
 	rcu_read_unlock();
 	nla_nest_end(skb, start);
 	return 0;
@@ -1132,6 +1166,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin)
 		nla_total_size(sizeof(u16)) +	/* TLS_INFO_TXCONF */
 		nla_total_size(0) +		/* TLS_INFO_ZC_RO_TX */
 		nla_total_size(0) +		/* TLS_INFO_RX_NO_PAD */
+		nla_total_size(sizeof(u16)) +   /* TLS_INFO_TX_RECORD_SIZE_LIM */
 		0;
 
 	return size;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index bac65d0d4e3e..9f9359f591d3 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
 	unsigned char record_type = TLS_RECORD_TYPE_DATA;
 	bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
 	bool eor = !(msg->msg_flags & MSG_MORE);
+	u16 record_size_limit;
 	size_t try_to_copy;
 	ssize_t copied = 0;
 	struct sk_msg *msg_pl, *msg_en;
@@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
 		}
 	}
 
+	record_size_limit = tls_ctx->record_size_limit ?
+			    tls_ctx->record_size_limit : TLS_MAX_PAYLOAD_SIZE;
+
 	while (msg_data_left(msg)) {
 		if (sk->sk_err) {
 			ret = -sk->sk_err;
-- 
2.51.0

Re: [PATCH v2] net/tls: support maximum record size limit

Posted by kernel test robot 1 month ago

Hi Wilfred,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net-next/main]
[also build test WARNING on net/main linus/master horms-ipvs/master v6.17-rc4 next-20250902]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Wilfred-Mallawa/net-tls-support-maximum-record-size-limit/20250902-114005
base:   net-next/main
patch link:    https://lore.kernel.org/r/20250902033809.177182-2-wilfred.opensource%40gmail.com
patch subject: [PATCH v2] net/tls: support maximum record size limit
config: i386-buildonly-randconfig-001-20250903 (https://download.01.org/0day-ci/archive/20250903/202509030542.ZW9r1S1c-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250903/202509030542.ZW9r1S1c-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202509030542.ZW9r1S1c-lkp@intel.com/

All warnings (new ones prefixed by >>):

   net/tls/tls_sw.c: In function 'tls_sw_sendmsg_locked':
>> net/tls/tls_sw.c:1036:13: warning: variable 'record_size_limit' set but not used [-Wunused-but-set-variable]
    1036 |         u16 record_size_limit;
         |             ^~~~~~~~~~~~~~~~~


vim +/record_size_limit +1036 net/tls/tls_sw.c

  1024	
  1025	static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
  1026					 size_t size)
  1027	{
  1028		long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
  1029		struct tls_context *tls_ctx = tls_get_ctx(sk);
  1030		struct tls_prot_info *prot = &tls_ctx->prot_info;
  1031		struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
  1032		bool async_capable = ctx->async_capable;
  1033		unsigned char record_type = TLS_RECORD_TYPE_DATA;
  1034		bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
  1035		bool eor = !(msg->msg_flags & MSG_MORE);
> 1036		u16 record_size_limit;
  1037		size_t try_to_copy;
  1038		ssize_t copied = 0;
  1039		struct sk_msg *msg_pl, *msg_en;
  1040		struct tls_rec *rec;
  1041		int required_size;
  1042		int num_async = 0;
  1043		bool full_record;
  1044		int record_room;
  1045		int num_zc = 0;
  1046		int orig_size;
  1047		int ret = 0;
  1048	
  1049		if (!eor && (msg->msg_flags & MSG_EOR))
  1050			return -EINVAL;
  1051	
  1052		if (unlikely(msg->msg_controllen)) {
  1053			ret = tls_process_cmsg(sk, msg, &record_type);
  1054			if (ret) {
  1055				if (ret == -EINPROGRESS)
  1056					num_async++;
  1057				else if (ret != -EAGAIN)
  1058					goto send_end;
  1059			}
  1060		}
  1061	
  1062		record_size_limit = tls_ctx->record_size_limit ?
  1063				    tls_ctx->record_size_limit : TLS_MAX_PAYLOAD_SIZE;
  1064	
  1065		while (msg_data_left(msg)) {
  1066			if (sk->sk_err) {
  1067				ret = -sk->sk_err;
  1068				goto send_end;
  1069			}
  1070	
  1071			if (ctx->open_rec)
  1072				rec = ctx->open_rec;
  1073			else
  1074				rec = ctx->open_rec = tls_get_rec(sk);
  1075			if (!rec) {
  1076				ret = -ENOMEM;
  1077				goto send_end;
  1078			}
  1079	
  1080			msg_pl = &rec->msg_plaintext;
  1081			msg_en = &rec->msg_encrypted;
  1082	
  1083			orig_size = msg_pl->sg.size;
  1084			full_record = false;
  1085			try_to_copy = msg_data_left(msg);
  1086			record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
  1087			if (try_to_copy >= record_room) {
  1088				try_to_copy = record_room;
  1089				full_record = true;
  1090			}
  1091	
  1092			required_size = msg_pl->sg.size + try_to_copy +
  1093					prot->overhead_size;
  1094	
  1095			if (!sk_stream_memory_free(sk))
  1096				goto wait_for_sndbuf;
  1097	
  1098	alloc_encrypted:
  1099			ret = tls_alloc_encrypted_msg(sk, required_size);
  1100			if (ret) {
  1101				if (ret != -ENOSPC)
  1102					goto wait_for_memory;
  1103	
  1104				/* Adjust try_to_copy according to the amount that was
  1105				 * actually allocated. The difference is due
  1106				 * to max sg elements limit
  1107				 */
  1108				try_to_copy -= required_size - msg_en->sg.size;
  1109				full_record = true;
  1110			}
  1111	
  1112			if (try_to_copy && (msg->msg_flags & MSG_SPLICE_PAGES)) {
  1113				ret = tls_sw_sendmsg_splice(sk, msg, msg_pl,
  1114							    try_to_copy, &copied);
  1115				if (ret < 0)
  1116					goto send_end;
  1117				tls_ctx->pending_open_record_frags = true;
  1118	
  1119				if (sk_msg_full(msg_pl))
  1120					full_record = true;
  1121	
  1122				if (full_record || eor)
  1123					goto copied;
  1124				continue;
  1125			}
  1126	
  1127			if (!is_kvec && (full_record || eor) && !async_capable) {
  1128				u32 first = msg_pl->sg.end;
  1129	
  1130				ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter,
  1131								msg_pl, try_to_copy);
  1132				if (ret)
  1133					goto fallback_to_reg_send;
  1134	
  1135				num_zc++;
  1136				copied += try_to_copy;
  1137	
  1138				sk_msg_sg_copy_set(msg_pl, first);
  1139				ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
  1140							  record_type, &copied,
  1141							  msg->msg_flags);
  1142				if (ret) {
  1143					if (ret == -EINPROGRESS)
  1144						num_async++;
  1145					else if (ret == -ENOMEM)
  1146						goto wait_for_memory;
  1147					else if (ctx->open_rec && ret == -ENOSPC) {
  1148						if (msg_pl->cork_bytes) {
  1149							ret = 0;
  1150							goto send_end;
  1151						}
  1152						goto rollback_iter;
  1153					} else if (ret != -EAGAIN)
  1154						goto send_end;
  1155				}
  1156				continue;
  1157	rollback_iter:
  1158				copied -= try_to_copy;
  1159				sk_msg_sg_copy_clear(msg_pl, first);
  1160				iov_iter_revert(&msg->msg_iter,
  1161						msg_pl->sg.size - orig_size);
  1162	fallback_to_reg_send:
  1163				sk_msg_trim(sk, msg_pl, orig_size);
  1164			}
  1165	
  1166			required_size = msg_pl->sg.size + try_to_copy;
  1167	
  1168			ret = tls_clone_plaintext_msg(sk, required_size);
  1169			if (ret) {
  1170				if (ret != -ENOSPC)
  1171					goto send_end;
  1172	
  1173				/* Adjust try_to_copy according to the amount that was
  1174				 * actually allocated. The difference is due
  1175				 * to max sg elements limit
  1176				 */
  1177				try_to_copy -= required_size - msg_pl->sg.size;
  1178				full_record = true;
  1179				sk_msg_trim(sk, msg_en,
  1180					    msg_pl->sg.size + prot->overhead_size);
  1181			}
  1182	
  1183			if (try_to_copy) {
  1184				ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter,
  1185							       msg_pl, try_to_copy);
  1186				if (ret < 0)
  1187					goto trim_sgl;
  1188			}
  1189	
  1190			/* Open records defined only if successfully copied, otherwise
  1191			 * we would trim the sg but not reset the open record frags.
  1192			 */
  1193			tls_ctx->pending_open_record_frags = true;
  1194			copied += try_to_copy;
  1195	copied:
  1196			if (full_record || eor) {
  1197				ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
  1198							  record_type, &copied,
  1199							  msg->msg_flags);
  1200				if (ret) {
  1201					if (ret == -EINPROGRESS)
  1202						num_async++;
  1203					else if (ret == -ENOMEM)
  1204						goto wait_for_memory;
  1205					else if (ret != -EAGAIN) {
  1206						if (ret == -ENOSPC)
  1207							ret = 0;
  1208						goto send_end;
  1209					}
  1210				}
  1211			}
  1212	
  1213			continue;
  1214	
  1215	wait_for_sndbuf:
  1216			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
  1217	wait_for_memory:
  1218			ret = sk_stream_wait_memory(sk, &timeo);
  1219			if (ret) {
  1220	trim_sgl:
  1221				if (ctx->open_rec)
  1222					tls_trim_both_msgs(sk, orig_size);
  1223				goto send_end;
  1224			}
  1225	
  1226			if (ctx->open_rec && msg_en->sg.size < required_size)
  1227				goto alloc_encrypted;
  1228		}
  1229	
  1230		if (!num_async) {
  1231			goto send_end;
  1232		} else if (num_zc || eor) {
  1233			int err;
  1234	
  1235			/* Wait for pending encryptions to get completed */
  1236			err = tls_encrypt_async_wait(ctx);
  1237			if (err) {
  1238				ret = err;
  1239				copied = 0;
  1240			}
  1241		}
  1242	
  1243		/* Transmit if any encryptions have completed */
  1244		if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) {
  1245			cancel_delayed_work(&ctx->tx_work.work);
  1246			tls_tx_records(sk, msg->msg_flags);
  1247		}
  1248	
  1249	send_end:
  1250		ret = sk_stream_error(sk, msg->msg_flags, ret);
  1251		return copied > 0 ? copied : ret;
  1252	}
  1253	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH v2] net/tls: support maximum record size limit

Posted by Sabrina Dubroca 1 month ago

2025-09-02, 13:38:10 +1000, Wilfred Mallawa wrote:
> From: Wilfred Mallawa <wilfred.mallawa@wdc.com>
> 
> During a handshake, an endpoint may specify a maximum record size limit.
> Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
> maximum record size. Meaning that, the outgoing records from the kernel
> can exceed a lower size negotiated during the handshake. In such a case,
> the TLS endpoint must send a fatal "record_overflow" alert [1], and
> thus the record is discarded.
> 
> Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
> support. For these devices, supporting TLS record size negotiation is
> necessary because the maximum TLS record size supported by the controller
> is less than the default 16KB currently used by the kernel.
> 
> This patch adds support for retrieving the negotiated record size limit
> during a handshake, and enforcing it at the TLS layer such that outgoing
> records are no larger than the size negotiated. This patch depends on
> the respective userspace support in tlshd and GnuTLS [2].
> 
> [1] https://www.rfc-editor.org/rfc/rfc8449
> [2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005
> 
> Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>
> ---
>  Documentation/networking/tls.rst |  7 ++++++
>  include/net/tls.h                |  1 +
>  include/uapi/linux/tls.h         |  2 ++
>  net/tls/tls_main.c               | 39 ++++++++++++++++++++++++++++++--
>  net/tls/tls_sw.c                 |  4 ++++
>  5 files changed, 51 insertions(+), 2 deletions(-)

A selftest would be nice (tools/testing/selftests/net/tls.c), but I'm
not sure what we could do on the "RX" side to check that we are
respecting the size restriction. Use a basic TCP socket and try to
parse (and then discard without decrypting) records manually out of
the stream and see if we got the length we wanted?


> diff --git a/include/net/tls.h b/include/net/tls.h
> index 857340338b69..c9a3759f27ca 100644
> --- a/include/net/tls.h
> +++ b/include/net/tls.h
> @@ -226,6 +226,7 @@ struct tls_context {
>  	u8 rx_conf:3;
>  	u8 zerocopy_sendfile:1;
>  	u8 rx_no_pad:1;
> +	u16 record_size_limit;

Maybe "tx_record_size_limit", since it's not intended for RX?

I don't know if the kernel will ever have a need to enforce the RX
record size, but it would maybe avoid future head-scratching "why is
this not used on the RX path?"



> diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
> index a3ccb3135e51..1098c01f2749 100644
> --- a/net/tls/tls_main.c
> +++ b/net/tls/tls_main.c
> @@ -812,6 +812,31 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
>  	return rc;
>  }
>  
> +static int do_tls_setsockopt_record_size(struct sock *sk, sockptr_t optval,
> +					 unsigned int optlen)
> +{
> +	struct tls_context *ctx = tls_get_ctx(sk);
> +	u16 value;
> +
> +	if (sockptr_is_null(optval) || optlen != sizeof(value))
> +		return -EINVAL;
> +
> +	if (copy_from_sockptr(&value, optval, sizeof(value)))
> +		return -EFAULT;
> +
> +	if (ctx->prot_info.version == TLS_1_2_VERSION &&
> +	    value > TLS_MAX_PAYLOAD_SIZE)
> +		return -EINVAL;
> +
> +	if (ctx->prot_info.version == TLS_1_3_VERSION &&
> +	    value > TLS_MAX_PAYLOAD_SIZE + 1)
> +		return -EINVAL;
> +
> +	ctx->record_size_limit = value;
> +
> +	return 0;
> +}
> +
>  static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
>  			     unsigned int optlen)
>  {
> @@ -833,6 +858,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
>  	case TLS_RX_EXPECT_NO_PAD:
>  		rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
>  		break;
> +	case TLS_TX_RECORD_SIZE_LIM:
> +		rc = do_tls_setsockopt_record_size(sk, optval, optlen);
> +		break;

Adding the corresponding changes to do_tls_getsockopt would also be good.


> diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
> index bac65d0d4e3e..9f9359f591d3 100644
> --- a/net/tls/tls_sw.c
> +++ b/net/tls/tls_sw.c
> @@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
>  	unsigned char record_type = TLS_RECORD_TYPE_DATA;
>  	bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
>  	bool eor = !(msg->msg_flags & MSG_MORE);
> +	u16 record_size_limit;
>  	size_t try_to_copy;
>  	ssize_t copied = 0;
>  	struct sk_msg *msg_pl, *msg_en;
> @@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
>  		}
>  	}
>  
> +	record_size_limit = tls_ctx->record_size_limit ?
> +			    tls_ctx->record_size_limit : TLS_MAX_PAYLOAD_SIZE;

As Simon said (good catch Simon :)), this isn't used anywhere. Are you
sure this patch works? The previous version had a hunk in
tls_sw_sendmsg_locked that looks like what I would expect.

And the the offloaded TX path (in net/tls/tls_device.c) would also
need similar changes.


I'm wondering if it's better to add this conditional, or just
initialize record_size_limit to TLS_MAX_PAYLOAD_SIZE as we set up the
tls_context. Then we only have to replace TLS_MAX_PAYLOAD_SIZE with
tls_ctx->record_size_limit in a few places?

-- 
Sabrina

Re: [PATCH v2] net/tls: support maximum record size limit

Posted by Wilfred Mallawa 1 month ago

On Tue, 2025-09-02 at 18:07 +0200, Sabrina Dubroca wrote:
> 2025-09-02, 13:38:10 +1000, Wilfred Mallawa wrote:
> > From: Wilfred Mallawa <wilfred.mallawa@wdc.com>
> > 
> > During a handshake, an endpoint may specify a maximum record size
> > limit.
> > Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for
> > the
> > maximum record size. Meaning that, the outgoing records from the
> > kernel
> > can exceed a lower size negotiated during the handshake. In such a
> > case,
> > the TLS endpoint must send a fatal "record_overflow" alert [1], and
> > thus the record is discarded.
> > 
> > Upcoming Western Digital NVMe-TCP hardware controllers implement
> > TLS
> > support. For these devices, supporting TLS record size negotiation
> > is
> > necessary because the maximum TLS record size supported by the
> > controller
> > is less than the default 16KB currently used by the kernel.
> > 
> > This patch adds support for retrieving the negotiated record size
> > limit
> > during a handshake, and enforcing it at the TLS layer such that
> > outgoing
> > records are no larger than the size negotiated. This patch depends
> > on
> > the respective userspace support in tlshd and GnuTLS [2].
> > 
> > [1] https://www.rfc-editor.org/rfc/rfc8449
> > [2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005
> > 
> > Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>
> > ---
> >  Documentation/networking/tls.rst |  7 ++++++
> >  include/net/tls.h                |  1 +
> >  include/uapi/linux/tls.h         |  2 ++
> >  net/tls/tls_main.c               | 39
> > ++++++++++++++++++++++++++++++--
> >  net/tls/tls_sw.c                 |  4 ++++
> >  5 files changed, 51 insertions(+), 2 deletions(-)
> 
Hey Sabrina,
> A selftest would be nice (tools/testing/selftests/net/tls.c), but I'm
> not sure what we could do on the "RX" side to check that we are
> respecting the size restriction. Use a basic TCP socket and try to
> parse (and then discard without decrypting) records manually out of
> the stream and see if we got the length we wanted?
> 
So far I have just been using an NVMe TCP Target with TLS enabled and
checking that the targets RX record sizes are <= negotiated size in
tls_rx_one_record(). I didn't check for this patch and the bug below
got through...my bad!

Is it possible to get the exact record length into the testing layer?
Wouldn't the socket just return N bytes received which doesn't
necessarily correlate to a record size?
> 
> > diff --git a/include/net/tls.h b/include/net/tls.h
> > index 857340338b69..c9a3759f27ca 100644
> > --- a/include/net/tls.h
> > +++ b/include/net/tls.h
> > @@ -226,6 +226,7 @@ struct tls_context {
> >  	u8 rx_conf:3;
> >  	u8 zerocopy_sendfile:1;
> >  	u8 rx_no_pad:1;
> > +	u16 record_size_limit;
> 
> Maybe "tx_record_size_limit", since it's not intended for RX?
> 
> I don't know if the kernel will ever have a need to enforce the RX
> record size, but it would maybe avoid future head-scratching "why is
> this not used on the RX path?"
Ah good point, I think this makes sense.
> 
> 
> 
> > diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
> > index a3ccb3135e51..1098c01f2749 100644
> > --- a/net/tls/tls_main.c
> > +++ b/net/tls/tls_main.c
> > @@ -812,6 +812,31 @@ static int do_tls_setsockopt_no_pad(struct
> > sock *sk, sockptr_t optval,
> >  	return rc;
> >  }
> >  
> > +static int do_tls_setsockopt_record_size(struct sock *sk,
> > sockptr_t optval,
> > +					 unsigned int optlen)
> > +{
> > +	struct tls_context *ctx = tls_get_ctx(sk);
> > +	u16 value;
> > +
> > +	if (sockptr_is_null(optval) || optlen != sizeof(value))
> > +		return -EINVAL;
> > +
> > +	if (copy_from_sockptr(&value, optval, sizeof(value)))
> > +		return -EFAULT;
> > +
> > +	if (ctx->prot_info.version == TLS_1_2_VERSION &&
> > +	    value > TLS_MAX_PAYLOAD_SIZE)
> > +		return -EINVAL;
> > +
> > +	if (ctx->prot_info.version == TLS_1_3_VERSION &&
> > +	    value > TLS_MAX_PAYLOAD_SIZE + 1)
> > +		return -EINVAL;
> > +
> > +	ctx->record_size_limit = value;
> > +
> > +	return 0;
> > +}
> > +
> >  static int do_tls_setsockopt(struct sock *sk, int optname,
> > sockptr_t optval,
> >  			     unsigned int optlen)
> >  {
> > @@ -833,6 +858,9 @@ static int do_tls_setsockopt(struct sock *sk,
> > int optname, sockptr_t optval,
> >  	case TLS_RX_EXPECT_NO_PAD:
> >  		rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
> >  		break;
> > +	case TLS_TX_RECORD_SIZE_LIM:
> > +		rc = do_tls_setsockopt_record_size(sk, optval,
> > optlen);
> > +		break;
> 
> Adding the corresponding changes to do_tls_getsockopt would also be
> good.
> 
okay, I will add that.
> 
> > diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
> > index bac65d0d4e3e..9f9359f591d3 100644
> > --- a/net/tls/tls_sw.c
> > +++ b/net/tls/tls_sw.c
> > @@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock
> > *sk, struct msghdr *msg,
> >  	unsigned char record_type = TLS_RECORD_TYPE_DATA;
> >  	bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
> >  	bool eor = !(msg->msg_flags & MSG_MORE);
> > +	u16 record_size_limit;
> >  	size_t try_to_copy;
> >  	ssize_t copied = 0;
> >  	struct sk_msg *msg_pl, *msg_en;
> > @@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock
> > *sk, struct msghdr *msg,
> >  		}
> >  	}
> >  
> > +	record_size_limit = tls_ctx->record_size_limit ?
> > +			    tls_ctx->record_size_limit :
> > TLS_MAX_PAYLOAD_SIZE;
> 
> As Simon said (good catch Simon :)), this isn't used anywhere. Are
> you
> sure this patch works? The previous version had a hunk in
> tls_sw_sendmsg_locked that looks like what I would expect.
> 
This is a bug! I missed adding that hunk.
> And the the offloaded TX path (in net/tls/tls_device.c) would also
> need similar changes.
> 
Okay, will add in V3.
> 
> I'm wondering if it's better to add this conditional, or just
> initialize record_size_limit to TLS_MAX_PAYLOAD_SIZE as we set up the
> tls_context. Then we only have to replace TLS_MAX_PAYLOAD_SIZE with
> tls_ctx->record_size_limit in a few places?
Yeah I think sounds better, will add for V3.

Thanks for the feedback!

Regards,
Wilfred

Re: [PATCH v2] net/tls: support maximum record size limit

Posted by Sabrina Dubroca 1 month ago

2025-09-02, 22:50:53 +0000, Wilfred Mallawa wrote:
> On Tue, 2025-09-02 at 18:07 +0200, Sabrina Dubroca wrote:
> > 2025-09-02, 13:38:10 +1000, Wilfred Mallawa wrote:
> > > From: Wilfred Mallawa <wilfred.mallawa@wdc.com>
> Hey Sabrina,
> > A selftest would be nice (tools/testing/selftests/net/tls.c), but I'm
> > not sure what we could do on the "RX" side to check that we are
> > respecting the size restriction. Use a basic TCP socket and try to
> > parse (and then discard without decrypting) records manually out of
> > the stream and see if we got the length we wanted?
> > 
> So far I have just been using an NVMe TCP Target with TLS enabled and
> checking that the targets RX record sizes are <= negotiated size in
> tls_rx_one_record(). I didn't check for this patch and the bug below
> got through...my bad!
> 
> Is it possible to get the exact record length into the testing layer?

Not really, unless we come up with some mechanism using probes. I
wouldn't go that route unless we don't have any other choice.

> Wouldn't the socket just return N bytes received which doesn't
> necessarily correlate to a record size?

Yes. That's why I suggested only using ktls on one side of the test,
and parsing the records out of the raw stream of bytes on the RX side.

Actually, control records don't get aggregated on read, so sending a
large non-data buffer should result in separate limit-sized reads. But
this makes me wonder if this limit is supposed to apply to control
records, and how the userspace library/application is supposed to deal
with the possible splitting of those records?


Here's a rough example of what I had in mind. The hardcoded cipher
overhead is a bit ugly but I don't see a way around it. Sanity check
at the end is probably not needed. I didn't write the loop because I
haven't had enough coffee yet to get that right :)


TEST(tx_record_size)
{
	struct tls_crypto_info_keys tls12;
	int cfd, ret, fd, len, overhead;
	char buf[1000], buf2[2000];
	__u16 limit = 100;
	bool notls;

	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_CCM_128,
			     &tls12, 0);

	ulp_sock_pair(_metadata, &fd, &cfd, &notls);

	if (notls)
		exit(KSFT_SKIP);

	/* Don't install keys on fd, we'll parse raw records */
	ret = setsockopt(cfd, SOL_TLS, TLS_TX, &tls12, tls12.len);
	ASSERT_EQ(ret, 0);

	ret = setsockopt(cfd, SOL_TLS, TLS_TX_RECORD_SIZE_LIM, &limit, sizeof(limit));
	ASSERT_EQ(ret, 0);

	EXPECT_EQ(send(cfd, buf, sizeof(buf), 0), sizeof(buf));
	close(cfd);

	ret = recv(fd, buf2, sizeof(buf2), 0);
	memcpy(&len, buf2 + 3, 2);
	len = htons(len);

	/* 16B tag + 8B IV -- record header (5B) is not counted but we'll need it to walk the record stream */
	overhead = 16 + 8;

	// TODO should be <= limit since we may not have filled every
	// record (especially the last one), and loop over all the
	// records we got
	// next record starts at buf2 + (limit + overhead + 5)
	ASSERT_EQ(len, limit + overhead);
	/* sanity check that it's a TLS header for application data */
	ASSERT_EQ(buf2[0], 23);
	ASSERT_EQ(buf2[1], 0x3);
	ASSERT_EQ(buf2[2], 0x3);

	close(fd);
}


-- 
Sabrina

Re: [PATCH v2] net/tls: support maximum record size limit

Posted by Wilfred Mallawa 4 weeks ago

On Wed, 2025-09-03 at 10:21 +0200, Sabrina Dubroca wrote:
> 2025-09-02, 22:50:53 +0000, Wilfred Mallawa wrote:
> > On Tue, 2025-09-02 at 18:07 +0200, Sabrina Dubroca wrote:
> > > 2025-09-02, 13:38:10 +1000, Wilfred Mallawa wrote:
> > > > From: Wilfred Mallawa <wilfred.mallawa@wdc.com>
> > Hey Sabrina,
> > > A selftest would be nice (tools/testing/selftests/net/tls.c), but
> > > I'm
> > > not sure what we could do on the "RX" side to check that we are
> > > respecting the size restriction. Use a basic TCP socket and try
> > > to
> > > parse (and then discard without decrypting) records manually out
> > > of
> > > the stream and see if we got the length we wanted?
> > > 
> > So far I have just been using an NVMe TCP Target with TLS enabled
> > and
> > checking that the targets RX record sizes are <= negotiated size in
> > tls_rx_one_record(). I didn't check for this patch and the bug
> > below
> > got through...my bad!
> > 
> > Is it possible to get the exact record length into the testing
> > layer?
> 
> Not really, unless we come up with some mechanism using probes. I
> wouldn't go that route unless we don't have any other choice.
> 
> > Wouldn't the socket just return N bytes received which doesn't
> > necessarily correlate to a record size?
> 
> Yes. That's why I suggested only using ktls on one side of the test,
> and parsing the records out of the raw stream of bytes on the RX
> side.
> 
Ah okay I see.
> Actually, control records don't get aggregated on read, so sending a
> large non-data buffer should result in separate limit-sized reads.
> But
> this makes me wonder if this limit is supposed to apply to control
> records, and how the userspace library/application is supposed to
> deal
> with the possible splitting of those records?
> 
Good point, from the spec, "When the "record_size_limit" extension is
negotiated, an endpoint MUST NOT generate a protected record with
plaintext that is larger than the RecordSizeLimit value it receives
from its peer. Unprotected messages are not subject to this limit." [1]

From what I understand, as long as it in encrypted. It must respect the
record size limit?

In regards to user-space, do you mean for TX or RX? For TX, there
shouldn't need to be any changes as record splitting occurs in the
kernel. For RX, I am not too sure, but this patch shouldn't change
anything for that case?

[1] https://datatracker.ietf.org/doc/html/rfc8449#section-4
> 
> Here's a rough example of what I had in mind. The hardcoded cipher
> overhead is a bit ugly but I don't see a way around it. Sanity check
> at the end is probably not needed. I didn't write the loop because I
> haven't had enough coffee yet to get that right :)
> 
Ha! Great! Thanks for the example. I am not too familiar with the self
tests in the kernel. But will try to it for the next round.

Thanks,
Wilfred
> 
> TEST(tx_record_size)
> {
> 	struct tls_crypto_info_keys tls12;
> 	int cfd, ret, fd, len, overhead;
> 	char buf[1000], buf2[2000];
> 	__u16 limit = 100;
> 	bool notls;
> 
> 	tls_crypto_info_init(TLS_1_2_VERSION,
> TLS_CIPHER_AES_CCM_128,
> 			     &tls12, 0);
> 
> 	ulp_sock_pair(_metadata, &fd, &cfd, &notls);
> 
> 	if (notls)
> 		exit(KSFT_SKIP);
> 
> 	/* Don't install keys on fd, we'll parse raw records */
> 	ret = setsockopt(cfd, SOL_TLS, TLS_TX, &tls12, tls12.len);
> 	ASSERT_EQ(ret, 0);
> 
> 	ret = setsockopt(cfd, SOL_TLS, TLS_TX_RECORD_SIZE_LIM,
> &limit, sizeof(limit));
> 	ASSERT_EQ(ret, 0);
> 
> 	EXPECT_EQ(send(cfd, buf, sizeof(buf), 0), sizeof(buf));
> 	close(cfd);
> 
> 	ret = recv(fd, buf2, sizeof(buf2), 0);
> 	memcpy(&len, buf2 + 3, 2);
> 	len = htons(len);
> 
> 	/* 16B tag + 8B IV -- record header (5B) is not counted but
> we'll need it to walk the record stream */
> 	overhead = 16 + 8;
> 
> 	// TODO should be <= limit since we may not have filled
> every
> 	// record (especially the last one), and loop over all the
> 	// records we got
> 	// next record starts at buf2 + (limit + overhead + 5)
> 	ASSERT_EQ(len, limit + overhead);
> 	/* sanity check that it's a TLS header for application data
> */
> 	ASSERT_EQ(buf2[0], 23);
> 	ASSERT_EQ(buf2[1], 0x3);
> 	ASSERT_EQ(buf2[2], 0x3);
> 
> 	close(fd);
> }
>

Re: [PATCH v2] net/tls: support maximum record size limit

Posted by Sabrina Dubroca 3 weeks, 4 days ago

2025-09-04, 23:31:23 +0000, Wilfred Mallawa wrote:
> On Wed, 2025-09-03 at 10:21 +0200, Sabrina Dubroca wrote:
> > 2025-09-02, 22:50:53 +0000, Wilfred Mallawa wrote:
> > > On Tue, 2025-09-02 at 18:07 +0200, Sabrina Dubroca wrote:
> > > > 2025-09-02, 13:38:10 +1000, Wilfred Mallawa wrote:
> > > > > From: Wilfred Mallawa <wilfred.mallawa@wdc.com>
> > > Hey Sabrina,
> > > > A selftest would be nice (tools/testing/selftests/net/tls.c), but
> > > > I'm
> > > > not sure what we could do on the "RX" side to check that we are
> > > > respecting the size restriction. Use a basic TCP socket and try
> > > > to
> > > > parse (and then discard without decrypting) records manually out
> > > > of
> > > > the stream and see if we got the length we wanted?
> > > > 
> > > So far I have just been using an NVMe TCP Target with TLS enabled
> > > and
> > > checking that the targets RX record sizes are <= negotiated size in
> > > tls_rx_one_record(). I didn't check for this patch and the bug
> > > below
> > > got through...my bad!
> > > 
> > > Is it possible to get the exact record length into the testing
> > > layer?
> > 
> > Not really, unless we come up with some mechanism using probes. I
> > wouldn't go that route unless we don't have any other choice.
> > 
> > > Wouldn't the socket just return N bytes received which doesn't
> > > necessarily correlate to a record size?
> > 
> > Yes. That's why I suggested only using ktls on one side of the test,
> > and parsing the records out of the raw stream of bytes on the RX
> > side.
> > 
> Ah okay I see.
> > Actually, control records don't get aggregated on read, so sending a
> > large non-data buffer should result in separate limit-sized reads.
> > But
> > this makes me wonder if this limit is supposed to apply to control
> > records, and how the userspace library/application is supposed to
> > deal
> > with the possible splitting of those records?
> > 
> Good point, from the spec, "When the "record_size_limit" extension is
> negotiated, an endpoint MUST NOT generate a protected record with
> plaintext that is larger than the RecordSizeLimit value it receives
> from its peer. Unprotected messages are not subject to this limit." [1]
> 
> From what I understand, as long as it in encrypted. It must respect the
> record size limit?

Yes, and the kernel will make sure to split all the data it sends over
records of the maximum acceptable length (currently
TLS_MAX_PAYLOAD_SIZE, with your patch tx_record_size_limit). The
question was more about what happens if userspace does a send(!DATA,
length > tx_record_size_limit). The kernel will happily split that
over N consecutive records of tx_record_size_limit (or fewer) bytes,
and the peer will receive N separate messages. But this could already
happen with a non-DATA record larger than TLS_MAX_PAYLOAD_SIZE, so
it's not really something we need to worry about here. It's a concern
for the userspace library (reconstructing the original message from
consecutive records read separately from the ktls socket). So, my
comment here was pretty much noise, sorry.

> In regards to user-space, do you mean for TX or RX? For TX, there
> shouldn't need to be any changes as record splitting occurs in the
> kernel. For RX, I am not too sure, but this patch shouldn't change
> anything for that case?

Yes, I'm talking about TX here.

-- 
Sabrina

Re: [PATCH v2] net/tls: support maximum record size limit

Posted by Simon Horman 1 month ago

On Tue, Sep 02, 2025 at 01:38:10PM +1000, Wilfred Mallawa wrote:
> From: Wilfred Mallawa <wilfred.mallawa@wdc.com>
> 
> During a handshake, an endpoint may specify a maximum record size limit.
> Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
> maximum record size. Meaning that, the outgoing records from the kernel
> can exceed a lower size negotiated during the handshake. In such a case,
> the TLS endpoint must send a fatal "record_overflow" alert [1], and
> thus the record is discarded.
> 
> Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
> support. For these devices, supporting TLS record size negotiation is
> necessary because the maximum TLS record size supported by the controller
> is less than the default 16KB currently used by the kernel.
> 
> This patch adds support for retrieving the negotiated record size limit
> during a handshake, and enforcing it at the TLS layer such that outgoing
> records are no larger than the size negotiated. This patch depends on
> the respective userspace support in tlshd and GnuTLS [2].
> 
> [1] https://www.rfc-editor.org/rfc/rfc8449
> [2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005
> 
> Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>

Hi Wilfred,

I'll leave review of this approach to others.
But in the meantime I wanted to pass on a minor problem I noticed in the code

> diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
> index bac65d0d4e3e..9f9359f591d3 100644
> --- a/net/tls/tls_sw.c
> +++ b/net/tls/tls_sw.c
> @@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
>  	unsigned char record_type = TLS_RECORD_TYPE_DATA;
>  	bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
>  	bool eor = !(msg->msg_flags & MSG_MORE);
> +	u16 record_size_limit;
>  	size_t try_to_copy;
>  	ssize_t copied = 0;
>  	struct sk_msg *msg_pl, *msg_en;
> @@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
>  		}
>  	}
>  
> +	record_size_limit = tls_ctx->record_size_limit ?
> +			    tls_ctx->record_size_limit : TLS_MAX_PAYLOAD_SIZE;
> +
>  	while (msg_data_left(msg)) {
>  		if (sk->sk_err) {
>  			ret = -sk->sk_err;

record_size_limit is set but otherwise unused.
Did you forget to add something here?

If not, please remove record_size_limit from this function.

-- 
pw-bot: changes-requested

Re: [PATCH v2] net/tls: support maximum record size limit

Posted by Wilfred Mallawa 1 month ago

On Tue, 2025-09-02 at 12:40 +0100, Simon Horman wrote:
> 
[snip]
> Hi Wilfred,
> 
> I'll leave review of this approach to others.
> But in the meantime I wanted to pass on a minor problem I noticed in
> the code
> 
> > diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
> > index bac65d0d4e3e..9f9359f591d3 100644
> > --- a/net/tls/tls_sw.c
> > +++ b/net/tls/tls_sw.c
> > @@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock
> > *sk, struct msghdr *msg,
> >  	unsigned char record_type = TLS_RECORD_TYPE_DATA;
> >  	bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
> >  	bool eor = !(msg->msg_flags & MSG_MORE);
> > +	u16 record_size_limit;
> >  	size_t try_to_copy;
> >  	ssize_t copied = 0;
> >  	struct sk_msg *msg_pl, *msg_en;
> > @@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock
> > *sk, struct msghdr *msg,
> >  		}
> >  	}
> >  
> > +	record_size_limit = tls_ctx->record_size_limit ?
> > +			    tls_ctx->record_size_limit :
> > TLS_MAX_PAYLOAD_SIZE;
> > +
> >  	while (msg_data_left(msg)) {
> >  		if (sk->sk_err) {
> >  			ret = -sk->sk_err;
> 
> record_size_limit is set but otherwise unused.
> Did you forget to add something here?
> 
> If not, please remove record_size_limit from this function.
Hey Simon,

Yes, this is an error. I will fixup. Thank you.

Regards,
Wilfred