Documentation/networking/tls.rst | 7 ++++++ include/net/tls.h | 1 + include/uapi/linux/tls.h | 2 ++ net/tls/tls_main.c | 39 ++++++++++++++++++++++++++++++-- net/tls/tls_sw.c | 4 ++++ 5 files changed, 51 insertions(+), 2 deletions(-)
From: Wilfred Mallawa <wilfred.mallawa@wdc.com>
During a handshake, an endpoint may specify a maximum record size limit.
Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
maximum record size. Meaning that, the outgoing records from the kernel
can exceed a lower size negotiated during the handshake. In such a case,
the TLS endpoint must send a fatal "record_overflow" alert [1], and
thus the record is discarded.
Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
support. For these devices, supporting TLS record size negotiation is
necessary because the maximum TLS record size supported by the controller
is less than the default 16KB currently used by the kernel.
This patch adds support for retrieving the negotiated record size limit
during a handshake, and enforcing it at the TLS layer such that outgoing
records are no larger than the size negotiated. This patch depends on
the respective userspace support in tlshd and GnuTLS [2].
[1] https://www.rfc-editor.org/rfc/rfc8449
[2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005
Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>
---
Documentation/networking/tls.rst | 7 ++++++
include/net/tls.h | 1 +
include/uapi/linux/tls.h | 2 ++
net/tls/tls_main.c | 39 ++++++++++++++++++++++++++++++--
net/tls/tls_sw.c | 4 ++++
5 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index 36cc7afc2527..0232df902320 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -280,6 +280,13 @@ If the record decrypted turns out to had been padded or is not a data
record it will be decrypted again into a kernel buffer without zero copy.
Such events are counted in the ``TlsDecryptRetry`` statistic.
+TLS_TX_RECORD_SIZE_LIM
+~~~~~~~~~~~~~~~~~~~~~~
+
+During a TLS handshake, an endpoint may use the record size limit extension
+to specify a maximum record size. This allows enforcing the specified record
+size limit, such that outgoing records do not exceed the limit specified.
+
Statistics
==========
diff --git a/include/net/tls.h b/include/net/tls.h
index 857340338b69..c9a3759f27ca 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -226,6 +226,7 @@ struct tls_context {
u8 rx_conf:3;
u8 zerocopy_sendfile:1;
u8 rx_no_pad:1;
+ u16 record_size_limit;
int (*push_pending_record)(struct sock *sk, int flags);
void (*sk_write_space)(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index b66a800389cc..3add266d5916 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -41,6 +41,7 @@
#define TLS_RX 2 /* Set receive parameters */
#define TLS_TX_ZEROCOPY_RO 3 /* TX zerocopy (only sendfile now) */
#define TLS_RX_EXPECT_NO_PAD 4 /* Attempt opportunistic zero-copy */
+#define TLS_TX_RECORD_SIZE_LIM 5 /* Maximum record size */
/* Supported versions */
#define TLS_VERSION_MINOR(ver) ((ver) & 0xFF)
@@ -194,6 +195,7 @@ enum {
TLS_INFO_RXCONF,
TLS_INFO_ZC_RO_TX,
TLS_INFO_RX_NO_PAD,
+ TLS_INFO_TX_RECORD_SIZE_LIM,
__TLS_INFO_MAX,
};
#define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index a3ccb3135e51..1098c01f2749 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -812,6 +812,31 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
return rc;
}
+static int do_tls_setsockopt_record_size(struct sock *sk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ u16 value;
+
+ if (sockptr_is_null(optval) || optlen != sizeof(value))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&value, optval, sizeof(value)))
+ return -EFAULT;
+
+ if (ctx->prot_info.version == TLS_1_2_VERSION &&
+ value > TLS_MAX_PAYLOAD_SIZE)
+ return -EINVAL;
+
+ if (ctx->prot_info.version == TLS_1_3_VERSION &&
+ value > TLS_MAX_PAYLOAD_SIZE + 1)
+ return -EINVAL;
+
+ ctx->record_size_limit = value;
+
+ return 0;
+}
+
static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
unsigned int optlen)
{
@@ -833,6 +858,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
case TLS_RX_EXPECT_NO_PAD:
rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
break;
+ case TLS_TX_RECORD_SIZE_LIM:
+ rc = do_tls_setsockopt_record_size(sk, optval, optlen);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -1065,7 +1093,7 @@ static u16 tls_user_config(struct tls_context *ctx, bool tx)
static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin)
{
- u16 version, cipher_type;
+ u16 version, cipher_type, record_size_limit;
struct tls_context *ctx;
struct nlattr *start;
int err;
@@ -1110,7 +1138,13 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin)
if (err)
goto nla_failure;
}
-
+ record_size_limit = ctx->record_size_limit;
+ if (record_size_limit) {
+ err = nla_put_u16(skb, TLS_INFO_TX_RECORD_SIZE_LIM,
+ record_size_limit);
+ if (err)
+ goto nla_failure;
+ }
rcu_read_unlock();
nla_nest_end(skb, start);
return 0;
@@ -1132,6 +1166,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin)
nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */
nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */
nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */
+ nla_total_size(sizeof(u16)) + /* TLS_INFO_TX_RECORD_SIZE_LIM */
0;
return size;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index bac65d0d4e3e..9f9359f591d3 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
unsigned char record_type = TLS_RECORD_TYPE_DATA;
bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
bool eor = !(msg->msg_flags & MSG_MORE);
+ u16 record_size_limit;
size_t try_to_copy;
ssize_t copied = 0;
struct sk_msg *msg_pl, *msg_en;
@@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
}
}
+ record_size_limit = tls_ctx->record_size_limit ?
+ tls_ctx->record_size_limit : TLS_MAX_PAYLOAD_SIZE;
+
while (msg_data_left(msg)) {
if (sk->sk_err) {
ret = -sk->sk_err;
--
2.51.0
Hi Wilfred, kernel test robot noticed the following build warnings: [auto build test WARNING on net-next/main] [also build test WARNING on net/main linus/master horms-ipvs/master v6.17-rc4 next-20250902] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Wilfred-Mallawa/net-tls-support-maximum-record-size-limit/20250902-114005 base: net-next/main patch link: https://lore.kernel.org/r/20250902033809.177182-2-wilfred.opensource%40gmail.com patch subject: [PATCH v2] net/tls: support maximum record size limit config: i386-buildonly-randconfig-001-20250903 (https://download.01.org/0day-ci/archive/20250903/202509030542.ZW9r1S1c-lkp@intel.com/config) compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250903/202509030542.ZW9r1S1c-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202509030542.ZW9r1S1c-lkp@intel.com/ All warnings (new ones prefixed by >>): net/tls/tls_sw.c: In function 'tls_sw_sendmsg_locked': >> net/tls/tls_sw.c:1036:13: warning: variable 'record_size_limit' set but not used [-Wunused-but-set-variable] 1036 | u16 record_size_limit; | ^~~~~~~~~~~~~~~~~ vim +/record_size_limit +1036 net/tls/tls_sw.c 1024 1025 static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, 1026 size_t size) 1027 { 1028 long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1029 struct tls_context *tls_ctx = tls_get_ctx(sk); 1030 struct tls_prot_info *prot = &tls_ctx->prot_info; 1031 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); 1032 bool async_capable = ctx->async_capable; 1033 unsigned char record_type = TLS_RECORD_TYPE_DATA; 1034 bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); 1035 bool eor = !(msg->msg_flags & MSG_MORE); > 1036 u16 record_size_limit; 1037 size_t try_to_copy; 1038 ssize_t copied = 0; 1039 struct sk_msg *msg_pl, *msg_en; 1040 struct tls_rec *rec; 1041 int required_size; 1042 int num_async = 0; 1043 bool full_record; 1044 int record_room; 1045 int num_zc = 0; 1046 int orig_size; 1047 int ret = 0; 1048 1049 if (!eor && (msg->msg_flags & MSG_EOR)) 1050 return -EINVAL; 1051 1052 if (unlikely(msg->msg_controllen)) { 1053 ret = tls_process_cmsg(sk, msg, &record_type); 1054 if (ret) { 1055 if (ret == -EINPROGRESS) 1056 num_async++; 1057 else if (ret != -EAGAIN) 1058 goto send_end; 1059 } 1060 } 1061 1062 record_size_limit = tls_ctx->record_size_limit ? 1063 tls_ctx->record_size_limit : TLS_MAX_PAYLOAD_SIZE; 1064 1065 while (msg_data_left(msg)) { 1066 if (sk->sk_err) { 1067 ret = -sk->sk_err; 1068 goto send_end; 1069 } 1070 1071 if (ctx->open_rec) 1072 rec = ctx->open_rec; 1073 else 1074 rec = ctx->open_rec = tls_get_rec(sk); 1075 if (!rec) { 1076 ret = -ENOMEM; 1077 goto send_end; 1078 } 1079 1080 msg_pl = &rec->msg_plaintext; 1081 msg_en = &rec->msg_encrypted; 1082 1083 orig_size = msg_pl->sg.size; 1084 full_record = false; 1085 try_to_copy = msg_data_left(msg); 1086 record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; 1087 if (try_to_copy >= record_room) { 1088 try_to_copy = record_room; 1089 full_record = true; 1090 } 1091 1092 required_size = msg_pl->sg.size + try_to_copy + 1093 prot->overhead_size; 1094 1095 if (!sk_stream_memory_free(sk)) 1096 goto wait_for_sndbuf; 1097 1098 alloc_encrypted: 1099 ret = tls_alloc_encrypted_msg(sk, required_size); 1100 if (ret) { 1101 if (ret != -ENOSPC) 1102 goto wait_for_memory; 1103 1104 /* Adjust try_to_copy according to the amount that was 1105 * actually allocated. The difference is due 1106 * to max sg elements limit 1107 */ 1108 try_to_copy -= required_size - msg_en->sg.size; 1109 full_record = true; 1110 } 1111 1112 if (try_to_copy && (msg->msg_flags & MSG_SPLICE_PAGES)) { 1113 ret = tls_sw_sendmsg_splice(sk, msg, msg_pl, 1114 try_to_copy, &copied); 1115 if (ret < 0) 1116 goto send_end; 1117 tls_ctx->pending_open_record_frags = true; 1118 1119 if (sk_msg_full(msg_pl)) 1120 full_record = true; 1121 1122 if (full_record || eor) 1123 goto copied; 1124 continue; 1125 } 1126 1127 if (!is_kvec && (full_record || eor) && !async_capable) { 1128 u32 first = msg_pl->sg.end; 1129 1130 ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter, 1131 msg_pl, try_to_copy); 1132 if (ret) 1133 goto fallback_to_reg_send; 1134 1135 num_zc++; 1136 copied += try_to_copy; 1137 1138 sk_msg_sg_copy_set(msg_pl, first); 1139 ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, 1140 record_type, &copied, 1141 msg->msg_flags); 1142 if (ret) { 1143 if (ret == -EINPROGRESS) 1144 num_async++; 1145 else if (ret == -ENOMEM) 1146 goto wait_for_memory; 1147 else if (ctx->open_rec && ret == -ENOSPC) { 1148 if (msg_pl->cork_bytes) { 1149 ret = 0; 1150 goto send_end; 1151 } 1152 goto rollback_iter; 1153 } else if (ret != -EAGAIN) 1154 goto send_end; 1155 } 1156 continue; 1157 rollback_iter: 1158 copied -= try_to_copy; 1159 sk_msg_sg_copy_clear(msg_pl, first); 1160 iov_iter_revert(&msg->msg_iter, 1161 msg_pl->sg.size - orig_size); 1162 fallback_to_reg_send: 1163 sk_msg_trim(sk, msg_pl, orig_size); 1164 } 1165 1166 required_size = msg_pl->sg.size + try_to_copy; 1167 1168 ret = tls_clone_plaintext_msg(sk, required_size); 1169 if (ret) { 1170 if (ret != -ENOSPC) 1171 goto send_end; 1172 1173 /* Adjust try_to_copy according to the amount that was 1174 * actually allocated. The difference is due 1175 * to max sg elements limit 1176 */ 1177 try_to_copy -= required_size - msg_pl->sg.size; 1178 full_record = true; 1179 sk_msg_trim(sk, msg_en, 1180 msg_pl->sg.size + prot->overhead_size); 1181 } 1182 1183 if (try_to_copy) { 1184 ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, 1185 msg_pl, try_to_copy); 1186 if (ret < 0) 1187 goto trim_sgl; 1188 } 1189 1190 /* Open records defined only if successfully copied, otherwise 1191 * we would trim the sg but not reset the open record frags. 1192 */ 1193 tls_ctx->pending_open_record_frags = true; 1194 copied += try_to_copy; 1195 copied: 1196 if (full_record || eor) { 1197 ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, 1198 record_type, &copied, 1199 msg->msg_flags); 1200 if (ret) { 1201 if (ret == -EINPROGRESS) 1202 num_async++; 1203 else if (ret == -ENOMEM) 1204 goto wait_for_memory; 1205 else if (ret != -EAGAIN) { 1206 if (ret == -ENOSPC) 1207 ret = 0; 1208 goto send_end; 1209 } 1210 } 1211 } 1212 1213 continue; 1214 1215 wait_for_sndbuf: 1216 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1217 wait_for_memory: 1218 ret = sk_stream_wait_memory(sk, &timeo); 1219 if (ret) { 1220 trim_sgl: 1221 if (ctx->open_rec) 1222 tls_trim_both_msgs(sk, orig_size); 1223 goto send_end; 1224 } 1225 1226 if (ctx->open_rec && msg_en->sg.size < required_size) 1227 goto alloc_encrypted; 1228 } 1229 1230 if (!num_async) { 1231 goto send_end; 1232 } else if (num_zc || eor) { 1233 int err; 1234 1235 /* Wait for pending encryptions to get completed */ 1236 err = tls_encrypt_async_wait(ctx); 1237 if (err) { 1238 ret = err; 1239 copied = 0; 1240 } 1241 } 1242 1243 /* Transmit if any encryptions have completed */ 1244 if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { 1245 cancel_delayed_work(&ctx->tx_work.work); 1246 tls_tx_records(sk, msg->msg_flags); 1247 } 1248 1249 send_end: 1250 ret = sk_stream_error(sk, msg->msg_flags, ret); 1251 return copied > 0 ? copied : ret; 1252 } 1253 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
2025-09-02, 13:38:10 +1000, Wilfred Mallawa wrote: > From: Wilfred Mallawa <wilfred.mallawa@wdc.com> > > During a handshake, an endpoint may specify a maximum record size limit. > Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the > maximum record size. Meaning that, the outgoing records from the kernel > can exceed a lower size negotiated during the handshake. In such a case, > the TLS endpoint must send a fatal "record_overflow" alert [1], and > thus the record is discarded. > > Upcoming Western Digital NVMe-TCP hardware controllers implement TLS > support. For these devices, supporting TLS record size negotiation is > necessary because the maximum TLS record size supported by the controller > is less than the default 16KB currently used by the kernel. > > This patch adds support for retrieving the negotiated record size limit > during a handshake, and enforcing it at the TLS layer such that outgoing > records are no larger than the size negotiated. This patch depends on > the respective userspace support in tlshd and GnuTLS [2]. > > [1] https://www.rfc-editor.org/rfc/rfc8449 > [2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005 > > Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com> > --- > Documentation/networking/tls.rst | 7 ++++++ > include/net/tls.h | 1 + > include/uapi/linux/tls.h | 2 ++ > net/tls/tls_main.c | 39 ++++++++++++++++++++++++++++++-- > net/tls/tls_sw.c | 4 ++++ > 5 files changed, 51 insertions(+), 2 deletions(-) A selftest would be nice (tools/testing/selftests/net/tls.c), but I'm not sure what we could do on the "RX" side to check that we are respecting the size restriction. Use a basic TCP socket and try to parse (and then discard without decrypting) records manually out of the stream and see if we got the length we wanted? > diff --git a/include/net/tls.h b/include/net/tls.h > index 857340338b69..c9a3759f27ca 100644 > --- a/include/net/tls.h > +++ b/include/net/tls.h > @@ -226,6 +226,7 @@ struct tls_context { > u8 rx_conf:3; > u8 zerocopy_sendfile:1; > u8 rx_no_pad:1; > + u16 record_size_limit; Maybe "tx_record_size_limit", since it's not intended for RX? I don't know if the kernel will ever have a need to enforce the RX record size, but it would maybe avoid future head-scratching "why is this not used on the RX path?" > diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c > index a3ccb3135e51..1098c01f2749 100644 > --- a/net/tls/tls_main.c > +++ b/net/tls/tls_main.c > @@ -812,6 +812,31 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval, > return rc; > } > > +static int do_tls_setsockopt_record_size(struct sock *sk, sockptr_t optval, > + unsigned int optlen) > +{ > + struct tls_context *ctx = tls_get_ctx(sk); > + u16 value; > + > + if (sockptr_is_null(optval) || optlen != sizeof(value)) > + return -EINVAL; > + > + if (copy_from_sockptr(&value, optval, sizeof(value))) > + return -EFAULT; > + > + if (ctx->prot_info.version == TLS_1_2_VERSION && > + value > TLS_MAX_PAYLOAD_SIZE) > + return -EINVAL; > + > + if (ctx->prot_info.version == TLS_1_3_VERSION && > + value > TLS_MAX_PAYLOAD_SIZE + 1) > + return -EINVAL; > + > + ctx->record_size_limit = value; > + > + return 0; > +} > + > static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, > unsigned int optlen) > { > @@ -833,6 +858,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, > case TLS_RX_EXPECT_NO_PAD: > rc = do_tls_setsockopt_no_pad(sk, optval, optlen); > break; > + case TLS_TX_RECORD_SIZE_LIM: > + rc = do_tls_setsockopt_record_size(sk, optval, optlen); > + break; Adding the corresponding changes to do_tls_getsockopt would also be good. > diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c > index bac65d0d4e3e..9f9359f591d3 100644 > --- a/net/tls/tls_sw.c > +++ b/net/tls/tls_sw.c > @@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, > unsigned char record_type = TLS_RECORD_TYPE_DATA; > bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); > bool eor = !(msg->msg_flags & MSG_MORE); > + u16 record_size_limit; > size_t try_to_copy; > ssize_t copied = 0; > struct sk_msg *msg_pl, *msg_en; > @@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, > } > } > > + record_size_limit = tls_ctx->record_size_limit ? > + tls_ctx->record_size_limit : TLS_MAX_PAYLOAD_SIZE; As Simon said (good catch Simon :)), this isn't used anywhere. Are you sure this patch works? The previous version had a hunk in tls_sw_sendmsg_locked that looks like what I would expect. And the the offloaded TX path (in net/tls/tls_device.c) would also need similar changes. I'm wondering if it's better to add this conditional, or just initialize record_size_limit to TLS_MAX_PAYLOAD_SIZE as we set up the tls_context. Then we only have to replace TLS_MAX_PAYLOAD_SIZE with tls_ctx->record_size_limit in a few places? -- Sabrina
On Tue, 2025-09-02 at 18:07 +0200, Sabrina Dubroca wrote: > 2025-09-02, 13:38:10 +1000, Wilfred Mallawa wrote: > > From: Wilfred Mallawa <wilfred.mallawa@wdc.com> > > > > During a handshake, an endpoint may specify a maximum record size > > limit. > > Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for > > the > > maximum record size. Meaning that, the outgoing records from the > > kernel > > can exceed a lower size negotiated during the handshake. In such a > > case, > > the TLS endpoint must send a fatal "record_overflow" alert [1], and > > thus the record is discarded. > > > > Upcoming Western Digital NVMe-TCP hardware controllers implement > > TLS > > support. For these devices, supporting TLS record size negotiation > > is > > necessary because the maximum TLS record size supported by the > > controller > > is less than the default 16KB currently used by the kernel. > > > > This patch adds support for retrieving the negotiated record size > > limit > > during a handshake, and enforcing it at the TLS layer such that > > outgoing > > records are no larger than the size negotiated. This patch depends > > on > > the respective userspace support in tlshd and GnuTLS [2]. > > > > [1] https://www.rfc-editor.org/rfc/rfc8449 > > [2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005 > > > > Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com> > > --- > > Documentation/networking/tls.rst | 7 ++++++ > > include/net/tls.h | 1 + > > include/uapi/linux/tls.h | 2 ++ > > net/tls/tls_main.c | 39 > > ++++++++++++++++++++++++++++++-- > > net/tls/tls_sw.c | 4 ++++ > > 5 files changed, 51 insertions(+), 2 deletions(-) > Hey Sabrina, > A selftest would be nice (tools/testing/selftests/net/tls.c), but I'm > not sure what we could do on the "RX" side to check that we are > respecting the size restriction. Use a basic TCP socket and try to > parse (and then discard without decrypting) records manually out of > the stream and see if we got the length we wanted? > So far I have just been using an NVMe TCP Target with TLS enabled and checking that the targets RX record sizes are <= negotiated size in tls_rx_one_record(). I didn't check for this patch and the bug below got through...my bad! Is it possible to get the exact record length into the testing layer? Wouldn't the socket just return N bytes received which doesn't necessarily correlate to a record size? > > > diff --git a/include/net/tls.h b/include/net/tls.h > > index 857340338b69..c9a3759f27ca 100644 > > --- a/include/net/tls.h > > +++ b/include/net/tls.h > > @@ -226,6 +226,7 @@ struct tls_context { > > u8 rx_conf:3; > > u8 zerocopy_sendfile:1; > > u8 rx_no_pad:1; > > + u16 record_size_limit; > > Maybe "tx_record_size_limit", since it's not intended for RX? > > I don't know if the kernel will ever have a need to enforce the RX > record size, but it would maybe avoid future head-scratching "why is > this not used on the RX path?" Ah good point, I think this makes sense. > > > > > diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c > > index a3ccb3135e51..1098c01f2749 100644 > > --- a/net/tls/tls_main.c > > +++ b/net/tls/tls_main.c > > @@ -812,6 +812,31 @@ static int do_tls_setsockopt_no_pad(struct > > sock *sk, sockptr_t optval, > > return rc; > > } > > > > +static int do_tls_setsockopt_record_size(struct sock *sk, > > sockptr_t optval, > > + unsigned int optlen) > > +{ > > + struct tls_context *ctx = tls_get_ctx(sk); > > + u16 value; > > + > > + if (sockptr_is_null(optval) || optlen != sizeof(value)) > > + return -EINVAL; > > + > > + if (copy_from_sockptr(&value, optval, sizeof(value))) > > + return -EFAULT; > > + > > + if (ctx->prot_info.version == TLS_1_2_VERSION && > > + value > TLS_MAX_PAYLOAD_SIZE) > > + return -EINVAL; > > + > > + if (ctx->prot_info.version == TLS_1_3_VERSION && > > + value > TLS_MAX_PAYLOAD_SIZE + 1) > > + return -EINVAL; > > + > > + ctx->record_size_limit = value; > > + > > + return 0; > > +} > > + > > static int do_tls_setsockopt(struct sock *sk, int optname, > > sockptr_t optval, > > unsigned int optlen) > > { > > @@ -833,6 +858,9 @@ static int do_tls_setsockopt(struct sock *sk, > > int optname, sockptr_t optval, > > case TLS_RX_EXPECT_NO_PAD: > > rc = do_tls_setsockopt_no_pad(sk, optval, optlen); > > break; > > + case TLS_TX_RECORD_SIZE_LIM: > > + rc = do_tls_setsockopt_record_size(sk, optval, > > optlen); > > + break; > > Adding the corresponding changes to do_tls_getsockopt would also be > good. > okay, I will add that. > > > diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c > > index bac65d0d4e3e..9f9359f591d3 100644 > > --- a/net/tls/tls_sw.c > > +++ b/net/tls/tls_sw.c > > @@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock > > *sk, struct msghdr *msg, > > unsigned char record_type = TLS_RECORD_TYPE_DATA; > > bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); > > bool eor = !(msg->msg_flags & MSG_MORE); > > + u16 record_size_limit; > > size_t try_to_copy; > > ssize_t copied = 0; > > struct sk_msg *msg_pl, *msg_en; > > @@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock > > *sk, struct msghdr *msg, > > } > > } > > > > + record_size_limit = tls_ctx->record_size_limit ? > > + tls_ctx->record_size_limit : > > TLS_MAX_PAYLOAD_SIZE; > > As Simon said (good catch Simon :)), this isn't used anywhere. Are > you > sure this patch works? The previous version had a hunk in > tls_sw_sendmsg_locked that looks like what I would expect. > This is a bug! I missed adding that hunk. > And the the offloaded TX path (in net/tls/tls_device.c) would also > need similar changes. > Okay, will add in V3. > > I'm wondering if it's better to add this conditional, or just > initialize record_size_limit to TLS_MAX_PAYLOAD_SIZE as we set up the > tls_context. Then we only have to replace TLS_MAX_PAYLOAD_SIZE with > tls_ctx->record_size_limit in a few places? Yeah I think sounds better, will add for V3. Thanks for the feedback! Regards, Wilfred
2025-09-02, 22:50:53 +0000, Wilfred Mallawa wrote: > On Tue, 2025-09-02 at 18:07 +0200, Sabrina Dubroca wrote: > > 2025-09-02, 13:38:10 +1000, Wilfred Mallawa wrote: > > > From: Wilfred Mallawa <wilfred.mallawa@wdc.com> > Hey Sabrina, > > A selftest would be nice (tools/testing/selftests/net/tls.c), but I'm > > not sure what we could do on the "RX" side to check that we are > > respecting the size restriction. Use a basic TCP socket and try to > > parse (and then discard without decrypting) records manually out of > > the stream and see if we got the length we wanted? > > > So far I have just been using an NVMe TCP Target with TLS enabled and > checking that the targets RX record sizes are <= negotiated size in > tls_rx_one_record(). I didn't check for this patch and the bug below > got through...my bad! > > Is it possible to get the exact record length into the testing layer? Not really, unless we come up with some mechanism using probes. I wouldn't go that route unless we don't have any other choice. > Wouldn't the socket just return N bytes received which doesn't > necessarily correlate to a record size? Yes. That's why I suggested only using ktls on one side of the test, and parsing the records out of the raw stream of bytes on the RX side. Actually, control records don't get aggregated on read, so sending a large non-data buffer should result in separate limit-sized reads. But this makes me wonder if this limit is supposed to apply to control records, and how the userspace library/application is supposed to deal with the possible splitting of those records? Here's a rough example of what I had in mind. The hardcoded cipher overhead is a bit ugly but I don't see a way around it. Sanity check at the end is probably not needed. I didn't write the loop because I haven't had enough coffee yet to get that right :) TEST(tx_record_size) { struct tls_crypto_info_keys tls12; int cfd, ret, fd, len, overhead; char buf[1000], buf2[2000]; __u16 limit = 100; bool notls; tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_CCM_128, &tls12, 0); ulp_sock_pair(_metadata, &fd, &cfd, ¬ls); if (notls) exit(KSFT_SKIP); /* Don't install keys on fd, we'll parse raw records */ ret = setsockopt(cfd, SOL_TLS, TLS_TX, &tls12, tls12.len); ASSERT_EQ(ret, 0); ret = setsockopt(cfd, SOL_TLS, TLS_TX_RECORD_SIZE_LIM, &limit, sizeof(limit)); ASSERT_EQ(ret, 0); EXPECT_EQ(send(cfd, buf, sizeof(buf), 0), sizeof(buf)); close(cfd); ret = recv(fd, buf2, sizeof(buf2), 0); memcpy(&len, buf2 + 3, 2); len = htons(len); /* 16B tag + 8B IV -- record header (5B) is not counted but we'll need it to walk the record stream */ overhead = 16 + 8; // TODO should be <= limit since we may not have filled every // record (especially the last one), and loop over all the // records we got // next record starts at buf2 + (limit + overhead + 5) ASSERT_EQ(len, limit + overhead); /* sanity check that it's a TLS header for application data */ ASSERT_EQ(buf2[0], 23); ASSERT_EQ(buf2[1], 0x3); ASSERT_EQ(buf2[2], 0x3); close(fd); } -- Sabrina
On Wed, 2025-09-03 at 10:21 +0200, Sabrina Dubroca wrote: > 2025-09-02, 22:50:53 +0000, Wilfred Mallawa wrote: > > On Tue, 2025-09-02 at 18:07 +0200, Sabrina Dubroca wrote: > > > 2025-09-02, 13:38:10 +1000, Wilfred Mallawa wrote: > > > > From: Wilfred Mallawa <wilfred.mallawa@wdc.com> > > Hey Sabrina, > > > A selftest would be nice (tools/testing/selftests/net/tls.c), but > > > I'm > > > not sure what we could do on the "RX" side to check that we are > > > respecting the size restriction. Use a basic TCP socket and try > > > to > > > parse (and then discard without decrypting) records manually out > > > of > > > the stream and see if we got the length we wanted? > > > > > So far I have just been using an NVMe TCP Target with TLS enabled > > and > > checking that the targets RX record sizes are <= negotiated size in > > tls_rx_one_record(). I didn't check for this patch and the bug > > below > > got through...my bad! > > > > Is it possible to get the exact record length into the testing > > layer? > > Not really, unless we come up with some mechanism using probes. I > wouldn't go that route unless we don't have any other choice. > > > Wouldn't the socket just return N bytes received which doesn't > > necessarily correlate to a record size? > > Yes. That's why I suggested only using ktls on one side of the test, > and parsing the records out of the raw stream of bytes on the RX > side. > Ah okay I see. > Actually, control records don't get aggregated on read, so sending a > large non-data buffer should result in separate limit-sized reads. > But > this makes me wonder if this limit is supposed to apply to control > records, and how the userspace library/application is supposed to > deal > with the possible splitting of those records? > Good point, from the spec, "When the "record_size_limit" extension is negotiated, an endpoint MUST NOT generate a protected record with plaintext that is larger than the RecordSizeLimit value it receives from its peer. Unprotected messages are not subject to this limit." [1] From what I understand, as long as it in encrypted. It must respect the record size limit? In regards to user-space, do you mean for TX or RX? For TX, there shouldn't need to be any changes as record splitting occurs in the kernel. For RX, I am not too sure, but this patch shouldn't change anything for that case? [1] https://datatracker.ietf.org/doc/html/rfc8449#section-4 > > Here's a rough example of what I had in mind. The hardcoded cipher > overhead is a bit ugly but I don't see a way around it. Sanity check > at the end is probably not needed. I didn't write the loop because I > haven't had enough coffee yet to get that right :) > Ha! Great! Thanks for the example. I am not too familiar with the self tests in the kernel. But will try to it for the next round. Thanks, Wilfred > > TEST(tx_record_size) > { > struct tls_crypto_info_keys tls12; > int cfd, ret, fd, len, overhead; > char buf[1000], buf2[2000]; > __u16 limit = 100; > bool notls; > > tls_crypto_info_init(TLS_1_2_VERSION, > TLS_CIPHER_AES_CCM_128, > &tls12, 0); > > ulp_sock_pair(_metadata, &fd, &cfd, ¬ls); > > if (notls) > exit(KSFT_SKIP); > > /* Don't install keys on fd, we'll parse raw records */ > ret = setsockopt(cfd, SOL_TLS, TLS_TX, &tls12, tls12.len); > ASSERT_EQ(ret, 0); > > ret = setsockopt(cfd, SOL_TLS, TLS_TX_RECORD_SIZE_LIM, > &limit, sizeof(limit)); > ASSERT_EQ(ret, 0); > > EXPECT_EQ(send(cfd, buf, sizeof(buf), 0), sizeof(buf)); > close(cfd); > > ret = recv(fd, buf2, sizeof(buf2), 0); > memcpy(&len, buf2 + 3, 2); > len = htons(len); > > /* 16B tag + 8B IV -- record header (5B) is not counted but > we'll need it to walk the record stream */ > overhead = 16 + 8; > > // TODO should be <= limit since we may not have filled > every > // record (especially the last one), and loop over all the > // records we got > // next record starts at buf2 + (limit + overhead + 5) > ASSERT_EQ(len, limit + overhead); > /* sanity check that it's a TLS header for application data > */ > ASSERT_EQ(buf2[0], 23); > ASSERT_EQ(buf2[1], 0x3); > ASSERT_EQ(buf2[2], 0x3); > > close(fd); > } >
2025-09-04, 23:31:23 +0000, Wilfred Mallawa wrote: > On Wed, 2025-09-03 at 10:21 +0200, Sabrina Dubroca wrote: > > 2025-09-02, 22:50:53 +0000, Wilfred Mallawa wrote: > > > On Tue, 2025-09-02 at 18:07 +0200, Sabrina Dubroca wrote: > > > > 2025-09-02, 13:38:10 +1000, Wilfred Mallawa wrote: > > > > > From: Wilfred Mallawa <wilfred.mallawa@wdc.com> > > > Hey Sabrina, > > > > A selftest would be nice (tools/testing/selftests/net/tls.c), but > > > > I'm > > > > not sure what we could do on the "RX" side to check that we are > > > > respecting the size restriction. Use a basic TCP socket and try > > > > to > > > > parse (and then discard without decrypting) records manually out > > > > of > > > > the stream and see if we got the length we wanted? > > > > > > > So far I have just been using an NVMe TCP Target with TLS enabled > > > and > > > checking that the targets RX record sizes are <= negotiated size in > > > tls_rx_one_record(). I didn't check for this patch and the bug > > > below > > > got through...my bad! > > > > > > Is it possible to get the exact record length into the testing > > > layer? > > > > Not really, unless we come up with some mechanism using probes. I > > wouldn't go that route unless we don't have any other choice. > > > > > Wouldn't the socket just return N bytes received which doesn't > > > necessarily correlate to a record size? > > > > Yes. That's why I suggested only using ktls on one side of the test, > > and parsing the records out of the raw stream of bytes on the RX > > side. > > > Ah okay I see. > > Actually, control records don't get aggregated on read, so sending a > > large non-data buffer should result in separate limit-sized reads. > > But > > this makes me wonder if this limit is supposed to apply to control > > records, and how the userspace library/application is supposed to > > deal > > with the possible splitting of those records? > > > Good point, from the spec, "When the "record_size_limit" extension is > negotiated, an endpoint MUST NOT generate a protected record with > plaintext that is larger than the RecordSizeLimit value it receives > from its peer. Unprotected messages are not subject to this limit." [1] > > From what I understand, as long as it in encrypted. It must respect the > record size limit? Yes, and the kernel will make sure to split all the data it sends over records of the maximum acceptable length (currently TLS_MAX_PAYLOAD_SIZE, with your patch tx_record_size_limit). The question was more about what happens if userspace does a send(!DATA, length > tx_record_size_limit). The kernel will happily split that over N consecutive records of tx_record_size_limit (or fewer) bytes, and the peer will receive N separate messages. But this could already happen with a non-DATA record larger than TLS_MAX_PAYLOAD_SIZE, so it's not really something we need to worry about here. It's a concern for the userspace library (reconstructing the original message from consecutive records read separately from the ktls socket). So, my comment here was pretty much noise, sorry. > In regards to user-space, do you mean for TX or RX? For TX, there > shouldn't need to be any changes as record splitting occurs in the > kernel. For RX, I am not too sure, but this patch shouldn't change > anything for that case? Yes, I'm talking about TX here. -- Sabrina
On Tue, Sep 02, 2025 at 01:38:10PM +1000, Wilfred Mallawa wrote: > From: Wilfred Mallawa <wilfred.mallawa@wdc.com> > > During a handshake, an endpoint may specify a maximum record size limit. > Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the > maximum record size. Meaning that, the outgoing records from the kernel > can exceed a lower size negotiated during the handshake. In such a case, > the TLS endpoint must send a fatal "record_overflow" alert [1], and > thus the record is discarded. > > Upcoming Western Digital NVMe-TCP hardware controllers implement TLS > support. For these devices, supporting TLS record size negotiation is > necessary because the maximum TLS record size supported by the controller > is less than the default 16KB currently used by the kernel. > > This patch adds support for retrieving the negotiated record size limit > during a handshake, and enforcing it at the TLS layer such that outgoing > records are no larger than the size negotiated. This patch depends on > the respective userspace support in tlshd and GnuTLS [2]. > > [1] https://www.rfc-editor.org/rfc/rfc8449 > [2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005 > > Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com> Hi Wilfred, I'll leave review of this approach to others. But in the meantime I wanted to pass on a minor problem I noticed in the code > diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c > index bac65d0d4e3e..9f9359f591d3 100644 > --- a/net/tls/tls_sw.c > +++ b/net/tls/tls_sw.c > @@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, > unsigned char record_type = TLS_RECORD_TYPE_DATA; > bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); > bool eor = !(msg->msg_flags & MSG_MORE); > + u16 record_size_limit; > size_t try_to_copy; > ssize_t copied = 0; > struct sk_msg *msg_pl, *msg_en; > @@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, > } > } > > + record_size_limit = tls_ctx->record_size_limit ? > + tls_ctx->record_size_limit : TLS_MAX_PAYLOAD_SIZE; > + > while (msg_data_left(msg)) { > if (sk->sk_err) { > ret = -sk->sk_err; record_size_limit is set but otherwise unused. Did you forget to add something here? If not, please remove record_size_limit from this function. -- pw-bot: changes-requested
On Tue, 2025-09-02 at 12:40 +0100, Simon Horman wrote: > [snip] > Hi Wilfred, > > I'll leave review of this approach to others. > But in the meantime I wanted to pass on a minor problem I noticed in > the code > > > diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c > > index bac65d0d4e3e..9f9359f591d3 100644 > > --- a/net/tls/tls_sw.c > > +++ b/net/tls/tls_sw.c > > @@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock > > *sk, struct msghdr *msg, > > unsigned char record_type = TLS_RECORD_TYPE_DATA; > > bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); > > bool eor = !(msg->msg_flags & MSG_MORE); > > + u16 record_size_limit; > > size_t try_to_copy; > > ssize_t copied = 0; > > struct sk_msg *msg_pl, *msg_en; > > @@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock > > *sk, struct msghdr *msg, > > } > > } > > > > + record_size_limit = tls_ctx->record_size_limit ? > > + tls_ctx->record_size_limit : > > TLS_MAX_PAYLOAD_SIZE; > > + > > while (msg_data_left(msg)) { > > if (sk->sk_err) { > > ret = -sk->sk_err; > > record_size_limit is set but otherwise unused. > Did you forget to add something here? > > If not, please remove record_size_limit from this function. Hey Simon, Yes, this is an error. I will fixup. Thank you. Regards, Wilfred
© 2016 - 2025 Red Hat, Inc.