[PATCH v3 mptcp-next] selftests: mptcp: more stable join tests-cases.

Paolo Abeni posted 1 patch 1 month, 1 week ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/multipath-tcp/mptcp_net-next tags/patchew/dfbdb183399851cfa695fa4c965a8d146d436e52.1639567831.git.pabeni@redhat.com
Maintainers: Mat Martineau <mathew.j.martineau@linux.intel.com>, Shuah Khan <shuah@kernel.org>, Jakub Kicinski <kuba@kernel.org>, Matthieu Baerts <matthieu.baerts@tessares.net>, "David S. Miller" <davem@davemloft.net>
.../testing/selftests/net/mptcp/mptcp_join.sh | 127 ++++++++++--------
1 file changed, 69 insertions(+), 58 deletions(-)

[PATCH v3 mptcp-next] selftests: mptcp: more stable join tests-cases.

Posted by Paolo Abeni 1 month, 1 week ago
MPTCP join self-tests are a bit fragile as they reply on
delays instead of events to catch-up with the expected
sockets states.

Replace the delay with state checking where possible and
reduce the number of sleeps in the most complex scenarios.

This will both reduce the tests run-time and will improve
stability.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
Note: I still see frequent failures in "remove subflow and signal IPv6",
but that looks unrelated from this patch: sk lookup fails on the
client side for the MPJ syn-ack. Funnily enough, I can't reproduce
the failure with pcap capture on...

v2 -> v3:
 - fix wait_for_address
 - consolidate stats dumping, add TCP stats

v1 -> v2:
 - fix ipv6 addr handling
---
 .../testing/selftests/net/mptcp/mptcp_join.sh | 127 ++++++++++--------
 1 file changed, 69 insertions(+), 58 deletions(-)

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 2684ef9c0d42..c85ef7987785 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -238,6 +238,45 @@ is_v6()
 	[ -z "${1##*:*}" ]
 }
 
+# $1: ns, $2: port
+wait_local_port_listen()
+{
+	local listener_ns="${1}"
+	local port="${2}"
+
+	local port_hex i
+
+	port_hex="$(printf "%04X" "${port}")"
+	for i in $(seq 10); do
+		ip netns exec "${listener_ns}" cat /proc/net/tcp* | \
+			awk "BEGIN {rc=1} {if (\$2 ~ /:${port_hex}\$/ && \$4 ~ /0A/) {rc=0; exit}} END {exit rc}" &&
+			break
+		sleep 0.1
+	done
+}
+
+rm_addr_count()
+{
+	ns=${1}
+
+	ip netns exec ${ns} nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'
+}
+
+# $1: ns, $2: old rm_addr counter in $ns
+wait_rm_addr()
+{
+	local ns="${1}"
+	local old_cnt="${2}"
+	local cnt
+	local i
+
+	for i in $(seq 10); do
+		cnt=$(rm_addr_count ${ns})
+		[ "$cnt" = "${old_cnt}" ] || break
+		sleep 0.1
+	done
+}
+
 do_transfer()
 {
 	listener_ns="$1"
@@ -307,7 +346,7 @@ do_transfer()
 	fi
 	spid=$!
 
-	sleep 1
+	wait_local_port_listen "${listener_ns}" "${port}"
 
 	if [ "$test_link_fail" -eq 0 ];then
 		timeout ${timeout_test} \
@@ -324,10 +363,13 @@ do_transfer()
 	fi
 	cpid=$!
 
+	# let the mptcp subflow be established in background before
+	# do endpoint manipulation
+	[ $addr_nr_ns1 = "0" -a $addr_nr_ns2 = "0" ] || sleep 1
+
 	if [ $addr_nr_ns1 -gt 0 ]; then
 		let add_nr_ns1=addr_nr_ns1
 		counter=2
-		sleep 1
 		while [ $add_nr_ns1 -gt 0 ]; do
 			local addr
 			if is_v6 "${connect_addr}"; then
@@ -339,7 +381,6 @@ do_transfer()
 			let counter+=1
 			let add_nr_ns1-=1
 		done
-		sleep 1
 	elif [ $addr_nr_ns1 -lt 0 ]; then
 		let rm_nr_ns1=-addr_nr_ns1
 		if [ $rm_nr_ns1 -lt 8 ]; then
@@ -347,22 +388,19 @@ do_transfer()
 			pos=1
 			dump=(`ip netns exec ${listener_ns} ./pm_nl_ctl dump`)
 			if [ ${#dump[@]} -gt 0 ]; then
-				sleep 1
-
 				while [ $counter -le $rm_nr_ns1 ]
 				do
 					id=${dump[$pos]}
+					rm_addr=$(rm_addr_count ${connector_ns})
 					ip netns exec ${listener_ns} ./pm_nl_ctl del $id
-					sleep 1
+					wait_rm_addr ${connector_ns} ${rm_addr}
 					let counter+=1
 					let pos+=5
 				done
 			fi
 		elif [ $rm_nr_ns1 -eq 8 ]; then
-			sleep 1
 			ip netns exec ${listener_ns} ./pm_nl_ctl flush
 		elif [ $rm_nr_ns1 -eq 9 ]; then
-			sleep 1
 			ip netns exec ${listener_ns} ./pm_nl_ctl del 0 ${connect_addr}
 		fi
 	fi
@@ -373,10 +411,13 @@ do_transfer()
 		addr_nr_ns2=${addr_nr_ns2:9}
 	fi
 
+	# if newly added endpoints must be deleted, give the background msk
+	# some time to created them
+	[ $addr_nr_ns1 -gt 0 -a $addr_nr_ns2 -lt 0 ] && sleep 1
+
 	if [ $addr_nr_ns2 -gt 0 ]; then
 		let add_nr_ns2=addr_nr_ns2
 		counter=3
-		sleep 1
 		while [ $add_nr_ns2 -gt 0 ]; do
 			local addr
 			if is_v6 "${connect_addr}"; then
@@ -388,7 +429,6 @@ do_transfer()
 			let counter+=1
 			let add_nr_ns2-=1
 		done
-		sleep 1
 	elif [ $addr_nr_ns2 -lt 0 ]; then
 		let rm_nr_ns2=-addr_nr_ns2
 		if [ $rm_nr_ns2 -lt 8 ]; then
@@ -396,19 +436,18 @@ do_transfer()
 			pos=1
 			dump=(`ip netns exec ${connector_ns} ./pm_nl_ctl dump`)
 			if [ ${#dump[@]} -gt 0 ]; then
-				sleep 1
-
 				while [ $counter -le $rm_nr_ns2 ]
 				do
+					# rm_addr are serialized, allow the previous one to complete
 					id=${dump[$pos]}
+					rm_addr=$(rm_addr_count ${listener_ns})
 					ip netns exec ${connector_ns} ./pm_nl_ctl del $id
-					sleep 1
+					wait_rm_addr ${listener_ns} ${rm_addr}
 					let counter+=1
 					let pos+=5
 				done
 			fi
 		elif [ $rm_nr_ns2 -eq 8 ]; then
-			sleep 1
 			ip netns exec ${connector_ns} ./pm_nl_ctl flush
 		elif [ $rm_nr_ns2 -eq 9 ]; then
 			local addr
@@ -417,7 +456,6 @@ do_transfer()
 			else
 				addr="10.0.1.2"
 			fi
-			sleep 1
 			ip netns exec ${connector_ns} ./pm_nl_ctl del 0 $addr
 		fi
 	fi
@@ -539,6 +577,14 @@ run_tests()
 	lret=$?
 }
 
+dump_stats()
+{
+	echo Server ns stats
+	ip netns exec $ns1 nstat -as | grep Tcp
+	echo Client ns stats
+	ip netns exec $ns2 nstat -as | grep Tcp
+}
+
 chk_csum_nr()
 {
 	local msg=${1:-""}
@@ -570,12 +616,7 @@ chk_csum_nr()
 	else
 		echo "[ ok ]"
 	fi
-	if [ "${dump_stats}" = 1 ]; then
-		echo Server ns stats
-		ip netns exec $ns1 nstat -as | grep MPTcp
-		echo Client ns stats
-		ip netns exec $ns2 nstat -as | grep MPTcp
-	fi
+	[ "${dump_stats}" = 1 ] && dump_stats
 }
 
 chk_fail_nr()
@@ -607,12 +648,7 @@ chk_fail_nr()
 		echo "[ ok ]"
 	fi
 
-	if [ "${dump_stats}" = 1 ]; then
-		echo Server ns stats
-		ip netns exec $ns1 nstat -as | grep MPTcp
-		echo Client ns stats
-		ip netns exec $ns2 nstat -as | grep MPTcp
-	fi
+	[ "${dump_stats}" = 1 ] && dump_stats
 }
 
 chk_infi_nr()
@@ -644,12 +680,7 @@ chk_infi_nr()
 		echo "[ ok ]"
 	fi
 
-	if [ "${dump_stats}" = 1 ]; then
-		echo Server ns stats
-		ip netns exec $ns1 nstat -as | grep MPTcp
-		echo Client ns stats
-		ip netns exec $ns2 nstat -as | grep MPTcp
-	fi
+	[ "${dump_stats}" = 1 ] && dump_stats
 }
 
 chk_join_nr()
@@ -693,12 +724,7 @@ chk_join_nr()
 	else
 		echo "[ ok ]"
 	fi
-	if [ "${dump_stats}" = 1 ]; then
-		echo Server ns stats
-		ip netns exec $ns1 nstat -as | grep MPTcp
-		echo Client ns stats
-		ip netns exec $ns2 nstat -as | grep MPTcp
-	fi
+	[ "${dump_stats}" = 1 ] && dump_stats
 	if [ $checksum -eq 1 ]; then
 		chk_csum_nr
 		chk_fail_nr 0 0
@@ -861,12 +887,7 @@ chk_add_nr()
 		echo ""
 	fi
 
-	if [ "${dump_stats}" = 1 ]; then
-		echo Server ns stats
-		ip netns exec $ns1 nstat -as | grep MPTcp
-		echo Client ns stats
-		ip netns exec $ns2 nstat -as | grep MPTcp
-	fi
+	[ "${dump_stats}" = 1 ] && dump_stats
 }
 
 chk_rm_nr()
@@ -909,12 +930,7 @@ chk_rm_nr()
 		echo "[ ok ]"
 	fi
 
-	if [ "${dump_stats}" = 1 ]; then
-		echo Server ns stats
-		ip netns exec $ns1 nstat -as | grep MPTcp
-		echo Client ns stats
-		ip netns exec $ns2 nstat -as | grep MPTcp
-	fi
+	[ "${dump_stats}" = 1 ] && dump_stats
 }
 
 chk_prio_nr()
@@ -946,12 +962,7 @@ chk_prio_nr()
 		echo "[ ok ]"
 	fi
 
-	if [ "${dump_stats}" = 1 ]; then
-		echo Server ns stats
-		ip netns exec $ns1 nstat -as | grep MPTcp
-		echo Client ns stats
-		ip netns exec $ns2 nstat -as | grep MPTcp
-	fi
+	[ "${dump_stats}" = 1 ] && dump_stats
 }
 
 chk_link_usage()
@@ -1615,7 +1626,7 @@ add_addr_ports_tests()
 	ip netns exec $ns2 ./pm_nl_ctl limits 1 3
 	ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
 	ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
-	run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow
+	run_tests $ns1 $ns2 10.0.1.1 0 -8 -2 slow
 	chk_join_nr "flush subflows and signal with port" 3 3 3
 	chk_add_nr 1 1
 	chk_rm_nr 2 2
-- 
2.33.1


Re: [PATCH v3 mptcp-next] selftests: mptcp: more stable join tests-cases.

Posted by Matthieu Baerts 1 month, 1 week ago
Hi Paolo, Mat,

On 15/12/2021 12:36, Paolo Abeni wrote:
> MPTCP join self-tests are a bit fragile as they reply on
> delays instead of events to catch-up with the expected
> sockets states.
> 
> Replace the delay with state checking where possible and
> reduce the number of sleeps in the most complex scenarios.
> 
> This will both reduce the tests run-time and will improve
> stability.

Thank you for the patch and the review!

Now in our tree with Mat's RvB tag:

- e5056a064c6f: selftests: mptcp: more stable join tests-cases
- Results: f83839879847..aa92c1bdf999

Builds and tests are now in progress:

https://cirrus-ci.com/github/multipath-tcp/mptcp_net-next/export/20211216T170147
https://github.com/multipath-tcp/mptcp_net-next/actions/workflows/build-validation.yml?query=branch:export

Cheers,
Matt
-- 
Tessares | Belgium | Hybrid Access Solutions
www.tessares.net

Re: [PATCH v3 mptcp-next] selftests: mptcp: more stable join tests-cases.

Posted by Matthieu Baerts 1 month, 1 week ago
Hi Paolo,

On 15/12/2021 12:36, Paolo Abeni wrote:
> MPTCP join self-tests are a bit fragile as they reply on
> delays instead of events to catch-up with the expected
> sockets states.
> 
> Replace the delay with state checking where possible and
> reduce the number of sleeps in the most complex scenarios.
> 
> This will both reduce the tests run-time and will improve
> stability.
> 
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> Note: I still see frequent failures in "remove subflow and signal IPv6",
> but that looks unrelated from this patch: sk lookup fails on the
> client side for the MPJ syn-ack. Funnily enough, I can't reproduce
> the failure with pcap capture on...

Thank you for the new version!

I'm trying it in a loop. So far, so good!


It looks like the CI didn't spot any issues in debug mode but different
ones in non-debug mode:

- KVM Validation: normal:
  - Unstable: 1 failed test(s): selftest_mptcp_join 🔴:
  - Task: https://cirrus-ci.com/task/6235909728239616
  - Summary:
https://api.cirrus-ci.com/v1/artifact/task/6235909728239616/summary/summary.txt

- KVM Validation: debug:
  - Unstable: 1 failed test(s): selftest_diag 🔴:
  - Task: https://cirrus-ci.com/task/4828534844686336
  - Summary:
https://api.cirrus-ci.com/v1/artifact/task/4828534844686336/summary/summary.txt


I guess it is not related to that.


> +rm_addr_count()
> +{
> +	ns=${1}

Small detail: it is not "local". I can fix that when applying the patch
if it helps.

> +# $1: ns, $2: old rm_addr counter in $ns
> +wait_rm_addr()
> +{
> +	local ns="${1}"
> +	local old_cnt="${2}"
> +	local cnt
> +	local i
> +
> +	for i in $(seq 10); do
> +		cnt=$(rm_addr_count ${ns})
> +		[ "$cnt" = "${old_cnt}" ] || break

I was confused by this: why do you stop if it is the same??? But then I
realised it was "||" and not "&&" :)

    [ "$cnt" != "${old_cnt}" ] && break

> +dump_stats()
> +{
> +	echo Server ns stats
> +	ip netns exec $ns1 nstat -as | grep Tcp
> +	echo Client ns stats
> +	ip netns exec $ns2 nstat -as | grep Tcp
> +}

Good idea!

-- 
Tessares | Belgium | Hybrid Access Solutions
www.tessares.net

Re: [PATCH v3 mptcp-next] selftests: mptcp: more stable join tests-cases.

Posted by Mat Martineau 1 month, 1 week ago
On Wed, 15 Dec 2021, Matthieu Baerts wrote:

> Hi Paolo,
>
> On 15/12/2021 12:36, Paolo Abeni wrote:
>> MPTCP join self-tests are a bit fragile as they reply on
>> delays instead of events to catch-up with the expected
>> sockets states.
>>
>> Replace the delay with state checking where possible and
>> reduce the number of sleeps in the most complex scenarios.
>>
>> This will both reduce the tests run-time and will improve
>> stability.
>>
>> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>> ---
>> Note: I still see frequent failures in "remove subflow and signal IPv6",
>> but that looks unrelated from this patch: sk lookup fails on the
>> client side for the MPJ syn-ack. Funnily enough, I can't reproduce
>> the failure with pcap capture on...
>
> Thank you for the new version!
>
> I'm trying it in a loop. So far, so good!
>

Looks ok here too - just seeing the same ipv6 failure as Paolo, and also 
that it always succeeds when trying to capture.

- Mat

>
> It looks like the CI didn't spot any issues in debug mode but different
> ones in non-debug mode:
>
> - KVM Validation: normal:
>  - Unstable: 1 failed test(s): selftest_mptcp_join 🔴:
>  - Task: https://cirrus-ci.com/task/6235909728239616
>  - Summary:
> https://api.cirrus-ci.com/v1/artifact/task/6235909728239616/summary/summary.txt
>
> - KVM Validation: debug:
>  - Unstable: 1 failed test(s): selftest_diag 🔴:
>  - Task: https://cirrus-ci.com/task/4828534844686336
>  - Summary:
> https://api.cirrus-ci.com/v1/artifact/task/4828534844686336/summary/summary.txt
>
>
> I guess it is not related to that.
>
>
>> +rm_addr_count()
>> +{
>> +	ns=${1}
>
> Small detail: it is not "local". I can fix that when applying the patch
> if it helps.
>
>> +# $1: ns, $2: old rm_addr counter in $ns
>> +wait_rm_addr()
>> +{
>> +	local ns="${1}"
>> +	local old_cnt="${2}"
>> +	local cnt
>> +	local i
>> +
>> +	for i in $(seq 10); do
>> +		cnt=$(rm_addr_count ${ns})
>> +		[ "$cnt" = "${old_cnt}" ] || break
>
> I was confused by this: why do you stop if it is the same??? But then I
> realised it was "||" and not "&&" :)
>
>    [ "$cnt" != "${old_cnt}" ] && break
>
>> +dump_stats()
>> +{
>> +	echo Server ns stats
>> +	ip netns exec $ns1 nstat -as | grep Tcp
>> +	echo Client ns stats
>> +	ip netns exec $ns2 nstat -as | grep Tcp
>> +}
>
> Good idea!
>
> -- 
> Tessares | Belgium | Hybrid Access Solutions
> www.tessares.net
>
>

--
Mat Martineau
Intel

Re: [PATCH v3 mptcp-next] selftests: mptcp: more stable join tests-cases.

Posted by Mat Martineau 1 month, 1 week ago
On Wed, 15 Dec 2021, Mat Martineau wrote:

> On Wed, 15 Dec 2021, Matthieu Baerts wrote:
>
>> Hi Paolo,
>> 
>> On 15/12/2021 12:36, Paolo Abeni wrote:
>>> MPTCP join self-tests are a bit fragile as they reply on
>>> delays instead of events to catch-up with the expected
>>> sockets states.
>>> 
>>> Replace the delay with state checking where possible and
>>> reduce the number of sleeps in the most complex scenarios.
>>> 
>>> This will both reduce the tests run-time and will improve
>>> stability.
>>> 
>>> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>>> ---
>>> Note: I still see frequent failures in "remove subflow and signal IPv6",
>>> but that looks unrelated from this patch: sk lookup fails on the
>>> client side for the MPJ syn-ack. Funnily enough, I can't reproduce
>>> the failure with pcap capture on...
>> 
>> Thank you for the new version!
>> 
>> I'm trying it in a loop. So far, so good!
>> 
>
> Looks ok here too - just seeing the same ipv6 failure as Paolo, and also that 
> it always succeeds when trying to capture.
>

As discussed in the meeting today:

Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>

--
Mat Martineau
Intel