net/mptcp/protocol.c | 72 +++++++++++++++++++++++++++++--------------- net/mptcp/protocol.h | 1 + 2 files changed, 48 insertions(+), 25 deletions(-)
The MPTCP packet scheduler has sub-optimal behavior with asymmetric
subflows: if the faster subflow-level cwin is closed, the packet
scheduler can enqueue "too much" data on a slower subflow.
When all the data on the faster subflow is acked, if the mptcp-level
cwin is closed, and link utilization becomes suboptimal.
The solution is implementing blest-like[1] HoL-blocking estimation,
transmitting only on the subflow with the shorter estimated time to
flush the queued memory. If such subflows cwin is closed, we wait
even if other subflows are available.
This is quite simpler than the original blest implementation, as we
leverage the pacing rate provided by the TCP socket. To get a more
accurate estimation for the subflow linger-time, we maintain a
per-subflow weighted average of such info.
Additionally drop magic numbers usage in favor of newly defined
macros and use more meaningful names for status variable.
[1] http://dl.ifip.org/db/conf/networking/networking2016/1570234725.pdf
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v1 -> v2:
- fix checkpatch issue (mat)
- rename ratio as linger_time (mat)
---
net/mptcp/protocol.c | 72 +++++++++++++++++++++++++++++---------------
net/mptcp/protocol.h | 1 +
2 files changed, 48 insertions(+), 25 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 7803b0dbb1be..582f5f88d6ef 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1370,7 +1370,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct subflow_send_info {
struct sock *ssk;
- u64 ratio;
+ u64 linger_time;
};
void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow)
@@ -1395,20 +1395,24 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
return __mptcp_subflow_active(subflow);
}
+#define SSK_MODE_ACTIVE 0
+#define SSK_MODE_BACKUP 1
+#define SSK_MODE_MAX 2
+
/* implement the mptcp packet scheduler;
* returns the subflow that will transmit the next DSS
* additionally updates the rtx timeout
*/
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
- struct subflow_send_info send_info[2];
+ struct subflow_send_info send_info[SSK_MODE_MAX];
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
+ u32 pace, burst, wmem;
int i, nr_active = 0;
struct sock *ssk;
+ u64 linger_time;
long tout = 0;
- u64 ratio;
- u32 pace;
sock_owned_by_me(sk);
@@ -1427,10 +1431,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
}
/* pick the subflow with the lower wmem/wspace ratio */
- for (i = 0; i < 2; ++i) {
+ for (i = 0; i < SSK_MODE_MAX; ++i) {
send_info[i].ssk = NULL;
- send_info[i].ratio = -1;
+ send_info[i].linger_time = -1;
}
+
mptcp_for_each_subflow(msk, subflow) {
trace_mptcp_subflow_get_send(subflow);
ssk = mptcp_subflow_tcp_sock(subflow);
@@ -1439,34 +1444,51 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
tout = max(tout, mptcp_timeout_from_subflow(subflow));
nr_active += !subflow->backup;
- if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd)
- continue;
-
- pace = READ_ONCE(ssk->sk_pacing_rate);
- if (!pace)
- continue;
+ pace = subflow->avg_pacing_rate;
+ if (unlikely(!pace)) {
+ /* init pacing rate from socket */
+ subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate);
+ pace = subflow->avg_pacing_rate;
+ if (!pace)
+ continue;
+ }
- ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
- pace);
- if (ratio < send_info[subflow->backup].ratio) {
+ linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace);
+ if (linger_time < send_info[subflow->backup].linger_time) {
send_info[subflow->backup].ssk = ssk;
- send_info[subflow->backup].ratio = ratio;
+ send_info[subflow->backup].linger_time = linger_time;
}
}
__mptcp_set_timeout(sk, tout);
/* pick the best backup if no other subflow is active */
if (!nr_active)
- send_info[0].ssk = send_info[1].ssk;
-
- if (send_info[0].ssk) {
- msk->last_snd = send_info[0].ssk;
- msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
- tcp_sk(msk->last_snd)->snd_wnd);
- return msk->last_snd;
- }
+ send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk;
+
+ /* According to the blest algorithm, to avoid HoL blocking for the
+ * faster flow, we need to:
+ * - estimate the faster flow linger time
+ * - use the above to estimate the amount of byte transferred
+ * by the faster flow
+ * - check that the amount of queued data is greter than the above,
+ * otherwise do not use the picked, slower, subflow
+ * We select the subflow with the shorter estimated time to flush
+ * the queued mem, which basically ensure the above. We just need
+ * to check that subflow has a non empty cwin.
+ */
+ ssk = send_info[SSK_MODE_ACTIVE].ssk;
+ if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd)
+ return NULL;
- return NULL;
+ burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd);
+ wmem = READ_ONCE(ssk->sk_wmem_queued);
+ subflow = mptcp_subflow_ctx(ssk);
+ subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
+ READ_ONCE(ssk->sk_pacing_rate) * burst,
+ burst + wmem);
+ msk->last_snd = ssk;
+ msk->snd_burst = burst;
+ return ssk;
}
static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 67a61ac48b20..46691acdea24 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -391,6 +391,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
/* MPTCP subflow context */
struct mptcp_subflow_context {
struct list_head node;/* conn_list of subflows */
+ unsigned long avg_pacing_rate; /* protected by msk socket lock */
u64 local_key;
u64 remote_key;
u64 idsn;
--
2.26.3
Hello, On Tue, 2021-10-26 at 11:42 +0200, Paolo Abeni wrote: > The MPTCP packet scheduler has sub-optimal behavior with asymmetric > subflows: if the faster subflow-level cwin is closed, the packet > scheduler can enqueue "too much" data on a slower subflow. > > When all the data on the faster subflow is acked, if the mptcp-level > cwin is closed, and link utilization becomes suboptimal. > > The solution is implementing blest-like[1] HoL-blocking estimation, > transmitting only on the subflow with the shorter estimated time to > flush the queued memory. If such subflows cwin is closed, we wait > even if other subflows are available. > > This is quite simpler than the original blest implementation, as we > leverage the pacing rate provided by the TCP socket. To get a more > accurate estimation for the subflow linger-time, we maintain a > per-subflow weighted average of such info. > > Additionally drop magic numbers usage in favor of newly defined > macros and use more meaningful names for status variable. > > [1] http://dl.ifip.org/db/conf/networking/networking2016/1570234725.pdf > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > --- > v1 -> v2: > - fix checkpatch issue (mat) > - rename ratio as linger_time (mat) As this patch empirically improves the situation quite a bit, what about applying it on the export branch, and staging it for a bit? If we find issues or improvements we can revert/drop/update. Otherwise we can keep it. Meanwhile it will get a better testing... WDYT? Paolo
On Tue, 2 Nov 2021, Paolo Abeni wrote: > Hello, > > On Tue, 2021-10-26 at 11:42 +0200, Paolo Abeni wrote: >> The MPTCP packet scheduler has sub-optimal behavior with asymmetric >> subflows: if the faster subflow-level cwin is closed, the packet >> scheduler can enqueue "too much" data on a slower subflow. >> >> When all the data on the faster subflow is acked, if the mptcp-level >> cwin is closed, and link utilization becomes suboptimal. >> >> The solution is implementing blest-like[1] HoL-blocking estimation, >> transmitting only on the subflow with the shorter estimated time to >> flush the queued memory. If such subflows cwin is closed, we wait >> even if other subflows are available. >> >> This is quite simpler than the original blest implementation, as we >> leverage the pacing rate provided by the TCP socket. To get a more >> accurate estimation for the subflow linger-time, we maintain a >> per-subflow weighted average of such info. >> >> Additionally drop magic numbers usage in favor of newly defined >> macros and use more meaningful names for status variable. >> >> [1] http://dl.ifip.org/db/conf/networking/networking2016/1570234725.pdf >> >> Signed-off-by: Paolo Abeni <pabeni@redhat.com> >> --- >> v1 -> v2: >> - fix checkpatch issue (mat) >> - rename ratio as linger_time (mat) > > As this patch empirically improves the situation quite a bit, what > about applying it on the export branch, and staging it for a bit? > > If we find issues or improvements we can revert/drop/update. Otherwise > we can keep it. Meanwhile it will get a better testing... > > WDYT? Hi Paolo - I finally (sorry about the delay) did capture some data to see what was going on with the pacing rate. I added a pr_debug() in mptcp_subflow_get_send() to dump the following: timestamp socket ptr sk_pacing_rate sk_pacing_shift mss_cache snd_cwnd packets_out srtt_us I chose those values based on what is used to calculate sk_pacing_rate - I should have also included snd_wnd and sk_wmem_queued, since those are used in the weighted average in this patch. But I ran out of time to update everything today and still get this reply written. The debug output didn't appear to interfere much with the simult_flows.sh tests, they still completed in the normal time (except for one that ran just a little long). I observed some patterns with the output I captured during a simult_flows.sh run: 1. When snd_cwnd and srtt_us were small (when data had not been sent yet or there was a time gap), sk_pacing_rate had very large spikes. Often 10x higher or more. 2. When packets were streaming out, sk_pacing_rate was quite consistent Some sockets just had a large initial value, then settled in to a stable range. An example is the attached normal-stable.png graph (socket ecab33d5 in the attached data). Here's the beginning of that "normal" case: socket,timestamp,pacing_rate,pacing_shift,mss_cache,snd_cwnd,packets_out,srtt_us 00000000ecab33d5 1115.798277 73877551 10 1448 10 0 3136 00000000ecab33d5 1115.859702 5711316 10 1448 21 17 51112 00000000ecab33d5 1115.8793 2691321 10 1448 22 18 113631 00000000ecab33d5 1115.888531 2428742 10 1448 22 22 125916 00000000ecab33d5 1115.905641 2048124 10 1448 23 22 156103 00000000ecab33d5 1115.952846 1687326 10 1448 25 14 205959 00000000ecab33d5 1115.972365 1731531 10 1448 26 26 208729 The average pacing rate settles in after the first few points. But when snd_cwnd and srtt_us are small, the pacing rate was not reliable. Other sockets, which seemed to encounter gaps where packets were not sent, had spikes in sk_pacing_rate each time sending resumed. pacing_spikes.png shows this. An excerpt of that (socket 59324953): socket,timestamp,pacing_rate,pacing_shift,mss_cache,snd_cwnd,packets_out,srtt_us 0000000059324953 1099.545071 1535124 10 1448 333 333 3015368 0000000059324953 1100.638369 28960000 10 1448 10 0 8000 0000000059324953 1108.17501 101614035 10 1448 10 0 2280 0000000059324953 1123.068772 212941176 10 1448 10 0 1088 0000000059324953 1123.277791 218360037 10 1448 10 7 1061 0000000059324953 1123.325756 5016147 10 1448 19 0 52653 0000000059324953 1123.406518 9498001 10 1448 20 1 29271 0000000059324953 1123.440064 4935750 10 1448 20 1 56327 Here, the pacing rate had settled to around 1.5 million, then suddenly jumped to ~29 million and ~218 million. Two orders of magnitude is a pretty huge change! If I had sampled sk_wmem_queued, I'm guessing that would show a small (or zero) value which would minimize the impact of these spikes on the weighted average. If this data is representative of typical behavior of sk_pacing_rate (and simult_flows.sh is representative enough of real-world behavior!), this does definitely confirm that the way the existing upstream code uses sk_pacing_rate is likely making some very sub-optimal choices when there are outliers in sk_pacing_rate. I think this also shows that we need to be careful about how these spikes affect a weighted average, because one sk_pacing_rate spike could end up influencing the scheduler for an even longer time depending on how it gets weighted. Before we merge a change here, we should develop a clear understanding of what the data going in to the scheduling decision looks like, and make sure that outliers in that input data are handled appropriately in a variety of scenarios. -- Mat Martineau Intel
On Tue, 2021-11-02 at 18:07 -0700, Mat Martineau wrote: > On Tue, 2 Nov 2021, Paolo Abeni wrote: > > > Hello, > > > > On Tue, 2021-10-26 at 11:42 +0200, Paolo Abeni wrote: > > > The MPTCP packet scheduler has sub-optimal behavior with asymmetric > > > subflows: if the faster subflow-level cwin is closed, the packet > > > scheduler can enqueue "too much" data on a slower subflow. > > > > > > When all the data on the faster subflow is acked, if the mptcp-level > > > cwin is closed, and link utilization becomes suboptimal. > > > > > > The solution is implementing blest-like[1] HoL-blocking estimation, > > > transmitting only on the subflow with the shorter estimated time to > > > flush the queued memory. If such subflows cwin is closed, we wait > > > even if other subflows are available. > > > > > > This is quite simpler than the original blest implementation, as we > > > leverage the pacing rate provided by the TCP socket. To get a more > > > accurate estimation for the subflow linger-time, we maintain a > > > per-subflow weighted average of such info. > > > > > > Additionally drop magic numbers usage in favor of newly defined > > > macros and use more meaningful names for status variable. > > > > > > [1] http://dl.ifip.org/db/conf/networking/networking2016/1570234725.pdf > > > > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > > > --- > > > v1 -> v2: > > > - fix checkpatch issue (mat) > > > - rename ratio as linger_time (mat) > > > > As this patch empirically improves the situation quite a bit, what > > about applying it on the export branch, and staging it for a bit? > > > > If we find issues or improvements we can revert/drop/update. Otherwise > > we can keep it. Meanwhile it will get a better testing... > > > > WDYT? > > Hi Paolo - > > I finally (sorry about the delay) did capture some data to see what was > going on with the pacing rate. I added a pr_debug() in > mptcp_subflow_get_send() to dump the following: > > timestamp > socket ptr > sk_pacing_rate > sk_pacing_shift > mss_cache > snd_cwnd > packets_out > srtt_us > > I chose those values based on what is used to calculate sk_pacing_rate - I > should have also included snd_wnd and sk_wmem_queued, since those are used > in the weighted average in this patch. But I ran out of time to update > everything today and still get this reply written. > > The debug output didn't appear to interfere much with the simult_flows.sh > tests, they still completed in the normal time (except for one that ran > just a little long). > > I observed some patterns with the output I captured during a > simult_flows.sh run: > > 1. When snd_cwnd and srtt_us were small (when data had not been sent yet > or there was a time gap), sk_pacing_rate had very large spikes. Often 10x > higher or more. > > 2. When packets were streaming out, sk_pacing_rate was quite consistent > > > Some sockets just had a large initial value, then settled in to a stable > range. An example is the attached normal-stable.png graph (socket ecab33d5 > in the attached data). > > Here's the beginning of that "normal" case: > > socket,timestamp,pacing_rate,pacing_shift,mss_cache,snd_cwnd,packets_out,srtt_us > > 00000000ecab33d5 1115.798277 73877551 10 1448 10 0 3136 > 00000000ecab33d5 1115.859702 5711316 10 1448 21 17 51112 > 00000000ecab33d5 1115.8793 2691321 10 1448 22 18 113631 > 00000000ecab33d5 1115.888531 2428742 10 1448 22 22 125916 > 00000000ecab33d5 1115.905641 2048124 10 1448 23 22 156103 > 00000000ecab33d5 1115.952846 1687326 10 1448 25 14 205959 > 00000000ecab33d5 1115.972365 1731531 10 1448 26 26 208729 > > The average pacing rate settles in after the first few points. But when > snd_cwnd and srtt_us are small, the pacing rate was not reliable. > > > Other sockets, which seemed to encounter gaps where packets were not sent, > had spikes in sk_pacing_rate each time sending resumed. pacing_spikes.png > shows this. > > An excerpt of that (socket 59324953): > > socket,timestamp,pacing_rate,pacing_shift,mss_cache,snd_cwnd,packets_out,srtt_us > > 0000000059324953 1099.545071 1535124 10 1448 333 333 3015368 > 0000000059324953 1100.638369 28960000 10 1448 10 0 8000 > 0000000059324953 1108.17501 101614035 10 1448 10 0 2280 > 0000000059324953 1123.068772 212941176 10 1448 10 0 1088 > 0000000059324953 1123.277791 218360037 10 1448 10 7 1061 > 0000000059324953 1123.325756 5016147 10 1448 19 0 52653 > 0000000059324953 1123.406518 9498001 10 1448 20 1 29271 > 0000000059324953 1123.440064 4935750 10 1448 20 1 56327 > > Here, the pacing rate had settled to around 1.5 million, then suddenly > jumped to ~29 million and ~218 million. Two orders of magnitude is a > pretty huge change! If I had sampled sk_wmem_queued, I'm guessing that > would show a small (or zero) value which would minimize the impact of > these spikes on the weighted average. > > > > If this data is representative of typical behavior of sk_pacing_rate (and > simult_flows.sh is representative enough of real-world behavior!), this > does definitely confirm that the way the existing upstream code uses > sk_pacing_rate is likely making some very sub-optimal choices when there > are outliers in sk_pacing_rate. > > I think this also shows that we need to be careful about how these spikes > affect a weighted average, because one sk_pacing_rate spike could end up > influencing the scheduler for an even longer time depending on how it gets > weighted. Thank you for the detailed analisys and all the info shared! I think there is an important point I was unable to clarify so far: the average pacing rate is _NOT_ used to make any prediction or estimation on the future. It computes an approximation of the time to flush the queued data. Note due to pacing and the mptcp scheduler enqueuing data only if the cwin is open, we could compute such time with absolute accuracy, e.g. if the packets present into the tx queue have the following data: pkt len pacing 1 2000 100K 2 1000 1M 3 500 10K The time to flush will be: 1000/ 10K + 2000/1M + 500/100K regardless of the high fluctuation in the pacing rate. If the TCP code selects a suboptimal pacing rate value, the _TCP_ tput will be suboptimal, but the above equation will still be exact. The average schema used here is an approximation of the above. Why don't using the exact formula instead? it would require more overhead, more accounting (we would need to store a timing or pace info in the CB of each packet) and possibly even additional hooks (we would need to update the accounting when a packet transmission completes). Empirical results showed that the avg approximation is a good enough approximation. > Before we merge a change here, we should develop a clear > understanding of what the data going in to the scheduling decision looks > like, and make sure that outliers in that input data are handled > appropriately in a variety of scenarios. Due to the above, I still propose to go ahead with this version and ev., as needed, improve incrementally. Cheers, Paolo
On Wed, 3 Nov 2021, Paolo Abeni wrote: > On Tue, 2021-11-02 at 18:07 -0700, Mat Martineau wrote: >> On Tue, 2 Nov 2021, Paolo Abeni wrote: >> >>> Hello, >>> >>> On Tue, 2021-10-26 at 11:42 +0200, Paolo Abeni wrote: >>>> The MPTCP packet scheduler has sub-optimal behavior with asymmetric >>>> subflows: if the faster subflow-level cwin is closed, the packet >>>> scheduler can enqueue "too much" data on a slower subflow. >>>> >>>> When all the data on the faster subflow is acked, if the mptcp-level >>>> cwin is closed, and link utilization becomes suboptimal. >>>> >>>> The solution is implementing blest-like[1] HoL-blocking estimation, >>>> transmitting only on the subflow with the shorter estimated time to >>>> flush the queued memory. If such subflows cwin is closed, we wait >>>> even if other subflows are available. >>>> >>>> This is quite simpler than the original blest implementation, as we >>>> leverage the pacing rate provided by the TCP socket. To get a more >>>> accurate estimation for the subflow linger-time, we maintain a >>>> per-subflow weighted average of such info. >>>> >>>> Additionally drop magic numbers usage in favor of newly defined >>>> macros and use more meaningful names for status variable. >>>> >>>> [1] http://dl.ifip.org/db/conf/networking/networking2016/1570234725.pdf >>>> >>>> Signed-off-by: Paolo Abeni <pabeni@redhat.com> >>>> --- >>>> v1 -> v2: >>>> - fix checkpatch issue (mat) >>>> - rename ratio as linger_time (mat) >>> >>> As this patch empirically improves the situation quite a bit, what >>> about applying it on the export branch, and staging it for a bit? >>> >>> If we find issues or improvements we can revert/drop/update. Otherwise >>> we can keep it. Meanwhile it will get a better testing... >>> >>> WDYT? >> >> Hi Paolo - >> >> I finally (sorry about the delay) did capture some data to see what was >> going on with the pacing rate. I added a pr_debug() in >> mptcp_subflow_get_send() to dump the following: >> >> timestamp >> socket ptr >> sk_pacing_rate >> sk_pacing_shift >> mss_cache >> snd_cwnd >> packets_out >> srtt_us >> >> I chose those values based on what is used to calculate sk_pacing_rate - I >> should have also included snd_wnd and sk_wmem_queued, since those are used >> in the weighted average in this patch. But I ran out of time to update >> everything today and still get this reply written. >> >> The debug output didn't appear to interfere much with the simult_flows.sh >> tests, they still completed in the normal time (except for one that ran >> just a little long). >> >> I observed some patterns with the output I captured during a >> simult_flows.sh run: >> >> 1. When snd_cwnd and srtt_us were small (when data had not been sent yet >> or there was a time gap), sk_pacing_rate had very large spikes. Often 10x >> higher or more. >> >> 2. When packets were streaming out, sk_pacing_rate was quite consistent >> >> >> Some sockets just had a large initial value, then settled in to a stable >> range. An example is the attached normal-stable.png graph (socket ecab33d5 >> in the attached data). >> >> Here's the beginning of that "normal" case: >> >> socket,timestamp,pacing_rate,pacing_shift,mss_cache,snd_cwnd,packets_out,srtt_us >> >> 00000000ecab33d5 1115.798277 73877551 10 1448 10 0 3136 >> 00000000ecab33d5 1115.859702 5711316 10 1448 21 17 51112 >> 00000000ecab33d5 1115.8793 2691321 10 1448 22 18 113631 >> 00000000ecab33d5 1115.888531 2428742 10 1448 22 22 125916 >> 00000000ecab33d5 1115.905641 2048124 10 1448 23 22 156103 >> 00000000ecab33d5 1115.952846 1687326 10 1448 25 14 205959 >> 00000000ecab33d5 1115.972365 1731531 10 1448 26 26 208729 >> >> The average pacing rate settles in after the first few points. But when >> snd_cwnd and srtt_us are small, the pacing rate was not reliable. >> >> >> Other sockets, which seemed to encounter gaps where packets were not sent, >> had spikes in sk_pacing_rate each time sending resumed. pacing_spikes.png >> shows this. >> >> An excerpt of that (socket 59324953): >> >> socket,timestamp,pacing_rate,pacing_shift,mss_cache,snd_cwnd,packets_out,srtt_us >> >> 0000000059324953 1099.545071 1535124 10 1448 333 333 3015368 >> 0000000059324953 1100.638369 28960000 10 1448 10 0 8000 >> 0000000059324953 1108.17501 101614035 10 1448 10 0 2280 >> 0000000059324953 1123.068772 212941176 10 1448 10 0 1088 >> 0000000059324953 1123.277791 218360037 10 1448 10 7 1061 >> 0000000059324953 1123.325756 5016147 10 1448 19 0 52653 >> 0000000059324953 1123.406518 9498001 10 1448 20 1 29271 >> 0000000059324953 1123.440064 4935750 10 1448 20 1 56327 >> >> Here, the pacing rate had settled to around 1.5 million, then suddenly >> jumped to ~29 million and ~218 million. Two orders of magnitude is a >> pretty huge change! If I had sampled sk_wmem_queued, I'm guessing that >> would show a small (or zero) value which would minimize the impact of >> these spikes on the weighted average. >> >> >> >> If this data is representative of typical behavior of sk_pacing_rate (and >> simult_flows.sh is representative enough of real-world behavior!), this >> does definitely confirm that the way the existing upstream code uses >> sk_pacing_rate is likely making some very sub-optimal choices when there >> are outliers in sk_pacing_rate. >> >> I think this also shows that we need to be careful about how these spikes >> affect a weighted average, because one sk_pacing_rate spike could end up >> influencing the scheduler for an even longer time depending on how it gets >> weighted. > > Thank you for the detailed analisys and all the info shared! > > I think there is an important point I was unable to clarify so far: the > average pacing rate is _NOT_ used to make any prediction or estimation > on the future. > > It computes an approximation of the time to flush the queued data. Note > due to pacing and the mptcp scheduler enqueuing data only if the cwin > is open, we could compute such time with absolute accuracy, e.g. if the > packets present into the tx queue have the following data: > > pkt len pacing > 1 2000 100K > 2 1000 1M > 3 500 10K > > The time to flush will be: > > 1000/ 10K + 2000/1M + 500/100K > > regardless of the high fluctuation in the pacing rate. If the TCP code > selects a suboptimal pacing rate value, the _TCP_ tput will be > suboptimal, but the above equation will still be exact. > > The average schema used here is an approximation of the above. Why > don't using the exact formula instead? it would require more overhead, > more accounting (we would need to store a timing or pace info in the CB > of each packet) and possibly even additional hooks (we would need to > update the accounting when a packet transmission completes). > > Empirical results showed that the avg approximation is a good enough > approximation. Thanks for the info and explanation. I do agree it's likely an improvement - I'll still collect some more data (this time with the avg, snd_wnd, and sk_wmem_queued) to see if more adjustments are needed before upstreaming. > >> Before we merge a change here, we should develop a clear >> understanding of what the data going in to the scheduling decision looks >> like, and make sure that outliers in that input data are handled >> appropriately in a variety of scenarios. > > Due to the above, I still propose to go ahead with this version and > ev., as needed, improve incrementally. I'm ok to provisionally add it to the export branch now to get some more test exposure beyond the tests I'm running locally. I haven't convinced myself it's ready for net-next yet but we can sort that part out later. -- Mat Martineau Intel
Hi Paolo, Mat, On 04/11/2021 00:28, Mat Martineau wrote: > On Wed, 3 Nov 2021, Paolo Abeni wrote: >> On Tue, 2021-11-02 at 18:07 -0700, Mat Martineau wrote: >>> On Tue, 2 Nov 2021, Paolo Abeni wrote: >>>> On Tue, 2021-10-26 at 11:42 +0200, Paolo Abeni wrote: >>>>> The MPTCP packet scheduler has sub-optimal behavior with asymmetric >>>>> subflows: if the faster subflow-level cwin is closed, the packet >>>>> scheduler can enqueue "too much" data on a slower subflow. >>>>> >>>>> When all the data on the faster subflow is acked, if the mptcp-level >>>>> cwin is closed, and link utilization becomes suboptimal. >>>>> >>>>> The solution is implementing blest-like[1] HoL-blocking estimation, >>>>> transmitting only on the subflow with the shorter estimated time to >>>>> flush the queued memory. If such subflows cwin is closed, we wait >>>>> even if other subflows are available. >>>>> >>>>> This is quite simpler than the original blest implementation, as we >>>>> leverage the pacing rate provided by the TCP socket. To get a more >>>>> accurate estimation for the subflow linger-time, we maintain a >>>>> per-subflow weighted average of such info. >>>>> >>>>> Additionally drop magic numbers usage in favor of newly defined >>>>> macros and use more meaningful names for status variable. Thank you for the new version of the patch! (...) > I do agree it's likely an improvement - I'll still collect some more > data (this time with the avg, snd_wnd, and sk_wmem_queued) to see if > more adjustments are needed before upstreaming. Small idea: if you redo the measurements, I do recommand you to use trace_printk() (or ebpf) instead of pr_debug() to collect data to avoid being impacted by printk() processing if you have a lot of stuff to display. >>> Before we merge a change here, we should develop a clear >>> understanding of what the data going in to the scheduling decision looks >>> like, and make sure that outliers in that input data are handled >>> appropriately in a variety of scenarios. >> >> Due to the above, I still propose to go ahead with this version and >> ev., as needed, improve incrementally. > > I'm ok to provisionally add it to the export branch now to get some more > test exposure beyond the tests I'm running locally. I haven't convinced > myself it's ready for net-next yet but we can sort that part out later. Great! Now applied in our tree with a Closes tag for issue /137! - 43868e0b4755: mptcp: enforce HoL-blocking estimation - Results: 1e4fe04f517d..6a2b7fa0be7c Builds and tests are now in progress: https://cirrus-ci.com/github/multipath-tcp/mptcp_net-next/export/20211104T140457 https://github.com/multipath-tcp/mptcp_net-next/actions/workflows/build-validation.yml?query=branch:export Cheers, Matt -- Tessares | Belgium | Hybrid Access Solutions www.tessares.net
© 2016 - 2024 Red Hat, Inc.