net/ipv4/proc.c | 116 ++++++++++++++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 47 deletions(-)
seq_printf() is costy, when reading /proc/net/snmp, profiling indicates
seq_printf() takes more than 50% samples of snmp_seq_show():
snmp_seq_show(97.751% 158722/162373)
snmp_seq_show_tcp_udp.isra.0(40.017% 63515/158722)
seq_printf(83.451% 53004/63515)
seq_write(1.170% 743/63515)
_find_next_bit(0.727% 462/63515)
...
seq_printf(24.762% 39303/158722)
snmp_seq_show_ipstats.isra.0(21.487% 34104/158722)
seq_printf(85.788% 29257/34104)
_find_next_bit(0.331% 113/34104)
seq_write(0.235% 80/34104)
...
icmpmsg_put(7.235% 11483/158722)
seq_printf(41.714% 4790/11483)
seq_write(2.630% 302/11483)
...
Time for a million rounds of stress reading /proc/net/snmp:
real 0m24.323s
user 0m0.293s
sys 0m23.679s
On average, reading /proc/net/snmp takes 0.023ms.
With this patch, extra costs of seq_printf() is avoided, and a million
rounds of reading /proc/net/snmp now takes only ~15.853s:
real 0m16.386s
user 0m0.280s
sys 0m15.853s
On average, one read takes 0.015ms, a ~40% improvement.
Signed-off-by: David Wang <00107082@163.com>
---
net/ipv4/proc.c | 116 ++++++++++++++++++++++++++++--------------------
1 file changed, 69 insertions(+), 47 deletions(-)
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 40053a02bae1..3a4586103b5e 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -315,13 +315,14 @@ static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
if (count) {
seq_puts(seq, "\nIcmpMsg:");
- for (j = 0; j < count; ++j)
- seq_printf(seq, " %sType%u",
- type[j] & 0x100 ? "Out" : "In",
- type[j] & 0xff);
+ for (j = 0; j < count; ++j) {
+ seq_putc(seq, ' ');
+ seq_puts(seq, type[j] & 0x100 ? "Out" : "In");
+ seq_put_decimal_ull(seq, "Type", type[j] & 0xff);
+ }
seq_puts(seq, "\nIcmpMsg:");
for (j = 0; j < count; ++j)
- seq_printf(seq, " %lu", vals[j]);
+ seq_put_decimal_ull(seq, " ", vals[j]);
}
}
@@ -358,26 +359,35 @@ static void icmp_put(struct seq_file *seq)
atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
- for (i = 0; icmpmibmap[i].name; i++)
- seq_printf(seq, " In%s", icmpmibmap[i].name);
+ for (i = 0; icmpmibmap[i].name; i++) {
+ seq_puts(seq, " In");
+ seq_puts(seq, icmpmibmap[i].name);
+ }
seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost");
+ for (i = 0; icmpmibmap[i].name; i++) {
+ seq_puts(seq, " Out");
+ seq_puts(seq, icmpmibmap[i].name);
+ }
+ seq_puts(seq, "\nIcmp:");
+ seq_put_decimal_ull(seq, " ",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS));
+ seq_put_decimal_ull(seq, " ",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS));
+ seq_put_decimal_ull(seq, " ",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
for (i = 0; icmpmibmap[i].name; i++)
- seq_printf(seq, " Out%s", icmpmibmap[i].name);
- seq_printf(seq, "\nIcmp: %lu %lu %lu",
- snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
- snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
- snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
+ seq_put_decimal_ull(seq, " ", atomic_long_read(ptr + icmpmibmap[i].index));
+ seq_put_decimal_ull(seq, " ",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS));
+ seq_put_decimal_ull(seq, " ",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
+ seq_put_decimal_ull(seq, " ",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL));
+ seq_put_decimal_ull(seq, " ",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST));
for (i = 0; icmpmibmap[i].name; i++)
- seq_printf(seq, " %lu",
- atomic_long_read(ptr + icmpmibmap[i].index));
- seq_printf(seq, " %lu %lu %lu %lu",
- snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
- snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS),
- snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL),
- snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST));
- for (i = 0; icmpmibmap[i].name; i++)
- seq_printf(seq, " %lu",
- atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
+ seq_put_decimal_ull(seq, " ",
+ atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
}
/*
@@ -392,8 +402,10 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64));
seq_puts(seq, "Ip: Forwarding DefaultTTL");
- for (i = 0; snmp4_ipstats_list[i].name; i++)
- seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
+ for (i = 0; snmp4_ipstats_list[i].name; i++) {
+ seq_putc(seq, ' ');
+ seq_puts(seq, snmp4_ipstats_list[i].name);
+ }
seq_printf(seq, "\nIp: %d %d",
IPV4_DEVCONF_ALL_RO(net, FORWARDING) ? 1 : 2,
@@ -404,7 +416,7 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
net->mib.ip_statistics,
offsetof(struct ipstats_mib, syncp));
for (i = 0; snmp4_ipstats_list[i].name; i++)
- seq_printf(seq, " %llu", buff64[i]);
+ seq_put_decimal_ull(seq, " ", buff64[i]);
return 0;
}
@@ -418,8 +430,10 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
seq_puts(seq, "\nTcp:");
- for (i = 0; snmp4_tcp_list[i].name; i++)
- seq_printf(seq, " %s", snmp4_tcp_list[i].name);
+ for (i = 0; snmp4_tcp_list[i].name; i++) {
+ seq_putc(seq, ' ');
+ seq_puts(seq, snmp4_tcp_list[i].name);
+ }
seq_puts(seq, "\nTcp:");
snmp_get_cpu_field_batch(buff, snmp4_tcp_list,
@@ -429,7 +443,7 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
seq_printf(seq, " %ld", buff[i]);
else
- seq_printf(seq, " %lu", buff[i]);
+ seq_put_decimal_ull(seq, " ", buff[i]);
}
memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
@@ -437,11 +451,13 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
snmp_get_cpu_field_batch(buff, snmp4_udp_list,
net->mib.udp_statistics);
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name; i++)
- seq_printf(seq, " %s", snmp4_udp_list[i].name);
+ for (i = 0; snmp4_udp_list[i].name; i++) {
+ seq_putc(seq, ' ');
+ seq_puts(seq, snmp4_udp_list[i].name);
+ }
seq_puts(seq, "\nUdp:");
for (i = 0; snmp4_udp_list[i].name; i++)
- seq_printf(seq, " %lu", buff[i]);
+ seq_put_decimal_ull(seq, " ", buff[i]);
memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
@@ -449,11 +465,13 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
seq_puts(seq, "\nUdpLite:");
snmp_get_cpu_field_batch(buff, snmp4_udp_list,
net->mib.udplite_statistics);
- for (i = 0; snmp4_udp_list[i].name; i++)
- seq_printf(seq, " %s", snmp4_udp_list[i].name);
+ for (i = 0; snmp4_udp_list[i].name; i++) {
+ seq_putc(seq, ' ');
+ seq_puts(seq, snmp4_udp_list[i].name);
+ }
seq_puts(seq, "\nUdpLite:");
for (i = 0; snmp4_udp_list[i].name; i++)
- seq_printf(seq, " %lu", buff[i]);
+ seq_put_decimal_ull(seq, " ", buff[i]);
seq_putc(seq, '\n');
return 0;
@@ -483,8 +501,10 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
int i;
seq_puts(seq, "TcpExt:");
- for (i = 0; i < tcp_cnt; i++)
- seq_printf(seq, " %s", snmp4_net_list[i].name);
+ for (i = 0; i < tcp_cnt; i++) {
+ seq_putc(seq, ' ');
+ seq_puts(seq, snmp4_net_list[i].name);
+ }
seq_puts(seq, "\nTcpExt:");
buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)),
@@ -493,16 +513,18 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
snmp_get_cpu_field_batch(buff, snmp4_net_list,
net->mib.net_statistics);
for (i = 0; i < tcp_cnt; i++)
- seq_printf(seq, " %lu", buff[i]);
+ seq_put_decimal_ull(seq, " ", buff[i]);
} else {
for (i = 0; i < tcp_cnt; i++)
- seq_printf(seq, " %lu",
- snmp_fold_field(net->mib.net_statistics,
- snmp4_net_list[i].entry));
+ seq_put_decimal_ull(seq, " ",
+ snmp_fold_field(net->mib.net_statistics,
+ snmp4_net_list[i].entry));
}
seq_puts(seq, "\nIpExt:");
- for (i = 0; i < ip_cnt; i++)
- seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
+ for (i = 0; i < ip_cnt; i++) {
+ seq_putc(seq, ' ');
+ seq_puts(seq, snmp4_ipextstats_list[i].name);
+ }
seq_puts(seq, "\nIpExt:");
if (buff) {
@@ -513,13 +535,13 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
net->mib.ip_statistics,
offsetof(struct ipstats_mib, syncp));
for (i = 0; i < ip_cnt; i++)
- seq_printf(seq, " %llu", buff64[i]);
+ seq_put_decimal_ull(seq, " ", buff64[i]);
} else {
for (i = 0; i < ip_cnt; i++)
- seq_printf(seq, " %llu",
- snmp_fold_field64(net->mib.ip_statistics,
- snmp4_ipextstats_list[i].entry,
- offsetof(struct ipstats_mib, syncp)));
+ seq_put_decimal_ull(seq, " ",
+ snmp_fold_field64(net->mib.ip_statistics,
+ snmp4_ipextstats_list[i].entry,
+ offsetof(struct ipstats_mib, syncp)));
}
kfree(buff);
seq_putc(seq, '\n');
--
2.39.2
On 11/11/24 05:56, David Wang wrote: > seq_printf() is costy, when reading /proc/net/snmp, profiling indicates > seq_printf() takes more than 50% samples of snmp_seq_show(): > snmp_seq_show(97.751% 158722/162373) > snmp_seq_show_tcp_udp.isra.0(40.017% 63515/158722) > seq_printf(83.451% 53004/63515) > seq_write(1.170% 743/63515) > _find_next_bit(0.727% 462/63515) > ... > seq_printf(24.762% 39303/158722) > snmp_seq_show_ipstats.isra.0(21.487% 34104/158722) > seq_printf(85.788% 29257/34104) > _find_next_bit(0.331% 113/34104) > seq_write(0.235% 80/34104) > ... > icmpmsg_put(7.235% 11483/158722) > seq_printf(41.714% 4790/11483) > seq_write(2.630% 302/11483) > ... > Time for a million rounds of stress reading /proc/net/snmp: > real 0m24.323s > user 0m0.293s > sys 0m23.679s > On average, reading /proc/net/snmp takes 0.023ms. > With this patch, extra costs of seq_printf() is avoided, and a million > rounds of reading /proc/net/snmp now takes only ~15.853s: > real 0m16.386s > user 0m0.280s > sys 0m15.853s > On average, one read takes 0.015ms, a ~40% improvement. > > Signed-off-by: David Wang <00107082@163.com> If the user space is really concerned with snmp access performances, I think such information should be exposed via netlink. Still the goal of the optimization looks doubtful. The total number of mibs domain is constant and limited (differently from the network devices number that in specific setup can grow a lot). Stats polling should be a low frequency operation. Why you need to optimize it? I don't think we should accept this change, too. And a solid explanation should be need to introduce a netlink MIB interface. > --- > net/ipv4/proc.c | 116 ++++++++++++++++++++++++++++-------------------- FTR you missed mptcp. /P
At 2024-11-14 17:30:34, "Paolo Abeni" <pabeni@redhat.com> wrote: >On 11/11/24 05:56, David Wang wrote: >> seq_printf() is costy, when reading /proc/net/snmp, profiling indicates >> seq_printf() takes more than 50% samples of snmp_seq_show(): >> snmp_seq_show(97.751% 158722/162373) >> snmp_seq_show_tcp_udp.isra.0(40.017% 63515/158722) >> seq_printf(83.451% 53004/63515) >> seq_write(1.170% 743/63515) >> _find_next_bit(0.727% 462/63515) >> ... >> seq_printf(24.762% 39303/158722) >> snmp_seq_show_ipstats.isra.0(21.487% 34104/158722) >> seq_printf(85.788% 29257/34104) >> _find_next_bit(0.331% 113/34104) >> seq_write(0.235% 80/34104) >> ... >> icmpmsg_put(7.235% 11483/158722) >> seq_printf(41.714% 4790/11483) >> seq_write(2.630% 302/11483) >> ... >> Time for a million rounds of stress reading /proc/net/snmp: >> real 0m24.323s >> user 0m0.293s >> sys 0m23.679s >> On average, reading /proc/net/snmp takes 0.023ms. >> With this patch, extra costs of seq_printf() is avoided, and a million >> rounds of reading /proc/net/snmp now takes only ~15.853s: >> real 0m16.386s >> user 0m0.280s >> sys 0m15.853s >> On average, one read takes 0.015ms, a ~40% improvement. >> >> Signed-off-by: David Wang <00107082@163.com> > >If the user space is really concerned with snmp access performances, I >think such information should be exposed via netlink. > >Still the goal of the optimization looks doubtful. The total number of >mibs domain is constant and limited (differently from the network >devices number that in specific setup can grow a lot). Stats polling >should be a low frequency operation. Why you need to optimize it? Well, one thing I think worth mention, optimize /proc entries can help increase sample frequency, hence more accurate rate analysis, for monitoring tools with a fixed/limited cpu quota. And for /proc/net/*, the optimization would be amplified when considering network namespaces. I think it worth to optimize. > >I don't think we should accept this change, too. And a solid explanation >should be need to introduce a netlink MIB interface. > >> --- >> net/ipv4/proc.c | 116 ++++++++++++++++++++++++++++-------------------- > >FTR you missed mptcp. > >/P
On 11/14/24 11:19, David Wang wrote: > At 2024-11-14 17:30:34, "Paolo Abeni" <pabeni@redhat.com> wrote: >> On 11/11/24 05:56, David Wang wrote: >>> seq_printf() is costy, when reading /proc/net/snmp, profiling indicates >>> seq_printf() takes more than 50% samples of snmp_seq_show(): >>> snmp_seq_show(97.751% 158722/162373) >>> snmp_seq_show_tcp_udp.isra.0(40.017% 63515/158722) >>> seq_printf(83.451% 53004/63515) >>> seq_write(1.170% 743/63515) >>> _find_next_bit(0.727% 462/63515) >>> ... >>> seq_printf(24.762% 39303/158722) >>> snmp_seq_show_ipstats.isra.0(21.487% 34104/158722) >>> seq_printf(85.788% 29257/34104) >>> _find_next_bit(0.331% 113/34104) >>> seq_write(0.235% 80/34104) >>> ... >>> icmpmsg_put(7.235% 11483/158722) >>> seq_printf(41.714% 4790/11483) >>> seq_write(2.630% 302/11483) >>> ... >>> Time for a million rounds of stress reading /proc/net/snmp: >>> real 0m24.323s >>> user 0m0.293s >>> sys 0m23.679s >>> On average, reading /proc/net/snmp takes 0.023ms. >>> With this patch, extra costs of seq_printf() is avoided, and a million >>> rounds of reading /proc/net/snmp now takes only ~15.853s: >>> real 0m16.386s >>> user 0m0.280s >>> sys 0m15.853s >>> On average, one read takes 0.015ms, a ~40% improvement. >>> >>> Signed-off-by: David Wang <00107082@163.com> >> >> If the user space is really concerned with snmp access performances, I >> think such information should be exposed via netlink. >> >> Still the goal of the optimization looks doubtful. The total number of >> mibs domain is constant and limited (differently from the network >> devices number that in specific setup can grow a lot). Stats polling >> should be a low frequency operation. Why you need to optimize it? > > Well, one thing I think worth mention, optimize /proc entries can help > increase sample frequency, hence more accurate rate analysis, > for monitoring tools with a fixed/limited cpu quota. > > And for /proc/net/*, the optimization would be amplified when considering network namespaces. I guess scaling better with many namespace could be useful. Please try to implement this interface over netlink. Thanks, Paolo
© 2016 - 2024 Red Hat, Inc.