[PATCH] net/ipv4/proc: Avoid usage for seq_printf() when reading /proc/net/snmp

David Wang posted 1 patch 1 week, 5 days ago
net/ipv4/proc.c | 116 ++++++++++++++++++++++++++++--------------------
1 file changed, 69 insertions(+), 47 deletions(-)
[PATCH] net/ipv4/proc: Avoid usage for seq_printf() when reading /proc/net/snmp
Posted by David Wang 1 week, 5 days ago
seq_printf() is costy, when reading /proc/net/snmp, profiling indicates
seq_printf() takes more than 50% samples of snmp_seq_show():
	snmp_seq_show(97.751% 158722/162373)
	    snmp_seq_show_tcp_udp.isra.0(40.017% 63515/158722)
		seq_printf(83.451% 53004/63515)
		seq_write(1.170% 743/63515)
		_find_next_bit(0.727% 462/63515)
		...
	    seq_printf(24.762% 39303/158722)
	    snmp_seq_show_ipstats.isra.0(21.487% 34104/158722)
		seq_printf(85.788% 29257/34104)
		_find_next_bit(0.331% 113/34104)
		seq_write(0.235% 80/34104)
		...
	    icmpmsg_put(7.235% 11483/158722)
		seq_printf(41.714% 4790/11483)
		seq_write(2.630% 302/11483)
		...
Time for a million rounds of stress reading /proc/net/snmp:
	real	0m24.323s
	user	0m0.293s
	sys	0m23.679s
On average, reading /proc/net/snmp takes 0.023ms.
With this patch, extra costs of seq_printf() is avoided, and a million
rounds of reading /proc/net/snmp now takes only ~15.853s:
	real	0m16.386s
	user	0m0.280s
	sys	0m15.853s
On average, one read takes 0.015ms, a ~40% improvement.

Signed-off-by: David Wang <00107082@163.com>
---
 net/ipv4/proc.c | 116 ++++++++++++++++++++++++++++--------------------
 1 file changed, 69 insertions(+), 47 deletions(-)

diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 40053a02bae1..3a4586103b5e 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -315,13 +315,14 @@ static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
 
 	if (count) {
 		seq_puts(seq, "\nIcmpMsg:");
-		for (j = 0; j < count; ++j)
-			seq_printf(seq, " %sType%u",
-				type[j] & 0x100 ? "Out" : "In",
-				type[j] & 0xff);
+		for (j = 0; j < count; ++j) {
+			seq_putc(seq, ' ');
+			seq_puts(seq, type[j] & 0x100 ? "Out" : "In");
+			seq_put_decimal_ull(seq, "Type", type[j] & 0xff);
+		}
 		seq_puts(seq, "\nIcmpMsg:");
 		for (j = 0; j < count; ++j)
-			seq_printf(seq, " %lu", vals[j]);
+			seq_put_decimal_ull(seq, " ", vals[j]);
 	}
 }
 
@@ -358,26 +359,35 @@ static void icmp_put(struct seq_file *seq)
 	atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
 
 	seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
-	for (i = 0; icmpmibmap[i].name; i++)
-		seq_printf(seq, " In%s", icmpmibmap[i].name);
+	for (i = 0; icmpmibmap[i].name; i++) {
+		seq_puts(seq, " In");
+		seq_puts(seq, icmpmibmap[i].name);
+	}
 	seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost");
+	for (i = 0; icmpmibmap[i].name; i++) {
+		seq_puts(seq, " Out");
+		seq_puts(seq, icmpmibmap[i].name);
+	}
+	seq_puts(seq, "\nIcmp:");
+	seq_put_decimal_ull(seq, " ",
+			    snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS));
+	seq_put_decimal_ull(seq, " ",
+			    snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS));
+	seq_put_decimal_ull(seq, " ",
+			    snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
 	for (i = 0; icmpmibmap[i].name; i++)
-		seq_printf(seq, " Out%s", icmpmibmap[i].name);
-	seq_printf(seq, "\nIcmp: %lu %lu %lu",
-		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
-		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
-		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
+		seq_put_decimal_ull(seq, " ", atomic_long_read(ptr + icmpmibmap[i].index));
+	seq_put_decimal_ull(seq, " ",
+			    snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS));
+	seq_put_decimal_ull(seq, " ",
+			    snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
+	seq_put_decimal_ull(seq, " ",
+			    snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL));
+	seq_put_decimal_ull(seq, " ",
+			    snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST));
 	for (i = 0; icmpmibmap[i].name; i++)
-		seq_printf(seq, " %lu",
-			   atomic_long_read(ptr + icmpmibmap[i].index));
-	seq_printf(seq, " %lu %lu %lu %lu",
-		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
-		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS),
-		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL),
-		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST));
-	for (i = 0; icmpmibmap[i].name; i++)
-		seq_printf(seq, " %lu",
-			   atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
+		seq_put_decimal_ull(seq, " ",
+				    atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
 }
 
 /*
@@ -392,8 +402,10 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
 	memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64));
 
 	seq_puts(seq, "Ip: Forwarding DefaultTTL");
-	for (i = 0; snmp4_ipstats_list[i].name; i++)
-		seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
+	for (i = 0; snmp4_ipstats_list[i].name; i++) {
+		seq_putc(seq, ' ');
+		seq_puts(seq, snmp4_ipstats_list[i].name);
+	}
 
 	seq_printf(seq, "\nIp: %d %d",
 		   IPV4_DEVCONF_ALL_RO(net, FORWARDING) ? 1 : 2,
@@ -404,7 +416,7 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
 				   net->mib.ip_statistics,
 				   offsetof(struct ipstats_mib, syncp));
 	for (i = 0; snmp4_ipstats_list[i].name; i++)
-		seq_printf(seq, " %llu", buff64[i]);
+		seq_put_decimal_ull(seq, " ", buff64[i]);
 
 	return 0;
 }
@@ -418,8 +430,10 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
 	memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
 
 	seq_puts(seq, "\nTcp:");
-	for (i = 0; snmp4_tcp_list[i].name; i++)
-		seq_printf(seq, " %s", snmp4_tcp_list[i].name);
+	for (i = 0; snmp4_tcp_list[i].name; i++) {
+		seq_putc(seq, ' ');
+		seq_puts(seq, snmp4_tcp_list[i].name);
+	}
 
 	seq_puts(seq, "\nTcp:");
 	snmp_get_cpu_field_batch(buff, snmp4_tcp_list,
@@ -429,7 +443,7 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
 		if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
 			seq_printf(seq, " %ld", buff[i]);
 		else
-			seq_printf(seq, " %lu", buff[i]);
+			seq_put_decimal_ull(seq, " ", buff[i]);
 	}
 
 	memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
@@ -437,11 +451,13 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
 	snmp_get_cpu_field_batch(buff, snmp4_udp_list,
 				 net->mib.udp_statistics);
 	seq_puts(seq, "\nUdp:");
-	for (i = 0; snmp4_udp_list[i].name; i++)
-		seq_printf(seq, " %s", snmp4_udp_list[i].name);
+	for (i = 0; snmp4_udp_list[i].name; i++) {
+		seq_putc(seq, ' ');
+		seq_puts(seq, snmp4_udp_list[i].name);
+	}
 	seq_puts(seq, "\nUdp:");
 	for (i = 0; snmp4_udp_list[i].name; i++)
-		seq_printf(seq, " %lu", buff[i]);
+		seq_put_decimal_ull(seq, " ", buff[i]);
 
 	memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
 
@@ -449,11 +465,13 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
 	seq_puts(seq, "\nUdpLite:");
 	snmp_get_cpu_field_batch(buff, snmp4_udp_list,
 				 net->mib.udplite_statistics);
-	for (i = 0; snmp4_udp_list[i].name; i++)
-		seq_printf(seq, " %s", snmp4_udp_list[i].name);
+	for (i = 0; snmp4_udp_list[i].name; i++) {
+		seq_putc(seq, ' ');
+		seq_puts(seq, snmp4_udp_list[i].name);
+	}
 	seq_puts(seq, "\nUdpLite:");
 	for (i = 0; snmp4_udp_list[i].name; i++)
-		seq_printf(seq, " %lu", buff[i]);
+		seq_put_decimal_ull(seq, " ", buff[i]);
 
 	seq_putc(seq, '\n');
 	return 0;
@@ -483,8 +501,10 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
 	int i;
 
 	seq_puts(seq, "TcpExt:");
-	for (i = 0; i < tcp_cnt; i++)
-		seq_printf(seq, " %s", snmp4_net_list[i].name);
+	for (i = 0; i < tcp_cnt; i++) {
+		seq_putc(seq, ' ');
+		seq_puts(seq, snmp4_net_list[i].name);
+	}
 
 	seq_puts(seq, "\nTcpExt:");
 	buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)),
@@ -493,16 +513,18 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
 		snmp_get_cpu_field_batch(buff, snmp4_net_list,
 					 net->mib.net_statistics);
 		for (i = 0; i < tcp_cnt; i++)
-			seq_printf(seq, " %lu", buff[i]);
+			seq_put_decimal_ull(seq, " ", buff[i]);
 	} else {
 		for (i = 0; i < tcp_cnt; i++)
-			seq_printf(seq, " %lu",
-				   snmp_fold_field(net->mib.net_statistics,
-						   snmp4_net_list[i].entry));
+			seq_put_decimal_ull(seq, " ",
+					    snmp_fold_field(net->mib.net_statistics,
+							    snmp4_net_list[i].entry));
 	}
 	seq_puts(seq, "\nIpExt:");
-	for (i = 0; i < ip_cnt; i++)
-		seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
+	for (i = 0; i < ip_cnt; i++) {
+		seq_putc(seq, ' ');
+		seq_puts(seq, snmp4_ipextstats_list[i].name);
+	}
 
 	seq_puts(seq, "\nIpExt:");
 	if (buff) {
@@ -513,13 +535,13 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
 					   net->mib.ip_statistics,
 					   offsetof(struct ipstats_mib, syncp));
 		for (i = 0; i < ip_cnt; i++)
-			seq_printf(seq, " %llu", buff64[i]);
+			seq_put_decimal_ull(seq, " ", buff64[i]);
 	} else {
 		for (i = 0; i < ip_cnt; i++)
-			seq_printf(seq, " %llu",
-				   snmp_fold_field64(net->mib.ip_statistics,
-						     snmp4_ipextstats_list[i].entry,
-						     offsetof(struct ipstats_mib, syncp)));
+			seq_put_decimal_ull(seq, " ",
+					    snmp_fold_field64(net->mib.ip_statistics,
+							      snmp4_ipextstats_list[i].entry,
+							      offsetof(struct ipstats_mib, syncp)));
 	}
 	kfree(buff);
 	seq_putc(seq, '\n');
-- 
2.39.2
Re: [PATCH] net/ipv4/proc: Avoid usage for seq_printf() when reading /proc/net/snmp
Posted by Paolo Abeni 1 week, 2 days ago
On 11/11/24 05:56, David Wang wrote:
> seq_printf() is costy, when reading /proc/net/snmp, profiling indicates
> seq_printf() takes more than 50% samples of snmp_seq_show():
> 	snmp_seq_show(97.751% 158722/162373)
> 	    snmp_seq_show_tcp_udp.isra.0(40.017% 63515/158722)
> 		seq_printf(83.451% 53004/63515)
> 		seq_write(1.170% 743/63515)
> 		_find_next_bit(0.727% 462/63515)
> 		...
> 	    seq_printf(24.762% 39303/158722)
> 	    snmp_seq_show_ipstats.isra.0(21.487% 34104/158722)
> 		seq_printf(85.788% 29257/34104)
> 		_find_next_bit(0.331% 113/34104)
> 		seq_write(0.235% 80/34104)
> 		...
> 	    icmpmsg_put(7.235% 11483/158722)
> 		seq_printf(41.714% 4790/11483)
> 		seq_write(2.630% 302/11483)
> 		...
> Time for a million rounds of stress reading /proc/net/snmp:
> 	real	0m24.323s
> 	user	0m0.293s
> 	sys	0m23.679s
> On average, reading /proc/net/snmp takes 0.023ms.
> With this patch, extra costs of seq_printf() is avoided, and a million
> rounds of reading /proc/net/snmp now takes only ~15.853s:
> 	real	0m16.386s
> 	user	0m0.280s
> 	sys	0m15.853s
> On average, one read takes 0.015ms, a ~40% improvement.
> 
> Signed-off-by: David Wang <00107082@163.com>

If the user space is really concerned with snmp access performances, I
think such information should be exposed via netlink.

Still the goal of the optimization looks doubtful. The total number of
mibs domain is constant and limited (differently from the network
devices number that in specific setup can grow a lot). Stats polling
should be a low frequency operation. Why you need to optimize it?

I don't think we should accept this change, too. And a solid explanation
should be need to introduce a netlink MIB interface.

> ---
>  net/ipv4/proc.c | 116 ++++++++++++++++++++++++++++--------------------

FTR you missed mptcp.

/P
Re: [PATCH] net/ipv4/proc: Avoid usage for seq_printf() when reading /proc/net/snmp
Posted by David Wang 1 week, 2 days ago
At 2024-11-14 17:30:34, "Paolo Abeni" <pabeni@redhat.com> wrote:
>On 11/11/24 05:56, David Wang wrote:
>> seq_printf() is costy, when reading /proc/net/snmp, profiling indicates
>> seq_printf() takes more than 50% samples of snmp_seq_show():
>> 	snmp_seq_show(97.751% 158722/162373)
>> 	    snmp_seq_show_tcp_udp.isra.0(40.017% 63515/158722)
>> 		seq_printf(83.451% 53004/63515)
>> 		seq_write(1.170% 743/63515)
>> 		_find_next_bit(0.727% 462/63515)
>> 		...
>> 	    seq_printf(24.762% 39303/158722)
>> 	    snmp_seq_show_ipstats.isra.0(21.487% 34104/158722)
>> 		seq_printf(85.788% 29257/34104)
>> 		_find_next_bit(0.331% 113/34104)
>> 		seq_write(0.235% 80/34104)
>> 		...
>> 	    icmpmsg_put(7.235% 11483/158722)
>> 		seq_printf(41.714% 4790/11483)
>> 		seq_write(2.630% 302/11483)
>> 		...
>> Time for a million rounds of stress reading /proc/net/snmp:
>> 	real	0m24.323s
>> 	user	0m0.293s
>> 	sys	0m23.679s
>> On average, reading /proc/net/snmp takes 0.023ms.
>> With this patch, extra costs of seq_printf() is avoided, and a million
>> rounds of reading /proc/net/snmp now takes only ~15.853s:
>> 	real	0m16.386s
>> 	user	0m0.280s
>> 	sys	0m15.853s
>> On average, one read takes 0.015ms, a ~40% improvement.
>> 
>> Signed-off-by: David Wang <00107082@163.com>
>
>If the user space is really concerned with snmp access performances, I
>think such information should be exposed via netlink.
>
>Still the goal of the optimization looks doubtful. The total number of
>mibs domain is constant and limited (differently from the network
>devices number that in specific setup can grow a lot). Stats polling
>should be a low frequency operation. Why you need to optimize it?

Well, one thing I think worth mention, optimize /proc entries can help
increase sample frequency, hence more accurate rate analysis,
 for monitoring tools with a fixed/limited cpu quota.

And for /proc/net/*, the optimization would be amplified when considering network namespaces.

I think it worth to optimize.   

>
>I don't think we should accept this change, too. And a solid explanation
>should be need to introduce a netlink MIB interface.
>
>> ---
>>  net/ipv4/proc.c | 116 ++++++++++++++++++++++++++++--------------------
>
>FTR you missed mptcp.
>
>/P
Re: [PATCH] net/ipv4/proc: Avoid usage for seq_printf() when reading /proc/net/snmp
Posted by Paolo Abeni 1 week, 2 days ago
On 11/14/24 11:19, David Wang wrote:
> At 2024-11-14 17:30:34, "Paolo Abeni" <pabeni@redhat.com> wrote:
>> On 11/11/24 05:56, David Wang wrote:
>>> seq_printf() is costy, when reading /proc/net/snmp, profiling indicates
>>> seq_printf() takes more than 50% samples of snmp_seq_show():
>>> 	snmp_seq_show(97.751% 158722/162373)
>>> 	    snmp_seq_show_tcp_udp.isra.0(40.017% 63515/158722)
>>> 		seq_printf(83.451% 53004/63515)
>>> 		seq_write(1.170% 743/63515)
>>> 		_find_next_bit(0.727% 462/63515)
>>> 		...
>>> 	    seq_printf(24.762% 39303/158722)
>>> 	    snmp_seq_show_ipstats.isra.0(21.487% 34104/158722)
>>> 		seq_printf(85.788% 29257/34104)
>>> 		_find_next_bit(0.331% 113/34104)
>>> 		seq_write(0.235% 80/34104)
>>> 		...
>>> 	    icmpmsg_put(7.235% 11483/158722)
>>> 		seq_printf(41.714% 4790/11483)
>>> 		seq_write(2.630% 302/11483)
>>> 		...
>>> Time for a million rounds of stress reading /proc/net/snmp:
>>> 	real	0m24.323s
>>> 	user	0m0.293s
>>> 	sys	0m23.679s
>>> On average, reading /proc/net/snmp takes 0.023ms.
>>> With this patch, extra costs of seq_printf() is avoided, and a million
>>> rounds of reading /proc/net/snmp now takes only ~15.853s:
>>> 	real	0m16.386s
>>> 	user	0m0.280s
>>> 	sys	0m15.853s
>>> On average, one read takes 0.015ms, a ~40% improvement.
>>>
>>> Signed-off-by: David Wang <00107082@163.com>
>>
>> If the user space is really concerned with snmp access performances, I
>> think such information should be exposed via netlink.
>>
>> Still the goal of the optimization looks doubtful. The total number of
>> mibs domain is constant and limited (differently from the network
>> devices number that in specific setup can grow a lot). Stats polling
>> should be a low frequency operation. Why you need to optimize it?
> 
> Well, one thing I think worth mention, optimize /proc entries can help
> increase sample frequency, hence more accurate rate analysis,
>  for monitoring tools with a fixed/limited cpu quota.
> 
> And for /proc/net/*, the optimization would be amplified when considering network namespaces.

I guess scaling better with many namespace could be useful.

Please try to implement this interface over netlink.

Thanks,

Paolo