[v3] net/bridge: Optimizing read-write locks in ebtables.c

[PATCH v3] net/bridge: Optimizing read-write locks in ebtables.c

Posted by yushengjin 2 months ago

When conducting WRK testing, the CPU usage rate of the testing machine was
100%. forwarding through a bridge, if the network load is too high, it may
cause abnormal load on the ebt_do_table of the kernel ebtable module, leading
to excessive soft interrupts and sometimes even directly causing CPU soft
deadlocks.

After analysis, it was found that the code of ebtables had not been optimized
for a long time, and the read-write locks inside still existed. However, other
arp/ip/ip6 tables had already been optimized a lot, and performance bottlenecks
in read-write locks had been discovered a long time ago.

Ref link: https://lore.kernel.org/lkml/20090428092411.5331c4a1@nehalam/

So I referred to arp/ip/ip6 modification methods to optimize the read-write
lock in ebtables.c.

test method:
1) Test machine creates bridge :
``` bash
brctl addbr br-a
brctl addbr br-b
brctl addif br-a enp1s0f0 enp1s0f1
brctl addif br-b enp130s0f0 enp130s0f1
ifconfig br-a up
ifconfig br-b up
```
2) Testing with another machine:
``` bash
ulimit -n 2048
./wrk -t48 -c2000 -d6000 -R10000 -s request.lua http://4.4.4.2:80/4k.html &
./wrk -t48 -c2000 -d6000 -R10000 -s request.lua http://5.5.5.2:80/4k.html &
```

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: yushengjin <yushengjin@uniontech.com>
Link: https://lore.kernel.org/all/CANn89iJCBRCM3aHDy-7gxWu_+agXC9M1R=hwFuh2G9RSLu_6bg@mail.gmail.com/
---
 include/linux/netfilter_bridge/ebtables.h |   1 -
 net/bridge/netfilter/ebtables.c           | 140 ++++++++++++++++------
 2 files changed, 102 insertions(+), 39 deletions(-)

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index fd533552a062..15aad1e479d7 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -93,7 +93,6 @@ struct ebt_table {
 	char name[EBT_TABLE_MAXNAMELEN];
 	struct ebt_replace_kernel *table;
 	unsigned int valid_hooks;
-	rwlock_t lock;
 	/* the data used by the kernel */
 	struct ebt_table_info *private;
 	struct nf_hook_ops *ops;
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 3e67d4aff419..08e430fcbe5a 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -204,11 +204,14 @@ unsigned int ebt_do_table(void *priv, struct sk_buff *skb,
 	const char *base;
 	const struct ebt_table_info *private;
 	struct xt_action_param acpar;
+	unsigned int addend;
 
 	acpar.state   = state;
 	acpar.hotdrop = false;
 
-	read_lock_bh(&table->lock);
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
+
 	private = table->private;
 	cb_base = COUNTER_BASE(private->counters, private->nentries,
 	   smp_processor_id());
@@ -229,10 +232,8 @@ unsigned int ebt_do_table(void *priv, struct sk_buff *skb,
 
 		if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0)
 			goto letscontinue;
-		if (acpar.hotdrop) {
-			read_unlock_bh(&table->lock);
-			return NF_DROP;
-		}
+		if (acpar.hotdrop)
+			goto drop_out;
 
 		ADD_COUNTER(*(counter_base + i), skb->len, 1);
 
@@ -251,13 +252,13 @@ unsigned int ebt_do_table(void *priv, struct sk_buff *skb,
 			verdict = t->u.target->target(skb, &acpar);
 		}
 		if (verdict == EBT_ACCEPT) {
-			read_unlock_bh(&table->lock);
+			xt_write_recseq_end(addend);
+			local_bh_enable();
 			return NF_ACCEPT;
 		}
-		if (verdict == EBT_DROP) {
-			read_unlock_bh(&table->lock);
-			return NF_DROP;
-		}
+		if (verdict == EBT_DROP)
+			goto drop_out;
+
 		if (verdict == EBT_RETURN) {
 letsreturn:
 			if (WARN(sp == 0, "RETURN on base chain")) {
@@ -278,10 +279,8 @@ unsigned int ebt_do_table(void *priv, struct sk_buff *skb,
 		if (verdict == EBT_CONTINUE)
 			goto letscontinue;
 
-		if (WARN(verdict < 0, "bogus standard verdict\n")) {
-			read_unlock_bh(&table->lock);
-			return NF_DROP;
-		}
+		if (WARN(verdict < 0, "bogus standard verdict\n"))
+			goto drop_out;
 
 		/* jump to a udc */
 		cs[sp].n = i + 1;
@@ -290,10 +289,8 @@ unsigned int ebt_do_table(void *priv, struct sk_buff *skb,
 		i = 0;
 		chaininfo = (struct ebt_entries *) (base + verdict);
 
-		if (WARN(chaininfo->distinguisher, "jump to non-chain\n")) {
-			read_unlock_bh(&table->lock);
-			return NF_DROP;
-		}
+		if (WARN(chaininfo->distinguisher, "jump to non-chain\n"))
+			goto drop_out;
 
 		nentries = chaininfo->nentries;
 		point = (struct ebt_entry *)chaininfo->data;
@@ -309,10 +306,15 @@ unsigned int ebt_do_table(void *priv, struct sk_buff *skb,
 	if (chaininfo->policy == EBT_RETURN)
 		goto letsreturn;
 	if (chaininfo->policy == EBT_ACCEPT) {
-		read_unlock_bh(&table->lock);
+		xt_write_recseq_end(addend);
+		local_bh_enable();
 		return NF_ACCEPT;
 	}
-	read_unlock_bh(&table->lock);
+
+drop_out:
+	xt_write_recseq_end(addend);
+	local_bh_enable();
+
 	return NF_DROP;
 }
 
@@ -983,12 +985,48 @@ static int translate_table(struct net *net, const char *name,
 	return ret;
 }
 
-/* called under write_lock */
+
 static void get_counters(const struct ebt_counter *oldcounters,
 			 struct ebt_counter *counters, unsigned int nentries)
 {
 	int i, cpu;
 	struct ebt_counter *counter_base;
+	seqcount_t *s;
+
+	/* counters of cpu 0 */
+	memcpy(counters, oldcounters,
+	       sizeof(struct ebt_counter) * nentries);
+
+	/* add other counters to those of cpu 0 */
+	for_each_possible_cpu(cpu) {
+
+		if (cpu == 0)
+			continue;
+
+		s = &per_cpu(xt_recseq, cpu);
+		counter_base = COUNTER_BASE(oldcounters, nentries, cpu);
+		for (i = 0; i < nentries; i++) {
+			u64 bcnt, pcnt;
+			unsigned int start;
+
+			do {
+				start = read_seqcount_begin(s);
+				bcnt = counter_base[i].bcnt;
+				pcnt = counter_base[i].pcnt;
+			} while (read_seqcount_retry(s, start));
+
+			ADD_COUNTER(counters[i], bcnt, pcnt);
+			cond_resched();
+		}
+	}
+}
+
+
+static void get_old_counters(const struct ebt_counter *oldcounters,
+			 struct ebt_counter *counters, unsigned int nentries)
+{
+	int i, cpu;
+	struct ebt_counter *counter_base;
 
 	/* counters of cpu 0 */
 	memcpy(counters, oldcounters,
@@ -1013,6 +1051,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
 	/* used to be able to unlock earlier */
 	struct ebt_table_info *table;
 	struct ebt_table *t;
+	unsigned int cpu;
 
 	/* the user wants counters back
 	 * the check on the size is done later, when we have the lock
@@ -1050,6 +1089,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
 		goto free_unlock;
 	}
 
+	local_bh_disable();
+
 	/* we have the mutex lock, so no danger in reading this pointer */
 	table = t->private;
 	/* make sure the table can only be rmmod'ed if it contains no rules */
@@ -1058,15 +1099,31 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
 		goto free_unlock;
 	} else if (table->nentries && !newinfo->nentries)
 		module_put(t->me);
-	/* we need an atomic snapshot of the counters */
-	write_lock_bh(&t->lock);
-	if (repl->num_counters)
-		get_counters(t->private->counters, counterstmp,
-		   t->private->nentries);
 
+	smp_wmb();
 	t->private = newinfo;
-	write_unlock_bh(&t->lock);
+	smp_mb();
+
+	local_bh_enable();
+
+	/* wait for even xt_recseq on all cpus */
+	for_each_possible_cpu(cpu) {
+		seqcount_t *s = &per_cpu(xt_recseq, cpu);
+		u32 seq = raw_read_seqcount(s);
+
+		if (seq & 1) {
+			do {
+				cond_resched();
+				cpu_relax();
+			} while (seq == raw_read_seqcount(s));
+		}
+	}
+
 	mutex_unlock(&ebt_mutex);
+
+	if (repl->num_counters)
+	    get_old_counters(table->counters, counterstmp, table->nentries);
+
 	/* so, a user can change the chains while having messed up her counter
 	 * allocation. Only reason why this is done is because this way the lock
 	 * is held only once, while this doesn't bring the kernel into a
@@ -1093,6 +1150,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
 	return 0;
 
 free_unlock:
+	local_bh_enable();
 	mutex_unlock(&ebt_mutex);
 free_iterate:
 	EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
@@ -1235,7 +1293,6 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table,
 		goto free_chainstack;
 
 	table->private = newinfo;
-	rwlock_init(&table->lock);
 	mutex_lock(&ebt_mutex);
 	list_for_each_entry(t, &ebt_net->tables, list) {
 		if (strcmp(t->name, table->name) == 0) {
@@ -1379,9 +1436,11 @@ static int do_update_counters(struct net *net, const char *name,
 			      struct ebt_counter __user *counters,
 			      unsigned int num_counters, unsigned int len)
 {
-	int i, ret;
-	struct ebt_counter *tmp;
+	int i, ret, cpu;
+	struct ebt_counter *tmp, *counter_base;
 	struct ebt_table *t;
+	unsigned int addend;
+	const struct ebt_table_info *private;
 
 	if (num_counters == 0)
 		return -EINVAL;
@@ -1405,14 +1464,21 @@ static int do_update_counters(struct net *net, const char *name,
 		goto unlock_mutex;
 	}
 
-	/* we want an atomic add of the counters */
-	write_lock_bh(&t->lock);
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
+	private = t->private;
+	cpu = smp_processor_id();
+
+	/* we add to the counters of the current cpu */
+	for (i = 0; i < num_counters; i++) {
+		counter_base = COUNTER_BASE(private->counters,
+					private->nentries, cpu);
+		ADD_COUNTER(counter_base[i], tmp[i].bcnt, tmp[i].pcnt);
+	}
 
-	/* we add to the counters of the first cpu */
-	for (i = 0; i < num_counters; i++)
-		ADD_COUNTER(t->private->counters[i], tmp[i].bcnt, tmp[i].pcnt);
+	xt_write_recseq_end(addend);
+	local_bh_enable();
 
-	write_unlock_bh(&t->lock);
 	ret = 0;
 unlock_mutex:
 	mutex_unlock(&ebt_mutex);
@@ -1530,9 +1596,7 @@ static int copy_counters_to_user(struct ebt_table *t,
 	if (!counterstmp)
 		return -ENOMEM;
 
-	write_lock_bh(&t->lock);
 	get_counters(oldcounters, counterstmp, nentries);
-	write_unlock_bh(&t->lock);
 
 	if (copy_to_user(user, counterstmp,
 	    array_size(nentries, sizeof(struct ebt_counter))))
-- 
2.43.0

Re: [PATCH v3] net/bridge: Optimizing read-write locks in ebtables.c

Posted by Stephen Hemminger 2 months ago

On Tue, 24 Sep 2024 17:09:06 +0800
yushengjin <yushengjin@uniontech.com> wrote:

> When conducting WRK testing, the CPU usage rate of the testing machine was
> 100%. forwarding through a bridge, if the network load is too high, it may
> cause abnormal load on the ebt_do_table of the kernel ebtable module, leading
> to excessive soft interrupts and sometimes even directly causing CPU soft
> deadlocks.
> 
> After analysis, it was found that the code of ebtables had not been optimized
> for a long time, and the read-write locks inside still existed. However, other
> arp/ip/ip6 tables had already been optimized a lot, and performance bottlenecks
> in read-write locks had been discovered a long time ago.
> 
> Ref link: https://lore.kernel.org/lkml/20090428092411.5331c4a1@nehalam/
> 
> So I referred to arp/ip/ip6 modification methods to optimize the read-write
> lock in ebtables.c.

What about doing RCU instead, faster and safer.

Re: [PATCH v3] net/bridge: Optimizing read-write locks in ebtables.c

Posted by Eric Dumazet 2 months ago

On Tue, Sep 24, 2024 at 3:33 PM Stephen Hemminger
<stephen@networkplumber.org> wrote:
>
> On Tue, 24 Sep 2024 17:09:06 +0800
> yushengjin <yushengjin@uniontech.com> wrote:
>
> > When conducting WRK testing, the CPU usage rate of the testing machine was
> > 100%. forwarding through a bridge, if the network load is too high, it may
> > cause abnormal load on the ebt_do_table of the kernel ebtable module, leading
> > to excessive soft interrupts and sometimes even directly causing CPU soft
> > deadlocks.
> >
> > After analysis, it was found that the code of ebtables had not been optimized
> > for a long time, and the read-write locks inside still existed. However, other
> > arp/ip/ip6 tables had already been optimized a lot, and performance bottlenecks
> > in read-write locks had been discovered a long time ago.
> >
> > Ref link: https://lore.kernel.org/lkml/20090428092411.5331c4a1@nehalam/
> >
> > So I referred to arp/ip/ip6 modification methods to optimize the read-write
> > lock in ebtables.c.
>
> What about doing RCU instead, faster and safer.

Safer ? How so ?

Stephen, we have used this stuff already in other netfilter components
since 2011

No performance issue at all.

Honestly, this old link (
https://lore.kernel.org/lkml/20090428092411.5331c4a1@nehalam/ ) is
quite confusing,
please yushengjin do not include it next time, or we will get outdated feedback.

Instead, point to the real useful commit :

commit 7f5c6d4f665bb57a19a34ce1fb16cc708c04f219
    netfilter: get rid of atomic ops in fast path

This is the useful commit, because this ebtable patch simply adopts
the solution already used in iptables.

And please compile your patch, and boot it, test it before sending it again.

Re: [PATCH v3] net/bridge: Optimizing read-write locks in ebtables.c

Posted by Stephen Hemminger 2 months ago

On Tue, 24 Sep 2024 15:46:17 +0200
Eric Dumazet <edumazet@google.com> wrote:

> On Tue, Sep 24, 2024 at 3:33 PM Stephen Hemminger
> <stephen@networkplumber.org> wrote:
> >
> > On Tue, 24 Sep 2024 17:09:06 +0800
> > yushengjin <yushengjin@uniontech.com> wrote:
> >  
> > > When conducting WRK testing, the CPU usage rate of the testing machine was
> > > 100%. forwarding through a bridge, if the network load is too high, it may
> > > cause abnormal load on the ebt_do_table of the kernel ebtable module, leading
> > > to excessive soft interrupts and sometimes even directly causing CPU soft
> > > deadlocks.
> > >
> > > After analysis, it was found that the code of ebtables had not been optimized
> > > for a long time, and the read-write locks inside still existed. However, other
> > > arp/ip/ip6 tables had already been optimized a lot, and performance bottlenecks
> > > in read-write locks had been discovered a long time ago.
> > >
> > > Ref link: https://lore.kernel.org/lkml/20090428092411.5331c4a1@nehalam/
> > >
> > > So I referred to arp/ip/ip6 modification methods to optimize the read-write
> > > lock in ebtables.c.  
> >
> > What about doing RCU instead, faster and safer.  
> 
> Safer ? How so ?
> 
> Stephen, we have used this stuff already in other netfilter components
> since 2011
> 
> No performance issue at all.
> 

I was thinking that lockdep and analysis tools do better job looking at RCU.
Most likely, the number of users of ebtables was small enough that nobody looked
hard at it until now.

Re: [PATCH v3] net/bridge: Optimizing read-write locks in ebtables.c

Posted by yushengjin 2 months ago

在 25/9/2024 上午12:40, Stephen Hemminger 写道:
> On Tue, 24 Sep 2024 15:46:17 +0200
> Eric Dumazet <edumazet@google.com> wrote:
>
>> On Tue, Sep 24, 2024 at 3:33 PM Stephen Hemminger
>> <stephen@networkplumber.org> wrote:
>>> On Tue, 24 Sep 2024 17:09:06 +0800
>>> yushengjin <yushengjin@uniontech.com> wrote:
>>>   
>>>> When conducting WRK testing, the CPU usage rate of the testing machine was
>>>> 100%. forwarding through a bridge, if the network load is too high, it may
>>>> cause abnormal load on the ebt_do_table of the kernel ebtable module, leading
>>>> to excessive soft interrupts and sometimes even directly causing CPU soft
>>>> deadlocks.
>>>>
>>>> After analysis, it was found that the code of ebtables had not been optimized
>>>> for a long time, and the read-write locks inside still existed. However, other
>>>> arp/ip/ip6 tables had already been optimized a lot, and performance bottlenecks
>>>> in read-write locks had been discovered a long time ago.
>>>>
>>>> Ref link: https://lore.kernel.org/lkml/20090428092411.5331c4a1@nehalam/
>>>>
>>>> So I referred to arp/ip/ip6 modification methods to optimize the read-write
>>>> lock in ebtables.c.
>>> What about doing RCU instead, faster and safer.
>> Safer ? How so ?
>>
>> Stephen, we have used this stuff already in other netfilter components
>> since 2011
>>
>> No performance issue at all.
>>
> I was thinking that lockdep and analysis tools do better job looking at RCU.
> Most likely, the number of users of ebtables was small enough that nobody looked
> hard at it until now.

Even though there are few users of ebtables, there are still serious issues.
This is the data running on the arm Kunpeng-920 (96 cpus) machine,When I 
only run
wrk tests, the softirq of the system will rapidly increase to 25%:

02:50:07 PM  CPU %usr  %nice %sys %iowait %irq  %soft  %steal %guest  
%gnice %idle
02:50:25 PM  all    0.00    0.00    0.05    0.00    0.72 23.20    
0.00    0.00    0.00   76.03
02:50:26 PM  all    0.00    0.00    0.08    0.00    0.72 24.53    
0.00    0.00    0.00   74.67
02:50:27 PM  all    0.01    0.00    0.13    0.00    0.75 24.89    
0.00    0.00    0.00   74.23

If ebatlse queries, updates, and other operations are continuously 
executed at this time, softirq
will increase again to 50%:

02:52:23 PM  all    0.00    0.00    1.18    0.00    0.54 48.91    
0.00    0.00    0.00   49.36
02:52:24 PM  all    0.00    0.00    1.19    0.00    0.43 48.23    
0.00    0.00    0.00   50.15
02:52:25 PM  all    0.00    0.00    1.20    0.00    0.50 48.29    
0.00    0.00    0.00   50.01

More seriously, soft lockup may occur:

Message from syslogd@localhost at Sep 25 14:52:22 ...
  kernel:watchdog: BUG: soft lockup - CPU#88 stuck for 23s! [ebtables:3896]

So i think soft lockup is even more unbearable than performance.

>
>