[PATCH V2] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl

lvxiafei posted 1 patch 1 month ago
There is a newer version of this series
include/net/netns/conntrack.h           |  1 +
net/netfilter/nf_conntrack_core.c       | 12 +++++++-----
net/netfilter/nf_conntrack_standalone.c |  7 ++++---
3 files changed, 12 insertions(+), 8 deletions(-)
[PATCH V2] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by lvxiafei 1 month ago
From: lvxiafei <lvxiafei@sensetime.com>

Support nf_conntrack_max settings in different netns,
nf_conntrack_max is used to more flexibly limit the
ct_count in different netns, which may be greater than
the value in the parent namespace. The default value
belongs to the global (ancestral) limit and no implicit
limit is inherited from the parent namespace.

Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
 include/net/netns/conntrack.h           |  1 +
 net/netfilter/nf_conntrack_core.c       | 12 +++++++-----
 net/netfilter/nf_conntrack_standalone.c |  7 ++++---
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..dd31ba205419 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
 	u8			sysctl_acct;
 	u8			sysctl_tstamp;
 	u8			sysctl_checksum;
+	u8			sysctl_max;
 
 	struct ip_conntrack_stat __percpu *stat;
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..5f0dbd358d66 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1498,7 +1498,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 
 static void gc_worker(struct work_struct *work)
 {
-	unsigned int i, hashsz, nf_conntrack_max95 = 0;
+	unsigned int i, hashsz;
 	u32 end_time, start_time = nfct_time_stamp;
 	struct conntrack_gc_work *gc_work;
 	unsigned int expired_count = 0;
@@ -1509,8 +1509,6 @@ static void gc_worker(struct work_struct *work)
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
-	if (gc_work->early_drop)
-		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
 	if (i == 0) {
 		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1536,7 @@ static void gc_worker(struct work_struct *work)
 		}
 
 		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+			unsigned int nf_conntrack_max95 = 0;
 			struct nf_conntrack_net *cnet;
 			struct net *net;
 			long expires;
@@ -1567,11 +1566,14 @@ static void gc_worker(struct work_struct *work)
 			expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
 			expires = (expires - (long)next_run) / ++count;
 			next_run += expires;
+			net = nf_ct_net(tmp);
+
+			if (gc_work->early_drop)
+				nf_conntrack_max95 = net->ct.sysctl_max / 100u * 95u;
 
 			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
 				continue;
 
-			net = nf_ct_net(tmp);
 			cnet = nf_ct_pernet(net);
 			if (atomic_read(&cnet->count) < nf_conntrack_max95)
 				continue;
@@ -1654,7 +1656,7 @@ __nf_conntrack_alloc(struct net *net,
 	/* We don't want any race condition at early drop stage */
 	ct_count = atomic_inc_return(&cnet->count);
 
-	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+	if (net->ct.sysctl_max && unlikely(ct_count > net->ct.sysctl_max)) {
 		if (!early_drop(net, hash)) {
 			if (!conntrack_gc_work.early_drop)
 				conntrack_gc_work.early_drop = true;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..77c9c01c7278 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
 static struct ctl_table nf_ct_sysctl_table[] = {
 	[NF_SYSCTL_CT_MAX] = {
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 static struct ctl_table nf_ct_netfilter_table[] = {
 	{
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
 	table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+	table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
 	table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
 	table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	/* Don't allow non-init_net ns to alter global sysctls */
 	if (!net_eq(&init_net, net)) {
-		table[NF_SYSCTL_CT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
 	}
@@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
 	int ret;
 
 	net->ct.sysctl_checksum = 1;
+	net->ct.sysctl_max = nf_conntrack_max;
 
 	ret = nf_conntrack_standalone_init_sysctl(net);
 	if (ret < 0)
-- 
2.40.1
Re: [PATCH V2] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by Florian Westphal 1 month ago
lvxiafei <xiafei_xupt@163.com> wrote:
> From: lvxiafei <lvxiafei@sensetime.com>
> 
> Support nf_conntrack_max settings in different netns,
> nf_conntrack_max is used to more flexibly limit the
> ct_count in different netns, which may be greater than
> the value in the parent namespace. The default value
> belongs to the global (ancestral) limit and no implicit
> limit is inherited from the parent namespace.

That seems the wrong thing to do.
There must be some way to limit the netns conntrack usage.

Whats the actual intent here?

You could apply max = min(init_net->max, net->max)
Or, you could relax it as long as netns are owned
by initial user ns, I guess.

Or perhaps its possible to make a guesstimate of
the maximum memory needed by the new limit, then
account that to memcg (at sysctl change time), and
reject if memcg is exhausted.

No other ideas at the moment, but I do not like the
"no limits" approach.
Re: [PATCH V2] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by lvxiafei 1 month ago
On Tue, 8 Apr 2025 11:58:54 Florian Westphal <fw@strlen.de> wrote:
> That seems the wrong thing to do.
> There must be some way to limit the netns conntrack usage.
>
> Whats the actual intent here?
>
> You could apply max = min(init_net->max, net->max)
> Or, you could relax it as long as netns are owned
> by initial user ns, I guess.
>
> Or perhaps its possible to make a guesstimate of
> the maximum memory needed by the new limit, then
> account that to memcg (at sysctl change time), and
> reject if memcg is exhausted.
>
> No other ideas at the moment, but I do not like the
> "no limits" approach.

The original nf_conntrack_max is a global variable.
Modification will affect the connection tracking
limit in other netns, and the maximum memory
consumption = number of netns * nf_conntrack_max

This modification can make nf_conntrack_max support
the netns level to set the size of the connection
tracking table, and more flexibly limit the connection
tracking of each netns. For example, the initial user ns
has a default value (=max_factor*nf_conntrack_htable_size).
The nf_conntrack_max when netns 1 and netns 2 are created
is the same as the nf_conntrack_max in the initial user ns.
You can set it to netns 1 1k and netns 2 2k without
affecting each other.

If you are worried that different netns may exceed the
initial user limit and memory limit when setting,
apply max = min(init_net->max, net->max), the value in
netns is not greater than init_net->max, and the new
maximum memory consumption <= the original maximum memory
consumption, which limits memory consumption to a certain
extent. However, this will bring several problems:

1. Do not allow nf_conntrack_max in other netns to be greater
than nf_conntrack_max of the initial user. For example, when
other netns carry north-south traffic, the actual number of
connection tracking is greater than that of the initial user.

2. If nf_conntrack_max of the initial user is increased, the
maximum memory consumption will inevitably increase by n copies

3. If nf_conntrack_max of the initial user is reduced, will
the existing connections in other netns be affected?
Re: [PATCH V2] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by Florian Westphal 1 month ago
lvxiafei <xiafei_xupt@163.com> wrote:
> This modification can make nf_conntrack_max support
> the netns level to set the size of the connection
> tracking table, and more flexibly limit the connection
> tracking of each netns. For example, the initial user ns
> has a default value (=max_factor*nf_conntrack_htable_size).
> The nf_conntrack_max when netns 1 and netns 2 are created
> is the same as the nf_conntrack_max in the initial user ns.
> You can set it to netns 1 1k and netns 2 2k without
> affecting each other.

Netns 2 can also set it to 2**31 and cause machine go OOM.

> If you are worried that different netns may exceed the
> initial user limit and memory limit when setting,
> apply max = min(init_net->max, net->max), the value in
> netns is not greater than init_net->max, and the new
> maximum memory consumption <= the original maximum memory
> consumption, which limits memory consumption to a certain
> extent.

That was one of the suggestions that I see how one could have
tunable pernet variable without allowing netns2 go haywire.

> However, this will bring several problems:
> 
> 1. Do not allow nf_conntrack_max in other netns to be greater
> than nf_conntrack_max of the initial user. For example, when
> other netns carry north-south traffic, the actual number of
> connection tracking is greater than that of the initial user.

Sure.

> 2. If nf_conntrack_max of the initial user is increased, the
> maximum memory consumption will inevitably increase by n copies

How is that different to current state of affairs?

> 3. If nf_conntrack_max of the initial user is reduced, will
> the existing connections in other netns be affected?

No, but new ones will be blocked until its below init_net limit again.
Re: [PATCH V2] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by lvxiafei 1 month ago
On 2025-04-08 13:28 Florian Westphal <fw@strlen.de> wrote:
> That was one of the suggestions that I see how one could have
> tunable pernet variable without allowing netns2 go haywire.

Yes, After net.netfilter.nf_conntrack_max is set in different
netns, it should be designed to not be allowed to be larger
than the global (ancestor) limit when working.