[PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl

lvxiafei posted 1 patch 1 month ago
There is a newer version of this series
include/net/netns/conntrack.h           |  1 +
net/netfilter/nf_conntrack_core.c       | 12 +++++++-----
net/netfilter/nf_conntrack_standalone.c |  5 +++--
3 files changed, 11 insertions(+), 7 deletions(-)
[PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by lvxiafei 1 month ago
From: lvxiafei <lvxiafei@sensetime.com>

Support net.netfilter.nf_conntrack_max settings in
different netns, net.netfilter.nf_conntrack_max is
used to more flexibly limit the ct_count in different
netns. The default value belongs to the global (ancestral)
limit and no implicit limit is inherited from the parent
namespace.

After net.netfilter.nf_conntrack_max is set in different
netns, it is not allowed to be greater than the global
(ancestral) limit net.nf_conntrack_max when working.

Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
 include/net/netns/conntrack.h           |  1 +
 net/netfilter/nf_conntrack_core.c       | 12 +++++++-----
 net/netfilter/nf_conntrack_standalone.c |  5 +++--
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..dd31ba205419 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
 	u8			sysctl_acct;
 	u8			sysctl_tstamp;
 	u8			sysctl_checksum;
+	u8			sysctl_max;
 
 	struct ip_conntrack_stat __percpu *stat;
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..4116c2f2b57f 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1498,7 +1498,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 
 static void gc_worker(struct work_struct *work)
 {
-	unsigned int i, hashsz, nf_conntrack_max95 = 0;
+	unsigned int i, hashsz;
 	u32 end_time, start_time = nfct_time_stamp;
 	struct conntrack_gc_work *gc_work;
 	unsigned int expired_count = 0;
@@ -1509,8 +1509,6 @@ static void gc_worker(struct work_struct *work)
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
-	if (gc_work->early_drop)
-		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
 	if (i == 0) {
 		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1536,7 @@ static void gc_worker(struct work_struct *work)
 		}
 
 		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+			unsigned int nf_conntrack_max95 = 0;
 			struct nf_conntrack_net *cnet;
 			struct net *net;
 			long expires;
@@ -1567,11 +1566,14 @@ static void gc_worker(struct work_struct *work)
 			expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
 			expires = (expires - (long)next_run) / ++count;
 			next_run += expires;
+			net = nf_ct_net(tmp);
+
+			if (gc_work->early_drop)
+				nf_conntrack_max95 = min(nf_conntrack_max, net->ct.sysctl_max) / 100u * 95u;
 
 			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
 				continue;
 
-			net = nf_ct_net(tmp);
 			cnet = nf_ct_pernet(net);
 			if (atomic_read(&cnet->count) < nf_conntrack_max95)
 				continue;
@@ -1654,7 +1656,7 @@ __nf_conntrack_alloc(struct net *net,
 	/* We don't want any race condition at early drop stage */
 	ct_count = atomic_inc_return(&cnet->count);
 
-	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+	if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
 		if (!early_drop(net, hash)) {
 			if (!conntrack_gc_work.early_drop)
 				conntrack_gc_work.early_drop = true;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..4a073c4de1b7 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
 static struct ctl_table nf_ct_sysctl_table[] = {
 	[NF_SYSCTL_CT_MAX] = {
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
 	table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+	table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
 	table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
 	table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	/* Don't allow non-init_net ns to alter global sysctls */
 	if (!net_eq(&init_net, net)) {
-		table[NF_SYSCTL_CT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
 	}
@@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
 	int ret;
 
 	net->ct.sysctl_checksum = 1;
+	net->ct.sysctl_max = nf_conntrack_max;
 
 	ret = nf_conntrack_standalone_init_sysctl(net);
 	if (ret < 0)
-- 
2.40.1
Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by Florian Westphal 1 month ago
lvxiafei <xiafei_xupt@163.com> wrote:
> -	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
> +	if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
>  		if (!early_drop(net, hash)) {
>  			if (!conntrack_gc_work.early_drop)
>  				conntrack_gc_work.early_drop = true;
> diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
> index 2f666751c7e7..4a073c4de1b7 100644
> --- a/net/netfilter/nf_conntrack_standalone.c
> +++ b/net/netfilter/nf_conntrack_standalone.c
> @@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
>  static struct ctl_table nf_ct_sysctl_table[] = {
>  	[NF_SYSCTL_CT_MAX] = {
>  		.procname	= "nf_conntrack_max",
> -		.data		= &nf_conntrack_max,
> +		.data		= &init_net.ct.sysctl_max,

Whats the function of nf_conntrack_max?
After this change its always 0?
Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by lvxiafei 1 month ago
Florian Westphal <fw@strlen.de> wrote:
> Whats the function of nf_conntrack_max?
> After this change its always 0?

nf_conntrack_max is a global (ancestor) limit, by default
nf_conntrack_max = max_factor * nf_conntrack_htable_size.

init_net.ct.sysctl_max is a parameter for each netns, and
setting it will not affect the value of nf_conntrack_max.
Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by Florian Westphal 1 month ago
lvxiafei <xiafei_xupt@163.com> wrote:
> Florian Westphal <fw@strlen.de> wrote:
> > Whats the function of nf_conntrack_max?
> > After this change its always 0?
> 
> nf_conntrack_max is a global (ancestor) limit, by default
> nf_conntrack_max = max_factor * nf_conntrack_htable_size.

Argh.

net.netfilter.nf_conntrack_max
is replaced by init_net.nf_conntrack_max in your patch.

But not net.nf_conntrack_max, so they are now different and not
related at all anymore except that the latter overrides the former
even in init_net.

I'm not sure this is sane.  And it needs an update to
Documentation/networking/nf_conntrack-sysctl.rst

in any case.

Also:

-       if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+       if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {


... can't be right, this allows a 0 setting in the netns.
So, setting 0 in non-init-net must be disallowed.

I suggest to remove nf_conntrack_max as a global variable,
make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
so in the init_net both sysctls remain the same.

Then, change __nf_conntrack_alloc() to do:

unsigned int nf_conntrack_max = min(net->ct.sysctl_max, &init_net.ct.sysctl_max);

and leave the if-condition as is, i.e.:

if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { ...

It means:
each netns can pick an arbitrary value (but not 0, this ability needs to
be removed).

When a new conntrack is allocated, then:

If the limit in the init_net is lower than the netns, then
that limit applies, so it provides upper cap.

If the limit in the init_net is higher, the lower pernet limit
is applied.

If the init_net has 0 setting, no limit is applied.

This also needs an update to Documentation/networking/nf_conntrack-sysctl.rst
to explain the restrictions.

Or, alternative, try the other suggestion I made
(memcg charge at sysctl change time,
 https://lore.kernel.org/netfilter-devel/20250408095854.GB536@breakpoint.cc/).

Or come up with a better proposal.
Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by lvxiafei 1 month ago
Florian Westphal <fw@strlen.de> wrote:
> I suggest to remove nf_conntrack_max as a global variable,
> make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
> so in the init_net both sysctls remain the same.

The nf_conntrack_max global variable is a system calculated
value and should not be removed.
nf_conntrack_max = max_factor * nf_conntrack_htable_size;

> When a new conntrack is allocated, then:
>
> If the limit in the init_net is lower than the netns, then
> that limit applies, so it provides upper cap.
>
> If the limit in the init_net is higher, the lower pernet limit
> is applied.
>
> If the init_net has 0 setting, no limit is applied.

If the init_net has 0 setting, it should depend on the
limit of other netns.

The netns Limit Behavior:
+------------------------+--------------------+-----------------------+
| init_net.ct.sysctl_max | net->ct.sysctl_max | netns Limit Behavior  |
+------------------------+--------------------+-----------------------+
| 0                      | 0                  | No limit              |
+------------------------+--------------------+-----------------------+
| 0                      | Non-zero           | net->ct.sysctl_max    |
+------------------------+--------------------+-----------------------+
| Non-zero               | 0                  | init_net.ct.sysctl_max|
+------------------------+--------------------+-----------------------+
| Non-zero               | Non-zero           | min                   |
+------------------------+--------------------+-----------------------+

net_ct_sysctl_max = likely(a && b) ? min(a, b) : max(a, b);
or
net_ct_sysctl_max = unlikely(a == 0 || b == 0) ? max(a, b) : min(a, b);

if (net_ct_sysctl_max && unlikely(ct_count > net_ct_sysctl_max)) { ...
Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by Florian Westphal 1 month ago
lvxiafei <xiafei_xupt@163.com> wrote:
> Florian Westphal <fw@strlen.de> wrote:
> > I suggest to remove nf_conntrack_max as a global variable,
> > make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
> > so in the init_net both sysctls remain the same.
> 
> The nf_conntrack_max global variable is a system calculated
> value and should not be removed.
> nf_conntrack_max = max_factor * nf_conntrack_htable_size;

Thats the default calculation for the initial sysctl value:

net/netfilter/nf_conntrack_standalone.c:                .data           = &nf_conntrack_max,
net/netfilter/nf_conntrack_standalone.c:                .data           = &nf_conntrack_max,

You can make an initial patch that replaces all occurences of
nf_conntrack_max with cnet->sysctl_conntrack_max

(adding a 'unsigned int sysctl_conntrack_max' to struct
 nf_conntrack_net).

Then, in a second patch, remove the '0444' readonly and redirect
the child netns to use the copy in its own pernet area rather than the
init_net one.
Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by Florian Westphal 1 month ago
Florian Westphal <fw@strlen.de> wrote:
> lvxiafei <xiafei_xupt@163.com> wrote:
> > Florian Westphal <fw@strlen.de> wrote:
> > > I suggest to remove nf_conntrack_max as a global variable,
> > > make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
> > > so in the init_net both sysctls remain the same.
> > 
> > The nf_conntrack_max global variable is a system calculated
> > value and should not be removed.
> > nf_conntrack_max = max_factor * nf_conntrack_htable_size;
> 
> Thats the default calculation for the initial sysctl value:
> 
> net/netfilter/nf_conntrack_standalone.c:                .data           = &nf_conntrack_max,
> net/netfilter/nf_conntrack_standalone.c:                .data           = &nf_conntrack_max,
> 
> You can make an initial patch that replaces all occurences of
> nf_conntrack_max with cnet->sysctl_conntrack_max

Something like this:

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
 extern struct hlist_nulls_head *nf_conntrack_hash;
 extern unsigned int nf_conntrack_htable_size;
 extern seqcount_spinlock_t nf_conntrack_generation;
-extern unsigned int nf_conntrack_max;
 
 /* must be called with rcu read lock held */
 static inline void
@@ -360,6 +359,11 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
 	return net_generic(net, nf_conntrack_net_id);
 }
 
+static inline unsigned int nf_conntrack_max(const struct net *net)
+{
+	return net->ct.sysctl_conntrack_max;
+}
+
 int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
 int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 			   u16 zone, u8 family, u8 *proto, u16 *mru);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
 	u8			sysctl_acct;
 	u8			sysctl_tstamp;
 	u8			sysctl_checksum;
+	unsigned int		sysctl_conntrack_max;
 
 	struct ip_conntrack_stat __percpu *stat;
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..8ae9c22cfcb3 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -202,8 +202,6 @@ static void nf_conntrack_all_unlock(void)
 unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
-unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
 seqcount_spinlock_t nf_conntrack_generation __read_mostly;
 static siphash_aligned_key_t nf_conntrack_hash_rnd;
 
@@ -1509,8 +1507,7 @@ static void gc_worker(struct work_struct *work)
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
-	if (gc_work->early_drop)
-		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
+		nf_conntrack_max95 = nf_conntrack_max(&init_net) / 100u * 95u;
 
 	if (i == 0) {
 		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1648,13 +1645,14 @@ __nf_conntrack_alloc(struct net *net,
 		     gfp_t gfp, u32 hash)
 {
 	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
-	unsigned int ct_count;
+	unsigned int ct_max, ct_count;
 	struct nf_conn *ct;
 
 	/* We don't want any race condition at early drop stage */
 	ct_count = atomic_inc_return(&cnet->count);
+	ct_max = nf_conntrack_max(&init_net);
 
-	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+	if (ct_max && unlikely(ct_count > ct_max)) {
 		if (!early_drop(net, hash)) {
 			if (!conntrack_gc_work.early_drop)
 				conntrack_gc_work.early_drop = true;
@@ -2650,7 +2648,7 @@ int nf_conntrack_init_start(void)
 	if (!nf_conntrack_hash)
 		return -ENOMEM;
 
-	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+	init_net.ct.sysctl_conntrack_max = max_factor * nf_conntrack_htable_size;
 
 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
 						sizeof(struct nf_conn),
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index db23876a6016..f1938204b827 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2608,7 +2608,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
 		goto nla_put_failure;
 
-	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max(net))))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 502cf10aab41..8a185dfd3261 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
 static struct ctl_table nf_ct_sysctl_table[] = {
 	[NF_SYSCTL_CT_MAX] = {
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_conntrack_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -944,7 +944,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 static struct ctl_table nf_ct_netfilter_table[] = {
 	{
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_conntrack_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by lvxiafei 1 month ago
Florian Westphal <fw@strlen.de> wrote:

> > You can make an initial patch that replaces all occurences of
> > nf_conntrack_max with cnet->sysctl_conntrack_max
>
> Something like this:
> ...

Agreed, I can submit the changes later.
First of all, a patch should do one thing clearly,
which is convenient for maintainers to review.
Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by lvxiafei 1 month ago
Florian Westphal <fw@strlen.de> wrote:
> net.netfilter.nf_conntrack_max
> is replaced by init_net.nf_conntrack_max in your patch.
>
> But not net.nf_conntrack_max, so they are now different and not
> related at all anymore except that the latter overrides the former
> even in init_net.
>
> I'm not sure this is sane.  And it needs an update to
> Documentation/networking/nf_conntrack-sysctl.rst

Yes, it needs an update to
Documentation/networking/nf_conntrack-sysctl.rst.

in different netns,
net.netfilter.nf_conntrack_max = init_net.ct.sysctl_max;
the global (ancestral) limit,
net.nf_conntrack_max = nf_conntrack_max = max_factor * nf_conntrack_htable_size;

> in any case.
>
> Also:
>
> -       if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
> +       if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
>
>
> ... can't be right, this allows a 0 setting in the netns.
> So, setting 0 in non-init-net must be disallowed.

Yes, setting 0 in non-init-net must be disallowed.

Should be used:
unsigned int net_ct_sysctl_max = max(min(nf_conntrack_max, net->ct.sysctl_max), 0);
if (nf_conntrack_max && unlikely(ct_count > net_ct_sysctl_max)) {

min(nf_conntrack_max, net->ct.sysctl_max) is the upper limit of ct_count
At the same time, when net->ct.sysctl_max == 0, the original intention is no limit,
but it can be limited by nf_conntrack_max in different netns.

> I suggest to remove nf_conntrack_max as a global variable,
> make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
> so in the init_net both sysctls remain the same.
>
> Then, change __nf_conntrack_alloc() to do:
>
> unsigned int nf_conntrack_max = min(net->ct.sysctl_max, &init_net.ct.sysctl_max);
>
> and leave the if-condition as is, i.e.:
>
> if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { ...

Yes, each netns can pick an arbitrary value (but not 0, this ability needs to
be removed).

Should be used:
unsigned int nf_conntrack_max = max(min(net->ct.sysctl_max, init_net.ct.sysctl_max, 0);

This also needs an update to Documentation/networking/nf_conntrack-sysctl.rst
to explain the restrictions.
Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Posted by Florian Westphal 1 month ago
lvxiafei <xiafei_xupt@163.com> wrote:
> > in any case.
> >
> > Also:
> >
> > -       if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
> > +       if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
> >
> >
> > ... can't be right, this allows a 0 setting in the netns.
> > So, setting 0 in non-init-net must be disallowed.
> 
> Yes, setting 0 in non-init-net must be disallowed.
> 
> Should be used:
> unsigned int net_ct_sysctl_max = max(min(nf_conntrack_max, net->ct.sysctl_max), 0);
> if (nf_conntrack_max && unlikely(ct_count > net_ct_sysctl_max)) {

That would work.  Alternative, probably preferrable, is to do
something like this:

@@ -615,10 +615,10 @@ enum nf_ct_sysctl_index {
 static struct ctl_table nf_ct_sysctl_table[] = {
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = proc_douintvec_minmax,
+               .extra1         = SYSCTL_ZERO, /* 0 == no limit */
        },
        [NF_SYSCTL_CT_COUNT] = {
                .procname       = "nf_conntrack_count",
@@ -1081,9 +1082,11 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)

        /* Don't allow non-init_net ns to alter global sysctls */
        if (!net_eq(&init_net, net)) {
                table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
                table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
+
+               /* 0 means no limit, only allowed in init_net */
+               table[NF_SYSCTL_CT_MAX].extra1 = SYSCTL_ONE;
        }

That will make setting a 0 value illegal for non-init net case:

sysctl net.netfilter.nf_conntrack_max=0
sysctl: setting key "net.netfilter.nf_conntrack_max": Invalid argument

> min(nf_conntrack_max, net->ct.sysctl_max) is the upper limit of ct_count
> At the same time, when net->ct.sysctl_max == 0, the original intention is no limit,
> but it can be limited by nf_conntrack_max in different netns.

Sounds good to me.