include/net/netns/conntrack.h | 1 + net/netfilter/nf_conntrack_core.c | 12 +++++++----- net/netfilter/nf_conntrack_standalone.c | 5 +++-- 3 files changed, 11 insertions(+), 7 deletions(-)
From: lvxiafei <lvxiafei@sensetime.com>
Support net.netfilter.nf_conntrack_max settings in
different netns, net.netfilter.nf_conntrack_max is
used to more flexibly limit the ct_count in different
netns. The default value belongs to the global (ancestral)
limit and no implicit limit is inherited from the parent
namespace.
After net.netfilter.nf_conntrack_max is set in different
netns, it is not allowed to be greater than the global
(ancestral) limit net.nf_conntrack_max when working.
Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
include/net/netns/conntrack.h | 1 +
net/netfilter/nf_conntrack_core.c | 12 +++++++-----
net/netfilter/nf_conntrack_standalone.c | 5 +++--
3 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..dd31ba205419 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
u8 sysctl_acct;
u8 sysctl_tstamp;
u8 sysctl_checksum;
+ u8 sysctl_max;
struct ip_conntrack_stat __percpu *stat;
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..4116c2f2b57f 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1498,7 +1498,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
static void gc_worker(struct work_struct *work)
{
- unsigned int i, hashsz, nf_conntrack_max95 = 0;
+ unsigned int i, hashsz;
u32 end_time, start_time = nfct_time_stamp;
struct conntrack_gc_work *gc_work;
unsigned int expired_count = 0;
@@ -1509,8 +1509,6 @@ static void gc_worker(struct work_struct *work)
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
i = gc_work->next_bucket;
- if (gc_work->early_drop)
- nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
if (i == 0) {
gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1536,7 @@ static void gc_worker(struct work_struct *work)
}
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+ unsigned int nf_conntrack_max95 = 0;
struct nf_conntrack_net *cnet;
struct net *net;
long expires;
@@ -1567,11 +1566,14 @@ static void gc_worker(struct work_struct *work)
expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
expires = (expires - (long)next_run) / ++count;
next_run += expires;
+ net = nf_ct_net(tmp);
+
+ if (gc_work->early_drop)
+ nf_conntrack_max95 = min(nf_conntrack_max, net->ct.sysctl_max) / 100u * 95u;
if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
continue;
- net = nf_ct_net(tmp);
cnet = nf_ct_pernet(net);
if (atomic_read(&cnet->count) < nf_conntrack_max95)
continue;
@@ -1654,7 +1656,7 @@ __nf_conntrack_alloc(struct net *net,
/* We don't want any race condition at early drop stage */
ct_count = atomic_inc_return(&cnet->count);
- if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+ if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
if (!early_drop(net, hash)) {
if (!conntrack_gc_work.early_drop)
conntrack_gc_work.early_drop = true;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..4a073c4de1b7 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_MAX] = {
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+ table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
/* Don't allow non-init_net ns to alter global sysctls */
if (!net_eq(&init_net, net)) {
- table[NF_SYSCTL_CT_MAX].mode = 0444;
table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
}
@@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
int ret;
net->ct.sysctl_checksum = 1;
+ net->ct.sysctl_max = nf_conntrack_max;
ret = nf_conntrack_standalone_init_sysctl(net);
if (ret < 0)
--
2.40.1
lvxiafei <xiafei_xupt@163.com> wrote: > - if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { > + if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) { > if (!early_drop(net, hash)) { > if (!conntrack_gc_work.early_drop) > conntrack_gc_work.early_drop = true; > diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c > index 2f666751c7e7..4a073c4de1b7 100644 > --- a/net/netfilter/nf_conntrack_standalone.c > +++ b/net/netfilter/nf_conntrack_standalone.c > @@ -615,7 +615,7 @@ enum nf_ct_sysctl_index { > static struct ctl_table nf_ct_sysctl_table[] = { > [NF_SYSCTL_CT_MAX] = { > .procname = "nf_conntrack_max", > - .data = &nf_conntrack_max, > + .data = &init_net.ct.sysctl_max, Whats the function of nf_conntrack_max? After this change its always 0?
Florian Westphal <fw@strlen.de> wrote: > Whats the function of nf_conntrack_max? > After this change its always 0? nf_conntrack_max is a global (ancestor) limit, by default nf_conntrack_max = max_factor * nf_conntrack_htable_size. init_net.ct.sysctl_max is a parameter for each netns, and setting it will not affect the value of nf_conntrack_max.
lvxiafei <xiafei_xupt@163.com> wrote: > Florian Westphal <fw@strlen.de> wrote: > > Whats the function of nf_conntrack_max? > > After this change its always 0? > > nf_conntrack_max is a global (ancestor) limit, by default > nf_conntrack_max = max_factor * nf_conntrack_htable_size. Argh. net.netfilter.nf_conntrack_max is replaced by init_net.nf_conntrack_max in your patch. But not net.nf_conntrack_max, so they are now different and not related at all anymore except that the latter overrides the former even in init_net. I'm not sure this is sane. And it needs an update to Documentation/networking/nf_conntrack-sysctl.rst in any case. Also: - if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { + if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) { ... can't be right, this allows a 0 setting in the netns. So, setting 0 in non-init-net must be disallowed. I suggest to remove nf_conntrack_max as a global variable, make net.nf_conntrack_max use init_net.nf_conntrack_max too internally, so in the init_net both sysctls remain the same. Then, change __nf_conntrack_alloc() to do: unsigned int nf_conntrack_max = min(net->ct.sysctl_max, &init_net.ct.sysctl_max); and leave the if-condition as is, i.e.: if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { ... It means: each netns can pick an arbitrary value (but not 0, this ability needs to be removed). When a new conntrack is allocated, then: If the limit in the init_net is lower than the netns, then that limit applies, so it provides upper cap. If the limit in the init_net is higher, the lower pernet limit is applied. If the init_net has 0 setting, no limit is applied. This also needs an update to Documentation/networking/nf_conntrack-sysctl.rst to explain the restrictions. Or, alternative, try the other suggestion I made (memcg charge at sysctl change time, https://lore.kernel.org/netfilter-devel/20250408095854.GB536@breakpoint.cc/). Or come up with a better proposal.
Florian Westphal <fw@strlen.de> wrote: > I suggest to remove nf_conntrack_max as a global variable, > make net.nf_conntrack_max use init_net.nf_conntrack_max too internally, > so in the init_net both sysctls remain the same. The nf_conntrack_max global variable is a system calculated value and should not be removed. nf_conntrack_max = max_factor * nf_conntrack_htable_size; > When a new conntrack is allocated, then: > > If the limit in the init_net is lower than the netns, then > that limit applies, so it provides upper cap. > > If the limit in the init_net is higher, the lower pernet limit > is applied. > > If the init_net has 0 setting, no limit is applied. If the init_net has 0 setting, it should depend on the limit of other netns. The netns Limit Behavior: +------------------------+--------------------+-----------------------+ | init_net.ct.sysctl_max | net->ct.sysctl_max | netns Limit Behavior | +------------------------+--------------------+-----------------------+ | 0 | 0 | No limit | +------------------------+--------------------+-----------------------+ | 0 | Non-zero | net->ct.sysctl_max | +------------------------+--------------------+-----------------------+ | Non-zero | 0 | init_net.ct.sysctl_max| +------------------------+--------------------+-----------------------+ | Non-zero | Non-zero | min | +------------------------+--------------------+-----------------------+ net_ct_sysctl_max = likely(a && b) ? min(a, b) : max(a, b); or net_ct_sysctl_max = unlikely(a == 0 || b == 0) ? max(a, b) : min(a, b); if (net_ct_sysctl_max && unlikely(ct_count > net_ct_sysctl_max)) { ...
lvxiafei <xiafei_xupt@163.com> wrote: > Florian Westphal <fw@strlen.de> wrote: > > I suggest to remove nf_conntrack_max as a global variable, > > make net.nf_conntrack_max use init_net.nf_conntrack_max too internally, > > so in the init_net both sysctls remain the same. > > The nf_conntrack_max global variable is a system calculated > value and should not be removed. > nf_conntrack_max = max_factor * nf_conntrack_htable_size; Thats the default calculation for the initial sysctl value: net/netfilter/nf_conntrack_standalone.c: .data = &nf_conntrack_max, net/netfilter/nf_conntrack_standalone.c: .data = &nf_conntrack_max, You can make an initial patch that replaces all occurences of nf_conntrack_max with cnet->sysctl_conntrack_max (adding a 'unsigned int sysctl_conntrack_max' to struct nf_conntrack_net). Then, in a second patch, remove the '0444' readonly and redirect the child netns to use the copy in its own pernet area rather than the init_net one.
Florian Westphal <fw@strlen.de> wrote: > lvxiafei <xiafei_xupt@163.com> wrote: > > Florian Westphal <fw@strlen.de> wrote: > > > I suggest to remove nf_conntrack_max as a global variable, > > > make net.nf_conntrack_max use init_net.nf_conntrack_max too internally, > > > so in the init_net both sysctls remain the same. > > > > The nf_conntrack_max global variable is a system calculated > > value and should not be removed. > > nf_conntrack_max = max_factor * nf_conntrack_htable_size; > > Thats the default calculation for the initial sysctl value: > > net/netfilter/nf_conntrack_standalone.c: .data = &nf_conntrack_max, > net/netfilter/nf_conntrack_standalone.c: .data = &nf_conntrack_max, > > You can make an initial patch that replaces all occurences of > nf_conntrack_max with cnet->sysctl_conntrack_max Something like this: diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize); extern struct hlist_nulls_head *nf_conntrack_hash; extern unsigned int nf_conntrack_htable_size; extern seqcount_spinlock_t nf_conntrack_generation; -extern unsigned int nf_conntrack_max; /* must be called with rcu read lock held */ static inline void @@ -360,6 +359,11 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net) return net_generic(net, nf_conntrack_net_id); } +static inline unsigned int nf_conntrack_max(const struct net *net) +{ + return net->ct.sysctl_conntrack_max; +} + int nf_ct_skb_network_trim(struct sk_buff *skb, int family); int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb, u16 zone, u8 family, u8 *proto, u16 *mru); diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -102,6 +102,7 @@ struct netns_ct { u8 sysctl_acct; u8 sysctl_tstamp; u8 sysctl_checksum; + unsigned int sysctl_conntrack_max; struct ip_conntrack_stat __percpu *stat; struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 7f8b245e287a..8ae9c22cfcb3 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -202,8 +202,6 @@ static void nf_conntrack_all_unlock(void) unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); -unsigned int nf_conntrack_max __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_max); seqcount_spinlock_t nf_conntrack_generation __read_mostly; static siphash_aligned_key_t nf_conntrack_hash_rnd; @@ -1509,8 +1507,7 @@ static void gc_worker(struct work_struct *work) gc_work = container_of(work, struct conntrack_gc_work, dwork.work); i = gc_work->next_bucket; - if (gc_work->early_drop) - nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; + nf_conntrack_max95 = nf_conntrack_max(&init_net) / 100u * 95u; if (i == 0) { gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; @@ -1648,13 +1645,14 @@ __nf_conntrack_alloc(struct net *net, gfp_t gfp, u32 hash) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); - unsigned int ct_count; + unsigned int ct_max, ct_count; struct nf_conn *ct; /* We don't want any race condition at early drop stage */ ct_count = atomic_inc_return(&cnet->count); + ct_max = nf_conntrack_max(&init_net); - if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { + if (ct_max && unlikely(ct_count > ct_max)) { if (!early_drop(net, hash)) { if (!conntrack_gc_work.early_drop) conntrack_gc_work.early_drop = true; @@ -2650,7 +2648,7 @@ int nf_conntrack_init_start(void) if (!nf_conntrack_hash) return -ENOMEM; - nf_conntrack_max = max_factor * nf_conntrack_htable_size; + init_net.ct.sysctl_conntrack_max = max_factor * nf_conntrack_htable_size; nf_conntrack_cachep = kmem_cache_create("nf_conntrack", sizeof(struct nf_conn), diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index db23876a6016..f1938204b827 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2608,7 +2608,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) goto nla_put_failure; - if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max))) + if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max(net)))) goto nla_put_failure; nlmsg_end(skb, nlh); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 502cf10aab41..8a185dfd3261 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -615,7 +615,7 @@ enum nf_ct_sysctl_index { static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", - .data = &nf_conntrack_max, + .data = &init_net.ct.sysctl_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -944,7 +944,7 @@ static struct ctl_table nf_ct_sysctl_table[] = { static struct ctl_table nf_ct_netfilter_table[] = { { .procname = "nf_conntrack_max", - .data = &nf_conntrack_max, + .data = &init_net.ct.sysctl_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec,
Florian Westphal <fw@strlen.de> wrote: > > You can make an initial patch that replaces all occurences of > > nf_conntrack_max with cnet->sysctl_conntrack_max > > Something like this: > ... Agreed, I can submit the changes later. First of all, a patch should do one thing clearly, which is convenient for maintainers to review.
Florian Westphal <fw@strlen.de> wrote: > net.netfilter.nf_conntrack_max > is replaced by init_net.nf_conntrack_max in your patch. > > But not net.nf_conntrack_max, so they are now different and not > related at all anymore except that the latter overrides the former > even in init_net. > > I'm not sure this is sane. And it needs an update to > Documentation/networking/nf_conntrack-sysctl.rst Yes, it needs an update to Documentation/networking/nf_conntrack-sysctl.rst. in different netns, net.netfilter.nf_conntrack_max = init_net.ct.sysctl_max; the global (ancestral) limit, net.nf_conntrack_max = nf_conntrack_max = max_factor * nf_conntrack_htable_size; > in any case. > > Also: > > - if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { > + if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) { > > > ... can't be right, this allows a 0 setting in the netns. > So, setting 0 in non-init-net must be disallowed. Yes, setting 0 in non-init-net must be disallowed. Should be used: unsigned int net_ct_sysctl_max = max(min(nf_conntrack_max, net->ct.sysctl_max), 0); if (nf_conntrack_max && unlikely(ct_count > net_ct_sysctl_max)) { min(nf_conntrack_max, net->ct.sysctl_max) is the upper limit of ct_count At the same time, when net->ct.sysctl_max == 0, the original intention is no limit, but it can be limited by nf_conntrack_max in different netns. > I suggest to remove nf_conntrack_max as a global variable, > make net.nf_conntrack_max use init_net.nf_conntrack_max too internally, > so in the init_net both sysctls remain the same. > > Then, change __nf_conntrack_alloc() to do: > > unsigned int nf_conntrack_max = min(net->ct.sysctl_max, &init_net.ct.sysctl_max); > > and leave the if-condition as is, i.e.: > > if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { ... Yes, each netns can pick an arbitrary value (but not 0, this ability needs to be removed). Should be used: unsigned int nf_conntrack_max = max(min(net->ct.sysctl_max, init_net.ct.sysctl_max, 0); This also needs an update to Documentation/networking/nf_conntrack-sysctl.rst to explain the restrictions.
lvxiafei <xiafei_xupt@163.com> wrote: > > in any case. > > > > Also: > > > > - if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { > > + if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) { > > > > > > ... can't be right, this allows a 0 setting in the netns. > > So, setting 0 in non-init-net must be disallowed. > > Yes, setting 0 in non-init-net must be disallowed. > > Should be used: > unsigned int net_ct_sysctl_max = max(min(nf_conntrack_max, net->ct.sysctl_max), 0); > if (nf_conntrack_max && unlikely(ct_count > net_ct_sysctl_max)) { That would work. Alternative, probably preferrable, is to do something like this: @@ -615,10 +615,10 @@ enum nf_ct_sysctl_index { static struct ctl_table nf_ct_sysctl_table[] = { - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, /* 0 == no limit */ }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", @@ -1081,9 +1082,11 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) /* Don't allow non-init_net ns to alter global sysctls */ if (!net_eq(&init_net, net)) { table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444; table[NF_SYSCTL_CT_BUCKETS].mode = 0444; + + /* 0 means no limit, only allowed in init_net */ + table[NF_SYSCTL_CT_MAX].extra1 = SYSCTL_ONE; } That will make setting a 0 value illegal for non-init net case: sysctl net.netfilter.nf_conntrack_max=0 sysctl: setting key "net.netfilter.nf_conntrack_max": Invalid argument > min(nf_conntrack_max, net->ct.sysctl_max) is the upper limit of ct_count > At the same time, when net->ct.sysctl_max == 0, the original intention is no limit, > but it can be limited by nf_conntrack_max in different netns. Sounds good to me.
© 2016 - 2025 Red Hat, Inc.