include/net/netns/conntrack.h | 1 + net/netfilter/nf_conntrack_core.c | 12 +++++++----- net/netfilter/nf_conntrack_standalone.c | 7 ++++--- 3 files changed, 12 insertions(+), 8 deletions(-)
From: lvxiafei <lvxiafei@sensetime.com>
Support nf_conntrack_max settings in different netns,
nf_conntrack_max is used to more flexibly limit the
ct_count in different netns, which may be greater than
the value in the parent namespace. The default value
belongs to the global (ancestral) limit and no implicit
limit is inherited from the parent namespace.
Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
include/net/netns/conntrack.h | 1 +
net/netfilter/nf_conntrack_core.c | 12 +++++++-----
net/netfilter/nf_conntrack_standalone.c | 7 ++++---
3 files changed, 12 insertions(+), 8 deletions(-)
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..dd31ba205419 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
u8 sysctl_acct;
u8 sysctl_tstamp;
u8 sysctl_checksum;
+ u8 sysctl_max;
struct ip_conntrack_stat __percpu *stat;
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..5f0dbd358d66 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1498,7 +1498,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
static void gc_worker(struct work_struct *work)
{
- unsigned int i, hashsz, nf_conntrack_max95 = 0;
+ unsigned int i, hashsz;
u32 end_time, start_time = nfct_time_stamp;
struct conntrack_gc_work *gc_work;
unsigned int expired_count = 0;
@@ -1509,8 +1509,6 @@ static void gc_worker(struct work_struct *work)
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
i = gc_work->next_bucket;
- if (gc_work->early_drop)
- nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
if (i == 0) {
gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1536,7 @@ static void gc_worker(struct work_struct *work)
}
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+ unsigned int nf_conntrack_max95 = 0;
struct nf_conntrack_net *cnet;
struct net *net;
long expires;
@@ -1567,11 +1566,14 @@ static void gc_worker(struct work_struct *work)
expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
expires = (expires - (long)next_run) / ++count;
next_run += expires;
+ net = nf_ct_net(tmp);
+
+ if (gc_work->early_drop)
+ nf_conntrack_max95 = net->ct.sysctl_max / 100u * 95u;
if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
continue;
- net = nf_ct_net(tmp);
cnet = nf_ct_pernet(net);
if (atomic_read(&cnet->count) < nf_conntrack_max95)
continue;
@@ -1654,7 +1656,7 @@ __nf_conntrack_alloc(struct net *net,
/* We don't want any race condition at early drop stage */
ct_count = atomic_inc_return(&cnet->count);
- if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+ if (net->ct.sysctl_max && unlikely(ct_count > net->ct.sysctl_max)) {
if (!early_drop(net, hash)) {
if (!conntrack_gc_work.early_drop)
conntrack_gc_work.early_drop = true;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..77c9c01c7278 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_MAX] = {
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
static struct ctl_table nf_ct_netfilter_table[] = {
{
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+ table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
/* Don't allow non-init_net ns to alter global sysctls */
if (!net_eq(&init_net, net)) {
- table[NF_SYSCTL_CT_MAX].mode = 0444;
table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
}
@@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
int ret;
net->ct.sysctl_checksum = 1;
+ net->ct.sysctl_max = nf_conntrack_max;
ret = nf_conntrack_standalone_init_sysctl(net);
if (ret < 0)
--
2.40.1
lvxiafei <xiafei_xupt@163.com> wrote: > From: lvxiafei <lvxiafei@sensetime.com> > > Support nf_conntrack_max settings in different netns, > nf_conntrack_max is used to more flexibly limit the > ct_count in different netns, which may be greater than > the value in the parent namespace. The default value > belongs to the global (ancestral) limit and no implicit > limit is inherited from the parent namespace. That seems the wrong thing to do. There must be some way to limit the netns conntrack usage. Whats the actual intent here? You could apply max = min(init_net->max, net->max) Or, you could relax it as long as netns are owned by initial user ns, I guess. Or perhaps its possible to make a guesstimate of the maximum memory needed by the new limit, then account that to memcg (at sysctl change time), and reject if memcg is exhausted. No other ideas at the moment, but I do not like the "no limits" approach.
On Tue, 8 Apr 2025 11:58:54 Florian Westphal <fw@strlen.de> wrote: > That seems the wrong thing to do. > There must be some way to limit the netns conntrack usage. > > Whats the actual intent here? > > You could apply max = min(init_net->max, net->max) > Or, you could relax it as long as netns are owned > by initial user ns, I guess. > > Or perhaps its possible to make a guesstimate of > the maximum memory needed by the new limit, then > account that to memcg (at sysctl change time), and > reject if memcg is exhausted. > > No other ideas at the moment, but I do not like the > "no limits" approach. The original nf_conntrack_max is a global variable. Modification will affect the connection tracking limit in other netns, and the maximum memory consumption = number of netns * nf_conntrack_max This modification can make nf_conntrack_max support the netns level to set the size of the connection tracking table, and more flexibly limit the connection tracking of each netns. For example, the initial user ns has a default value (=max_factor*nf_conntrack_htable_size). The nf_conntrack_max when netns 1 and netns 2 are created is the same as the nf_conntrack_max in the initial user ns. You can set it to netns 1 1k and netns 2 2k without affecting each other. If you are worried that different netns may exceed the initial user limit and memory limit when setting, apply max = min(init_net->max, net->max), the value in netns is not greater than init_net->max, and the new maximum memory consumption <= the original maximum memory consumption, which limits memory consumption to a certain extent. However, this will bring several problems: 1. Do not allow nf_conntrack_max in other netns to be greater than nf_conntrack_max of the initial user. For example, when other netns carry north-south traffic, the actual number of connection tracking is greater than that of the initial user. 2. If nf_conntrack_max of the initial user is increased, the maximum memory consumption will inevitably increase by n copies 3. If nf_conntrack_max of the initial user is reduced, will the existing connections in other netns be affected?
lvxiafei <xiafei_xupt@163.com> wrote: > This modification can make nf_conntrack_max support > the netns level to set the size of the connection > tracking table, and more flexibly limit the connection > tracking of each netns. For example, the initial user ns > has a default value (=max_factor*nf_conntrack_htable_size). > The nf_conntrack_max when netns 1 and netns 2 are created > is the same as the nf_conntrack_max in the initial user ns. > You can set it to netns 1 1k and netns 2 2k without > affecting each other. Netns 2 can also set it to 2**31 and cause machine go OOM. > If you are worried that different netns may exceed the > initial user limit and memory limit when setting, > apply max = min(init_net->max, net->max), the value in > netns is not greater than init_net->max, and the new > maximum memory consumption <= the original maximum memory > consumption, which limits memory consumption to a certain > extent. That was one of the suggestions that I see how one could have tunable pernet variable without allowing netns2 go haywire. > However, this will bring several problems: > > 1. Do not allow nf_conntrack_max in other netns to be greater > than nf_conntrack_max of the initial user. For example, when > other netns carry north-south traffic, the actual number of > connection tracking is greater than that of the initial user. Sure. > 2. If nf_conntrack_max of the initial user is increased, the > maximum memory consumption will inevitably increase by n copies How is that different to current state of affairs? > 3. If nf_conntrack_max of the initial user is reduced, will > the existing connections in other netns be affected? No, but new ones will be blocked until its below init_net limit again.
On 2025-04-08 13:28 Florian Westphal <fw@strlen.de> wrote: > That was one of the suggestions that I see how one could have > tunable pernet variable without allowing netns2 go haywire. Yes, After net.netfilter.nf_conntrack_max is set in different netns, it should be designed to not be allowed to be larger than the global (ancestor) limit when working.
© 2016 - 2025 Red Hat, Inc.