include/net/netns/conntrack.h | 1 + net/netfilter/nf_conntrack_core.c | 12 +++++++----- net/netfilter/nf_conntrack_standalone.c | 6 ++++-- 3 files changed, 12 insertions(+), 7 deletions(-)
From: lvxiafei <lvxiafei@sensetime.com>
The modification of nf_conntrack_max in one netns
should not affect the value in another one.
Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
include/net/netns/conntrack.h | 1 +
net/netfilter/nf_conntrack_core.c | 12 +++++++-----
net/netfilter/nf_conntrack_standalone.c | 6 ++++--
3 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..dd31ba205419 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
u8 sysctl_acct;
u8 sysctl_tstamp;
u8 sysctl_checksum;
+ u8 sysctl_max;
struct ip_conntrack_stat __percpu *stat;
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..5f0dbd358d66 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1498,7 +1498,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
static void gc_worker(struct work_struct *work)
{
- unsigned int i, hashsz, nf_conntrack_max95 = 0;
+ unsigned int i, hashsz;
u32 end_time, start_time = nfct_time_stamp;
struct conntrack_gc_work *gc_work;
unsigned int expired_count = 0;
@@ -1509,8 +1509,6 @@ static void gc_worker(struct work_struct *work)
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
i = gc_work->next_bucket;
- if (gc_work->early_drop)
- nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
if (i == 0) {
gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1536,7 @@ static void gc_worker(struct work_struct *work)
}
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+ unsigned int nf_conntrack_max95 = 0;
struct nf_conntrack_net *cnet;
struct net *net;
long expires;
@@ -1567,11 +1566,14 @@ static void gc_worker(struct work_struct *work)
expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
expires = (expires - (long)next_run) / ++count;
next_run += expires;
+ net = nf_ct_net(tmp);
+
+ if (gc_work->early_drop)
+ nf_conntrack_max95 = net->ct.sysctl_max / 100u * 95u;
if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
continue;
- net = nf_ct_net(tmp);
cnet = nf_ct_pernet(net);
if (atomic_read(&cnet->count) < nf_conntrack_max95)
continue;
@@ -1654,7 +1656,7 @@ __nf_conntrack_alloc(struct net *net,
/* We don't want any race condition at early drop stage */
ct_count = atomic_inc_return(&cnet->count);
- if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+ if (net->ct.sysctl_max && unlikely(ct_count > net->ct.sysctl_max)) {
if (!early_drop(net, hash)) {
if (!conntrack_gc_work.early_drop)
conntrack_gc_work.early_drop = true;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..5ac1893e9045 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_MAX] = {
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
static struct ctl_table nf_ct_netfilter_table[] = {
{
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+ table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1139,6 +1140,7 @@ static int nf_conntrack_pernet_init(struct net *net)
int ret;
net->ct.sysctl_checksum = 1;
+ net->ct.sysctl_max = nf_conntrack_max;
ret = nf_conntrack_standalone_init_sysctl(net);
if (ret < 0)
--
2.40.1
lvxiafei <xiafei_xupt@163.com> wrote:
> The modification of nf_conntrack_max in one netns
> should not affect the value in another one.
nf_conntrack_max can only be changed in init_net.
Given the check isn't removed:
/* Don't allow non-init_net ns to alter global sysctls */
if (!net_eq(&init_net, net)) {
table[NF_SYSCTL_CT_MAX].mode = 0444;
... this patch seems untested?
But, removing this check would allow any netns to consume
arbitrary amount of kernel memory.
How do you prevent this?
On Monday 2025-04-07 12:13, Florian Westphal wrote:
>lvxiafei <xiafei_xupt@163.com> wrote:
>> The modification of nf_conntrack_max in one netns
>> should not affect the value in another one.
>
>nf_conntrack_max can only be changed in init_net.
>
>Given the check isn't removed:
> /* Don't allow non-init_net ns to alter global sysctls */
> if (!net_eq(&init_net, net)) {
> table[NF_SYSCTL_CT_MAX].mode = 0444;
>
>... this patch seems untested?
>
>But, removing this check would allow any netns to consume
>arbitrary amount of kernel memory.
>
>How do you prevent this?
Yes, this check needs to be deleted
All netns share the original nf_conntrack_max. The nf_conntrack_max
limit does not limit the total ct_count of all netns. When it is changed
to a netns-level parameter, the default value is the same as the original
default value (=max_factor*nf_conntrack_htable_size), which is a global
(ancestral) limit, and kernel memory consumption is not affected.
On Monday 2025-04-07 12:13, Florian Westphal wrote:
>lvxiafei <xiafei_xupt@163.com> wrote:
>> The modification of nf_conntrack_max in one netns
>> should not affect the value in another one.
>
>nf_conntrack_max can only be changed in init_net.
>
>Given the check isn't removed:
> /* Don't allow non-init_net ns to alter global sysctls */
> if (!net_eq(&init_net, net)) {
> table[NF_SYSCTL_CT_MAX].mode = 0444;
>
>... this patch seems untested?
>
>But, removing this check would allow any netns to consume
>arbitrary amount of kernel memory.
>
>How do you prevent this?
By inheriting an implicit limit from the parent namespace somehow.
For example, even if you set the kernel.pid_max sysctl in the initial
namespace to something like 9999, subordinate namespace have
kernel.pid_max=4million again, but nevertheless are unable to use
more than 9999 PIDs. Or so documentation the documentation
from commit d385c8bceb14665e935419334aa3d3fac2f10456 tells me
(I did not try to create so many processes by myself).
A similar logic would have to be applied for netfilter sysctls
if they are made modifiable in subordinate namespaces.
On Mon, 7 Apr 2025 12:56:33 Jan Engelhardt <ej@inai.de> wrote: > By inheriting an implicit limit from the parent namespace somehow. > For example, even if you set the kernel.pid_max sysctl in the initial > namespace to something like 9999, subordinate namespace have > kernel.pid_max=4million again, but nevertheless are unable to use > more than 9999 PIDs. Or so documentation the documentation > from commit d385c8bceb14665e935419334aa3d3fac2f10456 tells me > (I did not try to create so many processes by myself). > > A similar logic would have to be applied for netfilter sysctls > if they are made modifiable in subordinate namespaces. The patch is to use nf_conntrack_max to more flexibly limit the ct_count in different netns, which may be greater than the parent namespace, belonging to the global (ancestral) limit, and there is no implicit limit inherited from the parent namespace
On Mon, 7 Apr 2025 12:56:33 Jan Engelhardt <ej@inai.de> wrote: > By inheriting an implicit limit from the parent namespace somehow. > For example, even if you set the kernel.pid_max sysctl in the initial > namespace to something like 9999, subordinate namespace have > kernel.pid_max=4million again, but nevertheless are unable to use > more than 9999 PIDs. Or so documentation the documentation > from commit d385c8bceb14665e935419334aa3d3fac2f10456 tells me > (I did not try to create so many processes by myself). > > A similar logic would have to be applied for netfilter sysctls > if they are made modifiable in subordinate namespaces. The patch is to use nf_conntrack_max to more flexibly limit the ct_count in different netns, which may be greater than the parent namespace, belonging to the global (ancestral) limit, and there is no implicit limit inherited from the parent namespace
From: lvxiafei <lvxiafei@sensetime.com>
Support net.netfilter.nf_conntrack_max settings per
netns, net.netfilter.nf_conntrack_max is used to more
flexibly limit the ct_count in different netns. The
default value belongs to the init_net limit.
After net.netfilter.nf_conntrack_max is set in different
netns, it is not allowed to be greater than the init_net
limit when working.
Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
.../networking/nf_conntrack-sysctl.rst | 29 +++++++++++++++----
include/net/netfilter/nf_conntrack.h | 12 +++++++-
include/net/netns/conntrack.h | 1 +
net/netfilter/nf_conntrack_core.c | 19 ++++++------
net/netfilter/nf_conntrack_netlink.c | 2 +-
net/netfilter/nf_conntrack_standalone.c | 7 +++--
6 files changed, 50 insertions(+), 20 deletions(-)
diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 238b66d0e059..6e7f17f5959a 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -93,12 +93,29 @@ nf_conntrack_log_invalid - INTEGER
Log invalid packets of a type specified by value.
nf_conntrack_max - INTEGER
- Maximum number of allowed connection tracking entries. This value is set
- to nf_conntrack_buckets by default.
- Note that connection tracking entries are added to the table twice -- once
- for the original direction and once for the reply direction (i.e., with
- the reversed address). This means that with default settings a maxed-out
- table will have a average hash chain length of 2, not 1.
+ - 0 - disabled (unlimited)
+ - not 0 - enabled
+
+ Maximum number of allowed connection tracking entries per netns. This value
+ is set to nf_conntrack_buckets by default.
+
+ Note that connection tracking entries are added to the table twice -- once
+ for the original direction and once for the reply direction (i.e., with
+ the reversed address). This means that with default settings a maxed-out
+ table will have a average hash chain length of 2, not 1.
+
+ The limit of other netns cannot be greater than init_net netns.
+ +----------------+-------------+----------------+
+ | init_net netns | other netns | limit behavior |
+ +----------------+-------------+----------------+
+ | 0 | 0 | unlimited |
+ +----------------+-------------+----------------+
+ | 0 | not 0 | other |
+ +----------------+-------------+----------------+
+ | not 0 | 0 | init_net |
+ +----------------+-------------+----------------+
+ | not 0 | not 0 | min |
+ +----------------+-------------+----------------+
nf_conntrack_tcp_be_liberal - BOOLEAN
- 0 - disabled (default)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 3f02a45773e8..594439b2f5a1 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
extern struct hlist_nulls_head *nf_conntrack_hash;
extern unsigned int nf_conntrack_htable_size;
extern seqcount_spinlock_t nf_conntrack_generation;
-extern unsigned int nf_conntrack_max;
/* must be called with rcu read lock held */
static inline void
@@ -360,6 +359,17 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
return net_generic(net, nf_conntrack_net_id);
}
+static inline unsigned int nf_conntrack_max(const struct net *net)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
+ min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
+ max(init_net.ct.sysctl_max, net->ct.sysctl_max);
+#else
+ return 0;
+#endif
+}
+
int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
u16 zone, u8 family, u8 *proto, u16 *mru);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..d3fcd0b92b2d 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
u8 sysctl_acct;
u8 sysctl_tstamp;
u8 sysctl_checksum;
+ unsigned int sysctl_max;
struct ip_conntrack_stat __percpu *stat;
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..a738564923ec 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -202,8 +202,6 @@ static void nf_conntrack_all_unlock(void)
unsigned int nf_conntrack_htable_size __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
-unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
seqcount_spinlock_t nf_conntrack_generation __read_mostly;
static siphash_aligned_key_t nf_conntrack_hash_rnd;
@@ -1498,7 +1496,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
static void gc_worker(struct work_struct *work)
{
- unsigned int i, hashsz, nf_conntrack_max95 = 0;
+ unsigned int i, hashsz;
u32 end_time, start_time = nfct_time_stamp;
struct conntrack_gc_work *gc_work;
unsigned int expired_count = 0;
@@ -1509,8 +1507,6 @@ static void gc_worker(struct work_struct *work)
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
i = gc_work->next_bucket;
- if (gc_work->early_drop)
- nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
if (i == 0) {
gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1534,7 @@ static void gc_worker(struct work_struct *work)
}
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+ unsigned int nf_conntrack_max95 = 0;
struct nf_conntrack_net *cnet;
struct net *net;
long expires;
@@ -1567,11 +1564,14 @@ static void gc_worker(struct work_struct *work)
expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
expires = (expires - (long)next_run) / ++count;
next_run += expires;
+ net = nf_ct_net(tmp);
+
+ if (gc_work->early_drop)
+ nf_conntrack_max95 = nf_conntrack_max(net) / 100u * 95u;
if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
continue;
- net = nf_ct_net(tmp);
cnet = nf_ct_pernet(net);
if (atomic_read(&cnet->count) < nf_conntrack_max95)
continue;
@@ -1648,13 +1648,14 @@ __nf_conntrack_alloc(struct net *net,
gfp_t gfp, u32 hash)
{
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
- unsigned int ct_count;
+ unsigned int ct_max, ct_count;
struct nf_conn *ct;
/* We don't want any race condition at early drop stage */
ct_count = atomic_inc_return(&cnet->count);
+ ct_max = nf_conntrack_max(net);
- if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+ if (ct_max && unlikely(ct_count > ct_max)) {
if (!early_drop(net, hash)) {
if (!conntrack_gc_work.early_drop)
conntrack_gc_work.early_drop = true;
@@ -2650,7 +2651,7 @@ int nf_conntrack_init_start(void)
if (!nf_conntrack_hash)
return -ENOMEM;
- nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+ init_net.ct.sysctl_max = max_factor * nf_conntrack_htable_size;
nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
sizeof(struct nf_conn),
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 2cc0fde23344..73e6bb1e939b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2608,7 +2608,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
goto nla_put_failure;
- if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+ if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max(net))))
goto nla_put_failure;
nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..5db6df0e4eb3 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_MAX] = {
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
static struct ctl_table nf_ct_netfilter_table[] = {
{
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+ table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
/* Don't allow non-init_net ns to alter global sysctls */
if (!net_eq(&init_net, net)) {
- table[NF_SYSCTL_CT_MAX].mode = 0444;
table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
}
@@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
int ret;
net->ct.sysctl_checksum = 1;
+ net->ct.sysctl_max = init_net.ct.sysctl_max;
ret = nf_conntrack_standalone_init_sysctl(net);
if (ret < 0)
--
2.40.1
On Tue, Apr 15, 2025 at 05:08:34PM +0800, lvxiafei wrote:
> diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
> index 238b66d0e059..6e7f17f5959a 100644
> --- a/Documentation/networking/nf_conntrack-sysctl.rst
> +++ b/Documentation/networking/nf_conntrack-sysctl.rst
> @@ -93,12 +93,29 @@ nf_conntrack_log_invalid - INTEGER
> Log invalid packets of a type specified by value.
>
> nf_conntrack_max - INTEGER
> - Maximum number of allowed connection tracking entries. This value is set
> - to nf_conntrack_buckets by default.
> - Note that connection tracking entries are added to the table twice -- once
> - for the original direction and once for the reply direction (i.e., with
> - the reversed address). This means that with default settings a maxed-out
> - table will have a average hash chain length of 2, not 1.
> + - 0 - disabled (unlimited)
unlimited is too much, and the number of buckets is also global, how
does this work?
Is your goal to allow a netns to have larger table than netns? There
should be a cap for this.
> + - not 0 - enabled
> +
> + Maximum number of allowed connection tracking entries per netns. This value
> + is set to nf_conntrack_buckets by default.
> +
> + Note that connection tracking entries are added to the table twice -- once
> + for the original direction and once for the reply direction (i.e., with
> + the reversed address). This means that with default settings a maxed-out
> + table will have a average hash chain length of 2, not 1.
> +
> + The limit of other netns cannot be greater than init_net netns.
> + +----------------+-------------+----------------+
> + | init_net netns | other netns | limit behavior |
> + +----------------+-------------+----------------+
> + | 0 | 0 | unlimited |
> + +----------------+-------------+----------------+
> + | 0 | not 0 | other |
> + +----------------+-------------+----------------+
> + | not 0 | 0 | init_net |
> + +----------------+-------------+----------------+
> + | not 0 | not 0 | min |
> + +----------------+-------------+----------------+
>
> nf_conntrack_tcp_be_liberal - BOOLEAN
> - 0 - disabled (default)
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index 3f02a45773e8..594439b2f5a1 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
> extern struct hlist_nulls_head *nf_conntrack_hash;
> extern unsigned int nf_conntrack_htable_size;
> extern seqcount_spinlock_t nf_conntrack_generation;
> -extern unsigned int nf_conntrack_max;
>
> /* must be called with rcu read lock held */
> static inline void
> @@ -360,6 +359,17 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
> return net_generic(net, nf_conntrack_net_id);
> }
>
> +static inline unsigned int nf_conntrack_max(const struct net *net)
> +{
> +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> + return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
> + min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
> + max(init_net.ct.sysctl_max, net->ct.sysctl_max);
> +#else
> + return 0;
> +#endif
> +}
> +
> int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
> int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
> u16 zone, u8 family, u8 *proto, u16 *mru);
> diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
> index bae914815aa3..d3fcd0b92b2d 100644
> --- a/include/net/netns/conntrack.h
> +++ b/include/net/netns/conntrack.h
> @@ -102,6 +102,7 @@ struct netns_ct {
> u8 sysctl_acct;
> u8 sysctl_tstamp;
> u8 sysctl_checksum;
> + unsigned int sysctl_max;
>
> struct ip_conntrack_stat __percpu *stat;
> struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 7f8b245e287a..a738564923ec 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -202,8 +202,6 @@ static void nf_conntrack_all_unlock(void)
> unsigned int nf_conntrack_htable_size __read_mostly;
> EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
>
> -unsigned int nf_conntrack_max __read_mostly;
> -EXPORT_SYMBOL_GPL(nf_conntrack_max);
> seqcount_spinlock_t nf_conntrack_generation __read_mostly;
> static siphash_aligned_key_t nf_conntrack_hash_rnd;
>
> @@ -1498,7 +1496,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
>
> static void gc_worker(struct work_struct *work)
> {
> - unsigned int i, hashsz, nf_conntrack_max95 = 0;
> + unsigned int i, hashsz;
> u32 end_time, start_time = nfct_time_stamp;
> struct conntrack_gc_work *gc_work;
> unsigned int expired_count = 0;
> @@ -1509,8 +1507,6 @@ static void gc_worker(struct work_struct *work)
> gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
>
> i = gc_work->next_bucket;
> - if (gc_work->early_drop)
> - nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
>
> if (i == 0) {
> gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
> @@ -1538,6 +1534,7 @@ static void gc_worker(struct work_struct *work)
> }
>
> hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
> + unsigned int nf_conntrack_max95 = 0;
> struct nf_conntrack_net *cnet;
> struct net *net;
> long expires;
> @@ -1567,11 +1564,14 @@ static void gc_worker(struct work_struct *work)
> expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
> expires = (expires - (long)next_run) / ++count;
> next_run += expires;
> + net = nf_ct_net(tmp);
> +
> + if (gc_work->early_drop)
> + nf_conntrack_max95 = nf_conntrack_max(net) / 100u * 95u;
>
> if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
> continue;
>
> - net = nf_ct_net(tmp);
> cnet = nf_ct_pernet(net);
> if (atomic_read(&cnet->count) < nf_conntrack_max95)
> continue;
> @@ -1648,13 +1648,14 @@ __nf_conntrack_alloc(struct net *net,
> gfp_t gfp, u32 hash)
> {
> struct nf_conntrack_net *cnet = nf_ct_pernet(net);
> - unsigned int ct_count;
> + unsigned int ct_max, ct_count;
> struct nf_conn *ct;
>
> /* We don't want any race condition at early drop stage */
> ct_count = atomic_inc_return(&cnet->count);
> + ct_max = nf_conntrack_max(net);
>
> - if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
> + if (ct_max && unlikely(ct_count > ct_max)) {
> if (!early_drop(net, hash)) {
> if (!conntrack_gc_work.early_drop)
> conntrack_gc_work.early_drop = true;
> @@ -2650,7 +2651,7 @@ int nf_conntrack_init_start(void)
> if (!nf_conntrack_hash)
> return -ENOMEM;
>
> - nf_conntrack_max = max_factor * nf_conntrack_htable_size;
> + init_net.ct.sysctl_max = max_factor * nf_conntrack_htable_size;
>
> nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
> sizeof(struct nf_conn),
> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index 2cc0fde23344..73e6bb1e939b 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -2608,7 +2608,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
> if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
> goto nla_put_failure;
>
> - if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
> + if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max(net))))
> goto nla_put_failure;
>
> nlmsg_end(skb, nlh);
> diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
> index 2f666751c7e7..5db6df0e4eb3 100644
> --- a/net/netfilter/nf_conntrack_standalone.c
> +++ b/net/netfilter/nf_conntrack_standalone.c
> @@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
> static struct ctl_table nf_ct_sysctl_table[] = {
> [NF_SYSCTL_CT_MAX] = {
> .procname = "nf_conntrack_max",
> - .data = &nf_conntrack_max,
> + .data = &init_net.ct.sysctl_max,
> .maxlen = sizeof(int),
> .mode = 0644,
> .proc_handler = proc_dointvec_minmax,
> @@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
> static struct ctl_table nf_ct_netfilter_table[] = {
> {
> .procname = "nf_conntrack_max",
> - .data = &nf_conntrack_max,
> + .data = &init_net.ct.sysctl_max,
> .maxlen = sizeof(int),
> .mode = 0644,
> .proc_handler = proc_dointvec_minmax,
> @@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
>
> table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
> table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
> + table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
> table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
> table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
> #ifdef CONFIG_NF_CONNTRACK_EVENTS
> @@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
>
> /* Don't allow non-init_net ns to alter global sysctls */
> if (!net_eq(&init_net, net)) {
> - table[NF_SYSCTL_CT_MAX].mode = 0444;
> table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
> table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
> }
> @@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
> int ret;
>
> net->ct.sysctl_checksum = 1;
> + net->ct.sysctl_max = init_net.ct.sysctl_max;
>
> ret = nf_conntrack_standalone_init_sysctl(net);
> if (ret < 0)
> --
> 2.40.1
>
Pablo Neira Ayuso <pablo@netfilter.org> wrote: > > - to nf_conntrack_buckets by default. > > - Note that connection tracking entries are added to the table twice -- once > > - for the original direction and once for the reply direction (i.e., with > > - the reversed address). This means that with default settings a maxed-out > > - table will have a average hash chain length of 2, not 1. > > + - 0 - disabled (unlimited) > > unlimited is too much, and the number of buckets is also global, how > does this work? Its an historic wart going back to ip_conntrack - it was never the default but you could disable any and all limits even in the original version. Wether its time to disallow 0 is a different topic and not related to this patch. I would argue: "yes", disallow 0 -- users can still set INT_MAX if they want and that should provide enough rope to strangle yourself. > > + The limit of other netns cannot be greater than init_net netns. > > + +----------------+-------------+----------------+ > > + | init_net netns | other netns | limit behavior | > > + +----------------+-------------+----------------+ > > + | 0 | 0 | unlimited | > > + +----------------+-------------+----------------+ > > + | 0 | not 0 | other | > > + +----------------+-------------+----------------+ > > + | not 0 | 0 | init_net | > > + +----------------+-------------+----------------+ > > + | not 0 | not 0 | min | > > + +----------------+-------------+----------------+ I think this is fine, it doesn't really change things from init_net point of view.
On Thu, May 22, 2025 at 09:32:23PM +0200, Florian Westphal wrote: > Pablo Neira Ayuso <pablo@netfilter.org> wrote: > > > - to nf_conntrack_buckets by default. > > > - Note that connection tracking entries are added to the table twice -- once > > > - for the original direction and once for the reply direction (i.e., with > > > - the reversed address). This means that with default settings a maxed-out > > > - table will have a average hash chain length of 2, not 1. > > > + - 0 - disabled (unlimited) > > > > unlimited is too much, and the number of buckets is also global, how > > does this work? > > Its an historic wart going back to ip_conntrack - it was never the > default but you could disable any and all limits even in the original > version. Thanks, I was just sitting here clueless. > Wether its time to disallow 0 is a different topic and not related to this patch. > > I would argue: "yes", disallow 0 -- users can still set INT_MAX if they > want and that should provide enough rope to strangle yourself. The question is how to make it without breaking crazy people. > > > + The limit of other netns cannot be greater than init_net netns. > > > + +----------------+-------------+----------------+ > > > + | init_net netns | other netns | limit behavior | > > > + +----------------+-------------+----------------+ > > > + | 0 | 0 | unlimited | > > > + +----------------+-------------+----------------+ > > > + | 0 | not 0 | other | > > > + +----------------+-------------+----------------+ > > > + | not 0 | 0 | init_net | in this case above... > > > + +----------------+-------------+----------------+ > > > + | not 0 | not 0 | min | ... and this case, init_net value is used as a cap for other netns. Then, this is basically allowing to specify a maximum that is smaller than init_netns. IIUC, that sounds reasonable. As for how to discontinue the unlimited in other netns, let me know if you have any suggestions. > > > + +----------------+-------------+----------------+ > > I think this is fine, it doesn't really change things from init_net > point of view. Thanks for explaning.
On Thu, 22 May 2025 21:58:13 +0200, Pablo Neira Ayuso wrote: > > Wether its time to disallow 0 is a different topic and not related to this patch. > > > > I would argue: "yes", disallow 0 -- users can still set INT_MAX if they > > want and that should provide enough rope to strangle yourself. > The question is how to make it without breaking crazy people. It seems that we need a new topic to discuss the maximum value that the system can tolerate to ensure safety: 1. This value is a system limitation, not a user setting 2. This value should be calculated based on system resources 3. This value takes precedence over 0 and other larger values that the user sets 4. This value does not affect the value of the user setting, and 0 in the user setting can still indicate that the user setting is unlimited, maintaining compatibility with historical usage.
lvxiafei <xiafei_xupt@163.com> wrote: > > > Wether its time to disallow 0 is a different topic and not related to this patch. > > > > > > I would argue: "yes", disallow 0 -- users can still set INT_MAX if they > > > want and that should provide enough rope to strangle yourself. > > > The question is how to make it without breaking crazy people. > > It seems that we need a new topic to discuss the maximum value that the system can > tolerate to ensure safety: > > 1. This value is a system limitation, not a user setting > 2. This value should be calculated based on system resources > 3. This value takes precedence over 0 and other larger values that the user sets > 4. This value does not affect the value of the user setting, and 0 in the user > setting can still indicate that the user setting is unlimited, maintaining > compatibility with historical usage. I've applied a variant of this patch to nf-next:testing. Could you please check that I adapted it correctly? https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next.git/commit/?h=testing&id=b7bfa7d96fa5a7f3c2a69ad406ede520e658cb07 (I added a patch right before that rejects conntrack_max=0). I wonder if we should update the sysctl path to reflect the effective value, i.e., so that when netns sets nf_conntrack_max=1000000 ... but init_net is capped at 65536, then a listing shows the sysctl at 65536. It would be similar to what we do for max_buckets. I also considered to make such a request fail at set time, but it would make the sysctl fail/not fail 'randomly' and it also would not do the right thing when init_net setting is reduced later.
> I've applied a variant of this patch to nf-next:testing. > > Could you please check that I adapted it correctly? > https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next.git/commit/?h=testing&id=b7bfa7d96fa5a7f3c2a69ad406ede520e658cb07 > > (I added a patch right before that rejects conntrack_max=0). Historically, some systems or scripts have used 0 to mean “unlimited”. In this way, some scripts that are set to 0 need to be adjusted. Rejecting this value may break compatibility, so it would be good to document this behavior change clearly in the commit message and/or changelog. > > I wonder if we should update the sysctl path to reflect the > effective value, i.e., so that when netns sets > > nf_conntrack_max=1000000 > > ... but init_net is capped at 65536, then a listing > shows the sysctl at 65536. > > It would be similar to what we do for max_buckets. I would argue against updating the sysctl path to reflect the effective value. Doing so could be misleading, as it would no longer show the value actually configured in the namespace, but rather the clamped or capped value. Users might feel that their explicit configuration has been silently altered, which can be frustrating for professional users who rely on precise control. If there is a genuine need to see the effective value, it can be computed on demand or exposed via a separate parameter specifically indicating the effective value. Keeping the sysctl path as-is preserves transparency, predictability, and user trust. > > I also considered to make such a request fail at set time, but it > would make the sysctl fail/not fail 'randomly' and it also would > not do the right thing when init_net setting is reduced later. I would be cautious about making the sysctl fail at set time. Doing so could lead to seemingly “random” failures depending on the current state of init_net, which would be confusing for users. Moreover, if init_net’s setting is reduced later, the sysctl behavior would not be consistent, and users might end up with invalid or unexpected values anyway. It seems safer to allow the set operation to succeed but let the effective value be determined by the existing limits, maintaining predictable behavior.
Hello! My name is Xiafei Lv, and I sent patch v6 on April 15, 2025. As of today, it has been nearly two weeks, and I have not received any feedback. However, the patch has been tested successfully, and you can view the details at this link: https://patchwork.kernel.org/project/netdevbpf/patch/20250415090834.24882-1-xiafei_xupt@163.com/ I understand that kernel development is busy, and maintainers may have many things to deal with and may not have time to review my patch. With this in mind, I would like to politely ask what steps I should take next? If you have any questions about my patch or need additional information from me, please feel free to let me know and I will cooperate as soon as possible. Thank you very much for taking the time to deal with my request, and I look forward to your response. Best wishes! Xiafei Lv
Hi! On Sun, Apr 27, 2025 at 04:14:20PM +0800, lvxiafei wrote: > Hello! > > My name is Xiafei Lv, and I sent patch v6 on April 15, 2025. As of today, it > has been nearly two weeks, and I have not received any feedback. However, the > patch has been tested successfully, and you can view the details at this link: > https://patchwork.kernel.org/project/netdevbpf/patch/20250415090834.24882-1-xiafei_xupt@163.com/ > > I understand that kernel development is busy, and maintainers may have many > things to deal with and may not have time to review my patch. With this in mind, > I would like to politely ask what steps I should take next? If you have any > questions about my patch or need additional information from me, please feel > free to let me know and I will cooperate as soon as possible. You just have to wait. We will get back to you with more questions if needed. > Thank you very much for taking the time to deal with my request, and I look > forward to your response. Thanks.
From: lvxiafei <lvxiafei@sensetime.com>
Support net.netfilter.nf_conntrack_max settings per
netns, net.netfilter.nf_conntrack_max is used to more
flexibly limit the ct_count in different netns. The
default value belongs to the init_net limit.
After net.netfilter.nf_conntrack_max is set in different
netns, it is not allowed to be greater than the init_net
limit when working.
Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
.../networking/nf_conntrack-sysctl.rst | 29 +++++++++++++++----
include/net/netfilter/nf_conntrack.h | 8 ++++-
include/net/netns/conntrack.h | 1 +
net/netfilter/nf_conntrack_core.c | 19 ++++++------
net/netfilter/nf_conntrack_netlink.c | 2 +-
net/netfilter/nf_conntrack_standalone.c | 7 +++--
6 files changed, 46 insertions(+), 20 deletions(-)
diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 238b66d0e059..6e7f17f5959a 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -93,12 +93,29 @@ nf_conntrack_log_invalid - INTEGER
Log invalid packets of a type specified by value.
nf_conntrack_max - INTEGER
- Maximum number of allowed connection tracking entries. This value is set
- to nf_conntrack_buckets by default.
- Note that connection tracking entries are added to the table twice -- once
- for the original direction and once for the reply direction (i.e., with
- the reversed address). This means that with default settings a maxed-out
- table will have a average hash chain length of 2, not 1.
+ - 0 - disabled (unlimited)
+ - not 0 - enabled
+
+ Maximum number of allowed connection tracking entries per netns. This value
+ is set to nf_conntrack_buckets by default.
+
+ Note that connection tracking entries are added to the table twice -- once
+ for the original direction and once for the reply direction (i.e., with
+ the reversed address). This means that with default settings a maxed-out
+ table will have a average hash chain length of 2, not 1.
+
+ The limit of other netns cannot be greater than init_net netns.
+ +----------------+-------------+----------------+
+ | init_net netns | other netns | limit behavior |
+ +----------------+-------------+----------------+
+ | 0 | 0 | unlimited |
+ +----------------+-------------+----------------+
+ | 0 | not 0 | other |
+ +----------------+-------------+----------------+
+ | not 0 | 0 | init_net |
+ +----------------+-------------+----------------+
+ | not 0 | not 0 | min |
+ +----------------+-------------+----------------+
nf_conntrack_tcp_be_liberal - BOOLEAN
- 0 - disabled (default)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 3f02a45773e8..062e67b9a5d7 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
extern struct hlist_nulls_head *nf_conntrack_hash;
extern unsigned int nf_conntrack_htable_size;
extern seqcount_spinlock_t nf_conntrack_generation;
-extern unsigned int nf_conntrack_max;
/* must be called with rcu read lock held */
static inline void
@@ -360,6 +359,13 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
return net_generic(net, nf_conntrack_net_id);
}
+static inline unsigned int nf_conntrack_max(const struct net *net)
+{
+ return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
+ min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
+ max(init_net.ct.sysctl_max, net->ct.sysctl_max);
+}
+
int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
u16 zone, u8 family, u8 *proto, u16 *mru);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..d3fcd0b92b2d 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
u8 sysctl_acct;
u8 sysctl_tstamp;
u8 sysctl_checksum;
+ unsigned int sysctl_max;
struct ip_conntrack_stat __percpu *stat;
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..a738564923ec 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -202,8 +202,6 @@ static void nf_conntrack_all_unlock(void)
unsigned int nf_conntrack_htable_size __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
-unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
seqcount_spinlock_t nf_conntrack_generation __read_mostly;
static siphash_aligned_key_t nf_conntrack_hash_rnd;
@@ -1498,7 +1496,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
static void gc_worker(struct work_struct *work)
{
- unsigned int i, hashsz, nf_conntrack_max95 = 0;
+ unsigned int i, hashsz;
u32 end_time, start_time = nfct_time_stamp;
struct conntrack_gc_work *gc_work;
unsigned int expired_count = 0;
@@ -1509,8 +1507,6 @@ static void gc_worker(struct work_struct *work)
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
i = gc_work->next_bucket;
- if (gc_work->early_drop)
- nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
if (i == 0) {
gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1534,7 @@ static void gc_worker(struct work_struct *work)
}
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+ unsigned int nf_conntrack_max95 = 0;
struct nf_conntrack_net *cnet;
struct net *net;
long expires;
@@ -1567,11 +1564,14 @@ static void gc_worker(struct work_struct *work)
expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
expires = (expires - (long)next_run) / ++count;
next_run += expires;
+ net = nf_ct_net(tmp);
+
+ if (gc_work->early_drop)
+ nf_conntrack_max95 = nf_conntrack_max(net) / 100u * 95u;
if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
continue;
- net = nf_ct_net(tmp);
cnet = nf_ct_pernet(net);
if (atomic_read(&cnet->count) < nf_conntrack_max95)
continue;
@@ -1648,13 +1648,14 @@ __nf_conntrack_alloc(struct net *net,
gfp_t gfp, u32 hash)
{
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
- unsigned int ct_count;
+ unsigned int ct_max, ct_count;
struct nf_conn *ct;
/* We don't want any race condition at early drop stage */
ct_count = atomic_inc_return(&cnet->count);
+ ct_max = nf_conntrack_max(net);
- if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+ if (ct_max && unlikely(ct_count > ct_max)) {
if (!early_drop(net, hash)) {
if (!conntrack_gc_work.early_drop)
conntrack_gc_work.early_drop = true;
@@ -2650,7 +2651,7 @@ int nf_conntrack_init_start(void)
if (!nf_conntrack_hash)
return -ENOMEM;
- nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+ init_net.ct.sysctl_max = max_factor * nf_conntrack_htable_size;
nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
sizeof(struct nf_conn),
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 2cc0fde23344..73e6bb1e939b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2608,7 +2608,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
goto nla_put_failure;
- if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+ if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max(net))))
goto nla_put_failure;
nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..5db6df0e4eb3 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_MAX] = {
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
static struct ctl_table nf_ct_netfilter_table[] = {
{
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+ table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
/* Don't allow non-init_net ns to alter global sysctls */
if (!net_eq(&init_net, net)) {
- table[NF_SYSCTL_CT_MAX].mode = 0444;
table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
}
@@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
int ret;
net->ct.sysctl_checksum = 1;
+ net->ct.sysctl_max = init_net.ct.sysctl_max;
ret = nf_conntrack_standalone_init_sysctl(net);
if (ret < 0)
--
2.40.1
lvxiafei <xiafei_xupt@163.com> wrote:
> + Maximum number of allowed connection tracking entries per netns. This value
> + is set to nf_conntrack_buckets by default.
> +
> + Note that connection tracking entries are added to the table twice -- once
> + for the original direction and once for the reply direction (i.e., with
> + the reversed address). This means that with default settings a maxed-out
> + table will have a average hash chain length of 2, not 1.
> +
> + The limit of other netns cannot be greater than init_net netns.
> + +----------------+-------------+----------------+
> + | init_net netns | other netns | limit behavior |
> + +----------------+-------------+----------------+
> + | 0 | 0 | unlimited |
> + +----------------+-------------+----------------+
> + | 0 | not 0 | other |
> + +----------------+-------------+----------------+
> + | not 0 | 0 | init_net |
> + +----------------+-------------+----------------+
> + | not 0 | not 0 | min |
> + +----------------+-------------+----------------+
>
> nf_conntrack_tcp_be_liberal - BOOLEAN
> - 0 - disabled (default)
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index 3f02a45773e8..062e67b9a5d7 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
> extern struct hlist_nulls_head *nf_conntrack_hash;
> extern unsigned int nf_conntrack_htable_size;
> extern seqcount_spinlock_t nf_conntrack_generation;
> -extern unsigned int nf_conntrack_max;
>
> /* must be called with rcu read lock held */
> static inline void
> @@ -360,6 +359,13 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
> return net_generic(net, nf_conntrack_net_id);
> }
>
> +static inline unsigned int nf_conntrack_max(const struct net *net)
> +{
> + return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
> + min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
> + max(init_net.ct.sysctl_max, net->ct.sysctl_max);
> +}
Is there a reason you did not follow my suggstion in
https://lore.kernel.org/netdev/20250410105352.GB6272@breakpoint.cc/
to disable net->ct.sysctl_max == 0 for non init netns?
You could then make this
if (likely(init_net.ct.sysctl_max))
return min(init_net.ct.sysctl_max, net->ct.sysctl_max);
return net->ct.sysctl_max;
... or am i missing something?
Aside from that (and the needed #IS_ENABLED() guard) this change looks
good to me.
On Sun, 13 Apr 2025 11:07:55 +0200 Florian Westphal <fw@strlen.de> wrote:
> Is there a reason you did not follow my suggstion in
> https://lore.kernel.org/netdev/20250410105352.GB6272@breakpoint.cc/
>
> to disable net->ct.sysctl_max == 0 for non init netns?
in https://lore.kernel.org/netdev/20250410105352.GB6272@breakpoint.cc/
> > min(nf_conntrack_max, net->ct.sysctl_max) is the upper limit of ct_count
> > At the same time, when net->ct.sysctl_max == 0, the original intention is no limit,
> > but it can be limited by nf_conntrack_max in different netns.
>
> Sounds good to me.
It seems that there are two ways to change it:
1. Disallow net->ct.sysctl_max == 0
2. Allow net->ct.sysctl_max == 0
the original intention is no limit, but it can be limited by init_net netns
in different netns. When init_net is modified, it will change dynamically.
+----------------+-------------+----------------+
| init_net netns | other netns | limit behavior |
+----------------+-------------+----------------+
| not 0 | 0 | init_net |
+----------------+-------------+----------------+
On Sun, 13 Apr 2025 01:26:10 +0800 lvxiafei wrote:
> +static inline unsigned int nf_conntrack_max(const struct net *net)
> +{
> + return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
> + min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
> + max(init_net.ct.sysctl_max, net->ct.sysctl_max);
If you CC netdev@ please do not post multiple versions a day.
Please wait with posting v6 until you get some feedback (and
this email does not count).
You need to be careful with the Kconfig, this file may be included
when contrack is not built:
In file included from ./include/linux/kernel.h:28,
from ./include/linux/cpumask.h:11,
from ./arch/x86/include/asm/cpumask.h:5,
from ./arch/x86/include/asm/msr.h:11,
from ./arch/x86/include/asm/tsc.h:10,
from ./arch/x86/include/asm/timex.h:6,
from ./include/linux/timex.h:67,
from ./include/linux/time32.h:13,
from ./include/linux/time.h:60,
from ./include/linux/compat.h:10,
from ./include/linux/ethtool.h:17,
from drivers/net/vrf.c:12:
include/net/netfilter/nf_conntrack.h:365:25: error: ‘struct net’ has no member named ‘ct’
365 | min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
| ^
> If you CC netdev@ please do not post multiple versions a day.
> Please wait with posting v6 until you get some feedback (and
> this email does not count).
Thanks for the reminder and the review.
I’ll hold off on posting v6 until I receive proper feedback.
Also, I’ll double-check the Kconfig dependencies and ensure the
file doesn’t break builds when conntrack is not enabled.
Appreciate your time!
Sincerely.
> You need to be careful with the Kconfig, this file may be included
> when contrack is not built:
>
> In file included from ./include/linux/kernel.h:28,
> from ./include/linux/cpumask.h:11,
> from ./arch/x86/include/asm/cpumask.h:5,
> from ./arch/x86/include/asm/msr.h:11,
> from ./arch/x86/include/asm/tsc.h:10,
> from ./arch/x86/include/asm/timex.h:6,
> from ./include/linux/timex.h:67,
> from ./include/linux/time32.h:13,
> from ./include/linux/time.h:60,
> from ./include/linux/compat.h:10,
> from ./include/linux/ethtool.h:17,
> from drivers/net/vrf.c:12:
> include/net/netfilter/nf_conntrack.h:365:25: error: ‘struct net’ has no member named ‘ct’
> 365 | min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
> | ^
Add conditional compilation protection:
+static inline unsigned int nf_conntrack_max(const struct net *net)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
+ min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
+ max(init_net.ct.sysctl_max, net->ct.sysctl_max);
+#else
+ return 0;
+#endif
+}
The limit of other netns cannot be greater than init_net netns.
+----------------+-------------+----------------+
| init_net netns | other netns | limit behavior |
+----------------+-------------+----------------+
| 0 | 0 | unlimited |
+----------------+-------------+----------------+
| 0 | not 0 | other |
+----------------+-------------+----------------+
| not 0 | 0 | init_net |
+----------------+-------------+----------------+
| not 0 | not 0 | min |
+----------------+-------------+----------------+
0,0
[Apr12 17:16] init_net.ct.sysctl_max 0, net->ct.sysctl_max 0, ct_max 0
0,1
[Apr12 17:17] init_net.ct.sysctl_max 0, net->ct.sysctl_max 1, ct_max 1
[ +0.000004] nf_conntrack: nf_conntrack: table full, dropping packet
1,0
Apr12 17:18] init_net.ct.sysctl_max 1, net->ct.sysctl_max 0, ct_max 1
[ +0.000006] nf_conntrack: nf_conntrack: table full, dropping packet
1,2
[Apr12 17:19] init_net.ct.sysctl_max 1, net->ct.sysctl_max 2, ct_max 1
[ +0.000004] nf_conntrack: nf_conntrack: table full, dropping packet
2,1
[Apr12 17:21] init_net.ct.sysctl_max 2, net->ct.sysctl_max 1, ct_max 1
[ +0.000004] nf_conntrack: nf_conntrack: table full, dropping packet
From: lvxiafei <lvxiafei@sensetime.com>
Support net.netfilter.nf_conntrack_max settings per
netns, net.netfilter.nf_conntrack_max is used to more
flexibly limit the ct_count in different netns. The
default value belongs to the init_net limit.
After net.netfilter.nf_conntrack_max is set in different
netns, it is not allowed to be greater than the init_net
limit when working.
Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
.../networking/nf_conntrack-sysctl.rst | 29 +++++++++++++++----
include/net/netfilter/nf_conntrack.h | 8 ++++-
include/net/netns/conntrack.h | 1 +
net/netfilter/nf_conntrack_core.c | 19 ++++++------
net/netfilter/nf_conntrack_netlink.c | 2 +-
net/netfilter/nf_conntrack_standalone.c | 5 ++--
6 files changed, 44 insertions(+), 20 deletions(-)
diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 238b66d0e059..58aad49ea179 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -93,12 +93,29 @@ nf_conntrack_log_invalid - INTEGER
Log invalid packets of a type specified by value.
nf_conntrack_max - INTEGER
- Maximum number of allowed connection tracking entries. This value is set
- to nf_conntrack_buckets by default.
- Note that connection tracking entries are added to the table twice -- once
- for the original direction and once for the reply direction (i.e., with
- the reversed address). This means that with default settings a maxed-out
- table will have a average hash chain length of 2, not 1.
+ - 0 - disabled (unlimited)
+ - not 0 - enabled
+
+ Maximum number of allowed connection tracking entries per netns. This value
+ is set to nf_conntrack_buckets by default.
+
+ Note that connection tracking entries are added to the table twice -- once
+ for the original direction and once for the reply direction (i.e., with
+ the reversed address). This means that with default settings a maxed-out
+ table will have a average hash chain length of 2, not 1.
+
+ The limit of other netns cannot be greater than init_net netns.
+ +----------------+-------------+----------------+
+ | init_net netns | other netns | limit behavior |
+ +----------------+-------------+----------------+
+ | 0 | 0 | unlimited |
+ +----------------+-------------+----------------+
+ | 0 | not 0 | other |
+ +----------------+-------------+----------------+
+ | not 0 | 0 | init_net |
+ +----------------+-------------+----------------+
+ | not 0 | not 0 | min |
+ +----------------+-------------+----------------+
nf_conntrack_tcp_be_liberal - BOOLEAN
- 0 - disabled (default)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 3f02a45773e8..062e67b9a5d7 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
extern struct hlist_nulls_head *nf_conntrack_hash;
extern unsigned int nf_conntrack_htable_size;
extern seqcount_spinlock_t nf_conntrack_generation;
-extern unsigned int nf_conntrack_max;
/* must be called with rcu read lock held */
static inline void
@@ -360,6 +359,13 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
return net_generic(net, nf_conntrack_net_id);
}
+static inline unsigned int nf_conntrack_max(const struct net *net)
+{
+ return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
+ min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
+ max(init_net.ct.sysctl_max, net->ct.sysctl_max);
+}
+
int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
u16 zone, u8 family, u8 *proto, u16 *mru);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..2f343842c2d9 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
u8 sysctl_acct;
u8 sysctl_tstamp;
u8 sysctl_checksum;
+ unsigned int sysctl_max;
struct ip_conntrack_stat __percpu *stat;
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..a738564923ec 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -202,8 +202,6 @@ static void nf_conntrack_all_unlock(void)
unsigned int nf_conntrack_htable_size __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
-unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
seqcount_spinlock_t nf_conntrack_generation __read_mostly;
static siphash_aligned_key_t nf_conntrack_hash_rnd;
@@ -1498,7 +1496,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
static void gc_worker(struct work_struct *work)
{
- unsigned int i, hashsz, nf_conntrack_max95 = 0;
+ unsigned int i, hashsz;
u32 end_time, start_time = nfct_time_stamp;
struct conntrack_gc_work *gc_work;
unsigned int expired_count = 0;
@@ -1509,8 +1507,6 @@ static void gc_worker(struct work_struct *work)
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
i = gc_work->next_bucket;
- if (gc_work->early_drop)
- nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
if (i == 0) {
gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1534,7 @@ static void gc_worker(struct work_struct *work)
}
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+ unsigned int nf_conntrack_max95 = 0;
struct nf_conntrack_net *cnet;
struct net *net;
long expires;
@@ -1567,11 +1564,14 @@ static void gc_worker(struct work_struct *work)
expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
expires = (expires - (long)next_run) / ++count;
next_run += expires;
+ net = nf_ct_net(tmp);
+
+ if (gc_work->early_drop)
+ nf_conntrack_max95 = nf_conntrack_max(net) / 100u * 95u;
if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
continue;
- net = nf_ct_net(tmp);
cnet = nf_ct_pernet(net);
if (atomic_read(&cnet->count) < nf_conntrack_max95)
continue;
@@ -1648,13 +1648,14 @@ __nf_conntrack_alloc(struct net *net,
gfp_t gfp, u32 hash)
{
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
- unsigned int ct_count;
+ unsigned int ct_max, ct_count;
struct nf_conn *ct;
/* We don't want any race condition at early drop stage */
ct_count = atomic_inc_return(&cnet->count);
+ ct_max = nf_conntrack_max(net);
- if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+ if (ct_max && unlikely(ct_count > ct_max)) {
if (!early_drop(net, hash)) {
if (!conntrack_gc_work.early_drop)
conntrack_gc_work.early_drop = true;
@@ -2650,7 +2651,7 @@ int nf_conntrack_init_start(void)
if (!nf_conntrack_hash)
return -ENOMEM;
- nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+ init_net.ct.sysctl_max = max_factor * nf_conntrack_htable_size;
nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
sizeof(struct nf_conn),
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 2cc0fde23344..73e6bb1e939b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2608,7 +2608,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
goto nla_put_failure;
- if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+ if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max(net))))
goto nla_put_failure;
nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..8608aec88758 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_MAX] = {
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
static struct ctl_table nf_ct_netfilter_table[] = {
{
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -1087,7 +1087,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
/* Don't allow non-init_net ns to alter global sysctls */
if (!net_eq(&init_net, net)) {
- table[NF_SYSCTL_CT_MAX].mode = 0444;
table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
}
--
2.40.1
From: lvxiafei <lvxiafei@sensetime.com>
Support net.netfilter.nf_conntrack_max settings in
different netns, net.netfilter.nf_conntrack_max is
used to more flexibly limit the ct_count in different
netns. The default value belongs to the global (ancestral)
limit and no implicit limit is inherited from the parent
namespace.
After net.netfilter.nf_conntrack_max is set in different
netns, it is not allowed to be greater than the global
(ancestral) limit net.nf_conntrack_max when working.
Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
include/net/netns/conntrack.h | 1 +
net/netfilter/nf_conntrack_core.c | 12 +++++++-----
net/netfilter/nf_conntrack_standalone.c | 5 +++--
3 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..dd31ba205419 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
u8 sysctl_acct;
u8 sysctl_tstamp;
u8 sysctl_checksum;
+ u8 sysctl_max;
struct ip_conntrack_stat __percpu *stat;
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..4116c2f2b57f 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1498,7 +1498,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
static void gc_worker(struct work_struct *work)
{
- unsigned int i, hashsz, nf_conntrack_max95 = 0;
+ unsigned int i, hashsz;
u32 end_time, start_time = nfct_time_stamp;
struct conntrack_gc_work *gc_work;
unsigned int expired_count = 0;
@@ -1509,8 +1509,6 @@ static void gc_worker(struct work_struct *work)
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
i = gc_work->next_bucket;
- if (gc_work->early_drop)
- nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
if (i == 0) {
gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1536,7 @@ static void gc_worker(struct work_struct *work)
}
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+ unsigned int nf_conntrack_max95 = 0;
struct nf_conntrack_net *cnet;
struct net *net;
long expires;
@@ -1567,11 +1566,14 @@ static void gc_worker(struct work_struct *work)
expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
expires = (expires - (long)next_run) / ++count;
next_run += expires;
+ net = nf_ct_net(tmp);
+
+ if (gc_work->early_drop)
+ nf_conntrack_max95 = min(nf_conntrack_max, net->ct.sysctl_max) / 100u * 95u;
if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
continue;
- net = nf_ct_net(tmp);
cnet = nf_ct_pernet(net);
if (atomic_read(&cnet->count) < nf_conntrack_max95)
continue;
@@ -1654,7 +1656,7 @@ __nf_conntrack_alloc(struct net *net,
/* We don't want any race condition at early drop stage */
ct_count = atomic_inc_return(&cnet->count);
- if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+ if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
if (!early_drop(net, hash)) {
if (!conntrack_gc_work.early_drop)
conntrack_gc_work.early_drop = true;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..4a073c4de1b7 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_MAX] = {
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+ table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
/* Don't allow non-init_net ns to alter global sysctls */
if (!net_eq(&init_net, net)) {
- table[NF_SYSCTL_CT_MAX].mode = 0444;
table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
}
@@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
int ret;
net->ct.sysctl_checksum = 1;
+ net->ct.sysctl_max = nf_conntrack_max;
ret = nf_conntrack_standalone_init_sysctl(net);
if (ret < 0)
--
2.40.1
lvxiafei <xiafei_xupt@163.com> wrote:
> - if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
> + if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
> if (!early_drop(net, hash)) {
> if (!conntrack_gc_work.early_drop)
> conntrack_gc_work.early_drop = true;
> diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
> index 2f666751c7e7..4a073c4de1b7 100644
> --- a/net/netfilter/nf_conntrack_standalone.c
> +++ b/net/netfilter/nf_conntrack_standalone.c
> @@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
> static struct ctl_table nf_ct_sysctl_table[] = {
> [NF_SYSCTL_CT_MAX] = {
> .procname = "nf_conntrack_max",
> - .data = &nf_conntrack_max,
> + .data = &init_net.ct.sysctl_max,
Whats the function of nf_conntrack_max?
After this change its always 0?
Florian Westphal <fw@strlen.de> wrote: > Whats the function of nf_conntrack_max? > After this change its always 0? nf_conntrack_max is a global (ancestor) limit, by default nf_conntrack_max = max_factor * nf_conntrack_htable_size. init_net.ct.sysctl_max is a parameter for each netns, and setting it will not affect the value of nf_conntrack_max.
lvxiafei <xiafei_xupt@163.com> wrote:
> Florian Westphal <fw@strlen.de> wrote:
> > Whats the function of nf_conntrack_max?
> > After this change its always 0?
>
> nf_conntrack_max is a global (ancestor) limit, by default
> nf_conntrack_max = max_factor * nf_conntrack_htable_size.
Argh.
net.netfilter.nf_conntrack_max
is replaced by init_net.nf_conntrack_max in your patch.
But not net.nf_conntrack_max, so they are now different and not
related at all anymore except that the latter overrides the former
even in init_net.
I'm not sure this is sane. And it needs an update to
Documentation/networking/nf_conntrack-sysctl.rst
in any case.
Also:
- if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+ if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
... can't be right, this allows a 0 setting in the netns.
So, setting 0 in non-init-net must be disallowed.
I suggest to remove nf_conntrack_max as a global variable,
make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
so in the init_net both sysctls remain the same.
Then, change __nf_conntrack_alloc() to do:
unsigned int nf_conntrack_max = min(net->ct.sysctl_max, &init_net.ct.sysctl_max);
and leave the if-condition as is, i.e.:
if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { ...
It means:
each netns can pick an arbitrary value (but not 0, this ability needs to
be removed).
When a new conntrack is allocated, then:
If the limit in the init_net is lower than the netns, then
that limit applies, so it provides upper cap.
If the limit in the init_net is higher, the lower pernet limit
is applied.
If the init_net has 0 setting, no limit is applied.
This also needs an update to Documentation/networking/nf_conntrack-sysctl.rst
to explain the restrictions.
Or, alternative, try the other suggestion I made
(memcg charge at sysctl change time,
https://lore.kernel.org/netfilter-devel/20250408095854.GB536@breakpoint.cc/).
Or come up with a better proposal.
Florian Westphal <fw@strlen.de> wrote:
> I suggest to remove nf_conntrack_max as a global variable,
> make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
> so in the init_net both sysctls remain the same.
The nf_conntrack_max global variable is a system calculated
value and should not be removed.
nf_conntrack_max = max_factor * nf_conntrack_htable_size;
> When a new conntrack is allocated, then:
>
> If the limit in the init_net is lower than the netns, then
> that limit applies, so it provides upper cap.
>
> If the limit in the init_net is higher, the lower pernet limit
> is applied.
>
> If the init_net has 0 setting, no limit is applied.
If the init_net has 0 setting, it should depend on the
limit of other netns.
The netns Limit Behavior:
+------------------------+--------------------+-----------------------+
| init_net.ct.sysctl_max | net->ct.sysctl_max | netns Limit Behavior |
+------------------------+--------------------+-----------------------+
| 0 | 0 | No limit |
+------------------------+--------------------+-----------------------+
| 0 | Non-zero | net->ct.sysctl_max |
+------------------------+--------------------+-----------------------+
| Non-zero | 0 | init_net.ct.sysctl_max|
+------------------------+--------------------+-----------------------+
| Non-zero | Non-zero | min |
+------------------------+--------------------+-----------------------+
net_ct_sysctl_max = likely(a && b) ? min(a, b) : max(a, b);
or
net_ct_sysctl_max = unlikely(a == 0 || b == 0) ? max(a, b) : min(a, b);
if (net_ct_sysctl_max && unlikely(ct_count > net_ct_sysctl_max)) { ...
lvxiafei <xiafei_xupt@163.com> wrote: > Florian Westphal <fw@strlen.de> wrote: > > I suggest to remove nf_conntrack_max as a global variable, > > make net.nf_conntrack_max use init_net.nf_conntrack_max too internally, > > so in the init_net both sysctls remain the same. > > The nf_conntrack_max global variable is a system calculated > value and should not be removed. > nf_conntrack_max = max_factor * nf_conntrack_htable_size; Thats the default calculation for the initial sysctl value: net/netfilter/nf_conntrack_standalone.c: .data = &nf_conntrack_max, net/netfilter/nf_conntrack_standalone.c: .data = &nf_conntrack_max, You can make an initial patch that replaces all occurences of nf_conntrack_max with cnet->sysctl_conntrack_max (adding a 'unsigned int sysctl_conntrack_max' to struct nf_conntrack_net). Then, in a second patch, remove the '0444' readonly and redirect the child netns to use the copy in its own pernet area rather than the init_net one.
Florian Westphal <fw@strlen.de> wrote:
> lvxiafei <xiafei_xupt@163.com> wrote:
> > Florian Westphal <fw@strlen.de> wrote:
> > > I suggest to remove nf_conntrack_max as a global variable,
> > > make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
> > > so in the init_net both sysctls remain the same.
> >
> > The nf_conntrack_max global variable is a system calculated
> > value and should not be removed.
> > nf_conntrack_max = max_factor * nf_conntrack_htable_size;
>
> Thats the default calculation for the initial sysctl value:
>
> net/netfilter/nf_conntrack_standalone.c: .data = &nf_conntrack_max,
> net/netfilter/nf_conntrack_standalone.c: .data = &nf_conntrack_max,
>
> You can make an initial patch that replaces all occurences of
> nf_conntrack_max with cnet->sysctl_conntrack_max
Something like this:
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
extern struct hlist_nulls_head *nf_conntrack_hash;
extern unsigned int nf_conntrack_htable_size;
extern seqcount_spinlock_t nf_conntrack_generation;
-extern unsigned int nf_conntrack_max;
/* must be called with rcu read lock held */
static inline void
@@ -360,6 +359,11 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
return net_generic(net, nf_conntrack_net_id);
}
+static inline unsigned int nf_conntrack_max(const struct net *net)
+{
+ return net->ct.sysctl_conntrack_max;
+}
+
int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
u16 zone, u8 family, u8 *proto, u16 *mru);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
u8 sysctl_acct;
u8 sysctl_tstamp;
u8 sysctl_checksum;
+ unsigned int sysctl_conntrack_max;
struct ip_conntrack_stat __percpu *stat;
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..8ae9c22cfcb3 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -202,8 +202,6 @@ static void nf_conntrack_all_unlock(void)
unsigned int nf_conntrack_htable_size __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
-unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
seqcount_spinlock_t nf_conntrack_generation __read_mostly;
static siphash_aligned_key_t nf_conntrack_hash_rnd;
@@ -1509,8 +1507,7 @@ static void gc_worker(struct work_struct *work)
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
i = gc_work->next_bucket;
- if (gc_work->early_drop)
- nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
+ nf_conntrack_max95 = nf_conntrack_max(&init_net) / 100u * 95u;
if (i == 0) {
gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1648,13 +1645,14 @@ __nf_conntrack_alloc(struct net *net,
gfp_t gfp, u32 hash)
{
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
- unsigned int ct_count;
+ unsigned int ct_max, ct_count;
struct nf_conn *ct;
/* We don't want any race condition at early drop stage */
ct_count = atomic_inc_return(&cnet->count);
+ ct_max = nf_conntrack_max(&init_net);
- if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+ if (ct_max && unlikely(ct_count > ct_max)) {
if (!early_drop(net, hash)) {
if (!conntrack_gc_work.early_drop)
conntrack_gc_work.early_drop = true;
@@ -2650,7 +2648,7 @@ int nf_conntrack_init_start(void)
if (!nf_conntrack_hash)
return -ENOMEM;
- nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+ init_net.ct.sysctl_conntrack_max = max_factor * nf_conntrack_htable_size;
nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
sizeof(struct nf_conn),
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index db23876a6016..f1938204b827 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2608,7 +2608,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
goto nla_put_failure;
- if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+ if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max(net))))
goto nla_put_failure;
nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 502cf10aab41..8a185dfd3261 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_MAX] = {
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_conntrack_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
@@ -944,7 +944,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
static struct ctl_table nf_ct_netfilter_table[] = {
{
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_conntrack_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
Florian Westphal <fw@strlen.de> wrote: > > You can make an initial patch that replaces all occurences of > > nf_conntrack_max with cnet->sysctl_conntrack_max > > Something like this: > ... Agreed, I can submit the changes later. First of all, a patch should do one thing clearly, which is convenient for maintainers to review.
Florian Westphal <fw@strlen.de> wrote:
> net.netfilter.nf_conntrack_max
> is replaced by init_net.nf_conntrack_max in your patch.
>
> But not net.nf_conntrack_max, so they are now different and not
> related at all anymore except that the latter overrides the former
> even in init_net.
>
> I'm not sure this is sane. And it needs an update to
> Documentation/networking/nf_conntrack-sysctl.rst
Yes, it needs an update to
Documentation/networking/nf_conntrack-sysctl.rst.
in different netns,
net.netfilter.nf_conntrack_max = init_net.ct.sysctl_max;
the global (ancestral) limit,
net.nf_conntrack_max = nf_conntrack_max = max_factor * nf_conntrack_htable_size;
> in any case.
>
> Also:
>
> - if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
> + if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
>
>
> ... can't be right, this allows a 0 setting in the netns.
> So, setting 0 in non-init-net must be disallowed.
Yes, setting 0 in non-init-net must be disallowed.
Should be used:
unsigned int net_ct_sysctl_max = max(min(nf_conntrack_max, net->ct.sysctl_max), 0);
if (nf_conntrack_max && unlikely(ct_count > net_ct_sysctl_max)) {
min(nf_conntrack_max, net->ct.sysctl_max) is the upper limit of ct_count
At the same time, when net->ct.sysctl_max == 0, the original intention is no limit,
but it can be limited by nf_conntrack_max in different netns.
> I suggest to remove nf_conntrack_max as a global variable,
> make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
> so in the init_net both sysctls remain the same.
>
> Then, change __nf_conntrack_alloc() to do:
>
> unsigned int nf_conntrack_max = min(net->ct.sysctl_max, &init_net.ct.sysctl_max);
>
> and leave the if-condition as is, i.e.:
>
> if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { ...
Yes, each netns can pick an arbitrary value (but not 0, this ability needs to
be removed).
Should be used:
unsigned int nf_conntrack_max = max(min(net->ct.sysctl_max, init_net.ct.sysctl_max, 0);
This also needs an update to Documentation/networking/nf_conntrack-sysctl.rst
to explain the restrictions.
lvxiafei <xiafei_xupt@163.com> wrote:
> > in any case.
> >
> > Also:
> >
> > - if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
> > + if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
> >
> >
> > ... can't be right, this allows a 0 setting in the netns.
> > So, setting 0 in non-init-net must be disallowed.
>
> Yes, setting 0 in non-init-net must be disallowed.
>
> Should be used:
> unsigned int net_ct_sysctl_max = max(min(nf_conntrack_max, net->ct.sysctl_max), 0);
> if (nf_conntrack_max && unlikely(ct_count > net_ct_sysctl_max)) {
That would work. Alternative, probably preferrable, is to do
something like this:
@@ -615,10 +615,10 @@ enum nf_ct_sysctl_index {
static struct ctl_table nf_ct_sysctl_table[] = {
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO, /* 0 == no limit */
},
[NF_SYSCTL_CT_COUNT] = {
.procname = "nf_conntrack_count",
@@ -1081,9 +1082,11 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
/* Don't allow non-init_net ns to alter global sysctls */
if (!net_eq(&init_net, net)) {
table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
+
+ /* 0 means no limit, only allowed in init_net */
+ table[NF_SYSCTL_CT_MAX].extra1 = SYSCTL_ONE;
}
That will make setting a 0 value illegal for non-init net case:
sysctl net.netfilter.nf_conntrack_max=0
sysctl: setting key "net.netfilter.nf_conntrack_max": Invalid argument
> min(nf_conntrack_max, net->ct.sysctl_max) is the upper limit of ct_count
> At the same time, when net->ct.sysctl_max == 0, the original intention is no limit,
> but it can be limited by nf_conntrack_max in different netns.
Sounds good to me.
From: lvxiafei <lvxiafei@sensetime.com>
Support nf_conntrack_max settings in different netns,
nf_conntrack_max is used to more flexibly limit the
ct_count in different netns, which may be greater than
the value in the parent namespace. The default value
belongs to the global (ancestral) limit and no implicit
limit is inherited from the parent namespace.
Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
include/net/netns/conntrack.h | 1 +
net/netfilter/nf_conntrack_core.c | 12 +++++++-----
net/netfilter/nf_conntrack_standalone.c | 7 ++++---
3 files changed, 12 insertions(+), 8 deletions(-)
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..dd31ba205419 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
u8 sysctl_acct;
u8 sysctl_tstamp;
u8 sysctl_checksum;
+ u8 sysctl_max;
struct ip_conntrack_stat __percpu *stat;
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..5f0dbd358d66 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1498,7 +1498,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
static void gc_worker(struct work_struct *work)
{
- unsigned int i, hashsz, nf_conntrack_max95 = 0;
+ unsigned int i, hashsz;
u32 end_time, start_time = nfct_time_stamp;
struct conntrack_gc_work *gc_work;
unsigned int expired_count = 0;
@@ -1509,8 +1509,6 @@ static void gc_worker(struct work_struct *work)
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
i = gc_work->next_bucket;
- if (gc_work->early_drop)
- nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
if (i == 0) {
gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1536,7 @@ static void gc_worker(struct work_struct *work)
}
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+ unsigned int nf_conntrack_max95 = 0;
struct nf_conntrack_net *cnet;
struct net *net;
long expires;
@@ -1567,11 +1566,14 @@ static void gc_worker(struct work_struct *work)
expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
expires = (expires - (long)next_run) / ++count;
next_run += expires;
+ net = nf_ct_net(tmp);
+
+ if (gc_work->early_drop)
+ nf_conntrack_max95 = net->ct.sysctl_max / 100u * 95u;
if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
continue;
- net = nf_ct_net(tmp);
cnet = nf_ct_pernet(net);
if (atomic_read(&cnet->count) < nf_conntrack_max95)
continue;
@@ -1654,7 +1656,7 @@ __nf_conntrack_alloc(struct net *net,
/* We don't want any race condition at early drop stage */
ct_count = atomic_inc_return(&cnet->count);
- if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+ if (net->ct.sysctl_max && unlikely(ct_count > net->ct.sysctl_max)) {
if (!early_drop(net, hash)) {
if (!conntrack_gc_work.early_drop)
conntrack_gc_work.early_drop = true;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..77c9c01c7278 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_MAX] = {
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
static struct ctl_table nf_ct_netfilter_table[] = {
{
.procname = "nf_conntrack_max",
- .data = &nf_conntrack_max,
+ .data = &init_net.ct.sysctl_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+ table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
/* Don't allow non-init_net ns to alter global sysctls */
if (!net_eq(&init_net, net)) {
- table[NF_SYSCTL_CT_MAX].mode = 0444;
table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
}
@@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
int ret;
net->ct.sysctl_checksum = 1;
+ net->ct.sysctl_max = nf_conntrack_max;
ret = nf_conntrack_standalone_init_sysctl(net);
if (ret < 0)
--
2.40.1
lvxiafei <xiafei_xupt@163.com> wrote: > From: lvxiafei <lvxiafei@sensetime.com> > > Support nf_conntrack_max settings in different netns, > nf_conntrack_max is used to more flexibly limit the > ct_count in different netns, which may be greater than > the value in the parent namespace. The default value > belongs to the global (ancestral) limit and no implicit > limit is inherited from the parent namespace. That seems the wrong thing to do. There must be some way to limit the netns conntrack usage. Whats the actual intent here? You could apply max = min(init_net->max, net->max) Or, you could relax it as long as netns are owned by initial user ns, I guess. Or perhaps its possible to make a guesstimate of the maximum memory needed by the new limit, then account that to memcg (at sysctl change time), and reject if memcg is exhausted. No other ideas at the moment, but I do not like the "no limits" approach.
On Tue, 8 Apr 2025 11:58:54 Florian Westphal <fw@strlen.de> wrote: > That seems the wrong thing to do. > There must be some way to limit the netns conntrack usage. > > Whats the actual intent here? > > You could apply max = min(init_net->max, net->max) > Or, you could relax it as long as netns are owned > by initial user ns, I guess. > > Or perhaps its possible to make a guesstimate of > the maximum memory needed by the new limit, then > account that to memcg (at sysctl change time), and > reject if memcg is exhausted. > > No other ideas at the moment, but I do not like the > "no limits" approach. The original nf_conntrack_max is a global variable. Modification will affect the connection tracking limit in other netns, and the maximum memory consumption = number of netns * nf_conntrack_max This modification can make nf_conntrack_max support the netns level to set the size of the connection tracking table, and more flexibly limit the connection tracking of each netns. For example, the initial user ns has a default value (=max_factor*nf_conntrack_htable_size). The nf_conntrack_max when netns 1 and netns 2 are created is the same as the nf_conntrack_max in the initial user ns. You can set it to netns 1 1k and netns 2 2k without affecting each other. If you are worried that different netns may exceed the initial user limit and memory limit when setting, apply max = min(init_net->max, net->max), the value in netns is not greater than init_net->max, and the new maximum memory consumption <= the original maximum memory consumption, which limits memory consumption to a certain extent. However, this will bring several problems: 1. Do not allow nf_conntrack_max in other netns to be greater than nf_conntrack_max of the initial user. For example, when other netns carry north-south traffic, the actual number of connection tracking is greater than that of the initial user. 2. If nf_conntrack_max of the initial user is increased, the maximum memory consumption will inevitably increase by n copies 3. If nf_conntrack_max of the initial user is reduced, will the existing connections in other netns be affected?
lvxiafei <xiafei_xupt@163.com> wrote: > This modification can make nf_conntrack_max support > the netns level to set the size of the connection > tracking table, and more flexibly limit the connection > tracking of each netns. For example, the initial user ns > has a default value (=max_factor*nf_conntrack_htable_size). > The nf_conntrack_max when netns 1 and netns 2 are created > is the same as the nf_conntrack_max in the initial user ns. > You can set it to netns 1 1k and netns 2 2k without > affecting each other. Netns 2 can also set it to 2**31 and cause machine go OOM. > If you are worried that different netns may exceed the > initial user limit and memory limit when setting, > apply max = min(init_net->max, net->max), the value in > netns is not greater than init_net->max, and the new > maximum memory consumption <= the original maximum memory > consumption, which limits memory consumption to a certain > extent. That was one of the suggestions that I see how one could have tunable pernet variable without allowing netns2 go haywire. > However, this will bring several problems: > > 1. Do not allow nf_conntrack_max in other netns to be greater > than nf_conntrack_max of the initial user. For example, when > other netns carry north-south traffic, the actual number of > connection tracking is greater than that of the initial user. Sure. > 2. If nf_conntrack_max of the initial user is increased, the > maximum memory consumption will inevitably increase by n copies How is that different to current state of affairs? > 3. If nf_conntrack_max of the initial user is reduced, will > the existing connections in other netns be affected? No, but new ones will be blocked until its below init_net limit again.
On 2025-04-08 13:28 Florian Westphal <fw@strlen.de> wrote: > That was one of the suggestions that I see how one could have > tunable pernet variable without allowing netns2 go haywire. Yes, After net.netfilter.nf_conntrack_max is set in different netns, it should be designed to not be allowed to be larger than the global (ancestor) limit when working.
© 2016 - 2026 Red Hat, Inc.