From nobody Sat Feb 7 15:11:14 2026 Received: from out-189.mta1.migadu.com (out-189.mta1.migadu.com [95.215.58.189]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BE5723016EB for ; Mon, 26 Jan 2026 09:03:18 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.189 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418200; cv=none; b=A3GE/lKiyLfaj4facJ6I2TauEEpODhixqV7m5MjazBtkMf8vxWyhgFscRamfkB9jHXc+QV3Ys1TvtkG+r66uIBoXjR7M1a1jrzU5CJCO/uCXlpnhMVPMF/Emfax/TwFLcnE89eA/FNN/a4mvzMF/inlM7kUQaULTYhKp4gvD1u8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418200; c=relaxed/simple; bh=ICuHoMZgwyeQHMZZvgMtyo6q66wG5W6MOEIDq951aKE=; h=From:To:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=mlqsoiRK+XZlwJPIf92L95xfeOFb24mew5cqBaaLEvtRuu15g7/mlRTDCOtMJsJUxzooi26mSwK0SGkgz5SBeNwrjnIa+VHXlbTM7TBwkdjwDiO3zwLh4yIbGa9EzCbCczqBdtCYm6Ho2pgyZD6+gZ64zxRnkdsZJh2lMZBCZeg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=O10fHPKl; arc=none smtp.client-ip=95.215.58.189 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="O10fHPKl" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769418196; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=RkVeShES4B2VgCiXTgzhTlETaaAj+olyI2gU3ClsRew=; b=O10fHPKlKdFZ/Yi788FotJKbzD4VjaoFeypk+NZQorir1kC9RdW6L3vR+CzlcnKOZbL8yG GCtjC1SsiJZikVD1hmcM24c3ab3v6CNMIN/9hLkkwSvw5a0c9mf/TEAGf7bCcPqtfKJpMk +4sGOOZ/TBVSqgB6QFsg56vkvSIZJV8= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH bpf-next v4 01/12] bpf: move bpf_struct_ops_link into bpf.h Date: Mon, 26 Jan 2026 17:02:25 +0800 Message-ID: <3a6694566eedbf17f84dbe5ffafe9aa0aa32108c.1769417588.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin Move struct bpf_struct_ops_link's definition into bpf.h, where other custom bpf links definitions are. It's necessary to access its members from outside of generic bpf_struct_ops implementation, which will be done by following patches in the series. Signed-off-by: Roman Gushchin --- include/linux/bpf.h | 6 ++++++ kernel/bpf/bpf_struct_ops.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4427c6e98331..899dd911dc82 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1891,6 +1891,12 @@ struct bpf_raw_tp_link { u64 cookie; }; =20 +struct bpf_struct_ops_link { + struct bpf_link link; + struct bpf_map __rcu *map; + wait_queue_head_t wait_hup; +}; + struct bpf_link_primer { struct bpf_link *link; struct file *file; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index c43346cb3d76..de01cf3025b3 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -55,12 +55,6 @@ struct bpf_struct_ops_map { struct bpf_struct_ops_value kvalue; }; =20 -struct bpf_struct_ops_link { - struct bpf_link link; - struct bpf_map __rcu *map; - wait_queue_head_t wait_hup; -}; - static DEFINE_MUTEX(update_mutex); =20 #define VALUE_PREFIX "bpf_struct_ops_" --=20 2.43.0 From nobody Sat Feb 7 15:11:14 2026 Received: from out-173.mta1.migadu.com (out-173.mta1.migadu.com [95.215.58.173]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D7E9D22D9F7 for ; Mon, 26 Jan 2026 09:03:31 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.173 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418213; cv=none; b=dWDm5wG1Cfev0ooWmKKgJlIdHHs0hmZqzxG/N+vroVyZTwYcbH3xzj6t0+ely3adzdf3HcqLgIG3ppJhbxb6fVhu69VWqejHXTZY+jZsaCcMXKvJDPvOSAkXJETiNnDnpcCr0JTa1Oo1I1S0HwDMCtndm/KA7UlC5T8XAhdRy/U= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418213; c=relaxed/simple; bh=1QNC2bfHfbW9SQUgfvnKGdBr0YOvFpgcnrQNCW/0a+Y=; h=From:To:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Gl+dZYdjkwjY9CgVlPZQaN5WdndlP/qcWZq6DK1Lj/RjNZhUBr8IbOhnPJaqmGx1ae9I4IvHhMRqB8n4fjFQBbbR8PO1ezTQ/Ez8CJnsvCvHMs0MwN0I7ixpFPcdkV0x/q92XtF3Itl88IRojEGKobwD/vME2wmnu2ZCqfh3dbk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=ud2bx3TE; arc=none smtp.client-ip=95.215.58.173 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="ud2bx3TE" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769418209; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=ycrFyw6aXaYmQPmQruEJsfLSWjPHoV8ar+xnlNHnd9w=; b=ud2bx3TEMtPgGp8z9l0owlnpj/NWdkQzdMWzUV6IHhmPv+afJYoJ7IIdwVRMloPKltQlvw FQmrztNLMufNKeF1BrzsNQbomsBldDjN2Hfb8Z2Z9HFOPwzyYDRyM+oobxVNJMwW2QDu6U /uEQiGHQD90+yXc7b4Gz4aNQFNrWppE= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH bpf-next v4 02/12] bpf: initial support for attaching struct ops to cgroups Date: Mon, 26 Jan 2026 17:02:26 +0800 Message-ID: <1c5845208d235e5deb37807f3be93af325033ba5.1769417588.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin When a struct ops is being attached and a bpf link is created, allow to pass a cgroup fd using bpf attr, so that struct ops can be attached to a cgroup instead of globally. Attached struct ops doesn't hold a reference to the cgroup, only preserves cgroup id. Signed-off-by: Roman Gushchin --- include/linux/bpf.h | 1 + kernel/bpf/bpf_struct_ops.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 899dd911dc82..720055d1dbce 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1895,6 +1895,7 @@ struct bpf_struct_ops_link { struct bpf_link link; struct bpf_map __rcu *map; wait_queue_head_t wait_hup; + u64 cgroup_id; }; =20 struct bpf_link_primer { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index de01cf3025b3..c807793e7633 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -13,6 +13,7 @@ #include #include #include +#include =20 struct bpf_struct_ops_value { struct bpf_struct_ops_common_value common; @@ -1377,6 +1378,20 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_= lops, NULL, attr->link_create.attach_type); +#ifdef CONFIG_CGROUPS + if (attr->link_create.cgroup.relative_fd) { + struct cgroup *cgrp; + + cgrp =3D cgroup_get_from_fd(attr->link_create.cgroup.relative_fd); + if (IS_ERR(cgrp)) { + err =3D PTR_ERR(cgrp); + goto err_out; + } + + link->cgroup_id =3D cgroup_id(cgrp); + cgroup_put(cgrp); + } +#endif /* CONFIG_CGROUPS */ =20 err =3D bpf_link_prime(&link->link, &link_primer); if (err) --=20 2.43.0 From nobody Sat Feb 7 15:11:14 2026 Received: from out-187.mta1.migadu.com (out-187.mta1.migadu.com [95.215.58.187]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A1568302151; Mon, 26 Jan 2026 09:03:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.187 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418225; cv=none; b=jofJYs06TpiGhMz7pgCZNWam9KFbyi20tT7m8TqL3gszI927+444pYlJRQ+zehpH8OmCPCkvr3D7BNVszk40JqiAk+1G+n2TljFbKhTSbKzjtA7C8cTNTyomqMFStadPd+FwKf5hzUj03F03kh1tls9hLxl5HbOd/T1N3PWg9Pw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418225; c=relaxed/simple; bh=N8uerXNrtYXAXsXSofx5ACpCyBePZpfRxoCGMF6fYG0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=ffM5Hq+/NJi88/NMfnb1yICvS9aTINdmQ2ofHImaC4PxbCX2OMSev26du73QL1QNzgAT2dlcDc/sT2RCXu3kMVW8EYe+z/Y3G8HJdEE8tNeTwjr6C31Rh6HEphiiWSTfGiqtuWagkqzTTmseqX/QJvQbQMsZ8aQh8FmjZLhQIAc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=N5uumHNB; arc=none smtp.client-ip=95.215.58.187 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="N5uumHNB" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769418221; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=R9sBuFUW725uaUgAoHBNUzbcXyyILwwKmBSUQhglIbE=; b=N5uumHNBSXM2Ihk+yOhHO/iUCVJo+bJ5XzjP49GZRH2DsYiVcQX3W6+QupeuqLR+/O/rVC B1YqdMC0pSUBBGy4GE+YVZFZZTfIGs8/WmTmGkfyfvxWbS/AkSnMT255S2EO3+MnhaqvZs GSDmBAv0gKJZuBYWlezKM/2Xc9qw0ZM= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Kumar Kartikeya Dwivedi Subject: [RFC PATCH bpf-next v4 03/12] bpf: mark struct oom_control's memcg field as TRUSTED_OR_NULL Date: Mon, 26 Jan 2026 17:02:27 +0800 Message-ID: <8c6d6d4751f1ad12582d3d9eabd549d5fd8925b0.1769417588.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin Struct oom_control is used to describe the OOM context. It's memcg field defines the scope of OOM: it's NULL for global OOMs and a valid memcg pointer for memcg-scoped OOMs. Teach bpf verifier to recognize it as trusted or NULL pointer. It will provide the bpf OOM handler a trusted memcg pointer, which for example is required for iterating the memcg's subtree. Signed-off-by: Roman Gushchin Acked-by: Kumar Kartikeya Dwivedi --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c2f2650db9fd..cca36edb460d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7242,6 +7242,10 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)= { struct file *vm_file; }; =20 +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct oom_control) { + struct mem_cgroup *memcg; +}; + static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *field_name, u32 btf_id) @@ -7284,6 +7288,7 @@ static bool type_is_trusted_or_null(struct bpf_verifi= er_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct oom_control)); =20 return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); --=20 2.43.0 From nobody Sat Feb 7 15:11:14 2026 Received: from out-174.mta0.migadu.com (out-174.mta0.migadu.com [91.218.175.174]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 343183033D9 for ; Mon, 26 Jan 2026 09:05:34 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.174 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418335; cv=none; b=nKPN7waVw6uc+n2x8AOZcNSO0C6fWTGBz+bPVqAZgxxUSvFXU7ULmSYiTJJ/FLFz2nxpkDZWAdOG6irwI16xikctNa+Q33Offp/UyBMdwJ4II2Px5V49Lhm8F+WLdQTK5dLLCKiUCHIo2qT8SVjV5jXqIPhnA5YWXxm0rjbVShk= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418335; c=relaxed/simple; bh=2k9LiI+M//qM4q+K2IyZ0CNDJt0ZnwZ2pxw6XNWmxxQ=; h=From:To:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=CqMFkjLU0A6KFFydNV1CSruTJT7vSU1aOCDnDpjYQSuMAZjXkgn5xogdc5cUTtPA7dZa4fSJI2fMNRqb86SQaBtbyKdYWzIbTPOA/RH+DoixKLXo4BiBtYalAPsUSvHJK47EohZ5TtnO9xe6K8Vnos3ueJFDHgMCz8ISGYR0fX0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=aDOq1xmX; arc=none smtp.client-ip=91.218.175.174 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="aDOq1xmX" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769418321; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=MZDHd5qG1jM6uT5/JnPwA06VDLOpkfHtWx6qC88p08g=; b=aDOq1xmX1htN3hO0feRYUDc3w9DQ2r1pila9TAuDHs2gzwvwuhQKew3H7l1iGkAqae1m+d 14ScCeAjq4WbJGmEoYbx8fpu+fFe+XBjV+Zcdr6Mu2ENKnJjCRbVoh9SQf4eVSjSQCwg4f MA7hyAoYo1P2OFkMvRn/S3+kwsgAky8= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH bpf-next v4 04/12] mm: define mem_cgroup_get_from_ino() outside of CONFIG_SHRINKER_DEBUG Date: Mon, 26 Jan 2026 17:04:47 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin mem_cgroup_get_from_ino() can be reused by the BPF OOM implementation, but currently depends on CONFIG_SHRINKER_DEBUG. Remove this dependency. Signed-off-by: Roman Gushchin --- include/linux/memcontrol.h | 4 ++-- mm/memcontrol.c | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 229ac9835adb..f3b8c71870d8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -833,9 +833,9 @@ static inline unsigned long mem_cgroup_ino(struct mem_c= group *memcg) { return memcg ? cgroup_ino(memcg->css.cgroup) : 0; } +#endif =20 struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); -#endif =20 static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { @@ -1298,12 +1298,12 @@ static inline unsigned long mem_cgroup_ino(struct m= em_cgroup *memcg) { return 0; } +#endif =20 static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { return NULL; } -#endif =20 static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3808845bc8cc..1f74fce27677 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3658,7 +3658,6 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short = id) return xa_load(&mem_cgroup_ids, id); } =20 -#ifdef CONFIG_SHRINKER_DEBUG struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { struct cgroup *cgrp; @@ -3679,7 +3678,6 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned l= ong ino) =20 return memcg; } -#endif =20 static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn) { --=20 2.43.0 From nobody Sat Feb 7 15:11:14 2026 Received: from out-179.mta0.migadu.com (out-179.mta0.migadu.com [91.218.175.179]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6124B302741 for ; Mon, 26 Jan 2026 09:05:36 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.179 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418338; cv=none; b=IZ9kIJ4Jiu6ge1SfNUFTcfqAvZlF4XFyCsK/PHN+8+jvtrKvGIjYs15NxCHe2zi6d2SBV4xaWGxqHBVWQkld85oynToVSUdC7HSdXuYQuzVtaawIieqsKztjeyqRVow2R4Sr7NnBxPVxZoERZvdP7mt3n/Q9KIdiRfmoVGmAqj8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418338; c=relaxed/simple; bh=OGLI1S2l0PmNr2S4bsYnKqfqSXB+2A+DsfQrV5CIDWU=; h=From:To:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=G2tS6sm3DI+I74IeC3QJzu4eJRomSR63FStOFmnUdjgW/WtGXihtczD32qfGYCdKtgLg09Z7w/WlREXMyH2794EOgAc4juFUkMZrHhHwlRxmpG0a48ZuvnOXwLRIZYSKE57ZQsRHxSDT8eL++k2atYea1VcwplKTV/0qpTqvWgM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=uy/RXibV; arc=none smtp.client-ip=91.218.175.179 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="uy/RXibV" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769418333; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=FOthmRQx1S9k8S+lynuRz8PEGmhPgopf4vkxZNGup6E=; b=uy/RXibVJ3oVqmHcuUREIO42g9J6ZAIDsRa75aGhZMOzYi44E9zdwYLQ+penjI2tSnOZ+w LYhshtEuxxm3yPmlrUhEurCZH+tgHpic1Git0dNeJokwkgNF4kbjIyW7g7HRcL6RogvQ/w Xf01C8a/HuO2jBXWYYLKTeQ+JOUbD78= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH bpf-next v4 05/12] libbpf: introduce bpf_map__attach_struct_ops_opts() Date: Mon, 26 Jan 2026 17:04:48 +0800 Message-ID: <635923ceadf1899672e4f7727ddc52554c11a3ac.1769417588.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin Introduce bpf_map__attach_struct_ops_opts(), an extended version of bpf_map__attach_struct_ops(), which takes additional struct bpf_struct_ops_opts argument. struct bpf_struct_ops_opts has the relative_fd member, which allows to pass an additional file descriptor argument. It can be used to attach struct ops maps to cgroups. Signed-off-by: Roman Gushchin --- tools/lib/bpf/bpf.c | 8 ++++++++ tools/lib/bpf/libbpf.c | 18 ++++++++++++++++-- tools/lib/bpf/libbpf.h | 14 ++++++++++++++ tools/lib/bpf/libbpf.map | 1 + 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 5846de364209..84a53c594f48 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -884,6 +884,14 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, cgroup)) return libbpf_err(-EINVAL); break; + case BPF_STRUCT_OPS: + relative_fd =3D OPTS_GET(opts, cgroup.relative_fd, 0); + attr.link_create.cgroup.relative_fd =3D relative_fd; + attr.link_create.cgroup.expected_revision =3D + OPTS_GET(opts, cgroup.expected_revision, 0); + if (!OPTS_ZEROED(opts, cgroup)) + return libbpf_err(-EINVAL); + break; default: if (!OPTS_ZEROED(opts, flags)) return libbpf_err(-EINVAL); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 0c8bf0b5cce4..70a00da54ff5 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -13462,12 +13462,19 @@ static int bpf_link__detach_struct_ops(struct bpf= _link *link) return close(link->fd); } =20 -struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +struct bpf_link *bpf_map__attach_struct_ops_opts(const struct bpf_map *map, + const struct bpf_struct_ops_opts *opts) { + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts); struct bpf_link_struct_ops *link; __u32 zero =3D 0; int err, fd; =20 + if (!OPTS_VALID(opts, bpf_struct_ops_opts)) { + pr_warn("map '%s': invalid opts\n", map->name); + return libbpf_err_ptr(-EINVAL); + } + if (!bpf_map__is_struct_ops(map)) { pr_warn("map '%s': can't attach non-struct_ops map\n", map->name); return libbpf_err_ptr(-EINVAL); @@ -13503,7 +13510,9 @@ struct bpf_link *bpf_map__attach_struct_ops(const s= truct bpf_map *map) return &link->link; } =20 - fd =3D bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, NULL); + link_opts.cgroup.relative_fd =3D OPTS_GET(opts, relative_fd, 0); + + fd =3D bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, &link_opts); if (fd < 0) { free(link); return libbpf_err_ptr(fd); @@ -13515,6 +13524,11 @@ struct bpf_link *bpf_map__attach_struct_ops(const = struct bpf_map *map) return &link->link; } =20 +struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +{ + return bpf_map__attach_struct_ops_opts(map, NULL); +} + /* * Swap the back struct_ops of a link with a new struct_ops map. */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index dfc37a615578..5aef44bcfcc2 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -920,6 +920,20 @@ bpf_program__attach_cgroup_opts(const struct bpf_progr= am *prog, int cgroup_fd, struct bpf_map; =20 LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_ma= p *map); + +struct bpf_struct_ops_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + __u32 flags; + __u32 relative_fd; + __u64 expected_revision; + size_t :0; +}; +#define bpf_struct_ops_opts__last_field expected_revision + +LIBBPF_API struct bpf_link * +bpf_map__attach_struct_ops_opts(const struct bpf_map *map, + const struct bpf_struct_ops_opts *opts); LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bp= f_map *map); =20 struct bpf_iter_attach_opts { diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index d18fbcea7578..4779190c97b6 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -454,4 +454,5 @@ LIBBPF_1.7.0 { bpf_prog_assoc_struct_ops; bpf_program__assoc_struct_ops; btf__permute; + bpf_map__attach_struct_ops_opts; } LIBBPF_1.6.0; --=20 2.43.0 From nobody Sat Feb 7 15:11:14 2026 Received: from out-189.mta0.migadu.com (out-189.mta0.migadu.com [91.218.175.189]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5D1443033C0 for ; Mon, 26 Jan 2026 09:05:48 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.189 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418349; cv=none; b=tdRb23SAv8rk3LkDN3w8K4wOqWwtTbt1biP9jXaS5vSdbjeuygJ7H+zhC+ITPiS+2aMmCzolXslI7qOte54Fi6a9LpmRs6CmeBohiFEYeEQpz6+TUhfAmBh4hwl+gIsYkax7jtGqWJYBuXT0tV99noyeQDCaL1HC3JBW2ypcfiw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418349; c=relaxed/simple; bh=FST42ieyWdVzaafbTT/pjnBN12hvGiGjV8DHVrhn4G0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=lG5ksQX13HZ0Cr7iesP/5xKWfQdDHnmJ74/WiVMMo7/XjD32Rryv/clPwvHwz7FHiBemxIFONJsX9wnLQ6nW9dt46Kc2KKHyxQv9pB25R6rEuDCiAxJaCbSqcX8NWEynmws+ivPIlm3Nyrgn9pBcb1Wz8MSrko8UlfWT0x3iJ9w= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=w4Gy5WcZ; arc=none smtp.client-ip=91.218.175.189 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="w4Gy5WcZ" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769418346; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=qJzfS1JLGvEK+FxWqw8ITpq79hNz1KKD0GGXddU0FBc=; b=w4Gy5WcZtlDZYkEO2TO4A9miXEi+bQ9j+mJmKZdHyxjM/yyMcz17Hm145nGY64rLCBLjEZ OaItV++yLzC5YIxcjk5+oVLXZKp0JTibbA0TJipVv9n5lFyJwMExVco1S5jtLB8/LsxqW7 xJQRKPMULmQa2T7sm/1Q/DaH5knBswc= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v4 06/12] bpf: Pass flags in bpf_link_create for struct_ops Date: Mon, 26 Jan 2026 17:04:49 +0800 Message-ID: <6b2d4fa8e5209d363f553d7851d5a1156137d9fb.1769417588.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu To support features like allowing overrides in cgroup hierarchies, we need a way to pass flags from userspace to the kernel when attaching a struct_ops. Extend `bpf_struct_ops_link` to include a `flags` field. This field is populated from `attr->link_create.flags` during link creation. This will allow struct_ops implementations, such as the upcoming memory controller ops, to interpret these flags and modify their attachment behavior accordingly. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- include/linux/bpf.h | 1 + kernel/bpf/bpf_struct_ops.c | 1 + tools/include/uapi/linux/bpf.h | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 720055d1dbce..13c933cfc614 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1896,6 +1896,7 @@ struct bpf_struct_ops_link { struct bpf_map __rcu *map; wait_queue_head_t wait_hup; u64 cgroup_id; + u32 flags; }; =20 struct bpf_link_primer { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index c807793e7633..0df608c88403 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1392,6 +1392,7 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) cgroup_put(cgrp); } #endif /* CONFIG_CGROUPS */ + link->flags =3D attr->link_create.flags; =20 err =3D bpf_link_prime(&link->link, &link_primer); if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ca7d76e05f0..4e1c5d6d91ae 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1185,7 +1185,7 @@ enum bpf_perf_event_type { BPF_PERF_EVENT_EVENT =3D 6, }; =20 -/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH and BPF_LINK_CREATE com= mand * * NONE(default): No further bpf programs allowed in the subtree. * --=20 2.43.0 From nobody Sat Feb 7 15:11:14 2026 Received: from out-189.mta0.migadu.com (out-189.mta0.migadu.com [91.218.175.189]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id E5D96303A15; Mon, 26 Jan 2026 09:06:00 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.189 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418362; cv=none; b=DxE+jftZN7r6PpwWF9lCol/Iq6LgP6F28/F1/Cvi+kVX5hfywGtsbgN6eewTONZgZUFAKRnwZHL6owByfUnexVA/MXJNlbtrqU00ckGpnugoA1Y4yE+lTOJK6TbVytHXGTrjHZjI9su4VsD3rBQRVZIs5KNJD1IxPeVyoaKkSqo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418362; c=relaxed/simple; bh=9PcymUN+ZCg2e50rgIRHOfTXaJvfmhQQChu88MTol/Q=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Jl7+1weyEs9mCyJEGstPSJzceH3LtV8AUuc3oy2xSgqN/xjuYZ37vP3/cm8sh2+sYG7pU3g4fRJtppbLeFrZ1v5+VsZ5XEDThckiy2NqLpAs/Ao2uHG8qnMn2R91+KhYsuLh/UDTbznFC/r63a7T3tIh381zHvMzNd/aIdnbTZE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=nhoZcECC; arc=none smtp.client-ip=91.218.175.189 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="nhoZcECC" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769418358; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=li2p2tnFJJjJ5INNx6uZKzhFQ5sqAQSEauna4Y+cr5o=; b=nhoZcECCwGjpkvlB0XDUMiiT/3p24XMdQu72shwwcl/4SxBv63js7lYNeTcl601Ge8RwH6 JQbnu8HSmWdzuFcWDoKaOiTHAMoyMCrqaPcJ7TPCm3+Tj++MB2Iap6lHrgC+Wg7N4bsrqf cmeLrz3N1x4sSM6ZS41Sfyc7IpNbUhk= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v4 07/12] libbpf: Support passing user-defined flags for struct_ops Date: Mon, 26 Jan 2026 17:04:50 +0800 Message-ID: <63d543aec9ade803afcd95461e3089e3d44caca6.1769417588.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Building on the previous change that added flags to the kernel's link creation path, this patch exposes this functionality through libbpf. The `bpf_struct_ops_opts` struct is extended with a `flags` member, which is then passed to the `bpf_link_create` syscall within `bpf_map__attach_struct_ops_opts`. This enables userspace applications to pass flags, such as `BPF_F_ALLOW_OVERRIDE`, when attaching struct_ops to cgroups, providing more control over the attachment behavior in nested hierarchies. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- tools/lib/bpf/libbpf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 70a00da54ff5..06c936bad211 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -13511,6 +13511,7 @@ struct bpf_link *bpf_map__attach_struct_ops_opts(co= nst struct bpf_map *map, } =20 link_opts.cgroup.relative_fd =3D OPTS_GET(opts, relative_fd, 0); + link_opts.flags =3D OPTS_GET(opts, flags, 0); =20 fd =3D bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, &link_opts); if (fd < 0) { --=20 2.43.0 From nobody Sat Feb 7 15:11:14 2026 Received: from out-188.mta0.migadu.com (out-188.mta0.migadu.com [91.218.175.188]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BA4763033E4 for ; Mon, 26 Jan 2026 09:06:58 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.188 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418421; cv=none; b=oURSGJAvJb4HoDycVyD2O+EqJ+CcE8EdSbBNWEdBrPhCB45xPrexHdHEQEmVt41tUzcpW8fGTOn01lKNH1KrswBzI9VNuqR+7djdvs1w/97GQLUkrL+30HZCcXviW/iP2qMKwPlujCRjeI4EptKT1gmXzvrdRHRQ/jOsOTntqFo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418421; c=relaxed/simple; bh=Z+i0d7jjTwoApqNcItRxT91eFDYxqkraXoyVc4DJ0CM=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=buk0B9rTvemymEe4wWPHzPiLWwSFajarphNZFcfk97AvHKkP31nq+X+hbzIiZm+Ws1QPC2T+8UuA1yso06uB2FkOXeqwXDTiNF/5sd1gmJN6Lny+w50sUL43fMhzx42xjLGlEFXTbw3820Tvk86i6ET7UcEkBEta8xcltolC1IQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=XE6XnTal; arc=none smtp.client-ip=91.218.175.188 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="XE6XnTal" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769418416; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=PJBPex93YEuCb0q0aG640wDPNK+oPOEy5TRS3CklQGM=; b=XE6XnTalRr+tHzAKerEujxtH92UoWQV96aU7xiFUw5HDoCWbML+oFBH3eOCEIzhlyRJgzs Zpk94XXd1OKpxkdV47fvznMeJJ5cN98NLC+898eg0nmC7LRLb2/JfN+93QfdbbhNhvbj3+ hP6F+anzlKBLsdddpescggP1QoczuWI= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v4 08/12] mm: memcontrol: Add BPF struct_ops for memory controller Date: Mon, 26 Jan 2026 17:06:27 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Introduce BPF struct_ops support to the memory controller, enabling custom and dynamic control over memory pressure. This is achieved through a new struct_ops type, `memcg_bpf_ops`. This new interface allows a BPF program to implement hooks that influence a memory cgroup's behavior. The `memcg_bpf_ops` struct provides the following hooks: - `get_high_delay_ms`: Returns a custom throttling delay in milliseconds for a cgroup that has breached its `memory.high` limit. This is the primary mechanism for BPF-driven throttling. - `below_low`: Overrides the `memory.low` protection check. If this hook returns true, the cgroup is considered to be protected by its `memory.low` setting, regardless of its actual usage. - `below_min`: Similar to `below_low`, this overrides the `memory.min` protection check. - `handle_cgroup_online`/`offline`: Callbacks invoked when a cgroup with an attached program comes online or goes offline, allowing for state management. This patch integrates these hooks into the core memory control logic. The `get_high_delay_ms` value is incorporated into charge paths like `try_charge_memcg` and the high-limit handler `__mem_cgroup_handle_over_high`. The `below_low` and `below_min` hooks are checked within their respective protection functions. Lifecycle management is handled to ensure BPF programs are correctly inherited by child cgroups and cleaned up on detachment. SRCU is used to protect concurrent access to the `memcg->bpf_ops` pointer. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- include/linux/memcontrol.h | 106 ++++++++++++++- mm/bpf_memcontrol.c | 255 ++++++++++++++++++++++++++++++++++++- mm/memcontrol.c | 32 +++-- 3 files changed, 380 insertions(+), 13 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f3b8c71870d8..1083be5d0362 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -181,6 +181,37 @@ struct obj_cgroup { }; }; =20 +#ifdef CONFIG_BPF_SYSCALL +/** + * struct memcg_bpf_ops - BPF callbacks for memory cgroup operations + * @handle_cgroup_online: Called when a cgroup comes online + * @handle_cgroup_offline: Called when a cgroup goes offline + * @below_low: Override memory.low protection check. If this callback retu= rns + * true, mem_cgroup_below_low() will return true immediately w= ithout + * performing the standard comparison. If it returns false, the + * original memory.low threshold comparison will proceed norma= lly. + * @below_min: Override memory.min protection check. If this callback retu= rns + * true, mem_cgroup_below_min() will return true immediately w= ithout + * performing the standard comparison. If it returns false, the + * original memory.min threshold comparison will proceed norma= lly. + * @get_high_delay_ms: Return custom throttle delay in milliseconds + * + * This structure defines the interface for BPF programs to customize + * memory cgroup behavior through struct_ops programs. + */ +struct memcg_bpf_ops { + void (*handle_cgroup_online)(struct mem_cgroup *memcg); + + void (*handle_cgroup_offline)(struct mem_cgroup *memcg); + + bool (*below_low)(struct mem_cgroup *memcg); + + bool (*below_min)(struct mem_cgroup *memcg); + + unsigned int (*get_high_delay_ms)(struct mem_cgroup *memcg); +}; +#endif /* CONFIG_BPF_SYSCALL */ + /* * The memory controller data structure. The memory controller controls bo= th * page cache and RSS per cgroup. We would eventually like to provide @@ -321,6 +352,10 @@ struct mem_cgroup { spinlock_t event_list_lock; #endif /* CONFIG_MEMCG_V1 */ =20 +#ifdef CONFIG_BPF_SYSCALL + struct memcg_bpf_ops *bpf_ops; +#endif + struct mem_cgroup_per_node *nodeinfo[]; }; =20 @@ -554,6 +589,66 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } =20 +#ifdef CONFIG_BPF_SYSCALL + +/* SRCU for protecting concurrent access to memcg->bpf_ops */ +extern struct srcu_struct memcg_bpf_srcu; + +/** + * BPF_MEMCG_CALL - Safely invoke a BPF memcg callback + * @memcg: The memory cgroup + * @op: The operation name (struct member) + * @default_val: Default return value if no BPF program attached + * + * This macro safely calls a BPF callback under SRCU protection. + */ +#define BPF_MEMCG_CALL(memcg, op, default_val) ({ \ + typeof(default_val) __ret =3D (default_val); \ + struct memcg_bpf_ops *__ops; \ + int __idx; \ + \ + __idx =3D srcu_read_lock(&memcg_bpf_srcu); \ + __ops =3D READ_ONCE((memcg)->bpf_ops); \ + if (__ops && __ops->op) \ + __ret =3D __ops->op(memcg); \ + srcu_read_unlock(&memcg_bpf_srcu, __idx); \ + __ret; \ +}) + +static inline bool bpf_memcg_below_low(struct mem_cgroup *memcg) +{ + return BPF_MEMCG_CALL(memcg, below_low, false); +} + +static inline bool bpf_memcg_below_min(struct mem_cgroup *memcg) +{ + return BPF_MEMCG_CALL(memcg, below_min, false); +} + +static inline unsigned long bpf_memcg_get_high_delay(struct mem_cgroup *me= mcg) +{ + unsigned int ret; + + ret =3D BPF_MEMCG_CALL(memcg, get_high_delay_ms, 0U); + return msecs_to_jiffies(ret); +} + +#undef BPF_MEMCG_CALL + +extern void memcontrol_bpf_online(struct mem_cgroup *memcg); +extern void memcontrol_bpf_offline(struct mem_cgroup *memcg); + +#else /* CONFIG_BPF_SYSCALL */ + +static inline unsigned long +bpf_memcg_get_high_delay(struct mem_cgroup *memcg) { return 0; } +static inline bpf_memcg_below_low(struct mem_cgroup *memcg) { return false= ; } +static inline bpf_memcg_below_min(struct mem_cgroup *memcg) { return false= ; } +static inline void memcontrol_bpf_online(struct mem_cgroup *memcg) { } +static inline void memcontrol_bpf_offline(struct mem_cgroup *memcg) { } + +#endif /* CONFIG_BPF_SYSCALL */ + static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, @@ -625,6 +720,9 @@ static inline bool mem_cgroup_below_low(struct mem_cgro= up *target, if (mem_cgroup_unprotected(target, memcg)) return false; =20 + if (bpf_memcg_below_low(memcg)) + return true; + return READ_ONCE(memcg->memory.elow) >=3D page_counter_read(&memcg->memory); } @@ -635,6 +733,9 @@ static inline bool mem_cgroup_below_min(struct mem_cgro= up *target, if (mem_cgroup_unprotected(target, memcg)) return false; =20 + if (bpf_memcg_below_min(memcg)) + return true; + return READ_ONCE(memcg->memory.emin) >=3D page_counter_read(&memcg->memory); } @@ -909,12 +1010,13 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lr= uvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } =20 -void __mem_cgroup_handle_over_high(gfp_t gfp_mask); +void __mem_cgroup_handle_over_high(gfp_t gfp_mask, + unsigned long bpf_high_delay); =20 static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { if (unlikely(current->memcg_nr_pages_over_high)) - __mem_cgroup_handle_over_high(gfp_mask); + __mem_cgroup_handle_over_high(gfp_mask, 0); } =20 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 716df49d7647..20c5c3552ce3 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -8,6 +8,9 @@ #include #include =20 +/* Protects memcg->bpf_ops pointer for read and write. */ +DEFINE_SRCU(memcg_bpf_srcu); + __bpf_kfunc_start_defs(); =20 /** @@ -179,15 +182,263 @@ static const struct btf_kfunc_id_set bpf_memcontrol_= kfunc_set =3D { .set =3D &bpf_memcontrol_kfuncs, }; =20 +/** + * memcontrol_bpf_online - Inherit BPF programs for a new online cgroup. + * @memcg: The memory cgroup that is coming online. + * + * When a new memcg is brought online, it inherits the BPF programs + * attached to its parent. This ensures consistent BPF-based memory + * control policies throughout the cgroup hierarchy. + * + * After inheriting, if the BPF program has an online handler, it is + * invoked for the new memcg. + */ +void memcontrol_bpf_online(struct mem_cgroup *memcg) +{ + int idx; + struct memcg_bpf_ops *ops; + struct mem_cgroup *parent_memcg; + + /* The root cgroup does not inherit from a parent. */ + if (mem_cgroup_is_root(memcg)) + return; + + parent_memcg =3D parent_mem_cgroup(memcg); + + idx =3D srcu_read_lock(&memcg_bpf_srcu); + + /* Inherit the BPF program from the parent cgroup. */ + ops =3D READ_ONCE(parent_memcg->bpf_ops); + if (!ops) + goto out; + + WRITE_ONCE(memcg->bpf_ops, ops); + + /* + * If the BPF program implements it, call the online handler to + * allow the program to perform setup tasks for the new cgroup. + */ + if (!ops->handle_cgroup_online) + goto out; + + ops->handle_cgroup_online(memcg); + +out: + srcu_read_unlock(&memcg_bpf_srcu, idx); +} + +/** + * memcontrol_bpf_offline - Run BPF cleanup for an offline cgroup. + * @memcg: The memory cgroup that is going offline. + * + * If a BPF program is attached and implements an offline handler, + * it is invoked to perform cleanup tasks before the memcg goes + * completely offline. + */ +void memcontrol_bpf_offline(struct mem_cgroup *memcg) +{ + int idx; + struct memcg_bpf_ops *ops; + + idx =3D srcu_read_lock(&memcg_bpf_srcu); + + ops =3D READ_ONCE(memcg->bpf_ops); + if (!ops || !ops->handle_cgroup_offline) + goto out; + + ops->handle_cgroup_offline(memcg); + +out: + srcu_read_unlock(&memcg_bpf_srcu, idx); +} + +static int memcg_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + return -EACCES; +} + +static bool memcg_ops_is_valid_access(int off, int size, enum bpf_access_t= ype type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops bpf_memcg_verifier_ops =3D { + .get_func_proto =3D bpf_base_func_proto, + .btf_struct_access =3D memcg_ops_btf_struct_access, + .is_valid_access =3D memcg_ops_is_valid_access, +}; + +static void cfi_handle_cgroup_online(struct mem_cgroup *memcg) +{ +} + +static void cfi_handle_cgroup_offline(struct mem_cgroup *memcg) +{ +} + +static bool cfi_below_low(struct mem_cgroup *memcg) +{ + return false; +} + +static bool cfi_below_min(struct mem_cgroup *memcg) +{ + return false; +} + +static unsigned int cfi_get_high_delay_ms(struct mem_cgroup *memcg) +{ + return 0; +} + +static struct memcg_bpf_ops cfi_bpf_memcg_ops =3D { + .handle_cgroup_online =3D cfi_handle_cgroup_online, + .handle_cgroup_offline =3D cfi_handle_cgroup_offline, + .below_low =3D cfi_below_low, + .below_min =3D cfi_below_min, + .get_high_delay_ms =3D cfi_get_high_delay_ms, +}; + +static int bpf_memcg_ops_init(struct btf *btf) +{ + return 0; +} + +static int bpf_memcg_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff =3D __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct memcg_bpf_ops, handle_cgroup_online): + break; + case offsetof(struct memcg_bpf_ops, handle_cgroup_offline): + break; + case offsetof(struct memcg_bpf_ops, below_low): + break; + case offsetof(struct memcg_bpf_ops, below_min): + break; + case offsetof(struct memcg_bpf_ops, get_high_delay_ms): + break; + default: + if (prog->sleepable) + return -EINVAL; + } + + return 0; +} + +static int bpf_memcg_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +/** + * clean_memcg_bpf_ops - Detach BPF programs from a cgroup hierarchy. + * @memcg: The root of the cgroup hierarchy to clean. + * @ops: The specific ops struct to detach. If NULL, detach any ops. + * + * Iterates through all descendant cgroups of @memcg (including itself) + * and clears their bpf_ops pointer. This is used when a BPF program + * is detached or if attachment fails midway. + */ +static void clean_memcg_bpf_ops(struct mem_cgroup *memcg, + struct memcg_bpf_ops *ops) +{ + struct mem_cgroup *iter =3D NULL; + + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { + if (ops) { + if (!WARN_ON(READ_ONCE(iter->bpf_ops) !=3D ops)) + WRITE_ONCE(iter->bpf_ops, NULL); + } else + WRITE_ONCE(iter->bpf_ops, NULL); + } +} + +static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link + =3D container_of(link, struct bpf_struct_ops_link, link); + struct memcg_bpf_ops *ops =3D kdata; + struct mem_cgroup *memcg, *iter =3D NULL; + int err =3D 0; + + memcg =3D mem_cgroup_get_from_ino(ops_link->cgroup_id); + if (IS_ERR_OR_NULL(memcg)) + return PTR_ERR(memcg); + + cgroup_lock(); + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { + if (READ_ONCE(iter->bpf_ops)) { + mem_cgroup_iter_break(memcg, iter); + err =3D -EBUSY; + break; + } + WRITE_ONCE(iter->bpf_ops, ops); + } + if (err) + clean_memcg_bpf_ops(memcg, NULL); + cgroup_unlock(); + + mem_cgroup_put(memcg); + return err; +} + +/* Unregister the struct ops instance */ +static void bpf_memcg_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link + =3D container_of(link, struct bpf_struct_ops_link, link); + struct memcg_bpf_ops *ops =3D kdata; + struct mem_cgroup *memcg; + + memcg =3D mem_cgroup_get_from_ino(ops_link->cgroup_id); + if (IS_ERR_OR_NULL(memcg)) + goto out; + + cgroup_lock(); + clean_memcg_bpf_ops(memcg, ops); + cgroup_unlock(); + + mem_cgroup_put(memcg); + +out: + synchronize_srcu(&memcg_bpf_srcu); +} + +static struct bpf_struct_ops bpf_memcg_bpf_ops =3D { + .verifier_ops =3D &bpf_memcg_verifier_ops, + .init =3D bpf_memcg_ops_init, + .check_member =3D bpf_memcg_ops_check_member, + .init_member =3D bpf_memcg_ops_init_member, + .reg =3D bpf_memcg_ops_reg, + .unreg =3D bpf_memcg_ops_unreg, + .name =3D "memcg_bpf_ops", + .owner =3D THIS_MODULE, + .cfi_stubs =3D &cfi_bpf_memcg_ops, +}; + static int __init bpf_memcontrol_init(void) { - int err; + int err, err2; =20 err =3D register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_memcontrol_kfunc_set); if (err) pr_warn("error while registering bpf memcontrol kfuncs: %d", err); =20 - return err; + err2 =3D register_bpf_struct_ops(&bpf_memcg_bpf_ops, memcg_bpf_ops); + if (err2) + pr_warn("error while registering memcontrol bpf ops: %d", err2); + + return err ? err : err2; } late_initcall(bpf_memcontrol_init); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1f74fce27677..8d90575aa77d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2252,7 +2252,8 @@ static unsigned long calculate_high_delay(struct mem_= cgroup *memcg, * try_charge() (context permitting), as well as from the userland * return path where reclaim is always able to block. */ -void __mem_cgroup_handle_over_high(gfp_t gfp_mask) +void +__mem_cgroup_handle_over_high(gfp_t gfp_mask, unsigned long bpf_high_delay) { unsigned long penalty_jiffies; unsigned long pflags; @@ -2294,11 +2295,15 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask) * memory.high is breached and reclaim is unable to keep up. Throttle * allocators proactively to slow down excessive growth. */ - penalty_jiffies =3D calculate_high_delay(memcg, nr_pages, - mem_find_max_overage(memcg)); + if (nr_pages) { + penalty_jiffies =3D calculate_high_delay( + memcg, nr_pages, mem_find_max_overage(memcg)); =20 - penalty_jiffies +=3D calculate_high_delay(memcg, nr_pages, - swap_find_max_overage(memcg)); + penalty_jiffies +=3D calculate_high_delay( + memcg, nr_pages, swap_find_max_overage(memcg)); + } else + penalty_jiffies =3D 0; + penalty_jiffies =3D max(penalty_jiffies, bpf_high_delay); =20 /* * Clamp the max delay per usermode return so as to still keep the @@ -2356,6 +2361,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg,= gfp_t gfp_mask, bool raised_max_event =3D false; unsigned long pflags; bool allow_spinning =3D gfpflags_allow_spinning(gfp_mask); + struct mem_cgroup *orig_memcg; =20 retry: if (consume_stock(memcg, nr_pages)) @@ -2481,6 +2487,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg,= gfp_t gfp_mask, if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); =20 + orig_memcg =3D memcg; /* * If the hierarchy is above the normal consumption range, schedule * reclaim on returning to userland. We can perform reclaim here @@ -2530,10 +2537,14 @@ static int try_charge_memcg(struct mem_cgroup *memc= g, gfp_t gfp_mask, * kernel. If this is successful, the return path will see it * when it rechecks the overage and simply bail out. */ - if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && - !(current->flags & PF_MEMALLOC) && - gfpflags_allow_blocking(gfp_mask)) - __mem_cgroup_handle_over_high(gfp_mask); + if (gfpflags_allow_blocking(gfp_mask)) { + unsigned long bpf_high_delay; + + bpf_high_delay =3D bpf_memcg_get_high_delay(orig_memcg); + if (bpf_high_delay || + current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH) + __mem_cgroup_handle_over_high(gfp_mask, bpf_high_delay); + } return 0; } =20 @@ -3906,6 +3917,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys= _state *css) */ xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL); =20 + memcontrol_bpf_online(memcg); + return 0; offline_kmem: memcg_offline_kmem(memcg); @@ -3925,6 +3938,7 @@ static void mem_cgroup_css_offline(struct cgroup_subs= ys_state *css) =20 zswap_memcg_offline_cleanup(memcg); =20 + memcontrol_bpf_offline(memcg); memcg_offline_kmem(memcg); reparent_deferred_split_queue(memcg); reparent_shrinker_deferred(memcg); --=20 2.43.0 From nobody Sat Feb 7 15:11:14 2026 Received: from out-173.mta0.migadu.com (out-173.mta0.migadu.com [91.218.175.173]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 71CE5303A0A for ; Mon, 26 Jan 2026 09:07:11 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.173 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418434; cv=none; b=JsDfg6Ppws+IVdgsGRoiEr/pdvFDSVopEr3XS9lr9bOAdpPJGmeaIT71QdA2QXSFfVSlDni6BNHVXSbYOJ3dweuW1siw6jIEIe7khHwLsperhC3NXfoLIC1HxEwkOC66xctVUFHhiNfCeEwXXfqkDXYIXJhxp7cCr87LQZkjc4o= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418434; c=relaxed/simple; bh=s+L/CEQLYB8zlS5YovGGFHaf+77ukYZA+DGQM0Wdzuw=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=D3uO0erZ5YYAhbcMmihDmZJDHIFJUKnJBdApL+yyNO+fc++CCTQI/WhhTU26pNftkPNcMBvux0/c9lfyRWtG36v8lHOQkjko73UTX4RaASvwQBoNCi7+TZTqQEETaBQV6DudaxtB3OhRpbFAkIXNopCGvuStKE9eGtjurTCMr1w= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=u1ddRi92; arc=none smtp.client-ip=91.218.175.173 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="u1ddRi92" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769418429; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=GO/LF+sPicNNy1cE+LoxDvdgC7eo9UT04E6MeiwBez4=; b=u1ddRi924XTVQ+A0lUwrQVJK0OMM4lcoOYsKWqaRhdCNnN7W/M3T4Kx2l81AscnLKseugY CCvH9srNWYy8hdcuU5h7hOHKh53Nd5dlaSriTGLqO6gC+xCp3KntleRh702UaHWe021O+d LCBfutNU8u2v8R92ALMG9M5/6AqUETY= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v4 09/12] selftests/bpf: Add tests for memcg_bpf_ops Date: Mon, 26 Jan 2026 17:06:28 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Add a comprehensive selftest suite for the `memcg_bpf_ops` functionality. These tests validate that BPF programs can correctly influence memory cgroup throttling behavior by implementing the new hooks. The test suite is added in `prog_tests/memcg_ops.c` and covers several key scenarios: 1. `test_memcg_ops_over_high`: Verifies that a BPF program can trigger throttling on a low-priority cgroup by returning a delay from the `get_high_delay_ms` hook when a high-priority cgroup is under pressure. 2. `test_memcg_ops_below_low_over_high`: Tests the combination of the `below_low` and `get_high_delay_ms` hooks, ensuring they work together as expected. 3. `test_memcg_ops_below_min_over_high`: Validates the interaction between the `below_min` and `get_high_delay_ms` hooks. The test framework sets up a cgroup hierarchy with high and low priority groups, attaches BPF programs, runs memory-intensive workloads, and asserts that the observed throttling (measured by workload execution time) matches expectations. The BPF program (`progs/memcg_ops.c`) uses a tracepoint on `memcg:count_memcg_events` (specifically PGFAULT) to detect memory pressure and trigger the appropriate hooks in response. This test suite provides essential validation for the new memory control mechanisms. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- MAINTAINERS | 2 + .../selftests/bpf/prog_tests/memcg_ops.c | 541 ++++++++++++++++++ tools/testing/selftests/bpf/progs/memcg_ops.c | 129 +++++ 3 files changed, 672 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/memcg_ops.c create mode 100644 tools/testing/selftests/bpf/progs/memcg_ops.c diff --git a/MAINTAINERS b/MAINTAINERS index 491d567f7dc8..7e07bb330eae 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6471,6 +6471,8 @@ F: mm/memcontrol-v1.h F: mm/page_counter.c F: mm/swap_cgroup.c F: samples/cgroup/* +F: tools/testing/selftests/bpf/prog_tests/memcg_ops.c +F: tools/testing/selftests/bpf/progs/memcg_ops.c F: tools/testing/selftests/cgroup/memcg_protection.m F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c F: tools/testing/selftests/cgroup/test_kmem.c diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/tes= ting/selftests/bpf/prog_tests/memcg_ops.c new file mode 100644 index 000000000000..a019ba2387bc --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c @@ -0,0 +1,541 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory controller eBPF struct ops test + */ + +#include +#include +#include +#include +#include +#include +#include "cgroup_helpers.h" + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +#include "memcg_ops.skel.h" + +#define TRIGGER_THRESHOLD 1 +#define OVER_HIGH_MS 2000 +#define FILE_SIZE (64 * 1024 * 1024ul) +#define BUFFER_SIZE (4096) +#define CG_LIMIT (120 * 1024 * 1024ul) + +#define CG_DIR "/memcg_ops_test" +#define CG_HIGH_DIR CG_DIR "/high" +#define CG_LOW_DIR CG_DIR "/low" + +static int +setup_cgroup(int *high_cgroup_id, int *low_cgroup_fd, int *high_cgroup_fd) +{ + int ret; + char limit_buf[20]; + + ret =3D setup_cgroup_environment(); + if (!ASSERT_OK(ret, "setup_cgroup_environment")) + goto cleanup; + + ret =3D create_and_get_cgroup(CG_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_DIR)) + goto cleanup; + close(ret); + ret =3D enable_controllers(CG_DIR, "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + snprintf(limit_buf, 20, "%ld", CG_LIMIT); + ret =3D write_cgroup_file(CG_DIR, "memory.max", limit_buf); + if (!ASSERT_OK(ret, "write_cgroup_file memory.max")) + goto cleanup; + ret =3D write_cgroup_file(CG_DIR, "memory.swap.max", "0"); + if (!ASSERT_OK(ret, "write_cgroup_file memory.swap.max")) + goto cleanup; + + ret =3D create_and_get_cgroup(CG_HIGH_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_HIGH_DIR)) + goto cleanup; + if (high_cgroup_fd) + *high_cgroup_fd =3D ret; + else + close(ret); + ret =3D (int)get_cgroup_id(CG_HIGH_DIR); + if (!ASSERT_GE(ret, 0, "get_cgroup_id")) + goto cleanup; + *high_cgroup_id =3D ret; + + ret =3D create_and_get_cgroup(CG_LOW_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_LOW_DIR)) + goto cleanup; + if (low_cgroup_fd) + *low_cgroup_fd =3D ret; + else + close(ret); + + return 0; + +cleanup: + cleanup_cgroup_environment(); + return -1; +} + +int write_file(const char *filename) +{ + int ret =3D -1; + size_t written =3D 0; + char *buffer; + FILE *fp; + + fp =3D fopen(filename, "wb"); + if (!fp) + goto out; + + buffer =3D malloc(BUFFER_SIZE); + if (!buffer) + goto cleanup_fp; + + memset(buffer, 'A', BUFFER_SIZE); + + while (written < FILE_SIZE) { + size_t to_write =3D (FILE_SIZE - written < BUFFER_SIZE) ? + (FILE_SIZE - written) : + BUFFER_SIZE; + + if (fwrite(buffer, 1, to_write, fp) !=3D to_write) + goto cleanup; + written +=3D to_write; + } + + ret =3D 0; +cleanup: + free(buffer); +cleanup_fp: + fclose(fp); +out: + return ret; +} + +int read_file(const char *filename, int iterations) +{ + int ret =3D -1; + long page_size =3D sysconf(_SC_PAGESIZE); + char *p; + char *map; + size_t i; + int fd; + struct stat sb; + + fd =3D open(filename, O_RDONLY); + if (fd =3D=3D -1) + goto out; + + if (fstat(fd, &sb) =3D=3D -1) + goto cleanup_fd; + + if (sb.st_size !=3D FILE_SIZE) { + fprintf(stderr, "File size mismatch: expected %ld, got %ld\n", + FILE_SIZE, sb.st_size); + goto cleanup_fd; + } + + map =3D mmap(NULL, FILE_SIZE, PROT_READ, MAP_PRIVATE, fd, 0); + if (map =3D=3D MAP_FAILED) + goto cleanup_fd; + + for (int iter =3D 0; iter < iterations; iter++) { + for (i =3D 0; i < FILE_SIZE; i +=3D page_size) { + /* access a byte to trigger page fault */ + p =3D &map[i]; + __asm__ __volatile__("" : : "r"(p) : "memory"); + } + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d %d done\n", __func__, getpid(), iter); + } + + if (munmap(map, FILE_SIZE) =3D=3D -1) + goto cleanup_fd; + + ret =3D 0; + +cleanup_fd: + close(fd); +out: + return ret; +} + +static void +real_test_memcg_ops_child_work(const char *cgroup_path, + char *data_filename, + char *time_filename, + int read_times) +{ + struct timeval start, end; + double elapsed; + FILE *fp; + + if (!ASSERT_OK(join_parent_cgroup(cgroup_path), "join_parent_cgroup")) + return; + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d begin\n", __func__, getpid()); + + gettimeofday(&start, NULL); + + if (!ASSERT_OK(write_file(data_filename), "write_file")) + return; + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d write_file done\n", __func__, getpid()); + + if (!ASSERT_OK(read_file(data_filename, read_times), "read_file")) + return; + + gettimeofday(&end, NULL); + + elapsed =3D (end.tv_sec - start.tv_sec) + + (end.tv_usec - start.tv_usec) / 1000000.0; + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d end %.6f\n", __func__, getpid(), elapsed); + + fp =3D fopen(time_filename, "w"); + if (!ASSERT_OK_PTR(fp, "fopen")) + return; + fprintf(fp, "%.6f", elapsed); + fclose(fp); +} + +static int get_time(char *time_filename, double *time) +{ + int ret =3D -1; + FILE *fp; + char buf[64]; + + fp =3D fopen(time_filename, "r"); + if (!ASSERT_OK_PTR(fp, "fopen")) + goto out; + + if (!ASSERT_OK_PTR(fgets(buf, sizeof(buf), fp), "fgets")) + goto cleanup; + + if (sscanf(buf, "%lf", time) < 0) { + PRINT_FAIL("sscanf %s", buf); + goto cleanup; + } + + ret =3D 0; +cleanup: + fclose(fp); +out: + return ret; +} + +static void real_test_memcg_ops(int read_times) +{ + int ret; + char data_file1[] =3D "/tmp/test_data_XXXXXX"; + char data_file2[] =3D "/tmp/test_data_XXXXXX"; + char time_file1[] =3D "/tmp/test_time_XXXXXX"; + char time_file2[] =3D "/tmp/test_time_XXXXXX"; + pid_t pid1, pid2; + double time1, time2; + + ret =3D mkstemp(data_file1); + if (!ASSERT_GT(ret, 0, "mkstemp")) + return; + close(ret); + ret =3D mkstemp(data_file2); + if (!ASSERT_GT(ret, 0, "mkstemp")) + goto cleanup_data_file1; + close(ret); + ret =3D mkstemp(time_file1); + if (!ASSERT_GT(ret, 0, "mkstemp")) + goto cleanup_data_file2; + close(ret); + ret =3D mkstemp(time_file2); + if (!ASSERT_GT(ret, 0, "mkstemp")) + goto cleanup_time_file1; + close(ret); + + pid1 =3D fork(); + if (!ASSERT_GE(pid1, 0, "fork")) + goto cleanup; + if (pid1 =3D=3D 0) { + real_test_memcg_ops_child_work(CG_LOW_DIR, + data_file1, + time_file1, + read_times); + exit(0); + } + + pid2 =3D fork(); + if (!ASSERT_GE(pid2, 0, "fork")) + goto cleanup; + if (pid2 =3D=3D 0) { + real_test_memcg_ops_child_work(CG_HIGH_DIR, + data_file2, + time_file2, + read_times); + exit(0); + } + + ret =3D waitpid(pid1, NULL, 0); + if (!ASSERT_GT(ret, 0, "waitpid")) + goto cleanup; + + ret =3D waitpid(pid2, NULL, 0); + if (!ASSERT_GT(ret, 0, "waitpid")) + goto cleanup; + + if (get_time(time_file1, &time1)) + goto cleanup; + + if (get_time(time_file2, &time2)) + goto cleanup; + + if (time1 < time2 || time1 - time2 <=3D 1) + PRINT_FAIL("low fast compare time1=3D%f, time2=3D%f", + time1, time2); + +cleanup: + unlink(time_file2); +cleanup_time_file1: + unlink(time_file1); +cleanup_data_file2: + unlink(data_file2); +cleanup_data_file1: + unlink(data_file1); +} + +void test_memcg_ops_over_high(void) +{ + int err, map_fd; + struct memcg_ops *skel =3D NULL; + struct bpf_map *map; + size_t bss_sz; + struct memcg_ops__bss *bss_data; + __u32 key =3D 0; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link2 =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int high_cgroup_id, low_cgroup_fd =3D -1; + + err =3D setup_cgroup(&high_cgroup_id, &low_cgroup_fd, NULL); + if (!ASSERT_OK(err, "setup_cgroup")) + goto out; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss")) + goto out; + + map_fd =3D bpf_map__fd(map); + bss_sz =3D bpf_map__value_size(map); + bss_data =3D malloc(bpf_map__value_size(map)); + if (!ASSERT_OK_PTR(bss_data, "malloc(bpf_map__value_size(map))")) + goto out; + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D TRIGGER_THRESHOLD; + bss_data->local_config.use_below_low =3D false; + bss_data->local_config.use_below_min =3D false; + bss_data->local_config.over_high_ms =3D OVER_HIGH_MS; + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + prog =3D bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto out; + + link =3D bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name low_mcg_ops")) + goto out; + + opts.relative_fd =3D low_cgroup_fd; + link2 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link2, "bpf_map__attach_struct_ops_opts")) + goto out; + + real_test_memcg_ops(5); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link2); + memcg_ops__detach(skel); + memcg_ops__destroy(skel); + close(low_cgroup_fd); + cleanup_cgroup_environment(); +} + +void test_memcg_ops_below_low_over_high(void) +{ + int err, map_fd; + struct memcg_ops *skel =3D NULL; + struct bpf_map *map; + size_t bss_sz; + struct memcg_ops__bss *bss_data; + __u32 key =3D 0; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link_high =3D NULL, *link_low =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int high_cgroup_id, high_cgroup_fd =3D -1, low_cgroup_fd =3D -1; + + err =3D setup_cgroup(&high_cgroup_id, &low_cgroup_fd, &high_cgroup_fd); + if (!ASSERT_OK(err, "setup_cgroup")) + goto out; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss")) + goto out; + + map_fd =3D bpf_map__fd(map); + bss_sz =3D bpf_map__value_size(map); + bss_data =3D malloc(bpf_map__value_size(map)); + if (!ASSERT_OK_PTR(bss_data, "malloc(bpf_map__value_size(map))")) + goto out; + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D TRIGGER_THRESHOLD; + bss_data->local_config.use_below_low =3D true; + bss_data->local_config.use_below_min =3D false; + bss_data->local_config.over_high_ms =3D OVER_HIGH_MS; + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + prog =3D bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto out; + + link =3D bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "high_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name mcg_ops")) + goto out; + opts.relative_fd =3D high_cgroup_fd; + link_high =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_high, "bpf_map__attach_struct_ops_opts")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name mcg_ops")) + goto out; + opts.relative_fd =3D low_cgroup_fd; + link_low =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_low, "bpf_map__attach_struct_ops_opts")) + goto out; + + real_test_memcg_ops(50); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link_high); + bpf_link__destroy(link_low); + memcg_ops__detach(skel); + memcg_ops__destroy(skel); + close(high_cgroup_fd); + close(low_cgroup_fd); + cleanup_cgroup_environment(); +} + +void test_memcg_ops_below_min_over_high(void) +{ + int err, map_fd; + struct memcg_ops *skel =3D NULL; + struct bpf_map *map; + size_t bss_sz; + struct memcg_ops__bss *bss_data; + __u32 key =3D 0; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link_high =3D NULL, *link_low =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int high_cgroup_id, high_cgroup_fd =3D -1, low_cgroup_fd =3D -1; + + err =3D setup_cgroup(&high_cgroup_id, &low_cgroup_fd, &high_cgroup_fd); + if (!ASSERT_OK(err, "setup_cgroup")) + goto out; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss")) + goto out; + + map_fd =3D bpf_map__fd(map); + bss_sz =3D bpf_map__value_size(map); + bss_data =3D malloc(bpf_map__value_size(map)); + if (!ASSERT_OK_PTR(bss_data, "malloc(bpf_map__value_size(map))")) + goto out; + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D TRIGGER_THRESHOLD; + bss_data->local_config.use_below_low =3D false; + bss_data->local_config.use_below_min =3D true; + bss_data->local_config.over_high_ms =3D OVER_HIGH_MS; + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + prog =3D bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto out; + + link =3D bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "high_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name mcg_ops")) + goto out; + opts.relative_fd =3D high_cgroup_fd; + link_high =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_high, "bpf_map__attach_struct_ops_opts")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name mcg_ops")) + goto out; + opts.relative_fd =3D low_cgroup_fd; + link_low =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_low, "bpf_map__attach_struct_ops_opts")) + goto out; + + real_test_memcg_ops(50); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link_high); + bpf_link__destroy(link_low); + memcg_ops__detach(skel); + memcg_ops__destroy(skel); + close(high_cgroup_fd); + close(low_cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/memcg_ops.c b/tools/testing/= selftests/bpf/progs/memcg_ops.c new file mode 100644 index 000000000000..44087a206a61 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/memcg_ops.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +#define ONE_SECOND_NS 1000000000 + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +struct AggregationData { + u64 sum; + u64 window_start_ts; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct AggregationData); +} aggregation_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u64); +} trigger_ts_map SEC(".maps"); + +SEC("tp/memcg/count_memcg_events") +int +handle_count_memcg_events(struct trace_event_raw_memcg_rstat_events *ctx) +{ + u32 key =3D 0; + struct AggregationData *data; + u64 current_ts; + + if (ctx->id !=3D local_config.high_cgroup_id || + (ctx->item !=3D PGFAULT)) + goto out; + + data =3D bpf_map_lookup_elem(&aggregation_map, &key); + if (!data) + goto out; + + current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - data->window_start_ts < ONE_SECOND_NS) { + data->sum +=3D ctx->val; + } else { + data->window_start_ts =3D current_ts; + data->sum =3D ctx->val; + } + + if (data->sum > local_config.threshold) { + bpf_map_update_elem(&trigger_ts_map, &key, ¤t_ts, + BPF_ANY); + data->sum =3D 0; + data->window_start_ts =3D current_ts; + } + +out: + return 0; +} + +static bool need_threshold(void) +{ + u32 key =3D 0; + u64 *trigger_ts; + bool ret =3D false; + + trigger_ts =3D bpf_map_lookup_elem(&trigger_ts_map, &key); + if (!trigger_ts || *trigger_ts =3D=3D 0) + goto out; + + u64 current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - *trigger_ts < ONE_SECOND_NS) + ret =3D true; + +out: + return ret; +} + +SEC("struct_ops/below_low") +unsigned int below_low_impl(struct mem_cgroup *memcg) +{ + if (!local_config.use_below_low) + return false; + + return need_threshold(); +} + +SEC("struct_ops/below_min") +unsigned int below_min_impl(struct mem_cgroup *memcg) +{ + if (!local_config.use_below_min) + return false; + + return need_threshold(); +} + +SEC("struct_ops/get_high_delay_ms") +unsigned int get_high_delay_ms_impl(struct mem_cgroup *memcg) +{ + if (local_config.over_high_ms && need_threshold()) + return local_config.over_high_ms; + + return 0; +} + +SEC(".struct_ops.link") +struct memcg_bpf_ops high_mcg_ops =3D { + .below_low =3D (void *)below_low_impl, + .below_min =3D (void *)below_min_impl, +}; + +SEC(".struct_ops.link") +struct memcg_bpf_ops low_mcg_ops =3D { + .get_high_delay_ms =3D (void *)get_high_delay_ms_impl, +}; + +char LICENSE[] SEC("license") =3D "GPL"; --=20 2.43.0 From nobody Sat Feb 7 15:11:14 2026 Received: from out-171.mta0.migadu.com (out-171.mta0.migadu.com [91.218.175.171]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8E205305E19; Mon, 26 Jan 2026 09:07:24 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.171 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418446; cv=none; b=tXDyF9iejJjrG2f38ukpkmpeGKMTzm7esQzVsuvAXSQRUwzh4KjouwixjdG2SBp3VTJuw8Ki+jvaz28aaOEjcNBXp0y2hKwDtZEQYCPC4LL3CTOKELQwmIElEWFgUxWTk1PRGCk077GLrHXy/P6wG460GQCKMVld0HcX05k8NKg= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418446; c=relaxed/simple; bh=QbKrL9O1TZRDadE07+BX10Nasmf/eESTbhoxmYVZSYU=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Q4wo6LT6H+t9sG+YHgb4C+R4YmVVjelyUFESWZrlMcZtMl8MNcU7NQaFIPyeaBSIJISj3x6PeCXvi4X/RH3wN4SBhNSgU1yppQveeQJBJ6zRx3LE0BzEqWIT1pfzkVxVzzcawk8mtDKxwRaNcEL7XpDvXqCZNtA3AxP29DMr46o= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=lTtXD/LO; arc=none smtp.client-ip=91.218.175.171 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="lTtXD/LO" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769418442; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=k0Dc6FifkDwPpG09rd3OftsrldqK8TRlWZhlATE0DkY=; b=lTtXD/LOh3+IIxcd+dM/dMSFWAaGTq5gjHRoiDFFdavMxM7L6EGNrEf0lErtM5ghv2fBme Ip0pPMjC2i51hLQLRC/Dg2BHUOvqmmxQ5KffNQlcavmQn77TLycm+sAIeSV4UoSkeaPraa DrmsMzn84Btk59jaJ2L8npT332it7K4= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v4 10/12] mm/bpf: Add BPF_F_ALLOW_OVERRIDE support for memcg_bpf_ops Date: Mon, 26 Jan 2026 17:06:29 +0800 Message-ID: <443511ca5d83a01d9f7f14c9548dea41ea485aab.1769417588.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu To allow for more flexible attachment policies in nested cgroup hierarchies, this patch introduces support for the `BPF_F_ALLOW_OVERRIDE` flag for `memcg_bpf_ops`. When a `memcg_bpf_ops` is attached to a cgroup with this flag, it permits child cgroups to attach their own, different `memcg_bpf_ops`, overriding the parent's inherited program. Without this flag, attaching a BPF program to a cgroup that already has one (either directly or via inheritance) will fail. The implementation involves: - Adding a `bpf_ops_flags` field to `struct mem_cgroup`. - During registration (`bpf_memcg_ops_reg`), checking for existing programs and the `BPF_F_ALLOW_OVERRIDE` flag. - During unregistration (`bpf_memcg_ops_unreg`), correctly restoring the parent's BPF program to the cgroup hierarchy. - Ensuring flags are inherited by child cgroups during online events. This change enables complex, multi-level policy enforcement where different subtrees of the cgroup hierarchy can have distinct memory management BPF programs. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- include/linux/memcontrol.h | 1 + mm/bpf_memcontrol.c | 83 ++++++++++++++++++++++++-------------- 2 files changed, 53 insertions(+), 31 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1083be5d0362..6e15da44ba35 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -354,6 +354,7 @@ struct mem_cgroup { =20 #ifdef CONFIG_BPF_SYSCALL struct memcg_bpf_ops *bpf_ops; + u32 bpf_ops_flags; #endif =20 struct mem_cgroup_per_node *nodeinfo[]; diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 20c5c3552ce3..756a7d4eb4e3 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -213,6 +213,7 @@ void memcontrol_bpf_online(struct mem_cgroup *memcg) goto out; =20 WRITE_ONCE(memcg->bpf_ops, ops); + memcg->bpf_ops_flags =3D parent_memcg->bpf_ops_flags; =20 /* * If the BPF program implements it, call the online handler to @@ -340,52 +341,54 @@ static int bpf_memcg_ops_init_member(const struct btf= _type *t, return 0; } =20 -/** - * clean_memcg_bpf_ops - Detach BPF programs from a cgroup hierarchy. - * @memcg: The root of the cgroup hierarchy to clean. - * @ops: The specific ops struct to detach. If NULL, detach any ops. - * - * Iterates through all descendant cgroups of @memcg (including itself) - * and clears their bpf_ops pointer. This is used when a BPF program - * is detached or if attachment fails midway. - */ -static void clean_memcg_bpf_ops(struct mem_cgroup *memcg, - struct memcg_bpf_ops *ops) -{ - struct mem_cgroup *iter =3D NULL; - - while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { - if (ops) { - if (!WARN_ON(READ_ONCE(iter->bpf_ops) !=3D ops)) - WRITE_ONCE(iter->bpf_ops, NULL); - } else - WRITE_ONCE(iter->bpf_ops, NULL); - } -} - static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link) { struct bpf_struct_ops_link *ops_link =3D container_of(link, struct bpf_struct_ops_link, link); - struct memcg_bpf_ops *ops =3D kdata; + struct memcg_bpf_ops *ops =3D kdata, *parent_ops =3D NULL; struct mem_cgroup *memcg, *iter =3D NULL; int err =3D 0; =20 + if (ops_link->flags & ~BPF_F_ALLOW_OVERRIDE) { + pr_err("attach only support BPF_F_ALLOW_OVERRIDE\n"); + return -EOPNOTSUPP; + } + memcg =3D mem_cgroup_get_from_ino(ops_link->cgroup_id); if (IS_ERR_OR_NULL(memcg)) return PTR_ERR(memcg); =20 cgroup_lock(); - while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { - if (READ_ONCE(iter->bpf_ops)) { - mem_cgroup_iter_break(memcg, iter); + + if (READ_ONCE(memcg->bpf_ops)) { + /* Check if bpf_ops of the parent is BPF_F_ALLOW_OVERRIDE. */ + if (memcg->bpf_ops_flags & BPF_F_ALLOW_OVERRIDE) { + iter =3D parent_mem_cgroup(memcg); + if (!iter || READ_ONCE(iter->bpf_ops) !=3D + READ_ONCE(memcg->bpf_ops)) + goto busy_out; + + parent_ops =3D READ_ONCE(memcg->bpf_ops); + } else { +busy_out: err =3D -EBUSY; - break; + goto unlock_out; + } + } + + iter =3D NULL; + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { + struct memcg_bpf_ops *iter_ops =3D READ_ONCE(iter->bpf_ops); + + if (iter_ops && iter_ops !=3D parent_ops) { + /* cannot override existing bpf_ops of sub-cgroup. */ + continue; } WRITE_ONCE(iter->bpf_ops, ops); + iter->bpf_ops_flags =3D ops_link->flags; } - if (err) - clean_memcg_bpf_ops(memcg, NULL); + +unlock_out: cgroup_unlock(); =20 mem_cgroup_put(memcg); @@ -399,13 +402,31 @@ static void bpf_memcg_ops_unreg(void *kdata, struct b= pf_link *link) =3D container_of(link, struct bpf_struct_ops_link, link); struct memcg_bpf_ops *ops =3D kdata; struct mem_cgroup *memcg; + struct mem_cgroup *iter; + struct memcg_bpf_ops *parent_bpf_ops =3D NULL; + u32 parent_bpf_ops_flags =3D 0; =20 memcg =3D mem_cgroup_get_from_ino(ops_link->cgroup_id); if (IS_ERR_OR_NULL(memcg)) goto out; =20 cgroup_lock(); - clean_memcg_bpf_ops(memcg, ops); + + /* Get the parent bpf_ops and bpf_ops_flags */ + iter =3D parent_mem_cgroup(memcg); + if (iter) { + parent_bpf_ops =3D READ_ONCE(iter->bpf_ops); + parent_bpf_ops_flags =3D iter->bpf_ops_flags; + } + + iter =3D NULL; + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { + if (READ_ONCE(iter->bpf_ops) =3D=3D ops) { + WRITE_ONCE(iter->bpf_ops, parent_bpf_ops); + iter->bpf_ops_flags =3D parent_bpf_ops_flags; + } + } + cgroup_unlock(); =20 mem_cgroup_put(memcg); --=20 2.43.0 From nobody Sat Feb 7 15:11:14 2026 Received: from out-179.mta0.migadu.com (out-179.mta0.migadu.com [91.218.175.179]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C6A5B3033EF for ; Mon, 26 Jan 2026 09:07:38 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.179 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418460; cv=none; b=FJrJqFtyqCzOEUgxoCkvCg3L9XGGTVeUvYJAz/mmEFk0CRCreQIV+mEUHUI7YNpEacAIRYAKaN/wqOb45b+8DOxRZ9sWhD3skyIk0CEwp5fa31dZ6qfkP85Z2iM3uExpgpnyA9sy884FtO3NSnavZYuOEHAXxT+OQoXPAU3b+nI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418460; c=relaxed/simple; bh=ghcfJmBYHqy6lBA1ot0FJknIHlXvYfjeiaWHDarsR+s=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=RI5+/g/B87Z7o2hnYEjcG9nO0bn3QVVtvAS7DNHbdNQka10s/pqx1nML59qcL54Pm2rmK1VpJ9B5szNTnfDLfYxDQRFkoRY8WZNQPGKWhcirWSfZ6Mn6jiQ+lNGSHELV/LAvskl2KZpP2Hpuu4LAzS5DBTQEYtotEVV89C9du00= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=mFV6/iB9; arc=none smtp.client-ip=91.218.175.179 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="mFV6/iB9" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769418456; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=3K5a8wCTBgErPuuh83LiBeWSXPRqSOI9kf2LngSju4s=; b=mFV6/iB9GZFGAUVD5NXXZ1rRRprzj3ohPE1BvwYPk5ov/lYjaaH+/gOu9fmlLzrA/86Pb/ dDAZ9Php8zxcHhVoiSnlDFzeS3F1mjKEfKXJO53mcSkb1/J34M0lYogM8AN6EmSa5DiPT5 t2hx/+v0iiTD4B6NkwmrUtF4Xnx8iJ8= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v4 11/12] selftests/bpf: Add test for memcg_bpf_ops hierarchies Date: Mon, 26 Jan 2026 17:06:30 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Add a new selftest, `test_memcg_ops_hierarchies`, to validate the behavior of attaching `memcg_bpf_ops` in a nested cgroup hierarchy, specifically testing the `BPF_F_ALLOW_OVERRIDE` flag. The test case performs the following steps: 1. Creates a three-level deep cgroup hierarchy: `/cg`, `/cg/cg`, and `/cg/cg/cg`. 2. Attaches a BPF struct_ops to the top-level cgroup (`/cg`) with the `BPF_F_ALLOW_OVERRIDE` flag. 3. Successfully attaches a new struct_ops to the middle cgroup (`/cg/cg`) without the flag, overriding the inherited one. 4. Asserts that attaching another struct_ops to the deepest cgroup (`/cg/cg/cg`) fails with -EBUSY, because its parent did not specify `BPF_F_ALLOW_OVERRIDE`. This test ensures that the attachment logic correctly enforces the override rules across a cgroup subtree. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- .../selftests/bpf/prog_tests/memcg_ops.c | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/tes= ting/selftests/bpf/prog_tests/memcg_ops.c index a019ba2387bc..08ac97752ac9 100644 --- a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c +++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c @@ -539,3 +539,73 @@ void test_memcg_ops_below_min_over_high(void) close(low_cgroup_fd); cleanup_cgroup_environment(); } + +void test_memcg_ops_hierarchies(void) +{ + int ret, first =3D -1, second =3D -1, third =3D -1; + struct memcg_ops *skel; + struct bpf_map *map; + struct bpf_link *link1 =3D NULL, *link2 =3D NULL, *link3 =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + + ret =3D setup_cgroup_environment(); + if (!ASSERT_OK(ret, "setup_cgroup_environment")) + goto cleanup; + + first =3D create_and_get_cgroup("/cg"); + if (!ASSERT_GE(first, 0, "create_and_get_cgroup /cg")) + goto cleanup; + ret =3D enable_controllers("/cg", "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + + second =3D create_and_get_cgroup("/cg/cg"); + if (!ASSERT_GE(second, 0, "create_and_get_cgroup /cg/cg")) + goto cleanup; + ret =3D enable_controllers("/cg/cg", "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + + third =3D create_and_get_cgroup("/cg/cg/cg"); + if (!ASSERT_GE(third, 0, "create_and_get_cgroup /cg/cg/cg")) + goto cleanup; + ret =3D enable_controllers("/cg/cg/cg", "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto cleanup; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name low_mcg_ops")) + goto cleanup; + + opts.relative_fd =3D first; + opts.flags =3D BPF_F_ALLOW_OVERRIDE; + link1 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link1, "bpf_map__attach_struct_ops_opts")) + goto cleanup; + + opts.relative_fd =3D second; + opts.flags =3D 0; + link2 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link2, "bpf_map__attach_struct_ops_opts")) + goto cleanup; + + opts.relative_fd =3D third; + opts.flags =3D 0; + link3 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_ERR_PTR(link3, "bpf_map__attach_struct_ops_opts")) + goto cleanup; + +cleanup: + bpf_link__destroy(link1); + bpf_link__destroy(link2); + bpf_link__destroy(link3); + memcg_ops__detach(skel); + close(first); + close(second); + close(third); + cleanup_cgroup_environment(); +} --=20 2.43.0 From nobody Sat Feb 7 15:11:14 2026 Received: from out-188.mta0.migadu.com (out-188.mta0.migadu.com [91.218.175.188]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 492123033F0 for ; Mon, 26 Jan 2026 09:07:54 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.188 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418477; cv=none; b=JFqfi2x0kVjkxDhZUfOxX1umxm9nGRxNirYw+xp4Yzv+rS+Rck4Ki/XVbN3nBLuJlsGm2U401V2JdJxS7X0AEjYzn5IR5hlpDGrTUaMr3zDz/j1u9jQYCTotNbjdLZN/HTzGRKHJd20RJdPCyV43BhqJo9zxK3LJdaG5x7Xosl8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769418477; c=relaxed/simple; bh=AyXJSg3lOrShVN9HheV2o8D32FAZa7ZdE4mnDpX45S0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=puO7CIjl0zuL/pu4GVYSnkGG4qRpvETwiVbYJ/dcB0XHd3V3N1V6t3GMqCWHTr/k0p8mzTJmd4ZZyy7L/mUxmOhGabKeupGuao9qVFp9A6PsImUNYVF9pd7FBYCgSaAEy+F4DiXdCJJtiaLY3lBNeQYBXVtTJ0viPchZYc3spcY= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=copk9EMd; arc=none smtp.client-ip=91.218.175.188 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="copk9EMd" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769418472; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=7g40Scui5AeSn6g7k+CN7+mSNdz8zvlZncZCV88NKxg=; b=copk9EMdoTHKQbMB3bhBYhXGwamypqLcWjDrVVdI7OAt1kcaier/EpCwd0reXZhKMzXwHb TSNpkaVgC5tWZb0CqfTNel02gPK7eyNnRHy9W7mtzwfEGXNExd589AiIzCd+M8oLblM5lA 8lyEvs3htVy/+iHnCqIN2c3ElghCnBI= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v4 12/12] samples/bpf: Add memcg priority control example Date: Mon, 26 Jan 2026 17:06:31 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Add a sample program to demonstrate a practical use case for the `memcg_bpf_ops` feature: priority-based memory throttling. The sample consists of a BPF program and a userspace loader: 1. memcg.bpf.c: A BPF program that monitors PGFAULT events on a high-priority cgroup. When activity exceeds a threshold, it uses the `get_high_delay_ms`, `below_low`, or `below_min` hooks to apply pressure on a low-priority cgroup. 2. memcg.c: A userspace loader that configures and attaches the BPF program. It takes command-line arguments for the high and low priority cgroup paths, a pressure threshold, and the desired throttling delay (`over_high_ms`). This provides a clear, working example of how to implement a dynamic, priority-aware memory management policy. A user can create two cgroups, run workloads of different priorities, and observe the low-priority workload being throttled to protect the high-priority one. Example usage: # ./memcg --low_path /sys/fs/cgroup/low \ # --high_path /sys/fs/cgroup/high \ # --threshold 100 --over_high_ms 1024 Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- MAINTAINERS | 2 + samples/bpf/.gitignore | 1 + samples/bpf/Makefile | 8 +- samples/bpf/memcg.bpf.c | 129 ++++++++++++++++ samples/bpf/memcg.c | 327 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 466 insertions(+), 1 deletion(-) create mode 100644 samples/bpf/memcg.bpf.c create mode 100644 samples/bpf/memcg.c diff --git a/MAINTAINERS b/MAINTAINERS index 7e07bb330eae..819ef271e011 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6470,6 +6470,8 @@ F: mm/memcontrol-v1.c F: mm/memcontrol-v1.h F: mm/page_counter.c F: mm/swap_cgroup.c +F: samples/bpf/memcg.bpf.c +F: samples/bpf/memcg.c F: samples/cgroup/* F: tools/testing/selftests/bpf/prog_tests/memcg_ops.c F: tools/testing/selftests/bpf/progs/memcg_ops.c diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index 0002cd359fb1..0de6569cdefd 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -49,3 +49,4 @@ iperf.* /vmlinux.h /bpftool/ /libbpf/ +memcg diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 95a4fa1f1e44..b00698bdc53b 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -37,6 +37,7 @@ tprogs-y +=3D xdp_fwd tprogs-y +=3D task_fd_query tprogs-y +=3D ibumad tprogs-y +=3D hbm +tprogs-y +=3D memcg =20 # Libbpf dependencies LIBBPF_SRC =3D $(TOOLS_PATH)/lib/bpf @@ -122,6 +123,7 @@ always-y +=3D task_fd_query_kern.o always-y +=3D ibumad_kern.o always-y +=3D hbm_out_kern.o always-y +=3D hbm_edt_kern.o +always-y +=3D memcg.bpf.o =20 COMMON_CFLAGS =3D $(TPROGS_USER_CFLAGS) TPROGS_LDFLAGS =3D $(TPROGS_USER_LDFLAGS) @@ -289,6 +291,8 @@ $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h $(obj)/hbm.o: $(src)/hbm.h $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h =20 +memcg: $(obj)/memcg.skel.h + # Override includes for xdp_sample_user.o because $(srctree)/usr/include in # TPROGS_CFLAGS causes conflicts XDP_SAMPLE_CFLAGS +=3D -Wall -O2 \ @@ -347,11 +351,13 @@ $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src= )/xdp_sample.bpf.h $(src)/x -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \ -c $(filter %.bpf.c,$^) -o $@ =20 -LINKED_SKELS :=3D xdp_router_ipv4.skel.h +LINKED_SKELS :=3D xdp_router_ipv4.skel.h memcg.skel.h clean-files +=3D $(LINKED_SKELS) =20 xdp_router_ipv4.skel.h-deps :=3D xdp_router_ipv4.bpf.o xdp_sample.bpf.o =20 +memcg.skel.h-deps :=3D memcg.bpf.o + LINKED_BPF_SRCS :=3D $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SK= ELS),$($(skel)-deps))) =20 BPF_SRCS_LINKED :=3D $(notdir $(wildcard $(src)/*.bpf.c)) diff --git a/samples/bpf/memcg.bpf.c b/samples/bpf/memcg.bpf.c new file mode 100644 index 000000000000..44087a206a61 --- /dev/null +++ b/samples/bpf/memcg.bpf.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +#define ONE_SECOND_NS 1000000000 + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +struct AggregationData { + u64 sum; + u64 window_start_ts; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct AggregationData); +} aggregation_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u64); +} trigger_ts_map SEC(".maps"); + +SEC("tp/memcg/count_memcg_events") +int +handle_count_memcg_events(struct trace_event_raw_memcg_rstat_events *ctx) +{ + u32 key =3D 0; + struct AggregationData *data; + u64 current_ts; + + if (ctx->id !=3D local_config.high_cgroup_id || + (ctx->item !=3D PGFAULT)) + goto out; + + data =3D bpf_map_lookup_elem(&aggregation_map, &key); + if (!data) + goto out; + + current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - data->window_start_ts < ONE_SECOND_NS) { + data->sum +=3D ctx->val; + } else { + data->window_start_ts =3D current_ts; + data->sum =3D ctx->val; + } + + if (data->sum > local_config.threshold) { + bpf_map_update_elem(&trigger_ts_map, &key, ¤t_ts, + BPF_ANY); + data->sum =3D 0; + data->window_start_ts =3D current_ts; + } + +out: + return 0; +} + +static bool need_threshold(void) +{ + u32 key =3D 0; + u64 *trigger_ts; + bool ret =3D false; + + trigger_ts =3D bpf_map_lookup_elem(&trigger_ts_map, &key); + if (!trigger_ts || *trigger_ts =3D=3D 0) + goto out; + + u64 current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - *trigger_ts < ONE_SECOND_NS) + ret =3D true; + +out: + return ret; +} + +SEC("struct_ops/below_low") +unsigned int below_low_impl(struct mem_cgroup *memcg) +{ + if (!local_config.use_below_low) + return false; + + return need_threshold(); +} + +SEC("struct_ops/below_min") +unsigned int below_min_impl(struct mem_cgroup *memcg) +{ + if (!local_config.use_below_min) + return false; + + return need_threshold(); +} + +SEC("struct_ops/get_high_delay_ms") +unsigned int get_high_delay_ms_impl(struct mem_cgroup *memcg) +{ + if (local_config.over_high_ms && need_threshold()) + return local_config.over_high_ms; + + return 0; +} + +SEC(".struct_ops.link") +struct memcg_bpf_ops high_mcg_ops =3D { + .below_low =3D (void *)below_low_impl, + .below_min =3D (void *)below_min_impl, +}; + +SEC(".struct_ops.link") +struct memcg_bpf_ops low_mcg_ops =3D { + .get_high_delay_ms =3D (void *)get_high_delay_ms_impl, +}; + +char LICENSE[] SEC("license") =3D "GPL"; diff --git a/samples/bpf/memcg.c b/samples/bpf/memcg.c new file mode 100644 index 000000000000..6d59a722f581 --- /dev/null +++ b/samples/bpf/memcg.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __MEMCG_RSTAT_SIMPLE_BPF_SKEL_H__ +#define u64 uint64_t +#endif + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +#include "memcg.skel.h" + +static bool exiting; + +static void sig_handler(int sig) +{ + exiting =3D true; +} + +static void usage(char *name) +{ + fprintf(stderr, + "Usage: %s --low_path=3D --high_path=3D \\\n" + " --threshold=3D [OPTIONS]\n\n", + name); + fprintf(stderr, "Required arguments:\n"); + fprintf(stderr, + " -l, --low_path=3DPATH Low priority memcgroup path\n"); + fprintf(stderr, + " -g, --high_path=3DPATH High priority memcgroup path\n"); + fprintf(stderr, + " -t, --threshold=3DVALUE The sum of 'val' PGSCAN of\n"); + fprintf(stderr, + " high priority memcgroup in\n"); + fprintf(stderr, + " 1 sec to trigger low priority\n"); + fprintf(stderr, + " cgroup over_high\n\n"); + fprintf(stderr, "Optional arguments:\n"); + fprintf(stderr, " -o, --over_high_ms=3DVALUE\n"); + fprintf(stderr, + " Low_path over_high_ms value\n"); + fprintf(stderr, + " (default: 0)\n"); + fprintf(stderr, " -L, --use_below_low Enable use_below_low flag\n"); + fprintf(stderr, " -M, --use_below_min Enable use_below_min flag\n"); + fprintf(stderr, + " -O, --allow_override Enable BPF_F_ALLOW_OVERRIDE\n"); + fprintf(stderr, + " flag\n"); + fprintf(stderr, " -h, --help Show this help message\n\n"); + fprintf(stderr, "Examples:\n"); + fprintf(stderr, " # Using long options:\n"); + fprintf(stderr, " %s --low_path=3D/sys/fs/cgroup/low \\\n", name); + fprintf(stderr, " --high_path=3D/sys/fs/cgroup/high \\\n"); + fprintf(stderr, " --threshold=3D1000 --over_high_ms=3D500 \\\n" + " --use_below_low\n\n"); + fprintf(stderr, " # Using short options:\n"); + fprintf(stderr, " %s -l /sys/fs/cgroup/low \\\n" + " -g /sys/fs/cgroup/high \\\n", + name); + fprintf(stderr, " -t 1000 -o 500 -L -M\n"); +} + +static uint64_t get_cgroup_id(const char *cgroup_path) +{ + struct stat st; + + if (cgroup_path =3D=3D NULL) { + fprintf(stderr, "Error: cgroup_path is NULL\n"); + return 0; + } + + if (stat(cgroup_path, &st) < 0) { + fprintf(stderr, "Error: stat(%s) failed: %d\n", + cgroup_path, errno); + return 0; + } + + return (uint64_t)st.st_ino; +} + +int main(int argc, char **argv) +{ + int low_cgroup_fd =3D -1, high_cgroup_fd =3D -1; + uint64_t threshold =3D 0, high_cgroup_id; + unsigned int over_high_ms =3D 0; + bool use_below_low =3D false, use_below_min =3D false; + __u32 opts_flags =3D 0; + const char *low_path =3D NULL; + const char *high_path =3D NULL; + const char *bpf_obj_file =3D "memcg.bpf.o"; + struct bpf_object *obj =3D NULL; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link_low =3D NULL, *link_high =3D NULL; + struct bpf_map *map; + struct memcg__bss *bss_data; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int err =3D -EINVAL; + int map_fd; + int opt; + int option_index =3D 0; + + static struct option long_options[] =3D { + {"low_path", required_argument, 0, 'l'}, + {"high_path", required_argument, 0, 'g'}, + {"threshold", required_argument, 0, 't'}, + {"over_high_ms", required_argument, 0, 'o'}, + {"use_below_low", no_argument, 0, 'L'}, + {"use_below_min", no_argument, 0, 'M'}, + {"allow_override", no_argument, 0, 'O'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0 } + }; + + while ((opt =3D getopt_long(argc, argv, "l:g:t:o:LMOh", + long_options, &option_index)) !=3D -1) { + switch (opt) { + case 'l': + low_path =3D optarg; + break; + case 'g': + high_path =3D optarg; + break; + case 't': + threshold =3D strtoull(optarg, NULL, 10); + break; + case 'o': + over_high_ms =3D strtoull(optarg, NULL, 10); + break; + case 'L': + use_below_low =3D true; + break; + case 'M': + use_below_min =3D true; + break; + case 'O': + opts_flags =3D BPF_F_ALLOW_OVERRIDE; + break; + case 'h': + usage(argv[0]); + return 0; + default: + usage(argv[0]); + return -EINVAL; + } + } + + if (!low_path || !high_path || !threshold) { + fprintf(stderr, + "ERROR: Missing required arguments\n\n"); + usage(argv[0]); + goto out; + } + + low_cgroup_fd =3D open(low_path, O_RDONLY); + if (low_cgroup_fd < 0) { + fprintf(stderr, + "ERROR: open low cgroup '%s' failed: %d\n", + low_path, errno); + err =3D -errno; + goto out; + } + + high_cgroup_id =3D get_cgroup_id(high_path); + if (!high_cgroup_id) + goto out; + high_cgroup_fd =3D open(high_path, O_RDONLY); + if (high_cgroup_fd < 0) { + fprintf(stderr, + "ERROR: open high cgroup '%s' failed: %d\n", + high_path, errno); + err =3D -errno; + goto out; + } + + obj =3D bpf_object__open_file(bpf_obj_file, NULL); + err =3D libbpf_get_error(obj); + if (err) { + fprintf(stderr, + "ERROR: opening BPF object file '%s' failed: %d\n", + bpf_obj_file, err); + goto out; + } + + map =3D bpf_object__find_map_by_name(obj, ".bss"); + if (!map) { + fprintf(stderr, "ERROR: Failed to find .data map\n"); + err =3D -ESRCH; + goto out; + } + + err =3D bpf_object__load(obj); + if (err) { + fprintf(stderr, + "ERROR: loading BPF object file failed: %d\n", + err); + goto out; + } + + map_fd =3D bpf_map__fd(map); + bss_data =3D malloc(bpf_map__value_size(map)); + if (bss_data) { + __u32 key =3D 0; + + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D threshold; + bss_data->local_config.over_high_ms =3D over_high_ms; + bss_data->local_config.use_below_low =3D use_below_low; + bss_data->local_config.use_below_min =3D use_below_min; + + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (err) { + fprintf(stderr, + "ERROR: update config failed: %d\n", + err); + goto out; + } + } else { + fprintf(stderr, + "ERROR: allocate memory failed\n"); + err =3D -ENOMEM; + goto out; + } + + prog =3D bpf_object__find_program_by_name(obj, + "handle_count_memcg_events"); + if (!prog) { + fprintf(stderr, + "ERROR: finding a prog in BPF object file failed\n"); + goto out; + } + + link =3D bpf_program__attach(prog); + err =3D libbpf_get_error(link); + if (err) { + fprintf(stderr, + "ERROR: bpf_program__attach failed: %d\n", + err); + goto out; + } + + if (over_high_ms) { + map =3D bpf_object__find_map_by_name(obj, "low_mcg_ops"); + if (!map) { + fprintf(stderr, + "ERROR: Failed to find low_mcg_ops map\n"); + err =3D -ESRCH; + goto out; + } + LIBBPF_OPTS_RESET(opts, + .flags =3D opts_flags, + .relative_fd =3D low_cgroup_fd, + ); + link_low =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!link_low) { + fprintf(stderr, + "Failed to attach struct ops low_mcg_ops: %d\n", + errno); + err =3D -errno; + goto out; + } + } + + if (use_below_low || use_below_min) { + map =3D bpf_object__find_map_by_name(obj, "high_mcg_ops"); + if (!map) { + fprintf(stderr, + "ERROR: Failed to find high_mcg_ops map\n"); + err =3D -ESRCH; + goto out; + } + LIBBPF_OPTS_RESET(opts, + .flags =3D opts_flags, + .relative_fd =3D high_cgroup_fd, + ); + link_high =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!link_high) { + fprintf(stderr, + "Failed to attach struct ops high_mcg_ops: %d\n", + errno); + err =3D -errno; + goto out; + } + } + + printf("Successfully attached!\n"); + + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + + while (!exiting) + pause(); + + printf("Exiting...\n"); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link_low); + bpf_link__destroy(link_high); + bpf_object__close(obj); + close(low_cgroup_fd); + close(high_cgroup_fd); + return err; +} --=20 2.43.0