From nobody Mon Jun 8 22:51:09 2026 Received: from out-181.mta1.migadu.com (out-181.mta1.migadu.com [95.215.58.181]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B6F0E2F7F14; Tue, 26 May 2026 02:21:06 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.181 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762068; cv=none; b=YIl2B6bQL7XxbRvmh4cFD5DZSerj1R+gpZDyu/vN/h6Q8XQk5qNi+RTP+uKFe/XslNEReDFy/P7qNJrWMbA36gkl9/Cel2mA2mZOQkTm8Jt0T5xtkhNpxHhmDVqBhhSG5Sqzih7R+nNqjbW8TCwEVRUlzBxr1Msxp4pkksHgQDw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762068; c=relaxed/simple; bh=ifV991Dr8YUH+Fv0g1oL+N4buaK7iTgry4TfR3U1QvA=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=baj1oe/qeqwLi1rg3dbd1uGAatJu9K+/KPJd21McRVjUr0BCdtE/r4sbkb85IEBfziF+Y75na1suVMQjqzkEbrlf5hWKtzsPVuRn+EuwGJCfJhRoHzuiHpLw8mQ/zBYn+3pkKEWY3Pp/cyzrnnpUrq7SQE5OU5Aufj96yIWwTEk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=nCyySfK8; arc=none smtp.client-ip=95.215.58.181 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="nCyySfK8" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1779762064; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=N6lb6S73FS1qu+rJyvkj2xvcDjRxUx8n4aBkU1vy4b8=; b=nCyySfK8JImDMjF+2Q0D5xtDEA1316DcKRfpuhoUv7F2U8I48894ytYteI50/nwzQVfPSt zXWSLi4FEYDV4lot4zHA8boMeiiTiUVY0youPBB3dkxT5juQZCcBWQqyoZ5X16KSXwvoCI MnIdZfKhwwPryfLftmuQN5jBLEKdBUU= From: Hui Zhu To: Alexei Starovoitov , Daniel Borkmann , John Fastabend , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Kumar Kartikeya Dwivedi , Song Liu , Yonghong Song , Jiri Olsa , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , JP Kobryn , Andrew Morton , Shuah Khan , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , Stanislav Fomichev , KP Singh , Tao Chen , Mykyta Yatsenko , Leon Hwang , Anton Protopopov , Amery Hung , Tobias Klauser , Eyal Birger , Rong Tao , Hao Luo , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , Willem de Bruijn , Jason Xing , Paul Chaignon , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, bpf@vger.kernel.org, cgroups@vger.kernel.org, linux-mm@kvack.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: geliang@kernel.org, baohua@kernel.org, Matt Bobrowski , Yafang Shao Subject: [RFC PATCH bpf-next v7 01/11] bpf: move bpf_struct_ops_link into bpf.h Date: Tue, 26 May 2026 10:20:01 +0800 Message-ID: <909faef088325613d895235f9c02993dc51e5ea3.1779760876.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin Move struct bpf_struct_ops_link's definition into bpf.h, where other custom bpf links definitions are. It's necessary to access its members from outside of generic bpf_struct_ops implementation, which will be done by following patches in the series. Signed-off-by: Roman Gushchin Acked-by: Matt Bobrowski Acked-by: Yafang Shao --- include/linux/bpf.h | 6 ++++++ kernel/bpf/bpf_struct_ops.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1b28cacc3075..01c0bf5a9cd0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1908,6 +1908,12 @@ struct bpf_raw_tp_link { u64 cookie; }; =20 +struct bpf_struct_ops_link { + struct bpf_link link; + struct bpf_map __rcu *map; + wait_queue_head_t wait_hup; +}; + struct bpf_link_primer { struct bpf_link *link; struct file *file; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 521cb9d7e8c7..cf3c604d48ef 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -55,12 +55,6 @@ struct bpf_struct_ops_map { struct bpf_struct_ops_value kvalue; }; =20 -struct bpf_struct_ops_link { - struct bpf_link link; - struct bpf_map __rcu *map; - wait_queue_head_t wait_hup; -}; - static DEFINE_MUTEX(update_mutex); =20 #define VALUE_PREFIX "bpf_struct_ops_" --=20 2.43.0 From nobody Mon Jun 8 22:51:09 2026 Received: from out-188.mta1.migadu.com (out-188.mta1.migadu.com [95.215.58.188]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 97B352F7F14 for ; Tue, 26 May 2026 02:21:21 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.188 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762083; cv=none; b=VDcIZ9zJ0z5mUGrCWx5ISyrUUEv988Rh17gRNbOCsRce4au5flHS+KrpEhIYUCh0XXyrAs9Eny8C3ZloTWsP6gDh4PP0jghGqEh3M91lSDFOnYrEuemV1A98d1O1FVB+ElrCJYn7t97lhGEqLQhZq9NYJwK66ftYf7jdZEld2NI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762083; c=relaxed/simple; bh=fgA7u0xlWi4suAXNZnu+tx4g2KqAjMocBcVjgaWION0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=hBSvNeylo7liH/rknNbznZ8UHhT4M8C0nkZlyUzSMRgRl9Exh7CqpyPCjxXti+EovoFUCv0XQFU+aIp/6FxzF2rJf8z18xi89Im2QT/wivfHPmR182ON6ZCnjtuPfzlymq0HsOVxr7sF4zMXowKGqBK44JbSGjd2CqmPuyNFRsI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=wiGodmtV; arc=none smtp.client-ip=95.215.58.188 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="wiGodmtV" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1779762079; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=br8xrpTcjYEeEq9ZZA1EyJzq2nS+XRXxEmKl1Ist3v4=; b=wiGodmtV5GoWbSCEJi9JPwrvbkx7m30YOeF5uIWsGhhlUbhafnKUtItj/EevaM/NxfT8ja u3AP4vqDOlKk47XLpoM/oHjHn2+klsc/ke30PeE48JVEC6bSlVlKtai1ksbzdLSIyEk2PA mabyDHlxCtpc44Gv01bRz0OcbpO8fXk= From: Hui Zhu To: Alexei Starovoitov , Daniel Borkmann , John Fastabend , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Kumar Kartikeya Dwivedi , Song Liu , Yonghong Song , Jiri Olsa , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , JP Kobryn , Andrew Morton , Shuah Khan , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , Stanislav Fomichev , KP Singh , Tao Chen , Mykyta Yatsenko , Leon Hwang , Anton Protopopov , Amery Hung , Tobias Klauser , Eyal Birger , Rong Tao , Hao Luo , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , Willem de Bruijn , Jason Xing , Paul Chaignon , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, bpf@vger.kernel.org, cgroups@vger.kernel.org, linux-mm@kvack.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: geliang@kernel.org, baohua@kernel.org Subject: [RFC PATCH bpf-next v7 02/11] bpf: allow attaching struct_ops to cgroups Date: Tue, 26 May 2026 10:20:02 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin Introduce an ability to attach bpf struct_ops'es to cgroups. >From user's standpoint it works in the following way: a user passes a BPF_F_CGROUP_FD flag and specifies the target cgroup fd while creating a struct_ops link. As the result, the bpf struct_ops link will be created and attached to a cgroup. The cgroup.bpf structure maintains a list of attached struct ops links. If the cgroup is getting deleted, attached struct ops'es are getting auto-detached and the userspace program gets a notification. This change doesn't answer the question how bpf programs belonging to these struct ops'es will be executed. It will be done individually for every bpf struct ops which supports this. Please, note that unlike "normal" bpf programs, struct ops'es are not propagated to cgroup sub-trees. Signed-off-by: Roman Gushchin --- include/linux/bpf-cgroup-defs.h | 3 ++ include/linux/bpf-cgroup.h | 16 +++++++++ include/linux/bpf.h | 3 ++ include/uapi/linux/bpf.h | 3 ++ kernel/bpf/bpf_struct_ops.c | 59 ++++++++++++++++++++++++++++++--- kernel/bpf/cgroup.c | 46 +++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 1 + 7 files changed, 127 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-def= s.h index c9e6b26abab6..6c5e37190dad 100644 --- a/include/linux/bpf-cgroup-defs.h +++ b/include/linux/bpf-cgroup-defs.h @@ -71,6 +71,9 @@ struct cgroup_bpf { /* temp storage for effective prog array used by prog_attach/detach */ struct bpf_prog_array *inactive; =20 + /* list of bpf struct ops links */ + struct list_head struct_ops_links; + /* reference counter used to detach bpf programs after cgroup removal */ struct percpu_ref refcnt; =20 diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index b2e79c2b41d5..88b643568012 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -423,6 +423,11 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr,= struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); =20 +int cgroup_bpf_attach_struct_ops(struct cgroup *cgrp, + struct bpf_struct_ops_link *link); +void cgroup_bpf_detach_struct_ops(struct cgroup *cgrp, + struct bpf_struct_ops_link *link); + const struct bpf_func_proto * cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *= prog); #else @@ -451,6 +456,17 @@ static inline int cgroup_bpf_link_attach(const union b= pf_attr *attr, return -EINVAL; } =20 +static inline int cgroup_bpf_attach_struct_ops(struct cgroup *cgrp, + struct bpf_struct_ops_link *link) +{ + return -EINVAL; +} + +static inline void cgroup_bpf_detach_struct_ops(struct cgroup *cgrp, + struct bpf_struct_ops_link *link) +{ +} + static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 01c0bf5a9cd0..743b4f0546b5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1911,6 +1911,9 @@ struct bpf_raw_tp_link { struct bpf_struct_ops_link { struct bpf_link link; struct bpf_map __rcu *map; + struct cgroup *cgroup; + bool cgroup_removed; + struct list_head list; wait_queue_head_t wait_hup; }; =20 diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aec171ccb6ef..f547613986cc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1246,6 +1246,7 @@ enum bpf_perf_event_type { #define BPF_F_AFTER (1U << 4) #define BPF_F_ID (1U << 5) #define BPF_F_PREORDER (1U << 6) +#define BPF_F_CGROUP_FD (1U << 7) #define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ =20 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the @@ -6793,6 +6794,8 @@ struct bpf_link_info { } xdp; struct { __u32 map_id; + __u32 :32; + __u64 cgroup_id; } struct_ops; struct { __u32 pf; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index cf3c604d48ef..5333290957cb 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include =20 struct bpf_struct_ops_value { struct bpf_struct_ops_common_value common; @@ -1220,6 +1222,10 @@ static void bpf_struct_ops_map_link_dealloc(struct b= pf_link *link) st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); bpf_map_put(&st_map->map); } + + if (st_link->cgroup) + cgroup_bpf_detach_struct_ops(st_link->cgroup, st_link); + kfree(st_link); } =20 @@ -1228,6 +1234,7 @@ static void bpf_struct_ops_map_link_show_fdinfo(const= struct bpf_link *link, { struct bpf_struct_ops_link *st_link; struct bpf_map *map; + u64 cgrp_id =3D 0; =20 st_link =3D container_of(link, struct bpf_struct_ops_link, link); rcu_read_lock(); @@ -1235,6 +1242,14 @@ static void bpf_struct_ops_map_link_show_fdinfo(cons= t struct bpf_link *link, if (map) seq_printf(seq, "map_id:\t%d\n", map->id); rcu_read_unlock(); + + cgroup_lock(); + if (st_link->cgroup) + cgrp_id =3D cgroup_id(st_link->cgroup); + cgroup_unlock(); + + if (cgrp_id) + seq_printf(seq, "cgroup_id:\t%llu\n", cgrp_id); } =20 static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *l= ink, @@ -1242,6 +1257,7 @@ static int bpf_struct_ops_map_link_fill_link_info(con= st struct bpf_link *link, { struct bpf_struct_ops_link *st_link; struct bpf_map *map; + u64 cgrp_id =3D 0; =20 st_link =3D container_of(link, struct bpf_struct_ops_link, link); rcu_read_lock(); @@ -1249,6 +1265,13 @@ static int bpf_struct_ops_map_link_fill_link_info(co= nst struct bpf_link *link, if (map) info->struct_ops.map_id =3D map->id; rcu_read_unlock(); + + cgroup_lock(); + if (st_link->cgroup) + cgrp_id =3D cgroup_id(st_link->cgroup); + cgroup_unlock(); + + info->struct_ops.cgroup_id =3D cgrp_id; return 0; } =20 @@ -1327,6 +1350,9 @@ static int bpf_struct_ops_map_link_detach(struct bpf_= link *link) =20 mutex_unlock(&update_mutex); =20 + if (st_link->cgroup) + cgroup_bpf_detach_struct_ops(st_link->cgroup, st_link); + wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP); =20 return 0; @@ -1339,6 +1365,9 @@ static __poll_t bpf_struct_ops_map_link_poll(struct f= ile *file, =20 poll_wait(file, &st_link->wait_hup, pts); =20 + if (st_link->cgroup_removed) + return EPOLLHUP; + return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP; } =20 @@ -1357,8 +1386,12 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) struct bpf_link_primer link_primer; struct bpf_struct_ops_map *st_map; struct bpf_map *map; + struct cgroup *cgrp; int err; =20 + if (attr->link_create.flags & ~BPF_F_CGROUP_FD) + return -EINVAL; + map =3D bpf_map_get(attr->link_create.map_fd); if (IS_ERR(map)) return PTR_ERR(map); @@ -1378,11 +1411,26 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_= lops, NULL, attr->link_create.attach_type); =20 + init_waitqueue_head(&link->wait_hup); + + if (attr->link_create.flags & BPF_F_CGROUP_FD) { + cgrp =3D cgroup_get_from_fd(attr->link_create.target_fd); + if (IS_ERR(cgrp)) { + err =3D PTR_ERR(cgrp); + goto err_out; + } + link->cgroup =3D cgrp; + err =3D cgroup_bpf_attach_struct_ops(cgrp, link); + if (err) { + cgroup_put(cgrp); + link->cgroup =3D NULL; + goto err_out; + } + } + err =3D bpf_link_prime(&link->link, &link_primer); if (err) - goto err_out; - - init_waitqueue_head(&link->wait_hup); + goto err_put_cgroup; =20 /* Hold the update_mutex such that the subsystem cannot * do link->ops->detach() before the link is fully initialized. @@ -1393,13 +1441,16 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) mutex_unlock(&update_mutex); bpf_link_cleanup(&link_primer); link =3D NULL; - goto err_out; + goto err_put_cgroup; } RCU_INIT_POINTER(link->map, map); mutex_unlock(&update_mutex); =20 return bpf_link_settle(&link_primer); =20 +err_put_cgroup: + if (link && link->cgroup) + cgroup_bpf_detach_struct_ops(link->cgroup, link); err_out: bpf_map_put(map); kfree(link); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 876f6a81a9b6..b593ebb30a4e 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include =20 @@ -307,12 +308,23 @@ static void cgroup_bpf_release(struct work_struct *wo= rk) bpf.release_work); struct bpf_prog_array *old_array; struct list_head *storages =3D &cgrp->bpf.storages; + struct bpf_struct_ops_link *st_link, *st_tmp; struct bpf_cgroup_storage *storage, *stmp; + LIST_HEAD(st_links); =20 unsigned int atype; =20 cgroup_lock(); =20 + list_splice_init(&cgrp->bpf.struct_ops_links, &st_links); + list_for_each_entry_safe(st_link, st_tmp, &st_links, list) { + st_link->cgroup =3D NULL; + st_link->cgroup_removed =3D true; + cgroup_put(cgrp); + if (IS_ERR(bpf_link_inc_not_zero(&st_link->link))) + list_del(&st_link->list); + } + for (atype =3D 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { struct hlist_head *progs =3D &cgrp->bpf.progs[atype]; struct bpf_prog_list *pl; @@ -346,6 +358,11 @@ static void cgroup_bpf_release(struct work_struct *wor= k) =20 cgroup_unlock(); =20 + list_for_each_entry_safe(st_link, st_tmp, &st_links, list) { + st_link->link.ops->detach(&st_link->link); + bpf_link_put(&st_link->link); + } + for (p =3D cgroup_parent(cgrp); p; p =3D cgroup_parent(p)) cgroup_bpf_put(p); =20 @@ -525,6 +542,7 @@ static int cgroup_bpf_inherit(struct cgroup *cgrp) INIT_HLIST_HEAD(&cgrp->bpf.progs[i]); =20 INIT_LIST_HEAD(&cgrp->bpf.storages); + INIT_LIST_HEAD(&cgrp->bpf.struct_ops_links); =20 for (i =3D 0; i < NR; i++) if (compute_effective_progs(cgrp, i, &arrays[i])) @@ -2755,3 +2773,31 @@ cgroup_common_func_proto(enum bpf_func_id func_id, c= onst struct bpf_prog *prog) return NULL; } } + +int cgroup_bpf_attach_struct_ops(struct cgroup *cgrp, + struct bpf_struct_ops_link *link) +{ + int ret =3D 0; + + cgroup_lock(); + if (percpu_ref_is_zero(&cgrp->bpf.refcnt)) { + ret =3D -EBUSY; + goto out; + } + list_add_tail(&link->list, &cgrp->bpf.struct_ops_links); +out: + cgroup_unlock(); + return ret; +} + +void cgroup_bpf_detach_struct_ops(struct cgroup *cgrp, + struct bpf_struct_ops_link *link) +{ + cgroup_lock(); + if (link->cgroup =3D=3D cgrp) { + list_del(&link->list); + link->cgroup =3D NULL; + cgroup_put(cgrp); + } + cgroup_unlock(); +} diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 37142e6d911a..fa075dc3b7eb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1246,6 +1246,7 @@ enum bpf_perf_event_type { #define BPF_F_AFTER (1U << 4) #define BPF_F_ID (1U << 5) #define BPF_F_PREORDER (1U << 6) +#define BPF_F_CGROUP_FD (1U << 7) #define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ =20 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the --=20 2.43.0 From nobody Mon Jun 8 22:51:09 2026 Received: from out-186.mta1.migadu.com (out-186.mta1.migadu.com [95.215.58.186]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9B7A72F7EF6 for ; Tue, 26 May 2026 02:21:36 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.186 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762098; cv=none; b=Xdx9B1WFOW7E6R8vYebjFaVP27KADZj+DLFtaXvlFeH7d/7ND3NV1+dkM/tytNnnaJ0h0zmT+6VtZ3DvjchkLbYzkTLZj7FJYfCCfMUtVfqVSyydDoXxcn0rdOlHECS8cEmnNJPHXBoTO2ECtUGaYXQ/ZLopD+f35+6su/nn/0M= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762098; c=relaxed/simple; bh=ZWuk2qCAxn94jpDeU5QgVfhK2SKt/OvPwcpF6BLBR4c=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=d6cwZMbYATXGPv5PmgtmKFMC/GtDie/ygjNRWvxmFkvETjUznyuUutZarkmAfkQ6omC0EMsZvsJTXmYQ9Or9OTWhe3PzbGKScibl1QjBfVcFDKHRwQ5TxAlwVzwn4Ef8FrchowGiDf/16if/HDCBrkmuzBk/bzWyN9AJ2a9AyBs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=GqLvCluv; arc=none smtp.client-ip=95.215.58.186 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="GqLvCluv" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1779762094; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=hNDaNP16Qm74gfq94xdOPMH/X6ONx50Ta9FgPYJmm+k=; b=GqLvCluvmmsKTWVXkXT9WhCYZ0SDbKpXXvM4VH/Q93rW7GfgAp94Oa9v8Wc8YPld0kd28T lp4oBWHvq0K/M96vvb0C/hky2plzX1PDLSTm/K7QgA1u2X03kt8uCk3hnIkbaHVUDXZMrL zD8wvJkpcIWT1xRoyAxB6k3ZpDeHLY0= From: Hui Zhu To: Alexei Starovoitov , Daniel Borkmann , John Fastabend , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Kumar Kartikeya Dwivedi , Song Liu , Yonghong Song , Jiri Olsa , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , JP Kobryn , Andrew Morton , Shuah Khan , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , Stanislav Fomichev , KP Singh , Tao Chen , Mykyta Yatsenko , Leon Hwang , Anton Protopopov , Amery Hung , Tobias Klauser , Eyal Birger , Rong Tao , Hao Luo , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , Willem de Bruijn , Jason Xing , Paul Chaignon , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, bpf@vger.kernel.org, cgroups@vger.kernel.org, linux-mm@kvack.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: geliang@kernel.org, baohua@kernel.org, Yafang Shao Subject: [RFC PATCH bpf-next v7 03/11] libbpf: fix return value on memory allocation failure Date: Tue, 26 May 2026 10:20:03 +0800 Message-ID: <8bfb7027abcc02c21db565cf52d6af78a6ce5b7d.1779760876.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin bpf_map__attach_struct_ops() returns -EINVAL instead of -ENOMEM on the memory allocation failure. Fix it. Fixes: 590a00888250 ("bpf: libbpf: Add STRUCT_OPS support") Signed-off-by: Roman Gushchin Acked-by: Yafang Shao --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ab2071fdd3e8..1e8688975d16 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -13701,7 +13701,7 @@ struct bpf_link *bpf_map__attach_struct_ops(const s= truct bpf_map *map) =20 link =3D calloc(1, sizeof(*link)); if (!link) - return libbpf_err_ptr(-EINVAL); + return libbpf_err_ptr(-ENOMEM); =20 /* kern_vdata should be prepared during the loading phase. */ err =3D bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0); --=20 2.43.0 From nobody Mon Jun 8 22:51:09 2026 Received: from out-170.mta1.migadu.com (out-170.mta1.migadu.com [95.215.58.170]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2A609233939 for ; Tue, 26 May 2026 02:21:50 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.170 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762111; cv=none; b=tNIuX0BafwVtmwWrT6D5bWISxsoFtBbI/8/rk/DghqgKWf5mdZiZmGUeAJRtqqgY9lWK/SltoqezW+a85lCZqlJKJnoDlBvVZdhTjYEk4s8ZJeW+q2nHj2g0wKfzv3ErpIEv+2/Y87Cz5O6Yz8s0lyp+p5SlhBiTKSgRt9c0wIk= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762111; c=relaxed/simple; bh=uZi3xorIb1H9ikhi5wArkjVayXHTg+sbCnE50nHmo5w=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Ku0ZpthtdiGLiSndAuPhk+F9hGyXCQvG8iENfukW/aPF1P8xrTi3ln9rj2wLEPwZzg4PlJ7ln6nhzM+sL/hsKDCoT+K1qKvSuq/JyX/yG7ShJ3sr2IQKIInpQvlBFWvIHR9flcSv+2X83mpQH+K9UoQlcThv2YIed+KoGGuWElI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=PZjgjlwc; arc=none smtp.client-ip=95.215.58.170 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="PZjgjlwc" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1779762107; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=K7Bh9sNsoNgUnqOqmMKX4zg5u4PI6PFUbnydcfDlgqU=; b=PZjgjlwcve5jLdLXC3W81a3ws371QG0/VRoWhYadFvjsVk98w+mAqx47aeLN8a6ph9YnUf wYAyBEJ/yMLAYymhs/N9jK+9yoIBpF574W/zXiDvjjYETKv45kx5umPmOt44LK4shgxJNN OIxwgn8cnfbIglwOOZDZKFLyxXRjcnE= From: Hui Zhu To: Alexei Starovoitov , Daniel Borkmann , John Fastabend , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Kumar Kartikeya Dwivedi , Song Liu , Yonghong Song , Jiri Olsa , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , JP Kobryn , Andrew Morton , Shuah Khan , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , Stanislav Fomichev , KP Singh , Tao Chen , Mykyta Yatsenko , Leon Hwang , Anton Protopopov , Amery Hung , Tobias Klauser , Eyal Birger , Rong Tao , Hao Luo , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , Willem de Bruijn , Jason Xing , Paul Chaignon , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, bpf@vger.kernel.org, cgroups@vger.kernel.org, linux-mm@kvack.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: geliang@kernel.org, baohua@kernel.org Subject: [RFC PATCH bpf-next v7 04/11] libbpf: introduce bpf_map__attach_struct_ops_opts() Date: Tue, 26 May 2026 10:20:04 +0800 Message-ID: <20bdaa33cc19364f5f10208c79ef94fe43bd5ac1.1779760876.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin Introduce bpf_map__attach_struct_ops_opts(), an extended version of bpf_map__attach_struct_ops(), which takes additional struct bpf_struct_ops_opts argument. This allows to pass a target_fd argument and the BPF_F_CGROUP_FD flag and attach the struct ops to a cgroup as a result. Signed-off-by: Roman Gushchin --- tools/lib/bpf/libbpf.c | 20 +++++++++++++++++--- tools/lib/bpf/libbpf.h | 14 ++++++++++++++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1e8688975d16..a1b54da1ded2 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -13683,11 +13683,18 @@ static int bpf_link__detach_struct_ops(struct bpf= _link *link) return close(link->fd); } =20 -struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +struct bpf_link *bpf_map__attach_struct_ops_opts(const struct bpf_map *map, + const struct bpf_struct_ops_opts *opts) { + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts); struct bpf_link_struct_ops *link; + int err, fd, target_fd; __u32 zero =3D 0; - int err, fd; + + if (!OPTS_VALID(opts, bpf_struct_ops_opts)) { + pr_warn("map '%s': invalid opts\n", map->name); + return libbpf_err_ptr(-EINVAL); + } =20 if (!bpf_map__is_struct_ops(map)) { pr_warn("map '%s': can't attach non-struct_ops map\n", map->name); @@ -13724,7 +13731,9 @@ struct bpf_link *bpf_map__attach_struct_ops(const s= truct bpf_map *map) return &link->link; } =20 - fd =3D bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, NULL); + link_opts.flags =3D OPTS_GET(opts, flags, 0); + target_fd =3D OPTS_GET(opts, target_fd, 0); + fd =3D bpf_link_create(map->fd, target_fd, BPF_STRUCT_OPS, &link_opts); if (fd < 0) { free(link); return libbpf_err_ptr(fd); @@ -13736,6 +13745,11 @@ struct bpf_link *bpf_map__attach_struct_ops(const = struct bpf_map *map) return &link->link; } =20 +struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +{ + return bpf_map__attach_struct_ops_opts(map, NULL); +} + /* * Swap the back struct_ops of a link with a new struct_ops map. */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index bba4e8464396..18af178547ad 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -945,6 +945,20 @@ bpf_program__attach_cgroup_opts(const struct bpf_progr= am *prog, int cgroup_fd, struct bpf_map; =20 LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_ma= p *map); + +struct bpf_struct_ops_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + __u32 flags; + __u32 target_fd; + __u64 expected_revision; + size_t :0; +}; +#define bpf_struct_ops_opts__last_field expected_revision + +LIBBPF_API struct bpf_link * +bpf_map__attach_struct_ops_opts(const struct bpf_map *map, + const struct bpf_struct_ops_opts *opts); LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bp= f_map *map); =20 struct bpf_iter_attach_opts { diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index dfed8d60af05..6105619b5ecf 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -454,6 +454,7 @@ LIBBPF_1.7.0 { bpf_prog_assoc_struct_ops; bpf_program__assoc_struct_ops; btf__permute; + bpf_map__attach_struct_ops_opts; } LIBBPF_1.6.0; =20 LIBBPF_1.8.0 { --=20 2.43.0 From nobody Mon Jun 8 22:51:09 2026 Received: from out-178.mta1.migadu.com (out-178.mta1.migadu.com [95.215.58.178]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9A7FC2F8EA9 for ; Tue, 26 May 2026 02:22:03 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.178 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762125; cv=none; b=pgtcyOjzipHSxawJwzR4FiCcqgzlktYOpPU7bGzUdui2YyJTvzF5XCt1Hw0pNCroqq9Rxnf6WinAxUF2Yhy8Inui9Z4jjW6FjTH/TmZMUIjx7GfFe2jt4B0s6vfkM7awtlS3TmApFoD3Hz0o1AHxEVG2er43+T/cQg4fHI2rhi4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762125; c=relaxed/simple; bh=SEBbyqm64/gGH282eDZdksbO6ZyqWL7e1qiXAOqBBAU=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Bl4GBrfnZVXinB9g9MJ1ZbatqE2YOhoz6BLMyTCe4T+3NByT5EZVBB7Yirz+PxaGCmP6Wq8vNik9Z1Wy1a1TIcwY7KiXzLgrHfkAAsuCZuRMchljSIh3GAYCLtcSuJ8h03XcUaKttmJ/vJc05gh3m/xxyuEpAhIZJNCfsFsnUkc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=c2O+nEAy; arc=none smtp.client-ip=95.215.58.178 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="c2O+nEAy" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1779762121; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=ykiWx+jJ41AIlhwvxbQbM9j1opOnC375H9pPJ40SXbc=; b=c2O+nEAyiCLBgt1zDhG69wQnZW5jG1GOvhIAkS/3ahbNDoa2JXUWTtBHLIpIPmmTsrcTHw E7ET8yOilbOSmQScbO1THS6NERx1Jr5ftkIeTh+p7iVuI0PQRMbIxkf3lCS1NAjYDcGI31 mgH24x8RVaZxRBEWf4/wk7MRvCI/96E= From: Hui Zhu To: Alexei Starovoitov , Daniel Borkmann , John Fastabend , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Kumar Kartikeya Dwivedi , Song Liu , Yonghong Song , Jiri Olsa , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , JP Kobryn , Andrew Morton , Shuah Khan , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , Stanislav Fomichev , KP Singh , Tao Chen , Mykyta Yatsenko , Leon Hwang , Anton Protopopov , Amery Hung , Tobias Klauser , Eyal Birger , Rong Tao , Hao Luo , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , Willem de Bruijn , Jason Xing , Paul Chaignon , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, bpf@vger.kernel.org, cgroups@vger.kernel.org, linux-mm@kvack.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: geliang@kernel.org, baohua@kernel.org, Hui Zhu Subject: [RFC PATCH bpf-next v7 05/11] bpf: Pass flags in bpf_link_create for struct_ops Date: Tue, 26 May 2026 10:20:05 +0800 Message-ID: <8dfc661158419f7331cc645368b051a424229fdc.1779760876.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu To support features like allowing overrides in cgroup hierarchies, we need a way to pass flags from userspace to the kernel when attaching a struct_ops. Extend `bpf_struct_ops_link` to include a `flags` field. This field is populated from `attr->link_create.flags` during link creation. This will allow struct_ops implementations, such as the upcoming memory controller ops, to interpret these flags and modify their attachment behavior accordingly. The flags validation in bpf_struct_ops_link_create() is updated to explicitly permit BPF_F_ALLOW_OVERRIDE in addition to the already-allowed BPF_F_CGROUP_FD. Any other flag combination will still be rejected with -EINVAL. UAPI Change: This patch updates the comment in include/uapi/linux/bpf.h to reflect that the cgroup-bpf attach flags (such as BPF_F_ALLOW_OVERRIDE) are now applicable to both BPF_PROG_ATTACH and BPF_LINK_CREATE commands. Previously, these flags were only documented for BPF_PROG_ATTACH. The actual flag definitions remain unchanged, so this is a compatible extension of the existing API. Older userspace will continue to work (by not passing flags), and newer userspace can opt-in to the new functionality by setting appropriate flags. Signed-off-by: Barry Song Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 2 +- kernel/bpf/bpf_struct_ops.c | 4 +++- tools/include/uapi/linux/bpf.h | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 743b4f0546b5..aae7f9837944 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1915,6 +1915,7 @@ struct bpf_struct_ops_link { bool cgroup_removed; struct list_head list; wait_queue_head_t wait_hup; + u32 flags; }; =20 struct bpf_link_primer { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f547613986cc..85ab5bdf81ac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1194,7 +1194,7 @@ enum bpf_perf_event_type { BPF_PERF_EVENT_EVENT =3D 6, }; =20 -/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH and BPF_LINK_CREATE com= mand * * NONE(default): No further bpf programs allowed in the subtree. * diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 5333290957cb..1d15c667a300 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1389,7 +1389,8 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) struct cgroup *cgrp; int err; =20 - if (attr->link_create.flags & ~BPF_F_CGROUP_FD) + if (attr->link_create.flags & ~(BPF_F_CGROUP_FD | + BPF_F_ALLOW_OVERRIDE)) return -EINVAL; =20 map =3D bpf_map_get(attr->link_create.map_fd); @@ -1427,6 +1428,7 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) goto err_out; } } + link->flags =3D attr->link_create.flags; =20 err =3D bpf_link_prime(&link->link, &link_primer); if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index fa075dc3b7eb..8a2b1f865d2b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1194,7 +1194,7 @@ enum bpf_perf_event_type { BPF_PERF_EVENT_EVENT =3D 6, }; =20 -/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH and BPF_LINK_CREATE com= mand * * NONE(default): No further bpf programs allowed in the subtree. * --=20 2.43.0 From nobody Mon Jun 8 22:51:09 2026 Received: from out-179.mta1.migadu.com (out-179.mta1.migadu.com [95.215.58.179]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B0EA42F7F07 for ; Tue, 26 May 2026 02:25:30 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.179 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762333; cv=none; b=XVxQR4GjTb1Yb7Wx+EBBnFQGFKqXxqVlis5OBGdllnGNInky8aqh/VzuzLy/7shjQzygdPEUTuizQzGmHrj39SEXoXTfbS6qmI0ENo7wPsvOGF8ZhrHkIgOQlSXVv8K96eidUNCnXt2wS0MYcsFuDiz3L5bm1rMJnAKEIO+BzoA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762333; c=relaxed/simple; bh=LH28HKcvHaSV++/SsY+s9YZTvVPzHG2/17T1tlbS8Hg=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=f7vohBoDQu5O3snwW69TwAixPcuGaethhBdlYMOo9CupJwNbIIFChscc3Qtr+0ZbJhJ8rNxN37OZGP8vgnNf867SJcWu0/2hkLdWcXJxGhFt/IT4gYVpLeLVDo04QtfkmXwFH2GpwFKH2FMhgxwMwtOYu0vjiRslIuV80gj/NlQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=YfiAh1tC; arc=none smtp.client-ip=95.215.58.179 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="YfiAh1tC" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1779762328; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=WBXofLS2/VUxF2UYjXjqK/PbDOUsVx2R7QdYyie6cEM=; b=YfiAh1tCl+MvkWkflELb5JhfMqa4GlkKZiB7v/dGb6ZkFPUhd3JnWnzAMAC9u/xxKuP3fH cDqhUBNAnjE25f2K+KrZG85Cf6B2fBNww+eKqzhx3kmH1S5aglrFr+XNJDg4XA/bEp3oB9 68w3TY7e4GWPu+TQBzpK8Z8r+l1p9xA= From: Hui Zhu To: Alexei Starovoitov , Daniel Borkmann , John Fastabend , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Kumar Kartikeya Dwivedi , Song Liu , Yonghong Song , Jiri Olsa , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , JP Kobryn , Andrew Morton , Shuah Khan , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , Stanislav Fomichev , KP Singh , Tao Chen , Mykyta Yatsenko , Leon Hwang , Anton Protopopov , Amery Hung , Tobias Klauser , Eyal Birger , Rong Tao , Hao Luo , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , Willem de Bruijn , Jason Xing , Paul Chaignon , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, bpf@vger.kernel.org, cgroups@vger.kernel.org, linux-mm@kvack.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: geliang@kernel.org, baohua@kernel.org, Hui Zhu Subject: [RFC PATCH bpf-next v7 06/11] mm: memcontrol: Add BPF struct_ops for memory controller Date: Tue, 26 May 2026 10:24:54 +0800 Message-ID: <9e081d01a0708dcdd101af7f7bede07cf43ca21d.1779760876.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT From: Hui Zhu Introduce BPF struct_ops support to the memory controller, enabling custom and dynamic control over memory pressure via a new struct_ops type, `memcg_bpf_ops`. The `memcg_bpf_ops` interface exposes the following hooks: - `memcg_charged`: Called on the synchronous blocking charge path after pages have been charged to the cgroup. Returns a custom throttling delay in milliseconds. This value is used as a lower bound for the penalty passed to `__mem_cgroup_handle_over_high()` and applies even when `memory.high` is not breached, allowing BPF programs to impose proactive back-pressure on any charge event. Return 0 for no delay. - `memcg_uncharged`: Called when pages are uncharged from a cgroup, allowing BPF programs to track or react to memory releases. - `below_low`: Overrides the `memory.low` protection check. Receives the effective low threshold (elow) and current usage as arguments. If it returns true, the cgroup is treated as protected regardless of the standard elow >=3D usage comparison. Returning false continues to the normal kernel check. - `below_min`: Same as `below_low`, but for `memory.min` protection. Receives emin and usage as arguments. - `handle_cgroup_online`/`offline`: Callbacks invoked when a cgroup with an attached program comes online or goes offline, allowing BPF programs to manage per-cgroup state. These hooks are integrated into core memory control logic. `memcg_charged` is consulted in `try_charge_memcg` on the synchronous blocking path. To avoid losing the originally charged cgroup pointer as the charge loop walks up the ancestor chain, `orig_memcg` is saved before the loop begins. After the loop, the BPF hook is called with `orig_memcg` and the actual batch size, and its result (converted from milliseconds to jiffies) is stored as `bpf_high_delay`. `__mem_cgroup_handle_over_high()` is then invoked when either `bpf_high_delay` is non-zero or `memcg_nr_pages_over_high` exceeds MEMCG_CHARGE_BATCH. Inside the function, the current task's memcg is obtained independently via `get_mem_cgroup_from_mm()`. Reclaim is attempted first; if reclaim makes forward progress or retries remain, the function loops back to reclaim again rather than throttling immediately. `bpf_high_delay` serves as a lower bound for the final penalty via `max(penalty_jiffies, bpf_high_delay)`: when `memcg_nr_pages_over_high` is zero (memory.high not breached), the kernel overage calculation is skipped and `bpf_high_delay` alone sets the penalty. In all cases, throttling only occurs if the resulting penalty exceeds HZ/100; a BPF-requested delay below this threshold causes no sleep. The deferred user-return path (via `mem_cgroup_handle_over_high()`) always passes bpf_high_delay=3D0 since BPF delay is evaluated exactly once, on the synchronous charge path. `below_low` and `below_min` are inserted in their respective inline functions after the unprotected check. The pre-read elow/emin and usage values are forwarded to the BPF hook; on false return the standard kernel comparison (elow >=3D usage) proceeds as normal. Support for `BPF_F_ALLOW_OVERRIDE` is included. When a program is registered with this flag, a descendant cgroup may later attach its own `memcg_bpf_ops` to override the inherited program. Without this flag, attaching to a cgroup that already has a program (whether attached directly or inherited from an ancestor) will fail with -EBUSY. On registration, ops are propagated to the cgroup itself and all its descendants via `mem_cgroup_iter`. A `bpf_ops_flags` field is added to `struct mem_cgroup` to persist the attachment flags, which are inherited during `css_online` and restored to the parent's flags on unregistration. On unregistration, rather than unconditionally clearing `bpf_ops` to NULL throughout the subtree, each descendant that still holds the unregistered ops pointer has its `bpf_ops` and `bpf_ops_flags` restored to the values the registering cgroup's parent held at that time. This correctly handles the override case where a descendant had re-attached over an inherited program. Lifecycle management ensures programs are inherited by child cgroups on `css_online` and cleaned up on `css_offline`. SRCU (`memcg_bpf_srcu`) protects concurrent read access to the `memcg->bpf_ops` pointer; all writes are serialized under `cgroup_mutex`. Signed-off-by: Barry Song Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- include/linux/memcontrol.h | 250 ++++++++++++++++++++++++++++++- mm/bpf_memcontrol.c | 298 ++++++++++++++++++++++++++++++++++++- mm/memcontrol.c | 43 ++++-- 3 files changed, 574 insertions(+), 17 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dc3fa687759b..30b7b8558ccb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -23,6 +23,7 @@ #include #include #include +#include =20 struct mem_cgroup; struct obj_cgroup; @@ -192,6 +193,59 @@ struct obj_cgroup { bool is_root; }; =20 +#ifdef CONFIG_BPF_SYSCALL +/* + * struct memcg_bpf_ops - BPF callbacks for memory cgroup operations + * + * @handle_cgroup_online: Called when a cgroup comes online. May be used + * by a BPF program to initialize per-cgroup state. + * @handle_cgroup_offline: Called when a cgroup goes offline. May be used + * to release per-cgroup state allocated in the + * online callback. + * @below_low: Override the memory.low protection check. + * Receives the effective low threshold @elow and the current + * memory usage @usage (both in pages). If the callback retur= ns + * true, mem_cgroup_below_low() returns true immediately, + * treating the cgroup as protected regardless of the standard + * elow >=3D usage comparison. Returning false continues to + * the normal kernel check. + * @below_min: Same as @below_low, but for the memory.min protection chec= k. + * Receives @emin and @usage. Returning true short-circuits t= he + * standard emin >=3D usage comparison. + * @memcg_charged: Called on the synchronous blocking charge path after + * pages have been charged to the cgroup. Returns a cust= om + * throttle delay in milliseconds. This delay is taken as + * a lower bound for the penalty in + * __mem_cgroup_handle_over_high() and applies even when + * memory.high is not breached. Return 0 for no extra de= lay. + * @memcg_uncharged: Called when pages are uncharged from the cgroup. + * Allows BPF programs to track memory releases or update + * accounting state. No return value. + * + * This structure defines the interface for BPF programs to customize + * memory cgroup behavior through struct_ops programs. All callbacks are + * non-sleepable. Concurrent readers are protected by SRCU + * (memcg_bpf_srcu); writers hold cgroup_mutex. + */ +struct memcg_bpf_ops { + void (*handle_cgroup_online)(struct mem_cgroup *memcg); + + void (*handle_cgroup_offline)(struct mem_cgroup *memcg); + + bool (*below_low)(struct mem_cgroup *memcg, unsigned long elow, + unsigned long usage); + + bool (*below_min)(struct mem_cgroup *memcg, unsigned long emin, + unsigned long usage); + + unsigned int (*memcg_charged)(struct mem_cgroup *memcg, + unsigned int nr_pages); + + void (*memcg_uncharged)(struct mem_cgroup *memcg, + unsigned int nr_pages); +}; +#endif /* CONFIG_BPF_SYSCALL */ + /* * The memory controller data structure. The memory controller controls bo= th * page cache and RSS per cgroup. We would eventually like to provide @@ -323,6 +377,11 @@ struct mem_cgroup { spinlock_t event_list_lock; #endif /* CONFIG_MEMCG_V1 */ =20 +#ifdef CONFIG_BPF_SYSCALL + struct memcg_bpf_ops *bpf_ops; + u32 bpf_ops_flags; +#endif + struct mem_cgroup_per_node *nodeinfo[]; }; =20 @@ -533,6 +592,165 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } =20 +#ifdef CONFIG_BPF_SYSCALL + +/* SRCU for protecting concurrent access to memcg->bpf_ops */ +extern struct srcu_struct memcg_bpf_srcu; + +/* + * BPF_MEMCG_CALL - Safely invoke a BPF memcg callback with return value + * @memcg: The memory cgroup whose bpf_ops to invoke + * @op: The callback name (struct member of memcg_bpf_ops) + * @default_val: Value to return if no BPF program is attached or the + * specific callback is not implemented + * @...: Additional arguments forwarded to the callback + * + * Uses a two-phase READ_ONCE() pattern: + * 1. An initial lockless READ_ONCE() provides a fast-path check. + * If bpf_ops is NULL the SRCU lock is never taken, keeping the + * common no-BPF path free of synchronization overhead. + * 2. A second READ_ONCE() after srcu_read_lock() ensures a consistent + * view of the pointer under the SRCU read section, guarding against + * a concurrent bpf_memcg_ops_unreg() that may be in progress. + */ +#define BPF_MEMCG_CALL(memcg, op, default_val, ...) ({ \ + typeof(default_val) __ret =3D (default_val); \ + struct memcg_bpf_ops *__ops; \ + int __idx; \ + \ + if (unlikely(READ_ONCE((memcg)->bpf_ops))) { \ + __idx =3D srcu_read_lock(&memcg_bpf_srcu); \ + __ops =3D READ_ONCE((memcg)->bpf_ops); \ + if (__ops && __ops->op) \ + __ret =3D __ops->op(memcg, ##__VA_ARGS__);\ + srcu_read_unlock(&memcg_bpf_srcu, __idx); \ + } \ + __ret; \ +}) + +/* + * BPF_MEMCG_CALL_VOID - Safely invoke a void BPF memcg callback + * @memcg: The memory cgroup whose bpf_ops to invoke + * @op: The callback name (struct member of memcg_bpf_ops) + * @...: Additional arguments forwarded to the callback + * + * Same SRCU fast-path pattern as BPF_MEMCG_CALL but for callbacks + * that have no return value. + */ +#define BPF_MEMCG_CALL_VOID(memcg, op, ...) do { \ + struct memcg_bpf_ops *__ops; \ + int __idx; \ + \ + if (unlikely(READ_ONCE((memcg)->bpf_ops))) { \ + __idx =3D srcu_read_lock(&memcg_bpf_srcu); \ + __ops =3D READ_ONCE((memcg)->bpf_ops); \ + if (__ops && __ops->op) \ + __ops->op(memcg, ##__VA_ARGS__); \ + srcu_read_unlock(&memcg_bpf_srcu, __idx); \ + } \ +} while (0) + +static inline bool +bpf_memcg_below_low(struct mem_cgroup *memcg, unsigned long elow, + unsigned long usage) +{ + return BPF_MEMCG_CALL(memcg, below_low, false, elow, usage); +} + +static inline bool +bpf_memcg_below_min(struct mem_cgroup *memcg, unsigned long emin, + unsigned long usage) +{ + return BPF_MEMCG_CALL(memcg, below_min, false, emin, usage); +} + +static inline unsigned long +bpf_memcg_charged(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + unsigned int ret; + + /* + * Retrieve the BPF-specified throttle delay in milliseconds and + * convert to jiffies for use in __mem_cgroup_handle_over_high(). + */ + ret =3D BPF_MEMCG_CALL(memcg, memcg_charged, 0U, nr_pages); + return msecs_to_jiffies(ret); +} + +static inline void +bpf_memcg_uncharged(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + BPF_MEMCG_CALL_VOID(memcg, memcg_uncharged, nr_pages); +} + +#undef BPF_MEMCG_CALL +#undef BPF_MEMCG_CALL_VOID + +/* + * memcontrol_bpf_online - Inherit BPF ops for a newly online cgroup. + * @memcg: The memory cgroup coming online. + * + * Called under cgroup_mutex from mem_cgroup_css_online(). Inherits the + * parent's bpf_ops pointer and bpf_ops_flags into @memcg so that + * BPF-based memory control policies propagate down the hierarchy + * automatically. + * + * If the parent has no bpf_ops, this is a no-op. If it does, the ops + * pointer is copied and, if an online handler is implemented, it is + * invoked to allow the BPF program to initialize per-cgroup state for + * the new child. + * + * Locking: cgroup_mutex is held by the caller. Because bpf_memcg_ops_reg() + * and bpf_memcg_ops_unreg() also hold cgroup_mutex when writing + * memcg->bpf_ops, no additional lock on memcg_bpf_srcu is required here. + */ +extern void memcontrol_bpf_online(struct mem_cgroup *memcg); + +/* + * memcontrol_bpf_offline - Run BPF cleanup for a cgroup going offline. + * @memcg: The memory cgroup going offline. + * + * Called under cgroup_mutex from mem_cgroup_css_offline(). If a BPF + * program is attached and implements a handle_cgroup_offline callback, + * it is invoked so the program can release any per-cgroup state before + * the memcg is freed. + * + * Locking: same as memcontrol_bpf_online() =E2=80=94 cgroup_mutex is held. + */ +extern void memcontrol_bpf_offline(struct mem_cgroup *memcg); + +#else /* CONFIG_BPF_SYSCALL */ + +static inline unsigned long +bpf_memcg_charged(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + return 0; +} + +static inline void +bpf_memcg_uncharged(struct mem_cgroup *memcg, unsigned int nr_pages) +{ +} + +static inline bool +bpf_memcg_below_low(struct mem_cgroup *memcg, unsigned long elow, + unsigned long usage) +{ + return false; +} + +static inline bool +bpf_memcg_below_min(struct mem_cgroup *memcg, unsigned long emin, + unsigned long usage) +{ + return false; +} + +static inline void memcontrol_bpf_online(struct mem_cgroup *memcg) { } +static inline void memcontrol_bpf_offline(struct mem_cgroup *memcg) { } + +#endif /* CONFIG_BPF_SYSCALL */ + static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, @@ -603,21 +821,35 @@ static inline bool mem_cgroup_unprotected(struct mem_= cgroup *target, static inline bool mem_cgroup_below_low(struct mem_cgroup *target, struct mem_cgroup *memcg) { + unsigned long elow, usage; + if (mem_cgroup_unprotected(target, memcg)) return false; =20 - return READ_ONCE(memcg->memory.elow) >=3D - page_counter_read(&memcg->memory); + elow =3D READ_ONCE(memcg->memory.elow); + usage =3D page_counter_read(&memcg->memory); + + if (bpf_memcg_below_low(memcg, elow, usage)) + return true; + + return elow >=3D usage; } =20 static inline bool mem_cgroup_below_min(struct mem_cgroup *target, struct mem_cgroup *memcg) { + unsigned long emin, usage; + if (mem_cgroup_unprotected(target, memcg)) return false; =20 - return READ_ONCE(memcg->memory.emin) >=3D - page_counter_read(&memcg->memory); + emin =3D READ_ONCE(memcg->memory.emin); + usage =3D page_counter_read(&memcg->memory); + + if (bpf_memcg_below_min(memcg, emin, usage)) + return true; + + return emin >=3D usage; } =20 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t g= fp); @@ -890,12 +1122,18 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lr= uvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } =20 -void __mem_cgroup_handle_over_high(gfp_t gfp_mask); +void __mem_cgroup_handle_over_high(gfp_t gfp_mask, + unsigned long bpf_high_delay); =20 static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { if (unlikely(current->memcg_nr_pages_over_high)) - __mem_cgroup_handle_over_high(gfp_mask); + /* + * Deferred user-return path: no BPF delay lookup here. + * BPF-provided delay is injected from try_charge_memcg() + * on the synchronous blocking charge path. + */ + __mem_cgroup_handle_over_high(gfp_mask, 0); } =20 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 716df49d7647..1f726a7b22e3 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -8,6 +8,9 @@ #include #include =20 +/* Protects memcg->bpf_ops pointer for read and write. */ +DEFINE_SRCU(memcg_bpf_srcu); + __bpf_kfunc_start_defs(); =20 /** @@ -179,15 +182,306 @@ static const struct btf_kfunc_id_set bpf_memcontrol_= kfunc_set =3D { .set =3D &bpf_memcontrol_kfuncs, }; =20 +/** + * memcontrol_bpf_online - Inherit BPF programs for a new online cgroup. + * @memcg: The memory cgroup that is coming online. + * + * When a new memcg is brought online, it inherits the BPF programs + * attached to its parent. This ensures consistent BPF-based memory + * control policies throughout the cgroup hierarchy. + * + * After inheriting, if the BPF program has an online handler, it is + * invoked for the new memcg. + */ +void memcontrol_bpf_online(struct mem_cgroup *memcg) +{ + struct memcg_bpf_ops *ops; + struct mem_cgroup *parent_memcg; + + /* The root cgroup does not inherit from a parent. */ + if (mem_cgroup_is_root(memcg)) + return; + + /* + * Because only functions bpf_memcg_ops_reg and bpf_memcg_ops_unreg + * write to memcg->bpf_ops and memcg->bpf_ops_flags under the + * protection of cgroup_mutex, ensuring that cgroup_mutex is already + * locked here allows safe reading and writing of memcg->bpf_ops and + * memcg->bpf_ops_flags without needing to acquire a lock on + * memcg_bpf_srcu. + */ + lockdep_assert_held(&cgroup_mutex); + + parent_memcg =3D parent_mem_cgroup(memcg); + + /* Inherit the BPF program from the parent cgroup. */ + ops =3D READ_ONCE(parent_memcg->bpf_ops); + if (!ops) + return; + WRITE_ONCE(memcg->bpf_ops, ops); + memcg->bpf_ops_flags =3D parent_memcg->bpf_ops_flags; + + /* + * If the BPF program implements it, call the online handler to + * allow the program to perform setup tasks for the new cgroup. + */ + if (ops->handle_cgroup_online) + ops->handle_cgroup_online(memcg); +} + +/** + * memcontrol_bpf_offline - Run BPF cleanup for an offline cgroup. + * @memcg: The memory cgroup that is going offline. + * + * If a BPF program is attached and implements an offline handler, + * it is invoked to perform cleanup tasks before the memcg goes + * completely offline. + */ +void memcontrol_bpf_offline(struct mem_cgroup *memcg) +{ + struct memcg_bpf_ops *ops; + + /* Same locking rules as memcontrol_bpf_online(). */ + lockdep_assert_held(&cgroup_mutex); + + ops =3D READ_ONCE(memcg->bpf_ops); + if (!ops || !ops->handle_cgroup_offline) + return; + + ops->handle_cgroup_offline(memcg); +} + +static int memcg_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + return -EACCES; +} + +static bool memcg_ops_is_valid_access(int off, int size, enum bpf_access_t= ype type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops bpf_memcg_verifier_ops =3D { + .get_func_proto =3D bpf_base_func_proto, + .btf_struct_access =3D memcg_ops_btf_struct_access, + .is_valid_access =3D memcg_ops_is_valid_access, +}; + +static void cfi_handle_cgroup_online(struct mem_cgroup *memcg) +{ +} + +static void cfi_handle_cgroup_offline(struct mem_cgroup *memcg) +{ +} + +static bool +cfi_below_low(struct mem_cgroup *memcg, unsigned long elow, + unsigned long usage) +{ + return false; +} + +static bool +cfi_below_min(struct mem_cgroup *memcg, unsigned long emin, + unsigned long usage) +{ + return false; +} + +static unsigned int cfi_memcg_charged(struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + return 0; +} + +static void cfi_memcg_uncharged(struct mem_cgroup *memcg, unsigned int nr_= pages) +{ +} + +static struct memcg_bpf_ops cfi_bpf_memcg_ops =3D { + .handle_cgroup_online =3D cfi_handle_cgroup_online, + .handle_cgroup_offline =3D cfi_handle_cgroup_offline, + .below_low =3D cfi_below_low, + .below_min =3D cfi_below_min, + .memcg_charged =3D cfi_memcg_charged, + .memcg_uncharged =3D cfi_memcg_uncharged, +}; + +static int bpf_memcg_ops_init(struct btf *btf) +{ + return 0; +} + +static int bpf_memcg_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff =3D __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct memcg_bpf_ops, handle_cgroup_online): + case offsetof(struct memcg_bpf_ops, handle_cgroup_offline): + case offsetof(struct memcg_bpf_ops, below_low): + case offsetof(struct memcg_bpf_ops, below_min): + case offsetof(struct memcg_bpf_ops, memcg_charged): + case offsetof(struct memcg_bpf_ops, memcg_uncharged): + break; + default: + return -EINVAL; + } + + if (prog->sleepable) + return -EINVAL; + + return 0; +} + +static int bpf_memcg_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link; + struct memcg_bpf_ops *ops =3D kdata, *old_ops; + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg, *iter; + int err =3D 0; + + if (!link) + return -EOPNOTSUPP; + ops_link =3D container_of(link, struct bpf_struct_ops_link, link); + if (!ops_link->cgroup) + return -EINVAL; + + cgroup_lock(); + + css =3D cgroup_e_css(ops_link->cgroup, &memory_cgrp_subsys); + if (!css) { + err =3D -EINVAL; + goto unlock_out; + } + memcg =3D mem_cgroup_from_css(css); + + /* + * Check if memcg has bpf_ops and whether it is inherited from + * parent. + * If inherited and BPF_F_ALLOW_OVERRIDE is set, allow override. + */ + old_ops =3D READ_ONCE(memcg->bpf_ops); + if (old_ops) { + struct mem_cgroup *parent_memcg =3D parent_mem_cgroup(memcg); + + if (!parent_memcg || + !(memcg->bpf_ops_flags & BPF_F_ALLOW_OVERRIDE) || + READ_ONCE(parent_memcg->bpf_ops) !=3D old_ops) { + err =3D -EBUSY; + goto unlock_out; + } + } + + /* Check for incompatible bpf_ops in descendants. */ + iter =3D NULL; + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { + struct memcg_bpf_ops *iter_ops =3D READ_ONCE(iter->bpf_ops); + + if (iter_ops && iter_ops !=3D old_ops) { + /* cannot override existing bpf_ops of sub-cgroup. */ + mem_cgroup_iter_break(memcg, iter); + err =3D -EBUSY; + goto unlock_out; + } + } + + iter =3D NULL; + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { + WRITE_ONCE(iter->bpf_ops, ops); + iter->bpf_ops_flags =3D ops_link->flags; + } + +unlock_out: + cgroup_unlock(); + return err; +} + +/* Unregister the struct ops instance */ +static void bpf_memcg_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link; + struct memcg_bpf_ops *ops =3D kdata; + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + struct mem_cgroup *iter; + struct memcg_bpf_ops *parent_bpf_ops =3D NULL; + u32 parent_bpf_ops_flags =3D 0; + + if (!link) + return; + ops_link =3D container_of(link, struct bpf_struct_ops_link, link); + if (!ops_link->cgroup) + return; + + cgroup_lock(); + + css =3D cgroup_e_css(ops_link->cgroup, &memory_cgrp_subsys); + if (!css) + goto unlock_out; + memcg =3D mem_cgroup_from_css(css); + + /* Get the parent bpf_ops and bpf_ops_flags */ + iter =3D parent_mem_cgroup(memcg); + if (iter) { + parent_bpf_ops =3D READ_ONCE(iter->bpf_ops); + parent_bpf_ops_flags =3D iter->bpf_ops_flags; + } + + iter =3D NULL; + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { + if (READ_ONCE(iter->bpf_ops) =3D=3D ops) { + WRITE_ONCE(iter->bpf_ops, parent_bpf_ops); + iter->bpf_ops_flags =3D parent_bpf_ops_flags; + } + } + +unlock_out: + cgroup_unlock(); + synchronize_srcu(&memcg_bpf_srcu); +} + +static struct bpf_struct_ops bpf_memcg_bpf_ops =3D { + .verifier_ops =3D &bpf_memcg_verifier_ops, + .init =3D bpf_memcg_ops_init, + .check_member =3D bpf_memcg_ops_check_member, + .init_member =3D bpf_memcg_ops_init_member, + .reg =3D bpf_memcg_ops_reg, + .unreg =3D bpf_memcg_ops_unreg, + .name =3D "memcg_bpf_ops", + .owner =3D THIS_MODULE, + .cfi_stubs =3D &cfi_bpf_memcg_ops, +}; + static int __init bpf_memcontrol_init(void) { - int err; + int err, err2; =20 err =3D register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_memcontrol_kfunc_set); if (err) pr_warn("error while registering bpf memcontrol kfuncs: %d", err); =20 - return err; + err2 =3D register_bpf_struct_ops(&bpf_memcg_bpf_ops, memcg_bpf_ops); + if (err2) + pr_warn("error while registering memcontrol bpf ops: %d\n", + err2); + + return err ? err : err2; } late_initcall(bpf_memcontrol_init); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c03d4787d466..ec912d19ef87 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2085,6 +2085,8 @@ static void memcg_uncharge(struct mem_cgroup *memcg, = unsigned int nr_pages) page_counter_uncharge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); + + bpf_memcg_uncharged(memcg, nr_pages); } =20 /* @@ -2473,8 +2475,12 @@ static unsigned long calculate_high_delay(struct mem= _cgroup *memcg, * Reclaims memory over the high limit. Called directly from * try_charge() (context permitting), as well as from the userland * return path where reclaim is always able to block. + * + * @bpf_high_delay is caller-provided extra delay. Callers that do + * not evaluate BPF delay (e.g. deferred return-path handling) pass 0. */ -void __mem_cgroup_handle_over_high(gfp_t gfp_mask) +void +__mem_cgroup_handle_over_high(gfp_t gfp_mask, unsigned long bpf_high_delay) { unsigned long penalty_jiffies; unsigned long pflags; @@ -2516,11 +2522,15 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask) * memory.high is breached and reclaim is unable to keep up. Throttle * allocators proactively to slow down excessive growth. */ - penalty_jiffies =3D calculate_high_delay(memcg, nr_pages, - mem_find_max_overage(memcg)); + if (nr_pages) { + penalty_jiffies =3D calculate_high_delay( + memcg, nr_pages, mem_find_max_overage(memcg)); =20 - penalty_jiffies +=3D calculate_high_delay(memcg, nr_pages, - swap_find_max_overage(memcg)); + penalty_jiffies +=3D calculate_high_delay( + memcg, nr_pages, swap_find_max_overage(memcg)); + } else + penalty_jiffies =3D 0; + penalty_jiffies =3D max(penalty_jiffies, bpf_high_delay); =20 /* * Clamp the max delay per usermode return so as to still keep the @@ -2578,6 +2588,8 @@ static int try_charge_memcg(struct mem_cgroup *memcg,= gfp_t gfp_mask, bool raised_max_event =3D false; unsigned long pflags; bool allow_spinning =3D gfpflags_allow_spinning(gfp_mask); + struct mem_cgroup *orig_memcg; + unsigned long bpf_high_delay; =20 retry: if (consume_stock(memcg, nr_pages)) @@ -2704,6 +2716,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg,= gfp_t gfp_mask, if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); =20 + orig_memcg =3D memcg; /* * If the hierarchy is above the normal consumption range, schedule * reclaim on returning to userland. We can perform reclaim here @@ -2746,6 +2759,8 @@ static int try_charge_memcg(struct mem_cgroup *memcg,= gfp_t gfp_mask, } } while ((memcg =3D parent_mem_cgroup(memcg))); =20 + bpf_high_delay =3D bpf_memcg_charged(orig_memcg, batch); + /* * Reclaim is set up above to be called from the userland * return path. But also attempt synchronous reclaim to avoid @@ -2753,10 +2768,17 @@ static int try_charge_memcg(struct mem_cgroup *memc= g, gfp_t gfp_mask, * kernel. If this is successful, the return path will see it * when it rechecks the overage and simply bail out. */ - if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && - !(current->flags & PF_MEMALLOC) && - gfpflags_allow_blocking(gfp_mask)) - __mem_cgroup_handle_over_high(gfp_mask); + if (!(current->flags & PF_MEMALLOC) && + gfpflags_allow_blocking(gfp_mask)) { + /* + * BPF high-delay is evaluated only on the synchronous + * blocking path. The deferred user-return path calls + * __mem_cgroup_handle_over_high() with bpf_high_delay =3D=3D 0. + */ + if (bpf_high_delay || + current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH) + __mem_cgroup_handle_over_high(gfp_mask, bpf_high_delay); + } return 0; } =20 @@ -4151,6 +4173,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys= _state *css) */ xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL); =20 + memcontrol_bpf_online(memcg); + return 0; free_objcg: for_each_node(nid) { @@ -4188,6 +4212,7 @@ static void mem_cgroup_css_offline(struct cgroup_subs= ys_state *css) =20 zswap_memcg_offline_cleanup(memcg); =20 + memcontrol_bpf_offline(memcg); memcg_offline_kmem(memcg); reparent_deferred_split_queue(memcg); /* --=20 2.43.0 From nobody Mon Jun 8 22:51:09 2026 Received: from out-186.mta1.migadu.com (out-186.mta1.migadu.com [95.215.58.186]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B7CAE305691 for ; Tue, 26 May 2026 02:25:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.186 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762345; cv=none; b=HmY+7LYuqjahCCXCR/J+CCuOEB3VnOw5MCLVCuua/AK1b4PojTwxwjDP/YM2V+B+2h+B5Un2A4RkDY4pw/E/5VkJuH/8aNPQClB6pgOtW9KrojBKMggXcLS16j+EnAhA+SeOqgS6DYXpN4nbCyNDMocCjHY3FTyqEqYO5/iRcNo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762345; c=relaxed/simple; bh=QirGPpr7fVdMCewDL8EqmwoSapxySrhGmmXCGr0oWK0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=HHyTrkksPc+taGTuh5+PJHRTUhGrpZpLoNoOIjCNfML/JuvYe9vcZdFmz+hZWP39rC2i9UrLoQqwq7wKRaqxmJMCnACNRrlyVoGtBPRwaoXb52DBZfzBDEEtbJsnjMf+qhPHjm8Opx/ybBXtZq1KPJmQU5+FsyDqu6E7LAMf12E= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=o/QG/lw4; arc=none smtp.client-ip=95.215.58.186 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="o/QG/lw4" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1779762341; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=KPt8FtuCxts2rIT8LYxYlii0QfD3k+uu3aBbt0C8m+w=; b=o/QG/lw4IOwQFCUm2A4fnOGTcp+vWpVJ60NU5X0Gvezz8ozDsWYZrgcuzcv03ZqfCU2qTL M3iUvaqT4i/xAb93I9pkkdTfnXLSapL2BROqTcHinjolD1mvo89anYgM8mLuwe5rZpZAMK 2GXMFd35Gsz3WpGtAk2tGr4XzzK8I4A= From: Hui Zhu To: Alexei Starovoitov , Daniel Borkmann , John Fastabend , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Kumar Kartikeya Dwivedi , Song Liu , Yonghong Song , Jiri Olsa , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , JP Kobryn , Andrew Morton , Shuah Khan , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , Stanislav Fomichev , KP Singh , Tao Chen , Mykyta Yatsenko , Leon Hwang , Anton Protopopov , Amery Hung , Tobias Klauser , Eyal Birger , Rong Tao , Hao Luo , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , Willem de Bruijn , Jason Xing , Paul Chaignon , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, bpf@vger.kernel.org, cgroups@vger.kernel.org, linux-mm@kvack.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: geliang@kernel.org, baohua@kernel.org, Hui Zhu Subject: [RFC PATCH bpf-next v7 07/11] mm/bpf: Add bpf_try_to_free_mem_cgroup_pages kfunc Date: Tue, 26 May 2026 10:24:55 +0800 Message-ID: <13b10d91aff4307580d1a601f1592efe42a92b05.1779760876.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Expose the memory cgroup reclaim interface to BPF programs by adding the bpf_try_to_free_mem_cgroup_pages kfunc. This allows BPF to trigger memory reclamation for a specific cgroup. The kfunc wraps try_to_free_mem_cgroup_pages and introduces a swappiness parameter with the following semantics: Values in [MIN_SWAPPINESS, SWAPPINESS_ANON_ONLY] are passed through as an explicit swappiness override. Values below MIN_SWAPPINESS indicate the use of the system default (passed as NULL to the core reclaim path). Values above SWAPPINESS_ANON_ONLY are rejected as invalid (-EINVAL). Note that the swappiness override is only respected by the core reclaim path if the MEMCG_RECLAIM_PROACTIVE flag is set in reclaim_options. Swap usage during reclaim is gated on reclaim_options: swap is considered only when MEMCG_RECLAIM_MAY_SWAP is set. Without this flag, reclaim is restricted to file-backed pages regardless of the swappiness value or the cgroup's swappiness setting. Also include for the swappiness macro definitions and register the function with the KF_SLEEPABLE flag. Signed-off-by: Barry Song Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- mm/bpf_memcontrol.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 1f726a7b22e3..0353c8736aa5 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -6,6 +6,7 @@ */ =20 #include +#include #include =20 /* Protects memcg->bpf_ops pointer for read and write. */ @@ -162,6 +163,60 @@ __bpf_kfunc void bpf_mem_cgroup_flush_stats(struct mem= _cgroup *memcg) mem_cgroup_flush_stats(memcg); } =20 +/** + * bpf_try_to_free_mem_cgroup_pages - attempt to reclaim pages from + * a memory cgroup + * @memcg: the target memory cgroup to reclaim from + * @nr_pages: the number of pages to reclaim + * @gfp_mask: GFP flags controlling the reclaim behavior + * @reclaim_options: bitmask of MEMCG_RECLAIM_* flags to tune + * reclaim strategy + * @swappiness: swappiness override value, or a sentinel to use + * the default + * + * BPF-facing wrapper around try_to_free_mem_cgroup_pages() that + * validates and translates the @swappiness argument before + * delegating to the core reclaim path. + * + * The @swappiness parameter follows these semantics: + * - Values in [MIN_SWAPPINESS, SWAPPINESS_ANON_ONLY] are passed + * through as an explicit swappiness override. + * - Values below MIN_SWAPPINESS are treated as "use the system + * default"; the override pointer is set to NULL and the cgroup's + * own swappiness setting takes effect. + * - Values above SWAPPINESS_ANON_ONLY are rejected as invalid. + * - If @reclaim_options does not include MEMCG_RECLAIM_PROACTIVE, + * the @swappiness override is ignored entirely by the core + * reclaim path and the system default is used regardless. + * + * Swap usage during reclaim is gated on @reclaim_options: swap is + * considered only when MEMCG_RECLAIM_MAY_SWAP is set. Without this + * flag, reclaim is restricted to file-backed pages regardless of the + * @swappiness value or the cgroup's swappiness setting. + * + * Return: + * The number of pages actually reclaimed on success, or -%EINVAL + * if @swappiness exceeds SWAPPINESS_ANON_ONLY. + */ +unsigned long bpf_try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, + gfp_t gfp_mask, + unsigned int reclaim_options, + int swappiness) +{ + int *swapiness_ptr; + + if (swappiness > SWAPPINESS_ANON_ONLY) + return -EINVAL; + else if (swappiness < MIN_SWAPPINESS) + swapiness_ptr =3D NULL; + else + swapiness_ptr =3D &swappiness; + + return try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, + reclaim_options, swapiness_ptr); +} + __bpf_kfunc_end_defs(); =20 BTF_KFUNCS_START(bpf_memcontrol_kfuncs) @@ -175,6 +230,8 @@ BTF_ID_FLAGS(func, bpf_mem_cgroup_usage) BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state) BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_SLEEPABLE) =20 +BTF_ID_FLAGS(func, bpf_try_to_free_mem_cgroup_pages, KF_SLEEPABLE) + BTF_KFUNCS_END(bpf_memcontrol_kfuncs) =20 static const struct btf_kfunc_id_set bpf_memcontrol_kfunc_set =3D { --=20 2.43.0 From nobody Mon Jun 8 22:51:09 2026 Received: from out-177.mta1.migadu.com (out-177.mta1.migadu.com [95.215.58.177]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3240E292B2E for ; Tue, 26 May 2026 02:25:58 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.177 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762360; cv=none; b=N3pDrzEOh7q805hR/k6cYoYf2P8a9Fd0J4KMFHmG2Rmh+0mUsDDTphSEmZbFOywGkIYuoDEDHM+Bs8ltZ/caLwkwL1bMzuiSy9DIjybwFyFJehyBC7nIOFgmk6AeztVfhij7cP1FYOtxPgLRzU5kCKepiU7BkUU+IZ2F2W2D8ow= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762360; c=relaxed/simple; bh=NZsybeCaQAY2gJXlMxVE7V8Ip8oqeh1zeTnxdRrNEMI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=oEg36PsHaQmDy5i1WN/Vp/kbLL2WK0aPSEskNSj1lLMPJIbDAmKW3xFO+pz/wDht6L7LDp5jkDEQJBREupQcpk+3xa7qj4nTtmpoPHdL4W16uUG9ZFQweMH5blqlE6cJjCxUeMpz46ZAQzypECSspfsq+N787WfoSBGmac9P42s= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=aIqbJCo0; arc=none smtp.client-ip=95.215.58.177 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="aIqbJCo0" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1779762355; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=33PG31psqtzKKyL9YlsX5pLdBGFtUZt06yU1qeQL7VI=; b=aIqbJCo0XQEKgTA2YA4Re8D8hMf0LPQrvtJE5STgVqdePPrMM/0xYlncquHOhe+FAkuR39 9giU5M3c6dqhw1IC6AqCwezcAbQ63csRN7nitGTrACbCh5vdfNYOqH7yOJaC4odiXOuAbM D8A/ow9PzQSU27qPcVKzgHEglhxdnUQ= From: Hui Zhu To: Alexei Starovoitov , Daniel Borkmann , John Fastabend , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Kumar Kartikeya Dwivedi , Song Liu , Yonghong Song , Jiri Olsa , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , JP Kobryn , Andrew Morton , Shuah Khan , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , Stanislav Fomichev , KP Singh , Tao Chen , Mykyta Yatsenko , Leon Hwang , Anton Protopopov , Amery Hung , Tobias Klauser , Eyal Birger , Rong Tao , Hao Luo , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , Willem de Bruijn , Jason Xing , Paul Chaignon , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, bpf@vger.kernel.org, cgroups@vger.kernel.org, linux-mm@kvack.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: geliang@kernel.org, baohua@kernel.org, Hui Zhu Subject: [RFC PATCH bpf-next v7 08/11] selftests/bpf: Add tests for memcg_bpf_ops Date: Tue, 26 May 2026 10:24:56 +0800 Message-ID: <722df14b403dfbb123f1a1df3c72fbbfb998a31f.1779760876.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Add a comprehensive selftest suite for the `memcg_bpf_ops` functionality. These tests validate that BPF programs can correctly influence memory cgroup throttling behavior by implementing the new hooks. The test suite is added in `prog_tests/memcg_ops.c` and covers several key scenarios: 1. `test_memcg_ops_over_high`: Verifies that a BPF program can trigger throttling on a low-priority cgroup by returning a delay from the `get_high_delay_ms` hook when a high-priority cgroup is under pressure. 2. `test_memcg_ops_below_low_over_high`: Tests the combination of the `below_low` and `get_high_delay_ms` hooks, ensuring they work together as expected. 3. `test_memcg_ops_below_min_over_high`: Validates the interaction between the `below_min` and `get_high_delay_ms` hooks. The test framework sets up a cgroup hierarchy with high and low priority groups, attaches BPF programs, runs memory-intensive workloads, and asserts that the observed throttling (measured by workload execution time) matches expectations. The BPF program (`progs/memcg_ops.c`) uses a tracepoint on `memcg:count_memcg_events` (specifically PGFAULT) to detect memory pressure and trigger the appropriate hooks in response. This test suite provides essential validation for the new memory control mechanisms. Signed-off-by: Barry Song Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- MAINTAINERS | 2 + .../selftests/bpf/prog_tests/memcg_ops.c | 561 ++++++++++++++++++ tools/testing/selftests/bpf/progs/memcg_ops.c | 132 +++++ 3 files changed, 695 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/memcg_ops.c create mode 100644 tools/testing/selftests/bpf/progs/memcg_ops.c diff --git a/MAINTAINERS b/MAINTAINERS index dfc621ff629d..1be243e544da 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6567,6 +6567,8 @@ F: mm/memcontrol-v1.h F: mm/page_counter.c F: mm/swap_cgroup.c F: samples/cgroup/* +F: tools/testing/selftests/bpf/prog_tests/memcg_ops.c +F: tools/testing/selftests/bpf/progs/memcg_ops.c F: tools/testing/selftests/cgroup/memcg_protection.m F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c F: tools/testing/selftests/cgroup/test_kmem.c diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/tes= ting/selftests/bpf/prog_tests/memcg_ops.c new file mode 100644 index 000000000000..19fd4fde2266 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c @@ -0,0 +1,561 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory controller eBPF struct ops test + */ + +#include +#include +#include +#include +#include +#include +#include "cgroup_helpers.h" + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +#include "memcg_ops.skel.h" + +#define TRIGGER_THRESHOLD 1 +#define OVER_HIGH_MS 2000 +#define FILE_SIZE (64 * 1024 * 1024ul) +#define BUFFER_SIZE (4096) +#define CG_LIMIT (120 * 1024 * 1024ul) + +#define CG_DIR "/memcg_ops_test" +#define CG_HIGH_DIR CG_DIR "/high" +#define CG_LOW_DIR CG_DIR "/low" + +static int +setup_high_low_cgroups(u64 *high_cgroup_id, int *low_cgroup_fd, + int *high_cgroup_fd) +{ + int ret; + char limit_buf[20]; + + ret =3D setup_cgroup_environment(); + if (!ASSERT_OK(ret, "setup_cgroup_environment")) + goto cleanup; + + ret =3D create_and_get_cgroup(CG_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_DIR)) + goto cleanup; + close(ret); + ret =3D enable_controllers(CG_DIR, "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + snprintf(limit_buf, 20, "%lu", CG_LIMIT); + ret =3D write_cgroup_file(CG_DIR, "memory.max", limit_buf); + if (!ASSERT_OK(ret, "write_cgroup_file memory.max")) + goto cleanup; + ret =3D write_cgroup_file(CG_DIR, "memory.swap.max", "0"); + if (!ASSERT_OK(ret, "write_cgroup_file memory.swap.max")) + goto cleanup; + + ret =3D create_and_get_cgroup(CG_HIGH_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_HIGH_DIR)) + goto cleanup; + if (high_cgroup_fd) + *high_cgroup_fd =3D ret; + else + close(ret); + *high_cgroup_id =3D get_cgroup_id(CG_HIGH_DIR); + if (!ASSERT_GT(*high_cgroup_id, 0, "get_cgroup_id")) + goto cleanup; + + ret =3D create_and_get_cgroup(CG_LOW_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_LOW_DIR)) + goto cleanup; + if (low_cgroup_fd) + *low_cgroup_fd =3D ret; + else + close(ret); + + return 0; + +cleanup: + cleanup_cgroup_environment(); + return -1; +} + +int write_file(const char *filename) +{ + int ret =3D -1; + size_t written =3D 0; + char *buffer; + FILE *fp; + + fp =3D fopen(filename, "wb"); + if (!fp) + goto out; + + buffer =3D malloc(BUFFER_SIZE); + if (!buffer) + goto cleanup_fp; + + memset(buffer, 'A', BUFFER_SIZE); + + while (written < FILE_SIZE) { + size_t to_write =3D (FILE_SIZE - written < BUFFER_SIZE) ? + (FILE_SIZE - written) : + BUFFER_SIZE; + + if (fwrite(buffer, 1, to_write, fp) !=3D to_write) + goto cleanup; + written +=3D to_write; + } + + ret =3D 0; +cleanup: + free(buffer); +cleanup_fp: + fclose(fp); +out: + return ret; +} + +int read_file(const char *filename, int iterations) +{ + int ret =3D -1; + long page_size =3D sysconf(_SC_PAGESIZE); + char *p; + char *map; + size_t i; + int fd; + struct stat sb; + + fd =3D open(filename, O_RDONLY); + if (fd =3D=3D -1) + goto out; + + if (fstat(fd, &sb) =3D=3D -1) + goto cleanup_fd; + + if (sb.st_size !=3D FILE_SIZE) { + fprintf(stderr, "File size mismatch: expected %lu, got %lu\n", + (unsigned long)FILE_SIZE, (unsigned long)sb.st_size); + goto cleanup_fd; + } + + map =3D mmap(NULL, FILE_SIZE, PROT_READ, MAP_PRIVATE, fd, 0); + if (map =3D=3D MAP_FAILED) + goto cleanup_fd; + + for (int iter =3D 0; iter < iterations; iter++) { + for (i =3D 0; i < FILE_SIZE; i +=3D page_size) { + /* access a byte to trigger page fault */ + p =3D &map[i]; + __asm__ __volatile__("" : : "r"(p) : "memory"); + } + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d %d done\n", __func__, getpid(), iter); + } + + if (munmap(map, FILE_SIZE) =3D=3D -1) + goto cleanup_fd; + + ret =3D 0; + +cleanup_fd: + close(fd); +out: + return ret; +} + +static int +real_test_memcg_ops_child_work(const char *cgroup_path, + char *data_filename, + char *time_filename, + int read_times) +{ + struct timeval start, end; + double elapsed; + FILE *fp; + int ret =3D -1; + + if (!ASSERT_OK(join_parent_cgroup(cgroup_path), "join_parent_cgroup")) + goto out; + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d begin\n", __func__, getpid()); + + gettimeofday(&start, NULL); + + if (!ASSERT_OK(write_file(data_filename), "write_file")) + goto out; + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d write_file done\n", __func__, getpid()); + + if (!ASSERT_OK(read_file(data_filename, read_times), "read_file")) + goto out; + + gettimeofday(&end, NULL); + + elapsed =3D (end.tv_sec - start.tv_sec) + + (end.tv_usec - start.tv_usec) / 1000000.0; + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d end %.6f\n", __func__, getpid(), elapsed); + + fp =3D fopen(time_filename, "w"); + if (!ASSERT_OK_PTR(fp, "fopen")) + goto out; + fprintf(fp, "%.6f", elapsed); + fclose(fp); + + ret =3D 0; +out: + return ret; +} + +static int get_time(char *time_filename, double *time) +{ + int ret =3D -1; + FILE *fp; + char buf[64]; + + fp =3D fopen(time_filename, "r"); + if (!ASSERT_OK_PTR(fp, "fopen")) + goto out; + + if (!ASSERT_OK_PTR(fgets(buf, sizeof(buf), fp), "fgets")) + goto cleanup; + + if (sscanf(buf, "%lf", time) !=3D 1) { + PRINT_FAIL("sscanf %s", buf); + goto cleanup; + } + + ret =3D 0; +cleanup: + fclose(fp); +out: + return ret; +} + +static void real_test_memcg_ops(int read_times) +{ + int ret; + char data_file1[] =3D "/tmp/test_data_1_XXXXXX"; + char data_file2[] =3D "/tmp/test_data_2_XXXXXX"; + char time_file1[] =3D "/tmp/test_time_1_XXXXXX"; + char time_file2[] =3D "/tmp/test_time_2_XXXXXX"; + pid_t pid1, pid2; + double time1, time2; + int status; + + ret =3D mkstemp(data_file1); + if (!ASSERT_GE(ret, 0, "mkstemp")) + return; + close(ret); + ret =3D mkstemp(data_file2); + if (!ASSERT_GE(ret, 0, "mkstemp")) + goto cleanup_data_file1; + close(ret); + ret =3D mkstemp(time_file1); + if (!ASSERT_GE(ret, 0, "mkstemp")) + goto cleanup_data_file2; + close(ret); + ret =3D mkstemp(time_file2); + if (!ASSERT_GE(ret, 0, "mkstemp")) + goto cleanup_time_file1; + close(ret); + + pid1 =3D fork(); + if (!ASSERT_GE(pid1, 0, "fork")) + goto cleanup; + if (pid1 =3D=3D 0) { + exit(real_test_memcg_ops_child_work(CG_LOW_DIR, + data_file1, + time_file1, + read_times)); + } + + pid2 =3D fork(); + if (!ASSERT_GE(pid2, 0, "fork")) { + /* Reap first child to avoid a zombie if second fork fails. */ + (void)waitpid(pid1, NULL, 0); + goto cleanup; + } + if (pid2 =3D=3D 0) { + exit(real_test_memcg_ops_child_work(CG_HIGH_DIR, + data_file2, + time_file2, + read_times)); + } + + ret =3D waitpid(pid1, &status, 0); + if (!ASSERT_GT(ret, 0, "child1 waitpid")) + goto cleanup; + if (!ASSERT_TRUE(WIFEXITED(status), "child1 exited normally")) + goto cleanup; + if (!ASSERT_EQ(WEXITSTATUS(status), 0, "child1 exit status")) + goto cleanup; + + ret =3D waitpid(pid2, &status, 0); + if (!ASSERT_GT(ret, 0, "child2 waitpid")) + goto cleanup; + if (!ASSERT_TRUE(WIFEXITED(status), "child2 exited normally")) + goto cleanup; + if (!ASSERT_EQ(WEXITSTATUS(status), 0, "child2 exit status")) + goto cleanup; + + if (get_time(time_file1, &time1)) + goto cleanup; + + if (get_time(time_file2, &time2)) + goto cleanup; + + if (time1 < time2 || time1 - time2 <=3D 1) + PRINT_FAIL("Low priority cgroup not slower: low=3D%f vs high=3D%f", + time1, time2); + +cleanup: + unlink(time_file2); +cleanup_time_file1: + unlink(time_file1); +cleanup_data_file2: + unlink(data_file2); +cleanup_data_file1: + unlink(data_file1); +} + +void test_memcg_ops_over_high(void) +{ + int err, map_fd; + struct memcg_ops *skel =3D NULL; + struct bpf_map *map; + struct memcg_ops__bss *bss_data; + __u32 key =3D 0; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link2 =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + u64 high_cgroup_id; + int low_cgroup_fd =3D -1; + + err =3D setup_high_low_cgroups(&high_cgroup_id, &low_cgroup_fd, NULL); + if (!ASSERT_OK(err, "setup_high_low_cgroups")) + goto out; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss")) + goto out; + + map_fd =3D bpf_map__fd(map); + bss_data =3D calloc(1, bpf_map__value_size(map)); + if (!ASSERT_OK_PTR(bss_data, "calloc(1, bpf_map__value_size(map))")) + goto out; + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D TRIGGER_THRESHOLD; + bss_data->local_config.use_below_low =3D false; + bss_data->local_config.use_below_min =3D false; + bss_data->local_config.over_high_ms =3D OVER_HIGH_MS; + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + prog =3D bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto out; + + link =3D bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name low_mcg_ops")) + goto out; + + opts.flags =3D BPF_F_CGROUP_FD; + opts.target_fd =3D low_cgroup_fd; + link2 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link2, "bpf_map__attach_struct_ops_opts")) + goto out; + + real_test_memcg_ops(5); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link2); + if (skel) { + memcg_ops__detach(skel); + memcg_ops__destroy(skel); + } + close(low_cgroup_fd); + cleanup_cgroup_environment(); +} + +void test_memcg_ops_below_low_over_high(void) +{ + int err, map_fd; + struct memcg_ops *skel =3D NULL; + struct bpf_map *map; + struct memcg_ops__bss *bss_data; + __u32 key =3D 0; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link_high =3D NULL, *link_low =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + u64 high_cgroup_id; + int high_cgroup_fd =3D -1, low_cgroup_fd =3D -1; + + err =3D setup_high_low_cgroups(&high_cgroup_id, &low_cgroup_fd, + &high_cgroup_fd); + if (!ASSERT_OK(err, "setup_high_low_cgroups")) + goto out; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss")) + goto out; + + map_fd =3D bpf_map__fd(map); + bss_data =3D calloc(1, bpf_map__value_size(map)); + if (!ASSERT_OK_PTR(bss_data, "calloc(1, bpf_map__value_size(map))")) + goto out; + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D TRIGGER_THRESHOLD; + bss_data->local_config.use_below_low =3D true; + bss_data->local_config.use_below_min =3D false; + bss_data->local_config.over_high_ms =3D OVER_HIGH_MS; + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + prog =3D bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto out; + + link =3D bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "high_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name high_mcg_ops")) + goto out; + opts.flags =3D BPF_F_CGROUP_FD; + opts.target_fd =3D high_cgroup_fd; + link_high =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_high, "bpf_map__attach_struct_ops_opts")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name low_mcg_ops")) + goto out; + opts.target_fd =3D low_cgroup_fd; + link_low =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_low, "bpf_map__attach_struct_ops_opts")) + goto out; + + real_test_memcg_ops(50); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link_high); + bpf_link__destroy(link_low); + if (skel) { + memcg_ops__detach(skel); + memcg_ops__destroy(skel); + } + close(high_cgroup_fd); + close(low_cgroup_fd); + cleanup_cgroup_environment(); +} + +void test_memcg_ops_below_min_over_high(void) +{ + int err, map_fd; + struct memcg_ops *skel =3D NULL; + struct bpf_map *map; + struct memcg_ops__bss *bss_data; + __u32 key =3D 0; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link_high =3D NULL, *link_low =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + u64 high_cgroup_id; + int high_cgroup_fd =3D -1, low_cgroup_fd =3D -1; + + err =3D setup_high_low_cgroups(&high_cgroup_id, &low_cgroup_fd, + &high_cgroup_fd); + if (!ASSERT_OK(err, "setup_high_low_cgroups")) + goto out; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss")) + goto out; + + map_fd =3D bpf_map__fd(map); + bss_data =3D calloc(1, bpf_map__value_size(map)); + if (!ASSERT_OK_PTR(bss_data, "calloc(1, bpf_map__value_size(map))")) + goto out; + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D TRIGGER_THRESHOLD; + bss_data->local_config.use_below_low =3D false; + bss_data->local_config.use_below_min =3D true; + bss_data->local_config.over_high_ms =3D OVER_HIGH_MS; + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + prog =3D bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto out; + + link =3D bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "high_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name high_mcg_ops")) + goto out; + opts.flags =3D BPF_F_CGROUP_FD; + opts.target_fd =3D high_cgroup_fd; + link_high =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_high, "bpf_map__attach_struct_ops_opts")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name low_mcg_ops")) + goto out; + opts.target_fd =3D low_cgroup_fd; + link_low =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_low, "bpf_map__attach_struct_ops_opts")) + goto out; + + real_test_memcg_ops(50); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link_high); + bpf_link__destroy(link_low); + if (skel) { + memcg_ops__detach(skel); + memcg_ops__destroy(skel); + } + close(high_cgroup_fd); + close(low_cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/memcg_ops.c b/tools/testing/= selftests/bpf/progs/memcg_ops.c new file mode 100644 index 000000000000..4a1d817c1d9c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/memcg_ops.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +#define ONE_SECOND_NS 1000000000 + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +struct AggregationData { + u64 sum; + u64 window_start_ts; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct AggregationData); +} aggregation_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u64); +} trigger_ts_map SEC(".maps"); + +SEC("tp/memcg/count_memcg_events") +int +handle_count_memcg_events(struct trace_event_raw_memcg_rstat_events *ctx) +{ + u32 key =3D 0; + struct AggregationData *data; + u64 current_ts; + + if (ctx->id !=3D local_config.high_cgroup_id || + (ctx->item !=3D PGFAULT)) + goto out; + + data =3D bpf_map_lookup_elem(&aggregation_map, &key); + if (!data) + goto out; + + current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - data->window_start_ts < ONE_SECOND_NS) { + data->sum +=3D ctx->val; + } else { + data->window_start_ts =3D current_ts; + data->sum =3D ctx->val; + } + + if (data->sum > local_config.threshold) { + bpf_map_update_elem(&trigger_ts_map, &key, ¤t_ts, + BPF_ANY); + data->sum =3D 0; + data->window_start_ts =3D current_ts; + } + +out: + return 0; +} + +static bool need_threshold(void) +{ + u32 key =3D 0; + u64 *trigger_ts; + bool ret =3D false; + u64 current_ts; + + trigger_ts =3D bpf_map_lookup_elem(&trigger_ts_map, &key); + if (!trigger_ts || *trigger_ts =3D=3D 0) + goto out; + + current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - *trigger_ts < ONE_SECOND_NS) + ret =3D true; + +out: + return ret; +} + +SEC("struct_ops/below_low") +bool below_low_impl(struct mem_cgroup *memcg, unsigned long elow, + unsigned long usage) +{ + if (!local_config.use_below_low) + return false; + + return need_threshold(); +} + +SEC("struct_ops/below_min") +bool below_min_impl(struct mem_cgroup *memcg, unsigned long emin, + unsigned long usage) +{ + if (!local_config.use_below_min) + return false; + + return need_threshold(); +} + +SEC("struct_ops/memcg_charged") +unsigned int memcg_charged_impl(struct mem_cgroup *memcg, unsigned int nr_= pages) +{ + if (local_config.over_high_ms && need_threshold()) + return local_config.over_high_ms; + + return 0; +} + +SEC(".struct_ops.link") +struct memcg_bpf_ops high_mcg_ops =3D { + .below_low =3D (void *)below_low_impl, + .below_min =3D (void *)below_min_impl, +}; + +SEC(".struct_ops.link") +struct memcg_bpf_ops low_mcg_ops =3D { + .memcg_charged =3D (void *)memcg_charged_impl, +}; + +char LICENSE[] SEC("license") =3D "GPL"; --=20 2.43.0 From nobody Mon Jun 8 22:51:09 2026 Received: from out-186.mta1.migadu.com (out-186.mta1.migadu.com [95.215.58.186]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D4A45305694 for ; Tue, 26 May 2026 02:28:30 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.186 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762512; cv=none; b=WH1X/XxGpzENsnOODWKYoete9ltsGGvONxoc4oWlmRiNYQUhJGEEqdVhao+SFrajIxi9967EZ1zZIACJ2YvwANCJcOjrqhySSrN487hynUo3lpwGSov1TgSTRWezEUdWqpEjhME/80nuipGEBFg22pBnf8REFZqzKDkUK40VOmY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762512; c=relaxed/simple; bh=PhTbPAXU0oh4RHyr8WZhdjadx4blhl2K4PslbE2HiVE=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=JOkp9Wu6XGYZgaxMXxUQ6L/QhiyDQ+zGBSxIECEcx6gFmnmPZwmo/sev84hJP4FAspUflThKsTA70AM/ILxMklfxKiXQGXsGsfn2Ue+zL7wj7bSwHkDbxpfetuhtQAwVD+nPsTSTWsEBaX1+4FnrxtHg0xCKfa/FTvAtdbw225Q= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=u+OBhwgH; arc=none smtp.client-ip=95.215.58.186 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="u+OBhwgH" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1779762508; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=IvkZWXPTCl+S70lg431XAc/NqFXphJXLE3UltUPGFag=; b=u+OBhwgHyQzfoVd9lAzv0TIz8bmU0/usJ47HXAhTYO7GY84MUfYO57hSO7U40EGgyGQGfH 1BVXw8FwkxIO9cHiYVFTSRyoxyIreuB2LHK+hQBvhJfhDigbmUhuUroRxSu7kP/MzB2590 mI/XNhQ4g7XrJLtdtiSyIut7hdrKAZ0= From: Hui Zhu To: Alexei Starovoitov , Daniel Borkmann , John Fastabend , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Kumar Kartikeya Dwivedi , Song Liu , Yonghong Song , Jiri Olsa , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , JP Kobryn , Andrew Morton , Shuah Khan , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , Stanislav Fomichev , KP Singh , Tao Chen , Mykyta Yatsenko , Leon Hwang , Anton Protopopov , Amery Hung , Tobias Klauser , Eyal Birger , Rong Tao , Hao Luo , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , Willem de Bruijn , Jason Xing , Paul Chaignon , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, bpf@vger.kernel.org, cgroups@vger.kernel.org, linux-mm@kvack.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: geliang@kernel.org, baohua@kernel.org, Hui Zhu Subject: [RFC PATCH bpf-next v7 09/11] selftests/bpf: Add test for memcg_bpf_ops hierarchies Date: Tue, 26 May 2026 10:27:54 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Add a new selftest, `test_memcg_ops_hierarchies`, to validate the behavior of attaching `memcg_bpf_ops` in a nested cgroup hierarchy, specifically testing the `BPF_F_ALLOW_OVERRIDE` flag. The test case performs the following steps: 1. Creates a three-level deep cgroup hierarchy: `/cg`, `/cg/cg`, and `/cg/cg/cg`. 2. Attaches a BPF struct_ops to the top-level cgroup (`/cg`) with the `BPF_F_ALLOW_OVERRIDE` flag. 3. Successfully attaches a new struct_ops to the middle cgroup (`/cg/cg`) without the flag, overriding the inherited one. 4. Asserts that attaching another struct_ops to the deepest cgroup (`/cg/cg/cg`) fails with -EBUSY, because its parent did not specify `BPF_F_ALLOW_OVERRIDE`. This test ensures that the attachment logic correctly enforces the override rules across a cgroup subtree. Signed-off-by: Barry Song Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- .../selftests/bpf/prog_tests/memcg_ops.c | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/tes= ting/selftests/bpf/prog_tests/memcg_ops.c index 19fd4fde2266..b4084e9327eb 100644 --- a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c +++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c @@ -559,3 +559,76 @@ void test_memcg_ops_below_min_over_high(void) close(low_cgroup_fd); cleanup_cgroup_environment(); } + +void test_memcg_ops_hierarchies(void) +{ + int ret, first =3D -1, second =3D -1, third =3D -1; + struct memcg_ops *skel =3D NULL; + struct bpf_map *map; + struct bpf_link *link1 =3D NULL, *link2 =3D NULL, *link3 =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + + ret =3D setup_cgroup_environment(); + if (!ASSERT_OK(ret, "setup_cgroup_environment")) + goto cleanup; + + first =3D create_and_get_cgroup("/cg"); + if (!ASSERT_GE(first, 0, "create_and_get_cgroup /cg")) + goto cleanup; + ret =3D enable_controllers("/cg", "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + + second =3D create_and_get_cgroup("/cg/cg"); + if (!ASSERT_GE(second, 0, "create_and_get_cgroup /cg/cg")) + goto cleanup; + ret =3D enable_controllers("/cg/cg", "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + + third =3D create_and_get_cgroup("/cg/cg/cg"); + if (!ASSERT_GE(third, 0, "create_and_get_cgroup /cg/cg/cg")) + goto cleanup; + ret =3D enable_controllers("/cg/cg/cg", "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto cleanup; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name low_mcg_ops")) + goto cleanup; + + opts.target_fd =3D first; + opts.flags =3D BPF_F_ALLOW_OVERRIDE | BPF_F_CGROUP_FD; + link1 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link1, "bpf_map__attach_struct_ops_opts")) + goto cleanup; + + opts.target_fd =3D second; + opts.flags =3D BPF_F_CGROUP_FD; + link2 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link2, "bpf_map__attach_struct_ops_opts")) + goto cleanup; + + opts.target_fd =3D third; + opts.flags =3D BPF_F_CGROUP_FD; + link3 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_ERR_PTR(link3, "bpf_map__attach_struct_ops_opts")) + goto cleanup; + +cleanup: + bpf_link__destroy(link1); + bpf_link__destroy(link2); + bpf_link__destroy(link3); + if (skel) { + memcg_ops__detach(skel); + memcg_ops__destroy(skel); + } + close(first); + close(second); + close(third); + cleanup_cgroup_environment(); +} --=20 2.43.0 From nobody Mon Jun 8 22:51:09 2026 Received: from out-177.mta1.migadu.com (out-177.mta1.migadu.com [95.215.58.177]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7232B30569D for ; Tue, 26 May 2026 02:28:54 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.177 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762536; cv=none; b=B/lFm110sQpUvVeQXSo5XNoi+Yqgv958Oa4kw+NSTf2++67CaMTZghuwEizH9Qj/01yRKvR93z620G3Y4bsQDH5QZVd6catjF3MPxkzMKWKtOXvPL5Mxxppd8OFfL7aSkTcEVzKJWW2EcAJ1zHi4nRsagdHLeFPYBmz6foo3cBw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762536; c=relaxed/simple; bh=e7X2eVAmkhgw2vNzLVS2uUCOI+Rw18Q1ZeOV/KkQp9A=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Qjr3N+gQ/0+mdBdQDaNV1Nkr+HguNTGCZFsE4N96LY55ioZathvGnNODbLIbtzTr+1UsZIAHrhE8CNo4yF4q7LiUEuC9h20ENZ4vvpFzAIrJ7nkwznOmft7M/sQ+ilMKHV69gdxp71zzTkeu54ZfExgH3KDCqVBWdN1fdvmE1n8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=QGVmMhZL; arc=none smtp.client-ip=95.215.58.177 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="QGVmMhZL" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1779762522; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=/EWtRGAXaZ/GNJCbWxnQ3W+mtK+0x2mWrk3Fzf8518s=; b=QGVmMhZLX/mU0YjjxSS9k1WBHWgbRtrrHgcswN3R+JFC3uh+kfo4nXYzC9Z+tX2T/2zfYV NXMYHphuYAz4AJSSJ9EN9nJIVtwNxYZinXyhWBd3liHSoOagF0ZAlUFMAXb+hJYQeiTnf/ z4W7JdW6QCFPgyaQjA8JK0JQYMORrW4= From: Hui Zhu To: Alexei Starovoitov , Daniel Borkmann , John Fastabend , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Kumar Kartikeya Dwivedi , Song Liu , Yonghong Song , Jiri Olsa , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , JP Kobryn , Andrew Morton , Shuah Khan , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , Stanislav Fomichev , KP Singh , Tao Chen , Mykyta Yatsenko , Leon Hwang , Anton Protopopov , Amery Hung , Tobias Klauser , Eyal Birger , Rong Tao , Hao Luo , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , Willem de Bruijn , Jason Xing , Paul Chaignon , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, bpf@vger.kernel.org, cgroups@vger.kernel.org, linux-mm@kvack.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: geliang@kernel.org, baohua@kernel.org, Hui Zhu Subject: [RFC PATCH bpf-next v7 10/11] selftests/bpf: Add selftest for memcg async reclaim via BPF Date: Tue, 26 May 2026 10:27:55 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Add a BPF selftest that demonstrates and validates asynchronous memory reclaim for a memory cgroup using BPF struct_ops and the BPF workqueue mechanism. The BPF program (progs/memcg_async_reclaim.c) registers struct_ops callbacks for memcg_charged and memcg_uncharged to track the memory charge/uncharge events of a target cgroup. When accumulated memory usage exceeds a configured threshold, the memcg_charged callback enqueues an asynchronous workqueue item via bpf_wq_start(). The workqueue callback then invokes bpf_try_to_free_mem_cgroup_pages() to reclaim pages from the target memcg without blocking the charging context. The test (prog_tests/memcg_async_reclaim.c) verifies the effectiveness of this mechanism by: 1. Running a memory workload (sequential file write + mmap read) without the BPF async reclaim program attached, and asserting that the memcg "max" event counter increases, confirming that the cgroup memory limit is being hit. 2. Repeating the same workload with the BPF async reclaim program active, and asserting that the "max" event counter does NOT increase, confirming that proactive async reclaim successfully kept memory usage below the hard limit. A new helper read_cgroup_file() is added to cgroup_helpers.c to support reading memcg interface files (e.g. memory.events) from within the test infrastructure. The new test files are also registered in MAINTAINERS under the Memory Controller section. Signed-off-by: Barry Song Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- MAINTAINERS | 2 + tools/testing/selftests/bpf/cgroup_helpers.c | 41 +++ tools/testing/selftests/bpf/cgroup_helpers.h | 2 + .../bpf/prog_tests/memcg_async_reclaim.c | 333 ++++++++++++++++++ .../selftests/bpf/progs/memcg_async_reclaim.c | 203 +++++++++++ 5 files changed, 581 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/memcg_async_recl= aim.c create mode 100644 tools/testing/selftests/bpf/progs/memcg_async_reclaim.c diff --git a/MAINTAINERS b/MAINTAINERS index 1be243e544da..b2e64ef8c60c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6567,7 +6567,9 @@ F: mm/memcontrol-v1.h F: mm/page_counter.c F: mm/swap_cgroup.c F: samples/cgroup/* +F: tools/testing/selftests/bpf/prog_tests/memcg_async_reclaim.c F: tools/testing/selftests/bpf/prog_tests/memcg_ops.c +F: tools/testing/selftests/bpf/progs/memcg_async_reclaim.c F: tools/testing/selftests/bpf/progs/memcg_ops.c F: tools/testing/selftests/cgroup/memcg_protection.m F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/s= elftests/bpf/cgroup_helpers.c index 45cd0b479fe3..22420d2f5199 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -167,6 +167,47 @@ int write_cgroup_file(const char *relative_path, const= char *file, return __write_cgroup_file(cgroup_path, file, buf); } =20 +/** + * read_cgroup_file() - Read content from a cgroup file + * @relative_path: The cgroup path, relative to the workdir + * @file: The name of the file in cgroupfs to read from + * @buf: Buffer to store the read data + * @buf_size: Size of the buffer + * + * Read the entire content of a cgroup file into the provided buffer. + * The buffer will be null-terminated on success. + * + * Return: 0 on success, negative error code on failure. + */ +int read_cgroup_file(const char *relative_path, const char *file, + char *buf, size_t buf_size) +{ + char cgroup_path[PATH_MAX - 24]; + char file_path[PATH_MAX + 1]; + int fd; + ssize_t len; + + if (!relative_path || !file || !buf || buf_size =3D=3D 0) + return -EINVAL; + + format_cgroup_path(cgroup_path, relative_path); + snprintf(file_path, sizeof(file_path), "%s/%s", cgroup_path, file); + + fd =3D open(file_path, O_RDONLY); + if (fd < 0) + return -errno; + + len =3D read(fd, buf, buf_size - 1); + if (len < 0) { + close(fd); + return -errno; + } + close(fd); + + buf[len] =3D '\0'; + return 0; +} + /** * write_cgroup_file_parent() - Write to a cgroup file in the parent proce= ss * workdir diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/s= elftests/bpf/cgroup_helpers.h index 3857304be874..d722e8ff8dee 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.h +++ b/tools/testing/selftests/bpf/cgroup_helpers.h @@ -13,6 +13,8 @@ int enable_controllers(const char *relative_path, const char *controllers); int write_cgroup_file(const char *relative_path, const char *file, const char *buf); +int read_cgroup_file(const char *relative_path, const char *file, + char *buf, size_t buf_size); int write_cgroup_file_parent(const char *relative_path, const char *file, const char *buf); int cgroup_setup_and_join(const char *relative_path); diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_async_reclaim.c b= /tools/testing/selftests/bpf/prog_tests/memcg_async_reclaim.c new file mode 100644 index 000000000000..bf25967c911c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/memcg_async_reclaim.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory controller eBPF async reclaim test + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cgroup_helpers.h" + +struct bpf_args_s { + u64 cgroup_id; + u64 limit_bytes; +}; + +#include "memcg_async_reclaim.skel.h" + +#define FILE_SIZE (64 * 1024 * 1024ul) +#define BUFFER_SIZE (4096) +#define CG_LIMIT (32 * 1024 * 1024ul) +#define CG_DIR1 "/memcg_async_reclaim1" +#define CG_DIR2 "/memcg_async_reclaim2" +#define RECLAIM_TRIGGER_SIZE (12 * 1024 * 1024ul) + +static int +setup_max_cgroup(const char *cg_path, u64 cg_max, u64 *cgroup_id, + int *cgroup_fd) +{ + int ret; + char limit_buf[20]; + + *cgroup_fd =3D create_and_get_cgroup(cg_path); + if (!ASSERT_GE(*cgroup_fd, 0, "create_and_get_cgroup")) + goto cleanup; + + *cgroup_id =3D get_cgroup_id(cg_path); + if (!ASSERT_GT(*cgroup_id, 0, "get_cgroup_id")) + goto cleanup; + + snprintf(limit_buf, 20, "%lu", cg_max); + ret =3D write_cgroup_file(cg_path, "memory.max", limit_buf); + if (!ASSERT_OK(ret, "write_cgroup_file memory.max")) + goto cleanup; + + ret =3D write_cgroup_file(cg_path, "memory.swap.max", "0"); + if (!ASSERT_OK(ret, "write_cgroup_file memory.swap.max")) + goto cleanup; + + return ret; + +cleanup: + close(*cgroup_fd); + cleanup_cgroup_environment(); + return -1; +} + +static int +setup_bpf(u64 cg_id, int cg_fd, u64 limit_bytes, + struct memcg_async_reclaim **skel_ptr, struct bpf_link **link_ptr) +{ + struct memcg_async_reclaim *skel; + struct bpf_map *map; + struct bpf_link *link =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + struct bpf_args_s bpf_args =3D { + .limit_bytes =3D limit_bytes, + .cgroup_id =3D cg_id, + }; + LIBBPF_OPTS(bpf_test_run_opts, run_opts, + .ctx_in =3D &bpf_args, + .ctx_size_in =3D sizeof(bpf_args) + ); + int prog_init_fd; + + skel =3D memcg_async_reclaim__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_async_reclaim__open_and_load")) + goto error; + + prog_init_fd =3D bpf_program__fd(skel->progs.prog_init); + if (!ASSERT_GE(prog_init_fd, 0, "bpf_program__fd")) + goto destroy_skel; + if (!ASSERT_OK((bpf_prog_test_run_opts(prog_init_fd, &run_opts) || + run_opts.retval), "bpf_prog_test_run_opts")) + goto destroy_skel; + + map =3D bpf_object__find_map_by_name(skel->obj, "mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name mcg_ops")) + goto destroy_skel; + opts.flags =3D BPF_F_CGROUP_FD; + opts.target_fd =3D cg_fd; + link =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops_opts")) + goto destroy_skel; + + *link_ptr =3D link; + *skel_ptr =3D skel; + return 0; + +destroy_skel: + memcg_async_reclaim__destroy(skel); +error: + return -1; +} + +static int write_file(const char *filename) +{ + int ret =3D -1; + size_t written =3D 0; + char *buffer; + FILE *fp; + + fp =3D fopen(filename, "wb"); + if (!fp) + goto out; + + buffer =3D malloc(BUFFER_SIZE); + if (!buffer) + goto cleanup_fp; + + memset(buffer, 'A', BUFFER_SIZE); + + while (written < FILE_SIZE) { + size_t to_write =3D (FILE_SIZE - written < BUFFER_SIZE) ? + (FILE_SIZE - written) : + BUFFER_SIZE; + + if (fwrite(buffer, 1, to_write, fp) !=3D to_write) + goto cleanup; + written +=3D to_write; + } + + ret =3D 0; +cleanup: + free(buffer); +cleanup_fp: + fclose(fp); +out: + return ret; +} + +static int read_file(const char *filename, int iterations) +{ + int ret =3D -1; + long page_size =3D sysconf(_SC_PAGESIZE); + char *p; + char *map; + size_t i; + int fd; + struct stat sb; + + fd =3D open(filename, O_RDONLY); + if (fd =3D=3D -1) + goto out; + + if (fstat(fd, &sb) =3D=3D -1) + goto cleanup_fd; + + if (sb.st_size !=3D FILE_SIZE) { + fprintf(stderr, "File size mismatch: expected %lu, got %lu\n", + (unsigned long)FILE_SIZE, (unsigned long)sb.st_size); + goto cleanup_fd; + } + + map =3D mmap(NULL, FILE_SIZE, PROT_READ, MAP_PRIVATE, fd, 0); + if (map =3D=3D MAP_FAILED) + goto cleanup_fd; + + for (int iter =3D 0; iter < iterations; iter++) { + for (i =3D 0; i < FILE_SIZE; i +=3D page_size) { + /* access a byte to trigger page fault */ + p =3D &map[i]; + __asm__ __volatile__("" : : "r"(p) : "memory"); + } + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d %d done\n", __func__, getpid(), iter); + } + + if (munmap(map, FILE_SIZE) =3D=3D -1) + goto cleanup_fd; + + ret =3D 0; + +cleanup_fd: + close(fd); +out: + return ret; +} + +int get_cgroup_memory_event(const char *relative_path, const char *key, + u64 *value) +{ + char buf[1024]; + char *line, *saveptr1; + char *c, *saveptr2; + char *val_str =3D NULL; + bool found =3D false; + int ret, i; + + if (!key || !value) + return -EINVAL; + + ret =3D read_cgroup_file(relative_path, "memory.events", + buf, sizeof(buf)); + if (ret < 0) + return ret; + + for (line =3D strtok_r(buf, "\n", &saveptr1); line; + line =3D strtok_r(NULL, "\n", &saveptr1)) { + val_str =3D NULL; + i =3D 0; + + for (c =3D strtok_r(line, " ", &saveptr2); c; + c =3D strtok_r(NULL, " ", &saveptr2)) { + if (i =3D=3D 0) { + if (strcmp(c, key) !=3D 0) + break; + } else if (i =3D=3D 1) { + val_str =3D c; + break; + } + i++; + } + + if (val_str) { + char *endptr; + u64 v; + + v =3D strtoull(val_str, &endptr, 10); + if (endptr =3D=3D val_str) + return -EINVAL; + + *value =3D v; + found =3D true; + break; + } + } + + if (!found) + return -ENOENT; + + return 0; +} + +void test_memcg_async_reclaim(void) +{ + u64 cgroup_id, old_max, new_max; + int cgroup_fd, ret; + struct memcg_async_reclaim *skel; + struct bpf_link *link =3D NULL; + char data_file1[] =3D "/tmp/test_data_1_XXXXXX"; + char data_file2[] =3D "/tmp/test_data_2_XXXXXX"; + + if (!ASSERT_OK(setup_cgroup_environment(), "setup_cgroup_environment")) + return; + + // test without async_reclaim + if (!ASSERT_OK(setup_max_cgroup(CG_DIR1, CG_LIMIT, &cgroup_id, + &cgroup_fd), "setup_max_cgroup")) + goto cleanup_cgroup; + if (!ASSERT_OK(join_cgroup(CG_DIR1), "join_cgroup")) + goto close_cgroup_fd; + ret =3D mkstemp(data_file1); + if (!ASSERT_GE(ret, 0, "mkstemp")) + goto close_cgroup_fd; + close(ret); + + if (!ASSERT_OK(get_cgroup_memory_event(CG_DIR1, "max", &old_max), + "get_cgroup_memory_event")) + goto cleanup_data_file1; + if (!ASSERT_OK(write_file(data_file1), "write_file")) + goto cleanup_data_file1; + if (!ASSERT_OK(read_file(data_file1, 2), "read_file")) + goto cleanup_data_file1; + if (!ASSERT_OK(get_cgroup_memory_event(CG_DIR1, "max", &new_max), + "get_cgroup_memory_event")) + goto cleanup_data_file1; + if (!ASSERT_GT(new_max, old_max, "memcg max event not trigger")) + goto cleanup_data_file1; + + // test with async_reclaim + close(cgroup_fd); + if (!ASSERT_OK(setup_max_cgroup(CG_DIR2, CG_LIMIT, &cgroup_id, + &cgroup_fd), "setup_max_cgroup")) + goto cleanup_data_file1; + if (!ASSERT_OK(join_cgroup(CG_DIR2), "join_cgroup")) + goto cleanup_data_file1; + ret =3D mkstemp(data_file2); + if (!ASSERT_GE(ret, 0, "mkstemp")) + goto cleanup_data_file1; + close(ret); + + if (!ASSERT_OK(setup_bpf(cgroup_id, cgroup_fd, RECLAIM_TRIGGER_SIZE, + &skel, &link), + "setup_bpf")) + goto cleanup_data_file2; + if (!ASSERT_OK(get_cgroup_memory_event(CG_DIR2, "max", &old_max), + "get_cgroup_memory_event")) + goto cleanup; + if (!ASSERT_OK(write_file(data_file2), "write_file")) + goto cleanup; + if (!ASSERT_OK(read_file(data_file2, 2), "read_file")) + goto cleanup; + if (!ASSERT_OK(get_cgroup_memory_event(CG_DIR2, "max", &new_max), + "get_cgroup_memory_event")) + goto cleanup; + if (!ASSERT_EQ(new_max, old_max, "memcg max event triggered")) + goto cleanup; + +cleanup: + bpf_link__destroy(link); + memcg_async_reclaim__detach(skel); + memcg_async_reclaim__destroy(skel); +cleanup_data_file2: + unlink(data_file2); +cleanup_data_file1: + unlink(data_file1); +close_cgroup_fd: + close(cgroup_fd); +cleanup_cgroup: + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/memcg_async_reclaim.c b/tool= s/testing/selftests/bpf/progs/memcg_async_reclaim.c new file mode 100644 index 000000000000..4e66766eb4a3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/memcg_async_reclaim.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include +#include +#include + +#define BIT(nr) (1UL << (nr)) + +#define ___GFP_IO BIT(___GFP_IO_BIT) +#define ___GFP_FS BIT(___GFP_FS_BIT) +#define ___GFP_DIRECT_RECLAIM BIT(___GFP_DIRECT_RECLAIM_BIT) +#define ___GFP_KSWAPD_RECLAIM BIT(___GFP_KSWAPD_RECLAIM_BIT) + +#define __GFP_IO ((gfp_t)___GFP_IO) +#define __GFP_FS ((gfp_t)___GFP_FS) +#define __GFP_DIRECT_RECLAIM ((gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can = reclaim */ +#define __GFP_KSWAPD_RECLAIM ((gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can = wake */ +#define __GFP_RECLAIM ((gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM= )) + +#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) + +#define ONE_MB_PAGE_COUNT 256 + +struct bpf_args_s { + u64 cgroup_id; + u64 limit_bytes; +} bpf_args; + +struct wq_elem { + struct bpf_wq work; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct wq_elem); +} wq_map SEC(".maps"); + +static s64 allocated; +static s64 old_allocated; +static u64 async_free_run; +static u64 initialize_status =3D 1; + +struct cgroup_memcg { + struct cgroup *cgrp; + struct mem_cgroup *memcg; +}; + +static int get_cgroup_memcg_from_id(u64 cgroup_id, struct cgroup_memcg *cm) +{ + cm->cgrp =3D bpf_cgroup_from_id(cgroup_id); + if (!cm->cgrp) + return -1; + + cm->memcg =3D bpf_get_mem_cgroup(&cm->cgrp->self); + if (!cm->memcg) { + bpf_cgroup_release(cm->cgrp); + return -1; + } + + return 0; +} + +static void put_cgroup_memcg(struct cgroup_memcg *cm) +{ + bpf_put_mem_cgroup(cm->memcg); + bpf_cgroup_release(cm->cgrp); +} + +static int async_free(void *map, int *key, void *value) +{ + struct cgroup_memcg cm; + bool started_wq =3D false; + + if (get_cgroup_memcg_from_id(bpf_args.cgroup_id, &cm) !=3D 0) + return 0; + + if (bpf_try_to_free_mem_cgroup_pages(cm.memcg, 32, GFP_KERNEL, + 0, -1) > 0) { + if (bpf_mem_cgroup_usage(cm.memcg) >=3D + bpf_args.limit_bytes - (ONE_MB_PAGE_COUNT * __PAGE_SIZE)) { + __u32 key2 =3D 0; + struct wq_elem *elem; + + elem =3D bpf_map_lookup_elem(&wq_map, &key2); + if (elem) { + bpf_wq_start(&elem->work, 0); + started_wq =3D true; + } + } + } + if (!started_wq) + __atomic_exchange_n(&async_free_run, 0, __ATOMIC_RELEASE); + + put_cgroup_memcg(&cm); + return 0; +} + +SEC("syscall") +int prog_init(struct bpf_args_s *ctx) +{ + struct wq_elem *elem; + __u32 key =3D 0; + int ret; + u64 expected =3D 1; + + if (!__atomic_compare_exchange_n(&initialize_status, + &expected, 2, + false, + __ATOMIC_ACQ_REL, + __ATOMIC_RELAXED)) + return -1; + + elem =3D bpf_map_lookup_elem(&wq_map, &key); + if (!elem) + return -1; + ret =3D bpf_wq_init(&elem->work, &wq_map, 0); + if (ret) + goto out; + ret =3D bpf_wq_set_callback(&elem->work, async_free, 0); + if (ret) + goto out; + + allocated =3D 0; + async_free_run =3D 0; + bpf_args.cgroup_id =3D ctx->cgroup_id; + bpf_args.limit_bytes =3D ctx->limit_bytes; + +out: + return ret; +} + +static u64 get_usage(void) +{ + u64 ret =3D 0; + struct cgroup_memcg cm; + + if (get_cgroup_memcg_from_id(bpf_args.cgroup_id, &cm) !=3D 0) + return 0; + + ret =3D bpf_mem_cgroup_usage(cm.memcg); + + put_cgroup_memcg(&cm); + + return ret; +} + +s64 abs_diff(s64 a, s64 b) +{ + return a > b ? a - b : b - a; +} + +SEC("struct_ops/memcg_charged") +unsigned int BPF_PROG(memcg_charged_impl, struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + struct wq_elem *elem; + __u32 key =3D 0; + u64 expected =3D 0; + s64 cur_allocated; + s64 cur_old_allocated; + + __atomic_add_fetch(&allocated, nr_pages, __ATOMIC_RELAXED); + cur_allocated =3D READ_ONCE(allocated); + cur_old_allocated =3D READ_ONCE(old_allocated); + if (abs_diff(cur_allocated, cur_old_allocated) < ONE_MB_PAGE_COUNT) + goto out; + WRITE_ONCE(old_allocated, cur_allocated); + + if (get_usage() < bpf_args.limit_bytes) + goto out; + + if (__atomic_compare_exchange_n(&async_free_run, + &expected, 1, + false, + __ATOMIC_ACQ_REL, + __ATOMIC_RELAXED)) { + elem =3D bpf_map_lookup_elem(&wq_map, &key); + if (!elem) + return 0; + + bpf_wq_start(&elem->work, 0); + } + +out: + return 0; +} + +SEC("struct_ops/memcg_uncharged") +void BPF_PROG(memcg_uncharged_impl, struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + __atomic_sub_fetch(&allocated, nr_pages, __ATOMIC_RELAXED); +} + +SEC(".struct_ops.link") +struct memcg_bpf_ops mcg_ops =3D { + .memcg_charged =3D (void *)memcg_charged_impl, + .memcg_uncharged =3D (void *)memcg_uncharged_impl, +}; + +char LICENSE[] SEC("license") =3D "GPL"; --=20 2.43.0 From nobody Mon Jun 8 22:51:09 2026 Received: from out-187.mta1.migadu.com (out-187.mta1.migadu.com [95.215.58.187]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9BAEB30C151 for ; Tue, 26 May 2026 02:28:57 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.187 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762543; cv=none; b=T+WgKGPLA79isPd+JfYFFc4QNzZAt6kBG7FxMUgA5pKtvQBLivdTgd+9T8K1GoSkqP3nOedUjrLO8T4hf6cizHCWwGp5q2q88RX6jK04270Q0qvTm9ESLwPAoheNbx0MLsCgOXorejcRZPki1+vkkDK/7plFM0xzCikOoad7o1w= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779762543; c=relaxed/simple; bh=Jf+YLOQx08po9vWNDiKcTwJR5Yc7FK1PYhDSF/N9q9A=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=hXrduf1bjv7WlihBlV5Y5cAuUupVnwit/B5csOPu1G56j0e/dAJqVDBOmR6MYseeJvW+RJypdw7u54B72nLTJZ1VlhNUw9pEYY/4LC3A48q6abjT+wSE2fC83yBEV62Dkh+daJUWMFhxRFl42SUjgcYmBqFpghnYpDAyAMt5iBs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=ooNt4/C7; arc=none smtp.client-ip=95.215.58.187 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="ooNt4/C7" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1779762535; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=KczRhSioI0pWQzSBajeSyCSXDleWNCaV061zhAknM4U=; b=ooNt4/C7NQby/ceJxAVAqpuHhgObK8J1Y5g0K097oPrXuS/JoSwJdo1r5S75Bv1/0qxamv s0trwT96jlB56rB7j/KQAxUnEhvwdrD88dD/sGXsT0Lz2QZ6rB0poQ0nnv+thPr5vZI50W j+Nzz7JR2M03wk+E6p2K7DjMxkpJIS4= From: Hui Zhu To: Alexei Starovoitov , Daniel Borkmann , John Fastabend , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Kumar Kartikeya Dwivedi , Song Liu , Yonghong Song , Jiri Olsa , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , JP Kobryn , Andrew Morton , Shuah Khan , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , Stanislav Fomichev , KP Singh , Tao Chen , Mykyta Yatsenko , Leon Hwang , Anton Protopopov , Amery Hung , Tobias Klauser , Eyal Birger , Rong Tao , Hao Luo , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , Willem de Bruijn , Jason Xing , Paul Chaignon , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, bpf@vger.kernel.org, cgroups@vger.kernel.org, linux-mm@kvack.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: geliang@kernel.org, baohua@kernel.org, Hui Zhu Subject: [RFC PATCH bpf-next v7 11/11] samples/bpf: Add memcg priority control and async reclaim example Date: Tue, 26 May 2026 10:27:56 +0800 Message-ID: <39784d6dba757f4fb82134192419094ce42c5af5.1779760876.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Add a sample program demonstrating two complementary use cases for the `memcg_bpf_ops` feature: priority-based memory throttling and workqueue-driven asynchronous page reclaim. The sample consists of a BPF program and a userspace loader: 1. memcg.bpf.c: A BPF program with the following capabilities: - Monitors PGFAULT events on a high-priority cgroup via a tracepoint. When the per-second PGFAULT sum exceeds a configurable threshold, a trigger timestamp is recorded. - Priority throttling: uses the `below_low` / `below_min` hooks on the high-priority cgroup and the `memcg_charged` hook on the low-priority cgroup to apply a configurable delay (over_high_ms), protecting the high-priority workload. - Async reclaim: uses the `memcg_charged` / `memcg_uncharged` hooks together with a BPF workqueue to trigger background page reclaim on the low-priority cgroup when its memory usage exceeds a configurable byte threshold (async_trigger_bytes), without blocking the charging context. - Six struct_ops variants are exported to allow userspace to attach only the hooks needed for the chosen feature combination: high_mcg_ops, high_mcg_ops_below_low, high_mcg_ops_below_min, low_mcg_ops (combined), low_mcg_ops_high_delay, low_mcg_ops_async. - A `prog_init` syscall program initialises the BPF workqueue and copies the configuration from userspace before struct_ops are attached. 2. memcg.c: A userspace loader that parses command-line arguments, resolves cgroup IDs from filesystem inodes, loads the BPF skeleton, calls prog_init via bpf_prog_test_run_opts(), and selects and attaches the appropriate struct_ops map for the requested feature combination. It supports BPF_F_ALLOW_OVERRIDE for stackable policies. Users can run workloads of different priorities in two cgroups and observe the low-priority workload being throttled or proactively reclaimed to protect the high-priority one. Example usage: # Priority throttling only: # ./memcg --low_path /sys/fs/cgroup/low \ # --high_path /sys/fs/cgroup/high \ # --threshold 1000 --over_high_ms 500 --use_below_low # Async reclaim only: # ./memcg --low_path /sys/fs/cgroup/low \ # --threshold 1000 --async_trigger_bytes 33554432 # Both features combined: # ./memcg --low_path /sys/fs/cgroup/low \ # --high_path /sys/fs/cgroup/high \ # --threshold 1000 --over_high_ms 500 \ # --async_trigger_bytes 33554432 Signed-off-by: Barry Song Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- MAINTAINERS | 2 + samples/bpf/.gitignore | 1 + samples/bpf/Makefile | 8 +- samples/bpf/memcg.bpf.c | 380 +++++++++++++++++++++++++++++++++++++ samples/bpf/memcg.c | 411 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 801 insertions(+), 1 deletion(-) create mode 100644 samples/bpf/memcg.bpf.c create mode 100644 samples/bpf/memcg.c diff --git a/MAINTAINERS b/MAINTAINERS index b2e64ef8c60c..a3f737a506b5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6566,6 +6566,8 @@ F: mm/memcontrol-v1.c F: mm/memcontrol-v1.h F: mm/page_counter.c F: mm/swap_cgroup.c +F: samples/bpf/memcg.bpf.c +F: samples/bpf/memcg.c F: samples/cgroup/* F: tools/testing/selftests/bpf/prog_tests/memcg_async_reclaim.c F: tools/testing/selftests/bpf/prog_tests/memcg_ops.c diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index 0002cd359fb1..0de6569cdefd 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -49,3 +49,4 @@ iperf.* /vmlinux.h /bpftool/ /libbpf/ +memcg diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 95a4fa1f1e44..b00698bdc53b 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -37,6 +37,7 @@ tprogs-y +=3D xdp_fwd tprogs-y +=3D task_fd_query tprogs-y +=3D ibumad tprogs-y +=3D hbm +tprogs-y +=3D memcg =20 # Libbpf dependencies LIBBPF_SRC =3D $(TOOLS_PATH)/lib/bpf @@ -122,6 +123,7 @@ always-y +=3D task_fd_query_kern.o always-y +=3D ibumad_kern.o always-y +=3D hbm_out_kern.o always-y +=3D hbm_edt_kern.o +always-y +=3D memcg.bpf.o =20 COMMON_CFLAGS =3D $(TPROGS_USER_CFLAGS) TPROGS_LDFLAGS =3D $(TPROGS_USER_LDFLAGS) @@ -289,6 +291,8 @@ $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h $(obj)/hbm.o: $(src)/hbm.h $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h =20 +memcg: $(obj)/memcg.skel.h + # Override includes for xdp_sample_user.o because $(srctree)/usr/include in # TPROGS_CFLAGS causes conflicts XDP_SAMPLE_CFLAGS +=3D -Wall -O2 \ @@ -347,11 +351,13 @@ $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src= )/xdp_sample.bpf.h $(src)/x -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \ -c $(filter %.bpf.c,$^) -o $@ =20 -LINKED_SKELS :=3D xdp_router_ipv4.skel.h +LINKED_SKELS :=3D xdp_router_ipv4.skel.h memcg.skel.h clean-files +=3D $(LINKED_SKELS) =20 xdp_router_ipv4.skel.h-deps :=3D xdp_router_ipv4.bpf.o xdp_sample.bpf.o =20 +memcg.skel.h-deps :=3D memcg.bpf.o + LINKED_BPF_SRCS :=3D $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SK= ELS),$($(skel)-deps))) =20 BPF_SRCS_LINKED :=3D $(notdir $(wildcard $(src)/*.bpf.c)) diff --git a/samples/bpf/memcg.bpf.c b/samples/bpf/memcg.bpf.c new file mode 100644 index 000000000000..0995284794ac --- /dev/null +++ b/samples/bpf/memcg.bpf.c @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +#define ONE_SECOND_NS 1000000000 +#define ONE_MB_PAGE_COUNT 256 + +/* GFP flags needed by bpf_try_to_free_mem_cgroup_pages() */ +#define BIT(nr) (1UL << (nr)) +#define ___GFP_IO BIT(___GFP_IO_BIT) +#define ___GFP_FS BIT(___GFP_FS_BIT) +#define ___GFP_DIRECT_RECLAIM BIT(___GFP_DIRECT_RECLAIM_BIT) +#define ___GFP_KSWAPD_RECLAIM BIT(___GFP_KSWAPD_RECLAIM_BIT) +#define __GFP_IO ((gfp_t)___GFP_IO) +#define __GFP_FS ((gfp_t)___GFP_FS) +#define __GFP_DIRECT_RECLAIM ((gfp_t)___GFP_DIRECT_RECLAIM) +#define __GFP_KSWAPD_RECLAIM ((gfp_t)___GFP_KSWAPD_RECLAIM) +#define __GFP_RECLAIM ((gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM= )) +#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) + +#define MEMCG_RECLAIM_MAY_SWAP (1 << 1) +#define MEMCG_RECLAIM_PROACTIVE (1 << 2) + +#define ASYNC_FREE_BATCH 32 +#define ASYNC_FREE_LOOP_MAX 16 + +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) +#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) =3D (val)) + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + u64 low_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; + u64 async_trigger_bytes; +} local_config; + +struct AggregationData { + u64 sum; + u64 window_start_ts; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct AggregationData); +} aggregation_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u64); +} trigger_ts_map SEC(".maps"); + +struct wq_elem { + struct bpf_wq work; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct wq_elem); +} wq_map SEC(".maps"); + +static s64 allocated; +static s64 old_allocated; +/* + * async_free_run: 0 =3D idle, 1 =3D workqueue item is queued/running. + * Acts as a one-shot guard: only one reclaim task is in-flight at + * a time. Cleared by async_free() once reclaim is complete and + * re-armed by __memcg_charged_impl() on the next trigger. + */ +static u64 async_free_run; + +/* + * wq_initialized: flipped from 0 -> 1 by prog_init() to make init + * idempotent if prog_init() is called more than once. + */ +static u64 wq_initialized; + +struct cgroup_memcg { + struct cgroup *cgrp; + struct mem_cgroup *memcg; +}; + +static int get_cgroup_memcg_from_id(u64 cgroup_id, struct cgroup_memcg *cm) +{ + cm->cgrp =3D bpf_cgroup_from_id(cgroup_id); + if (!cm->cgrp) + return -1; + + cm->memcg =3D bpf_get_mem_cgroup(&cm->cgrp->self); + if (!cm->memcg) { + bpf_cgroup_release(cm->cgrp); + return -1; + } + return 0; +} + +static void put_cgroup_memcg(struct cgroup_memcg *cm) +{ + bpf_put_mem_cgroup(cm->memcg); + bpf_cgroup_release(cm->cgrp); +} + +static int async_free(void *map, int *key, void *value) +{ + struct cgroup_memcg cm; + bool started_wq =3D false; + int i; + + if (get_cgroup_memcg_from_id(local_config.low_cgroup_id, &cm) !=3D 0) + return 0; + + for (i =3D 0; i < ASYNC_FREE_LOOP_MAX; i++) { + if (bpf_try_to_free_mem_cgroup_pages(cm.memcg, ASYNC_FREE_BATCH, + GFP_KERNEL, + MEMCG_RECLAIM_MAY_SWAP, + -1) <=3D 0) + break; + + if (bpf_mem_cgroup_usage(cm.memcg) < + local_config.async_trigger_bytes) + break; + } + + if (i =3D=3D ASYNC_FREE_LOOP_MAX) { + __u32 k =3D 0; + struct wq_elem *elem =3D bpf_map_lookup_elem(&wq_map, &k); + + if (elem) { + bpf_wq_start(&elem->work, 0); + started_wq =3D true; + } + } + + put_cgroup_memcg(&cm); + + if (!started_wq) + __atomic_exchange_n(&async_free_run, 0, __ATOMIC_RELEASE); + return 0; +} + +SEC("syscall") +int prog_init(struct local_config *ctx) +{ + struct wq_elem *elem; + __u32 key =3D 0; + u64 expected =3D 0; + int ret =3D -1; + + /* Guard against double-initialisation */ + if (!__atomic_compare_exchange_n(&wq_initialized, &expected, 1, + false, + __ATOMIC_ACQ_REL, + __ATOMIC_RELAXED)) + goto out; + + elem =3D bpf_map_lookup_elem(&wq_map, &key); + if (!elem) + goto out; + ret =3D bpf_wq_init(&elem->work, &wq_map, 0); + if (ret) + goto out; + ret =3D bpf_wq_set_callback(&elem->work, async_free, 0); + if (ret) + goto out; + + allocated =3D 0; + async_free_run =3D 0; + __builtin_memcpy(&local_config, ctx, sizeof(local_config)); + +out: + return ret; +} + +SEC("tp/memcg/count_memcg_events") +int handle_count_memcg_events( + struct trace_event_raw_memcg_rstat_events *ctx) +{ + u32 key =3D 0; + struct AggregationData *data; + u64 current_ts; + + if (ctx->id !=3D local_config.high_cgroup_id || + ctx->item !=3D PGFAULT) + goto out; + + data =3D bpf_map_lookup_elem(&aggregation_map, &key); + if (!data) + goto out; + + current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - data->window_start_ts < ONE_SECOND_NS) { + data->sum +=3D ctx->val; + } else { + data->window_start_ts =3D current_ts; + data->sum =3D ctx->val; + } + + if (data->sum > local_config.threshold) { + bpf_map_update_elem(&trigger_ts_map, &key, ¤t_ts, + BPF_ANY); + data->sum =3D 0; + data->window_start_ts =3D current_ts; + } + +out: + return 0; +} + +static bool need_threshold(void) +{ + u32 key =3D 0; + u64 *trigger_ts; + bool ret =3D false; + u64 current_ts; + + trigger_ts =3D bpf_map_lookup_elem(&trigger_ts_map, &key); + if (!trigger_ts || *trigger_ts =3D=3D 0) + goto out; + + current_ts =3D bpf_ktime_get_ns(); + if (current_ts - *trigger_ts < ONE_SECOND_NS) + ret =3D true; + +out: + return ret; +} + +SEC("struct_ops/below_low") +bool below_low_impl(struct mem_cgroup *memcg, unsigned long elow, + unsigned long usage) +{ + return need_threshold(); +} + +SEC("struct_ops/below_min") +bool below_min_impl(struct mem_cgroup *memcg, unsigned long elow, + unsigned long usage) +{ + return need_threshold(); +} + +static u64 get_usage(void) +{ + u64 ret =3D 0; + struct cgroup_memcg cm; + + if (get_cgroup_memcg_from_id(local_config.low_cgroup_id, &cm) !=3D 0) + return 0; + + ret =3D bpf_mem_cgroup_usage(cm.memcg); + + put_cgroup_memcg(&cm); + + return ret; +} + +static __always_inline s64 abs_diff(s64 a, s64 b) +{ + return a > b ? a - b : b - a; +} + +static __always_inline unsigned int +__memcg_charged_impl(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + struct wq_elem *elem; + __u32 key =3D 0; + u64 expected =3D 0; + s64 cur_allocated; + s64 cur_old_allocated; + + __atomic_add_fetch(&allocated, nr_pages, __ATOMIC_RELAXED); + cur_allocated =3D READ_ONCE(allocated); + cur_old_allocated =3D READ_ONCE(old_allocated); + if (abs_diff(cur_allocated, cur_old_allocated) < ONE_MB_PAGE_COUNT) + goto out; + WRITE_ONCE(old_allocated, cur_allocated); + + if (get_usage() < local_config.async_trigger_bytes) + goto out; + + if (__atomic_compare_exchange_n(&async_free_run, + &expected, 1, + false, + __ATOMIC_ACQ_REL, + __ATOMIC_RELAXED)) { + elem =3D bpf_map_lookup_elem(&wq_map, &key); + if (!elem) + return 0; + + bpf_wq_start(&elem->work, 0); + } + +out: + return 0; +} + +SEC("struct_ops/memcg_charged") +unsigned int BPF_PROG(memcg_charged_impl, struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + return __memcg_charged_impl(memcg, nr_pages); +} + +SEC("struct_ops/memcg_uncharged") +void BPF_PROG(memcg_uncharged_impl, struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + __atomic_sub_fetch(&allocated, nr_pages, __ATOMIC_RELAXED); +} + +unsigned int +__get_high_delay_ms_impl(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + if (need_threshold()) + return local_config.over_high_ms; + + return 0; +} + +SEC("struct_ops/memcg_charged") +unsigned int BPF_PROG(get_high_delay_ms_impl, struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + return __get_high_delay_ms_impl(memcg, nr_pages); +} + +SEC("struct_ops/memcg_charged") +unsigned int BPF_PROG(low_mcg_impl, struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + __memcg_charged_impl(memcg, nr_pages); + + return __get_high_delay_ms_impl(memcg, nr_pages); +} + +SEC(".struct_ops.link") +struct memcg_bpf_ops high_mcg_ops =3D { + .below_low =3D (void *)below_low_impl, + .below_min =3D (void *)below_min_impl, +}; + +SEC(".struct_ops.link") +struct memcg_bpf_ops high_mcg_ops_below_low =3D { + .below_low =3D (void *)below_low_impl, +}; + +SEC(".struct_ops.link") +struct memcg_bpf_ops high_mcg_ops_below_min =3D { + .below_min =3D (void *)below_min_impl, +}; + +SEC(".struct_ops.link") +struct memcg_bpf_ops low_mcg_ops =3D { + .memcg_charged =3D (void *)low_mcg_impl, + .memcg_uncharged =3D (void *)memcg_uncharged_impl, +}; + +SEC(".struct_ops.link") +struct memcg_bpf_ops low_mcg_ops_high_delay =3D { + .memcg_charged =3D (void *)get_high_delay_ms_impl, +}; + +SEC(".struct_ops.link") +struct memcg_bpf_ops low_mcg_ops_async =3D { + .memcg_charged =3D (void *)memcg_charged_impl, + .memcg_uncharged =3D (void *)memcg_uncharged_impl, +}; + +char LICENSE[] SEC("license") =3D "GPL"; diff --git a/samples/bpf/memcg.c b/samples/bpf/memcg.c new file mode 100644 index 000000000000..0929d868e6d8 --- /dev/null +++ b/samples/bpf/memcg.c @@ -0,0 +1,411 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __MEMCG_RSTAT_SIMPLE_BPF_SKEL_H__ +#define u64 uint64_t +#endif + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + u64 low_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; + u64 async_trigger_bytes; +}; + +#include "memcg.skel.h" + +static bool exiting; + +static void sig_handler(int sig) +{ + exiting =3D true; +} + +static void usage(const char *name) +{ + fprintf(stderr, + "Usage: %s --low_path=3D --high_path=3D\n" + " --threshold=3D [OPTIONS]\n\n", + name); + + fprintf(stderr, "Required arguments:\n"); + fprintf(stderr, + " -l, --low_path=3DPATH Low priority memcgroup path\n"); + fprintf(stderr, + " -g, --high_path=3DPATH High priority memcgroup path\n"); + fprintf(stderr, + " -t, --threshold=3DVALUE Sum of PGFAULT 'val' events from\n" + " the high-priority cgroup per second\n" + " needed to trigger low-priority\n" + " cgroup throttling\n\n"); + + fprintf(stderr, "Priority throttling options:\n"); + fprintf(stderr, + " -L, --use_below_low Enable the below_low hook on the\n" + " high-priority cgroup\n"); + fprintf(stderr, + " -M, --use_below_min Enable the below_min hook on the\n" + " high-priority cgroup\n"); + fprintf(stderr, + " -o, --over_high_ms=3DVALUE Delay (ms) returned by memcg_charged\n" + " for the low-priority cgroup while\n" + " throttling is active (default: 0)\n"); + fprintf(stderr, + " -a, --async_trigger_bytes=3DBYTES\n" + " Memory threshold bytes for\n" + " the async-reclaim Low priority\n" + " memcgroup above which background\n" + " page reclaim is triggered.\n" + " 0 or omitted =3D feature disabled.\n"); + fprintf(stderr, + " -O, --allow_override Set BPF_F_ALLOW_OVERRIDE when\n" + " attaching struct_ops\n\n"); + + fprintf(stderr, "Misc:\n"); + fprintf(stderr, " -h, --help Show this help message\n\n"); + + fprintf(stderr, "Examples:\n"); + fprintf(stderr, + " # Priority throttling only:\n" + " %s --low_path=3D/sys/fs/cgroup/low \\\n" + " --high_path=3D/sys/fs/cgroup/high \\\n" + " --threshold=3D1000 --over_high_ms=3D500 --use_below_low\n\n", + name); + fprintf(stderr, + " # Async reclaim only (no throttling):\n" + " %s --low_path=3D/sys/fs/cgroup/low \\\n" + " --threshold=3D1000 \\\n" + " --async_trigger_bytes=3D33554432\n\n", + name); + fprintf(stderr, + " # Both features combined:\n" + " %s --low_path=3D/sys/fs/cgroup/low \\\n" + " --high_path=3D/sys/fs/cgroup/high \\\n" + " --threshold=3D1000 --over_high_ms=3D500 \\\n" + " --async_trigger_bytes=3D33554432\n", + name); +} + +static uint64_t get_cgroup_id(const char *cgroup_path) +{ + struct stat st; + + if (!cgroup_path) { + fprintf(stderr, "Error: cgroup_path is NULL\n"); + return 0; + } + + if (stat(cgroup_path, &st) < 0) { + fprintf(stderr, "Error: stat(%s) failed: %d\n", + cgroup_path, errno); + return 0; + } + + return (uint64_t)st.st_ino; +} + +static uint64_t parse_u64(const char *str, const char *prog) +{ + uint64_t value; + + errno =3D 0; + value =3D strtoull(str, NULL, 10); + if (errno !=3D 0) { + fprintf(stderr, "ERROR: strtoull '%s' failed: %d\n", + str, errno); + usage(prog); + exit(-errno); + } + return value; +} + +static int +attach_ops(struct bpf_object *obj, __u32 opts_flags, const char *name, int= fd, + struct bpf_link **link_ptr) +{ + int err; + struct bpf_map *map; + struct bpf_link *link; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts, + .flags =3D opts_flags | BPF_F_CGROUP_FD, + .target_fd =3D fd, + ); + + map =3D bpf_object__find_map_by_name(obj, name); + if (!map) { + fprintf(stderr, + "ERROR: Failed to find %s map\n", name); + err =3D -ESRCH; + goto out; + } + link =3D bpf_map__attach_struct_ops_opts(map, &opts); + err =3D libbpf_get_error(link); + if (err) { + link =3D NULL; + fprintf(stderr, + "Failed to attach struct ops %s: %d\n", + name, err); + goto out; + } + *link_ptr =3D link; + +out: + return err; +} + +int main(int argc, char **argv) +{ + int low_cgroup_fd =3D -1, high_cgroup_fd =3D -1; + struct local_config local_config =3D { + .threshold =3D 1, + .high_cgroup_id =3D 0, + .low_cgroup_id =3D 0, + .use_below_low =3D false, + .use_below_min =3D false, + .over_high_ms =3D 0, + .async_trigger_bytes =3D 0, + }; + LIBBPF_OPTS(bpf_test_run_opts, run_opts, + .ctx_in =3D &local_config, + .ctx_size_in =3D sizeof(local_config) + ); + int prog_init_fd; + __u32 opts_flags =3D 0; + const char *low_path =3D NULL; + const char *high_path =3D NULL; + struct memcg *skel =3D NULL; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link_low =3D NULL, *link_high =3D NULL; + int err =3D -EINVAL; + int opt; + int option_index =3D 0; + + static struct option long_options[] =3D { + /* required */ + {"low_path", required_argument, 0, 'l'}, + {"high_path", required_argument, 0, 'g'}, + {"threshold", required_argument, 0, 't'}, + /* priority throttling */ + {"over_high_ms", required_argument, 0, 'o'}, + {"use_below_low", no_argument, 0, 'L'}, + {"use_below_min", no_argument, 0, 'M'}, + {"async_trigger_bytes", required_argument, 0, 'a'}, + {"allow_override", no_argument, 0, 'O'}, + /* misc */ + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0 } + }; + + while ((opt =3D getopt_long(argc, argv, "l:g:t:o:LMOa:h", + long_options, &option_index)) !=3D -1) { + switch (opt) { + case 'l': + low_path =3D optarg; + break; + case 'g': + high_path =3D optarg; + break; + case 't': + local_config.threshold =3D parse_u64(optarg, argv[0]); + break; + case 'o': + local_config.over_high_ms + =3D (unsigned int)parse_u64(optarg, argv[0]); + break; + case 'L': + local_config.use_below_low =3D true; + break; + case 'M': + local_config.use_below_min =3D true; + break; + case 'O': + opts_flags =3D BPF_F_ALLOW_OVERRIDE; + break; + case 'a': + local_config.async_trigger_bytes + =3D parse_u64(optarg, argv[0]); + break; + case 'h': + usage(argv[0]); + return 0; + default: + usage(argv[0]); + return -EINVAL; + } + } + + if ((!local_config.use_below_low && + !local_config.use_below_min && + !local_config.async_trigger_bytes && + !local_config.over_high_ms) || + ((local_config.use_below_low || local_config.use_below_min) && + !high_path) || + (local_config.async_trigger_bytes && !low_path) || + (local_config.over_high_ms && (!high_path || !low_path))) { + fprintf(stderr, "ERROR: Missing required arguments\n\n"); + usage(argv[0]); + goto out; + } + + + if (low_path) { + low_cgroup_fd =3D open(low_path, O_RDONLY); + if (low_cgroup_fd < 0) { + fprintf(stderr, + "ERROR: open low cgroup '%s' failed: %d\n", + low_path, errno); + err =3D -errno; + goto out; + } + + local_config.low_cgroup_id =3D get_cgroup_id(low_path); + if (!local_config.low_cgroup_id) { + fprintf(stderr, + "ERROR: get low cgroup '%s' id failed: %d\n", + low_path, errno); + err =3D -errno; + goto out; + } + } + + if (high_path) { + high_cgroup_fd =3D open(high_path, O_RDONLY); + if (high_cgroup_fd < 0) { + fprintf(stderr, + "ERROR: open high cgroup '%s' failed: %d\n", + high_path, errno); + err =3D -errno; + goto out; + } + + local_config.high_cgroup_id =3D get_cgroup_id(high_path); + if (!local_config.high_cgroup_id) { + fprintf(stderr, + "ERROR: get high cgroup '%s' id failed: %d\n", + high_path, errno); + err =3D -errno; + goto out; + } + } + + skel =3D memcg__open_and_load(); + if (!skel) { + err =3D -errno; + fprintf(stderr, + "ERROR: opening and loading BPF skeleton failed: %d\n", + err); + goto out; + } + + prog_init_fd =3D bpf_program__fd(skel->progs.prog_init); + err =3D bpf_prog_test_run_opts(prog_init_fd, &run_opts); + if (err || run_opts.retval) { + fprintf(stderr, + "ERROR: prog_init failed (err=3D%d retval=3D%d)\n", + err, run_opts.retval); + err =3D err ? err : -run_opts.retval; + goto out; + } + + if (local_config.use_below_low && local_config.use_below_min) { + err =3D attach_ops(skel->obj, opts_flags, "high_mcg_ops", + high_cgroup_fd, &link_high); + if (err) + goto out; + } else if (local_config.use_below_low) { + err =3D attach_ops(skel->obj, opts_flags, + "high_mcg_ops_below_low", + high_cgroup_fd, &link_high); + if (err) + goto out; + } else if (local_config.use_below_min) { + err =3D attach_ops(skel->obj, opts_flags, + "high_mcg_ops_below_min", + high_cgroup_fd, &link_high); + if (err) + goto out; + } + + if (local_config.over_high_ms && local_config.async_trigger_bytes) { + err =3D attach_ops(skel->obj, opts_flags, + "low_mcg_ops", + low_cgroup_fd, &link_low); + if (err) + goto out; + } else if (local_config.over_high_ms) { + err =3D attach_ops(skel->obj, opts_flags, + "low_mcg_ops_high_delay", + low_cgroup_fd, &link_low); + if (err) + goto out; + } else if (local_config.async_trigger_bytes) { + err =3D attach_ops(skel->obj, opts_flags, + "low_mcg_ops_async", + low_cgroup_fd, &link_low); + if (err) + goto out; + } + + if (local_config.use_below_low || local_config.use_below_min || + local_config.over_high_ms) { + prog =3D bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!prog) { + fprintf(stderr, + "ERROR: finding a prog in BPF object file failed\n"); + goto out; + } + + link =3D bpf_program__attach(prog); + err =3D libbpf_get_error(link); + if (err) { + link =3D NULL; + fprintf(stderr, + "ERROR: bpf_program__attach failed: %d\n", + err); + goto out; + } + } + + printf("Successfully attached!\n"); + + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + + while (!exiting) + pause(); + + printf("Exiting...\n"); + err =3D 0; + +out: + bpf_link__destroy(link); + bpf_link__destroy(link_low); + bpf_link__destroy(link_high); + if (skel) { + memcg__detach(skel); + memcg__destroy(skel); + } + if (low_cgroup_fd >=3D 0) + close(low_cgroup_fd); + if (high_cgroup_fd >=3D 0) + close(high_cgroup_fd); + return err; +} --=20 2.43.0