From nobody Sat Feb 7 11:31:04 2026 Received: from out-186.mta0.migadu.com (out-186.mta0.migadu.com [91.218.175.186]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 75C9A320393 for ; Fri, 23 Jan 2026 08:56:20 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.186 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158582; cv=none; b=tRetqfRrPovZLfC7CfGrG7T1OAt0Z4gNugtJHSiBWz3OeeFpHmuFlM79+FJlnVWDUGtleGv0N0IPuSCOTo4jHfXpWafSBRLeCq67jZcsk0dYNrR/euH0Fa50sY5cvV0x5daZLyGUCuVI9DT/y0U7JIVNbxnCpysJ+j2GUJUVT7k= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158582; c=relaxed/simple; bh=bWju0Pu/q1fcHhJ75vFo+k/1nN5ACrnizuMh4PQ2PyU=; h=From:To:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=GYCx5qaSjXx04kU3b0zReVBXTq+QnWkqRDw3d0YqigsiUdqpU0gO5ItbKURi0KSyk1ZWx/py+HB3W0kw/AKspfj54c1CSD3a41PMfLSTZQBeJAX7+TO2E5U4GQNf5qpJWNxXQEcrodZTL3rrNrYEF1YsA74a/o+XIz0//jaNVUA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=I2mHUI2A; arc=none smtp.client-ip=91.218.175.186 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="I2mHUI2A" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769158578; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=4IZwwlsklIfyYIeeJQqajpok/bifebBH8KgsHF7gR+Y=; b=I2mHUI2AYbqOhP0GZVQ7d5GjtaXLD2bXmUkzdEJnIsbrYIYnXnORHCehEq1EhcFDRTLZe4 lJuKqiCRjeZj8O4b1t5Th3DoiD+cse/Y69dRz5iANYxn/eayBcnQplrZ4hCExPD/EjK81m Q5XMWVUkr84piWYFfOv54D77MdMEa+A= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH bpf-next v3 01/12] bpf: move bpf_struct_ops_link into bpf.h Date: Fri, 23 Jan 2026 16:55:19 +0800 Message-ID: <6ddaa3949346e2b048b7bd6714d182c96a586d4c.1769157382.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin Move struct bpf_struct_ops_link's definition into bpf.h, where other custom bpf links definitions are. It's necessary to access its members from outside of generic bpf_struct_ops implementation, which will be done by following patches in the series. Signed-off-by: Roman Gushchin --- include/linux/bpf.h | 6 ++++++ kernel/bpf/bpf_struct_ops.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5936f8e2996f..031682f1ad39 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1881,6 +1881,12 @@ struct bpf_raw_tp_link { u64 cookie; }; =20 +struct bpf_struct_ops_link { + struct bpf_link link; + struct bpf_map __rcu *map; + wait_queue_head_t wait_hup; +}; + struct bpf_link_primer { struct bpf_link *link; struct file *file; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index c43346cb3d76..de01cf3025b3 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -55,12 +55,6 @@ struct bpf_struct_ops_map { struct bpf_struct_ops_value kvalue; }; =20 -struct bpf_struct_ops_link { - struct bpf_link link; - struct bpf_map __rcu *map; - wait_queue_head_t wait_hup; -}; - static DEFINE_MUTEX(update_mutex); =20 #define VALUE_PREFIX "bpf_struct_ops_" --=20 2.43.0 From nobody Sat Feb 7 11:31:04 2026 Received: from out-181.mta0.migadu.com (out-181.mta0.migadu.com [91.218.175.181]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C47E62FD685 for ; Fri, 23 Jan 2026 08:56:31 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.181 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158593; cv=none; b=sBK78EdeDeRdSbGu3okyGNtaS9HRxlT9gAuBnzS6ihww56Yw8XHOK9me+4xkjw0ffTlEAbSdNns5B3+pVdNhyE9N95P5CftrYbSARLcsmf96FATWRH0jiUWhfFheGO4pT1AcT2tX3jFtPXEn5mRHaMIczlDp4ZHCOQA1Ukv7SfI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158593; c=relaxed/simple; bh=VMWSjdZBbwCROxtM45L3N2YVJItH/AEKdFdBA0R/Kg8=; h=From:To:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Gef5U08QWjEXRZL1AxQRu3Mwk+D0+oRkr+Xr6ys6I3AJUaYF4rtjmCr77HO+XHdZ4E7TMrQ7/eB/vQlKUYyvSZ0au5Rs4IoyxP+eHlEdT5bBjs2kfXk6YaLysRQbRLPjzw88h2favPmmsrlhqz3Prw5EFrvpx2tV2ePnxIlIp7U= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=NMD50k4b; arc=none smtp.client-ip=91.218.175.181 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="NMD50k4b" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769158589; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=mRzQHCKarCYBi7G1xIo0Ho9ytROzsvuCqz+L2Dk75mk=; b=NMD50k4bUm77bwDcYhOB0EpujUCmId6LC0LnmOQ+cN7HRcsMqcCVMBNhz7FEmicI88cTzs fCiKD/e7ZUORx7uTYhwqmUndQiS7GsgwVJdfUZ4G1Wtzv8/QnHJOyc0nCpr8f/K4LOtplX e4UW3EsGa7P7cP1zRrhbOsTQC9UwH8w= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH bpf-next v3 02/12] bpf: initial support for attaching struct ops to cgroups Date: Fri, 23 Jan 2026 16:55:20 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin When a struct ops is being attached and a bpf link is created, allow to pass a cgroup fd using bpf attr, so that struct ops can be attached to a cgroup instead of globally. Attached struct ops doesn't hold a reference to the cgroup, only preserves cgroup id. Signed-off-by: Roman Gushchin --- include/linux/bpf.h | 1 + kernel/bpf/bpf_struct_ops.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 031682f1ad39..7c15bac782fc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1885,6 +1885,7 @@ struct bpf_struct_ops_link { struct bpf_link link; struct bpf_map __rcu *map; wait_queue_head_t wait_hup; + u64 cgroup_id; }; =20 struct bpf_link_primer { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index de01cf3025b3..f575c5cd0dc8 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -13,6 +13,7 @@ #include #include #include +#include =20 struct bpf_struct_ops_value { struct bpf_struct_ops_common_value common; @@ -1377,6 +1378,18 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_= lops, NULL, attr->link_create.attach_type); +#ifdef CONFIG_CGROUPS + if (attr->link_create.cgroup.relative_fd) { + struct cgroup *cgrp; + + cgrp =3D cgroup_get_from_fd(attr->link_create.cgroup.relative_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + link->cgroup_id =3D cgroup_id(cgrp); + cgroup_put(cgrp); + } +#endif /* CONFIG_CGROUPS */ =20 err =3D bpf_link_prime(&link->link, &link_primer); if (err) --=20 2.43.0 From nobody Sat Feb 7 11:31:04 2026 Received: from out-189.mta0.migadu.com (out-189.mta0.migadu.com [91.218.175.189]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 40BFF31A571 for ; Fri, 23 Jan 2026 08:56:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.189 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158604; cv=none; b=Si3xOzI6kwzlpt7M9ZntXwBZpNjGYqvzPzdjN/ye7Crl2cJEmD2sazLcaU1csUBRkPWlpJTo0QVcS2PP0qHtcj7zTP7VOGcsF8XIZd09zK658a8KJp+xDWuZHz1zCw/PVX+GkyuHOkGLaJPd49GfKeSPXdknVlPLoe7CmcZcrJg= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158604; c=relaxed/simple; bh=1UvSBdgGuBgiQRwX3e7nYHkdgZ7JTX6rdeF6qDJwPQo=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=ginzYY+b+kj7zWYmAIr3VeFVyRoFj/Fyb3MH5XLlMIfZBulW0xxLW/0RznBGF0ABYQVRA6qSTyrw4jpzf+7mUWcbikOfgiIOCnuWnhzwJ6RMnU8JkShDrJZAK/I3xZoK+AWNzjnJb1QBhI7hxObM2+nIzqNnNe/JuQUsxUqdqnU= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=G/KkPluM; arc=none smtp.client-ip=91.218.175.189 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="G/KkPluM" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769158600; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=ev1nwp2jqQHVpe3TXk2x9AVEGB6Nj6mx2yP0IpcUZW4=; b=G/KkPluM8KiVv/KlXXvon+NjDg2D+OVXWkHzALnwohFtZK4prPRR6w8tcl7XhnGO9L1O5L yoLCuV9BE9e/bPYYrRXc1CFx1c+BQkJAnwkDsEou9XN8/GXO2/DN8T+qUPqi4V4YqtegaR vdecvuwC7H3OEHkwZGZukpN6hW5pqn8= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Kumar Kartikeya Dwivedi Subject: [RFC PATCH bpf-next v3 03/12] bpf: mark struct oom_control's memcg field as TRUSTED_OR_NULL Date: Fri, 23 Jan 2026 16:55:21 +0800 Message-ID: <27af33f679846a2783cc8d82111ebeac170b004c.1769157382.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin Struct oom_control is used to describe the OOM context. It's memcg field defines the scope of OOM: it's NULL for global OOMs and a valid memcg pointer for memcg-scoped OOMs. Teach bpf verifier to recognize it as trusted or NULL pointer. It will provide the bpf OOM handler a trusted memcg pointer, which for example is required for iterating the memcg's subtree. Signed-off-by: Roman Gushchin Acked-by: Kumar Kartikeya Dwivedi --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7a375f608263..e59acdbb5062 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7152,6 +7152,10 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)= { struct file *vm_file; }; =20 +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct oom_control) { + struct mem_cgroup *memcg; +}; + static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *field_name, u32 btf_id) @@ -7194,6 +7198,7 @@ static bool type_is_trusted_or_null(struct bpf_verifi= er_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct oom_control)); =20 return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); --=20 2.43.0 From nobody Sat Feb 7 11:31:04 2026 Received: from out-174.mta1.migadu.com (out-174.mta1.migadu.com [95.215.58.174]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 13B8D313274; Fri, 23 Jan 2026 08:58:42 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.174 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158727; cv=none; b=F3q3u1qqjogDrUwUmVrD06U7LEur8jMCGVtqS9scjnhvKTNygOS2v2bWeQRFw1sxpCG/Y+YG/CcDXC6eTWbRDzSiQreCxSaMqBNX9nntsPlHtZyOEOcV/h13yvYnQ4Ax5ndbERUrOZa9jR3itpRNe3ahGQm10TRuHd3he4Dexss= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158727; c=relaxed/simple; bh=2k9LiI+M//qM4q+K2IyZ0CNDJt0ZnwZ2pxw6XNWmxxQ=; h=From:To:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=jWj3xpV8782eIPub/Da5j5uQwlnfXmXD+YmHQAs0eKxehpwP2fld7lYAepWKGjOLnQrYDriZrkwXhp88Hz4b/28TG58/qWvGZjdeVq+a/XdPmpiMPgXcM+iAWmWexqi3iw8xgmG8SfwBnsOoP9gPWSbkqSr/dyqhXyBuShuQkdM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=s1F4hAcN; arc=none smtp.client-ip=95.215.58.174 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="s1F4hAcN" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769158710; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=MZDHd5qG1jM6uT5/JnPwA06VDLOpkfHtWx6qC88p08g=; b=s1F4hAcNIr7FO7zIIhXOc52f/ar923r2DlaIfnkq9t9LXEKwLC/zJvy/Jwl45AvVrjQ7V1 oM0I6+NJkSSr0xpaHsAfkzNd+jZZOEZBJTWEjJ9HsKumn8IOWsUJTw6yBQ9aPRHuFekepj Sqw7JE/M2z8LvOE2rpvDvCmlnRkH+nQ= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH bpf-next v3 04/12] mm: define mem_cgroup_get_from_ino() outside of CONFIG_SHRINKER_DEBUG Date: Fri, 23 Jan 2026 16:57:58 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin mem_cgroup_get_from_ino() can be reused by the BPF OOM implementation, but currently depends on CONFIG_SHRINKER_DEBUG. Remove this dependency. Signed-off-by: Roman Gushchin --- include/linux/memcontrol.h | 4 ++-- mm/memcontrol.c | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 229ac9835adb..f3b8c71870d8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -833,9 +833,9 @@ static inline unsigned long mem_cgroup_ino(struct mem_c= group *memcg) { return memcg ? cgroup_ino(memcg->css.cgroup) : 0; } +#endif =20 struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); -#endif =20 static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { @@ -1298,12 +1298,12 @@ static inline unsigned long mem_cgroup_ino(struct m= em_cgroup *memcg) { return 0; } +#endif =20 static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { return NULL; } -#endif =20 static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3808845bc8cc..1f74fce27677 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3658,7 +3658,6 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short = id) return xa_load(&mem_cgroup_ids, id); } =20 -#ifdef CONFIG_SHRINKER_DEBUG struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { struct cgroup *cgrp; @@ -3679,7 +3678,6 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned l= ong ino) =20 return memcg; } -#endif =20 static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn) { --=20 2.43.0 From nobody Sat Feb 7 11:31:04 2026 Received: from out-189.mta1.migadu.com (out-189.mta1.migadu.com [95.215.58.189]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9045A33A010 for ; Fri, 23 Jan 2026 08:58:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.189 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158728; cv=none; b=NqPC5WehX9X1jjCGN1ki1GF4u4Bm1jo87yrKRqBIGNv0qKeLu8ffBCAsFoYvYi7vEk+U2Gjb9ZdVn2RMf9QWuyFv+RxzwdqoANBRLWqOzvqD/igR/PbIKQmEC7eVClSdL2W3mXAOrtAiA7OFzluC+E0yBcppLNryTkJavgmPv1A= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158728; c=relaxed/simple; bh=W2z9ag7GJdOo5y4sTtuZS1rUggdzi+NIWFP9U5JSzBw=; h=From:To:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=VfJhB9uOr+M3zlJC0vNRVrPRa1LFAaccXFT5BqVyTsbNGfa2W+X7QJl4QZorQuU17rvzLTeQnpUmsuLX3KW8lhK7bWb8rTyOUvxmsyk6CcEB6i8rhYrBMPS+iZxFtL39+lhOmru2q1HvJYP3/JKBJGRBaqq3vJwAqzEnrPOUQdE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=fTSVavEG; arc=none smtp.client-ip=95.215.58.189 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="fTSVavEG" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769158721; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=yMDUAyLhkyqoXu0AnX5yMDVCDjfmzYBfHTdL76REPRM=; b=fTSVavEGC9/BVpguN5QVYm6QgZJEp24s+u5liPxQawZxkjy2xNt2bcSuWIp+ppId6ZUqs0 JRlAOIGdjfyU2rNCDJGZVi2xCMUGSQ/OE35BXqXa+evHCC4pJld5lS+KTcbVbGzQ0QUo1n FPxiGBbhQtm8eBP+7QW/eolDZuX+f1g= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH bpf-next v3 05/12] libbpf: introduce bpf_map__attach_struct_ops_opts() Date: Fri, 23 Jan 2026 16:57:59 +0800 Message-ID: <03bf6aafe30def690c63e454d76e733f27dff4b3.1769157382.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin Introduce bpf_map__attach_struct_ops_opts(), an extended version of bpf_map__attach_struct_ops(), which takes additional struct bpf_struct_ops_opts argument. struct bpf_struct_ops_opts has the relative_fd member, which allows to pass an additional file descriptor argument. It can be used to attach struct ops maps to cgroups. Signed-off-by: Roman Gushchin --- tools/lib/bpf/bpf.c | 8 ++++++++ tools/lib/bpf/libbpf.c | 18 ++++++++++++++++-- tools/lib/bpf/libbpf.h | 14 ++++++++++++++ tools/lib/bpf/libbpf.map | 2 +- 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 21b57a629916..a2833a50a509 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -883,6 +883,14 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, cgroup)) return libbpf_err(-EINVAL); break; + case BPF_STRUCT_OPS: + relative_fd =3D OPTS_GET(opts, cgroup.relative_fd, 0); + attr.link_create.cgroup.relative_fd =3D relative_fd; + attr.link_create.cgroup.expected_revision =3D + OPTS_GET(opts, cgroup.expected_revision, 0); + if (!OPTS_ZEROED(opts, cgroup)) + return libbpf_err(-EINVAL); + break; default: if (!OPTS_ZEROED(opts, flags)) return libbpf_err(-EINVAL); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index bbcfd72b07d5..37eb4f96b28e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -13459,12 +13459,19 @@ static int bpf_link__detach_struct_ops(struct bpf= _link *link) return close(link->fd); } =20 -struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +struct bpf_link *bpf_map__attach_struct_ops_opts(const struct bpf_map *map, + const struct bpf_struct_ops_opts *opts) { + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts); struct bpf_link_struct_ops *link; __u32 zero =3D 0; int err, fd; =20 + if (!OPTS_VALID(opts, bpf_struct_ops_opts)) { + pr_warn("map '%s': invalid opts\n", map->name); + return libbpf_err_ptr(-EINVAL); + } + if (!bpf_map__is_struct_ops(map)) { pr_warn("map '%s': can't attach non-struct_ops map\n", map->name); return libbpf_err_ptr(-EINVAL); @@ -13500,7 +13507,9 @@ struct bpf_link *bpf_map__attach_struct_ops(const s= truct bpf_map *map) return &link->link; } =20 - fd =3D bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, NULL); + link_opts.cgroup.relative_fd =3D OPTS_GET(opts, relative_fd, 0); + + fd =3D bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, &link_opts); if (fd < 0) { free(link); return libbpf_err_ptr(fd); @@ -13512,6 +13521,11 @@ struct bpf_link *bpf_map__attach_struct_ops(const = struct bpf_map *map) return &link->link; } =20 +struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +{ + return bpf_map__attach_struct_ops_opts(map, NULL); +} + /* * Swap the back struct_ops of a link with a new struct_ops map. */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index dfc37a615578..5aef44bcfcc2 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -920,6 +920,20 @@ bpf_program__attach_cgroup_opts(const struct bpf_progr= am *prog, int cgroup_fd, struct bpf_map; =20 LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_ma= p *map); + +struct bpf_struct_ops_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + __u32 flags; + __u32 relative_fd; + __u64 expected_revision; + size_t :0; +}; +#define bpf_struct_ops_opts__last_field expected_revision + +LIBBPF_API struct bpf_link * +bpf_map__attach_struct_ops_opts(const struct bpf_map *map, + const struct bpf_struct_ops_opts *opts); LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bp= f_map *map); =20 struct bpf_iter_attach_opts { diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index d18fbcea7578..2bf514462045 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -453,5 +453,5 @@ LIBBPF_1.7.0 { bpf_map__exclusive_program; bpf_prog_assoc_struct_ops; bpf_program__assoc_struct_ops; - btf__permute; + bpf_map__attach_struct_ops_opts; } LIBBPF_1.6.0; --=20 2.43.0 From nobody Sat Feb 7 11:31:04 2026 Received: from out-173.mta1.migadu.com (out-173.mta1.migadu.com [95.215.58.173]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BB2D034C145; Fri, 23 Jan 2026 08:58:56 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.173 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158740; cv=none; b=TOqzMfikxfhp7P31PBxPg7fx4njVpe5mKul1ttTVlmEQf+NYL0BlQlgsOcRv2N1rvc76a1MY7jMaH+VLreV+tMwHRULbFxKf0EaSy3/mNZXL8PljjWi+DEzshTSOE7L2YNjlMZcRIdcVratDmNzpW8Fe0S5vaF5RsII1OS364+o= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158740; c=relaxed/simple; bh=W33/TQ8Cn+9VYTNLkRzhBtnypxwCG7M1ZaFpZizuCzU=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=cBvLFUW9gRlEZdgiT/Q1Ti4rlKNszdEJgBIN1ppSFdxuTHbU9SYPJ6bcMmw1GONOLJnTicM6qz2jo4HH97/cXSG2Pq+eAmQ7TZdT6XHjj3d6/HdXEQjabtjkCaGUxigeWNdj8h1rvtx5zHmPV3H/6msSnuGlS8B90hQrfL0Hzws= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=e6LwxY5N; arc=none smtp.client-ip=95.215.58.173 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="e6LwxY5N" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769158734; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=dF3N7wfqonOz2JeVOK2D6Fq8XUWqRSBOclHYLh6Qqp8=; b=e6LwxY5NkLu/Ogam+UhCiUBdJX+tXDML3QQGaCMjx/GTLIvuPNCLqgkIQBpsnEysmISM28 GtPQ7AaT/cv06QB9YrunhGFVJPXzZVEYL+0R86DafDY/b032z6nS+0vw0ZYygcIqz1B8rk BxgDFTkJVddmfDtf64LslrldN1LMxx0= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v3 06/12] bpf: Pass flags in bpf_link_create for struct_ops Date: Fri, 23 Jan 2026 16:58:00 +0800 Message-ID: <04f5673e4b9c992a6e42ef3ea36db1df1418815f.1769157382.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu To support features like allowing overrides in cgroup hierarchies, we need a way to pass flags from userspace to the kernel when attaching a struct_ops. Extend `bpf_struct_ops_link` to include a `flags` field. This field is populated from `attr->link_create.flags` during link creation. This will allow struct_ops implementations, such as the upcoming memory controller ops, to interpret these flags and modify their attachment behavior accordingly. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- include/linux/bpf.h | 1 + kernel/bpf/bpf_struct_ops.c | 1 + tools/include/uapi/linux/bpf.h | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7c15bac782fc..b8fde3bf4b91 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1886,6 +1886,7 @@ struct bpf_struct_ops_link { struct bpf_map __rcu *map; wait_queue_head_t wait_hup; u64 cgroup_id; + u32 flags; }; =20 struct bpf_link_primer { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index f575c5cd0dc8..18042751f1eb 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1390,6 +1390,7 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) cgroup_put(cgrp); } #endif /* CONFIG_CGROUPS */ + link->flags =3D attr->link_create.flags; =20 err =3D bpf_link_prime(&link->link, &link_primer); if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b816bc53d2e1..a58fdb4484a4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1184,7 +1184,7 @@ enum bpf_perf_event_type { BPF_PERF_EVENT_EVENT =3D 6, }; =20 -/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH and BPF_LINK_CREATE com= mand * * NONE(default): No further bpf programs allowed in the subtree. * --=20 2.43.0 From nobody Sat Feb 7 11:31:04 2026 Received: from out-176.mta1.migadu.com (out-176.mta1.migadu.com [95.215.58.176]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4C66E350A3B; Fri, 23 Jan 2026 08:59:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.176 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158750; cv=none; b=XYdyePQm9JuTIuMRpG8ezgAbUbmd5P8dTMoVKMViUDCnidDkRUjnwCDLXoMzg/8fP1AolYIX0T0+7K66MVG0VhqV/QQEZK6YriXaJt9opeKsX4FuXQxBl1yY4m944RWDeZlnNh8X1FhS+ffSFavXyjG7gV15cyDkEA2BtnhqDZE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158750; c=relaxed/simple; bh=FOZgACDD42oP7q3Vatpm8aL6PwAkwL4upLYGHYnoYiY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=r/EmRMdck9cQLu01LbQ+KPxe8wRVkdoHMPkTpv3F1HK3AmmgMQy9bJdahbEYPTUxFUvyUxln67jzu1Ohf7DJ1KxpXNbIV8D2gLivujumc+mWTjGrQPYvWL0+nBGy8eYJrxXdZ1EpPIlpZ1tzUgKRgWPD1eoFcPLed2w5/zM2SQQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=a7IIP/Es; arc=none smtp.client-ip=95.215.58.176 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="a7IIP/Es" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769158746; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=uLmLgIc/WysrcWe9//I7t11dZt4RG+riPvAd1pdlrBg=; b=a7IIP/EsFUJnD3r7OnHIdd2/eiTbPkC3e6yHe7BggslKBJxB4oQ47RRY0b98uf/B/WTkxM OQmGL1/JCv/S9EGgOwUTCr3OVVhK/BTpgiHN8sZ8WSgXM0gPOvPGTUX26n9gi3LG9mgpSZ nAR6Ug/78D3j1Ymvcl56/aRR5+vBNjw= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v3 07/12] libbpf: Support passing user-defined flags for struct_ops Date: Fri, 23 Jan 2026 16:58:01 +0800 Message-ID: <006635a13eeae0fe8da56447bc5131f6a80a48bf.1769157382.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Building on the previous change that added flags to the kernel's link creation path, this patch exposes this functionality through libbpf. The `bpf_struct_ops_opts` struct is extended with a `flags` member, which is then passed to the `bpf_link_create` syscall within `bpf_map__attach_struct_ops_opts`. This enables userspace applications to pass flags, such as `BPF_F_ALLOW_OVERRIDE`, when attaching struct_ops to cgroups, providing more control over the attachment behavior in nested hierarchies. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- tools/lib/bpf/libbpf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 37eb4f96b28e..089bd1325468 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -13508,6 +13508,7 @@ struct bpf_link *bpf_map__attach_struct_ops_opts(co= nst struct bpf_map *map, } =20 link_opts.cgroup.relative_fd =3D OPTS_GET(opts, relative_fd, 0); + link_opts.flags =3D OPTS_GET(opts, flags, 0); =20 fd =3D bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, &link_opts); if (fd < 0) { --=20 2.43.0 From nobody Sat Feb 7 11:31:04 2026 Received: from out-170.mta1.migadu.com (out-170.mta1.migadu.com [95.215.58.170]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id EFE922EFDA4 for ; Fri, 23 Jan 2026 09:00:39 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.170 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158842; cv=none; b=bnEjwPCeX96+VuBOQ1R6WuUb/iPgNO3zetGDjI4LJRFB93po+E8nYEbVW+9u0melyGJuB6HBSO3w8ffOD7RKMzdFxAxANV+vMy3KRP1U79KUpju2smLLMMQ66H8FyNQX+j0sdCX+vu3ihBJ6xs51bQIWGeVigeU1c23ua2XTxng= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158842; c=relaxed/simple; bh=qW0cSGcqYpP6ESlgCJ0TVzGTDgVyXrW1i/CCakXzy/M=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=NxZl1FoIR39dHb28YRy/8WAn7Se2t2BGZml52iybKrQfwCM+m99YFq5cuV1ps6XJJrnBTCka/KXN2hcRmCHqE3QTZLG5kl0f8UtKAEeqIr9sr+TrOFFGBS6XJHo9pt/uN2SnjHsM7CAD8HYl93O5OHzhzqfrIee3ef1LenPCsz0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=oH9p83jf; arc=none smtp.client-ip=95.215.58.170 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="oH9p83jf" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769158837; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=YmdpaV9B6tw25LP5uotyZ1lCEdDuASwhRWt8MPVMavA=; b=oH9p83jfG7qjP1gOLGi1H99wE8zFGPkm6ZDTBXb0xT5J94HC68j/e5uBeNFGsMFrcbGNfP E4/DvnXs75aP4rJeLvmGofelgf57Cq+HUL0vJHWwHyXbzAofrHRUeErHUYjbS6BEPiEZ/+ gVKTBwRZwnXgyPXcyMShc7eatNqV1Gk= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v3 08/12] mm: memcontrol: Add BPF struct_ops for memory controller Date: Fri, 23 Jan 2026 17:00:13 +0800 Message-ID: <863b91cb85097f137bc741e7cb686ff78d9bb95e.1769157382.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Introduce BPF struct_ops support to the memory controller, enabling custom and dynamic control over memory pressure. This is achieved through a new struct_ops type, `memcg_bpf_ops`. This new interface allows a BPF program to implement hooks that influence a memory cgroup's behavior. The `memcg_bpf_ops` struct provides the following hooks: - `get_high_delay_ms`: Returns a custom throttling delay in milliseconds for a cgroup that has breached its `memory.high` limit. This is the primary mechanism for BPF-driven throttling. - `below_low`: Overrides the `memory.low` protection check. If this hook returns true, the cgroup is considered to be protected by its `memory.low` setting, regardless of its actual usage. - `below_min`: Similar to `below_low`, this overrides the `memory.min` protection check. - `handle_cgroup_online`/`offline`: Callbacks invoked when a cgroup with an attached program comes online or goes offline, allowing for state management. This patch integrates these hooks into the core memory control logic. The `get_high_delay_ms` value is incorporated into charge paths like `try_charge_memcg` and the high-limit handler `__mem_cgroup_handle_over_high`. The `below_low` and `below_min` hooks are checked within their respective protection functions. Lifecycle management is handled to ensure BPF programs are correctly inherited by child cgroups and cleaned up on detachment. SRCU is used to protect concurrent access to the `memcg->bpf_ops` pointer. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- include/linux/memcontrol.h | 106 ++++++++++++++- mm/bpf_memcontrol.c | 255 ++++++++++++++++++++++++++++++++++++- mm/memcontrol.c | 32 +++-- 3 files changed, 380 insertions(+), 13 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f3b8c71870d8..d71e86b85ba7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -181,6 +181,37 @@ struct obj_cgroup { }; }; =20 +#ifdef CONFIG_BPF_SYSCALL +/** + * struct memcg_bpf_ops - BPF callbacks for memory cgroup operations + * @handle_cgroup_online: Called when a cgroup comes online + * @handle_cgroup_offline: Called when a cgroup goes offline + * @below_low: Override memory.low protection check. If this callback retu= rns + * true, mem_cgroup_below_low() will return true immediately w= ithout + * performing the standard comparison. If it returns false, the + * original memory.low threshold comparison will proceed norma= lly. + * @below_min: Override memory.min protection check. If this callback retu= rns + * true, mem_cgroup_below_min() will return true immediately w= ithout + * performing the standard comparison. If it returns false, the + * original memory.min threshold comparison will proceed norma= lly. + * @get_high_delay_ms: Return custom throttle delay in milliseconds + * + * This structure defines the interface for BPF programs to customize + * memory cgroup behavior through struct_ops programs. + */ +struct memcg_bpf_ops { + void (*handle_cgroup_online)(struct mem_cgroup *memcg); + + void (*handle_cgroup_offline)(struct mem_cgroup *memcg); + + bool (*below_low)(struct mem_cgroup *memcg); + + bool (*below_min)(struct mem_cgroup *memcg); + + unsigned int (*get_high_delay_ms)(struct mem_cgroup *memcg); +}; +#endif /* CONFIG_BPF_SYSCALL */ + /* * The memory controller data structure. The memory controller controls bo= th * page cache and RSS per cgroup. We would eventually like to provide @@ -321,6 +352,10 @@ struct mem_cgroup { spinlock_t event_list_lock; #endif /* CONFIG_MEMCG_V1 */ =20 +#ifdef CONFIG_BPF_SYSCALL + struct memcg_bpf_ops *bpf_ops; +#endif + struct mem_cgroup_per_node *nodeinfo[]; }; =20 @@ -554,6 +589,66 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } =20 +#ifdef CONFIG_BPF_SYSCALL + +/* SRCU for protecting concurrent access to memcg->bpf_ops */ +extern struct srcu_struct memcg_bpf_srcu; + +/** + * BPF_MEMCG_CALL - Safely invoke a BPF memcg callback + * @memcg: The memory cgroup + * @op: The operation name (struct member) + * @default_val: Default return value if no BPF program attached + * + * This macro safely calls a BPF callback under SRCU protection. + */ +#define BPF_MEMCG_CALL(memcg, op, default_val) ({ \ + typeof(default_val) __ret =3D (default_val); \ + struct memcg_bpf_ops *__ops; \ + int __idx; \ + \ + __idx =3D srcu_read_lock(&memcg_bpf_srcu); \ + __ops =3D READ_ONCE((memcg)->bpf_ops); \ + if (__ops && __ops->op) \ + __ret =3D __ops->op(memcg); \ + srcu_read_unlock(&memcg_bpf_srcu, __idx); \ + __ret; \ +}) + +static inline bool bpf_memcg_below_low(struct mem_cgroup *memcg) +{ + return BPF_MEMCG_CALL(memcg, below_low, false); +} + +static inline bool bpf_memcg_below_min(struct mem_cgroup *memcg) +{ + return BPF_MEMCG_CALL(memcg, below_min, false); +} + +static inline unsigned long bpf_memcg_get_high_delay(struct mem_cgroup *me= mcg) +{ + unsigned int ret; + + ret =3D BPF_MEMCG_CALL(memcg, get_high_delay_ms, 0U); + return msecs_to_jiffies(ret); +} + +#undef BPF_MEMCG_CALL + +extern void memcontrol_bpf_online(struct mem_cgroup *memcg); +extern void memcontrol_bpf_offline(struct mem_cgroup *memcg); + +#else /* CONFIG_BPF_SYSCALL */ + +static inline unsigned long +bpf_memcg_get_high_delay(struct mem_cgroup *memcg) { return 0 } +static inline bpf_memcg_below_low(struct mem_cgroup *memcg) { return false= } +static inline bpf_memcg_below_min(struct mem_cgroup *memcg) { return false= } +static inline void memcontrol_bpf_online(struct mem_cgroup *memcg) { } +static inline void memcontrol_bpf_offline(struct mem_cgroup *memcg) { } + +#endif /* CONFIG_BPF_SYSCALL */ + static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, @@ -625,6 +720,9 @@ static inline bool mem_cgroup_below_low(struct mem_cgro= up *target, if (mem_cgroup_unprotected(target, memcg)) return false; =20 + if (bpf_memcg_below_low(memcg)) + return true; + return READ_ONCE(memcg->memory.elow) >=3D page_counter_read(&memcg->memory); } @@ -635,6 +733,9 @@ static inline bool mem_cgroup_below_min(struct mem_cgro= up *target, if (mem_cgroup_unprotected(target, memcg)) return false; =20 + if (bpf_memcg_below_min(memcg)) + return true; + return READ_ONCE(memcg->memory.emin) >=3D page_counter_read(&memcg->memory); } @@ -909,12 +1010,13 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lr= uvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } =20 -void __mem_cgroup_handle_over_high(gfp_t gfp_mask); +void __mem_cgroup_handle_over_high(gfp_t gfp_mask, + unsigned long bpf_high_delay); =20 static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { if (unlikely(current->memcg_nr_pages_over_high)) - __mem_cgroup_handle_over_high(gfp_mask); + __mem_cgroup_handle_over_high(gfp_mask, 0); } =20 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index f95cd5d16f4c..3eae1af49519 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -8,6 +8,9 @@ #include #include =20 +/* Protects memcg->bpf_ops pointer for read and write. */ +DEFINE_SRCU(memcg_bpf_srcu); + __bpf_kfunc_start_defs(); =20 /** @@ -179,15 +182,263 @@ static const struct btf_kfunc_id_set bpf_memcontrol_= kfunc_set =3D { .set =3D &bpf_memcontrol_kfuncs, }; =20 +/** + * memcontrol_bpf_online - Inherit BPF programs for a new online cgroup. + * @memcg: The memory cgroup that is coming online. + * + * When a new memcg is brought online, it inherits the BPF programs + * attached to its parent. This ensures consistent BPF-based memory + * control policies throughout the cgroup hierarchy. + * + * After inheriting, if the BPF program has an online handler, it is + * invoked for the new memcg. + */ +void memcontrol_bpf_online(struct mem_cgroup *memcg) +{ + int idx; + struct memcg_bpf_ops *ops; + struct mem_cgroup *parent_memcg; + + /* The root cgroup does not inherit from a parent. */ + if (mem_cgroup_is_root(memcg)) + return; + + parent_memcg =3D parent_mem_cgroup(memcg); + + idx =3D srcu_read_lock(&memcg_bpf_srcu); + + /* Inherit the BPF program from the parent cgroup. */ + ops =3D READ_ONCE(parent_memcg->bpf_ops); + if (!ops) + goto out; + + WRITE_ONCE(memcg->bpf_ops, ops); + + /* + * If the BPF program implements it, call the online handler to + * allow the program to perform setup tasks for the new cgroup. + */ + if (!ops->handle_cgroup_online) + goto out; + + ops->handle_cgroup_online(memcg); + +out: + srcu_read_unlock(&memcg_bpf_srcu, idx); +} + +/** + * memcontrol_bpf_offline - Run BPF cleanup for an offline cgroup. + * @memcg: The memory cgroup that is going offline. + * + * If a BPF program is attached and implements an offline handler, + * it is invoked to perform cleanup tasks before the memcg goes + * completely offline. + */ +void memcontrol_bpf_offline(struct mem_cgroup *memcg) +{ + int idx; + struct memcg_bpf_ops *ops; + + idx =3D srcu_read_lock(&memcg_bpf_srcu); + + ops =3D READ_ONCE(memcg->bpf_ops); + if (!ops || !ops->handle_cgroup_offline) + goto out; + + ops->handle_cgroup_offline(memcg); + +out: + srcu_read_unlock(&memcg_bpf_srcu, idx); +} + +static int memcg_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + return -EACCES; +} + +static bool memcg_ops_is_valid_access(int off, int size, enum bpf_access_t= ype type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops bpf_memcg_verifier_ops =3D { + .get_func_proto =3D bpf_base_func_proto, + .btf_struct_access =3D memcg_ops_btf_struct_access, + .is_valid_access =3D memcg_ops_is_valid_access, +}; + +static void cfi_handle_cgroup_online(struct mem_cgroup *memcg) +{ +} + +static void cfi_handle_cgroup_offline(struct mem_cgroup *memcg) +{ +} + +static bool cfi_below_low(struct mem_cgroup *memcg) +{ + return false; +} + +static bool cfi_below_min(struct mem_cgroup *memcg) +{ + return false; +} + +static unsigned int cfi_get_high_delay_ms(struct mem_cgroup *memcg) +{ + return 0; +} + +static struct memcg_bpf_ops cfi_bpf_memcg_ops =3D { + .handle_cgroup_online =3D cfi_handle_cgroup_online, + .handle_cgroup_offline =3D cfi_handle_cgroup_offline, + .below_low =3D cfi_below_low, + .below_min =3D cfi_below_min, + .get_high_delay_ms =3D cfi_get_high_delay_ms, +}; + +static int bpf_memcg_ops_init(struct btf *btf) +{ + return 0; +} + +static int bpf_memcg_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff =3D __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct memcg_bpf_ops, handle_cgroup_online): + break; + case offsetof(struct memcg_bpf_ops, handle_cgroup_offline): + break; + case offsetof(struct memcg_bpf_ops, below_low): + break; + case offsetof(struct memcg_bpf_ops, below_min): + break; + case offsetof(struct memcg_bpf_ops, get_high_delay_ms): + break; + default: + if (prog->sleepable) + return -EINVAL; + } + + return 0; +} + +static int bpf_memcg_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +/** + * clean_memcg_bpf_ops - Detach BPF programs from a cgroup hierarchy. + * @memcg: The root of the cgroup hierarchy to clean. + * @ops: The specific ops struct to detach. If NULL, detach any ops. + * + * Iterates through all descendant cgroups of @memcg (including itself) + * and clears their bpf_ops pointer. This is used when a BPF program + * is detached or if attachment fails midway. + */ +static void clean_memcg_bpf_ops(struct mem_cgroup *memcg, + struct memcg_bpf_ops *ops) +{ + struct mem_cgroup *iter =3D NULL; + + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { + if (ops) { + if (!WARN_ON(READ_ONCE(memcg->bpf_ops) !=3D ops)) + WRITE_ONCE(memcg->bpf_ops, NULL); + } else + WRITE_ONCE(iter->bpf_ops, NULL); + } +} + +static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link + =3D container_of(link, struct bpf_struct_ops_link, link); + struct memcg_bpf_ops *ops =3D kdata; + struct mem_cgroup *memcg, *iter =3D NULL; + int err =3D 0; + + memcg =3D mem_cgroup_get_from_ino(ops_link->cgroup_id); + if (IS_ERR_OR_NULL(memcg)) + return PTR_ERR(memcg); + + cgroup_lock(); + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { + if (READ_ONCE(iter->bpf_ops)) { + mem_cgroup_iter_break(memcg, iter); + err =3D -EBUSY; + break; + } + WRITE_ONCE(iter->bpf_ops, ops); + } + if (err) + clean_memcg_bpf_ops(memcg, NULL); + cgroup_unlock(); + + mem_cgroup_put(memcg); + return err; +} + +/* Unregister the struct ops instance */ +static void bpf_memcg_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link + =3D container_of(link, struct bpf_struct_ops_link, link); + struct memcg_bpf_ops *ops =3D kdata; + struct mem_cgroup *memcg; + + memcg =3D mem_cgroup_get_from_ino(ops_link->cgroup_id); + if (IS_ERR_OR_NULL(memcg)) + goto out; + + cgroup_lock(); + clean_memcg_bpf_ops(memcg, ops); + cgroup_unlock(); + + mem_cgroup_put(memcg); + +out: + synchronize_srcu(&memcg_bpf_srcu); +} + +static struct bpf_struct_ops bpf_memcg_bpf_ops =3D { + .verifier_ops =3D &bpf_memcg_verifier_ops, + .init =3D bpf_memcg_ops_init, + .check_member =3D bpf_memcg_ops_check_member, + .init_member =3D bpf_memcg_ops_init_member, + .reg =3D bpf_memcg_ops_reg, + .unreg =3D bpf_memcg_ops_unreg, + .name =3D "memcg_bpf_ops", + .owner =3D THIS_MODULE, + .cfi_stubs =3D &cfi_bpf_memcg_ops, +}; + static int __init bpf_memcontrol_init(void) { - int err; + int err, err2; =20 err =3D register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_memcontrol_kfunc_set); if (err) pr_warn("error while registering bpf memcontrol kfuncs: %d", err); =20 - return err; + err2 =3D register_bpf_struct_ops(&bpf_memcg_bpf_ops, memcg_bpf_ops); + if (err) + pr_warn("error while registering memcontrol bpf ops: %d", err2); + + return err ? err : err2; } late_initcall(bpf_memcontrol_init); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1f74fce27677..8d90575aa77d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2252,7 +2252,8 @@ static unsigned long calculate_high_delay(struct mem_= cgroup *memcg, * try_charge() (context permitting), as well as from the userland * return path where reclaim is always able to block. */ -void __mem_cgroup_handle_over_high(gfp_t gfp_mask) +void +__mem_cgroup_handle_over_high(gfp_t gfp_mask, unsigned long bpf_high_delay) { unsigned long penalty_jiffies; unsigned long pflags; @@ -2294,11 +2295,15 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask) * memory.high is breached and reclaim is unable to keep up. Throttle * allocators proactively to slow down excessive growth. */ - penalty_jiffies =3D calculate_high_delay(memcg, nr_pages, - mem_find_max_overage(memcg)); + if (nr_pages) { + penalty_jiffies =3D calculate_high_delay( + memcg, nr_pages, mem_find_max_overage(memcg)); =20 - penalty_jiffies +=3D calculate_high_delay(memcg, nr_pages, - swap_find_max_overage(memcg)); + penalty_jiffies +=3D calculate_high_delay( + memcg, nr_pages, swap_find_max_overage(memcg)); + } else + penalty_jiffies =3D 0; + penalty_jiffies =3D max(penalty_jiffies, bpf_high_delay); =20 /* * Clamp the max delay per usermode return so as to still keep the @@ -2356,6 +2361,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg,= gfp_t gfp_mask, bool raised_max_event =3D false; unsigned long pflags; bool allow_spinning =3D gfpflags_allow_spinning(gfp_mask); + struct mem_cgroup *orig_memcg; =20 retry: if (consume_stock(memcg, nr_pages)) @@ -2481,6 +2487,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg,= gfp_t gfp_mask, if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); =20 + orig_memcg =3D memcg; /* * If the hierarchy is above the normal consumption range, schedule * reclaim on returning to userland. We can perform reclaim here @@ -2530,10 +2537,14 @@ static int try_charge_memcg(struct mem_cgroup *memc= g, gfp_t gfp_mask, * kernel. If this is successful, the return path will see it * when it rechecks the overage and simply bail out. */ - if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && - !(current->flags & PF_MEMALLOC) && - gfpflags_allow_blocking(gfp_mask)) - __mem_cgroup_handle_over_high(gfp_mask); + if (gfpflags_allow_blocking(gfp_mask)) { + unsigned long bpf_high_delay; + + bpf_high_delay =3D bpf_memcg_get_high_delay(orig_memcg); + if (bpf_high_delay || + current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH) + __mem_cgroup_handle_over_high(gfp_mask, bpf_high_delay); + } return 0; } =20 @@ -3906,6 +3917,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys= _state *css) */ xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL); =20 + memcontrol_bpf_online(memcg); + return 0; offline_kmem: memcg_offline_kmem(memcg); @@ -3925,6 +3938,7 @@ static void mem_cgroup_css_offline(struct cgroup_subs= ys_state *css) =20 zswap_memcg_offline_cleanup(memcg); =20 + memcontrol_bpf_offline(memcg); memcg_offline_kmem(memcg); reparent_deferred_split_queue(memcg); reparent_shrinker_deferred(memcg); --=20 2.43.0 From nobody Sat Feb 7 11:31:04 2026 Received: from out-179.mta1.migadu.com (out-179.mta1.migadu.com [95.215.58.179]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id AB32E30F7E0 for ; Fri, 23 Jan 2026 09:00:51 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.179 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158854; cv=none; b=SW7JftG/wWnPNn0fjYaO53rpLSpjmMnhFGhRvL0kF38kErqsTIlk7/fn7HkB4i0fAPetzLHrCpNS0YNtTKGcTqrD8NON6+pb6U/FpZ3ZXuwOFWM7en91Am4wYos14g+6hTJPW3/CVX10+XoyiN+x4DjBiOKvwO1iPJOWW5lXGf4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158854; c=relaxed/simple; bh=G3/CchmEfAbQ3wS5rPAhxH6qSbCofSfrretztB+S6hY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=uV8TOE3Uf2nYd/MKNqVJGaZFS47RzKK01kv1X82SsGXLAVMVvUVCPDoegfRq0Cy7mCZzueNqH1qkqZJUD5AImP3E0p4+v7lpQz39JiTuIM28wQdvirfh4aFXjMDIYWPSok/SVb27evCnR8msQPrHx0jzLVz1pKyz59mWNNX6Jg8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=ZLi8EJfr; arc=none smtp.client-ip=95.215.58.179 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="ZLi8EJfr" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769158849; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=3nAcR+voSiNUE364uzehfbuCNqbLCP6WtNqAUJso1wI=; b=ZLi8EJfrLfzlkJ0sO6oMlNdPYjwA4FvmylEjIjK7JVDNwCyv3zMzaAhS/nSUatezlgExjQ X1PZYExO2Ua9JOQtUM7Oabf7K03qS1ZT8bsS49bZZtwgWizy2PeYdbFYruZRT3usyNJnqq 9fnCuTgtoR7RX42ez7zGv4HVGTeB8ic= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v3 09/12] selftests/bpf: Add tests for memcg_bpf_ops Date: Fri, 23 Jan 2026 17:00:14 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Add a comprehensive selftest suite for the `memcg_bpf_ops` functionality. These tests validate that BPF programs can correctly influence memory cgroup throttling behavior by implementing the new hooks. The test suite is added in `prog_tests/memcg_ops.c` and covers several key scenarios: 1. `test_memcg_ops_over_high`: Verifies that a BPF program can trigger throttling on a low-priority cgroup by returning a delay from the `get_high_delay_ms` hook when a high-priority cgroup is under pressure. 2. `test_memcg_ops_below_low_over_high`: Tests the combination of the `below_low` and `get_high_delay_ms` hooks, ensuring they work together as expected. 3. `test_memcg_ops_below_min_over_high`: Validates the interaction between the `below_min` and `get_high_delay_ms` hooks. The test framework sets up a cgroup hierarchy with high and low priority groups, attaches BPF programs, runs memory-intensive workloads, and asserts that the observed throttling (measured by workload execution time) matches expectations. The BPF program (`progs/memcg_ops.c`) uses a tracepoint on `memcg:count_memcg_events` (specifically PGFAULT) to detect memory pressure and trigger the appropriate hooks in response. This test suite provides essential validation for the new memory control mechanisms. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- MAINTAINERS | 2 + .../selftests/bpf/prog_tests/memcg_ops.c | 537 ++++++++++++++++++ tools/testing/selftests/bpf/progs/memcg_ops.c | 129 +++++ 3 files changed, 668 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/memcg_ops.c create mode 100644 tools/testing/selftests/bpf/progs/memcg_ops.c diff --git a/MAINTAINERS b/MAINTAINERS index 491d567f7dc8..7e07bb330eae 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6471,6 +6471,8 @@ F: mm/memcontrol-v1.h F: mm/page_counter.c F: mm/swap_cgroup.c F: samples/cgroup/* +F: tools/testing/selftests/bpf/prog_tests/memcg_ops.c +F: tools/testing/selftests/bpf/progs/memcg_ops.c F: tools/testing/selftests/cgroup/memcg_protection.m F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c F: tools/testing/selftests/cgroup/test_kmem.c diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/tes= ting/selftests/bpf/prog_tests/memcg_ops.c new file mode 100644 index 000000000000..9a8d16296f2d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c @@ -0,0 +1,537 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory controller eBPF struct ops test + */ + +#include +#include +#include +#include +#include +#include +#include "cgroup_helpers.h" + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +#include "memcg_ops.skel.h" + +#define TRIGGER_THRESHOLD 1 +#define OVER_HIGH_MS 2000 +#define FILE_SIZE (64 * 1024 * 1024ul) +#define BUFFER_SIZE (4096) +#define CG_LIMIT (120 * 1024 * 1024ul) + +#define CG_DIR "/memcg_ops_test" +#define CG_HIGH_DIR CG_DIR "/high" +#define CG_LOW_DIR CG_DIR "/low" + +static int +setup_cgroup(int *high_cgroup_id, int *low_cgroup_fd, int *high_cgroup_fd) +{ + int ret; + char limit_buf[20]; + + ret =3D setup_cgroup_environment(); + if (!ASSERT_OK(ret, "setup_cgroup_environment")) + goto cleanup; + + ret =3D create_and_get_cgroup(CG_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_DIR)) + goto cleanup; + close(ret); + ret =3D enable_controllers(CG_DIR, "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + snprintf(limit_buf, 20, "%ld", CG_LIMIT); + ret =3D write_cgroup_file(CG_DIR, "memory.max", limit_buf); + if (!ASSERT_OK(ret, "write_cgroup_file memory.max")) + goto cleanup; + ret =3D write_cgroup_file(CG_DIR, "memory.swap.max", "0"); + if (!ASSERT_OK(ret, "write_cgroup_file memory.swap.max")) + goto cleanup; + + ret =3D create_and_get_cgroup(CG_HIGH_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_HIGH_DIR)) + goto cleanup; + if (high_cgroup_fd) + *high_cgroup_fd =3D ret; + else + close(ret); + ret =3D (int)get_cgroup_id(CG_HIGH_DIR); + if (!ASSERT_GE(ret, 0, "get_cgroup_id")) + goto cleanup; + *high_cgroup_id =3D ret; + + ret =3D create_and_get_cgroup(CG_LOW_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_LOW_DIR)) + goto cleanup; + if (low_cgroup_fd) + *low_cgroup_fd =3D ret; + else + close(ret); + + return 0; + +cleanup: + cleanup_cgroup_environment(); + return -1; +} + +int write_file(const char *filename) +{ + int ret =3D -1; + size_t written =3D 0; + char *buffer; + FILE *fp; + + fp =3D fopen(filename, "wb"); + if (!fp) + goto out; + + buffer =3D malloc(BUFFER_SIZE); + if (!buffer) + goto cleanup_fp; + + memset(buffer, 'A', BUFFER_SIZE); + + while (written < FILE_SIZE) { + size_t to_write =3D (FILE_SIZE - written < BUFFER_SIZE) ? + (FILE_SIZE - written) : + BUFFER_SIZE; + + if (fwrite(buffer, 1, to_write, fp) !=3D to_write) + goto cleanup; + written +=3D to_write; + } + + ret =3D 0; +cleanup: + free(buffer); +cleanup_fp: + fclose(fp); +out: + return ret; +} + +int read_file(const char *filename, int iterations) +{ + int ret =3D -1; + long page_size =3D sysconf(_SC_PAGESIZE); + char *p; + char *map; + size_t i; + int fd; + struct stat sb; + + fd =3D open(filename, O_RDONLY); + if (fd =3D=3D -1) + goto out; + + if (fstat(fd, &sb) =3D=3D -1) + goto cleanup_fd; + + if (sb.st_size !=3D FILE_SIZE) { + fprintf(stderr, "File size mismatch: expected %ld, got %ld\n", + FILE_SIZE, sb.st_size); + goto cleanup_fd; + } + + map =3D mmap(NULL, FILE_SIZE, PROT_READ, MAP_PRIVATE, fd, 0); + if (map =3D=3D MAP_FAILED) + goto cleanup_fd; + + for (int iter =3D 0; iter < iterations; iter++) { + for (i =3D 0; i < FILE_SIZE; i +=3D page_size) { + /* access a byte to trigger page fault */ + p =3D &map[i]; + __asm__ __volatile__("" : : "r"(p) : "memory"); + } + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d %d done\n", __func__, getpid(), iter); + } + + if (munmap(map, FILE_SIZE) =3D=3D -1) + goto cleanup_fd; + + ret =3D 0; + +cleanup_fd: + close(fd); +out: + return ret; +} + +static void +real_test_memcg_ops_child_work(const char *cgroup_path, + char *data_filename, + char *time_filename, + int read_times) +{ + struct timeval start, end; + double elapsed; + FILE *fp; + + if (!ASSERT_OK(join_parent_cgroup(cgroup_path), "join_parent_cgroup")) + goto out; + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d begin\n", __func__, getpid()); + + gettimeofday(&start, NULL); + + if (!ASSERT_OK(write_file(data_filename), "write_file")) + goto out; + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d write_file done\n", __func__, getpid()); + + if (!ASSERT_OK(read_file(data_filename, read_times), "read_file")) + goto out; + + gettimeofday(&end, NULL); + + elapsed =3D (end.tv_sec - start.tv_sec) + + (end.tv_usec - start.tv_usec) / 1000000.0; + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d end %.6f\n", __func__, getpid(), elapsed); + + fp =3D fopen(time_filename, "w"); + if (!ASSERT_OK_PTR(fp, "fopen")) + goto out; + fprintf(fp, "%.6f", elapsed); + fclose(fp); + +out: + exit(0); +} + +static int get_time(char *time_filename, double *time) +{ + int ret =3D -1; + FILE *fp; + char buf[64]; + + fp =3D fopen(time_filename, "r"); + if (!ASSERT_OK_PTR(fp, "fopen")) + goto out; + + if (!ASSERT_OK_PTR(fgets(buf, sizeof(buf), fp), "fgets")) + goto cleanup; + + if (sscanf(buf, "%lf", time) < 0) { + PRINT_FAIL("sscanf %s", buf); + goto cleanup; + } + + ret =3D 0; +cleanup: + fclose(fp); +out: + return ret; +} + +static void real_test_memcg_ops(int read_times) +{ + int ret; + char data_file1[] =3D "/tmp/test_data_XXXXXX"; + char data_file2[] =3D "/tmp/test_data_XXXXXX"; + char time_file1[] =3D "/tmp/test_time_XXXXXX"; + char time_file2[] =3D "/tmp/test_time_XXXXXX"; + pid_t pid1, pid2; + double time1, time2; + + ret =3D mkstemp(data_file1); + if (!ASSERT_GT(ret, 0, "mkstemp")) + return; + close(ret); + ret =3D mkstemp(data_file2); + if (!ASSERT_GT(ret, 0, "mkstemp")) + goto cleanup_data_file1; + close(ret); + ret =3D mkstemp(time_file1); + if (!ASSERT_GT(ret, 0, "mkstemp")) + goto cleanup_data_file2; + close(ret); + ret =3D mkstemp(time_file2); + if (!ASSERT_GT(ret, 0, "mkstemp")) + goto cleanup_time_file1; + close(ret); + + pid1 =3D fork(); + if (!ASSERT_GE(pid1, 0, "fork")) + goto cleanup; + if (pid1 =3D=3D 0) + real_test_memcg_ops_child_work(CG_LOW_DIR, + data_file1, + time_file1, + read_times); + + pid2 =3D fork(); + if (!ASSERT_GE(pid1, 0, "fork")) + goto cleanup; + if (pid2 =3D=3D 0) + real_test_memcg_ops_child_work(CG_HIGH_DIR, + data_file2, + time_file2, + read_times); + + ret =3D waitpid(pid1, NULL, 0); + if (!ASSERT_GT(ret, 0, "waitpid")) + goto cleanup; + + ret =3D waitpid(pid2, NULL, 0); + if (!ASSERT_GT(ret, 0, "waitpid")) + goto cleanup; + + if (get_time(time_file1, &time1)) + goto cleanup; + + if (get_time(time_file2, &time2)) + goto cleanup; + + if (time1 < time2 || time1 - time2 <=3D 1) + PRINT_FAIL("low fast compare time1=3D%f, time2=3D%f", + time1, time2); + +cleanup: + unlink(time_file2); +cleanup_time_file1: + unlink(time_file1); +cleanup_data_file2: + unlink(data_file2); +cleanup_data_file1: + unlink(data_file1); +} + +void test_memcg_ops_over_high(void) +{ + int err, map_fd; + struct memcg_ops *skel; + struct bpf_map *map; + size_t bss_sz; + struct memcg_ops__bss *bss_data; + __u32 key =3D 0; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link2 =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int high_cgroup_id, low_cgroup_fd =3D -1; + + err =3D setup_cgroup(&high_cgroup_id, &low_cgroup_fd, NULL); + if (!ASSERT_OK(err, "setup_cgroup")) + goto out; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss")) + goto out; + + map_fd =3D bpf_map__fd(map); + bss_sz =3D bpf_map__value_size(map); + bss_data =3D malloc(bpf_map__value_size(map)); + if (!ASSERT_OK_PTR(bss_data, "malloc(bpf_map__value_size(map))")) + goto out; + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D TRIGGER_THRESHOLD; + bss_data->local_config.use_below_low =3D false; + bss_data->local_config.use_below_min =3D false; + bss_data->local_config.over_high_ms =3D OVER_HIGH_MS; + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + prog =3D bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto out; + + link =3D bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name low_mcg_ops")) + goto out; + + opts.relative_fd =3D low_cgroup_fd; + link2 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link2, "bpf_map__attach_struct_ops_opts")) + goto out; + + real_test_memcg_ops(5); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link2); + memcg_ops__detach(skel); + close(low_cgroup_fd); + cleanup_cgroup_environment(); +} + +void test_memcg_ops_below_low_over_high(void) +{ + int err, map_fd; + struct memcg_ops *skel; + struct bpf_map *map; + size_t bss_sz; + struct memcg_ops__bss *bss_data; + __u32 key =3D 0; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link_high =3D NULL, *link_low =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int high_cgroup_id, high_cgroup_fd =3D -1, low_cgroup_fd =3D -1; + + err =3D setup_cgroup(&high_cgroup_id, &low_cgroup_fd, &high_cgroup_fd); + if (!ASSERT_OK(err, "setup_cgroup")) + goto out; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss")) + goto out; + + map_fd =3D bpf_map__fd(map); + bss_sz =3D bpf_map__value_size(map); + bss_data =3D malloc(bpf_map__value_size(map)); + if (!ASSERT_OK_PTR(bss_data, "malloc(bpf_map__value_size(map))")) + goto out; + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D TRIGGER_THRESHOLD; + bss_data->local_config.use_below_low =3D true; + bss_data->local_config.use_below_min =3D false; + bss_data->local_config.over_high_ms =3D OVER_HIGH_MS; + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + prog =3D bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto out; + + link =3D bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "high_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name mcg_ops")) + goto out; + opts.relative_fd =3D high_cgroup_fd; + link_high =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_high, "bpf_map__attach_struct_ops_opts")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name mcg_ops")) + goto out; + opts.relative_fd =3D low_cgroup_fd; + link_low =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_low, "bpf_map__attach_struct_ops_opts")) + goto out; + + real_test_memcg_ops(50); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link_high); + bpf_link__destroy(link_low); + memcg_ops__detach(skel); + close(high_cgroup_fd); + close(low_cgroup_fd); + cleanup_cgroup_environment(); +} + +void test_memcg_ops_below_min_over_high(void) +{ + int err, map_fd; + struct memcg_ops *skel; + struct bpf_map *map; + size_t bss_sz; + struct memcg_ops__bss *bss_data; + __u32 key =3D 0; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link_high =3D NULL, *link_low =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int high_cgroup_id, high_cgroup_fd =3D -1, low_cgroup_fd =3D -1; + + err =3D setup_cgroup(&high_cgroup_id, &low_cgroup_fd, &high_cgroup_fd); + if (!ASSERT_OK(err, "setup_cgroup")) + goto out; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss")) + goto out; + + map_fd =3D bpf_map__fd(map); + bss_sz =3D bpf_map__value_size(map); + bss_data =3D malloc(bpf_map__value_size(map)); + if (!ASSERT_OK_PTR(bss_data, "malloc(bpf_map__value_size(map))")) + goto out; + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D TRIGGER_THRESHOLD; + bss_data->local_config.use_below_low =3D false; + bss_data->local_config.use_below_min =3D true; + bss_data->local_config.over_high_ms =3D OVER_HIGH_MS; + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + prog =3D bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto out; + + link =3D bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "high_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name mcg_ops")) + goto out; + opts.relative_fd =3D high_cgroup_fd; + link_high =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_high, "bpf_map__attach_struct_ops_opts")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name mcg_ops")) + goto out; + opts.relative_fd =3D low_cgroup_fd; + link_low =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_low, "bpf_map__attach_struct_ops_opts")) + goto out; + + real_test_memcg_ops(50); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link_high); + bpf_link__destroy(link_low); + memcg_ops__detach(skel); + close(high_cgroup_fd); + close(low_cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/memcg_ops.c b/tools/testing/= selftests/bpf/progs/memcg_ops.c new file mode 100644 index 000000000000..44087a206a61 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/memcg_ops.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +#define ONE_SECOND_NS 1000000000 + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +struct AggregationData { + u64 sum; + u64 window_start_ts; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct AggregationData); +} aggregation_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u64); +} trigger_ts_map SEC(".maps"); + +SEC("tp/memcg/count_memcg_events") +int +handle_count_memcg_events(struct trace_event_raw_memcg_rstat_events *ctx) +{ + u32 key =3D 0; + struct AggregationData *data; + u64 current_ts; + + if (ctx->id !=3D local_config.high_cgroup_id || + (ctx->item !=3D PGFAULT)) + goto out; + + data =3D bpf_map_lookup_elem(&aggregation_map, &key); + if (!data) + goto out; + + current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - data->window_start_ts < ONE_SECOND_NS) { + data->sum +=3D ctx->val; + } else { + data->window_start_ts =3D current_ts; + data->sum =3D ctx->val; + } + + if (data->sum > local_config.threshold) { + bpf_map_update_elem(&trigger_ts_map, &key, ¤t_ts, + BPF_ANY); + data->sum =3D 0; + data->window_start_ts =3D current_ts; + } + +out: + return 0; +} + +static bool need_threshold(void) +{ + u32 key =3D 0; + u64 *trigger_ts; + bool ret =3D false; + + trigger_ts =3D bpf_map_lookup_elem(&trigger_ts_map, &key); + if (!trigger_ts || *trigger_ts =3D=3D 0) + goto out; + + u64 current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - *trigger_ts < ONE_SECOND_NS) + ret =3D true; + +out: + return ret; +} + +SEC("struct_ops/below_low") +unsigned int below_low_impl(struct mem_cgroup *memcg) +{ + if (!local_config.use_below_low) + return false; + + return need_threshold(); +} + +SEC("struct_ops/below_min") +unsigned int below_min_impl(struct mem_cgroup *memcg) +{ + if (!local_config.use_below_min) + return false; + + return need_threshold(); +} + +SEC("struct_ops/get_high_delay_ms") +unsigned int get_high_delay_ms_impl(struct mem_cgroup *memcg) +{ + if (local_config.over_high_ms && need_threshold()) + return local_config.over_high_ms; + + return 0; +} + +SEC(".struct_ops.link") +struct memcg_bpf_ops high_mcg_ops =3D { + .below_low =3D (void *)below_low_impl, + .below_min =3D (void *)below_min_impl, +}; + +SEC(".struct_ops.link") +struct memcg_bpf_ops low_mcg_ops =3D { + .get_high_delay_ms =3D (void *)get_high_delay_ms_impl, +}; + +char LICENSE[] SEC("license") =3D "GPL"; --=20 2.43.0 From nobody Sat Feb 7 11:31:04 2026 Received: from out-183.mta1.migadu.com (out-183.mta1.migadu.com [95.215.58.183]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D240F296BA9 for ; Fri, 23 Jan 2026 09:01:03 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.183 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158865; cv=none; b=P/jSTSAWxN/dSAu0h8vNGe+3icZFfXM7HGicx1gU6odvOXbxbG9QBlOSjxT0U91HmeWQDXWZlpdo+1FYxA/BoUeYGvIV/1XK2iTUrGBrWc5pX/7FTpl/FqijT2L9M9ILv54+VS4yFI1R7MbKApGKeAY6sAOJrtY4OjZAlsHcitg= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158865; c=relaxed/simple; bh=TokOtLPJOxzGUdwmcKHQxoHBN5+zFmo6iC5kHiyYT0Y=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=JT41PSOB/4b4G++p1LcxsKd86urE4ITDk2a/XlTbGzKT3l2qhv4qQleNicVJJY2iEQUYtJyo9f9FyPj5uqyIAW1mhwAfUOqkTs3EBENf17IRHC9pYggQ1Qi2RYtvEc8B0Jyfc78CViTjfp7kWBCEIstCkpZJRawP3Z0+vwGNSD0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=s16CndXY; arc=none smtp.client-ip=95.215.58.183 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="s16CndXY" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769158861; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=jSd5dZjazXtYLKnDww1Cz0qa7pIOMhPfOmKjHC0vcEI=; b=s16CndXYTk+3Ju3wcxjkAVQN+AwotBloPZ2BQeqwtwTfv+LXFsf84cvqam5EaIE7lkaEd8 ofv8AdNIzdVXwHkmNupiHKj8MoRrlfwN7Pew8/Tj2oC4qNitnirZER/RXQGEqLMwsS7W3y xC1vcB3c1zIqaov6OqWQX1lZ5ATaK0w= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v3 10/12] mm/bpf: Add BPF_F_ALLOW_OVERRIDE support for memcg_bpf_ops Date: Fri, 23 Jan 2026 17:00:15 +0800 Message-ID: <9f072e53f79ceaea43e3730476494517e453530a.1769157382.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu To allow for more flexible attachment policies in nested cgroup hierarchies, this patch introduces support for the `BPF_F_ALLOW_OVERRIDE` flag for `memcg_bpf_ops`. When a `memcg_bpf_ops` is attached to a cgroup with this flag, it permits child cgroups to attach their own, different `memcg_bpf_ops`, overriding the parent's inherited program. Without this flag, attaching a BPF program to a cgroup that already has one (either directly or via inheritance) will fail. The implementation involves: - Adding a `bpf_ops_flags` field to `struct mem_cgroup`. - During registration (`bpf_memcg_ops_reg`), checking for existing programs and the `BPF_F_ALLOW_OVERRIDE` flag. - During unregistration (`bpf_memcg_ops_unreg`), correctly restoring the parent's BPF program to the cgroup hierarchy. - Ensuring flags are inherited by child cgroups during online events. This change enables complex, multi-level policy enforcement where different subtrees of the cgroup hierarchy can have distinct memory management BPF programs. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- include/linux/memcontrol.h | 1 + mm/bpf_memcontrol.c | 77 ++++++++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 29 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d71e86b85ba7..a37b78d3853d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -354,6 +354,7 @@ struct mem_cgroup { =20 #ifdef CONFIG_BPF_SYSCALL struct memcg_bpf_ops *bpf_ops; + u32 bpf_ops_flags; #endif =20 struct mem_cgroup_per_node *nodeinfo[]; diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 3eae1af49519..d6126b94f521 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -213,6 +213,7 @@ void memcontrol_bpf_online(struct mem_cgroup *memcg) goto out; =20 WRITE_ONCE(memcg->bpf_ops, ops); + memcg->bpf_ops_flags =3D parent_memcg->bpf_ops_flags; =20 /* * If the BPF program implements it, call the online handler to @@ -340,29 +341,6 @@ static int bpf_memcg_ops_init_member(const struct btf_= type *t, return 0; } =20 -/** - * clean_memcg_bpf_ops - Detach BPF programs from a cgroup hierarchy. - * @memcg: The root of the cgroup hierarchy to clean. - * @ops: The specific ops struct to detach. If NULL, detach any ops. - * - * Iterates through all descendant cgroups of @memcg (including itself) - * and clears their bpf_ops pointer. This is used when a BPF program - * is detached or if attachment fails midway. - */ -static void clean_memcg_bpf_ops(struct mem_cgroup *memcg, - struct memcg_bpf_ops *ops) -{ - struct mem_cgroup *iter =3D NULL; - - while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { - if (ops) { - if (!WARN_ON(READ_ONCE(memcg->bpf_ops) !=3D ops)) - WRITE_ONCE(memcg->bpf_ops, NULL); - } else - WRITE_ONCE(iter->bpf_ops, NULL); - } -} - static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link) { struct bpf_struct_ops_link *ops_link @@ -371,21 +349,44 @@ static int bpf_memcg_ops_reg(void *kdata, struct bpf_= link *link) struct mem_cgroup *memcg, *iter =3D NULL; int err =3D 0; =20 + if (ops_link->flags & ~BPF_F_ALLOW_OVERRIDE) { + pr_err("attach only support BPF_F_ALLOW_OVERRIDE\n"); + return -EOPNOTSUPP; + } + memcg =3D mem_cgroup_get_from_ino(ops_link->cgroup_id); if (IS_ERR_OR_NULL(memcg)) return PTR_ERR(memcg); =20 cgroup_lock(); + + if (READ_ONCE(memcg->bpf_ops)) { + /* Check if bpf_ops of the parent is BPF_F_ALLOW_OVERRIDE. */ + if (memcg->bpf_ops_flags & BPF_F_ALLOW_OVERRIDE) { + iter =3D parent_mem_cgroup(memcg); + + if (!iter) + goto busy_out; + if (READ_ONCE(iter->bpf_ops) !=3D + READ_ONCE(memcg->bpf_ops)) + goto busy_out; + } else { +busy_out: + err =3D -EBUSY; + goto unlock_out; + } + } + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { if (READ_ONCE(iter->bpf_ops)) { - mem_cgroup_iter_break(memcg, iter); - err =3D -EBUSY; - break; + /* cannot override existing bpf_ops of sub-cgroup. */ + continue; } WRITE_ONCE(iter->bpf_ops, ops); + iter->bpf_ops_flags =3D ops_link->flags; } - if (err) - clean_memcg_bpf_ops(memcg, NULL); + +unlock_out: cgroup_unlock(); =20 mem_cgroup_put(memcg); @@ -399,13 +400,31 @@ static void bpf_memcg_ops_unreg(void *kdata, struct b= pf_link *link) =3D container_of(link, struct bpf_struct_ops_link, link); struct memcg_bpf_ops *ops =3D kdata; struct mem_cgroup *memcg; + struct mem_cgroup *iter; + struct memcg_bpf_ops *parent_bpf_ops =3D NULL; + u32 parent_bpf_ops_flags =3D 0; =20 memcg =3D mem_cgroup_get_from_ino(ops_link->cgroup_id); if (IS_ERR_OR_NULL(memcg)) goto out; =20 cgroup_lock(); - clean_memcg_bpf_ops(memcg, ops); + + /* Get the parent bpf_ops and bpf_ops_flags */ + iter =3D parent_mem_cgroup(memcg); + if (iter) { + parent_bpf_ops =3D READ_ONCE(iter->bpf_ops); + parent_bpf_ops_flags =3D iter->bpf_ops_flags; + } + + iter =3D NULL; + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { + if (READ_ONCE(iter->bpf_ops) =3D=3D ops) { + WRITE_ONCE(iter->bpf_ops, parent_bpf_ops); + iter->bpf_ops_flags =3D parent_bpf_ops_flags; + } + } + cgroup_unlock(); =20 mem_cgroup_put(memcg); --=20 2.43.0 From nobody Sat Feb 7 11:31:05 2026 Received: from out-172.mta1.migadu.com (out-172.mta1.migadu.com [95.215.58.172]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C4193324B1D for ; Fri, 23 Jan 2026 09:02:42 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158964; cv=none; b=DISuTmkqXj8nTGP3lrzN6O7ERE0ARQtv+/mqMKqsMmMXN9LDC5vGobK27VyL/AvKbelDSKV6K5NGTpxyMvVqxuUUDw2v9bXYTFHi1O/ss7CFg1e3/0o9WauxA9FYTh24LQmjPLHkf71wOJDS2N6TvTNy1iNj0lYvQRpcmJnmRhk= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158964; c=relaxed/simple; bh=cHHyeHa84VKYKc+fClfF1pIYHBp7EHYVUi+qkwVe0cE=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=SI7jkSG0YgnpGTGnTEaLc585dwHe1jaJhSbs9+AwRqWooZ9ELsURsvFBGlFfCuT7iPoWDUA61SXuOst8MWu4wLxblCUKAYuDrdOYnN8gB1pYgpYku4sr64+ES5eyyTrEYk3CHDrop5iiwvEIxAfTNuvnfjyBDeOOQl+CZbVCBfE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=wJHq11Pr; arc=none smtp.client-ip=95.215.58.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="wJHq11Pr" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769158950; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=3iLrjiv0gburAFEhS5XqHyT8faYM4NGG2Wz0oK9Qdf4=; b=wJHq11Pr0GtVVpulWuS2H7wQCNNdItz0NY2nkgb6vSa3RZXwQxkbhqsUeYiPsYfKmvchNj 8rb/XPQ/7gZHndK/RYsb4oM88oS1AN7JHx/F/2CE0WDV3dM+lYhsd1TvOWQaVrrDDvXsqB KOmnqEtcE6XYQWOHeU9x//WaEErpY50= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v3 11/12] selftests/bpf: Add test for memcg_bpf_ops hierarchies Date: Fri, 23 Jan 2026 17:01:54 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Add a new selftest, `test_memcg_ops_hierarchies`, to validate the behavior of attaching `memcg_bpf_ops` in a nested cgroup hierarchy, specifically testing the `BPF_F_ALLOW_OVERRIDE` flag. The test case performs the following steps: 1. Creates a three-level deep cgroup hierarchy: `/cg`, `/cg/cg`, and `/cg/cg/cg`. 2. Attaches a BPF struct_ops to the top-level cgroup (`/cg`) with the `BPF_F_ALLOW_OVERRIDE` flag. 3. Successfully attaches a new struct_ops to the middle cgroup (`/cg/cg`) without the flag, overriding the inherited one. 4. Asserts that attaching another struct_ops to the deepest cgroup (`/cg/cg/cg`) fails with -EBUSY, because its parent did not specify `BPF_F_ALLOW_OVERRIDE`. This test ensures that the attachment logic correctly enforces the override rules across a cgroup subtree. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- .../selftests/bpf/prog_tests/memcg_ops.c | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/tes= ting/selftests/bpf/prog_tests/memcg_ops.c index 9a8d16296f2d..bab9f3094f59 100644 --- a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c +++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c @@ -535,3 +535,72 @@ void test_memcg_ops_below_min_over_high(void) close(low_cgroup_fd); cleanup_cgroup_environment(); } + +void test_memcg_ops_hierarchies(void) +{ + int ret, first =3D -1, second =3D -1, third =3D -1; + struct memcg_ops *skel; + struct bpf_map *map; + struct bpf_link *link1 =3D NULL, *link2 =3D NULL, *link3 =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + + ret =3D setup_cgroup_environment(); + if (!ASSERT_OK(ret, "setup_cgroup_environment")) + goto cleanup; + + first =3D create_and_get_cgroup("/cg"); + if (!ASSERT_GE(first, 0, "create_and_get_cgroup /cg")) + goto cleanup; + ret =3D enable_controllers("/cg", "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + + second =3D create_and_get_cgroup("/cg/cg"); + if (!ASSERT_GE(second, 0, "create_and_get_cgroup /cg/cg")) + goto cleanup; + ret =3D enable_controllers("/cg/cg", "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + + third =3D create_and_get_cgroup("/cg/cg/cg"); + if (!ASSERT_GE(third, 0, "create_and_get_cgroup /cg/cg/cg")) + goto cleanup; + ret =3D enable_controllers("/cg/cg/cg", "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto cleanup; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name low_mcg_ops")) + goto cleanup; + + opts.relative_fd =3D first; + opts.flags =3D BPF_F_ALLOW_OVERRIDE; + link1 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link1, "bpf_map__attach_struct_ops_opts")) + goto cleanup; + + opts.relative_fd =3D second; + opts.flags =3D 0; + link2 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link2, "bpf_map__attach_struct_ops_opts")) + goto cleanup; + + opts.relative_fd =3D third; + opts.flags =3D 0; + link2 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_ERR_PTR(link3, "bpf_map__attach_struct_ops_opts")) + goto cleanup; + +cleanup: + bpf_link__destroy(link1); + bpf_link__destroy(link2); + memcg_ops__detach(skel); + close(first); + close(second); + close(third); + cleanup_cgroup_environment(); +} --=20 2.43.0 From nobody Sat Feb 7 11:31:05 2026 Received: from out-187.mta1.migadu.com (out-187.mta1.migadu.com [95.215.58.187]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7DF66318EC0 for ; Fri, 23 Jan 2026 09:02:53 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.187 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158976; cv=none; b=a6IRguRTVwC+AewWt9PZv4JqXxN37URdh2dQxGS+M0T6Y+nJ9W/m3MNeu91Dk9iAR7j/CWXH+AHMOBFi0sbOYDt0aEtExYegN+EU6jqOL11If4jzPH529xnBG+SKr9Nn0/6hddId6UimLXw/sgCjfMxx+XKBu20erwFZYdkcS5U= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769158976; c=relaxed/simple; bh=fGvW/hCKB6PaS/tBEchyX1uCFbPBbjV7sVdBWPNCHBw=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=uPaOWH764KLdzwd9UsqSU2ra/vMXW+W8V/6W9eGQljLGA2FSX38o73iedLRqEp+YEoYqHjddELisgmOLYnCoVst37RQi/hTrfPUDhdLUSI2fDrzXWFva/TkQjUuydlmdi7XcDL6OjZWS2/HEwYyMqdh+30Io30zEOGgwX274gEw= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=lPytnziE; arc=none smtp.client-ip=95.215.58.187 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="lPytnziE" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769158971; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=CVJK2posKcAxYsFSIvqHPLbeQQmW9K5Q2TVuhlRBOeQ=; b=lPytnziE/5rsaoUg4eVwJ/vMFOUBPW1i1YDXPbqOM0nmwFnY9FLl60SUPM8dfgIufdg0ay QoukGi+cUPmUfYk6cz12HRpNGOtqqZp+5KY6S01JbU9oMPEgxPYl/8SHnd6YjVI7bd2ysP S7nV+ZA/GBMhA8+w1nDwHKX37Qa0SiM= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v3 12/12] samples/bpf: Add memcg priority control example Date: Fri, 23 Jan 2026 17:01:55 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Add a sample program to demonstrate a practical use case for the `memcg_bpf_ops` feature: priority-based memory throttling. The sample consists of a BPF program and a userspace loader: 1. memcg.bpf.c: A BPF program that monitors PGFAULT events on a high-priority cgroup. When activity exceeds a threshold, it uses the `get_high_delay_ms`, `below_low`, or `below_min` hooks to apply pressure on a low-priority cgroup. 2. memcg.c: A userspace loader that configures and attaches the BPF program. It takes command-line arguments for the high and low priority cgroup paths, a pressure threshold, and the desired throttling delay (`over_high_ms`). This provides a clear, working example of how to implement a dynamic, priority-aware memory management policy. A user can create two cgroups, run workloads of different priorities, and observe the low-priority workload being throttled to protect the high-priority one. Example usage: # ./memcg --low_path /sys/fs/cgroup/low \ # --high_path /sys/fs/cgroup/high \ # --threshold 100 --over_high_ms 1024 Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- MAINTAINERS | 2 + samples/bpf/.gitignore | 1 + samples/bpf/Makefile | 9 +- samples/bpf/memcg.bpf.c | 129 ++++++++++++++++ samples/bpf/memcg.c | 327 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 467 insertions(+), 1 deletion(-) create mode 100644 samples/bpf/memcg.bpf.c create mode 100644 samples/bpf/memcg.c diff --git a/MAINTAINERS b/MAINTAINERS index 7e07bb330eae..819ef271e011 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6470,6 +6470,8 @@ F: mm/memcontrol-v1.c F: mm/memcontrol-v1.h F: mm/page_counter.c F: mm/swap_cgroup.c +F: samples/bpf/memcg.bpf.c +F: samples/bpf/memcg.c F: samples/cgroup/* F: tools/testing/selftests/bpf/prog_tests/memcg_ops.c F: tools/testing/selftests/bpf/progs/memcg_ops.c diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index 0002cd359fb1..0de6569cdefd 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -49,3 +49,4 @@ iperf.* /vmlinux.h /bpftool/ /libbpf/ +memcg diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 95a4fa1f1e44..6416c8aa3034 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -37,6 +37,7 @@ tprogs-y +=3D xdp_fwd tprogs-y +=3D task_fd_query tprogs-y +=3D ibumad tprogs-y +=3D hbm +tprogs-y +=3D memcg =20 # Libbpf dependencies LIBBPF_SRC =3D $(TOOLS_PATH)/lib/bpf @@ -122,6 +123,7 @@ always-y +=3D task_fd_query_kern.o always-y +=3D ibumad_kern.o always-y +=3D hbm_out_kern.o always-y +=3D hbm_edt_kern.o +always-y +=3D memcg.bpf.o =20 COMMON_CFLAGS =3D $(TPROGS_USER_CFLAGS) TPROGS_LDFLAGS =3D $(TPROGS_USER_LDFLAGS) @@ -289,6 +291,8 @@ $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h $(obj)/hbm.o: $(src)/hbm.h $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h =20 +memcg: $(obj)/memcg.skel.h + # Override includes for xdp_sample_user.o because $(srctree)/usr/include in # TPROGS_CFLAGS causes conflicts XDP_SAMPLE_CFLAGS +=3D -Wall -O2 \ @@ -347,11 +351,13 @@ $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src= )/xdp_sample.bpf.h $(src)/x -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \ -c $(filter %.bpf.c,$^) -o $@ =20 -LINKED_SKELS :=3D xdp_router_ipv4.skel.h +LINKED_SKELS :=3D xdp_router_ipv4.skel.h memcg.skel.h clean-files +=3D $(LINKED_SKELS) =20 xdp_router_ipv4.skel.h-deps :=3D xdp_router_ipv4.bpf.o xdp_sample.bpf.o =20 +memcg.skel.h-deps :=3D memcg.bpf.o + LINKED_BPF_SRCS :=3D $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SK= ELS),$($(skel)-deps))) =20 BPF_SRCS_LINKED :=3D $(notdir $(wildcard $(src)/*.bpf.c)) @@ -360,6 +366,7 @@ BPF_SKELS_LINKED :=3D $(addprefix $(obj)/,$(LINKED_SKEL= S)) =20 $(BPF_SKELS_LINKED): $(BPF_OBJS_LINKED) $(BPFTOOL) @echo " BPF GEN-OBJ " $(@:.skel.h=3D) + echo $(Q)$(BPFTOOL) gen object $(@:.skel.h=3D.lbpf.o) $(addprefix $(obj)/= ,$($(@F)-deps)) $(Q)$(BPFTOOL) gen object $(@:.skel.h=3D.lbpf.o) $(addprefix $(obj)/,$($(= @F)-deps)) @echo " BPF GEN-SKEL" $(@:.skel.h=3D) $(Q)$(BPFTOOL) gen skeleton $(@:.skel.h=3D.lbpf.o) name $(notdir $(@:.ske= l.h=3D)) > $@ diff --git a/samples/bpf/memcg.bpf.c b/samples/bpf/memcg.bpf.c new file mode 100644 index 000000000000..44087a206a61 --- /dev/null +++ b/samples/bpf/memcg.bpf.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +#define ONE_SECOND_NS 1000000000 + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +struct AggregationData { + u64 sum; + u64 window_start_ts; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct AggregationData); +} aggregation_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u64); +} trigger_ts_map SEC(".maps"); + +SEC("tp/memcg/count_memcg_events") +int +handle_count_memcg_events(struct trace_event_raw_memcg_rstat_events *ctx) +{ + u32 key =3D 0; + struct AggregationData *data; + u64 current_ts; + + if (ctx->id !=3D local_config.high_cgroup_id || + (ctx->item !=3D PGFAULT)) + goto out; + + data =3D bpf_map_lookup_elem(&aggregation_map, &key); + if (!data) + goto out; + + current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - data->window_start_ts < ONE_SECOND_NS) { + data->sum +=3D ctx->val; + } else { + data->window_start_ts =3D current_ts; + data->sum =3D ctx->val; + } + + if (data->sum > local_config.threshold) { + bpf_map_update_elem(&trigger_ts_map, &key, ¤t_ts, + BPF_ANY); + data->sum =3D 0; + data->window_start_ts =3D current_ts; + } + +out: + return 0; +} + +static bool need_threshold(void) +{ + u32 key =3D 0; + u64 *trigger_ts; + bool ret =3D false; + + trigger_ts =3D bpf_map_lookup_elem(&trigger_ts_map, &key); + if (!trigger_ts || *trigger_ts =3D=3D 0) + goto out; + + u64 current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - *trigger_ts < ONE_SECOND_NS) + ret =3D true; + +out: + return ret; +} + +SEC("struct_ops/below_low") +unsigned int below_low_impl(struct mem_cgroup *memcg) +{ + if (!local_config.use_below_low) + return false; + + return need_threshold(); +} + +SEC("struct_ops/below_min") +unsigned int below_min_impl(struct mem_cgroup *memcg) +{ + if (!local_config.use_below_min) + return false; + + return need_threshold(); +} + +SEC("struct_ops/get_high_delay_ms") +unsigned int get_high_delay_ms_impl(struct mem_cgroup *memcg) +{ + if (local_config.over_high_ms && need_threshold()) + return local_config.over_high_ms; + + return 0; +} + +SEC(".struct_ops.link") +struct memcg_bpf_ops high_mcg_ops =3D { + .below_low =3D (void *)below_low_impl, + .below_min =3D (void *)below_min_impl, +}; + +SEC(".struct_ops.link") +struct memcg_bpf_ops low_mcg_ops =3D { + .get_high_delay_ms =3D (void *)get_high_delay_ms_impl, +}; + +char LICENSE[] SEC("license") =3D "GPL"; diff --git a/samples/bpf/memcg.c b/samples/bpf/memcg.c new file mode 100644 index 000000000000..85432cb01c27 --- /dev/null +++ b/samples/bpf/memcg.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __MEMCG_RSTAT_SIMPLE_BPF_SKEL_H__ +#define u64 uint64_t +#endif + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +#include "memcg.skel.h" + +static bool exiting; + +static void sig_handler(int sig) +{ + exiting =3D true; +} + +static void usage(char *name) +{ + fprintf(stderr, + "Usage: %s --low_path=3D --high_path=3D \\\n" + " --threshold=3D [OPTIONS]\n\n", + name); + fprintf(stderr, "Required arguments:\n"); + fprintf(stderr, + " -l, --low_path=3DPATH Low priority memcgroup path\n"); + fprintf(stderr, + " -g, --high_path=3DPATH High priority memcgroup path\n"); + fprintf(stderr, + " -t, --threshold=3DVALUE The sum of 'val' PGSCAN of\n"); + fprintf(stderr, + " high priority memcgroup in\n"); + fprintf(stderr, + " 1 sec to trigger low priority\n"); + fprintf(stderr, + " cgroup over_high\n\n"); + fprintf(stderr, "Optional arguments:\n"); + fprintf(stderr, " -o, --over_high_ms=3DVALUE\n"); + fprintf(stderr, + " Low_path over_high_ms value\n"); + fprintf(stderr, + " (default: 0)\n"); + fprintf(stderr, " -L, --use_below_low Enable use_below_low flag\n"); + fprintf(stderr, " -M, --use_below_min Enable use_below_min flag\n"); + fprintf(stderr, + " -O, --allow_override Enable BPF_F_ALLOW_OVERRIDE\n"); + fprintf(stderr, + " flag\n"); + fprintf(stderr, " -h, --help Show this help message\n\n"); + fprintf(stderr, "Examples:\n"); + fprintf(stderr, " # Using long options:\n"); + fprintf(stderr, " %s --low_path=3D/sys/fs/cgroup/low \\\n", name); + fprintf(stderr, " --high_path=3D/sys/fs/cgroup/high \\\n"); + fprintf(stderr, " --threshold=3D1000 --over_high_ms=3D500 \\\n" + " --use_below_low\n\n"); + fprintf(stderr, " # Using short options:\n"); + fprintf(stderr, " %s -l /sys/fs/cgroup/low \\\n" + " -g /sys/fs/cgroup/high \\\n", + name); + fprintf(stderr, " -t 1000 -o 500 -L -M\n"); +} + +static uint64_t get_cgroup_id(const char *cgroup_path) +{ + struct stat st; + + if (cgroup_path =3D=3D NULL) { + fprintf(stderr, "Error: cgroup_path is NULL\n"); + return 0; + } + + if (stat(cgroup_path, &st) < 0) { + fprintf(stderr, "Error: stat(%s) failed: %d\n", + cgroup_path, errno); + return 0; + } + + return (uint64_t)st.st_ino; +} + +int main(int argc, char **argv) +{ + int low_cgroup_fd =3D -1, high_cgroup_fd =3D -1; + uint64_t threshold =3D 0, high_cgroup_id; + unsigned int over_high_ms =3D 0; + bool use_below_low =3D false, use_below_min =3D false; + __u32 opts_flags =3D 0; + const char *low_path =3D NULL; + const char *high_path =3D NULL; + const char *bpf_obj_file =3D "memcg.bpf.o"; + struct bpf_object *obj =3D NULL; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link_low =3D NULL, *link_high =3D NULL; + struct bpf_map *map; + struct memcg__bss *bss_data; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int err =3D -EINVAL; + int map_fd; + int opt; + int option_index =3D 0; + + static struct option long_options[] =3D { + {"low_path", required_argument, 0, 'l'}, + {"high_path", required_argument, 0, 'g'}, + {"threshold", required_argument, 0, 't'}, + {"over_high_ms", required_argument, 0, 'o'}, + {"use_below_low", no_argument, 0, 'L'}, + {"use_below_min", no_argument, 0, 'M'}, + {"allow_override", no_argument, 0, 'O'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0 } + }; + + while ((opt =3D getopt_long(argc, argv, "l:g:t:o:LMOh", + long_options, &option_index)) !=3D -1) { + switch (opt) { + case 'l': + low_path =3D optarg; + break; + case 'g': + high_path =3D optarg; + break; + case 't': + threshold =3D strtoull(optarg, NULL, 10); + break; + case 'o': + over_high_ms =3D strtoull(optarg, NULL, 10); + break; + case 'L': + use_below_low =3D true; + break; + case 'M': + use_below_min =3D true; + break; + case 'O': + opts_flags =3D BPF_F_ALLOW_OVERRIDE; + break; + case 'h': + usage(argv[0]); + return 0; + default: + usage(argv[0]); + return -EINVAL; + } + } + + if (!low_path || !high_path || !threshold) { + fprintf(stderr, + "ERROR: Missing required arguments\n\n"); + usage(argv[0]); + goto out; + } + + low_cgroup_fd =3D open(low_path, O_RDONLY); + if (low_cgroup_fd < 0) { + fprintf(stderr, + "ERROR: open low cgroup '%s' failed: %d\n", + low_path, errno); + err =3D -errno; + goto out; + } + + high_cgroup_id =3D get_cgroup_id(high_path); + if (!high_cgroup_id) + goto out; + high_cgroup_fd =3D open(high_path, O_RDONLY); + if (high_cgroup_fd < 0) { + fprintf(stderr, + "ERROR: open high cgroup '%s' failed: %d\n", + low_path, errno); + err =3D -errno; + goto out; + } + + obj =3D bpf_object__open_file(bpf_obj_file, NULL); + err =3D libbpf_get_error(obj); + if (err) { + fprintf(stderr, + "ERROR: opening BPF object file '%s' failed: %d\n", + bpf_obj_file, err); + goto out; + } + + map =3D bpf_object__find_map_by_name(obj, ".bss"); + if (!map) { + fprintf(stderr, "ERROR: Failed to find .data map\n"); + err =3D -ESRCH; + goto out; + } + + err =3D bpf_object__load(obj); + if (err) { + fprintf(stderr, + "ERROR: loading BPF object file failed: %d\n", + err); + goto out; + } + + map_fd =3D bpf_map__fd(map); + bss_data =3D malloc(bpf_map__value_size(map)); + if (bss_data) { + __u32 key =3D 0; + + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D threshold; + bss_data->local_config.over_high_ms =3D over_high_ms; + bss_data->local_config.use_below_low =3D use_below_low; + bss_data->local_config.use_below_min =3D use_below_min; + + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (err) { + fprintf(stderr, + "ERROR: update config failed: %d\n", + err); + goto out; + } + } else { + fprintf(stderr, + "ERROR: allocate memory failed\n"); + err =3D -ENOMEM; + goto out; + } + + prog =3D bpf_object__find_program_by_name(obj, + "handle_count_memcg_events"); + if (!prog) { + fprintf(stderr, + "ERROR: finding a prog in BPF object file failed\n"); + goto out; + } + + link =3D bpf_program__attach(prog); + err =3D libbpf_get_error(link); + if (err) { + fprintf(stderr, + "ERROR: bpf_program__attach failed: %d\n", + err); + goto out; + } + + if (over_high_ms) { + map =3D bpf_object__find_map_by_name(obj, "low_mcg_ops"); + if (!map) { + fprintf(stderr, + "ERROR: Failed to find low_mcg_ops map\n"); + err =3D -ESRCH; + goto out; + } + LIBBPF_OPTS_RESET(opts, + .flags =3D opts_flags, + .relative_fd =3D low_cgroup_fd, + ); + link_low =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!link_low) { + fprintf(stderr, + "Failed to attach struct ops low_mcg_ops: %d\n", + errno); + err =3D -errno; + goto out; + } + } + + if (use_below_low || use_below_min) { + map =3D bpf_object__find_map_by_name(obj, "high_mcg_ops"); + if (!map) { + fprintf(stderr, + "ERROR: Failed to find high_mcg_ops map\n"); + err =3D -ESRCH; + goto out; + } + LIBBPF_OPTS_RESET(opts, + .flags =3D opts_flags, + .relative_fd =3D high_cgroup_fd, + ); + link_low =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!link_low) { + fprintf(stderr, + "Failed to attach struct ops high_mcg_ops: %d\n", + errno); + err =3D -errno; + goto out; + } + } + + printf("Successfully attached!\n"); + + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + + while (!exiting) + pause(); + + printf("Exiting...\n"); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link_low); + bpf_link__destroy(link_high); + bpf_object__close(obj); + close(low_cgroup_fd); + close(high_cgroup_fd); + return err; +} --=20 2.43.0