From nobody Sat Feb 7 18:15:48 2026 Received: from out-188.mta1.migadu.com (out-188.mta1.migadu.com [95.215.58.188]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7885232C932 for ; Tue, 27 Jan 2026 09:43:33 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.188 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507015; cv=none; b=AeE+1CLmPmhZOa24hYIexZYBqY7t/bzAiAAlEWVl9dnAcETyD30mTF4kn5GHryMJiPORAloYlJE46mgmZUeOwoGb4gWb5CsCJxREC1e/yhJvlQhk1pWZiKpoXQsd/ZSTrIap7UspoftTRceuA011Y75NtZsq2j28bNk3f9GeKsI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507015; c=relaxed/simple; bh=ICuHoMZgwyeQHMZZvgMtyo6q66wG5W6MOEIDq951aKE=; h=From:To:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=mk93ME1SS+TNsTkrkysqk2e4/Nve+GX7FshwDeuqVmCqvAaNMK37ZbEX9w8/6zHIYIkniKffOHb+aIZPDtbTHj4hTOcYRssy3IUaDdgmjwVcrQ6Kw2KpDMm1fAW79bjvDWI513IjhDgEDk7APj8ttawZt5YCPXLUzNILs9Y2Roc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=vUqg2TQG; arc=none smtp.client-ip=95.215.58.188 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="vUqg2TQG" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769507010; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=RkVeShES4B2VgCiXTgzhTlETaaAj+olyI2gU3ClsRew=; b=vUqg2TQGgleAvGPxg6XpPWj0hk+VX3LBsQJb5GxaEMooMNP/ieKchFW14UMkjxRPh0Q6a+ r1cWOTF9Zw4grH9/KRuvlSn+G1E4enTMa1Zz7X/x7mCHk+mujGKmLYodR2iDlRzqHsXUHV e2hnU95tejeSh9IVMOfF/K+2v8LQ8k0= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH bpf-next v5 01/12] bpf: move bpf_struct_ops_link into bpf.h Date: Tue, 27 Jan 2026 17:42:38 +0800 Message-ID: <3a6694566eedbf17f84dbe5ffafe9aa0aa32108c.1769506741.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin Move struct bpf_struct_ops_link's definition into bpf.h, where other custom bpf links definitions are. It's necessary to access its members from outside of generic bpf_struct_ops implementation, which will be done by following patches in the series. Signed-off-by: Roman Gushchin --- include/linux/bpf.h | 6 ++++++ kernel/bpf/bpf_struct_ops.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4427c6e98331..899dd911dc82 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1891,6 +1891,12 @@ struct bpf_raw_tp_link { u64 cookie; }; =20 +struct bpf_struct_ops_link { + struct bpf_link link; + struct bpf_map __rcu *map; + wait_queue_head_t wait_hup; +}; + struct bpf_link_primer { struct bpf_link *link; struct file *file; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index c43346cb3d76..de01cf3025b3 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -55,12 +55,6 @@ struct bpf_struct_ops_map { struct bpf_struct_ops_value kvalue; }; =20 -struct bpf_struct_ops_link { - struct bpf_link link; - struct bpf_map __rcu *map; - wait_queue_head_t wait_hup; -}; - static DEFINE_MUTEX(update_mutex); =20 #define VALUE_PREFIX "bpf_struct_ops_" --=20 2.43.0 From nobody Sat Feb 7 18:15:48 2026 Received: from out-178.mta1.migadu.com (out-178.mta1.migadu.com [95.215.58.178]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id EF0E0334C08 for ; Tue, 27 Jan 2026 09:43:44 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.178 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507027; cv=none; b=QqKqjIkrRamxo+FRdTtoFMd3uhlfHfAVt9rqyuNErbbjYh+JkVCTM+00oyErQrIwZUf/7x2zpG2L+c0HqA5mlq+CelwTeYj1jbNyjSrfslbkVPUudHZzwNi9FWloeAc2UquZVAIC1V3uHPPGnPIPscmzB7ggV7Yujxrn8RkfOTs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507027; c=relaxed/simple; bh=1QNC2bfHfbW9SQUgfvnKGdBr0YOvFpgcnrQNCW/0a+Y=; h=From:To:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Rl6r9oKmLLqZrRkS4jssdOCUKi+Q8TJ0i1QgQetT13nWp2kvNxQFAKCShTZkq4uXw0HiaUxpL41mZa4MQdEU/89nK946eRgC044JPydiFRQw0T653mXfotaBZkuiq754bWyhAShvAchlRuSPl4f72k2rOsbs+vO1tgmwfWv3YIE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=YQXTc1qw; arc=none smtp.client-ip=95.215.58.178 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="YQXTc1qw" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769507022; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=ycrFyw6aXaYmQPmQruEJsfLSWjPHoV8ar+xnlNHnd9w=; b=YQXTc1qwSK8YUcIlgs8w3Xp+DkzqZVrK9w7JA7p4wJtHvmD4cMt1T5d62d/hZOzzrTSVBH tUdc78LPAuaxmK47I0TuUtkE7/SOXlNqNq3qwXbzCw/sb/tzsdCY85bnStZpiMVXlLQBKB KxpyEQ0JelY7vPIP2LXhZyj+susJMGg= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH bpf-next v5 02/12] bpf: initial support for attaching struct ops to cgroups Date: Tue, 27 Jan 2026 17:42:39 +0800 Message-ID: <1c5845208d235e5deb37807f3be93af325033ba5.1769506741.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin When a struct ops is being attached and a bpf link is created, allow to pass a cgroup fd using bpf attr, so that struct ops can be attached to a cgroup instead of globally. Attached struct ops doesn't hold a reference to the cgroup, only preserves cgroup id. Signed-off-by: Roman Gushchin --- include/linux/bpf.h | 1 + kernel/bpf/bpf_struct_ops.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 899dd911dc82..720055d1dbce 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1895,6 +1895,7 @@ struct bpf_struct_ops_link { struct bpf_link link; struct bpf_map __rcu *map; wait_queue_head_t wait_hup; + u64 cgroup_id; }; =20 struct bpf_link_primer { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index de01cf3025b3..c807793e7633 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -13,6 +13,7 @@ #include #include #include +#include =20 struct bpf_struct_ops_value { struct bpf_struct_ops_common_value common; @@ -1377,6 +1378,20 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_= lops, NULL, attr->link_create.attach_type); +#ifdef CONFIG_CGROUPS + if (attr->link_create.cgroup.relative_fd) { + struct cgroup *cgrp; + + cgrp =3D cgroup_get_from_fd(attr->link_create.cgroup.relative_fd); + if (IS_ERR(cgrp)) { + err =3D PTR_ERR(cgrp); + goto err_out; + } + + link->cgroup_id =3D cgroup_id(cgrp); + cgroup_put(cgrp); + } +#endif /* CONFIG_CGROUPS */ =20 err =3D bpf_link_prime(&link->link, &link_primer); if (err) --=20 2.43.0 From nobody Sat Feb 7 18:15:48 2026 Received: from out-172.mta1.migadu.com (out-172.mta1.migadu.com [95.215.58.172]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5DE9F335067 for ; Tue, 27 Jan 2026 09:43:56 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507037; cv=none; b=lFmE4fL5cKq5fs3ZLg6q2mzOzlyXWcgjIp8mB6ekr585Ha7TdAWADU8pOpL3XxWuGreW9oF1sQzRES90POfTKGEbWugX8BnwhaXNApSlhzz3MeHBHU9+x/KSrNuyjt5qAxd6lTzVR20iz9M1wkytPHcpKPyP6jNoumXQ0jTJfp4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507037; c=relaxed/simple; bh=N8uerXNrtYXAXsXSofx5ACpCyBePZpfRxoCGMF6fYG0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=X5JaueezEhbq9jVvlp/R9WVPHDhF9KLZ24HKPTCOanrs+KG2WrUcWfGXGx6bclI0ty31UORi8PWlgOF1HQBCKakO6+EfVCiL1A+Kdh1jfb81s7flXWcqCtgfOA8vfYOmcgsI3usHM29KycsVasZ1pMKl/c6yZqf94R1Y+dJOgvs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=V6EVuU79; arc=none smtp.client-ip=95.215.58.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="V6EVuU79" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769507034; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=R9sBuFUW725uaUgAoHBNUzbcXyyILwwKmBSUQhglIbE=; b=V6EVuU79vNgGvQDc4EYR6qMx3zOUY5408Nxax8gbmUwfgPseSVeEcTfzDsDZA+SxLgBG6J ycTWpG/K3bX4xhq8fUTlLKDdUJxgXGL6eIAo8yRqRAlHFGOXBPt92DhC+LCgp+5XL20gxh 89thHaGUOr0HUEDMOXYU5iX25cLsrI8= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Kumar Kartikeya Dwivedi Subject: [RFC PATCH bpf-next v5 03/12] bpf: mark struct oom_control's memcg field as TRUSTED_OR_NULL Date: Tue, 27 Jan 2026 17:42:40 +0800 Message-ID: <8c6d6d4751f1ad12582d3d9eabd549d5fd8925b0.1769506741.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin Struct oom_control is used to describe the OOM context. It's memcg field defines the scope of OOM: it's NULL for global OOMs and a valid memcg pointer for memcg-scoped OOMs. Teach bpf verifier to recognize it as trusted or NULL pointer. It will provide the bpf OOM handler a trusted memcg pointer, which for example is required for iterating the memcg's subtree. Signed-off-by: Roman Gushchin Acked-by: Kumar Kartikeya Dwivedi --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c2f2650db9fd..cca36edb460d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7242,6 +7242,10 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)= { struct file *vm_file; }; =20 +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct oom_control) { + struct mem_cgroup *memcg; +}; + static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *field_name, u32 btf_id) @@ -7284,6 +7288,7 @@ static bool type_is_trusted_or_null(struct bpf_verifi= er_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct oom_control)); =20 return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); --=20 2.43.0 From nobody Sat Feb 7 18:15:48 2026 Received: from out-186.mta1.migadu.com (out-186.mta1.migadu.com [95.215.58.186]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 755152DC765 for ; Tue, 27 Jan 2026 09:44:07 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.186 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507048; cv=none; b=lyvIWLTQ/K+sT28YZryKSbqYaCk+VSIi+dfz0C2/rgSZum0OVX91dkm0EIEiXxo2unciRgAgRAulU2rI07oZJebTG4xxcqVk0MxYqYnC/mLQXjo9dKhAxu15X/nhp2NW8yEJbPYdBxZbuVc0yIhD7rfSMc8GECer3AKCaoINPdU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507048; c=relaxed/simple; bh=2k9LiI+M//qM4q+K2IyZ0CNDJt0ZnwZ2pxw6XNWmxxQ=; h=From:To:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=VItv+Th9xvk7B17lNd3W8/Bpi5cEj7ClzIPkn4ymoZXzOEdZTpecKDMF+WMm7LKYPa79fgJXrp2OPLRR1Ce9gRUY0rwbyiptQePcCQUU/cSIltErk8lAYTgCxFe5WU5jK0i/ZwPUmizdfKAOd3rYqkOfYa3rr7AhzF5Z2hKSMtw= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=pCggPyhq; arc=none smtp.client-ip=95.215.58.186 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="pCggPyhq" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769507045; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=MZDHd5qG1jM6uT5/JnPwA06VDLOpkfHtWx6qC88p08g=; b=pCggPyhqpd99immxb6zujzAvZczggMve7iRzC9xJBttxhuOfxADBoYXpUlm00tpzzWQwP6 wYQTe7O9rSNR0Q22G5GpJOnPbsyFLUYrNH3wmnLmIBORZ/zKW4lTXISzBudp9cQCKnmp6n 7IosG+lkEdDwR7eRHrhJH/2v03dH0Vo= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH bpf-next v5 04/12] mm: define mem_cgroup_get_from_ino() outside of CONFIG_SHRINKER_DEBUG Date: Tue, 27 Jan 2026 17:42:41 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin mem_cgroup_get_from_ino() can be reused by the BPF OOM implementation, but currently depends on CONFIG_SHRINKER_DEBUG. Remove this dependency. Signed-off-by: Roman Gushchin --- include/linux/memcontrol.h | 4 ++-- mm/memcontrol.c | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 229ac9835adb..f3b8c71870d8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -833,9 +833,9 @@ static inline unsigned long mem_cgroup_ino(struct mem_c= group *memcg) { return memcg ? cgroup_ino(memcg->css.cgroup) : 0; } +#endif =20 struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); -#endif =20 static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { @@ -1298,12 +1298,12 @@ static inline unsigned long mem_cgroup_ino(struct m= em_cgroup *memcg) { return 0; } +#endif =20 static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { return NULL; } -#endif =20 static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3808845bc8cc..1f74fce27677 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3658,7 +3658,6 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short = id) return xa_load(&mem_cgroup_ids, id); } =20 -#ifdef CONFIG_SHRINKER_DEBUG struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { struct cgroup *cgrp; @@ -3679,7 +3678,6 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned l= ong ino) =20 return memcg; } -#endif =20 static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn) { --=20 2.43.0 From nobody Sat Feb 7 18:15:48 2026 Received: from out-187.mta1.migadu.com (out-187.mta1.migadu.com [95.215.58.187]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 55B6E32BF43 for ; Tue, 27 Jan 2026 09:44:18 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.187 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507060; cv=none; b=llbPIGAx8ppj25jwDXlLP0tXk+T7tgM6/xm+5JRC3p9HMMGo+12gQ0qTelWobcLMFuwgT1wdcGc1Ih5hwBlrVx7xeQklwiInRgNRPIba/fehZd+mr9ep5m69BxG3r3U6pBat9y7PiFdlFTSG1NHfNFQY0nnmKgohdZImhKg5M58= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507060; c=relaxed/simple; bh=OGLI1S2l0PmNr2S4bsYnKqfqSXB+2A+DsfQrV5CIDWU=; h=From:To:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=bzvkkckQwmFzOZ5SUZRJiyEu1knblyd4X+CYK/lorFTqwN+4W/Qtkp8LmD4yw/Hv8erN04PWkDXXY0lX6tQf2kwVNW5W29zj5P8RefscmTs2Wk5UaFKP54Hn4EsUVZ9CCqXcz3z4HefgVjWr2qIXgyvxsuswn7UjS/tTLvbssfM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=czTeJJpy; arc=none smtp.client-ip=95.215.58.187 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="czTeJJpy" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769507056; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=FOthmRQx1S9k8S+lynuRz8PEGmhPgopf4vkxZNGup6E=; b=czTeJJpyv2okdcCiyzy152EVyslqIqInTXlSCpdBL+Ji64e4WZostd/vUnqvbh34LRNCGw yQwVnul1ze0+FrDsiegO0TmgTEDAMS8+i05KcRILZlHoRpR7NbsTo+61aPQwJ0OMIQbKZk Efh/gHG+N5wYHpYLmwdCCpSA3ukuqA0= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH bpf-next v5 05/12] libbpf: introduce bpf_map__attach_struct_ops_opts() Date: Tue, 27 Jan 2026 17:42:42 +0800 Message-ID: <635923ceadf1899672e4f7727ddc52554c11a3ac.1769506741.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Roman Gushchin Introduce bpf_map__attach_struct_ops_opts(), an extended version of bpf_map__attach_struct_ops(), which takes additional struct bpf_struct_ops_opts argument. struct bpf_struct_ops_opts has the relative_fd member, which allows to pass an additional file descriptor argument. It can be used to attach struct ops maps to cgroups. Signed-off-by: Roman Gushchin --- tools/lib/bpf/bpf.c | 8 ++++++++ tools/lib/bpf/libbpf.c | 18 ++++++++++++++++-- tools/lib/bpf/libbpf.h | 14 ++++++++++++++ tools/lib/bpf/libbpf.map | 1 + 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 5846de364209..84a53c594f48 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -884,6 +884,14 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, cgroup)) return libbpf_err(-EINVAL); break; + case BPF_STRUCT_OPS: + relative_fd =3D OPTS_GET(opts, cgroup.relative_fd, 0); + attr.link_create.cgroup.relative_fd =3D relative_fd; + attr.link_create.cgroup.expected_revision =3D + OPTS_GET(opts, cgroup.expected_revision, 0); + if (!OPTS_ZEROED(opts, cgroup)) + return libbpf_err(-EINVAL); + break; default: if (!OPTS_ZEROED(opts, flags)) return libbpf_err(-EINVAL); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 0c8bf0b5cce4..70a00da54ff5 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -13462,12 +13462,19 @@ static int bpf_link__detach_struct_ops(struct bpf= _link *link) return close(link->fd); } =20 -struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +struct bpf_link *bpf_map__attach_struct_ops_opts(const struct bpf_map *map, + const struct bpf_struct_ops_opts *opts) { + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts); struct bpf_link_struct_ops *link; __u32 zero =3D 0; int err, fd; =20 + if (!OPTS_VALID(opts, bpf_struct_ops_opts)) { + pr_warn("map '%s': invalid opts\n", map->name); + return libbpf_err_ptr(-EINVAL); + } + if (!bpf_map__is_struct_ops(map)) { pr_warn("map '%s': can't attach non-struct_ops map\n", map->name); return libbpf_err_ptr(-EINVAL); @@ -13503,7 +13510,9 @@ struct bpf_link *bpf_map__attach_struct_ops(const s= truct bpf_map *map) return &link->link; } =20 - fd =3D bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, NULL); + link_opts.cgroup.relative_fd =3D OPTS_GET(opts, relative_fd, 0); + + fd =3D bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, &link_opts); if (fd < 0) { free(link); return libbpf_err_ptr(fd); @@ -13515,6 +13524,11 @@ struct bpf_link *bpf_map__attach_struct_ops(const = struct bpf_map *map) return &link->link; } =20 +struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +{ + return bpf_map__attach_struct_ops_opts(map, NULL); +} + /* * Swap the back struct_ops of a link with a new struct_ops map. */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index dfc37a615578..5aef44bcfcc2 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -920,6 +920,20 @@ bpf_program__attach_cgroup_opts(const struct bpf_progr= am *prog, int cgroup_fd, struct bpf_map; =20 LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_ma= p *map); + +struct bpf_struct_ops_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + __u32 flags; + __u32 relative_fd; + __u64 expected_revision; + size_t :0; +}; +#define bpf_struct_ops_opts__last_field expected_revision + +LIBBPF_API struct bpf_link * +bpf_map__attach_struct_ops_opts(const struct bpf_map *map, + const struct bpf_struct_ops_opts *opts); LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bp= f_map *map); =20 struct bpf_iter_attach_opts { diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index d18fbcea7578..4779190c97b6 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -454,4 +454,5 @@ LIBBPF_1.7.0 { bpf_prog_assoc_struct_ops; bpf_program__assoc_struct_ops; btf__permute; + bpf_map__attach_struct_ops_opts; } LIBBPF_1.6.0; --=20 2.43.0 From nobody Sat Feb 7 18:15:48 2026 Received: from out-182.mta1.migadu.com (out-182.mta1.migadu.com [95.215.58.182]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2D5172D9EE4 for ; Tue, 27 Jan 2026 09:46:20 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.182 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507182; cv=none; b=Jc6AkP7y5JmZj37vhyArO8ZUTP7vFxw0eY67urqw8Mz+HrejVCpnKU/Y8Yisa2h3/QSMjQRztjaYi9be2WkFwrc4hSUQwkdLGa4nmQJcKRz+A4vH2hq5ACQarJBZcOHD7pjoYkIlHCAAAB0diSC5fREhNc3qCfgMT2nG0UCOvhs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507182; c=relaxed/simple; bh=FST42ieyWdVzaafbTT/pjnBN12hvGiGjV8DHVrhn4G0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=UZN/ef6JZ+3DvdlgvA7/utRKd03tvEccVLF/KLCJj9SBWxMiZuD7HsxBjecNCR9PlyrP30hHMRJorRwp6Cx4xLFFc2oa1T27mK6oDDj4B4Rl1XnzmxL4Dx7fPjp8zTBVqKHhBiTo224jAv4pq+Xw6JN+0QP6TtkCXPy+HBp6l4U= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=P7Htg7Dy; arc=none smtp.client-ip=95.215.58.182 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="P7Htg7Dy" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769507179; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=qJzfS1JLGvEK+FxWqw8ITpq79hNz1KKD0GGXddU0FBc=; b=P7Htg7Dy6LcFd8R8VdXfIlRFf756CP3aQQqE0Z/k6TdgZY0LGPPqD7tubppy+ucyIppTJs XszAcPfj7QDhMZd//Q4CaJNlXtS7x69b7nbUy3/UywCdzCv5CntwBtC4qKsYL3g4YWzCCM 7Ul2OpmeuFE7OHLo8g03BDqHNO0XCr4= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v5 06/12] bpf: Pass flags in bpf_link_create for struct_ops Date: Tue, 27 Jan 2026 17:45:50 +0800 Message-ID: <6b2d4fa8e5209d363f553d7851d5a1156137d9fb.1769506741.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu To support features like allowing overrides in cgroup hierarchies, we need a way to pass flags from userspace to the kernel when attaching a struct_ops. Extend `bpf_struct_ops_link` to include a `flags` field. This field is populated from `attr->link_create.flags` during link creation. This will allow struct_ops implementations, such as the upcoming memory controller ops, to interpret these flags and modify their attachment behavior accordingly. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- include/linux/bpf.h | 1 + kernel/bpf/bpf_struct_ops.c | 1 + tools/include/uapi/linux/bpf.h | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 720055d1dbce..13c933cfc614 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1896,6 +1896,7 @@ struct bpf_struct_ops_link { struct bpf_map __rcu *map; wait_queue_head_t wait_hup; u64 cgroup_id; + u32 flags; }; =20 struct bpf_link_primer { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index c807793e7633..0df608c88403 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1392,6 +1392,7 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) cgroup_put(cgrp); } #endif /* CONFIG_CGROUPS */ + link->flags =3D attr->link_create.flags; =20 err =3D bpf_link_prime(&link->link, &link_primer); if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ca7d76e05f0..4e1c5d6d91ae 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1185,7 +1185,7 @@ enum bpf_perf_event_type { BPF_PERF_EVENT_EVENT =3D 6, }; =20 -/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH and BPF_LINK_CREATE com= mand * * NONE(default): No further bpf programs allowed in the subtree. * --=20 2.43.0 From nobody Sat Feb 7 18:15:48 2026 Received: from out-179.mta1.migadu.com (out-179.mta1.migadu.com [95.215.58.179]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BD08A2E62A9 for ; Tue, 27 Jan 2026 09:46:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.179 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507205; cv=none; b=mvbYXVttLC9eJ9COoqnjyGdZ1Yv05m40BmWeNm0aG5J9763PS0M4rT7P5/9h+Z0oQNS0sm9K4i28Wy4UKpS7u5vXkRtAVkz4OEhQjFrTsfqgmc5LqlF44kNxPrnGsqOEWUeZkK6YMlySEYATEk9TEXNMTqPjKmFH5jI9S2aEw6w= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507205; c=relaxed/simple; bh=9PcymUN+ZCg2e50rgIRHOfTXaJvfmhQQChu88MTol/Q=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=VJvXjVPmyqch7EUnK3GpC6GpbR2IaIvQOOO4Ihs/wAZsMyFQ2RuBz4CDjXob7FaJm0i77ISDRMdzIRotHrEmshCKf7g0reLMJjTwM3b/fQ19gA48UBWdSrI7u1WlIr6SiOki75WB2eqi4JtFKu2E8oiUluyJ36gRxyI8Isysc3E= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=GybHeuHi; arc=none smtp.client-ip=95.215.58.179 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="GybHeuHi" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769507191; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=li2p2tnFJJjJ5INNx6uZKzhFQ5sqAQSEauna4Y+cr5o=; b=GybHeuHiFgFrkJ7AI5shcktmPnd9xH64DeeRqlQBhSV0NblYrsh06r9lIa9PvDUqRdVvH8 G3IBrJjSqgSsbon2DrgvqkcEKLa3thyT+3AQ0GiRVk7tXtDpinuyKvLRIKacq4yEPFwPrZ d2++W8afsihpFC/CQhgMyILVg9H3cic= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v5 07/12] libbpf: Support passing user-defined flags for struct_ops Date: Tue, 27 Jan 2026 17:45:51 +0800 Message-ID: <63d543aec9ade803afcd95461e3089e3d44caca6.1769506741.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Building on the previous change that added flags to the kernel's link creation path, this patch exposes this functionality through libbpf. The `bpf_struct_ops_opts` struct is extended with a `flags` member, which is then passed to the `bpf_link_create` syscall within `bpf_map__attach_struct_ops_opts`. This enables userspace applications to pass flags, such as `BPF_F_ALLOW_OVERRIDE`, when attaching struct_ops to cgroups, providing more control over the attachment behavior in nested hierarchies. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- tools/lib/bpf/libbpf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 70a00da54ff5..06c936bad211 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -13511,6 +13511,7 @@ struct bpf_link *bpf_map__attach_struct_ops_opts(co= nst struct bpf_map *map, } =20 link_opts.cgroup.relative_fd =3D OPTS_GET(opts, relative_fd, 0); + link_opts.flags =3D OPTS_GET(opts, flags, 0); =20 fd =3D bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, &link_opts); if (fd < 0) { --=20 2.43.0 From nobody Sat Feb 7 18:15:48 2026 Received: from out-182.mta1.migadu.com (out-182.mta1.migadu.com [95.215.58.182]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C8A9F33556D; Tue, 27 Jan 2026 09:46:46 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.182 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507209; cv=none; b=YjC6DklHOvUnXFaKACW49zB94l9aAeUzrXOmA6eYg9KRNs9I+7rGFXvuiDczE8Rus809QW8mUkbUsBbHGowuTSmDD+PwFL6I+DY/RQGuAS525E03PfLsuM3U8KtZdXIQEVrB1xqWceOUlsNEn5LXHKXvDfjtAqu8nKSCezCw9vE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507209; c=relaxed/simple; bh=J6e6CxIAwVkvPRs52IAfsJlCCwOtkA/tG2GinoRREUI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=P98ra1YGVDOyKAAKh+kYRMmSdUEH5DhlHARFG+UM2uPVsZRPPqwVwUAz00ZDbHUKYibCkD/UGmVxT8Zbl2zjIqdCfCNLVumuGw1dJ+TqqaAsGMS5seU9gfbYhk1KuiZLQ516DNrKe1QHTbQHuUWUa/Drpn8xh7LrZyCBtWmiKfc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=Ye3jWQBK; arc=none smtp.client-ip=95.215.58.182 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="Ye3jWQBK" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769507204; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=TlOrsGmE89AzNqFkY/W+yKrUCs36fwQ3YYkQ5KYGK94=; b=Ye3jWQBKsWmroCQ6+jdiYjHI3u5VE2K6ZT/XHil364iLGKjePoXqtlLro69f6mlIkm8zsE L7r9ErMNQBZloJAO3OF0GrZfRkDShw3BQmpd6+uuQobQip2oyLCcF6NtXNtA8aw6MLUr/5 rfxsVZ6fSjHAIexpXtIjzEumuSClMxo= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v5 08/12] mm: memcontrol: Add BPF struct_ops for memory controller Date: Tue, 27 Jan 2026 17:45:52 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Introduce BPF struct_ops support to the memory controller, enabling custom and dynamic control over memory pressure. This is achieved through a new struct_ops type, `memcg_bpf_ops`. This new interface allows a BPF program to implement hooks that influence a memory cgroup's behavior. The `memcg_bpf_ops` struct provides the following hooks: - `get_high_delay_ms`: Returns a custom throttling delay in milliseconds for a cgroup that has breached its `memory.high` limit. This is the primary mechanism for BPF-driven throttling. - `below_low`: Overrides the `memory.low` protection check. If this hook returns true, the cgroup is considered to be protected by its `memory.low` setting, regardless of its actual usage. - `below_min`: Similar to `below_low`, this overrides the `memory.min` protection check. - `handle_cgroup_online`/`offline`: Callbacks invoked when a cgroup with an attached program comes online or goes offline, allowing for state management. This patch integrates these hooks into the core memory control logic. The `get_high_delay_ms` value is incorporated into charge paths like `try_charge_memcg` and the high-limit handler `__mem_cgroup_handle_over_high`. The `below_low` and `below_min` hooks are checked within their respective protection functions. Lifecycle management is handled to ensure BPF programs are correctly inherited by child cgroups and cleaned up on detachment. SRCU is used to protect concurrent access to the `memcg->bpf_ops` pointer. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- include/linux/memcontrol.h | 108 +++++++++++++++- mm/bpf_memcontrol.c | 251 ++++++++++++++++++++++++++++++++++++- mm/memcontrol.c | 32 +++-- 3 files changed, 378 insertions(+), 13 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f3b8c71870d8..24c4df864401 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -181,6 +181,37 @@ struct obj_cgroup { }; }; =20 +#ifdef CONFIG_BPF_SYSCALL +/** + * struct memcg_bpf_ops - BPF callbacks for memory cgroup operations + * @handle_cgroup_online: Called when a cgroup comes online + * @handle_cgroup_offline: Called when a cgroup goes offline + * @below_low: Override memory.low protection check. If this callback retu= rns + * true, mem_cgroup_below_low() will return true immediately w= ithout + * performing the standard comparison. If it returns false, the + * original memory.low threshold comparison will proceed norma= lly. + * @below_min: Override memory.min protection check. If this callback retu= rns + * true, mem_cgroup_below_min() will return true immediately w= ithout + * performing the standard comparison. If it returns false, the + * original memory.min threshold comparison will proceed norma= lly. + * @get_high_delay_ms: Return custom throttle delay in milliseconds + * + * This structure defines the interface for BPF programs to customize + * memory cgroup behavior through struct_ops programs. + */ +struct memcg_bpf_ops { + void (*handle_cgroup_online)(struct mem_cgroup *memcg); + + void (*handle_cgroup_offline)(struct mem_cgroup *memcg); + + bool (*below_low)(struct mem_cgroup *memcg); + + bool (*below_min)(struct mem_cgroup *memcg); + + unsigned int (*get_high_delay_ms)(struct mem_cgroup *memcg); +}; +#endif /* CONFIG_BPF_SYSCALL */ + /* * The memory controller data structure. The memory controller controls bo= th * page cache and RSS per cgroup. We would eventually like to provide @@ -321,6 +352,10 @@ struct mem_cgroup { spinlock_t event_list_lock; #endif /* CONFIG_MEMCG_V1 */ =20 +#ifdef CONFIG_BPF_SYSCALL + struct memcg_bpf_ops *bpf_ops; +#endif + struct mem_cgroup_per_node *nodeinfo[]; }; =20 @@ -554,6 +589,68 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } =20 +#ifdef CONFIG_BPF_SYSCALL + +/* SRCU for protecting concurrent access to memcg->bpf_ops */ +extern struct srcu_struct memcg_bpf_srcu; + +/** + * BPF_MEMCG_CALL - Safely invoke a BPF memcg callback + * @memcg: The memory cgroup + * @op: The operation name (struct member) + * @default_val: Default return value if no BPF program attached + * + * This macro safely calls a BPF callback under SRCU protection. + */ +#define BPF_MEMCG_CALL(memcg, op, default_val) ({ \ + typeof(default_val) __ret =3D (default_val); \ + struct memcg_bpf_ops *__ops; \ + int __idx; \ + \ + __idx =3D srcu_read_lock(&memcg_bpf_srcu); \ + __ops =3D READ_ONCE((memcg)->bpf_ops); \ + if (__ops && __ops->op) \ + __ret =3D __ops->op(memcg); \ + srcu_read_unlock(&memcg_bpf_srcu, __idx); \ + __ret; \ +}) + +static inline bool bpf_memcg_below_low(struct mem_cgroup *memcg) +{ + return BPF_MEMCG_CALL(memcg, below_low, false); +} + +static inline bool bpf_memcg_below_min(struct mem_cgroup *memcg) +{ + return BPF_MEMCG_CALL(memcg, below_min, false); +} + +static inline unsigned long bpf_memcg_get_high_delay(struct mem_cgroup *me= mcg) +{ + unsigned int ret; + + ret =3D BPF_MEMCG_CALL(memcg, get_high_delay_ms, 0U); + return msecs_to_jiffies(ret); +} + +#undef BPF_MEMCG_CALL + +extern void memcontrol_bpf_online(struct mem_cgroup *memcg); +extern void memcontrol_bpf_offline(struct mem_cgroup *memcg); + +#else /* CONFIG_BPF_SYSCALL */ + +static inline unsigned long +bpf_memcg_get_high_delay(struct mem_cgroup *memcg) { return 0; } +static inline bool +bpf_memcg_below_low(struct mem_cgroup *memcg) { return false; } +static inline bool +bpf_memcg_below_min(struct mem_cgroup *memcg) { return false; } +static inline void memcontrol_bpf_online(struct mem_cgroup *memcg) { } +static inline void memcontrol_bpf_offline(struct mem_cgroup *memcg) { } + +#endif /* CONFIG_BPF_SYSCALL */ + static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, @@ -625,6 +722,9 @@ static inline bool mem_cgroup_below_low(struct mem_cgro= up *target, if (mem_cgroup_unprotected(target, memcg)) return false; =20 + if (bpf_memcg_below_low(memcg)) + return true; + return READ_ONCE(memcg->memory.elow) >=3D page_counter_read(&memcg->memory); } @@ -635,6 +735,9 @@ static inline bool mem_cgroup_below_min(struct mem_cgro= up *target, if (mem_cgroup_unprotected(target, memcg)) return false; =20 + if (bpf_memcg_below_min(memcg)) + return true; + return READ_ONCE(memcg->memory.emin) >=3D page_counter_read(&memcg->memory); } @@ -909,12 +1012,13 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lr= uvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } =20 -void __mem_cgroup_handle_over_high(gfp_t gfp_mask); +void __mem_cgroup_handle_over_high(gfp_t gfp_mask, + unsigned long bpf_high_delay); =20 static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { if (unlikely(current->memcg_nr_pages_over_high)) - __mem_cgroup_handle_over_high(gfp_mask); + __mem_cgroup_handle_over_high(gfp_mask, 0); } =20 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 716df49d7647..e746eb9cbd56 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -8,6 +8,9 @@ #include #include =20 +/* Protects memcg->bpf_ops pointer for read and write. */ +DEFINE_SRCU(memcg_bpf_srcu); + __bpf_kfunc_start_defs(); =20 /** @@ -179,15 +182,259 @@ static const struct btf_kfunc_id_set bpf_memcontrol_= kfunc_set =3D { .set =3D &bpf_memcontrol_kfuncs, }; =20 +/** + * memcontrol_bpf_online - Inherit BPF programs for a new online cgroup. + * @memcg: The memory cgroup that is coming online. + * + * When a new memcg is brought online, it inherits the BPF programs + * attached to its parent. This ensures consistent BPF-based memory + * control policies throughout the cgroup hierarchy. + * + * After inheriting, if the BPF program has an online handler, it is + * invoked for the new memcg. + */ +void memcontrol_bpf_online(struct mem_cgroup *memcg) +{ + int idx; + struct memcg_bpf_ops *ops; + struct mem_cgroup *parent_memcg; + + /* The root cgroup does not inherit from a parent. */ + if (mem_cgroup_is_root(memcg)) + return; + + parent_memcg =3D parent_mem_cgroup(memcg); + + idx =3D srcu_read_lock(&memcg_bpf_srcu); + + /* Inherit the BPF program from the parent cgroup. */ + ops =3D READ_ONCE(parent_memcg->bpf_ops); + if (!ops) + goto out; + + WRITE_ONCE(memcg->bpf_ops, ops); + + /* + * If the BPF program implements it, call the online handler to + * allow the program to perform setup tasks for the new cgroup. + */ + if (!ops->handle_cgroup_online) + goto out; + + ops->handle_cgroup_online(memcg); + +out: + srcu_read_unlock(&memcg_bpf_srcu, idx); +} + +/** + * memcontrol_bpf_offline - Run BPF cleanup for an offline cgroup. + * @memcg: The memory cgroup that is going offline. + * + * If a BPF program is attached and implements an offline handler, + * it is invoked to perform cleanup tasks before the memcg goes + * completely offline. + */ +void memcontrol_bpf_offline(struct mem_cgroup *memcg) +{ + int idx; + struct memcg_bpf_ops *ops; + + idx =3D srcu_read_lock(&memcg_bpf_srcu); + + ops =3D READ_ONCE(memcg->bpf_ops); + if (!ops || !ops->handle_cgroup_offline) + goto out; + + ops->handle_cgroup_offline(memcg); + +out: + srcu_read_unlock(&memcg_bpf_srcu, idx); +} + +static int memcg_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + return -EACCES; +} + +static bool memcg_ops_is_valid_access(int off, int size, enum bpf_access_t= ype type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops bpf_memcg_verifier_ops =3D { + .get_func_proto =3D bpf_base_func_proto, + .btf_struct_access =3D memcg_ops_btf_struct_access, + .is_valid_access =3D memcg_ops_is_valid_access, +}; + +static void cfi_handle_cgroup_online(struct mem_cgroup *memcg) +{ +} + +static void cfi_handle_cgroup_offline(struct mem_cgroup *memcg) +{ +} + +static bool cfi_below_low(struct mem_cgroup *memcg) +{ + return false; +} + +static bool cfi_below_min(struct mem_cgroup *memcg) +{ + return false; +} + +static unsigned int cfi_get_high_delay_ms(struct mem_cgroup *memcg) +{ + return 0; +} + +static struct memcg_bpf_ops cfi_bpf_memcg_ops =3D { + .handle_cgroup_online =3D cfi_handle_cgroup_online, + .handle_cgroup_offline =3D cfi_handle_cgroup_offline, + .below_low =3D cfi_below_low, + .below_min =3D cfi_below_min, + .get_high_delay_ms =3D cfi_get_high_delay_ms, +}; + +static int bpf_memcg_ops_init(struct btf *btf) +{ + return 0; +} + +static int bpf_memcg_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff =3D __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct memcg_bpf_ops, handle_cgroup_online): + case offsetof(struct memcg_bpf_ops, handle_cgroup_offline): + case offsetof(struct memcg_bpf_ops, below_low): + case offsetof(struct memcg_bpf_ops, below_min): + case offsetof(struct memcg_bpf_ops, get_high_delay_ms): + break; + default: + return -EINVAL; + } + + if (prog->sleepable) + return -EINVAL; + + return 0; +} + +static int bpf_memcg_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +/** + * clean_memcg_bpf_ops - Clear BPF ops from a memory cgroup hierarchy + * @memcg: Root memory cgroup to start from + * @ops: The specific BPF ops to remove + * + * Walks the cgroup hierarchy and clears bpf_ops for any cgroup that + * matches @ops. + */ +static void clean_memcg_bpf_ops(struct mem_cgroup *memcg, + struct memcg_bpf_ops *ops) +{ + struct mem_cgroup *iter =3D NULL; + + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { + if (READ_ONCE(iter->bpf_ops) =3D=3D ops) + WRITE_ONCE(iter->bpf_ops, NULL); + } +} + +static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link + =3D container_of(link, struct bpf_struct_ops_link, link); + struct memcg_bpf_ops *ops =3D kdata; + struct mem_cgroup *memcg, *iter =3D NULL; + int err =3D 0; + + memcg =3D mem_cgroup_get_from_ino(ops_link->cgroup_id); + if (!memcg) + return -ENOENT; + if (IS_ERR(memcg)) + return PTR_ERR(memcg); + + cgroup_lock(); + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { + if (READ_ONCE(iter->bpf_ops)) { + mem_cgroup_iter_break(memcg, iter); + err =3D -EBUSY; + break; + } + WRITE_ONCE(iter->bpf_ops, ops); + } + if (err) + clean_memcg_bpf_ops(memcg, ops); + cgroup_unlock(); + + mem_cgroup_put(memcg); + return err; +} + +/* Unregister the struct ops instance */ +static void bpf_memcg_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link + =3D container_of(link, struct bpf_struct_ops_link, link); + struct memcg_bpf_ops *ops =3D kdata; + struct mem_cgroup *memcg; + + memcg =3D mem_cgroup_get_from_ino(ops_link->cgroup_id); + if (IS_ERR_OR_NULL(memcg)) + goto out; + + cgroup_lock(); + clean_memcg_bpf_ops(memcg, ops); + cgroup_unlock(); + + mem_cgroup_put(memcg); + +out: + synchronize_srcu(&memcg_bpf_srcu); +} + +static struct bpf_struct_ops bpf_memcg_bpf_ops =3D { + .verifier_ops =3D &bpf_memcg_verifier_ops, + .init =3D bpf_memcg_ops_init, + .check_member =3D bpf_memcg_ops_check_member, + .init_member =3D bpf_memcg_ops_init_member, + .reg =3D bpf_memcg_ops_reg, + .unreg =3D bpf_memcg_ops_unreg, + .name =3D "memcg_bpf_ops", + .owner =3D THIS_MODULE, + .cfi_stubs =3D &cfi_bpf_memcg_ops, +}; + static int __init bpf_memcontrol_init(void) { - int err; + int err, err2; =20 err =3D register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_memcontrol_kfunc_set); if (err) pr_warn("error while registering bpf memcontrol kfuncs: %d", err); =20 - return err; + err2 =3D register_bpf_struct_ops(&bpf_memcg_bpf_ops, memcg_bpf_ops); + if (err2) + pr_warn("error while registering memcontrol bpf ops: %d", err2); + + return err ? err : err2; } late_initcall(bpf_memcontrol_init); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1f74fce27677..8d90575aa77d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2252,7 +2252,8 @@ static unsigned long calculate_high_delay(struct mem_= cgroup *memcg, * try_charge() (context permitting), as well as from the userland * return path where reclaim is always able to block. */ -void __mem_cgroup_handle_over_high(gfp_t gfp_mask) +void +__mem_cgroup_handle_over_high(gfp_t gfp_mask, unsigned long bpf_high_delay) { unsigned long penalty_jiffies; unsigned long pflags; @@ -2294,11 +2295,15 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask) * memory.high is breached and reclaim is unable to keep up. Throttle * allocators proactively to slow down excessive growth. */ - penalty_jiffies =3D calculate_high_delay(memcg, nr_pages, - mem_find_max_overage(memcg)); + if (nr_pages) { + penalty_jiffies =3D calculate_high_delay( + memcg, nr_pages, mem_find_max_overage(memcg)); =20 - penalty_jiffies +=3D calculate_high_delay(memcg, nr_pages, - swap_find_max_overage(memcg)); + penalty_jiffies +=3D calculate_high_delay( + memcg, nr_pages, swap_find_max_overage(memcg)); + } else + penalty_jiffies =3D 0; + penalty_jiffies =3D max(penalty_jiffies, bpf_high_delay); =20 /* * Clamp the max delay per usermode return so as to still keep the @@ -2356,6 +2361,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg,= gfp_t gfp_mask, bool raised_max_event =3D false; unsigned long pflags; bool allow_spinning =3D gfpflags_allow_spinning(gfp_mask); + struct mem_cgroup *orig_memcg; =20 retry: if (consume_stock(memcg, nr_pages)) @@ -2481,6 +2487,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg,= gfp_t gfp_mask, if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); =20 + orig_memcg =3D memcg; /* * If the hierarchy is above the normal consumption range, schedule * reclaim on returning to userland. We can perform reclaim here @@ -2530,10 +2537,14 @@ static int try_charge_memcg(struct mem_cgroup *memc= g, gfp_t gfp_mask, * kernel. If this is successful, the return path will see it * when it rechecks the overage and simply bail out. */ - if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && - !(current->flags & PF_MEMALLOC) && - gfpflags_allow_blocking(gfp_mask)) - __mem_cgroup_handle_over_high(gfp_mask); + if (gfpflags_allow_blocking(gfp_mask)) { + unsigned long bpf_high_delay; + + bpf_high_delay =3D bpf_memcg_get_high_delay(orig_memcg); + if (bpf_high_delay || + current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH) + __mem_cgroup_handle_over_high(gfp_mask, bpf_high_delay); + } return 0; } =20 @@ -3906,6 +3917,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys= _state *css) */ xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL); =20 + memcontrol_bpf_online(memcg); + return 0; offline_kmem: memcg_offline_kmem(memcg); @@ -3925,6 +3938,7 @@ static void mem_cgroup_css_offline(struct cgroup_subs= ys_state *css) =20 zswap_memcg_offline_cleanup(memcg); =20 + memcontrol_bpf_offline(memcg); memcg_offline_kmem(memcg); reparent_deferred_split_queue(memcg); reparent_shrinker_deferred(memcg); --=20 2.43.0 From nobody Sat Feb 7 18:15:48 2026 Received: from out-173.mta1.migadu.com (out-173.mta1.migadu.com [95.215.58.173]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5DEF0331A4D for ; Tue, 27 Jan 2026 09:48:01 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.173 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507284; cv=none; b=hHnGXQeeOkfRhp9sdTX6oZfhqtUEFXaxdbABjyMQvEf7EH4sc5v8hpVAh54q1dhWF/tEunkQrSTFFTgxIw9DhV84pVB4X3rUmgpf+rMJShkC303OpP7KenAJYHzR7FIuF8FJcvhiwi0CvT8FyrRA9CsGlcFLrKIgaHz2L3737pk= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507284; c=relaxed/simple; bh=cFgyCB+Xo928vdCQ2YHyjKAyociQPoqpJIjXgNroi4I=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=gmWy+M7+SHZA/okwiTpDn5j7y6OKqXAF74ju2he6qzeni2w6LHGQmjAT49N09Mytf4mdqGf7FRWiwgJpjOOAmUwU1eSXLQ8wovxwgAzcdWi/A7XVEie7ngIokPCiSbX1AbQGdodoy7Dgum2f4dIEjcvUxBY9TFAVVzoFt34yJIc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=m8nCQ29z; arc=none smtp.client-ip=95.215.58.173 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="m8nCQ29z" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769507279; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=O+67HLQQ2B8tTBN7esBDatZbpYLzuXbZR8dRSH8sAe4=; b=m8nCQ29z+k3FxE0Ie3sgjRqB16ot4orj7QVzotYaYqHd/djkOWxNDCfe34QL0tfM515m8F s9dL0XenMv71F+Sk9rqiYgJlSwqEWySw27FqgW0T2d/Sroy8QYP2mejcUHQMBo/pWpi0Qr /sO02T2mQ7ehuBYQxAU6FIdQ2PXRXQc= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v5 09/12] selftests/bpf: Add tests for memcg_bpf_ops Date: Tue, 27 Jan 2026 17:47:34 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Add a comprehensive selftest suite for the `memcg_bpf_ops` functionality. These tests validate that BPF programs can correctly influence memory cgroup throttling behavior by implementing the new hooks. The test suite is added in `prog_tests/memcg_ops.c` and covers several key scenarios: 1. `test_memcg_ops_over_high`: Verifies that a BPF program can trigger throttling on a low-priority cgroup by returning a delay from the `get_high_delay_ms` hook when a high-priority cgroup is under pressure. 2. `test_memcg_ops_below_low_over_high`: Tests the combination of the `below_low` and `get_high_delay_ms` hooks, ensuring they work together as expected. 3. `test_memcg_ops_below_min_over_high`: Validates the interaction between the `below_min` and `get_high_delay_ms` hooks. The test framework sets up a cgroup hierarchy with high and low priority groups, attaches BPF programs, runs memory-intensive workloads, and asserts that the observed throttling (measured by workload execution time) matches expectations. The BPF program (`progs/memcg_ops.c`) uses a tracepoint on `memcg:count_memcg_events` (specifically PGFAULT) to detect memory pressure and trigger the appropriate hooks in response. This test suite provides essential validation for the new memory control mechanisms. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- MAINTAINERS | 2 + .../selftests/bpf/prog_tests/memcg_ops.c | 535 ++++++++++++++++++ tools/testing/selftests/bpf/progs/memcg_ops.c | 130 +++++ 3 files changed, 667 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/memcg_ops.c create mode 100644 tools/testing/selftests/bpf/progs/memcg_ops.c diff --git a/MAINTAINERS b/MAINTAINERS index 491d567f7dc8..7e07bb330eae 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6471,6 +6471,8 @@ F: mm/memcontrol-v1.h F: mm/page_counter.c F: mm/swap_cgroup.c F: samples/cgroup/* +F: tools/testing/selftests/bpf/prog_tests/memcg_ops.c +F: tools/testing/selftests/bpf/progs/memcg_ops.c F: tools/testing/selftests/cgroup/memcg_protection.m F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c F: tools/testing/selftests/cgroup/test_kmem.c diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/tes= ting/selftests/bpf/prog_tests/memcg_ops.c new file mode 100644 index 000000000000..a596926ea233 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c @@ -0,0 +1,535 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory controller eBPF struct ops test + */ + +#include +#include +#include +#include +#include +#include +#include "cgroup_helpers.h" + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +#include "memcg_ops.skel.h" + +#define TRIGGER_THRESHOLD 1 +#define OVER_HIGH_MS 2000 +#define FILE_SIZE (64 * 1024 * 1024ul) +#define BUFFER_SIZE (4096) +#define CG_LIMIT (120 * 1024 * 1024ul) + +#define CG_DIR "/memcg_ops_test" +#define CG_HIGH_DIR CG_DIR "/high" +#define CG_LOW_DIR CG_DIR "/low" + +static int +setup_cgroup(int *high_cgroup_id, int *low_cgroup_fd, int *high_cgroup_fd) +{ + int ret; + char limit_buf[20]; + + ret =3D setup_cgroup_environment(); + if (!ASSERT_OK(ret, "setup_cgroup_environment")) + goto cleanup; + + ret =3D create_and_get_cgroup(CG_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_DIR)) + goto cleanup; + close(ret); + ret =3D enable_controllers(CG_DIR, "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + snprintf(limit_buf, 20, "%ld", CG_LIMIT); + ret =3D write_cgroup_file(CG_DIR, "memory.max", limit_buf); + if (!ASSERT_OK(ret, "write_cgroup_file memory.max")) + goto cleanup; + ret =3D write_cgroup_file(CG_DIR, "memory.swap.max", "0"); + if (!ASSERT_OK(ret, "write_cgroup_file memory.swap.max")) + goto cleanup; + + ret =3D create_and_get_cgroup(CG_HIGH_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_HIGH_DIR)) + goto cleanup; + if (high_cgroup_fd) + *high_cgroup_fd =3D ret; + else + close(ret); + ret =3D (int)get_cgroup_id(CG_HIGH_DIR); + if (!ASSERT_GE(ret, 0, "get_cgroup_id")) + goto cleanup; + *high_cgroup_id =3D ret; + + ret =3D create_and_get_cgroup(CG_LOW_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_LOW_DIR)) + goto cleanup; + if (low_cgroup_fd) + *low_cgroup_fd =3D ret; + else + close(ret); + + return 0; + +cleanup: + cleanup_cgroup_environment(); + return -1; +} + +int write_file(const char *filename) +{ + int ret =3D -1; + size_t written =3D 0; + char *buffer; + FILE *fp; + + fp =3D fopen(filename, "wb"); + if (!fp) + goto out; + + buffer =3D malloc(BUFFER_SIZE); + if (!buffer) + goto cleanup_fp; + + memset(buffer, 'A', BUFFER_SIZE); + + while (written < FILE_SIZE) { + size_t to_write =3D (FILE_SIZE - written < BUFFER_SIZE) ? + (FILE_SIZE - written) : + BUFFER_SIZE; + + if (fwrite(buffer, 1, to_write, fp) !=3D to_write) + goto cleanup; + written +=3D to_write; + } + + ret =3D 0; +cleanup: + free(buffer); +cleanup_fp: + fclose(fp); +out: + return ret; +} + +int read_file(const char *filename, int iterations) +{ + int ret =3D -1; + long page_size =3D sysconf(_SC_PAGESIZE); + char *p; + char *map; + size_t i; + int fd; + struct stat sb; + + fd =3D open(filename, O_RDONLY); + if (fd =3D=3D -1) + goto out; + + if (fstat(fd, &sb) =3D=3D -1) + goto cleanup_fd; + + if (sb.st_size !=3D FILE_SIZE) { + fprintf(stderr, "File size mismatch: expected %ld, got %ld\n", + FILE_SIZE, sb.st_size); + goto cleanup_fd; + } + + map =3D mmap(NULL, FILE_SIZE, PROT_READ, MAP_PRIVATE, fd, 0); + if (map =3D=3D MAP_FAILED) + goto cleanup_fd; + + for (int iter =3D 0; iter < iterations; iter++) { + for (i =3D 0; i < FILE_SIZE; i +=3D page_size) { + /* access a byte to trigger page fault */ + p =3D &map[i]; + __asm__ __volatile__("" : : "r"(p) : "memory"); + } + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d %d done\n", __func__, getpid(), iter); + } + + if (munmap(map, FILE_SIZE) =3D=3D -1) + goto cleanup_fd; + + ret =3D 0; + +cleanup_fd: + close(fd); +out: + return ret; +} + +static void +real_test_memcg_ops_child_work(const char *cgroup_path, + char *data_filename, + char *time_filename, + int read_times) +{ + struct timeval start, end; + double elapsed; + FILE *fp; + + if (!ASSERT_OK(join_parent_cgroup(cgroup_path), "join_parent_cgroup")) + return; + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d begin\n", __func__, getpid()); + + gettimeofday(&start, NULL); + + if (!ASSERT_OK(write_file(data_filename), "write_file")) + return; + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d write_file done\n", __func__, getpid()); + + if (!ASSERT_OK(read_file(data_filename, read_times), "read_file")) + return; + + gettimeofday(&end, NULL); + + elapsed =3D (end.tv_sec - start.tv_sec) + + (end.tv_usec - start.tv_usec) / 1000000.0; + + if (env.verbosity >=3D VERBOSE_NORMAL) + printf("%s %d end %.6f\n", __func__, getpid(), elapsed); + + fp =3D fopen(time_filename, "w"); + if (!ASSERT_OK_PTR(fp, "fopen")) + return; + fprintf(fp, "%.6f", elapsed); + fclose(fp); +} + +static int get_time(char *time_filename, double *time) +{ + int ret =3D -1; + FILE *fp; + char buf[64]; + + fp =3D fopen(time_filename, "r"); + if (!ASSERT_OK_PTR(fp, "fopen")) + goto out; + + if (!ASSERT_OK_PTR(fgets(buf, sizeof(buf), fp), "fgets")) + goto cleanup; + + if (sscanf(buf, "%lf", time) !=3D 1) { + PRINT_FAIL("sscanf %s", buf); + goto cleanup; + } + + ret =3D 0; +cleanup: + fclose(fp); +out: + return ret; +} + +static void real_test_memcg_ops(int read_times) +{ + int ret; + char data_file1[] =3D "/tmp/test_data_XXXXXX"; + char data_file2[] =3D "/tmp/test_data_XXXXXX"; + char time_file1[] =3D "/tmp/test_time_XXXXXX"; + char time_file2[] =3D "/tmp/test_time_XXXXXX"; + pid_t pid1, pid2; + double time1, time2; + + ret =3D mkstemp(data_file1); + if (!ASSERT_GT(ret, 0, "mkstemp")) + return; + close(ret); + ret =3D mkstemp(data_file2); + if (!ASSERT_GT(ret, 0, "mkstemp")) + goto cleanup_data_file1; + close(ret); + ret =3D mkstemp(time_file1); + if (!ASSERT_GT(ret, 0, "mkstemp")) + goto cleanup_data_file2; + close(ret); + ret =3D mkstemp(time_file2); + if (!ASSERT_GT(ret, 0, "mkstemp")) + goto cleanup_time_file1; + close(ret); + + pid1 =3D fork(); + if (!ASSERT_GE(pid1, 0, "fork")) + goto cleanup; + if (pid1 =3D=3D 0) { + real_test_memcg_ops_child_work(CG_LOW_DIR, + data_file1, + time_file1, + read_times); + exit(0); + } + + pid2 =3D fork(); + if (!ASSERT_GE(pid2, 0, "fork")) + goto cleanup; + if (pid2 =3D=3D 0) { + real_test_memcg_ops_child_work(CG_HIGH_DIR, + data_file2, + time_file2, + read_times); + exit(0); + } + + ret =3D waitpid(pid1, NULL, 0); + if (!ASSERT_GT(ret, 0, "waitpid")) + goto cleanup; + + ret =3D waitpid(pid2, NULL, 0); + if (!ASSERT_GT(ret, 0, "waitpid")) + goto cleanup; + + if (get_time(time_file1, &time1)) + goto cleanup; + + if (get_time(time_file2, &time2)) + goto cleanup; + + if (time1 < time2 || time1 - time2 <=3D 1) + PRINT_FAIL("low fast compare time1=3D%f, time2=3D%f", + time1, time2); + +cleanup: + unlink(time_file2); +cleanup_time_file1: + unlink(time_file1); +cleanup_data_file2: + unlink(data_file2); +cleanup_data_file1: + unlink(data_file1); +} + +void test_memcg_ops_over_high(void) +{ + int err, map_fd; + struct memcg_ops *skel =3D NULL; + struct bpf_map *map; + struct memcg_ops__bss *bss_data; + __u32 key =3D 0; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link2 =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int high_cgroup_id, low_cgroup_fd =3D -1; + + err =3D setup_cgroup(&high_cgroup_id, &low_cgroup_fd, NULL); + if (!ASSERT_OK(err, "setup_cgroup")) + goto out; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss")) + goto out; + + map_fd =3D bpf_map__fd(map); + bss_data =3D malloc(bpf_map__value_size(map)); + if (!ASSERT_OK_PTR(bss_data, "malloc(bpf_map__value_size(map))")) + goto out; + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D TRIGGER_THRESHOLD; + bss_data->local_config.use_below_low =3D false; + bss_data->local_config.use_below_min =3D false; + bss_data->local_config.over_high_ms =3D OVER_HIGH_MS; + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + prog =3D bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto out; + + link =3D bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name low_mcg_ops")) + goto out; + + opts.relative_fd =3D low_cgroup_fd; + link2 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link2, "bpf_map__attach_struct_ops_opts")) + goto out; + + real_test_memcg_ops(5); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link2); + memcg_ops__detach(skel); + memcg_ops__destroy(skel); + close(low_cgroup_fd); + cleanup_cgroup_environment(); +} + +void test_memcg_ops_below_low_over_high(void) +{ + int err, map_fd; + struct memcg_ops *skel =3D NULL; + struct bpf_map *map; + struct memcg_ops__bss *bss_data; + __u32 key =3D 0; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link_high =3D NULL, *link_low =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int high_cgroup_id, high_cgroup_fd =3D -1, low_cgroup_fd =3D -1; + + err =3D setup_cgroup(&high_cgroup_id, &low_cgroup_fd, &high_cgroup_fd); + if (!ASSERT_OK(err, "setup_cgroup")) + goto out; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss")) + goto out; + + map_fd =3D bpf_map__fd(map); + bss_data =3D malloc(bpf_map__value_size(map)); + if (!ASSERT_OK_PTR(bss_data, "malloc(bpf_map__value_size(map))")) + goto out; + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D TRIGGER_THRESHOLD; + bss_data->local_config.use_below_low =3D true; + bss_data->local_config.use_below_min =3D false; + bss_data->local_config.over_high_ms =3D OVER_HIGH_MS; + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + prog =3D bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto out; + + link =3D bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "high_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name high_mcg_ops")) + goto out; + opts.relative_fd =3D high_cgroup_fd; + link_high =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_high, "bpf_map__attach_struct_ops_opts")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name low_mcg_ops")) + goto out; + opts.relative_fd =3D low_cgroup_fd; + link_low =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_low, "bpf_map__attach_struct_ops_opts")) + goto out; + + real_test_memcg_ops(50); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link_high); + bpf_link__destroy(link_low); + memcg_ops__detach(skel); + memcg_ops__destroy(skel); + close(high_cgroup_fd); + close(low_cgroup_fd); + cleanup_cgroup_environment(); +} + +void test_memcg_ops_below_min_over_high(void) +{ + int err, map_fd; + struct memcg_ops *skel =3D NULL; + struct bpf_map *map; + struct memcg_ops__bss *bss_data; + __u32 key =3D 0; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link_high =3D NULL, *link_low =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int high_cgroup_id, high_cgroup_fd =3D -1, low_cgroup_fd =3D -1; + + err =3D setup_cgroup(&high_cgroup_id, &low_cgroup_fd, &high_cgroup_fd); + if (!ASSERT_OK(err, "setup_cgroup")) + goto out; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss")) + goto out; + + map_fd =3D bpf_map__fd(map); + bss_data =3D malloc(bpf_map__value_size(map)); + if (!ASSERT_OK_PTR(bss_data, "malloc(bpf_map__value_size(map))")) + goto out; + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D TRIGGER_THRESHOLD; + bss_data->local_config.use_below_low =3D false; + bss_data->local_config.use_below_min =3D true; + bss_data->local_config.over_high_ms =3D OVER_HIGH_MS; + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + prog =3D bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto out; + + link =3D bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "high_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name high_mcg_ops")) + goto out; + opts.relative_fd =3D high_cgroup_fd; + link_high =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_high, "bpf_map__attach_struct_ops_opts")) + goto out; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name low_mcg_ops")) + goto out; + opts.relative_fd =3D low_cgroup_fd; + link_low =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link_low, "bpf_map__attach_struct_ops_opts")) + goto out; + + real_test_memcg_ops(50); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link_high); + bpf_link__destroy(link_low); + memcg_ops__detach(skel); + memcg_ops__destroy(skel); + close(high_cgroup_fd); + close(low_cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/memcg_ops.c b/tools/testing/= selftests/bpf/progs/memcg_ops.c new file mode 100644 index 000000000000..e611ac0e641a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/memcg_ops.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +#define ONE_SECOND_NS 1000000000 + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +struct AggregationData { + u64 sum; + u64 window_start_ts; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct AggregationData); +} aggregation_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u64); +} trigger_ts_map SEC(".maps"); + +SEC("tp/memcg/count_memcg_events") +int +handle_count_memcg_events(struct trace_event_raw_memcg_rstat_events *ctx) +{ + u32 key =3D 0; + struct AggregationData *data; + u64 current_ts; + + if (ctx->id !=3D local_config.high_cgroup_id || + (ctx->item !=3D PGFAULT)) + goto out; + + data =3D bpf_map_lookup_elem(&aggregation_map, &key); + if (!data) + goto out; + + current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - data->window_start_ts < ONE_SECOND_NS) { + data->sum +=3D ctx->val; + } else { + data->window_start_ts =3D current_ts; + data->sum =3D ctx->val; + } + + if (data->sum > local_config.threshold) { + bpf_map_update_elem(&trigger_ts_map, &key, ¤t_ts, + BPF_ANY); + data->sum =3D 0; + data->window_start_ts =3D current_ts; + } + +out: + return 0; +} + +static bool need_threshold(void) +{ + u32 key =3D 0; + u64 *trigger_ts; + bool ret =3D false; + u64 current_ts; + + trigger_ts =3D bpf_map_lookup_elem(&trigger_ts_map, &key); + if (!trigger_ts || *trigger_ts =3D=3D 0) + goto out; + + current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - *trigger_ts < ONE_SECOND_NS) + ret =3D true; + +out: + return ret; +} + +SEC("struct_ops/below_low") +unsigned int below_low_impl(struct mem_cgroup *memcg) +{ + if (!local_config.use_below_low) + return false; + + return need_threshold(); +} + +SEC("struct_ops/below_min") +unsigned int below_min_impl(struct mem_cgroup *memcg) +{ + if (!local_config.use_below_min) + return false; + + return need_threshold(); +} + +SEC("struct_ops/get_high_delay_ms") +unsigned int get_high_delay_ms_impl(struct mem_cgroup *memcg) +{ + if (local_config.over_high_ms && need_threshold()) + return local_config.over_high_ms; + + return 0; +} + +SEC(".struct_ops.link") +struct memcg_bpf_ops high_mcg_ops =3D { + .below_low =3D (void *)below_low_impl, + .below_min =3D (void *)below_min_impl, +}; + +SEC(".struct_ops.link") +struct memcg_bpf_ops low_mcg_ops =3D { + .get_high_delay_ms =3D (void *)get_high_delay_ms_impl, +}; + +char LICENSE[] SEC("license") =3D "GPL"; --=20 2.43.0 From nobody Sat Feb 7 18:15:48 2026 Received: from out-177.mta1.migadu.com (out-177.mta1.migadu.com [95.215.58.177]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B52F5335547 for ; Tue, 27 Jan 2026 09:48:12 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.177 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507294; cv=none; b=XeiWiIUTgNf350oc0DpJbDJO6G5/9CwLRHptcsOLddsP+KSfJ+LylVogG46k6aAPeyp9s/nvlLsxcnkhJWSsqwcQTpEKpPMluNG4vd89GX9LRo0y18A+kHM3E2l0E5rXdZRtaLIQfc5qgw0x3B/mE0wF/+bCCjBQtSB+s+Q0xDg= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507294; c=relaxed/simple; bh=FCFy1j/Ma2OsiV9AvUNgxQIAPG/9M6RdjHZidkT7GR8=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=gsUbHL2epqZqcbKrY68q1oUUrLzzOVwZaKoYKknKDQaW+Ac4cauwnzkBewhjqXFmIo0kxZTUfpVsHNRGs53zRrskfIgz9+q6BGpn8P1WwriB65bVZy2dkx53LNWeff/qczRxig/No7HmUKKDHQu5g+1apTDMjwQVdqaSSBLjkfg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=KimfuEVO; arc=none smtp.client-ip=95.215.58.177 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="KimfuEVO" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769507290; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=ZDAnxBm956aOF5ZEWN5PM3SfgIASeIOgGwSIT/GymGw=; b=KimfuEVO0LaSFIx1vLjDfszO3hna5+Nma5FK1NDslPhcUrH/gP7YCHp6FkQNCIp4Rrzvxh HcXQNqhZRXxjjgOraT/91ord6ZcUnFisKaRTUxDxWmtCQ3F8UykYDH28W8zxPQ9101jV/z YcM2Z+FHQD7uvJ4j8I97lOIPgKGAtUo= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v5 10/12] mm/bpf: Add BPF_F_ALLOW_OVERRIDE support for memcg_bpf_ops Date: Tue, 27 Jan 2026 17:47:35 +0800 Message-ID: <4a1ad099a2db96b92318cb924d2a7b76ee530209.1769506741.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu To allow for more flexible attachment policies in nested cgroup hierarchies, this patch introduces support for the `BPF_F_ALLOW_OVERRIDE` flag for `memcg_bpf_ops`. When a `memcg_bpf_ops` is attached to a cgroup with this flag, it permits child cgroups to attach their own, different `memcg_bpf_ops`, overriding the parent's inherited program. Without this flag, attaching a BPF program to a cgroup that already has one (either directly or via inheritance) will fail. The implementation involves: - Adding a `bpf_ops_flags` field to `struct mem_cgroup`. - During registration (`bpf_memcg_ops_reg`), checking for existing programs and the `BPF_F_ALLOW_OVERRIDE` flag. - During unregistration (`bpf_memcg_ops_unreg`), correctly restoring the parent's BPF program to the cgroup hierarchy. - Ensuring flags are inherited by child cgroups during online events. This change enables complex, multi-level policy enforcement where different subtrees of the cgroup hierarchy can have distinct memory management BPF programs. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- include/linux/memcontrol.h | 1 + mm/bpf_memcontrol.c | 82 ++++++++++++++++++++++++++------------ 2 files changed, 57 insertions(+), 26 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 24c4df864401..98c16e8dcd5b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -354,6 +354,7 @@ struct mem_cgroup { =20 #ifdef CONFIG_BPF_SYSCALL struct memcg_bpf_ops *bpf_ops; + u32 bpf_ops_flags; #endif =20 struct mem_cgroup_per_node *nodeinfo[]; diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index e746eb9cbd56..7cd983e350d7 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -213,6 +213,7 @@ void memcontrol_bpf_online(struct mem_cgroup *memcg) goto out; =20 WRITE_ONCE(memcg->bpf_ops, ops); + memcg->bpf_ops_flags =3D parent_memcg->bpf_ops_flags; =20 /* * If the BPF program implements it, call the online handler to @@ -338,33 +339,19 @@ static int bpf_memcg_ops_init_member(const struct btf= _type *t, return 0; } =20 -/** - * clean_memcg_bpf_ops - Clear BPF ops from a memory cgroup hierarchy - * @memcg: Root memory cgroup to start from - * @ops: The specific BPF ops to remove - * - * Walks the cgroup hierarchy and clears bpf_ops for any cgroup that - * matches @ops. - */ -static void clean_memcg_bpf_ops(struct mem_cgroup *memcg, - struct memcg_bpf_ops *ops) -{ - struct mem_cgroup *iter =3D NULL; - - while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { - if (READ_ONCE(iter->bpf_ops) =3D=3D ops) - WRITE_ONCE(iter->bpf_ops, NULL); - } -} - static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link) { struct bpf_struct_ops_link *ops_link =3D container_of(link, struct bpf_struct_ops_link, link); - struct memcg_bpf_ops *ops =3D kdata; + struct memcg_bpf_ops *ops =3D kdata, *old_ops; struct mem_cgroup *memcg, *iter =3D NULL; int err =3D 0; =20 + if (ops_link->flags & ~BPF_F_ALLOW_OVERRIDE) { + pr_err("attach only support BPF_F_ALLOW_OVERRIDE\n"); + return -EOPNOTSUPP; + } + memcg =3D mem_cgroup_get_from_ino(ops_link->cgroup_id); if (!memcg) return -ENOENT; @@ -372,16 +359,41 @@ static int bpf_memcg_ops_reg(void *kdata, struct bpf_= link *link) return PTR_ERR(memcg); =20 cgroup_lock(); + + /* + * Check if memcg has bpf_ops and whether it is inherited from + * parent. + * If inherited and BPF_F_ALLOW_OVERRIDE is set, allow override. + */ + old_ops =3D READ_ONCE(memcg->bpf_ops); + if (old_ops) { + struct mem_cgroup *parent_memcg =3D parent_mem_cgroup(memcg); + + if (!parent_memcg || + !(memcg->bpf_ops_flags & BPF_F_ALLOW_OVERRIDE) || + READ_ONCE(parent_memcg->bpf_ops) !=3D old_ops) { + err =3D -EBUSY; + goto unlock_out; + } + } + + /* Check for incompatible bpf_ops in descendants. */ while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { - if (READ_ONCE(iter->bpf_ops)) { - mem_cgroup_iter_break(memcg, iter); + struct memcg_bpf_ops *iter_ops =3D READ_ONCE(iter->bpf_ops); + + if (iter_ops && iter_ops !=3D old_ops) { + /* cannot override existing bpf_ops of sub-cgroup. */ err =3D -EBUSY; - break; + goto unlock_out; } + } + + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { WRITE_ONCE(iter->bpf_ops, ops); + iter->bpf_ops_flags =3D ops_link->flags; } - if (err) - clean_memcg_bpf_ops(memcg, ops); + +unlock_out: cgroup_unlock(); =20 mem_cgroup_put(memcg); @@ -395,13 +407,31 @@ static void bpf_memcg_ops_unreg(void *kdata, struct b= pf_link *link) =3D container_of(link, struct bpf_struct_ops_link, link); struct memcg_bpf_ops *ops =3D kdata; struct mem_cgroup *memcg; + struct mem_cgroup *iter; + struct memcg_bpf_ops *parent_bpf_ops =3D NULL; + u32 parent_bpf_ops_flags =3D 0; =20 memcg =3D mem_cgroup_get_from_ino(ops_link->cgroup_id); if (IS_ERR_OR_NULL(memcg)) goto out; =20 cgroup_lock(); - clean_memcg_bpf_ops(memcg, ops); + + /* Get the parent bpf_ops and bpf_ops_flags */ + iter =3D parent_mem_cgroup(memcg); + if (iter) { + parent_bpf_ops =3D READ_ONCE(iter->bpf_ops); + parent_bpf_ops_flags =3D iter->bpf_ops_flags; + } + + iter =3D NULL; + while ((iter =3D mem_cgroup_iter(memcg, iter, NULL))) { + if (READ_ONCE(iter->bpf_ops) =3D=3D ops) { + WRITE_ONCE(iter->bpf_ops, parent_bpf_ops); + iter->bpf_ops_flags =3D parent_bpf_ops_flags; + } + } + cgroup_unlock(); =20 mem_cgroup_put(memcg); --=20 2.43.0 From nobody Sat Feb 7 18:15:48 2026 Received: from out-187.mta1.migadu.com (out-187.mta1.migadu.com [95.215.58.187]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 171B9335067 for ; Tue, 27 Jan 2026 09:48:23 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.187 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507307; cv=none; b=fcDlfWiSOp8EaZGQk20QB+J24HtmIyFAun5ABPKa1BJP8xBOGw71imQn9kq0j/HHLGSRRSkazG421eKLut+o4m8aWhZQ7ft7JTZO+PASh6mlF8AuojBxr0xbiyrd6SwiyUPJycfLhQBnk2PZyh4WsgNeqmfUJ4FiUQqIKSZkeHw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507307; c=relaxed/simple; bh=aDizkIIrISoiSOmQOCh/qH2aJrcVckHeAXd0H3mi6To=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=lEaASao/NwycQWTC4fhTgZvt/HmArZoTv/QpgMXA7A+tooMjh+tVnvkIDj6ZjrxPtVSSy/7Zf34KN/dCpVLO6ebNt3VORz69eAYOWvaV4P/k5DUGEOuCaQE7oy8I1r9lMeVH13H3OdXQBSeJy+8DIXxQ+2O98pzHDWs/8NcnNNA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=O6bFglpo; arc=none smtp.client-ip=95.215.58.187 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="O6bFglpo" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769507302; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=38FTcR+0kk8dvZzN/w/J+BLbrBHVmHWXDv78hCgH2EM=; b=O6bFglpozchYmTk8MyWF7zSH8QHQ/5EZph8UjcURHY/I6G6u9aXp2ss64fZKkFnKeTpNvs tqZkyPA0egpf98Jjir8s3OFqJBVHPe/YdMWqAaDUaIlv3XWSiPw7BNJnNlyC8/LXNOvDMD WP6OVk0NbAaZUqMMsVcEI9DpEPS0RyQ= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v5 11/12] selftests/bpf: Add test for memcg_bpf_ops hierarchies Date: Tue, 27 Jan 2026 17:47:36 +0800 Message-ID: <0b2c894a6544aa44db25d01def2c6c3e23a8bcb4.1769506741.git.zhuhui@kylinos.cn> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Add a new selftest, `test_memcg_ops_hierarchies`, to validate the behavior of attaching `memcg_bpf_ops` in a nested cgroup hierarchy, specifically testing the `BPF_F_ALLOW_OVERRIDE` flag. The test case performs the following steps: 1. Creates a three-level deep cgroup hierarchy: `/cg`, `/cg/cg`, and `/cg/cg/cg`. 2. Attaches a BPF struct_ops to the top-level cgroup (`/cg`) with the `BPF_F_ALLOW_OVERRIDE` flag. 3. Successfully attaches a new struct_ops to the middle cgroup (`/cg/cg`) without the flag, overriding the inherited one. 4. Asserts that attaching another struct_ops to the deepest cgroup (`/cg/cg/cg`) fails with -EBUSY, because its parent did not specify `BPF_F_ALLOW_OVERRIDE`. This test ensures that the attachment logic correctly enforces the override rules across a cgroup subtree. Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- .../selftests/bpf/prog_tests/memcg_ops.c | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/tes= ting/selftests/bpf/prog_tests/memcg_ops.c index a596926ea233..91084e8acc32 100644 --- a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c +++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c @@ -533,3 +533,74 @@ void test_memcg_ops_below_min_over_high(void) close(low_cgroup_fd); cleanup_cgroup_environment(); } + +void test_memcg_ops_hierarchies(void) +{ + int ret, first =3D -1, second =3D -1, third =3D -1; + struct memcg_ops *skel =3D NULL; + struct bpf_map *map; + struct bpf_link *link1 =3D NULL, *link2 =3D NULL, *link3 =3D NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + + ret =3D setup_cgroup_environment(); + if (!ASSERT_OK(ret, "setup_cgroup_environment")) + goto cleanup; + + first =3D create_and_get_cgroup("/cg"); + if (!ASSERT_GE(first, 0, "create_and_get_cgroup /cg")) + goto cleanup; + ret =3D enable_controllers("/cg", "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + + second =3D create_and_get_cgroup("/cg/cg"); + if (!ASSERT_GE(second, 0, "create_and_get_cgroup /cg/cg")) + goto cleanup; + ret =3D enable_controllers("/cg/cg", "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + + third =3D create_and_get_cgroup("/cg/cg/cg"); + if (!ASSERT_GE(third, 0, "create_and_get_cgroup /cg/cg/cg")) + goto cleanup; + ret =3D enable_controllers("/cg/cg/cg", "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + + skel =3D memcg_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load")) + goto cleanup; + + map =3D bpf_object__find_map_by_name(skel->obj, "low_mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name low_mcg_ops")) + goto cleanup; + + opts.relative_fd =3D first; + opts.flags =3D BPF_F_ALLOW_OVERRIDE; + link1 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link1, "bpf_map__attach_struct_ops_opts")) + goto cleanup; + + opts.relative_fd =3D second; + opts.flags =3D 0; + link2 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link2, "bpf_map__attach_struct_ops_opts")) + goto cleanup; + + opts.relative_fd =3D third; + opts.flags =3D 0; + link3 =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_ERR_PTR(link3, "bpf_map__attach_struct_ops_opts")) + goto cleanup; + +cleanup: + bpf_link__destroy(link1); + bpf_link__destroy(link2); + bpf_link__destroy(link3); + memcg_ops__detach(skel); + memcg_ops__destroy(skel); + close(first); + close(second); + close(third); + cleanup_cgroup_environment(); +} --=20 2.43.0 From nobody Sat Feb 7 18:15:48 2026 Received: from out-174.mta0.migadu.com (out-174.mta0.migadu.com [91.218.175.174]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2E2883346BF for ; Tue, 27 Jan 2026 09:49:28 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.174 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507370; cv=none; b=nvvTmKF1ggz1jgF5MvATbT3X2BO/HxkchLPoiIJgUuLpejpRc4Aw6XSheNm1fmWsXcsmYYr7XUOBGMvaZ/mq1QolWJrWU4XuKYSxB9lBzJTYkMsH10zsqWBI0CoYhH8FTVLQkIQ1VU9YWMln0zg9TUN1Ng7f7CAkpvI9s13bczo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769507370; c=relaxed/simple; bh=S7Ug2ka6mO/w/YVzx9oNA2JjUvWPG/sbKL1Icf/GMUM=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=V38aPChsLhjGvnzJShHd/11WlGGvidKthzGzW/FaZC++XC2CS/YQbKQFzrEbNrUAUF+wnOeb3hj9XiteXbpUQHoI1BHYqfsbyyI1zBYoFL7CFrfDn9R8ciBg6ODn+KKUeZrN7SjBh3jTG8sajJ8oPSjbrBKPD/TuI4eLZbTac4Q= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=AJiyh1rH; arc=none smtp.client-ip=91.218.175.174 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="AJiyh1rH" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1769507355; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=LNlKokYgxgtWwpk4PxEDaqPinFDBpx6E9i1VtbSdG98=; b=AJiyh1rHunCQlyZlUOqMxiENPEkvpYgXw/LOtImSreONfHFhLDLbSC4yfSWbB/rBc7wyIa zijHvy9PDzD6HHx+pBZwCDV7HrG/SUlK0hMkG/+SeVzgpgp7pSfOvFkLp2gE5p740/W/H0 D90oadXR/BrIjHGqeCD/EqdQf/KlCq0= From: Hui Zhu To: Andrew Morton , Johannes Weiner , Michal Hocko , Roman Gushchin , Shakeel Butt , Muchun Song , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Peter Zijlstra , Miguel Ojeda , Nathan Chancellor , Kees Cook , Tejun Heo , Jeff Xu , mkoutny@suse.com, Jan Hendrik Farr , Christian Brauner , Randy Dunlap , Brian Gerst , Masahiro Yamada , davem@davemloft.net, Jakub Kicinski , Jesper Dangaard Brouer , JP Kobryn , Willem de Bruijn , Jason Xing , Paul Chaignon , Anton Protopopov , Amery Hung , Chen Ridong , Lance Yang , Jiayuan Chen , linux-kernel@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, linux-kselftest@vger.kernel.org Cc: Hui Zhu , Geliang Tang Subject: [RFC PATCH bpf-next v5 12/12] samples/bpf: Add memcg priority control example Date: Tue, 27 Jan 2026 17:48:55 +0800 Message-ID: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Hui Zhu Add a sample program to demonstrate a practical use case for the `memcg_bpf_ops` feature: priority-based memory throttling. The sample consists of a BPF program and a userspace loader: 1. memcg.bpf.c: A BPF program that monitors PGFAULT events on a high-priority cgroup. When activity exceeds a threshold, it uses the `get_high_delay_ms`, `below_low`, or `below_min` hooks to apply pressure on a low-priority cgroup. 2. memcg.c: A userspace loader that configures and attaches the BPF program. It takes command-line arguments for the high and low priority cgroup paths, a pressure threshold, and the desired throttling delay (`over_high_ms`). This provides a clear, working example of how to implement a dynamic, priority-aware memory management policy. A user can create two cgroups, run workloads of different priorities, and observe the low-priority workload being throttled to protect the high-priority one. Example usage: # ./memcg --low_path /sys/fs/cgroup/low \ # --high_path /sys/fs/cgroup/high \ # --threshold 100 --over_high_ms 1024 Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- MAINTAINERS | 2 + samples/bpf/.gitignore | 1 + samples/bpf/Makefile | 8 +- samples/bpf/memcg.bpf.c | 130 +++++++++++++++ samples/bpf/memcg.c | 345 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 485 insertions(+), 1 deletion(-) create mode 100644 samples/bpf/memcg.bpf.c create mode 100644 samples/bpf/memcg.c diff --git a/MAINTAINERS b/MAINTAINERS index 7e07bb330eae..819ef271e011 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6470,6 +6470,8 @@ F: mm/memcontrol-v1.c F: mm/memcontrol-v1.h F: mm/page_counter.c F: mm/swap_cgroup.c +F: samples/bpf/memcg.bpf.c +F: samples/bpf/memcg.c F: samples/cgroup/* F: tools/testing/selftests/bpf/prog_tests/memcg_ops.c F: tools/testing/selftests/bpf/progs/memcg_ops.c diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index 0002cd359fb1..0de6569cdefd 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -49,3 +49,4 @@ iperf.* /vmlinux.h /bpftool/ /libbpf/ +memcg diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 95a4fa1f1e44..b00698bdc53b 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -37,6 +37,7 @@ tprogs-y +=3D xdp_fwd tprogs-y +=3D task_fd_query tprogs-y +=3D ibumad tprogs-y +=3D hbm +tprogs-y +=3D memcg =20 # Libbpf dependencies LIBBPF_SRC =3D $(TOOLS_PATH)/lib/bpf @@ -122,6 +123,7 @@ always-y +=3D task_fd_query_kern.o always-y +=3D ibumad_kern.o always-y +=3D hbm_out_kern.o always-y +=3D hbm_edt_kern.o +always-y +=3D memcg.bpf.o =20 COMMON_CFLAGS =3D $(TPROGS_USER_CFLAGS) TPROGS_LDFLAGS =3D $(TPROGS_USER_LDFLAGS) @@ -289,6 +291,8 @@ $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h $(obj)/hbm.o: $(src)/hbm.h $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h =20 +memcg: $(obj)/memcg.skel.h + # Override includes for xdp_sample_user.o because $(srctree)/usr/include in # TPROGS_CFLAGS causes conflicts XDP_SAMPLE_CFLAGS +=3D -Wall -O2 \ @@ -347,11 +351,13 @@ $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src= )/xdp_sample.bpf.h $(src)/x -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \ -c $(filter %.bpf.c,$^) -o $@ =20 -LINKED_SKELS :=3D xdp_router_ipv4.skel.h +LINKED_SKELS :=3D xdp_router_ipv4.skel.h memcg.skel.h clean-files +=3D $(LINKED_SKELS) =20 xdp_router_ipv4.skel.h-deps :=3D xdp_router_ipv4.bpf.o xdp_sample.bpf.o =20 +memcg.skel.h-deps :=3D memcg.bpf.o + LINKED_BPF_SRCS :=3D $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SK= ELS),$($(skel)-deps))) =20 BPF_SRCS_LINKED :=3D $(notdir $(wildcard $(src)/*.bpf.c)) diff --git a/samples/bpf/memcg.bpf.c b/samples/bpf/memcg.bpf.c new file mode 100644 index 000000000000..e611ac0e641a --- /dev/null +++ b/samples/bpf/memcg.bpf.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +#define ONE_SECOND_NS 1000000000 + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +struct AggregationData { + u64 sum; + u64 window_start_ts; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct AggregationData); +} aggregation_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u64); +} trigger_ts_map SEC(".maps"); + +SEC("tp/memcg/count_memcg_events") +int +handle_count_memcg_events(struct trace_event_raw_memcg_rstat_events *ctx) +{ + u32 key =3D 0; + struct AggregationData *data; + u64 current_ts; + + if (ctx->id !=3D local_config.high_cgroup_id || + (ctx->item !=3D PGFAULT)) + goto out; + + data =3D bpf_map_lookup_elem(&aggregation_map, &key); + if (!data) + goto out; + + current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - data->window_start_ts < ONE_SECOND_NS) { + data->sum +=3D ctx->val; + } else { + data->window_start_ts =3D current_ts; + data->sum =3D ctx->val; + } + + if (data->sum > local_config.threshold) { + bpf_map_update_elem(&trigger_ts_map, &key, ¤t_ts, + BPF_ANY); + data->sum =3D 0; + data->window_start_ts =3D current_ts; + } + +out: + return 0; +} + +static bool need_threshold(void) +{ + u32 key =3D 0; + u64 *trigger_ts; + bool ret =3D false; + u64 current_ts; + + trigger_ts =3D bpf_map_lookup_elem(&trigger_ts_map, &key); + if (!trigger_ts || *trigger_ts =3D=3D 0) + goto out; + + current_ts =3D bpf_ktime_get_ns(); + + if (current_ts - *trigger_ts < ONE_SECOND_NS) + ret =3D true; + +out: + return ret; +} + +SEC("struct_ops/below_low") +unsigned int below_low_impl(struct mem_cgroup *memcg) +{ + if (!local_config.use_below_low) + return false; + + return need_threshold(); +} + +SEC("struct_ops/below_min") +unsigned int below_min_impl(struct mem_cgroup *memcg) +{ + if (!local_config.use_below_min) + return false; + + return need_threshold(); +} + +SEC("struct_ops/get_high_delay_ms") +unsigned int get_high_delay_ms_impl(struct mem_cgroup *memcg) +{ + if (local_config.over_high_ms && need_threshold()) + return local_config.over_high_ms; + + return 0; +} + +SEC(".struct_ops.link") +struct memcg_bpf_ops high_mcg_ops =3D { + .below_low =3D (void *)below_low_impl, + .below_min =3D (void *)below_min_impl, +}; + +SEC(".struct_ops.link") +struct memcg_bpf_ops low_mcg_ops =3D { + .get_high_delay_ms =3D (void *)get_high_delay_ms_impl, +}; + +char LICENSE[] SEC("license") =3D "GPL"; diff --git a/samples/bpf/memcg.c b/samples/bpf/memcg.c new file mode 100644 index 000000000000..0c47ed53f6ae --- /dev/null +++ b/samples/bpf/memcg.c @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __MEMCG_RSTAT_SIMPLE_BPF_SKEL_H__ +#define u64 uint64_t +#endif + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + bool use_below_low; + bool use_below_min; + unsigned int over_high_ms; +} local_config; + +#include "memcg.skel.h" + +static bool exiting; + +static void sig_handler(int sig) +{ + exiting =3D true; +} + +static void usage(char *name) +{ + fprintf(stderr, + "Usage: %s --low_path=3D --high_path=3D \\\n" + " --threshold=3D [OPTIONS]\n\n", + name); + fprintf(stderr, "Required arguments:\n"); + fprintf(stderr, + " -l, --low_path=3DPATH Low priority memcgroup path\n"); + fprintf(stderr, + " -g, --high_path=3DPATH High priority memcgroup path\n"); + fprintf(stderr, + " -t, --threshold=3DVALUE The sum of 'val' PGSCAN of\n"); + fprintf(stderr, + " high priority memcgroup in\n"); + fprintf(stderr, + " 1 sec to trigger low priority\n"); + fprintf(stderr, + " cgroup over_high\n\n"); + fprintf(stderr, "Optional arguments:\n"); + fprintf(stderr, " -o, --over_high_ms=3DVALUE\n"); + fprintf(stderr, + " Low_path over_high_ms value\n"); + fprintf(stderr, + " (default: 0)\n"); + fprintf(stderr, " -L, --use_below_low Enable use_below_low flag\n"); + fprintf(stderr, " -M, --use_below_min Enable use_below_min flag\n"); + fprintf(stderr, + " -O, --allow_override Enable BPF_F_ALLOW_OVERRIDE\n"); + fprintf(stderr, + " flag\n"); + fprintf(stderr, " -h, --help Show this help message\n\n"); + fprintf(stderr, "Examples:\n"); + fprintf(stderr, " # Using long options:\n"); + fprintf(stderr, " %s --low_path=3D/sys/fs/cgroup/low \\\n", name); + fprintf(stderr, " --high_path=3D/sys/fs/cgroup/high \\\n"); + fprintf(stderr, " --threshold=3D1000 --over_high_ms=3D500 \\\n" + " --use_below_low\n\n"); + fprintf(stderr, " # Using short options:\n"); + fprintf(stderr, " %s -l /sys/fs/cgroup/low \\\n" + " -g /sys/fs/cgroup/high \\\n", + name); + fprintf(stderr, " -t 1000 -o 500 -L -M\n"); +} + +static uint64_t get_cgroup_id(const char *cgroup_path) +{ + struct stat st; + + if (cgroup_path =3D=3D NULL) { + fprintf(stderr, "Error: cgroup_path is NULL\n"); + return 0; + } + + if (stat(cgroup_path, &st) < 0) { + fprintf(stderr, "Error: stat(%s) failed: %d\n", + cgroup_path, errno); + return 0; + } + + return (uint64_t)st.st_ino; +} + +static uint64_t parse_u64(const char *str, const char *name) +{ + uint64_t value; + + errno =3D 0; + value =3D strtoull(str, NULL, 10); + + if (errno !=3D 0) { + fprintf(stderr, + "ERROR: strtoull '%s' failed: %d\n", + str, errno); + usage(name); + exit(-errno); + } + + return value; +} + +int main(int argc, char **argv) +{ + int low_cgroup_fd =3D -1, high_cgroup_fd =3D -1; + uint64_t threshold =3D 0, high_cgroup_id; + unsigned int over_high_ms =3D 0; + bool use_below_low =3D false, use_below_min =3D false; + __u32 opts_flags =3D 0; + const char *low_path =3D NULL; + const char *high_path =3D NULL; + const char *bpf_obj_file =3D "memcg.bpf.o"; + struct bpf_object *obj =3D NULL; + struct bpf_program *prog =3D NULL; + struct bpf_link *link =3D NULL, *link_low =3D NULL, *link_high =3D NULL; + struct bpf_map *map; + struct memcg__bss *bss_data; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int err =3D -EINVAL; + int map_fd; + int opt; + int option_index =3D 0; + + static struct option long_options[] =3D { + {"low_path", required_argument, 0, 'l'}, + {"high_path", required_argument, 0, 'g'}, + {"threshold", required_argument, 0, 't'}, + {"over_high_ms", required_argument, 0, 'o'}, + {"use_below_low", no_argument, 0, 'L'}, + {"use_below_min", no_argument, 0, 'M'}, + {"allow_override", no_argument, 0, 'O'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0 } + }; + + while ((opt =3D getopt_long(argc, argv, "l:g:t:o:LMOh", + long_options, &option_index)) !=3D -1) { + switch (opt) { + case 'l': + low_path =3D optarg; + break; + case 'g': + high_path =3D optarg; + break; + case 't': + threshold =3D parse_u64(optarg, argv[0]); + break; + case 'o': + over_high_ms =3D (unsigned int)parse_u64(optarg, argv[0]); + break; + case 'L': + use_below_low =3D true; + break; + case 'M': + use_below_min =3D true; + break; + case 'O': + opts_flags =3D BPF_F_ALLOW_OVERRIDE; + break; + case 'h': + usage(argv[0]); + return 0; + default: + usage(argv[0]); + return -EINVAL; + } + } + + if (!low_path || !high_path || !threshold) { + fprintf(stderr, + "ERROR: Missing required arguments\n\n"); + usage(argv[0]); + goto out; + } + + low_cgroup_fd =3D open(low_path, O_RDONLY); + if (low_cgroup_fd < 0) { + fprintf(stderr, + "ERROR: open low cgroup '%s' failed: %d\n", + low_path, errno); + err =3D -errno; + goto out; + } + + high_cgroup_id =3D get_cgroup_id(high_path); + if (!high_cgroup_id) + goto out; + high_cgroup_fd =3D open(high_path, O_RDONLY); + if (high_cgroup_fd < 0) { + fprintf(stderr, + "ERROR: open high cgroup '%s' failed: %d\n", + high_path, errno); + err =3D -errno; + goto out; + } + + obj =3D bpf_object__open_file(bpf_obj_file, NULL); + err =3D libbpf_get_error(obj); + if (err) { + fprintf(stderr, + "ERROR: opening BPF object file '%s' failed: %d\n", + bpf_obj_file, err); + goto out; + } + + map =3D bpf_object__find_map_by_name(obj, ".bss"); + if (!map) { + fprintf(stderr, "ERROR: Failed to find .bss map\n"); + err =3D -ESRCH; + goto out; + } + + err =3D bpf_object__load(obj); + if (err) { + fprintf(stderr, + "ERROR: loading BPF object file failed: %d\n", + err); + goto out; + } + + map_fd =3D bpf_map__fd(map); + bss_data =3D malloc(bpf_map__value_size(map)); + if (bss_data) { + __u32 key =3D 0; + + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id =3D high_cgroup_id; + bss_data->local_config.threshold =3D threshold; + bss_data->local_config.over_high_ms =3D over_high_ms; + bss_data->local_config.use_below_low =3D use_below_low; + bss_data->local_config.use_below_min =3D use_below_min; + + err =3D bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (err) { + fprintf(stderr, + "ERROR: update config failed: %d\n", + err); + goto out; + } + } else { + fprintf(stderr, + "ERROR: allocate memory failed\n"); + err =3D -ENOMEM; + goto out; + } + + prog =3D bpf_object__find_program_by_name(obj, + "handle_count_memcg_events"); + if (!prog) { + fprintf(stderr, + "ERROR: finding a prog in BPF object file failed\n"); + goto out; + } + + link =3D bpf_program__attach(prog); + err =3D libbpf_get_error(link); + if (err) { + fprintf(stderr, + "ERROR: bpf_program__attach failed: %d\n", + err); + goto out; + } + + if (over_high_ms) { + map =3D bpf_object__find_map_by_name(obj, "low_mcg_ops"); + if (!map) { + fprintf(stderr, + "ERROR: Failed to find low_mcg_ops map\n"); + err =3D -ESRCH; + goto out; + } + LIBBPF_OPTS_RESET(opts, + .flags =3D opts_flags, + .relative_fd =3D low_cgroup_fd, + ); + link_low =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!link_low) { + fprintf(stderr, + "Failed to attach struct ops low_mcg_ops: %d\n", + errno); + err =3D -errno; + goto out; + } + } + + if (use_below_low || use_below_min) { + map =3D bpf_object__find_map_by_name(obj, "high_mcg_ops"); + if (!map) { + fprintf(stderr, + "ERROR: Failed to find high_mcg_ops map\n"); + err =3D -ESRCH; + goto out; + } + LIBBPF_OPTS_RESET(opts, + .flags =3D opts_flags, + .relative_fd =3D high_cgroup_fd, + ); + link_high =3D bpf_map__attach_struct_ops_opts(map, &opts); + if (!link_high) { + fprintf(stderr, + "Failed to attach struct ops high_mcg_ops: %d\n", + errno); + err =3D -errno; + goto out; + } + } + + printf("Successfully attached!\n"); + + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + + while (!exiting) + pause(); + + printf("Exiting...\n"); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link_low); + bpf_link__destroy(link_high); + bpf_object__close(obj); + close(low_cgroup_fd); + close(high_cgroup_fd); + return err; +} --=20 2.43.0