Implement a new bpf_psi_create_trigger() bpf kfunc, which allows
to create new psi triggers and attach them to cgroups or be
system-wide.
Created triggers will exist until the struct ops is loaded and
if they are attached to a cgroup until the cgroup exists.
Due to a limitation of 5 arguments, the resource type and the "full"
bit are squeezed into a single u32.
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
kernel/sched/bpf_psi.c | 84 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
diff --git a/kernel/sched/bpf_psi.c b/kernel/sched/bpf_psi.c
index 2ea9d7276b21..94b684221708 100644
--- a/kernel/sched/bpf_psi.c
+++ b/kernel/sched/bpf_psi.c
@@ -156,6 +156,83 @@ static const struct bpf_verifier_ops bpf_psi_verifier_ops = {
.is_valid_access = bpf_psi_ops_is_valid_access,
};
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_psi_create_trigger - Create a PSI trigger
+ * @bpf_psi: bpf_psi struct to attach the trigger to
+ * @cgroup_id: cgroup Id to attach the trigger; 0 for system-wide scope
+ * @resource: resource to monitor (PSI_MEM, PSI_IO, etc) and the full bit.
+ * @threshold_us: threshold in us
+ * @window_us: window in us
+ *
+ * Creates a PSI trigger and attached is to bpf_psi. The trigger will be
+ * active unless bpf struct ops is unloaded or the corresponding cgroup
+ * is deleted.
+ *
+ * Resource's most significant bit encodes whether "some" or "full"
+ * PSI state should be tracked.
+ *
+ * Returns 0 on success and the error code on failure.
+ */
+__bpf_kfunc int bpf_psi_create_trigger(struct bpf_psi *bpf_psi,
+ u64 cgroup_id, u32 resource,
+ u32 threshold_us, u32 window_us)
+{
+ enum psi_res res = resource & ~BPF_PSI_FULL;
+ bool full = resource & BPF_PSI_FULL;
+ struct psi_trigger_params params;
+ struct cgroup *cgroup __maybe_unused = NULL;
+ struct psi_group *group;
+ struct psi_trigger *t;
+ int ret = 0;
+
+ if (res >= NR_PSI_RESOURCES)
+ return -EINVAL;
+
+#ifdef CONFIG_CGROUPS
+ if (cgroup_id) {
+ cgroup = cgroup_get_from_id(cgroup_id);
+ if (IS_ERR_OR_NULL(cgroup))
+ return PTR_ERR(cgroup);
+
+ group = cgroup_psi(cgroup);
+ } else
+#endif
+ group = &psi_system;
+
+ params.type = PSI_BPF;
+ params.bpf_psi = bpf_psi;
+ params.privileged = capable(CAP_SYS_RESOURCE);
+ params.res = res;
+ params.full = full;
+ params.threshold_us = threshold_us;
+ params.window_us = window_us;
+
+ t = psi_trigger_create(group, ¶ms);
+ if (IS_ERR(t))
+ ret = PTR_ERR(t);
+ else
+ t->cgroup_id = cgroup_id;
+
+#ifdef CONFIG_CGROUPS
+ if (cgroup)
+ cgroup_put(cgroup);
+#endif
+
+ return ret;
+}
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_psi_kfuncs)
+BTF_ID_FLAGS(func, bpf_psi_create_trigger, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_psi_kfuncs)
+
+static const struct btf_kfunc_id_set bpf_psi_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_psi_kfuncs,
+};
+
static int bpf_psi_ops_reg(void *kdata, struct bpf_link *link)
{
struct bpf_psi_ops *ops = kdata;
@@ -238,6 +315,13 @@ static int __init bpf_psi_struct_ops_init(void)
if (!bpf_psi_wq)
return -ENOMEM;
+ err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &bpf_psi_kfunc_set);
+ if (err) {
+ pr_warn("error while registering bpf psi kfuncs: %d", err);
+ goto err;
+ }
+
err = register_bpf_struct_ops(&bpf_psi_bpf_ops, bpf_psi_ops);
if (err) {
pr_warn("error while registering bpf psi struct ops: %d", err);
--
2.50.1
On Mon, Aug 18, 2025 at 10:06 AM Roman Gushchin <roman.gushchin@linux.dev> wrote: > > Implement a new bpf_psi_create_trigger() bpf kfunc, which allows > to create new psi triggers and attach them to cgroups or be > system-wide. > > Created triggers will exist until the struct ops is loaded and > if they are attached to a cgroup until the cgroup exists. > > Due to a limitation of 5 arguments, the resource type and the "full" > bit are squeezed into a single u32. > > Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> > --- > kernel/sched/bpf_psi.c | 84 ++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 84 insertions(+) > > diff --git a/kernel/sched/bpf_psi.c b/kernel/sched/bpf_psi.c > index 2ea9d7276b21..94b684221708 100644 > --- a/kernel/sched/bpf_psi.c > +++ b/kernel/sched/bpf_psi.c > @@ -156,6 +156,83 @@ static const struct bpf_verifier_ops bpf_psi_verifier_ops = { > .is_valid_access = bpf_psi_ops_is_valid_access, > }; > > +__bpf_kfunc_start_defs(); > + > +/** > + * bpf_psi_create_trigger - Create a PSI trigger > + * @bpf_psi: bpf_psi struct to attach the trigger to > + * @cgroup_id: cgroup Id to attach the trigger; 0 for system-wide scope > + * @resource: resource to monitor (PSI_MEM, PSI_IO, etc) and the full bit. > + * @threshold_us: threshold in us > + * @window_us: window in us > + * > + * Creates a PSI trigger and attached is to bpf_psi. The trigger will be > + * active unless bpf struct ops is unloaded or the corresponding cgroup > + * is deleted. > + * > + * Resource's most significant bit encodes whether "some" or "full" > + * PSI state should be tracked. > + * > + * Returns 0 on success and the error code on failure. > + */ > +__bpf_kfunc int bpf_psi_create_trigger(struct bpf_psi *bpf_psi, > + u64 cgroup_id, u32 resource, > + u32 threshold_us, u32 window_us) > +{ > + enum psi_res res = resource & ~BPF_PSI_FULL; > + bool full = resource & BPF_PSI_FULL; > + struct psi_trigger_params params; > + struct cgroup *cgroup __maybe_unused = NULL; > + struct psi_group *group; > + struct psi_trigger *t; > + int ret = 0; > + > + if (res >= NR_PSI_RESOURCES) > + return -EINVAL; > + > +#ifdef CONFIG_CGROUPS > + if (cgroup_id) { > + cgroup = cgroup_get_from_id(cgroup_id); > + if (IS_ERR_OR_NULL(cgroup)) > + return PTR_ERR(cgroup); > + > + group = cgroup_psi(cgroup); > + } else > +#endif > + group = &psi_system; just a drive-by comment while skimming through the patch set: can't you use IS_ENABLED(CONFIG_CGROUPS) and have a proper if/else with proper {} ? > + > + params.type = PSI_BPF; > + params.bpf_psi = bpf_psi; > + params.privileged = capable(CAP_SYS_RESOURCE); > + params.res = res; > + params.full = full; > + params.threshold_us = threshold_us; > + params.window_us = window_us; > + > + t = psi_trigger_create(group, ¶ms); > + if (IS_ERR(t)) > + ret = PTR_ERR(t); > + else > + t->cgroup_id = cgroup_id; > + > +#ifdef CONFIG_CGROUPS > + if (cgroup) > + cgroup_put(cgroup); > +#endif > + > + return ret; > +} > +__bpf_kfunc_end_defs(); > + > +BTF_KFUNCS_START(bpf_psi_kfuncs) > +BTF_ID_FLAGS(func, bpf_psi_create_trigger, KF_TRUSTED_ARGS) > +BTF_KFUNCS_END(bpf_psi_kfuncs) > + > +static const struct btf_kfunc_id_set bpf_psi_kfunc_set = { > + .owner = THIS_MODULE, > + .set = &bpf_psi_kfuncs, > +}; > + > static int bpf_psi_ops_reg(void *kdata, struct bpf_link *link) > { > struct bpf_psi_ops *ops = kdata; > @@ -238,6 +315,13 @@ static int __init bpf_psi_struct_ops_init(void) > if (!bpf_psi_wq) > return -ENOMEM; > > + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, > + &bpf_psi_kfunc_set); would this make kfunc callable from any struct_ops, not just this psi one? > + if (err) { > + pr_warn("error while registering bpf psi kfuncs: %d", err); > + goto err; > + } > + > err = register_bpf_struct_ops(&bpf_psi_bpf_ops, bpf_psi_ops); > if (err) { > pr_warn("error while registering bpf psi struct ops: %d", err); > -- > 2.50.1 > >
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes: > On Mon, Aug 18, 2025 at 10:06 AM Roman Gushchin > <roman.gushchin@linux.dev> wrote: >> >> Implement a new bpf_psi_create_trigger() bpf kfunc, which allows >> to create new psi triggers and attach them to cgroups or be >> system-wide. >> >> Created triggers will exist until the struct ops is loaded and >> if they are attached to a cgroup until the cgroup exists. >> >> Due to a limitation of 5 arguments, the resource type and the "full" >> bit are squeezed into a single u32. >> >> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> >> --- >> kernel/sched/bpf_psi.c | 84 ++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 84 insertions(+) >> >> diff --git a/kernel/sched/bpf_psi.c b/kernel/sched/bpf_psi.c >> index 2ea9d7276b21..94b684221708 100644 >> --- a/kernel/sched/bpf_psi.c >> +++ b/kernel/sched/bpf_psi.c >> @@ -156,6 +156,83 @@ static const struct bpf_verifier_ops bpf_psi_verifier_ops = { >> .is_valid_access = bpf_psi_ops_is_valid_access, >> }; >> >> +__bpf_kfunc_start_defs(); >> + >> +/** >> + * bpf_psi_create_trigger - Create a PSI trigger >> + * @bpf_psi: bpf_psi struct to attach the trigger to >> + * @cgroup_id: cgroup Id to attach the trigger; 0 for system-wide scope >> + * @resource: resource to monitor (PSI_MEM, PSI_IO, etc) and the full bit. >> + * @threshold_us: threshold in us >> + * @window_us: window in us >> + * >> + * Creates a PSI trigger and attached is to bpf_psi. The trigger will be >> + * active unless bpf struct ops is unloaded or the corresponding cgroup >> + * is deleted. >> + * >> + * Resource's most significant bit encodes whether "some" or "full" >> + * PSI state should be tracked. >> + * >> + * Returns 0 on success and the error code on failure. >> + */ >> +__bpf_kfunc int bpf_psi_create_trigger(struct bpf_psi *bpf_psi, >> + u64 cgroup_id, u32 resource, >> + u32 threshold_us, u32 window_us) >> +{ >> + enum psi_res res = resource & ~BPF_PSI_FULL; >> + bool full = resource & BPF_PSI_FULL; >> + struct psi_trigger_params params; >> + struct cgroup *cgroup __maybe_unused = NULL; >> + struct psi_group *group; >> + struct psi_trigger *t; >> + int ret = 0; >> + >> + if (res >= NR_PSI_RESOURCES) >> + return -EINVAL; >> + >> +#ifdef CONFIG_CGROUPS >> + if (cgroup_id) { >> + cgroup = cgroup_get_from_id(cgroup_id); >> + if (IS_ERR_OR_NULL(cgroup)) >> + return PTR_ERR(cgroup); >> + >> + group = cgroup_psi(cgroup); >> + } else >> +#endif >> + group = &psi_system; > > just a drive-by comment while skimming through the patch set: can't > you use IS_ENABLED(CONFIG_CGROUPS) and have a proper if/else with > proper {} ? Fixed. It required defining cgroup_get_from_id() and cgroup_psi() for !CONFIG_CGROUPS, but I agree, it's much better. Thanks > >> + >> + params.type = PSI_BPF; >> + params.bpf_psi = bpf_psi; >> + params.privileged = capable(CAP_SYS_RESOURCE); >> + params.res = res; >> + params.full = full; >> + params.threshold_us = threshold_us; >> + params.window_us = window_us; >> + >> + t = psi_trigger_create(group, ¶ms); >> + if (IS_ERR(t)) >> + ret = PTR_ERR(t); >> + else >> + t->cgroup_id = cgroup_id; >> + >> +#ifdef CONFIG_CGROUPS >> + if (cgroup) >> + cgroup_put(cgroup); >> +#endif >> + >> + return ret; >> +} >> +__bpf_kfunc_end_defs(); >> + >> +BTF_KFUNCS_START(bpf_psi_kfuncs) >> +BTF_ID_FLAGS(func, bpf_psi_create_trigger, KF_TRUSTED_ARGS) >> +BTF_KFUNCS_END(bpf_psi_kfuncs) >> + >> +static const struct btf_kfunc_id_set bpf_psi_kfunc_set = { >> + .owner = THIS_MODULE, >> + .set = &bpf_psi_kfuncs, >> +}; >> + >> static int bpf_psi_ops_reg(void *kdata, struct bpf_link *link) >> { >> struct bpf_psi_ops *ops = kdata; >> @@ -238,6 +315,13 @@ static int __init bpf_psi_struct_ops_init(void) >> if (!bpf_psi_wq) >> return -ENOMEM; >> >> + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, >> + &bpf_psi_kfunc_set); > > would this make kfunc callable from any struct_ops, not just this psi > one? It will. Idk how big of a problem it is, given that the caller needs a trusted reference to bpf_psi. Also, is there a simple way to constrain it? Wdyt?
On 8/20/25 5:36 PM, Roman Gushchin wrote: > It will. Idk how big of a problem it is, given that the caller needs > a trusted reference to bpf_psi. Also, is there a simple way to constrain > it? Wdyt? The bpf qdisc has the kfunc filtering. Take a look at the bpf_qdisc_kfunc_filter in bpf_qdisc.c.
Martin KaFai Lau <martin.lau@linux.dev> writes: > On 8/20/25 5:36 PM, Roman Gushchin wrote: >> It will. Idk how big of a problem it is, given that the caller needs >> a trusted reference to bpf_psi. Also, is there a simple way to constrain >> it? Wdyt? > > The bpf qdisc has the kfunc filtering. Take a look at the > bpf_qdisc_kfunc_filter in bpf_qdisc.c. Thanks! I'll take a look.
On Wed, Aug 20, 2025 at 5:36 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: > > Andrii Nakryiko <andrii.nakryiko@gmail.com> writes: > > > On Mon, Aug 18, 2025 at 10:06 AM Roman Gushchin > > <roman.gushchin@linux.dev> wrote: > >> > >> Implement a new bpf_psi_create_trigger() bpf kfunc, which allows > >> to create new psi triggers and attach them to cgroups or be > >> system-wide. > >> > >> Created triggers will exist until the struct ops is loaded and > >> if they are attached to a cgroup until the cgroup exists. > >> > >> Due to a limitation of 5 arguments, the resource type and the "full" > >> bit are squeezed into a single u32. > >> > >> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> > >> --- > >> kernel/sched/bpf_psi.c | 84 ++++++++++++++++++++++++++++++++++++++++++ > >> 1 file changed, 84 insertions(+) > >> > >> diff --git a/kernel/sched/bpf_psi.c b/kernel/sched/bpf_psi.c > >> index 2ea9d7276b21..94b684221708 100644 > >> --- a/kernel/sched/bpf_psi.c > >> +++ b/kernel/sched/bpf_psi.c > >> @@ -156,6 +156,83 @@ static const struct bpf_verifier_ops bpf_psi_verifier_ops = { > >> .is_valid_access = bpf_psi_ops_is_valid_access, > >> }; > >> > >> +__bpf_kfunc_start_defs(); > >> + > >> +/** > >> + * bpf_psi_create_trigger - Create a PSI trigger > >> + * @bpf_psi: bpf_psi struct to attach the trigger to > >> + * @cgroup_id: cgroup Id to attach the trigger; 0 for system-wide scope > >> + * @resource: resource to monitor (PSI_MEM, PSI_IO, etc) and the full bit. > >> + * @threshold_us: threshold in us > >> + * @window_us: window in us > >> + * > >> + * Creates a PSI trigger and attached is to bpf_psi. The trigger will be > >> + * active unless bpf struct ops is unloaded or the corresponding cgroup > >> + * is deleted. > >> + * > >> + * Resource's most significant bit encodes whether "some" or "full" > >> + * PSI state should be tracked. > >> + * > >> + * Returns 0 on success and the error code on failure. > >> + */ > >> +__bpf_kfunc int bpf_psi_create_trigger(struct bpf_psi *bpf_psi, > >> + u64 cgroup_id, u32 resource, > >> + u32 threshold_us, u32 window_us) > >> +{ > >> + enum psi_res res = resource & ~BPF_PSI_FULL; > >> + bool full = resource & BPF_PSI_FULL; > >> + struct psi_trigger_params params; > >> + struct cgroup *cgroup __maybe_unused = NULL; > >> + struct psi_group *group; > >> + struct psi_trigger *t; > >> + int ret = 0; > >> + > >> + if (res >= NR_PSI_RESOURCES) > >> + return -EINVAL; > >> + > >> +#ifdef CONFIG_CGROUPS > >> + if (cgroup_id) { > >> + cgroup = cgroup_get_from_id(cgroup_id); > >> + if (IS_ERR_OR_NULL(cgroup)) > >> + return PTR_ERR(cgroup); > >> + > >> + group = cgroup_psi(cgroup); > >> + } else > >> +#endif > >> + group = &psi_system; > > > > just a drive-by comment while skimming through the patch set: can't > > you use IS_ENABLED(CONFIG_CGROUPS) and have a proper if/else with > > proper {} ? > > Fixed. > It required defining cgroup_get_from_id() and cgroup_psi() > for !CONFIG_CGROUPS, but I agree, it's much better. > Thanks > > > > >> + > >> + params.type = PSI_BPF; > >> + params.bpf_psi = bpf_psi; > >> + params.privileged = capable(CAP_SYS_RESOURCE); > >> + params.res = res; > >> + params.full = full; > >> + params.threshold_us = threshold_us; > >> + params.window_us = window_us; > >> + > >> + t = psi_trigger_create(group, ¶ms); > >> + if (IS_ERR(t)) > >> + ret = PTR_ERR(t); > >> + else > >> + t->cgroup_id = cgroup_id; > >> + > >> +#ifdef CONFIG_CGROUPS > >> + if (cgroup) > >> + cgroup_put(cgroup); > >> +#endif > >> + > >> + return ret; > >> +} > >> +__bpf_kfunc_end_defs(); > >> + > >> +BTF_KFUNCS_START(bpf_psi_kfuncs) > >> +BTF_ID_FLAGS(func, bpf_psi_create_trigger, KF_TRUSTED_ARGS) > >> +BTF_KFUNCS_END(bpf_psi_kfuncs) > >> + > >> +static const struct btf_kfunc_id_set bpf_psi_kfunc_set = { > >> + .owner = THIS_MODULE, > >> + .set = &bpf_psi_kfuncs, > >> +}; > >> + > >> static int bpf_psi_ops_reg(void *kdata, struct bpf_link *link) > >> { > >> struct bpf_psi_ops *ops = kdata; > >> @@ -238,6 +315,13 @@ static int __init bpf_psi_struct_ops_init(void) > >> if (!bpf_psi_wq) > >> return -ENOMEM; > >> > >> + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, > >> + &bpf_psi_kfunc_set); > > > > would this make kfunc callable from any struct_ops, not just this psi > > one? > > It will. Idk how big of a problem it is, given that the caller needs > a trusted reference to bpf_psi. Yes, I agree, probably not a big deal. > Also, is there a simple way to constrain it? Wdyt? We've talked about having the ability to restrict kfuncs to specific struct_ops types, but I don't think we've ever made much progress on this. So no, I don't think there is a simple way.
© 2016 - 2025 Red Hat, Inc.