Implement a new bpf_psi_create_trigger() bpf kfunc, which allows
to create new psi triggers and attach them to cgroups or be
system-wide.
Created triggers will exist until the struct ops is loaded and
if they are attached to a cgroup until the cgroup exists.
Due to a limitation of 5 arguments, the resource type and the "full"
bit are squeezed into a single u32.
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
kernel/sched/bpf_psi.c | 84 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
diff --git a/kernel/sched/bpf_psi.c b/kernel/sched/bpf_psi.c
index 2ea9d7276b21..94b684221708 100644
--- a/kernel/sched/bpf_psi.c
+++ b/kernel/sched/bpf_psi.c
@@ -156,6 +156,83 @@ static const struct bpf_verifier_ops bpf_psi_verifier_ops = {
.is_valid_access = bpf_psi_ops_is_valid_access,
};
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_psi_create_trigger - Create a PSI trigger
+ * @bpf_psi: bpf_psi struct to attach the trigger to
+ * @cgroup_id: cgroup Id to attach the trigger; 0 for system-wide scope
+ * @resource: resource to monitor (PSI_MEM, PSI_IO, etc) and the full bit.
+ * @threshold_us: threshold in us
+ * @window_us: window in us
+ *
+ * Creates a PSI trigger and attached is to bpf_psi. The trigger will be
+ * active unless bpf struct ops is unloaded or the corresponding cgroup
+ * is deleted.
+ *
+ * Resource's most significant bit encodes whether "some" or "full"
+ * PSI state should be tracked.
+ *
+ * Returns 0 on success and the error code on failure.
+ */
+__bpf_kfunc int bpf_psi_create_trigger(struct bpf_psi *bpf_psi,
+ u64 cgroup_id, u32 resource,
+ u32 threshold_us, u32 window_us)
+{
+ enum psi_res res = resource & ~BPF_PSI_FULL;
+ bool full = resource & BPF_PSI_FULL;
+ struct psi_trigger_params params;
+ struct cgroup *cgroup __maybe_unused = NULL;
+ struct psi_group *group;
+ struct psi_trigger *t;
+ int ret = 0;
+
+ if (res >= NR_PSI_RESOURCES)
+ return -EINVAL;
+
+#ifdef CONFIG_CGROUPS
+ if (cgroup_id) {
+ cgroup = cgroup_get_from_id(cgroup_id);
+ if (IS_ERR_OR_NULL(cgroup))
+ return PTR_ERR(cgroup);
+
+ group = cgroup_psi(cgroup);
+ } else
+#endif
+ group = &psi_system;
+
+ params.type = PSI_BPF;
+ params.bpf_psi = bpf_psi;
+ params.privileged = capable(CAP_SYS_RESOURCE);
+ params.res = res;
+ params.full = full;
+ params.threshold_us = threshold_us;
+ params.window_us = window_us;
+
+ t = psi_trigger_create(group, ¶ms);
+ if (IS_ERR(t))
+ ret = PTR_ERR(t);
+ else
+ t->cgroup_id = cgroup_id;
+
+#ifdef CONFIG_CGROUPS
+ if (cgroup)
+ cgroup_put(cgroup);
+#endif
+
+ return ret;
+}
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_psi_kfuncs)
+BTF_ID_FLAGS(func, bpf_psi_create_trigger, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_psi_kfuncs)
+
+static const struct btf_kfunc_id_set bpf_psi_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_psi_kfuncs,
+};
+
static int bpf_psi_ops_reg(void *kdata, struct bpf_link *link)
{
struct bpf_psi_ops *ops = kdata;
@@ -238,6 +315,13 @@ static int __init bpf_psi_struct_ops_init(void)
if (!bpf_psi_wq)
return -ENOMEM;
+ err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &bpf_psi_kfunc_set);
+ if (err) {
+ pr_warn("error while registering bpf psi kfuncs: %d", err);
+ goto err;
+ }
+
err = register_bpf_struct_ops(&bpf_psi_bpf_ops, bpf_psi_ops);
if (err) {
pr_warn("error while registering bpf psi struct ops: %d", err);
--
2.50.1
On Mon, Aug 18, 2025 at 10:06 AM Roman Gushchin
<roman.gushchin@linux.dev> wrote:
>
> Implement a new bpf_psi_create_trigger() bpf kfunc, which allows
> to create new psi triggers and attach them to cgroups or be
> system-wide.
>
> Created triggers will exist until the struct ops is loaded and
> if they are attached to a cgroup until the cgroup exists.
>
> Due to a limitation of 5 arguments, the resource type and the "full"
> bit are squeezed into a single u32.
>
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> ---
> kernel/sched/bpf_psi.c | 84 ++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 84 insertions(+)
>
> diff --git a/kernel/sched/bpf_psi.c b/kernel/sched/bpf_psi.c
> index 2ea9d7276b21..94b684221708 100644
> --- a/kernel/sched/bpf_psi.c
> +++ b/kernel/sched/bpf_psi.c
> @@ -156,6 +156,83 @@ static const struct bpf_verifier_ops bpf_psi_verifier_ops = {
> .is_valid_access = bpf_psi_ops_is_valid_access,
> };
>
> +__bpf_kfunc_start_defs();
> +
> +/**
> + * bpf_psi_create_trigger - Create a PSI trigger
> + * @bpf_psi: bpf_psi struct to attach the trigger to
> + * @cgroup_id: cgroup Id to attach the trigger; 0 for system-wide scope
> + * @resource: resource to monitor (PSI_MEM, PSI_IO, etc) and the full bit.
> + * @threshold_us: threshold in us
> + * @window_us: window in us
> + *
> + * Creates a PSI trigger and attached is to bpf_psi. The trigger will be
> + * active unless bpf struct ops is unloaded or the corresponding cgroup
> + * is deleted.
> + *
> + * Resource's most significant bit encodes whether "some" or "full"
> + * PSI state should be tracked.
> + *
> + * Returns 0 on success and the error code on failure.
> + */
> +__bpf_kfunc int bpf_psi_create_trigger(struct bpf_psi *bpf_psi,
> + u64 cgroup_id, u32 resource,
> + u32 threshold_us, u32 window_us)
> +{
> + enum psi_res res = resource & ~BPF_PSI_FULL;
> + bool full = resource & BPF_PSI_FULL;
> + struct psi_trigger_params params;
> + struct cgroup *cgroup __maybe_unused = NULL;
> + struct psi_group *group;
> + struct psi_trigger *t;
> + int ret = 0;
> +
> + if (res >= NR_PSI_RESOURCES)
> + return -EINVAL;
> +
> +#ifdef CONFIG_CGROUPS
> + if (cgroup_id) {
> + cgroup = cgroup_get_from_id(cgroup_id);
> + if (IS_ERR_OR_NULL(cgroup))
> + return PTR_ERR(cgroup);
> +
> + group = cgroup_psi(cgroup);
> + } else
> +#endif
> + group = &psi_system;
just a drive-by comment while skimming through the patch set: can't
you use IS_ENABLED(CONFIG_CGROUPS) and have a proper if/else with
proper {} ?
> +
> + params.type = PSI_BPF;
> + params.bpf_psi = bpf_psi;
> + params.privileged = capable(CAP_SYS_RESOURCE);
> + params.res = res;
> + params.full = full;
> + params.threshold_us = threshold_us;
> + params.window_us = window_us;
> +
> + t = psi_trigger_create(group, ¶ms);
> + if (IS_ERR(t))
> + ret = PTR_ERR(t);
> + else
> + t->cgroup_id = cgroup_id;
> +
> +#ifdef CONFIG_CGROUPS
> + if (cgroup)
> + cgroup_put(cgroup);
> +#endif
> +
> + return ret;
> +}
> +__bpf_kfunc_end_defs();
> +
> +BTF_KFUNCS_START(bpf_psi_kfuncs)
> +BTF_ID_FLAGS(func, bpf_psi_create_trigger, KF_TRUSTED_ARGS)
> +BTF_KFUNCS_END(bpf_psi_kfuncs)
> +
> +static const struct btf_kfunc_id_set bpf_psi_kfunc_set = {
> + .owner = THIS_MODULE,
> + .set = &bpf_psi_kfuncs,
> +};
> +
> static int bpf_psi_ops_reg(void *kdata, struct bpf_link *link)
> {
> struct bpf_psi_ops *ops = kdata;
> @@ -238,6 +315,13 @@ static int __init bpf_psi_struct_ops_init(void)
> if (!bpf_psi_wq)
> return -ENOMEM;
>
> + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
> + &bpf_psi_kfunc_set);
would this make kfunc callable from any struct_ops, not just this psi one?
> + if (err) {
> + pr_warn("error while registering bpf psi kfuncs: %d", err);
> + goto err;
> + }
> +
> err = register_bpf_struct_ops(&bpf_psi_bpf_ops, bpf_psi_ops);
> if (err) {
> pr_warn("error while registering bpf psi struct ops: %d", err);
> --
> 2.50.1
>
>
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
> On Mon, Aug 18, 2025 at 10:06 AM Roman Gushchin
> <roman.gushchin@linux.dev> wrote:
>>
>> Implement a new bpf_psi_create_trigger() bpf kfunc, which allows
>> to create new psi triggers and attach them to cgroups or be
>> system-wide.
>>
>> Created triggers will exist until the struct ops is loaded and
>> if they are attached to a cgroup until the cgroup exists.
>>
>> Due to a limitation of 5 arguments, the resource type and the "full"
>> bit are squeezed into a single u32.
>>
>> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
>> ---
>> kernel/sched/bpf_psi.c | 84 ++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 84 insertions(+)
>>
>> diff --git a/kernel/sched/bpf_psi.c b/kernel/sched/bpf_psi.c
>> index 2ea9d7276b21..94b684221708 100644
>> --- a/kernel/sched/bpf_psi.c
>> +++ b/kernel/sched/bpf_psi.c
>> @@ -156,6 +156,83 @@ static const struct bpf_verifier_ops bpf_psi_verifier_ops = {
>> .is_valid_access = bpf_psi_ops_is_valid_access,
>> };
>>
>> +__bpf_kfunc_start_defs();
>> +
>> +/**
>> + * bpf_psi_create_trigger - Create a PSI trigger
>> + * @bpf_psi: bpf_psi struct to attach the trigger to
>> + * @cgroup_id: cgroup Id to attach the trigger; 0 for system-wide scope
>> + * @resource: resource to monitor (PSI_MEM, PSI_IO, etc) and the full bit.
>> + * @threshold_us: threshold in us
>> + * @window_us: window in us
>> + *
>> + * Creates a PSI trigger and attached is to bpf_psi. The trigger will be
>> + * active unless bpf struct ops is unloaded or the corresponding cgroup
>> + * is deleted.
>> + *
>> + * Resource's most significant bit encodes whether "some" or "full"
>> + * PSI state should be tracked.
>> + *
>> + * Returns 0 on success and the error code on failure.
>> + */
>> +__bpf_kfunc int bpf_psi_create_trigger(struct bpf_psi *bpf_psi,
>> + u64 cgroup_id, u32 resource,
>> + u32 threshold_us, u32 window_us)
>> +{
>> + enum psi_res res = resource & ~BPF_PSI_FULL;
>> + bool full = resource & BPF_PSI_FULL;
>> + struct psi_trigger_params params;
>> + struct cgroup *cgroup __maybe_unused = NULL;
>> + struct psi_group *group;
>> + struct psi_trigger *t;
>> + int ret = 0;
>> +
>> + if (res >= NR_PSI_RESOURCES)
>> + return -EINVAL;
>> +
>> +#ifdef CONFIG_CGROUPS
>> + if (cgroup_id) {
>> + cgroup = cgroup_get_from_id(cgroup_id);
>> + if (IS_ERR_OR_NULL(cgroup))
>> + return PTR_ERR(cgroup);
>> +
>> + group = cgroup_psi(cgroup);
>> + } else
>> +#endif
>> + group = &psi_system;
>
> just a drive-by comment while skimming through the patch set: can't
> you use IS_ENABLED(CONFIG_CGROUPS) and have a proper if/else with
> proper {} ?
Fixed.
It required defining cgroup_get_from_id() and cgroup_psi()
for !CONFIG_CGROUPS, but I agree, it's much better.
Thanks
>
>> +
>> + params.type = PSI_BPF;
>> + params.bpf_psi = bpf_psi;
>> + params.privileged = capable(CAP_SYS_RESOURCE);
>> + params.res = res;
>> + params.full = full;
>> + params.threshold_us = threshold_us;
>> + params.window_us = window_us;
>> +
>> + t = psi_trigger_create(group, ¶ms);
>> + if (IS_ERR(t))
>> + ret = PTR_ERR(t);
>> + else
>> + t->cgroup_id = cgroup_id;
>> +
>> +#ifdef CONFIG_CGROUPS
>> + if (cgroup)
>> + cgroup_put(cgroup);
>> +#endif
>> +
>> + return ret;
>> +}
>> +__bpf_kfunc_end_defs();
>> +
>> +BTF_KFUNCS_START(bpf_psi_kfuncs)
>> +BTF_ID_FLAGS(func, bpf_psi_create_trigger, KF_TRUSTED_ARGS)
>> +BTF_KFUNCS_END(bpf_psi_kfuncs)
>> +
>> +static const struct btf_kfunc_id_set bpf_psi_kfunc_set = {
>> + .owner = THIS_MODULE,
>> + .set = &bpf_psi_kfuncs,
>> +};
>> +
>> static int bpf_psi_ops_reg(void *kdata, struct bpf_link *link)
>> {
>> struct bpf_psi_ops *ops = kdata;
>> @@ -238,6 +315,13 @@ static int __init bpf_psi_struct_ops_init(void)
>> if (!bpf_psi_wq)
>> return -ENOMEM;
>>
>> + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
>> + &bpf_psi_kfunc_set);
>
> would this make kfunc callable from any struct_ops, not just this psi
> one?
It will. Idk how big of a problem it is, given that the caller needs
a trusted reference to bpf_psi. Also, is there a simple way to constrain
it? Wdyt?
On 8/20/25 5:36 PM, Roman Gushchin wrote: > It will. Idk how big of a problem it is, given that the caller needs > a trusted reference to bpf_psi. Also, is there a simple way to constrain > it? Wdyt? The bpf qdisc has the kfunc filtering. Take a look at the bpf_qdisc_kfunc_filter in bpf_qdisc.c.
Martin KaFai Lau <martin.lau@linux.dev> writes: > On 8/20/25 5:36 PM, Roman Gushchin wrote: >> It will. Idk how big of a problem it is, given that the caller needs >> a trusted reference to bpf_psi. Also, is there a simple way to constrain >> it? Wdyt? > > The bpf qdisc has the kfunc filtering. Take a look at the > bpf_qdisc_kfunc_filter in bpf_qdisc.c. Thanks! I'll take a look.
On Wed, Aug 20, 2025 at 5:36 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>
> > On Mon, Aug 18, 2025 at 10:06 AM Roman Gushchin
> > <roman.gushchin@linux.dev> wrote:
> >>
> >> Implement a new bpf_psi_create_trigger() bpf kfunc, which allows
> >> to create new psi triggers and attach them to cgroups or be
> >> system-wide.
> >>
> >> Created triggers will exist until the struct ops is loaded and
> >> if they are attached to a cgroup until the cgroup exists.
> >>
> >> Due to a limitation of 5 arguments, the resource type and the "full"
> >> bit are squeezed into a single u32.
> >>
> >> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> >> ---
> >> kernel/sched/bpf_psi.c | 84 ++++++++++++++++++++++++++++++++++++++++++
> >> 1 file changed, 84 insertions(+)
> >>
> >> diff --git a/kernel/sched/bpf_psi.c b/kernel/sched/bpf_psi.c
> >> index 2ea9d7276b21..94b684221708 100644
> >> --- a/kernel/sched/bpf_psi.c
> >> +++ b/kernel/sched/bpf_psi.c
> >> @@ -156,6 +156,83 @@ static const struct bpf_verifier_ops bpf_psi_verifier_ops = {
> >> .is_valid_access = bpf_psi_ops_is_valid_access,
> >> };
> >>
> >> +__bpf_kfunc_start_defs();
> >> +
> >> +/**
> >> + * bpf_psi_create_trigger - Create a PSI trigger
> >> + * @bpf_psi: bpf_psi struct to attach the trigger to
> >> + * @cgroup_id: cgroup Id to attach the trigger; 0 for system-wide scope
> >> + * @resource: resource to monitor (PSI_MEM, PSI_IO, etc) and the full bit.
> >> + * @threshold_us: threshold in us
> >> + * @window_us: window in us
> >> + *
> >> + * Creates a PSI trigger and attached is to bpf_psi. The trigger will be
> >> + * active unless bpf struct ops is unloaded or the corresponding cgroup
> >> + * is deleted.
> >> + *
> >> + * Resource's most significant bit encodes whether "some" or "full"
> >> + * PSI state should be tracked.
> >> + *
> >> + * Returns 0 on success and the error code on failure.
> >> + */
> >> +__bpf_kfunc int bpf_psi_create_trigger(struct bpf_psi *bpf_psi,
> >> + u64 cgroup_id, u32 resource,
> >> + u32 threshold_us, u32 window_us)
> >> +{
> >> + enum psi_res res = resource & ~BPF_PSI_FULL;
> >> + bool full = resource & BPF_PSI_FULL;
> >> + struct psi_trigger_params params;
> >> + struct cgroup *cgroup __maybe_unused = NULL;
> >> + struct psi_group *group;
> >> + struct psi_trigger *t;
> >> + int ret = 0;
> >> +
> >> + if (res >= NR_PSI_RESOURCES)
> >> + return -EINVAL;
> >> +
> >> +#ifdef CONFIG_CGROUPS
> >> + if (cgroup_id) {
> >> + cgroup = cgroup_get_from_id(cgroup_id);
> >> + if (IS_ERR_OR_NULL(cgroup))
> >> + return PTR_ERR(cgroup);
> >> +
> >> + group = cgroup_psi(cgroup);
> >> + } else
> >> +#endif
> >> + group = &psi_system;
> >
> > just a drive-by comment while skimming through the patch set: can't
> > you use IS_ENABLED(CONFIG_CGROUPS) and have a proper if/else with
> > proper {} ?
>
> Fixed.
> It required defining cgroup_get_from_id() and cgroup_psi()
> for !CONFIG_CGROUPS, but I agree, it's much better.
> Thanks
>
> >
> >> +
> >> + params.type = PSI_BPF;
> >> + params.bpf_psi = bpf_psi;
> >> + params.privileged = capable(CAP_SYS_RESOURCE);
> >> + params.res = res;
> >> + params.full = full;
> >> + params.threshold_us = threshold_us;
> >> + params.window_us = window_us;
> >> +
> >> + t = psi_trigger_create(group, ¶ms);
> >> + if (IS_ERR(t))
> >> + ret = PTR_ERR(t);
> >> + else
> >> + t->cgroup_id = cgroup_id;
> >> +
> >> +#ifdef CONFIG_CGROUPS
> >> + if (cgroup)
> >> + cgroup_put(cgroup);
> >> +#endif
> >> +
> >> + return ret;
> >> +}
> >> +__bpf_kfunc_end_defs();
> >> +
> >> +BTF_KFUNCS_START(bpf_psi_kfuncs)
> >> +BTF_ID_FLAGS(func, bpf_psi_create_trigger, KF_TRUSTED_ARGS)
> >> +BTF_KFUNCS_END(bpf_psi_kfuncs)
> >> +
> >> +static const struct btf_kfunc_id_set bpf_psi_kfunc_set = {
> >> + .owner = THIS_MODULE,
> >> + .set = &bpf_psi_kfuncs,
> >> +};
> >> +
> >> static int bpf_psi_ops_reg(void *kdata, struct bpf_link *link)
> >> {
> >> struct bpf_psi_ops *ops = kdata;
> >> @@ -238,6 +315,13 @@ static int __init bpf_psi_struct_ops_init(void)
> >> if (!bpf_psi_wq)
> >> return -ENOMEM;
> >>
> >> + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
> >> + &bpf_psi_kfunc_set);
> >
> > would this make kfunc callable from any struct_ops, not just this psi
> > one?
>
> It will. Idk how big of a problem it is, given that the caller needs
> a trusted reference to bpf_psi.
Yes, I agree, probably not a big deal.
> Also, is there a simple way to constrain it? Wdyt?
We've talked about having the ability to restrict kfuncs to specific
struct_ops types, but I don't think we've ever made much progress on
this. So no, I don't think there is a simple way.
© 2016 - 2026 Red Hat, Inc.