[RFC PATCH v2 2/5] mm: Add policy_name to identify OOM policies

Chuyi Zhou posted 5 patches 2 years, 6 months ago
[RFC PATCH v2 2/5] mm: Add policy_name to identify OOM policies
Posted by Chuyi Zhou 2 years, 6 months ago
This patch adds a new metadata policy_name in oom_control and report it
in dump_header(), so we can know what has been the selection policy. In
BPF program, we can call kfunc set_oom_policy_name to set the current
user-defined policy name. The in-kernel policy_name is "default".

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
 include/linux/oom.h |  7 +++++++
 mm/oom_kill.c       | 42 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 7d0c9c48a0c5..69d0f2ec6ea6 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -22,6 +22,10 @@ enum oom_constraint {
 	CONSTRAINT_MEMCG,
 };
 
+enum {
+	POLICY_NAME_LEN = 16,
+};
+
 /*
  * Details of the page allocation that triggered the oom killer that are used to
  * determine what should be killed.
@@ -52,6 +56,9 @@ struct oom_control {
 
 	/* Used to print the constraint info. */
 	enum oom_constraint constraint;
+
+	/* Used to report the policy info. */
+	char policy_name[POLICY_NAME_LEN];
 };
 
 extern struct mutex oom_lock;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 255c9ef1d808..3239dcdba4d7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -443,6 +443,35 @@ static int dump_task(struct task_struct *p, void *arg)
 	return 0;
 }
 
+__bpf_kfunc void set_oom_policy_name(struct oom_control *oc, const char *src, size_t sz)
+{
+	memset(oc->policy_name, 0, sizeof(oc->policy_name));
+
+	if (sz > POLICY_NAME_LEN)
+		sz = POLICY_NAME_LEN;
+
+	memcpy(oc->policy_name, src, sz);
+}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "kfuncs which will be used in BPF programs");
+
+__weak noinline void bpf_set_policy_name(struct oom_control *oc)
+{
+}
+
+__diag_pop();
+
+BTF_SET8_START(bpf_oom_policy_kfunc_ids)
+BTF_ID_FLAGS(func, set_oom_policy_name)
+BTF_SET8_END(bpf_oom_policy_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_oom_policy_kfunc_set = {
+	.owner          = THIS_MODULE,
+	.set            = &bpf_oom_policy_kfunc_ids,
+};
+
 /**
  * dump_tasks - dump current memory state of all system tasks
  * @oc: pointer to struct oom_control
@@ -484,8 +513,8 @@ static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
 
 static void dump_header(struct oom_control *oc, struct task_struct *p)
 {
-	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
-		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
+	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, policy_name=%s, oom_score_adj=%hd\n",
+		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, oc->policy_name,
 			current->signal->oom_score_adj);
 	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
 		pr_warn("COMPACTION is disabled!!!\n");
@@ -775,8 +804,11 @@ static int __init oom_init(void)
 	err = register_btf_fmodret_id_set(&oom_bpf_fmodret_set);
 	if (err)
 		pr_warn("error while registering oom fmodret entrypoints: %d", err);
+	err = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+					&bpf_oom_policy_kfunc_set);
+	if (err)
+		pr_warn("error while registering oom kfunc entrypoints: %d", err);
 #endif
-
 	return 0;
 }
 subsys_initcall(oom_init)
@@ -1196,6 +1228,10 @@ bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
+	set_oom_policy_name(oc, "default", sizeof("default"));
+
+	bpf_set_policy_name(oc);
+
 	select_bad_process(oc);
 	/* Found nothing?!?! */
 	if (!oc->chosen) {
-- 
2.20.1
Re: [RFC PATCH v2 2/5] mm: Add policy_name to identify OOM policies
Posted by Jonathan Corbet 2 years, 5 months ago
Chuyi Zhou <zhouchuyi@bytedance.com> writes:

> This patch adds a new metadata policy_name in oom_control and report it
> in dump_header(), so we can know what has been the selection policy. In
> BPF program, we can call kfunc set_oom_policy_name to set the current
> user-defined policy name. The in-kernel policy_name is "default".
>
> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> ---
>  include/linux/oom.h |  7 +++++++
>  mm/oom_kill.c       | 42 +++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 46 insertions(+), 3 deletions(-)

So I have a possibly dumb question here...

> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 7d0c9c48a0c5..69d0f2ec6ea6 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -22,6 +22,10 @@ enum oom_constraint {
>  	CONSTRAINT_MEMCG,
>  };
>  
> +enum {
> +	POLICY_NAME_LEN = 16,
> +};

We've defined our name length, fine...

>  /*
>   * Details of the page allocation that triggered the oom killer that are used to
>   * determine what should be killed.
> @@ -52,6 +56,9 @@ struct oom_control {
>  
>  	/* Used to print the constraint info. */
>  	enum oom_constraint constraint;
> +
> +	/* Used to report the policy info. */
> +	char policy_name[POLICY_NAME_LEN];
>  };

...that is the length of the array, appended to the structure...

>  extern struct mutex oom_lock;
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 255c9ef1d808..3239dcdba4d7 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -443,6 +443,35 @@ static int dump_task(struct task_struct *p, void *arg)
>  	return 0;
>  }
>  
> +__bpf_kfunc void set_oom_policy_name(struct oom_control *oc, const char *src, size_t sz)
> +{
> +	memset(oc->policy_name, 0, sizeof(oc->policy_name));
> +
> +	if (sz > POLICY_NAME_LEN)
> +		sz = POLICY_NAME_LEN;
> +
> +	memcpy(oc->policy_name, src, sz);
> +}

This truncates the name, possibly leaving it without a terminating NUL
character, right?

> +__diag_push();
> +__diag_ignore_all("-Wmissing-prototypes",
> +		  "kfuncs which will be used in BPF programs");
> +
> +__weak noinline void bpf_set_policy_name(struct oom_control *oc)
> +{
> +}
> +
> +__diag_pop();
> +
> +BTF_SET8_START(bpf_oom_policy_kfunc_ids)
> +BTF_ID_FLAGS(func, set_oom_policy_name)
> +BTF_SET8_END(bpf_oom_policy_kfunc_ids)
> +
> +static const struct btf_kfunc_id_set bpf_oom_policy_kfunc_set = {
> +	.owner          = THIS_MODULE,
> +	.set            = &bpf_oom_policy_kfunc_ids,
> +};
> +
>  /**
>   * dump_tasks - dump current memory state of all system tasks
>   * @oc: pointer to struct oom_control
> @@ -484,8 +513,8 @@ static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
>  
>  static void dump_header(struct oom_control *oc, struct task_struct *p)
>  {
> -	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
> -		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
> +	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, policy_name=%s, oom_score_adj=%hd\n",
> +		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, oc->policy_name,

...and if the policy name is unterminated, this print will run off the
end of the structure.

Am I missing something here?

Thanks,

jon
Re: [RFC PATCH v2 2/5] mm: Add policy_name to identify OOM policies
Posted by Bixuan Cui 2 years, 4 months ago

在 2023/8/15 4:51, Jonathan Corbet 写道:
>>   /**
>>    * dump_tasks - dump current memory state of all system tasks
>>    * @oc: pointer to struct oom_control
>> @@ -484,8 +513,8 @@ static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
>>   
>>   static void dump_header(struct oom_control *oc, struct task_struct *p)
>>   {
>> -	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
>> -		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
>> +	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, policy_name=%s, oom_score_adj=%hd\n",
>> +		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, oc->policy_name,
> ...and if the policy name is unterminated, this print will run off the
> end of the structure.
> 
> Am I missing something here?
Perhaps it is inaccurate to use policy name in this way. For example, 
some one use BPF_PROG(bpf_oom_evaluate_task, ...) but do not set the 
policy name through bpf_set_policy_name. In this way, the result is 
still policy name=default, which ultimately leads to error print in the 
dump_header.
I think a better way:

+static const char *const policy_select[] = {
+    "OOM_DEFAULT";
+    "BPF_ABORT",
+    "BPF_NEXT",
+    "BPF_SELECT",
+};

struct oom_control {

  	/* Used to print the constraint info. */
  	enum oom_constraint constraint;
+
+	/* Used to report the policy select. */
+	int policy_select;
  };

static int oom_evaluate_task(struct task_struct *task, void *arg)
{
...

+	switch (bpf_oom_evaluate_task(task, oc)) {
+	case BPF_EVAL_ABORT:
+              oc->policy_select = BPF_EVAL_ABORT;
+		goto abort; /* abort search process */
+	case BPF_EVAL_NEXT:
+              oc->policy_select = BPF_EVAL_NEXT;
+		goto next; /* ignore the task */
+	case BPF_EVAL_SELECT:
+             oc->policy_select = BPF_EVAL_SELECT;
+		goto select; /* select the task */
+	default:
+		break; /* No BPF policy */
+	}

  static void dump_header(struct oom_control *oc, struct task_struct *p)
  {
-	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
oom_score_adj=%hd\n",
-		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
+	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
policy_name=%s, oom_score_adj=%hd\n",
+		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, 
policy_select[oc->policy_select],
  			current->signal->oom_score_adj);


And all definitions of oc should be added
struct oom_control oc = {
      .select = NO_BPF_POLICY,
}

Delete set_oom_policy_name, it makes no sense for users to set policy 
names. :-)

Thanks
Bixuan Cui





Re: [RFC PATCH v2 2/5] mm: Add policy_name to identify OOM policies
Posted by Bixuan Cui 2 years, 4 months ago

在 2023/8/15 4:51, Jonathan Corbet 写道:
>>   /**
>>    * dump_tasks - dump current memory state of all system tasks
>>    * @oc: pointer to struct oom_control
>> @@ -484,8 +513,8 @@ static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
>>   
>>   static void dump_header(struct oom_control *oc, struct task_struct *p)
>>   {
>> -	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
>> -		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
>> +	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, policy_name=%s, oom_score_adj=%hd\n",
>> +		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, oc->policy_name,
> ...and if the policy name is unterminated, this print will run off the
> end of the structure.
> 
> Am I missing something here?
Perhaps it is inaccurate to use policy name in this way. For example, 
some one use BPF_PROG(bpf_oom_evaluate_task, ...) but do not set the 
policy name through bpf_set_policy_name. In this way, the result is 
still policy name=default, which ultimately leads to error print in the 
dump_header.
I think a better way:

+static const char *const policy_select[] = {
+    "OOM_DEFAULT";
+    "BPF_ABORT",
+    "BPF_NEXT",
+    "BPF_SELECT",
+};

struct oom_control {

  	/* Used to print the constraint info. */
  	enum oom_constraint constraint;
+
+	/* Used to report the policy select. */
+	int policy_select;
  };

static int oom_evaluate_task(struct task_struct *task, void *arg)
{
...

+	switch (bpf_oom_evaluate_task(task, oc)) {
+	case BPF_EVAL_ABORT:
+              oc->policy_select = BPF_EVAL_ABORT;
+		goto abort; /* abort search process */
+	case BPF_EVAL_NEXT:
+              oc->policy_select = BPF_EVAL_NEXT;
+		goto next; /* ignore the task */
+	case BPF_EVAL_SELECT:
+             oc->policy_select = BPF_EVAL_SELECT;
+		goto select; /* select the task */
+	default:
+		break; /* No BPF policy */
+	}

  static void dump_header(struct oom_control *oc, struct task_struct *p)
  {
-	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
oom_score_adj=%hd\n",
-		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
+	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
policy_name=%s, oom_score_adj=%hd\n",
+		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, 
policy_select[oc->policy_select],
  			current->signal->oom_score_adj);


And all definitions of oc should be added
struct oom_control oc = {
      .select = NO_BPF_POLICY,
}

Delete set_oom_policy_name, it makes no sense for users to set policy 
names. :-)

Thanks
Bixuan Cui





Re: [External] Re: [RFC PATCH v2 2/5] mm: Add policy_name to identify OOM policies
Posted by Chuyi Zhou 2 years, 4 months ago
Hello,

在 2023/9/14 20:02, Bixuan Cui 写道:
> 
> 
> 在 2023/8/15 4:51, Jonathan Corbet 写道:
>>>   /**
>>>    * dump_tasks - dump current memory state of all system tasks
>>>    * @oc: pointer to struct oom_control
>>> @@ -484,8 +513,8 @@ static void dump_oom_summary(struct oom_control 
>>> *oc, struct task_struct *victim)
>>>   static void dump_header(struct oom_control *oc, struct task_struct *p)
>>>   {
>>> -    pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
>>> oom_score_adj=%hd\n",
>>> -        current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
>>> +    pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
>>> policy_name=%s, oom_score_adj=%hd\n",
>>> +        current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, 
>>> oc->policy_name,
>> ...and if the policy name is unterminated, this print will run off the
>> end of the structure.
>>
>> Am I missing something here?
> Perhaps it is inaccurate to use policy name in this way. For example, 
> some one use BPF_PROG(bpf_oom_evaluate_task, ...) but do not set the 
> policy name through bpf_set_policy_name. In this way, the result is 
> still policy name=default, which ultimately leads to error print in the 
> dump_header.
> I think a better way:
> 
> +static const char *const policy_select[] = {
> +    "OOM_DEFAULT";
> +    "BPF_ABORT",
> +    "BPF_NEXT",
> +    "BPF_SELECT",
> +};
> 
> struct oom_control {
> 
>       /* Used to print the constraint info. */
>       enum oom_constraint constraint;
> +
> +    /* Used to report the policy select. */
> +    int policy_select;
>   };
> 
> static int oom_evaluate_task(struct task_struct *task, void *arg)
> {
> ...
> 
> +    switch (bpf_oom_evaluate_task(task, oc)) {
> +    case BPF_EVAL_ABORT:
> +              oc->policy_select = BPF_EVAL_ABORT;
> +        goto abort; /* abort search process */
> +    case BPF_EVAL_NEXT:
> +              oc->policy_select = BPF_EVAL_NEXT;
> +        goto next; /* ignore the task */
> +    case BPF_EVAL_SELECT:
> +             oc->policy_select = BPF_EVAL_SELECT;
> +        goto select; /* select the task */
> +    default:
> +        break; /* No BPF policy */
> +    }
> 
>   static void dump_header(struct oom_control *oc, struct task_struct *p)
>   {
> -    pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
> oom_score_adj=%hd\n",
> -        current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
> +    pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
> policy_name=%s, oom_score_adj=%hd\n",
> +        current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, 
> policy_select[oc->policy_select],
>               current->signal->oom_score_adj);
> 
> 

The policy_name may be different from the previous OOM reporting, even 
though they are using the same policy.

> And all definitions of oc should be added
> struct oom_control oc = {
>       .select = NO_BPF_POLICY,
> }
> 
> Delete set_oom_policy_name, it makes no sense for users to set policy 
> names. :-)
> 

There can be multiple OOM policy in the system at the same time.

If we need to apply different OOM policies to different memcgs based on 
different scenarios, we can use this hook(set_oom_policy_name) to set 
name to identify which policy in invoked at that time.

Just some thoughts.

Thanks.

> Thanks
> Bixuan Cui
> 
> 
> 
> 
> 
Re: [External] Re: [RFC PATCH v2 2/5] mm: Add policy_name to identify OOM policies
Posted by Bixuan Cui 2 years, 4 months ago

在 2023/9/14 20:50, Chuyi Zhou 写道:
>>
>> Delete set_oom_policy_name, it makes no sense for users to set policy 
>> names. 🙂
>>
> 
> There can be multiple OOM policy in the system at the same time.
> 
> If we need to apply different OOM policies to different memcgs based on 
> different scenarios, we can use this hook(set_oom_policy_name) to set 
> name to identify which policy in invoked at that time.
> 
> Just some thoughts.
Well, I thought the system would only load one OOM policy(set one policy 
name), which would set the prio of all memcgs.

What you mean is that multiple bpf.c may be loaded at the system, and 
each bpf.c sets the different policy for different memcgs?
Re: [RFC PATCH v2 2/5] mm: Add policy_name to identify OOM policies
Posted by Chuyi Zhou 2 years, 4 months ago

在 2023/9/15 10:28, Bixuan Cui 写道:
> 
> 
> 在 2023/9/14 20:50, Chuyi Zhou 写道:
>>>
>>> Delete set_oom_policy_name, it makes no sense for users to set policy 
>>> names. 🙂
>>>
>>
>> There can be multiple OOM policy in the system at the same time.
>>
>> If we need to apply different OOM policies to different memcgs based 
>> on different scenarios, we can use this hook(set_oom_policy_name) to 
>> set name to identify which policy in invoked at that time.
>>
>> Just some thoughts.
> Well, I thought the system would only load one OOM policy(set one policy 
> name), which would set the prio of all memcgs.
> 
> What you mean is that multiple bpf.c may be loaded at the system, and 
> each bpf.c sets the different policy for different memcgs?

No. Not multiple bpf_oompolicy.c but multiple OOM Policy in one BPF Program.
Re: [RFC PATCH v2 2/5] mm: Add policy_name to identify OOM policies
Posted by Chuyi Zhou 2 years, 5 months ago
Hello,

在 2023/8/15 04:51, Jonathan Corbet 写道:
> Chuyi Zhou <zhouchuyi@bytedance.com> writes:
> 
>> This patch adds a new metadata policy_name in oom_control and report it
>> in dump_header(), so we can know what has been the selection policy. In
>> BPF program, we can call kfunc set_oom_policy_name to set the current
>> user-defined policy name. The in-kernel policy_name is "default".
>>
>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
>> ---
>>   include/linux/oom.h |  7 +++++++
>>   mm/oom_kill.c       | 42 +++++++++++++++++++++++++++++++++++++++---
>>   2 files changed, 46 insertions(+), 3 deletions(-)
> 
> So I have a possibly dumb question here...
> 
>> diff --git a/include/linux/oom.h b/include/linux/oom.h
>> index 7d0c9c48a0c5..69d0f2ec6ea6 100644
>> --- a/include/linux/oom.h
>> +++ b/include/linux/oom.h
>> @@ -22,6 +22,10 @@ enum oom_constraint {
>>   	CONSTRAINT_MEMCG,
>>   };
>>   
>> +enum {
>> +	POLICY_NAME_LEN = 16,
>> +};
> 
> We've defined our name length, fine...
> 
>>   /*
>>    * Details of the page allocation that triggered the oom killer that are used to
>>    * determine what should be killed.
>> @@ -52,6 +56,9 @@ struct oom_control {
>>   
>>   	/* Used to print the constraint info. */
>>   	enum oom_constraint constraint;
>> +
>> +	/* Used to report the policy info. */
>> +	char policy_name[POLICY_NAME_LEN];
>>   };
> 
> ...that is the length of the array, appended to the structure...
> 
>>   extern struct mutex oom_lock;
>> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
>> index 255c9ef1d808..3239dcdba4d7 100644
>> --- a/mm/oom_kill.c
>> +++ b/mm/oom_kill.c
>> @@ -443,6 +443,35 @@ static int dump_task(struct task_struct *p, void *arg)
>>   	return 0;
>>   }
>>   
>> +__bpf_kfunc void set_oom_policy_name(struct oom_control *oc, const char *src, size_t sz)
>> +{
>> +	memset(oc->policy_name, 0, sizeof(oc->policy_name));
>> +
>> +	if (sz > POLICY_NAME_LEN)
>> +		sz = POLICY_NAME_LEN;
>> +
>> +	memcpy(oc->policy_name, src, sz);
>> +}
> 
> This truncates the name, possibly leaving it without a terminating NUL
> character, right?
> 

Yes, indeed. We should guarantee that the policy_name is terminated with 
a NULL character to avoid some undefined behavior.

Thanks for your helpful review.

------
Chuyi Zhou