[v1] Reduce cost of accessing tg->load_avg

[PATCH 1/4] sched/fair: free allocated memory on error in alloc_fair_sched_group()

Posted by Aaron Lu 2 years, 1 month ago

There is one struct cfs_rq and one struct se on each cpu for a taskgroup
and when allocation for tg->cfs_rq[X] failed, the already allocated
tg->cfs_rq[0]..tg->cfs_rq[X-1] should be freed. The same for tg->se.

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
---
 kernel/sched/fair.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a80a73909dc2..0f913487928d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12443,10 +12443,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 
 	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
 	if (!tg->cfs_rq)
-		goto err;
+		return 0;
 	tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
 	if (!tg->se)
-		goto err;
+		goto err_free_rq_pointer;
 
 	tg->shares = NICE_0_LOAD;
 
@@ -12456,12 +12456,12 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
-			goto err;
+			goto err_free;
 
 		se = kzalloc_node(sizeof(struct sched_entity_stats),
 				  GFP_KERNEL, cpu_to_node(i));
 		if (!se)
-			goto err_free_rq;
+			goto err_free;
 
 		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
@@ -12470,9 +12470,18 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 
 	return 1;
 
-err_free_rq:
-	kfree(cfs_rq);
-err:
+err_free:
+	for_each_possible_cpu(i) {
+		kfree(tg->cfs_rq[i]);
+		kfree(tg->se[i]);
+
+		if (!tg->cfs_rq[i] && !tg->se[i])
+			break;
+	}
+	kfree(tg->se);
+err_free_rq_pointer:
+	kfree(tg->cfs_rq);
+
 	return 0;
 }
 
-- 
2.41.0

Re: [PATCH 1/4] sched/fair: free allocated memory on error in alloc_fair_sched_group()

Posted by Chen Yu 2 years, 1 month ago

On 2023-07-18 at 21:41:17 +0800, Aaron Lu wrote:
> There is one struct cfs_rq and one struct se on each cpu for a taskgroup
> and when allocation for tg->cfs_rq[X] failed, the already allocated
> tg->cfs_rq[0]..tg->cfs_rq[X-1] should be freed. The same for tg->se.
> 
> Signed-off-by: Aaron Lu <aaron.lu@intel.com>
> ---
>  kernel/sched/fair.c | 23 ++++++++++++++++-------
>  1 file changed, 16 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a80a73909dc2..0f913487928d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12443,10 +12443,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
>  
>  	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
>  	if (!tg->cfs_rq)
> -		goto err;
> +		return 0;
>  	tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
>  	if (!tg->se)
> -		goto err;
> +		goto err_free_rq_pointer;
>  
>  	tg->shares = NICE_0_LOAD;
>  
> @@ -12456,12 +12456,12 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
>  		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
>  				      GFP_KERNEL, cpu_to_node(i));
>  		if (!cfs_rq)
> -			goto err;
> +			goto err_free;
>  
>  		se = kzalloc_node(sizeof(struct sched_entity_stats),
>  				  GFP_KERNEL, cpu_to_node(i));
>  		if (!se)
> -			goto err_free_rq;
> +			goto err_free;
>  
>  		init_cfs_rq(cfs_rq);
>  		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
> @@ -12470,9 +12470,18 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
>  
>  	return 1;
>  
> -err_free_rq:
> -	kfree(cfs_rq);
> -err:
> +err_free:
> +	for_each_possible_cpu(i) {
> +		kfree(tg->cfs_rq[i]);
> +		kfree(tg->se[i]);
> +
> +		if (!tg->cfs_rq[i] && !tg->se[i])
> +			break;
> +	}
> +	kfree(tg->se);
> +err_free_rq_pointer:
> +	kfree(tg->cfs_rq);
> +
Not sure if I overlooked, if alloc_fair_sched_group() fails in sched_create_group(),
would sched_free_group()->free_fair_sched_group() do the cleanup?

thanks,
Chenyu

Re: [PATCH 1/4] sched/fair: free allocated memory on error in alloc_fair_sched_group()

Posted by Aaron Lu 2 years, 1 month ago

On Tue, Jul 18, 2023 at 11:13:12PM +0800, Chen Yu wrote:
> On 2023-07-18 at 21:41:17 +0800, Aaron Lu wrote:
> > There is one struct cfs_rq and one struct se on each cpu for a taskgroup
> > and when allocation for tg->cfs_rq[X] failed, the already allocated
> > tg->cfs_rq[0]..tg->cfs_rq[X-1] should be freed. The same for tg->se.
> > 
> > Signed-off-by: Aaron Lu <aaron.lu@intel.com>
> > ---
> >  kernel/sched/fair.c | 23 ++++++++++++++++-------
> >  1 file changed, 16 insertions(+), 7 deletions(-)
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index a80a73909dc2..0f913487928d 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -12443,10 +12443,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
> >  
> >  	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
> >  	if (!tg->cfs_rq)
> > -		goto err;
> > +		return 0;
> >  	tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
> >  	if (!tg->se)
> > -		goto err;
> > +		goto err_free_rq_pointer;
> >  
> >  	tg->shares = NICE_0_LOAD;
> >  
> > @@ -12456,12 +12456,12 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
> >  		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
> >  				      GFP_KERNEL, cpu_to_node(i));
> >  		if (!cfs_rq)
> > -			goto err;
> > +			goto err_free;
> >  
> >  		se = kzalloc_node(sizeof(struct sched_entity_stats),
> >  				  GFP_KERNEL, cpu_to_node(i));
> >  		if (!se)
> > -			goto err_free_rq;
> > +			goto err_free;
> >  
> >  		init_cfs_rq(cfs_rq);
> >  		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
> > @@ -12470,9 +12470,18 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
> >  
> >  	return 1;
> >  
> > -err_free_rq:
> > -	kfree(cfs_rq);
> > -err:
> > +err_free:
> > +	for_each_possible_cpu(i) {
> > +		kfree(tg->cfs_rq[i]);
> > +		kfree(tg->se[i]);
> > +
> > +		if (!tg->cfs_rq[i] && !tg->se[i])
> > +			break;
> > +	}
> > +	kfree(tg->se);
> > +err_free_rq_pointer:
> > +	kfree(tg->cfs_rq);
> > +
> Not sure if I overlooked, if alloc_fair_sched_group() fails in sched_create_group(),
> would sched_free_group()->free_fair_sched_group() do the cleanup?

You are right, I overlooked... Thanks for pointing this out.

Re: [PATCH 1/4] sched/fair: free allocated memory on error in alloc_fair_sched_group()

Posted by Aaron Lu 2 years, 1 month ago

On Wed, Jul 19, 2023 at 10:13:55AM +0800, Aaron Lu wrote:
> On Tue, Jul 18, 2023 at 11:13:12PM +0800, Chen Yu wrote:
> > On 2023-07-18 at 21:41:17 +0800, Aaron Lu wrote:
> > > There is one struct cfs_rq and one struct se on each cpu for a taskgroup
> > > and when allocation for tg->cfs_rq[X] failed, the already allocated
> > > tg->cfs_rq[0]..tg->cfs_rq[X-1] should be freed. The same for tg->se.
> > > 
> > > Signed-off-by: Aaron Lu <aaron.lu@intel.com>
> > > ---
> > >  kernel/sched/fair.c | 23 ++++++++++++++++-------
> > >  1 file changed, 16 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index a80a73909dc2..0f913487928d 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -12443,10 +12443,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
> > >  
> > >  	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
> > >  	if (!tg->cfs_rq)
> > > -		goto err;
> > > +		return 0;
> > >  	tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
> > >  	if (!tg->se)
> > > -		goto err;
> > > +		goto err_free_rq_pointer;
> > >  
> > >  	tg->shares = NICE_0_LOAD;
> > >  
> > > @@ -12456,12 +12456,12 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
> > >  		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
> > >  				      GFP_KERNEL, cpu_to_node(i));
> > >  		if (!cfs_rq)
> > > -			goto err;
> > > +			goto err_free;
> > >  
> > >  		se = kzalloc_node(sizeof(struct sched_entity_stats),
> > >  				  GFP_KERNEL, cpu_to_node(i));
> > >  		if (!se)
> > > -			goto err_free_rq;
> > > +			goto err_free;
> > >  
> > >  		init_cfs_rq(cfs_rq);
> > >  		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
> > > @@ -12470,9 +12470,18 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
> > >  
> > >  	return 1;
> > >  
> > > -err_free_rq:
> > > -	kfree(cfs_rq);
> > > -err:
> > > +err_free:
> > > +	for_each_possible_cpu(i) {
> > > +		kfree(tg->cfs_rq[i]);
> > > +		kfree(tg->se[i]);
> > > +
> > > +		if (!tg->cfs_rq[i] && !tg->se[i])
> > > +			break;
> > > +	}
> > > +	kfree(tg->se);
> > > +err_free_rq_pointer:
> > > +	kfree(tg->cfs_rq);
> > > +
> > Not sure if I overlooked, if alloc_fair_sched_group() fails in sched_create_group(),
> > would sched_free_group()->free_fair_sched_group() do the cleanup?
> 
> You are right, I overlooked... Thanks for pointing this out.

While preparing v2, one thing still looks strange in the existing code:

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
	... ...

	for_each_possible_cpu(i) {
		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				      GFP_KERNEL, cpu_to_node(i));
		if (!cfs_rq)
			goto err;

		se = kzalloc_node(sizeof(struct sched_entity_stats),
				  GFP_KERNEL, cpu_to_node(i));
		if (!se)
			goto err_free_rq;
			~~~~~~~~~~~~~~~~

Since free_fair_sched_group() will take care freeing these memories on
error path, it looks unnecessary to do this free for a random cfs_rq
here. Or do I miss something again...?

		init_cfs_rq(cfs_rq);
		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
		init_entity_runnable_average(se);
	}

	return 1;

err_free_rq:
	kfree(cfs_rq);
err:
	return 0;
}

I plan to change it to this:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a80a73909dc2..ef2618ad26eb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12461,7 +12461,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		se = kzalloc_node(sizeof(struct sched_entity_stats),
 				  GFP_KERNEL, cpu_to_node(i));
 		if (!se)
-			goto err_free_rq;
+			goto err;
 
 		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
@@ -12470,8 +12470,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 
 	return 1;
 
-err_free_rq:
-	kfree(cfs_rq);
 err:
 	return 0;
 }

Re: [PATCH 1/4] sched/fair: free allocated memory on error in alloc_fair_sched_group()

Posted by Chen Yu 2 years, 1 month ago

On 2023-08-02 at 15:01:05 +0800, Aaron Lu wrote:
> On Wed, Jul 19, 2023 at 10:13:55AM +0800, Aaron Lu wrote:
> > On Tue, Jul 18, 2023 at 11:13:12PM +0800, Chen Yu wrote:
> > > On 2023-07-18 at 21:41:17 +0800, Aaron Lu wrote:
> > > > There is one struct cfs_rq and one struct se on each cpu for a taskgroup
> > > > and when allocation for tg->cfs_rq[X] failed, the already allocated
> > > > tg->cfs_rq[0]..tg->cfs_rq[X-1] should be freed. The same for tg->se.
> > > > 
> > > > Signed-off-by: Aaron Lu <aaron.lu@intel.com>
> > > > ---
> > > >  kernel/sched/fair.c | 23 ++++++++++++++++-------
> > > >  1 file changed, 16 insertions(+), 7 deletions(-)
> > > > 
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index a80a73909dc2..0f913487928d 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -12443,10 +12443,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
> > > >  
> > > >  	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
> > > >  	if (!tg->cfs_rq)
> > > > -		goto err;
> > > > +		return 0;
> > > >  	tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
> > > >  	if (!tg->se)
> > > > -		goto err;
> > > > +		goto err_free_rq_pointer;
> > > >  
> > > >  	tg->shares = NICE_0_LOAD;
> > > >  
> > > > @@ -12456,12 +12456,12 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
> > > >  		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
> > > >  				      GFP_KERNEL, cpu_to_node(i));
> > > >  		if (!cfs_rq)
> > > > -			goto err;
> > > > +			goto err_free;
> > > >  
> > > >  		se = kzalloc_node(sizeof(struct sched_entity_stats),
> > > >  				  GFP_KERNEL, cpu_to_node(i));
> > > >  		if (!se)
> > > > -			goto err_free_rq;
> > > > +			goto err_free;
> > > >  
> > > >  		init_cfs_rq(cfs_rq);
> > > >  		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
> > > > @@ -12470,9 +12470,18 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
> > > >  
> > > >  	return 1;
> > > >  
> > > > -err_free_rq:
> > > > -	kfree(cfs_rq);
> > > > -err:
> > > > +err_free:
> > > > +	for_each_possible_cpu(i) {
> > > > +		kfree(tg->cfs_rq[i]);
> > > > +		kfree(tg->se[i]);
> > > > +
> > > > +		if (!tg->cfs_rq[i] && !tg->se[i])
> > > > +			break;
> > > > +	}
> > > > +	kfree(tg->se);
> > > > +err_free_rq_pointer:
> > > > +	kfree(tg->cfs_rq);
> > > > +
> > > Not sure if I overlooked, if alloc_fair_sched_group() fails in sched_create_group(),
> > > would sched_free_group()->free_fair_sched_group() do the cleanup?
> > 
> > You are right, I overlooked... Thanks for pointing this out.
> 
> While preparing v2, one thing still looks strange in the existing code:
> 
> int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
> {
> 	... ...
> 
> 	for_each_possible_cpu(i) {
> 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
> 				      GFP_KERNEL, cpu_to_node(i));
> 		if (!cfs_rq)
> 			goto err;
> 
> 		se = kzalloc_node(sizeof(struct sched_entity_stats),
> 				  GFP_KERNEL, cpu_to_node(i));
> 		if (!se)
> 			goto err_free_rq;
> 			~~~~~~~~~~~~~~~~
> 
> Since free_fair_sched_group() will take care freeing these memories on
> error path, it looks unnecessary to do this free for a random cfs_rq
> here. Or do I miss something again...?
> 
> 		init_cfs_rq(cfs_rq);
> 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
> 		init_entity_runnable_average(se);
> 	}
> 
> 	return 1;
> 
> err_free_rq:
> 	kfree(cfs_rq);
> err:
> 	return 0;
> }
> 
> I plan to change it to this:
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a80a73909dc2..ef2618ad26eb 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12461,7 +12461,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
>  		se = kzalloc_node(sizeof(struct sched_entity_stats),
>  				  GFP_KERNEL, cpu_to_node(i));
>  		if (!se)
> -			goto err_free_rq;
> +			goto err;
>  
>  		init_cfs_rq(cfs_rq);
>  		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
> @@ -12470,8 +12470,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
>  
>  	return 1;
>  
> -err_free_rq:
> -	kfree(cfs_rq);
>  err:
>  	return 0;
>  }

It seems that the err_free_rq was introduced in
Commit dfc12eb26a28 ("sched: Fix memory leak in two error corner cases")
The memory leak was detected by the static tool, while that tools is
unaware of free_fair_sched_group() which will take care of this.
Not sure if a self-maintained function is preferred, or let other function
to take care of that. For now it seems to be a duplicated free. And
alloc_rt_sched_group() has the same issue.

thanks,
Chenyu

[PATCH 1/4] sched/fair: free allocated memory on error in alloc_fair_sched_group()
[RFC PATCH 2/4] sched/fair: Make tg->load_avg per node
[RFC PATCH 3/4] sched/fair: delay update_tg_load_avg() for cfs_rq's removed load
[RFC PATCH 4/4] sched/fair: skip some update_cfs_group() on en/dequeue_entity()