[v1] xen/sched: fix cpu hotplug

[PATCH 2/3] xen/sched: carve out memory allocation and freeing from schedule_cpu_rm()

Posted by Juergen Gross 3 years, 6 months ago

In order to prepare not allocating or freeing memory from
schedule_cpu_rm(), move this functionality to dedicated functions.

For now call those functions from schedule_cpu_rm().

No change of behavior expected.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 xen/common/sched/core.c    | 133 +++++++++++++++++++++----------------
 xen/common/sched/private.h |   8 +++
 2 files changed, 85 insertions(+), 56 deletions(-)

diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
index c8d1034d3d..d6ff4f4921 100644
--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -3190,6 +3190,66 @@ out:
     return ret;
 }
 
+static struct cpu_rm_data *schedule_cpu_rm_alloc(unsigned int cpu)
+{
+    struct cpu_rm_data *data;
+    struct sched_resource *sr;
+    int idx;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    sr = get_sched_res(cpu);
+    data = xzalloc_flex_struct(struct cpu_rm_data, sr, sr->granularity - 1);
+    if ( !data )
+        goto out;
+
+    data->old_ops = sr->scheduler;
+    data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
+    data->ppriv_old = sr->sched_priv;
+
+    for ( idx = 0; idx < sr->granularity - 1; idx++ )
+    {
+        data->sr[idx] = sched_alloc_res();
+        if ( data->sr[idx] )
+        {
+            data->sr[idx]->sched_unit_idle = sched_alloc_unit_mem();
+            if ( !data->sr[idx]->sched_unit_idle )
+            {
+                sched_res_free(&data->sr[idx]->rcu);
+                data->sr[idx] = NULL;
+            }
+        }
+        if ( !data->sr[idx] )
+        {
+            for ( idx--; idx >= 0; idx-- )
+                sched_res_free(&data->sr[idx]->rcu);
+            xfree(data);
+            data = NULL;
+            goto out;
+        }
+
+        data->sr[idx]->curr = data->sr[idx]->sched_unit_idle;
+        data->sr[idx]->scheduler = &sched_idle_ops;
+        data->sr[idx]->granularity = 1;
+
+        /* We want the lock not to change when replacing the resource. */
+        data->sr[idx]->schedule_lock = sr->schedule_lock;
+    }
+
+ out:
+    rcu_read_unlock(&sched_res_rculock);
+
+    return data;
+}
+
+static void schedule_cpu_rm_free(struct cpu_rm_data *mem, unsigned int cpu)
+{
+    sched_free_udata(mem->old_ops, mem->vpriv_old);
+    sched_free_pdata(mem->old_ops, mem->ppriv_old, cpu);
+
+    xfree(mem);
+}
+
 /*
  * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops
  * (the idle scheduler).
@@ -3198,53 +3258,22 @@ out:
  */
 int schedule_cpu_rm(unsigned int cpu)
 {
-    void *ppriv_old, *vpriv_old;
-    struct sched_resource *sr, **sr_new = NULL;
+    struct sched_resource *sr;
+    struct cpu_rm_data *data;
     struct sched_unit *unit;
-    struct scheduler *old_ops;
     spinlock_t *old_lock;
     unsigned long flags;
-    int idx, ret = -ENOMEM;
+    int idx = 0;
     unsigned int cpu_iter;
 
+    data = schedule_cpu_rm_alloc(cpu);
+    if ( !data )
+        return -ENOMEM;
+
     rcu_read_lock(&sched_res_rculock);
 
     sr = get_sched_res(cpu);
-    old_ops = sr->scheduler;
 
-    if ( sr->granularity > 1 )
-    {
-        sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1);
-        if ( !sr_new )
-            goto out;
-        for ( idx = 0; idx < sr->granularity - 1; idx++ )
-        {
-            sr_new[idx] = sched_alloc_res();
-            if ( sr_new[idx] )
-            {
-                sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem();
-                if ( !sr_new[idx]->sched_unit_idle )
-                {
-                    sched_res_free(&sr_new[idx]->rcu);
-                    sr_new[idx] = NULL;
-                }
-            }
-            if ( !sr_new[idx] )
-            {
-                for ( idx--; idx >= 0; idx-- )
-                    sched_res_free(&sr_new[idx]->rcu);
-                goto out;
-            }
-            sr_new[idx]->curr = sr_new[idx]->sched_unit_idle;
-            sr_new[idx]->scheduler = &sched_idle_ops;
-            sr_new[idx]->granularity = 1;
-
-            /* We want the lock not to change when replacing the resource. */
-            sr_new[idx]->schedule_lock = sr->schedule_lock;
-        }
-    }
-
-    ret = 0;
     ASSERT(sr->cpupool != NULL);
     ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
     ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid));
@@ -3252,10 +3281,6 @@ int schedule_cpu_rm(unsigned int cpu)
     /* See comment in schedule_cpu_add() regarding lock switching. */
     old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
 
-    vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
-    ppriv_old = sr->sched_priv;
-
-    idx = 0;
     for_each_cpu ( cpu_iter, sr->cpus )
     {
         per_cpu(sched_res_idx, cpu_iter) = 0;
@@ -3269,27 +3294,27 @@ int schedule_cpu_rm(unsigned int cpu)
         else
         {
             /* Initialize unit. */
-            unit = sr_new[idx]->sched_unit_idle;
-            unit->res = sr_new[idx];
+            unit = data->sr[idx]->sched_unit_idle;
+            unit->res = data->sr[idx];
             unit->is_running = true;
             sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]);
             sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain);
 
             /* Adjust cpu masks of resources (old and new). */
             cpumask_clear_cpu(cpu_iter, sr->cpus);
-            cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus);
+            cpumask_set_cpu(cpu_iter, data->sr[idx]->cpus);
             cpumask_set_cpu(cpu_iter, &sched_res_mask);
 
             /* Init timer. */
-            init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter);
+            init_timer(&data->sr[idx]->s_timer, s_timer_fn, NULL, cpu_iter);
 
             /* Last resource initializations and insert resource pointer. */
-            sr_new[idx]->master_cpu = cpu_iter;
-            set_sched_res(cpu_iter, sr_new[idx]);
+            data->sr[idx]->master_cpu = cpu_iter;
+            set_sched_res(cpu_iter, data->sr[idx]);
 
             /* Last action: set the new lock pointer. */
             smp_mb();
-            sr_new[idx]->schedule_lock = &sched_free_cpu_lock;
+            data->sr[idx]->schedule_lock = &sched_free_cpu_lock;
 
             idx++;
         }
@@ -3305,16 +3330,12 @@ int schedule_cpu_rm(unsigned int cpu)
     /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
     spin_unlock_irqrestore(old_lock, flags);
 
-    sched_deinit_pdata(old_ops, ppriv_old, cpu);
-
-    sched_free_udata(old_ops, vpriv_old);
-    sched_free_pdata(old_ops, ppriv_old, cpu);
+    sched_deinit_pdata(data->old_ops, data->ppriv_old, cpu);
 
-out:
     rcu_read_unlock(&sched_res_rculock);
-    xfree(sr_new);
+    schedule_cpu_rm_free(data, cpu);
 
-    return ret;
+    return 0;
 }
 
 struct scheduler *scheduler_get_default(void)
diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h
index de0cf63ce8..c626ad4907 100644
--- a/xen/common/sched/private.h
+++ b/xen/common/sched/private.h
@@ -598,6 +598,14 @@ struct affinity_masks {
     cpumask_var_t soft;
 };
 
+/* Memory allocation related data for schedule_cpu_rm(). */
+struct cpu_rm_data {
+    struct scheduler *old_ops;
+    void *ppriv_old;
+    void *vpriv_old;
+    struct sched_resource *sr[];
+};
+
 void domain_update_node_affinity_noalloc(struct domain *d,
                                          const cpumask_t *online,
                                          struct affinity_masks *affinity);
-- 
2.35.3

Re: [PATCH 2/3] xen/sched: carve out memory allocation and freeing from schedule_cpu_rm()

Posted by Jan Beulich 3 years, 6 months ago

On 02.08.2022 15:27, Juergen Gross wrote:
> --- a/xen/common/sched/core.c
> +++ b/xen/common/sched/core.c
> @@ -3190,6 +3190,66 @@ out:
>      return ret;
>  }
>  
> +static struct cpu_rm_data *schedule_cpu_rm_alloc(unsigned int cpu)
> +{
> +    struct cpu_rm_data *data;
> +    struct sched_resource *sr;

const?

> +    int idx;

While code is supposedly only being moved, I still question this not
being "unsigned int", the more that sr->granularity is "unsigned int"
as well. (Same then for the retained instance ofthe variable in the
original function.) Of course the loop in the error path then needs
writing differently.

> +    rcu_read_lock(&sched_res_rculock);
> +
> +    sr = get_sched_res(cpu);
> +    data = xzalloc_flex_struct(struct cpu_rm_data, sr, sr->granularity - 1);

Afaict xmalloc_flex_struct() would do here, as you fill all fields.

> +    if ( !data )
> +        goto out;
> +
> +    data->old_ops = sr->scheduler;
> +    data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
> +    data->ppriv_old = sr->sched_priv;

At least from an abstract perspective, doesn't reading fields from
sr require the RCU lock to be held continuously (i.e. not dropping
it at the end of this function and re-acquiring it in the caller)?

> +    for ( idx = 0; idx < sr->granularity - 1; idx++ )
> +    {
> +        data->sr[idx] = sched_alloc_res();
> +        if ( data->sr[idx] )
> +        {
> +            data->sr[idx]->sched_unit_idle = sched_alloc_unit_mem();
> +            if ( !data->sr[idx]->sched_unit_idle )
> +            {
> +                sched_res_free(&data->sr[idx]->rcu);
> +                data->sr[idx] = NULL;
> +            }
> +        }
> +        if ( !data->sr[idx] )
> +        {
> +            for ( idx--; idx >= 0; idx-- )
> +                sched_res_free(&data->sr[idx]->rcu);
> +            xfree(data);
> +            data = NULL;

XFREE()?

> @@ -3198,53 +3258,22 @@ out:
>   */
>  int schedule_cpu_rm(unsigned int cpu)
>  {
> -    void *ppriv_old, *vpriv_old;
> -    struct sched_resource *sr, **sr_new = NULL;
> +    struct sched_resource *sr;
> +    struct cpu_rm_data *data;
>      struct sched_unit *unit;
> -    struct scheduler *old_ops;
>      spinlock_t *old_lock;
>      unsigned long flags;
> -    int idx, ret = -ENOMEM;
> +    int idx = 0;
>      unsigned int cpu_iter;
>  
> +    data = schedule_cpu_rm_alloc(cpu);
> +    if ( !data )
> +        return -ENOMEM;
> +
>      rcu_read_lock(&sched_res_rculock);
>  
>      sr = get_sched_res(cpu);
> -    old_ops = sr->scheduler;
>  
> -    if ( sr->granularity > 1 )
> -    {

This conditional is lost afaict, resulting in potentially wrong behavior
in the new helper. Considering its purpose I expect there's a guarantee
that the field's value can never be zero, but then I guess an ASSERT()
would be nice next to the potentially problematic uses in the helper.

> --- a/xen/common/sched/private.h
> +++ b/xen/common/sched/private.h
> @@ -598,6 +598,14 @@ struct affinity_masks {
>      cpumask_var_t soft;
>  };
>  
> +/* Memory allocation related data for schedule_cpu_rm(). */
> +struct cpu_rm_data {
> +    struct scheduler *old_ops;

const?

Jan

Re: [PATCH 2/3] xen/sched: carve out memory allocation and freeing from schedule_cpu_rm()

Posted by Juergen Gross 3 years, 6 months ago

On 03.08.22 11:25, Jan Beulich wrote:
> On 02.08.2022 15:27, Juergen Gross wrote:
>> --- a/xen/common/sched/core.c
>> +++ b/xen/common/sched/core.c
>> @@ -3190,6 +3190,66 @@ out:
>>       return ret;
>>   }
>>   
>> +static struct cpu_rm_data *schedule_cpu_rm_alloc(unsigned int cpu)
>> +{
>> +    struct cpu_rm_data *data;
>> +    struct sched_resource *sr;
> 
> const?

Yes.

> 
>> +    int idx;
> 
> While code is supposedly only being moved, I still question this not
> being "unsigned int", the more that sr->granularity is "unsigned int"
> as well. (Same then for the retained instance ofthe variable in the
> original function.) Of course the loop in the error path then needs
> writing differently.

I considered that and didn't want to change the loop. OTOH this seems
to be rather trivial, so I can do the switch.

> 
>> +    rcu_read_lock(&sched_res_rculock);
>> +
>> +    sr = get_sched_res(cpu);
>> +    data = xzalloc_flex_struct(struct cpu_rm_data, sr, sr->granularity - 1);
> 
> Afaict xmalloc_flex_struct() would do here, as you fill all fields.

Okay.

> 
>> +    if ( !data )
>> +        goto out;
>> +
>> +    data->old_ops = sr->scheduler;
>> +    data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
>> +    data->ppriv_old = sr->sched_priv;
> 
> At least from an abstract perspective, doesn't reading fields from
> sr require the RCU lock to be held continuously (i.e. not dropping
> it at the end of this function and re-acquiring it in the caller)?
> 
>> +    for ( idx = 0; idx < sr->granularity - 1; idx++ )
>> +    {
>> +        data->sr[idx] = sched_alloc_res();
>> +        if ( data->sr[idx] )
>> +        {
>> +            data->sr[idx]->sched_unit_idle = sched_alloc_unit_mem();
>> +            if ( !data->sr[idx]->sched_unit_idle )
>> +            {
>> +                sched_res_free(&data->sr[idx]->rcu);
>> +                data->sr[idx] = NULL;
>> +            }
>> +        }
>> +        if ( !data->sr[idx] )
>> +        {
>> +            for ( idx--; idx >= 0; idx-- )
>> +                sched_res_free(&data->sr[idx]->rcu);
>> +            xfree(data);
>> +            data = NULL;
> 
> XFREE()?

Oh, right. Forgot about that possibility.

> 
>> @@ -3198,53 +3258,22 @@ out:
>>    */
>>   int schedule_cpu_rm(unsigned int cpu)
>>   {
>> -    void *ppriv_old, *vpriv_old;
>> -    struct sched_resource *sr, **sr_new = NULL;
>> +    struct sched_resource *sr;
>> +    struct cpu_rm_data *data;
>>       struct sched_unit *unit;
>> -    struct scheduler *old_ops;
>>       spinlock_t *old_lock;
>>       unsigned long flags;
>> -    int idx, ret = -ENOMEM;
>> +    int idx = 0;
>>       unsigned int cpu_iter;
>>   
>> +    data = schedule_cpu_rm_alloc(cpu);
>> +    if ( !data )
>> +        return -ENOMEM;
>> +
>>       rcu_read_lock(&sched_res_rculock);
>>   
>>       sr = get_sched_res(cpu);
>> -    old_ops = sr->scheduler;
>>   
>> -    if ( sr->granularity > 1 )
>> -    {
> 
> This conditional is lost afaict, resulting in potentially wrong behavior
> in the new helper. Considering its purpose I expect there's a guarantee
> that the field's value can never be zero, but then I guess an ASSERT()
> would be nice next to the potentially problematic uses in the helper.

I'll add the ASSERT().

> 
>> --- a/xen/common/sched/private.h
>> +++ b/xen/common/sched/private.h
>> @@ -598,6 +598,14 @@ struct affinity_masks {
>>       cpumask_var_t soft;
>>   };
>>   
>> +/* Memory allocation related data for schedule_cpu_rm(). */
>> +struct cpu_rm_data {
>> +    struct scheduler *old_ops;
> 
> const?

Yes.


Juergen

[PATCH 1/3] xen/sched: introduce cpupool_update_node_affinity()
[PATCH 2/3] xen/sched: carve out memory allocation and freeing from schedule_cpu_rm()
[PATCH 3/3] xen/sched: fix cpu hotplug