[v3] cpuset/isolation: Honour kthreads preferred affinity

[PATCH 22/33] kthread: Include unbound kthreads in the managed affinity list

Posted by Frederic Weisbecker 5 months, 3 weeks ago

The managed affinity list currently contains only unbound kthreads that
have affinity preferences. Unbound kthreads globally affine by default
are outside of the list because their affinity is automatically managed
by the scheduler (through the fallback housekeeping mask) and by cpuset.

However in order to preserve the preferred affinity of kthreads, cpuset
will delegate the isolated partition update propagation to the
housekeeping and kthread code.

Prepare for that with including all unbound kthreads in the managed
affinity list.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/kthread.c | 59 ++++++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index c4dd967e9e9c..cba3d297f267 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -365,9 +365,10 @@ static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpum
 	if (kthread->preferred_affinity) {
 		pref = kthread->preferred_affinity;
 	} else {
-		if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE))
-			return;
-		pref = cpumask_of_node(kthread->node);
+		if (kthread->node == NUMA_NO_NODE)
+			pref = housekeeping_cpumask(HK_TYPE_KTHREAD);
+		else
+			pref = cpumask_of_node(kthread->node);
 	}
 
 	cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD));
@@ -380,32 +381,29 @@ static void kthread_affine_node(void)
 	struct kthread *kthread = to_kthread(current);
 	cpumask_var_t affinity;
 
-	WARN_ON_ONCE(kthread_is_per_cpu(current));
+	if (WARN_ON_ONCE(kthread_is_per_cpu(current)))
+		return;
 
-	if (kthread->node == NUMA_NO_NODE) {
-		housekeeping_affine(current, HK_TYPE_KTHREAD);
-	} else {
-		if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
-			WARN_ON_ONCE(1);
-			return;
-		}
-
-		mutex_lock(&kthread_affinity_lock);
-		WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
-		list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
-		/*
-		 * The node cpumask is racy when read from kthread() but:
-		 * - a racing CPU going down will either fail on the subsequent
-		 *   call to set_cpus_allowed_ptr() or be migrated to housekeepers
-		 *   afterwards by the scheduler.
-		 * - a racing CPU going up will be handled by kthreads_online_cpu()
-		 */
-		kthread_fetch_affinity(kthread, affinity);
-		set_cpus_allowed_ptr(current, affinity);
-		mutex_unlock(&kthread_affinity_lock);
-
-		free_cpumask_var(affinity);
+	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
+		WARN_ON_ONCE(1);
+		return;
 	}
+
+	mutex_lock(&kthread_affinity_lock);
+	WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
+	list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
+	/*
+	 * The node cpumask is racy when read from kthread() but:
+	 * - a racing CPU going down will either fail on the subsequent
+	 *   call to set_cpus_allowed_ptr() or be migrated to housekeepers
+	 *   afterwards by the scheduler.
+	 * - a racing CPU going up will be handled by kthreads_online_cpu()
+	 */
+	kthread_fetch_affinity(kthread, affinity);
+	set_cpus_allowed_ptr(current, affinity);
+	mutex_unlock(&kthread_affinity_lock);
+
+	free_cpumask_var(affinity);
 }
 
 static int kthread(void *_create)
@@ -924,8 +922,11 @@ static int kthreads_online_cpu(unsigned int cpu)
 			ret = -EINVAL;
 			continue;
 		}
-		kthread_fetch_affinity(k, affinity);
-		set_cpus_allowed_ptr(k->task, affinity);
+
+		if (k->preferred_affinity || k->node != NUMA_NO_NODE) {
+			kthread_fetch_affinity(k, affinity);
+			set_cpus_allowed_ptr(k->task, affinity);
+		}
 	}
 
 	free_cpumask_var(affinity);
-- 
2.51.0

Re: [PATCH 22/33] kthread: Include unbound kthreads in the managed affinity list

Posted by Waiman Long 5 months, 2 weeks ago

On 10/13/25 4:31 PM, Frederic Weisbecker wrote:
> The managed affinity list currently contains only unbound kthreads that
> have affinity preferences. Unbound kthreads globally affine by default
> are outside of the list because their affinity is automatically managed
> by the scheduler (through the fallback housekeeping mask) and by cpuset.
>
> However in order to preserve the preferred affinity of kthreads, cpuset
> will delegate the isolated partition update propagation to the
> housekeeping and kthread code.
>
> Prepare for that with including all unbound kthreads in the managed
> affinity list.
>
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> ---
>   kernel/kthread.c | 59 ++++++++++++++++++++++++------------------------
>   1 file changed, 30 insertions(+), 29 deletions(-)
>
> diff --git a/kernel/kthread.c b/kernel/kthread.c
> index c4dd967e9e9c..cba3d297f267 100644
> --- a/kernel/kthread.c
> +++ b/kernel/kthread.c
> @@ -365,9 +365,10 @@ static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpum
>   	if (kthread->preferred_affinity) {
>   		pref = kthread->preferred_affinity;
>   	} else {
> -		if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE))
> -			return;
> -		pref = cpumask_of_node(kthread->node);
> +		if (kthread->node == NUMA_NO_NODE)
> +			pref = housekeeping_cpumask(HK_TYPE_KTHREAD);
> +		else
> +			pref = cpumask_of_node(kthread->node);
>   	}
>   
>   	cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD));
> @@ -380,32 +381,29 @@ static void kthread_affine_node(void)
>   	struct kthread *kthread = to_kthread(current);
>   	cpumask_var_t affinity;
>   
> -	WARN_ON_ONCE(kthread_is_per_cpu(current));
> +	if (WARN_ON_ONCE(kthread_is_per_cpu(current)))
> +		return;
>   
> -	if (kthread->node == NUMA_NO_NODE) {
> -		housekeeping_affine(current, HK_TYPE_KTHREAD);
> -	} else {
> -		if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
> -			WARN_ON_ONCE(1);
> -			return;
> -		}
> -
> -		mutex_lock(&kthread_affinity_lock);
> -		WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
> -		list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
> -		/*
> -		 * The node cpumask is racy when read from kthread() but:
> -		 * - a racing CPU going down will either fail on the subsequent
> -		 *   call to set_cpus_allowed_ptr() or be migrated to housekeepers
> -		 *   afterwards by the scheduler.
> -		 * - a racing CPU going up will be handled by kthreads_online_cpu()
> -		 */
> -		kthread_fetch_affinity(kthread, affinity);
> -		set_cpus_allowed_ptr(current, affinity);
> -		mutex_unlock(&kthread_affinity_lock);
> -
> -		free_cpumask_var(affinity);
> +	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
> +		WARN_ON_ONCE(1);
> +		return;
>   	}
> +
> +	mutex_lock(&kthread_affinity_lock);
> +	WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
> +	list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
> +	/*
> +	 * The node cpumask is racy when read from kthread() but:
> +	 * - a racing CPU going down will either fail on the subsequent
> +	 *   call to set_cpus_allowed_ptr() or be migrated to housekeepers
> +	 *   afterwards by the scheduler.
> +	 * - a racing CPU going up will be handled by kthreads_online_cpu()
> +	 */
> +	kthread_fetch_affinity(kthread, affinity);
> +	set_cpus_allowed_ptr(current, affinity);
> +	mutex_unlock(&kthread_affinity_lock);
> +
> +	free_cpumask_var(affinity);
>   }
>   
>   static int kthread(void *_create)
> @@ -924,8 +922,11 @@ static int kthreads_online_cpu(unsigned int cpu)
>   			ret = -EINVAL;
>   			continue;
>   		}
> -		kthread_fetch_affinity(k, affinity);
> -		set_cpus_allowed_ptr(k->task, affinity);
> +
> +		if (k->preferred_affinity || k->node != NUMA_NO_NODE) {
> +			kthread_fetch_affinity(k, affinity);
> +			set_cpus_allowed_ptr(k->task, affinity);
> +		}
>   	}

My understanding of kthreads_online_cpu() is that hotplug won't affect 
the affinity returned from kthread_fetch_affinity(). However, 
set_cpus_allowed_ptr() will mask out all the offline CPUs. So if the 
given "cpu" to be brought online is in the returned affinity, we should 
call set_cpus_allowed_ptr() to add this cpu into its affinity mask 
though the current code will call it even it is not strictly necessary. 
This change will not do this update to NUMA_NO_NODE kthread with no 
preferred_affinity, is this a problem?

Cheers,
Longman

Re: [PATCH 22/33] kthread: Include unbound kthreads in the managed affinity list

Posted by Frederic Weisbecker 5 months ago

Le Tue, Oct 21, 2025 at 06:42:59PM -0400, Waiman Long a écrit :
> 
> On 10/13/25 4:31 PM, Frederic Weisbecker wrote:
> > The managed affinity list currently contains only unbound kthreads that
> > have affinity preferences. Unbound kthreads globally affine by default
> > are outside of the list because their affinity is automatically managed
> > by the scheduler (through the fallback housekeeping mask) and by cpuset.
> > 
> > However in order to preserve the preferred affinity of kthreads, cpuset
> > will delegate the isolated partition update propagation to the
> > housekeeping and kthread code.
> > 
> > Prepare for that with including all unbound kthreads in the managed
> > affinity list.
> > 
> > Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> > ---
> >   kernel/kthread.c | 59 ++++++++++++++++++++++++------------------------
> >   1 file changed, 30 insertions(+), 29 deletions(-)
> > 
> > diff --git a/kernel/kthread.c b/kernel/kthread.c
> > index c4dd967e9e9c..cba3d297f267 100644
> > --- a/kernel/kthread.c
> > +++ b/kernel/kthread.c
> > @@ -365,9 +365,10 @@ static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpum
> >   	if (kthread->preferred_affinity) {
> >   		pref = kthread->preferred_affinity;
> >   	} else {
> > -		if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE))
> > -			return;
> > -		pref = cpumask_of_node(kthread->node);
> > +		if (kthread->node == NUMA_NO_NODE)
> > +			pref = housekeeping_cpumask(HK_TYPE_KTHREAD);
> > +		else
> > +			pref = cpumask_of_node(kthread->node);
> >   	}
> >   	cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD));
> > @@ -380,32 +381,29 @@ static void kthread_affine_node(void)
> >   	struct kthread *kthread = to_kthread(current);
> >   	cpumask_var_t affinity;
> > -	WARN_ON_ONCE(kthread_is_per_cpu(current));
> > +	if (WARN_ON_ONCE(kthread_is_per_cpu(current)))
> > +		return;
> > -	if (kthread->node == NUMA_NO_NODE) {
> > -		housekeeping_affine(current, HK_TYPE_KTHREAD);
> > -	} else {
> > -		if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
> > -			WARN_ON_ONCE(1);
> > -			return;
> > -		}
> > -
> > -		mutex_lock(&kthread_affinity_lock);
> > -		WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
> > -		list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
> > -		/*
> > -		 * The node cpumask is racy when read from kthread() but:
> > -		 * - a racing CPU going down will either fail on the subsequent
> > -		 *   call to set_cpus_allowed_ptr() or be migrated to housekeepers
> > -		 *   afterwards by the scheduler.
> > -		 * - a racing CPU going up will be handled by kthreads_online_cpu()
> > -		 */
> > -		kthread_fetch_affinity(kthread, affinity);
> > -		set_cpus_allowed_ptr(current, affinity);
> > -		mutex_unlock(&kthread_affinity_lock);
> > -
> > -		free_cpumask_var(affinity);
> > +	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
> > +		WARN_ON_ONCE(1);
> > +		return;
> >   	}
> > +
> > +	mutex_lock(&kthread_affinity_lock);
> > +	WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
> > +	list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
> > +	/*
> > +	 * The node cpumask is racy when read from kthread() but:
> > +	 * - a racing CPU going down will either fail on the subsequent
> > +	 *   call to set_cpus_allowed_ptr() or be migrated to housekeepers
> > +	 *   afterwards by the scheduler.
> > +	 * - a racing CPU going up will be handled by kthreads_online_cpu()
> > +	 */
> > +	kthread_fetch_affinity(kthread, affinity);
> > +	set_cpus_allowed_ptr(current, affinity);
> > +	mutex_unlock(&kthread_affinity_lock);
> > +
> > +	free_cpumask_var(affinity);
> >   }
> >   static int kthread(void *_create)
> > @@ -924,8 +922,11 @@ static int kthreads_online_cpu(unsigned int cpu)
> >   			ret = -EINVAL;
> >   			continue;
> >   		}
> > -		kthread_fetch_affinity(k, affinity);
> > -		set_cpus_allowed_ptr(k->task, affinity);
> > +
> > +		if (k->preferred_affinity || k->node != NUMA_NO_NODE) {
> > +			kthread_fetch_affinity(k, affinity);
> > +			set_cpus_allowed_ptr(k->task, affinity);
> > +		}
> >   	}
> 
> My understanding of kthreads_online_cpu() is that hotplug won't affect the
> affinity returned from kthread_fetch_affinity().

It should. The onlining CPU is considered online at this point and might
be part of the returned kthread_fetch_affinity().

> However, set_cpus_allowed_ptr() will mask out all the offline CPUs. So if the given
> "cpu" to be brought online is in the returned affinity, we should call
> set_cpus_allowed_ptr() to add this cpu into its affinity mask though the
> current code will call it even it is not strictly necessary.

I'm not sure I understand what you mean.

> This change will not do this update to NUMA_NO_NODE kthread with no preferred_affinity,
> is this a problem?

Ah, so unbound kthreads without preferred affinity are already affine to all
possible CPUs (or housekeeping), whether those CPUs are online or not. So we
don't need to add newly online CPUs to them.

kthreads with a preferred affinity or node are different because if none of
their preferred CPUs are online, they must be affine to housekeeping. But as
soon as one of their preferred CPU becomes online, they must be affine to them.

Hence the different treatment. I'm adding a big comment to explain that.

Thanks!

-- 
Frederic Weisbecker
SUSE Labs