[PATCH 13/15] sched/isolation: Implement sysfs interface for dynamic housekeeping

Qiliang Yuan posted 15 patches 1 week ago
[PATCH 13/15] sched/isolation: Implement sysfs interface for dynamic housekeeping
Posted by Qiliang Yuan 1 week ago
Subsystem housekeeping masks are currently static and can only be set
via boot-time parameters (isolcpus, nohz_full, etc.). There is no
userspace interface to reconfigure these boundaries at runtime.

Implement the DHEI sysfs interface under /sys/kernel/housekeeping.

This enables userspace to independently reconfigure different kernel
services' affinities without a reboot.

Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
---
 kernel/sched/isolation.c | 89 ++++++++++++++++++++++++------------------------
 1 file changed, 45 insertions(+), 44 deletions(-)

diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 685cc0df1bd9f..1c867784d155b 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -8,7 +8,12 @@
  *
  */
 #include <linux/sched/isolation.h>
+#include <linux/capability.h>
 #include <linux/mutex.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include "sched.h"
@@ -16,9 +21,17 @@
 enum hk_flags {
 	HK_FLAG_DOMAIN		= BIT(HK_TYPE_DOMAIN),
 	HK_FLAG_MANAGED_IRQ	= BIT(HK_TYPE_MANAGED_IRQ),
-	HK_FLAG_KERNEL_NOISE	= BIT(HK_TYPE_KERNEL_NOISE),
+	HK_FLAG_TICK		= BIT(HK_TYPE_TICK),
+	HK_FLAG_TIMER		= BIT(HK_TYPE_TIMER),
+	HK_FLAG_RCU		= BIT(HK_TYPE_RCU),
+	HK_FLAG_MISC		= BIT(HK_TYPE_MISC),
+	HK_FLAG_WQ		= BIT(HK_TYPE_WQ),
+	HK_FLAG_KTHREAD		= BIT(HK_TYPE_KTHREAD),
 };
 
+#define HK_FLAG_KERNEL_NOISE (HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | \
+			      HK_FLAG_MISC | HK_FLAG_WQ | HK_FLAG_KTHREAD)
+
 static DEFINE_MUTEX(housekeeping_mutex);
 static BLOCKING_NOTIFIER_HEAD(housekeeping_notifier_list);
 DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
@@ -44,6 +57,9 @@ static ssize_t smt_aware_store(struct kobject *kobj,
 {
 	bool val;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if (kstrtobool(buf, &val))
 		return -EINVAL;
 
@@ -53,7 +69,7 @@ static ssize_t smt_aware_store(struct kobject *kobj,
 }
 
 static struct kobj_attribute smt_aware_attr =
-	__ATTR(smt_aware_mode, 0644, smt_aware_show, smt_aware_store);
+	__ATTR(smt_aware_mode, 0600, smt_aware_show, smt_aware_store);
 
 bool housekeeping_enabled(enum hk_type type)
 {
@@ -171,6 +187,9 @@ static ssize_t housekeeping_store(struct kobject *kobject,
 	cpumask_var_t new_mask;
 	int err;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
 		return -ENOMEM;
 
@@ -178,42 +197,26 @@ static ssize_t housekeeping_store(struct kobject *kobject,
 	if (err)
 		goto out_free;
 
-	/* Safety check: must have at least one online CPU for housekeeping */
-	if (!cpumask_intersects(new_mask, cpu_online_mask)) {
+	if (cpumask_empty(new_mask) ||
+	    !cpumask_intersects(new_mask, cpu_online_mask)) {
 		err = -EINVAL;
 		goto out_free;
 	}
 
-	if (housekeeping_smt_aware) {
-		int cpu, sibling;
-		cpumask_var_t tmp_mask;
+	mutex_lock(&housekeeping_mutex);
 
-		if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL)) {
-			err = -ENOMEM;
-			goto out_free;
-		}
+	if (housekeeping_smt_aware) {
+		int cpu;
 
-		cpumask_copy(tmp_mask, new_mask);
-		for_each_cpu(cpu, tmp_mask) {
-			for_each_cpu(sibling, topology_sibling_cpumask(cpu)) {
-				if (!cpumask_test_cpu(sibling, tmp_mask)) {
-					/* SMT sibling should stay grouped */
-					cpumask_clear_cpu(cpu, new_mask);
-					break;
-				}
+		for_each_cpu(cpu, new_mask) {
+			if (!cpumask_subset(topology_sibling_cpumask(cpu),
+					    new_mask)) {
+				err = -EINVAL;
+				goto out_unlock;
 			}
 		}
-		free_cpumask_var(tmp_mask);
-
-		/* Re-check after SMT sync */
-		if (!cpumask_intersects(new_mask, cpu_online_mask)) {
-			err = -EINVAL;
-			goto out_free;
-		}
 	}
 
-	mutex_lock(&housekeeping_mutex);
-
 	if (!housekeeping.cpumasks[type]) {
 		if (!alloc_cpumask_var(&housekeeping.cpumasks[type], GFP_KERNEL)) {
 			err = -ENOMEM;
@@ -242,7 +245,7 @@ static ssize_t housekeeping_store(struct kobject *kobject,
 }
 
 static struct hk_attribute housekeeping_attrs[HK_TYPE_MAX];
-static struct attribute *housekeeping_attr_ptr[HK_TYPE_MAX + 1];
+static struct attribute *housekeeping_attr_ptr[HK_TYPE_MAX + 2];
 
 static const struct attribute_group housekeeping_attr_group = {
 	.attrs = housekeeping_attr_ptr,
@@ -265,28 +268,22 @@ static int __init housekeeping_sysfs_init(void)
 		housekeeping_attrs[i].type = i;
 		sysfs_attr_init(&housekeeping_attrs[i].kattr.attr);
 		housekeeping_attrs[i].kattr.attr.name = hk_type_names[i];
-		housekeeping_attrs[i].kattr.attr.mode = 0644;
+		housekeeping_attrs[i].kattr.attr.mode = 0600;
 		housekeeping_attrs[i].kattr.show = housekeeping_show;
 		housekeeping_attrs[i].kattr.store = housekeeping_store;
 		housekeeping_attr_ptr[j++] = &housekeeping_attrs[i].kattr.attr;
 	}
+
+	housekeeping_attr_ptr[j++] = &smt_aware_attr.attr;
 	housekeeping_attr_ptr[j] = NULL;
 
 	ret = sysfs_create_group(housekeeping_kobj, &housekeeping_attr_group);
-	if (ret)
-		goto err_group;
-
-	ret = sysfs_create_file(housekeeping_kobj, &smt_aware_attr.attr);
-	if (ret)
-		goto err_file;
+	if (ret) {
+		kobject_put(housekeeping_kobj);
+		return ret;
+	}
 
 	return 0;
-
-err_file:
-	sysfs_remove_group(housekeeping_kobj, &housekeeping_attr_group);
-err_group:
-	kobject_put(housekeeping_kobj);
-	return ret;
 }
 late_initcall(housekeeping_sysfs_init);
 
@@ -313,8 +310,12 @@ static void __init housekeeping_setup_type(enum hk_type type,
 	if (!slab_is_available())
 		gfp = GFP_NOWAIT;
 
-	if (!housekeeping.cpumasks[type])
-		alloc_cpumask_var(&housekeeping.cpumasks[type], gfp);
+	if (!housekeeping.cpumasks[type]) {
+		if (!alloc_cpumask_var(&housekeeping.cpumasks[type], gfp)) {
+			pr_err("housekeeping: failed to allocate cpumask for type %d\n", type);
+			return;
+		}
+	}
 
 	cpumask_copy(housekeeping.cpumasks[type],
 		     housekeeping_staging);

-- 
2.43.0
Re: [PATCH 13/15] sched/isolation: Implement sysfs interface for dynamic housekeeping
Posted by Peter Zijlstra 1 week ago
On Wed, Mar 25, 2026 at 05:09:44PM +0800, Qiliang Yuan wrote:
> Subsystem housekeeping masks are currently static and can only be set
> via boot-time parameters (isolcpus, nohz_full, etc.). There is no
> userspace interface to reconfigure these boundaries at runtime.
> 
> Implement the DHEI sysfs interface under /sys/kernel/housekeeping.
> 

Why? What was wrong with cpusets?
Re: [PATCH 13/15] sched/isolation: Implement sysfs interface for dynamic housekeeping
Posted by Qiliang Yuan 2 days, 14 hours ago
On Wed, Mar 25, 2026 at 03:04:32PM +0100, Peter Zijlstra wrote:
> Why? What was wrong with cpusets?

This is the central point of the architecture. The distinction I was 
trying to address is:

1. Task Isolation (Current CPUSets):
   The `cpuset` subsystem (especially `cpuset.cpus.partition = isolated`) 
   is excellent at managing task placement and load balancing. It 
   ensures no user tasks are pushed to isolated CPUs.

2. Kernel Overhead Isolation (Housekeeping):
   Currently, `cpusets` do not manage kernel-internal overhead like RCU 
   callbacks, timers, or unbound workqueues. These are managed by the 
   global `housekeeping_cpumask`, which is settled at boot via 
   `isolcpus`/`nohz_full` and is static.

DHEI fills this second gap by making the housekeeping mask dynamic. 
However, I agree that a parallel sysfs interface is redundant.

In V13, I will move the control interface to `cpuset`. The root cpuset 
will serve as the primary interface, allowing changes in the cpuset 
partition state to automatically trigger the migration of kernel 
housekeeping overhead. This achieves "Full Dynamic Isolation" (both tasks 
and kernel overhead) through a single, unified interface.

Best regards,
Qiliang
Re: [PATCH 13/15] sched/isolation: Implement sysfs interface for dynamic housekeeping
Posted by Waiman Long 1 day, 11 hours ago
On 3/30/26 7:46 AM, Qiliang Yuan wrote:
> On Wed, Mar 25, 2026 at 03:04:32PM +0100, Peter Zijlstra wrote:
>> Why? What was wrong with cpusets?
> This is the central point of the architecture. The distinction I was
> trying to address is:
>
> 1. Task Isolation (Current CPUSets):
>     The `cpuset` subsystem (especially `cpuset.cpus.partition = isolated`)
>     is excellent at managing task placement and load balancing. It
>     ensures no user tasks are pushed to isolated CPUs.
>
> 2. Kernel Overhead Isolation (Housekeeping):
>     Currently, `cpusets` do not manage kernel-internal overhead like RCU
>     callbacks, timers, or unbound workqueues. These are managed by the
>     global `housekeeping_cpumask`, which is settled at boot via
>     `isolcpus`/`nohz_full` and is static.

My plan is to extend the cpuset isolated partition mechanism to isolate 
other kernel noise currently covered by the nohz_full and manged_irq 
boot command line. That will makes the HK_TYPE_KERNEL_NOISE cpumask 
modifiable at run time. It is not a direct modification of the HK 
cpumasks as advocated by this patch series but an indirect one via the 
creation of the appropriate isolated cpuset partitions.

Cheers,
Longman
Re: [PATCH 13/15] sched/isolation: Implement sysfs interface for dynamic housekeeping
Posted by Tejun Heo 2 days, 7 hours ago
cc'ing Waiman.

On Mon, Mar 30, 2026 at 07:46:20PM +0800, Qiliang Yuan wrote:
> On Wed, Mar 25, 2026 at 03:04:32PM +0100, Peter Zijlstra wrote:
> > Why? What was wrong with cpusets?
> 
> This is the central point of the architecture. The distinction I was 
> trying to address is:
> 
> 1. Task Isolation (Current CPUSets):
>    The `cpuset` subsystem (especially `cpuset.cpus.partition = isolated`) 
>    is excellent at managing task placement and load balancing. It 
>    ensures no user tasks are pushed to isolated CPUs.
> 
> 2. Kernel Overhead Isolation (Housekeeping):
>    Currently, `cpusets` do not manage kernel-internal overhead like RCU 
>    callbacks, timers, or unbound workqueues. These are managed by the 
>    global `housekeeping_cpumask`, which is settled at boot via 
>    `isolcpus`/`nohz_full` and is static.
> 
> DHEI fills this second gap by making the housekeeping mask dynamic. 
> However, I agree that a parallel sysfs interface is redundant.
> 
> In V13, I will move the control interface to `cpuset`. The root cpuset 
> will serve as the primary interface, allowing changes in the cpuset 
> partition state to automatically trigger the migration of kernel 
> housekeeping overhead. This achieves "Full Dynamic Isolation" (both tasks 
> and kernel overhead) through a single, unified interface.

Please discuss with Frederic and Waiman first because they have been working
towards making cpuset to cover what cpuisol does. I don't think we want two
separate mechanisms for the same thing and don't see why this would need to
be its own thing when it has to be coupled with task isolation anyway.

Thanks.

-- 
tejun