Subsystem housekeeping masks are currently static and can only be set
via boot-time parameters (isolcpus, nohz_full, etc.). There is no
userspace interface to reconfigure these boundaries at runtime.
Implement the DHEI sysfs interface under /sys/kernel/housekeeping.
This enables userspace to independently reconfigure different kernel
services' affinities without a reboot.
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
---
kernel/sched/isolation.c | 89 ++++++++++++++++++++++++------------------------
1 file changed, 45 insertions(+), 44 deletions(-)
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 685cc0df1bd9f..1c867784d155b 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -8,7 +8,12 @@
*
*/
#include <linux/sched/isolation.h>
+#include <linux/capability.h>
#include <linux/mutex.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
#include <linux/notifier.h>
#include <linux/topology.h>
#include "sched.h"
@@ -16,9 +21,17 @@
enum hk_flags {
HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
- HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE),
+ HK_FLAG_TICK = BIT(HK_TYPE_TICK),
+ HK_FLAG_TIMER = BIT(HK_TYPE_TIMER),
+ HK_FLAG_RCU = BIT(HK_TYPE_RCU),
+ HK_FLAG_MISC = BIT(HK_TYPE_MISC),
+ HK_FLAG_WQ = BIT(HK_TYPE_WQ),
+ HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD),
};
+#define HK_FLAG_KERNEL_NOISE (HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | \
+ HK_FLAG_MISC | HK_FLAG_WQ | HK_FLAG_KTHREAD)
+
static DEFINE_MUTEX(housekeeping_mutex);
static BLOCKING_NOTIFIER_HEAD(housekeeping_notifier_list);
DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
@@ -44,6 +57,9 @@ static ssize_t smt_aware_store(struct kobject *kobj,
{
bool val;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
if (kstrtobool(buf, &val))
return -EINVAL;
@@ -53,7 +69,7 @@ static ssize_t smt_aware_store(struct kobject *kobj,
}
static struct kobj_attribute smt_aware_attr =
- __ATTR(smt_aware_mode, 0644, smt_aware_show, smt_aware_store);
+ __ATTR(smt_aware_mode, 0600, smt_aware_show, smt_aware_store);
bool housekeeping_enabled(enum hk_type type)
{
@@ -171,6 +187,9 @@ static ssize_t housekeeping_store(struct kobject *kobject,
cpumask_var_t new_mask;
int err;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
return -ENOMEM;
@@ -178,42 +197,26 @@ static ssize_t housekeeping_store(struct kobject *kobject,
if (err)
goto out_free;
- /* Safety check: must have at least one online CPU for housekeeping */
- if (!cpumask_intersects(new_mask, cpu_online_mask)) {
+ if (cpumask_empty(new_mask) ||
+ !cpumask_intersects(new_mask, cpu_online_mask)) {
err = -EINVAL;
goto out_free;
}
- if (housekeeping_smt_aware) {
- int cpu, sibling;
- cpumask_var_t tmp_mask;
+ mutex_lock(&housekeeping_mutex);
- if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL)) {
- err = -ENOMEM;
- goto out_free;
- }
+ if (housekeeping_smt_aware) {
+ int cpu;
- cpumask_copy(tmp_mask, new_mask);
- for_each_cpu(cpu, tmp_mask) {
- for_each_cpu(sibling, topology_sibling_cpumask(cpu)) {
- if (!cpumask_test_cpu(sibling, tmp_mask)) {
- /* SMT sibling should stay grouped */
- cpumask_clear_cpu(cpu, new_mask);
- break;
- }
+ for_each_cpu(cpu, new_mask) {
+ if (!cpumask_subset(topology_sibling_cpumask(cpu),
+ new_mask)) {
+ err = -EINVAL;
+ goto out_unlock;
}
}
- free_cpumask_var(tmp_mask);
-
- /* Re-check after SMT sync */
- if (!cpumask_intersects(new_mask, cpu_online_mask)) {
- err = -EINVAL;
- goto out_free;
- }
}
- mutex_lock(&housekeeping_mutex);
-
if (!housekeeping.cpumasks[type]) {
if (!alloc_cpumask_var(&housekeeping.cpumasks[type], GFP_KERNEL)) {
err = -ENOMEM;
@@ -242,7 +245,7 @@ static ssize_t housekeeping_store(struct kobject *kobject,
}
static struct hk_attribute housekeeping_attrs[HK_TYPE_MAX];
-static struct attribute *housekeeping_attr_ptr[HK_TYPE_MAX + 1];
+static struct attribute *housekeeping_attr_ptr[HK_TYPE_MAX + 2];
static const struct attribute_group housekeeping_attr_group = {
.attrs = housekeeping_attr_ptr,
@@ -265,28 +268,22 @@ static int __init housekeeping_sysfs_init(void)
housekeeping_attrs[i].type = i;
sysfs_attr_init(&housekeeping_attrs[i].kattr.attr);
housekeeping_attrs[i].kattr.attr.name = hk_type_names[i];
- housekeeping_attrs[i].kattr.attr.mode = 0644;
+ housekeeping_attrs[i].kattr.attr.mode = 0600;
housekeeping_attrs[i].kattr.show = housekeeping_show;
housekeeping_attrs[i].kattr.store = housekeeping_store;
housekeeping_attr_ptr[j++] = &housekeeping_attrs[i].kattr.attr;
}
+
+ housekeeping_attr_ptr[j++] = &smt_aware_attr.attr;
housekeeping_attr_ptr[j] = NULL;
ret = sysfs_create_group(housekeeping_kobj, &housekeeping_attr_group);
- if (ret)
- goto err_group;
-
- ret = sysfs_create_file(housekeeping_kobj, &smt_aware_attr.attr);
- if (ret)
- goto err_file;
+ if (ret) {
+ kobject_put(housekeeping_kobj);
+ return ret;
+ }
return 0;
-
-err_file:
- sysfs_remove_group(housekeeping_kobj, &housekeeping_attr_group);
-err_group:
- kobject_put(housekeeping_kobj);
- return ret;
}
late_initcall(housekeeping_sysfs_init);
@@ -313,8 +310,12 @@ static void __init housekeeping_setup_type(enum hk_type type,
if (!slab_is_available())
gfp = GFP_NOWAIT;
- if (!housekeeping.cpumasks[type])
- alloc_cpumask_var(&housekeeping.cpumasks[type], gfp);
+ if (!housekeeping.cpumasks[type]) {
+ if (!alloc_cpumask_var(&housekeeping.cpumasks[type], gfp)) {
+ pr_err("housekeeping: failed to allocate cpumask for type %d\n", type);
+ return;
+ }
+ }
cpumask_copy(housekeeping.cpumasks[type],
housekeeping_staging);
--
2.43.0
On Wed, Mar 25, 2026 at 05:09:44PM +0800, Qiliang Yuan wrote: > Subsystem housekeeping masks are currently static and can only be set > via boot-time parameters (isolcpus, nohz_full, etc.). There is no > userspace interface to reconfigure these boundaries at runtime. > > Implement the DHEI sysfs interface under /sys/kernel/housekeeping. > Why? What was wrong with cpusets?
On Wed, Mar 25, 2026 at 03:04:32PM +0100, Peter Zijlstra wrote: > Why? What was wrong with cpusets? This is the central point of the architecture. The distinction I was trying to address is: 1. Task Isolation (Current CPUSets): The `cpuset` subsystem (especially `cpuset.cpus.partition = isolated`) is excellent at managing task placement and load balancing. It ensures no user tasks are pushed to isolated CPUs. 2. Kernel Overhead Isolation (Housekeeping): Currently, `cpusets` do not manage kernel-internal overhead like RCU callbacks, timers, or unbound workqueues. These are managed by the global `housekeeping_cpumask`, which is settled at boot via `isolcpus`/`nohz_full` and is static. DHEI fills this second gap by making the housekeeping mask dynamic. However, I agree that a parallel sysfs interface is redundant. In V13, I will move the control interface to `cpuset`. The root cpuset will serve as the primary interface, allowing changes in the cpuset partition state to automatically trigger the migration of kernel housekeeping overhead. This achieves "Full Dynamic Isolation" (both tasks and kernel overhead) through a single, unified interface. Best regards, Qiliang
On 3/30/26 7:46 AM, Qiliang Yuan wrote: > On Wed, Mar 25, 2026 at 03:04:32PM +0100, Peter Zijlstra wrote: >> Why? What was wrong with cpusets? > This is the central point of the architecture. The distinction I was > trying to address is: > > 1. Task Isolation (Current CPUSets): > The `cpuset` subsystem (especially `cpuset.cpus.partition = isolated`) > is excellent at managing task placement and load balancing. It > ensures no user tasks are pushed to isolated CPUs. > > 2. Kernel Overhead Isolation (Housekeeping): > Currently, `cpusets` do not manage kernel-internal overhead like RCU > callbacks, timers, or unbound workqueues. These are managed by the > global `housekeeping_cpumask`, which is settled at boot via > `isolcpus`/`nohz_full` and is static. My plan is to extend the cpuset isolated partition mechanism to isolate other kernel noise currently covered by the nohz_full and manged_irq boot command line. That will makes the HK_TYPE_KERNEL_NOISE cpumask modifiable at run time. It is not a direct modification of the HK cpumasks as advocated by this patch series but an indirect one via the creation of the appropriate isolated cpuset partitions. Cheers, Longman
cc'ing Waiman. On Mon, Mar 30, 2026 at 07:46:20PM +0800, Qiliang Yuan wrote: > On Wed, Mar 25, 2026 at 03:04:32PM +0100, Peter Zijlstra wrote: > > Why? What was wrong with cpusets? > > This is the central point of the architecture. The distinction I was > trying to address is: > > 1. Task Isolation (Current CPUSets): > The `cpuset` subsystem (especially `cpuset.cpus.partition = isolated`) > is excellent at managing task placement and load balancing. It > ensures no user tasks are pushed to isolated CPUs. > > 2. Kernel Overhead Isolation (Housekeeping): > Currently, `cpusets` do not manage kernel-internal overhead like RCU > callbacks, timers, or unbound workqueues. These are managed by the > global `housekeeping_cpumask`, which is settled at boot via > `isolcpus`/`nohz_full` and is static. > > DHEI fills this second gap by making the housekeeping mask dynamic. > However, I agree that a parallel sysfs interface is redundant. > > In V13, I will move the control interface to `cpuset`. The root cpuset > will serve as the primary interface, allowing changes in the cpuset > partition state to automatically trigger the migration of kernel > housekeeping overhead. This achieves "Full Dynamic Isolation" (both tasks > and kernel overhead) through a single, unified interface. Please discuss with Frederic and Waiman first because they have been working towards making cpuset to cover what cpuisol does. I don't think we want two separate mechanisms for the same thing and don't see why this would need to be its own thing when it has to be coupled with task isolation anyway. Thanks. -- tejun
© 2016 - 2026 Red Hat, Inc.