The housekeeping CPU masks, set up by the "isolcpus" and "nohz_full"
boot command line options, are used at boot time to exclude selected CPUs
from running some kernel background processes to minimize disturbance
to latency sensitive userspace applications. Some of housekeeping CPU
masks are also checked at run time to avoid using those isolated CPUs.
The cpuset subsystem is now able to dynamically create a set of isolated
CPUs to be used in isolated cpuset partitions. The long term goal is
to make the degree of isolation as close as possible to what can be
done statically using those boot command line options.
This patch is a step in that direction by making the housekeeping CPU
mask APIs exclude the dynamically isolated CPUs when they are called
at run time. The housekeeping CPU masks will fall back to the bootup
default when all the dynamically isolated CPUs are released.
A new housekeeping_exlude_isolcpus() function is added which is to be
called by the cpuset subsystem to provide a list of isolated CPUs to
be excluded.
Signed-off-by: Waiman Long <longman@redhat.com>
---
include/linux/sched/isolation.h | 8 +++
kernel/sched/isolation.c | 112 +++++++++++++++++++++++++++++++-
2 files changed, 119 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 2b461129d1fa..d64fa4e60138 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -27,6 +27,8 @@ extern bool housekeeping_enabled(enum hk_type type);
extern void housekeeping_affine(struct task_struct *t, enum hk_type type);
extern bool housekeeping_test_cpu(int cpu, enum hk_type type);
extern void __init housekeeping_init(void);
+extern int housekeeping_exlude_isolcpus(const struct cpumask *isolcpus,
+ unsigned long flags);
#else
@@ -54,6 +56,12 @@ static inline bool housekeeping_test_cpu(int cpu, enum hk_type type)
}
static inline void housekeeping_init(void) { }
+
+static inline int housekeeping_exlude_isolcpus(struct cpumask *isolcpus,
+ unsigned long flags)
+{
+ return -EOPNOTSUPP;
+}
#endif /* CONFIG_CPU_ISOLATION */
static inline bool housekeeping_cpu(int cpu, enum hk_type type)
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 5891e715f00d..3018ba81eb65 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -28,7 +28,16 @@ struct housekeeping {
unsigned long flags;
};
-static struct housekeeping housekeeping;
+static struct housekeeping housekeeping __read_mostly;
+
+/*
+ * Boot time housekeeping cpumask and flags
+ *
+ * If more than one of nohz_full or isolcpus are specified, the cpumask must
+ * be the same or the setup will fail.
+ */
+static cpumask_var_t boot_hk_cpumask;
+static unsigned long boot_hk_flags;
bool housekeeping_enabled(enum hk_type type)
{
@@ -253,3 +262,104 @@ static int __init housekeeping_isolcpus_setup(char *str)
return housekeeping_setup(str, flags);
}
__setup("isolcpus=", housekeeping_isolcpus_setup);
+
+/*
+ * Save bootup housekeeping cpumask and flags
+ */
+static int housekeeping_save(void)
+{
+ enum hk_type type;
+
+ boot_hk_flags = housekeeping.flags;
+ for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
+ if (!alloc_cpumask_var(&boot_hk_cpumask, GFP_KERNEL))
+ return -ENOMEM;
+ cpumask_copy(boot_hk_cpumask, housekeeping.cpumasks[type]);
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Exclude the given dynamically isolated CPUs from the housekeeping CPUs
+ * External synchronization is required to make sure that concurrent call to
+ * this function will not happen.
+ *
+ * [TODO] The housekeeping cpumasks and flags at bootup time are currently
+ * preserved as cpuset dynamic CPU isolation isn't as good as boot time CPU
+ * isolation yet. Once dynamic CPU isolation is close to boot time isolation,
+ * we will not need to save the bootup values and will allow them to be
+ * overridden.
+ *
+ * Return: 0 if successful, an error code if not
+ */
+int housekeeping_exlude_isolcpus(const struct cpumask *isolcpus, unsigned long flags)
+{
+ static unsigned long alloc_flags;
+ static cpumask_var_t tmp_mask;
+ static bool excluded; /* @true if some CPUs have been excluded */
+ static bool inited; /* @true if called before */
+
+ bool isolate_none = !isolcpus || cpumask_empty(isolcpus);
+ enum hk_type type;
+
+ lockdep_assert_cpus_held();
+
+ if (isolate_none && (!inited || !excluded))
+ return 0;
+
+ if (unlikely(!inited)) {
+ if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL))
+ return -ENOMEM;
+ if (housekeeping.flags) {
+ int err = housekeeping_save();
+
+ if (err)
+ return err;
+ }
+ alloc_flags = housekeeping.flags;
+ inited = true;
+ }
+
+ if (isolate_none) {
+ excluded = false;
+
+ /*
+ * Reset housekeeping to bootup default
+ */
+ for_each_set_bit(type, &boot_hk_flags, HK_TYPE_MAX)
+ cpumask_copy(housekeeping.cpumasks[type], boot_hk_cpumask);
+
+ WRITE_ONCE(housekeeping.flags, boot_hk_flags);
+ if (!boot_hk_flags && static_key_enabled(&housekeeping_overridden))
+ static_key_disable_cpuslocked(&housekeeping_overridden.key);
+ return 0;
+ }
+
+ /*
+ * Setting up the new housekeeping cpumasks
+ */
+ for_each_set_bit(type, &flags, HK_TYPE_MAX) {
+ const struct cpumask *src_mask;
+
+ if (!(BIT(type) & alloc_flags)) {
+ if (!alloc_cpumask_var(&housekeeping.cpumasks[type], GFP_KERNEL))
+ return -ENOMEM;
+ alloc_flags |= BIT(type);
+ }
+ src_mask = (BIT(type) & boot_hk_flags)
+ ? boot_hk_cpumask : cpu_possible_mask;
+ /*
+ * Make sure there is at least one online housekeeping CPU
+ */
+ cpumask_andnot(tmp_mask, src_mask, isolcpus);
+ if (!cpumask_intersects(tmp_mask, cpu_online_mask))
+ return -EINVAL; /* Invalid isolated CPUs */
+ cpumask_copy(housekeeping.cpumasks[type], tmp_mask);
+ }
+ WRITE_ONCE(housekeeping.flags, boot_hk_flags | flags);
+ excluded = true;
+ if (!static_key_enabled(&housekeeping_overridden))
+ static_key_enable_cpuslocked(&housekeeping_overridden.key);
+ return 0;
+}
--
2.43.5
Le Wed, Aug 21, 2024 at 10:23:11AM -0400, Waiman Long a écrit :
> The housekeeping CPU masks, set up by the "isolcpus" and "nohz_full"
> boot command line options, are used at boot time to exclude selected CPUs
> from running some kernel background processes to minimize disturbance
> to latency sensitive userspace applications. Some of housekeeping CPU
> masks are also checked at run time to avoid using those isolated CPUs.
>
> The cpuset subsystem is now able to dynamically create a set of isolated
> CPUs to be used in isolated cpuset partitions. The long term goal is
> to make the degree of isolation as close as possible to what can be
> done statically using those boot command line options.
>
> This patch is a step in that direction by making the housekeeping CPU
> mask APIs exclude the dynamically isolated CPUs when they are called
> at run time. The housekeeping CPU masks will fall back to the bootup
> default when all the dynamically isolated CPUs are released.
>
> A new housekeeping_exlude_isolcpus() function is added which is to be
> called by the cpuset subsystem to provide a list of isolated CPUs to
> be excluded.
>
> Signed-off-by: Waiman Long <longman@redhat.com>
It's a bit hard to review this for several reasons:
* first, because I'm doing it three months late, sorry about that
* We need to get the HK_TYPE_KERNEL_NOISE patchset in because the
gazillions types don't help. Let's ping again scheduler people
once -rc1 is released. I'm setting an alarm!
* It's hard to forecast what kind of synchronization will be needed
against housekeeping cpumask updates. I need to audit all the users.
But since all target CPUs are offline, there are just a few things left
to consider. One of them is kthreads affinity and that should be at
least partially solved by the kthread affinity patchset
(https://lore.kernel.org/lkml/20241112142248.20503-1-frederic@kernel.org/)
Hopefully I'll manage to get that in for the upcoming merge window.
Some more thoughts:
> ---
> include/linux/sched/isolation.h | 8 +++
> kernel/sched/isolation.c | 112 +++++++++++++++++++++++++++++++-
> 2 files changed, 119 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
> index 2b461129d1fa..d64fa4e60138 100644
> --- a/include/linux/sched/isolation.h
> +++ b/include/linux/sched/isolation.h
> @@ -27,6 +27,8 @@ extern bool housekeeping_enabled(enum hk_type type);
> extern void housekeeping_affine(struct task_struct *t, enum hk_type type);
> extern bool housekeeping_test_cpu(int cpu, enum hk_type type);
> extern void __init housekeeping_init(void);
> +extern int housekeeping_exlude_isolcpus(const struct cpumask *isolcpus,
> + unsigned long flags);
>
> #else
>
> @@ -54,6 +56,12 @@ static inline bool housekeeping_test_cpu(int cpu, enum hk_type type)
> }
>
> static inline void housekeeping_init(void) { }
> +
> +static inline int housekeeping_exlude_isolcpus(struct cpumask *isolcpus,
> + unsigned long flags)
> +{
> + return -EOPNOTSUPP;
> +}
> #endif /* CONFIG_CPU_ISOLATION */
>
> static inline bool housekeeping_cpu(int cpu, enum hk_type type)
> diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
> index 5891e715f00d..3018ba81eb65 100644
> --- a/kernel/sched/isolation.c
> +++ b/kernel/sched/isolation.c
> @@ -28,7 +28,16 @@ struct housekeeping {
> unsigned long flags;
> };
>
> -static struct housekeeping housekeeping;
> +static struct housekeeping housekeeping __read_mostly;
> +
> +/*
> + * Boot time housekeeping cpumask and flags
> + *
> + * If more than one of nohz_full or isolcpus are specified, the cpumask must
> + * be the same or the setup will fail.
> + */
> +static cpumask_var_t boot_hk_cpumask;
> +static unsigned long boot_hk_flags;
>
> bool housekeeping_enabled(enum hk_type type)
> {
> @@ -253,3 +262,104 @@ static int __init housekeeping_isolcpus_setup(char *str)
> return housekeeping_setup(str, flags);
> }
> __setup("isolcpus=", housekeeping_isolcpus_setup);
> +
> +/*
> + * Save bootup housekeeping cpumask and flags
> + */
> +static int housekeeping_save(void)
> +{
> + enum hk_type type;
> +
> + boot_hk_flags = housekeeping.flags;
> + for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
> + if (!alloc_cpumask_var(&boot_hk_cpumask, GFP_KERNEL))
> + return -ENOMEM;
So this leaks and overwrites the mask for each flags?
Also only HK_TYPE_KERNEL_NOISE will be interesting.
> + cpumask_copy(boot_hk_cpumask, housekeeping.cpumasks[type]);
> + break;
> + }
> + return 0;
> +}
Should it be done on boot when housekeeping is allocated?
Thanks.
Hello.
I recently liked the idea of considering isolated CPUs a static (boot
time) resource and only use cpusets to place (or remove) sensitive
workload from those selected CPUs depending on current needs. (Yes, this
may not efficiently utilize the isolated CPUs when reserve them based on
maximum needs of a node.)
On Wed, Aug 21, 2024 at 10:23:11AM GMT, Waiman Long <longman@redhat.com> wrote:
> This patch is a step in that direction by making the housekeeping CPU
> mask APIs exclude the dynamically isolated CPUs when they are called
> at run time. The housekeeping CPU masks will fall back to the bootup
> default when all the dynamically isolated CPUs are released.
But when I look at it with the dynamism in mind, I would expect that
some API like housekeeping_setup_type(), i.e. modify the set of isolated
CPUs are requested and leave it up to the isolation implementation to
propagate any changes to respective subsystems. And return an error of
type contains a flag for which dynamism isn't implemented yet or not
possible.
The boot time value would only be the initial default, but the value
would be mutated by this API. There's IMO no need to revert to that.
(Also someone mentioned that this could share lots of code with CPU
offlining/onlining.)
> A new housekeeping_exlude_isolcpus() function is added which is to be
exclude
> called by the cpuset subsystem to provide a list of isolated CPUs to
> be excluded.
HTH,
Michal
(-Cc: lizefan.x@bytedance.com)
On 11/15/24 10:45 AM, Michal Koutný wrote: > (Also someone mentioned that this could share lots of code with CPU > offlining/onlining.) Yes, that is true. The simplest way to do that is to offline the CPUs to be isolated, change the housekeeping masks and then online those CPUs again. That is good for managing a single isolated partition. However, Daniel had told me that CPU hotplug code could cause latency spike in existing isolated CPUs. That could be a problem if we have more than one isolated partitions to manage. So more investigation will be needed in this. This is still the direction we are going initially, but first we need to enable dynamic changes to the housekeeping masks first. Cheers, Longman
On 11/15/24 10:45 AM, Michal Koutný wrote: > Hello. > > I recently liked the idea of considering isolated CPUs a static (boot > time) resource and only use cpusets to place (or remove) sensitive > workload from those selected CPUs depending on current needs. (Yes, this > may not efficiently utilize the isolated CPUs when reserve them based on > maximum needs of a node.) Thanks for taking a look at this. Yes, I am moving in this direction too. Boot time statically isolated CPUs have better isolation than is currently possible if we do it dynamically at run time, though we are trying to close the gap. > > > On Wed, Aug 21, 2024 at 10:23:11AM GMT, Waiman Long <longman@redhat.com> wrote: >> This patch is a step in that direction by making the housekeeping CPU >> mask APIs exclude the dynamically isolated CPUs when they are called >> at run time. The housekeeping CPU masks will fall back to the bootup >> default when all the dynamically isolated CPUs are released. > But when I look at it with the dynamism in mind, I would expect that > some API like housekeeping_setup_type(), i.e. modify the set of isolated > CPUs are requested and leave it up to the isolation implementation to > propagate any changes to respective subsystems. And return an error of > type contains a flag for which dynamism isn't implemented yet or not > possible. There are currently 9 different hk_type's defined in include/linux/sched/isolation.h. We are now trying to reduce their number as some of them cannot be set independently. See [1]. I am thinking about doing dynamism in the best effort basis. Of course, we could expose some information about what aspect of dynamic isolation can be enabled at the moment, if necessary. Cheers, Longman
On 11/15/24 2:32 PM, Waiman Long wrote: > On 11/15/24 10:45 AM, Michal Koutný wrote: >> Hello. >> >> I recently liked the idea of considering isolated CPUs a static (boot >> time) resource and only use cpusets to place (or remove) sensitive >> workload from those selected CPUs depending on current needs. (Yes, this >> may not efficiently utilize the isolated CPUs when reserve them based on >> maximum needs of a node.) > > Thanks for taking a look at this. > > Yes, I am moving in this direction too. Boot time statically isolated > CPUs have better isolation than is currently possible if we do it > dynamically at run time, though we are trying to close the gap. > >> >> >> On Wed, Aug 21, 2024 at 10:23:11AM GMT, Waiman Long >> <longman@redhat.com> wrote: >>> This patch is a step in that direction by making the housekeeping CPU >>> mask APIs exclude the dynamically isolated CPUs when they are called >>> at run time. The housekeeping CPU masks will fall back to the bootup >>> default when all the dynamically isolated CPUs are released. >> But when I look at it with the dynamism in mind, I would expect that >> some API like housekeeping_setup_type(), i.e. modify the set of isolated >> CPUs are requested and leave it up to the isolation implementation to >> propagate any changes to respective subsystems. And return an error of >> type contains a flag for which dynamism isn't implemented yet or not >> possible. > > There are currently 9 different hk_type's defined in > include/linux/sched/isolation.h. We are now trying to reduce their > number as some of them cannot be set independently. See [1]. I am > thinking about doing dynamism in the best effort basis. Of course, we > could expose some information about what aspect of dynamic isolation > can be enabled at the moment, if necessary. Forgot to put the link. [1] https://lore.kernel.org/lkml/20240921190720.106195-1-longman@redhat.com/ > > Cheers, > Longman >
On 8/21/24 10:23, Waiman Long wrote:
> The housekeeping CPU masks, set up by the "isolcpus" and "nohz_full"
> boot command line options, are used at boot time to exclude selected CPUs
> from running some kernel background processes to minimize disturbance
> to latency sensitive userspace applications. Some of housekeeping CPU
> masks are also checked at run time to avoid using those isolated CPUs.
>
> The cpuset subsystem is now able to dynamically create a set of isolated
> CPUs to be used in isolated cpuset partitions. The long term goal is
> to make the degree of isolation as close as possible to what can be
> done statically using those boot command line options.
>
> This patch is a step in that direction by making the housekeeping CPU
> mask APIs exclude the dynamically isolated CPUs when they are called
> at run time. The housekeeping CPU masks will fall back to the bootup
> default when all the dynamically isolated CPUs are released.
>
> A new housekeeping_exlude_isolcpus() function is added which is to be
> called by the cpuset subsystem to provide a list of isolated CPUs to
> be excluded.
>
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
> include/linux/sched/isolation.h | 8 +++
> kernel/sched/isolation.c | 112 +++++++++++++++++++++++++++++++-
> 2 files changed, 119 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
> index 2b461129d1fa..d64fa4e60138 100644
> --- a/include/linux/sched/isolation.h
> +++ b/include/linux/sched/isolation.h
> @@ -27,6 +27,8 @@ extern bool housekeeping_enabled(enum hk_type type);
> extern void housekeeping_affine(struct task_struct *t, enum hk_type type);
> extern bool housekeeping_test_cpu(int cpu, enum hk_type type);
> extern void __init housekeeping_init(void);
> +extern int housekeeping_exlude_isolcpus(const struct cpumask *isolcpus,
> + unsigned long flags);
>
> #else
>
> @@ -54,6 +56,12 @@ static inline bool housekeeping_test_cpu(int cpu, enum hk_type type)
> }
>
> static inline void housekeeping_init(void) { }
> +
> +static inline int housekeeping_exlude_isolcpus(struct cpumask *isolcpus,
> + unsigned long flags)
> +{
> + return -EOPNOTSUPP;
> +}
> #endif /* CONFIG_CPU_ISOLATION */
>
> static inline bool housekeeping_cpu(int cpu, enum hk_type type)
> diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
> index 5891e715f00d..3018ba81eb65 100644
> --- a/kernel/sched/isolation.c
> +++ b/kernel/sched/isolation.c
> @@ -28,7 +28,16 @@ struct housekeeping {
> unsigned long flags;
> };
>
> -static struct housekeeping housekeeping;
> +static struct housekeeping housekeeping __read_mostly;
> +
> +/*
> + * Boot time housekeeping cpumask and flags
> + *
> + * If more than one of nohz_full or isolcpus are specified, the cpumask must
> + * be the same or the setup will fail.
> + */
> +static cpumask_var_t boot_hk_cpumask;
> +static unsigned long boot_hk_flags;
>
> bool housekeeping_enabled(enum hk_type type)
> {
> @@ -253,3 +262,104 @@ static int __init housekeeping_isolcpus_setup(char *str)
> return housekeeping_setup(str, flags);
> }
> __setup("isolcpus=", housekeeping_isolcpus_setup);
> +
> +/*
> + * Save bootup housekeeping cpumask and flags
> + */
> +static int housekeeping_save(void)
> +{
> + enum hk_type type;
> +
> + boot_hk_flags = housekeeping.flags;
> + for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
> + if (!alloc_cpumask_var(&boot_hk_cpumask, GFP_KERNEL))
> + return -ENOMEM;
> + cpumask_copy(boot_hk_cpumask, housekeeping.cpumasks[type]);
> + break;
> + }
> + return 0;
> +}
> +
> +/*
> + * Exclude the given dynamically isolated CPUs from the housekeeping CPUs
> + * External synchronization is required to make sure that concurrent call to
> + * this function will not happen.
> + *
> + * [TODO] The housekeeping cpumasks and flags at bootup time are currently
> + * preserved as cpuset dynamic CPU isolation isn't as good as boot time CPU
> + * isolation yet. Once dynamic CPU isolation is close to boot time isolation,
> + * we will not need to save the bootup values and will allow them to be
> + * overridden.
> + *
> + * Return: 0 if successful, an error code if not
> + */
> +int housekeeping_exlude_isolcpus(const struct cpumask *isolcpus, unsigned long flags)
> +{
> + static unsigned long alloc_flags;
> + static cpumask_var_t tmp_mask;
> + static bool excluded; /* @true if some CPUs have been excluded */
> + static bool inited; /* @true if called before */
> +
> + bool isolate_none = !isolcpus || cpumask_empty(isolcpus);
> + enum hk_type type;
> +
> + lockdep_assert_cpus_held();
> +
> + if (isolate_none && (!inited || !excluded))
> + return 0;
> +
> + if (unlikely(!inited)) {
> + if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL))
> + return -ENOMEM;
> + if (housekeeping.flags) {
> + int err = housekeeping_save();
> +
> + if (err)
> + return err;
> + }
> + alloc_flags = housekeeping.flags;
> + inited = true;
> + }
> +
> + if (isolate_none) {
> + excluded = false;
> +
> + /*
> + * Reset housekeeping to bootup default
> + */
> + for_each_set_bit(type, &boot_hk_flags, HK_TYPE_MAX)
> + cpumask_copy(housekeeping.cpumasks[type], boot_hk_cpumask);
> +
> + WRITE_ONCE(housekeeping.flags, boot_hk_flags);
> + if (!boot_hk_flags && static_key_enabled(&housekeeping_overridden))
> + static_key_disable_cpuslocked(&housekeeping_overridden.key);
> + return 0;
> + }
> +
> + /*
> + * Setting up the new housekeeping cpumasks
> + */
> + for_each_set_bit(type, &flags, HK_TYPE_MAX) {
> + const struct cpumask *src_mask;
> +
> + if (!(BIT(type) & alloc_flags)) {
> + if (!alloc_cpumask_var(&housekeeping.cpumasks[type], GFP_KERNEL))
> + return -ENOMEM;
> + alloc_flags |= BIT(type);
> + }
> + src_mask = (BIT(type) & boot_hk_flags)
> + ? boot_hk_cpumask : cpu_possible_mask;
> + /*
> + * Make sure there is at least one online housekeeping CPU
> + */
> + cpumask_andnot(tmp_mask, src_mask, isolcpus);
> + if (!cpumask_intersects(tmp_mask, cpu_online_mask))
> + return -EINVAL; /* Invalid isolated CPUs */
> + cpumask_copy(housekeeping.cpumasks[type], tmp_mask);
> + }
> + WRITE_ONCE(housekeeping.flags, boot_hk_flags | flags);
> + excluded = true;
> + if (!static_key_enabled(&housekeeping_overridden))
> + static_key_enable_cpuslocked(&housekeeping_overridden.key);
> + return 0;
> +}
Any comment or suggestion about this patch?
Thanks,
Longman
© 2016 - 2026 Red Hat, Inc.