[PATCH 2/2] timers/nohz: Avoid /proc/stat idle/iowait fluctuation when cpu hotplug

Xin Zhao posted 2 patches 2 days, 8 hours ago
[PATCH 2/2] timers/nohz: Avoid /proc/stat idle/iowait fluctuation when cpu hotplug
Posted by Xin Zhao 2 days, 8 hours ago
The idle and iowait statistics in /proc/stat are obtained through
get_idle_time and get_iowait_time. Assuming CONFIG_NO_HZ_COMMON is
enabled, when CPU is online, the idle and iowait values use the
idle_sleeptime and iowait_sleeptime statistics from tick_cpu_sched, but
use CPUTIME_IDLE and CPUTIME_IOWAIT items from kernel_cpustat when CPU
is offline. Although /proc/stat do not print statistics of offline CPU,
it still print aggregated statistics for all possible CPUs.
tick_cpu_sched and kernel_cpustat are maintained by different logic,
leading to a significant gap. The first line of the data below shows the
/proc/stat output when only one CPU remains after CPU offline, the second
line shows the /proc/stat output after all CPUs are brought back online:

cpu  2408558 2 916619 4275883 5403 123758 64685 0 0 0
cpu  2408588 2 916693 4200737 4184 123762 64686 0 0 0

Obviously, other values do not experience significant fluctuations, while
idle/iowait statistics show a substantial decrease, which make system CPU
monitoring troublesome.
Introduce get_cpu_idle_time_us_raw and get_cpu_iowait_time_us_raw, so that
/proc/stat logic can use them to get the last raw value of idle_sleeptime
and iowait_sleeptime from tick_cpu_sched without any calculation when CPU
is offline. It avoids /proc/stat idle/iowait fluctuation when cpu hotplug.

Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
---
 fs/proc/stat.c           |  4 ++++
 include/linux/tick.h     |  4 ++++
 kernel/time/tick-sched.c | 46 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+)

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 8b444e862..de13a2e1c 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -28,6 +28,8 @@ u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
 
 	if (cpu_online(cpu))
 		idle_usecs = get_cpu_idle_time_us(cpu, NULL);
+	else
+		idle_usecs = get_cpu_idle_time_us_raw(cpu);
 
 	if (idle_usecs == -1ULL)
 		/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
@@ -44,6 +46,8 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
 
 	if (cpu_online(cpu))
 		iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
+	else
+		iowait_usecs = get_cpu_iowait_time_us_raw(cpu);
 
 	if (iowait_usecs == -1ULL)
 		/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index ac76ae9fa..bdd38d270 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -139,7 +139,9 @@ extern ktime_t tick_nohz_get_next_hrtimer(void);
 extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next);
 extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
+extern u64 get_cpu_idle_time_us_raw(int cpu);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
+extern u64 get_cpu_iowait_time_us_raw(int cpu);
 #else /* !CONFIG_NO_HZ_COMMON */
 #define tick_nohz_enabled (0)
 static inline int tick_nohz_tick_stopped(void) { return 0; }
@@ -161,7 +163,9 @@ static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 	return *delta_next;
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
+static inline u64 get_cpu_idle_time_us_raw(int cpu) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
+static inline u64 get_cpu_iowait_time_us_raw(int cpu) { return -1; }
 #endif /* !CONFIG_NO_HZ_COMMON */
 
 /*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4d089b290..607fc22d4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -829,6 +829,29 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 
+/**
+ * get_cpu_idle_time_us_raw - get the raw total idle time of a CPU
+ * @cpu: CPU number to query
+ *
+ * Return the raw idle time (since boot) for a given CPU, in
+ * microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_idle_time_us_raw(int cpu)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+	if (!tick_nohz_active)
+		return -1;
+
+	return ktime_to_us(ts->idle_sleeptime);
+}
+EXPORT_SYMBOL_GPL(get_cpu_idle_time_us_raw);
+
 /**
  * get_cpu_iowait_time_us - get the total iowait time of a CPU
  * @cpu: CPU number to query
@@ -855,6 +878,29 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 
+/**
+ * get_cpu_iowait_time_us_raw - get the raw total iowait time of a CPU
+ * @cpu: CPU number to query
+ *
+ * Return the raw iowait time (since boot) for a given CPU, in
+ * microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_iowait_time_us_raw(int cpu)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+	if (!tick_nohz_active)
+		return -1;
+
+	return ktime_to_us(ts->iowait_sleeptime);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us_raw);
+
 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 {
 	hrtimer_cancel(&ts->sched_timer);
-- 
2.34.1
Re: [PATCH 2/2] timers/nohz: Avoid /proc/stat idle/iowait fluctuation when cpu hotplug
Posted by Ingo Molnar an hour ago
* Xin Zhao <jackzxcui1989@163.com> wrote:

> The idle and iowait statistics in /proc/stat are obtained through
> get_idle_time and get_iowait_time. Assuming CONFIG_NO_HZ_COMMON is
> enabled, when CPU is online, the idle and iowait values use the
> idle_sleeptime and iowait_sleeptime statistics from tick_cpu_sched, but
> use CPUTIME_IDLE and CPUTIME_IOWAIT items from kernel_cpustat when CPU
> is offline. Although /proc/stat do not print statistics of offline CPU,
> it still print aggregated statistics for all possible CPUs.
> tick_cpu_sched and kernel_cpustat are maintained by different logic,
> leading to a significant gap. The first line of the data below shows the
> /proc/stat output when only one CPU remains after CPU offline, the second
> line shows the /proc/stat output after all CPUs are brought back online:
> 
> cpu  2408558 2 916619 4275883 5403 123758 64685 0 0 0
> cpu  2408588 2 916693 4200737 4184 123762 64686 0 0 0

Yeah, that outlier indeed looks suboptimal, and there's 
very little user-space tooling can do to detect it. I 
think your suggestion, to use the 'frozen' values of an 
offline CPU, might as well be the right approach.

What value is printed if the CPU was never online, is 
it properly initialized to zero?


> Obviously, other values do not experience significant fluctuations, while
> idle/iowait statistics show a substantial decrease, which make system CPU
> monitoring troublesome.
> Introduce get_cpu_idle_time_us_raw and get_cpu_iowait_time_us_raw, so that
> /proc/stat logic can use them to get the last raw value of idle_sleeptime
> and iowait_sleeptime from tick_cpu_sched without any calculation when CPU
> is offline. It avoids /proc/stat idle/iowait fluctuation when cpu hotplug.
> 
> Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
> ---
>  fs/proc/stat.c           |  4 ++++
>  include/linux/tick.h     |  4 ++++
>  kernel/time/tick-sched.c | 46 ++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 54 insertions(+)
> 
> diff --git a/fs/proc/stat.c b/fs/proc/stat.c
> index 8b444e862..de13a2e1c 100644
> --- a/fs/proc/stat.c
> +++ b/fs/proc/stat.c
> @@ -28,6 +28,8 @@ u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
>  
>  	if (cpu_online(cpu))
>  		idle_usecs = get_cpu_idle_time_us(cpu, NULL);
> +	else
> +		idle_usecs = get_cpu_idle_time_us_raw(cpu);
>  
>  	if (idle_usecs == -1ULL)
>  		/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
> @@ -44,6 +46,8 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
>  
>  	if (cpu_online(cpu))
>  		iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
> +	else
> +		iowait_usecs = get_cpu_iowait_time_us_raw(cpu);

So why not just use the get_cpu_idle_time_us() and 
get_cpu_iowait_time_us() values unconditionally, for 
all possible_cpus?

The raw/non-raw distinction makes very little sense in 
this context, the read_seqlock_retry loop will always 
succeed after a single step (because there are no 
writers), so the behavior of the full get_cpu_idle/iowait_time_us()
functions should be close to the _raw() variants.

Patch would be much simpler that way.

Thanks,

	Ingo