kernel/sched/psi.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+)
Pavan reported a problem that PSI avgs_work idle shutoff is not
working at all. Because PSI_NONIDLE condition would be observed in
psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
only the kworker running avgs_work on the CPU.
Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
still will always re-arm the avgs_work, so shutoff is not working.
This patch changes to consider current CPU groupc as IDLE if the
kworker running avgs_work is the only task running and no IOWAIT
or MEMSTALL sleep tasks, in which case we will shut off the avgs_work
if other CPUs' groupc are also IDLE.
One potential problem is that the brief period of non-idle time
incurred between the aggregation run and the kworker's dequeue will
be stranded in the per-cpu buckets until avgs_work run next time.
The buckets can hold 4s worth of time, and future activity will wake
the avgs_work with a 2s delay, giving us 2s worth of data we can leave
behind when shut off the avgs_work. If the kworker run other works after
avgs_work shut off and doesn't have any scheduler activities for 2s,
this maybe a problem.
Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
kernel/sched/psi.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ee2ecc081422..f4cdf6f184ba 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
u32 *pchanged_states)
{
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+ int current_cpu = raw_smp_processor_id();
+ bool only_avgs_work = false;
u64 now, state_start;
enum psi_states s;
unsigned int seq;
@@ -256,6 +258,15 @@ static void get_recent_times(struct psi_group *group, int cpu,
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
+ /*
+ * This CPU has only avgs_work kworker running, snapshot the
+ * newest times then don't need to re-arm for this groupc.
+ * Normally this kworker will sleep soon and won't wake
+ * avgs_work back up in psi_group_change().
+ */
+ if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
+ !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
+ only_avgs_work = true;
} while (read_seqcount_retry(&groupc->seq, seq));
/* Calculate state time deltas against the previous snapshot */
@@ -280,6 +291,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
if (delta)
*pchanged_states |= (1 << s);
}
+
+ /* Clear PSI_NONIDLE so avgs_work won't be re-armed for this groupc */
+ if (only_avgs_work)
+ *pchanged_states &= ~(1 << PSI_NONIDLE);
}
static void calc_avgs(unsigned long avg[3], int missed_periods,
--
2.37.2
On Mon, Oct 10, 2022 at 3:42 AM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> Pavan reported a problem that PSI avgs_work idle shutoff is not
> working at all. Because PSI_NONIDLE condition would be observed in
> psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
> only the kworker running avgs_work on the CPU.
>
> Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
> avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
> still will always re-arm the avgs_work, so shutoff is not working.
>
> This patch changes to consider current CPU groupc as IDLE if the
> kworker running avgs_work is the only task running and no IOWAIT
> or MEMSTALL sleep tasks, in which case we will shut off the avgs_work
> if other CPUs' groupc are also IDLE.
>
> One potential problem is that the brief period of non-idle time
> incurred between the aggregation run and the kworker's dequeue will
> be stranded in the per-cpu buckets until avgs_work run next time.
> The buckets can hold 4s worth of time, and future activity will wake
> the avgs_work with a 2s delay, giving us 2s worth of data we can leave
> behind when shut off the avgs_work. If the kworker run other works after
> avgs_work shut off and doesn't have any scheduler activities for 2s,
> this maybe a problem.
>
> Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Copying my comments from
https://lore.kernel.org/all/CAJuCfpHyYMak-mfVmtEN9Z-hGYQ6Wko57TLjukz9HaN26EDAuA@mail.gmail.com/
in case you want to continue the discussion here...
> ---
> kernel/sched/psi.c | 15 +++++++++++++++
> 1 file changed, 15 insertions(+)
>
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index ee2ecc081422..f4cdf6f184ba 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
> u32 *pchanged_states)
> {
> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
> + int current_cpu = raw_smp_processor_id();
> + bool only_avgs_work = false;
> u64 now, state_start;
> enum psi_states s;
> unsigned int seq;
> @@ -256,6 +258,15 @@ static void get_recent_times(struct psi_group *group, int cpu,
> memcpy(times, groupc->times, sizeof(groupc->times));
> state_mask = groupc->state_mask;
> state_start = groupc->state_start;
> + /*
> + * This CPU has only avgs_work kworker running, snapshot the
> + * newest times then don't need to re-arm for this groupc.
> + * Normally this kworker will sleep soon and won't wake
> + * avgs_work back up in psi_group_change().
> + */
> + if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
> + !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
> + only_avgs_work = true;
Why do you determine only_avgs_work while taking a snapshot? The
read_seqcount_retry() might fail and the loop gets retried, which
might lead to a wrong only_avgs_work value if the state changes
between retries. I think it's safer to do this after the snapshot was
taken and to use tasks[NR_RUNNING] instead of roupc->tasks.
> } while (read_seqcount_retry(&groupc->seq, seq));
>
> /* Calculate state time deltas against the previous snapshot */
> @@ -280,6 +291,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
> if (delta)
> *pchanged_states |= (1 << s);
> }
> +
> + /* Clear PSI_NONIDLE so avgs_work won't be re-armed for this groupc */
> + if (only_avgs_work)
> + *pchanged_states &= ~(1 << PSI_NONIDLE);
This seems to be safe because changed_states&(1<< PSI_NONIDLE) is used
only for re-arming psi_avgs_work, however semantically this is
incorrect. The CPU was not idle when it was executing psi_avgs_work.
IMO a separate flag would avoid this confusion.
> }
>
> static void calc_avgs(unsigned long avg[3], int missed_periods,
> --
> 2.37.2
>
Hello,
On 2022/10/11 05:21, Suren Baghdasaryan wrote:
> On Mon, Oct 10, 2022 at 3:42 AM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> Pavan reported a problem that PSI avgs_work idle shutoff is not
>> working at all. Because PSI_NONIDLE condition would be observed in
>> psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
>> only the kworker running avgs_work on the CPU.
>>
>> Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
>> avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
>> still will always re-arm the avgs_work, so shutoff is not working.
>>
>> This patch changes to consider current CPU groupc as IDLE if the
>> kworker running avgs_work is the only task running and no IOWAIT
>> or MEMSTALL sleep tasks, in which case we will shut off the avgs_work
>> if other CPUs' groupc are also IDLE.
>>
>> One potential problem is that the brief period of non-idle time
>> incurred between the aggregation run and the kworker's dequeue will
>> be stranded in the per-cpu buckets until avgs_work run next time.
>> The buckets can hold 4s worth of time, and future activity will wake
>> the avgs_work with a 2s delay, giving us 2s worth of data we can leave
>> behind when shut off the avgs_work. If the kworker run other works after
>> avgs_work shut off and doesn't have any scheduler activities for 2s,
>> this maybe a problem.
>>
>> Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>
> Copying my comments from
> https://lore.kernel.org/all/CAJuCfpHyYMak-mfVmtEN9Z-hGYQ6Wko57TLjukz9HaN26EDAuA@mail.gmail.com/
> in case you want to continue the discussion here...
>
>> ---
>> kernel/sched/psi.c | 15 +++++++++++++++
>> 1 file changed, 15 insertions(+)
>>
>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>> index ee2ecc081422..f4cdf6f184ba 100644
>> --- a/kernel/sched/psi.c
>> +++ b/kernel/sched/psi.c
>> @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
>> u32 *pchanged_states)
>> {
>> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
>> + int current_cpu = raw_smp_processor_id();
>> + bool only_avgs_work = false;
>> u64 now, state_start;
>> enum psi_states s;
>> unsigned int seq;
>> @@ -256,6 +258,15 @@ static void get_recent_times(struct psi_group *group, int cpu,
>> memcpy(times, groupc->times, sizeof(groupc->times));
>> state_mask = groupc->state_mask;
>> state_start = groupc->state_start;
>> + /*
>> + * This CPU has only avgs_work kworker running, snapshot the
>> + * newest times then don't need to re-arm for this groupc.
>> + * Normally this kworker will sleep soon and won't wake
>> + * avgs_work back up in psi_group_change().
>> + */
>> + if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
>> + !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
>> + only_avgs_work = true;
>
> Why do you determine only_avgs_work while taking a snapshot? The
> read_seqcount_retry() might fail and the loop gets retried, which
> might lead to a wrong only_avgs_work value if the state changes
> between retries. I think it's safer to do this after the snapshot was
> taken and to use tasks[NR_RUNNING] instead of roupc->tasks.
Ah, you are right, coping groupc->tasks[NR_RUNNING] and tasks[NR_IOWAIT], tasks[NR_MEMSTALL]
is ok for me. (Maybe we only need to copy groupc->tasks[NR_RUNNING]?)
Another way is to add an else branch:
if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
!groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
only_avgs_work = true;
else
only_avgs_work = false;
Both are ok for me.
>
>> } while (read_seqcount_retry(&groupc->seq, seq));
>>
>> /* Calculate state time deltas against the previous snapshot */
>> @@ -280,6 +291,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
>> if (delta)
>> *pchanged_states |= (1 << s);
>> }
>> +
>> + /* Clear PSI_NONIDLE so avgs_work won't be re-armed for this groupc */
>> + if (only_avgs_work)
>> + *pchanged_states &= ~(1 << PSI_NONIDLE);
>
> This seems to be safe because changed_states&(1<< PSI_NONIDLE) is used
> only for re-arming psi_avgs_work, however semantically this is
> incorrect. The CPU was not idle when it was executing psi_avgs_work.
> IMO a separate flag would avoid this confusion.
Yes, it's safe, but has this confusion. Use a separate flag looks better, like PSI_ONLY_AVGS_WORK.
But then in collect_percpu_times() we still have to clear PSI_NONIDLE of this CPU if PSI_ONLY_AVGS_WORK
has been set.
for_each_possible_cpu(cpu) {
u32 times[NR_PSI_STATES];
u32 nonidle;
u32 cpu_changed_states;
get_recent_times(group, cpu, aggregator, times,
&cpu_changed_states);
changed_states |= cpu_changed_states;
cpu_changed_states should clear PSI_NONIDLE if PSI_ONLY_AVGS_WORK already set.
I'm not sure if it's better to clear here, maybe we can add a comment in get_recent_times()
when do PSI_NONIDLE clear?
Thanks!
On Mon, Oct 10, 2022 at 5:07 PM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> Hello,
>
> On 2022/10/11 05:21, Suren Baghdasaryan wrote:
> > On Mon, Oct 10, 2022 at 3:42 AM Chengming Zhou
> > <zhouchengming@bytedance.com> wrote:
> >>
> >> Pavan reported a problem that PSI avgs_work idle shutoff is not
> >> working at all. Because PSI_NONIDLE condition would be observed in
> >> psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
> >> only the kworker running avgs_work on the CPU.
> >>
> >> Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
> >> avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
> >> still will always re-arm the avgs_work, so shutoff is not working.
> >>
> >> This patch changes to consider current CPU groupc as IDLE if the
> >> kworker running avgs_work is the only task running and no IOWAIT
> >> or MEMSTALL sleep tasks, in which case we will shut off the avgs_work
> >> if other CPUs' groupc are also IDLE.
> >>
> >> One potential problem is that the brief period of non-idle time
> >> incurred between the aggregation run and the kworker's dequeue will
> >> be stranded in the per-cpu buckets until avgs_work run next time.
> >> The buckets can hold 4s worth of time, and future activity will wake
> >> the avgs_work with a 2s delay, giving us 2s worth of data we can leave
> >> behind when shut off the avgs_work. If the kworker run other works after
> >> avgs_work shut off and doesn't have any scheduler activities for 2s,
> >> this maybe a problem.
> >>
> >> Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
> >> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> >
> > Copying my comments from
> > https://lore.kernel.org/all/CAJuCfpHyYMak-mfVmtEN9Z-hGYQ6Wko57TLjukz9HaN26EDAuA@mail.gmail.com/
> > in case you want to continue the discussion here...
> >
> >> ---
> >> kernel/sched/psi.c | 15 +++++++++++++++
> >> 1 file changed, 15 insertions(+)
> >>
> >> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> >> index ee2ecc081422..f4cdf6f184ba 100644
> >> --- a/kernel/sched/psi.c
> >> +++ b/kernel/sched/psi.c
> >> @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
> >> u32 *pchanged_states)
> >> {
> >> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
> >> + int current_cpu = raw_smp_processor_id();
> >> + bool only_avgs_work = false;
> >> u64 now, state_start;
> >> enum psi_states s;
> >> unsigned int seq;
> >> @@ -256,6 +258,15 @@ static void get_recent_times(struct psi_group *group, int cpu,
> >> memcpy(times, groupc->times, sizeof(groupc->times));
> >> state_mask = groupc->state_mask;
> >> state_start = groupc->state_start;
> >> + /*
> >> + * This CPU has only avgs_work kworker running, snapshot the
> >> + * newest times then don't need to re-arm for this groupc.
> >> + * Normally this kworker will sleep soon and won't wake
> >> + * avgs_work back up in psi_group_change().
> >> + */
> >> + if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
> >> + !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
> >> + only_avgs_work = true;
> >
> > Why do you determine only_avgs_work while taking a snapshot? The
> > read_seqcount_retry() might fail and the loop gets retried, which
> > might lead to a wrong only_avgs_work value if the state changes
> > between retries. I think it's safer to do this after the snapshot was
> > taken and to use tasks[NR_RUNNING] instead of roupc->tasks.
>
> Ah, you are right, coping groupc->tasks[NR_RUNNING] and tasks[NR_IOWAIT], tasks[NR_MEMSTALL]
> is ok for me. (Maybe we only need to copy groupc->tasks[NR_RUNNING]?)
>
> Another way is to add an else branch:
>
> if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
> !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
> only_avgs_work = true;
> else
> only_avgs_work = false;
>
> Both are ok for me.
Let's use the simple way and use the tasks[] after the snapshot is taken.
>
> >
> >> } while (read_seqcount_retry(&groupc->seq, seq));
> >>
> >> /* Calculate state time deltas against the previous snapshot */
> >> @@ -280,6 +291,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
> >> if (delta)
> >> *pchanged_states |= (1 << s);
> >> }
> >> +
> >> + /* Clear PSI_NONIDLE so avgs_work won't be re-armed for this groupc */
> >> + if (only_avgs_work)
> >> + *pchanged_states &= ~(1 << PSI_NONIDLE);
> >
> > This seems to be safe because changed_states&(1<< PSI_NONIDLE) is used
> > only for re-arming psi_avgs_work, however semantically this is
> > incorrect. The CPU was not idle when it was executing psi_avgs_work.
> > IMO a separate flag would avoid this confusion.
>
> Yes, it's safe, but has this confusion. Use a separate flag looks better, like PSI_ONLY_AVGS_WORK.
> But then in collect_percpu_times() we still have to clear PSI_NONIDLE of this CPU if PSI_ONLY_AVGS_WORK
> has been set.
>
> for_each_possible_cpu(cpu) {
> u32 times[NR_PSI_STATES];
> u32 nonidle;
> u32 cpu_changed_states;
>
> get_recent_times(group, cpu, aggregator, times,
> &cpu_changed_states);
> changed_states |= cpu_changed_states;
>
> cpu_changed_states should clear PSI_NONIDLE if PSI_ONLY_AVGS_WORK already set.
No, PSI_NONIDLE should not be affected by PSI_ONLY_AVGS_WORK. These
flags should be independent and aggregated into changed_states without
affecting each other. Something similar to how I suggested with
PSI_STATE_WAKE_CLOCK in
https://lore.kernel.org/all/CAJuCfpFr3JfwkWbDqkU=NUJbCYuCWGySwNusMCdmS3z95WD2AQ@mail.gmail.com/#t.
Thanks,
Suren.
>
> I'm not sure if it's better to clear here, maybe we can add a comment in get_recent_times()
> when do PSI_NONIDLE clear?
>
> Thanks!
>
On 2022/10/12 01:00, Suren Baghdasaryan wrote:
> On Mon, Oct 10, 2022 at 5:07 PM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> Hello,
>>
>> On 2022/10/11 05:21, Suren Baghdasaryan wrote:
>>> On Mon, Oct 10, 2022 at 3:42 AM Chengming Zhou
>>> <zhouchengming@bytedance.com> wrote:
>>>>
>>>> Pavan reported a problem that PSI avgs_work idle shutoff is not
>>>> working at all. Because PSI_NONIDLE condition would be observed in
>>>> psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
>>>> only the kworker running avgs_work on the CPU.
>>>>
>>>> Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
>>>> avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
>>>> still will always re-arm the avgs_work, so shutoff is not working.
>>>>
>>>> This patch changes to consider current CPU groupc as IDLE if the
>>>> kworker running avgs_work is the only task running and no IOWAIT
>>>> or MEMSTALL sleep tasks, in which case we will shut off the avgs_work
>>>> if other CPUs' groupc are also IDLE.
>>>>
>>>> One potential problem is that the brief period of non-idle time
>>>> incurred between the aggregation run and the kworker's dequeue will
>>>> be stranded in the per-cpu buckets until avgs_work run next time.
>>>> The buckets can hold 4s worth of time, and future activity will wake
>>>> the avgs_work with a 2s delay, giving us 2s worth of data we can leave
>>>> behind when shut off the avgs_work. If the kworker run other works after
>>>> avgs_work shut off and doesn't have any scheduler activities for 2s,
>>>> this maybe a problem.
>>>>
>>>> Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
>>>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>>>
>>> Copying my comments from
>>> https://lore.kernel.org/all/CAJuCfpHyYMak-mfVmtEN9Z-hGYQ6Wko57TLjukz9HaN26EDAuA@mail.gmail.com/
>>> in case you want to continue the discussion here...
>>>
>>>> ---
>>>> kernel/sched/psi.c | 15 +++++++++++++++
>>>> 1 file changed, 15 insertions(+)
>>>>
>>>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>>>> index ee2ecc081422..f4cdf6f184ba 100644
>>>> --- a/kernel/sched/psi.c
>>>> +++ b/kernel/sched/psi.c
>>>> @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>>> u32 *pchanged_states)
>>>> {
>>>> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
>>>> + int current_cpu = raw_smp_processor_id();
>>>> + bool only_avgs_work = false;
>>>> u64 now, state_start;
>>>> enum psi_states s;
>>>> unsigned int seq;
>>>> @@ -256,6 +258,15 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>>> memcpy(times, groupc->times, sizeof(groupc->times));
>>>> state_mask = groupc->state_mask;
>>>> state_start = groupc->state_start;
>>>> + /*
>>>> + * This CPU has only avgs_work kworker running, snapshot the
>>>> + * newest times then don't need to re-arm for this groupc.
>>>> + * Normally this kworker will sleep soon and won't wake
>>>> + * avgs_work back up in psi_group_change().
>>>> + */
>>>> + if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
>>>> + !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
>>>> + only_avgs_work = true;
>>>
>>> Why do you determine only_avgs_work while taking a snapshot? The
>>> read_seqcount_retry() might fail and the loop gets retried, which
>>> might lead to a wrong only_avgs_work value if the state changes
>>> between retries. I think it's safer to do this after the snapshot was
>>> taken and to use tasks[NR_RUNNING] instead of roupc->tasks.
>>
>> Ah, you are right, coping groupc->tasks[NR_RUNNING] and tasks[NR_IOWAIT], tasks[NR_MEMSTALL]
>> is ok for me. (Maybe we only need to copy groupc->tasks[NR_RUNNING]?)
>>
>> Another way is to add an else branch:
>>
>> if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
>> !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
>> only_avgs_work = true;
>> else
>> only_avgs_work = false;
>>
>> Both are ok for me.
>
> Let's use the simple way and use the tasks[] after the snapshot is taken.
Ok, I changed like this:
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+ int current_cpu = raw_smp_processor_id();
+ unsigned int tasks[NR_PSI_TASK_COUNTS];
u64 now, state_start;
enum psi_states s;
unsigned int seq;
@@ -256,6 +258,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
+ if (cpu == current_cpu)
+ memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
} while (read_seqcount_retry(&groupc->seq, seq));
>
>>
>>>
>>>> } while (read_seqcount_retry(&groupc->seq, seq));
>>>>
>>>> /* Calculate state time deltas against the previous snapshot */
>>>> @@ -280,6 +291,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>>> if (delta)
>>>> *pchanged_states |= (1 << s);
>>>> }
>>>> +
>>>> + /* Clear PSI_NONIDLE so avgs_work won't be re-armed for this groupc */
>>>> + if (only_avgs_work)
>>>> + *pchanged_states &= ~(1 << PSI_NONIDLE);
>>>
>>> This seems to be safe because changed_states&(1<< PSI_NONIDLE) is used
>>> only for re-arming psi_avgs_work, however semantically this is
>>> incorrect. The CPU was not idle when it was executing psi_avgs_work.
>>> IMO a separate flag would avoid this confusion.
>>
>> Yes, it's safe, but has this confusion. Use a separate flag looks better, like PSI_ONLY_AVGS_WORK.
>> But then in collect_percpu_times() we still have to clear PSI_NONIDLE of this CPU if PSI_ONLY_AVGS_WORK
>> has been set.
>>
>> for_each_possible_cpu(cpu) {
>> u32 times[NR_PSI_STATES];
>> u32 nonidle;
>> u32 cpu_changed_states;
>>
>> get_recent_times(group, cpu, aggregator, times,
>> &cpu_changed_states);
>> changed_states |= cpu_changed_states;
>>
>> cpu_changed_states should clear PSI_NONIDLE if PSI_ONLY_AVGS_WORK already set.
>
> No, PSI_NONIDLE should not be affected by PSI_ONLY_AVGS_WORK. These
> flags should be independent and aggregated into changed_states without
> affecting each other. Something similar to how I suggested with
> PSI_STATE_WAKE_CLOCK in
> https://lore.kernel.org/all/CAJuCfpFr3JfwkWbDqkU=NUJbCYuCWGySwNusMCdmS3z95WD2AQ@mail.gmail.com/#t.
If we don't clear PSI_NONIDLE of this CPU, changed_states |= cpu_changed_states;
so changed_states has PSI_NONIDLE, and we won't know if other CPUs are IDLE or not
in psi_avgs_work().
Thanks.
On Tue, Oct 11, 2022 at 7:11 PM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> On 2022/10/12 01:00, Suren Baghdasaryan wrote:
> > On Mon, Oct 10, 2022 at 5:07 PM Chengming Zhou
> > <zhouchengming@bytedance.com> wrote:
> >>
> >> Hello,
> >>
> >> On 2022/10/11 05:21, Suren Baghdasaryan wrote:
> >>> On Mon, Oct 10, 2022 at 3:42 AM Chengming Zhou
> >>> <zhouchengming@bytedance.com> wrote:
> >>>>
> >>>> Pavan reported a problem that PSI avgs_work idle shutoff is not
> >>>> working at all. Because PSI_NONIDLE condition would be observed in
> >>>> psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
> >>>> only the kworker running avgs_work on the CPU.
> >>>>
> >>>> Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
> >>>> avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
> >>>> still will always re-arm the avgs_work, so shutoff is not working.
> >>>>
> >>>> This patch changes to consider current CPU groupc as IDLE if the
> >>>> kworker running avgs_work is the only task running and no IOWAIT
> >>>> or MEMSTALL sleep tasks, in which case we will shut off the avgs_work
> >>>> if other CPUs' groupc are also IDLE.
> >>>>
> >>>> One potential problem is that the brief period of non-idle time
> >>>> incurred between the aggregation run and the kworker's dequeue will
> >>>> be stranded in the per-cpu buckets until avgs_work run next time.
> >>>> The buckets can hold 4s worth of time, and future activity will wake
> >>>> the avgs_work with a 2s delay, giving us 2s worth of data we can leave
> >>>> behind when shut off the avgs_work. If the kworker run other works after
> >>>> avgs_work shut off and doesn't have any scheduler activities for 2s,
> >>>> this maybe a problem.
> >>>>
> >>>> Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
> >>>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> >>>
> >>> Copying my comments from
> >>> https://lore.kernel.org/all/CAJuCfpHyYMak-mfVmtEN9Z-hGYQ6Wko57TLjukz9HaN26EDAuA@mail.gmail.com/
> >>> in case you want to continue the discussion here...
> >>>
> >>>> ---
> >>>> kernel/sched/psi.c | 15 +++++++++++++++
> >>>> 1 file changed, 15 insertions(+)
> >>>>
> >>>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> >>>> index ee2ecc081422..f4cdf6f184ba 100644
> >>>> --- a/kernel/sched/psi.c
> >>>> +++ b/kernel/sched/psi.c
> >>>> @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
> >>>> u32 *pchanged_states)
> >>>> {
> >>>> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
> >>>> + int current_cpu = raw_smp_processor_id();
> >>>> + bool only_avgs_work = false;
> >>>> u64 now, state_start;
> >>>> enum psi_states s;
> >>>> unsigned int seq;
> >>>> @@ -256,6 +258,15 @@ static void get_recent_times(struct psi_group *group, int cpu,
> >>>> memcpy(times, groupc->times, sizeof(groupc->times));
> >>>> state_mask = groupc->state_mask;
> >>>> state_start = groupc->state_start;
> >>>> + /*
> >>>> + * This CPU has only avgs_work kworker running, snapshot the
> >>>> + * newest times then don't need to re-arm for this groupc.
> >>>> + * Normally this kworker will sleep soon and won't wake
> >>>> + * avgs_work back up in psi_group_change().
> >>>> + */
> >>>> + if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
> >>>> + !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
> >>>> + only_avgs_work = true;
> >>>
> >>> Why do you determine only_avgs_work while taking a snapshot? The
> >>> read_seqcount_retry() might fail and the loop gets retried, which
> >>> might lead to a wrong only_avgs_work value if the state changes
> >>> between retries. I think it's safer to do this after the snapshot was
> >>> taken and to use tasks[NR_RUNNING] instead of roupc->tasks.
> >>
> >> Ah, you are right, coping groupc->tasks[NR_RUNNING] and tasks[NR_IOWAIT], tasks[NR_MEMSTALL]
> >> is ok for me. (Maybe we only need to copy groupc->tasks[NR_RUNNING]?)
> >>
> >> Another way is to add an else branch:
> >>
> >> if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
> >> !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
> >> only_avgs_work = true;
> >> else
> >> only_avgs_work = false;
> >>
> >> Both are ok for me.
> >
> > Let's use the simple way and use the tasks[] after the snapshot is taken.
>
> Ok, I changed like this:
>
> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
> + int current_cpu = raw_smp_processor_id();
> + unsigned int tasks[NR_PSI_TASK_COUNTS];
> u64 now, state_start;
> enum psi_states s;
> unsigned int seq;
> @@ -256,6 +258,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
> memcpy(times, groupc->times, sizeof(groupc->times));
> state_mask = groupc->state_mask;
> state_start = groupc->state_start;
> + if (cpu == current_cpu)
> + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
> } while (read_seqcount_retry(&groupc->seq, seq));
>
> >
> >>
> >>>
> >>>> } while (read_seqcount_retry(&groupc->seq, seq));
> >>>>
> >>>> /* Calculate state time deltas against the previous snapshot */
> >>>> @@ -280,6 +291,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
> >>>> if (delta)
> >>>> *pchanged_states |= (1 << s);
> >>>> }
> >>>> +
> >>>> + /* Clear PSI_NONIDLE so avgs_work won't be re-armed for this groupc */
> >>>> + if (only_avgs_work)
> >>>> + *pchanged_states &= ~(1 << PSI_NONIDLE);
> >>>
> >>> This seems to be safe because changed_states&(1<< PSI_NONIDLE) is used
> >>> only for re-arming psi_avgs_work, however semantically this is
> >>> incorrect. The CPU was not idle when it was executing psi_avgs_work.
> >>> IMO a separate flag would avoid this confusion.
> >>
> >> Yes, it's safe, but has this confusion. Use a separate flag looks better, like PSI_ONLY_AVGS_WORK.
> >> But then in collect_percpu_times() we still have to clear PSI_NONIDLE of this CPU if PSI_ONLY_AVGS_WORK
> >> has been set.
> >>
> >> for_each_possible_cpu(cpu) {
> >> u32 times[NR_PSI_STATES];
> >> u32 nonidle;
> >> u32 cpu_changed_states;
> >>
> >> get_recent_times(group, cpu, aggregator, times,
> >> &cpu_changed_states);
> >> changed_states |= cpu_changed_states;
> >>
> >> cpu_changed_states should clear PSI_NONIDLE if PSI_ONLY_AVGS_WORK already set.
> >
> > No, PSI_NONIDLE should not be affected by PSI_ONLY_AVGS_WORK. These
> > flags should be independent and aggregated into changed_states without
> > affecting each other. Something similar to how I suggested with
> > PSI_STATE_WAKE_CLOCK in
> > https://lore.kernel.org/all/CAJuCfpFr3JfwkWbDqkU=NUJbCYuCWGySwNusMCdmS3z95WD2AQ@mail.gmail.com/#t.
>
> If we don't clear PSI_NONIDLE of this CPU, changed_states |= cpu_changed_states;
> so changed_states has PSI_NONIDLE, and we won't know if other CPUs are IDLE or not
> in psi_avgs_work().
I was thinking something like this:
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -244,6 +244,7 @@ static void get_recent_times(struct psi_group
*group, int cpu,
enum psi_states s;
unsigned int seq;
u32 state_mask;
+ bool reschedule;
*pchanged_states = 0;
@@ -254,6 +255,14 @@ static void get_recent_times(struct psi_group
*group, int cpu,
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
+ if (current_cpu == cpu)
+ reschedule = groupc->tasks[NR_RUNNING] +
+ groupc->tasks[NR_IOWAIT] +
+ groupc->tasks[NR_MEMSTALL] > 1;
+ else
+ reschedule = times[PSI_NONIDLE] >
+ groupc->times_prev[aggregator][PSI_NONIDLE];
+
} while (read_seqcount_retry(&groupc->seq, seq));
/* Calculate state time deltas against the previous snapshot */
@@ -278,6 +287,8 @@ static void get_recent_times(struct psi_group
*group, int cpu,
if (delta)
*pchanged_states |= (1 << s);
}
+ if (reschedule)
+ *pchanged_states |= PSI_STATE_RESCHEDULE;
}
static void calc_avgs(unsigned long avg[3], int missed_periods,
@@ -413,7 +424,7 @@ static void psi_avgs_work(struct work_struct *work)
struct delayed_work *dwork;
struct psi_group *group;
u32 changed_states;
- bool nonidle;
+ bool reschedule;
u64 now;
dwork = to_delayed_work(work);
@@ -424,7 +435,7 @@ static void psi_avgs_work(struct work_struct *work)
now = sched_clock();
collect_percpu_times(group, PSI_AVGS, &changed_states);
- nonidle = changed_states & (1 << PSI_NONIDLE);
+ reschedule = changed_states & (1 << PSI_STATE_RESCHEDULE);
/*
* If there is task activity, periodically fold the per-cpu
* times and feed samples into the running averages. If things
@@ -435,7 +446,7 @@ static void psi_avgs_work(struct work_struct *work)
if (now >= group->avg_next_update)
group->avg_next_update = update_averages(group, now);
- if (nonidle) {
+ if (reschedule) {
schedule_delayed_work(dwork, nsecs_to_jiffies(
group->avg_next_update - now) + 1);
}
Does that address your concern?
>
> Thanks.
On 2022/10/13 02:24, Suren Baghdasaryan wrote:
> On Tue, Oct 11, 2022 at 7:11 PM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> On 2022/10/12 01:00, Suren Baghdasaryan wrote:
>>> On Mon, Oct 10, 2022 at 5:07 PM Chengming Zhou
>>> <zhouchengming@bytedance.com> wrote:
>>>>
>>>> Hello,
>>>>
>>>> On 2022/10/11 05:21, Suren Baghdasaryan wrote:
>>>>> On Mon, Oct 10, 2022 at 3:42 AM Chengming Zhou
>>>>> <zhouchengming@bytedance.com> wrote:
>>>>>>
>>>>>> Pavan reported a problem that PSI avgs_work idle shutoff is not
>>>>>> working at all. Because PSI_NONIDLE condition would be observed in
>>>>>> psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
>>>>>> only the kworker running avgs_work on the CPU.
>>>>>>
>>>>>> Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
>>>>>> avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
>>>>>> still will always re-arm the avgs_work, so shutoff is not working.
>>>>>>
>>>>>> This patch changes to consider current CPU groupc as IDLE if the
>>>>>> kworker running avgs_work is the only task running and no IOWAIT
>>>>>> or MEMSTALL sleep tasks, in which case we will shut off the avgs_work
>>>>>> if other CPUs' groupc are also IDLE.
>>>>>>
>>>>>> One potential problem is that the brief period of non-idle time
>>>>>> incurred between the aggregation run and the kworker's dequeue will
>>>>>> be stranded in the per-cpu buckets until avgs_work run next time.
>>>>>> The buckets can hold 4s worth of time, and future activity will wake
>>>>>> the avgs_work with a 2s delay, giving us 2s worth of data we can leave
>>>>>> behind when shut off the avgs_work. If the kworker run other works after
>>>>>> avgs_work shut off and doesn't have any scheduler activities for 2s,
>>>>>> this maybe a problem.
>>>>>>
>>>>>> Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
>>>>>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>>>>>
>>>>> Copying my comments from
>>>>> https://lore.kernel.org/all/CAJuCfpHyYMak-mfVmtEN9Z-hGYQ6Wko57TLjukz9HaN26EDAuA@mail.gmail.com/
>>>>> in case you want to continue the discussion here...
>>>>>
>>>>>> ---
>>>>>> kernel/sched/psi.c | 15 +++++++++++++++
>>>>>> 1 file changed, 15 insertions(+)
>>>>>>
>>>>>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>>>>>> index ee2ecc081422..f4cdf6f184ba 100644
>>>>>> --- a/kernel/sched/psi.c
>>>>>> +++ b/kernel/sched/psi.c
>>>>>> @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>>>>> u32 *pchanged_states)
>>>>>> {
>>>>>> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
>>>>>> + int current_cpu = raw_smp_processor_id();
>>>>>> + bool only_avgs_work = false;
>>>>>> u64 now, state_start;
>>>>>> enum psi_states s;
>>>>>> unsigned int seq;
>>>>>> @@ -256,6 +258,15 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>>>>> memcpy(times, groupc->times, sizeof(groupc->times));
>>>>>> state_mask = groupc->state_mask;
>>>>>> state_start = groupc->state_start;
>>>>>> + /*
>>>>>> + * This CPU has only avgs_work kworker running, snapshot the
>>>>>> + * newest times then don't need to re-arm for this groupc.
>>>>>> + * Normally this kworker will sleep soon and won't wake
>>>>>> + * avgs_work back up in psi_group_change().
>>>>>> + */
>>>>>> + if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
>>>>>> + !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
>>>>>> + only_avgs_work = true;
>>>>>
>>>>> Why do you determine only_avgs_work while taking a snapshot? The
>>>>> read_seqcount_retry() might fail and the loop gets retried, which
>>>>> might lead to a wrong only_avgs_work value if the state changes
>>>>> between retries. I think it's safer to do this after the snapshot was
>>>>> taken and to use tasks[NR_RUNNING] instead of roupc->tasks.
>>>>
>>>> Ah, you are right, coping groupc->tasks[NR_RUNNING] and tasks[NR_IOWAIT], tasks[NR_MEMSTALL]
>>>> is ok for me. (Maybe we only need to copy groupc->tasks[NR_RUNNING]?)
>>>>
>>>> Another way is to add an else branch:
>>>>
>>>> if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
>>>> !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
>>>> only_avgs_work = true;
>>>> else
>>>> only_avgs_work = false;
>>>>
>>>> Both are ok for me.
>>>
>>> Let's use the simple way and use the tasks[] after the snapshot is taken.
>>
>> Ok, I changed like this:
>>
>> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
>> + int current_cpu = raw_smp_processor_id();
>> + unsigned int tasks[NR_PSI_TASK_COUNTS];
>> u64 now, state_start;
>> enum psi_states s;
>> unsigned int seq;
>> @@ -256,6 +258,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
>> memcpy(times, groupc->times, sizeof(groupc->times));
>> state_mask = groupc->state_mask;
>> state_start = groupc->state_start;
>> + if (cpu == current_cpu)
>> + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
>> } while (read_seqcount_retry(&groupc->seq, seq));
>>
>>>
>>>>
>>>>>
>>>>>> } while (read_seqcount_retry(&groupc->seq, seq));
>>>>>>
>>>>>> /* Calculate state time deltas against the previous snapshot */
>>>>>> @@ -280,6 +291,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>>>>> if (delta)
>>>>>> *pchanged_states |= (1 << s);
>>>>>> }
>>>>>> +
>>>>>> + /* Clear PSI_NONIDLE so avgs_work won't be re-armed for this groupc */
>>>>>> + if (only_avgs_work)
>>>>>> + *pchanged_states &= ~(1 << PSI_NONIDLE);
>>>>>
>>>>> This seems to be safe because changed_states&(1<< PSI_NONIDLE) is used
>>>>> only for re-arming psi_avgs_work, however semantically this is
>>>>> incorrect. The CPU was not idle when it was executing psi_avgs_work.
>>>>> IMO a separate flag would avoid this confusion.
>>>>
>>>> Yes, it's safe, but has this confusion. Use a separate flag looks better, like PSI_ONLY_AVGS_WORK.
>>>> But then in collect_percpu_times() we still have to clear PSI_NONIDLE of this CPU if PSI_ONLY_AVGS_WORK
>>>> has been set.
>>>>
>>>> for_each_possible_cpu(cpu) {
>>>> u32 times[NR_PSI_STATES];
>>>> u32 nonidle;
>>>> u32 cpu_changed_states;
>>>>
>>>> get_recent_times(group, cpu, aggregator, times,
>>>> &cpu_changed_states);
>>>> changed_states |= cpu_changed_states;
>>>>
>>>> cpu_changed_states should clear PSI_NONIDLE if PSI_ONLY_AVGS_WORK already set.
>>>
>>> No, PSI_NONIDLE should not be affected by PSI_ONLY_AVGS_WORK. These
>>> flags should be independent and aggregated into changed_states without
>>> affecting each other. Something similar to how I suggested with
>>> PSI_STATE_WAKE_CLOCK in
>>> https://lore.kernel.org/all/CAJuCfpFr3JfwkWbDqkU=NUJbCYuCWGySwNusMCdmS3z95WD2AQ@mail.gmail.com/#t.
>>
>> If we don't clear PSI_NONIDLE of this CPU, changed_states |= cpu_changed_states;
>> so changed_states has PSI_NONIDLE, and we won't know if other CPUs are IDLE or not
>> in psi_avgs_work().
>
> I was thinking something like this:
>
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -244,6 +244,7 @@ static void get_recent_times(struct psi_group
> *group, int cpu,
> enum psi_states s;
> unsigned int seq;
> u32 state_mask;
> + bool reschedule;
>
> *pchanged_states = 0;
>
> @@ -254,6 +255,14 @@ static void get_recent_times(struct psi_group
> *group, int cpu,
> memcpy(times, groupc->times, sizeof(groupc->times));
> state_mask = groupc->state_mask;
> state_start = groupc->state_start;
> + if (current_cpu == cpu)
> + reschedule = groupc->tasks[NR_RUNNING] +
> + groupc->tasks[NR_IOWAIT] +
> + groupc->tasks[NR_MEMSTALL] > 1;
> + else
> + reschedule = times[PSI_NONIDLE] >
> + groupc->times_prev[aggregator][PSI_NONIDLE];
> +
> } while (read_seqcount_retry(&groupc->seq, seq));
We can't use times[] here because it will incorporate currently active states
on the CPU.
Should I still need to copy groupc->tasks[] out for the current_cpu as you
suggested before?
>
> /* Calculate state time deltas against the previous snapshot */
> @@ -278,6 +287,8 @@ static void get_recent_times(struct psi_group
> *group, int cpu,
> if (delta)
> *pchanged_states |= (1 << s);
> }
> + if (reschedule)
> + *pchanged_states |= PSI_STATE_RESCHEDULE;
Is this ok?
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 4bd3913dd176..0cbcbf89a934 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
u32 *pchanged_states)
{
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+ int current_cpu = raw_smp_processor_id();
+ bool reschedule;
u64 now, state_start;
enum psi_states s;
unsigned int seq;
@@ -256,6 +258,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
+ if (cpu == current_cpu)
+ reschedule = groupc->tasks[NR_RUNNING] +
+ groupc->tasks[NR_IOWAIT] +
+ groupc->tasks[NR_MEMSTALL] > 1;
} while (read_seqcount_retry(&groupc->seq, seq));
/* Calculate state time deltas against the previous snapshot */
@@ -280,6 +286,12 @@ static void get_recent_times(struct psi_group *group, int cpu,
if (delta)
*pchanged_states |= (1 << s);
}
+
+ if (cpu != current_cpu)
+ reschedule = *pchanged_states & (1 << PSI_NONIDLE);
+
+ if (reschedule)
+ *pchanged_states |= (1 << PSI_STATE_RESCHEDULE);
}
On Thu, Oct 13, 2022 at 07:06:55PM +0800, Chengming Zhou wrote:
> Should I still need to copy groupc->tasks[] out for the current_cpu as you
> suggested before?
It'd be my preference as well. This way the resched logic can be
consolidated into a single block of comment + code at the end of the
function.
> @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
> u32 *pchanged_states)
> {
> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
> + int current_cpu = raw_smp_processor_id();
> + bool reschedule;
> u64 now, state_start;
> enum psi_states s;
> unsigned int seq;
> @@ -256,6 +258,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
> memcpy(times, groupc->times, sizeof(groupc->times));
> state_mask = groupc->state_mask;
> state_start = groupc->state_start;
> + if (cpu == current_cpu)
> + reschedule = groupc->tasks[NR_RUNNING] +
> + groupc->tasks[NR_IOWAIT] +
> + groupc->tasks[NR_MEMSTALL] > 1;
> } while (read_seqcount_retry(&groupc->seq, seq));
This also matches psi_show() and the poll worker. They don't currently
use the flag, but it's somewhat fragile and confusing. Add a test for
current_work() == &group->avgs_work?
On 2022/10/13 23:52, Johannes Weiner wrote:
> On Thu, Oct 13, 2022 at 07:06:55PM +0800, Chengming Zhou wrote:
>> Should I still need to copy groupc->tasks[] out for the current_cpu as you
>> suggested before?
>
> It'd be my preference as well. This way the resched logic can be
> consolidated into a single block of comment + code at the end of the
> function.
Ok, will move these resched logic to the end of get_recent_times().
>
>> @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
>> u32 *pchanged_states)
>> {
>> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
>> + int current_cpu = raw_smp_processor_id();
>> + bool reschedule;
>> u64 now, state_start;
>> enum psi_states s;
>> unsigned int seq;
>> @@ -256,6 +258,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
>> memcpy(times, groupc->times, sizeof(groupc->times));
>> state_mask = groupc->state_mask;
>> state_start = groupc->state_start;
>> + if (cpu == current_cpu)
>> + reschedule = groupc->tasks[NR_RUNNING] +
>> + groupc->tasks[NR_IOWAIT] +
>> + groupc->tasks[NR_MEMSTALL] > 1;
>> } while (read_seqcount_retry(&groupc->seq, seq));
>
> This also matches psi_show() and the poll worker. They don't currently
> use the flag, but it's somewhat fragile and confusing. Add a test for
> current_work() == &group->avgs_work?
Yes, only psi_avgs_work() use this to re-arm now, I will add this check next version.
Thanks.
On Thu, Oct 13, 2022 at 8:52 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Thu, Oct 13, 2022 at 07:06:55PM +0800, Chengming Zhou wrote:
> > Should I still need to copy groupc->tasks[] out for the current_cpu as you
> > suggested before?
>
> It'd be my preference as well. This way the resched logic can be
> consolidated into a single block of comment + code at the end of the
> function.
Sounds good to me. If we are copying times in the retry loop then
let's move the `reschedule =` decision out of that loop completely. At
the end of get_recent_times we can do:
if (cpu == current_cpu)
reschedule = tasks[NR_RUNNING] +
tasks[NR_IOWAIT] +
tasks[NR_MEMSTALL] > 1;
else
reschedule = *pchanged_states & (1 << PSI_NONIDLE);
>
> > @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
> > u32 *pchanged_states)
> > {
> > struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
> > + int current_cpu = raw_smp_processor_id();
> > + bool reschedule;
> > u64 now, state_start;
> > enum psi_states s;
> > unsigned int seq;
> > @@ -256,6 +258,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
> > memcpy(times, groupc->times, sizeof(groupc->times));
> > state_mask = groupc->state_mask;
> > state_start = groupc->state_start;
> > + if (cpu == current_cpu)
> > + reschedule = groupc->tasks[NR_RUNNING] +
> > + groupc->tasks[NR_IOWAIT] +
> > + groupc->tasks[NR_MEMSTALL] > 1;
> > } while (read_seqcount_retry(&groupc->seq, seq));
>
> This also matches psi_show() and the poll worker. They don't currently
> use the flag, but it's somewhat fragile and confusing. Add a test for
> current_work() == &group->avgs_work?
Good point. (tasks[NR_RUNNING] + tasks[NR_IOWAIT] + tasks[NR_MEMSTALL]
> 1) condition should also contain this check.
On 2022/10/14 00:10, Suren Baghdasaryan wrote:
> On Thu, Oct 13, 2022 at 8:52 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>>
>> On Thu, Oct 13, 2022 at 07:06:55PM +0800, Chengming Zhou wrote:
>>> Should I still need to copy groupc->tasks[] out for the current_cpu as you
>>> suggested before?
>>
>> It'd be my preference as well. This way the resched logic can be
>> consolidated into a single block of comment + code at the end of the
>> function.
>
> Sounds good to me. If we are copying times in the retry loop then
> let's move the `reschedule =` decision out of that loop completely. At
> the end of get_recent_times we can do:
>
> if (cpu == current_cpu)
> reschedule = tasks[NR_RUNNING] +
> tasks[NR_IOWAIT] +
> tasks[NR_MEMSTALL] > 1;
> else
> reschedule = *pchanged_states & (1 << PSI_NONIDLE);
>
Ok, I will send an updated patch later.
Thanks!
>
>>
>>> @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>> u32 *pchanged_states)
>>> {
>>> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
>>> + int current_cpu = raw_smp_processor_id();
>>> + bool reschedule;
>>> u64 now, state_start;
>>> enum psi_states s;
>>> unsigned int seq;
>>> @@ -256,6 +258,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>> memcpy(times, groupc->times, sizeof(groupc->times));
>>> state_mask = groupc->state_mask;
>>> state_start = groupc->state_start;
>>> + if (cpu == current_cpu)
>>> + reschedule = groupc->tasks[NR_RUNNING] +
>>> + groupc->tasks[NR_IOWAIT] +
>>> + groupc->tasks[NR_MEMSTALL] > 1;
>>> } while (read_seqcount_retry(&groupc->seq, seq));
>>
>> This also matches psi_show() and the poll worker. They don't currently
>> use the flag, but it's somewhat fragile and confusing. Add a test for
>> current_work() == &group->avgs_work?
>
> Good point. (tasks[NR_RUNNING] + tasks[NR_IOWAIT] + tasks[NR_MEMSTALL]
>> 1) condition should also contain this check.
On 2022/10/13 02:24, Suren Baghdasaryan wrote:
> On Tue, Oct 11, 2022 at 7:11 PM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> On 2022/10/12 01:00, Suren Baghdasaryan wrote:
>>> On Mon, Oct 10, 2022 at 5:07 PM Chengming Zhou
>>> <zhouchengming@bytedance.com> wrote:
>>>>
>>>> Hello,
>>>>
>>>> On 2022/10/11 05:21, Suren Baghdasaryan wrote:
>>>>> On Mon, Oct 10, 2022 at 3:42 AM Chengming Zhou
>>>>> <zhouchengming@bytedance.com> wrote:
>>>>>>
>>>>>> Pavan reported a problem that PSI avgs_work idle shutoff is not
>>>>>> working at all. Because PSI_NONIDLE condition would be observed in
>>>>>> psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
>>>>>> only the kworker running avgs_work on the CPU.
>>>>>>
>>>>>> Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
>>>>>> avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
>>>>>> still will always re-arm the avgs_work, so shutoff is not working.
>>>>>>
>>>>>> This patch changes to consider current CPU groupc as IDLE if the
>>>>>> kworker running avgs_work is the only task running and no IOWAIT
>>>>>> or MEMSTALL sleep tasks, in which case we will shut off the avgs_work
>>>>>> if other CPUs' groupc are also IDLE.
>>>>>>
>>>>>> One potential problem is that the brief period of non-idle time
>>>>>> incurred between the aggregation run and the kworker's dequeue will
>>>>>> be stranded in the per-cpu buckets until avgs_work run next time.
>>>>>> The buckets can hold 4s worth of time, and future activity will wake
>>>>>> the avgs_work with a 2s delay, giving us 2s worth of data we can leave
>>>>>> behind when shut off the avgs_work. If the kworker run other works after
>>>>>> avgs_work shut off and doesn't have any scheduler activities for 2s,
>>>>>> this maybe a problem.
>>>>>>
>>>>>> Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
>>>>>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>>>>>
>>>>> Copying my comments from
>>>>> https://lore.kernel.org/all/CAJuCfpHyYMak-mfVmtEN9Z-hGYQ6Wko57TLjukz9HaN26EDAuA@mail.gmail.com/
>>>>> in case you want to continue the discussion here...
>>>>>
>>>>>> ---
>>>>>> kernel/sched/psi.c | 15 +++++++++++++++
>>>>>> 1 file changed, 15 insertions(+)
>>>>>>
>>>>>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>>>>>> index ee2ecc081422..f4cdf6f184ba 100644
>>>>>> --- a/kernel/sched/psi.c
>>>>>> +++ b/kernel/sched/psi.c
>>>>>> @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>>>>> u32 *pchanged_states)
>>>>>> {
>>>>>> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
>>>>>> + int current_cpu = raw_smp_processor_id();
>>>>>> + bool only_avgs_work = false;
>>>>>> u64 now, state_start;
>>>>>> enum psi_states s;
>>>>>> unsigned int seq;
>>>>>> @@ -256,6 +258,15 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>>>>> memcpy(times, groupc->times, sizeof(groupc->times));
>>>>>> state_mask = groupc->state_mask;
>>>>>> state_start = groupc->state_start;
>>>>>> + /*
>>>>>> + * This CPU has only avgs_work kworker running, snapshot the
>>>>>> + * newest times then don't need to re-arm for this groupc.
>>>>>> + * Normally this kworker will sleep soon and won't wake
>>>>>> + * avgs_work back up in psi_group_change().
>>>>>> + */
>>>>>> + if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
>>>>>> + !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
>>>>>> + only_avgs_work = true;
>>>>>
>>>>> Why do you determine only_avgs_work while taking a snapshot? The
>>>>> read_seqcount_retry() might fail and the loop gets retried, which
>>>>> might lead to a wrong only_avgs_work value if the state changes
>>>>> between retries. I think it's safer to do this after the snapshot was
>>>>> taken and to use tasks[NR_RUNNING] instead of roupc->tasks.
>>>>
>>>> Ah, you are right, coping groupc->tasks[NR_RUNNING] and tasks[NR_IOWAIT], tasks[NR_MEMSTALL]
>>>> is ok for me. (Maybe we only need to copy groupc->tasks[NR_RUNNING]?)
>>>>
>>>> Another way is to add an else branch:
>>>>
>>>> if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
>>>> !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
>>>> only_avgs_work = true;
>>>> else
>>>> only_avgs_work = false;
>>>>
>>>> Both are ok for me.
>>>
>>> Let's use the simple way and use the tasks[] after the snapshot is taken.
>>
>> Ok, I changed like this:
>>
>> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
>> + int current_cpu = raw_smp_processor_id();
>> + unsigned int tasks[NR_PSI_TASK_COUNTS];
>> u64 now, state_start;
>> enum psi_states s;
>> unsigned int seq;
>> @@ -256,6 +258,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
>> memcpy(times, groupc->times, sizeof(groupc->times));
>> state_mask = groupc->state_mask;
>> state_start = groupc->state_start;
>> + if (cpu == current_cpu)
>> + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
>> } while (read_seqcount_retry(&groupc->seq, seq));
>>
>>>
>>>>
>>>>>
>>>>>> } while (read_seqcount_retry(&groupc->seq, seq));
>>>>>>
>>>>>> /* Calculate state time deltas against the previous snapshot */
>>>>>> @@ -280,6 +291,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>>>>> if (delta)
>>>>>> *pchanged_states |= (1 << s);
>>>>>> }
>>>>>> +
>>>>>> + /* Clear PSI_NONIDLE so avgs_work won't be re-armed for this groupc */
>>>>>> + if (only_avgs_work)
>>>>>> + *pchanged_states &= ~(1 << PSI_NONIDLE);
>>>>>
>>>>> This seems to be safe because changed_states&(1<< PSI_NONIDLE) is used
>>>>> only for re-arming psi_avgs_work, however semantically this is
>>>>> incorrect. The CPU was not idle when it was executing psi_avgs_work.
>>>>> IMO a separate flag would avoid this confusion.
>>>>
>>>> Yes, it's safe, but has this confusion. Use a separate flag looks better, like PSI_ONLY_AVGS_WORK.
>>>> But then in collect_percpu_times() we still have to clear PSI_NONIDLE of this CPU if PSI_ONLY_AVGS_WORK
>>>> has been set.
>>>>
>>>> for_each_possible_cpu(cpu) {
>>>> u32 times[NR_PSI_STATES];
>>>> u32 nonidle;
>>>> u32 cpu_changed_states;
>>>>
>>>> get_recent_times(group, cpu, aggregator, times,
>>>> &cpu_changed_states);
>>>> changed_states |= cpu_changed_states;
>>>>
>>>> cpu_changed_states should clear PSI_NONIDLE if PSI_ONLY_AVGS_WORK already set.
>>>
>>> No, PSI_NONIDLE should not be affected by PSI_ONLY_AVGS_WORK. These
>>> flags should be independent and aggregated into changed_states without
>>> affecting each other. Something similar to how I suggested with
>>> PSI_STATE_WAKE_CLOCK in
>>> https://lore.kernel.org/all/CAJuCfpFr3JfwkWbDqkU=NUJbCYuCWGySwNusMCdmS3z95WD2AQ@mail.gmail.com/#t.
>>
>> If we don't clear PSI_NONIDLE of this CPU, changed_states |= cpu_changed_states;
>> so changed_states has PSI_NONIDLE, and we won't know if other CPUs are IDLE or not
>> in psi_avgs_work().
>
> I was thinking something like this:
>
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -244,6 +244,7 @@ static void get_recent_times(struct psi_group
> *group, int cpu,
> enum psi_states s;
> unsigned int seq;
> u32 state_mask;
> + bool reschedule;
>
> *pchanged_states = 0;
>
> @@ -254,6 +255,14 @@ static void get_recent_times(struct psi_group
> *group, int cpu,
> memcpy(times, groupc->times, sizeof(groupc->times));
> state_mask = groupc->state_mask;
> state_start = groupc->state_start;
> + if (current_cpu == cpu)
> + reschedule = groupc->tasks[NR_RUNNING] +
> + groupc->tasks[NR_IOWAIT] +
> + groupc->tasks[NR_MEMSTALL] > 1;
> + else
> + reschedule = times[PSI_NONIDLE] >
> + groupc->times_prev[aggregator][PSI_NONIDLE];
> +
> } while (read_seqcount_retry(&groupc->seq, seq));
Ok, I get what you mean: for other CPUs, reschedule == PSI_NONIDLE,
for this current CPU, reschedule is calculated from groupc->tasks[].
Then we use PSI_STATE_RESCHEDULE in psi_avgs_work(), don't use PSI_NONIDLE,
so PSI_NONIDLE become useless in changed_states. Right?
Well, I'm not sure this is easier than just using PSI_NONIDLE. But anyway,
I will try this way in the next revision.
Thanks.
>
> /* Calculate state time deltas against the previous snapshot */
> @@ -278,6 +287,8 @@ static void get_recent_times(struct psi_group
> *group, int cpu,
> if (delta)
> *pchanged_states |= (1 << s);
> }
> + if (reschedule)
> + *pchanged_states |= PSI_STATE_RESCHEDULE;
> }
>
> static void calc_avgs(unsigned long avg[3], int missed_periods,
> @@ -413,7 +424,7 @@ static void psi_avgs_work(struct work_struct *work)
> struct delayed_work *dwork;
> struct psi_group *group;
> u32 changed_states;
> - bool nonidle;
> + bool reschedule;
> u64 now;
>
> dwork = to_delayed_work(work);
> @@ -424,7 +435,7 @@ static void psi_avgs_work(struct work_struct *work)
> now = sched_clock();
>
> collect_percpu_times(group, PSI_AVGS, &changed_states);
> - nonidle = changed_states & (1 << PSI_NONIDLE);
> + reschedule = changed_states & (1 << PSI_STATE_RESCHEDULE);
> /*
> * If there is task activity, periodically fold the per-cpu
> * times and feed samples into the running averages. If things
> @@ -435,7 +446,7 @@ static void psi_avgs_work(struct work_struct *work)
> if (now >= group->avg_next_update)
> group->avg_next_update = update_averages(group, now);
>
> - if (nonidle) {
> + if (reschedule) {
> schedule_delayed_work(dwork, nsecs_to_jiffies(
> group->avg_next_update - now) + 1);
> }
>
> Does that address your concern?
>
>>
>> Thanks.
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 7d89d7bb921c5ae5a428df282e64ee5692e26fe0
Gitweb: https://git.kernel.org/tip/7d89d7bb921c5ae5a428df282e64ee5692e26fe0
Author: Chengming Zhou <zhouchengming@bytedance.com>
AuthorDate: Mon, 10 Oct 2022 18:42:06 +08:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 27 Oct 2022 11:01:23 +02:00
sched/psi: Fix avgs_work re-arm in psi_avgs_work()
Pavan reported a problem that PSI avgs_work idle shutoff is not
working at all. Because PSI_NONIDLE condition would be observed in
psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
only the kworker running avgs_work on the CPU.
Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
still will always re-arm the avgs_work, so shutoff is not working.
This patch changes to consider current CPU groupc as IDLE if the
kworker running avgs_work is the only task running and no IOWAIT
or MEMSTALL sleep tasks, in which case we will shut off the avgs_work
if other CPUs' groupc are also IDLE.
One potential problem is that the brief period of non-idle time
incurred between the aggregation run and the kworker's dequeue will
be stranded in the per-cpu buckets until avgs_work run next time.
The buckets can hold 4s worth of time, and future activity will wake
the avgs_work with a 2s delay, giving us 2s worth of data we can leave
behind when shut off the avgs_work. If the kworker run other works after
avgs_work shut off and doesn't have any scheduler activities for 2s,
this maybe a problem.
Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221010104206.12184-1-zhouchengming@bytedance.com
---
kernel/sched/psi.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ee2ecc0..f4cdf6f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
u32 *pchanged_states)
{
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+ int current_cpu = raw_smp_processor_id();
+ bool only_avgs_work = false;
u64 now, state_start;
enum psi_states s;
unsigned int seq;
@@ -256,6 +258,15 @@ static void get_recent_times(struct psi_group *group, int cpu,
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
+ /*
+ * This CPU has only avgs_work kworker running, snapshot the
+ * newest times then don't need to re-arm for this groupc.
+ * Normally this kworker will sleep soon and won't wake
+ * avgs_work back up in psi_group_change().
+ */
+ if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
+ !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
+ only_avgs_work = true;
} while (read_seqcount_retry(&groupc->seq, seq));
/* Calculate state time deltas against the previous snapshot */
@@ -280,6 +291,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
if (delta)
*pchanged_states |= (1 << s);
}
+
+ /* Clear PSI_NONIDLE so avgs_work won't be re-armed for this groupc */
+ if (only_avgs_work)
+ *pchanged_states &= ~(1 << PSI_NONIDLE);
}
static void calc_avgs(unsigned long avg[3], int missed_periods,
Hello,
Thanks for picking this up. There is a newer version which has been acked:
https://lore.kernel.org/all/20221014110551.22695-1-zhouchengming@bytedance.com/
As well another PSI patch that has been acked by Johannes:
https://lore.kernel.org/all/20220926081931.45420-1-zhouchengming@bytedance.com/
Thanks!
On 2022/10/28 14:42, tip-bot2 for Chengming Zhou wrote:
> The following commit has been merged into the sched/core branch of tip:
>
> Commit-ID: 7d89d7bb921c5ae5a428df282e64ee5692e26fe0
> Gitweb: https://git.kernel.org/tip/7d89d7bb921c5ae5a428df282e64ee5692e26fe0
> Author: Chengming Zhou <zhouchengming@bytedance.com>
> AuthorDate: Mon, 10 Oct 2022 18:42:06 +08:00
> Committer: Peter Zijlstra <peterz@infradead.org>
> CommitterDate: Thu, 27 Oct 2022 11:01:23 +02:00
>
> sched/psi: Fix avgs_work re-arm in psi_avgs_work()
>
> Pavan reported a problem that PSI avgs_work idle shutoff is not
> working at all. Because PSI_NONIDLE condition would be observed in
> psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
> only the kworker running avgs_work on the CPU.
>
> Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
> avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
> still will always re-arm the avgs_work, so shutoff is not working.
>
> This patch changes to consider current CPU groupc as IDLE if the
> kworker running avgs_work is the only task running and no IOWAIT
> or MEMSTALL sleep tasks, in which case we will shut off the avgs_work
> if other CPUs' groupc are also IDLE.
>
> One potential problem is that the brief period of non-idle time
> incurred between the aggregation run and the kworker's dequeue will
> be stranded in the per-cpu buckets until avgs_work run next time.
> The buckets can hold 4s worth of time, and future activity will wake
> the avgs_work with a 2s delay, giving us 2s worth of data we can leave
> behind when shut off the avgs_work. If the kworker run other works after
> avgs_work shut off and doesn't have any scheduler activities for 2s,
> this maybe a problem.
>
> Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: https://lore.kernel.org/r/20221010104206.12184-1-zhouchengming@bytedance.com
> ---
> kernel/sched/psi.c | 15 +++++++++++++++
> 1 file changed, 15 insertions(+)
>
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index ee2ecc0..f4cdf6f 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
> u32 *pchanged_states)
> {
> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
> + int current_cpu = raw_smp_processor_id();
> + bool only_avgs_work = false;
> u64 now, state_start;
> enum psi_states s;
> unsigned int seq;
> @@ -256,6 +258,15 @@ static void get_recent_times(struct psi_group *group, int cpu,
> memcpy(times, groupc->times, sizeof(groupc->times));
> state_mask = groupc->state_mask;
> state_start = groupc->state_start;
> + /*
> + * This CPU has only avgs_work kworker running, snapshot the
> + * newest times then don't need to re-arm for this groupc.
> + * Normally this kworker will sleep soon and won't wake
> + * avgs_work back up in psi_group_change().
> + */
> + if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
> + !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
> + only_avgs_work = true;
> } while (read_seqcount_retry(&groupc->seq, seq));
>
> /* Calculate state time deltas against the previous snapshot */
> @@ -280,6 +291,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
> if (delta)
> *pchanged_states |= (1 << s);
> }
> +
> + /* Clear PSI_NONIDLE so avgs_work won't be re-armed for this groupc */
> + if (only_avgs_work)
> + *pchanged_states &= ~(1 << PSI_NONIDLE);
> }
>
> static void calc_avgs(unsigned long avg[3], int missed_periods,
On Thu, Oct 27, 2022 at 11:50 PM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> Hello,
>
> Thanks for picking this up. There is a newer version which has been acked:
> https://lore.kernel.org/all/20221014110551.22695-1-zhouchengming@bytedance.com/
Hmm. Indeed this seems to be an older version and not the one I asked
Peter to pick up in
https://lore.kernel.org/all/CAJuCfpHeJuZBbv-q+WXjgNHwt_caMomFPL3L9rxosXOrZz3fBw@mail.gmail.com/.
Not sure what went wrong. Peter, could you please replace this one
with https://lore.kernel.org/all/20221014110551.22695-1-zhouchengming@bytedance.com/?
Chengming, please do not top-post next time. Would be better if you
posted your note under the "Link:" field in this email.
Thanks!
>
> As well another PSI patch that has been acked by Johannes:
> https://lore.kernel.org/all/20220926081931.45420-1-zhouchengming@bytedance.com/
>
> Thanks!
>
>
> On 2022/10/28 14:42, tip-bot2 for Chengming Zhou wrote:
> > The following commit has been merged into the sched/core branch of tip:
> >
> > Commit-ID: 7d89d7bb921c5ae5a428df282e64ee5692e26fe0
> > Gitweb: https://git.kernel.org/tip/7d89d7bb921c5ae5a428df282e64ee5692e26fe0
> > Author: Chengming Zhou <zhouchengming@bytedance.com>
> > AuthorDate: Mon, 10 Oct 2022 18:42:06 +08:00
> > Committer: Peter Zijlstra <peterz@infradead.org>
> > CommitterDate: Thu, 27 Oct 2022 11:01:23 +02:00
> >
> > sched/psi: Fix avgs_work re-arm in psi_avgs_work()
> >
> > Pavan reported a problem that PSI avgs_work idle shutoff is not
> > working at all. Because PSI_NONIDLE condition would be observed in
> > psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
> > only the kworker running avgs_work on the CPU.
> >
> > Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
> > avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
> > still will always re-arm the avgs_work, so shutoff is not working.
> >
> > This patch changes to consider current CPU groupc as IDLE if the
> > kworker running avgs_work is the only task running and no IOWAIT
> > or MEMSTALL sleep tasks, in which case we will shut off the avgs_work
> > if other CPUs' groupc are also IDLE.
> >
> > One potential problem is that the brief period of non-idle time
> > incurred between the aggregation run and the kworker's dequeue will
> > be stranded in the per-cpu buckets until avgs_work run next time.
> > The buckets can hold 4s worth of time, and future activity will wake
> > the avgs_work with a 2s delay, giving us 2s worth of data we can leave
> > behind when shut off the avgs_work. If the kworker run other works after
> > avgs_work shut off and doesn't have any scheduler activities for 2s,
> > this maybe a problem.
> >
> > Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
> > Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Link: https://lore.kernel.org/r/20221010104206.12184-1-zhouchengming@bytedance.com
> > ---
> > kernel/sched/psi.c | 15 +++++++++++++++
> > 1 file changed, 15 insertions(+)
> >
> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > index ee2ecc0..f4cdf6f 100644
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
> > u32 *pchanged_states)
> > {
> > struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
> > + int current_cpu = raw_smp_processor_id();
> > + bool only_avgs_work = false;
> > u64 now, state_start;
> > enum psi_states s;
> > unsigned int seq;
> > @@ -256,6 +258,15 @@ static void get_recent_times(struct psi_group *group, int cpu,
> > memcpy(times, groupc->times, sizeof(groupc->times));
> > state_mask = groupc->state_mask;
> > state_start = groupc->state_start;
> > + /*
> > + * This CPU has only avgs_work kworker running, snapshot the
> > + * newest times then don't need to re-arm for this groupc.
> > + * Normally this kworker will sleep soon and won't wake
> > + * avgs_work back up in psi_group_change().
> > + */
> > + if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
> > + !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
> > + only_avgs_work = true;
> > } while (read_seqcount_retry(&groupc->seq, seq));
> >
> > /* Calculate state time deltas against the previous snapshot */
> > @@ -280,6 +291,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
> > if (delta)
> > *pchanged_states |= (1 << s);
> > }
> > +
> > + /* Clear PSI_NONIDLE so avgs_work won't be re-armed for this groupc */
> > + if (only_avgs_work)
> > + *pchanged_states &= ~(1 << PSI_NONIDLE);
> > }
> >
> > static void calc_avgs(unsigned long avg[3], int missed_periods,
On Fri, Oct 28, 2022 at 08:58:03AM -0700, Suren Baghdasaryan wrote: > Not sure what went wrong. Peter, could you please replace this one Probably me being an idiot and searching on subject instead of msgid :/ I'll go fix up -- tomorrow though, it's late and I'm likely to mess it up again.
On Fri, Oct 28, 2022 at 09:53:57PM +0200, Peter Zijlstra wrote: > On Fri, Oct 28, 2022 at 08:58:03AM -0700, Suren Baghdasaryan wrote: > > > Not sure what went wrong. Peter, could you please replace this one > > Probably me being an idiot and searching on subject instead of msgid :/ > > I'll go fix up -- tomorrow though, it's late and I'm likely to mess it > up again. Can you please check queue.git/sched/core ; did I get it right this time?
On 2022/10/29 19:55, Peter Zijlstra wrote: > On Fri, Oct 28, 2022 at 09:53:57PM +0200, Peter Zijlstra wrote: >> On Fri, Oct 28, 2022 at 08:58:03AM -0700, Suren Baghdasaryan wrote: >> >>> Not sure what went wrong. Peter, could you please replace this one >> >> Probably me being an idiot and searching on subject instead of msgid :/ >> >> I'll go fix up -- tomorrow though, it's late and I'm likely to mess it >> up again. > > Can you please check queue.git/sched/core ; did I get it right this > time? I just checked that three patches, LGTM. And would you mind picking up this, by the way? https://lore.kernel.org/all/20220926081931.45420-1-zhouchengming@bytedance.com/ Thanks!
On Sat, Oct 29, 2022 at 5:42 AM Chengming Zhou <zhouchengming@bytedance.com> wrote: > > On 2022/10/29 19:55, Peter Zijlstra wrote: > > On Fri, Oct 28, 2022 at 09:53:57PM +0200, Peter Zijlstra wrote: > >> On Fri, Oct 28, 2022 at 08:58:03AM -0700, Suren Baghdasaryan wrote: > >> > >>> Not sure what went wrong. Peter, could you please replace this one > >> > >> Probably me being an idiot and searching on subject instead of msgid :/ > >> > >> I'll go fix up -- tomorrow though, it's late and I'm likely to mess it > >> up again. > > > > Can you please check queue.git/sched/core ; did I get it right this > > time? > > I just checked that three patches, LGTM. Yep, all three patches are correct. Thanks! > > And would you mind picking up this, by the way? > > https://lore.kernel.org/all/20220926081931.45420-1-zhouchengming@bytedance.com/ > > Thanks!
On 2022/10/28 23:58, Suren Baghdasaryan wrote:
> On Thu, Oct 27, 2022 at 11:50 PM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> Hello,
>>
>> Thanks for picking this up. There is a newer version which has been acked:
>> https://lore.kernel.org/all/20221014110551.22695-1-zhouchengming@bytedance.com/
>
> Hmm. Indeed this seems to be an older version and not the one I asked
> Peter to pick up in
> https://lore.kernel.org/all/CAJuCfpHeJuZBbv-q+WXjgNHwt_caMomFPL3L9rxosXOrZz3fBw@mail.gmail.com/.
> Not sure what went wrong. Peter, could you please replace this one
> with https://lore.kernel.org/all/20221014110551.22695-1-zhouchengming@bytedance.com/?
Oh, I didn't notice that email.
>
> Chengming, please do not top-post next time. Would be better if you
> posted your note under the "Link:" field in this email.
Got it, I will do next time.
Thanks!
> Thanks!
>
>>
>> As well another PSI patch that has been acked by Johannes:
>> https://lore.kernel.org/all/20220926081931.45420-1-zhouchengming@bytedance.com/
>>
>> Thanks!
>>
>>
>> On 2022/10/28 14:42, tip-bot2 for Chengming Zhou wrote:
>>> The following commit has been merged into the sched/core branch of tip:
>>>
>>> Commit-ID: 7d89d7bb921c5ae5a428df282e64ee5692e26fe0
>>> Gitweb: https://git.kernel.org/tip/7d89d7bb921c5ae5a428df282e64ee5692e26fe0
>>> Author: Chengming Zhou <zhouchengming@bytedance.com>
>>> AuthorDate: Mon, 10 Oct 2022 18:42:06 +08:00
>>> Committer: Peter Zijlstra <peterz@infradead.org>
>>> CommitterDate: Thu, 27 Oct 2022 11:01:23 +02:00
>>>
>>> sched/psi: Fix avgs_work re-arm in psi_avgs_work()
>>>
>>> Pavan reported a problem that PSI avgs_work idle shutoff is not
>>> working at all. Because PSI_NONIDLE condition would be observed in
>>> psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
>>> only the kworker running avgs_work on the CPU.
>>>
>>> Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
>>> avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
>>> still will always re-arm the avgs_work, so shutoff is not working.
>>>
>>> This patch changes to consider current CPU groupc as IDLE if the
>>> kworker running avgs_work is the only task running and no IOWAIT
>>> or MEMSTALL sleep tasks, in which case we will shut off the avgs_work
>>> if other CPUs' groupc are also IDLE.
>>>
>>> One potential problem is that the brief period of non-idle time
>>> incurred between the aggregation run and the kworker's dequeue will
>>> be stranded in the per-cpu buckets until avgs_work run next time.
>>> The buckets can hold 4s worth of time, and future activity will wake
>>> the avgs_work with a 2s delay, giving us 2s worth of data we can leave
>>> behind when shut off the avgs_work. If the kworker run other works after
>>> avgs_work shut off and doesn't have any scheduler activities for 2s,
>>> this maybe a problem.
>>>
>>> Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
>>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>>> Link: https://lore.kernel.org/r/20221010104206.12184-1-zhouchengming@bytedance.com
>>> ---
>>> kernel/sched/psi.c | 15 +++++++++++++++
>>> 1 file changed, 15 insertions(+)
>>>
>>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>>> index ee2ecc0..f4cdf6f 100644
>>> --- a/kernel/sched/psi.c
>>> +++ b/kernel/sched/psi.c
>>> @@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>> u32 *pchanged_states)
>>> {
>>> struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
>>> + int current_cpu = raw_smp_processor_id();
>>> + bool only_avgs_work = false;
>>> u64 now, state_start;
>>> enum psi_states s;
>>> unsigned int seq;
>>> @@ -256,6 +258,15 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>> memcpy(times, groupc->times, sizeof(groupc->times));
>>> state_mask = groupc->state_mask;
>>> state_start = groupc->state_start;
>>> + /*
>>> + * This CPU has only avgs_work kworker running, snapshot the
>>> + * newest times then don't need to re-arm for this groupc.
>>> + * Normally this kworker will sleep soon and won't wake
>>> + * avgs_work back up in psi_group_change().
>>> + */
>>> + if (current_cpu == cpu && groupc->tasks[NR_RUNNING] == 1 &&
>>> + !groupc->tasks[NR_IOWAIT] && !groupc->tasks[NR_MEMSTALL])
>>> + only_avgs_work = true;
>>> } while (read_seqcount_retry(&groupc->seq, seq));
>>>
>>> /* Calculate state time deltas against the previous snapshot */
>>> @@ -280,6 +291,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
>>> if (delta)
>>> *pchanged_states |= (1 << s);
>>> }
>>> +
>>> + /* Clear PSI_NONIDLE so avgs_work won't be re-armed for this groupc */
>>> + if (only_avgs_work)
>>> + *pchanged_states &= ~(1 << PSI_NONIDLE);
>>> }
>>>
>>> static void calc_avgs(unsigned long avg[3], int missed_periods,
© 2016 - 2026 Red Hat, Inc.