From nobody Mon Apr 27 01:53:16 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 19C95C43334 for ; Mon, 20 Jun 2022 02:11:58 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S230024AbiFTCLz (ORCPT ); Sun, 19 Jun 2022 22:11:55 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55976 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229774AbiFTCLx (ORCPT ); Sun, 19 Jun 2022 22:11:53 -0400 Received: from mail-pj1-x102d.google.com (mail-pj1-x102d.google.com [IPv6:2607:f8b0:4864:20::102d]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 748AAB871 for ; Sun, 19 Jun 2022 19:11:52 -0700 (PDT) Received: by mail-pj1-x102d.google.com with SMTP id go6so4108118pjb.0 for ; Sun, 19 Jun 2022 19:11:52 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=from:to:cc:subject:date:message-id:mime-version :content-transfer-encoding; bh=hhN9Eeyd98g/WZSwmGVpVtmeYBzF+xaNE9o0DjrSVoY=; b=Grmhq4ImiNyhow8zp+4IVr/E3h/bG3Tk+XoVIqKA8meP9Vin6U3PYOk/wBFt/8zMpm KyHU9TxXB3s3YiYNad0j4NhFHOUl8cPf1wiCuoGc2AHJSCYEcexmuaF/aQb+/K2Iz+UG lMDvOVwLCY7TAdrRHwiQuPaXfsIIU7zLGkAt2Fq1SijFyFDQZ2DxJHKZtIJRPcrRdCQ+ 2j/gwJ5qNBt+Gq5W2/TQ8P4OHfqShB5s9e9KygEZ3AfdJDfaGsMOI1SmOVyR1qSG8P1a bqbl53aeB38C8bDnaz/YixupPXGe6s7sVqrDzyLSUFfUa6oPF+QAh2UVCJsiBOIz6RAg S6Vw== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:mime-version :content-transfer-encoding; bh=hhN9Eeyd98g/WZSwmGVpVtmeYBzF+xaNE9o0DjrSVoY=; b=oPlRV7WjtcBwpqZqcjscgLPobNxKw+blKsQuJ3fKR2XJL+2Tk815MUqSdNstmWXBjh NkSzwFL5pi2nYbdNaefRxdoyYf6U3YXr0ns7tTPcx+oYWcKv9uH9YCw5p5hrKZbA2XuW EFJmVHKDQ/qiTANdM2AsNy0NQ0h9oyshwCcadsViiVHWAEu27VpT7COsZVRYCEXYtOPV G549Q8yoNXeTugdyLPYaTnBGJ3Nsji2YpIpv/aZf89KLrPevY88RZgLhejFL2lGXvsmG r70uwlx3O5LzZtPfIVCs3wmNKU25/sWa9IoY0rpW5DtBqfM2Z38QhhHhshEO9DpJELNS thyw== X-Gm-Message-State: AJIora81+HoOR+vW4IoDe/ELJkNCWnDnq0Wm8320pde93oj2sqx2LeY3 5G1+iJCPq/PrMD/XOnperP5yMOV3Oco= X-Google-Smtp-Source: AGRyM1vqV9YPfyW2ak9jAy+RGRSOQ6R+NF8PO2z7k/cplOIPjik/QQEKFtW4XN/iSdIRJ+pzbKDOdA== X-Received: by 2002:a17:903:2303:b0:166:313f:a85f with SMTP id d3-20020a170903230300b00166313fa85fmr21865669plh.57.1655691111749; Sun, 19 Jun 2022 19:11:51 -0700 (PDT) Received: from localhost.localdomain ([193.203.214.57]) by smtp.gmail.com with ESMTPSA id a30-20020a62d41e000000b00518c8397e7esm7580157pfh.211.2022.06.19.19.11.50 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Sun, 19 Jun 2022 19:11:51 -0700 (PDT) From: CGEL X-Google-Original-From: CGEL To: hannes@cmpxchg.org Cc: ran.xiaokai@zte.com.cn, linux-kernel@vger.kernel.org, cgel Subject: [PATCH] [RFC patch] psi: introduce memory.pressure.stat Date: Mon, 20 Jun 2022 02:11:46 +0000 Message-Id: <20220620021146.936566-1-ran.xiaokai@zte.com.cn> X-Mailer: git-send-email 2.25.1 MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: cgel Psi memory pressure account for all the mem stall in the system level. And didnot provide a detailed information why the stall happen. This patch introduce a cgroupu knob memory.pressure.stat, it tells the detailed stall information of all memory events and it format and the corresponding proc interface. kswapd: avg10=3D0.00 avg60=3D0.00 avg300=3D0.00 total=3D0 direct reclaim: avg10=3D0.00 avg60=3D0.00 avg300=3D0.12 total=3D42356 kcompacted: avg10=3D0.00 avg60=3D0.00 avg300=3D0.00 total=3D0 direct compact: avg10=3D0.00 avg60=3D0.00 avg300=3D0.00 total=3D0 cgroup reclaim: avg10=3D0.00 avg60=3D0.00 avg300=3D0.00 total=3D0 workingset thrashing: avg10=3D0.00 avg60=3D0.00 avg300=3D0.00 total=3D0 Signed-off-by: cgel --- include/linux/psi.h | 7 +-- include/linux/psi_types.h | 34 +++++++++++++ kernel/cgroup/cgroup.c | 11 ++++ kernel/sched/psi.c | 126 +++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 168 insertions(+), 10 deletions(-) diff --git a/include/linux/psi.h b/include/linux/psi.h index 7b3de73..163da43 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -19,10 +19,11 @@ void psi_init(void); void psi_task_change(struct task_struct *task, int clear, int set); =20 void psi_memstall_tick(struct task_struct *task, int cpu); -void psi_memstall_enter(unsigned long *flags); -void psi_memstall_leave(unsigned long *flags); +void psi_memstall_enter(unsigned long *flags, int mem_state); +void psi_memstall_leave(unsigned long *flags, int mem_state); =20 int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); +int psi_mem_pressure_stat_show(struct seq_file *m, void *v); =20 #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgrp); @@ -41,7 +42,7 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, =20 static inline void psi_init(void) {} =20 -static inline void psi_memstall_enter(unsigned long *flags) {} +static inline void psi_memstall_enter(unsigned long *flags, int mem_state) {} static inline void psi_memstall_leave(unsigned long *flags) {} =20 #ifdef CONFIG_CGROUPS diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 07aaf9b..8200623 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -9,6 +9,8 @@ =20 #ifdef CONFIG_PSI =20 +#define PSI_MASK(x) ((1UL << (x))-1) + /* Tracked task states */ enum psi_task_count { NR_IOWAIT, @@ -22,6 +24,10 @@ enum psi_task_count { #define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_RUNNING (1 << NR_RUNNING) =20 +#define TSK_COUNT_MASK PSI_MASK(NR_PSI_TASK_COUNTS) +#define TSK_COUNT_SHIFT 8 + + /* Resources that workloads could be stalled on */ enum psi_res { PSI_IO, @@ -53,6 +59,27 @@ enum psi_aggregators { NR_PSI_AGGREGATORS, }; =20 +/* Causes for mem pressure */ +enum psi_memstall_states { + PSI_MEM_KSWAPD, + PSI_MEM_DRECALAIM, + PSI_MEM_KCOMPACTED, + PSI_MEM_DCOMPACT, + PSI_MEM_CGROUP, + PSI_MEM_SWAP, + PSI_MEM_WORKINGSET, + PSI_MEM_STATES, +}; + +#define TSK_MEMSTALL_SHIFT 8 +#define TSK_MEMSTALL_KSWAPD (1 << (PSI_MEM_KSWAPD + TSK_MEMSTALL_SHIFT)) +#define TSK_MEMSTALL_DRECLAIM (1 << (PSI_MEM_KCOMPACTED + TSK_MEMSTALL_SHIFT)) +#define TSK_MEMSTALL_KCOMPACTED (1 << (PSI_MEM_DCOMPACT + TSK_MEMSTALL_SHIFT)) +#define TSK_MEMSTALL_DCOMPACT (1 << (PSI_MEM_CGROUP + TSK_MEMSTALL_SHIFT)) +#define TSK_MEMSTALL_CGROUP (1 << (PSI_MEM_DRECALAIM + TSK_MEMSTALL_SHIFT)) +#define TSK_MEMSTALL_WORKINGSET (1 << (PSI_MEM_WORKINGSET + TSK_MEMSTALL_SHIFT)) +#define TSK_MEMSTALL_MASK (PSI_MASK(TSK_MEMSTALL_SHIFT) << TSK_COUNT_SHIFT) + struct psi_group_cpu { /* 1st cacheline updated by the scheduler */ =20 @@ -64,9 +91,11 @@ struct psi_group_cpu { =20 /* Aggregate pressure state derived from the tasks */ u32 state_mask; + u32 state_memstall; =20 /* Period time sampling buckets for each state of interest (ns) * */ u32 times[NR_PSI_STATES]; + u32 times_mem[PSI_MEM_STATES]; =20 /* Time of last task change in this group (rq_clock) */ u64 state_start; @@ -76,6 +105,7 @@ struct psi_group_cpu { /* Delta detection against the sampling buckets */ u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES] ____cacheline_aligned_in_smp; + u32 times_mem_prev[PSI_MEM_STATES]; }; =20 /* PSI growth tracking window */ @@ -144,6 +174,10 @@ struct psi_group { u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1]; unsigned long avg[NR_PSI_STATES - 1][3]; =20 + u64 total_mems[PSI_MEM_STATES]; + unsigned long avg_mems[PSI_MEM_STATES][3]; + u64 avg_total_mems[PSI_MEM_STATES]; + /* Monitor work control */ atomic_t poll_scheduled; struct kthread_worker __rcu *poll_kworker; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 806fc9d..b50ab92 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3613,6 +3613,13 @@ static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) =20 return psi_show(seq, psi, PSI_MEM); } +static int cgroup_memory_pressure_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgroup =3D seq_css(seq)->cgroup; + struct psi_group *psi =3D cgroup->id =3D=3D 1 ? &psi_system : &cgroup->psi; + + return psi_mem_pressure_stat_show(seq, psi); +} static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgroup =3D seq_css(seq)->cgroup; @@ -4930,6 +4937,10 @@ static struct cftype cgroup_base_files[] =3D { .poll =3D cgroup_pressure_poll, .release =3D cgroup_pressure_release, }, + { + .name =3D "memory.pressure.stat", + .seq_show =3D cgroup_memory_pressure_stat_show, + }, { .name =3D "cpu.pressure", .seq_show =3D cgroup_cpu_pressure_show, diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 9154e74..072d535 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -279,6 +279,35 @@ static void get_recent_times(struct psi_group *group, int cpu, } } =20 +static void get_recent_mem_times(struct psi_group *group, int cpu, u32 *times_mem) +{ + struct psi_group_cpu *groupc =3D per_cpu_ptr(group->pcpu, cpu); + u64 now, state_start; + enum psi_memstall_states s; + unsigned int seq; + u32 state_mask; + + do { + seq =3D read_seqcount_begin(&groupc->seq); + now =3D cpu_clock(cpu); + memcpy(times_mem, groupc->times_mem, sizeof(groupc->times_mem)); + state_mask =3D groupc->state_mask; + state_start =3D groupc->state_start; + } while (read_seqcount_retry(&groupc->seq, seq)); + + for (s =3D 0; s < PSI_MEM_STATES; s++) { + u32 delta; + + if (state_mask & (1 << s)) + times_mem[s] +=3D now - state_start; + + delta =3D times_mem[s] - groupc->times_mem_prev[s]; + groupc->times_mem_prev[s] =3D times_mem[s]; + + times_mem[s] =3D delta; + } +} + static void calc_avgs(unsigned long avg[3], int missed_periods, u64 time, u64 period) { @@ -304,6 +333,7 @@ static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] =3D { 0, }; + u64 delta_mems[PSI_MEM_STATES - 1] =3D { 0, }; unsigned long nonidle_total =3D 0; u32 changed_states =3D 0; int cpu; @@ -319,11 +349,16 @@ static void collect_percpu_times(struct psi_group *group, */ for_each_possible_cpu(cpu) { u32 times[NR_PSI_STATES]; + u32 times_mem[PSI_MEM_STATES]; + u32 nonidle; u32 cpu_changed_states; =20 get_recent_times(group, cpu, aggregator, times, &cpu_changed_states); + if (times[PSI_MEM_SOME]) + get_recent_mem_times(group, cpu, times_mem); + changed_states |=3D cpu_changed_states; =20 nonidle =3D nsecs_to_jiffies(times[PSI_NONIDLE]); @@ -350,6 +385,10 @@ static void collect_percpu_times(struct psi_group *group, group->total[aggregator][s] +=3D div_u64(deltas[s], max(nonidle_total, 1UL)); =20 + for (s =3D 0; s < PSI_MEM_STATES - 1; s++) + group->total_mems[s] +=3D + div_u64(delta_mems[s], max(nonidle_total, 1UL)); + if (pchanged_states) *pchanged_states =3D changed_states; } @@ -404,6 +443,16 @@ static u64 update_averages(struct psi_group *group, u64 now) calc_avgs(group->avg[s], missed_periods, sample, period); } =20 + for (s =3D 0; s < PSI_MEM_STATES - 1; s++) { + u32 sample; + + sample =3D group->total_mems[s] - group->avg_total_mems[s]; + if (sample > period) + sample =3D period; + group->avg_total_mems[s] +=3D sample; + calc_avgs(group->avg_mems[s], missed_periods, sample, period); + } + return avg_next_update; } =20 @@ -628,6 +677,7 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, { u32 delta; u64 now; + int state_memstall =3D groupc->state_memstall; =20 now =3D cpu_clock(cpu); delta =3D now - groupc->state_start; @@ -641,6 +691,7 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, =20 if (groupc->state_mask & (1 << PSI_MEM_SOME)) { groupc->times[PSI_MEM_SOME] +=3D delta; + groupc->times_mem[state_memstall] +=3D delta; if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] +=3D delta; else if (memstall_tick) { @@ -676,7 +727,12 @@ static u32 psi_group_change(struct psi_group *group, int cpu, unsigned int t, m; enum psi_states s; u32 state_mask =3D 0; + u32 state_memstall =3D 0; =20 + if (set & TSK_MEMSTALL) { + state_memstall =3D set & TSK_MEMSTALL_MASK; + set &=3D TSK_COUNT_MASK; + } groupc =3D per_cpu_ptr(group->pcpu, cpu); =20 /* @@ -714,7 +770,7 @@ static u32 psi_group_change(struct psi_group *group, int cpu, state_mask |=3D (1 << s); } groupc->state_mask =3D state_mask; - + groupc->state_memstall =3D state_memstall; write_seqcount_end(&groupc->seq); =20 return state_mask; @@ -810,7 +866,7 @@ void psi_memstall_tick(struct task_struct *task, int cpu) * Marks the calling task as being stalled due to a lack of memory, * such as waiting for a refault or performing reclaim. */ -void psi_memstall_enter(unsigned long *flags) +void psi_memstall_enter(unsigned long *flags, int mem_state) { struct rq_flags rf; struct rq *rq; @@ -829,7 +885,7 @@ void psi_memstall_enter(unsigned long *flags) rq =3D this_rq_lock_irq(&rf); =20 current->flags |=3D PF_MEMSTALL; - psi_task_change(current, 0, TSK_MEMSTALL); + psi_task_change(current, 0, TSK_MEMSTALL | mem_state); =20 rq_unlock_irq(rq, &rf); } @@ -840,7 +896,7 @@ void psi_memstall_enter(unsigned long *flags) * * Marks the calling task as no longer stalled due to lack of memory. */ -void psi_memstall_leave(unsigned long *flags) +void psi_memstall_leave(unsigned long *flags, int mem_state) { struct rq_flags rf; struct rq *rq; @@ -858,7 +914,7 @@ void psi_memstall_leave(unsigned long *flags) rq =3D this_rq_lock_irq(&rf); =20 current->flags &=3D ~PF_MEMSTALL; - psi_task_change(current, TSK_MEMSTALL, 0); + psi_task_change(current, TSK_MEMSTALL | mem_state, 0); =20 rq_unlock_irq(rq, &rf); } @@ -974,6 +1030,53 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } =20 +const char * const memstall_text[] =3D { + "kswapd", + "direct reclaim", + "kcompacted", + "direct compact", + "cgroup reclaim", + "swap", + "workingset", +}; + +int psi_mem_pressure_stat_show(struct seq_file *m, void *v) +{ + int s; + u64 now; + struct psi_group *group =3D &psi_system; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + mutex_lock(&group->avgs_lock); + now =3D sched_clock(); + collect_percpu_times(group, PSI_AVGS, NULL); + if (now >=3D group->avg_next_update) + group->avg_next_update =3D update_averages(group, now); + mutex_unlock(&group->avgs_lock); + + for (s =3D 0; s < PSI_MEM_STATES; s++) { + unsigned long avg[3]; + u64 total; + int w; + + for (w =3D 0; w < 3; w++) + avg[w] =3D group->avg_mems[s][w]; + + total =3D div_u64(group->total_mems[PSI_AVGS], NSEC_PER_USEC); + + seq_printf(m, "%s avg10=3D%lu.%02lu avg60=3D%lu.%02lu avg300=3D%lu.%02lu total=3D%llu\n", + memstall_text[s], + LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), + LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), + LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), + total); + } + + return 0; +} + static int psi_io_show(struct seq_file *m, void *v) { return psi_show(m, &psi_system, PSI_IO); @@ -998,7 +1101,10 @@ static int psi_memory_open(struct inode *inode, struct file *file) { return single_open(file, psi_memory_show, NULL); } - +static int psi_memory_stat_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_mem_pressure_stat_show, NULL); +} static int psi_cpu_open(struct inode *inode, struct file *file) { return single_open(file, psi_cpu_show, NULL); @@ -1271,7 +1377,12 @@ static const struct file_operations psi_memory_fops =3D { .poll =3D psi_fop_poll, .release =3D psi_fop_release, }; - +static const struct file_operations psi_memory_stat_fops =3D { + .open =3D psi_memory_stat_open, + .read =3D seq_read, + .llseek =3D seq_lseek, + .release =3D psi_fop_release, +}; static const struct file_operations psi_cpu_fops =3D { .open =3D psi_cpu_open, .read =3D seq_read, @@ -1286,6 +1397,7 @@ static int __init psi_proc_init(void) proc_mkdir("pressure", NULL); proc_create("pressure/io", 0, NULL, &psi_io_fops); proc_create("pressure/memory", 0, NULL, &psi_memory_fops); + proc_create("pressure/memory_stat", 0, NULL, &psi_memory_stat_fops); proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); return 0; } --=20 2.15.2