From nobody Mon Jun 8 09:51:03 2026 Received: from mxhk.zte.com.cn (mxhk.zte.com.cn [160.30.148.35]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8A38F25B0BF for ; Sat, 30 May 2026 09:03:18 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=160.30.148.35 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1780131800; cv=none; b=ahN/0bf6I2J8qzkNIwVKg5qcPzrA3xQ1DAn4cDpzBLLnkqzPwCswjnJzuyls/YyDOXdUziY9ikmPi7YufrOTp1GG3R0L2UhBFifMlEsdv/btpbfkizRjviXKL3VKImvNkx/zhoq//MrIGFWWL8eXONeJwGL6/mGbDWGH+9GF3ko= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1780131800; c=relaxed/simple; bh=r305FnM0O5UDSQXdVGtRToGKkwIg585DoKKyHIzJH+Y=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=hklAJ1C1QhNndnAryJ7z87H9YjmKwhS6FkOGJ2l3BisoUUfzadJYR8sXXrg1zNOegU3jsWlF6bkhwk9RIqhGPVlRTS/yXCCah3NKXLHoiqUQnXxiH9ZcxG33vgHj5bztzFxDIk/b/4Wegc/abCc287FocnQiE2g3a43spJ//qGk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=160.30.148.35 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl2.zte.com.cn (unknown [10.5.228.133]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxhk.zte.com.cn (FangMail) with ESMTPS id 4gSDlm2pSlz8Xrnw; Sat, 30 May 2026 17:03:16 +0800 (CST) Received: from xaxapp01.zte.com.cn ([10.88.99.176]) by mse-fl2.zte.com.cn with SMTP id 64U937c1049996; Sat, 30 May 2026 17:03:07 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp01[null]) by mapi (Zmail) with MAPI id mid32; Sat, 30 May 2026 17:03:09 +0800 (CST) X-Zmail-TransId: 2af96a1aa7cdd7b-1a488 X-Mailer: Zmail v1.0 Message-ID: <202605301703094695zmVgcSC27BNR0rH0N8_x@zte.com.cn> In-Reply-To: <20260530165907829ZSDzDdMc110MnOflRzf9P@zte.com.cn> References: 20260530165907829ZSDzDdMc110MnOflRzf9P@zte.com.cn Date: Sat, 30 May 2026 17:03:09 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , Cc: , , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY3IDEvNl0gbW0vcm1hcDogYWRkIHRyYWNlcG9pbnQgZm9yIHJtYXBfd2Fsaw==?= X-MAIL: mse-fl2.zte.com.cn 64U937c1049996 X-TLS: YES X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SOURCE-IP: 10.5.228.133 unknown Sat, 30 May 2026 17:03:16 +0800 X-CLEAN: YES X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A1AA7D4.000/4gSDlm2pSlz8Xrnw Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: xu xin Add trace_rmap_walk_start() and trace_rmap_walk_end() to bracket reverse mapping walks. Unlike manual clock sampling, these tracepoints record no timestamp; latency can be computed offline by tools (e.g., perf, trace-cmd) using the event timestamps. When tracepoints are disabled, the only cost is a static branch check (no clock read, no duration calculation), making them suitable for production use. The information (folio type, locked state) helps diagnose performance issues in KSM, anonymous, and file-backed rmap walks. Signed-off-by: xu xin --- include/trace/events/rmap.h | 67 +++++++++++++++++++++++++++++++++++++ mm/rmap.c | 9 +++++ 2 files changed, 76 insertions(+) create mode 100644 include/trace/events/rmap.h diff --git a/include/trace/events/rmap.h b/include/trace/events/rmap.h new file mode 100644 index 000000000000..55a319ba6235 --- /dev/null +++ b/include/trace/events/rmap.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rmap + +#if !defined(_TRACE_RMAP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RMAP_H + +#include +#include + +#define GET_RMAP_PAGE_TYPE(folio) (folio_test_ksm(folio) ? "ksm" : \ + (folio_test_anon(folio) ? "anon" : "file")) + +/** + * rmap_walk_template - called for start / stop of rmap_walk. + */ +DECLARE_EVENT_CLASS(rmap_walk_template, + + TP_PROTO(struct folio *folio, struct rmap_walk_control *rwc, bool locked), + + TP_ARGS(folio, rwc, locked), + + TP_STRUCT__entry( + __field(unsigned long, folio_addr) + __field(unsigned long, rwc_addr) + __string(page_type, GET_RMAP_PAGE_TYPE(folio)) + __field(bool, locked) + ), + + TP_fast_assign( + __entry->folio_addr =3D (unsigned long)folio; + __entry->rwc_addr =3D (unsigned long)rwc; + __assign_str(page_type); + __entry->locked =3D locked; + ), + + TP_printk("folio=3D%p rwc=3D%p page_type=3D%s locked=3D%s", + (void *)(unsigned long)__entry->folio_addr, + (void *)(unsigned long)__entry->rwc_addr, + __get_str(page_type), + __entry->locked ? "true" : "false") + +); + +/** + * rmap_walk_start - called before a folio is rmapped. + */ +DEFINE_EVENT(rmap_walk_template, rmap_walk_start, + + TP_PROTO(struct folio *folio, struct rmap_walk_control *rwc, bool locked), + + TP_ARGS(folio, rwc, locked) +); + +DEFINE_EVENT(rmap_walk_template, rmap_walk_end, + + TP_PROTO(struct folio *folio, struct rmap_walk_control *rwc, bool locked), + + TP_ARGS(folio, rwc, locked) +); + + +#endif /* _TRACE_RMAP_H */ + +/* This part must be outside protection */ +#include + diff --git a/mm/rmap.c b/mm/rmap.c index 78b7fb5f367c..52f795f768e1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -80,6 +80,7 @@ #define CREATE_TRACE_POINTS #include +#include #include "internal.h" #include "swap.h" @@ -3098,23 +3099,31 @@ static void rmap_walk_file(struct folio *folio, void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) { + trace_rmap_walk_start(folio, rwc, false); + if (unlikely(folio_test_ksm(folio))) rmap_walk_ksm(folio, rwc); else if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, false); else rmap_walk_file(folio, rwc, false); + + trace_rmap_walk_end(folio, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) { + trace_rmap_walk_start(folio, rwc, true); + /* no ksm support for now */ VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, true); else rmap_walk_file(folio, rwc, true); + + trace_rmap_walk_end(folio, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE --=20 2.25.1 From nobody Mon Jun 8 09:51:03 2026 Received: from mxct.zte.com.cn (mxct.zte.com.cn [58.251.27.85]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id F40EE30E85C for ; Sat, 30 May 2026 09:04:29 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=58.251.27.85 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1780131872; cv=none; b=KTOFijMG2VV1aDtPjPh7Br6eYFMkmw8cmUSsLatB/dsGrF1YS0w9zrskh/ebjxZlfzWiaK+nDK8wlv6YCZqIvNG5GnNu8Zda77FiBMMJhHd0zA0J+2ReUL1vtw6nBfmTngehDCTLgGzPFsR9wqvjgqBWjrxS5z/CZWXr8S/nmk4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1780131872; c=relaxed/simple; bh=0yaUm7Hx4JSuGkQ1oWD3j2lSrfg8fSgfuMapvuD7lNg=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=PPI5FKnf1X1DxMvarq0JN5u0OxNn05kme0YpVtwXL/fvB4VJBxvc/uNnwrEQFQL8fsvqiCH2EHFvraUgaD2AX8HfryV3HKbXnf8JGPMmT5lhfkuf1Su1nDvqPZdHjdLLERMmeObb8AJVErMUfyuxFpt/+1fdhOGRWS5SHQyA9q4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=58.251.27.85 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mxde.zte.com.cn (unknown [10.35.20.165]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxct.zte.com.cn (FangMail) with ESMTPS id 4gSDn05NBMzKj3 for ; Sat, 30 May 2026 17:04:20 +0800 (CST) Received: from mxhk.zte.com.cn (unknown [192.168.250.138]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxde.zte.com.cn (FangMail) with ESMTPS id 4gSDmp59kGz5TCG9 for ; Sat, 30 May 2026 17:04:10 +0800 (CST) Received: from mse-fl1.zte.com.cn (unknown [10.5.228.132]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxhk.zte.com.cn (FangMail) with ESMTPS id 4gSDml29vyz5BNRf; Sat, 30 May 2026 17:04:07 +0800 (CST) Received: from xaxapp05.zte.com.cn ([10.99.98.109]) by mse-fl1.zte.com.cn with SMTP id 64U943Yw034760; Sat, 30 May 2026 17:04:03 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp01[null]) by mapi (Zmail) with MAPI id mid32; Sat, 30 May 2026 17:04:04 +0800 (CST) X-Zmail-TransId: 2af96a1aa804bb2-1b2be X-Mailer: Zmail v1.0 Message-ID: <20260530170404509QpJmBtpSjn3uQHeVKA2iA@zte.com.cn> In-Reply-To: <20260530165907829ZSDzDdMc110MnOflRzf9P@zte.com.cn> References: 20260530165907829ZSDzDdMc110MnOflRzf9P@zte.com.cn Date: Sat, 30 May 2026 17:04:04 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , Cc: , , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY3IDIvNl0gdG9vbHMvdGVzdGluZzogYWRkIHJtYXAgd2FsayBsYXRlbmN5IGJlbmNobWFyaw==?= Content-Type: text/plain; charset="utf-8" X-MAIL: mse-fl1.zte.com.cn 64U943Yw034760 X-TLS: YES X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SOURCE-IP: 10.35.20.165 unknown Sat, 30 May 2026 17:04:21 +0800 X-CLEAN: YES X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A1AA813.000/4gSDn05NBMzKj3 Content-Transfer-Encoding: quoted-printable From: xu xin Add a benchmark to measure rmap_walk latency for KSM, anonymous, and file-backed pages under high sharing. For KSM, the sysfs file "max_page_sharing" is set as 256 to make sure that every KSM page can be shared by 256 original pages. A large region is split into 20000 VMAs via mprotect then merged by KSM, whose purpose is to construct a lot of unrelated VMAs sharing its anon_vma to reproduce the issue proposed at https://lore.kernel.org/all/20260503205013850sym7UO0jvKJzyRiTYmO8V@zte.com.= cn/. For anon and file pages, 256 (equal to KSM's max_page_sharing) child processes are forked to share a single physical page (COW not broken). The benchmark uses move_pages() to trigger migration, collects rmap_walk_start/end trace events, and reports max/average latency. The tool also saves/restores KSM config and filters trace events by PID to avoid noise. For each test, the program prints the number of captured events and the maximum / average latency in milliseconds. This benchmark helps developers evaluate optimizations in the reverse mapping code, such as limiting max_page_sharing or improving tree traversal efficiency. Usage (must be run as root): '# cd tools/testing/rmap/ && make =E2=80=99# sudo ./rmap_benchmark KSM rmap_walk latency (Shared by 256 VMAs via mprotect and KSM merge): Max: 839.00 ms (838998 us) Avg: 605.80 ms (605799 us) Count: 4 events Anonymous page rmap_walk latency (Shared by 256 VMAs via fork, COW not brok= en): Max: 4.05 ms (4055 us) Avg: 2.86 ms (2858 us) Count: 2 events File page rmap_walk latency (Shared by 256 VMAs via fork, MAP_SHARED): Max: 2.57 ms (2572 us) Avg: 1.25 ms (1250 us) Count: 4 events Signed-off-by: xu xin --- tools/testing/rmap/Makefile | 11 + tools/testing/rmap/rmap_benchmark.c | 674 ++++++++++++++++++++++++++++ 2 files changed, 685 insertions(+) create mode 100644 tools/testing/rmap/Makefile create mode 100644 tools/testing/rmap/rmap_benchmark.c diff --git a/tools/testing/rmap/Makefile b/tools/testing/rmap/Makefile new file mode 100644 index 000000000000..200bd364cafb --- /dev/null +++ b/tools/testing/rmap/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +CC :=3D $(CROSS_COMPILE)gcc + +PROGS :=3D rmap_benchmark + +all: $(PROGS) + +rmap_benchmark: LDLIBS =3D -lnuma + +clean: + rm -fr $(PROGS) diff --git a/tools/testing/rmap/rmap_benchmark.c b/tools/testing/rmap/rmap_= benchmark.c new file mode 100644 index 000000000000..fdbd29cfa6f6 --- /dev/null +++ b/tools/testing/rmap/rmap_benchmark.c @@ -0,0 +1,674 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Reverse mapping latency test for KSM, anonymous and file pages + * + * This program creates a large number of pages (KSM merged, normal anonym= ous, + * or file mapped), splits the VMA into many small VMAs via mprotect, + * triggers rmap_walk by move_pages(), and collects latency data from the + * tracepoints 'rmap_walk_start' and 'rmap_walk_end' (offline timestamp di= ff). + * + * Usage: must be run as root (to access tracefs and KSM sysfs). + * + * Copyright 2026, ZTE Corp. + * + * Author(s): Xu Xin + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int page_size; + +/* Used for test_anon and test_file */ +#define NR_SHARERS 256 /* Number of child processes sharing a single page= */ + +/* KSM sysfs paths */ +#define TEST_PATTERN 0xaa +#define KSM_RUN_PATH "/sys/kernel/mm/ksm/run" +#define KSM_SLEEP_MS_PATH "/sys/kernel/mm/ksm/sleep_millisecs" +#define KSM_PAGES_TO_SCAN "/sys/kernel/mm/ksm/pages_to_scan" +#define KSM_FULL_SCANS_PATH "/sys/kernel/mm/ksm/full_scans" +#define KSM_MAX_SHARING_PATH "/sys/kernel/mm/ksm/max_page_sharing" + +/* Tracepoint control paths - enable all events under rmap */ +#define TRACE_ENABLE "/sys/kernel/tracing/events/rmap/enable" +#define TRACE_FILE "/sys/kernel/tracing/trace" + +/* + * Kernel TASK_COMM_LEN is 16. We use a slightly larger buffer + * to safely accommodate the string plus null terminator without + * depending on internal kernel headers. + */ +#define COMM_BUF_SIZE 32 + +#define MAX_TRACING_PENDING 128 + +enum page_type { + PAGE_TYPE_KSM, + PAGE_TYPE_ANON, + PAGE_TYPE_FILE, +}; + +static const char *page_type_str(enum page_type type) +{ + switch (type) { + case PAGE_TYPE_KSM: return "ksm"; + case PAGE_TYPE_ANON: return "anon"; + case PAGE_TYPE_FILE: return "file"; + default: return "unknown"; + } +} + +/* Helper: read/write sysfs */ +static int write_sys(const char *path, int value) +{ + char buf[32]; + int fd; + ssize_t ret; + + snprintf(buf, sizeof(buf), "%d", value); + fd =3D open(path, O_WRONLY); + if (fd < 0) { + fprintf(stderr, "open %s failed: %s\n", path, strerror(errno)); + return -1; + } + ret =3D write(fd, buf, strlen(buf)); + close(fd); + if (ret !=3D (ssize_t)strlen(buf)) { + fprintf(stderr, "write %s failed: %s\n", path, strerror(errno)); + return -1; + } + return 0; +} + +static int read_sys_int(const char *path, int *val) +{ + FILE *fp =3D fopen(path, "r"); + + if (!fp) + return -1; + if (fscanf(fp, "%d", val) !=3D 1) { + fclose(fp); + return -1; + } + fclose(fp); + return 0; +} + +/* KSM full scan count */ +static int ksm_get_full_scans(void) +{ + int val; + + if (read_sys_int(KSM_FULL_SCANS_PATH, &val)) + return -1; + + return val; +} + +/* Wait for KSM full scans */ +static void wait_ksm_merge(void) +{ + int start_scans, end_scans; + int max_wait =3D 60, waited =3D 0; + + start_scans =3D ksm_get_full_scans(); + if (start_scans < 0) { + fprintf(stderr, "Failed to read initial full_scans\n"); + return; + } + if (write_sys(KSM_RUN_PATH, 1) < 0) { + fprintf(stderr, "Failed to start KSM\n"); + return; + } + do { + sleep(1); + end_scans =3D ksm_get_full_scans(); + if (end_scans < 0) + return; + waited++; + if (waited > max_wait) { + fprintf(stderr, "Warning: KSM full_scans not increased after %ds\n", ma= x_wait); + break; + } + } while (end_scans < start_scans + 2); +} + +static void disable_tracepoint(void) +{ + write_sys(TRACE_ENABLE, 0); +} + +/* Tracepoint enable/disable */ +static int enable_tracepoint(void) +{ + struct stat st; + int fd; + + if (stat("/sys/kernel/tracing/trace", &st) !=3D 0) { + if (mount("tracefs", "/sys/kernel/tracing", "tracefs", 0, NULL) !=3D 0) + fprintf(stderr, "Warning: mount tracefs failed: %s\n", strerror(errno)); + } + + if (write_sys(TRACE_ENABLE, 1) < 0) + return -1; + + fd =3D open(TRACE_FILE, O_WRONLY | O_TRUNC); + if (fd < 0) { + perror("open " TRACE_FILE); + disable_tracepoint(); + return -1; + } + close(fd); + + return 0; +} + +/* + * Get current process comm (task_struct->comm). + * Returns 0 on success, -1 on failure. + */ +static int get_self_comm(char *buf, size_t size) +{ + FILE *fp =3D fopen("/proc/self/comm", "r"); + size_t len; + + if (!fp) + return -1; + + if (!fgets(buf, size, fp)) { + fclose(fp); + return -1; + } + fclose(fp); + + /* Strip trailing newline */ + len =3D strlen(buf); + if (len > 0 && buf[len - 1] =3D=3D '\n') + buf[len - 1] =3D '\0'; + + return 0; +} + +/* Timestamp extraction (us) */ +static unsigned long long extract_timestamp_us(const char *line) +{ + char time_str[32]; + double ts_sec =3D 0.0; + + if (sscanf(line, "%*s %*s %*s %31s", time_str) =3D=3D 1) { + char *colon =3D strchr(time_str, ':'); + + if (colon) + *colon =3D '\0'; + ts_sec =3D strtod(time_str, NULL); + } + return (unsigned long long)(ts_sec * 1e6); +} + +/* Safe start/end pairing using folio and rwc addresses, plus PID filter */ +struct pending_start { + unsigned long long ts; + unsigned long folio; + unsigned long rwc; +}; + +static int parse_trace_and_print(enum page_type type, unsigned long long *= max_us, + unsigned long long *avg_us, int *count) +{ + FILE *fp =3D fopen(TRACE_FILE, "r"); + + if (!fp) { + perror("fopen " TRACE_FILE); + return -1; + } + + char line[1024]; + struct pending_start pending[MAX_TRACING_PENDING]; + int pending_cnt =3D 0; + unsigned long long sum =3D 0, max_val =3D 0; + int pairs =3D 0; + const char *type_str =3D page_type_str(type); + char type_pattern[64]; + char my_comm[COMM_BUF_SIZE]; + bool overflow_warned =3D false; + /* COMM(32) + '-' + PID(max 7 digits), using 16 for pid is safe */ + char match_pattern[COMM_BUF_SIZE + 16]; + + if (get_self_comm(my_comm, sizeof(my_comm)) < 0) { + fprintf(stderr, "Failed to read /proc/self/comm\n"); + fclose(fp); + return -1; + } + + snprintf(type_pattern, sizeof(type_pattern), "page_type=3D%s", type_str); + /* Get current PID for filtering */ + pid_t mypid =3D getpid(); + + snprintf(match_pattern, sizeof(match_pattern), "%s-%d ", my_comm, mypid); + + while (fgets(line, sizeof(line), fp)) { + /* Filter by COMM-PID: line should start with "COMM-PID" */ + if (!strstr(line, match_pattern)) + continue; + if (!strstr(line, type_pattern)) + continue; + + /* Extract folio and rwc addresses */ + unsigned long folio =3D 0, rwc =3D 0; + char *folio_str =3D strstr(line, "folio=3D"); + char *rwc_str =3D strstr(line, "rwc=3D"); + + if (folio_str && rwc_str) { + folio =3D strtoul(folio_str + 6, NULL, 16); + rwc =3D strtoul(rwc_str + 4, NULL, 16); + } else { + continue; + } + + if (strstr(line, "rmap_walk_start:")) { + if (pending_cnt < MAX_TRACING_PENDING) { + pending[pending_cnt].ts =3D extract_timestamp_us(line); + pending[pending_cnt].folio =3D folio; + pending[pending_cnt].rwc =3D rwc; + pending_cnt++; + } else if (!overflow_warned) { + fprintf(stderr, "Warning: pending_start overflow, some events may be l= ost\n"); + overflow_warned =3D true; /* Only warn once */ + } + } else if (strstr(line, "rmap_walk_end:")) { + unsigned long long end_ts =3D extract_timestamp_us(line); + /* Find matching start event */ + for (int i =3D 0; i < pending_cnt; i++) { + if (pending[i].folio =3D=3D folio && pending[i].rwc =3D=3D rwc) { + unsigned long long delta =3D end_ts - pending[i].ts; + + if (delta > max_val) + max_val =3D delta; + sum +=3D delta; + pairs++; + /* Remove this pending entry */ + pending[i] =3D pending[--pending_cnt]; + break; + } + } + } + } + fclose(fp); + + if (pairs =3D=3D 0) { + printf("No rmap_walk events with page_type=3D%s found.\n", type_str); + return -1; + } + + *max_us =3D max_val; + *avg_us =3D sum / pairs; + *count =3D pairs; + return 0; +} + +/* + * Trigger rmap_walk by moving a single page. + * Returns 0 on success, -1 on failure (e.g., single NUMA node). + */ +static int trigger_rmap_walk(void *region) +{ + int ret, status, cur_node, target_node; + void *pages[1]; + int nodes[1]; + + ret =3D move_pages(0, 1, (void **)®ion, NULL, &status, MPOL_MF_MOVE_AL= L); + if (ret !=3D 0) { + perror("Failed to get original numa"); + return -1; + } + cur_node =3D status; + + for (target_node =3D 0; target_node <=3D numa_max_node(); target_node++) { + if (numa_bitmask_isbitset(numa_all_nodes_ptr, target_node) && target_nod= e !=3D cur_node) + break; + } + if (target_node > numa_max_node()) { + fprintf(stderr, "No other NUMA node available, cannot trigger migration.= \n"); + return -1; + } + + pages[0] =3D region; + nodes[0] =3D target_node; + ret =3D move_pages(0, 1, pages, nodes, &status, MPOL_MF_MOVE_ALL); + if (ret < 0) { + perror("move_pages"); + return -1; + } + return 0; +} + +/* + * Fork nr_forks child processes sharing the same memory region (COW not b= roken). + * The parent and all children will have a VMA mapping the same physical p= age. + * + * On success, returns nr_forks and stores the child PIDs in *out_pids. + * The caller is responsible for freeing *out_pids and killing/waiting chi= ldren. + * On failure, returns -1 and cleans up any partially forked children inte= rnally. + */ +static int fork_sharers(int nr_forks, pid_t **out_pids) +{ + pid_t *pids =3D malloc(sizeof(pid_t) * nr_forks); + + if (!pids) { + perror("malloc pids"); + return -1; + } + + int i, forked =3D 0; + + for (i =3D 0; i < nr_forks; i++) { + pid_t pid =3D fork(); + + if (pid =3D=3D 0) { + /* Child: just wait for signal to exit */ + free(pids); + pause(); + exit(0); + } else if (pid > 0) { + pids[i] =3D pid; + forked++; + } else { + perror("fork"); + break; + } + } + + if (forked < nr_forks) { + /* Fork failed: kill already forked children */ + for (int j =3D 0; j < forked; j++) { + kill(pids[j], SIGTERM); + waitpid(pids[j], NULL, 0); + } + free(pids); + return -1; + } + + /* Give children a moment to settle */ + usleep(100000); + + *out_pids =3D pids; + return forked; +} + +/* Helper: kill and reap all child sharers */ +static void cleanup_children(pid_t *pids, int nr_children) +{ + for (int i =3D 0; i < nr_children; i++) { + kill(pids[i], SIGTERM); + waitpid(pids[i], NULL, 0); + } + free(pids); +} + + +/* + * Split VMA with mprotect (used only for KSM test). + * Returns number of successful mprotects (or -1 on error). + */ +static int split_vma_with_mprotect(void *addr, size_t size) +{ + int splits =3D 0; + size_t pages =3D size / page_size; + + for (size_t i =3D 0; i < pages; i++) { + if (i % 2 =3D=3D 0) { + if (mprotect(addr + i * page_size, page_size, PROT_READ) < 0) { + perror("mprotect"); + return -1; + } + splits++; + } + } + + return splits; +} + +/* KSM configuration save/restore */ +static struct ksm_config { + int run; + int sleep_ms; + int pages_to_scan; + int max_page_sharing; +} orig_ksm; + +static int save_ksm_config(void) +{ + if (read_sys_int(KSM_RUN_PATH, &orig_ksm.run) || + read_sys_int(KSM_SLEEP_MS_PATH, &orig_ksm.sleep_ms) || + read_sys_int(KSM_PAGES_TO_SCAN, &orig_ksm.pages_to_scan) || + read_sys_int(KSM_MAX_SHARING_PATH, &orig_ksm.max_page_sharing)) { + fprintf(stderr, "Failed to read KSM config\n"); + return -1; + } + return 0; +} + +static void restore_ksm_config(void) +{ + write_sys(KSM_RUN_PATH, orig_ksm.run); + write_sys(KSM_SLEEP_MS_PATH, orig_ksm.sleep_ms); + write_sys(KSM_PAGES_TO_SCAN, orig_ksm.pages_to_scan); + write_sys(KSM_MAX_SHARING_PATH, orig_ksm.max_page_sharing); +} + +/* + * KSM test (shared by many VMAs via mprotect splitting). + * the total memory area is 20000 pages, and split into 20000 VMAs. + * Restricted by KSM config (max_page_sharing =3D 256), so any one of + * KSM pages is shared by 256 VMAs at maximum. + */ +static void test_ksm(void) +{ + int nr_ksm_pages =3D 20000; + int config_max_page_sharing =3D 256; + size_t size =3D nr_ksm_pages * page_size; + unsigned long long max_us, avg_us; + int count; + + if (save_ksm_config() < 0) { + printf("KSM not available, skip KSM test.\n"); + return; + } + + if (write_sys(KSM_RUN_PATH, 2) < 0 || + write_sys(KSM_SLEEP_MS_PATH, 0) < 0 || + write_sys(KSM_MAX_SHARING_PATH, config_max_page_sharing) < 0 || + write_sys(KSM_PAGES_TO_SCAN, 10000) < 0) { + fprintf(stderr, "Failed to configure KSM\n"); + goto restore_out; + } + + void *region =3D mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | M= AP_ANONYMOUS, -1, 0); + + if (region =3D=3D MAP_FAILED) { + perror("mmap for KSM"); + goto restore_out; + } + + memset(region, TEST_PATTERN, size); + if (madvise(region, size, MADV_MERGEABLE) !=3D 0) { + perror("madvise MADV_MERGEABLE"); + goto unmap_out; + } + + if (write_sys(KSM_RUN_PATH, 1) < 0) { + perror("Start KSM"); + goto unmap_out; + } + + if (split_vma_with_mprotect(region, size) =3D=3D -1) + goto unmap_out; + + wait_ksm_merge(); + + if (enable_tracepoint() !=3D 0) + goto unmap_out; + + if (trigger_rmap_walk(region + page_size) !=3D 0) { + disable_tracepoint(); + goto unmap_out; + } + usleep(100000); + disable_tracepoint(); + + if (parse_trace_and_print(PAGE_TYPE_KSM, &max_us, &avg_us, &count) =3D=3D= 0) { + printf("KSM rmap_walk latency (Shared by %d VMAs via mprotect and KSM me= rge):\n", + config_max_page_sharing); + printf(" Max: %.2f ms (%.0f us)\n", max_us/1000.0, (double)max_us); + printf(" Avg: %.2f ms (%.0f us)\n", avg_us/1000.0, (double)avg_us); + printf(" Count: %d events\n", count); + } +unmap_out: + munmap(region, size); +restore_out: + restore_ksm_config(); +} + +/* Anonymous test: fork many child processes to share a single physical pa= ge */ +static void test_anon(void) +{ + unsigned long long max_us, avg_us; + int count; + int nr_forks =3D NR_SHARERS - 1; + pid_t *pids =3D NULL; + + void *region =3D mmap(NULL, page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (region =3D=3D MAP_FAILED) { + perror("mmap anon for sharing"); + return; + } + memset(region, TEST_PATTERN, page_size); + + int forked =3D fork_sharers(nr_forks, &pids); + + if (forked < 0) { + printf("Failed to fork enough children, abort anonymous test.\n"); + goto munmap_out; + } + + if (enable_tracepoint() !=3D 0) + goto cleanup_child_out; + + if (trigger_rmap_walk(region) !=3D 0) { + disable_tracepoint(); + goto cleanup_child_out; + } + usleep(100000); + disable_tracepoint(); + + if (parse_trace_and_print(PAGE_TYPE_ANON, &max_us, &avg_us, &count) =3D= =3D 0) { + printf("Anonymous page rmap_walk latency (Shared by %d VMAs via fork, CO= W not broken):\n", forked + 1); + printf(" Max: %.2f ms (%.0f us)\n", max_us / 1000.0, (double)max_us); + printf(" Avg: %.2f ms (%.0f us)\n", avg_us / 1000.0, (double)avg_us); + printf(" Count: %d events\n", count); + } + +cleanup_child_out: + cleanup_children(pids, forked); +munmap_out: + munmap(region, page_size); +} + +/* File-backed test: similar to anonymous but using MAP_SHARED file mappin= g */ +static void test_file(void) +{ + unsigned long long max_us, avg_us; + int count; + int nr_forks =3D NR_SHARERS - 1; + pid_t *pids =3D NULL; + char filename[] =3D "/tmp/rmap_test_file_XXXXXX"; + int fd =3D mkstemp(filename); + + if (fd < 0) { + perror("mkstemp"); + return; + } + unlink(filename); + if (ftruncate(fd, page_size) < 0) { + perror("ftruncate"); + goto close_fd_out; + } + + void *region =3D mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED= , fd, 0); + + if (region =3D=3D MAP_FAILED) { + perror("mmap file for sharing"); + goto close_fd_out; + } + memset(region, TEST_PATTERN, page_size); + + int forked =3D fork_sharers(nr_forks, &pids); + + if (forked < 0) { + printf("Failed to fork enough children, abort file test.\n"); + goto munmap_out; + } + + if (enable_tracepoint() !=3D 0) + goto cleanup_child_out; + + if (trigger_rmap_walk(region) !=3D 0) { + disable_tracepoint(); + goto cleanup_child_out; + } + + usleep(100000); + disable_tracepoint(); + + if (parse_trace_and_print(PAGE_TYPE_FILE, &max_us, &avg_us, &count) =3D= =3D 0) { + printf("File page rmap_walk latency (Shared by %d VMAs via fork, MAP_SHA= RED):\n", forked + 1); + printf(" Max: %.2f ms (%.0f us)\n", max_us / 1000.0, (double)max_us); + printf(" Avg: %.2f ms (%.0f us)\n", avg_us / 1000.0, (double)avg_us); + printf(" Count: %d events\n", count); + } + +cleanup_child_out: + cleanup_children(pids, forked); +munmap_out: + munmap(region, page_size); +close_fd_out: + close(fd); +} + +int main(void) +{ + page_size =3D getpagesize(); + + if (geteuid() !=3D 0) { + fprintf(stderr, "Must be run as root.\n"); + return 1; + } + if (numa_available() < 0) { + fprintf(stderr, "NUMA not available.\n"); + return 1; + } + + test_ksm(); + test_anon(); + test_file(); + return 0; +} + --=20 2.25.1 From nobody Mon Jun 8 09:51:03 2026 Received: from mxhk.zte.com.cn (mxhk.zte.com.cn [160.30.148.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C74AD35B63B for ; Sat, 30 May 2026 09:06:31 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=160.30.148.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1780131993; cv=none; b=QD1j4BS/DfP9RnI8Yjk7+XfTI6ghiwxkc0lh8FgcBs6EWqOCTu7ILLq1UCvypuYrsLar4jYGp1ZRC3b4AvSv6+84q7B3WJxxwNkb4G3WQRGlZyHLYARAM1FuTEulv0H0ZPCLc7joz7OUj+pffVvrEb9FM/oyKIkq4TzHKwHzyDk= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1780131993; c=relaxed/simple; bh=dKNSkm6b5MO8d5Wl3ECAMi/t39rDd81bBXOFHxb0Y8s=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=ZUQdjVBPlGHhaHPnUn5te4bWOUUND3TcWLjkvpYu8M+mEqH8xRvU7LOEBPTIF93sq5MqA0du/SqWPBBIq/5yj6AAf+RLZuHkzsjQ8yAXkhIzGAuvq6ldkcExu6LgJJwFfCxIRqrEaIbursA8v9QDmw7TpIFkKo2NKvILCXgHv9M= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=160.30.148.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl2.zte.com.cn (unknown [10.5.228.133]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxhk.zte.com.cn (FangMail) with ESMTPS id 4gSDqT52tsz5BNS0; Sat, 30 May 2026 17:06:29 +0800 (CST) Received: from xaxapp01.zte.com.cn ([10.88.99.176]) by mse-fl2.zte.com.cn with SMTP id 64U96MFO051181; Sat, 30 May 2026 17:06:22 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp04[null]) by mapi (Zmail) with MAPI id mid32; Sat, 30 May 2026 17:06:24 +0800 (CST) X-Zmail-TransId: 2afb6a1aa89046c-19b79 X-Mailer: Zmail v1.0 Message-ID: <20260530170624912AfpqJafVKV2Xmz1EPfHH2@zte.com.cn> In-Reply-To: <20260530165907829ZSDzDdMc110MnOflRzf9P@zte.com.cn> References: 20260530165907829ZSDzDdMc110MnOflRzf9P@zte.com.cn Date: Sat, 30 May 2026 17:06:24 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , Cc: , , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY3IDMvNl0gTUFJTlRBSU5FUlM6IGFkZCBteXNlbGYgYXMgcmV2aWV3ZXIgZm9yIHJtYXAgc2VjdGlvbg==?= X-MAIL: mse-fl2.zte.com.cn 64U96MFO051181 X-TLS: YES X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SOURCE-IP: 10.5.228.133 unknown Sat, 30 May 2026 17:06:29 +0800 X-CLEAN: YES X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A1AA895.000/4gSDqT52tsz5BNS0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: xu xin To help review future changes related to rmap tracing and testing, add myself as a reviewer (R:) for the rmap entry, and also update the file patterns to include: - include/trace/events/rmap.h - tools/testing/rmap/rmap_benchmark.c Signed-off-by: Xu Xin --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8e7268d2f6ec..01cc34cc83a2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17006,11 +17006,14 @@ R: Liam R. Howlett R: Vlastimil Babka R: Harry Yoo R: Jann Horn +R: Xu Xin L: linux-mm@kvack.org S: Maintained F: include/linux/rmap.h +F: include/trace/events/rmap.h F: mm/page_vma_mapped.c F: mm/rmap.c +F: tools/testing/rmap/rmap_benchmark.c F: tools/testing/selftests/mm/rmap.c MEMORY MANAGEMENT - SECRETMEM --=20 2.25.1 From nobody Mon Jun 8 09:51:03 2026 Received: from mxhk.zte.com.cn (mxhk.zte.com.cn [160.30.148.35]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D510A30E85C for ; Sat, 30 May 2026 09:07:16 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=160.30.148.35 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1780132038; cv=none; b=q9depzAAcB682JT9AnV9cGzUSa+tD6rTiE8qa+d6XA27SMZIVuG+/CxsJyJasRq4EaNDgIe2dAhcOQELPHCUeta/4cRXl57uiCxHvSKW0evVBdTRtQ3BCqSCDM77688+bFBrQ/0Yjzfb3QItYutDemFXNyEbQg1Di9/AQv/Tr0g= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1780132038; c=relaxed/simple; bh=al8X5naXJExnaDVynwWF5LVPfMA9TNOVBrbx0uSb1I8=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=KIqwshINHjaLkmRHrolhZQzehI0S/q/h1H9rm970lT3jBk/lOmLXiZ5IgFGLVmnLRucUcwN8A2l8+0BvQoRGJTtkLVOCvxoHx8+td/52v6PebcY0noyDdVAlS+nbIMPNWGfLn4KYjIjGLr3wPRzDcwH5v+VgsAnhB6GNA/zHGjo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=160.30.148.35 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl2.zte.com.cn (unknown [10.5.228.133]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxhk.zte.com.cn (FangMail) with ESMTPS id 4gSDrM1dx3z8Xrr9; Sat, 30 May 2026 17:07:15 +0800 (CST) Received: from xaxapp05.zte.com.cn ([10.99.98.109]) by mse-fl2.zte.com.cn with SMTP id 64U9794S051514; Sat, 30 May 2026 17:07:09 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp02[null]) by mapi (Zmail) with MAPI id mid32; Sat, 30 May 2026 17:07:11 +0800 (CST) X-Zmail-TransId: 2afa6a1aa8bfa14-28113 X-Mailer: Zmail v1.0 Message-ID: <20260530170711362Lxt422zijBMtOvPlJD-0-@zte.com.cn> In-Reply-To: <20260530165907829ZSDzDdMc110MnOflRzf9P@zte.com.cn> References: 20260530165907829ZSDzDdMc110MnOflRzf9P@zte.com.cn Date: Sat, 30 May 2026 17:07:11 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , Cc: , , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY3IDQvNl0ga3NtOiBhZGQgcGdvZmYgaW50byBrc21fcm1hcF9pdGVt?= X-MAIL: mse-fl2.zte.com.cn 64U9794S051514 X-TLS: YES X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SOURCE-IP: 10.5.228.133 unknown Sat, 30 May 2026 17:07:15 +0800 X-CLEAN: YES X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A1AA8C3.000/4gSDrM1dx3z8Xrr9 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: xu xin The reason for adding pgoff to ksm_rmap_item has been discussed in previous mailing list threads [1][2]. The main purpose is to allow the KSM reverse m= apping to obtain the original page's linear page index, so that during anon_vma_tr= ee travering, it can conditionally locate the VMAs and avoid scanning the enti= re address space [0, ULONG_MAX]. To minimize the size impact of adding pgoff to ksm_rmap_item as much as possible, a trick that David suggested is to use a UNION that groups the me= mbers related to the unstable tree together with the newly added linear page inde= x. The members that valids only when in unstable tree include oldchecksum and age = information. However, the function should_skip_rmap_item() in the smart scanning needs s= light modification, since this function still uses the age information even when = the rmap_item is in a stable state (the page is not KSM), a situation that occu= rs during COW faults. After using union, the size is still 64 byte without inc= reasing. We keep the same way to store the pgoff as rmap->anon_vma which is set when= the page is merged and become a KsmPage at try_to_merge_with_ksm_page(), and reset at remove_rmap_item_from_tree() and remove_node_from_stable_tree() and reset w= hen break_cow. To be specially clarified, the reason for resetting pgoff at break_cow() is: - When a page successfully becomes a KSM page (i.e., after stable_tree_appe= nd() sets STABLE_FLAG), both anon_vma and vm_pgoff are stored and remain valid. - However, during the merging process there are several failure paths where= a page that was temporarily treated as a KSM page must be reverted back to = an anonymous page. Examples include: * The second call to try_to_merge_with_ksm_page() fails in try_to_merge_two_pages(). * stable_tree_insert() fails in cmp_and_merge_page(). In such cases, break_cow() is invoked to break the COW mapping and discard the KSM state. Currently, break_cow() already contains a put_anon_vma(rmap_item->anon_vma) to release the reference taken during the aborted merge. Because 'pgoff' is logically paired with anon_vma (both are only meaningful when the rmap_item is in a stable state), it must also be cleared (or reset) in break_cow() to avoid leaving stale pgoff values that could confuse subsequent rmap walks or scanning logic. [1] https://lore.kernel.org/all/adTPQSb-qSSHviJN@lucifer/ [2] https://lore.kernel.org/all/202604091806051535BJWZ_FTtdIm3Snk24ei_@zte.= com.cn/ Suggested-by: David Hildenbrand (Arm) Signed-off-by: xu xin --- mm/ksm.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 7d5b76478f0b..4761ca3fa984 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -195,22 +195,28 @@ struct ksm_stable_node { * @node: rb node of this rmap_item in the unstable tree * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node - * @age: number of scan iterations since creation - * @remaining_skips: how many scans to skip + * @age: number of scan iterations since creation (unstable node) + * @remaining_skips: how many scans to skip (unstable node) + * @pgoff: pgoff into @anon_vma where the page is mapped (stable tree) */ struct ksm_rmap_item { struct ksm_rmap_item *rmap_list; union { - struct anon_vma *anon_vma; /* when stable */ + struct anon_vma *anon_vma; /* for reverse mapping, when stable */ #ifdef CONFIG_NUMA int nid; /* when node of unstable tree */ #endif }; struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ - unsigned int oldchecksum; /* when unstable */ - rmap_age_t age; - rmap_age_t remaining_skips; + union { + struct { + unsigned int oldchecksum; + rmap_age_t age; + rmap_age_t remaining_skips; + }; /* when unstable */ + unsigned long pgoff; /* for reverse mapping, when stable */ + }; union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ @@ -776,6 +782,10 @@ static struct vm_area_struct *find_mergeable_vma(struc= t mm_struct *mm, return vma; } +/* + * break_cow: actively break the write-protect of the VMA. This is called = when + * rmap_item has not yet become stable, but page has been merged. + */ static void break_cow(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm =3D rmap_item->mm; @@ -787,6 +797,8 @@ static void break_cow(struct ksm_rmap_item *rmap_item) * to undo, we also need to drop a reference to the anon_vma. */ put_anon_vma(rmap_item->anon_vma); + /* Reset pgoff that might overlay age-related information. (still unstabl= e) */ + rmap_item->pgoff =3D 0; mmap_read_lock(mm); vma =3D find_mergeable_vma(mm, addr); @@ -899,6 +911,8 @@ static void remove_node_from_stable_tree(struct ksm_sta= ble_node *stable_node) VM_BUG_ON(stable_node->rmap_hlist_len <=3D 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); + /* Reset pgoff that might overlay age-related information. */ + rmap_item->pgoff =3D 0; rmap_item->address &=3D PAGE_MASK; cond_resched(); } @@ -1052,6 +1066,8 @@ static void remove_rmap_item_from_tree(struct ksm_rma= p_item *rmap_item) stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); + /* Reset pgoff that might overlay age-related information. */ + rmap_item->pgoff =3D 0; rmap_item->head =3D NULL; rmap_item->address &=3D PAGE_MASK; @@ -1598,8 +1614,15 @@ static int try_to_merge_with_ksm_page(struct ksm_rma= p_item *rmap_item, /* Unstable nid is in union with stable anon_vma: remove first */ remove_rmap_item_from_tree(rmap_item); - /* Must get reference to anon_vma while still holding mmap_lock */ + /* + * Must get reference to anon_vma while still holding mmap_lock, + * We set these two members of stable node here instead of + * stable_tree_append(), maybe because we don't want to hold + * mmap_read_lock again. Here mmap_read_lock is already held to + * find_mergeable_vma before merging. + */ rmap_item->anon_vma =3D vma->anon_vma; + rmap_item->pgoff =3D linear_page_index(vma, rmap_item->address); get_anon_vma(vma->anon_vma); out: mmap_read_unlock(mm); @@ -2458,6 +2481,10 @@ static bool should_skip_rmap_item(struct folio *foli= o, if (folio_test_ksm(folio)) return false; + /* There is no age information in stable-tree nodes. */ + if (rmap_item->address & STABLE_FLAG) + return false; + age =3D rmap_item->age; if (age !=3D U8_MAX) rmap_item->age++; --=20 2.25.1 From nobody Mon Jun 8 09:51:03 2026 Received: from mxhk.zte.com.cn (mxhk.zte.com.cn [160.30.148.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7E4D435B63B for ; Sat, 30 May 2026 09:09:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=160.30.148.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1780132185; cv=none; b=lSLpTh2PnS39zwDY9KxrXlgGOoMpyKXN88O4M2owqYVBcJn7uGrwlxkCcOWYW8ppIqnmaZAQEauS2ZQdrojxn2rW6WwAQOzceGrDNty5Goqm28unLJycO3Mp5wAwZkVJTi9WI7yfM5d8KAK9h/XLFbkgJAL4lzEQRhx7NpSmjUg= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1780132185; c=relaxed/simple; bh=6JiZYS6Nw1zIsBjcNNHGo2UasrnXCLSghyY8mrWBi/w=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=m5wOMr44V0lve1tAYprFc7Hol44lGKgpnA2kxnSyjMbc8Qa3uQzfpX2GgHC0Du6hUyGql+6GthadVd59JPoHPjI9doVI/kS4VSd9JxIcwAns1ezP/f4lUtk/TXbyCadUOFGeGM6oqXZzey6iIiHeQqPQS6RKMkzcj+Ho6RI8jLM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=160.30.148.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl1.zte.com.cn (unknown [10.5.228.132]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxhk.zte.com.cn (FangMail) with ESMTPS id 4gSDv94Rk8z5BNRf; Sat, 30 May 2026 17:09:41 +0800 (CST) Received: from xaxapp02.zte.com.cn ([10.88.97.241]) by mse-fl1.zte.com.cn with SMTP id 64U99Xrc036532; Sat, 30 May 2026 17:09:33 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp02[null]) by mapi (Zmail) with MAPI id mid32; Sat, 30 May 2026 17:09:34 +0800 (CST) X-Zmail-TransId: 2afa6a1aa94ec63-2a360 X-Mailer: Zmail v1.0 Message-ID: <20260530170934343YM7yiZrS2RbZYpLrAJWHN@zte.com.cn> In-Reply-To: <20260530165907829ZSDzDdMc110MnOflRzf9P@zte.com.cn> References: 20260530165907829ZSDzDdMc110MnOflRzf9P@zte.com.cn Date: Sat, 30 May 2026 17:09:34 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , Cc: , , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY3IDUvNl0ga3NtOiBPcHRpbWl6ZSBybWFwX3dhbGtfa3NtIGJ5IHBhc3NpbmcgYSBzdWl0YWJsZSBwZ29mZg==?= Content-Type: text/plain; charset="utf-8" X-MAIL: mse-fl1.zte.com.cn 64U99Xrc036532 X-TLS: YES X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SOURCE-IP: 10.5.228.132 unknown Sat, 30 May 2026 17:09:41 +0800 X-CLEAN: YES X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A1AA955.000/4gSDv94Rk8z5BNRf Content-Transfer-Encoding: quoted-printable From: xu xin User impact / Why this matters to Linux users =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D When a system runs with KSM enabled and memory becomes tight, KSM pages may be swapped out or migrated. The kernel then performs a reverse map walk by rmap_walk_ksm to locate all page table entries that reference these pages. If A large number of unrelated VMAs can attach to a single anon_vma related with this KSM page, then rmap_walk might be severe performance bottleneck. In our embedded test environment, we observed ~20,000 VMAs sharing one anon_vma without any fork =E2=80=93 purely from VMA splits=EF=BC=8C which cause 200~700ms duration of rmap_walk_ksm. When one of those VMAs mapped a KSM page, then this KSM page's rmapping will become bottleneck with hold its anon_vma lock for a long time. The anon_vma lock is not only used by KSM; it is a core lock protecting the VMA interval tree and is acquired by many critical memory operations: =E2=80=A2 Page faults: do_anonymous_page(), do_wp_page() (especially duri= ng COW) =E2=80=A2 Memory reclaim: try_to_unmap() =E2=80=A2 Page migration & compaction: migrate_pages(), compact_zone() =E2=80=A2 mlock / munlock: mlock_fixup() =E2=80=A2 Process exit: exit_mmap() (tearing down VMAs) =E2=80=A2 Cgroup memory accounting: mem_cgroup_move_charge() If one thread holds the anon_vma lock for hundreds of milliseconds because of an inefficient KSM rmap walk, any other thread that tries to acquire the same lock (e.g., an application taking a page fault, kswapd reclaiming pages, or a migration thread) will block. This leads to stalled application threads, increased latency spikes, and in extreme cases container timeouts or watchdog triggers. This patch reduces the worst-case anon_vma lock hold time during KSM rmap walk from >500 ms to <1 ms, thereby almost eliminating this source of lock contention and improving system responsiveness under memory pressure. Real-world examples: =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D - JVM / Go runtime: These use mmap for heap regions and later call mprotect(PROT_NONE) for garbage collection barriers or guard pages, splitting the original VMA into thousands of small pieces over time. - Database engines (MySQL, PostgreSQL): Large shared memory buffers or anonymous mappings are managed with madvise(MADV_DONTNEED) to release specific pages, which also splits VMAs. * Why the benchmark numbers are realistic: We observed ~20,000 VMAs sharing one anon_vma on a production system running a Java application with KSM enabled. The lock hold time before the patch was measured at 228=E2=80=AFms= (max) during rmap walks triggered by memory compaction and page migration. The benchmark reproduces that VMA count and lock=E2=80=91hold behavior in a controlled environment. Root Cause =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D Through my local debugging trace analysis, we found that most of the latency of rmap_walk_ksm occurs within anon_vma_interval_tree_foreach, leading to an excessively long hold time on the anon_vma lock (even reaching 500ms or mor= e), which in turn causes upper-layer applications (waiting for the anon_vma loc= k) to be blocked for extended periods. Further investigation revealed that 99.9% of iterations inside the anon_vma_interval_tree_foreach loop are skipped due to the first check "if (addr < vma->vm_start || addr >=3D vma->vm_end)), indicating that a lar= ge number of loop iterations are ineffective. This inefficiency arises because the pgoff_start and pgoff_end parameters passed to anon_vma_interval_tree_foreach span the entire address space from 0 to ULONG_MAX, resulting in very poor loop efficiency. Solution =3D=3D=3D=3D=3D=3D=3D=3D We cannot rely solely on anon_vma to locate all PTEs mapping this page but also need to have the original page's pgoff. Since the implementation of anon_vma_interval_tree_foreach =E2=80=94 it essentially iterates to find a = suitable VMA such that the provided pgoff falls within the candidate's vm_pgoff rang= e. vm_pgoff <=3D pgoff (original linear page offset) <=3D (vm_pgoff + vma_page= s(v) - 1) Fortunately, we have already pgoff in ksm_rmap_item in the previos patch of series, so that we use it to get the pgoff to accelerate the searching. Test results =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D We provide a rmap testbench: tools/testing/rmap/rmap_benchmark.c The testing result in QEMU is shown as follows: KSM rmapping Maximum duration Average duration Before: 705.12 ms (705119858 ns) 532.04 ms (532041586 ns) After: 1.67 ms (1665917 ns) 1.44 ms (1443784 ns) Co-developed-by: Wang Yaxin Signed-off-by: Wang Yaxin Signed-off-by: xu xin --- mm/ksm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/ksm.c b/mm/ksm.c index 4761ca3fa984..7fe1a8753309 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -3200,6 +3200,7 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_w= alk_control *rwc) hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { /* Ignore the stable/unstable/sqnr flags */ const unsigned long addr =3D rmap_item->address & PAGE_MASK; + const unsigned long pgoff =3D rmap_item->pgoff; struct anon_vma *anon_vma =3D rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; @@ -3213,8 +3214,12 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_= walk_control *rwc) anon_vma_lock_read(anon_vma); } + /* + * Currently KSM folios are order-0 normal pages, so pgoff_end + * should be the same as pgoff_start. + */ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, - 0, ULONG_MAX) { + pgoff, pgoff) { cond_resched(); vma =3D vmac->vma; --=20 2.25.1 From nobody Mon Jun 8 09:51:03 2026 Received: from mxct.zte.com.cn (mxct.zte.com.cn [183.62.165.209]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id AB302396562 for ; Sat, 30 May 2026 09:10:28 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=183.62.165.209 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1780132230; cv=none; b=VXGEylbw6/Y/9+4+P/US87FvpAQ20TRRpGTW2lSsTQMPZrSI2Yyc26oeTbmYA415J74XDVrOj/lac/4xMStUZZRSmRHTuCg+K0B9o8Rg9UogGb7Nph+oO0Osyz0OaVbEBv4vywHpbKu6wvXQKBkAlMpVCos5Yld1Ml0z/juvtg4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1780132230; c=relaxed/simple; bh=j2ZUFVF4rEaKncu3OJqXdsl8N2BoXApBdZVsaVkoGyI=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=QS0c37CQucuGELYNBGwrMN18gMkfT4/7Gf32Dda925Amg0T3s3X8ZrBbQ8LbfnvNCCHvoNEy0hTE0D7ali7n+ePp+bfq4petYl9fFNIrlugdQTaajPMztOXCODjrIZ7NhR527M5MIOoCzCzKm0Cf/L/XLwxdEZ9v2MZtnAuyRys= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=183.62.165.209 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl2.zte.com.cn (unknown [10.5.228.133]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxct.zte.com.cn (FangMail) with ESMTPS id 4gSDw00J9Xz52RpX; Sat, 30 May 2026 17:10:24 +0800 (CST) Received: from xaxapp02.zte.com.cn ([10.88.97.241]) by mse-fl2.zte.com.cn with SMTP id 64U9AJOT052872; Sat, 30 May 2026 17:10:19 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp01[null]) by mapi (Zmail) with MAPI id mid32; Sat, 30 May 2026 17:10:21 +0800 (CST) X-Zmail-TransId: 2af96a1aa97d777-20e7d X-Mailer: Zmail v1.0 Message-ID: <20260530171021329670dwn6uVqrmEL6L1A8ib@zte.com.cn> In-Reply-To: <20260530165907829ZSDzDdMc110MnOflRzf9P@zte.com.cn> References: 20260530165907829ZSDzDdMc110MnOflRzf9P@zte.com.cn Date: Sat, 30 May 2026 17:10:21 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , Cc: , , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY3IDYvNl0ga3NtOiBhZGQgbXJlbWFwIHNlbGZ0ZXN0cyBmb3Iga3NtX3JtYXBfd2Fsaw==?= X-MAIL: mse-fl2.zte.com.cn 64U9AJOT052872 X-TLS: YES X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SOURCE-IP: 10.5.228.133 unknown Sat, 30 May 2026 17:10:24 +0800 X-CLEAN: YES X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A1AA980.000/4gSDw00J9Xz52RpX Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: xu xin The existing tools/testing/selftests/mm/rmap.c has already one testcase for ksm_rmap_walk in TEST_F(migrate, ksm), which takes use of migration of page from one NUMA node to another NUMA node. However, it just lacks the scenario of mremapped VMAs. We add the calling of mremap() and then trigger KSM to merge pages before migrating, which is specifically to test an optimization which is introduced by this patch ("ksm: Optimize rmap_walk_ksm by passing a suitable address pgoff"). This test can reproduce the issue that Hugh points out at https://lore.kernel.org/all/02e1b8df-d568-8cbb-b8f6-46d5476d9d75@google.com/ Signed-off-by: xu xin --- tools/testing/selftests/mm/rmap.c | 97 ++++++++++++++++++++++++++++ tools/testing/selftests/mm/vm_util.c | 47 ++++++++++++++ tools/testing/selftests/mm/vm_util.h | 2 + 3 files changed, 146 insertions(+) diff --git a/tools/testing/selftests/mm/rmap.c b/tools/testing/selftests/mm= /rmap.c index 53f2058b0ef2..e548d00762b3 100644 --- a/tools/testing/selftests/mm/rmap.c +++ b/tools/testing/selftests/mm/rmap.c @@ -430,4 +430,101 @@ TEST_F(migrate, ksm) propagate_children(_metadata, data); } +static void prepare_pages(struct global_data *data, int nr_pages) +{ + /* Allocate exactly pages for the test */ + data->mapsize =3D nr_pages * getpagesize(); + data->region =3D mmap(NULL, data->mapsize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + if (data->region =3D=3D MAP_FAILED) + ksft_exit_fail_perror("mmap failed"); + + /* Fill all pages with identical content to encourage KSM merging */ + memset(data->region, 0x77, data->mapsize); +} + +static int mremap_merge_and_migrate(struct global_data *data) +{ + int ret; + void *old_region; + void *new_region; + int nr_pages =3D 32; + long base_shared, base_sharing; + long shared, sharing; + + /* Take baseline before creating our pages */ + base_shared =3D ksm_get_pages_shared(); + base_sharing =3D ksm_get_pages_sharing(); + if (base_shared < 0 || base_sharing < 0) + return FAIL_ON_CHECK; + + prepare_pages(data, nr_pages); + + if (ksm_start() < 0) + return FAIL_ON_CHECK; + + old_region =3D data->region; + /* + * Mremap the second half region to the first half location (FIXED). + */ + new_region =3D mremap(old_region + data->mapsize / 2, data->mapsize / 2, + data->mapsize / 2, MREMAP_MAYMOVE | MREMAP_FIXED, + old_region); + if (new_region =3D=3D MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + return FAIL_ON_CHECK; + } + data->region =3D new_region; + data->mapsize /=3D 2; /* mapping is now half of original */ + + if (ksm_start() < 0) + return FAIL_ON_CHECK; + + /* Attempt to migrate the merged KSM page */ + ret =3D try_to_move_page(data->region); + if (ret !=3D 0) { + ksft_print_msg("migration of KSM page after mremap failed\n"); + return FAIL_ON_CHECK; + } + + /* Ensure ksmd scan two turns at least to update ksm counters */ + if (ksm_start() < 0) + return FAIL_ON_CHECK; + + shared =3D ksm_get_pages_shared(); + sharing =3D ksm_get_pages_sharing(); + if (shared < 0 || sharing < 0) + return FAIL_ON_CHECK; + + if (shared - base_shared !=3D 1 || + sharing - base_sharing !=3D nr_pages / 2 - 1) { + ksft_print_msg("Unexpected KSM counters: shared delta=3D%ld, sharing del= ta=3D%ld\n", + shared - base_shared, sharing - base_sharing); + return FAIL_ON_CHECK; + } + + return 0; +} + + +TEST_F(migrate, ksm_and_mremap) +{ + struct global_data *data =3D &self->data; + int ret; + + /* Skip if KSM is not available */ + if (ksm_stop() < 0) + SKIP(return, "accessing \"/sys/kernel/mm/ksm/run\" failed"); + if (ksm_get_full_scans() < 0) + SKIP(return, "accessing \"/sys/kernel/mm/ksm/full_scan\" failed"); + + ret =3D prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); + if (ret < 0 && errno =3D=3D EINVAL) + SKIP(return, "PR_SET_MEMORY_MERGE not supported"); + else if (ret) + ksft_exit_fail_perror("PR_SET_MEMORY_MERGE=3D1 failed"); + + ASSERT_EQ(mremap_merge_and_migrate(data), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests= /mm/vm_util.c index db94564f4431..bfa88937485e 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -648,6 +648,53 @@ long ksm_get_self_merging_pages(void) return strtol(buf, NULL, 10); } +long ksm_get_pages_shared(void) +{ + int fd; + char buf[10]; + ssize_t ret; + int saved_errno; + + fd =3D open("/sys/kernel/mm/ksm/pages_shared", O_RDONLY); + if (fd < 0) + return -errno; + + ret =3D pread(fd, buf, sizeof(buf) - 1, 0); + saved_errno =3D errno; + close(fd); + if (ret <=3D 0) { + if (ret =3D=3D 0) + return -ENODATA; /* unexpected EOF */ + return -saved_errno; + } + buf[ret] =3D 0; + return strtol(buf, NULL, 10); +} + +long ksm_get_pages_sharing(void) +{ + int fd; + char buf[10]; + ssize_t ret; + int saved_errno; + + fd =3D open("/sys/kernel/mm/ksm/pages_sharing", O_RDONLY); + if (fd < 0) + return -errno; + + ret =3D pread(fd, buf, sizeof(buf) - 1, 0); + saved_errno =3D errno; + close(fd); + if (ret <=3D 0) { + if (ret =3D=3D 0) + return -ENODATA; + return -saved_errno; + } + buf[ret] =3D 0; + return strtol(buf, NULL, 10); +} + + long ksm_get_full_scans(void) { int ksm_full_scans_fd; diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests= /mm/vm_util.h index 1a07305ceff4..3b40727c3f1f 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -151,6 +151,8 @@ void *sys_mremap(void *old_address, unsigned long old_s= ize, long ksm_get_self_zero_pages(void); long ksm_get_self_merging_pages(void); +long ksm_get_pages_shared(void); +long ksm_get_pages_sharing(void); long ksm_get_full_scans(void); int ksm_use_zero_pages(void); int ksm_start(void); --=20 2.25.1