From nobody Mon May 25 02:57:20 2026 Received: from mxhk.zte.com.cn (mxhk.zte.com.cn [160.30.148.35]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 430AB27E049 for ; Tue, 19 May 2026 14:09:25 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=160.30.148.35 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779199768; cv=none; b=a9hgQGgFK2ygw62TqVzPoG9X799DwhHrWuxMZQgmM3MXG/KDf+peW0k+0ckP/lYd3H7D2ivuzXRaNMl2LklABPdsqPA5EniftJKenloOlk3Ln0C7teZl716hbkmOFwaYWiSdmYple8SiKxvA00damdd2/aRoCuwoUP4mNXK11TI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779199768; c=relaxed/simple; bh=b0b4bgurnFDiklxzNbq/L+LkWtXi27qpk9tZG7NfXmE=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=ePoFERCaqXjWUOyX4VpXWckJKSw8TyDncfmvqKYbuuPkTfr4BiSLhRLr6l84v7FJS2XwWKunr8UdaMqpCec4MBlJGTABp3HlMO7BMcweqemMMYjIGmj7vqhRIWvI6f2kSFj4FSgsM1nqqt7MPcy5tg2HEQl8uKL4wHZ3qd3j0OU= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=160.30.148.35 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl1.zte.com.cn (unknown [10.5.228.132]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxhk.zte.com.cn (FangMail) with ESMTPS id 4gKc3w6sNvz8Xrkn; Tue, 19 May 2026 22:09:16 +0800 (CST) Received: from xaxapp02.zte.com.cn ([10.88.97.241]) by mse-fl1.zte.com.cn with SMTP id 64JE97kl056524; Tue, 19 May 2026 22:09:07 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp05[null]) by mapi (Zmail) with MAPI id mid32; Tue, 19 May 2026 22:09:09 +0800 (CST) X-Zmail-TransId: 2afc6a0c6f055d7-efb5a X-Mailer: Zmail v1.0 Message-ID: <20260519220909868loJhTdEdfN7gGPtr_wH-y@zte.com.cn> In-Reply-To: <20260519220536792dMIKRMurt3vZ5lXC5pwh8@zte.com.cn> References: 20260519220536792dMIKRMurt3vZ5lXC5pwh8@zte.com.cn Date: Tue, 19 May 2026 22:09:09 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , , Cc: , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY1IDEvNV0gbW0vcm1hcDogYWRkIHRyYWNlcG9pbnQgZm9yIHJtYXBfd2Fsaw==?= X-MAIL: mse-fl1.zte.com.cn 64JE97kl056524 X-TLS: YES X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SOURCE-IP: 10.5.228.132 unknown Tue, 19 May 2026 22:09:16 +0800 X-CLEAN: YES X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A0C6F0C.000/4gKc3w6sNvz8Xrkn Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: xu xin Add trace_rmap_walk_start() and trace_rmap_walk_end() to bracket reverse mapping walks. Unlike manual clock sampling, these tracepoints record no timestamp; latency can be computed offline by tools (e.g., perf, trace-cmd) using the event timestamps. When tracepoints are disabled, the only cost is a static branch check (no clock read, no duration calculation), making them suitable for production use. The information (folio type, locked state) helps diagnose performance issues in KSM, anonymous, and file-backed rmap walks. '# cat /sys/kernel/tracing/trace '# tracer: nop '# '# entries-in-buffer/entries-written: 408/408 #P:4 '# '# _-----=3D> irqs-off/BH-disabled '# / _----=3D> need-resched '# | / _---=3D> hardirq/softirq '# || / _--=3D> preempt-depth '# ||| / _-=3D> migrate-disable '# |||| / delay '#TASK-PID CPU# ||||| TIMESTAMP FUNCTION '# | | | ||||| | | rmap-215 [002] ..... 163.530676: rmap_walk_start: folio=3D00000000ae4= eb941 rwc=3D0000000030d943e6 page_type=3Dksm locked=3Dfalse rmap-215 [002] ..... 163.532446: rmap_walk_end: folio=3D00000000ae4eb= 941 rwc=3D0000000030d943e6 page_type=3Dksm locked=3Dfalse rmap-215 [002] ..... 163.533201: rmap_walk_start: folio=3D00000000ae4= eb941 rwc=3D000000003c9e8513 page_type=3Dksm locked=3Dfalse rmap-215 [002] ..... 163.533597: rmap_walk_end: folio=3D00000000ae4eb= 941 rwc=3D000000003c9e8513 page_type=3Dksm locked=3Dfalse rmap-215 [002] ..... 163.533899: rmap_walk_start: folio=3D00000000ae4= eb941 rwc=3D0000000030d943e6 page_type=3Dksm locked=3Dfalse rmap-215 [002] ..... 163.534053: rmap_walk_end: folio=3D00000000ae4eb= 941 rwc=3D0000000030d943e6 page_type=3Dksm locked=3Dfalse Signed-off-by: xu xin --- include/trace/events/rmap.h | 73 +++++++++++++++++++++++++++++++++++++ mm/rmap.c | 9 +++++ 2 files changed, 82 insertions(+) create mode 100644 include/trace/events/rmap.h diff --git a/include/trace/events/rmap.h b/include/trace/events/rmap.h new file mode 100644 index 000000000000..275e77849698 --- /dev/null +++ b/include/trace/events/rmap.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rmap + +#if !defined(_TRACE_RMAP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RMAP_H + +#include +#include + +#define GET_RMAP_PAGE_TYPE(folio) (folio_test_ksm(folio) ? "ksm" : \ + (folio_test_anon(folio) ? "anon" : "file")) + +TRACE_EVENT(rmap_walk_start, + + TP_PROTO(struct folio *folio, struct rmap_walk_control *rwc, bool locked), + + TP_ARGS(folio, rwc, locked), + + TP_STRUCT__entry( + __field(unsigned long, folio_addr) + __field(unsigned long, rwc_addr) + __string(page_type, GET_RMAP_PAGE_TYPE(folio)) + __field(bool, locked) + ), + + TP_fast_assign( + __entry->folio_addr =3D (unsigned long)folio; + __entry->rwc_addr =3D (unsigned long)rwc; + __assign_str(page_type); + __entry->locked =3D locked; + ), + + TP_printk("folio=3D%p rwc=3D%p page_type=3D%s locked=3D%s", + (void *)(unsigned long)__entry->folio_addr, + (void *)(unsigned long)__entry->rwc_addr, + __get_str(page_type), + __entry->locked ? "true" : "false") +); + +TRACE_EVENT(rmap_walk_end, + + TP_PROTO(struct folio *folio, struct rmap_walk_control *rwc, bool locked), + + TP_ARGS(folio, rwc, locked), + + TP_STRUCT__entry( + __field(unsigned long, folio_addr) + __field(unsigned long, rwc_addr) + __string(page_type, GET_RMAP_PAGE_TYPE(folio)) + __field(bool, locked) + ), + + TP_fast_assign( + __entry->folio_addr =3D (unsigned long)folio; + __entry->rwc_addr =3D (unsigned long)rwc; + __assign_str(page_type); + __entry->locked =3D locked; + ), + + TP_printk("folio=3D%p rwc=3D%p page_type=3D%s locked=3D%s", + (void *)(unsigned long)__entry->folio_addr, + (void *)(unsigned long)__entry->rwc_addr, + __get_str(page_type), + __entry->locked ? "true" : "false") +); + + +#endif /* _TRACE_RMAP_H */ + +/* This part must be outside protection */ +#include + diff --git a/mm/rmap.c b/mm/rmap.c index 78b7fb5f367c..52f795f768e1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -80,6 +80,7 @@ #define CREATE_TRACE_POINTS #include +#include #include "internal.h" #include "swap.h" @@ -3098,23 +3099,31 @@ static void rmap_walk_file(struct folio *folio, void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) { + trace_rmap_walk_start(folio, rwc, false); + if (unlikely(folio_test_ksm(folio))) rmap_walk_ksm(folio, rwc); else if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, false); else rmap_walk_file(folio, rwc, false); + + trace_rmap_walk_end(folio, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) { + trace_rmap_walk_start(folio, rwc, true); + /* no ksm support for now */ VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, true); else rmap_walk_file(folio, rwc, true); + + trace_rmap_walk_end(folio, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE --=20 2.25.1 From nobody Mon May 25 02:57:20 2026 Received: from mxhk.zte.com.cn (mxhk.zte.com.cn [160.30.148.35]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 071B34028F3 for ; Tue, 19 May 2026 14:11:32 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=160.30.148.35 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779199895; cv=none; b=CJfG2i5F+XaDJcYU3iI3BlihaaSDpGC1My0paGMEk1zaS9cJAFtPBNwsrxyoFDuUv06bZpXRdAHX2I5OE+sgGH2eb533OGaK63WCYdTnOaxw8Fl0dh24DcIyFJflnDAQI3d1zuHXKynyJPp+VKonqdpVNYEjcP4N3pRHYGUS6oM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779199895; c=relaxed/simple; bh=9GYQ74gli+8jGO3cWRdfmww07J9fIu2Wr33A/XwHvm0=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=L8w+Wov2B6QcEo7ylps40GhDz+l4vS82mIWpBRLsDcKqfFaHfoMBhlYPZt7cLQLycOny+TS0Zpbv8yQbhLbpfwriU3KOS+s6NhMNGvcEV53K93VOmURGCiUAK5ihUEMKbAWnOguQ8q+pljMDQpVu8u6PSwq71h1PPqS4W4Bk5I4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=160.30.148.35 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl2.zte.com.cn (unknown [10.5.228.133]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxhk.zte.com.cn (FangMail) with ESMTPS id 4gKc6V6rGjz8Xrrc; Tue, 19 May 2026 22:11:30 +0800 (CST) Received: from xaxapp01.zte.com.cn ([10.88.99.176]) by mse-fl2.zte.com.cn with SMTP id 64JEBLBK098928; Tue, 19 May 2026 22:11:21 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp01[null]) by mapi (Zmail) with MAPI id mid32; Tue, 19 May 2026 22:11:24 +0800 (CST) X-Zmail-TransId: 2af96a0c6f8c669-fabe1 X-Mailer: Zmail v1.0 Message-ID: <20260519221124806bCFNrWzKpCZlGtL4guMKZ@zte.com.cn> In-Reply-To: <20260519220536792dMIKRMurt3vZ5lXC5pwh8@zte.com.cn> References: 20260519220536792dMIKRMurt3vZ5lXC5pwh8@zte.com.cn Date: Tue, 19 May 2026 22:11:24 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , Cc: , , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY1IDIvNV0gdG9vbHMvdGVzdGluZzogYWRkIHJtYXAgd2FsayBsYXRlbmN5IGJlbmNobWFyaw==?= X-MAIL: mse-fl2.zte.com.cn 64JEBLBK098928 X-TLS: YES X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SOURCE-IP: 10.5.228.133 unknown Tue, 19 May 2026 22:11:30 +0800 X-CLEAN: YES X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A0C6F92.001/4gKc6V6rGjz8Xrrc Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: xu xin Add a new benchmark that measures rmap_walk latency under controlled conditions. The test creates a large region (20,000 pages by default), optionally splits the VMA into many small VMAs by mprotect(PROT_READ) on every other page, then triggers rmap_walk via move_pages(). The existing rmap_walk tracepoints (events/rmap/rmap_walk_start and events/rmap/rmap_walk_end) are used to collect duration for events with page_type=3Dksm, page_type=3Danon, and page_type=3Dfile. Three separate test cases are run: - KSM pages: allocate an anonymous region, fill with identical data, mark MADV_MERGEABLE, wait for KSM to merge all pages (by polling /sys/kernel/mm/ksm/full_scans), then trigger migration. - Anonymous pages: similar but without KSM merging. - File pages: mmap a temporary file with shared mapping and fill with identical data. For each test, the program prints the number of captured events and the maximum / average latency in milliseconds. This benchmark helps developers evaluate optimizations in the reverse mapping code, such as limiting max_page_sharing or improving tree traversal efficiency. Usage (must be run as root): cd tools/testing/rmap/ && make sudo ./rmap_bench =3D=3D=3D Testing KSM pages =3D=3D=3D Triggering rmap_walk via move_pages... KSM rmap_walk latency: Maximum duration: 705.12 ms (705119 us) Average duration: 532.04 ms (532041 us) Count: 4 events =3D=3D=3D Testing anonymous pages =3D=3D=3D Triggering rmap_walk via move_pages... Anonymous page rmap_walk latency: Maximum duration: 0.07 ms (69 us) Average duration: 0.05 ms (48 us) Count: 2 events =3D=3D=3D Testing file pages =3D=3D=3D Triggering rmap_walk via move_pages... File page rmap_walk latency: Maximum duration: 0.07 ms (67 us) Average duration: 0.03 ms (30 us) Count: 4 events By the way, update the section of REVERSE MAPPING in MAINTAINERS. Signed-off-by: xu xin --- MAINTAINERS | 3 + tools/testing/rmap/Makefile | 11 + tools/testing/rmap/rmap_benchmark.c | 529 ++++++++++++++++++++++++++++ 3 files changed, 543 insertions(+) create mode 100644 tools/testing/rmap/Makefile create mode 100644 tools/testing/rmap/rmap_benchmark.c diff --git a/MAINTAINERS b/MAINTAINERS index 8e7268d2f6ec..01cc34cc83a2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17006,11 +17006,14 @@ R: Liam R. Howlett R: Vlastimil Babka R: Harry Yoo R: Jann Horn +R: Xu Xin L: linux-mm@kvack.org S: Maintained F: include/linux/rmap.h +F: include/trace/events/rmap.h F: mm/page_vma_mapped.c F: mm/rmap.c +F: tools/testing/rmap/rmap_benchmark.c F: tools/testing/selftests/mm/rmap.c MEMORY MANAGEMENT - SECRETMEM diff --git a/tools/testing/rmap/Makefile b/tools/testing/rmap/Makefile new file mode 100644 index 000000000000..200bd364cafb --- /dev/null +++ b/tools/testing/rmap/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +CC :=3D $(CROSS_COMPILE)gcc + +PROGS :=3D rmap_benchmark + +all: $(PROGS) + +rmap_benchmark: LDLIBS =3D -lnuma + +clean: + rm -fr $(PROGS) diff --git a/tools/testing/rmap/rmap_benchmark.c b/tools/testing/rmap/rmap_= benchmark.c new file mode 100644 index 000000000000..77dac93c794d --- /dev/null +++ b/tools/testing/rmap/rmap_benchmark.c @@ -0,0 +1,529 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Reverse mapping latency test for KSM, anonymous and file pages + * + * This program creates a large number of pages (KSM merged, normal anonym= ous, + * or file mapped), splits the VMA into many small VMAs via mprotect, + * triggers rmap_walk by move_pages(), and collects latency data from the + * tracepoints 'rmap_walk_start' and 'rmap_walk_end' (offline timestamp di= ff). + * + * Usage: must be run as root (to access tracefs and KSM sysfs). + * + * Copyright 2026, ZTE Corp. + * + * Author(s): Xu Xin + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Page size and test parameters */ +int page_size; +#define NR_PAGES 20000 /* Number of virtual pages */ +#define TEST_PATTERN 0xaa + +/* KSM sysfs paths */ +#define KSM_RUN_PATH "/sys/kernel/mm/ksm/run" +#define KSM_SLEEP_MS_PATH "/sys/kernel/mm/ksm/sleep_millisecs" +#define KSM_PAGES_TO_SCAN "/sys/kernel/mm/ksm/pages_to_scan" +#define KSM_FULL_SCANS_PATH "/sys/kernel/mm/ksm/full_scans" + +/* Tracepoint control paths - enable all events under rmap */ +#define TRACE_ENABLE "/sys/kernel/tracing/events/rmap/enable" +#define TRACE_FILE "/sys/kernel/tracing/trace" + +/* + * Page types for rmap_walk tracepoint filtering + */ +enum page_type { + PAGE_TYPE_KSM, + PAGE_TYPE_ANON, + PAGE_TYPE_FILE, +}; + +static const char *page_type_str(enum page_type type) +{ + switch (type) { + case PAGE_TYPE_KSM: return "ksm"; + case PAGE_TYPE_ANON: return "anon"; + case PAGE_TYPE_FILE: return "file"; + default: return "unknown"; + } +} + +/* + * Write a string to a sysfs file. + */ +static int write_sys(const char *path, const char *value) +{ + int fd; + ssize_t ret; + + fd =3D open(path, O_WRONLY); + if (fd < 0) { + fprintf(stderr, "open %s failed: %s\n", path, strerror(errno)); + return -1; + } + ret =3D write(fd, value, strlen(value)); + close(fd); + if (ret !=3D (ssize_t)strlen(value)) { + fprintf(stderr, "write %s failed: %s\n", path, strerror(errno)); + return -1; + } + return 0; +} + +/* + * Read an integer from a sysfs file. + */ +static int read_sys_int(const char *path) +{ + FILE *fp; + int val; + + fp =3D fopen(path, "r"); + if (!fp) { + fprintf(stderr, "fopen %s failed: %s\n", path, strerror(errno)); + return -1; + } + if (fscanf(fp, "%d", &val) !=3D 1) { + fprintf(stderr, "fscanf %s failed\n", path); + fclose(fp); + return -1; + } + fclose(fp); + return val; +} + +/* + * Get KSM full scan count. + */ +static int ksm_get_full_scans(void) +{ + return read_sys_int(KSM_FULL_SCANS_PATH); +} + +/* + * Wait for KSM to complete at least two full scans, which ensures that + * merging has had a chance to happen. + */ +static void wait_ksm_merge(void) +{ + int start_scans, end_scans; + int max_wait =3D 60; + int waited =3D 0; + + start_scans =3D ksm_get_full_scans(); + if (start_scans < 0) { + fprintf(stderr, "Failed to read initial full_scans\n"); + return; + } + + /* Make sure KSM is running */ + if (write_sys(KSM_RUN_PATH, "1") < 0) { + fprintf(stderr, "Failed to start KSM\n"); + return; + } + + do { + sleep(1); + end_scans =3D ksm_get_full_scans(); + if (end_scans < 0) { + fprintf(stderr, "Failed to read full_scans\n"); + return; + } + waited++; + if (waited > max_wait) { + fprintf(stderr, "Warning: KSM full_scans not increased " + "after %d seconds\n", max_wait); + break; + } + } while (end_scans < start_scans + 2); +} + +/* + * Enable the rmap tracepoints and clear the trace buffer. + */ +static void enable_tracepoint(void) +{ + int fd; + struct stat st; + + /* Check if tracefs is already accessible */ + if (stat("/sys/kernel/tracing/trace", &st) !=3D 0) { + /* Try to mount tracefs */ + if (mount("tracefs", "/sys/kernel/tracing", "tracefs", 0, NULL) !=3D 0) { + fprintf(stderr, "Warning: Failed to mount tracefs: %s\n", + strerror(errno)); + /* Continue anyway, maybe it's already mounted elsewhere */ + } + } + + if (write_sys(TRACE_ENABLE, "1") < 0) + exit(1); + /* Truncate the trace file to clear old data */ + fd =3D open(TRACE_FILE, O_WRONLY | O_TRUNC); + if (fd < 0) { + perror("open " TRACE_FILE); + exit(1); + } + close(fd); +} + +/* + * Disable the rmap tracepoints. + */ +static void disable_tracepoint(void) +{ + write_sys(TRACE_ENABLE, "0"); +} + +/* + * Extract timestamp from a trace line (the 5th field, before colon). + * Returns timestamp in microseconds. + */ +static unsigned long long extract_timestamp_us(const char *line) +{ + char time_str[32]; + double ts_sec =3D 0.0; + + /* %*s skip first field (task-name), second field (PID in brackets? Actua= lly "[CPU]"? + * The format is: - [CPU] FLAGS TIMESTAMP: ... + * Example: "rmap_benchmark_-467 [003] ..... 17258.057760: rmap_walk_= start: ..." + * Split by whitespace: + * 0: "rmap_benchmark_-467" + * 1: "[003]" + * 2: "....." + * 3: "17258.057760:" <-- timestamp with colon + * Therefore timestamp is the 4th field (index 3). But careful: there mig= ht be extra spaces. + * So we need to skip three fields: %*s %*s %*s, then read the timestamp = string. + */ + if (sscanf(line, "%*s %*s %*s %31s", time_str) =3D=3D 1) { + /* Remove trailing colon if present */ + char *colon =3D strchr(time_str, ':'); + + if (colon) + *colon =3D '\0'; + ts_sec =3D strtod(time_str, NULL); + } + return (unsigned long long)(ts_sec * 1e6); +} + +/* + * Parse the trace file and compute latency from start/end events for a gi= ven page_type. + * Returns 0 on success, -1 if no matching events. + */ +static int parse_trace_and_print(enum page_type type, unsigned long long *= max_us, + unsigned long long *avg_us, int *count) +{ + FILE *fp; + char line[1024]; + unsigned long long start_ts =3D 0; + unsigned long long delta; + unsigned long long sum =3D 0, max_val =3D 0; + int pairs =3D 0; + const char *type_str =3D page_type_str(type); + char type_pattern[64]; + + snprintf(type_pattern, sizeof(type_pattern), "page_type=3D%s", type_str); + + fp =3D fopen(TRACE_FILE, "r"); + if (!fp) { + perror("fopen " TRACE_FILE); + return -1; + } + + while (fgets(line, sizeof(line), fp)) { + /* Check if line contains our page_type */ + if (!strstr(line, type_pattern)) + continue; + + if (strstr(line, "rmap_walk_start:")) { + start_ts =3D extract_timestamp_us(line); + } else if (strstr(line, "rmap_walk_end:")) { + if (start_ts) { + unsigned long long end_ts =3D extract_timestamp_us(line); + + if (end_ts > start_ts) { + delta =3D end_ts - start_ts; + sum +=3D delta; + if (delta > max_val) + max_val =3D delta; + pairs++; + } + start_ts =3D 0; /* reset after pairing */ + } + } + } + fclose(fp); + + if (pairs =3D=3D 0) { + printf("No rmap_walk events with page_type=3D%s found.\n", type_str); + return -1; + } + + *max_us =3D max_val; + *avg_us =3D sum / pairs; + *count =3D pairs; + return 0; +} + +/* + * Trigger rmap_walk by moving a single page. + * region: pointer to the page (any page in the mapped region). + * The function will try to move that page to a different NUMA node. + */ +static void trigger_rmap_walk(void *region) +{ + int ret, status, cur_node, target_node; + void *pages[1]; + int nodes[1]; + + printf("Triggering rmap_walk via move_pages...\n"); + + ret =3D move_pages(0, 1, (void **)®ion, NULL, &status, MPOL_MF_MOVE_AL= L); + if (ret !=3D 0) { + perror("Failed to get original numa"); + exit(1); + } + cur_node =3D status; + + for (target_node =3D 0; target_node <=3D numa_max_node(); target_node++) { + if (numa_bitmask_isbitset(numa_all_nodes_ptr, target_node) && + target_node !=3D cur_node) + break; + } + if (target_node > numa_max_node()) { + printf("Couldn't find available numa node for testing\n"); + exit(1); + } + + pages[0] =3D region; + nodes[0] =3D target_node; + + /* + * Note: We ignore the return value when ret >=3D 0, since there's probab= ility + * that ksmd's ksm_get_folio collides with do_move_page(), which cause + * __migrate_folio failed due to the check "folio_ref_count(src) !=3D + * expected_count". + */ + ret =3D move_pages(0, 1, pages, nodes, &status, MPOL_MF_MOVE_ALL); + if (ret < 0) + perror("move_pages"); +} + +/* + * Split a VMA into many small VMAs by changing protection on every other = page. + * This increases the number of anon_vma_chain entries and makes rmap_walk= slower. + */ +static void split_vma_with_mprotect(void *addr, size_t size) +{ + for (size_t i =3D 0; i < size / page_size; i++) { + if (i % 2 =3D=3D 0) { + if (mprotect(addr + i * page_size, page_size, PROT_READ) < 0) { + if (errno !=3D EACCES) + perror("mprotect"); + } + } + } +} + +/* + * Test for KSM pages. + */ +static void test_ksm(void) +{ + void *region; + size_t size =3D NR_PAGES * page_size; + unsigned long long max_us, avg_us; + int count; + + printf("\n=3D=3D=3D Testing KSM pages =3D=3D=3D\n"); + + /* Stop KSM and set aggressive scan parameters */ + if (write_sys(KSM_RUN_PATH, "2") < 0) { + printf("Warning: CONFIG_KSM might be not build, skip testing.\n"); + return; + } + if (write_sys(KSM_SLEEP_MS_PATH, "0") < 0 || + write_sys(KSM_PAGES_TO_SCAN, "10000") < 0) + exit(1); + + region =3D mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (region =3D=3D MAP_FAILED) { + perror("mmap for KSM"); + exit(1); + } + memset(region, TEST_PATTERN, size); + + if (madvise(region, size, MADV_MERGEABLE) !=3D 0) { + perror("madvise MADV_MERGEABLE"); + munmap(region, size); + exit(1); + } + + /* Start KSM scanner */ + if (write_sys(KSM_RUN_PATH, "1") < 0) { + munmap(region, size); + exit(1); + } + + split_vma_with_mprotect(region, size); + + /* Wait full merging */ + wait_ksm_merge(); + + enable_tracepoint(); + /* Move the page at offset page_size (any page is fine) */ + trigger_rmap_walk(region + page_size); + usleep(100000); /* allow trace to be written */ + disable_tracepoint(); + + if (parse_trace_and_print(PAGE_TYPE_KSM, &max_us, &avg_us, &count) =3D=3D= 0) { + printf("KSM rmap_walk latency:\n"); + printf(" Maximum duration: %.2f ms (%.0f us)\n", + max_us / 1000.0, (double)max_us); + printf(" Average duration: %.2f ms (%.0f us)\n", + avg_us / 1000.0, (double)avg_us); + printf(" Count: %d events\n", count); + } + + munmap(region, size); + write_sys(KSM_RUN_PATH, "2"); /* stop KSM */ +} + +/* + * Test for normal anonymous pages. + */ +static void test_anon(void) +{ + void *region; + size_t size =3D NR_PAGES * page_size; + unsigned long long max_us, avg_us; + int count; + + printf("\n=3D=3D=3D Testing anonymous pages =3D=3D=3D\n"); + + region =3D mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (region =3D=3D MAP_FAILED) { + perror("mmap for anonymous"); + exit(1); + } + memset(region, TEST_PATTERN, size); + + split_vma_with_mprotect(region, size); + + enable_tracepoint(); + trigger_rmap_walk(region + page_size); + usleep(100000); + disable_tracepoint(); + + if (parse_trace_and_print(PAGE_TYPE_ANON, &max_us, &avg_us, &count) =3D= =3D 0) { + printf("Anonymous page rmap_walk latency:\n"); + printf(" Maximum duration: %.2f ms (%.0f us)\n", + max_us / 1000.0, (double)max_us); + printf(" Average duration: %.2f ms (%.0f us)\n", + avg_us / 1000.0, (double)avg_us); + printf(" Count: %d events\n", count); + } + + munmap(region, size); +} + +/* + * Test for file-backed pages (mmap of a temporary file). + */ +static void test_file(void) +{ + void *region; + size_t size =3D NR_PAGES * page_size; + int fd; + char filename[] =3D "/tmp/rmap_test_file_XXXXXX"; + + printf("\n=3D=3D=3D Testing file pages =3D=3D=3D\n"); + + fd =3D mkstemp(filename); + if (fd < 0) { + perror("mkstemp"); + exit(1); + } + if (ftruncate(fd, size) < 0) { + perror("ftruncate"); + unlink(filename); + close(fd); + exit(1); + } + + region =3D mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (region =3D=3D MAP_FAILED) { + perror("mmap for file"); + unlink(filename); + close(fd); + exit(1); + } + memset(region, TEST_PATTERN, size); + + split_vma_with_mprotect(region, size); + + enable_tracepoint(); + trigger_rmap_walk(region + page_size); + usleep(100000); + disable_tracepoint(); + + unsigned long long max_us, avg_us; + int count; + + if (parse_trace_and_print(PAGE_TYPE_FILE, &max_us, &avg_us, &count) =3D= =3D 0) { + printf("File page rmap_walk latency:\n"); + printf(" Maximum duration: %.2f ms (%.0f us)\n", + max_us / 1000.0, (double)max_us); + printf(" Average duration: %.2f ms (%.0f us)\n", + avg_us / 1000.0, (double)avg_us); + printf(" Count: %d events\n", count); + } + + munmap(region, size); + unlink(filename); + close(fd); +} + +int main(void) +{ + page_size =3D getpagesize(); + + /* Need root for tracefs and KSM sysfs */ + if (geteuid() !=3D 0) { + fprintf(stderr, "This program must be run as root.\n"); + exit(1); + } + + if (numa_available() < 0) { + printf("Warning: NUMA not available, move_pages may not work.\n"); + exit(1); + } + + /* Run three tests */ + test_ksm(); + test_anon(); + test_file(); + + return 0; +} --=20 2.25. From nobody Mon May 25 02:57:20 2026 Received: from mxhk.zte.com.cn (mxhk.zte.com.cn [160.30.148.35]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 006DE340419 for ; Tue, 19 May 2026 14:12:51 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=160.30.148.35 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779199973; cv=none; b=uHFfm9x1E8eZ7YqKjCKUZSXn8DgeqyJLguQTqzH1T1TunwzHBoYJfTN1j1ll099tHB/z2fTRbL5Lov4vLwu9QwD+6k2HKR5I+G2utYy5fnTOG5UtXJ0RiNYZYdm+kTDSwrR5Y92z4yWe+EaP1oNIobeCX/iMMltEvqef1imRo9w= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779199973; c=relaxed/simple; bh=Gyw8mlo9GD1ur4vyw4aZVGqN9utTDb41CRUSY+6FzZU=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=utnNWw2Bkrsl20+0XSVo5qXYpMrU+54n06rP/oZTwZwblg+8Y293V87FPb0H3CE/eSv++n1C64N4Lm4SfBOQmEguawhpXMgepu+NkVqb7rvHm704Qp+ll09PH71YrzZZq+6eia+Hxf+4UCnBSz13KJI/akkgKy5818vTKOOx21g= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=160.30.148.35 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl1.zte.com.cn (unknown [10.5.228.132]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxhk.zte.com.cn (FangMail) with ESMTPS id 4gKc815Wshz8XrrJ; Tue, 19 May 2026 22:12:49 +0800 (CST) Received: from xaxapp05.zte.com.cn ([10.99.98.109]) by mse-fl1.zte.com.cn with SMTP id 64JECenP060106; Tue, 19 May 2026 22:12:40 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp04[null]) by mapi (Zmail) with MAPI id mid32; Tue, 19 May 2026 22:12:44 +0800 (CST) X-Zmail-TransId: 2afb6a0c6fdc718-f0c9a X-Mailer: Zmail v1.0 Message-ID: <20260519221244272O8Chn7CcMIG3xDLxJf_aQ@zte.com.cn> In-Reply-To: <20260519220536792dMIKRMurt3vZ5lXC5pwh8@zte.com.cn> References: 20260519220536792dMIKRMurt3vZ5lXC5pwh8@zte.com.cn Date: Tue, 19 May 2026 22:12:44 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , , Cc: , , , , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY1IDMvNV0ga3NtOiBhZGQgcGdvZmYgaW50byBrc21fcm1hcF9pdGVt?= X-MAIL: mse-fl1.zte.com.cn 64JECenP060106 X-TLS: YES X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SOURCE-IP: 10.5.228.132 unknown Tue, 19 May 2026 22:12:49 +0800 X-CLEAN: YES X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A0C6FE1.000/4gKc815Wshz8XrrJ Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: xu xin The reason for adding pgoff to ksm_rmap_item has been discussed in previous mailing list threads [1][2]. The main purpose is to allow the KSM reverse m= apping to obtain the original page's linear page index, so that during anon_vma_tr= ee travering, it can conditionally locate the VMAs and avoid scanning the enti= re address space [0, ULONG_MAX]. To minimize the size impact of adding pgoff to ksm_rmap_item as much as possible, a trick that David suggested is to use a UNION that groups the me= mbers related to the unstable tree together with the newly added linear page inde= x. The members that valids only when in unstable tree include oldchecksum and age = information. However, the function should_skip_rmap_item() in the smart scanning needs s= light modification, since this function still uses the age information even when = the rmap_item is in a stable state (the page is not KSM), a situation that occu= rs during COW faults. After using union, the size is still 64 byte without inc= reasing. We keep the same way to store the pgoff as rmap->anon_vma which is set when= the page is merged and become a KsmPage at try_to_merge_with_ksm_page(), and reset at remove_rmap_item_from_tree() and remove_node_from_stable_tree() and reset w= hen break_cow. To be specially clarified, the reason for resetting pgoff at break_cow() is: - When a page successfully becomes a KSM page (i.e., after stable_tree_appe= nd() sets STABLE_FLAG), both anon_vma and pgoff are stored and remain valid. - However, during the merging process there are several failure paths where= a page that was temporarily treated as a KSM page must be reverted back to = an anonymous page. Examples include: * The second call to try_to_merge_with_ksm_page() fails in try_to_merge_two_pages(). * stable_tree_insert() fails in cmp_and_merge_page(). In such cases, break_cow() is invoked to break the COW mapping and discard the KSM state. Currently, break_cow() already contains a put_anon_vma(rmap_item->anon_vma) to release the reference taken during the aborted merge. Because 'pgoff' is logically paired with anon_vma (both are only meaningful when the rmap_item is in a stable state), it must also be cleared (or reset) in break_cow() to avoid leaving stale pgoff values that could confuse subsequent rmap walks or scanning logic. [1] https://lore.kernel.org/all/adTPQSb-qSSHviJN@lucifer/ [2] https://lore.kernel.org/all/202604091806051535BJWZ_FTtdIm3Snk24ei_@zte.= com.cn/ Suggested-by: David Hildenbrand (Arm) Signed-off-by: xu xin --- mm/ksm.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 7d5b76478f0b..4761ca3fa984 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -195,22 +195,28 @@ struct ksm_stable_node { * @node: rb node of this rmap_item in the unstable tree * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node - * @age: number of scan iterations since creation - * @remaining_skips: how many scans to skip + * @age: number of scan iterations since creation (unstable node) + * @remaining_skips: how many scans to skip (unstable node) + * @pgoff: pgoff into @anon_vma where the page is mapped (stable tree) */ struct ksm_rmap_item { struct ksm_rmap_item *rmap_list; union { - struct anon_vma *anon_vma; /* when stable */ + struct anon_vma *anon_vma; /* for reverse mapping, when stable */ #ifdef CONFIG_NUMA int nid; /* when node of unstable tree */ #endif }; struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ - unsigned int oldchecksum; /* when unstable */ - rmap_age_t age; - rmap_age_t remaining_skips; + union { + struct { + unsigned int oldchecksum; + rmap_age_t age; + rmap_age_t remaining_skips; + }; /* when unstable */ + unsigned long pgoff; /* for reverse mapping, when stable */ + }; union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ @@ -776,6 +782,10 @@ static struct vm_area_struct *find_mergeable_vma(struc= t mm_struct *mm, return vma; } +/* + * break_cow: actively break the write-protect of the VMA. This is called = when + * rmap_item has not yet become stable, but page has been merged. + */ static void break_cow(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm =3D rmap_item->mm; @@ -787,6 +797,8 @@ static void break_cow(struct ksm_rmap_item *rmap_item) * to undo, we also need to drop a reference to the anon_vma. */ put_anon_vma(rmap_item->anon_vma); + /* Reset pgoff that might overlay age-related information. (still unstabl= e) */ + rmap_item->pgoff =3D 0; mmap_read_lock(mm); vma =3D find_mergeable_vma(mm, addr); @@ -899,6 +911,8 @@ static void remove_node_from_stable_tree(struct ksm_sta= ble_node *stable_node) VM_BUG_ON(stable_node->rmap_hlist_len <=3D 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); + /* Reset pgoff that might overlay age-related information. */ + rmap_item->pgoff =3D 0; rmap_item->address &=3D PAGE_MASK; cond_resched(); } @@ -1052,6 +1066,8 @@ static void remove_rmap_item_from_tree(struct ksm_rma= p_item *rmap_item) stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); + /* Reset pgoff that might overlay age-related information. */ + rmap_item->pgoff =3D 0; rmap_item->head =3D NULL; rmap_item->address &=3D PAGE_MASK; @@ -1598,8 +1614,15 @@ static int try_to_merge_with_ksm_page(struct ksm_rma= p_item *rmap_item, /* Unstable nid is in union with stable anon_vma: remove first */ remove_rmap_item_from_tree(rmap_item); - /* Must get reference to anon_vma while still holding mmap_lock */ + /* + * Must get reference to anon_vma while still holding mmap_lock, + * We set these two members of stable node here instead of + * stable_tree_append(), maybe because we don't want to hold + * mmap_read_lock again. Here mmap_read_lock is already held to + * find_mergeable_vma before merging. + */ rmap_item->anon_vma =3D vma->anon_vma; + rmap_item->pgoff =3D linear_page_index(vma, rmap_item->address); get_anon_vma(vma->anon_vma); out: mmap_read_unlock(mm); @@ -2458,6 +2481,10 @@ static bool should_skip_rmap_item(struct folio *foli= o, if (folio_test_ksm(folio)) return false; + /* There is no age information in stable-tree nodes. */ + if (rmap_item->address & STABLE_FLAG) + return false; + age =3D rmap_item->age; if (age !=3D U8_MAX) rmap_item->age++; --=20 2.25.1 From nobody Mon May 25 02:57:20 2026 Received: from mxhk.zte.com.cn (mxhk.zte.com.cn [160.30.148.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 916F340960D for ; Tue, 19 May 2026 14:14:34 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=160.30.148.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779200076; cv=none; b=BkmCNP9LxZQHzIDf24Q+Xlsno3XDXYSpIKooiU4IMnP5pk+xeFcZ8G2QhFQLmLU7uEeDc4XIlTPrgnM2BHD4+OyhV/loyfkD/jJ8ZyRs2lXVI+p4Ih7Tu1V/IXAOhBS+SlalMyvquaiUdWM+mVyVWcTNt+v6hBXVxvYiXHPAAjY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779200076; c=relaxed/simple; bh=v9px9uZlDH8SMqGFCJPPfe2jy9iTse9LAC7uKk+0od8=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=tKHvB/kinzxDNFagN1fI7+1iXzYNWIs2ch8eKXC6b30a3GjAZoXHhxD4cvj8ulm/qyET0XSUmMviG7Npy80P13O4L/naqwCOGjZqa2YtK4GoHzEJ9+yMOS4T4axOAJ8+RPPDfH3BN0rXgIq5D68RSGF7+jNNL139p20plU9TBfc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=160.30.148.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl1.zte.com.cn (unknown [10.5.228.132]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxhk.zte.com.cn (FangMail) with ESMTPS id 4gKc9z57cfz57DCd; Tue, 19 May 2026 22:14:31 +0800 (CST) Received: from xaxapp04.zte.com.cn ([10.99.98.157]) by mse-fl1.zte.com.cn with SMTP id 64JEENPi060712; Tue, 19 May 2026 22:14:23 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp05[null]) by mapi (Zmail) with MAPI id mid32; Tue, 19 May 2026 22:14:26 +0800 (CST) X-Zmail-TransId: 2afc6a0c70424a6-f4a24 X-Mailer: Zmail v1.0 Message-ID: <20260519221426579UmohTh1ABCHAeJWQ2gA7t@zte.com.cn> In-Reply-To: <20260519220536792dMIKRMurt3vZ5lXC5pwh8@zte.com.cn> References: 20260519220536792dMIKRMurt3vZ5lXC5pwh8@zte.com.cn Date: Tue, 19 May 2026 22:14:26 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , Cc: , , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY1IDQvNV0ga3NtOiBPcHRpbWl6ZSBybWFwX3dhbGtfa3NtIGJ5IHBhc3NpbmcgYSBzdWl0YWJsZSBwZ29mZg==?= Content-Type: text/plain; charset="utf-8" X-MAIL: mse-fl1.zte.com.cn 64JEENPi060712 X-TLS: YES X-SPF-DOMAIN: zte.com.cn X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SPF: None X-SOURCE-IP: 10.5.228.132 unknown Tue, 19 May 2026 22:14:31 +0800 X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A0C7047.001/4gKc9z57cfz57DCd Content-Transfer-Encoding: quoted-printable From: xu xin User impact / Why this matters to Linux users =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D When a system runs with KSM enabled and memory becomes tight, KSM pages may be swapped out or migrated. The kernel then performs a reverse map walk to locate all page table entries that reference these pages. This walk must traverse every VMA that shares the same anon_vma. A large number of VMAs can attach to a single anon_vma not only due to fork(2) (COW sharing), but also due to **VMA splitting** =E2=80=93 for exam= ple, applications frequently calling mprotect(2) on different parts of a large mapping, or memory allocators that use madvise(MADV_DONTNEED) to release sub=E2=80=91ranges. Over time, a single anonymous memory region can become thousands of small VMAs, all still linked to the original anon_vma. In our embedded test environment, we observed ~20,000 VMAs sharing one anon_vma without any fork =E2=80=93 purely from VMA splits. When one of those VMAs mapped a KSM page, then this KSM page's rmapping will become bottleneck with hold its anon_vma lock for a long time. The anon_vma lock is not only used by KSM; it is a core lock protecting the VMA interval tree and is acquired by many critical memory operations: =E2=80=A2 Page faults: do_anonymous_page(), do_wp_page() (especially duri= ng COW) =E2=80=A2 Memory reclaim: try_to_unmap() =E2=80=A2 Page migration & compaction: migrate_pages(), compact_zone() =E2=80=A2 mlock / munlock: mlock_fixup() =E2=80=A2 Process exit: exit_mmap() (tearing down VMAs) =E2=80=A2 Cgroup memory accounting: mem_cgroup_move_charge() If one thread holds the anon_vma lock for hundreds of milliseconds because of an inefficient KSM rmap walk, any other thread that tries to acquire the same lock (e.g., an application taking a page fault, kswapd reclaiming pages, or a migration thread) will block. This leads to stalled application threads, increased latency spikes, and in extreme cases container timeouts or watchdog triggers. This patch reduces the worst-case anon_vma lock hold time during KSM rmap walk from >500 ms to <1 ms, thereby almost eliminating this source of lock contention and improving system responsiveness under memory pressure. Problem =3D=3D=3D=3D=3D=3D=3D When available memory is extremely tight, causing KSM pages to be swapped out, or when there is significant memory fragmentation and THP triggers memory compaction, the system will invoke the rmap_walk_ksm function to perform reverse mapping. However, we observed that this function becomes particularly time-consuming when a large number of VMAs (e.g., 20,000) share the same anon_vma. Through debug trace analysis, we found that most of the latency occurs within anon_vma_interval_tree_foreach, leading to an excessively long hold time on the anon_vma lock (even reaching 500ms or more), which in turn causes upper-layer applications (waiting for the anon_vma lock) to be blocked for extended periods. Root Cause =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D Further investigation revealed that 99.9% of iterations inside the anon_vma_interval_tree_foreach loop are skipped due to the first check "if (addr < vma->vm_start || addr >=3D vma->vm_end)), indicating that a lar= ge number of loop iterations are ineffective. This inefficiency arises because the pgoff_start and pgoff_end parameters passed to anon_vma_interval_tree_foreach span the entire address space from 0 to ULONG_MAX, resulting in very poor loop efficiency. Solution =3D=3D=3D=3D=3D=3D=3D=3D We cannot rely solely on anon_vma to locate all PTEs mapping this page but also need to have the original page's pgoff. Since the implementation of anon_vma_interval_tree_foreach =E2=80=94 it essentially iterates to find a = suitable VMA such that the provided pgoff falls within the candidate's vm_pgoff rang= e. vm_pgoff <=3D pgoff (original linear page offset) <=3D (vm_pgoff + vma_page= s(v) - 1) Fortunately, we have already pgoff in ksm_rmap_item in the previos patch of series, so that we use it to get the pgoff to accelerate the searching. Test results =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D We provide a rmap testbench: tools/testing/rmap/rmap_benchmark.c The testing result in QEMU is shown as follows: KSM rmapping Maximum duration Average duration Before: 705.12 ms (705119858 ns) 532.04 ms (532041586 ns) After: 1.67 ms (1665917 ns) 1.44 ms (1443784 ns) Co-developed-by: Wang Yaxin Signed-off-by: Wang Yaxin Signed-off-by: xu xin --- mm/ksm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/ksm.c b/mm/ksm.c index 4761ca3fa984..7fe1a8753309 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -3200,6 +3200,7 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_w= alk_control *rwc) hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { /* Ignore the stable/unstable/sqnr flags */ const unsigned long addr =3D rmap_item->address & PAGE_MASK; + const unsigned long pgoff =3D rmap_item->pgoff; struct anon_vma *anon_vma =3D rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; @@ -3213,8 +3214,12 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_= walk_control *rwc) anon_vma_lock_read(anon_vma); } + /* + * Currently KSM folios are order-0 normal pages, so pgoff_end + * should be the same as pgoff_start. + */ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, - 0, ULONG_MAX) { + pgoff, pgoff) { cond_resched(); vma =3D vmac->vma; --=20 2.25.1 From nobody Mon May 25 02:57:20 2026 Received: from mxct.zte.com.cn (mxct.zte.com.cn [183.62.165.209]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 755DE4028E1 for ; Tue, 19 May 2026 14:18:11 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=183.62.165.209 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779200294; cv=none; b=bRtUZfrabpJzK/glVku//RZx6mZ1uuhbODd+IBEjCcoorZXT8NlrCTv6/9VHTwTJBtpPcyiOBtqF61zOYRrhY4D2HpmeEwp9n5/dSSgoF1cR9K2ooIef7hYkWnX/K5RG+SIbDVwUqMUPmeK5+YoYe/1e4UIMsNCLgJJjmj0kGvA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779200294; c=relaxed/simple; bh=RtCjUDX7GD0wUDHCDGAVP4Mgq0zrv2Yl04aenfZZQj8=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=dJZAKVUh8YH+p/o+twhCK3z4swXsjZfvs+knTqXXgWu78OPoILEZeAXdyeysMcKy45ZMKiRQCXwxzYUlYq7g1QZnQYXSxVN/hxzwFwbCl1+WA7UQPAKZEs+EijngsGSPd2h5whF11CeyU7ghyXiiyIHm+o1n18nmq+Yr12voVn0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=183.62.165.209 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl1.zte.com.cn (unknown [10.5.228.132]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxct.zte.com.cn (FangMail) with ESMTPS id 4gKcG4614Yz4x6Cw; Tue, 19 May 2026 22:18:04 +0800 (CST) Received: from xaxapp05.zte.com.cn ([10.99.98.109]) by mse-fl1.zte.com.cn with SMTP id 64JEI1ch062919; Tue, 19 May 2026 22:18:01 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp04[null]) by mapi (Zmail) with MAPI id mid32; Tue, 19 May 2026 22:18:05 +0800 (CST) X-Zmail-TransId: 2afb6a0c711d7d0-f5d4d X-Mailer: Zmail v1.0 Message-ID: <20260519221805076ac4m_iQchFDe37JqE1wW1@zte.com.cn> In-Reply-To: <20260519220536792dMIKRMurt3vZ5lXC5pwh8@zte.com.cn> References: 20260519220536792dMIKRMurt3vZ5lXC5pwh8@zte.com.cn Date: Tue, 19 May 2026 22:18:05 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , Cc: , , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY1IDUvNV0ga3NtOiBhZGQgbXJlbWFwIHNlbGZ0ZXN0cyBmb3Iga3NtX3JtYXBfd2Fsaw==?= X-MAIL: mse-fl1.zte.com.cn 64JEI1ch062919 X-TLS: YES X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SOURCE-IP: 10.5.228.132 unknown Tue, 19 May 2026 22:18:04 +0800 X-CLEAN: YES X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A0C711C.000/4gKcG4614Yz4x6Cw Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: xu xin The existing tools/testing/selftests/mm/rmap.c has already one testcase for ksm_rmap_walk in TEST_F(migrate, ksm), which takes use of migration of page from one NUMA node to another NUMA node. However, it just lacks the scenario of mremapped VMAs. We add the calling of mremap() and then trigger KSM to merge pages before migrating, , which is specailly to test a optimization which is introduced by this patch ("ksm: Optimize rmap_walk_ksm by passing a suitable address range"). This test can reproduce the issue that Hugh points out at https://lore.kernel.org/all/02e1b8df-d568-8cbb-b8f6-46d5476d9d75@google.com/ Signed-off-by: xu xin --- tools/testing/selftests/mm/rmap.c | 76 ++++++++++++++++++++++++++++ tools/testing/selftests/mm/vm_util.c | 38 ++++++++++++++ tools/testing/selftests/mm/vm_util.h | 2 + 3 files changed, 116 insertions(+) diff --git a/tools/testing/selftests/mm/rmap.c b/tools/testing/selftests/mm= /rmap.c index 53f2058b0ef2..f3eb693872ac 100644 --- a/tools/testing/selftests/mm/rmap.c +++ b/tools/testing/selftests/mm/rmap.c @@ -430,4 +430,80 @@ TEST_F(migrate, ksm) propagate_children(_metadata, data); } +static void prepare_pages(struct global_data *data, int nr_pages) +{ + /* Allocate exactly pages for the test */ + data->mapsize =3D nr_pages * getpagesize(); + data->region =3D mmap(NULL, data->mapsize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + if (data->region =3D=3D MAP_FAILED) + ksft_exit_fail_perror("mmap failed"); + + /* Fill all pages with identical content to encourage KSM merging */ + memset(data->region, 0x77, data->mapsize); +} + +static int mremap_merge_and_migrate(struct global_data *data) +{ + int ret; + void *old_region; + int nr_pages =3D 32; + + prepare_pages(data, nr_pages); + + if (ksm_start() < 0) + return FAIL_ON_CHECK; + + old_region =3D data->region; + /* + * Mremap the second harf region to the first harf location (FIXED). + */ + data->region =3D mremap(old_region + data->mapsize / 2, data->mapsize / 2, + data->mapsize / 2, MREMAP_MAYMOVE | MREMAP_FIXED, old_region); + if (data->region =3D=3D MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + return FAIL_ON_CHECK; + } + + if (ksm_start() < 0) + return FAIL_ON_CHECK; + + /* Attempt to migrate the merged KSM page */ + ret =3D try_to_move_page(data->region); + if (ret !=3D 0) { + ksft_print_msg("migration of KSM page after mremap failed\n"); + return FAIL_ON_CHECK; + } + + /* Ensure ksmd scan two turns at least to update ksm counters */ + if (ksm_start() < 0) + return FAIL_ON_CHECK; + + if (ksm_get_pages_shared() !=3D 1 || + ksm_get_pages_sharing() !=3D nr_pages / 2 - 1) + return FAIL_ON_CHECK; + + return 0; +} + +TEST_F(migrate, ksm_and_mremap) +{ + struct global_data *data =3D &self->data; + int ret; + + /* Skip if KSM is not available */ + if (ksm_stop() < 0) + SKIP(return, "accessing \"/sys/kernel/mm/ksm/run\" failed"); + if (ksm_get_full_scans() < 0) + SKIP(return, "accessing \"/sys/kernel/mm/ksm/full_scan\" failed"); + + ret =3D prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); + if (ret < 0 && errno =3D=3D EINVAL) + SKIP(return, "PR_SET_MEMORY_MERGE not supported"); + else if (ret) + ksft_exit_fail_perror("PR_SET_MEMORY_MERGE=3D1 failed"); + + ASSERT_EQ(mremap_merge_and_migrate(data), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests= /mm/vm_util.c index db94564f4431..a33a4069de7c 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -648,6 +648,44 @@ long ksm_get_self_merging_pages(void) return strtol(buf, NULL, 10); } +long ksm_get_pages_shared(void) +{ + int ksm_pages_shared_fd; + char buf[10]; + ssize_t ret; + + ksm_pages_shared_fd =3D open("/sys/kernel/mm/ksm/pages_shared", O_RDONLY); + if (ksm_pages_shared_fd < 0) + return -errno; + + ret =3D pread(ksm_pages_shared_fd, buf, sizeof(buf) - 1, 0); + close(ksm_pages_shared_fd); + if (ret <=3D 0) + return -errno; + buf[ret] =3D 0; + + return strtol(buf, NULL, 10); +} + +long ksm_get_pages_sharing(void) +{ + int ksm_pages_sharing_fd; + char buf[10]; + ssize_t ret; + + ksm_pages_sharing_fd =3D open("/sys/kernel/mm/ksm/pages_sharing", O_RDONL= Y); + if (ksm_pages_sharing_fd < 0) + return -errno; + + ret =3D pread(ksm_pages_sharing_fd, buf, sizeof(buf) - 1, 0); + close(ksm_pages_sharing_fd); + if (ret <=3D 0) + return -errno; + buf[ret] =3D 0; + + return strtol(buf, NULL, 10); +} + long ksm_get_full_scans(void) { int ksm_full_scans_fd; diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests= /mm/vm_util.h index 1a07305ceff4..3b40727c3f1f 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -151,6 +151,8 @@ void *sys_mremap(void *old_address, unsigned long old_s= ize, long ksm_get_self_zero_pages(void); long ksm_get_self_merging_pages(void); +long ksm_get_pages_shared(void); +long ksm_get_pages_sharing(void); long ksm_get_full_scans(void); int ksm_use_zero_pages(void); int ksm_start(void); --=20 2.25.1