From nobody Sun May 24 19:35:53 2026 Received: from mxhk.zte.com.cn (mxhk.zte.com.cn [160.30.148.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4875830F52A for ; Fri, 22 May 2026 02:54:30 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=160.30.148.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779418473; cv=none; b=e3z8eVOLDj+g1wUhhJiTOoFBP5u3IW+nOJpbV2Awegj2ekJoH5ikchGKt6gLJg/141l2krls9WF5k9ErBlcAYoX6iQXvIwrqkWm709u/Bx4c/8nD4Bh7eFyQs5iOQPbVX7v0YVui5thRHNpMcCWv7aykjkh8xDd272Xeem9wilk= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779418473; c=relaxed/simple; bh=r305FnM0O5UDSQXdVGtRToGKkwIg585DoKKyHIzJH+Y=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=dn7H1x+aaex1XxODPo6StSzYFgWxDhab++nsFBW5FaCVZgw06ciJ0c59kwxzjXmnpgvThRfqAnJQ95lQed9OaI4r5SzYaMRN3nKrrkuh+AhHtz5yFLfORdNzcUB92hrARqAOFWnD/FdBulEKq8SP7v8q+42y/wNhuf9tgk5ulHo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=160.30.148.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl2.zte.com.cn (unknown [10.5.228.133]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxhk.zte.com.cn (FangMail) with ESMTPS id 4gM8xw5kSkz57DCf; Fri, 22 May 2026 10:54:28 +0800 (CST) Received: from xaxapp05.zte.com.cn ([10.99.98.109]) by mse-fl2.zte.com.cn with SMTP id 64M2sMOA002602; Fri, 22 May 2026 10:54:22 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp02[null]) by mapi (Zmail) with MAPI id mid32; Fri, 22 May 2026 10:54:23 +0800 (CST) X-Zmail-TransId: 2afa6a0fc55f23f-2a797 X-Mailer: Zmail v1.0 Message-ID: <202605221054232385LsDu1aTeJ-0fwQCqR-ro@zte.com.cn> In-Reply-To: <20260522105234715fKI7KSsjC5XpEVMwoV6rI@zte.com.cn> References: 20260522105234715fKI7KSsjC5XpEVMwoV6rI@zte.com.cn Date: Fri, 22 May 2026 10:54:23 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , , Cc: , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY2IDEvNl0gbW0vcm1hcDogYWRkIHRyYWNlcG9pbnQgZm9yIHJtYXBfd2Fsa8KgwqA=?= X-MAIL: mse-fl2.zte.com.cn 64M2sMOA002602 X-TLS: YES X-SPF-DOMAIN: zte.com.cn X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SPF: None X-SOURCE-IP: 10.5.228.133 unknown Fri, 22 May 2026 10:54:28 +0800 X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A0FC564.001/4gM8xw5kSkz57DCf Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: xu xin Add trace_rmap_walk_start() and trace_rmap_walk_end() to bracket reverse mapping walks. Unlike manual clock sampling, these tracepoints record no timestamp; latency can be computed offline by tools (e.g., perf, trace-cmd) using the event timestamps. When tracepoints are disabled, the only cost is a static branch check (no clock read, no duration calculation), making them suitable for production use. The information (folio type, locked state) helps diagnose performance issues in KSM, anonymous, and file-backed rmap walks. Signed-off-by: xu xin --- include/trace/events/rmap.h | 67 +++++++++++++++++++++++++++++++++++++ mm/rmap.c | 9 +++++ 2 files changed, 76 insertions(+) create mode 100644 include/trace/events/rmap.h diff --git a/include/trace/events/rmap.h b/include/trace/events/rmap.h new file mode 100644 index 000000000000..55a319ba6235 --- /dev/null +++ b/include/trace/events/rmap.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rmap + +#if !defined(_TRACE_RMAP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RMAP_H + +#include +#include + +#define GET_RMAP_PAGE_TYPE(folio) (folio_test_ksm(folio) ? "ksm" : \ + (folio_test_anon(folio) ? "anon" : "file")) + +/** + * rmap_walk_template - called for start / stop of rmap_walk. + */ +DECLARE_EVENT_CLASS(rmap_walk_template, + + TP_PROTO(struct folio *folio, struct rmap_walk_control *rwc, bool locked), + + TP_ARGS(folio, rwc, locked), + + TP_STRUCT__entry( + __field(unsigned long, folio_addr) + __field(unsigned long, rwc_addr) + __string(page_type, GET_RMAP_PAGE_TYPE(folio)) + __field(bool, locked) + ), + + TP_fast_assign( + __entry->folio_addr =3D (unsigned long)folio; + __entry->rwc_addr =3D (unsigned long)rwc; + __assign_str(page_type); + __entry->locked =3D locked; + ), + + TP_printk("folio=3D%p rwc=3D%p page_type=3D%s locked=3D%s", + (void *)(unsigned long)__entry->folio_addr, + (void *)(unsigned long)__entry->rwc_addr, + __get_str(page_type), + __entry->locked ? "true" : "false") + +); + +/** + * rmap_walk_start - called before a folio is rmapped. + */ +DEFINE_EVENT(rmap_walk_template, rmap_walk_start, + + TP_PROTO(struct folio *folio, struct rmap_walk_control *rwc, bool locked), + + TP_ARGS(folio, rwc, locked) +); + +DEFINE_EVENT(rmap_walk_template, rmap_walk_end, + + TP_PROTO(struct folio *folio, struct rmap_walk_control *rwc, bool locked), + + TP_ARGS(folio, rwc, locked) +); + + +#endif /* _TRACE_RMAP_H */ + +/* This part must be outside protection */ +#include + diff --git a/mm/rmap.c b/mm/rmap.c index 78b7fb5f367c..52f795f768e1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -80,6 +80,7 @@ #define CREATE_TRACE_POINTS #include +#include #include "internal.h" #include "swap.h" @@ -3098,23 +3099,31 @@ static void rmap_walk_file(struct folio *folio, void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) { + trace_rmap_walk_start(folio, rwc, false); + if (unlikely(folio_test_ksm(folio))) rmap_walk_ksm(folio, rwc); else if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, false); else rmap_walk_file(folio, rwc, false); + + trace_rmap_walk_end(folio, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) { + trace_rmap_walk_start(folio, rwc, true); + /* no ksm support for now */ VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, true); else rmap_walk_file(folio, rwc, true); + + trace_rmap_walk_end(folio, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE --=20 2.25.1 From nobody Sun May 24 19:35:53 2026 Received: from mxct.zte.com.cn (mxct.zte.com.cn [183.62.165.209]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 724251DF980 for ; Fri, 22 May 2026 02:57:03 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=183.62.165.209 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779418625; cv=none; b=e3sbM9Xucg2CWy9pud9z1fmKkQsohbT6JXgcqecb1SpywzHvG/dhIeW6NwHj9SwbtZJl/NlhVNrbNovj8YYovVWoMgSRDiUYyoBXLMd3q+SKZdnJoEc5+n8oQFeF4q7ojXI/jc+Z/T1nhA/O43+TJqSkehdLUMyYo4owSP5DsA8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779418625; c=relaxed/simple; bh=IoQKGPn7ihKK2ifnusV+doub63KldgoRTuG6cC7tNwQ=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=TDqhOSzhjY/w15A3VsyocU+nFGIyYW3xSge6P8+8kkbVkHY+ZrpfTX6ipGQpWdfBXP49YIkv9gh3WlE292ImYiJjhb5aLAgYz1YHY30S/klVQHCFmrmKIEh7pHV9AtqCGZpkORepw8q9ESfG1oD8PwTRN9T4rDkmqSjTMJeaFQs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=183.62.165.209 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl1.zte.com.cn (unknown [10.5.228.132]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxct.zte.com.cn (FangMail) with ESMTPS id 4gM90q6KHkz51Srq; Fri, 22 May 2026 10:56:59 +0800 (CST) Received: from xaxapp01.zte.com.cn ([10.88.99.176]) by mse-fl1.zte.com.cn with SMTP id 64M2un9r020614; Fri, 22 May 2026 10:56:49 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp05[null]) by mapi (Zmail) with MAPI id mid32; Fri, 22 May 2026 10:56:50 +0800 (CST) X-Zmail-TransId: 2afc6a0fc5f225e-0b7d5 X-Mailer: Zmail v1.0 Message-ID: <20260522105650750eZ1hZ7TTa353dYy_uXG3i@zte.com.cn> In-Reply-To: <20260522105234715fKI7KSsjC5XpEVMwoV6rI@zte.com.cn> References: 20260522105234715fKI7KSsjC5XpEVMwoV6rI@zte.com.cn Date: Fri, 22 May 2026 10:56:50 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , , Cc: , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY2IDIvNl0gdG9vbHMvdGVzdGluZzogYWRkIHJtYXAgd2FsayBsYXRlbmN5IGJlbmNobWFyayBmb3LCoEtTTSwgYW5vbnltb3VzIGFuZCBmaWxlIHBhZ2Vz?= X-MAIL: mse-fl1.zte.com.cn 64M2un9r020614 X-TLS: YES X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SOURCE-IP: 10.5.228.132 unknown Fri, 22 May 2026 10:56:59 +0800 X-CLEAN: YES X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A0FC5FB.001/4gM90q6KHkz51Srq Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: xu xin Add a new benchmark that measures rmap_walk latency under controlled conditions. The test creates a large region (20,000 pages by default), optionally splits the VMA into many small VMAs by mprotect(PROT_READ) on every other page, then triggers rmap_walk via move_pages(). The existing rmap_walk tracepoints (events/rmap/rmap_walk_start and events/rmap/rmap_walk_end) are used to collect duration for events with page_type=3Dksm, page_type=3Danon, and page_type=3Dfile. Three separate test cases are run: - KSM pages: allocate an anonymous region, fill with identical data, mark MADV_MERGEABLE, wait for KSM to merge all pages (by polling /sys/kernel/mm/ksm/full_scans), then trigger migration. - Anonymous pages: similar but without KSM merging. - File pages: mmap a temporary file with shared mapping and fill with identical data. For each test, the program prints the number of captured events and the maximum / average latency in milliseconds. This benchmark helps developers evaluate optimizations in the reverse mapping code, such as limiting max_page_sharing or improving tree traversal efficiency. Usage (must be run as root): cd tools/testing/rmap/ && make sudo ./rmap_bench =3D=3D=3D Testing KSM pages =3D=3D=3D Triggering rmap_walk via move_pages... KSM rmap_walk latency: Maximum duration: 705.12 ms (705119 us) Average duration: 532.04 ms (532041 us) Count: 4 events =3D=3D=3D Testing anonymous pages =3D=3D=3D Triggering rmap_walk via move_pages... Anonymous page rmap_walk latency: Maximum duration: 0.07 ms (69 us) Average duration: 0.05 ms (48 us) Count: 2 events =3D=3D=3D Testing file pages =3D=3D=3D Triggering rmap_walk via move_pages... File page rmap_walk latency: Maximum duration: 0.07 ms (67 us) Average duration: 0.03 ms (30 us) Count: 4 events Signed-off-by: xu xin --- tools/testing/rmap/Makefile | 11 + tools/testing/rmap/rmap_benchmark.c | 461 ++++++++++++++++++++++++++++ 2 files changed, 472 insertions(+) create mode 100644 tools/testing/rmap/Makefile create mode 100644 tools/testing/rmap/rmap_benchmark.c diff --git a/tools/testing/rmap/Makefile b/tools/testing/rmap/Makefile new file mode 100644 index 000000000000..200bd364cafb --- /dev/null +++ b/tools/testing/rmap/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +CC :=3D $(CROSS_COMPILE)gcc + +PROGS :=3D rmap_benchmark + +all: $(PROGS) + +rmap_benchmark: LDLIBS =3D -lnuma + +clean: + rm -fr $(PROGS) diff --git a/tools/testing/rmap/rmap_benchmark.c b/tools/testing/rmap/rmap_= benchmark.c new file mode 100644 index 000000000000..b163f4d6aec3 --- /dev/null +++ b/tools/testing/rmap/rmap_benchmark.c @@ -0,0 +1,461 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Reverse mapping latency test for KSM, anonymous and file pages + * + * This program creates a large number of pages (KSM merged, normal anonym= ous, + * or file mapped), splits the VMA into many small VMAs via mprotect, + * triggers rmap_walk by move_pages(), and collects latency data from the + * tracepoints 'rmap_walk_start' and 'rmap_walk_end' (offline timestamp di= ff). + * + * Usage: must be run as root (to access tracefs and KSM sysfs). + * + * Copyright 2026, ZTE Corp. + * + * Author(s): Xu Xin + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Page size and test parameters */ +int page_size; +#define NR_PAGES 20000 /* Number of virtual pages */ +#define TEST_PATTERN 0xaa + +/* KSM sysfs paths */ +#define KSM_RUN_PATH "/sys/kernel/mm/ksm/run" +#define KSM_SLEEP_MS_PATH "/sys/kernel/mm/ksm/sleep_millisecs" +#define KSM_PAGES_TO_SCAN "/sys/kernel/mm/ksm/pages_to_scan" +#define KSM_FULL_SCANS_PATH "/sys/kernel/mm/ksm/full_scans" + +/* Tracepoint control paths - enable all events under rmap */ +#define TRACE_ENABLE "/sys/kernel/tracing/events/rmap/enable" +#define TRACE_FILE "/sys/kernel/tracing/trace" + +enum page_type { + PAGE_TYPE_KSM, + PAGE_TYPE_ANON, + PAGE_TYPE_FILE, +}; + +static const char *page_type_str(enum page_type type) +{ + switch (type) { + case PAGE_TYPE_KSM: return "ksm"; + case PAGE_TYPE_ANON: return "anon"; + case PAGE_TYPE_FILE: return "file"; + default: return "unknown"; + } +} + +/* Helper: read/write sysfs */ +static int write_sys(const char *path, const char *value) +{ + int fd =3D open(path, O_WRONLY); + if (fd < 0) { + fprintf(stderr, "open %s failed: %s\n", path, strerror(errno)); + return -1; + } + ssize_t ret =3D write(fd, value, strlen(value)); + close(fd); + if (ret !=3D (ssize_t)strlen(value)) { + fprintf(stderr, "write %s failed: %s\n", path, strerror(errno)); + return -1; + } + return 0; +} + +static int read_sys_int(const char *path, int *val) +{ + FILE *fp =3D fopen(path, "r"); + if (!fp) + return -1; + if (fscanf(fp, "%d", val) !=3D 1) { + fclose(fp); + return -1; + } + fclose(fp); + return 0; +} + +/* KSM full scan count */ +static int ksm_get_full_scans(void) +{ + int val; + if (read_sys_int(KSM_FULL_SCANS_PATH, &val)) + return -1; + return val; +} + +/* Wait for KSM full scans */ +static void wait_ksm_merge(void) +{ + int start_scans, end_scans; + int max_wait =3D 60, waited =3D 0; + + start_scans =3D ksm_get_full_scans(); + if (start_scans < 0) { + fprintf(stderr, "Failed to read initial full_scans\n"); + return; + } + if (write_sys(KSM_RUN_PATH, "1") < 0) { + fprintf(stderr, "Failed to start KSM\n"); + return; + } + do { + sleep(1); + end_scans =3D ksm_get_full_scans(); + if (end_scans < 0) + return; + waited++; + if (waited > max_wait) { + fprintf(stderr, "Warning: KSM full_scans not increased after %ds\n", ma= x_wait); + break; + } + } while (end_scans < start_scans + 2); +} + +/* Tracepoint enable/disable */ +static void enable_tracepoint(void) +{ + struct stat st; + if (stat("/sys/kernel/tracing/trace", &st) !=3D 0) { + if (mount("tracefs", "/sys/kernel/tracing", "tracefs", 0, NULL) !=3D 0) + fprintf(stderr, "Warning: mount tracefs failed: %s\n", strerror(errno)); + } + if (write_sys(TRACE_ENABLE, "1") < 0) + exit(1); + int fd =3D open(TRACE_FILE, O_WRONLY | O_TRUNC); + if (fd < 0) { + perror("open " TRACE_FILE); + exit(1); + } + close(fd); +} + +static void disable_tracepoint(void) +{ + write_sys(TRACE_ENABLE, "0"); +} + +/* Timestamp extraction (us) */ +static unsigned long long extract_timestamp_us(const char *line) +{ + char time_str[32]; + double ts_sec =3D 0.0; + if (sscanf(line, "%*s %*s %*s %31s", time_str) =3D=3D 1) { + char *colon =3D strchr(time_str, ':'); + if (colon) *colon =3D '\0'; + ts_sec =3D strtod(time_str, NULL); + } + return (unsigned long long)(ts_sec * 1e6); +} + +/* Safe start/end pairing using folio and rwc addresses */ +struct pending_start { + unsigned long long ts; + unsigned long folio; + unsigned long rwc; +}; + +static int parse_trace_and_print(enum page_type type, unsigned long long *= max_us, + unsigned long long *avg_us, int *count) +{ + FILE *fp =3D fopen(TRACE_FILE, "r"); + if (!fp) { + perror("fopen " TRACE_FILE); + return -1; + } + + char line[1024]; + struct pending_start pending[128]; + int pending_cnt =3D 0; + unsigned long long sum =3D 0, max_val =3D 0; + int pairs =3D 0; + const char *type_str =3D page_type_str(type); + char type_pattern[64]; + snprintf(type_pattern, sizeof(type_pattern), "page_type=3D%s", type_str); + + while (fgets(line, sizeof(line), fp)) { + if (!strstr(line, type_pattern)) + continue; + + /* Extract folio and rwc addresses */ + unsigned long folio =3D 0, rwc =3D 0; + char *folio_str =3D strstr(line, "folio=3D"); + char *rwc_str =3D strstr(line, "rwc=3D"); + if (folio_str && rwc_str) { + folio =3D strtoul(folio_str + 6, NULL, 16); + rwc =3D strtoul(rwc_str + 4, NULL, 16); + } else { + continue; + } + + if (strstr(line, "rmap_walk_start:")) { + if (pending_cnt < 128) { + pending[pending_cnt].ts =3D extract_timestamp_us(line); + pending[pending_cnt].folio =3D folio; + pending[pending_cnt].rwc =3D rwc; + pending_cnt++; + } + } else if (strstr(line, "rmap_walk_end:")) { + unsigned long long end_ts =3D extract_timestamp_us(line); + /* Find matching start event */ + for (int i =3D 0; i < pending_cnt; i++) { + if (pending[i].folio =3D=3D folio && pending[i].rwc =3D=3D rwc) { + unsigned long long delta =3D end_ts - pending[i].ts; + if (delta > max_val) max_val =3D delta; + sum +=3D delta; + pairs++; + /* Remove this pending entry */ + pending[i] =3D pending[--pending_cnt]; + break; + } + } + } + } + fclose(fp); + + if (pairs =3D=3D 0) { + printf("No rmap_walk events with page_type=3D%s found.\n", type_str); + return -1; + } + + *max_us =3D max_val; + *avg_us =3D sum / pairs; + *count =3D pairs; + return 0; +} + +/* Trigger rmap_walk via move_pages */ +static void trigger_rmap_walk(void *region) +{ + int ret, status, cur_node, target_node; + void *pages[1]; + int nodes[1]; + + ret =3D move_pages(0, 1, (void **)®ion, NULL, &status, MPOL_MF_MOVE_AL= L); + if (ret !=3D 0) { + perror("Failed to get original numa"); + exit(1); + } + cur_node =3D status; + + for (target_node =3D 0; target_node <=3D numa_max_node(); target_node++) { + if (numa_bitmask_isbitset(numa_all_nodes_ptr, target_node) && target_nod= e !=3D cur_node) + break; + } + if (target_node > numa_max_node()) { + fprintf(stderr, "No other NUMA node\n"); + exit(1); + } + + pages[0] =3D region; + nodes[0] =3D target_node; + ret =3D move_pages(0, 1, pages, nodes, &status, MPOL_MF_MOVE_ALL); + if (ret < 0) + perror("move_pages"); +} + +/* Split VMA with mprotect */ +static void split_vma_with_mprotect(void *addr, size_t size) +{ + for (size_t i =3D 0; i < size / page_size; i++) { + if (i % 2 =3D=3D 0) { + if (mprotect(addr + i * page_size, page_size, PROT_READ) < 0 && errno != =3D EACCES) + perror("mprotect"); + } + } +} + +/* KSM configuration save/restore */ +static struct ksm_config { + int run; + int sleep_ms; + int pages_to_scan; +} orig_ksm; + +static int save_ksm_config(void) +{ + if (read_sys_int(KSM_RUN_PATH, &orig_ksm.run) || + read_sys_int(KSM_SLEEP_MS_PATH, &orig_ksm.sleep_ms) || + read_sys_int(KSM_PAGES_TO_SCAN, &orig_ksm.pages_to_scan)) { + fprintf(stderr, "Failed to read KSM config\n"); + return -1; + } + return 0; +} + +static void restore_ksm_config(void) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%d", orig_ksm.run); + write_sys(KSM_RUN_PATH, buf); + snprintf(buf, sizeof(buf), "%d", orig_ksm.sleep_ms); + write_sys(KSM_SLEEP_MS_PATH, buf); + snprintf(buf, sizeof(buf), "%d", orig_ksm.pages_to_scan); + write_sys(KSM_PAGES_TO_SCAN, buf); +} + +/* KSM test */ +static void test_ksm(void) +{ + size_t size =3D NR_PAGES * page_size; + unsigned long long max_us, avg_us; + int count; + + if (save_ksm_config() < 0) { + printf("KSM not available, skip KSM test.\n"); + return; + } + + if (write_sys(KSM_RUN_PATH, "2") < 0 || + write_sys(KSM_SLEEP_MS_PATH, "0") < 0 || + write_sys(KSM_PAGES_TO_SCAN, "10000") < 0) { + fprintf(stderr, "Failed to configure KSM\n"); + restore_ksm_config(); + return; + } + + void *region =3D mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | M= AP_ANONYMOUS, -1, 0); + if (region =3D=3D MAP_FAILED) { + perror("mmap for KSM"); + restore_ksm_config(); + return; + } + + memset(region, TEST_PATTERN, size); + if (madvise(region, size, MADV_MERGEABLE) !=3D 0) { + perror("madvise MADV_MERGEABLE"); + munmap(region, size); + restore_ksm_config(); + return; + } + + if (write_sys(KSM_RUN_PATH, "1") < 0) { + perror("Start KSM"); + munmap(region, size); + restore_ksm_config(); + return; + } + + /* Construct a anon_vma shared by a number of unrelated VMAs */ + split_vma_with_mprotect(region, size); + wait_ksm_merge(); + + /* Trigger one page to be rmapped */ + enable_tracepoint(); + trigger_rmap_walk(region + page_size); + usleep(100000); + disable_tracepoint(); + + if (parse_trace_and_print(PAGE_TYPE_KSM, &max_us, &avg_us, &count) =3D=3D= 0) { + printf("KSM rmap_walk latency:\n"); + printf(" Max: %.2f ms (%.0f us)\n", max_us/1000.0, (double)max_us); + printf(" Avg: %.2f ms (%.0f us)\n", avg_us/1000.0, (double)avg_us); + printf(" Count: %d\n", count); + } + munmap(region, size); + restore_ksm_config(); +} + +/* Anonymous test */ +static void test_anon(void) +{ + size_t size =3D NR_PAGES * page_size; + unsigned long long max_us, avg_us; + int count; + void *region =3D mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | M= AP_ANONYMOUS, -1, 0); + if (region =3D=3D MAP_FAILED) { + perror("mmap anon"); + return; + } + memset(region, TEST_PATTERN, size); + split_vma_with_mprotect(region, size); + enable_tracepoint(); + trigger_rmap_walk(region + page_size); + usleep(100000); + disable_tracepoint(); + if (parse_trace_and_print(PAGE_TYPE_ANON, &max_us, &avg_us, &count) =3D= =3D 0) { + printf("Anonymous page rmap_walk latency:\n"); + printf(" Max: %.2f ms (%.0f us)\n", max_us/1000.0, (double)max_us); + printf(" Avg: %.2f ms (%.0f us)\n", avg_us/1000.0, (double)avg_us); + printf(" Count: %d\n", count); + } + munmap(region, size); +} + +/* File-backed test (with early unlink) */ +static void test_file(void) +{ + size_t size =3D NR_PAGES * page_size; + char filename[] =3D "/tmp/rmap_test_file_XXXXXX"; + int fd =3D mkstemp(filename); + if (fd < 0) { + perror("mkstemp"); + return; + } + unlink(filename); /* file will vanish when fd closed, even on crash */ + if (ftruncate(fd, size) < 0) { + perror("ftruncate"); + close(fd); + return; + } + void *region =3D mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,= 0); + if (region =3D=3D MAP_FAILED) { + perror("mmap file"); + close(fd); + return; + } + memset(region, TEST_PATTERN, size); + split_vma_with_mprotect(region, size); + enable_tracepoint(); + trigger_rmap_walk(region + page_size); + usleep(100000); + disable_tracepoint(); + + unsigned long long max_us, avg_us; + int count; + if (parse_trace_and_print(PAGE_TYPE_FILE, &max_us, &avg_us, &count) =3D= =3D 0) { + printf("File page rmap_walk latency:\n"); + printf(" Max: %.2f ms (%.0f us)\n", max_us/1000.0, (double)max_us); + printf(" Avg: %.2f ms (%.0f us)\n", avg_us/1000.0, (double)avg_us); + printf(" Count: %d\n", count); + } + munmap(region, size); + close(fd); +} + +int main(void) +{ + page_size =3D getpagesize(); + + if (geteuid() !=3D 0) { + fprintf(stderr, "Must be run as root.\n"); + return 1; + } + if (numa_available() < 0) { + fprintf(stderr, "NUMA not available.\n"); + return 1; + } + + test_ksm(); + test_anon(); + test_file(); + return 0; +} --=20 2.25.1 From nobody Sun May 24 19:35:53 2026 Received: from mxct.zte.com.cn (mxct.zte.com.cn [183.62.165.209]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 240672C375E for ; Fri, 22 May 2026 02:57:59 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=183.62.165.209 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779418681; cv=none; b=ftWTtQlE4GLjExcsvVv9uzXwSfTm7FwYoz/h5doshbpmO8bLl8vjJ+CvbJScs221L5CwY4VdSkrlj72o9peESzbB7RTA7GdREFLy4NuKFW4kZeXXYs4z2lfaGyBTU79UvGtWMDB9CKnvigtEInL3CW17iBYpQBh4I3rlF81K5uo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779418681; c=relaxed/simple; bh=dKNSkm6b5MO8d5Wl3ECAMi/t39rDd81bBXOFHxb0Y8s=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=sF3VGgJGcxNOECIGOkiy+sQrAdhBrx5T9mGaxB73TSN2GMBR/CEID2al/VuuIeBvPxVlDoPQWeqSuprnhn93KQJqyZ8swnibcWFsjVi2bsg/rhv0BGwnqsbBsa6j/YOLLCMIdX74PGUa4bi0KswjlOGaihYevRxu49iU7d0GYig= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=183.62.165.209 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl1.zte.com.cn (unknown [10.5.228.132]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxct.zte.com.cn (FangMail) with ESMTPS id 4gM91w53lNz51Srk; Fri, 22 May 2026 10:57:56 +0800 (CST) Received: from xaxapp05.zte.com.cn ([10.99.98.109]) by mse-fl1.zte.com.cn with SMTP id 64M2vru5021644; Fri, 22 May 2026 10:57:53 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp05[null]) by mapi (Zmail) with MAPI id mid32; Fri, 22 May 2026 10:57:54 +0800 (CST) X-Zmail-TransId: 2afc6a0fc632e7e-0d3f4 X-Mailer: Zmail v1.0 Message-ID: <20260522105754477r3fHG2ryVBxFDxUNA895E@zte.com.cn> In-Reply-To: <20260522105234715fKI7KSsjC5XpEVMwoV6rI@zte.com.cn> References: 20260522105234715fKI7KSsjC5XpEVMwoV6rI@zte.com.cn Date: Fri, 22 May 2026 10:57:54 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , , Cc: , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY2IDMvNl0gTUFJTlRBSU5FUlM6IGFkZCBteXNlbGYgYXMgcmV2aWV3ZXIgZm9yIHJtYXAgc2VjdGlvbg==?= X-MAIL: mse-fl1.zte.com.cn 64M2vru5021644 X-TLS: YES X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SOURCE-IP: 10.5.228.132 unknown Fri, 22 May 2026 10:57:56 +0800 X-CLEAN: YES X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A0FC634.000/4gM91w53lNz51Srk Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: xu xin To help review future changes related to rmap tracing and testing, add myself as a reviewer (R:) for the rmap entry, and also update the file patterns to include: - include/trace/events/rmap.h - tools/testing/rmap/rmap_benchmark.c Signed-off-by: Xu Xin --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8e7268d2f6ec..01cc34cc83a2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17006,11 +17006,14 @@ R: Liam R. Howlett R: Vlastimil Babka R: Harry Yoo R: Jann Horn +R: Xu Xin L: linux-mm@kvack.org S: Maintained F: include/linux/rmap.h +F: include/trace/events/rmap.h F: mm/page_vma_mapped.c F: mm/rmap.c +F: tools/testing/rmap/rmap_benchmark.c F: tools/testing/selftests/mm/rmap.c MEMORY MANAGEMENT - SECRETMEM --=20 2.25.1 From nobody Sun May 24 19:35:53 2026 Received: from mxhk.zte.com.cn (mxhk.zte.com.cn [160.30.148.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2839039FD4 for ; Fri, 22 May 2026 03:00:14 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=160.30.148.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779418816; cv=none; b=RbyqcasaL5kSr7NUf16r5wkwNhWXOmz53Z1dLINnVZkY+zzgLLKnVDa05KzpPkJ7IyoBPcDqJEDNjot9ZJXD6xVZRt0wReN8442G5tpdfb9FwMgRenPPCz8ft1eWDZbdwsHDAwwJRuH6gpqwP7fNtxBZi+hgtEnVpRIm2yY1/hg= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779418816; c=relaxed/simple; bh=al8X5naXJExnaDVynwWF5LVPfMA9TNOVBrbx0uSb1I8=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=pdo9M3vOUU6jRHJkZEUGa6CQF700RzOrotEGMrO6G1O+Go5weKYOvkVkQOPjLwyAcnFKgPyZiVY2P995B8Ob4DFIJls4kR6P4ZXi7LsbXCss7G3ta6C1qybIYM71K1ha8RtMmeIgyqa3pSHS0y2O1nyV5CHbuVJWIsl/4xtjKN0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=160.30.148.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl2.zte.com.cn (unknown [10.5.228.133]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxhk.zte.com.cn (FangMail) with ESMTPS id 4gM94X2kvPz57DCh; Fri, 22 May 2026 11:00:12 +0800 (CST) Received: from xaxapp04.zte.com.cn ([10.99.98.157]) by mse-fl2.zte.com.cn with SMTP id 64M301FL013137; Fri, 22 May 2026 11:00:01 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp01[null]) by mapi (Zmail) with MAPI id mid32; Fri, 22 May 2026 11:00:02 +0800 (CST) X-Zmail-TransId: 2af96a0fc6b2973-1fed6 X-Mailer: Zmail v1.0 Message-ID: <202605221100025182H60-ttQ2fjhaBEFVOBRm@zte.com.cn> In-Reply-To: <20260522105234715fKI7KSsjC5XpEVMwoV6rI@zte.com.cn> References: 20260522105234715fKI7KSsjC5XpEVMwoV6rI@zte.com.cn Date: Fri, 22 May 2026 11:00:02 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , , Cc: , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY2IDQvNl0ga3NtOiBhZGQgcGdvZmYgaW50byBrc21fcm1hcF9pdGVt?= X-MAIL: mse-fl2.zte.com.cn 64M301FL013137 X-TLS: YES X-SPF-DOMAIN: zte.com.cn X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SPF: None X-SOURCE-IP: 10.5.228.133 unknown Fri, 22 May 2026 11:00:12 +0800 X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A0FC6BC.001/4gM94X2kvPz57DCh Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: xu xin The reason for adding pgoff to ksm_rmap_item has been discussed in previous mailing list threads [1][2]. The main purpose is to allow the KSM reverse m= apping to obtain the original page's linear page index, so that during anon_vma_tr= ee travering, it can conditionally locate the VMAs and avoid scanning the enti= re address space [0, ULONG_MAX]. To minimize the size impact of adding pgoff to ksm_rmap_item as much as possible, a trick that David suggested is to use a UNION that groups the me= mbers related to the unstable tree together with the newly added linear page inde= x. The members that valids only when in unstable tree include oldchecksum and age = information. However, the function should_skip_rmap_item() in the smart scanning needs s= light modification, since this function still uses the age information even when = the rmap_item is in a stable state (the page is not KSM), a situation that occu= rs during COW faults. After using union, the size is still 64 byte without inc= reasing. We keep the same way to store the pgoff as rmap->anon_vma which is set when= the page is merged and become a KsmPage at try_to_merge_with_ksm_page(), and reset at remove_rmap_item_from_tree() and remove_node_from_stable_tree() and reset w= hen break_cow. To be specially clarified, the reason for resetting pgoff at break_cow() is: - When a page successfully becomes a KSM page (i.e., after stable_tree_appe= nd() sets STABLE_FLAG), both anon_vma and vm_pgoff are stored and remain valid. - However, during the merging process there are several failure paths where= a page that was temporarily treated as a KSM page must be reverted back to = an anonymous page. Examples include: * The second call to try_to_merge_with_ksm_page() fails in try_to_merge_two_pages(). * stable_tree_insert() fails in cmp_and_merge_page(). In such cases, break_cow() is invoked to break the COW mapping and discard the KSM state. Currently, break_cow() already contains a put_anon_vma(rmap_item->anon_vma) to release the reference taken during the aborted merge. Because 'pgoff' is logically paired with anon_vma (both are only meaningful when the rmap_item is in a stable state), it must also be cleared (or reset) in break_cow() to avoid leaving stale pgoff values that could confuse subsequent rmap walks or scanning logic. [1] https://lore.kernel.org/all/adTPQSb-qSSHviJN@lucifer/ [2] https://lore.kernel.org/all/202604091806051535BJWZ_FTtdIm3Snk24ei_@zte.= com.cn/ Suggested-by: David Hildenbrand (Arm) Signed-off-by: xu xin --- mm/ksm.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 7d5b76478f0b..4761ca3fa984 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -195,22 +195,28 @@ struct ksm_stable_node { * @node: rb node of this rmap_item in the unstable tree * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node - * @age: number of scan iterations since creation - * @remaining_skips: how many scans to skip + * @age: number of scan iterations since creation (unstable node) + * @remaining_skips: how many scans to skip (unstable node) + * @pgoff: pgoff into @anon_vma where the page is mapped (stable tree) */ struct ksm_rmap_item { struct ksm_rmap_item *rmap_list; union { - struct anon_vma *anon_vma; /* when stable */ + struct anon_vma *anon_vma; /* for reverse mapping, when stable */ #ifdef CONFIG_NUMA int nid; /* when node of unstable tree */ #endif }; struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ - unsigned int oldchecksum; /* when unstable */ - rmap_age_t age; - rmap_age_t remaining_skips; + union { + struct { + unsigned int oldchecksum; + rmap_age_t age; + rmap_age_t remaining_skips; + }; /* when unstable */ + unsigned long pgoff; /* for reverse mapping, when stable */ + }; union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ @@ -776,6 +782,10 @@ static struct vm_area_struct *find_mergeable_vma(struc= t mm_struct *mm, return vma; } +/* + * break_cow: actively break the write-protect of the VMA. This is called = when + * rmap_item has not yet become stable, but page has been merged. + */ static void break_cow(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm =3D rmap_item->mm; @@ -787,6 +797,8 @@ static void break_cow(struct ksm_rmap_item *rmap_item) * to undo, we also need to drop a reference to the anon_vma. */ put_anon_vma(rmap_item->anon_vma); + /* Reset pgoff that might overlay age-related information. (still unstabl= e) */ + rmap_item->pgoff =3D 0; mmap_read_lock(mm); vma =3D find_mergeable_vma(mm, addr); @@ -899,6 +911,8 @@ static void remove_node_from_stable_tree(struct ksm_sta= ble_node *stable_node) VM_BUG_ON(stable_node->rmap_hlist_len <=3D 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); + /* Reset pgoff that might overlay age-related information. */ + rmap_item->pgoff =3D 0; rmap_item->address &=3D PAGE_MASK; cond_resched(); } @@ -1052,6 +1066,8 @@ static void remove_rmap_item_from_tree(struct ksm_rma= p_item *rmap_item) stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); + /* Reset pgoff that might overlay age-related information. */ + rmap_item->pgoff =3D 0; rmap_item->head =3D NULL; rmap_item->address &=3D PAGE_MASK; @@ -1598,8 +1614,15 @@ static int try_to_merge_with_ksm_page(struct ksm_rma= p_item *rmap_item, /* Unstable nid is in union with stable anon_vma: remove first */ remove_rmap_item_from_tree(rmap_item); - /* Must get reference to anon_vma while still holding mmap_lock */ + /* + * Must get reference to anon_vma while still holding mmap_lock, + * We set these two members of stable node here instead of + * stable_tree_append(), maybe because we don't want to hold + * mmap_read_lock again. Here mmap_read_lock is already held to + * find_mergeable_vma before merging. + */ rmap_item->anon_vma =3D vma->anon_vma; + rmap_item->pgoff =3D linear_page_index(vma, rmap_item->address); get_anon_vma(vma->anon_vma); out: mmap_read_unlock(mm); @@ -2458,6 +2481,10 @@ static bool should_skip_rmap_item(struct folio *foli= o, if (folio_test_ksm(folio)) return false; + /* There is no age information in stable-tree nodes. */ + if (rmap_item->address & STABLE_FLAG) + return false; + age =3D rmap_item->age; if (age !=3D U8_MAX) rmap_item->age++; --=20 2.25.1 From nobody Sun May 24 19:35:53 2026 Received: from mxhk.zte.com.cn (mxhk.zte.com.cn [160.30.148.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8F7713128AB for ; Fri, 22 May 2026 03:01:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=160.30.148.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779418877; cv=none; b=uVwvhwQEooVhwlvLL21/36wLkXIY1Pki30UWmjlC4/54dfVSki0+YQQhSRxhxySp/kTlI6mBikH9J9FOX/ScxH4yoeZAeJzIOTopaNwWC/y/fV0J21ESfSS1q/8uFpqFQNHiamkzIodeG41JFlLCfyurqd9YR4h4s0U3RuRHuWw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779418877; c=relaxed/simple; bh=6JiZYS6Nw1zIsBjcNNHGo2UasrnXCLSghyY8mrWBi/w=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=PJBLE5GWCRPOxRKqBK94Bip1LrIgPnIcqt0Ckp5GCmwbBVeTVC+IB3V8qxsI8GspRW+2Bv+J8S92jHJU+h4Hu1+gVx+y+8fzjeeY0eYSUvFvTZpXxRYh3jOw05iVLSMvntikyDG8pXcyb/8wr1fonpXDhWvYnYAkkVVJNYOaXkE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=160.30.148.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl2.zte.com.cn (unknown [10.5.228.133]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxhk.zte.com.cn (FangMail) with ESMTPS id 4gM95j631Dz57DCl; Fri, 22 May 2026 11:01:13 +0800 (CST) Received: from xaxapp02.zte.com.cn ([10.88.97.241]) by mse-fl2.zte.com.cn with SMTP id 64M318fY023988; Fri, 22 May 2026 11:01:08 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp04[null]) by mapi (Zmail) with MAPI id mid32; Fri, 22 May 2026 11:01:09 +0800 (CST) X-Zmail-TransId: 2afb6a0fc6f5df8-1136a X-Mailer: Zmail v1.0 Message-ID: <20260522110109231BdqordSWkUPmdr47-ghXo@zte.com.cn> In-Reply-To: <20260522105234715fKI7KSsjC5XpEVMwoV6rI@zte.com.cn> References: 20260522105234715fKI7KSsjC5XpEVMwoV6rI@zte.com.cn Date: Fri, 22 May 2026 11:01:09 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , , Cc: , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY2IDUvNl0ga3NtOiBPcHRpbWl6ZSBybWFwX3dhbGtfa3NtIGJ5IHBhc3NpbmcgYSBzdWl0YWJsZSBwZ29mZg==?= Content-Type: text/plain; charset="utf-8" X-MAIL: mse-fl2.zte.com.cn 64M318fY023988 X-TLS: YES X-SPF-DOMAIN: zte.com.cn X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SPF: None X-SOURCE-IP: 10.5.228.133 unknown Fri, 22 May 2026 11:01:13 +0800 X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A0FC6F9.000/4gM95j631Dz57DCl Content-Transfer-Encoding: quoted-printable From: xu xin User impact / Why this matters to Linux users =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D When a system runs with KSM enabled and memory becomes tight, KSM pages may be swapped out or migrated. The kernel then performs a reverse map walk by rmap_walk_ksm to locate all page table entries that reference these pages. If A large number of unrelated VMAs can attach to a single anon_vma related with this KSM page, then rmap_walk might be severe performance bottleneck. In our embedded test environment, we observed ~20,000 VMAs sharing one anon_vma without any fork =E2=80=93 purely from VMA splits=EF=BC=8C which cause 200~700ms duration of rmap_walk_ksm. When one of those VMAs mapped a KSM page, then this KSM page's rmapping will become bottleneck with hold its anon_vma lock for a long time. The anon_vma lock is not only used by KSM; it is a core lock protecting the VMA interval tree and is acquired by many critical memory operations: =E2=80=A2 Page faults: do_anonymous_page(), do_wp_page() (especially duri= ng COW) =E2=80=A2 Memory reclaim: try_to_unmap() =E2=80=A2 Page migration & compaction: migrate_pages(), compact_zone() =E2=80=A2 mlock / munlock: mlock_fixup() =E2=80=A2 Process exit: exit_mmap() (tearing down VMAs) =E2=80=A2 Cgroup memory accounting: mem_cgroup_move_charge() If one thread holds the anon_vma lock for hundreds of milliseconds because of an inefficient KSM rmap walk, any other thread that tries to acquire the same lock (e.g., an application taking a page fault, kswapd reclaiming pages, or a migration thread) will block. This leads to stalled application threads, increased latency spikes, and in extreme cases container timeouts or watchdog triggers. This patch reduces the worst-case anon_vma lock hold time during KSM rmap walk from >500 ms to <1 ms, thereby almost eliminating this source of lock contention and improving system responsiveness under memory pressure. Real-world examples: =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D - JVM / Go runtime: These use mmap for heap regions and later call mprotect(PROT_NONE) for garbage collection barriers or guard pages, splitting the original VMA into thousands of small pieces over time. - Database engines (MySQL, PostgreSQL): Large shared memory buffers or anonymous mappings are managed with madvise(MADV_DONTNEED) to release specific pages, which also splits VMAs. * Why the benchmark numbers are realistic: We observed ~20,000 VMAs sharing one anon_vma on a production system running a Java application with KSM enabled. The lock hold time before the patch was measured at 228=E2=80=AFms= (max) during rmap walks triggered by memory compaction and page migration. The benchmark reproduces that VMA count and lock=E2=80=91hold behavior in a controlled environment. Root Cause =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D Through my local debugging trace analysis, we found that most of the latency of rmap_walk_ksm occurs within anon_vma_interval_tree_foreach, leading to an excessively long hold time on the anon_vma lock (even reaching 500ms or mor= e), which in turn causes upper-layer applications (waiting for the anon_vma loc= k) to be blocked for extended periods. Further investigation revealed that 99.9% of iterations inside the anon_vma_interval_tree_foreach loop are skipped due to the first check "if (addr < vma->vm_start || addr >=3D vma->vm_end)), indicating that a lar= ge number of loop iterations are ineffective. This inefficiency arises because the pgoff_start and pgoff_end parameters passed to anon_vma_interval_tree_foreach span the entire address space from 0 to ULONG_MAX, resulting in very poor loop efficiency. Solution =3D=3D=3D=3D=3D=3D=3D=3D We cannot rely solely on anon_vma to locate all PTEs mapping this page but also need to have the original page's pgoff. Since the implementation of anon_vma_interval_tree_foreach =E2=80=94 it essentially iterates to find a = suitable VMA such that the provided pgoff falls within the candidate's vm_pgoff rang= e. vm_pgoff <=3D pgoff (original linear page offset) <=3D (vm_pgoff + vma_page= s(v) - 1) Fortunately, we have already pgoff in ksm_rmap_item in the previos patch of series, so that we use it to get the pgoff to accelerate the searching. Test results =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D We provide a rmap testbench: tools/testing/rmap/rmap_benchmark.c The testing result in QEMU is shown as follows: KSM rmapping Maximum duration Average duration Before: 705.12 ms (705119858 ns) 532.04 ms (532041586 ns) After: 1.67 ms (1665917 ns) 1.44 ms (1443784 ns) Co-developed-by: Wang Yaxin Signed-off-by: Wang Yaxin Signed-off-by: xu xin --- mm/ksm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/ksm.c b/mm/ksm.c index 4761ca3fa984..7fe1a8753309 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -3200,6 +3200,7 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_w= alk_control *rwc) hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { /* Ignore the stable/unstable/sqnr flags */ const unsigned long addr =3D rmap_item->address & PAGE_MASK; + const unsigned long pgoff =3D rmap_item->pgoff; struct anon_vma *anon_vma =3D rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; @@ -3213,8 +3214,12 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_= walk_control *rwc) anon_vma_lock_read(anon_vma); } + /* + * Currently KSM folios are order-0 normal pages, so pgoff_end + * should be the same as pgoff_start. + */ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, - 0, ULONG_MAX) { + pgoff, pgoff) { cond_resched(); vma =3D vmac->vma; --=20 2.25.1 From nobody Sun May 24 19:35:53 2026 Received: from mxct.zte.com.cn (mxct.zte.com.cn [183.62.165.209]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C053430BF4F for ; Fri, 22 May 2026 03:02:37 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=183.62.165.209 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779418959; cv=none; b=li5aH52BxkJEVU6y0nnpXZpfBnzV405fXPbVbgFOpJX50fH1tz9UiIK03gzQZWpFHMvjwZpVwiguFQaeIrvGctGbOqSVrtYxwfJ0X6yNaOXuX8BW3Ra1Zd/POnM8XcgylzbX6Lmv97mQ0Kgst9nCS7+TLxC0uC88HohuEDLGATA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779418959; c=relaxed/simple; bh=RtCjUDX7GD0wUDHCDGAVP4Mgq0zrv2Yl04aenfZZQj8=; h=Message-ID:In-Reply-To:References:Date:Mime-Version:From:To:Cc: Subject:Content-Type; b=rAaPgGVpU51dllckX/NQVguIHqnOHgkPRt5vqNeK4GO8vtACm2x0mAkesdkVcP1Xip3jCSCuH6pNywq9bv6zafJToEnzLOAZJs06k8SUVwzPkuxvgVqjbDJqfMiVZAcUxqOnYi0A6oqUmLDY/0okJR/ysUrM5uKqoL0uxXu5ShA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn; spf=pass smtp.mailfrom=zte.com.cn; arc=none smtp.client-ip=183.62.165.209 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=zte.com.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=zte.com.cn Received: from mse-fl1.zte.com.cn (unknown [10.5.228.132]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange x25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mxct.zte.com.cn (FangMail) with ESMTPS id 4gM97G26m8z4xNtZ; Fri, 22 May 2026 11:02:34 +0800 (CST) Received: from xaxapp01.zte.com.cn ([10.88.99.176]) by mse-fl1.zte.com.cn with SMTP id 64M32NX7039363; Fri, 22 May 2026 11:02:24 +0800 (+08) (envelope-from xu.xin16@zte.com.cn) Received: from mapi (xaxapp02[null]) by mapi (Zmail) with MAPI id mid32; Fri, 22 May 2026 11:02:25 +0800 (CST) X-Zmail-TransId: 2afa6a0fc741c42-3818d X-Mailer: Zmail v1.0 Message-ID: <202605221102251615sSZpzKPatH1ieTk7pInS@zte.com.cn> In-Reply-To: <20260522105234715fKI7KSsjC5XpEVMwoV6rI@zte.com.cn> References: 20260522105234715fKI7KSsjC5XpEVMwoV6rI@zte.com.cn Date: Fri, 22 May 2026 11:02:25 +0800 (CST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 From: To: , , Cc: , , , , , , Subject: =?UTF-8?B?W1BBVENIIHY2IDYvNl0ga3NtOiBhZGQgbXJlbWFwIHNlbGZ0ZXN0cyBmb3Iga3NtX3JtYXBfd2Fsaw==?= X-MAIL: mse-fl1.zte.com.cn 64M32NX7039363 X-TLS: YES X-ENVELOPE-SENDER: xu.xin16@zte.com.cn X-SOURCE-IP: 10.5.228.132 unknown Fri, 22 May 2026 11:02:34 +0800 X-CLEAN: YES X-Fangmail-Anti-Spam-Filtered: true X-Fangmail-MID-QID: 6A0FC74A.000/4gM97G26m8z4xNtZ Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: xu xin The existing tools/testing/selftests/mm/rmap.c has already one testcase for ksm_rmap_walk in TEST_F(migrate, ksm), which takes use of migration of page from one NUMA node to another NUMA node. However, it just lacks the scenario of mremapped VMAs. We add the calling of mremap() and then trigger KSM to merge pages before migrating, , which is specailly to test a optimization which is introduced by this patch ("ksm: Optimize rmap_walk_ksm by passing a suitable address range"). This test can reproduce the issue that Hugh points out at https://lore.kernel.org/all/02e1b8df-d568-8cbb-b8f6-46d5476d9d75@google.com/ Signed-off-by: xu xin --- tools/testing/selftests/mm/rmap.c | 76 ++++++++++++++++++++++++++++ tools/testing/selftests/mm/vm_util.c | 38 ++++++++++++++ tools/testing/selftests/mm/vm_util.h | 2 + 3 files changed, 116 insertions(+) diff --git a/tools/testing/selftests/mm/rmap.c b/tools/testing/selftests/mm= /rmap.c index 53f2058b0ef2..f3eb693872ac 100644 --- a/tools/testing/selftests/mm/rmap.c +++ b/tools/testing/selftests/mm/rmap.c @@ -430,4 +430,80 @@ TEST_F(migrate, ksm) propagate_children(_metadata, data); } +static void prepare_pages(struct global_data *data, int nr_pages) +{ + /* Allocate exactly pages for the test */ + data->mapsize =3D nr_pages * getpagesize(); + data->region =3D mmap(NULL, data->mapsize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + if (data->region =3D=3D MAP_FAILED) + ksft_exit_fail_perror("mmap failed"); + + /* Fill all pages with identical content to encourage KSM merging */ + memset(data->region, 0x77, data->mapsize); +} + +static int mremap_merge_and_migrate(struct global_data *data) +{ + int ret; + void *old_region; + int nr_pages =3D 32; + + prepare_pages(data, nr_pages); + + if (ksm_start() < 0) + return FAIL_ON_CHECK; + + old_region =3D data->region; + /* + * Mremap the second harf region to the first harf location (FIXED). + */ + data->region =3D mremap(old_region + data->mapsize / 2, data->mapsize / 2, + data->mapsize / 2, MREMAP_MAYMOVE | MREMAP_FIXED, old_region); + if (data->region =3D=3D MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + return FAIL_ON_CHECK; + } + + if (ksm_start() < 0) + return FAIL_ON_CHECK; + + /* Attempt to migrate the merged KSM page */ + ret =3D try_to_move_page(data->region); + if (ret !=3D 0) { + ksft_print_msg("migration of KSM page after mremap failed\n"); + return FAIL_ON_CHECK; + } + + /* Ensure ksmd scan two turns at least to update ksm counters */ + if (ksm_start() < 0) + return FAIL_ON_CHECK; + + if (ksm_get_pages_shared() !=3D 1 || + ksm_get_pages_sharing() !=3D nr_pages / 2 - 1) + return FAIL_ON_CHECK; + + return 0; +} + +TEST_F(migrate, ksm_and_mremap) +{ + struct global_data *data =3D &self->data; + int ret; + + /* Skip if KSM is not available */ + if (ksm_stop() < 0) + SKIP(return, "accessing \"/sys/kernel/mm/ksm/run\" failed"); + if (ksm_get_full_scans() < 0) + SKIP(return, "accessing \"/sys/kernel/mm/ksm/full_scan\" failed"); + + ret =3D prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); + if (ret < 0 && errno =3D=3D EINVAL) + SKIP(return, "PR_SET_MEMORY_MERGE not supported"); + else if (ret) + ksft_exit_fail_perror("PR_SET_MEMORY_MERGE=3D1 failed"); + + ASSERT_EQ(mremap_merge_and_migrate(data), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests= /mm/vm_util.c index db94564f4431..a33a4069de7c 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -648,6 +648,44 @@ long ksm_get_self_merging_pages(void) return strtol(buf, NULL, 10); } +long ksm_get_pages_shared(void) +{ + int ksm_pages_shared_fd; + char buf[10]; + ssize_t ret; + + ksm_pages_shared_fd =3D open("/sys/kernel/mm/ksm/pages_shared", O_RDONLY); + if (ksm_pages_shared_fd < 0) + return -errno; + + ret =3D pread(ksm_pages_shared_fd, buf, sizeof(buf) - 1, 0); + close(ksm_pages_shared_fd); + if (ret <=3D 0) + return -errno; + buf[ret] =3D 0; + + return strtol(buf, NULL, 10); +} + +long ksm_get_pages_sharing(void) +{ + int ksm_pages_sharing_fd; + char buf[10]; + ssize_t ret; + + ksm_pages_sharing_fd =3D open("/sys/kernel/mm/ksm/pages_sharing", O_RDONL= Y); + if (ksm_pages_sharing_fd < 0) + return -errno; + + ret =3D pread(ksm_pages_sharing_fd, buf, sizeof(buf) - 1, 0); + close(ksm_pages_sharing_fd); + if (ret <=3D 0) + return -errno; + buf[ret] =3D 0; + + return strtol(buf, NULL, 10); +} + long ksm_get_full_scans(void) { int ksm_full_scans_fd; diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests= /mm/vm_util.h index 1a07305ceff4..3b40727c3f1f 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -151,6 +151,8 @@ void *sys_mremap(void *old_address, unsigned long old_s= ize, long ksm_get_self_zero_pages(void); long ksm_get_self_merging_pages(void); +long ksm_get_pages_shared(void); +long ksm_get_pages_sharing(void); long ksm_get_full_scans(void); int ksm_use_zero_pages(void); int ksm_start(void); --=20 2.25.1