From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 0DAF51A9B4E
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:35:45 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029753; cv=none;
 b=CobzEq+D0qvQ3jBKLuMbDzpKkOVW3DWu7UieYfe0xt201AdsW8IPaMglZU8HovTtHp4sKwOsLAoSnpK3G8lXR78H+twcZLAbXJE+3JMvYXRmQwwa1RpWm9vOh4iVW6JqESuI1LToU/Mcv34Qmax8blTce+npjmjF/Ubz89lIvMM=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029753; c=relaxed/simple;
	bh=Uv8nmFZOuubSU+ZBEWrLcFBd4mHzwTof9syrfb7FABA=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=dp2vvMrOXjzhmTbTuzLvwrPUaWdltjPUVuwVyMc5s5kRHlWj43nuh6NTlkrF6enAIxjWKMbUZl4C7A+gnwDjnhbtIvStdMe7K7Q+1cqOURfKKANddv+wb9f+Yaw3D0Sx4uKzohLgVjBwkJeD8TYniuVxhme6VbLa1IK7I8HtGSw=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-b0-67b6bba6f406
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 01/26] x86/tlb: add APIs manipulating tlb batch's arch
 data
Date: Thu, 20 Feb 2025 14:20:02 +0900
Message-Id: <20250220052027.58847-2-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrGLMWRmVeSWpSXmKPExsXC9ZZnoe6y3dvSDZ6fFbSYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	85edZCpYLVTxYeVs1gbGw/xdjJwcEgImEu97FjDD2DO/TWAHsdkE1CVu3PgJFhcRMJM42PoH
	LM4scJdJ4kA/G4gtLBAsMenaXbA4i4CqxKfj58HqeQVMJdZd/wk1U15i9YYDYDYn0JwfM3rB
	eoWAat4tuMTUxcgFVPOeTeL6ud1sEA2SEgdX3GCZwMi7gJFhFaNQZl5ZbmJmjoleRmVeZoVe
	cn7uJkZg6C+r/RO9g/HTheBDjAIcjEo8vDNat6ULsSaWFVfmHmKU4GBWEuFtq9+SLsSbklhZ
	lVqUH19UmpNafIhRmoNFSZzX6Ft5ipBAemJJanZqakFqEUyWiYNTqoExoOvliaW8hz33Pz6v
	zD9x0r+mSVLV2nx/f349Ndd0tbjFZKNN7r9Pii2qjF4dev9q7IzC9n5OcXtVBS+vqf5tvvLe
	aY9/SF+vzzosk/Gk433kss8pfGG+hwuapeS01vLXSRu5+/ou/uFwcfI/JS7p7q+zeGQEXz8p
	DpCZULx3Neu9HzMKqrqVWIozEg21mIuKEwEGxIPeeQIAAA==
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrLLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g1dn2C3mrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlTF/2UmmgtVCFR9WzmZtYDzM38XIySEhYCIx89sEdhCbTUBd4saNn8wgtoiA
	mcTB1j9gcWaBu0wSB/rZQGxhgWCJSdfugsVZBFQlPh0/D1bPK2Aqse46RK+EgLzE6g0HwGxO
	oDk/ZvSC9QoB1bxbcIlpAiPXAkaGVYwimXlluYmZOaZ6xdkZlXmZFXrJ+bmbGIGBvKz2z8Qd
	jF8uux9iFOBgVOLhffB4a7oQa2JZcWXuIUYJDmYlEd62+i3pQrwpiZVVqUX58UWlOanFhxil
	OViUxHm9wlMThATSE0tSs1NTC1KLYLJMHJxSDYyLDJd0m/osX1s1z2FRbiejiZPpyu//m6Qf
	b+s8WVDScbZqitCFH5t0O/bFyD5p/Z7PPHXx7NKgdqX1vdkXlfoCnLOV1ha4F6Z3Cz3Mvrq2
	y+HEtc5DT2x/HtB5VTm3JC7j9X7nW9uXsn/YKxb55rmm75PMQyrCYlxSAsfc7+v2sKl0p5sv
	ilRiKc5INNRiLipOBAAe/tirYAIAAA==
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

A new mechanism, LUF(Lazy Unmap Flush), defers tlb flush until folios
that have been unmapped and freed, eventually get allocated again.  It's
safe for folios that had been mapped read-only and were unmapped, since
the contents of the folios wouldn't change while staying in pcp or buddy
so we can still read the data through the stale tlb entries.

This is a preparation for the mechanism that needs to recognize
read-only tlb entries by separating tlb batch arch data into two, one is
for read-only entries and the other is for writable ones, and merging
those two when needed.

It also optimizes tlb shootdown by skipping CPUs that have already
performed tlb flush needed since.  To support it, added APIs
manipulating arch data for x86.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 arch/x86/include/asm/tlbflush.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflus=
h.h
index 69e79fff41b80..0ae9564c7301e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -5,6 +5,7 @@
 #include <linux/mm_types.h>
 #include <linux/mmu_notifier.h>
 #include <linux/sched.h>
+#include <linux/cpumask.h>
=20
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
@@ -293,6 +294,29 @@ static inline void arch_flush_tlb_batched_pending(stru=
ct mm_struct *mm)
=20
 extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
=20
+static inline void arch_tlbbatch_clear(struct arch_tlbflush_unmap_batch *b=
atch)
+{
+	cpumask_clear(&batch->cpumask);
+}
+
+static inline void arch_tlbbatch_fold(struct arch_tlbflush_unmap_batch *bd=
st,
+		struct arch_tlbflush_unmap_batch *bsrc)
+{
+	cpumask_or(&bdst->cpumask, &bdst->cpumask, &bsrc->cpumask);
+}
+
+static inline bool arch_tlbbatch_need_fold(struct arch_tlbflush_unmap_batc=
h *batch,
+		struct mm_struct *mm)
+{
+	return !cpumask_subset(mm_cpumask(mm), &batch->cpumask);
+}
+
+static inline bool arch_tlbbatch_done(struct arch_tlbflush_unmap_batch *bd=
st,
+		struct arch_tlbflush_unmap_batch *bsrc)
+{
+	return !cpumask_andnot(&bdst->cpumask, &bdst->cpumask, &bsrc->cpumask);
+}
+
 static inline bool pte_flags_need_flush(unsigned long oldflags,
 					unsigned long newflags,
 					bool ignore_access)
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 7B025524F
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:00 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029763; cv=none;
 b=JLHsJj3f6yE4Z4PEx2kjneOUy+K21WnI20lS6vuygrWOjy6k6+U0FKH12Sc34gfzUUky5RLdJMgl5zjy0DNVpuOIo9amQGVZDrQHXtmc5TlWL3O8mLTAIVytRbZDhxW3Sa1NxClj2sZAlH1xH/kV0ahXB9VDF8akkxTm50uSPQg=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029763; c=relaxed/simple;
	bh=9VwKof6fdvIRikGVrT0fSu5yYaV9J0zhI9erH9jRXZQ=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=SkqlAV3QR6fmOXP25H43zdQHFXOHUJA+NEwYWwe3/eBGmfwDKdB8+z6j1zIXKwIZIcq74bbnZwrBrmv+7dKVyOHBqwTM7hQ2ud4Iwn7SUCE6d18RCDUyQ0ijm5BhKLBRtTkuVnuoafGe//r2eo48E/gEnm1oH9wn8Hzf9hy/OAk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-b5-67b6bba69892
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 02/26] arm64/tlbflush: add APIs manipulating tlb
 batch's arch data
Date: Thu, 20 Feb 2025 14:20:03 +0900
Message-Id: <20250220052027.58847-3-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrKLMWRmVeSWpSXmKPExsXC9ZZnoe6y3dvSDU6vE7aYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	G24vZC34y1dxu/U4cwPjUp4uRk4OCQETiR9bd7LD2Luv9YPZbALqEjdu/GQGsUUEzCQOtv4B
	izML3GWSONDPBmILC0RJNHf/ZwKxWQRUJTY2fmHtYuTg4BUwlWjpSYcYKS+xesMBsDGcQGN+
	zOgFaxUCKnm34BJQKxdQzWc2iWUnNzNDNEhKHFxxg2UCI+8CRoZVjEKZeWW5iZk5JnoZlXmZ
	FXrJ+bmbGIGBv6z2T/QOxk8Xgg8xCnAwKvHwzmjdli7EmlhWXJl7iFGCg1lJhLetfku6EG9K
	YmVValF+fFFpTmrxIUZpDhYlcV6jb+UpQgLpiSWp2ampBalFMFkmDk6pBkYzxvKM++ElBjNj
	XpUaXuO5yS+xOplHdsuJkuVicwxbnsve3N3ApRMYvynlfurFkNxHn9b8njqD45O9iE79idNB
	4pEtKQuWf9y9z9hour/aO5bo1jeHjGJMn7f7avWtiv7z6pDk6Yl9d924l/G+/cfqoHhmH/du
	NY/QiuuHorMXHmKpkm+pVFZiKc5INNRiLipOBABZvryXeAIAAA==
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrDLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0gyf9XBZz1q9hs/i84R+b
	xYsN7YwWX9f/YrZ4+qmPxeLw3JOsFpd3zWGzuLfmP6vF+V1rWS12LN3HZHHpwAImi+O9B5gs
	5t/7zGaxedNUZovjU6YyWvz+AVR8ctZkFgdBj++tfSweO2fdZfdYsKnUY/MKLY/Fe14yeWxa
	1cnmsenTJHaPd+fOsXucmPGbxWPeyUCP9/uusnksfvGByWPrLzuPxqnX2Dw+b5IL4I/isklJ
	zcksSy3St0vgythweyFrwV++itutx5kbGJfydDFyckgImEjsvtbPDmKzCahL3LjxkxnEFhEw
	kzjY+gcszixwl0niQD8biC0sECXR3P2fCcRmEVCV2Nj4hbWLkYODV8BUoqUnHWKkvMTqDQfA
	xnACjfkxoxesVQio5N2CS0wTGLkWMDKsYhTJzCvLTczMMdUrzs6ozMus0EvOz93ECAzjZbV/
	Ju5g/HLZ/RCjAAejEg/vg8db04VYE8uKK3MPMUpwMCuJ8LbVb0kX4k1JrKxKLcqPLyrNSS0+
	xCjNwaIkzusVnpogJJCeWJKanZpakFoEk2Xi4JRqYJQ3mfVKIE73q1qxlZOVgA3TM5+fvZyZ
	L0RMv0zmtj9wO3m+/cLPrzTF6j8wtG2euMkn8aThonNCKl8+Tv33UJLj2r7Te/5dN9hQUqCz
	IL/rddJfCY/+/nMPatevexe/xYzv9pLr3RG3r9eI2E1kzo/akz775payvNUVbFY+Fzh+sVxY
	8LP9pIgSS3FGoqEWc1FxIgDPhFPrXwIAAA==
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

A new mechanism, LUF(Lazy Unmap Flush), defers tlb flush until folios
that have been unmapped and freed, eventually get allocated again.  It's
safe for folios that had been mapped read only and were unmapped, since
the contents of the folios don't change while staying in pcp or buddy
so we can still read the data through the stale tlb entries.

This is a preparation for the mechanism that requires to manipulate tlb
batch's arch data.  Even though arm64 does nothing for tlb things, arch
with CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH should provide the APIs.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 arch/arm64/include/asm/tlbflush.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlb=
flush.h
index 95fbc8c056079..a62e1ea61e4af 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -354,6 +354,33 @@ static inline void arch_tlbbatch_flush(struct arch_tlb=
flush_unmap_batch *batch)
 	dsb(ish);
 }
=20
+static inline void arch_tlbbatch_clear(struct arch_tlbflush_unmap_batch *b=
atch)
+{
+	/* nothing to do */
+}
+
+static inline void arch_tlbbatch_fold(struct arch_tlbflush_unmap_batch *bd=
st,
+			       struct arch_tlbflush_unmap_batch *bsrc)
+{
+	/* nothing to do */
+}
+
+static inline bool arch_tlbbatch_need_fold(struct arch_tlbflush_unmap_batc=
h *batch,
+			       struct mm_struct *mm)
+{
+	/*
+	 * Nothing is needed in this architecture.
+	 */
+	return false;
+}
+
+static inline bool arch_tlbbatch_done(struct arch_tlbflush_unmap_batch *bd=
st,
+			       struct arch_tlbflush_unmap_batch *bsrc)
+{
+	/* Kernel can consider tlb batch always has been done. */
+	return true;
+}
+
 /*
  * This is meant to avoid soft lock-ups on large TLB flushing ranges and n=
ot
  * necessarily a performance improvement.
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id CDD231DF975
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:00 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029764; cv=none;
 b=R1LoTehyqw0ENLAi+qzxD7CfquQjRyy8Z60s7tXMiEe9Y0fOUY3qgsotGQR4C3DBS+HQi2vRPsgWx7EGA83rtfR71tnWkQ2dIWHIZqIb928pLevV2vSsn3g/mlTnv+93QNZCzPtHceJaP35BcEMY/VH58A2T+JeNjMoQ29TCq9I=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029764; c=relaxed/simple;
	bh=RG+nIHafzsu6KH0sZ7fNOwFi7KXBGFLymx/SA4HIhmw=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=BS2BA/5TJcvR17b5EmoXguDV5A22gdou45GONrZ2apMiWzoc+5G8eDGjJp1LsXTxmKwW6r4WZFosfgRPjz6dYAvbZpyqQsfk/t9mbToeb/QdS925tJUOHI4Aoj8swLnMqgKECnXlsxCzKJ0MunKEPuREFobJmdI/kE/a8py4XaA=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-ba-67b6bba6ff58
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 03/26] riscv/tlb: add APIs manipulating tlb batch's
 arch data
Date: Thu, 20 Feb 2025 14:20:04 +0900
Message-Id: <20250220052027.58847-4-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrGLMWRmVeSWpSXmKPExsXC9ZZnke6y3dvSDRZ2ilnMWb+GzeLzhn9s
	Fi82tDNafF3/i9ni6ac+FovLu+awWdxb85/V4vyutawWO5buY7K4dGABk8Xx3gNMFvPvfWaz
	2LxpKrPF8SlTGS1+/wAqPjlrMouDgMf31j4Wj52z7rJ7LNhU6rF5hZbH4j0vmTw2repk89j0
	aRK7x7tz59g9Tsz4zeIx72Sgx/t9V9k8tv6y82iceo3N4/MmuQC+KC6blNSczLLUIn27BK6M
	HWvvshRcE6p48vgfUwPjK/4uRk4OCQETiW9rm9lh7GWH7rKC2GwC6hI3bvxkBrFFBMwkDrb+
	AathFrjLJHGgnw3EFhYIlbh44jFYDYuAqsSulvdgNq+AqcTZXxNZIWbKS6zecAAszgk058eM
	XrBeIaCadwsuMXUxcgHVvGeTmP95PxtEg6TEwRU3WCYw8i5gZFjFKJSZV5abmJljopdRmZdZ
	oZecn7uJERj6y2r/RO9g/HQh+BCjAAejEg/vjNZt6UKsiWXFlbmHGCU4mJVEeNvqt6QL8aYk
	VlalFuXHF5XmpBYfYpTmYFES5zX6Vp4iJJCeWJKanZpakFoEk2Xi4JRqYJQ1XSmtdOvglDaX
	t5zPrGM6rSYosl3bwZXhXXrfZ2rwM8v9+vxcN3QvzYha+YXpdU2K1LWfmtPv892V5GdRT977
	NDpv072Oib/zd9Vkdf93zHxue3/3dskbV+5UPZ1n/S/Ta89mkc08FzleRXW7rtK7uGE7Ewf/
	phWRHBuW6DVFnW+wKK0t/a/EUpyRaKjFXFScCAAdaBcNeQIAAA==
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrLLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g+3HeSzmrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlbFj7V2WgmtCFU8e/2NqYHzF38XIySEhYCKx7NBdVhCbTUBd4saNn8wgtoiA
	mcTB1j/sIDazwF0miQP9bCC2sECoxMUTj8FqWARUJXa1vAezeQVMJc7+msgKMVNeYvWGA2Bx
	TqA5P2b0gvUKAdW8W3CJaQIj1wJGhlWMIpl5ZbmJmTmmesXZGZV5mRV6yfm5mxiBgbys9s/E
	HYxfLrsfYhTgYFTi4X3weGu6EGtiWXFl7iFGCQ5mJRHetvot6UK8KYmVValF+fFFpTmpxYcY
	pTlYlMR5vcJTE4QE0hNLUrNTUwtSi2CyTBycUg2Mmf8unnOzMWgPtpv7YNm9NR7ndtyzjxBm
	c8/cdf0O+6fgDUe0CtYuZuboz4h4eTNl9azm2iSGtbU8d+6+azryIF5NqXPh7kMBmdk9jyN6
	3Q+on9C1iov6canheVYog9OKibGBF3+xdcZyzheWfpq1ovml+qLM23wnX+7kTuUqb9z9W3XJ
	toptSizFGYmGWsxFxYkAIR/GaGACAAA=
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

A new mechanism, LUF(Lazy Unmap Flush), defers tlb flush until folios
that have been unmapped and freed, eventually get allocated again.  It's
safe for folios that had been mapped read only and were unmapped, since
the contents of the folios don't change while staying in pcp or buddy
so we can still read the data through the stale tlb entries.

This is a preparation for the mechanism that needs to recognize
read-only tlb entries by separating tlb batch arch data into two, one is
for read-only entries and the other is for writable ones, and merging
those two when needed.

It also optimizes tlb shootdown by skipping CPUs that have already
performed tlb flush needed since.  To support it, added APIs
manipulating arch data for riscv.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 arch/riscv/include/asm/tlbflush.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlb=
flush.h
index 72e5599349529..1dc7d30273d59 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -8,6 +8,7 @@
 #define _ASM_RISCV_TLBFLUSH_H
=20
 #include <linux/mm_types.h>
+#include <linux/cpumask.h>
 #include <asm/smp.h>
 #include <asm/errata_list.h>
=20
@@ -65,6 +66,33 @@ void arch_tlbbatch_add_pending(struct arch_tlbflush_unma=
p_batch *batch,
 void arch_flush_tlb_batched_pending(struct mm_struct *mm);
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
=20
+static inline void arch_tlbbatch_clear(struct arch_tlbflush_unmap_batch *b=
atch)
+{
+	cpumask_clear(&batch->cpumask);
+
+}
+
+static inline void arch_tlbbatch_fold(struct arch_tlbflush_unmap_batch *bd=
st,
+		struct arch_tlbflush_unmap_batch *bsrc)
+{
+	cpumask_or(&bdst->cpumask, &bdst->cpumask, &bsrc->cpumask);
+
+}
+
+static inline bool arch_tlbbatch_need_fold(struct arch_tlbflush_unmap_batc=
h *batch,
+		struct mm_struct *mm)
+{
+	return !cpumask_subset(mm_cpumask(mm), &batch->cpumask);
+
+}
+
+static inline bool arch_tlbbatch_done(struct arch_tlbflush_unmap_batch *bd=
st,
+		struct arch_tlbflush_unmap_batch *bsrc)
+{
+	return !cpumask_andnot(&bdst->cpumask, &bdst->cpumask, &bsrc->cpumask);
+
+}
+
 extern unsigned long tlb_flush_all_threshold;
 #else /* CONFIG_MMU */
 #define local_flush_tlb_all()			do { } while (0)
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id CDBDD198E81
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:00 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029763; cv=none;
 b=sdJ9mV1GSX1O7NmtXeDjBLNH3wiBVe8qUy6hhjBv+5Lir5cv6bZt3jEhDsE2qJ0xI8eIrmYFRqPiJxuJGaL/OIMccXEe10dTqpmAN6DEjS/33beg/QeIp5oWk+ShGP7C24WzRtKGPdgU8PjYQRxA6vXDrj4ZNKjNKuXnj7JCHgg=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029763; c=relaxed/simple;
	bh=QzVui9QA+b13SQhXv5/2R6P1FQ7PePIcbk7dbrC4Dec=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=LQSIgj2HXdUJAmvWVyIHMBNoOiKsnPYTCDGySt+EY/wFg7LP7QI/Q4H92rauNSW8Gj+HpXcC0W5j1hs8HxnheCuo60Zd9oY8HCsAfFrRbD0jxfO+eDGb8tyG7UrDuT5dev7zAGJ5b2TpGSnUne/bYu6JQ2fG++lV5gc5ClgYxds=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-bf-67b6bba6b39a
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 04/26] x86/tlb, riscv/tlb,
 mm/rmap: separate arch_tlbbatch_clear() out of arch_tlbbatch_flush()
Date: Thu, 20 Feb 2025 14:20:05 +0900
Message-Id: <20250220052027.58847-5-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrCLMWRmVeSWpSXmKPExsXC9ZZnoe6y3dvSDfb+FbeYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	Z288Yil4ylcxafI8tgbG2TxdjJwcEgImEv++3mGFsVfvXsoGYrMJqEvcuPGTGcQWETCTONj6
	hx3EZha4yyRxoB+ohoNDWKBcYs0sPxCTRUBV4uktVZAKXgFTictfWhghJspLrN5wAGwKJ9CU
	HzN6waYLAdW8W3CJqYuRC6jmM5tE963rUCdIShxccYNlAiPvAkaGVYxCmXlluYmZOSZ6GZV5
	mRV6yfm5mxiBYb+s9k/0DsZPF4IPMQpwMCrx8M5o3ZYuxJpYVlyZe4hRgoNZSYS3rX5LuhBv
	SmJlVWpRfnxRaU5q8SFGaQ4WJXFeo2/lKUIC6YklqdmpqQWpRTBZJg5OqQbGagPJ7Zt9xZsn
	626cPF0wizV9rsuiZ0WaXSc4pogo+J+dOHvfvNg/Yf+fTRJv99soeam8JdDC7mCKm8CiZydf
	q8fvLND++czCd+q9iKUeF9aGL/vUNuOy1nWe2c4C+gmr7V8tKz6RY8IpnNX/+chT1/It6xXV
	52vHKezyi/o36fTCROXbmzzXKrEUZyQaajEXFScCAFaoSSN3AgAA
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrNLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g9MP+SzmrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlXH2xiOWgqd8FZMmz2NrYJzN08XIySEhYCKxevdSNhCbTUBd4saNn8wgtoiA
	mcTB1j/sIDazwF0miQP9QDUcHMIC5RJrZvmBmCwCqhJPb6mCVPAKmEpc/tLCCDFRXmL1hgNg
	UziBpvyY0Qs2XQio5t2CS0wTGLkWMDKsYhTJzCvLTczMMdUrzs6ozMus0EvOz93ECAziZbV/
	Ju5g/HLZ/RCjAAejEg/vg8db04VYE8uKK3MPMUpwMCuJ8LbVb0kX4k1JrKxKLcqPLyrNSS0+
	xCjNwaIkzusVnpogJJCeWJKanZpakFoEk2Xi4JRqYLQ5vFln2+xUHd+QVxJOv7PWGchsd9lm
	du7eT/3HdrdcVacJr7sc63vT52PH3NbERVe5fLwefAzqV/E0O7WvgdvY//Upht/fyg70WMTG
	aWz/fVjgRnhE8uwfXc6a4Sk/63ccK12X+pyPeS/nhQqpwvwnVRt/H22JfVLvfImp+23P7hPs
	6qrztiuxFGckGmoxFxUnAgBEHJIWXgIAAA==
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

A new mechanism, LUF(Lazy Unmap Flush), defers tlb flush until folios
that have been unmapped and freed, eventually get allocated again.  It's
safe for folios that had been mapped read only and were unmapped, since
the contents of the folios don't change while staying in pcp or buddy
so we can still read the data through the stale tlb entries.

This is a preparation for the mechanism that requires to avoid redundant
tlb flush by manipulating tlb batch's arch data.  To achieve that, we
need to separate the part clearing the tlb batch's arch data out of
arch_tlbbatch_flush().

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 arch/riscv/mm/tlbflush.c | 1 -
 arch/x86/mm/tlb.c        | 2 --
 mm/rmap.c                | 1 +
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index 9b6e86ce38674..36f996af6256c 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -201,5 +201,4 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_bat=
ch *batch)
 {
 	__flush_tlb_range(&batch->cpumask, FLUSH_TLB_NO_ASID, 0,
 			  FLUSH_TLB_MAX_SIZE, PAGE_SIZE);
-	cpumask_clear(&batch->cpumask);
 }
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 86593d1b787d8..860e49b223fd7 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1262,8 +1262,6 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_b=
atch *batch)
 		local_irq_enable();
 	}
=20
-	cpumask_clear(&batch->cpumask);
-
 	put_flush_tlb_info();
 	put_cpu();
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index c6c4d4ea29a7e..2de01de164ef0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -648,6 +648,7 @@ void try_to_unmap_flush(void)
 		return;
=20
 	arch_tlbbatch_flush(&tlb_ubc->arch);
+	arch_tlbbatch_clear(&tlb_ubc->arch);
 	tlb_ubc->flush_required =3D false;
 	tlb_ubc->writable =3D false;
 }
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.hynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id CDE571E04AE
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:00 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029764; cv=none;
 b=hACxblWlbjpg5K40n2ZZPNB5QMy7EePhHIQ+mchLG4fKA4duq6vnhyoe61F+gQTkXSx5rA6UakAMhiqCuhte1BuI5yu30VTjMCNepo1yHXcd4OOIHxByxbf9m/lN/siwpU84I4naH8UDJgw+2J8E19wQgIYgJuEv/N83XM0HWAA=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029764; c=relaxed/simple;
	bh=dFw/xsnT4MNREXbLJjOCEGbGptaqRGQf56Z0hBIHWrg=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=lHSAR///eaOc6u5MkLw/8CmAc/D/LaF5opaZ7fvgSqtJ2/t2zJ12dnQ8dg/wc+y/wn2IdVWysv6pH0FIOYwt9+04RJ0ShVTsDtDvkpR0B20TL+RV5DXAREk3JIim7vXa8HV/5JSLk5dN3Fd3npk22l2jcdyh0NcOascc4PV5Zj4=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-c6-67b6bba6280f
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 05/26] mm/buddy: make room for a new variable,
 luf_key, in struct page
Date: Thu, 20 Feb 2025 14:20:06 +0900
Message-Id: <20250220052027.58847-6-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrCLMWRmVeSWpSXmKPExsXC9ZZnoe6y3dvSDe61yljMWb+GzeLzhn9s
	Fi82tDNafF3/i9ni6ac+FovLu+awWdxb85/V4vyutawWO5buY7K4dGABk8Xx3gNMFvPvfWaz
	2LxpKrPF8SlTGS1+/wAqPjlrMouDgMf31j4Wj52z7rJ7LNhU6rF5hZbH4j0vmTw2repk89j0
	aRK7x7tz59g9Tsz4zeIx72Sgx/t9V9k8tv6y82iceo3N4/MmuQC+KC6blNSczLLUIn27BK6M
	042yBWtkKt4deczUwDhXrIuRg0NCwETi5ELBLkZOMLPnXz87iM0moC5x48ZPZhBbRMBM4mDr
	H7A4s8BdJokD/WwgtrBAnMTGuzfAalgEVCUer9jDAmLzCphK3Fu4kBFiprzE6g0HwGo4geb8
	mNEL1isEVPNuwSWmLkYuoJr3bBKLj3+EapCUOLjiBssERt4FjAyrGIUy88pyEzNzTPQyKvMy
	K/SS83M3MQLDflntn+gdjJ8uBB9iFOBgVOLhndG6LV2INbGsuDL3EKMEB7OSCG9b/ZZ0Id6U
	xMqq1KL8+KLSnNTiQ4zSHCxK4rxG38pThATSE0tSs1NTC1KLYLJMHJxSDYyuHI8eKvIKlV4o
	nhut7fXo+BYDc0WVDr+qExpbnk46dPyVxQGZw3oxvzsalizsfZjed0xZjU033u8vo6vJRotP
	5XHZqoFeO75JMBoHbBbNFnohFPpx+R9/lutGVkqfHrmy6Ro1cS7Y4HptXZ3t6T2iivVL9jEe
	PWL0oatC+0nQQZcDOuv7jyixFGckGmoxFxUnAgC5URuDdwIAAA==
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrDLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g3WLBC3mrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlXG6UbZgjUzFuyOPmRoY54p1MXJySAiYSPT862cHsdkE1CVu3PjJDGKLCJhJ
	HGz9AxZnFrjLJHGgnw3EFhaIk9h49wZYDYuAqsTjFXtYQGxeAVOJewsXMkLMlJdYveEAWA0n
	0JwfM3rBeoWAat4tuMQ0gZFrASPDKkaRzLyy3MTMHFO94uyMyrzMCr3k/NxNjMAwXlb7Z+IO
	xi+X3Q8xCnAwKvHwPni8NV2INbGsuDL3EKMEB7OSCG9b/ZZ0Id6UxMqq1KL8+KLSnNTiQ4zS
	HCxK4rxe4akJQgLpiSWp2ampBalFMFkmDk6pBsbat9XpU4JPzLIs3KJn8aru4HQ3LsuvVy+a
	JB6qWdprHxrP86KmmXnCUkaNHd4HhKUMIuyWdD1zfmacVJI+X1NI2adfavucGs+yI+LdGfWR
	iZ9cJnw+p3/8bn5E9qt1nNYx90+aF7NsjHu2QWR7xC5n3SnB0xa8fuZp1RvnfGjX7tYV3AdE
	tyqxFGckGmoxFxUnAgAAgnVcXwIAAA==
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Functionally, no change.  This is a preparation for luf mechanism that
tracks need of tlb flush for each page residing in buddy.

Since the private field in struct page is used only to store page order
in buddy, ranging from 0 to MAX_PAGE_ORDER, that can be covered with
unsigned short.  So splitted it into two smaller ones, order and luf_key,
so that the both can be used in buddy at the same time.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 include/linux/mm_types.h | 42 +++++++++++++++++++++++++++++++++-------
 mm/internal.h            |  4 ++--
 mm/page_alloc.c          |  2 +-
 3 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 80fef38d9d645..20d85c4e609de 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -106,13 +106,27 @@ struct page {
 				pgoff_t index;		/* Our offset within mapping. */
 				unsigned long share;	/* share count for fsdax */
 			};
-			/**
-			 * @private: Mapping-private opaque data.
-			 * Usually used for buffer_heads if PagePrivate.
-			 * Used for swp_entry_t if swapcache flag set.
-			 * Indicates order in the buddy system if PageBuddy.
-			 */
-			unsigned long private;
+			union {
+				/**
+				 * @private: Mapping-private opaque data.
+				 * Usually used for buffer_heads if PagePrivate.
+				 * Used for swp_entry_t if swapcache flag set.
+				 * Indicates order in the buddy system if PageBuddy.
+				 */
+				unsigned long private;
+				struct {
+					/*
+					 * Indicates order in the buddy system if PageBuddy.
+					 */
+					unsigned short order;
+
+					/*
+					 * For tracking need of tlb flush,
+					 * by luf(lazy unmap flush).
+					 */
+					unsigned short luf_key;
+				};
+			};
 		};
 		struct {	/* page_pool used by netstack */
 			/**
@@ -537,6 +551,20 @@ static inline void set_page_private(struct page *page,=
 unsigned long private)
 	page->private =3D private;
 }
=20
+#define page_buddy_order(page)		((page)->order)
+
+static inline void set_page_buddy_order(struct page *page, unsigned int or=
der)
+{
+	page->order =3D (unsigned short)order;
+}
+
+#define page_luf_key(page)		((page)->luf_key)
+
+static inline void set_page_luf_key(struct page *page, unsigned short luf_=
key)
+{
+	page->luf_key =3D luf_key;
+}
+
 static inline void *folio_get_private(struct folio *folio)
 {
 	return folio->private;
diff --git a/mm/internal.h b/mm/internal.h
index 5a7302baeed7c..754f1dd763448 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -541,7 +541,7 @@ struct alloc_context {
 static inline unsigned int buddy_order(struct page *page)
 {
 	/* PageBuddy() must be checked by the caller */
-	return page_private(page);
+	return page_buddy_order(page);
 }
=20
 /*
@@ -555,7 +555,7 @@ static inline unsigned int buddy_order(struct page *pag=
e)
  * times, potentially observing different values in the tests and the actu=
al
  * use of the result.
  */
-#define buddy_order_unsafe(page)	READ_ONCE(page_private(page))
+#define buddy_order_unsafe(page)	READ_ONCE(page_buddy_order(page))
=20
 /*
  * This function checks whether a page is free && is the buddy
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 839708353cb77..59c26f59db3d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -576,7 +576,7 @@ void prep_compound_page(struct page *page, unsigned int=
 order)
=20
 static inline void set_buddy_order(struct page *page, unsigned int order)
 {
-	set_page_private(page, order);
+	set_page_buddy_order(page, order);
 	__SetPageBuddy(page);
 }
=20
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id CDC351A9B4E
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:00 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029763; cv=none;
 b=cZt/tp7G1ih+pSmNF8NgMG3gSg7iE4kgPf4Oq4Q6zJwl3q8yj5DJ0mZ27tv1zwwu1Spnhqu/bqJYJp/1S35JRno90NmzS8v1qUHtlXbpEtSyjhPxi39sKXY1ILWY5rKI5SW9D7PGyIFegzdnXK0tc3JxZ/HzYnq8FcbgTMU+y9w=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029763; c=relaxed/simple;
	bh=x5P/59Gr/bGb0CCHWfnSxFZ0UEKQuXMdmHzpaAhjp3I=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=rxtNQLl92iUtfdigkuvubsLha89OyhThXeQ4+mb/fa8Yk5Ahlmq4QVLznLIjk2nfikJADteEOHw4X0itWeg6TY4785RP+VitvfvAGoJPpfjIqvcYyMZpog15rLKB+kaOaD30l5eIoTbzhqV1iyegcGOsJ+pbLFFbIxDvps6UyJg=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-cb-67b6bba62acc
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 06/26] mm: move should_skip_kasan_poison() to
 mm/internal.h
Date: Thu, 20 Feb 2025 14:20:07 +0900
Message-Id: <20250220052027.58847-7-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrKLMWRmVeSWpSXmKPExsXC9ZZnoe6y3dvSDa53K1jMWb+GzeLzhn9s
	Fi82tDNafF3/i9ni6ac+FovLu+awWdxb85/V4vyutawWO5buY7K4dGABk8Xx3gNMFvPvfWaz
	2LxpKrPF8SlTGS1+/wAqPjlrMouDgMf31j4Wj52z7rJ7LNhU6rF5hZbH4j0vmTw2repk89j0
	aRK7x7tz59g9Tsz4zeIx72Sgx/t9V9k8tv6y82iceo3N4/MmuQC+KC6blNSczLLUIn27BK6M
	5ocnWQoW6lQce/6MtYFxjkoXIyeHhICJxPUlW5i7GDnA7I9NqSBhNgF1iRs3fjKD2CICZhIH
	W/+wg9jMAneZJA70s4HYwgLBEuf2nAKrYRFQlZi04gxYnFfAVGLGhNnMEOPlJVZvOABmcwLN
	+TGjF6xGCKjm3YJLTF2MXEA179kkftw7wg7RIClxcMUNlgmMvAsYGVYxCmXmleUmZuaY6GVU
	5mVW6CXn525iBAb+sto/0TsYP10IPsQowMGoxMM7o3VbuhBrYllxZe4hRgkOZiUR3rb6LelC
	vCmJlVWpRfnxRaU5qcWHGKU5WJTEeY2+lacICaQnlqRmp6YWpBbBZJk4OKUaGBPX886puR3r
	IndcS81q0Z6aUL1JSsnCesnTZb1V5sf/ePKtbJKjwZ+5Nf2LVf34lJ8flhTZqvNi+fGeI7wm
	mTE3jr+vDn/4+lzUxhOsla239/z6xtK864FCRdKT+ohXGWZMIVxSyuuyRY588PNSDyn5kXA2
	4eh8a8W1E3+EMF5ln6dl5BZiq8RSnJFoqMVcVJwIAFMqBtZ4AgAA
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrDLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g0mzxCzmrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAldH88CRLwUKdimPPn7E2MM5R6WLk4JAQMJH42JTaxcjJwSagLnHjxk9mEFtE
	wEziYOsfdhCbWeAuk8SBfjYQW1ggWOLcnlNgNSwCqhKTVpwBi/MKmErMmDAbLC4hIC+xesMB
	MJsTaM6PGb1gNUJANe8WXGKawMi1gJFhFaNIZl5ZbmJmjqlecXZGZV5mhV5yfu4mRmAYL6v9
	M3EH45fL7ocYBTgYlXh4Hzzemi7EmlhWXJl7iFGCg1lJhLetfku6EG9KYmVValF+fFFpTmrx
	IUZpDhYlcV6v8NQEIYH0xJLU7NTUgtQimCwTB6dUA2NZ+A6jo08ilpoIanJ3xVt93d+3XEl3
	zpXJ7tbWf2ZkxWY/iF0kXjozZM65IzuvNS9VZ//nJyT67aTWkxK7eNs5HFv6WNXdT9i4z4/v
	naIjFTAr71dDw8KfSdsYJ9x1avacrMCQHPPF68r35qjGrKf3j0VXHPc+261ikRxvOne+/eoJ
	MR5HpZRYijMSDbWYi4oTAa2FdxpfAgAA
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Functionally, no change.  This is a preparation for luf mechanism that
needs to use should_skip_kasan_poison() function in mm/internal.h.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 mm/internal.h   | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c | 47 -----------------------------------------------
 2 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 754f1dd763448..e3084d32272e3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1038,8 +1038,55 @@ static inline void vunmap_range_noflush(unsigned lon=
g start, unsigned long end)
 DECLARE_STATIC_KEY_TRUE(deferred_pages);
=20
 bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
+
+static inline bool deferred_pages_enabled(void)
+{
+	return static_branch_unlikely(&deferred_pages);
+}
+#else
+static inline bool deferred_pages_enabled(void)
+{
+	return false;
+}
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
=20
+/*
+ * Skip KASAN memory poisoning when either:
+ *
+ * 1. For generic KASAN: deferred memory initialization has not yet comple=
ted.
+ *    Tag-based KASAN modes skip pages freed via deferred memory initializ=
ation
+ *    using page tags instead (see below).
+ * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indic=
ating
+ *    that error detection is disabled for accesses via the page address.
+ *
+ * Pages will have match-all tags in the following circumstances:
+ *
+ * 1. Pages are being initialized for the first time, including during def=
erred
+ *    memory init; see the call to page_kasan_tag_reset in __init_single_p=
age.
+ * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
+ *    exception of pages unpoisoned by kasan_unpoison_vmalloc.
+ * 3. The allocation was excluded from being checked due to sampling,
+ *    see the call to kasan_unpoison_pages.
+ *
+ * Poisoning pages during deferred memory init will greatly lengthen the
+ * process and cause problem in large memory systems as the deferred pages
+ * initialization is done with interrupt disabled.
+ *
+ * Assuming that there will be no reference to those newly initialized
+ * pages before they are ever allocated, this should have no effect on
+ * KASAN memory tracking as the poison will be properly inserted at page
+ * allocation time. The only corner case is when pages are allocated by
+ * on-demand allocation and then freed again before the deferred pages
+ * initialization is done, but this is not likely to happen.
+ */
+static inline bool should_skip_kasan_poison(struct page *page)
+{
+	if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+		return deferred_pages_enabled();
+
+	return page_kasan_tag(page) =3D=3D KASAN_TAG_KERNEL;
+}
+
 enum mminit_level {
 	MMINIT_WARNING,
 	MMINIT_VERIFY,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 59c26f59db3d6..244cb30496be5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -299,11 +299,6 @@ int page_group_by_mobility_disabled __read_mostly;
  */
 DEFINE_STATIC_KEY_TRUE(deferred_pages);
=20
-static inline bool deferred_pages_enabled(void)
-{
-	return static_branch_unlikely(&deferred_pages);
-}
-
 /*
  * deferred_grow_zone() is __init, but it is called from
  * get_page_from_freelist() during early boot until deferred_pages permane=
ntly
@@ -316,11 +311,6 @@ _deferred_grow_zone(struct zone *zone, unsigned int or=
der)
 	return deferred_grow_zone(zone, order);
 }
 #else
-static inline bool deferred_pages_enabled(void)
-{
-	return false;
-}
-
 static inline bool _deferred_grow_zone(struct zone *zone, unsigned int ord=
er)
 {
 	return false;
@@ -993,43 +983,6 @@ static int free_tail_page_prepare(struct page *head_pa=
ge, struct page *page)
 	return ret;
 }
=20
-/*
- * Skip KASAN memory poisoning when either:
- *
- * 1. For generic KASAN: deferred memory initialization has not yet comple=
ted.
- *    Tag-based KASAN modes skip pages freed via deferred memory initializ=
ation
- *    using page tags instead (see below).
- * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indic=
ating
- *    that error detection is disabled for accesses via the page address.
- *
- * Pages will have match-all tags in the following circumstances:
- *
- * 1. Pages are being initialized for the first time, including during def=
erred
- *    memory init; see the call to page_kasan_tag_reset in __init_single_p=
age.
- * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
- *    exception of pages unpoisoned by kasan_unpoison_vmalloc.
- * 3. The allocation was excluded from being checked due to sampling,
- *    see the call to kasan_unpoison_pages.
- *
- * Poisoning pages during deferred memory init will greatly lengthen the
- * process and cause problem in large memory systems as the deferred pages
- * initialization is done with interrupt disabled.
- *
- * Assuming that there will be no reference to those newly initialized
- * pages before they are ever allocated, this should have no effect on
- * KASAN memory tracking as the poison will be properly inserted at page
- * allocation time. The only corner case is when pages are allocated by
- * on-demand allocation and then freed again before the deferred pages
- * initialization is done, but this is not likely to happen.
- */
-static inline bool should_skip_kasan_poison(struct page *page)
-{
-	if (IS_ENABLED(CONFIG_KASAN_GENERIC))
-		return deferred_pages_enabled();
-
-	return page_kasan_tag(page) =3D=3D KASAN_TAG_KERNEL;
-}
-
 static void kernel_init_pages(struct page *page, int numpages)
 {
 	int i;
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id CDCC01DE8A2
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:00 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029763; cv=none;
 b=bX39BYk1xtVG16Juenwc/YKjwgjM4KAWmz5zeChbuBeWwWMosFskWacPeoa5Uec3FFkIXugNFaANWTV2AJ6+GoOMrmKQCk+Nht/XUX3o8C4/qBN4s77bf8zH5qKdXPwu3vBJaJMljIQKJOi29ZOgKcG2O/kwzMYr2O944QJAxOI=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029763; c=relaxed/simple;
	bh=Y2sfbC4g4I7ryoAy/dwLz8LbdSA4fxJRetZmoammRF8=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=J3aPUhLBystTSOBm9sF+PzlG8Hl/ZgcyTEtVQdmU23rhc8rbDl9g+m/SifNlY9B9ncFZP3TWoriR2B63EyhB0xqmRnARHO2RK/12a2ATyi4rjTS0ggD4tOHGS5D4yB1sbnV5AggF88ktuSMw31Rd2yV6Ptcvg6tmAh8zWLFDlcI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-d0-67b6bba63805
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 07/26] mm: introduce luf_ugen to be used as a global
 timestamp
Date: Thu, 20 Feb 2025 14:20:08 +0900
Message-Id: <20250220052027.58847-8-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrGLMWRmVeSWpSXmKPExsXC9ZZnke6y3dvSDdZuU7WYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	PbsvsBXcEKm4PHkHSwPjaYEuRk4OCQETia3zj7DC2AveHWUEsdkE1CVu3PjJDGKLCJhJHGz9
	ww5iMwvcZZI40M8GYgsLhElcvX4PrIZFQFXi5px9YDW8AqYSzT/vsUDMlJdYveEAWA0n0Jwf
	M3rBeoWAat4tuMTUxcgFVPOeTWLvmQnMEA2SEgdX3GCZwMi7gJFhFaNQZl5ZbmJmjoleRmVe
	ZoVecn7uJkZg6C+r/RO9g/HTheBDjAIcjEo8vDNat6ULsSaWFVfmHmKU4GBWEuFtq9+SLsSb
	klhZlVqUH19UmpNafIhRmoNFSZzX6Ft5ipBAemJJanZqakFqEUyWiYNTqoHRWkWcZQ/z1K3b
	Ct17P9uv/y4+12zX8/SHrS8C98anT2o/d4vR8J/mi5USYus2HxSw/TdhhU7lsidJheu3Wz/8
	UB+324vZpCvwxNqaSc8VrnL53/Buv7wkr+yRv6nRtq6mp/1JVydeja7+J74rbOMFJff2sKWv
	1r88dWjpmowLpz2r9Zf+3SW8SomlOCPRUIu5qDgRAPjD1ol5AgAA
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrLLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g47nEhZz1q9hs/i84R+b
	xYsN7YwWX9f/YrZ4+qmPxeLw3JOsFpd3zWGzuLfmP6vF+V1rWS12LN3HZHHpwAImi+O9B5gs
	5t/7zGaxedNUZovjU6YyWvz+AVR8ctZkFgdBj++tfSweO2fdZfdYsKnUY/MKLY/Fe14yeWxa
	1cnmsenTJHaPd+fOsXucmPGbxWPeyUCP9/uusnksfvGByWPrLzuPxqnX2Dw+b5IL4I/isklJ
	zcksSy3St0vgyujZfYGt4IZIxeXJO1gaGE8LdDFyckgImEgseHeUEcRmE1CXuHHjJzOILSJg
	JnGw9Q87iM0scJdJ4kA/G4gtLBAmcfX6PbAaFgFViZtz9oHV8AqYSjT/vMcCMVNeYvWGA2A1
	nEBzfszoBesVAqp5t+AS0wRGrgWMDKsYRTLzynITM3NM9YqzMyrzMiv0kvNzNzECA3lZ7Z+J
	Oxi/XHY/xCjAwajEw/vg8dZ0IdbEsuLK3EOMEhzMSiK8bfVb0oV4UxIrq1KL8uOLSnNSiw8x
	SnOwKInzeoWnJggJpCeWpGanphakFsFkmTg4pRoYW/d2RHkca+B9vJFh53KuN8mTdZ89zMzL
	ffNLI+Dor52v2Rhtzm6fuVwy99qCTT6F68QVG9aYGk9+e5wl+lNu2VUJrdN/73efdPK6WZox
	YYH6w6279jZP63dWlda+EOl/ZorP4Rdv9Mo6D9XcOfz5aLa5x0rlW9opqxhN3YTyJ+bUur7h
	ZPD1V2Ipzkg01GIuKk4EAAldmBBgAgAA
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Functionally, no change.  This is a preparation for luf mechanism that
needs to evaluate the temporal sequence of events to determine whether
tlb flush required has been done on each CPU.

To achieve that, this patch introduced a generation number, luf_ugen,
and a few APIs manipulating the number.  It's worth noting the number is
designed to wraparound so care must be taken when using it.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 include/linux/mm.h | 34 ++++++++++++++++++++++++++++++++++
 mm/rmap.c          | 22 ++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fecd47239fa99..53a5f1cb21e0d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4161,4 +4161,38 @@ static inline int do_mseal(unsigned long start, size=
_t len_in, unsigned long fla
 }
 #endif
=20
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH)
+/*
+ * luf_ugen will start with 2 so that 1 can be regarded as a passed one.
+ */
+#define LUF_UGEN_INIT 2
+
+static inline bool ugen_before(unsigned long a, unsigned long b)
+{
+	/*
+	 * Consider wraparound.
+	 */
+	return (long)(a - b) < 0;
+}
+
+static inline unsigned long next_ugen(unsigned long ugen)
+{
+	if (ugen + 1)
+		return ugen + 1;
+	/*
+	 * Avoid invalid ugen, zero.
+	 */
+	return ugen + 2;
+}
+
+static inline unsigned long prev_ugen(unsigned long ugen)
+{
+	if (ugen - 1)
+		return ugen - 1;
+	/*
+	 * Avoid invalid ugen, zero.
+	 */
+	return ugen - 2;
+}
+#endif
 #endif /* _LINUX_MM_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 2de01de164ef0..ed345503e4f88 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -634,6 +634,28 @@ struct anon_vma *folio_lock_anon_vma_read(const struct=
 folio *folio,
 }
=20
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+
+/*
+ * This generation number is primarily used as a global timestamp to
+ * determine whether tlb flush required has been done on each CPU.  The
+ * function, ugen_before(), should be used to evaluate the temporal
+ * sequence of events because the number is designed to wraparound.
+ */
+static atomic_long_t __maybe_unused luf_ugen =3D ATOMIC_LONG_INIT(LUF_UGEN=
_INIT);
+
+/*
+ * Don't return invalid luf_ugen, zero.
+ */
+static unsigned long __maybe_unused new_luf_ugen(void)
+{
+	unsigned long ugen =3D atomic_long_inc_return(&luf_ugen);
+
+	if (!ugen)
+		ugen =3D atomic_long_inc_return(&luf_ugen);
+
+	return ugen;
+}
+
 /*
  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
  * important if a PTE was dirty when it was unmapped that it's flushed
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id D062A1E0DCC
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:00 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029764; cv=none;
 b=N6WsZVA8ZFCgQkt6FWVmI53MrobuJKe3qXlAauNX/WLlZOAG/0IAnePvlLHBNVPxBBy3gq78bMBa1MZtn0qamVsZsE7Tvf1WUBqEVc/SwpmTyyfqa59o8Jc4w/JyXRMW6IU9eCKj0LH7x0zEcw6Toc/OnB1rhDQyvIb/K4L59Qo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029764; c=relaxed/simple;
	bh=ahuFuxCzircgBSA0c34wTy2uB97+fb9p8uYSFhRjrgY=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=djb3jh4JJi1hk7/xKJi3sSFTpCFU6TA6Q7XJZIzxuz29iD1PI6JYybAXJKGFX8UXV+6cgprZ3tk/Q5J/k4tk0XUbBXX3tmSHKRWSMTAkXk79qS98HuWMBydm/bIjVXen21m2pINItNNpI1ryZm+UdyvRYveZjIdUM69slgqV9UY=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-d5-67b6bba6e98b
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 08/26] mm: introduce luf_batch to be used as hash
 table to store luf meta data
Date: Thu, 20 Feb 2025 14:20:09 +0900
Message-Id: <20250220052027.58847-9-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrGLMWRmVeSWpSXmKPExsXC9ZZnoe6y3dvSDe4f1rCYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	XzoWshVsMaho2NDO1sB4SL2LkZNDQsBEom/RBRYY+0zjQzYQm01AXeLGjZ/MILaIgJnEwdY/
	7CA2s8BdJokD/WA1wgJpEle2XwazWQRUJWZPPMcIYvMKmEr0z7vKBDFTXmL1hgNgcziB5vyY
	0QtWLwRU827BJaAaLqCa92wSDZ+ns0E0SEocXHGDZQIj7wJGhlWMQpl5ZbmJmTkmehmVeZkV
	esn5uZsYgaG/rPZP9A7GTxeCDzEKcDAq8fDOaN2WLsSaWFZcmXuIUYKDWUmEt61+S7oQb0pi
	ZVVqUX58UWlOavEhRmkOFiVxXqNv5SlCAumJJanZqakFqUUwWSYOTqkGxlY1HU9Rh223vj46
	pv4+4Wr7Njc1rffTUz+eLbNqeyV0S3/7Q/V3mbdtll8z8bvBdtGx/sz2b0uSvjTtXS65VlL3
	peyshw+2ijR3LbfqWtl5bJHFTTYVbnGtY2vqHB5vSrqy3Dms5Eji3lX8uxyahaLyT+g7ysRm
	+6hwCdnvXFkX3Oar+DivTYmlOCPRUIu5qDgRAGvyUlB5AgAA
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrLLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g+trpS3mrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlfGlYyFbwRaDioYN7WwNjIfUuxg5OSQETCTOND5kA7HZBNQlbtz4yQxiiwiY
	SRxs/cMOYjML3GWSONAPViMskCZxZftlMJtFQFVi9sRzjCA2r4CpRP+8q0wQM+UlVm84ADaH
	E2jOjxm9YPVCQDXvFlximsDItYCRYRWjSGZeWW5iZo6pXnF2RmVeZoVecn7uJkZgIC+r/TNx
	B+OXy+6HGAU4GJV4eB883pouxJpYVlyZe4hRgoNZSYS3rX5LuhBvSmJlVWpRfnxRaU5q8SFG
	aQ4WJXFer/DUBCGB9MSS1OzU1ILUIpgsEwenVAOjfqX8x5vquXsLZnGpK+5u0PXZdvR7R0D9
	qTyvKexFop1y2qXm1ut16+PWvbnu6HnY86bbomPlJ3pea/yrzP2X/KRv/va6JxsW7d68IZPN
	c8J5qZhLjyzL1zWw1RnGRDaEnfpgY5txdl36zvVa+Rrr9zwv3KBTsdulubEh6tp9ppxpBz9p
	vNynxFKckWioxVxUnAgAi7hxFGACAAA=
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Functionally, no change.  This is a preparation for luf mechanism that
needs to keep luf meta data per page while staying in pcp or buddy
allocator.  The meta data includes cpumask for tlb shootdown and luf's
request generation number.

Since struct page doesn't have enough room to store luf meta data, this
patch introduces a hash table to store them and makes each page keep its
hash key instead.

Since all the pages in pcp or buddy share the hash table, confliction is
inevitable so care must be taken when reading or updating its entry.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 include/linux/mm_types.h |  10 ++++
 mm/internal.h            |   8 +++
 mm/rmap.c                | 122 +++++++++++++++++++++++++++++++++++++--
 3 files changed, 136 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 20d85c4e609de..39a6b5124b01f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -32,6 +32,16 @@
 struct address_space;
 struct mem_cgroup;
=20
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+struct luf_batch {
+	struct tlbflush_unmap_batch batch;
+	unsigned long ugen;
+	rwlock_t lock;
+};
+#else
+struct luf_batch {};
+#endif
+
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
diff --git a/mm/internal.h b/mm/internal.h
index e3084d32272e3..b38a9ae9d6993 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1240,6 +1240,8 @@ extern struct workqueue_struct *mm_percpu_wq;
 void try_to_unmap_flush(void);
 void try_to_unmap_flush_dirty(void);
 void flush_tlb_batched_pending(struct mm_struct *mm);
+void fold_batch(struct tlbflush_unmap_batch *dst, struct tlbflush_unmap_ba=
tch *src, bool reset);
+void fold_luf_batch(struct luf_batch *dst, struct luf_batch *src);
 #else
 static inline void try_to_unmap_flush(void)
 {
@@ -1250,6 +1252,12 @@ static inline void try_to_unmap_flush_dirty(void)
 static inline void flush_tlb_batched_pending(struct mm_struct *mm)
 {
 }
+static inline void fold_batch(struct tlbflush_unmap_batch *dst, struct tlb=
flush_unmap_batch *src, bool reset)
+{
+}
+static inline void fold_luf_batch(struct luf_batch *dst, struct luf_batch =
*src)
+{
+}
 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
=20
 extern const struct trace_print_flags pageflag_names[];
diff --git a/mm/rmap.c b/mm/rmap.c
index ed345503e4f88..74fbf6c2fb3a7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -641,7 +641,7 @@ struct anon_vma *folio_lock_anon_vma_read(const struct =
folio *folio,
  * function, ugen_before(), should be used to evaluate the temporal
  * sequence of events because the number is designed to wraparound.
  */
-static atomic_long_t __maybe_unused luf_ugen =3D ATOMIC_LONG_INIT(LUF_UGEN=
_INIT);
+static atomic_long_t luf_ugen =3D ATOMIC_LONG_INIT(LUF_UGEN_INIT);
=20
 /*
  * Don't return invalid luf_ugen, zero.
@@ -656,6 +656,122 @@ static unsigned long __maybe_unused new_luf_ugen(void)
 	return ugen;
 }
=20
+static void reset_batch(struct tlbflush_unmap_batch *batch)
+{
+	arch_tlbbatch_clear(&batch->arch);
+	batch->flush_required =3D false;
+	batch->writable =3D false;
+}
+
+void fold_batch(struct tlbflush_unmap_batch *dst,
+		struct tlbflush_unmap_batch *src, bool reset)
+{
+	if (!src->flush_required)
+		return;
+
+	/*
+	 * Fold src to dst.
+	 */
+	arch_tlbbatch_fold(&dst->arch, &src->arch);
+	dst->writable =3D dst->writable || src->writable;
+	dst->flush_required =3D true;
+
+	if (!reset)
+		return;
+
+	/*
+	 * Reset src.
+	 */
+	reset_batch(src);
+}
+
+/*
+ * The range that luf_key covers, which is 'unsigned short' type.
+ */
+#define NR_LUF_BATCH (1 << (sizeof(short) * 8))
+
+/*
+ * Use 0th entry as accumulated batch.
+ */
+static struct luf_batch luf_batch[NR_LUF_BATCH];
+
+static void luf_batch_init(struct luf_batch *lb)
+{
+	rwlock_init(&lb->lock);
+	reset_batch(&lb->batch);
+	lb->ugen =3D atomic_long_read(&luf_ugen) - 1;
+}
+
+static int __init luf_init(void)
+{
+	int i;
+
+	for (i =3D 0; i < NR_LUF_BATCH; i++)
+		luf_batch_init(&luf_batch[i]);
+
+	return 0;
+}
+early_initcall(luf_init);
+
+/*
+ * key to point an entry of the luf_batch array
+ *
+ * note: zero means invalid key
+ */
+static atomic_t luf_kgen =3D ATOMIC_INIT(1);
+
+/*
+ * Don't return invalid luf_key, zero.
+ */
+static unsigned short __maybe_unused new_luf_key(void)
+{
+	unsigned short luf_key =3D atomic_inc_return(&luf_kgen);
+
+	if (!luf_key)
+		luf_key =3D atomic_inc_return(&luf_kgen);
+
+	return luf_key;
+}
+
+static void __fold_luf_batch(struct luf_batch *dst_lb,
+		struct tlbflush_unmap_batch *src_batch,
+		unsigned long src_ugen)
+{
+	/*
+	 * dst_lb->ugen represents one that requires tlb shootdown for
+	 * it, that is, sort of request number.  The newer it is, the
+	 * more tlb shootdown might be needed to fulfill the newer
+	 * request.  Conservertively keep the newer one.
+	 */
+	if (!dst_lb->ugen || ugen_before(dst_lb->ugen, src_ugen))
+		dst_lb->ugen =3D src_ugen;
+	fold_batch(&dst_lb->batch, src_batch, false);
+}
+
+void fold_luf_batch(struct luf_batch *dst, struct luf_batch *src)
+{
+	unsigned long flags;
+
+	/*
+	 * Exactly same.  Nothing to fold.
+	 */
+	if (dst =3D=3D src)
+		return;
+
+	if (&src->lock < &dst->lock) {
+		read_lock_irqsave(&src->lock, flags);
+		write_lock(&dst->lock);
+	} else {
+		write_lock_irqsave(&dst->lock, flags);
+		read_lock(&src->lock);
+	}
+
+	__fold_luf_batch(dst, &src->batch, src->ugen);
+
+	write_unlock(&dst->lock);
+	read_unlock_irqrestore(&src->lock, flags);
+}
+
 /*
  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
  * important if a PTE was dirty when it was unmapped that it's flushed
@@ -670,9 +786,7 @@ void try_to_unmap_flush(void)
 		return;
=20
 	arch_tlbbatch_flush(&tlb_ubc->arch);
-	arch_tlbbatch_clear(&tlb_ubc->arch);
-	tlb_ubc->flush_required =3D false;
-	tlb_ubc->writable =3D false;
+	reset_batch(tlb_ubc);
 }
=20
 /* Flush iff there are potentially writable TLB entries that can race with=
 IO */
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id A48241E3DD8
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:03 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029765; cv=none;
 b=L7wZUyYahfNR0Mg19rqSv9stflZaNJpP+H3DI0qPZbM/RHa+EqIxzCass4csPWWB8Q1bv2FluKwf/kx7FJXNZTrpgyTgNtXLIn18owJ9IynDxSc3MKav+Jbpn9SnHkdlO8EVUohi15n4UeCNDc+J786liQMOtnzvcGbGML6oscE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029765; c=relaxed/simple;
	bh=O+xLeL/yU205IDwGfwDxMSn+hZr/1tNXQbKe4WQie4c=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=u7VlFWu0x16wtsHiAPZlOAvQ3euiyUr+wT4+svTbiAmVZTnrP6bfVNciu3Ecr2ZIkegD2vzgIYcMZ0GvUsmJi+0L6E4ObvVcU3TW/gGuIRwbp/LZivmtUmfjCdybGb2RNq0BB2jnsXp7pAxgdNIxDXiomE1FmsWO6UTB2/h3UtE=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-db-67b6bba6bdc3
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 09/26] mm: introduce API to perform tlb shootdown on
 exit from page allocator
Date: Thu, 20 Feb 2025 14:20:10 +0900
Message-Id: <20250220052027.58847-10-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrKLMWRmVeSWpSXmKPExsXC9ZZnoe6y3dvSDZqvaFjMWb+GzeLzhn9s
	Fi82tDNafF3/i9ni6ac+FovLu+awWdxb85/V4vyutawWO5buY7K4dGABk8Xx3gNMFvPvfWaz
	2LxpKrPF8SlTGS1+/wAqPjlrMouDgMf31j4Wj52z7rJ7LNhU6rF5hZbH4j0vmTw2repk89j0
	aRK7x7tz59g9Tsz4zeIx72Sgx/t9V9k8tv6y82iceo3N4/MmuQC+KC6blNSczLLUIn27BK6M
	+yvPMxc8Eq24MTG2gXG2UBcjJ4eEgInEjvZmRhj70fdrbCA2m4C6xI0bP5lBbBEBM4mDrX/Y
	QWxmgbtMEgf6wWqEBVIl9v3azwpiswioSnxZshKshheofsmViSwQM+UlVm84ADaHEyj+Y0Yv
	WK+QgKnEuwWXmLoYuYBq3rNJvNo0jQ2iQVLi4IobLBMYeRcwMqxiFMrMK8tNzMwx0cuozMus
	0EvOz93ECAz8ZbV/oncwfroQfIhRgINRiYd3Ruu2dCHWxLLiytxDjBIczEoivG31W9KFeFMS
	K6tSi/Lji0pzUosPMUpzsCiJ8xp9K08REkhPLEnNTk0tSC2CyTJxcEo1MBaz89d1iDzqnOTv
	Wsv6OfT3ye8lpr67Dr3OvbT1yZ+oO7P1LnBV+f8qrBOOfT/Z5L92Z8dHsZRdZz4sVGV0u+Gl
	X9T0Up0zMH7hpd+Z50wTlAsjhWffMMx4L8zztVr4T8OtaC3lmTa/X7I5G16xjRdaffaukLwR
	b5auDzMT19VjPxg/mszrUmIpzkg01GIuKk4EAEubtG54AgAA
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrDLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g5UXZS3mrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlXF/5XnmgkeiFTcmxjYwzhbqYuTkkBAwkXj0/RobiM0moC5x48ZPZhBbRMBM
	4mDrH3YQm1ngLpPEgX6wGmGBVIl9v/azgtgsAqoSX5asBKvhBapfcmUiC8RMeYnVGw6AzeEE
	iv+Y0QvWKyRgKvFuwSWmCYxcCxgZVjGKZOaV5SZm5pjqFWdnVOZlVugl5+duYgSG8bLaPxN3
	MH657H6IUYCDUYmH98HjrelCrIllxZW5hxglOJiVRHjb6rekC/GmJFZWpRblxxeV5qQWH2KU
	5mBREuf1Ck9NEBJITyxJzU5NLUgtgskycXBKNTDufxh/WJi78Pbx/Q1Xyk4sWdB0wM5nl/fC
	WMbUo0+0dp+P49vOF9PKni/QUx93dW+8g63bNjeNJY/2/gvKbbkn5phUrT5RwKph5vfZ6T9j
	L4r4r9i/8bejsbnO569m9v7sB2tPuZhv2Nfy+Whv56JpMbfX/ZvQYuhtfuSggv/lWyxPbCTX
	Ln2jxFKckWioxVxUnAgA3SuVa18CAAA=
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Functionally, no change.  This is a preparation for luf mechanism that
performs tlb shootdown required on exit from page allocator.

This patch introduced a new API rather than making use of existing
try_to_unmap_flush() to avoid repeated and redundant tlb shootdown due
to frequent page allocations during a session of batched unmap flush.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 include/linux/sched.h |  1 +
 mm/internal.h         |  4 ++++
 mm/rmap.c             | 20 ++++++++++++++++++++
 3 files changed, 25 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index bb343136ddd05..8e6e7a83332cf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1375,6 +1375,7 @@ struct task_struct {
 #endif
=20
 	struct tlbflush_unmap_batch	tlb_ubc;
+	struct tlbflush_unmap_batch	tlb_ubc_takeoff;
=20
 	/* Cache last used pipe for splice(): */
 	struct pipe_inode_info		*splice_pipe;
diff --git a/mm/internal.h b/mm/internal.h
index b38a9ae9d6993..cbdebf8a02437 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1239,6 +1239,7 @@ extern struct workqueue_struct *mm_percpu_wq;
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 void try_to_unmap_flush(void);
 void try_to_unmap_flush_dirty(void);
+void try_to_unmap_flush_takeoff(void);
 void flush_tlb_batched_pending(struct mm_struct *mm);
 void fold_batch(struct tlbflush_unmap_batch *dst, struct tlbflush_unmap_ba=
tch *src, bool reset);
 void fold_luf_batch(struct luf_batch *dst, struct luf_batch *src);
@@ -1249,6 +1250,9 @@ static inline void try_to_unmap_flush(void)
 static inline void try_to_unmap_flush_dirty(void)
 {
 }
+static inline void try_to_unmap_flush_takeoff(void)
+{
+}
 static inline void flush_tlb_batched_pending(struct mm_struct *mm)
 {
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 74fbf6c2fb3a7..72c5e665e59a4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -772,6 +772,26 @@ void fold_luf_batch(struct luf_batch *dst, struct luf_=
batch *src)
 	read_unlock_irqrestore(&src->lock, flags);
 }
=20
+void try_to_unmap_flush_takeoff(void)
+{
+	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
+	struct tlbflush_unmap_batch *tlb_ubc_takeoff =3D &current->tlb_ubc_takeof=
f;
+
+	if (!tlb_ubc_takeoff->flush_required)
+		return;
+
+	arch_tlbbatch_flush(&tlb_ubc_takeoff->arch);
+
+	/*
+	 * Now that tlb shootdown of tlb_ubc_takeoff has been performed,
+	 * it's good chance to shrink tlb_ubc if possible.
+	 */
+	if (arch_tlbbatch_done(&tlb_ubc->arch, &tlb_ubc_takeoff->arch))
+		reset_batch(tlb_ubc);
+
+	reset_batch(tlb_ubc_takeoff);
+}
+
 /*
  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
  * important if a PTE was dirty when it was unmapped that it's flushed
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id A29161E3DD6
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:03 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029766; cv=none;
 b=GawVxYK+zR7j+pE8yhWCoOd8TAw+Ow5AFaliT3CPVvEyDAc9G9mG3FK4EcXHk8qagkngujv8XNV79ve2m9NeIHcc80aOaoZF9o30OdkamsDGjWGBBbLV6nbEA/nKHTfVdOn3zsxWOSmairFPrV4b7w+6aMoGRYPWCtHe6MtCh4Q=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029766; c=relaxed/simple;
	bh=7z4c/azJKKPHTuQ4rKrS5+vLrWnw7FZpUNSGCSasrMM=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=PvJeLuD+SOU+9QJo4GCd8I7ygqMKB1omfzF+u6rHE45ASRWkfOFeEIURHmcv2BOgZRo7EoEbrtLdf93L5OeJON7WNFTVEOM6XER8MMaU/GkeEw5oXusEdb2q8mFDa79LxHgGF3EzNjVAuWYtt0KJQL8fJQK64+21tYQXIQp9DaQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-e0-67b6bba66069
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 10/26] mm: introduce APIs to check if the page
 allocation is tlb shootdownable
Date: Thu, 20 Feb 2025 14:20:11 +0900
Message-Id: <20250220052027.58847-11-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrGLMWRmVeSWpSXmKPExsXC9ZZnke6y3dvSDWZ+0bGYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	N57sYin4aVyx6vhV1gbG41pdjJwcEgImEls7G5lg7G2vrjCD2GwC6hI3bvwEs0UEzCQOtv5h
	B7GZBe4ySRzoZwOxhQXSJH7PmMQCYrMIqEq0bJoAZvMC1f9+soAVYqa8xOoNB8DmcALFf8zo
	BesVEjCVeLfgEtBeLqCa92wSs+92Qh0hKXFwxQ2WCYy8CxgZVjEKZeaV5SZm5pjoZVTmZVbo
	JefnbmIEhv6y2j/ROxg/XQg+xCjAwajEwzujdVu6EGtiWXFl7iFGCQ5mJRHetvot6UK8KYmV
	ValF+fFFpTmpxYcYpTlYlMR5jb6VpwgJpCeWpGanphakFsFkmTg4pRoYp9RLvdDJNF78MEbk
	HLP+r1fMKenVZxlEuVUvH3u2XHUG053ERflFFfdlf7Ku+DZPZGrpSecC/n1TA3i0b8V0F8xy
	uKpR7vg5QGLFmrifb+QSz7bPTvG98Gpmcmhd2oOps+IeSQnt7t3rPN2N/d7lgqf3L2v6XL03
	/9fUCPZ7sa6rn5QeE6pVVmIpzkg01GIuKk4EAEnlmJR5AgAA
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrLLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g0VvlCzmrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlXHjyS6Wgp/GFauOX2VtYDyu1cXIySEhYCKx7dUVZhCbTUBd4saNn2C2iICZ
	xMHWP+wgNrPAXSaJA/1sILawQJrE7xmTWEBsFgFViZZNE8BsXqD6308WsELMlJdYveEA2BxO
	oPiPGb1gvUICphLvFlximsDItYCRYRWjSGZeWW5iZo6pXnF2RmVeZoVecn7uJkZgIC+r/TNx
	B+OXy+6HGAU4GJV4eB883pouxJpYVlyZe4hRgoNZSYS3rX5LuhBvSmJlVWpRfnxRaU5q8SFG
	aQ4WJXFer/DUBCGB9MSS1OzU1ILUIpgsEwenVAOj1Knmcj6lN6yen68f2Rr47sFtX+ubN1pz
	zE0WrGzM0DbZemdqp5+7vuonVyfp+vX8p32yN/1apSYS8kHNzyJY4qd8kv+nTb0pTTJlPBei
	ktwCJzrnXpU4elP9RvCdk6dmSj+72c67naFhdk5QYfvez0++JC/tudS6yfOa+b1bW412lE5R
	f7hMiaU4I9FQi7moOBEAyKhgsmACAAA=
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Functionally, no change.  This is a preparation for luf mechanism that
should indentify if tlb shootdown can be performed on page allocation.

In a context with irq disabled or non-task, tlb shootdown cannot be
performed because of deadlock issue.  Thus, page allocator should work
being aware of whether tlb shootdown can be performed on returning page.

This patch introduced APIs that pcp or buddy page allocator can use to
delimit the critical sections taking off pages and indentify whether
tlb shootdown can be performed.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 include/linux/sched.h |   5 ++
 mm/internal.h         |  14 ++++
 mm/page_alloc.c       | 159 ++++++++++++++++++++++++++++++++++++++++++
 mm/rmap.c             |   2 +-
 4 files changed, 179 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8e6e7a83332cf..c4ff83e1d5953 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1374,6 +1374,11 @@ struct task_struct {
 	struct callback_head		cid_work;
 #endif
=20
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH)
+	int luf_no_shootdown;
+	int luf_takeoff_started;
+#endif
+
 	struct tlbflush_unmap_batch	tlb_ubc;
 	struct tlbflush_unmap_batch	tlb_ubc_takeoff;
=20
diff --git a/mm/internal.h b/mm/internal.h
index cbdebf8a02437..55bc8ca0d6118 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1583,6 +1583,20 @@ static inline void accept_page(struct page *page)
 {
 }
 #endif /* CONFIG_UNACCEPTED_MEMORY */
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH)
+extern struct luf_batch luf_batch[];
+bool luf_takeoff_start(void);
+void luf_takeoff_end(void);
+bool luf_takeoff_no_shootdown(void);
+bool luf_takeoff_check(struct page *page);
+bool luf_takeoff_check_and_fold(struct page *page);
+#else
+static inline bool luf_takeoff_start(void) { return false; }
+static inline void luf_takeoff_end(void) {}
+static inline bool luf_takeoff_no_shootdown(void) { return true; }
+static inline bool luf_takeoff_check(struct page *page) { return true; }
+static inline bool luf_takeoff_check_and_fold(struct page *page) { return =
true; }
+#endif
=20
 /* pagewalk.c */
 int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 244cb30496be5..cac2c95ca2430 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -622,6 +622,165 @@ compaction_capture(struct capture_control *capc, stru=
ct page *page,
 }
 #endif /* CONFIG_COMPACTION */
=20
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH)
+static bool no_shootdown_context(void)
+{
+	/*
+	 * If it performs with irq disabled, that might cause a deadlock.
+	 * Avoid tlb shootdown in this case.
+	 */
+	return !(!irqs_disabled() && in_task());
+}
+
+/*
+ * Can be called with zone lock released and irq enabled.
+ */
+bool luf_takeoff_start(void)
+{
+	unsigned long flags;
+	bool no_shootdown =3D no_shootdown_context();
+
+	local_irq_save(flags);
+
+	/*
+	 * It's the outmost luf_takeoff_start().
+	 */
+	if (!current->luf_takeoff_started)
+		VM_WARN_ON(current->luf_no_shootdown);
+
+	/*
+	 * current->luf_no_shootdown > 0 doesn't mean tlb shootdown is
+	 * not allowed at all.  However, it guarantees tlb shootdown is
+	 * possible once current->luf_no_shootdown =3D=3D 0.  It might look
+	 * too conservative but for now do this way for simplity.
+	 */
+	if (no_shootdown || current->luf_no_shootdown)
+		current->luf_no_shootdown++;
+
+	current->luf_takeoff_started++;
+	local_irq_restore(flags);
+
+	return !no_shootdown;
+}
+
+/*
+ * Should be called within the same context of luf_takeoff_start().
+ */
+void luf_takeoff_end(void)
+{
+	unsigned long flags;
+	bool no_shootdown;
+	bool outmost =3D false;
+
+	local_irq_save(flags);
+	VM_WARN_ON(!current->luf_takeoff_started);
+
+	/*
+	 * Assume the context and irq flags are same as those at
+	 * luf_takeoff_start().
+	 */
+	if (current->luf_no_shootdown)
+		current->luf_no_shootdown--;
+
+	no_shootdown =3D !!current->luf_no_shootdown;
+
+	current->luf_takeoff_started--;
+
+	/*
+	 * It's the outmost luf_takeoff_end().
+	 */
+	if (!current->luf_takeoff_started)
+		outmost =3D true;
+
+	local_irq_restore(flags);
+
+	if (no_shootdown)
+		goto out;
+
+	try_to_unmap_flush_takeoff();
+out:
+	if (outmost)
+		VM_WARN_ON(current->luf_no_shootdown);
+}
+
+/*
+ * Can be called with zone lock released and irq enabled.
+ */
+bool luf_takeoff_no_shootdown(void)
+{
+	bool no_shootdown =3D true;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	/*
+	 * No way.  Delimit using luf_takeoff_{start,end}().
+	 */
+	if (unlikely(!current->luf_takeoff_started)) {
+		VM_WARN_ON(1);
+		goto out;
+	}
+	no_shootdown =3D current->luf_no_shootdown;
+out:
+	local_irq_restore(flags);
+	return no_shootdown;
+}
+
+/*
+ * Should be called with either zone lock held and irq disabled or pcp
+ * lock held.
+ */
+bool luf_takeoff_check(struct page *page)
+{
+	unsigned short luf_key =3D page_luf_key(page);
+
+	/*
+	 * No way.  Delimit using luf_takeoff_{start,end}().
+	 */
+	if (unlikely(!current->luf_takeoff_started)) {
+		VM_WARN_ON(1);
+		return false;
+	}
+
+	if (!luf_key)
+		return true;
+
+	return !current->luf_no_shootdown;
+}
+
+/*
+ * Should be called with either zone lock held and irq disabled or pcp
+ * lock held.
+ */
+bool luf_takeoff_check_and_fold(struct page *page)
+{
+	struct tlbflush_unmap_batch *tlb_ubc_takeoff =3D &current->tlb_ubc_takeof=
f;
+	unsigned short luf_key =3D page_luf_key(page);
+	struct luf_batch *lb;
+	unsigned long flags;
+
+	/*
+	 * No way.  Delimit using luf_takeoff_{start,end}().
+	 */
+	if (unlikely(!current->luf_takeoff_started)) {
+		VM_WARN_ON(1);
+		return false;
+	}
+
+	if (!luf_key)
+		return true;
+
+	if (current->luf_no_shootdown)
+		return false;
+
+	lb =3D &luf_batch[luf_key];
+	read_lock_irqsave(&lb->lock, flags);
+	fold_batch(tlb_ubc_takeoff, &lb->batch, false);
+	read_unlock_irqrestore(&lb->lock, flags);
+	return true;
+}
+#endif
+
 static inline void account_freepages(struct zone *zone, int nr_pages,
 				     int migratetype)
 {
diff --git a/mm/rmap.c b/mm/rmap.c
index 72c5e665e59a4..1581b1a00f974 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -693,7 +693,7 @@ void fold_batch(struct tlbflush_unmap_batch *dst,
 /*
  * Use 0th entry as accumulated batch.
  */
-static struct luf_batch luf_batch[NR_LUF_BATCH];
+struct luf_batch luf_batch[NR_LUF_BATCH];
=20
 static void luf_batch_init(struct luf_batch *lb)
 {
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 3B2BB1E411C
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:04 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029767; cv=none;
 b=e3Lg35TSzWnVEAYEmFhzqy8KI/sbkiNxfmUKQXWSfdLu1fDbBacnUWTsh/2KKHvsW+DSDrT2Uqd6eS3UbhsVEsXOtGPMbSYKzZ5zOcLk/+ZprtGcA2MM9HM/taEMaU9opQZb8/8jcjRnRIy3BcT8UhwJzdiYFhr5ii6lLczVb4g=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029767; c=relaxed/simple;
	bh=xsHN83I7sYhT0LId4D+/9zqGo2XyB2ZkUuHcLOADkN4=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=BT2vXQfralWyjwvaopuNDPHt3p1IFJe8AM9uuGAoBYVv2OGwcNGNmfotld8XDIBwygXR5xldcfJ6vXx7IqAPqP0P07AS86IbymqCg758K8BrwV3YZF/b7d5lX1P+5E4Fz93LqoduNLL+lq9T11szf935xTNz6ObxcYyBW9K12jY=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-e5-67b6bba63df4
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 11/26] mm: deliver luf_key to pcp or buddy on free
 after unmapping
Date: Thu, 20 Feb 2025 14:20:12 +0900
Message-Id: <20250220052027.58847-12-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrGLMWRmVeSWpSXmKPExsXC9ZZnoe6y3dvSDe5tMbCYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	vd82shUsb2esmNrdwtbAuDS3i5GTQ0LARGLi79NMMPaKpmuMIDabgLrEjRs/mUFsEQEziYOt
	f9hBbGaBu0wSB/rZQGxhgSiJZ18/sYDYLAKqElN+tQDVcHDwAtX3rMuAGCkvsXrDAbAxnEDh
	HzN6wVqFBEwl3i24BLSWC6jmM5vElo19zBANkhIHV9xgmcDIu4CRYRWjUGZeWW5iZo6JXkZl
	XmaFXnJ+7iZGYOgvq/0TvYPx04XgQ4wCHIxKPLwzWrelC7EmlhVX5h5ilOBgVhLhbavfki7E
	m5JYWZValB9fVJqTWnyIUZqDRUmc1+hbeYqQQHpiSWp2ampBahFMlomDU6qBMXnmpRM6aZlm
	j5xl2bZV/gtaMClEdGbDi4ePT3x6/qDtoNzhgzuvvNXVrvvxTCm6XbRea3LlrR8iQjc3/Fmb
	UOr/bM3aiXUr8nJvz0+cuG/V/jOMz7Zs61nxbGvHlV0Lkp9L3LTOaXDNk7l06rqcW/7FXFPv
	Z/v5VZc+y5i2WqS+ivdV2MHZdYpKLMUZiYZazEXFiQCY1sPSeQIAAA==
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrLLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g8ZLqhZz1q9hs/i84R+b
	xYsN7YwWX9f/YrZ4+qmPxeLw3JOsFpd3zWGzuLfmP6vF+V1rWS12LN3HZHHpwAImi+O9B5gs
	5t/7zGaxedNUZovjU6YyWvz+AVR8ctZkFgdBj++tfSweO2fdZfdYsKnUY/MKLY/Fe14yeWxa
	1cnmsenTJHaPd+fOsXucmPGbxWPeyUCP9/uusnksfvGByWPrLzuPxqnX2Dw+b5IL4I/isklJ
	zcksSy3St0vgyuj9tpGtYHk7Y8XU7ha2BsaluV2MnBwSAiYSK5quMYLYbALqEjdu/GQGsUUE
	zCQOtv5hB7GZBe4ySRzoZwOxhQWiJJ59/cQCYrMIqEpM+dUCVMPBwQtU37MuA2KkvMTqDQfA
	xnAChX/M6AVrFRIwlXi34BLTBEauBYwMqxhFMvPKchMzc0z1irMzKvMyK/SS83M3MQIDeVnt
	n4k7GL9cdj/EKMDBqMTD++Dx1nQh1sSy4srcQ4wSHMxKIrxt9VvShXhTEiurUovy44tKc1KL
	DzFKc7AoifN6hacmCAmkJ5akZqemFqQWwWSZODilGhjdRPmd/t9OuqkS7a9ccOAc09F/Hy4E
	HvtpzrVyvfKdBJddIp63Qg/MnHJNm/H00bun4lduuX4yMu+7SkXb1v11M0o26tfc/3Jmj8e7
	2uYsr6AD257sXsne+XR3SvfvsHf9Lh63eTaln1jzdf46CeuNjnK1KlUXfsSrtWRMj/ok2XaX
	Me/aFq1SJZbijERDLeai4kQAPKqb5WACAAA=
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Functionally, no change.  This is a preparation for luf mechanism that
needs to pass luf_key to pcp or buddy allocator on free after unmapping
e.g. during page reclaim or page migration.

The luf_key will be used to track need of tlb shootdown and which cpus
need to perform tlb flush, per page residing in pcp or buddy, and should
be handed over properly when pages travel between pcp and buddy.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 mm/internal.h       |   4 +-
 mm/page_alloc.c     | 120 ++++++++++++++++++++++++++++++++------------
 mm/page_isolation.c |   6 +++
 mm/page_reporting.c |   6 +++
 mm/swap.c           |   4 +-
 mm/vmscan.c         |   8 +--
 6 files changed, 109 insertions(+), 39 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 55bc8ca0d6118..2bb54bc04260b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -741,8 +741,8 @@ extern bool free_pages_prepare(struct page *page, unsig=
ned int order);
=20
 extern int user_min_free_kbytes;
=20
-void free_unref_page(struct page *page, unsigned int order);
-void free_unref_folios(struct folio_batch *fbatch);
+void free_unref_page(struct page *page, unsigned int order, unsigned short=
 luf_key);
+void free_unref_folios(struct folio_batch *fbatch, unsigned short luf_key);
=20
 extern void zone_pcp_reset(struct zone *zone);
 extern void zone_pcp_disable(struct zone *zone);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cac2c95ca2430..05a1098f8c61f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -212,7 +212,7 @@ unsigned int pageblock_order __read_mostly;
 #endif
=20
 static void __free_pages_ok(struct page *page, unsigned int order,
-			    fpi_t fpi_flags);
+			    fpi_t fpi_flags, unsigned short luf_key);
=20
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
@@ -850,8 +850,13 @@ static inline void __del_page_from_free_list(struct pa=
ge *page, struct zone *zon
=20
 	list_del(&page->buddy_list);
 	__ClearPageBuddy(page);
-	set_page_private(page, 0);
 	zone->free_area[order].nr_free--;
+
+	/*
+	 * Keep head page's private until post_alloc_hook().
+	 *
+	 * XXX: Tail pages' private doesn't get cleared.
+	 */
 }
=20
 static inline void del_page_from_free_list(struct page *page, struct zone =
*zone,
@@ -920,7 +925,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long bud=
dy_pfn,
 static inline void __free_one_page(struct page *page,
 		unsigned long pfn,
 		struct zone *zone, unsigned int order,
-		int migratetype, fpi_t fpi_flags)
+		int migratetype, fpi_t fpi_flags, unsigned short luf_key)
 {
 	struct capture_control *capc =3D task_capc(zone);
 	unsigned long buddy_pfn =3D 0;
@@ -937,10 +942,21 @@ static inline void __free_one_page(struct page *page,
=20
 	account_freepages(zone, 1 << order, migratetype);
=20
+	/*
+	 * Use the page's luf_key unchanged if luf_key =3D=3D 0.  Worth
+	 * noting that page_luf_key() will be 0 in most cases since it's
+	 * initialized at free_pages_prepare().
+	 */
+	if (luf_key)
+		set_page_luf_key(page, luf_key);
+	else
+		luf_key =3D page_luf_key(page);
+
 	while (order < MAX_PAGE_ORDER) {
 		int buddy_mt =3D migratetype;
+		unsigned short buddy_luf_key;
=20
-		if (compaction_capture(capc, page, order, migratetype)) {
+		if (!luf_key && compaction_capture(capc, page, order, migratetype)) {
 			account_freepages(zone, -(1 << order), migratetype);
 			return;
 		}
@@ -973,6 +989,18 @@ static inline void __free_one_page(struct page *page,
 		else
 			__del_page_from_free_list(buddy, zone, order, buddy_mt);
=20
+		/*
+		 * !buddy_luf_key && !luf_key : do nothing
+		 *  buddy_luf_key && !luf_key : luf_key =3D buddy_luf_key
+		 * !buddy_luf_key &&  luf_key : do nothing
+		 *  buddy_luf_key &&  luf_key : merge two into luf_key
+		 */
+		buddy_luf_key =3D page_luf_key(buddy);
+		if (buddy_luf_key && !luf_key)
+			luf_key =3D buddy_luf_key;
+		else if (buddy_luf_key && luf_key)
+			fold_luf_batch(&luf_batch[luf_key], &luf_batch[buddy_luf_key]);
+
 		if (unlikely(buddy_mt !=3D migratetype)) {
 			/*
 			 * Match buddy type. This ensures that an
@@ -984,6 +1012,7 @@ static inline void __free_one_page(struct page *page,
=20
 		combined_pfn =3D buddy_pfn & pfn;
 		page =3D page + (combined_pfn - pfn);
+		set_page_luf_key(page, luf_key);
 		pfn =3D combined_pfn;
 		order++;
 	}
@@ -1164,6 +1193,11 @@ __always_inline bool free_pages_prepare(struct page =
*page,
=20
 	VM_BUG_ON_PAGE(PageTail(page), page);
=20
+	/*
+	 * Ensure private is zero before using it inside allocator.
+	 */
+	set_page_private(page, 0);
+
 	trace_mm_page_free(page, order);
 	kmsan_free_page(page, order);
=20
@@ -1329,7 +1363,8 @@ static void free_pcppages_bulk(struct zone *zone, int=
 count,
 			count -=3D nr_pages;
 			pcp->count -=3D nr_pages;
=20
-			__free_one_page(page, pfn, zone, order, mt, FPI_NONE);
+			__free_one_page(page, pfn, zone, order, mt, FPI_NONE, 0);
+
 			trace_mm_page_pcpu_drain(page, order, mt);
 		} while (count > 0 && !list_empty(list));
 	}
@@ -1353,7 +1388,7 @@ static void split_large_buddy(struct zone *zone, stru=
ct page *page,
 	while (pfn !=3D end) {
 		int mt =3D get_pfnblock_migratetype(page, pfn);
=20
-		__free_one_page(page, pfn, zone, order, mt, fpi);
+		__free_one_page(page, pfn, zone, order, mt, fpi, 0);
 		pfn +=3D 1 << order;
 		page =3D pfn_to_page(pfn);
 	}
@@ -1361,11 +1396,18 @@ static void split_large_buddy(struct zone *zone, st=
ruct page *page,
=20
 static void free_one_page(struct zone *zone, struct page *page,
 			  unsigned long pfn, unsigned int order,
-			  fpi_t fpi_flags)
+			  fpi_t fpi_flags, unsigned short luf_key)
 {
 	unsigned long flags;
=20
 	spin_lock_irqsave(&zone->lock, flags);
+
+	/*
+	 * valid luf_key can be passed only if order =3D=3D 0.
+	 */
+	VM_WARN_ON(luf_key && order);
+	set_page_luf_key(page, luf_key);
+
 	split_large_buddy(zone, page, pfn, order, fpi_flags);
 	spin_unlock_irqrestore(&zone->lock, flags);
=20
@@ -1373,13 +1415,13 @@ static void free_one_page(struct zone *zone, struct=
 page *page,
 }
=20
 static void __free_pages_ok(struct page *page, unsigned int order,
-			    fpi_t fpi_flags)
+			    fpi_t fpi_flags, unsigned short luf_key)
 {
 	unsigned long pfn =3D page_to_pfn(page);
 	struct zone *zone =3D page_zone(page);
=20
 	if (free_pages_prepare(page, order))
-		free_one_page(zone, page, pfn, order, fpi_flags);
+		free_one_page(zone, page, pfn, order, fpi_flags, luf_key);
 }
=20
 void __meminit __free_pages_core(struct page *page, unsigned int order,
@@ -1433,7 +1475,7 @@ void __meminit __free_pages_core(struct page *page, u=
nsigned int order,
 	 * Bypass PCP and place fresh pages right to the tail, primarily
 	 * relevant for memory onlining.
 	 */
-	__free_pages_ok(page, order, FPI_TO_TAIL);
+	__free_pages_ok(page, order, FPI_TO_TAIL, 0);
 }
=20
 /*
@@ -2459,6 +2501,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned =
int order,
 		if (unlikely(page =3D=3D NULL))
 			break;
=20
+		/*
+		 * Keep the page's luf_key.
+		 */
+
 		/*
 		 * Split buddy pages returned by expand() are received here in
 		 * physical page order. The page is added to the tail of
@@ -2740,12 +2786,14 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, s=
truct zone *zone,
=20
 static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages=
 *pcp,
 				   struct page *page, int migratetype,
-				   unsigned int order)
+				   unsigned int order, unsigned short luf_key)
 {
 	int high, batch;
 	int pindex;
 	bool free_high =3D false;
=20
+	set_page_luf_key(page, luf_key);
+
 	/*
 	 * On freeing, reduce the number of pages that are batch allocated.
 	 * See nr_pcp_alloc() where alloc_factor is increased for subsequent
@@ -2754,7 +2802,16 @@ static void free_unref_page_commit(struct zone *zone=
, struct per_cpu_pages *pcp,
 	pcp->alloc_factor >>=3D 1;
 	__count_vm_events(PGFREE, 1 << order);
 	pindex =3D order_to_pindex(migratetype, order);
-	list_add(&page->pcp_list, &pcp->lists[pindex]);
+
+	/*
+	 * Defer tlb shootdown as much as possible by putting luf'd
+	 * pages to the tail.
+	 */
+	if (luf_key)
+		list_add_tail(&page->pcp_list, &pcp->lists[pindex]);
+	else
+		list_add(&page->pcp_list, &pcp->lists[pindex]);
+
 	pcp->count +=3D 1 << order;
=20
 	batch =3D READ_ONCE(pcp->batch);
@@ -2789,7 +2846,8 @@ static void free_unref_page_commit(struct zone *zone,=
 struct per_cpu_pages *pcp,
 /*
  * Free a pcp page
  */
-void free_unref_page(struct page *page, unsigned int order)
+void free_unref_page(struct page *page, unsigned int order,
+		     unsigned short luf_key)
 {
 	unsigned long __maybe_unused UP_flags;
 	struct per_cpu_pages *pcp;
@@ -2798,7 +2856,7 @@ void free_unref_page(struct page *page, unsigned int =
order)
 	int migratetype;
=20
 	if (!pcp_allowed_order(order)) {
-		__free_pages_ok(page, order, FPI_NONE);
+		__free_pages_ok(page, order, FPI_NONE, luf_key);
 		return;
 	}
=20
@@ -2815,7 +2873,7 @@ void free_unref_page(struct page *page, unsigned int =
order)
 	migratetype =3D get_pfnblock_migratetype(page, pfn);
 	if (unlikely(migratetype >=3D MIGRATE_PCPTYPES)) {
 		if (unlikely(is_migrate_isolate(migratetype))) {
-			free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
+			free_one_page(page_zone(page), page, pfn, order, FPI_NONE, luf_key);
 			return;
 		}
 		migratetype =3D MIGRATE_MOVABLE;
@@ -2825,10 +2883,10 @@ void free_unref_page(struct page *page, unsigned in=
t order)
 	pcp_trylock_prepare(UP_flags);
 	pcp =3D pcp_spin_trylock(zone->per_cpu_pageset);
 	if (pcp) {
-		free_unref_page_commit(zone, pcp, page, migratetype, order);
+		free_unref_page_commit(zone, pcp, page, migratetype, order, luf_key);
 		pcp_spin_unlock(pcp);
 	} else {
-		free_one_page(zone, page, pfn, order, FPI_NONE);
+		free_one_page(zone, page, pfn, order, FPI_NONE, luf_key);
 	}
 	pcp_trylock_finish(UP_flags);
 }
@@ -2836,7 +2894,7 @@ void free_unref_page(struct page *page, unsigned int =
order)
 /*
  * Free a batch of folios
  */
-void free_unref_folios(struct folio_batch *folios)
+void free_unref_folios(struct folio_batch *folios, unsigned short luf_key)
 {
 	unsigned long __maybe_unused UP_flags;
 	struct per_cpu_pages *pcp =3D NULL;
@@ -2857,7 +2915,7 @@ void free_unref_folios(struct folio_batch *folios)
 		 */
 		if (!pcp_allowed_order(order)) {
 			free_one_page(folio_zone(folio), &folio->page,
-				      pfn, order, FPI_NONE);
+				      pfn, order, FPI_NONE, luf_key);
 			continue;
 		}
 		folio->private =3D (void *)(unsigned long)order;
@@ -2893,7 +2951,7 @@ void free_unref_folios(struct folio_batch *folios)
 			 */
 			if (is_migrate_isolate(migratetype)) {
 				free_one_page(zone, &folio->page, pfn,
-					      order, FPI_NONE);
+					      order, FPI_NONE, luf_key);
 				continue;
 			}
=20
@@ -2906,7 +2964,7 @@ void free_unref_folios(struct folio_batch *folios)
 			if (unlikely(!pcp)) {
 				pcp_trylock_finish(UP_flags);
 				free_one_page(zone, &folio->page, pfn,
-					      order, FPI_NONE);
+					      order, FPI_NONE, luf_key);
 				continue;
 			}
 			locked_zone =3D zone;
@@ -2921,7 +2979,7 @@ void free_unref_folios(struct folio_batch *folios)
=20
 		trace_mm_page_free_batched(&folio->page);
 		free_unref_page_commit(zone, pcp, &folio->page, migratetype,
-				order);
+				order, luf_key);
 	}
=20
 	if (pcp) {
@@ -3013,7 +3071,7 @@ void __putback_isolated_page(struct page *page, unsig=
ned int order, int mt)
=20
 	/* Return isolated page to tail of freelist. */
 	__free_one_page(page, page_to_pfn(page), zone, order, mt,
-			FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
+			FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL, 0);
 }
=20
 /*
@@ -4983,11 +5041,11 @@ void __free_pages(struct page *page, unsigned int o=
rder)
 	struct alloc_tag *tag =3D pgalloc_tag_get(page);
=20
 	if (put_page_testzero(page))
-		free_unref_page(page, order);
+		free_unref_page(page, order, 0);
 	else if (!head) {
 		pgalloc_tag_sub_pages(tag, (1 << order) - 1);
 		while (order-- > 0)
-			free_unref_page(page + (1 << order), order);
+			free_unref_page(page + (1 << order), order, 0);
 	}
 }
 EXPORT_SYMBOL(__free_pages);
@@ -5049,7 +5107,7 @@ void __page_frag_cache_drain(struct page *page, unsig=
ned int count)
 	VM_BUG_ON_PAGE(page_ref_count(page) =3D=3D 0, page);
=20
 	if (page_ref_sub_and_test(page, count))
-		free_unref_page(page, compound_order(page));
+		free_unref_page(page, compound_order(page), 0);
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
=20
@@ -5090,7 +5148,7 @@ void *__page_frag_alloc_align(struct page_frag_cache =
*nc,
 			goto refill;
=20
 		if (unlikely(nc->pfmemalloc)) {
-			free_unref_page(page, compound_order(page));
+			free_unref_page(page, compound_order(page), 0);
 			goto refill;
 		}
=20
@@ -5134,7 +5192,7 @@ void page_frag_free(void *addr)
 	struct page *page =3D virt_to_head_page(addr);
=20
 	if (unlikely(put_page_testzero(page)))
-		free_unref_page(page, compound_order(page));
+		free_unref_page(page, compound_order(page), 0);
 }
 EXPORT_SYMBOL(page_frag_free);
=20
@@ -5154,7 +5212,7 @@ static void *make_alloc_exact(unsigned long addr, uns=
igned int order,
=20
 		last =3D page + (1UL << order);
 		for (page +=3D nr; page < last; page++)
-			__free_pages_ok(page, 0, FPI_TO_TAIL);
+			__free_pages_ok(page, 0, FPI_TO_TAIL, 0);
 	}
 	return (void *)addr;
 }
@@ -7124,7 +7182,7 @@ bool put_page_back_buddy(struct page *page)
 		int migratetype =3D get_pfnblock_migratetype(page, pfn);
=20
 		ClearPageHWPoisonTakenOff(page);
-		__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
+		__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE, 0);
 		if (TestClearPageHWPoison(page)) {
 			ret =3D true;
 		}
@@ -7193,7 +7251,7 @@ static void __accept_page(struct zone *zone, unsigned=
 long *flags,
=20
 	accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
=20
-	__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
+	__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL, 0);
=20
 	if (last)
 		static_branch_dec(&zones_with_unaccepted_pages);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 7e04047977cfe..8467838d4dbc8 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -258,6 +258,12 @@ static void unset_migratetype_isolate(struct page *pag=
e, int migratetype)
 		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
 	} else {
 		set_pageblock_migratetype(page, migratetype);
+
+		/*
+		 * Do not clear the page's private to keep its luf_key
+		 * unchanged.
+		 */
+
 		__putback_isolated_page(page, order, migratetype);
 	}
 	zone->nr_isolate_pageblock--;
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index e4c428e61d8c1..c05afb7a395f1 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -116,6 +116,12 @@ page_reporting_drain(struct page_reporting_dev_info *p=
rdev,
 		int mt =3D get_pageblock_migratetype(page);
 		unsigned int order =3D get_order(sg->length);
=20
+		/*
+		 * Ensure private is zero before putting into the
+		 * allocator.
+		 */
+		set_page_private(page, 0);
+
 		__putback_isolated_page(page, order, mt);
=20
 		/* If the pages were not reported due to error skip flagging */
diff --git a/mm/swap.c b/mm/swap.c
index 10decd9dffa17..54b0ba10dbb86 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -109,7 +109,7 @@ void __folio_put(struct folio *folio)
 	page_cache_release(folio);
 	folio_unqueue_deferred_split(folio);
 	mem_cgroup_uncharge(folio);
-	free_unref_page(&folio->page, folio_order(folio));
+	free_unref_page(&folio->page, folio_order(folio), 0);
 }
 EXPORT_SYMBOL(__folio_put);
=20
@@ -959,7 +959,7 @@ void folios_put_refs(struct folio_batch *folios, unsign=
ed int *refs)
=20
 	folios->nr =3D j;
 	mem_cgroup_uncharge_folios(folios);
-	free_unref_folios(folios);
+	free_unref_folios(folios, 0);
 }
 EXPORT_SYMBOL(folios_put_refs);
=20
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 76378bc257e38..2970a8f35d3d3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1480,7 +1480,7 @@ static unsigned int shrink_folio_list(struct list_hea=
d *folio_list,
 		if (folio_batch_add(&free_folios, folio) =3D=3D 0) {
 			mem_cgroup_uncharge_folios(&free_folios);
 			try_to_unmap_flush();
-			free_unref_folios(&free_folios);
+			free_unref_folios(&free_folios, 0);
 		}
 		continue;
=20
@@ -1548,7 +1548,7 @@ static unsigned int shrink_folio_list(struct list_hea=
d *folio_list,
=20
 	mem_cgroup_uncharge_folios(&free_folios);
 	try_to_unmap_flush();
-	free_unref_folios(&free_folios);
+	free_unref_folios(&free_folios, 0);
=20
 	list_splice(&ret_folios, folio_list);
 	count_vm_events(PGACTIVATE, pgactivate);
@@ -1868,7 +1868,7 @@ static unsigned int move_folios_to_lru(struct lruvec =
*lruvec,
 			if (folio_batch_add(&free_folios, folio) =3D=3D 0) {
 				spin_unlock_irq(&lruvec->lru_lock);
 				mem_cgroup_uncharge_folios(&free_folios);
-				free_unref_folios(&free_folios);
+				free_unref_folios(&free_folios, 0);
 				spin_lock_irq(&lruvec->lru_lock);
 			}
=20
@@ -1890,7 +1890,7 @@ static unsigned int move_folios_to_lru(struct lruvec =
*lruvec,
 	if (free_folios.nr) {
 		spin_unlock_irq(&lruvec->lru_lock);
 		mem_cgroup_uncharge_folios(&free_folios);
-		free_unref_folios(&free_folios);
+		free_unref_folios(&free_folios, 0);
 		spin_lock_irq(&lruvec->lru_lock);
 	}
=20
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 4BB571E47A9
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:04 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029768; cv=none;
 b=NFKC/iYTacEKZ150jhWX8xKPL45Dtk2Oww/twYoxfpE5pmwP0r5Pl2IIgLB0c2b6FugwQKKM53DEj/ptci4Nhwiq9Fl9bUlUYt24Bodz/v/aGqIycNoXhNtKv+SlH4vUdiS6JnyRowmog6cRUVue3Da1QVIA7E1WRMu9OHQDJzo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029768; c=relaxed/simple;
	bh=KdT8AtGK+qYpDffSacrYcymV9M+2Gfbhm4h0ypoJVUU=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=KipQiXNiVObNvKKld8daUQuL28ytufRxzg4MzcEOK/snn9i6R2fdq6ro42FolO3VRNVH1vAr7f/4tgkK89u+SPTVY2xgvj5hZK3+RbDc45Qm5p+w6GdzHqUeYeupmNUT9KbFshk+eK2cpxbYfx+lVBaDhYz+ZB8EUSE6RqIY4B4=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-ea-67b6bba63180
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 12/26] mm: delimit critical sections to take off pages
 from pcp or buddy alloctor
Date: Thu, 20 Feb 2025 14:20:13 +0900
Message-Id: <20250220052027.58847-13-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrGLMWRmVeSWpSXmKPExsXC9ZZnoe6y3dvSDR4sMrKYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	x5f/YyrY2cxYsXzPO5YGxslZXYycHBICJhJ/eu+wwthfzixhB7HZBNQlbtz4yQxiiwiYSRxs
	/QMWZxa4yyRxoJ8NxBYWyJTYtriHEcRmEVCVaD3xEqyeF6j+SEcHG8RMeYnVGw6AxTmB4j9m
	9ILFhQRMJd4tuMQEUfOeTWLbzlAIW1Li4IobLBMYeRcwMqxiFMrMK8tNzMwx0cuozMus0EvO
	z93ECAz9ZbV/oncwfroQfIhRgINRiYd3Ruu2dCHWxLLiytxDjBIczEoivG31W9KFeFMSK6tS
	i/Lji0pzUosPMUpzsCiJ8xp9K08REkhPLEnNTk0tSC2CyTJxcEo1MDrdv2l+UXvJvPWyoksb
	XniXPWXyC4rILqhYlH+G0+P/2vIp4txqi74X+f2yNWKdkN30+mzZ9y9HGvKaZp4zEOvuOV8Z
	JL8vcFUX8/ZpTR8mW5648e+9ZCtH6YN/B613FbJHKn1lM/r+TF7vpPLUP0+NazZ2ddmy+tc1
	GfgZsgevYltUJBD9R4mlOCPRUIu5qDgRABKViE55AgAA
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrHLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g4NbNSzmrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlXF8+T+mgp3NjBXL97xjaWCcnNXFyMkhIWAi8eXMEnYQm01AXeLGjZ/MILaI
	gJnEwdY/YHFmgbtMEgf62UBsYYFMiW2LexhBbBYBVYnWEy/B6nmB6o90dLBBzJSXWL3hAFic
	Eyj+Y0YvWFxIwFTi3YJLTBMYuRYwMqxiFMnMK8tNzMwx1SvOzqjMy6zQS87P3cQIDOVltX8m
	7mD8ctn9EKMAB6MSD++Dx1vThVgTy4orcw8xSnAwK4nwttVvSRfiTUmsrEotyo8vKs1JLT7E
	KM3BoiTO6xWemiAkkJ5YkpqdmlqQWgSTZeLglGpgnHt7wvJP0y7922u/c6misKL3zsNL+HUC
	PuXz19zbxBT/7IlNSuW05S33xPas1/n3Vj7QX65Qg2UhM+epsqDD7Eud3mjEqarOWTrrWLGZ
	/zqR7UlqAoL9sw2aUkLUs3qkWdPqkt23/Pl6/8mkE8e/GCpcWvFj+kTLCzvn2S1R2P1E9xzb
	HoGH0kosxRmJhlrMRcWJAIisZQVhAgAA
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Now that luf mechanism has been introduced, tlb shootdown might be
necessary when luf'd pages exit from pcp or buddy allocator.  Check if
it's okay to take off pages and can perform for luf'd pages before use.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 mm/compaction.c     | 32 ++++++++++++++++--
 mm/internal.h       |  2 +-
 mm/page_alloc.c     | 79 +++++++++++++++++++++++++++++++++++++++++++--
 mm/page_isolation.c |  4 ++-
 mm/page_reporting.c | 20 +++++++++++-
 5 files changed, 129 insertions(+), 8 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 6009f5d1021a6..90f5c34f333db 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -605,6 +605,7 @@ static unsigned long isolate_freepages_block(struct com=
pact_control *cc,
=20
 	page =3D pfn_to_page(blockpfn);
=20
+	luf_takeoff_start();
 	/* Isolate free pages. */
 	for (; blockpfn < end_pfn; blockpfn +=3D stride, page +=3D stride) {
 		int isolated;
@@ -652,9 +653,12 @@ static unsigned long isolate_freepages_block(struct co=
mpact_control *cc,
 				goto isolate_fail;
 		}
=20
+		if (!luf_takeoff_check(page))
+			goto isolate_fail;
+
 		/* Found a free page, will break it into order-0 pages */
 		order =3D buddy_order(page);
-		isolated =3D __isolate_free_page(page, order);
+		isolated =3D __isolate_free_page(page, order, false);
 		if (!isolated)
 			break;
 		set_page_private(page, order);
@@ -682,6 +686,11 @@ static unsigned long isolate_freepages_block(struct co=
mpact_control *cc,
 	if (locked)
 		spin_unlock_irqrestore(&cc->zone->lock, flags);
=20
+	/*
+	 * Check and flush before using the pages taken off.
+	 */
+	luf_takeoff_end();
+
 	/*
 	 * Be careful to not go outside of the pageblock.
 	 */
@@ -1589,6 +1598,7 @@ static void fast_isolate_freepages(struct compact_con=
trol *cc)
 		if (!area->nr_free)
 			continue;
=20
+		luf_takeoff_start();
 		spin_lock_irqsave(&cc->zone->lock, flags);
 		freelist =3D &area->free_list[MIGRATE_MOVABLE];
 		list_for_each_entry_reverse(freepage, freelist, buddy_list) {
@@ -1596,6 +1606,10 @@ static void fast_isolate_freepages(struct compact_co=
ntrol *cc)
=20
 			order_scanned++;
 			nr_scanned++;
+
+			if (!luf_takeoff_check(freepage))
+				goto scan_next;
+
 			pfn =3D page_to_pfn(freepage);
=20
 			if (pfn >=3D highest)
@@ -1615,7 +1629,7 @@ static void fast_isolate_freepages(struct compact_con=
trol *cc)
 				/* Shorten the scan if a candidate is found */
 				limit >>=3D 1;
 			}
-
+scan_next:
 			if (order_scanned >=3D limit)
 				break;
 		}
@@ -1633,7 +1647,7 @@ static void fast_isolate_freepages(struct compact_con=
trol *cc)
=20
 		/* Isolate the page if available */
 		if (page) {
-			if (__isolate_free_page(page, order)) {
+			if (__isolate_free_page(page, order, false)) {
 				set_page_private(page, order);
 				nr_isolated =3D 1 << order;
 				nr_scanned +=3D nr_isolated - 1;
@@ -1650,6 +1664,11 @@ static void fast_isolate_freepages(struct compact_co=
ntrol *cc)
=20
 		spin_unlock_irqrestore(&cc->zone->lock, flags);
=20
+		/*
+		 * Check and flush before using the pages taken off.
+		 */
+		luf_takeoff_end();
+
 		/* Skip fast search if enough freepages isolated */
 		if (cc->nr_freepages >=3D cc->nr_migratepages)
 			break;
@@ -2369,7 +2388,14 @@ static enum compact_result compact_finished(struct c=
ompact_control *cc)
 {
 	int ret;
=20
+	/*
+	 * luf_takeoff_{start,end}() is required to identify whether
+	 * this compaction context is tlb shootdownable for luf'd pages.
+	 */
+	luf_takeoff_start();
 	ret =3D __compact_finished(cc);
+	luf_takeoff_end();
+
 	trace_mm_compaction_finished(cc->zone, cc->order, ret);
 	if (ret =3D=3D COMPACT_NO_SUITABLE_PAGE)
 		ret =3D COMPACT_CONTINUE;
diff --git a/mm/internal.h b/mm/internal.h
index 2bb54bc04260b..3a6da77d04ed3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -662,7 +662,7 @@ static inline void clear_zone_contiguous(struct zone *z=
one)
 	zone->contiguous =3D false;
 }
=20
-extern int __isolate_free_page(struct page *page, unsigned int order);
+extern int __isolate_free_page(struct page *page, unsigned int order, bool=
 willputback);
 extern void __putback_isolated_page(struct page *page, unsigned int order,
 				    int mt);
 extern void memblock_free_pages(struct page *page, unsigned long pfn,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 05a1098f8c61f..f2ea69596ff15 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -869,8 +869,13 @@ static inline void del_page_from_free_list(struct page=
 *page, struct zone *zone,
 static inline struct page *get_page_from_free_area(struct free_area *area,
 					    int migratetype)
 {
-	return list_first_entry_or_null(&area->free_list[migratetype],
+	struct page *page =3D list_first_entry_or_null(&area->free_list[migratety=
pe],
 					struct page, buddy_list);
+
+	if (page && luf_takeoff_check(page))
+		return page;
+
+	return NULL;
 }
=20
 /*
@@ -1579,6 +1584,8 @@ static __always_inline void page_del_and_expand(struc=
t zone *zone,
 	int nr_pages =3D 1 << high;
=20
 	__del_page_from_free_list(page, zone, high, migratetype);
+	if (unlikely(!luf_takeoff_check_and_fold(page)))
+		VM_WARN_ON(1);
 	nr_pages -=3D expand(zone, page, low, high, migratetype);
 	account_freepages(zone, -nr_pages, migratetype);
 }
@@ -1950,6 +1957,13 @@ bool move_freepages_block_isolate(struct zone *zone,=
 struct page *page,
=20
 		del_page_from_free_list(buddy, zone, order,
 					get_pfnblock_migratetype(buddy, pfn));
+
+		/*
+		 * No need to luf_takeoff_check_and_fold() since it's
+		 * going back to buddy. luf_key will be handed over in
+		 * split_large_buddy().
+		 */
+
 		set_pageblock_migratetype(page, migratetype);
 		split_large_buddy(zone, buddy, pfn, order, FPI_NONE);
 		return true;
@@ -1961,6 +1975,13 @@ bool move_freepages_block_isolate(struct zone *zone,=
 struct page *page,
=20
 		del_page_from_free_list(page, zone, order,
 					get_pfnblock_migratetype(page, pfn));
+
+		/*
+		 * No need to luf_takeoff_check_and_fold() since it's
+		 * going back to buddy. luf_key will be handed over in
+		 * split_large_buddy().
+		 */
+
 		set_pageblock_migratetype(page, migratetype);
 		split_large_buddy(zone, page, pfn, order, FPI_NONE);
 		return true;
@@ -2085,6 +2106,8 @@ steal_suitable_fallback(struct zone *zone, struct pag=
e *page,
 		unsigned int nr_added;
=20
 		del_page_from_free_list(page, zone, current_order, block_type);
+		if (unlikely(!luf_takeoff_check_and_fold(page)))
+			VM_WARN_ON(1);
 		change_pageblock_range(page, current_order, start_type);
 		nr_added =3D expand(zone, page, order, current_order, start_type);
 		account_freepages(zone, nr_added, start_type);
@@ -2165,6 +2188,9 @@ int find_suitable_fallback(struct free_area *area, un=
signed int order,
 		if (free_area_empty(area, fallback_mt))
 			continue;
=20
+		if (luf_takeoff_no_shootdown())
+			continue;
+
 		if (can_steal_fallback(order, migratetype))
 			*can_steal =3D true;
=20
@@ -2256,6 +2282,11 @@ static bool unreserve_highatomic_pageblock(const str=
uct alloc_context *ac,
 					pageblock_nr_pages)
 			continue;
=20
+		/*
+		 * luf_takeoff_{start,end}() is required for
+		 * get_page_from_free_area() to use luf_takeoff_check().
+		 */
+		luf_takeoff_start();
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order =3D 0; order < NR_PAGE_ORDERS; order++) {
 			struct free_area *area =3D &(zone->free_area[order]);
@@ -2313,10 +2344,12 @@ static bool unreserve_highatomic_pageblock(const st=
ruct alloc_context *ac,
 			WARN_ON_ONCE(ret =3D=3D -1);
 			if (ret > 0) {
 				spin_unlock_irqrestore(&zone->lock, flags);
+				luf_takeoff_end();
 				return ret;
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
+		luf_takeoff_end();
 	}
=20
 	return false;
@@ -2494,6 +2527,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned i=
nt order,
 	unsigned long flags;
 	int i;
=20
+	luf_takeoff_start();
 	spin_lock_irqsave(&zone->lock, flags);
 	for (i =3D 0; i < count; ++i) {
 		struct page *page =3D __rmqueue(zone, order, migratetype,
@@ -2518,6 +2552,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned =
int order,
 		list_add_tail(&page->pcp_list, list);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
+	/*
+	 * Check and flush before using the pages taken off.
+	 */
+	luf_takeoff_end();
=20
 	return i;
 }
@@ -3012,7 +3050,7 @@ void split_page(struct page *page, unsigned int order)
 }
 EXPORT_SYMBOL_GPL(split_page);
=20
-int __isolate_free_page(struct page *page, unsigned int order)
+int __isolate_free_page(struct page *page, unsigned int order, bool willpu=
tback)
 {
 	struct zone *zone =3D page_zone(page);
 	int mt =3D get_pageblock_migratetype(page);
@@ -3031,6 +3069,8 @@ int __isolate_free_page(struct page *page, unsigned i=
nt order)
 	}
=20
 	del_page_from_free_list(page, zone, order, mt);
+	if (unlikely(!willputback && !luf_takeoff_check_and_fold(page)))
+		VM_WARN_ON(1);
=20
 	/*
 	 * Set the pageblock if the isolated page is at least half of a
@@ -3110,6 +3150,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zon=
e, struct zone *zone,
=20
 	do {
 		page =3D NULL;
+		luf_takeoff_start();
 		spin_lock_irqsave(&zone->lock, flags);
 		if (alloc_flags & ALLOC_HIGHATOMIC)
 			page =3D __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
@@ -3127,10 +3168,15 @@ struct page *rmqueue_buddy(struct zone *preferred_z=
one, struct zone *zone,
=20
 			if (!page) {
 				spin_unlock_irqrestore(&zone->lock, flags);
+				luf_takeoff_end();
 				return NULL;
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
+		/*
+		 * Check and flush before using the pages taken off.
+		 */
+		luf_takeoff_end();
 	} while (check_new_pages(page, order));
=20
 	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
@@ -3214,6 +3260,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, uns=
igned int order,
 		}
=20
 		page =3D list_first_entry(list, struct page, pcp_list);
+		if (!luf_takeoff_check_and_fold(page))
+			return NULL;
 		list_del(&page->pcp_list);
 		pcp->count -=3D 1 << order;
 	} while (check_new_pages(page, order));
@@ -3231,11 +3279,13 @@ static struct page *rmqueue_pcplist(struct zone *pr=
eferred_zone,
 	struct page *page;
 	unsigned long __maybe_unused UP_flags;
=20
+	luf_takeoff_start();
 	/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
 	pcp_trylock_prepare(UP_flags);
 	pcp =3D pcp_spin_trylock(zone->per_cpu_pageset);
 	if (!pcp) {
 		pcp_trylock_finish(UP_flags);
+		luf_takeoff_end();
 		return NULL;
 	}
=20
@@ -3249,6 +3299,10 @@ static struct page *rmqueue_pcplist(struct zone *pre=
ferred_zone,
 	page =3D __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, li=
st);
 	pcp_spin_unlock(pcp);
 	pcp_trylock_finish(UP_flags);
+	/*
+	 * Check and flush before using the pages taken off.
+	 */
+	luf_takeoff_end();
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 		zone_statistics(preferred_zone, zone, 1);
@@ -4853,6 +4907,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int =
preferred_nid,
 	if (unlikely(!zone))
 		goto failed;
=20
+	luf_takeoff_start();
 	/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
 	pcp_trylock_prepare(UP_flags);
 	pcp =3D pcp_spin_trylock(zone->per_cpu_pageset);
@@ -4891,6 +4946,10 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int=
 preferred_nid,
=20
 	pcp_spin_unlock(pcp);
 	pcp_trylock_finish(UP_flags);
+	/*
+	 * Check and flush before using the pages taken off.
+	 */
+	luf_takeoff_end();
=20
 	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
 	zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account);
@@ -4900,6 +4959,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int =
preferred_nid,
=20
 failed_irq:
 	pcp_trylock_finish(UP_flags);
+	luf_takeoff_end();
=20
 failed:
 	page =3D __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
@@ -7036,6 +7096,7 @@ unsigned long __offline_isolated_pages(unsigned long =
start_pfn,
=20
 	offline_mem_sections(pfn, end_pfn);
 	zone =3D page_zone(pfn_to_page(pfn));
+	luf_takeoff_start();
 	spin_lock_irqsave(&zone->lock, flags);
 	while (pfn < end_pfn) {
 		page =3D pfn_to_page(pfn);
@@ -7064,9 +7125,15 @@ unsigned long __offline_isolated_pages(unsigned long=
 start_pfn,
 		VM_WARN_ON(get_pageblock_migratetype(page) !=3D MIGRATE_ISOLATE);
 		order =3D buddy_order(page);
 		del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE);
+		if (unlikely(!luf_takeoff_check_and_fold(page)))
+			VM_WARN_ON(1);
 		pfn +=3D (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
+	/*
+	 * Check and flush before using the pages taken off.
+	 */
+	luf_takeoff_end();
=20
 	return end_pfn - start_pfn - already_offline;
 }
@@ -7142,6 +7209,7 @@ bool take_page_off_buddy(struct page *page)
 	unsigned int order;
 	bool ret =3D false;
=20
+	luf_takeoff_start();
 	spin_lock_irqsave(&zone->lock, flags);
 	for (order =3D 0; order < NR_PAGE_ORDERS; order++) {
 		struct page *page_head =3D page - (pfn & ((1 << order) - 1));
@@ -7154,6 +7222,8 @@ bool take_page_off_buddy(struct page *page)
=20
 			del_page_from_free_list(page_head, zone, page_order,
 						migratetype);
+			if (unlikely(!luf_takeoff_check_and_fold(page_head)))
+				VM_WARN_ON(1);
 			break_down_buddy_pages(zone, page_head, page, 0,
 						page_order, migratetype);
 			SetPageHWPoisonTakenOff(page);
@@ -7164,6 +7234,11 @@ bool take_page_off_buddy(struct page *page)
 			break;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
+
+	/*
+	 * Check and flush before using the pages taken off.
+	 */
+	luf_takeoff_end();
 	return ret;
 }
=20
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 8467838d4dbc8..eae33d188762b 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -211,6 +211,7 @@ static void unset_migratetype_isolate(struct page *page=
, int migratetype)
 	struct page *buddy;
=20
 	zone =3D page_zone(page);
+	luf_takeoff_start();
 	spin_lock_irqsave(&zone->lock, flags);
 	if (!is_migrate_isolate_page(page))
 		goto out;
@@ -229,7 +230,7 @@ static void unset_migratetype_isolate(struct page *page=
, int migratetype)
 			buddy =3D find_buddy_page_pfn(page, page_to_pfn(page),
 						    order, NULL);
 			if (buddy && !is_migrate_isolate_page(buddy)) {
-				isolated_page =3D !!__isolate_free_page(page, order);
+				isolated_page =3D !!__isolate_free_page(page, order, true);
 				/*
 				 * Isolating a free page in an isolated pageblock
 				 * is expected to always work as watermarks don't
@@ -269,6 +270,7 @@ static void unset_migratetype_isolate(struct page *page=
, int migratetype)
 	zone->nr_isolate_pageblock--;
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
+	luf_takeoff_end(zone);
 }
=20
 static inline struct page *
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index c05afb7a395f1..03a7f5f6dc073 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -167,6 +167,7 @@ page_reporting_cycle(struct page_reporting_dev_info *pr=
dev, struct zone *zone,
 	if (list_empty(list))
 		return err;
=20
+	luf_takeoff_start();
 	spin_lock_irq(&zone->lock);
=20
 	/*
@@ -191,6 +192,11 @@ page_reporting_cycle(struct page_reporting_dev_info *p=
rdev, struct zone *zone,
 		if (PageReported(page))
 			continue;
=20
+		if (!luf_takeoff_check(page)) {
+			VM_WARN_ON(1);
+			continue;
+		}
+
 		/*
 		 * If we fully consumed our budget then update our
 		 * state to indicate that we are requesting additional
@@ -204,7 +210,7 @@ page_reporting_cycle(struct page_reporting_dev_info *pr=
dev, struct zone *zone,
=20
 		/* Attempt to pull page from list and place in scatterlist */
 		if (*offset) {
-			if (!__isolate_free_page(page, order)) {
+			if (!__isolate_free_page(page, order, false)) {
 				next =3D page;
 				break;
 			}
@@ -227,6 +233,11 @@ page_reporting_cycle(struct page_reporting_dev_info *p=
rdev, struct zone *zone,
 		/* release lock before waiting on report processing */
 		spin_unlock_irq(&zone->lock);
=20
+		/*
+		 * Check and flush before using the pages taken off.
+		 */
+		luf_takeoff_end();
+
 		/* begin processing pages in local list */
 		err =3D prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
=20
@@ -236,6 +247,8 @@ page_reporting_cycle(struct page_reporting_dev_info *pr=
dev, struct zone *zone,
 		/* update budget to reflect call to report function */
 		budget--;
=20
+		luf_takeoff_start();
+
 		/* reacquire zone lock and resume processing */
 		spin_lock_irq(&zone->lock);
=20
@@ -259,6 +272,11 @@ page_reporting_cycle(struct page_reporting_dev_info *p=
rdev, struct zone *zone,
=20
 	spin_unlock_irq(&zone->lock);
=20
+	/*
+	 * Check and flush before using the pages taken off.
+	 */
+	luf_takeoff_end();
+
 	return err;
 }
=20
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id D52BD1E5705
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:04 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029768; cv=none;
 b=rHnu2GCMqbsce/P89DZ68mPKc4qc+D/VFNgw7TTsQbJKwH20ASqdu7SBj8M8+bewL4xmhYP1pq07OqaIWIU3wdlm29e0tmjfCQapzv3Dx4mxYKFSnMbtqCgN7BG8ENg2k9YUJ1VX9zv1G3u70nDAHYm7l1hpQvJOvILfPpE4WCo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029768; c=relaxed/simple;
	bh=VV5q9y2jCgNFyBLSL/GSQGFNOMM0kMUwOHWIY1uqOgg=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=u1aw540hENCdaiDaNeqA5F3Cc7qtQm5BSP8B+qOVCBCRiNyvSSeC+YWW1Nw0Mfta364Iwgj2xLjR811bw3SQX+tOovEX6prS4KeCAqaHZyXAW34A+0PugD/ChbpiaSzM5fqAaEBZKa6gzpri4KUxXg4BBKsxOEXJRUSwaLWutaw=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-ef-67b6bba6fd7c
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 13/26] mm: introduce pend_list in struct free_area to
 track luf'd pages
Date: Thu, 20 Feb 2025 14:20:14 +0900
Message-Id: <20250220052027.58847-14-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrOLMWRmVeSWpSXmKPExsXC9ZZnke6y3dvSDX4eM7WYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	vROaGQs+T2as2HflMHMD446yLkZODgkBE4lZi04yw9jTZj1gB7HZBNQlbtz4CRYXETCTONj6
	ByzOLHCXSeJAPxuILSwQL3G/fxVYDYuAqsTKW/PAbF6g+t9/pkDNlJdYveEAmM0JFP8xoxes
	V0jAVOLdgktMXYxcQDXv2SQ23bnLCtEgKXFwxQ2WCYy8CxgZVjEKZeaV5SZm5pjoZVTmZVbo
	JefnbmIEBv+y2j/ROxg/XQg+xCjAwajEwzujdVu6EGtiWXFl7iFGCQ5mJRHetvot6UK8KYmV
	ValF+fFFpTmpxYcYpTlYlMR5jb6VpwgJpCeWpGanphakFsFkmTg4pRoYvRdeDw4L/fdw06vX
	9g8Ndr+2PdL8xrvffvORUwaVlWtN+QwrOezuNrN8at5godinLXvq9/Qfi8QuTAkslHb4bv9p
	qsmiE4oZrQyCf14eC/uhVTevs3Pa8p5a+6aV2kFBCxslAirZ92yYGWTyde7t/boJJ4z3ur9O
	rdfOkRUp1j0WxX+5qI1HiaU4I9FQi7moOBEAsDiU8XoCAAA=
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrHLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0gxuHtC3mrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAldE7oZmx4PNkxop9Vw4zNzDuKOti5OSQEDCRmDbrATuIzSagLnHjxk9mEFtE
	wEziYOsfsDizwF0miQP9bCC2sEC8xP3+VWA1LAKqEitvzQOzeYHqf/+ZwgwxU15i9YYDYDYn
	UPzHjF6wXiEBU4l3Cy4xTWDkWsDIsIpRJDOvLDcxM8dUrzg7ozIvs0IvOT93EyMwlJfV/pm4
	g/HLZfdDjAIcjEo8vA8eb00XYk0sK67MPcQowcGsJMLbVr8lXYg3JbGyKrUoP76oNCe1+BCj
	NAeLkjivV3hqgpBAemJJanZqakFqEUyWiYNTqoExMULLrFJKhHEmr++NmJdbPjF5PPx7ffXp
	k2e3blk9/cb5DXx2EQeuTLWN3ry6ZcYV0d9fEh/5d63qP79SwLOTTcO9ZmLPimsMMx4vW+vM
	wbbaXOV/rF2vmbXVphf1/BflnpY16z/zlJkpcDn486WXeccNCnZ47tgWbNUYPyMtr7Vi1saO
	Y75pSizFGYmGWsxFxYkAQp74/2ECAAA=
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

luf'd pages requires tlb shootdown on exiting from page allocator. For
some page allocation request, it's okay to return luf'd page followed by
tlb shootdown but it's not okay for e.g. irq context.

This patch splitted the list in free_area into two, 'free_list' for
non-luf'd pages and 'pend_list' for luf'd pages so that the buddy
allocator can work better with various conditions of context.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 include/linux/mmzone.h  |   3 ++
 kernel/power/snapshot.c |  14 ++++++
 kernel/vmcore_info.c    |   2 +
 mm/compaction.c         |  33 ++++++++++---
 mm/internal.h           |  17 ++++++-
 mm/mm_init.c            |   2 +
 mm/page_alloc.c         | 105 ++++++++++++++++++++++++++++++++++------
 mm/page_reporting.c     |  22 ++++++---
 mm/vmstat.c             |  15 ++++++
 9 files changed, 184 insertions(+), 29 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b36124145a16f..ac3178b5fc50b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -116,6 +116,7 @@ extern int page_group_by_mobility_disabled;
 			MIGRATETYPE_MASK)
 struct free_area {
 	struct list_head	free_list[MIGRATE_TYPES];
+	struct list_head	pend_list[MIGRATE_TYPES];
 	unsigned long		nr_free;
 };
=20
@@ -995,6 +996,8 @@ struct zone {
 	/* Zone statistics */
 	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
 	atomic_long_t		vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
+	/* Count pages that need tlb shootdown on allocation */
+	atomic_long_t		nr_luf_pages;
 } ____cacheline_internodealigned_in_smp;
=20
 enum pgdat_flags {
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 30894d8f0a781..863b0c54185dc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1288,6 +1288,20 @@ static void mark_free_pages(struct zone *zone)
 				swsusp_set_page_free(pfn_to_page(pfn + i));
 			}
 		}
+
+		list_for_each_entry(page,
+				&zone->free_area[order].pend_list[t], buddy_list) {
+			unsigned long i;
+
+			pfn =3D page_to_pfn(page);
+			for (i =3D 0; i < (1UL << order); i++) {
+				if (!--page_count) {
+					touch_nmi_watchdog();
+					page_count =3D WD_PAGE_COUNT;
+				}
+				swsusp_set_page_free(pfn_to_page(pfn + i));
+			}
+		}
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index 1fec61603ef32..638deb57f9ddd 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -188,11 +188,13 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(zone, vm_stat);
 	VMCOREINFO_OFFSET(zone, spanned_pages);
 	VMCOREINFO_OFFSET(free_area, free_list);
+	VMCOREINFO_OFFSET(free_area, pend_list);
 	VMCOREINFO_OFFSET(list_head, next);
 	VMCOREINFO_OFFSET(list_head, prev);
 	VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
 	log_buf_vmcoreinfo_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+	VMCOREINFO_LENGTH(free_area.pend_list, MIGRATE_TYPES);
 	VMCOREINFO_NUMBER(NR_FREE_PAGES);
 	VMCOREINFO_NUMBER(PG_lru);
 	VMCOREINFO_NUMBER(PG_private);
diff --git a/mm/compaction.c b/mm/compaction.c
index 90f5c34f333db..27f3d743762bb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1590,24 +1590,28 @@ static void fast_isolate_freepages(struct compact_c=
ontrol *cc)
 	     order =3D next_search_order(cc, order)) {
 		struct free_area *area =3D &cc->zone->free_area[order];
 		struct list_head *freelist;
+		struct list_head *high_pfn_list;
 		struct page *freepage;
 		unsigned long flags;
 		unsigned int order_scanned =3D 0;
 		unsigned long high_pfn =3D 0;
+		bool consider_pend =3D false;
+		bool can_shootdown;
=20
 		if (!area->nr_free)
 			continue;
=20
-		luf_takeoff_start();
+		can_shootdown =3D luf_takeoff_start();
 		spin_lock_irqsave(&cc->zone->lock, flags);
 		freelist =3D &area->free_list[MIGRATE_MOVABLE];
+retry:
 		list_for_each_entry_reverse(freepage, freelist, buddy_list) {
 			unsigned long pfn;
=20
 			order_scanned++;
 			nr_scanned++;
=20
-			if (!luf_takeoff_check(freepage))
+			if (unlikely(consider_pend && !luf_takeoff_check(freepage)))
 				goto scan_next;
=20
 			pfn =3D page_to_pfn(freepage);
@@ -1620,26 +1624,34 @@ static void fast_isolate_freepages(struct compact_c=
ontrol *cc)
 				cc->fast_search_fail =3D 0;
 				cc->search_order =3D order;
 				page =3D freepage;
-				break;
+				goto done;
 			}
=20
 			if (pfn >=3D min_pfn && pfn > high_pfn) {
 				high_pfn =3D pfn;
+				high_pfn_list =3D freelist;
=20
 				/* Shorten the scan if a candidate is found */
 				limit >>=3D 1;
 			}
 scan_next:
 			if (order_scanned >=3D limit)
-				break;
+				goto done;
 		}
=20
+		if (!consider_pend && can_shootdown) {
+			consider_pend =3D true;
+			freelist =3D &area->pend_list[MIGRATE_MOVABLE];
+			goto retry;
+		}
+done:
 		/* Use a maximum candidate pfn if a preferred one was not found */
 		if (!page && high_pfn) {
 			page =3D pfn_to_page(high_pfn);
=20
 			/* Update freepage for the list reorder below */
 			freepage =3D page;
+			freelist =3D high_pfn_list;
 		}
=20
 		/* Reorder to so a future search skips recent pages */
@@ -2036,18 +2048,20 @@ static unsigned long fast_find_migrateblock(struct =
compact_control *cc)
 		struct list_head *freelist;
 		unsigned long flags;
 		struct page *freepage;
+		bool consider_pend =3D false;
=20
 		if (!area->nr_free)
 			continue;
=20
 		spin_lock_irqsave(&cc->zone->lock, flags);
 		freelist =3D &area->free_list[MIGRATE_MOVABLE];
+retry:
 		list_for_each_entry(freepage, freelist, buddy_list) {
 			unsigned long free_pfn;
=20
 			if (nr_scanned++ >=3D limit) {
 				move_freelist_tail(freelist, freepage);
-				break;
+				goto done;
 			}
=20
 			free_pfn =3D page_to_pfn(freepage);
@@ -2070,9 +2084,16 @@ static unsigned long fast_find_migrateblock(struct c=
ompact_control *cc)
 					pfn =3D cc->zone->zone_start_pfn;
 				cc->fast_search_fail =3D 0;
 				found_block =3D true;
-				break;
+				goto done;
 			}
 		}
+
+		if (!consider_pend) {
+			consider_pend =3D true;
+			freelist =3D &area->pend_list[MIGRATE_MOVABLE];
+			goto retry;
+		}
+done:
 		spin_unlock_irqrestore(&cc->zone->lock, flags);
 	}
=20
diff --git a/mm/internal.h b/mm/internal.h
index 3a6da77d04ed3..0dc374553f9b5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -836,11 +836,16 @@ void init_cma_reserved_pageblock(struct page *page);
 int find_suitable_fallback(struct free_area *area, unsigned int order,
 			int migratetype, bool only_stealable, bool *can_steal);
=20
-static inline bool free_area_empty(struct free_area *area, int migratetype)
+static inline bool free_list_empty(struct free_area *area, int migratetype)
 {
 	return list_empty(&area->free_list[migratetype]);
 }
=20
+static inline bool free_area_empty(struct free_area *area, int migratetype)
+{
+	return list_empty(&area->free_list[migratetype]) &&
+	       list_empty(&area->pend_list[migratetype]);
+}
 /* mm/util.c */
 struct anon_vma *folio_anon_vma(const struct folio *folio);
=20
@@ -1590,12 +1595,22 @@ void luf_takeoff_end(void);
 bool luf_takeoff_no_shootdown(void);
 bool luf_takeoff_check(struct page *page);
 bool luf_takeoff_check_and_fold(struct page *page);
+
+static inline bool non_luf_pages_ok(struct zone *zone)
+{
+	unsigned long nr_free =3D zone_page_state(zone, NR_FREE_PAGES);
+	unsigned long min_wm =3D min_wmark_pages(zone);
+	unsigned long nr_luf_pages =3D atomic_long_read(&zone->nr_luf_pages);
+
+	return nr_free - nr_luf_pages > min_wm;
+}
 #else
 static inline bool luf_takeoff_start(void) { return false; }
 static inline void luf_takeoff_end(void) {}
 static inline bool luf_takeoff_no_shootdown(void) { return true; }
 static inline bool luf_takeoff_check(struct page *page) { return true; }
 static inline bool luf_takeoff_check_and_fold(struct page *page) { return =
true; }
+static inline bool non_luf_pages_ok(struct zone *zone) { return true; }
 #endif
=20
 /* pagewalk.c */
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 1c205b0a86ed5..12b96cd6a87b0 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1396,12 +1396,14 @@ static void __meminit zone_init_free_lists(struct z=
one *zone)
 	unsigned int order, t;
 	for_each_migratetype_order(order, t) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
+		INIT_LIST_HEAD(&zone->free_area[order].pend_list[t]);
 		zone->free_area[order].nr_free =3D 0;
 	}
=20
 #ifdef CONFIG_UNACCEPTED_MEMORY
 	INIT_LIST_HEAD(&zone->unaccepted_pages);
 #endif
+	atomic_long_set(&zone->nr_luf_pages, 0);
 }
=20
 void __meminit init_currently_empty_zone(struct zone *zone,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f2ea69596ff15..65acc437d8387 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -804,15 +804,28 @@ static inline void __add_to_free_list(struct page *pa=
ge, struct zone *zone,
 				      bool tail)
 {
 	struct free_area *area =3D &zone->free_area[order];
+	struct list_head *list;
=20
 	VM_WARN_ONCE(get_pageblock_migratetype(page) !=3D migratetype,
 		     "page type is %lu, passed migratetype is %d (nr=3D%d)\n",
 		     get_pageblock_migratetype(page), migratetype, 1 << order);
=20
+	/*
+	 * When identifying whether a page requires tlb shootdown, false
+	 * positive is okay because it will cause just additional tlb
+	 * shootdown.
+	 */
+	if (page_luf_key(page)) {
+		list =3D &area->pend_list[migratetype];
+		atomic_long_add(1 << order, &zone->nr_luf_pages);
+	} else
+		list =3D &area->free_list[migratetype];
+
 	if (tail)
-		list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
+		list_add_tail(&page->buddy_list, list);
 	else
-		list_add(&page->buddy_list, &area->free_list[migratetype]);
+		list_add(&page->buddy_list, list);
+
 	area->nr_free++;
 }
=20
@@ -831,7 +844,20 @@ static inline void move_to_free_list(struct page *page=
, struct zone *zone,
 		     "page type is %lu, passed migratetype is %d (nr=3D%d)\n",
 		     get_pageblock_migratetype(page), old_mt, 1 << order);
=20
-	list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
+	/*
+	 * The page might have been taken from a pfn where it's not
+	 * clear which list was used.  Therefore, conservatively
+	 * consider it as pend_list, not to miss any true ones that
+	 * require tlb shootdown.
+	 *
+	 * When identifying whether a page requires tlb shootdown, false
+	 * positive is okay because it will cause just additional tlb
+	 * shootdown.
+	 */
+	if (page_luf_key(page))
+		list_move_tail(&page->buddy_list, &area->pend_list[new_mt]);
+	else
+		list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
=20
 	account_freepages(zone, -(1 << order), old_mt);
 	account_freepages(zone, 1 << order, new_mt);
@@ -848,6 +874,9 @@ static inline void __del_page_from_free_list(struct pag=
e *page, struct zone *zon
 	if (page_reported(page))
 		__ClearPageReported(page);
=20
+	if (page_luf_key(page))
+		atomic_long_sub(1 << order, &zone->nr_luf_pages);
+
 	list_del(&page->buddy_list);
 	__ClearPageBuddy(page);
 	zone->free_area[order].nr_free--;
@@ -866,15 +895,48 @@ static inline void del_page_from_free_list(struct pag=
e *page, struct zone *zone,
 	account_freepages(zone, -(1 << order), migratetype);
 }
=20
-static inline struct page *get_page_from_free_area(struct free_area *area,
-					    int migratetype)
+static inline struct page *get_page_from_free_area(struct zone *zone,
+		struct free_area *area, int migratetype)
 {
-	struct page *page =3D list_first_entry_or_null(&area->free_list[migratety=
pe],
-					struct page, buddy_list);
+	struct page *page;
+	bool pend_first;
=20
-	if (page && luf_takeoff_check(page))
-		return page;
+	/*
+	 * XXX: Make the decision preciser if needed e.g. using
+	 * zone_watermark_ok() or its family, but for now, don't want to
+	 * make it heavier.
+	 *
+	 * Try free_list, holding non-luf pages, first if there are
+	 * enough non-luf pages to aggressively defer tlb flush, but
+	 * should try pend_list first instead if not.
+	 */
+	pend_first =3D !non_luf_pages_ok(zone);
+
+	if (pend_first) {
+		page =3D list_first_entry_or_null(&area->pend_list[migratetype],
+				struct page, buddy_list);
+
+		if (page && luf_takeoff_check(page))
+			return page;
+
+		page =3D list_first_entry_or_null(&area->free_list[migratetype],
+				struct page, buddy_list);
+
+		if (page)
+			return page;
+	} else {
+		page =3D list_first_entry_or_null(&area->free_list[migratetype],
+				struct page, buddy_list);
+
+		if (page)
+			return page;
=20
+		page =3D list_first_entry_or_null(&area->pend_list[migratetype],
+				struct page, buddy_list);
+
+		if (page && luf_takeoff_check(page))
+			return page;
+	}
 	return NULL;
 }
=20
@@ -1027,6 +1089,8 @@ static inline void __free_one_page(struct page *page,
=20
 	if (fpi_flags & FPI_TO_TAIL)
 		to_tail =3D true;
+	else if (page_luf_key(page))
+		to_tail =3D true;
 	else if (is_shuffle_order(order))
 		to_tail =3D shuffle_pick_tail();
 	else
@@ -1556,6 +1620,8 @@ static inline unsigned int expand(struct zone *zone, =
struct page *page, int low,
 	unsigned int nr_added =3D 0;
=20
 	while (high > low) {
+		bool tail =3D false;
+
 		high--;
 		size >>=3D 1;
 		VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
@@ -1569,7 +1635,10 @@ static inline unsigned int expand(struct zone *zone,=
 struct page *page, int low,
 		if (set_page_guard(zone, &page[size], high))
 			continue;
=20
-		__add_to_free_list(&page[size], zone, high, migratetype, false);
+		if (page_luf_key(&page[size]))
+			tail =3D true;
+
+		__add_to_free_list(&page[size], zone, high, migratetype, tail);
 		set_buddy_order(&page[size], high);
 		nr_added +=3D size;
 	}
@@ -1754,7 +1823,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order =3D order; current_order < NR_PAGE_ORDERS; ++current_o=
rder) {
 		area =3D &(zone->free_area[current_order]);
-		page =3D get_page_from_free_area(area, migratetype);
+		page =3D get_page_from_free_area(zone, area, migratetype);
 		if (!page)
 			continue;
=20
@@ -2188,7 +2257,8 @@ int find_suitable_fallback(struct free_area *area, un=
signed int order,
 		if (free_area_empty(area, fallback_mt))
 			continue;
=20
-		if (luf_takeoff_no_shootdown())
+		if (free_list_empty(area, fallback_mt) &&
+		    luf_takeoff_no_shootdown())
 			continue;
=20
 		if (can_steal_fallback(order, migratetype))
@@ -2292,7 +2362,7 @@ static bool unreserve_highatomic_pageblock(const stru=
ct alloc_context *ac,
 			struct free_area *area =3D &(zone->free_area[order]);
 			int mt;
=20
-			page =3D get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
+			page =3D get_page_from_free_area(zone, area, MIGRATE_HIGHATOMIC);
 			if (!page)
 				continue;
=20
@@ -2430,7 +2500,7 @@ __rmqueue_fallback(struct zone *zone, int order, int =
start_migratetype,
 	VM_BUG_ON(current_order > MAX_PAGE_ORDER);
=20
 do_steal:
-	page =3D get_page_from_free_area(area, fallback_mt);
+	page =3D get_page_from_free_area(zone, area, fallback_mt);
=20
 	/* take off list, maybe claim block, expand remainder */
 	page =3D steal_suitable_fallback(zone, page, current_order, order,
@@ -7180,6 +7250,8 @@ static void break_down_buddy_pages(struct zone *zone,=
 struct page *page,
 	struct page *current_buddy;
=20
 	while (high > low) {
+		bool tail =3D false;
+
 		high--;
 		size >>=3D 1;
=20
@@ -7193,7 +7265,10 @@ static void break_down_buddy_pages(struct zone *zone=
, struct page *page,
 		if (set_page_guard(zone, current_buddy, high))
 			continue;
=20
-		add_to_free_list(current_buddy, zone, high, migratetype, false);
+		if (page_luf_key(current_buddy))
+			tail =3D true;
+
+		add_to_free_list(current_buddy, zone, high, migratetype, tail);
 		set_buddy_order(current_buddy, high);
 	}
 }
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 03a7f5f6dc073..e152b22fbba8a 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -159,15 +159,17 @@ page_reporting_cycle(struct page_reporting_dev_info *=
prdev, struct zone *zone,
 	struct page *page, *next;
 	long budget;
 	int err =3D 0;
+	bool consider_pend =3D false;
+	bool can_shootdown;
=20
 	/*
 	 * Perform early check, if free area is empty there is
 	 * nothing to process so we can skip this free_list.
 	 */
-	if (list_empty(list))
+	if (free_area_empty(area, mt))
 		return err;
=20
-	luf_takeoff_start();
+	can_shootdown =3D luf_takeoff_start();
 	spin_lock_irq(&zone->lock);
=20
 	/*
@@ -185,14 +187,14 @@ page_reporting_cycle(struct page_reporting_dev_info *=
prdev, struct zone *zone,
 	 * should always be a power of 2.
 	 */
 	budget =3D DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
-
+retry:
 	/* loop through free list adding unreported pages to sg list */
 	list_for_each_entry_safe(page, next, list, lru) {
 		/* We are going to skip over the reported pages. */
 		if (PageReported(page))
 			continue;
=20
-		if (!luf_takeoff_check(page)) {
+		if (unlikely(consider_pend && !luf_takeoff_check(page))) {
 			VM_WARN_ON(1);
 			continue;
 		}
@@ -205,14 +207,14 @@ page_reporting_cycle(struct page_reporting_dev_info *=
prdev, struct zone *zone,
 		if (budget < 0) {
 			atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
 			next =3D page;
-			break;
+			goto done;
 		}
=20
 		/* Attempt to pull page from list and place in scatterlist */
 		if (*offset) {
 			if (!__isolate_free_page(page, order, false)) {
 				next =3D page;
-				break;
+				goto done;
 			}
=20
 			/* Add page to scatter list */
@@ -263,9 +265,15 @@ page_reporting_cycle(struct page_reporting_dev_info *p=
rdev, struct zone *zone,
=20
 		/* exit on error */
 		if (err)
-			break;
+			goto done;
 	}
=20
+	if (!consider_pend && can_shootdown) {
+		consider_pend =3D true;
+		list =3D &area->pend_list[mt];
+		goto retry;
+	}
+done:
 	/* Rotate any leftover pages to the head of the freelist */
 	if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, li=
st))
 		list_rotate_to_front(&next->lru, list);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4d016314a56c9..3fb9a5f6dd6da 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1581,6 +1581,21 @@ static void pagetypeinfo_showfree_print(struct seq_f=
ile *m,
 					break;
 				}
 			}
+			list_for_each(curr, &area->pend_list[mtype]) {
+				/*
+				 * Cap the pend_list iteration because it might
+				 * be really large and we are under a spinlock
+				 * so a long time spent here could trigger a
+				 * hard lockup detector. Anyway this is a
+				 * debugging tool so knowing there is a handful
+				 * of pages of this order should be more than
+				 * sufficient.
+				 */
+				if (++freecount >=3D 100000) {
+					overflow =3D true;
+					break;
+				}
+			}
 			seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
 			spin_unlock_irq(&zone->lock);
 			cond_resched();
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id DA38F1E571A
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:04 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029767; cv=none;
 b=uf8U7ZHEF7x41ouh8wZN4JnTEOO0zgbxyXiGXfiXhnxwYBxo5ETPWQIbEUFZ8Z1RRl4pCma/axPhP7BhFxAZwD8+puT3BQCqw096XpJwCVs5ha5kDu3DdLLpOGP2wj8AF9Vqz0Y8kRRZ9jbWtY8uI5TerSiH5H1VBfWp3hPefuU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029767; c=relaxed/simple;
	bh=w7vX7JgcAEQHAdfHnEtOdFTLy8UIyanj3an2oyK8r3E=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=u2mDYE2Idu+9rG+2fX6xnLHH/k4sQGdqM39QrwITPOI8aOVCk9O35fbu8t5Mc6NOXKADPGEZHDLv/MY+/fOHGGZJJ4+r3OBldV9a4yvuDS2OX7tUahoN7p5videhZB9JqUqIF4apHozKht1G84qZPFXe6fhXF7V1k4Dqcb9O010=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-f4-67b6bba6ccf4
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 14/26] mm/rmap: recognize read-only tlb entries during
 batched tlb flush
Date: Thu, 20 Feb 2025 14:20:15 +0900
Message-Id: <20250220052027.58847-15-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrMLMWRmVeSWpSXmKPExsXC9ZZnoe6y3dvSDU5cMLeYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	P/ZsZC04KVbxpHUrWwPjC6EuRk4OCQETiTWT2lm6GDnA7Edz+UHCbALqEjdu/GQGsUUEzCQO
	tv5hB7GZBe4ySRzoZwOxhQUSJNY2P2ABsVkEVCVmNhxlAhnDC1Tf2F0OMV1eYvWGA2BjOIHC
	P2b0grUKCZhKvFtwiQmi5jObxLsd6hC2pMTBFTdYJjDyLmBkWMUolJlXlpuYmWOil1GZl1mh
	l5yfu4kRGPTLav9E72D8dCH4EKMAB6MSD++M1m3pQqyJZcWVuYcYJTiYlUR42+q3pAvxpiRW
	VqUW5ccXleakFh9ilOZgURLnNfpWniIkkJ5YkpqdmlqQWgSTZeLglGpgzPlhdov7meX1hzlX
	j7eIndjz/NQBa59mlz2uVx+zeyfJPbG9bXpQ8VRw2ZXGgvtii24+77AN+bCo4f5XlfMW2Qpd
	sTHCd12Xl3qedX6+9tHaANZ3J3P2b9kfq9y7fnF27vt/YicuWS++OH+fDue9uBARCfZn2Xyt
	Z5PY32m5H6j7d9y4sdP7oBJLcUaioRZzUXEiAFB+uU92AgAA
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrNLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g5U3dS3mrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlfFjz0bWgpNiFU9at7I1ML4Q6mLk4JAQMJF4NJe/i5GTg01AXeLGjZ/MILaI
	gJnEwdY/7CA2s8BdJokD/WwgtrBAgsTa5gcsIDaLgKrEzIajTCBjeIHqG7vLQcISAvISqzcc
	ABvDCRT+MaMXrFVIwFTi3YJLTBMYuRYwMqxiFMnMK8tNzMwx1SvOzqjMy6zQS87P3cQIDOJl
	tX8m7mD8ctn9EKMAB6MSD++Dx1vThVgTy4orcw8xSnAwK4nwttVvSRfiTUmsrEotyo8vKs1J
	LT7EKM3BoiTO6xWemiAkkJ5YkpqdmlqQWgSTZeLglGpg1LkjXdLXPGX5tN8CbH3Hpggxr9z6
	XeOBpoS47P71YiuMjF//X65ldlPM2+rrMxEra3+WyZtTLmvcXpXWtGzPutn/fPUZ9mw8oMv1
	3IG35IPY3ir749UaZ8NX8e1y0L7XekdBTFT7wWFOkcctobuPzZqjGW33yzFLR69jhVfiQvXp
	y7YJHNrkpsRSnJFoqMVcVJwIALEjBHJeAgAA
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Functionally, no change.  This is a preparation for luf mechanism that
requires to recognize read-only tlb entries and handle them in a
different way.  The newly introduced API in this patch, fold_ubc(), will
be used by luf mechanism.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 include/linux/sched.h |  1 +
 mm/rmap.c             | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c4ff83e1d5953..a217d6011fdfe 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1381,6 +1381,7 @@ struct task_struct {
=20
 	struct tlbflush_unmap_batch	tlb_ubc;
 	struct tlbflush_unmap_batch	tlb_ubc_takeoff;
+	struct tlbflush_unmap_batch	tlb_ubc_ro;
=20
 	/* Cache last used pipe for splice(): */
 	struct pipe_inode_info		*splice_pipe;
diff --git a/mm/rmap.c b/mm/rmap.c
index 1581b1a00f974..3ed6234dd777e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -775,6 +775,7 @@ void fold_luf_batch(struct luf_batch *dst, struct luf_b=
atch *src)
 void try_to_unmap_flush_takeoff(void)
 {
 	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
+	struct tlbflush_unmap_batch *tlb_ubc_ro =3D &current->tlb_ubc_ro;
 	struct tlbflush_unmap_batch *tlb_ubc_takeoff =3D &current->tlb_ubc_takeof=
f;
=20
 	if (!tlb_ubc_takeoff->flush_required)
@@ -789,6 +790,9 @@ void try_to_unmap_flush_takeoff(void)
 	if (arch_tlbbatch_done(&tlb_ubc->arch, &tlb_ubc_takeoff->arch))
 		reset_batch(tlb_ubc);
=20
+	if (arch_tlbbatch_done(&tlb_ubc_ro->arch, &tlb_ubc_takeoff->arch))
+		reset_batch(tlb_ubc_ro);
+
 	reset_batch(tlb_ubc_takeoff);
 }
=20
@@ -801,7 +805,9 @@ void try_to_unmap_flush_takeoff(void)
 void try_to_unmap_flush(void)
 {
 	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
+	struct tlbflush_unmap_batch *tlb_ubc_ro =3D &current->tlb_ubc_ro;
=20
+	fold_batch(tlb_ubc, tlb_ubc_ro, true);
 	if (!tlb_ubc->flush_required)
 		return;
=20
@@ -813,8 +819,9 @@ void try_to_unmap_flush(void)
 void try_to_unmap_flush_dirty(void)
 {
 	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
+	struct tlbflush_unmap_batch *tlb_ubc_ro =3D &current->tlb_ubc_ro;
=20
-	if (tlb_ubc->writable)
+	if (tlb_ubc->writable || tlb_ubc_ro->writable)
 		try_to_unmap_flush();
 }
=20
@@ -831,13 +838,18 @@ void try_to_unmap_flush_dirty(void)
 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
 				      unsigned long uaddr)
 {
-	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
+	struct tlbflush_unmap_batch *tlb_ubc;
 	int batch;
 	bool writable =3D pte_dirty(pteval);
=20
 	if (!pte_accessible(mm, pteval))
 		return;
=20
+	if (pte_write(pteval))
+		tlb_ubc =3D &current->tlb_ubc;
+	else
+		tlb_ubc =3D &current->tlb_ubc_ro;
+
 	arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
 	tlb_ubc->flush_required =3D true;
=20
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id DD80F1E571F
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:04 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029768; cv=none;
 b=Xz6VcgI/kZbEG8HI3AjEJi2DfV7YLQFM8UEPjLpQHknuief42NuLFYalPPemtjma0P8hcYJxQLKJRKdXe2Kdfpg+kSJwHDig2UxmISTTzl1c38OYwnaXp+k5eBU/TqOWjt3wcctLx9cNlbyI9JXcGw/I9QtdAYbKex66ttgMDWo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029768; c=relaxed/simple;
	bh=j7QU/35dX1hh0elbRPf8gMTWbEEz1zESJHGhkIar0Ck=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=oKOmspIOAf9UEIYifP6FZ8Z7JJPeSX5hFycxRprX8mU6knd1WUUeVcU10wyhbT4xwNi/IKg13Ki/Cq66/WUVBi6M8D5JeX4sRzmWJEZOGVDNvPTuQuqt/Ij2k2UUFW8GeiX/dDpdn/FaJP1Bl/WQUCZruO8ijX83fqgmmMBpZrI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-f9-67b6bba672fe
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 15/26] fs,
 filemap: refactor to gather the scattered ->write_{begin,end}() calls
Date: Thu, 20 Feb 2025 14:20:16 +0900
Message-Id: <20250220052027.58847-16-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrCLMWRmVeSWpSXmKPExsXC9ZZnoe6y3dvSDU7vs7KYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	388mFszMrujZfYGtgbE1tIuRk0NCwESires2M4zd8aaRFcRmE1CXuHHjJ1hcRMBM4mDrH3YQ
	m1ngLpPEgX62LkYODmGBDIkJK6xBwiwCqhLXNneDtfICle/Ys5ERYqS8xOoNB8DGcALFf8zo
	ZQOxhQRMJd4tuMTUxcgFVPOZTeLX3s9sEA2SEgdX3GCZwMi7gJFhFaNQZl5ZbmJmjoleRmVe
	ZoVecn7uJkZg2C+r/RO9g/HTheBDjAIcjEo8vDNat6ULsSaWFVfmHmKU4GBWEuFtq9+SLsSb
	klhZlVqUH19UmpNafIhRmoNFSZzX6Ft5ipBAemJJanZqakFqEUyWiYNTqoFx1ca/ahuM56hH
	1Xsd274v+fLC/17ZPw6m6X38knPm++09M58k6fzbVnXjjB1f/lzjfd3lU3YLqJjXPHRd9Mxi
	x8HTnnF5LztUNz91Pd74Ys6BSctLC2/qP2w+8K3A7tMBs8tTaldaFUxiSFX5d/DUpxWmGn3c
	+c/nWYbPUJl33H5fcYbKG+6DW5VYijMSDbWYi4oTAb3RVLJ3AgAA
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrNLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g/d7DSzmrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlfH9bGLBzOyKnt0X2BoYW0O7GDk5JARMJDreNLKC2GwC6hI3bvxkBrFFBMwk
	Drb+YQexmQXuMkkc6GfrYuTgEBbIkJiwwhokzCKgKnFtczdYKy9Q+Y49GxkhRspLrN5wAGwM
	J1D8x4xeNhBbSMBU4t2CS0wTGLkWMDKsYhTJzCvLTczMMdUrzs6ozMus0EvOz93ECAziZbV/
	Ju5g/HLZ/RCjAAejEg/vg8db04VYE8uKK3MPMUpwMCuJ8LbVb0kX4k1JrKxKLcqPLyrNSS0+
	xCjNwaIkzusVnpogJJCeWJKanZpakFoEk2Xi4JRqYOwL21D2p295TsuKHdzXTq60uv7q+1S/
	A6vWx9r2xbB7+5qpWsip7gxs1dn68WzI5Qv71/M8/rP2nb1xyx/OHxezgu3q3YNfqH34e9h/
	rU2PxgVrvom1H1bzXODc2n1QrVK+8PjaFZN2H69zbrXfXH6VtbpPqTfr0mTZ/kOPln0WVq+c
	+5Xx5QwlluKMREMt5qLiRADutSDtXgIAAA==
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Functionally, no change.  This is a preparation for luf mechanism that
requires to hook when updating page cache that might have pages that
have been mapped on any tasks so that tlb flush needed can be performed.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 11 ++++-------
 fs/affs/file.c                            |  4 ++--
 fs/buffer.c                               | 14 ++++++--------
 fs/exfat/file.c                           |  5 ++---
 fs/ext4/verity.c                          |  5 ++---
 fs/f2fs/super.c                           |  5 ++---
 fs/f2fs/verity.c                          |  5 ++---
 fs/namei.c                                |  5 ++---
 include/linux/fs.h                        | 18 ++++++++++++++++++
 mm/filemap.c                              |  5 ++---
 10 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i9=
15/gem/i915_gem_shmem.c
index fe69f2c8527d7..1d475d681d3de 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
@@ -422,7 +422,6 @@ shmem_pwrite(struct drm_i915_gem_object *obj,
 	     const struct drm_i915_gem_pwrite *arg)
 {
 	struct address_space *mapping =3D obj->base.filp->f_mapping;
-	const struct address_space_operations *aops =3D mapping->a_ops;
 	char __user *user_data =3D u64_to_user_ptr(arg->data_ptr);
 	u64 remain;
 	loff_t pos;
@@ -481,7 +480,7 @@ shmem_pwrite(struct drm_i915_gem_object *obj,
 		if (err)
 			return err;
=20
-		err =3D aops->write_begin(obj->base.filp, mapping, pos, len,
+		err =3D mapping_write_begin(obj->base.filp, mapping, pos, len,
 					&folio, &data);
 		if (err < 0)
 			return err;
@@ -492,7 +491,7 @@ shmem_pwrite(struct drm_i915_gem_object *obj,
 		pagefault_enable();
 		kunmap_local(vaddr);
=20
-		err =3D aops->write_end(obj->base.filp, mapping, pos, len,
+		err =3D mapping_write_end(obj->base.filp, mapping, pos, len,
 				      len - unwritten, folio, data);
 		if (err < 0)
 			return err;
@@ -658,7 +657,6 @@ i915_gem_object_create_shmem_from_data(struct drm_i915_=
private *i915,
 {
 	struct drm_i915_gem_object *obj;
 	struct file *file;
-	const struct address_space_operations *aops;
 	loff_t pos;
 	int err;
=20
@@ -670,21 +668,20 @@ i915_gem_object_create_shmem_from_data(struct drm_i91=
5_private *i915,
 	GEM_BUG_ON(obj->write_domain !=3D I915_GEM_DOMAIN_CPU);
=20
 	file =3D obj->base.filp;
-	aops =3D file->f_mapping->a_ops;
 	pos =3D 0;
 	do {
 		unsigned int len =3D min_t(typeof(size), size, PAGE_SIZE);
 		struct folio *folio;
 		void *fsdata;
=20
-		err =3D aops->write_begin(file, file->f_mapping, pos, len,
+		err =3D mapping_write_begin(file, file->f_mapping, pos, len,
 					&folio, &fsdata);
 		if (err < 0)
 			goto fail;
=20
 		memcpy_to_folio(folio, offset_in_folio(folio, pos), data, len);
=20
-		err =3D aops->write_end(file, file->f_mapping, pos, len, len,
+		err =3D mapping_write_end(file, file->f_mapping, pos, len, len,
 				      folio, fsdata);
 		if (err < 0)
 			goto fail;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index a5a861dd52230..10e7f53828e93 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -885,9 +885,9 @@ affs_truncate(struct inode *inode)
 		loff_t isize =3D inode->i_size;
 		int res;
=20
-		res =3D mapping->a_ops->write_begin(NULL, mapping, isize, 0, &folio, &fs=
data);
+		res =3D mapping_write_begin(NULL, mapping, isize, 0, &folio, &fsdata);
 		if (!res)
-			res =3D mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, folio, fs=
data);
+			res =3D mapping_write_end(NULL, mapping, isize, 0, 0, folio, fsdata);
 		else
 			inode->i_size =3D AFFS_I(inode)->mmu_private;
 		mark_inode_dirty(inode);
diff --git a/fs/buffer.c b/fs/buffer.c
index 88e765b0699fe..7cb0295500937 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2456,7 +2456,6 @@ EXPORT_SYMBOL(block_read_full_folio);
 int generic_cont_expand_simple(struct inode *inode, loff_t size)
 {
 	struct address_space *mapping =3D inode->i_mapping;
-	const struct address_space_operations *aops =3D mapping->a_ops;
 	struct folio *folio;
 	void *fsdata =3D NULL;
 	int err;
@@ -2465,11 +2464,11 @@ int generic_cont_expand_simple(struct inode *inode,=
 loff_t size)
 	if (err)
 		goto out;
=20
-	err =3D aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata);
+	err =3D mapping_write_begin(NULL, mapping, size, 0, &folio, &fsdata);
 	if (err)
 		goto out;
=20
-	err =3D aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata);
+	err =3D mapping_write_end(NULL, mapping, size, 0, 0, folio, fsdata);
 	BUG_ON(err > 0);
=20
 out:
@@ -2481,7 +2480,6 @@ static int cont_expand_zero(struct file *file, struct=
 address_space *mapping,
 			    loff_t pos, loff_t *bytes)
 {
 	struct inode *inode =3D mapping->host;
-	const struct address_space_operations *aops =3D mapping->a_ops;
 	unsigned int blocksize =3D i_blocksize(inode);
 	struct folio *folio;
 	void *fsdata =3D NULL;
@@ -2501,12 +2499,12 @@ static int cont_expand_zero(struct file *file, stru=
ct address_space *mapping,
 		}
 		len =3D PAGE_SIZE - zerofrom;
=20
-		err =3D aops->write_begin(file, mapping, curpos, len,
+		err =3D mapping_write_begin(file, mapping, curpos, len,
 					    &folio, &fsdata);
 		if (err)
 			goto out;
 		folio_zero_range(folio, offset_in_folio(folio, curpos), len);
-		err =3D aops->write_end(file, mapping, curpos, len, len,
+		err =3D mapping_write_end(file, mapping, curpos, len, len,
 						folio, fsdata);
 		if (err < 0)
 			goto out;
@@ -2534,12 +2532,12 @@ static int cont_expand_zero(struct file *file, stru=
ct address_space *mapping,
 		}
 		len =3D offset - zerofrom;
=20
-		err =3D aops->write_begin(file, mapping, curpos, len,
+		err =3D mapping_write_begin(file, mapping, curpos, len,
 					    &folio, &fsdata);
 		if (err)
 			goto out;
 		folio_zero_range(folio, offset_in_folio(folio, curpos), len);
-		err =3D aops->write_end(file, mapping, curpos, len, len,
+		err =3D mapping_write_end(file, mapping, curpos, len, len,
 						folio, fsdata);
 		if (err < 0)
 			goto out;
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index a25d7eb789f4c..242563b9dec95 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -539,7 +539,6 @@ static int exfat_extend_valid_size(struct file *file, l=
off_t new_valid_size)
 	struct inode *inode =3D file_inode(file);
 	struct exfat_inode_info *ei =3D EXFAT_I(inode);
 	struct address_space *mapping =3D inode->i_mapping;
-	const struct address_space_operations *ops =3D mapping->a_ops;
=20
 	pos =3D ei->valid_size;
 	while (pos < new_valid_size) {
@@ -550,11 +549,11 @@ static int exfat_extend_valid_size(struct file *file,=
 loff_t new_valid_size)
 		if (pos + len > new_valid_size)
 			len =3D new_valid_size - pos;
=20
-		err =3D ops->write_begin(file, mapping, pos, len, &folio, NULL);
+		err =3D mapping_write_begin(file, mapping, pos, len, &folio, NULL);
 		if (err)
 			goto out;
=20
-		err =3D ops->write_end(file, mapping, pos, len, len, folio, NULL);
+		err =3D mapping_write_end(file, mapping, pos, len, len, folio, NULL);
 		if (err < 0)
 			goto out;
 		pos +=3D len;
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index d9203228ce979..64fa43f80c73e 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -68,7 +68,6 @@ static int pagecache_write(struct inode *inode, const voi=
d *buf, size_t count,
 			   loff_t pos)
 {
 	struct address_space *mapping =3D inode->i_mapping;
-	const struct address_space_operations *aops =3D mapping->a_ops;
=20
 	if (pos + count > inode->i_sb->s_maxbytes)
 		return -EFBIG;
@@ -80,13 +79,13 @@ static int pagecache_write(struct inode *inode, const v=
oid *buf, size_t count,
 		void *fsdata =3D NULL;
 		int res;
=20
-		res =3D aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata);
+		res =3D mapping_write_begin(NULL, mapping, pos, n, &folio, &fsdata);
 		if (res)
 			return res;
=20
 		memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n);
=20
-		res =3D aops->write_end(NULL, mapping, pos, n, n, folio, fsdata);
+		res =3D mapping_write_end(NULL, mapping, pos, n, n, folio, fsdata);
 		if (res < 0)
 			return res;
 		if (res !=3D n)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 87ab5696bd482..f8d5ee466807c 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2678,7 +2678,6 @@ static ssize_t f2fs_quota_write(struct super_block *s=
b, int type,
 {
 	struct inode *inode =3D sb_dqopt(sb)->files[type];
 	struct address_space *mapping =3D inode->i_mapping;
-	const struct address_space_operations *a_ops =3D mapping->a_ops;
 	int offset =3D off & (sb->s_blocksize - 1);
 	size_t towrite =3D len;
 	struct folio *folio;
@@ -2690,7 +2689,7 @@ static ssize_t f2fs_quota_write(struct super_block *s=
b, int type,
 		tocopy =3D min_t(unsigned long, sb->s_blocksize - offset,
 								towrite);
 retry:
-		err =3D a_ops->write_begin(NULL, mapping, off, tocopy,
+		err =3D mapping_write_begin(NULL, mapping, off, tocopy,
 							&folio, &fsdata);
 		if (unlikely(err)) {
 			if (err =3D=3D -ENOMEM) {
@@ -2703,7 +2702,7 @@ static ssize_t f2fs_quota_write(struct super_block *s=
b, int type,
=20
 		memcpy_to_folio(folio, offset_in_folio(folio, off), data, tocopy);
=20
-		a_ops->write_end(NULL, mapping, off, tocopy, tocopy,
+		mapping_write_end(NULL, mapping, off, tocopy, tocopy,
 						folio, fsdata);
 		offset =3D 0;
 		towrite -=3D tocopy;
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 2287f238ae09e..b232589546d39 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -72,7 +72,6 @@ static int pagecache_write(struct inode *inode, const voi=
d *buf, size_t count,
 			   loff_t pos)
 {
 	struct address_space *mapping =3D inode->i_mapping;
-	const struct address_space_operations *aops =3D mapping->a_ops;
=20
 	if (pos + count > F2FS_BLK_TO_BYTES(max_file_blocks(inode)))
 		return -EFBIG;
@@ -84,13 +83,13 @@ static int pagecache_write(struct inode *inode, const v=
oid *buf, size_t count,
 		void *fsdata =3D NULL;
 		int res;
=20
-		res =3D aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata);
+		res =3D mapping_write_begin(NULL, mapping, pos, n, &folio, &fsdata);
 		if (res)
 			return res;
=20
 		memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n);
=20
-		res =3D aops->write_end(NULL, mapping, pos, n, n, folio, fsdata);
+		res =3D mapping_write_end(NULL, mapping, pos, n, n, folio, fsdata);
 		if (res < 0)
 			return res;
 		if (res !=3D n)
diff --git a/fs/namei.c b/fs/namei.c
index 4a4a22a08ac20..14a701ecf1a7e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -5349,7 +5349,6 @@ EXPORT_SYMBOL(page_readlink);
 int page_symlink(struct inode *inode, const char *symname, int len)
 {
 	struct address_space *mapping =3D inode->i_mapping;
-	const struct address_space_operations *aops =3D mapping->a_ops;
 	bool nofs =3D !mapping_gfp_constraint(mapping, __GFP_FS);
 	struct folio *folio;
 	void *fsdata =3D NULL;
@@ -5359,7 +5358,7 @@ int page_symlink(struct inode *inode, const char *sym=
name, int len)
 retry:
 	if (nofs)
 		flags =3D memalloc_nofs_save();
-	err =3D aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata);
+	err =3D mapping_write_begin(NULL, mapping, 0, len-1, &folio, &fsdata);
 	if (nofs)
 		memalloc_nofs_restore(flags);
 	if (err)
@@ -5367,7 +5366,7 @@ int page_symlink(struct inode *inode, const char *sym=
name, int len)
=20
 	memcpy(folio_address(folio), symname, len - 1);
=20
-	err =3D aops->write_end(NULL, mapping, 0, len - 1, len - 1,
+	err =3D mapping_write_end(NULL, mapping, 0, len - 1, len - 1,
 						folio, fsdata);
 	if (err < 0)
 		goto fail;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3559446279c15..bfd8aaeb78bb8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -494,6 +494,24 @@ struct address_space {
 #define PAGECACHE_TAG_WRITEBACK	XA_MARK_1
 #define PAGECACHE_TAG_TOWRITE	XA_MARK_2
=20
+static inline int mapping_write_begin(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len,
+				struct folio **foliop, void **fsdata)
+{
+	return mapping->a_ops->write_begin(file, mapping, pos, len, foliop,
+			fsdata);
+}
+
+static inline int mapping_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct folio *folio, void *fsdata)
+{
+	return mapping->a_ops->write_end(file, mapping, pos, len, copied,
+			folio, fsdata);
+}
+
 /*
  * Returns true if any of the pages in the mapping are marked with the tag.
  */
diff --git a/mm/filemap.c b/mm/filemap.c
index e582a1545d2ae..a4930449fc705 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -4016,7 +4016,6 @@ ssize_t generic_perform_write(struct kiocb *iocb, str=
uct iov_iter *i)
 	struct file *file =3D iocb->ki_filp;
 	loff_t pos =3D iocb->ki_pos;
 	struct address_space *mapping =3D file->f_mapping;
-	const struct address_space_operations *a_ops =3D mapping->a_ops;
 	size_t chunk =3D mapping_max_folio_size(mapping);
 	long status =3D 0;
 	ssize_t written =3D 0;
@@ -4050,7 +4049,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, str=
uct iov_iter *i)
 			break;
 		}
=20
-		status =3D a_ops->write_begin(file, mapping, pos, bytes,
+		status =3D mapping_write_begin(file, mapping, pos, bytes,
 						&folio, &fsdata);
 		if (unlikely(status < 0))
 			break;
@@ -4065,7 +4064,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, str=
uct iov_iter *i)
 		copied =3D copy_folio_from_iter_atomic(folio, offset, bytes, i);
 		flush_dcache_folio(folio);
=20
-		status =3D a_ops->write_end(file, mapping, pos, bytes, copied,
+		status =3D mapping_write_end(file, mapping, pos, bytes, copied,
 						folio, fsdata);
 		if (unlikely(status !=3D copied)) {
 			iov_iter_revert(i, copied - max(status, 0L));
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.hynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 06A851E5734
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:04 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029770; cv=none;
 b=lyplY8ngUS7peizU+prZCRmh8IdBEWRsYTWMCiILbSiquW2XVkNdYciVghaVDeyd6BO0zAoKIWkT4Qk2drfFkHJr3edQjlfMSnLXpYa6cCPlo7faAJ5EicPH26B/DFXdrVJhcLmxMa7nCEVLnkOJztKTUbjiXo3HWrdYY5PuAQ4=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029770; c=relaxed/simple;
	bh=sBgFgGiwQ2AQvkC45eUex4ANimKUgDhVvJX/lv+z+54=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=X9b4l3TQdjVThWn7yIUcM9M/UoC2Be4elVabkjVlFDwUvzjJB3dvrt+H1sgOqjGytNo4eQCPUPq/zkIqODgQu/w8QQJFZQ9c5KQT8GU40BeZ+sOGpiArSZc3tukTmase7x1GrCT4PHf3VeGbpteNdjPvgTsr6iWEUnB0wjGJGs8=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-fe-67b6bba6fd6f
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 16/26] mm: implement LUF(Lazy Unmap Flush) defering
 tlb flush when folios get unmapped
Date: Thu, 20 Feb 2025 14:20:17 +0900
Message-Id: <20250220052027.58847-17-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrBLMWRmVeSWpSXmKPExsXC9ZZnoe7y3dvSDXbtYLSYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	M7d/YCvovcdYsfrRNuYGxpb1jF2MnBwSAiYSv29NYIex/07oZgOx2QTUJW7c+MkMYosImEkc
	bP0DVsMscJdJ4kA/WI2wQJ7Eq7l/weawCKhKLHmzHqyeF6j+655JzBAz5SVWbzgAZnMCxX/M
	6AXrFRIwlXi34BJTFyMXUM1nNonpzbNYIRokJQ6uuMEygZF3ASPDKkahzLyy3MTMHBO9jMq8
	zAq95PzcTYzA8F9W+yd6B+OnC8GHGAU4GJV4eGe0bksXYk0sK67MPcQowcGsJMLbVr8lXYg3
	JbGyKrUoP76oNCe1+BCjNAeLkjiv0bfyFCGB9MSS1OzU1ILUIpgsEwenVANjbNrfeIH/wVur
	pc61yf6LX2pfu2oX24UPthZxR2sWFN5M+Okp2BwW435KeNuEh3POylbNt/m7SXv6Y6l3txfK
	iU2fU36X+UXT75x78hdm6d0uC6heceLEkwY5nqS+Kn8D1syW317aDPtvS+t92u1ycL+8/hO1
	663/l/od/9X30N2kY29lVeh7JZbijERDLeai4kQAa5ymkHsCAAA=
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrPLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0gzk3jS3mrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlTFz+we2gt57jBWrH21jbmBsWc/YxcjJISFgIvF3QjcbiM0moC5x48ZPZhBb
	RMBM4mDrH3YQm1ngLpPEgX6wGmGBPIlXc/+C9bIIqEosebMerJ4XqP7rnknMEDPlJVZvOABm
	cwLFf8zoBesVEjCVeLfgEtMERq4FjAyrGEUy88pyEzNzTPWKszMq8zIr9JLzczcxAoN5We2f
	iTsYv1x2P8QowMGoxMP74PHWdCHWxLLiytxDjBIczEoivG31W9KFeFMSK6tSi/Lji0pzUosP
	MUpzsCiJ83qFpyYICaQnlqRmp6YWpBbBZJk4OKUaGOfxVjBz+5rknE1b3JFoOcXTy8RK8snt
	NTu95hoee3Q9mPXuBLW1Lz8/fZ33pLf1bnLGvI2RX0qk84/c3HdcYuHT2vk6735s1vv7c8P3
	k7O4/a3YGuM+ys5/nCAYffHg0zXMm6bNDdtTrRBslO6Rtmajq86SR6XX8hrL9s68mV/z+fGT
	l19+GLIqsRRnJBpqMRcVJwIANMT5emICAAA=
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

A new mechanism, LUF(Lazy Unmap Flush), defers tlb flush until folios
that have been unmapped and freed, eventually get allocated again.  It's
safe for folios that had been mapped read-only and were unmapped, as
long as the contents of the folios don't change while staying in pcp or
buddy so we can still read the data through the stale tlb entries.

tlb flush can be defered when folios get unmapped as long as it
guarantees to perform tlb flush needed, before the folios actually
become used, of course, only if all the corresponding ptes don't have
write permission.  Otherwise, the system will get messed up.

To achieve that, for the folios that map only to non-writable tlb
entries, prevent tlb flush during unmapping but perform it just before
the folios actually become used, out of buddy or pcp.

However, we should cancel the pending by LUF and perform the deferred
TLB flush right away when:

   1. a writable pte is newly set through fault handler
   2. a file is updated
   3. kasan needs poisoning on free
   4. the kernel wants to init pages on free

No matter what type of workload is used for performance evaluation, the
result would be positive thanks to the unconditional reduction of tlb
flushes, tlb misses and interrupts.  For the test, I picked up one of
the most popular and heavy workload, llama.cpp that is a
LLM(Large Language Model) inference engine.

The result would depend on memory latency and how often reclaim runs,
which implies tlb miss overhead and how many times unmapping happens.
In my system, the result shows:

   1. tlb shootdown interrupts are reduced about 97%.
   2. The test program runtime is reduced about 4.5%.

The test environment and the test set are like:

   Machine: bare metal, x86_64, Intel(R) Xeon(R) Gold 6430
   CPU: 1 socket 64 core with hyper thread on
   Numa: 2 nodes (64 CPUs DRAM 42GB, no CPUs CXL expander 98GB)
   Config: swap off, numa balancing tiering on, demotion enabled

   llama.cpp/main -m $(70G_model1) -p "who are you?" -s 1 -t 15 -n 20 &
   llama.cpp/main -m $(70G_model2) -p "who are you?" -s 1 -t 15 -n 20 &
   llama.cpp/main -m $(70G_model3) -p "who are you?" -s 1 -t 15 -n 20 &
   wait

   where,
   -t: nr of threads, -s: seed used to make the runtime stable,
   -n: nr of tokens that determines the runtime, -p: prompt to ask,
   -m: LLM model to use.

Run the test set 5 times successively with caches dropped every run via
'echo 3 > /proc/sys/vm/drop_caches'.  Each inference prints its runtime
at the end of each.  The results are like:

   1. Runtime from the output of llama.cpp

   BEFORE
   ------
   llama_print_timings:       total time =3D  883450.54 ms /    24 tokens
   llama_print_timings:       total time =3D  861665.91 ms /    24 tokens
   llama_print_timings:       total time =3D  898079.02 ms /    24 tokens
   llama_print_timings:       total time =3D  879897.69 ms /    24 tokens
   llama_print_timings:       total time =3D  892360.75 ms /    24 tokens
   llama_print_timings:       total time =3D  884587.85 ms /    24 tokens
   llama_print_timings:       total time =3D  861023.19 ms /    24 tokens
   llama_print_timings:       total time =3D  900022.18 ms /    24 tokens
   llama_print_timings:       total time =3D  878771.88 ms /    24 tokens
   llama_print_timings:       total time =3D  889027.98 ms /    24 tokens
   llama_print_timings:       total time =3D  880783.90 ms /    24 tokens
   llama_print_timings:       total time =3D  856475.29 ms /    24 tokens
   llama_print_timings:       total time =3D  896842.21 ms /    24 tokens
   llama_print_timings:       total time =3D  878883.53 ms /    24 tokens
   llama_print_timings:       total time =3D  890122.10 ms /    24 tokens

   AFTER
   -----
   llama_print_timings:       total time =3D  871060.86 ms /    24 tokens
   llama_print_timings:       total time =3D  825609.53 ms /    24 tokens
   llama_print_timings:       total time =3D  836854.81 ms /    24 tokens
   llama_print_timings:       total time =3D  843147.99 ms /    24 tokens
   llama_print_timings:       total time =3D  831426.65 ms /    24 tokens
   llama_print_timings:       total time =3D  873939.23 ms /    24 tokens
   llama_print_timings:       total time =3D  826127.69 ms /    24 tokens
   llama_print_timings:       total time =3D  835489.26 ms /    24 tokens
   llama_print_timings:       total time =3D  842589.62 ms /    24 tokens
   llama_print_timings:       total time =3D  833700.66 ms /    24 tokens
   llama_print_timings:       total time =3D  875996.19 ms /    24 tokens
   llama_print_timings:       total time =3D  826401.73 ms /    24 tokens
   llama_print_timings:       total time =3D  839341.28 ms /    24 tokens
   llama_print_timings:       total time =3D  841075.10 ms /    24 tokens
   llama_print_timings:       total time =3D  835136.41 ms /    24 tokens

   2. tlb shootdowns from 'cat /proc/interrupts'

   BEFORE
   ------
   TLB:
    80911532   93691786  100296251  111062810  109769109  109862429
   108968588  119175230  115779676  118377498  119325266  120300143
   124514185  116697222  121068466  118031913  122660681  117494403
   121819907  116960596  120936335  117217061  118630217  122322724
   119595577  111693298  119232201  120030377  115334687  113179982
   118808254  116353592  140987367  137095516  131724276  139742240
   136501150  130428761  127585535  132483981  133430250  133756207
   131786710  126365824  129812539  133850040  131742690  125142213
   128572830  132234350  131945922  128417707  133355434  129972846
   126331823  134050849  133991626  121129038  124637283  132830916
   126875507  122322440  125776487  124340278   TLB shootdowns

   AFTER
   -----
   TLB:
     2121206    2615108    2983494    2911950    3055086    3092672
     3204894    3346082    3286744    3307310    3357296    3315940
     3428034    3112596    3143325    3185551    3186493    3322314
     3330523    3339663    3156064    3272070    3296309    3198962
     3332662    3315870    3234467    3353240    3281234    3300666
     3345452    3173097    4009196    3932215    3898735    3726531
     3717982    3671726    3728788    3724613    3799147    3691764
     3620630    3684655    3666688    3393974    3448651    3487593
     3446357    3618418    3671920    3712949    3575264    3715385
     3641513    3630897    3691047    3630690    3504933    3662647
     3629926    3443044    3832970    3548813   TLB shootdowns

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 include/asm-generic/tlb.h |   5 ++
 include/linux/fs.h        |  12 +++-
 include/linux/mm_types.h  |   6 ++
 include/linux/sched.h     |   9 +++
 kernel/sched/core.c       |   1 +
 mm/internal.h             |  94 ++++++++++++++++++++++++-
 mm/memory.c               |  15 ++++
 mm/pgtable-generic.c      |   2 +
 mm/rmap.c                 | 141 +++++++++++++++++++++++++++++++++++---
 mm/truncate.c             |  55 +++++++++++++--
 mm/vmscan.c               |  12 +++-
 11 files changed, 333 insertions(+), 19 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 709830274b756..4a99351be111e 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -549,6 +549,11 @@ static inline void tlb_start_vma(struct mmu_gather *tl=
b, struct vm_area_struct *
=20
 static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_stru=
ct *vma)
 {
+	/*
+	 * Don't leave stale tlb entries for this vma.
+	 */
+	luf_flush(0);
+
 	if (tlb->fullmm)
 		return;
=20
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bfd8aaeb78bb8..ec88270221bfe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -499,8 +499,18 @@ static inline int mapping_write_begin(struct file *fil=
e,
 				loff_t pos, unsigned len,
 				struct folio **foliop, void **fsdata)
 {
-	return mapping->a_ops->write_begin(file, mapping, pos, len, foliop,
+	int ret;
+
+	ret =3D mapping->a_ops->write_begin(file, mapping, pos, len, foliop,
 			fsdata);
+
+	/*
+	 * Ensure to clean stale tlb entries for this mapping.
+	 */
+	if (!ret)
+		luf_flush(0);
+
+	return ret;
 }
=20
 static inline int mapping_write_end(struct file *file,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 39a6b5124b01f..b3eb5a4e45efb 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1270,6 +1270,12 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, s=
truct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct=
 *mm);
 extern void tlb_finish_mmu(struct mmu_gather *tlb);
=20
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH)
+void luf_flush(unsigned short luf_key);
+#else
+static inline void luf_flush(unsigned short luf_key) {}
+#endif
+
 struct vm_fault;
=20
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a217d6011fdfe..94321d51b91e8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1382,6 +1382,15 @@ struct task_struct {
 	struct tlbflush_unmap_batch	tlb_ubc;
 	struct tlbflush_unmap_batch	tlb_ubc_takeoff;
 	struct tlbflush_unmap_batch	tlb_ubc_ro;
+	struct tlbflush_unmap_batch	tlb_ubc_luf;
+
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH)
+	/*
+	 * whether all the mappings of a folio during unmap are read-only
+	 * so that luf can work on the folio
+	 */
+	bool				can_luf;
+#endif
=20
 	/* Cache last used pipe for splice(): */
 	struct pipe_inode_info		*splice_pipe;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 719e0ed1e9761..aea08d8a9e258 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5225,6 +5225,7 @@ static struct rq *finish_task_switch(struct task_stru=
ct *prev)
 	if (mm) {
 		membarrier_mm_sync_core_before_usermode(mm);
 		mmdrop_lazy_tlb_sched(mm);
+		luf_flush(0);
 	}
=20
 	if (unlikely(prev_state =3D=3D TASK_DEAD)) {
diff --git a/mm/internal.h b/mm/internal.h
index 0dc374553f9b5..fe4a1c174895f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1604,13 +1604,105 @@ static inline bool non_luf_pages_ok(struct zone *z=
one)
=20
 	return nr_free - nr_luf_pages > min_wm;
 }
-#else
+
+unsigned short fold_unmap_luf(void);
+
+/*
+ * Reset the indicator indicating there are no writable mappings at the
+ * beginning of every rmap traverse for unmap.  luf can work only when
+ * all the mappings are read-only.
+ */
+static inline void can_luf_init(struct folio *f)
+{
+	if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
+		current->can_luf =3D false;
+	/*
+	 * Pages might get updated inside buddy.
+	 */
+	else if (want_init_on_free())
+		current->can_luf =3D false;
+	/*
+	 * Pages might get updated inside buddy.
+	 */
+	else if (!should_skip_kasan_poison(folio_page(f, 0)))
+		current->can_luf =3D false;
+	/*
+	 * XXX: Remove the constraint once luf handles zone device folio.
+	 */
+	else if (unlikely(folio_is_zone_device(f)))
+		current->can_luf =3D false;
+	/*
+	 * XXX: Remove the constraint once luf handles hugetlb folio.
+	 */
+	else if (unlikely(folio_test_hugetlb(f)))
+		current->can_luf =3D false;
+	/*
+	 * XXX: Remove the constraint once luf handles large folio.
+	 */
+	else if (unlikely(folio_test_large(f)))
+		current->can_luf =3D false;
+	/*
+	 * Can track write of anon folios through fault handler.
+	 */
+	else if (folio_test_anon(f))
+		current->can_luf =3D true;
+	/*
+	 * Can track write of file folios through page cache or truncation.
+	 */
+	else if (folio_mapping(f))
+		current->can_luf =3D true;
+	/*
+	 * For niehter anon nor file folios, do not apply luf.
+	 */
+	else
+		current->can_luf =3D false;
+}
+
+/*
+ * Mark the folio is not applicable to luf once it found a writble or
+ * dirty pte during rmap traverse for unmap.
+ */
+static inline void can_luf_fail(void)
+{
+	current->can_luf =3D false;
+}
+
+/*
+ * Check if all the mappings are read-only.
+ */
+static inline bool can_luf_test(void)
+{
+	return current->can_luf;
+}
+
+static inline bool can_luf_vma(struct vm_area_struct *vma)
+{
+	/*
+	 * Shared region requires a medium like file to keep all the
+	 * associated mm_struct.  luf makes use of strcut address_space
+	 * for that purpose.
+	 */
+	if (vma->vm_flags & VM_SHARED)
+		return !!vma->vm_file;
+
+	/*
+	 * Private region can be handled through its mm_struct.
+	 */
+	return true;
+}
+#else /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 static inline bool luf_takeoff_start(void) { return false; }
 static inline void luf_takeoff_end(void) {}
 static inline bool luf_takeoff_no_shootdown(void) { return true; }
 static inline bool luf_takeoff_check(struct page *page) { return true; }
 static inline bool luf_takeoff_check_and_fold(struct page *page) { return =
true; }
 static inline bool non_luf_pages_ok(struct zone *zone) { return true; }
+static inline unsigned short fold_unmap_luf(void) { return 0; }
+
+static inline void can_luf_init(struct folio *f) {}
+static inline void can_luf_fail(void) {}
+static inline bool can_luf_test(void) { return false; }
+static inline bool can_luf_vma(struct vm_area_struct *vma) { return false;=
 }
 #endif
=20
 /* pagewalk.c */
diff --git a/mm/memory.c b/mm/memory.c
index 209885a4134f7..0e85c49bc5028 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6081,6 +6081,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma=
, unsigned long address,
 	struct mm_struct *mm =3D vma->vm_mm;
 	vm_fault_t ret;
 	bool is_droppable;
+	bool flush =3D false;
=20
 	__set_current_state(TASK_RUNNING);
=20
@@ -6106,6 +6107,14 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vm=
a, unsigned long address,
=20
 	lru_gen_enter_fault(vma);
=20
+	/*
+	 * Any potential cases that make pte writable even forcely
+	 * should be considered.
+	 */
+	if (vma->vm_flags & (VM_WRITE | VM_MAYWRITE) ||
+			flags & FAULT_FLAG_WRITE)
+		flush =3D true;
+
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		ret =3D hugetlb_fault(vma->vm_mm, vma, address, flags);
 	else
@@ -6137,6 +6146,12 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vm=
a, unsigned long address,
 out:
 	mm_account_fault(mm, regs, address, flags, ret);
=20
+	/*
+	 * Ensure to clean stale tlb entries for this vma.
+	 */
+	if (flush)
+		luf_flush(0);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(handle_mm_fault);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 5297dcc38c37a..215d8d93560fd 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -99,6 +99,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsign=
ed long address,
 	pte =3D ptep_get_and_clear(mm, address, ptep);
 	if (pte_accessible(mm, pte))
 		flush_tlb_page(vma, address);
+	else
+		luf_flush(0);
 	return pte;
 }
 #endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 3ed6234dd777e..0aaf02b1b34c3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -646,7 +646,7 @@ static atomic_long_t luf_ugen =3D ATOMIC_LONG_INIT(LUF_=
UGEN_INIT);
 /*
  * Don't return invalid luf_ugen, zero.
  */
-static unsigned long __maybe_unused new_luf_ugen(void)
+static unsigned long new_luf_ugen(void)
 {
 	unsigned long ugen =3D atomic_long_inc_return(&luf_ugen);
=20
@@ -723,7 +723,7 @@ static atomic_t luf_kgen =3D ATOMIC_INIT(1);
 /*
  * Don't return invalid luf_key, zero.
  */
-static unsigned short __maybe_unused new_luf_key(void)
+static unsigned short new_luf_key(void)
 {
 	unsigned short luf_key =3D atomic_inc_return(&luf_kgen);
=20
@@ -776,6 +776,7 @@ void try_to_unmap_flush_takeoff(void)
 {
 	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
 	struct tlbflush_unmap_batch *tlb_ubc_ro =3D &current->tlb_ubc_ro;
+	struct tlbflush_unmap_batch *tlb_ubc_luf =3D &current->tlb_ubc_luf;
 	struct tlbflush_unmap_batch *tlb_ubc_takeoff =3D &current->tlb_ubc_takeof=
f;
=20
 	if (!tlb_ubc_takeoff->flush_required)
@@ -793,9 +794,72 @@ void try_to_unmap_flush_takeoff(void)
 	if (arch_tlbbatch_done(&tlb_ubc_ro->arch, &tlb_ubc_takeoff->arch))
 		reset_batch(tlb_ubc_ro);
=20
+	if (arch_tlbbatch_done(&tlb_ubc_luf->arch, &tlb_ubc_takeoff->arch))
+		reset_batch(tlb_ubc_luf);
+
 	reset_batch(tlb_ubc_takeoff);
 }
=20
+/*
+ * Should be called just before try_to_unmap_flush() to optimize the tlb
+ * shootdown using arch_tlbbatch_done().
+ */
+unsigned short fold_unmap_luf(void)
+{
+	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
+	struct tlbflush_unmap_batch *tlb_ubc_luf =3D &current->tlb_ubc_luf;
+	struct luf_batch *lb;
+	unsigned long new_ugen;
+	unsigned short new_key;
+	unsigned long flags;
+
+	if (!tlb_ubc_luf->flush_required)
+		return 0;
+
+	/*
+	 * fold_unmap_luf() is always followed by try_to_unmap_flush().
+	 */
+	if (arch_tlbbatch_done(&tlb_ubc_luf->arch, &tlb_ubc->arch)) {
+		tlb_ubc_luf->flush_required =3D false;
+		tlb_ubc_luf->writable =3D false;
+	}
+
+	/*
+	 * Check again after shrinking.
+	 */
+	if (!tlb_ubc_luf->flush_required)
+		return 0;
+
+	new_ugen =3D new_luf_ugen();
+	new_key =3D new_luf_key();
+
+	/*
+	 * Update the next entry of luf_batch table, that is the oldest
+	 * entry among the candidate, hopefully tlb flushes have been
+	 * done for all of the CPUs.
+	 */
+	lb =3D &luf_batch[new_key];
+	write_lock_irqsave(&lb->lock, flags);
+	__fold_luf_batch(lb, tlb_ubc_luf, new_ugen);
+	write_unlock_irqrestore(&lb->lock, flags);
+
+	reset_batch(tlb_ubc_luf);
+	return new_key;
+}
+
+void luf_flush(unsigned short luf_key)
+{
+	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
+	struct luf_batch *lb =3D &luf_batch[luf_key];
+	unsigned long flags;
+
+	read_lock_irqsave(&lb->lock, flags);
+	fold_batch(tlb_ubc, &lb->batch, false);
+	read_unlock_irqrestore(&lb->lock, flags);
+	try_to_unmap_flush();
+}
+EXPORT_SYMBOL(luf_flush);
+
 /*
  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
  * important if a PTE was dirty when it was unmapped that it's flushed
@@ -806,8 +870,10 @@ void try_to_unmap_flush(void)
 {
 	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
 	struct tlbflush_unmap_batch *tlb_ubc_ro =3D &current->tlb_ubc_ro;
+	struct tlbflush_unmap_batch *tlb_ubc_luf =3D &current->tlb_ubc_luf;
=20
 	fold_batch(tlb_ubc, tlb_ubc_ro, true);
+	fold_batch(tlb_ubc, tlb_ubc_luf, true);
 	if (!tlb_ubc->flush_required)
 		return;
=20
@@ -820,8 +886,9 @@ void try_to_unmap_flush_dirty(void)
 {
 	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
 	struct tlbflush_unmap_batch *tlb_ubc_ro =3D &current->tlb_ubc_ro;
+	struct tlbflush_unmap_batch *tlb_ubc_luf =3D &current->tlb_ubc_luf;
=20
-	if (tlb_ubc->writable || tlb_ubc_ro->writable)
+	if (tlb_ubc->writable || tlb_ubc_ro->writable || tlb_ubc_luf->writable)
 		try_to_unmap_flush();
 }
=20
@@ -836,7 +903,8 @@ void try_to_unmap_flush_dirty(void)
 	(TLB_FLUSH_BATCH_PENDING_MASK / 2)
=20
 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
-				      unsigned long uaddr)
+				      unsigned long uaddr,
+				      struct vm_area_struct *vma)
 {
 	struct tlbflush_unmap_batch *tlb_ubc;
 	int batch;
@@ -845,7 +913,16 @@ static void set_tlb_ubc_flush_pending(struct mm_struct=
 *mm, pte_t pteval,
 	if (!pte_accessible(mm, pteval))
 		return;
=20
-	if (pte_write(pteval))
+	if (can_luf_test()) {
+		/*
+		 * luf cannot work with the folio once it found a
+		 * writable or dirty mapping on it.
+		 */
+		if (pte_write(pteval) || !can_luf_vma(vma))
+			can_luf_fail();
+	}
+
+	if (!can_luf_test())
 		tlb_ubc =3D &current->tlb_ubc;
 	else
 		tlb_ubc =3D &current->tlb_ubc_ro;
@@ -853,6 +930,21 @@ static void set_tlb_ubc_flush_pending(struct mm_struct=
 *mm, pte_t pteval,
 	arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
 	tlb_ubc->flush_required =3D true;
=20
+	if (can_luf_test()) {
+		struct luf_batch *lb;
+		unsigned long flags;
+
+		/*
+		 * Accumulate to the 0th entry right away so that
+		 * luf_flush(0) can be uesed to properly perform pending
+		 * TLB flush once this unmapping is observed.
+		 */
+		lb =3D &luf_batch[0];
+		write_lock_irqsave(&lb->lock, flags);
+		__fold_luf_batch(lb, tlb_ubc, new_luf_ugen());
+		write_unlock_irqrestore(&lb->lock, flags);
+	}
+
 	/*
 	 * Ensure compiler does not re-order the setting of tlb_flush_batched
 	 * before the PTE is cleared.
@@ -907,6 +999,8 @@ static bool should_defer_flush(struct mm_struct *mm, en=
um ttu_flags flags)
  * This must be called under the PTL so that an access to tlb_flush_batched
  * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchr=
onise
  * via the PTL.
+ *
+ * LUF(Lazy Unmap Flush) also relies on this for mprotect/munmap/etc.
  */
 void flush_tlb_batched_pending(struct mm_struct *mm)
 {
@@ -916,6 +1010,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
=20
 	if (pending !=3D flushed) {
 		arch_flush_tlb_batched_pending(mm);
+
 		/*
 		 * If the new TLB flushing is pending during flushing, leave
 		 * mm->tlb_flush_batched as is, to avoid losing flushing.
@@ -926,7 +1021,8 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
 }
 #else
 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
-				      unsigned long uaddr)
+				      unsigned long uaddr,
+				      struct vm_area_struct *vma)
 {
 }
=20
@@ -1292,6 +1388,11 @@ int folio_mkclean(struct folio *folio)
=20
 	rmap_walk(folio, &rwc);
=20
+	/*
+	 * Ensure to clean stale tlb entries for this mapping.
+	 */
+	luf_flush(0);
+
 	return cleaned;
 }
 EXPORT_SYMBOL_GPL(folio_mkclean);
@@ -1961,7 +2062,7 @@ static bool try_to_unmap_one(struct folio *folio, str=
uct vm_area_struct *vma,
 				 */
 				pteval =3D ptep_get_and_clear(mm, address, pvmw.pte);
=20
-				set_tlb_ubc_flush_pending(mm, pteval, address);
+				set_tlb_ubc_flush_pending(mm, pteval, address, vma);
 			} else {
 				pteval =3D ptep_clear_flush(vma, address, pvmw.pte);
 			}
@@ -2132,6 +2233,8 @@ static bool try_to_unmap_one(struct folio *folio, str=
uct vm_area_struct *vma,
=20
 	mmu_notifier_invalidate_range_end(&range);
=20
+	if (!ret)
+		can_luf_fail();
 	return ret;
 }
=20
@@ -2164,11 +2267,21 @@ void try_to_unmap(struct folio *folio, enum ttu_fla=
gs flags)
 		.done =3D folio_not_mapped,
 		.anon_lock =3D folio_lock_anon_vma_read,
 	};
+	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
+	struct tlbflush_unmap_batch *tlb_ubc_ro =3D &current->tlb_ubc_ro;
+	struct tlbflush_unmap_batch *tlb_ubc_luf =3D &current->tlb_ubc_luf;
+
+	can_luf_init(folio);
=20
 	if (flags & TTU_RMAP_LOCKED)
 		rmap_walk_locked(folio, &rwc);
 	else
 		rmap_walk(folio, &rwc);
+
+	if (can_luf_test())
+		fold_batch(tlb_ubc_luf, tlb_ubc_ro, true);
+	else
+		fold_batch(tlb_ubc, tlb_ubc_ro, true);
 }
=20
 /*
@@ -2338,7 +2451,7 @@ static bool try_to_migrate_one(struct folio *folio, s=
truct vm_area_struct *vma,
 				 */
 				pteval =3D ptep_get_and_clear(mm, address, pvmw.pte);
=20
-				set_tlb_ubc_flush_pending(mm, pteval, address);
+				set_tlb_ubc_flush_pending(mm, pteval, address, vma);
 			} else {
 				pteval =3D ptep_clear_flush(vma, address, pvmw.pte);
 			}
@@ -2494,6 +2607,8 @@ static bool try_to_migrate_one(struct folio *folio, s=
truct vm_area_struct *vma,
=20
 	mmu_notifier_invalidate_range_end(&range);
=20
+	if (!ret)
+		can_luf_fail();
 	return ret;
 }
=20
@@ -2513,6 +2628,9 @@ void try_to_migrate(struct folio *folio, enum ttu_fla=
gs flags)
 		.done =3D folio_not_mapped,
 		.anon_lock =3D folio_lock_anon_vma_read,
 	};
+	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
+	struct tlbflush_unmap_batch *tlb_ubc_ro =3D &current->tlb_ubc_ro;
+	struct tlbflush_unmap_batch *tlb_ubc_luf =3D &current->tlb_ubc_luf;
=20
 	/*
 	 * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
@@ -2537,10 +2655,17 @@ void try_to_migrate(struct folio *folio, enum ttu_f=
lags flags)
 	if (!folio_test_ksm(folio) && folio_test_anon(folio))
 		rwc.invalid_vma =3D invalid_migration_vma;
=20
+	can_luf_init(folio);
+
 	if (flags & TTU_RMAP_LOCKED)
 		rmap_walk_locked(folio, &rwc);
 	else
 		rmap_walk(folio, &rwc);
+
+	if (can_luf_test())
+		fold_batch(tlb_ubc_luf, tlb_ubc_ro, true);
+	else
+		fold_batch(tlb_ubc, tlb_ubc_ro, true);
 }
=20
 #ifdef CONFIG_DEVICE_PRIVATE
diff --git a/mm/truncate.c b/mm/truncate.c
index e5151703ba04a..14618c53f1910 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -124,6 +124,11 @@ void folio_invalidate(struct folio *folio, size_t offs=
et, size_t length)
=20
 	if (aops->invalidate_folio)
 		aops->invalidate_folio(folio, offset, length);
+
+	/*
+	 * Ensure to clean stale tlb entries for this mapping.
+	 */
+	luf_flush(0);
 }
 EXPORT_SYMBOL_GPL(folio_invalidate);
=20
@@ -161,6 +166,11 @@ int truncate_inode_folio(struct address_space *mapping=
, struct folio *folio)
=20
 	truncate_cleanup_folio(folio);
 	filemap_remove_folio(folio);
+
+	/*
+	 * Ensure to clean stale tlb entries for this mapping.
+	 */
+	luf_flush(0);
 	return 0;
 }
=20
@@ -206,6 +216,12 @@ bool truncate_inode_partial_folio(struct folio *folio,=
 loff_t start, loff_t end)
=20
 	if (folio_needs_release(folio))
 		folio_invalidate(folio, offset, length);
+
+	/*
+	 * Ensure to clean stale tlb entries for this mapping.
+	 */
+	luf_flush(0);
+
 	if (!folio_test_large(folio))
 		return true;
 	if (split_folio(folio) =3D=3D 0)
@@ -247,19 +263,28 @@ EXPORT_SYMBOL(generic_error_remove_folio);
  */
 long mapping_evict_folio(struct address_space *mapping, struct folio *foli=
o)
 {
+	long ret =3D 0;
+
 	/* The page may have been truncated before it was locked */
 	if (!mapping)
-		return 0;
+		goto out;
 	if (folio_test_dirty(folio) || folio_test_writeback(folio))
-		return 0;
+		goto out;
 	/* The refcount will be elevated if any page in the folio is mapped */
 	if (folio_ref_count(folio) >
 			folio_nr_pages(folio) + folio_has_private(folio) + 1)
-		return 0;
+		goto out;
 	if (!filemap_release_folio(folio, 0))
-		return 0;
+		goto out;
=20
-	return remove_mapping(mapping, folio);
+	ret =3D remove_mapping(mapping, folio);
+out:
+	/*
+	 * Ensure to clean stale tlb entries for this mapping.
+	 */
+	luf_flush(0);
+
+	return ret;
 }
=20
 /**
@@ -299,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *m=
apping,
 	bool		same_folio;
=20
 	if (mapping_empty(mapping))
-		return;
+		goto out;
=20
 	/*
 	 * 'start' and 'end' always covers the range of pages to be fully
@@ -387,6 +412,12 @@ void truncate_inode_pages_range(struct address_space *=
mapping,
 		truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
 		folio_batch_release(&fbatch);
 	}
+
+out:
+	/*
+	 * Ensure to clean stale tlb entries for this mapping.
+	 */
+	luf_flush(0);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
=20
@@ -502,6 +533,11 @@ unsigned long mapping_try_invalidate(struct address_sp=
ace *mapping,
 		folio_batch_release(&fbatch);
 		cond_resched();
 	}
+
+	/*
+	 * Ensure to clean stale tlb entries for this mapping.
+	 */
+	luf_flush(0);
 	return count;
 }
=20
@@ -594,7 +630,7 @@ int invalidate_inode_pages2_range(struct address_space =
*mapping,
 	int did_range_unmap =3D 0;
=20
 	if (mapping_empty(mapping))
-		return 0;
+		goto out;
=20
 	folio_batch_init(&fbatch);
 	index =3D start;
@@ -664,6 +700,11 @@ int invalidate_inode_pages2_range(struct address_space=
 *mapping,
 	if (dax_mapping(mapping)) {
 		unmap_mapping_pages(mapping, start, end - start + 1, false);
 	}
+out:
+	/*
+	 * Ensure to clean stale tlb entries for this mapping.
+	 */
+	luf_flush(0);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2970a8f35d3d3..ffc4a48710f1d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -821,6 +821,8 @@ static int __remove_mapping(struct address_space *mappi=
ng, struct folio *folio,
  */
 long remove_mapping(struct address_space *mapping, struct folio *folio)
 {
+	long ret =3D 0;
+
 	if (__remove_mapping(mapping, folio, false, NULL)) {
 		/*
 		 * Unfreezing the refcount with 1 effectively
@@ -828,9 +830,15 @@ long remove_mapping(struct address_space *mapping, str=
uct folio *folio)
 		 * atomic operation.
 		 */
 		folio_ref_unfreeze(folio, 1);
-		return folio_nr_pages(folio);
+		ret =3D folio_nr_pages(folio);
 	}
-	return 0;
+
+	/*
+	 * Ensure to clean stale tlb entries for this mapping.
+	 */
+	luf_flush(0);
+
+	return ret;
 }
=20
 /**
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 4573E1E7640
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:06 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029769; cv=none;
 b=PgHekLAL6nA/9X6Q9Ikmq5NzyPdqUb1RXbsy5C91oUbKw197U1JcQhNH5aTZH+CLkx8ojgMBonyqacRFpHjWv3tNteSTALzHQ3Q7t5WkZUoqjwFhlC5422+xOORbAtUlpt4LRRpXhW/gnMypPf/92PmEAdxgttnOiUuI7aTTHQs=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029769; c=relaxed/simple;
	bh=Q7vnjTLWOrbiW5f59YTqwg06V+yzI/mytdoniBJqdJ8=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=DD9ld4URQexQR1X8XwKzOtaqnZETjwPnbdOK6m/EXKExA7MFap5boCCoXgIljZ0TtCLn7fuUsYA0iMtH8i+Ikv38TSTEhdkxwRBbSisvjF9Ae+xuLxdqojcTDpnl7AklaAr2TkorhaRmWSGIfwQOFqpmvzbaey43aK/+AthSBUQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-03-67b6bba6c75d
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 17/26] x86/tlb, riscv/tlb, arm64/tlbflush,
 mm: remove cpus from tlb shootdown that already have been done
Date: Thu, 20 Feb 2025 14:20:18 +0900
Message-Id: <20250220052027.58847-18-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrOLMWRmVeSWpSXmKPExsXC9ZZnoe7y3dvSDeZuZLGYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	PbsvsBU8qa14/WwqUwPj3PQuRk4OCQETiTfLzrHA2F9mLGAGsdkE1CVu3PgJZosImEkcbP3D
	DmIzC9xlkjjQz9bFyMUhLNDIKHH4xEEmkASLgKrEuv+v2UBsXqCGc58eM0IMlZdYveEA2CBO
	oPiPGb1gNUICphLvFlxiAhkkIfCZTeJg30KoKyQlDq64wTKBkXcBI8MqRqHMvLLcxMwcE72M
	yrzMCr3k/NxNjMDgX1b7J3oH46cLwYcYBTgYlXh4Z7RuSxdiTSwrrsw9xCjBwawkwttWvyVd
	iDclsbIqtSg/vqg0J7X4EKM0B4uSOK/Rt/IUIYH0xJLU7NTUgtQimCwTB6dUA+OqMpaTN4MW
	c7xftXZpbLcUx/1zL7OvnlyydMHLAgub/EtrTi7xLzk5fUHY/ZSU7Enl9raXnW9v2MCUwHjM
	5dGhhpSwNY9zbqd/qCtNz4q8YxTFWub7qKyThVtlmcfvXUq3Z14/c2jl/DXmbWVywT4PrI9n
	yxkVzDBxmvNjq3pe1fzMLtV28XIlluKMREMt5qLiRAA4o4pdegIAAA==
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrHLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g6OHzSzmrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAldGz+wJbwZPaitfPpjI1MM5N72Lk5JAQMJH4MmMBM4jNJqAucePGTzBbRMBM
	4mDrH3YQm1ngLpPEgX62LkYuDmGBRkaJwycOMoEkWARUJdb9f80GYvMCNZz79JgRYqi8xOoN
	B8AGcQLFf8zoBasREjCVeLfgEtMERq4FjAyrGEUy88pyEzNzTPWKszMq8zIr9JLzczcxAkN5
	We2fiTsYv1x2P8QowMGoxMP74PHWdCHWxLLiytxDjBIczEoivG31W9KFeFMSK6tSi/Lji0pz
	UosPMUpzsCiJ83qFpyYICaQnlqRmp6YWpBbBZJk4OKUaGJXPXDF0FNj1XHTD0eXRaa9NzyTe
	239kpdSGmb2Oq5V7tsZNOF7I2XUtUqF8YYOVdVH3vsZrSl8kH5hOioh2XKOeH/I46KLt4/hd
	hYxndmkLbPuUUFti65WzPd7LIGJrfdfb91fZr220MgwJ3a34pb3/2eTNQnoqb++8dMvm0d3O
	uiRgQe9TNiWW4oxEQy3mouJEAG9p8vZhAgAA
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

luf mechanism performs tlb shootdown for mappings that have been
unmapped in lazy manner.  However, it doesn't have to perform tlb
shootdown to cpus that already have been done by others since the tlb
shootdown was desired.

Since luf already introduced its own generation number used as a global
timestamp, luf_ugen, it's possible to selectively pick cpus that have
been done tlb flush required.

This patch introduced APIs that use the generation number to select and
remove those cpus so that it can perform tlb shootdown with a smaller
cpumask, for all the CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH archs,
x86, riscv, and arm64.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 arch/arm64/include/asm/tlbflush.h |  26 +++++++
 arch/riscv/include/asm/tlbflush.h |   4 ++
 arch/riscv/mm/tlbflush.c          | 108 ++++++++++++++++++++++++++++++
 arch/x86/include/asm/tlbflush.h   |   4 ++
 arch/x86/mm/tlb.c                 | 108 ++++++++++++++++++++++++++++++
 include/linux/sched.h             |   1 +
 mm/internal.h                     |   4 ++
 mm/page_alloc.c                   |  32 +++++++--
 mm/rmap.c                         |  46 ++++++++++++-
 9 files changed, 327 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlb=
flush.h
index a62e1ea61e4af..f8290bec32e01 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -354,6 +354,32 @@ static inline void arch_tlbbatch_flush(struct arch_tlb=
flush_unmap_batch *batch)
 	dsb(ish);
 }
=20
+static inline bool arch_tlbbatch_check_done(struct arch_tlbflush_unmap_bat=
ch *batch, unsigned long ugen)
+{
+	/*
+	 * Nothing is needed in this architecture.
+	 */
+	return true;
+}
+
+static inline bool arch_tlbbatch_diet(struct arch_tlbflush_unmap_batch *ba=
tch, unsigned long ugen)
+{
+	/*
+	 * Nothing is needed in this architecture.
+	 */
+	return true;
+}
+
+static inline void arch_tlbbatch_mark_ugen(struct arch_tlbflush_unmap_batc=
h *batch, unsigned long ugen)
+{
+	/* nothing to do */
+}
+
+static inline void arch_mm_mark_ugen(struct mm_struct *mm, unsigned long u=
gen)
+{
+	/* nothing to do */
+}
+
 static inline void arch_tlbbatch_clear(struct arch_tlbflush_unmap_batch *b=
atch)
 {
 	/* nothing to do */
diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlb=
flush.h
index 1dc7d30273d59..ec5caeb3cf8ef 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -65,6 +65,10 @@ void arch_tlbbatch_add_pending(struct arch_tlbflush_unma=
p_batch *batch,
 			       unsigned long uaddr);
 void arch_flush_tlb_batched_pending(struct mm_struct *mm);
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+bool arch_tlbbatch_check_done(struct arch_tlbflush_unmap_batch *batch, uns=
igned long ugen);
+bool arch_tlbbatch_diet(struct arch_tlbflush_unmap_batch *batch, unsigned =
long ugen);
+void arch_tlbbatch_mark_ugen(struct arch_tlbflush_unmap_batch *batch, unsi=
gned long ugen);
+void arch_mm_mark_ugen(struct mm_struct *mm, unsigned long ugen);
=20
 static inline void arch_tlbbatch_clear(struct arch_tlbflush_unmap_batch *b=
atch)
 {
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index 36f996af6256c..93afb7a299003 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -202,3 +202,111 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_b=
atch *batch)
 	__flush_tlb_range(&batch->cpumask, FLUSH_TLB_NO_ASID, 0,
 			  FLUSH_TLB_MAX_SIZE, PAGE_SIZE);
 }
+
+static DEFINE_PER_CPU(atomic_long_t, ugen_done);
+
+static int __init luf_init_arch(void)
+{
+	int cpu;
+
+	for_each_cpu(cpu, cpu_possible_mask)
+		atomic_long_set(per_cpu_ptr(&ugen_done, cpu), LUF_UGEN_INIT - 1);
+
+	return 0;
+}
+early_initcall(luf_init_arch);
+
+/*
+ * batch will not be updated.
+ */
+bool arch_tlbbatch_check_done(struct arch_tlbflush_unmap_batch *batch,
+			unsigned long ugen)
+{
+	int cpu;
+
+	if (!ugen)
+		goto out;
+
+	for_each_cpu(cpu, &batch->cpumask) {
+		unsigned long done;
+
+		done =3D atomic_long_read(per_cpu_ptr(&ugen_done, cpu));
+		if (ugen_before(done, ugen))
+			return false;
+	}
+	return true;
+out:
+	return cpumask_empty(&batch->cpumask);
+}
+
+bool arch_tlbbatch_diet(struct arch_tlbflush_unmap_batch *batch,
+			unsigned long ugen)
+{
+	int cpu;
+
+	if (!ugen)
+		goto out;
+
+	for_each_cpu(cpu, &batch->cpumask) {
+		unsigned long done;
+
+		done =3D atomic_long_read(per_cpu_ptr(&ugen_done, cpu));
+		if (!ugen_before(done, ugen))
+			cpumask_clear_cpu(cpu, &batch->cpumask);
+	}
+out:
+	return cpumask_empty(&batch->cpumask);
+}
+
+void arch_tlbbatch_mark_ugen(struct arch_tlbflush_unmap_batch *batch,
+			     unsigned long ugen)
+{
+	int cpu;
+
+	if (!ugen)
+		return;
+
+	for_each_cpu(cpu, &batch->cpumask) {
+		atomic_long_t *done =3D per_cpu_ptr(&ugen_done, cpu);
+		unsigned long old =3D atomic_long_read(done);
+
+		/*
+		 * It's racy.  The race results in unnecessary tlb flush
+		 * because of the smaller ugen_done than it should be.
+		 * However, it's okay in terms of correctness.
+		 */
+		if (!ugen_before(old, ugen))
+			continue;
+
+		/*
+		 * It's for optimization.  Just skip on fail than retry.
+		 */
+		atomic_long_cmpxchg(done, old, ugen);
+	}
+}
+
+void arch_mm_mark_ugen(struct mm_struct *mm, unsigned long ugen)
+{
+	int cpu;
+
+	if (!ugen)
+		return;
+
+	for_each_cpu(cpu, mm_cpumask(mm)) {
+		atomic_long_t *done =3D per_cpu_ptr(&ugen_done, cpu);
+		unsigned long old =3D atomic_long_read(done);
+
+		/*
+		 * It's racy.  The race results in unnecessary tlb flush
+		 * because of the smaller ugen_done than it should be.
+		 * However, it's okay in terms of correctness.
+		 */
+		if (!ugen_before(old, ugen))
+			continue;
+
+		/*
+		 * It's for optimization.  Just skip on fail than retry.
+		 */
+		atomic_long_cmpxchg(done, old, ugen);
+	}
+}
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflus=
h.h
index 0ae9564c7301e..1fc5bacd72dff 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -293,6 +293,10 @@ static inline void arch_flush_tlb_batched_pending(stru=
ct mm_struct *mm)
 }
=20
 extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+extern bool arch_tlbbatch_check_done(struct arch_tlbflush_unmap_batch *bat=
ch, unsigned long ugen);
+extern bool arch_tlbbatch_diet(struct arch_tlbflush_unmap_batch *batch, un=
signed long ugen);
+extern void arch_tlbbatch_mark_ugen(struct arch_tlbflush_unmap_batch *batc=
h, unsigned long ugen);
+extern void arch_mm_mark_ugen(struct mm_struct *mm, unsigned long ugen);
=20
 static inline void arch_tlbbatch_clear(struct arch_tlbflush_unmap_batch *b=
atch)
 {
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 860e49b223fd7..975f58fa4b30f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1240,6 +1240,114 @@ void __flush_tlb_all(void)
 }
 EXPORT_SYMBOL_GPL(__flush_tlb_all);
=20
+static DEFINE_PER_CPU(atomic_long_t, ugen_done);
+
+static int __init luf_init_arch(void)
+{
+	int cpu;
+
+	for_each_cpu(cpu, cpu_possible_mask)
+		atomic_long_set(per_cpu_ptr(&ugen_done, cpu), LUF_UGEN_INIT - 1);
+
+	return 0;
+}
+early_initcall(luf_init_arch);
+
+/*
+ * batch will not be updated.
+ */
+bool arch_tlbbatch_check_done(struct arch_tlbflush_unmap_batch *batch,
+			unsigned long ugen)
+{
+	int cpu;
+
+	if (!ugen)
+		goto out;
+
+	for_each_cpu(cpu, &batch->cpumask) {
+		unsigned long done;
+
+		done =3D atomic_long_read(per_cpu_ptr(&ugen_done, cpu));
+		if (ugen_before(done, ugen))
+			return false;
+	}
+	return true;
+out:
+	return cpumask_empty(&batch->cpumask);
+}
+
+bool arch_tlbbatch_diet(struct arch_tlbflush_unmap_batch *batch,
+			unsigned long ugen)
+{
+	int cpu;
+
+	if (!ugen)
+		goto out;
+
+	for_each_cpu(cpu, &batch->cpumask) {
+		unsigned long done;
+
+		done =3D atomic_long_read(per_cpu_ptr(&ugen_done, cpu));
+		if (!ugen_before(done, ugen))
+			cpumask_clear_cpu(cpu, &batch->cpumask);
+	}
+out:
+	return cpumask_empty(&batch->cpumask);
+}
+
+void arch_tlbbatch_mark_ugen(struct arch_tlbflush_unmap_batch *batch,
+			     unsigned long ugen)
+{
+	int cpu;
+
+	if (!ugen)
+		return;
+
+	for_each_cpu(cpu, &batch->cpumask) {
+		atomic_long_t *done =3D per_cpu_ptr(&ugen_done, cpu);
+		unsigned long old =3D atomic_long_read(done);
+
+		/*
+		 * It's racy.  The race results in unnecessary tlb flush
+		 * because of the smaller ugen_done than it should be.
+		 * However, it's okay in terms of correctness.
+		 */
+		if (!ugen_before(old, ugen))
+			continue;
+
+		/*
+		 * It's for optimization.  Just skip on fail than retry.
+		 */
+		atomic_long_cmpxchg(done, old, ugen);
+	}
+}
+
+void arch_mm_mark_ugen(struct mm_struct *mm, unsigned long ugen)
+{
+	int cpu;
+
+	if (!ugen)
+		return;
+
+	for_each_cpu(cpu, mm_cpumask(mm)) {
+		atomic_long_t *done =3D per_cpu_ptr(&ugen_done, cpu);
+		unsigned long old =3D atomic_long_read(done);
+
+		/*
+		 * It's racy.  The race results in unnecessary tlb flush
+		 * because of the smaller ugen_done than it should be.
+		 * However, it's okay in terms of correctness.
+		 */
+		if (!ugen_before(old, ugen))
+			continue;
+
+		/*
+		 * It's for optimization.  Just skip on fail than retry.
+		 */
+		atomic_long_cmpxchg(done, old, ugen);
+	}
+}
+
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 {
 	struct flush_tlb_info *info;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 94321d51b91e8..5c6c4fd021973 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1377,6 +1377,7 @@ struct task_struct {
 #if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH)
 	int luf_no_shootdown;
 	int luf_takeoff_started;
+	unsigned long luf_ugen;
 #endif
=20
 	struct tlbflush_unmap_batch	tlb_ubc;
diff --git a/mm/internal.h b/mm/internal.h
index fe4a1c174895f..77657c17af204 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1246,6 +1246,7 @@ void try_to_unmap_flush(void);
 void try_to_unmap_flush_dirty(void);
 void try_to_unmap_flush_takeoff(void);
 void flush_tlb_batched_pending(struct mm_struct *mm);
+void reset_batch(struct tlbflush_unmap_batch *batch);
 void fold_batch(struct tlbflush_unmap_batch *dst, struct tlbflush_unmap_ba=
tch *src, bool reset);
 void fold_luf_batch(struct luf_batch *dst, struct luf_batch *src);
 #else
@@ -1261,6 +1262,9 @@ static inline void try_to_unmap_flush_takeoff(void)
 static inline void flush_tlb_batched_pending(struct mm_struct *mm)
 {
 }
+static inline void reset_batch(struct tlbflush_unmap_batch *batch)
+{
+}
 static inline void fold_batch(struct tlbflush_unmap_batch *dst, struct tlb=
flush_unmap_batch *src, bool reset)
 {
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 65acc437d8387..3032fedd8392b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -668,9 +668,11 @@ bool luf_takeoff_start(void)
  */
 void luf_takeoff_end(void)
 {
+	struct tlbflush_unmap_batch *tlb_ubc_takeoff =3D &current->tlb_ubc_takeof=
f;
 	unsigned long flags;
 	bool no_shootdown;
 	bool outmost =3D false;
+	unsigned long cur_luf_ugen;
=20
 	local_irq_save(flags);
 	VM_WARN_ON(!current->luf_takeoff_started);
@@ -697,10 +699,19 @@ void luf_takeoff_end(void)
 	if (no_shootdown)
 		goto out;
=20
+	cur_luf_ugen =3D current->luf_ugen;
+
+	current->luf_ugen =3D 0;
+
+	if (cur_luf_ugen && arch_tlbbatch_diet(&tlb_ubc_takeoff->arch, cur_luf_ug=
en))
+		reset_batch(tlb_ubc_takeoff);
+
 	try_to_unmap_flush_takeoff();
 out:
-	if (outmost)
+	if (outmost) {
 		VM_WARN_ON(current->luf_no_shootdown);
+		VM_WARN_ON(current->luf_ugen);
+	}
 }
=20
 /*
@@ -757,6 +768,7 @@ bool luf_takeoff_check_and_fold(struct page *page)
 	struct tlbflush_unmap_batch *tlb_ubc_takeoff =3D &current->tlb_ubc_takeof=
f;
 	unsigned short luf_key =3D page_luf_key(page);
 	struct luf_batch *lb;
+	unsigned long lb_ugen;
 	unsigned long flags;
=20
 	/*
@@ -770,13 +782,25 @@ bool luf_takeoff_check_and_fold(struct page *page)
 	if (!luf_key)
 		return true;
=20
-	if (current->luf_no_shootdown)
-		return false;
-
 	lb =3D &luf_batch[luf_key];
 	read_lock_irqsave(&lb->lock, flags);
+	lb_ugen =3D lb->ugen;
+
+	if (arch_tlbbatch_check_done(&lb->batch.arch, lb_ugen)) {
+		read_unlock_irqrestore(&lb->lock, flags);
+		return true;
+	}
+
+	if (current->luf_no_shootdown) {
+		read_unlock_irqrestore(&lb->lock, flags);
+		return false;
+	}
+
 	fold_batch(tlb_ubc_takeoff, &lb->batch, false);
 	read_unlock_irqrestore(&lb->lock, flags);
+
+	if (!current->luf_ugen || ugen_before(current->luf_ugen, lb_ugen))
+		current->luf_ugen =3D lb_ugen;
 	return true;
 }
 #endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 0aaf02b1b34c3..cf6667fb18fe2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -656,7 +656,7 @@ static unsigned long new_luf_ugen(void)
 	return ugen;
 }
=20
-static void reset_batch(struct tlbflush_unmap_batch *batch)
+void reset_batch(struct tlbflush_unmap_batch *batch)
 {
 	arch_tlbbatch_clear(&batch->arch);
 	batch->flush_required =3D false;
@@ -743,8 +743,14 @@ static void __fold_luf_batch(struct luf_batch *dst_lb,
 	 * more tlb shootdown might be needed to fulfill the newer
 	 * request.  Conservertively keep the newer one.
 	 */
-	if (!dst_lb->ugen || ugen_before(dst_lb->ugen, src_ugen))
+	if (!dst_lb->ugen || ugen_before(dst_lb->ugen, src_ugen)) {
+		/*
+		 * Good chance to shrink the batch using the old ugen.
+		 */
+		if (dst_lb->ugen && arch_tlbbatch_diet(&dst_lb->batch.arch, dst_lb->ugen=
))
+			reset_batch(&dst_lb->batch);
 		dst_lb->ugen =3D src_ugen;
+	}
 	fold_batch(&dst_lb->batch, src_batch, false);
 }
=20
@@ -772,17 +778,45 @@ void fold_luf_batch(struct luf_batch *dst, struct luf=
_batch *src)
 	read_unlock_irqrestore(&src->lock, flags);
 }
=20
+static unsigned long tlb_flush_start(void)
+{
+	/*
+	 * Memory barrier implied in the atomic operation prevents
+	 * reading luf_ugen from happening after the following
+	 * tlb flush.
+	 */
+	return new_luf_ugen();
+}
+
+static void tlb_flush_end(struct arch_tlbflush_unmap_batch *arch,
+		struct mm_struct *mm, unsigned long ugen)
+{
+	/*
+	 * Prevent the following marking from placing prior to the
+	 * actual tlb flush.
+	 */
+	smp_mb();
+
+	if (arch)
+		arch_tlbbatch_mark_ugen(arch, ugen);
+	if (mm)
+		arch_mm_mark_ugen(mm, ugen);
+}
+
 void try_to_unmap_flush_takeoff(void)
 {
 	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
 	struct tlbflush_unmap_batch *tlb_ubc_ro =3D &current->tlb_ubc_ro;
 	struct tlbflush_unmap_batch *tlb_ubc_luf =3D &current->tlb_ubc_luf;
 	struct tlbflush_unmap_batch *tlb_ubc_takeoff =3D &current->tlb_ubc_takeof=
f;
+	unsigned long ugen;
=20
 	if (!tlb_ubc_takeoff->flush_required)
 		return;
=20
+	ugen =3D tlb_flush_start();
 	arch_tlbbatch_flush(&tlb_ubc_takeoff->arch);
+	tlb_flush_end(&tlb_ubc_takeoff->arch, NULL, ugen);
=20
 	/*
 	 * Now that tlb shootdown of tlb_ubc_takeoff has been performed,
@@ -871,13 +905,17 @@ void try_to_unmap_flush(void)
 	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
 	struct tlbflush_unmap_batch *tlb_ubc_ro =3D &current->tlb_ubc_ro;
 	struct tlbflush_unmap_batch *tlb_ubc_luf =3D &current->tlb_ubc_luf;
+	unsigned long ugen;
=20
 	fold_batch(tlb_ubc, tlb_ubc_ro, true);
 	fold_batch(tlb_ubc, tlb_ubc_luf, true);
 	if (!tlb_ubc->flush_required)
 		return;
=20
+	ugen =3D tlb_flush_start();
 	arch_tlbbatch_flush(&tlb_ubc->arch);
+	tlb_flush_end(&tlb_ubc->arch, NULL, ugen);
+
 	reset_batch(tlb_ubc);
 }
=20
@@ -1009,7 +1047,11 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
 	int flushed =3D batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
=20
 	if (pending !=3D flushed) {
+		unsigned long ugen;
+
+		ugen =3D tlb_flush_start();
 		arch_flush_tlb_batched_pending(mm);
+		tlb_flush_end(NULL, mm, ugen);
=20
 		/*
 		 * If the new TLB flushing is pending during flushing, leave
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 991FD1E8345
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:06 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029769; cv=none;
 b=WKmWhMcRv6aqYLPIgN4+MNLEFxs5N28bc/O6zX+SEo6hBVndpumdcH61Fulikg6sM/l8R+a9GaX9vqABEwmKXXXH1uUzs2rtTSuBr2gbUEhp8cyIRwWGhSd051U3ZA04KWEHXuPu7o9k6spRJu2ANfXcXn7vsLVPxY3hPQX0oUs=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029769; c=relaxed/simple;
	bh=ibhtPcxsan2ORPeRajX4nLWvkXK8XgQ6EXtn+fJovlo=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=m610iB0JRmJ4UFyisG6/ucIzliGHZijfJWMeLpXHVQEhP4ygtcVBs5Af3BvmOMzTJyptSuxA2GIEs5Kz7ndT3GXCDutllvue/duXygROXjalSjDoGM6maAQ1NALi0ysedJRXaNbLDlXNrIl9yUC2/4HeUOKg4gmwZPqRJ0oxk0Y=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-08-67b6bba767e5
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 18/26] mm/page_alloc: retry 3 times to take pcp pages
 on luf check failure
Date: Thu, 20 Feb 2025 14:20:19 +0900
Message-Id: <20250220052027.58847-19-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrMLMWRmVeSWpSXmKPExsXC9ZZnke7y3dvSDVpesVnMWb+GzeLzhn9s
	Fi82tDNafF3/i9ni6ac+FovLu+awWdxb85/V4vyutawWO5buY7K4dGABk8Xx3gNMFvPvfWaz
	2LxpKrPF8SlTGS1+/wAqPjlrMouDgMf31j4Wj52z7rJ7LNhU6rF5hZbH4j0vmTw2repk89j0
	aRK7x7tz59g9Tsz4zeIx72Sgx/t9V9k8tv6y82iceo3N4/MmuQC+KC6blNSczLLUIn27BK6M
	XV9+MhU84ao4/HITcwPjFo4uRg4OCQETiS13hGDM010uXYycHGwC6hI3bvxkBrFFBMwkDrb+
	YQexmQXuMkkc6GcDsYUFkiROz3vPCmKzCKhK/H3ezAgyhheofntzGEhYQkBeYvWGA2BjOIHC
	P2b0grUKCZhKvFtwiamLkQuo5jObxKSnS1kgGiQlDq64wTKBkXcBI8MqRqHMvLLcxMwcE72M
	yrzMCr3k/NxNjMCgX1b7J3oH46cLwYcYBTgYlXh4Z7RuSxdiTSwrrsw9xCjBwawkwttWvyVd
	iDclsbIqtSg/vqg0J7X4EKM0B4uSOK/Rt/IUIYH0xJLU7NTUgtQimCwTB6dUA+PSYMcWnQkq
	7Eafp5ns4pp3XCGXu9bNfsrEWeuPl4YzZ9k4K92Iv+lY9+hlytktKwxude141fXmy+2qJIFv
	69zWvJ0a8ff0S4sPt+skprOLHdzm43bid3Db+hs2a/5qpu3geLvm7iPxdae2svjqPnaY1XFi
	WdZfma1VBjdWzvvBrmO4cNPuj18SlViKMxINtZiLihMB+JLdZ3YCAAA=
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrNLMWRmVeSWpSXmKPExsXC5WfdrLts97Z0g6lTrC3mrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlbHry0+mgidcFYdfbmJuYNzC0cXIwSEhYCJxusuli5GTg01AXeLGjZ/MILaI
	gJnEwdY/7CA2s8BdJokD/WwgtrBAksTpee9ZQWwWAVWJv8+bGUHG8ALVb28OAwlLCMhLrN5w
	AGwMJ1D4x4xesFYhAVOJdwsuMU1g5FrAyLCKUSQzryw3MTPHVK84O6MyL7NCLzk/dxMjMIiX
	1f6ZuIPxy2X3Q4wCHIxKPLwPHm9NF2JNLCuuzD3EKMHBrCTC21a/JV2INyWxsiq1KD++qDQn
	tfgQozQHi5I4r1d4aoKQQHpiSWp2ampBahFMlomDU6qBMTvRs3+fScG/3O+HxW8ZCC/efUyM
	/ZMf/63QKu5Io7NbK9s1ii00Lu30ktBIKT43yfWpspkrS8+nnFspUZH+mv2f/ZZsn3WbcdHu
	nrVrtUV/mrbsPtWb96pmXlXEjdXG7TMdUn1SUua6aX8q8wpMORdo4daxhWOHEqP83cTvtUci
	tG+bTK5TYinOSDTUYi4qTgQA9wFB2l4CAAA=
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 mm/page_alloc.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3032fedd8392b..0b6e7f235c4a1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3339,6 +3339,12 @@ struct page *__rmqueue_pcplist(struct zone *zone, un=
signed int order,
 {
 	struct page *page;
=20
+	/*
+	 * give up taking page from pcp if it fails to take pcp page
+	 * 3 times due to the tlb shootdownable issue.
+	 */
+	int try_luf_pages =3D 3;
+
 	do {
 		if (list_empty(list)) {
 			int batch =3D nr_pcp_alloc(pcp, zone, order);
@@ -3353,11 +3359,21 @@ struct page *__rmqueue_pcplist(struct zone *zone, u=
nsigned int order,
 				return NULL;
 		}
=20
-		page =3D list_first_entry(list, struct page, pcp_list);
-		if (!luf_takeoff_check_and_fold(page))
+		list_for_each_entry(page, list, pcp_list) {
+			if (luf_takeoff_check_and_fold(page)) {
+				list_del(&page->pcp_list);
+				pcp->count -=3D 1 << order;
+				break;
+			}
+			if (!--try_luf_pages)
+				return NULL;
+		}
+
+		/*
+		 * If all the pages in the list fails...
+		 */
+		if (list_entry_is_head(page, list, pcp_list))
 			return NULL;
-		list_del(&page->pcp_list);
-		pcp->count -=3D 1 << order;
 	} while (check_new_pages(page, order));
=20
 	return page;
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id E21951E9919
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:07 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029771; cv=none;
 b=VinOnqMiZtIDAj2GuFQGNBEKyPgELnvcY9HTX81WGc2ZsZsHGFr5BNqO6O/2HJXRsV3hZfe0+ZFkXpeuFTh7D1e4wSVypqSCqLZ8ZCxsdrvIgYOPs901cbUcz4VDXKgZ3CN6vXBthX5FQHxEaOBVAPZgjPXu8LkzxU5syqUKxDc=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029771; c=relaxed/simple;
	bh=JbvgcH015+NA/RjyJRa1CVDlVMj9X3fQL4RUWiaByBM=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=J0nRGOBVMRVThUy5Pp2YrJHWG088xrMSriYM+exeZEuw8CtHMrqI4SW+h9O+U/7IdE9knXCprtI63uR+gzUaWyDvGw20nSrpv9/PikhgAYDvvHgDSSVTfKHwki5SWkLe4j8kyVEq1tUWvB5dUwbx8LbKtvhMi0oE12s0pbrCe/U=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-0d-67b6bba748fa
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 19/26] mm: skip luf tlb flush for luf'd mm that
 already has been done
Date: Thu, 20 Feb 2025 14:20:20 +0900
Message-Id: <20250220052027.58847-20-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrKLMWRmVeSWpSXmKPExsXC9ZZnoe7y3dvSDWbM4bWYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	K7ofsRacta3YcNyhgbHdqIuRk0NCwESieeZ5Zhj76YtjTCA2m4C6xI0bP8HiIgJmEgdb/7CD
	2MwCd5kkDvSzgdjCArESs3asZgGxWQRUJd7sOwVWzwtU3/nhFdRMeYnVGw6A2ZxA8R8zesF6
	hQRMJd4tuAS0iwuo5j2bxLYzH6EaJCUOrrjBMoGRdwEjwypGocy8stzEzBwTvYzKvMwKveT8
	3E2MwMBfVvsnegfjpwvBhxgFOBiVeHhntG5LF2JNLCuuzD3EKMHBrCTC21a/JV2INyWxsiq1
	KD++qDQntfgQozQHi5I4r9G38hQhgfTEktTs1NSC1CKYLBMHp1QD49p/iQEPVnjsjfuRceze
	+ic5Rclr1rxfW5dq35Tnm73DpiN2Tl7cIbHeqbz8npyZH0QSY9sX1bE4iyi8TuQN1Nfj03rt
	p+lhWPKi5cefyrW7L213EX8vETnzq3plkq3QJRu2W66B0g7JHh5Pw7d+2+9tcWfRn4cNsRXL
	WHmfVwj+2OT07MEUJZbijERDLeai4kQAqLvAnngCAAA=
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrDLMWRmVeSWpSXmKPExsXC5WfdrLt897Z0g1PtzBZz1q9hs/i84R+b
	xYsN7YwWX9f/YrZ4+qmPxeLw3JOsFpd3zWGzuLfmP6vF+V1rWS12LN3HZHHpwAImi+O9B5gs
	5t/7zGaxedNUZovjU6YyWvz+AVR8ctZkFgdBj++tfSweO2fdZfdYsKnUY/MKLY/Fe14yeWxa
	1cnmsenTJHaPd+fOsXucmPGbxWPeyUCP9/uusnksfvGByWPrLzuPxqnX2Dw+b5IL4I/isklJ
	zcksSy3St0vgyljR/Yi14KxtxYbjDg2M7UZdjJwcEgImEk9fHGMCsdkE1CVu3PjJDGKLCJhJ
	HGz9ww5iMwvcZZI40M8GYgsLxErM2rGaBcRmEVCVeLPvFFg9L1B954dXzBAz5SVWbzgAZnMC
	xX/M6AXrFRIwlXi34BLTBEauBYwMqxhFMvPKchMzc0z1irMzKvMyK/SS83M3MQLDeFntn4k7
	GL9cdj/EKMDBqMTD++Dx1nQh1sSy4srcQ4wSHMxKIrxt9VvShXhTEiurUovy44tKc1KLDzFK
	c7AoifN6hacmCAmkJ5akZqemFqQWwWSZODilGhgVkh5HP29I0P5TYnToldmyfXmbbaZdK/y8
	1e7HjO2sKQe+ann5Lz3otuF5kXndQdW0JlXlYJf5/Tsfl9yO+f7pOaM9i1NtV8HRDp/eqU9P
	/uudv+LdXRX+t8KPf2raLta9t/iK0+8zGpOsD7198Siu6fP3qvdlMlO9ZqYqxPtZP3EQC+DL
	c61RYinOSDTUYi4qTgQATtMI518CAAA=
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Fault hander performs tlb flush pended by luf when a new pte becomes
to have write permission, no matter whether tlb flush required has been
performed or not.

By storing luf generation number, luf_ugen, in struct mm_struct, we can
skip unnecessary tlb flush.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 include/asm-generic/tlb.h |  2 +-
 include/linux/mm_types.h  |  9 +++++
 kernel/fork.c             |  1 +
 kernel/sched/core.c       |  2 +-
 mm/memory.c               | 22 ++++++++++--
 mm/pgtable-generic.c      |  2 +-
 mm/rmap.c                 | 74 +++++++++++++++++++++++++++++++++++++--
 7 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 4a99351be111e..94b329a5127a7 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -552,7 +552,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, =
struct vm_area_struct *vm
 	/*
 	 * Don't leave stale tlb entries for this vma.
 	 */
-	luf_flush(0);
+	luf_flush_vma(vma);
=20
 	if (tlb->fullmm)
 		return;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b3eb5a4e45efb..8de4c190ad514 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -38,8 +38,10 @@ struct luf_batch {
 	unsigned long ugen;
 	rwlock_t lock;
 };
+void luf_batch_init(struct luf_batch *lb);
 #else
 struct luf_batch {};
+static inline void luf_batch_init(struct luf_batch *lb) {}
 #endif
=20
 /*
@@ -1022,6 +1024,9 @@ struct mm_struct {
 		 * moving a PROT_NONE mapped page.
 		 */
 		atomic_t tlb_flush_pending;
+
+		/* luf batch for this mm */
+		struct luf_batch luf_batch;
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 		/* See flush_tlb_batched_pending() */
 		atomic_t tlb_flush_batched;
@@ -1272,8 +1277,12 @@ extern void tlb_finish_mmu(struct mmu_gather *tlb);
=20
 #if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH)
 void luf_flush(unsigned short luf_key);
+void luf_flush_mm(struct mm_struct *mm);
+void luf_flush_vma(struct vm_area_struct *vma);
 #else
 static inline void luf_flush(unsigned short luf_key) {}
+static inline void luf_flush_mm(struct mm_struct *mm) {}
+static inline void luf_flush_vma(struct vm_area_struct *vma) {}
 #endif
=20
 struct vm_fault;
diff --git a/kernel/fork.c b/kernel/fork.c
index 0061cf2450efd..593e74235ea8a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1268,6 +1268,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm=
, struct task_struct *p,
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
 	spin_lock_init(&mm->arg_lock);
+	luf_batch_init(&mm->luf_batch);
 	mm_init_cpumask(mm);
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aea08d8a9e258..c7665cb93f617 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5225,7 +5225,7 @@ static struct rq *finish_task_switch(struct task_stru=
ct *prev)
 	if (mm) {
 		membarrier_mm_sync_core_before_usermode(mm);
 		mmdrop_lazy_tlb_sched(mm);
-		luf_flush(0);
+		luf_flush_mm(mm);
 	}
=20
 	if (unlikely(prev_state =3D=3D TASK_DEAD)) {
diff --git a/mm/memory.c b/mm/memory.c
index 0e85c49bc5028..b02f86b1adb91 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6081,6 +6081,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma=
, unsigned long address,
 	struct mm_struct *mm =3D vma->vm_mm;
 	vm_fault_t ret;
 	bool is_droppable;
+	struct address_space *mapping =3D NULL;
 	bool flush =3D false;
=20
 	__set_current_state(TASK_RUNNING);
@@ -6112,9 +6113,17 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vm=
a, unsigned long address,
 	 * should be considered.
 	 */
 	if (vma->vm_flags & (VM_WRITE | VM_MAYWRITE) ||
-			flags & FAULT_FLAG_WRITE)
+			flags & FAULT_FLAG_WRITE) {
 		flush =3D true;
=20
+		/*
+		 * Doesn't care the !VM_SHARED cases because it won't
+		 * update the pages that might be shared with others.
+		 */
+		if (vma->vm_flags & VM_SHARED && vma->vm_file)
+			mapping =3D vma->vm_file->f_mapping;
+	}
+
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		ret =3D hugetlb_fault(vma->vm_mm, vma, address, flags);
 	else
@@ -6149,8 +6158,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vm=
a, unsigned long address,
 	/*
 	 * Ensure to clean stale tlb entries for this vma.
 	 */
-	if (flush)
-		luf_flush(0);
+	if (flush) {
+		/*
+		 * If it has a VM_SHARED mapping, all the mms involved
+		 * should be luf_flush'ed.
+		 */
+		if (mapping)
+			luf_flush(0);
+		luf_flush_mm(mm);
+	}
=20
 	return ret;
 }
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 215d8d93560fd..5a876c1c93a80 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -100,7 +100,7 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsi=
gned long address,
 	if (pte_accessible(mm, pte))
 		flush_tlb_page(vma, address);
 	else
-		luf_flush(0);
+		luf_flush_vma(vma);
 	return pte;
 }
 #endif
diff --git a/mm/rmap.c b/mm/rmap.c
index cf6667fb18fe2..e0304dc74c3a7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -695,7 +695,7 @@ void fold_batch(struct tlbflush_unmap_batch *dst,
  */
 struct luf_batch luf_batch[NR_LUF_BATCH];
=20
-static void luf_batch_init(struct luf_batch *lb)
+void luf_batch_init(struct luf_batch *lb)
 {
 	rwlock_init(&lb->lock);
 	reset_batch(&lb->batch);
@@ -778,6 +778,31 @@ void fold_luf_batch(struct luf_batch *dst, struct luf_=
batch *src)
 	read_unlock_irqrestore(&src->lock, flags);
 }
=20
+static void fold_luf_batch_mm(struct luf_batch *dst,
+		struct mm_struct *mm)
+{
+	unsigned long flags;
+	bool need_fold =3D false;
+
+	read_lock_irqsave(&dst->lock, flags);
+	if (arch_tlbbatch_need_fold(&dst->batch.arch, mm))
+		need_fold =3D true;
+	read_unlock(&dst->lock);
+
+	write_lock(&dst->lock);
+	if (unlikely(need_fold))
+		arch_tlbbatch_add_pending(&dst->batch.arch, mm, 0);
+
+	/*
+	 * dst->ugen represents sort of request for tlb shootdown.  The
+	 * newer it is, the more tlb shootdown might be needed to
+	 * fulfill the newer request.  Keep the newest one not to miss
+	 * necessary tlb shootdown.
+	 */
+	dst->ugen =3D new_luf_ugen();
+	write_unlock_irqrestore(&dst->lock, flags);
+}
+
 static unsigned long tlb_flush_start(void)
 {
 	/*
@@ -894,6 +919,49 @@ void luf_flush(unsigned short luf_key)
 }
 EXPORT_SYMBOL(luf_flush);
=20
+void luf_flush_vma(struct vm_area_struct *vma)
+{
+	struct mm_struct *mm;
+	struct address_space *mapping =3D NULL;
+
+	if (!vma)
+		return;
+
+	mm =3D vma->vm_mm;
+	/*
+	 * Doesn't care the !VM_SHARED cases because it won't
+	 * update the pages that might be shared with others.
+	 */
+	if (vma->vm_flags & VM_SHARED && vma->vm_file)
+		mapping =3D vma->vm_file->f_mapping;
+
+	if (mapping)
+		luf_flush(0);
+	luf_flush_mm(mm);
+}
+
+void luf_flush_mm(struct mm_struct *mm)
+{
+	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
+	struct luf_batch *lb;
+	unsigned long flags;
+	unsigned long lb_ugen;
+
+	if (!mm)
+		return;
+
+	lb =3D &mm->luf_batch;
+	read_lock_irqsave(&lb->lock, flags);
+	fold_batch(tlb_ubc, &lb->batch, false);
+	lb_ugen =3D lb->ugen;
+	read_unlock_irqrestore(&lb->lock, flags);
+
+	if (arch_tlbbatch_diet(&tlb_ubc->arch, lb_ugen))
+		return;
+
+	try_to_unmap_flush();
+}
+
 /*
  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
  * important if a PTE was dirty when it was unmapped that it's flushed
@@ -962,8 +1030,10 @@ static void set_tlb_ubc_flush_pending(struct mm_struc=
t *mm, pte_t pteval,
=20
 	if (!can_luf_test())
 		tlb_ubc =3D &current->tlb_ubc;
-	else
+	else {
 		tlb_ubc =3D &current->tlb_ubc_ro;
+		fold_luf_batch_mm(&mm->luf_batch, mm);
+	}
=20
 	arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
 	tlb_ubc->flush_required =3D true;
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 5AAEA1EB181
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:09 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029773; cv=none;
 b=XxCXZplPuRTMcnLx589IbwJhq/KKWZjmVdyjzxUhpY+ZTuH2qjMEj1qUXB/HHeKK2UeHs5+ZF8M9UeD6rvfYBKRt5nanH3D43XH7vNfT0dFwGb9RVMeGDjXH+y5XztJnwFxNg+N0KBx0dLBwMXMBtMdDLJGnuntZDE+YhtV6jRY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029773; c=relaxed/simple;
	bh=aNFdWmd/IwFyCddRF7YVe0SPx1Jlwdld/eJ0DPmsUqs=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=k/1CPTuIcLJL6TMxjYHgtg0GYPsqLQEPcdIVsJtokHCDw/z9I9Axwi4r2sgqVRDCGGlCWr2iU58jvN/8c0+c8wsSmgEpVGSs358nJXAuQ4ksMLU9llBwcR0VTBzYRLcf6JjpeZBoxPFdUdVo4CRGk6RdAKA+OkNgLIm2zPfBtlc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-12-67b6bba718ad
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 20/26] mm,
 fs: skip tlb flushes for luf'd filemap that already has been done
Date: Thu, 20 Feb 2025 14:20:21 +0900
Message-Id: <20250220052027.58847-21-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrKLMWRmVeSWpSXmKPExsXC9ZZnoe7y3dvSDZp3CFrMWb+GzeLzhn9s
	Fi82tDNafF3/i9ni6ac+FovLu+awWdxb85/V4vyutawWO5buY7K4dGABk8Xx3gNMFvPvfWaz
	2LxpKrPF8SlTGS1+/wAqPjlrMouDgMf31j4Wj52z7rJ7LNhU6rF5hZbH4j0vmTw2repk89j0
	aRK7x7tz59g9Tsz4zeIx72Sgx/t9V9k8tv6y82iceo3N4/MmuQC+KC6blNSczLLUIn27BK6M
	pQ8WMBdcCqg4uG8XWwPjWYcuRk4OCQETiRNP3zPC2NN3XGUBsdkE1CVu3PjJDGKLCJhJHGz9
	ww5iMwvcZZI40M/WxcjBISyQInHgnQZImEVAVeJn4zEmEJsXqLx31TUmiJHyEqs3HAAbwwkU
	/zGjlw3EFhIwlXi34BJQDRdQzWc2ieOfnrFCNEhKHFxxg2UCI+8CRoZVjEKZeWW5iZk5JnoZ
	lXmZFXrJ+bmbGIGBv6z2T/QOxk8Xgg8xCnAwKvHwzmjdli7EmlhWXJl7iFGCg1lJhLetfku6
	EG9KYmVValF+fFFpTmrxIUZpDhYlcV6jb+UpQgLpiSWp2ampBalFMFkmDk6pBsbci8EnElQl
	DrVo5kx1c7znJDjZ69em24H5IUe2TjyfVqzNoRQUsO3uk65EXcGqxWFWphq/r24SXFta/TWy
	eSrDwUOn/u+/EmHZvIRRacoZf73l8k0njgot+vV6vx1T0v4Kn9ky329dmBq9+Ijylr4/nEZX
	lj+fNYNR0ER60e6c4HjnyPrctNtKLMUZiYZazEXFiQAFu6/BeAIAAA==
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrDLMWRmVeSWpSXmKPExsXC5WfdrLt897Z0g18f2S3mrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlbH0wQLmgksBFQf37WJrYDzr0MXIySEhYCIxfcdVFhCbTUBd4saNn8wgtoiA
	mcTB1j/sIDazwF0miQP9bF2MHBzCAikSB95pgIRZBFQlfjYeYwKxeYHKe1ddY4IYKS+xesMB
	sDGcQPEfM3rZQGwhAVOJdwsuMU1g5FrAyLCKUSQzryw3MTPHVK84O6MyL7NCLzk/dxMjMIyX
	1f6ZuIPxy2X3Q4wCHIxKPLwPHm9NF2JNLCuuzD3EKMHBrCTC21a/JV2INyWxsiq1KD++qDQn
	tfgQozQHi5I4r1d4aoKQQHpiSWp2ampBahFMlomDU6qBsXjb1Vn7X9zP7HIrCK5ZZTFlypzJ
	M4wau3MnvGIL6AqOfmtlybexSN9Q9+PB1faP5WZlvl7IqnFE4cNH9xUbJwj33597NlE3b8GD
	sNY5zREfZu1WevJla3F5+beOdU2GeQoro/xOf2ap12VSXiftE73WgEH6zsada+Z99zj4SIB/
	t8weiZ7VN5RYijMSDbWYi4oTAR3U4EpfAgAA
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

For luf'd filemap, tlb shootdown is performed when updating page cache,
no matter whether tlb flushes required already has been done or not.

By storing luf meta data in struct address_space and updating the luf
meta data properly, we can skip unnecessary tlb flush.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 fs/inode.c               |  1 +
 include/linux/fs.h       |  4 ++-
 include/linux/mm_types.h |  2 ++
 mm/memory.c              |  4 +--
 mm/rmap.c                | 59 +++++++++++++++++++++++++---------------
 mm/truncate.c            | 14 +++++-----
 mm/vmscan.c              |  2 +-
 7 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index 46fbd5b234822..e155e51be2d28 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -404,6 +404,7 @@ static void __address_space_init_once(struct address_sp=
ace *mapping)
 	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->i_private_list);
 	spin_lock_init(&mapping->i_private_lock);
+	luf_batch_init(&mapping->luf_batch);
 	mapping->i_mmap =3D RB_ROOT_CACHED;
 }
=20
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ec88270221bfe..0cc588c704cd1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -461,6 +461,7 @@ extern const struct address_space_operations empty_aops;
  * @i_private_lock: For use by the owner of the address_space.
  * @i_private_list: For use by the owner of the address_space.
  * @i_private_data: For use by the owner of the address_space.
+ * @luf_batch: Data to track need of tlb flush by luf.
  */
 struct address_space {
 	struct inode		*host;
@@ -482,6 +483,7 @@ struct address_space {
 	struct list_head	i_private_list;
 	struct rw_semaphore	i_mmap_rwsem;
 	void *			i_private_data;
+	struct luf_batch	luf_batch;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
@@ -508,7 +510,7 @@ static inline int mapping_write_begin(struct file *file,
 	 * Ensure to clean stale tlb entries for this mapping.
 	 */
 	if (!ret)
-		luf_flush(0);
+		luf_flush_mapping(mapping);
=20
 	return ret;
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8de4c190ad514..c50cfc1c6282f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1279,10 +1279,12 @@ extern void tlb_finish_mmu(struct mmu_gather *tlb);
 void luf_flush(unsigned short luf_key);
 void luf_flush_mm(struct mm_struct *mm);
 void luf_flush_vma(struct vm_area_struct *vma);
+void luf_flush_mapping(struct address_space *mapping);
 #else
 static inline void luf_flush(unsigned short luf_key) {}
 static inline void luf_flush_mm(struct mm_struct *mm) {}
 static inline void luf_flush_vma(struct vm_area_struct *vma) {}
+static inline void luf_flush_mapping(struct address_space *mapping) {}
 #endif
=20
 struct vm_fault;
diff --git a/mm/memory.c b/mm/memory.c
index b02f86b1adb91..c98af5e567e89 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6161,10 +6161,10 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *v=
ma, unsigned long address,
 	if (flush) {
 		/*
 		 * If it has a VM_SHARED mapping, all the mms involved
-		 * should be luf_flush'ed.
+		 * in the struct address_space should be luf_flush'ed.
 		 */
 		if (mapping)
-			luf_flush(0);
+			luf_flush_mapping(mapping);
 		luf_flush_mm(mm);
 	}
=20
diff --git a/mm/rmap.c b/mm/rmap.c
index e0304dc74c3a7..0cb13e8fcd739 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -691,7 +691,7 @@ void fold_batch(struct tlbflush_unmap_batch *dst,
 #define NR_LUF_BATCH (1 << (sizeof(short) * 8))
=20
 /*
- * Use 0th entry as accumulated batch.
+ * XXX: Reserve the 0th entry for later use.
  */
 struct luf_batch luf_batch[NR_LUF_BATCH];
=20
@@ -936,7 +936,7 @@ void luf_flush_vma(struct vm_area_struct *vma)
 		mapping =3D vma->vm_file->f_mapping;
=20
 	if (mapping)
-		luf_flush(0);
+		luf_flush_mapping(mapping);
 	luf_flush_mm(mm);
 }
=20
@@ -962,6 +962,29 @@ void luf_flush_mm(struct mm_struct *mm)
 	try_to_unmap_flush();
 }
=20
+void luf_flush_mapping(struct address_space *mapping)
+{
+	struct tlbflush_unmap_batch *tlb_ubc =3D &current->tlb_ubc;
+	struct luf_batch *lb;
+	unsigned long flags;
+	unsigned long lb_ugen;
+
+	if (!mapping)
+		return;
+
+	lb =3D &mapping->luf_batch;
+	read_lock_irqsave(&lb->lock, flags);
+	fold_batch(tlb_ubc, &lb->batch, false);
+	lb_ugen =3D lb->ugen;
+	read_unlock_irqrestore(&lb->lock, flags);
+
+	if (arch_tlbbatch_diet(&tlb_ubc->arch, lb_ugen))
+		return;
+
+	try_to_unmap_flush();
+}
+EXPORT_SYMBOL(luf_flush_mapping);
+
 /*
  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
  * important if a PTE was dirty when it was unmapped that it's flushed
@@ -1010,7 +1033,8 @@ void try_to_unmap_flush_dirty(void)
=20
 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
 				      unsigned long uaddr,
-				      struct vm_area_struct *vma)
+				      struct vm_area_struct *vma,
+				      struct address_space *mapping)
 {
 	struct tlbflush_unmap_batch *tlb_ubc;
 	int batch;
@@ -1032,27 +1056,15 @@ static void set_tlb_ubc_flush_pending(struct mm_str=
uct *mm, pte_t pteval,
 		tlb_ubc =3D &current->tlb_ubc;
 	else {
 		tlb_ubc =3D &current->tlb_ubc_ro;
+
 		fold_luf_batch_mm(&mm->luf_batch, mm);
+		if (mapping)
+			fold_luf_batch_mm(&mapping->luf_batch, mm);
 	}
=20
 	arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
 	tlb_ubc->flush_required =3D true;
=20
-	if (can_luf_test()) {
-		struct luf_batch *lb;
-		unsigned long flags;
-
-		/*
-		 * Accumulate to the 0th entry right away so that
-		 * luf_flush(0) can be uesed to properly perform pending
-		 * TLB flush once this unmapping is observed.
-		 */
-		lb =3D &luf_batch[0];
-		write_lock_irqsave(&lb->lock, flags);
-		__fold_luf_batch(lb, tlb_ubc, new_luf_ugen());
-		write_unlock_irqrestore(&lb->lock, flags);
-	}
-
 	/*
 	 * Ensure compiler does not re-order the setting of tlb_flush_batched
 	 * before the PTE is cleared.
@@ -1134,7 +1146,8 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
 #else
 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
 				      unsigned long uaddr,
-				      struct vm_area_struct *vma)
+				      struct vm_area_struct *vma,
+				      struct address_space *mapping)
 {
 }
=20
@@ -1503,7 +1516,7 @@ int folio_mkclean(struct folio *folio)
 	/*
 	 * Ensure to clean stale tlb entries for this mapping.
 	 */
-	luf_flush(0);
+	luf_flush_mapping(mapping);
=20
 	return cleaned;
 }
@@ -2037,6 +2050,7 @@ static bool try_to_unmap_one(struct folio *folio, str=
uct vm_area_struct *vma,
 	enum ttu_flags flags =3D (enum ttu_flags)(long)arg;
 	unsigned long pfn;
 	unsigned long hsz =3D 0;
+	struct address_space *mapping =3D folio_mapping(folio);
=20
 	/*
 	 * When racing against e.g. zap_pte_range() on another cpu,
@@ -2174,7 +2188,7 @@ static bool try_to_unmap_one(struct folio *folio, str=
uct vm_area_struct *vma,
 				 */
 				pteval =3D ptep_get_and_clear(mm, address, pvmw.pte);
=20
-				set_tlb_ubc_flush_pending(mm, pteval, address, vma);
+				set_tlb_ubc_flush_pending(mm, pteval, address, vma, mapping);
 			} else {
 				pteval =3D ptep_clear_flush(vma, address, pvmw.pte);
 			}
@@ -2414,6 +2428,7 @@ static bool try_to_migrate_one(struct folio *folio, s=
truct vm_area_struct *vma,
 	enum ttu_flags flags =3D (enum ttu_flags)(long)arg;
 	unsigned long pfn;
 	unsigned long hsz =3D 0;
+	struct address_space *mapping =3D folio_mapping(folio);
=20
 	/*
 	 * When racing against e.g. zap_pte_range() on another cpu,
@@ -2563,7 +2578,7 @@ static bool try_to_migrate_one(struct folio *folio, s=
truct vm_area_struct *vma,
 				 */
 				pteval =3D ptep_get_and_clear(mm, address, pvmw.pte);
=20
-				set_tlb_ubc_flush_pending(mm, pteval, address, vma);
+				set_tlb_ubc_flush_pending(mm, pteval, address, vma, mapping);
 			} else {
 				pteval =3D ptep_clear_flush(vma, address, pvmw.pte);
 			}
diff --git a/mm/truncate.c b/mm/truncate.c
index 14618c53f1910..f9a3416610231 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -128,7 +128,7 @@ void folio_invalidate(struct folio *folio, size_t offse=
t, size_t length)
 	/*
 	 * Ensure to clean stale tlb entries for this mapping.
 	 */
-	luf_flush(0);
+	luf_flush_mapping(folio->mapping);
 }
 EXPORT_SYMBOL_GPL(folio_invalidate);
=20
@@ -170,7 +170,7 @@ int truncate_inode_folio(struct address_space *mapping,=
 struct folio *folio)
 	/*
 	 * Ensure to clean stale tlb entries for this mapping.
 	 */
-	luf_flush(0);
+	luf_flush_mapping(mapping);
 	return 0;
 }
=20
@@ -220,7 +220,7 @@ bool truncate_inode_partial_folio(struct folio *folio, =
loff_t start, loff_t end)
 	/*
 	 * Ensure to clean stale tlb entries for this mapping.
 	 */
-	luf_flush(0);
+	luf_flush_mapping(folio->mapping);
=20
 	if (!folio_test_large(folio))
 		return true;
@@ -282,7 +282,7 @@ long mapping_evict_folio(struct address_space *mapping,=
 struct folio *folio)
 	/*
 	 * Ensure to clean stale tlb entries for this mapping.
 	 */
-	luf_flush(0);
+	luf_flush_mapping(mapping);
=20
 	return ret;
 }
@@ -417,7 +417,7 @@ void truncate_inode_pages_range(struct address_space *m=
apping,
 	/*
 	 * Ensure to clean stale tlb entries for this mapping.
 	 */
-	luf_flush(0);
+	luf_flush_mapping(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
=20
@@ -537,7 +537,7 @@ unsigned long mapping_try_invalidate(struct address_spa=
ce *mapping,
 	/*
 	 * Ensure to clean stale tlb entries for this mapping.
 	 */
-	luf_flush(0);
+	luf_flush_mapping(mapping);
 	return count;
 }
=20
@@ -704,7 +704,7 @@ int invalidate_inode_pages2_range(struct address_space =
*mapping,
 	/*
 	 * Ensure to clean stale tlb entries for this mapping.
 	 */
-	luf_flush(0);
+	luf_flush_mapping(mapping);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ffc4a48710f1d..cbca027d2a10e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -836,7 +836,7 @@ long remove_mapping(struct address_space *mapping, stru=
ct folio *folio)
 	/*
 	 * Ensure to clean stale tlb entries for this mapping.
 	 */
-	luf_flush(0);
+	luf_flush_mapping(mapping);
=20
 	return ret;
 }
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 523EC1A9B4E
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:08 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029773; cv=none;
 b=dQV2fchm2If2pihomNY59U7/rwPv79qGkcgGV0djyWqTLT4LPbkXt5wvUWrDgZUrIJqnnGtmTYf8q+weeB5XsAKIsRV5TONqC2vSSq85Xv00k4iKqPcGIC8hZ9qjsqoD3gUErHjkznZlkgf+sc0vXdbWTI0MD7cg1+eHmrhKFAw=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029773; c=relaxed/simple;
	bh=z0wUCn9bKKNR5vbYMEsxSaOb7Sjpa/9nbJ27NHSiqaQ=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=TNNg+5uPYxwP9zeuYuDT2cXZJ+G/iRU5OEkrb6h8odZKAH9fhXJ561tnjueZWBK7xNbLTDu6zf6uazpU6WS7G3Ov4mOcj1vV4+JtVZmHDIjWlyyhqJRH45Jy9gYy8RIv412q24yHnziC9ArDgR4YOz++kLWSasYopDEb9evAJWs=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-17-67b6bba717eb
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 21/26] mm: perform luf tlb shootdown per zone in
 batched manner
Date: Thu, 20 Feb 2025 14:20:22 +0900
Message-Id: <20250220052027.58847-22-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrBLMWRmVeSWpSXmKPExsXC9ZZnoe7y3dvSDea+ErOYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	f87tZy6438NUsfH/HMYGxj/XGbsYOTkkBEwkXnyfxw5jP7rxH8xmE1CXuHHjJzOILSJgJnGw
	9Q9YnFngLpPEgX42EFtYIFzi99I7rCA2i4CqxLOna8Bm8gLVb7mxnxViprzE6g0HwOZwAsV/
	zOgF6xUSMJV4t+ASUxcjF1DNZzaJjrXT2SAaJCUOrrjBMoGRdwEjwypGocy8stzEzBwTvYzK
	vMwKveT83E2MwPBfVvsnegfjpwvBhxgFOBiVeHhntG5LF2JNLCuuzD3EKMHBrCTC21a/JV2I
	NyWxsiq1KD++qDQntfgQozQHi5I4r9G38hQhgfTEktTs1NSC1CKYLBMHp1QDo9AnntNepS/s
	7kW/2Mt863fB50MTE6tdz0Y/MMn58Czy+b+Zx6XWlpt92/7o0/xV/8Na9WSK188orXmTkFT/
	+pJKs03MreLC/73HbiuVfir7z9M/WUcu3epBcXaR63Xngy5vH9ql5c1LPnI58qHqySLGwqU7
	6gr4gn0NSvSz9q29ycyYu/WWqhJLcUaioRZzUXEiAFHrTcV7AgAA
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrPLMWRmVeSWpSXmKPExsXC5WfdrLt897Z0gwv3uS3mrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlfHn3H7mgvs9TBUb/89hbGD8c52xi5GTQ0LAROLRjf/sIDabgLrEjRs/mUFs
	EQEziYOtf8DizAJ3mSQO9LOB2MIC4RK/l95hBbFZBFQlnj1dAzaHF6h+y439rBAz5SVWbzgA
	NocTKP5jRi9Yr5CAqcS7BZeYJjByLWBkWMUokplXlpuYmWOqV5ydUZmXWaGXnJ+7iREYzMtq
	/0zcwfjlsvshRgEORiUe3gePt6YLsSaWFVfmHmKU4GBWEuFtq9+SLsSbklhZlVqUH19UmpNa
	fIhRmoNFSZzXKzw1QUggPbEkNTs1tSC1CCbLxMEp1cA4q6dGsWb9c9kO+Qeu4XWnRHyb4hfe
	ZpRdye3SdWBJ/3KZB18P3k7YfqD2kfreKxq7DXfpNM74cIC/+NXpy0UL3TaqT6w04LX3e7Hg
	SnDk0teaCqni04494vftmr13a54j7z33iX9UYi+o+sRMkxTQN+zq4uIsZZull/4u1m/qmdx1
	y965vJ2uxFKckWioxVxUnAgAJ6ln02ICAAA=
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Each luf page in buddy has its pending tlb shootdown information and
performs the corresponding tlb shootdown on exit from buddy.  However,
every exit from buddy causes small but frequent IPIs.  Even though total
IPIs get reduced, unnecessary waits on conflict CPUs in IPI handler have
been observed via perf profiling.

Thus, made it perfrom luf tlb shootdown per zone in batched manner when
pages exit from buddy so as to avoid frequent IPIs.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 include/linux/mm.h       |  44 ++++-
 include/linux/mm_types.h |  19 +-
 include/linux/mmzone.h   |   9 +
 include/linux/sched.h    |   2 +
 mm/compaction.c          |  10 +-
 mm/internal.h            |  13 +-
 mm/mm_init.c             |   5 +
 mm/page_alloc.c          | 363 +++++++++++++++++++++++++++++++--------
 mm/page_reporting.c      |   9 +-
 mm/rmap.c                |   6 +-
 10 files changed, 383 insertions(+), 97 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 53a5f1cb21e0d..46638e86e8073 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4161,12 +4161,16 @@ static inline int do_mseal(unsigned long start, siz=
e_t len_in, unsigned long fla
 }
 #endif
=20
-#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH)
 /*
  * luf_ugen will start with 2 so that 1 can be regarded as a passed one.
  */
 #define LUF_UGEN_INIT 2
+/*
+ * zone_ugen will start with 2 so that 1 can be regarded as done.
+ */
+#define ZONE_UGEN_INIT 2
=20
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH)
 static inline bool ugen_before(unsigned long a, unsigned long b)
 {
 	/*
@@ -4177,7 +4181,11 @@ static inline bool ugen_before(unsigned long a, unsi=
gned long b)
=20
 static inline unsigned long next_ugen(unsigned long ugen)
 {
-	if (ugen + 1)
+	/*
+	 * Avoid zero even in unsigned short range so as to treat
+	 * '(unsigned short)ugen =3D=3D 0' as invalid.
+	 */
+	if ((unsigned short)(ugen + 1))
 		return ugen + 1;
 	/*
 	 * Avoid invalid ugen, zero.
@@ -4187,7 +4195,11 @@ static inline unsigned long next_ugen(unsigned long =
ugen)
=20
 static inline unsigned long prev_ugen(unsigned long ugen)
 {
-	if (ugen - 1)
+	/*
+	 * Avoid zero even in unsigned short range so as to treat
+	 * '(unsigned short)ugen =3D=3D 0' as invalid.
+	 */
+	if ((unsigned short)(ugen - 1))
 		return ugen - 1;
 	/*
 	 * Avoid invalid ugen, zero.
@@ -4195,4 +4207,30 @@ static inline unsigned long prev_ugen(unsigned long =
ugen)
 	return ugen - 2;
 }
 #endif
+
+/*
+ * return the biggest ugen but it should be before the real zone_ugen.
+ */
+static inline unsigned long page_zone_ugen(struct zone *zone, struct page =
*page)
+{
+	unsigned long zone_ugen =3D zone->zone_ugen;
+	unsigned short short_zone_ugen =3D page->zone_ugen;
+	unsigned long cand1, cand2;
+
+	if (!short_zone_ugen)
+		return 0;
+
+	cand1 =3D (zone_ugen & ~(unsigned long)USHRT_MAX) | short_zone_ugen;
+	cand2 =3D cand1 - USHRT_MAX - 1;
+
+	if (!ugen_before(zone_ugen, cand1))
+		return cand1;
+
+	return cand2;
+}
+
+static inline void set_page_zone_ugen(struct page *page, unsigned short zo=
ne_ugen)
+{
+	page->zone_ugen =3D zone_ugen;
+}
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c50cfc1c6282f..e3132e1e5e5d2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -132,11 +132,20 @@ struct page {
 					 */
 					unsigned short order;
=20
-					/*
-					 * For tracking need of tlb flush,
-					 * by luf(lazy unmap flush).
-					 */
-					unsigned short luf_key;
+					union {
+						/*
+						 * For tracking need of
+						 * tlb flush, by
+						 * luf(lazy unmap flush).
+						 */
+						unsigned short luf_key;
+
+						/*
+						 * Casted zone_ugen with
+						 * unsigned short.
+						 */
+						unsigned short zone_ugen;
+					};
 				};
 			};
 		};
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ac3178b5fc50b..3c1b04d21fda9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -117,6 +117,7 @@ extern int page_group_by_mobility_disabled;
 struct free_area {
 	struct list_head	free_list[MIGRATE_TYPES];
 	struct list_head	pend_list[MIGRATE_TYPES];
+	unsigned long		pend_zone_ugen[MIGRATE_TYPES];
 	unsigned long		nr_free;
 };
=20
@@ -998,6 +999,14 @@ struct zone {
 	atomic_long_t		vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
 	/* Count pages that need tlb shootdown on allocation */
 	atomic_long_t		nr_luf_pages;
+	/* Generation number for that tlb shootdown has been done */
+	unsigned long		zone_ugen_done;
+	/* Generation number to control zone batched tlb shootdown */
+	unsigned long		zone_ugen;
+	/* Approximate latest luf_ugen that have ever entered */
+	unsigned long		luf_ugen;
+	/* Accumulated tlb batch for this zone */
+	struct tlbflush_unmap_batch zone_batch;
 } ____cacheline_internodealigned_in_smp;
=20
 enum pgdat_flags {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c6c4fd021973..463cb2fb8f919 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1378,6 +1378,8 @@ struct task_struct {
 	int luf_no_shootdown;
 	int luf_takeoff_started;
 	unsigned long luf_ugen;
+	unsigned long zone_ugen;
+	unsigned long wait_zone_ugen;
 #endif
=20
 	struct tlbflush_unmap_batch	tlb_ubc;
diff --git a/mm/compaction.c b/mm/compaction.c
index 27f3d743762bb..a7f17867decae 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -653,7 +653,7 @@ static unsigned long isolate_freepages_block(struct com=
pact_control *cc,
 				goto isolate_fail;
 		}
=20
-		if (!luf_takeoff_check(page))
+		if (!luf_takeoff_check(cc->zone, page))
 			goto isolate_fail;
=20
 		/* Found a free page, will break it into order-0 pages */
@@ -689,7 +689,7 @@ static unsigned long isolate_freepages_block(struct com=
pact_control *cc,
 	/*
 	 * Check and flush before using the pages taken off.
 	 */
-	luf_takeoff_end();
+	luf_takeoff_end(cc->zone);
=20
 	/*
 	 * Be careful to not go outside of the pageblock.
@@ -1611,7 +1611,7 @@ static void fast_isolate_freepages(struct compact_con=
trol *cc)
 			order_scanned++;
 			nr_scanned++;
=20
-			if (unlikely(consider_pend && !luf_takeoff_check(freepage)))
+			if (unlikely(consider_pend && !luf_takeoff_check(cc->zone, freepage)))
 				goto scan_next;
=20
 			pfn =3D page_to_pfn(freepage);
@@ -1679,7 +1679,7 @@ static void fast_isolate_freepages(struct compact_con=
trol *cc)
 		/*
 		 * Check and flush before using the pages taken off.
 		 */
-		luf_takeoff_end();
+		luf_takeoff_end(cc->zone);
=20
 		/* Skip fast search if enough freepages isolated */
 		if (cc->nr_freepages >=3D cc->nr_migratepages)
@@ -2415,7 +2415,7 @@ static enum compact_result compact_finished(struct co=
mpact_control *cc)
 	 */
 	luf_takeoff_start();
 	ret =3D __compact_finished(cc);
-	luf_takeoff_end();
+	luf_takeoff_end(cc->zone);
=20
 	trace_mm_compaction_finished(cc->zone, cc->order, ret);
 	if (ret =3D=3D COMPACT_NO_SUITABLE_PAGE)
diff --git a/mm/internal.h b/mm/internal.h
index 77657c17af204..e634eaf220f00 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1595,10 +1595,10 @@ static inline void accept_page(struct page *page)
 #if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH)
 extern struct luf_batch luf_batch[];
 bool luf_takeoff_start(void);
-void luf_takeoff_end(void);
+void luf_takeoff_end(struct zone *zone);
 bool luf_takeoff_no_shootdown(void);
-bool luf_takeoff_check(struct page *page);
-bool luf_takeoff_check_and_fold(struct page *page);
+bool luf_takeoff_check(struct zone *zone, struct page *page);
+bool luf_takeoff_check_and_fold(struct zone *zone, struct page *page);
=20
 static inline bool non_luf_pages_ok(struct zone *zone)
 {
@@ -1608,7 +1608,6 @@ static inline bool non_luf_pages_ok(struct zone *zone)
=20
 	return nr_free - nr_luf_pages > min_wm;
 }
-
 unsigned short fold_unmap_luf(void);
=20
 /*
@@ -1696,10 +1695,10 @@ static inline bool can_luf_vma(struct vm_area_struc=
t *vma)
 }
 #else /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 static inline bool luf_takeoff_start(void) { return false; }
-static inline void luf_takeoff_end(void) {}
+static inline void luf_takeoff_end(struct zone *zone) {}
 static inline bool luf_takeoff_no_shootdown(void) { return true; }
-static inline bool luf_takeoff_check(struct page *page) { return true; }
-static inline bool luf_takeoff_check_and_fold(struct page *page) { return =
true; }
+static inline bool luf_takeoff_check(struct zone *zone, struct page *page)=
 { return true; }
+static inline bool luf_takeoff_check_and_fold(struct zone *zone, struct pa=
ge *page) { return true; }
 static inline bool non_luf_pages_ok(struct zone *zone) { return true; }
 static inline unsigned short fold_unmap_luf(void) { return 0; }
=20
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 12b96cd6a87b0..58e616ceef52a 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1397,6 +1397,7 @@ static void __meminit zone_init_free_lists(struct zon=
e *zone)
 	for_each_migratetype_order(order, t) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
 		INIT_LIST_HEAD(&zone->free_area[order].pend_list[t]);
+		zone->free_area[order].pend_zone_ugen[t] =3D ZONE_UGEN_INIT;
 		zone->free_area[order].nr_free =3D 0;
 	}
=20
@@ -1404,6 +1405,10 @@ static void __meminit zone_init_free_lists(struct zo=
ne *zone)
 	INIT_LIST_HEAD(&zone->unaccepted_pages);
 #endif
 	atomic_long_set(&zone->nr_luf_pages, 0);
+	zone->zone_ugen_done =3D ZONE_UGEN_INIT - 1;
+	zone->zone_ugen =3D ZONE_UGEN_INIT;
+	zone->luf_ugen =3D LUF_UGEN_INIT - 1;
+	reset_batch(&zone->zone_batch);
 }
=20
 void __meminit init_currently_empty_zone(struct zone *zone,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0b6e7f235c4a1..b81931c6f2cfd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -663,16 +663,29 @@ bool luf_takeoff_start(void)
 	return !no_shootdown;
 }
=20
+static void wait_zone_ugen_done(struct zone *zone, unsigned long zone_ugen)
+{
+	while (ugen_before(READ_ONCE(zone->zone_ugen_done), zone_ugen))
+		cond_resched();
+}
+
+static void set_zone_ugen_done(struct zone *zone, unsigned long zone_ugen)
+{
+	WRITE_ONCE(zone->zone_ugen_done, zone_ugen);
+}
+
 /*
  * Should be called within the same context of luf_takeoff_start().
  */
-void luf_takeoff_end(void)
+void luf_takeoff_end(struct zone *zone)
 {
 	struct tlbflush_unmap_batch *tlb_ubc_takeoff =3D &current->tlb_ubc_takeof=
f;
 	unsigned long flags;
 	bool no_shootdown;
 	bool outmost =3D false;
 	unsigned long cur_luf_ugen;
+	unsigned long cur_zone_ugen;
+	unsigned long cur_wait_zone_ugen;
=20
 	local_irq_save(flags);
 	VM_WARN_ON(!current->luf_takeoff_started);
@@ -700,6 +713,8 @@ void luf_takeoff_end(void)
 		goto out;
=20
 	cur_luf_ugen =3D current->luf_ugen;
+	cur_zone_ugen =3D current->zone_ugen;
+	cur_wait_zone_ugen =3D current->wait_zone_ugen;
=20
 	current->luf_ugen =3D 0;
=20
@@ -707,10 +722,38 @@ void luf_takeoff_end(void)
 		reset_batch(tlb_ubc_takeoff);
=20
 	try_to_unmap_flush_takeoff();
+
+	if (cur_wait_zone_ugen || cur_zone_ugen) {
+		/*
+		 * pcp(zone =3D=3D NULL) doesn't work with zone batch.
+		 */
+		if (zone) {
+			current->zone_ugen =3D 0;
+			current->wait_zone_ugen =3D 0;
+
+			/*
+			 * Guarantee that tlb shootdown required for the
+			 * zone_ugen has been completed once observing
+			 * 'zone_ugen_done'.
+			 */
+			smp_mb();
+
+			/*
+			 * zone->zone_ugen_done should be updated
+			 * sequentially.
+			 */
+			if (cur_wait_zone_ugen)
+				wait_zone_ugen_done(zone, cur_wait_zone_ugen);
+			if (cur_zone_ugen)
+				set_zone_ugen_done(zone, cur_zone_ugen);
+		}
+	}
 out:
 	if (outmost) {
 		VM_WARN_ON(current->luf_no_shootdown);
 		VM_WARN_ON(current->luf_ugen);
+		VM_WARN_ON(current->zone_ugen);
+		VM_WARN_ON(current->wait_zone_ugen);
 	}
 }
=20
@@ -741,9 +784,9 @@ bool luf_takeoff_no_shootdown(void)
  * Should be called with either zone lock held and irq disabled or pcp
  * lock held.
  */
-bool luf_takeoff_check(struct page *page)
+bool luf_takeoff_check(struct zone *zone, struct page *page)
 {
-	unsigned short luf_key =3D page_luf_key(page);
+	unsigned long zone_ugen;
=20
 	/*
 	 * No way.  Delimit using luf_takeoff_{start,end}().
@@ -753,7 +796,29 @@ bool luf_takeoff_check(struct page *page)
 		return false;
 	}
=20
-	if (!luf_key)
+	if (!zone) {
+		unsigned short luf_key =3D page_luf_key(page);
+
+		if (!luf_key)
+			return true;
+
+		if (current->luf_no_shootdown)
+			return false;
+
+		return true;
+	}
+
+	zone_ugen =3D page_zone_ugen(zone, page);
+	if (!zone_ugen)
+		return true;
+
+	/*
+	 * Should not be zero since zone-zone_ugen has been updated in
+	 * __free_one_page() -> update_zone_batch().
+	 */
+	VM_WARN_ON(!zone->zone_ugen);
+
+	if (!ugen_before(READ_ONCE(zone->zone_ugen_done), zone_ugen))
 		return true;
=20
 	return !current->luf_no_shootdown;
@@ -763,13 +828,11 @@ bool luf_takeoff_check(struct page *page)
  * Should be called with either zone lock held and irq disabled or pcp
  * lock held.
  */
-bool luf_takeoff_check_and_fold(struct page *page)
+bool luf_takeoff_check_and_fold(struct zone *zone, struct page *page)
 {
 	struct tlbflush_unmap_batch *tlb_ubc_takeoff =3D &current->tlb_ubc_takeof=
f;
-	unsigned short luf_key =3D page_luf_key(page);
-	struct luf_batch *lb;
-	unsigned long lb_ugen;
 	unsigned long flags;
+	unsigned long zone_ugen;
=20
 	/*
 	 * No way.  Delimit using luf_takeoff_{start,end}().
@@ -779,28 +842,94 @@ bool luf_takeoff_check_and_fold(struct page *page)
 		return false;
 	}
=20
-	if (!luf_key)
-		return true;
+	/*
+	 * pcp case
+	 */
+	if (!zone) {
+		unsigned short luf_key =3D page_luf_key(page);
+		struct luf_batch *lb;
+		unsigned long lb_ugen;
=20
-	lb =3D &luf_batch[luf_key];
-	read_lock_irqsave(&lb->lock, flags);
-	lb_ugen =3D lb->ugen;
+		if (!luf_key)
+			return true;
+
+		lb =3D &luf_batch[luf_key];
+		read_lock_irqsave(&lb->lock, flags);
+		lb_ugen =3D lb->ugen;
+
+		if (arch_tlbbatch_check_done(&lb->batch.arch, lb_ugen)) {
+			read_unlock_irqrestore(&lb->lock, flags);
+			return true;
+		}
+
+		if (current->luf_no_shootdown) {
+			read_unlock_irqrestore(&lb->lock, flags);
+			return false;
+		}
=20
-	if (arch_tlbbatch_check_done(&lb->batch.arch, lb_ugen)) {
+		fold_batch(tlb_ubc_takeoff, &lb->batch, false);
 		read_unlock_irqrestore(&lb->lock, flags);
+
+		if (!current->luf_ugen || ugen_before(current->luf_ugen, lb_ugen))
+			current->luf_ugen =3D lb_ugen;
 		return true;
 	}
=20
-	if (current->luf_no_shootdown) {
-		read_unlock_irqrestore(&lb->lock, flags);
+	zone_ugen =3D page_zone_ugen(zone, page);
+	if (!zone_ugen)
+		return true;
+
+	/*
+	 * Should not be zero since zone-zone_ugen has been updated in
+	 * __free_one_page() -> update_zone_batch().
+	 */
+	VM_WARN_ON(!zone->zone_ugen);
+
+	if (!ugen_before(READ_ONCE(zone->zone_ugen_done), zone_ugen))
+		return true;
+
+	if (current->luf_no_shootdown)
 		return false;
-	}
=20
-	fold_batch(tlb_ubc_takeoff, &lb->batch, false);
-	read_unlock_irqrestore(&lb->lock, flags);
+	/*
+	 * zone batched flush has been already set.
+	 */
+	if (current->zone_ugen)
+		return true;
+
+	/*
+	 * Others are already performing tlb shootdown for us.  All we
+	 * need is to wait for those to complete.
+	 */
+	if (zone_ugen !=3D zone->zone_ugen) {
+		if (!current->wait_zone_ugen ||
+		    ugen_before(current->wait_zone_ugen, zone_ugen))
+			current->wait_zone_ugen =3D zone_ugen;
+	/*
+	 * It's the first time that zone->zone_ugen has been set to
+	 * current->zone_ugen.  current->luf_ugen also get set.
+	 */
+	} else {
+		current->wait_zone_ugen =3D prev_ugen(zone->zone_ugen);
+		current->zone_ugen =3D zone->zone_ugen;
+		current->luf_ugen =3D zone->luf_ugen;
+
+		/*
+		 * Now that tlb shootdown for the zone_ugen will be
+		 * performed at luf_takeoff_end(), advance it so that
+		 * the next zone->lock holder can efficiently avoid
+		 * unnecessary tlb shootdown.
+		 */
+		zone->zone_ugen =3D next_ugen(zone->zone_ugen);
=20
-	if (!current->luf_ugen || ugen_before(current->luf_ugen, lb_ugen))
-		current->luf_ugen =3D lb_ugen;
+		/*
+		 * All the luf pages will eventually become non-luf
+		 * pages by tlb flushing at luf_takeoff_end() and,
+		 * flush_pend_list_if_done() will empty pend_list.
+		 */
+		atomic_long_set(&zone->nr_luf_pages, 0);
+		fold_batch(tlb_ubc_takeoff, &zone->zone_batch, true);
+	}
 	return true;
 }
 #endif
@@ -822,6 +951,42 @@ static inline void account_freepages(struct zone *zone=
, int nr_pages,
 			   zone->nr_free_highatomic + nr_pages);
 }
=20
+static void flush_pend_list_if_done(struct zone *zone,
+		struct free_area *area, int migratetype)
+{
+	unsigned long zone_ugen_done =3D READ_ONCE(zone->zone_ugen_done);
+
+	/*
+	 * tlb shootdown required for the zone_ugen already has been
+	 * done.  Thus, let's move pages in pend_list to free_list to
+	 * secure more non-luf pages.
+	 */
+	if (!ugen_before(zone_ugen_done, area->pend_zone_ugen[migratetype]))
+		list_splice_init(&area->pend_list[migratetype],
+				 &area->free_list[migratetype]);
+}
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+/*
+ * Should be called with zone->lock held and irq disabled.
+ */
+static void update_zone_batch(struct zone *zone, unsigned short luf_key)
+{
+	unsigned long lb_ugen;
+	struct luf_batch *lb =3D &luf_batch[luf_key];
+
+	read_lock(&lb->lock);
+	fold_batch(&zone->zone_batch, &lb->batch, false);
+	lb_ugen =3D lb->ugen;
+	read_unlock(&lb->lock);
+
+	if (ugen_before(zone->luf_ugen, lb_ugen))
+		zone->luf_ugen =3D lb_ugen;
+}
+#else
+static void update_zone_batch(struct zone *zone, unsigned short luf_key) {}
+#endif
+
 /* Used for pages not on another list */
 static inline void __add_to_free_list(struct page *page, struct zone *zone,
 				      unsigned int order, int migratetype,
@@ -830,6 +995,12 @@ static inline void __add_to_free_list(struct page *pag=
e, struct zone *zone,
 	struct free_area *area =3D &zone->free_area[order];
 	struct list_head *list;
=20
+	/*
+	 * Good chance to flush pend_list just before updating the
+	 * {free,pend}_list.
+	 */
+	flush_pend_list_if_done(zone, area, migratetype);
+
 	VM_WARN_ONCE(get_pageblock_migratetype(page) !=3D migratetype,
 		     "page type is %lu, passed migratetype is %d (nr=3D%d)\n",
 		     get_pageblock_migratetype(page), migratetype, 1 << order);
@@ -839,8 +1010,9 @@ static inline void __add_to_free_list(struct page *pag=
e, struct zone *zone,
 	 * positive is okay because it will cause just additional tlb
 	 * shootdown.
 	 */
-	if (page_luf_key(page)) {
+	if (page_zone_ugen(zone, page)) {
 		list =3D &area->pend_list[migratetype];
+		area->pend_zone_ugen[migratetype] =3D zone->zone_ugen;
 		atomic_long_add(1 << order, &zone->nr_luf_pages);
 	} else
 		list =3D &area->free_list[migratetype];
@@ -862,6 +1034,7 @@ static inline void move_to_free_list(struct page *page=
, struct zone *zone,
 				     unsigned int order, int old_mt, int new_mt)
 {
 	struct free_area *area =3D &zone->free_area[order];
+	unsigned long zone_ugen =3D page_zone_ugen(zone, page);
=20
 	/* Free page moving can fail, so it happens before the type update */
 	VM_WARN_ONCE(get_pageblock_migratetype(page) !=3D old_mt,
@@ -878,9 +1051,12 @@ static inline void move_to_free_list(struct page *pag=
e, struct zone *zone,
 	 * positive is okay because it will cause just additional tlb
 	 * shootdown.
 	 */
-	if (page_luf_key(page))
+	if (zone_ugen) {
 		list_move_tail(&page->buddy_list, &area->pend_list[new_mt]);
-	else
+		if (!area->pend_zone_ugen[new_mt] ||
+		    ugen_before(area->pend_zone_ugen[new_mt], zone_ugen))
+			area->pend_zone_ugen[new_mt] =3D zone_ugen;
+	} else
 		list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
=20
 	account_freepages(zone, -(1 << order), old_mt);
@@ -898,7 +1074,7 @@ static inline void __del_page_from_free_list(struct pa=
ge *page, struct zone *zon
 	if (page_reported(page))
 		__ClearPageReported(page);
=20
-	if (page_luf_key(page))
+	if (page_zone_ugen(zone, page))
 		atomic_long_sub(1 << order, &zone->nr_luf_pages);
=20
 	list_del(&page->buddy_list);
@@ -936,29 +1112,39 @@ static inline struct page *get_page_from_free_area(s=
truct zone *zone,
 	 */
 	pend_first =3D !non_luf_pages_ok(zone);
=20
+	/*
+	 * Good chance to flush pend_list just before updating the
+	 * {free,pend}_list.
+	 */
+	flush_pend_list_if_done(zone, area, migratetype);
+
 	if (pend_first) {
 		page =3D list_first_entry_or_null(&area->pend_list[migratetype],
 				struct page, buddy_list);
=20
-		if (page && luf_takeoff_check(page))
+		if (page && luf_takeoff_check(zone, page))
 			return page;
=20
 		page =3D list_first_entry_or_null(&area->free_list[migratetype],
 				struct page, buddy_list);
=20
-		if (page)
+		if (page) {
+			set_page_zone_ugen(page, 0);
 			return page;
+		}
 	} else {
 		page =3D list_first_entry_or_null(&area->free_list[migratetype],
 				struct page, buddy_list);
=20
-		if (page)
+		if (page) {
+			set_page_zone_ugen(page, 0);
 			return page;
+		}
=20
 		page =3D list_first_entry_or_null(&area->pend_list[migratetype],
 				struct page, buddy_list);
=20
-		if (page && luf_takeoff_check(page))
+		if (page && luf_takeoff_check(zone, page))
 			return page;
 	}
 	return NULL;
@@ -1023,6 +1209,7 @@ static inline void __free_one_page(struct page *page,
 	unsigned long combined_pfn;
 	struct page *buddy;
 	bool to_tail;
+	unsigned long zone_ugen;
=20
 	VM_BUG_ON(!zone_is_initialized(zone));
 	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
@@ -1034,20 +1221,25 @@ static inline void __free_one_page(struct page *pag=
e,
 	account_freepages(zone, 1 << order, migratetype);
=20
 	/*
-	 * Use the page's luf_key unchanged if luf_key =3D=3D 0.  Worth
-	 * noting that page_luf_key() will be 0 in most cases since it's
-	 * initialized at free_pages_prepare().
+	 * Use the page's zone_ugen unchanged if luf_key =3D=3D 0.  Worth
+	 * noting that page_zone_ugen() will be 0 in most cases since
+	 * it's initialized at free_pages_prepare().
+	 *
+	 * Update page's zone_ugen and zone's batch only if a valid
+	 * luf_key was passed.
 	 */
-	if (luf_key)
-		set_page_luf_key(page, luf_key);
-	else
-		luf_key =3D page_luf_key(page);
+	if (luf_key) {
+		zone_ugen =3D zone->zone_ugen;
+		set_page_zone_ugen(page, (unsigned short)zone_ugen);
+		update_zone_batch(zone, luf_key);
+	} else
+		zone_ugen =3D page_zone_ugen(zone, page);
=20
 	while (order < MAX_PAGE_ORDER) {
 		int buddy_mt =3D migratetype;
-		unsigned short buddy_luf_key;
+		unsigned long buddy_zone_ugen;
=20
-		if (!luf_key && compaction_capture(capc, page, order, migratetype)) {
+		if (!zone_ugen && compaction_capture(capc, page, order, migratetype)) {
 			account_freepages(zone, -(1 << order), migratetype);
 			return;
 		}
@@ -1080,17 +1272,15 @@ static inline void __free_one_page(struct page *pag=
e,
 		else
 			__del_page_from_free_list(buddy, zone, order, buddy_mt);
=20
+		buddy_zone_ugen =3D page_zone_ugen(zone, buddy);
+
 		/*
-		 * !buddy_luf_key && !luf_key : do nothing
-		 *  buddy_luf_key && !luf_key : luf_key =3D buddy_luf_key
-		 * !buddy_luf_key &&  luf_key : do nothing
-		 *  buddy_luf_key &&  luf_key : merge two into luf_key
+		 * if (!zone_ugen && !buddy_zone_ugen) : nothing to do
+		 * if ( zone_ugen && !buddy_zone_ugen) : nothing to do
 		 */
-		buddy_luf_key =3D page_luf_key(buddy);
-		if (buddy_luf_key && !luf_key)
-			luf_key =3D buddy_luf_key;
-		else if (buddy_luf_key && luf_key)
-			fold_luf_batch(&luf_batch[luf_key], &luf_batch[buddy_luf_key]);
+		if ((!zone_ugen && buddy_zone_ugen) ||
+		    ( zone_ugen && buddy_zone_ugen && ugen_before(zone_ugen, buddy_zone_=
ugen)))
+			zone_ugen =3D buddy_zone_ugen;
=20
 		if (unlikely(buddy_mt !=3D migratetype)) {
 			/*
@@ -1103,7 +1293,7 @@ static inline void __free_one_page(struct page *page,
=20
 		combined_pfn =3D buddy_pfn & pfn;
 		page =3D page + (combined_pfn - pfn);
-		set_page_luf_key(page, luf_key);
+		set_page_zone_ugen(page, zone_ugen);
 		pfn =3D combined_pfn;
 		order++;
 	}
@@ -1446,6 +1636,7 @@ static void free_pcppages_bulk(struct zone *zone, int=
 count,
 		do {
 			unsigned long pfn;
 			int mt;
+			unsigned short luf_key;
=20
 			page =3D list_last_entry(list, struct page, pcp_list);
 			pfn =3D page_to_pfn(page);
@@ -1456,7 +1647,16 @@ static void free_pcppages_bulk(struct zone *zone, in=
t count,
 			count -=3D nr_pages;
 			pcp->count -=3D nr_pages;
=20
-			__free_one_page(page, pfn, zone, order, mt, FPI_NONE, 0);
+			/*
+			 * page private in pcp stores luf_key while it
+			 * stores zone_ugen in buddy.  Thus, the private
+			 * needs to be cleared and the luf_key needs to
+			 * be passed to buddy.
+			 */
+			luf_key =3D page_luf_key(page);
+			set_page_private(page, 0);
+
+			__free_one_page(page, pfn, zone, order, mt, FPI_NONE, luf_key);
=20
 			trace_mm_page_pcpu_drain(page, order, mt);
 		} while (count > 0 && !list_empty(list));
@@ -1499,7 +1699,15 @@ static void free_one_page(struct zone *zone, struct =
page *page,
 	 * valid luf_key can be passed only if order =3D=3D 0.
 	 */
 	VM_WARN_ON(luf_key && order);
-	set_page_luf_key(page, luf_key);
+
+	/*
+	 * Update page's zone_ugen and zone's batch only if a valid
+	 * luf_key was passed.
+	 */
+	if (luf_key) {
+		set_page_zone_ugen(page, (unsigned short)zone->zone_ugen);
+		update_zone_batch(zone, luf_key);
+	}
=20
 	split_large_buddy(zone, page, pfn, order, fpi_flags);
 	spin_unlock_irqrestore(&zone->lock, flags);
@@ -1659,7 +1867,7 @@ static inline unsigned int expand(struct zone *zone, =
struct page *page, int low,
 		if (set_page_guard(zone, &page[size], high))
 			continue;
=20
-		if (page_luf_key(&page[size]))
+		if (page_zone_ugen(zone, &page[size]))
 			tail =3D true;
=20
 		__add_to_free_list(&page[size], zone, high, migratetype, tail);
@@ -1677,7 +1885,7 @@ static __always_inline void page_del_and_expand(struc=
t zone *zone,
 	int nr_pages =3D 1 << high;
=20
 	__del_page_from_free_list(page, zone, high, migratetype);
-	if (unlikely(!luf_takeoff_check_and_fold(page)))
+	if (unlikely(!luf_takeoff_check_and_fold(zone, page)))
 		VM_WARN_ON(1);
 	nr_pages -=3D expand(zone, page, low, high, migratetype);
 	account_freepages(zone, -nr_pages, migratetype);
@@ -2199,7 +2407,7 @@ steal_suitable_fallback(struct zone *zone, struct pag=
e *page,
 		unsigned int nr_added;
=20
 		del_page_from_free_list(page, zone, current_order, block_type);
-		if (unlikely(!luf_takeoff_check_and_fold(page)))
+		if (unlikely(!luf_takeoff_check_and_fold(zone, page)))
 			VM_WARN_ON(1);
 		change_pageblock_range(page, current_order, start_type);
 		nr_added =3D expand(zone, page, order, current_order, start_type);
@@ -2438,12 +2646,12 @@ static bool unreserve_highatomic_pageblock(const st=
ruct alloc_context *ac,
 			WARN_ON_ONCE(ret =3D=3D -1);
 			if (ret > 0) {
 				spin_unlock_irqrestore(&zone->lock, flags);
-				luf_takeoff_end();
+				luf_takeoff_end(zone);
 				return ret;
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
-		luf_takeoff_end();
+		luf_takeoff_end(zone);
 	}
=20
 	return false;
@@ -2644,12 +2852,15 @@ static int rmqueue_bulk(struct zone *zone, unsigned=
 int order,
 		 * pages are ordered properly.
 		 */
 		list_add_tail(&page->pcp_list, list);
+
+		/*
+		 * Reset all the luf fields.  tlb shootdown will be
+		 * performed at luf_takeoff_end() below if needed.
+		 */
+		set_page_private(page, 0);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
-	/*
-	 * Check and flush before using the pages taken off.
-	 */
-	luf_takeoff_end();
+	luf_takeoff_end(zone);
=20
 	return i;
 }
@@ -3163,7 +3374,7 @@ int __isolate_free_page(struct page *page, unsigned i=
nt order, bool willputback)
 	}
=20
 	del_page_from_free_list(page, zone, order, mt);
-	if (unlikely(!willputback && !luf_takeoff_check_and_fold(page)))
+	if (unlikely(!willputback && !luf_takeoff_check_and_fold(zone, page)))
 		VM_WARN_ON(1);
=20
 	/*
@@ -3262,7 +3473,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zon=
e, struct zone *zone,
=20
 			if (!page) {
 				spin_unlock_irqrestore(&zone->lock, flags);
-				luf_takeoff_end();
+				luf_takeoff_end(zone);
 				return NULL;
 			}
 		}
@@ -3270,7 +3481,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zon=
e, struct zone *zone,
 		/*
 		 * Check and flush before using the pages taken off.
 		 */
-		luf_takeoff_end();
+		luf_takeoff_end(zone);
 	} while (check_new_pages(page, order));
=20
 	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
@@ -3360,7 +3571,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, uns=
igned int order,
 		}
=20
 		list_for_each_entry(page, list, pcp_list) {
-			if (luf_takeoff_check_and_fold(page)) {
+			if (luf_takeoff_check_and_fold(NULL, page)) {
 				list_del(&page->pcp_list);
 				pcp->count -=3D 1 << order;
 				break;
@@ -3395,7 +3606,7 @@ static struct page *rmqueue_pcplist(struct zone *pref=
erred_zone,
 	pcp =3D pcp_spin_trylock(zone->per_cpu_pageset);
 	if (!pcp) {
 		pcp_trylock_finish(UP_flags);
-		luf_takeoff_end();
+		luf_takeoff_end(NULL);
 		return NULL;
 	}
=20
@@ -3412,7 +3623,7 @@ static struct page *rmqueue_pcplist(struct zone *pref=
erred_zone,
 	/*
 	 * Check and flush before using the pages taken off.
 	 */
-	luf_takeoff_end();
+	luf_takeoff_end(NULL);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 		zone_statistics(preferred_zone, zone, 1);
@@ -3451,6 +3662,7 @@ struct page *rmqueue(struct zone *preferred_zone,
 							migratetype);
=20
 out:
+
 	/* Separate test+clear to avoid unnecessary atomics */
 	if ((alloc_flags & ALLOC_KSWAPD) &&
 	    unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
@@ -5059,7 +5271,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int =
preferred_nid,
 	/*
 	 * Check and flush before using the pages taken off.
 	 */
-	luf_takeoff_end();
+	luf_takeoff_end(NULL);
=20
 	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
 	zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account);
@@ -5069,7 +5281,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int =
preferred_nid,
=20
 failed_irq:
 	pcp_trylock_finish(UP_flags);
-	luf_takeoff_end();
+	luf_takeoff_end(NULL);
=20
 failed:
 	page =3D __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
@@ -7235,7 +7447,7 @@ unsigned long __offline_isolated_pages(unsigned long =
start_pfn,
 		VM_WARN_ON(get_pageblock_migratetype(page) !=3D MIGRATE_ISOLATE);
 		order =3D buddy_order(page);
 		del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE);
-		if (unlikely(!luf_takeoff_check_and_fold(page)))
+		if (unlikely(!luf_takeoff_check_and_fold(zone, page)))
 			VM_WARN_ON(1);
 		pfn +=3D (1 << order);
 	}
@@ -7243,7 +7455,7 @@ unsigned long __offline_isolated_pages(unsigned long =
start_pfn,
 	/*
 	 * Check and flush before using the pages taken off.
 	 */
-	luf_takeoff_end();
+	luf_takeoff_end(zone);
=20
 	return end_pfn - start_pfn - already_offline;
 }
@@ -7305,7 +7517,7 @@ static void break_down_buddy_pages(struct zone *zone,=
 struct page *page,
 		if (set_page_guard(zone, current_buddy, high))
 			continue;
=20
-		if (page_luf_key(current_buddy))
+		if (page_zone_ugen(zone, current_buddy))
 			tail =3D true;
=20
 		add_to_free_list(current_buddy, zone, high, migratetype, tail);
@@ -7337,7 +7549,7 @@ bool take_page_off_buddy(struct page *page)
=20
 			del_page_from_free_list(page_head, zone, page_order,
 						migratetype);
-			if (unlikely(!luf_takeoff_check_and_fold(page_head)))
+			if (unlikely(!luf_takeoff_check_and_fold(zone, page_head)))
 				VM_WARN_ON(1);
 			break_down_buddy_pages(zone, page_head, page, 0,
 						page_order, migratetype);
@@ -7353,7 +7565,7 @@ bool take_page_off_buddy(struct page *page)
 	/*
 	 * Check and flush before using the pages taken off.
 	 */
-	luf_takeoff_end();
+	luf_takeoff_end(zone);
 	return ret;
 }
=20
@@ -7372,6 +7584,13 @@ bool put_page_back_buddy(struct page *page)
 		int migratetype =3D get_pfnblock_migratetype(page, pfn);
=20
 		ClearPageHWPoisonTakenOff(page);
+
+		/*
+		 * Reset all the luf fields.  tlb shootdown has already
+		 * been performed by take_page_off_buddy().
+		 */
+		set_page_private(page, 0);
+
 		__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE, 0);
 		if (TestClearPageHWPoison(page)) {
 			ret =3D true;
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index e152b22fbba8a..b23d3ed34ec07 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -118,7 +118,8 @@ page_reporting_drain(struct page_reporting_dev_info *pr=
dev,
=20
 		/*
 		 * Ensure private is zero before putting into the
-		 * allocator.
+		 * allocator.  tlb shootdown has already been performed
+		 * at isolation.
 		 */
 		set_page_private(page, 0);
=20
@@ -194,7 +195,7 @@ page_reporting_cycle(struct page_reporting_dev_info *pr=
dev, struct zone *zone,
 		if (PageReported(page))
 			continue;
=20
-		if (unlikely(consider_pend && !luf_takeoff_check(page))) {
+		if (unlikely(consider_pend && !luf_takeoff_check(zone, page))) {
 			VM_WARN_ON(1);
 			continue;
 		}
@@ -238,7 +239,7 @@ page_reporting_cycle(struct page_reporting_dev_info *pr=
dev, struct zone *zone,
 		/*
 		 * Check and flush before using the pages taken off.
 		 */
-		luf_takeoff_end();
+		luf_takeoff_end(zone);
=20
 		/* begin processing pages in local list */
 		err =3D prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
@@ -283,7 +284,7 @@ page_reporting_cycle(struct page_reporting_dev_info *pr=
dev, struct zone *zone,
 	/*
 	 * Check and flush before using the pages taken off.
 	 */
-	luf_takeoff_end();
+	luf_takeoff_end(zone);
=20
 	return err;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 0cb13e8fcd739..ebe91ff1bcb16 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -650,7 +650,11 @@ static unsigned long new_luf_ugen(void)
 {
 	unsigned long ugen =3D atomic_long_inc_return(&luf_ugen);
=20
-	if (!ugen)
+	/*
+	 * Avoid zero even in unsigned short range so as to treat
+	 * '(unsigned short)ugen =3D=3D 0' as invalid.
+	 */
+	if (!(unsigned short)ugen)
 		ugen =3D atomic_long_inc_return(&luf_ugen);
=20
 	return ugen;
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 09CD31EA7C9
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:08 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029772; cv=none;
 b=Ixv5R4C9sONh8wwz6vM3j/0QmNN8baL17Lq2Kv/wVieiaoMNAyPHmPJ/fWEn3kD6Fs3LZE5CHzi+MkgmCkMgtZQI2BLJgL73k6GuqerVfNnhlforGkw8axQNhYVquySNZBhykc4xkP2RT3ndzOz5HN2siIQa0C9R+LH90rwizD0=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029772; c=relaxed/simple;
	bh=kJUOmAXIN+Be88pGq5nqP4G6HpOEuI3dn+5Ika8a0VA=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=OwAiVf12CzMpGt+TWqFSJjGKK/8uK+hrgE9uzA9ZvK8tcGe9AOcC83eUI64UGUHUfzmGqf8czG6G/u0vDdZHsb9UsrT7KCBjnf7F3LrfO+3ZIQvFp1G/f5n0D3WVyYrWGFXwfev0UeF4zSuHLoV7Fs0LA5DBm8+Qdbx7NK5psRU=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-1c-67b6bba7c1f0
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 22/26] mm/page_alloc: not allow to tlb shootdown if
 !preemptable() && non_luf_pages_ok()
Date: Thu, 20 Feb 2025 14:20:23 +0900
Message-Id: <20250220052027.58847-23-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrCLMWRmVeSWpSXmKPExsXC9ZZnoe7y3dvSDTafkbKYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	/z8LFXyxqnjyaxJrA+M7gy5GDg4JAROJCQ/9uhg5wcwZvfeYQWw2AXWJGzd+gtkiAmYSB1v/
	sIPYzAJ3mSQO9LOB2MICBRLbfpwFi7MIqEosODeZBcTmBao/umcyO8RMeYnVGw6AzeEEiv+Y
	0QvWKyRgKvFuwSWmLkYuoJr3bBKXmjczQjRIShxccYNlAiPvAkaGVYxCmXlluYmZOSZ6GZV5
	mRV6yfm5mxiBYb+s9k/0DsZPF4IPMQpwMCrx8M5o3ZYuxJpYVlyZe4hRgoNZSYS3rX5LuhBv
	SmJlVWpRfnxRaU5q8SFGaQ4WJXFeo2/lKUIC6YklqdmpqQWpRTBZJg5OqQZGqb/ib325lm7U
	UV2V5PjJkflWyEXbJsZVFQpHXhycpc0a+Ez9Za3bt5odtx9MjLlzaOLt92pim7WZll2Ynb/n
	3ku9U171OvN72DZe7z0zWdR9wp13LoYLvTp3f5Pe5JAcflVFz3phb424T6oqo7X2gcK3bk+O
	vQ5bpfx/tkRe0jEzKa/dR5aoKrEUZyQaajEXFScCAB6fEFp3AgAA
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrDLMWRmVeSWpSXmKPExsXC5WfdrLt897Z0g62/BSzmrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlfH/s1DBF6uKJ78msTYwvjPoYuTkkBAwkZjRe48ZxGYTUJe4ceMnmC0iYCZx
	sPUPO4jNLHCXSeJAPxuILSxQILHtx1mwOIuAqsSCc5NZQGxeoPqjeyazQ8yUl1i94QDYHE6g
	+I8ZvWC9QgKmEu8WXGKawMi1gJFhFaNIZl5ZbmJmjqlecXZGZV5mhV5yfu4mRmAYL6v9M3EH
	45fL7ocYBTgYlXh4Hzzemi7EmlhWXJl7iFGCg1lJhLetfku6EG9KYmVValF+fFFpTmrxIUZp
	DhYlcV6v8NQEIYH0xJLU7NTUgtQimCwTB6dUA+OKor7yNV8CgneJvYxO6zj1tKOtUPXc/jam
	j+aXcvfd6DU3lN+3e6l/9NHNlZvmJNb5vzhx6ejWwB3Pt8ZKLfJgWybJU14fNH3tmbVV4u8U
	Mja+5f4TKl01qfT16oUn9vQ9Ulq/4EzE9YeL576ocf+UXO2/JrAy97ZTx4rvh8IULnfrfJu+
	76qREktxRqKhFnNRcSIAHburZl8CAAA=
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Do not perform tlb shootdown if the context is in preempt disable and
there are already enough non luf pages, not to hurt preemptibility.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 mm/compaction.c     |  6 +++---
 mm/internal.h       |  5 +++--
 mm/page_alloc.c     | 27 +++++++++++++++------------
 mm/page_isolation.c |  2 +-
 mm/page_reporting.c |  4 ++--
 5 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index a7f17867decae..8fa9de6db2441 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -605,7 +605,7 @@ static unsigned long isolate_freepages_block(struct com=
pact_control *cc,
=20
 	page =3D pfn_to_page(blockpfn);
=20
-	luf_takeoff_start();
+	luf_takeoff_start(cc->zone);
 	/* Isolate free pages. */
 	for (; blockpfn < end_pfn; blockpfn +=3D stride, page +=3D stride) {
 		int isolated;
@@ -1601,7 +1601,7 @@ static void fast_isolate_freepages(struct compact_con=
trol *cc)
 		if (!area->nr_free)
 			continue;
=20
-		can_shootdown =3D luf_takeoff_start();
+		can_shootdown =3D luf_takeoff_start(cc->zone);
 		spin_lock_irqsave(&cc->zone->lock, flags);
 		freelist =3D &area->free_list[MIGRATE_MOVABLE];
 retry:
@@ -2413,7 +2413,7 @@ static enum compact_result compact_finished(struct co=
mpact_control *cc)
 	 * luf_takeoff_{start,end}() is required to identify whether
 	 * this compaction context is tlb shootdownable for luf'd pages.
 	 */
-	luf_takeoff_start();
+	luf_takeoff_start(cc->zone);
 	ret =3D __compact_finished(cc);
 	luf_takeoff_end(cc->zone);
=20
diff --git a/mm/internal.h b/mm/internal.h
index e634eaf220f00..fba19c283ac48 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1594,7 +1594,7 @@ static inline void accept_page(struct page *page)
 #endif /* CONFIG_UNACCEPTED_MEMORY */
 #if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH)
 extern struct luf_batch luf_batch[];
-bool luf_takeoff_start(void);
+bool luf_takeoff_start(struct zone *zone);
 void luf_takeoff_end(struct zone *zone);
 bool luf_takeoff_no_shootdown(void);
 bool luf_takeoff_check(struct zone *zone, struct page *page);
@@ -1608,6 +1608,7 @@ static inline bool non_luf_pages_ok(struct zone *zone)
=20
 	return nr_free - nr_luf_pages > min_wm;
 }
+
 unsigned short fold_unmap_luf(void);
=20
 /*
@@ -1694,7 +1695,7 @@ static inline bool can_luf_vma(struct vm_area_struct =
*vma)
 	return true;
 }
 #else /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
-static inline bool luf_takeoff_start(void) { return false; }
+static inline bool luf_takeoff_start(struct zone *zone) { return false; }
 static inline void luf_takeoff_end(struct zone *zone) {}
 static inline bool luf_takeoff_no_shootdown(void) { return true; }
 static inline bool luf_takeoff_check(struct zone *zone, struct page *page)=
 { return true; }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b81931c6f2cfd..ccbe49b78190a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -623,22 +623,25 @@ compaction_capture(struct capture_control *capc, stru=
ct page *page,
 #endif /* CONFIG_COMPACTION */
=20
 #if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH)
-static bool no_shootdown_context(void)
+static bool no_shootdown_context(struct zone *zone)
 {
 	/*
-	 * If it performs with irq disabled, that might cause a deadlock.
-	 * Avoid tlb shootdown in this case.
+	 * Tries to avoid tlb shootdown if !preemptible().  However, it
+	 * should be allowed under heavy memory pressure.
 	 */
+	if (zone && non_luf_pages_ok(zone))
+		return !(preemptible() && in_task());
+
 	return !(!irqs_disabled() && in_task());
 }
=20
 /*
  * Can be called with zone lock released and irq enabled.
  */
-bool luf_takeoff_start(void)
+bool luf_takeoff_start(struct zone *zone)
 {
 	unsigned long flags;
-	bool no_shootdown =3D no_shootdown_context();
+	bool no_shootdown =3D no_shootdown_context(zone);
=20
 	local_irq_save(flags);
=20
@@ -2588,7 +2591,7 @@ static bool unreserve_highatomic_pageblock(const stru=
ct alloc_context *ac,
 		 * luf_takeoff_{start,end}() is required for
 		 * get_page_from_free_area() to use luf_takeoff_check().
 		 */
-		luf_takeoff_start();
+		luf_takeoff_start(zone);
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order =3D 0; order < NR_PAGE_ORDERS; order++) {
 			struct free_area *area =3D &(zone->free_area[order]);
@@ -2829,7 +2832,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned i=
nt order,
 	unsigned long flags;
 	int i;
=20
-	luf_takeoff_start();
+	luf_takeoff_start(zone);
 	spin_lock_irqsave(&zone->lock, flags);
 	for (i =3D 0; i < count; ++i) {
 		struct page *page =3D __rmqueue(zone, order, migratetype,
@@ -3455,7 +3458,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zon=
e, struct zone *zone,
=20
 	do {
 		page =3D NULL;
-		luf_takeoff_start();
+		luf_takeoff_start(zone);
 		spin_lock_irqsave(&zone->lock, flags);
 		if (alloc_flags & ALLOC_HIGHATOMIC)
 			page =3D __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
@@ -3600,7 +3603,7 @@ static struct page *rmqueue_pcplist(struct zone *pref=
erred_zone,
 	struct page *page;
 	unsigned long __maybe_unused UP_flags;
=20
-	luf_takeoff_start();
+	luf_takeoff_start(NULL);
 	/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
 	pcp_trylock_prepare(UP_flags);
 	pcp =3D pcp_spin_trylock(zone->per_cpu_pageset);
@@ -5229,7 +5232,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int =
preferred_nid,
 	if (unlikely(!zone))
 		goto failed;
=20
-	luf_takeoff_start();
+	luf_takeoff_start(NULL);
 	/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
 	pcp_trylock_prepare(UP_flags);
 	pcp =3D pcp_spin_trylock(zone->per_cpu_pageset);
@@ -7418,7 +7421,7 @@ unsigned long __offline_isolated_pages(unsigned long =
start_pfn,
=20
 	offline_mem_sections(pfn, end_pfn);
 	zone =3D page_zone(pfn_to_page(pfn));
-	luf_takeoff_start();
+	luf_takeoff_start(zone);
 	spin_lock_irqsave(&zone->lock, flags);
 	while (pfn < end_pfn) {
 		page =3D pfn_to_page(pfn);
@@ -7536,7 +7539,7 @@ bool take_page_off_buddy(struct page *page)
 	unsigned int order;
 	bool ret =3D false;
=20
-	luf_takeoff_start();
+	luf_takeoff_start(zone);
 	spin_lock_irqsave(&zone->lock, flags);
 	for (order =3D 0; order < NR_PAGE_ORDERS; order++) {
 		struct page *page_head =3D page - (pfn & ((1 << order) - 1));
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index eae33d188762b..ccd36838f9cff 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -211,7 +211,7 @@ static void unset_migratetype_isolate(struct page *page=
, int migratetype)
 	struct page *buddy;
=20
 	zone =3D page_zone(page);
-	luf_takeoff_start();
+	luf_takeoff_start(zone);
 	spin_lock_irqsave(&zone->lock, flags);
 	if (!is_migrate_isolate_page(page))
 		goto out;
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index b23d3ed34ec07..83b66e7f0d257 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -170,7 +170,7 @@ page_reporting_cycle(struct page_reporting_dev_info *pr=
dev, struct zone *zone,
 	if (free_area_empty(area, mt))
 		return err;
=20
-	can_shootdown =3D luf_takeoff_start();
+	can_shootdown =3D luf_takeoff_start(zone);
 	spin_lock_irq(&zone->lock);
=20
 	/*
@@ -250,7 +250,7 @@ page_reporting_cycle(struct page_reporting_dev_info *pr=
dev, struct zone *zone,
 		/* update budget to reflect call to report function */
 		budget--;
=20
-		luf_takeoff_start();
+		luf_takeoff_start(zone);
=20
 		/* reacquire zone lock and resume processing */
 		spin_lock_irq(&zone->lock);
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 2E60B1E3DC4
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:09 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029772; cv=none;
 b=N9Ke2NSesSjMbjh02aJEwzVa9Yw23hGDHoniMCKheyNMuDiXHrDwXY5+HKYZlVrFG5IThBZzhENXXucW7IF+FlkyIGu/CNKqjtuVpuUDEK+xwRkrsb5ViMdNT6Im/BD9waIIr/oDVt+pvG78JYVLxRJkvkldmItvTTXYdr7AqZY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029772; c=relaxed/simple;
	bh=o92M9Oubd12E0b566Pkq4Lbv5SuVQHUAFaiOY/7OkGc=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=W3iUQpZRKPwXMv8g6bxylTHeN7Sv+0W/zKIg0VZfn8uByvA4Q7touRqnSA/tSCAnoH+am4GpEHlWaWMco5U81YeLcnwd796yUiB5i2S6nqCJkE1TDg+xswwlJvtDOMMGLuToB/u/zEfPJTA6KODBuxDQW1Xy20X2op3xLOn/lI4=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-21-67b6bba759bc
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 23/26] mm: separate move/undo parts from
 migrate_pages_batch()
Date: Thu, 20 Feb 2025 14:20:24 +0900
Message-Id: <20250220052027.58847-24-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrKLMWRmVeSWpSXmKPExsXC9ZZnke7y3dvSDfq+y1jMWb+GzeLzhn9s
	Fi82tDNafF3/i9ni6ac+FovLu+awWdxb85/V4vyutawWO5buY7K4dGABk8Xx3gNMFvPvfWaz
	2LxpKrPF8SlTGS1+/wAqPjlrMouDgMf31j4Wj52z7rJ7LNhU6rF5hZbH4j0vmTw2repk89j0
	aRK7x7tz59g9Tsz4zeIx72Sgx/t9V9k8tv6y82iceo3N4/MmuQC+KC6blNSczLLUIn27BK6M
	rz1eBX+0Kx49O8nawDhTuYuRk0NCwERi1depbDD2z89vWEFsNgF1iRs3fjKD2CICZhIHW/+w
	g9jMAneZJA70g9ULC4RJ3NyxG8xmEVCVmDBrN1gvL1B9+465zBAz5SVWbzgAZnMCxX/M6AWr
	FxIwlXi34BJTFyMXUM17NoljR+axQjRIShxccYNlAiPvAkaGVYxCmXlluYmZOSZ6GZV5mRV6
	yfm5mxiBgb+s9k/0DsZPF4IPMQpwMCrx8M5o3ZYuxJpYVlyZe4hRgoNZSYS3rX5LuhBvSmJl
	VWpRfnxRaU5q8SFGaQ4WJXFeo2/lKUIC6YklqdmpqQWpRTBZJg5OqQbGHpWW1MRliikTmYu/
	MP99xr33yLZCy+1FOgorwri2LrrsK23LHVTyaJN9+K0zTscu/rA6eGXfq4rNjY9Z17ov9Gs+
	lP3L+rNeaXhe3K2iB3LyDULfHj2cendGo7Rfg7vfA71rbS5VEu8zw9/obZ1x483b13wvgycX
	PRbYKvTJLGf5u57q9dH9SizFGYmGWsxFxYkAlW02a3gCAAA=
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrDLMWRmVeSWpSXmKPExsXC5WfdrLt897Z0g5+zhS3mrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlfG1x6vgj3bFo2cnWRsYZyp3MXJySAiYSPz8/IYVxGYTUJe4ceMnM4gtImAm
	cbD1DzuIzSxwl0niQD8biC0sECZxc8duMJtFQFViwqzdYL28QPXtO+YyQ8yUl1i94QCYzQkU
	/zGjF6xeSMBU4t2CS0wTGLkWMDKsYhTJzCvLTczMMdUrzs6ozMus0EvOz93ECAzjZbV/Ju5g
	/HLZ/RCjAAejEg/vg8db04VYE8uKK3MPMUpwMCuJ8LbVb0kX4k1JrKxKLcqPLyrNSS0+xCjN
	waIkzusVnpogJJCeWJKanZpakFoEk2Xi4JRqYGTljrVtzTKauYx3slvNo0tzjbZqBwqUbp28
	aYbFHIv4T/PXH/A7mZ8S5xcxqeLfTqPPxtPFShmeimwuW7GmeVbUosmiF36vvqAZ99oofVfg
	4WdmqXnTD77452glwv3nz6q9LVsnnpL2fGL/ItCvPnBNo12pkFFVQOyy1DvtEs1TpSXm6r9T
	vaLEUpyRaKjFXFScCADMZTXRXwIAAA==
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

Functionally, no change.  This is a preparation for luf mechanism that
requires to use separated folio lists for its own handling during
migration.  Refactored migrate_pages_batch() so as to separate move/undo
parts from migrate_pages_batch().

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 mm/migrate.c | 134 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 83 insertions(+), 51 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index dfb5eba3c5223..5e12023dbc75a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1695,6 +1695,81 @@ static int migrate_hugetlbs(struct list_head *from, =
new_folio_t get_new_folio,
 	return nr_failed;
 }
=20
+static void migrate_folios_move(struct list_head *src_folios,
+		struct list_head *dst_folios,
+		free_folio_t put_new_folio, unsigned long private,
+		enum migrate_mode mode, int reason,
+		struct list_head *ret_folios,
+		struct migrate_pages_stats *stats,
+		int *retry, int *thp_retry, int *nr_failed,
+		int *nr_retry_pages)
+{
+	struct folio *folio, *folio2, *dst, *dst2;
+	bool is_thp;
+	int nr_pages;
+	int rc;
+
+	dst =3D list_first_entry(dst_folios, struct folio, lru);
+	dst2 =3D list_next_entry(dst, lru);
+	list_for_each_entry_safe(folio, folio2, src_folios, lru) {
+		is_thp =3D folio_test_large(folio) && folio_test_pmd_mappable(folio);
+		nr_pages =3D folio_nr_pages(folio);
+
+		cond_resched();
+
+		rc =3D migrate_folio_move(put_new_folio, private,
+				folio, dst, mode,
+				reason, ret_folios);
+		/*
+		 * The rules are:
+		 *	Success: folio will be freed
+		 *	-EAGAIN: stay on the unmap_folios list
+		 *	Other errno: put on ret_folios list
+		 */
+		switch (rc) {
+		case -EAGAIN:
+			*retry +=3D 1;
+			*thp_retry +=3D is_thp;
+			*nr_retry_pages +=3D nr_pages;
+			break;
+		case MIGRATEPAGE_SUCCESS:
+			stats->nr_succeeded +=3D nr_pages;
+			stats->nr_thp_succeeded +=3D is_thp;
+			break;
+		default:
+			*nr_failed +=3D 1;
+			stats->nr_thp_failed +=3D is_thp;
+			stats->nr_failed_pages +=3D nr_pages;
+			break;
+		}
+		dst =3D dst2;
+		dst2 =3D list_next_entry(dst, lru);
+	}
+}
+
+static void migrate_folios_undo(struct list_head *src_folios,
+		struct list_head *dst_folios,
+		free_folio_t put_new_folio, unsigned long private,
+		struct list_head *ret_folios)
+{
+	struct folio *folio, *folio2, *dst, *dst2;
+
+	dst =3D list_first_entry(dst_folios, struct folio, lru);
+	dst2 =3D list_next_entry(dst, lru);
+	list_for_each_entry_safe(folio, folio2, src_folios, lru) {
+		int old_page_state =3D 0;
+		struct anon_vma *anon_vma =3D NULL;
+
+		__migrate_folio_extract(dst, &old_page_state, &anon_vma);
+		migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
+				anon_vma, true, ret_folios);
+		list_del(&dst->lru);
+		migrate_folio_undo_dst(dst, true, put_new_folio, private);
+		dst =3D dst2;
+		dst2 =3D list_next_entry(dst, lru);
+	}
+}
+
 /*
  * migrate_pages_batch() first unmaps folios in the from list as many as
  * possible, then move the unmapped folios.
@@ -1717,7 +1792,7 @@ static int migrate_pages_batch(struct list_head *from,
 	int pass =3D 0;
 	bool is_thp =3D false;
 	bool is_large =3D false;
-	struct folio *folio, *folio2, *dst =3D NULL, *dst2;
+	struct folio *folio, *folio2, *dst =3D NULL;
 	int rc, rc_saved =3D 0, nr_pages;
 	LIST_HEAD(unmap_folios);
 	LIST_HEAD(dst_folios);
@@ -1888,42 +1963,11 @@ static int migrate_pages_batch(struct list_head *fr=
om,
 		thp_retry =3D 0;
 		nr_retry_pages =3D 0;
=20
-		dst =3D list_first_entry(&dst_folios, struct folio, lru);
-		dst2 =3D list_next_entry(dst, lru);
-		list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
-			is_thp =3D folio_test_large(folio) && folio_test_pmd_mappable(folio);
-			nr_pages =3D folio_nr_pages(folio);
-
-			cond_resched();
-
-			rc =3D migrate_folio_move(put_new_folio, private,
-						folio, dst, mode,
-						reason, ret_folios);
-			/*
-			 * The rules are:
-			 *	Success: folio will be freed
-			 *	-EAGAIN: stay on the unmap_folios list
-			 *	Other errno: put on ret_folios list
-			 */
-			switch(rc) {
-			case -EAGAIN:
-				retry++;
-				thp_retry +=3D is_thp;
-				nr_retry_pages +=3D nr_pages;
-				break;
-			case MIGRATEPAGE_SUCCESS:
-				stats->nr_succeeded +=3D nr_pages;
-				stats->nr_thp_succeeded +=3D is_thp;
-				break;
-			default:
-				nr_failed++;
-				stats->nr_thp_failed +=3D is_thp;
-				stats->nr_failed_pages +=3D nr_pages;
-				break;
-			}
-			dst =3D dst2;
-			dst2 =3D list_next_entry(dst, lru);
-		}
+		/* Move the unmapped folios */
+		migrate_folios_move(&unmap_folios, &dst_folios,
+				put_new_folio, private, mode, reason,
+				ret_folios, stats, &retry, &thp_retry,
+				&nr_failed, &nr_retry_pages);
 	}
 	nr_failed +=3D retry;
 	stats->nr_thp_failed +=3D thp_retry;
@@ -1932,20 +1976,8 @@ static int migrate_pages_batch(struct list_head *fro=
m,
 	rc =3D rc_saved ? : nr_failed;
 out:
 	/* Cleanup remaining folios */
-	dst =3D list_first_entry(&dst_folios, struct folio, lru);
-	dst2 =3D list_next_entry(dst, lru);
-	list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
-		int old_page_state =3D 0;
-		struct anon_vma *anon_vma =3D NULL;
-
-		__migrate_folio_extract(dst, &old_page_state, &anon_vma);
-		migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
-				       anon_vma, true, ret_folios);
-		list_del(&dst->lru);
-		migrate_folio_undo_dst(dst, true, put_new_folio, private);
-		dst =3D dst2;
-		dst2 =3D list_next_entry(dst, lru);
-	}
+	migrate_folios_undo(&unmap_folios, &dst_folios,
+			put_new_folio, private, ret_folios);
=20
 	return rc;
 }
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id BE9851EB1A6
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:09 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029773; cv=none;
 b=r0WvEUGPdHFY2eTd+KpiT7oWmbMhSIL3pNHUUAzQvmbu0s7VMgsOlF/bz/rA9s+kf8jN6ifEJfWC/3PcLH5P2H9v5HQ0l3lcng3o6P/v0cvuQqSxahXVU6Jrd6FXAcQmPWbi/6gYOwR02JbeAmjzAaParwvYh5/y9bd4k5T//aU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029773; c=relaxed/simple;
	bh=A6kHinvGbcpszadU980q9bBOHrb8jZD1A7j8NRJzceg=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=hq6JUTYsDKqB6DagrGU4FEUvwybXJOHBnE2KOPD9frrFDkQGPXQZbcAWWl9/KwtbuA24QQtloUUwLJAzPCp0jb2jSVdLpxPRaYiWt5IjvukFFTNk227LdxsUZoQvtlTNO5rsTd7OXOGadIYcK11F7Atgs3GXmFXfKsW51OC3HQs=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-26-67b6bba76383
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 24/26] mm/migrate: apply luf mechanism to unmapping
 during migration
Date: Thu, 20 Feb 2025 14:20:25 +0900
Message-Id: <20250220052027.58847-25-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrGLMWRmVeSWpSXmKPExsXC9ZZnoe7y3dvSDbbfU7CYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	a+ZsZCq4GlCx6sx8pgbGNw5djJwcEgImEt2vpjLD2Ee+T2MFsdkE1CVu3PgJFhcRMJM42PqH
	HcRmFrjLJHGgnw3EFhaIkdh8oAeshkVAVWL2h32MIDYvUP3CiefZIWbKS6zecACshhMo/mNG
	L1ivkICpxLsFl5i6GLmAat6zSfS/fccE0SApcXDFDZYJjLwLGBlWMQpl5pXlJmbmmOhlVOZl
	Vugl5+duYgSG/rLaP9E7GD9dCD7EKMDBqMTDO6N1W7oQa2JZcWXuIUYJDmYlEd62+i3pQrwp
	iZVVqUX58UWlOanFhxilOViUxHmNvpWnCAmkJ5akZqemFqQWwWSZODilGhjVTWtvvjIv25Tn
	LrrsZ0FYmYfr2UY1pbZzv9eE3lRy0WV9mMlgrqWqujx0d7bGJPWi+ltxH/Nj97zwfSIZfvXT
	ymvbGXyuO7+8fe7kzFOCnboV3A8d7mcekZQyXbj2f8hHqVtpIduKNLP3xjPnO6rc1hc3/MUi
	19V90LjTUEOt+NqCXuVHBUosxRmJhlrMRcWJAF4LOdZ5AgAA
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrLLMWRmVeSWpSXmKPExsXC5WfdrLt897Z0g8Z/ohZz1q9hs/i84R+b
	xYsN7YwWX9f/YrZ4+qmPxeLw3JOsFpd3zWGzuLfmP6vF+V1rWS12LN3HZHHpwAImi+O9B5gs
	5t/7zGaxedNUZovjU6YyWvz+AVR8ctZkFgdBj++tfSweO2fdZfdYsKnUY/MKLY/Fe14yeWxa
	1cnmsenTJHaPd+fOsXucmPGbxWPeyUCP9/uusnksfvGByWPrLzuPxqnX2Dw+b5IL4I/isklJ
	zcksSy3St0vgylgzZyNTwdWAilVn5jM1ML5x6GLk5JAQMJE48n0aK4jNJqAucePGT2YQW0TA
	TOJg6x92EJtZ4C6TxIF+NhBbWCBGYvOBHrAaFgFVidkf9jGC2LxA9QsnnmeHmCkvsXrDAbAa
	TqD4jxm9YL1CAqYS7xZcYprAyLWAkWEVo0hmXlluYmaOqV5xdkZlXmaFXnJ+7iZGYCAvq/0z
	cQfjl8vuhxgFOBiVeHgfPN6aLsSaWFZcmXuIUYKDWUmEt61+S7oQb0piZVVqUX58UWlOavEh
	RmkOFiVxXq/w1AQhgfTEktTs1NSC1CKYLBMHp1QDI4PEzeR3q75x/Pd+/fmll+1bdZ+87Gt6
	P017ZdpuXi5ZLhdkNY1n18KEHaea2jaZWVjkJrxqNY20m85muV2qY8YB5q+Xfk9dsXRR3Ud3
	NpaOQ5bfAx6Gr9j5+lDjb/ko+dzzCSUac82t/keH1k23j2WbwHXt89ubwVNa757c4P9yVV+3
	+O3wM0osxRmJhlrMRcWJAO3GVDlgAgAA
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

A new mechanism, LUF(Lazy Unmap Flush), defers tlb flush until folios
that have been unmapped and freed, eventually get allocated again.  It's
safe for folios that had been mapped read only and were unmapped, since
the contents of the folios don't change while staying in pcp or buddy
so we can still read the data through the stale tlb entries.

Applied the mechanism to unmapping during migration.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 include/linux/mm.h   |  2 ++
 include/linux/rmap.h |  2 +-
 mm/migrate.c         | 65 ++++++++++++++++++++++++++++++++++----------
 mm/rmap.c            | 15 ++++++----
 mm/swap.c            |  2 +-
 5 files changed, 63 insertions(+), 23 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46638e86e8073..5c81c9831bc5d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1476,6 +1476,8 @@ static inline void folio_put(struct folio *folio)
 		__folio_put(folio);
 }
=20
+void page_cache_release(struct folio *folio);
+
 /**
  * folio_put_refs - Reduce the reference count on a folio.
  * @folio: The folio.
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 683a04088f3f2..cedba4812ccc7 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -660,7 +660,7 @@ static inline int folio_try_share_anon_rmap_pmd(struct =
folio *folio,
 int folio_referenced(struct folio *, int is_locked,
 			struct mem_cgroup *memcg, unsigned long *vm_flags);
=20
-void try_to_migrate(struct folio *folio, enum ttu_flags flags);
+bool try_to_migrate(struct folio *folio, enum ttu_flags flags);
 void try_to_unmap(struct folio *, enum ttu_flags flags);
=20
 int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
diff --git a/mm/migrate.c b/mm/migrate.c
index 5e12023dbc75a..6b77efee4ebd7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1172,7 +1172,8 @@ static void migrate_folio_undo_dst(struct folio *dst,=
 bool locked,
=20
 /* Cleanup src folio upon migration success */
 static void migrate_folio_done(struct folio *src,
-			       enum migrate_reason reason)
+			       enum migrate_reason reason,
+			       unsigned short luf_key)
 {
 	/*
 	 * Compaction can migrate also non-LRU pages which are
@@ -1183,16 +1184,30 @@ static void migrate_folio_done(struct folio *src,
 		mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
 				    folio_is_file_lru(src), -folio_nr_pages(src));
=20
-	if (reason !=3D MR_MEMORY_FAILURE)
-		/* We release the page in page_handle_poison. */
+	/* We release the page in page_handle_poison. */
+	if (reason =3D=3D MR_MEMORY_FAILURE)
+		luf_flush(luf_key);
+	else if (!luf_key)
 		folio_put(src);
+	else {
+		/*
+		 * Should be the last reference.
+		 */
+		if (unlikely(!folio_put_testzero(src)))
+			VM_WARN_ON(1);
+
+		page_cache_release(src);
+		mem_cgroup_uncharge(src);
+		free_unref_page(&src->page, folio_order(src), luf_key);
+	}
 }
=20
 /* Obtain the lock on page, remove all ptes. */
 static int migrate_folio_unmap(new_folio_t get_new_folio,
 		free_folio_t put_new_folio, unsigned long private,
 		struct folio *src, struct folio **dstp, enum migrate_mode mode,
-		enum migrate_reason reason, struct list_head *ret)
+		enum migrate_reason reason, struct list_head *ret,
+		bool *can_luf)
 {
 	struct folio *dst;
 	int rc =3D -EAGAIN;
@@ -1208,7 +1223,7 @@ static int migrate_folio_unmap(new_folio_t get_new_fo=
lio,
 		folio_clear_unevictable(src);
 		/* free_pages_prepare() will clear PG_isolated. */
 		list_del(&src->lru);
-		migrate_folio_done(src, reason);
+		migrate_folio_done(src, reason, 0);
 		return MIGRATEPAGE_SUCCESS;
 	}
=20
@@ -1325,7 +1340,7 @@ static int migrate_folio_unmap(new_folio_t get_new_fo=
lio,
 		/* Establish migration ptes */
 		VM_BUG_ON_FOLIO(folio_test_anon(src) &&
 			       !folio_test_ksm(src) && !anon_vma, src);
-		try_to_migrate(src, mode =3D=3D MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
+		*can_luf =3D try_to_migrate(src, mode =3D=3D MIGRATE_ASYNC ? TTU_BATCH_F=
LUSH : 0);
 		old_page_state |=3D PAGE_WAS_MAPPED;
 	}
=20
@@ -1353,7 +1368,7 @@ static int migrate_folio_unmap(new_folio_t get_new_fo=
lio,
 static int migrate_folio_move(free_folio_t put_new_folio, unsigned long pr=
ivate,
 			      struct folio *src, struct folio *dst,
 			      enum migrate_mode mode, enum migrate_reason reason,
-			      struct list_head *ret)
+			      struct list_head *ret, unsigned short luf_key)
 {
 	int rc;
 	int old_page_state =3D 0;
@@ -1407,7 +1422,7 @@ static int migrate_folio_move(free_folio_t put_new_fo=
lio, unsigned long private,
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 	folio_unlock(src);
-	migrate_folio_done(src, reason);
+	migrate_folio_done(src, reason, luf_key);
=20
 	return rc;
 out:
@@ -1702,7 +1717,7 @@ static void migrate_folios_move(struct list_head *src=
_folios,
 		struct list_head *ret_folios,
 		struct migrate_pages_stats *stats,
 		int *retry, int *thp_retry, int *nr_failed,
-		int *nr_retry_pages)
+		int *nr_retry_pages, unsigned short luf_key)
 {
 	struct folio *folio, *folio2, *dst, *dst2;
 	bool is_thp;
@@ -1719,7 +1734,7 @@ static void migrate_folios_move(struct list_head *src=
_folios,
=20
 		rc =3D migrate_folio_move(put_new_folio, private,
 				folio, dst, mode,
-				reason, ret_folios);
+				reason, ret_folios, luf_key);
 		/*
 		 * The rules are:
 		 *	Success: folio will be freed
@@ -1796,7 +1811,11 @@ static int migrate_pages_batch(struct list_head *fro=
m,
 	int rc, rc_saved =3D 0, nr_pages;
 	LIST_HEAD(unmap_folios);
 	LIST_HEAD(dst_folios);
+	LIST_HEAD(unmap_folios_luf);
+	LIST_HEAD(dst_folios_luf);
 	bool nosplit =3D (reason =3D=3D MR_NUMA_MISPLACED);
+	unsigned short luf_key;
+	bool can_luf;
=20
 	VM_WARN_ON_ONCE(mode !=3D MIGRATE_ASYNC &&
 			!list_empty(from) && !list_is_singular(from));
@@ -1871,9 +1890,11 @@ static int migrate_pages_batch(struct list_head *fro=
m,
 				continue;
 			}
=20
+			can_luf =3D false;
 			rc =3D migrate_folio_unmap(get_new_folio, put_new_folio,
 					private, folio, &dst, mode, reason,
-					ret_folios);
+					ret_folios, &can_luf);
+
 			/*
 			 * The rules are:
 			 *	Success: folio will be freed
@@ -1919,7 +1940,8 @@ static int migrate_pages_batch(struct list_head *from,
 				/* nr_failed isn't updated for not used */
 				stats->nr_thp_failed +=3D thp_retry;
 				rc_saved =3D rc;
-				if (list_empty(&unmap_folios))
+				if (list_empty(&unmap_folios) &&
+				    list_empty(&unmap_folios_luf))
 					goto out;
 				else
 					goto move;
@@ -1933,8 +1955,13 @@ static int migrate_pages_batch(struct list_head *fro=
m,
 				stats->nr_thp_succeeded +=3D is_thp;
 				break;
 			case MIGRATEPAGE_UNMAP:
-				list_move_tail(&folio->lru, &unmap_folios);
-				list_add_tail(&dst->lru, &dst_folios);
+				if (can_luf) {
+					list_move_tail(&folio->lru, &unmap_folios_luf);
+					list_add_tail(&dst->lru, &dst_folios_luf);
+				} else {
+					list_move_tail(&folio->lru, &unmap_folios);
+					list_add_tail(&dst->lru, &dst_folios);
+				}
 				break;
 			default:
 				/*
@@ -1954,6 +1981,8 @@ static int migrate_pages_batch(struct list_head *from,
 	stats->nr_thp_failed +=3D thp_retry;
 	stats->nr_failed_pages +=3D nr_retry_pages;
 move:
+	/* Should be before try_to_unmap_flush() */
+	luf_key =3D fold_unmap_luf();
 	/* Flush TLBs for all unmapped folios */
 	try_to_unmap_flush();
=20
@@ -1967,7 +1996,11 @@ static int migrate_pages_batch(struct list_head *fro=
m,
 		migrate_folios_move(&unmap_folios, &dst_folios,
 				put_new_folio, private, mode, reason,
 				ret_folios, stats, &retry, &thp_retry,
-				&nr_failed, &nr_retry_pages);
+				&nr_failed, &nr_retry_pages, 0);
+		migrate_folios_move(&unmap_folios_luf, &dst_folios_luf,
+				put_new_folio, private, mode, reason,
+				ret_folios, stats, &retry, &thp_retry,
+				&nr_failed, &nr_retry_pages, luf_key);
 	}
 	nr_failed +=3D retry;
 	stats->nr_thp_failed +=3D thp_retry;
@@ -1978,6 +2011,8 @@ static int migrate_pages_batch(struct list_head *from,
 	/* Cleanup remaining folios */
 	migrate_folios_undo(&unmap_folios, &dst_folios,
 			put_new_folio, private, ret_folios);
+	migrate_folios_undo(&unmap_folios_luf, &dst_folios_luf,
+			put_new_folio, private, ret_folios);
=20
 	return rc;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index ebe91ff1bcb16..b6b61b8103655 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2750,8 +2750,9 @@ static bool try_to_migrate_one(struct folio *folio, s=
truct vm_area_struct *vma,
  *
  * Tries to remove all the page table entries which are mapping this folio=
 and
  * replace them with special swap entries. Caller must hold the folio lock.
+ * Return true if all the mappings are read-only, otherwise false.
  */
-void try_to_migrate(struct folio *folio, enum ttu_flags flags)
+bool try_to_migrate(struct folio *folio, enum ttu_flags flags)
 {
 	struct rmap_walk_control rwc =3D {
 		.rmap_one =3D try_to_migrate_one,
@@ -2769,11 +2770,11 @@ void try_to_migrate(struct folio *folio, enum ttu_f=
lags flags)
 	 */
 	if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
 					TTU_SYNC | TTU_BATCH_FLUSH)))
-		return;
+		return false;
=20
 	if (folio_is_zone_device(folio) &&
 	    (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
-		return;
+		return false;
=20
 	/*
 	 * During exec, a temporary VMA is setup and later moved.
@@ -2793,10 +2794,12 @@ void try_to_migrate(struct folio *folio, enum ttu_f=
lags flags)
 	else
 		rmap_walk(folio, &rwc);
=20
-	if (can_luf_test())
+	if (can_luf_test()) {
 		fold_batch(tlb_ubc_luf, tlb_ubc_ro, true);
-	else
-		fold_batch(tlb_ubc, tlb_ubc_ro, true);
+		return true;
+	}
+	fold_batch(tlb_ubc, tlb_ubc_ro, true);
+	return false;
 }
=20
 #ifdef CONFIG_DEVICE_PRIVATE
diff --git a/mm/swap.c b/mm/swap.c
index 54b0ba10dbb86..d6c29fdc67ca5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -84,7 +84,7 @@ static void __page_cache_release(struct folio *folio, str=
uct lruvec **lruvecp,
  * This path almost never happens for VM activity - pages are normally fre=
ed
  * in batches.  But it gets used by networking - and for compound pages.
  */
-static void page_cache_release(struct folio *folio)
+void page_cache_release(struct folio *folio)
 {
 	struct lruvec *lruvec =3D NULL;
 	unsigned long flags;
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.skhynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 442781EB9ED
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:10 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029773; cv=none;
 b=HkRkA+LS0iQShT1DYSXwF1BklH5yVlHzPzeHKzNtpRQ8WP0blyAz1P4t1M4DHDvZf+pFAghRGmJe37GV83jK8HnjgiSEKg+dFJyhvzPh73ImHUHAkzZovdiwOpgncopD4tmtUDP4cxVC2D7IqoRNrR1KpG3fCtm0gud/ZVjVOZ4=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029773; c=relaxed/simple;
	bh=R/34JHDbuoMfx94XJV9RCFhRV7+t316aSkMsBuloTdk=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=gVZQKRGFay2LYKxHNFGeNyFAyUCkXPp9EvU+VUN2X6BJyXHnn7oI6VY8t8sC9Lf5I46l9wixCNv4rMxptTWMseKBTdmJ5plwlZVufpWmURt69bZiTw5wgI7f7q3VQGfekuW9ItbiNypUAKOs5D0TEGS7fDts2Si2qvjx4sHzv5I=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-2c-67b6bba7d1ea
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 25/26] mm/vmscan: apply luf mechanism to unmapping
 during folio reclaim
Date: Thu, 20 Feb 2025 14:20:26 +0900
Message-Id: <20250220052027.58847-26-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrKLMWRmVeSWpSXmKPExsXC9ZZnoe7y3dvSDfYsVbGYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	u9doF5zVr7i1YCZzA+MLtS5GTg4JAROJaw8OscLYD4/+ZQGx2QTUJW7c+MkMYosImEkcbP3D
	DmIzC9xlkjjQzwZiCwvESxz5dgOshkVAVWLb+gVgNbxA9aeu34eaKS+xesMBsBpOoPiPGb1g
	vUICphLvFlxi6mLkAqp5zyaxZe9ydogGSYmDK26wTGDkXcDIsIpRKDOvLDcxM8dEL6MyL7NC
	Lzk/dxMjMPCX1f6J3sH46ULwIUYBDkYlHt4ZrdvShVgTy4orcw8xSnAwK4nwttVvSRfiTUms
	rEotyo8vKs1JLT7EKM3BoiTOa/StPEVIID2xJDU7NbUgtQgmy8TBKdXAyFX5TjnlXPL5P/Ut
	60vypuxwao5Wehad11vRnWXHJdyptnizjfrGEvOZZ7hamJb0Wd1nmXTRWL+74uDx7tT9RwTn
	cIo6PFWVX2fEoqv9UHM7s0unQPIx3cWHVPUDDvxRq92nJ3IxQI/hydNE05/7t+XbTUwT41rh
	9LVthuqtos81brffbdRSYinOSDTUYi4qTgQAC5qRB3gCAAA=
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrDLMWRmVeSWpSXmKPExsXC5WfdrLt897Z0g3kvpSzmrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlbF7jXbBWf2KWwtmMjcwvlDrYuTkkBAwkXh49C8LiM0moC5x48ZPZhBbRMBM
	4mDrH3YQm1ngLpPEgX42EFtYIF7iyLcbYDUsAqoS29YvAKvhBao/df0+K8RMeYnVGw6A1XAC
	xX/M6AXrFRIwlXi34BLTBEauBYwMqxhFMvPKchMzc0z1irMzKvMyK/SS83M3MQLDeFntn4k7
	GL9cdj/EKMDBqMTD++Dx1nQh1sSy4srcQ4wSHMxKIrxt9VvShXhTEiurUovy44tKc1KLDzFK
	c7AoifN6hacmCAmkJ5akZqemFqQWwWSZODilGhjvrs+bpHgq486nzRm/9aO03u18/FdCTu2P
	qvi/uDmye99Zcy5sL809s/uc0j5JabWXBrMPn1xhqeK30fDMrHdr7sk5z2M38da4Xlt49vq3
	P6yzjN0CDu/iq5HYu/Oc7MSWT0e+d6zaXiZxc/3816fuHvNY+qTqEOeelW3mTEstnMRfZUU1
	rzg4SYmlOCPRUIu5qDgRAMF1a/tfAgAA
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

A new mechanism, LUF(Lazy Unmap Flush), defers tlb flush until folios
that have been unmapped and freed, eventually get allocated again.  It's
safe for folios that had been mapped read only and were unmapped, since
the contents of the folios don't change while staying in pcp or buddy
so we can still read the data through the stale tlb entries.

Applied the mechanism to unmapping during folio reclaim.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 include/linux/rmap.h |  5 +++--
 mm/rmap.c            | 11 +++++++----
 mm/vmscan.c          | 37 ++++++++++++++++++++++++++++++++-----
 3 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index cedba4812ccc7..854b41441d466 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -661,7 +661,7 @@ int folio_referenced(struct folio *, int is_locked,
 			struct mem_cgroup *memcg, unsigned long *vm_flags);
=20
 bool try_to_migrate(struct folio *folio, enum ttu_flags flags);
-void try_to_unmap(struct folio *, enum ttu_flags flags);
+bool try_to_unmap(struct folio *, enum ttu_flags flags);
=20
 int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
 				unsigned long end, struct page **pages,
@@ -794,8 +794,9 @@ static inline int folio_referenced(struct folio *folio,=
 int is_locked,
 	return 0;
 }
=20
-static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
+static inline bool try_to_unmap(struct folio *folio, enum ttu_flags flags)
 {
+	return false;
 }
=20
 static inline int folio_mkclean(struct folio *folio)
diff --git a/mm/rmap.c b/mm/rmap.c
index b6b61b8103655..55003eb0b4936 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2386,10 +2386,11 @@ static int folio_not_mapped(struct folio *folio)
  * Tries to remove all the page table entries which are mapping this
  * folio.  It is the caller's responsibility to check if the folio is
  * still mapped if needed (use TTU_SYNC to prevent accounting races).
+ * Return true if all the mappings are read-only, otherwise false.
  *
  * Context: Caller must hold the folio lock.
  */
-void try_to_unmap(struct folio *folio, enum ttu_flags flags)
+bool try_to_unmap(struct folio *folio, enum ttu_flags flags)
 {
 	struct rmap_walk_control rwc =3D {
 		.rmap_one =3D try_to_unmap_one,
@@ -2408,10 +2409,12 @@ void try_to_unmap(struct folio *folio, enum ttu_fla=
gs flags)
 	else
 		rmap_walk(folio, &rwc);
=20
-	if (can_luf_test())
+	if (can_luf_test()) {
 		fold_batch(tlb_ubc_luf, tlb_ubc_ro, true);
-	else
-		fold_batch(tlb_ubc, tlb_ubc_ro, true);
+		return true;
+	}
+	fold_batch(tlb_ubc, tlb_ubc_ro, true);
+	return false;
 }
=20
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cbca027d2a10e..1ece0ccfccefb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1052,14 +1052,17 @@ static unsigned int shrink_folio_list(struct list_h=
ead *folio_list,
 		struct reclaim_stat *stat, bool ignore_references)
 {
 	struct folio_batch free_folios;
+	struct folio_batch free_folios_luf;
 	LIST_HEAD(ret_folios);
 	LIST_HEAD(demote_folios);
 	unsigned int nr_reclaimed =3D 0;
 	unsigned int pgactivate =3D 0;
 	bool do_demote_pass;
 	struct swap_iocb *plug =3D NULL;
+	unsigned short luf_key;
=20
 	folio_batch_init(&free_folios);
+	folio_batch_init(&free_folios_luf);
 	memset(stat, 0, sizeof(*stat));
 	cond_resched();
 	do_demote_pass =3D can_demote(pgdat->node_id, sc);
@@ -1071,6 +1074,7 @@ static unsigned int shrink_folio_list(struct list_hea=
d *folio_list,
 		enum folio_references references =3D FOLIOREF_RECLAIM;
 		bool dirty, writeback;
 		unsigned int nr_pages;
+		bool can_luf =3D false;
=20
 		cond_resched();
=20
@@ -1309,7 +1313,7 @@ static unsigned int shrink_folio_list(struct list_hea=
d *folio_list,
 			if (folio_test_large(folio))
 				flags |=3D TTU_SYNC;
=20
-			try_to_unmap(folio, flags);
+			can_luf =3D try_to_unmap(folio, flags);
 			if (folio_mapped(folio)) {
 				stat->nr_unmap_fail +=3D nr_pages;
 				if (!was_swapbacked &&
@@ -1453,6 +1457,8 @@ static unsigned int shrink_folio_list(struct list_hea=
d *folio_list,
 					 * leave it off the LRU).
 					 */
 					nr_reclaimed +=3D nr_pages;
+					if (can_luf)
+						luf_flush(fold_unmap_luf());
 					continue;
 				}
 			}
@@ -1485,6 +1491,19 @@ static unsigned int shrink_folio_list(struct list_he=
ad *folio_list,
 		nr_reclaimed +=3D nr_pages;
=20
 		folio_unqueue_deferred_split(folio);
+
+		if (can_luf) {
+			if (folio_batch_add(&free_folios_luf, folio) =3D=3D 0) {
+				mem_cgroup_uncharge_folios(&free_folios);
+				mem_cgroup_uncharge_folios(&free_folios_luf);
+				luf_key =3D fold_unmap_luf();
+				try_to_unmap_flush();
+				free_unref_folios(&free_folios, 0);
+				free_unref_folios(&free_folios_luf, luf_key);
+			}
+			continue;
+		}
+
 		if (folio_batch_add(&free_folios, folio) =3D=3D 0) {
 			mem_cgroup_uncharge_folios(&free_folios);
 			try_to_unmap_flush();
@@ -1519,9 +1538,21 @@ static unsigned int shrink_folio_list(struct list_he=
ad *folio_list,
 		list_add(&folio->lru, &ret_folios);
 		VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
 				folio_test_unevictable(folio), folio);
+		if (can_luf)
+			luf_flush(fold_unmap_luf());
 	}
 	/* 'folio_list' is always empty here */
=20
+	/*
+	 * Finalize this turn before demote_folio_list().
+	 */
+	mem_cgroup_uncharge_folios(&free_folios);
+	mem_cgroup_uncharge_folios(&free_folios_luf);
+	luf_key =3D fold_unmap_luf();
+	try_to_unmap_flush();
+	free_unref_folios(&free_folios, 0);
+	free_unref_folios(&free_folios_luf, luf_key);
+
 	/* Migrate folios selected for demotion */
 	stat->nr_demoted =3D demote_folio_list(&demote_folios, pgdat);
 	nr_reclaimed +=3D stat->nr_demoted;
@@ -1554,10 +1585,6 @@ static unsigned int shrink_folio_list(struct list_he=
ad *folio_list,
=20
 	pgactivate =3D stat->nr_activate[0] + stat->nr_activate[1];
=20
-	mem_cgroup_uncharge_folios(&free_folios);
-	try_to_unmap_flush();
-	free_unref_folios(&free_folios, 0);
-
 	list_splice(&ret_folios, folio_list);
 	count_vm_events(PGACTIVATE, pgactivate);
=20
--=20
2.17.1
From nobody Sun Feb  8 06:42:31 2026
Received: from invmail4.hynix.com (exvmail4.hynix.com [166.125.252.92])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 8A8211EBA14
	for <linux-kernel@vger.kernel.org>; Thu, 20 Feb 2025 05:36:10 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=166.125.252.92
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1740029775; cv=none;
 b=sp/KaWqyXEeh6qC5VfJCZ0CXV0fN0Fg1dk81OiKi+lH6YvP734t1AuBhGk+BfS5ha4KpTXFCvlyMN8l81ZMUZrju6noV25RLMP9/0rIQtVNEJKiE6QXVSKpxSRvIYKQwSufZo+M3TgCNSjK1g5PS0mi2PNKCWSrrQdraTjbn12g=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1740029775; c=relaxed/simple;
	bh=8cjiuzwkFX3AlqVLR0qyEp6gA9iYW/MALgIX6OXBfdQ=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References;
 b=OMCHjdEc7TOQH0PogpmpFbiH5PYgeHGGdxzawSC+Si9jI8rhlIDL3UAMgcPqJFu6krbjeQTH4KJcr0jZV6MnGu35r90qrDr9COlJhYz6TX3Qh8N09uJlc1UayrfzlQQ+JI6wkKC/XAfwOc4jPlnBaPipMWrngUCL9nxzEGsjMgY=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com;
 spf=pass smtp.mailfrom=sk.com; arc=none smtp.client-ip=166.125.252.92
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=sk.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=sk.com
X-AuditID: a67dfc5b-3c9ff7000001d7ae-32-67b6bba7c76f
From: Byungchul Park <byungchul@sk.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: kernel_team@skhynix.com,
	akpm@linux-foundation.org,
	ying.huang@intel.com,
	vernhao@tencent.com,
	mgorman@techsingularity.net,
	hughd@google.com,
	willy@infradead.org,
	david@redhat.com,
	peterz@infradead.org,
	luto@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	rjgolo@gmail.com
Subject: [RFC PATCH v12 26/26] mm/luf: implement luf debug feature
Date: Thu, 20 Feb 2025 14:20:27 +0900
Message-Id: <20250220052027.58847-27-byungchul@sk.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20250220052027.58847-1-byungchul@sk.com>
References: <20250220052027.58847-1-byungchul@sk.com>
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFnrBLMWRmVeSWpSXmKPExsXC9ZZnoe7y3dvSDY781LCYs34Nm8XnDf/Y
	LF5saGe0+Lr+F7PF0099LBaXd81hs7i35j+rxflda1ktdizdx2Rx6cACJovjvQeYLObf+8xm
	sXnTVGaL41OmMlr8/gFUfHLWZBYHAY/vrX0sHjtn3WX3WLCp1GPzCi2PxXteMnlsWtXJ5rHp
	0yR2j3fnzrF7nJjxm8Vj3slAj/f7rrJ5bP1l59E49Rqbx+dNcgF8UVw2Kak5mWWpRfp2CVwZ
	+99NZC24eJKxYv6a80wNjHuXMHYxcnJICJhI/H81lwnG/rV8ASuIzSagLnHjxk9mEFtEwEzi
	YOsfdhCbWeAuk8SBfjYQW1jASWLbpqNgc1gEVCXmPF/IAmLzAtWfWncXar68xOoNB8DmcALF
	f8zoBesVEjCVeLfgEtBeLqCaz2wSn5+cZoZokJQ4uOIGywRG3gWMDKsYhTLzynITM3NM9DIq
	8zIr9JLzczcxAsN/We2f6B2Mny4EH2IU4GBU4uGd0botXYg1say4MvcQowQHs5IIb1v9lnQh
	3pTEyqrUovz4otKc1OJDjNIcLErivEbfylOEBNITS1KzU1MLUotgskwcnFINjHlVHLaf1We6
	uZ03Xyd2bwFXxfUPj2aGS6ju7HXQWVIwLWBFOXtPlZsh09fteWXBMadOKHcIbt3Ut/X/rcVK
	Bp+7uLeJLNCd7+fJ7mZ+dvW7SJ/vd9adiWXdbLlBtq7ymv1iqx3NAXlKTUsernnKEHQ/SJ7J
	491+g5rEM4I+O//ZvFnlc47vkRJLcUaioRZzUXEiAIVnIgB7AgAA
X-Brightmail-Tracker: 
 H4sIAAAAAAAAA+NgFjrPLMWRmVeSWpSXmKPExsXC5WfdrLt897Z0g9UT5CzmrF/DZvF5wz82
	ixcb2hktvq7/xWzx9FMfi8XhuSdZLS7vmsNmcW/Nf1aL87vWslrsWLqPyeLSgQVMFsd7DzBZ
	zL/3mc1i86apzBbHp0xltPj9A6j45KzJLA6CHt9b+1g8ds66y+6xYFOpx+YVWh6L97xk8ti0
	qpPNY9OnSewe786dY/c4MeM3i8e8k4Ee7/ddZfNY/OIDk8fWX3YejVOvsXl83iQXwB/FZZOS
	mpNZllqkb5fAlbH/3UTWgosnGSvmrznP1MC4dwljFyMnh4SAicSv5QtYQWw2AXWJGzd+MoPY
	IgJmEgdb/7CD2MwCd5kkDvSzgdjCAk4S2zYdBetlEVCVmPN8IQuIzQtUf2rdXaiZ8hKrNxwA
	m8MJFP8xoxesV0jAVOLdgktMExi5FjAyrGIUycwry03MzDHVK87OqMzLrNBLzs/dxAgM5mW1
	fybuYPxy2f0QowAHoxIP74PHW9OFWBPLiitzDzFKcDArifC21W9JF+JNSaysSi3Kjy8qzUkt
	PsQozcGiJM7rFZ6aICSQnliSmp2aWpBaBJNl4uCUamDkLTu88uLcvaKTz2lZ23uvl8iNDtLO
	+257/PKfqAW1p20nvXRKn75KNVB36vE1TLOiZ365JPyycBuHxaZ77Vs95khsebh2Vc2FOG2p
	Jex8T0XO3FxxeLfosgvMPA0u/32UhdYE/nrcmXL4fgCLiJOXWIinPDPT0/9Bkr9dDSZV3/KQ
	86ph8HBQYinOSDTUYi4qTgQApwYahmICAAA=
X-CFilter-Loop: Reflected
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

We need luf debug feature to detect when luf goes wrong by any chance.
As a RFC, suggest a simple implementation to report problematic
situations by luf.

Signed-off-by: Byungchul Park <byungchul@sk.com>
---
 arch/riscv/include/asm/tlbflush.h |   3 +
 arch/riscv/mm/tlbflush.c          |  35 ++++-
 arch/x86/include/asm/pgtable.h    |  10 ++
 arch/x86/include/asm/tlbflush.h   |   3 +
 arch/x86/mm/pgtable.c             |  10 ++
 arch/x86/mm/tlb.c                 |  35 ++++-
 include/linux/highmem-internal.h  |   5 +
 include/linux/mm.h                |  20 ++-
 include/linux/mm_types.h          |  16 +--
 include/linux/mm_types_task.h     |  16 +++
 include/linux/sched.h             |   5 +
 mm/highmem.c                      |   1 +
 mm/memory.c                       |  12 ++
 mm/page_alloc.c                   |  34 ++++-
 mm/page_ext.c                     |   3 +
 mm/rmap.c                         | 229 ++++++++++++++++++++++++++++++
 16 files changed, 418 insertions(+), 19 deletions(-)

diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlb=
flush.h
index ec5caeb3cf8ef..9451f3d22f229 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -69,6 +69,9 @@ bool arch_tlbbatch_check_done(struct arch_tlbflush_unmap_=
batch *batch, unsigned
 bool arch_tlbbatch_diet(struct arch_tlbflush_unmap_batch *batch, unsigned =
long ugen);
 void arch_tlbbatch_mark_ugen(struct arch_tlbflush_unmap_batch *batch, unsi=
gned long ugen);
 void arch_mm_mark_ugen(struct mm_struct *mm, unsigned long ugen);
+#ifdef CONFIG_LUF_DEBUG
+extern void print_lufd_arch(void);
+#endif
=20
 static inline void arch_tlbbatch_clear(struct arch_tlbflush_unmap_batch *b=
atch)
 {
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index 93afb7a299003..de91bfe0426c2 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -216,6 +216,25 @@ static int __init luf_init_arch(void)
 }
 early_initcall(luf_init_arch);
=20
+#ifdef CONFIG_LUF_DEBUG
+static DEFINE_SPINLOCK(luf_debug_lock);
+#define lufd_lock(f) spin_lock_irqsave(&luf_debug_lock, (f))
+#define lufd_unlock(f) spin_unlock_irqrestore(&luf_debug_lock, (f))
+
+void print_lufd_arch(void)
+{
+	int cpu;
+
+	pr_cont("LUFD ARCH:");
+	for_each_cpu(cpu, cpu_possible_mask)
+		pr_cont(" %lu", atomic_long_read(per_cpu_ptr(&ugen_done, cpu)));
+	pr_cont("\n");
+}
+#else
+#define lufd_lock(f) do { (void)(f); } while(0)
+#define lufd_unlock(f) do { (void)(f); } while(0)
+#endif
+
 /*
  * batch will not be updated.
  */
@@ -223,17 +242,22 @@ bool arch_tlbbatch_check_done(struct arch_tlbflush_un=
map_batch *batch,
 			unsigned long ugen)
 {
 	int cpu;
+	unsigned long flags;
=20
 	if (!ugen)
 		goto out;
=20
+	lufd_lock(flags);
 	for_each_cpu(cpu, &batch->cpumask) {
 		unsigned long done;
=20
 		done =3D atomic_long_read(per_cpu_ptr(&ugen_done, cpu));
-		if (ugen_before(done, ugen))
+		if (ugen_before(done, ugen)) {
+			lufd_unlock(flags);
 			return false;
+		}
 	}
+	lufd_unlock(flags);
 	return true;
 out:
 	return cpumask_empty(&batch->cpumask);
@@ -243,10 +267,12 @@ bool arch_tlbbatch_diet(struct arch_tlbflush_unmap_ba=
tch *batch,
 			unsigned long ugen)
 {
 	int cpu;
+	unsigned long flags;
=20
 	if (!ugen)
 		goto out;
=20
+	lufd_lock(flags);
 	for_each_cpu(cpu, &batch->cpumask) {
 		unsigned long done;
=20
@@ -254,6 +280,7 @@ bool arch_tlbbatch_diet(struct arch_tlbflush_unmap_batc=
h *batch,
 		if (!ugen_before(done, ugen))
 			cpumask_clear_cpu(cpu, &batch->cpumask);
 	}
+	lufd_unlock(flags);
 out:
 	return cpumask_empty(&batch->cpumask);
 }
@@ -262,10 +289,12 @@ void arch_tlbbatch_mark_ugen(struct arch_tlbflush_unm=
ap_batch *batch,
 			     unsigned long ugen)
 {
 	int cpu;
+	unsigned long flags;
=20
 	if (!ugen)
 		return;
=20
+	lufd_lock(flags);
 	for_each_cpu(cpu, &batch->cpumask) {
 		atomic_long_t *done =3D per_cpu_ptr(&ugen_done, cpu);
 		unsigned long old =3D atomic_long_read(done);
@@ -283,15 +312,18 @@ void arch_tlbbatch_mark_ugen(struct arch_tlbflush_unm=
ap_batch *batch,
 		 */
 		atomic_long_cmpxchg(done, old, ugen);
 	}
+	lufd_unlock(flags);
 }
=20
 void arch_mm_mark_ugen(struct mm_struct *mm, unsigned long ugen)
 {
 	int cpu;
+	unsigned long flags;
=20
 	if (!ugen)
 		return;
=20
+	lufd_lock(flags);
 	for_each_cpu(cpu, mm_cpumask(mm)) {
 		atomic_long_t *done =3D per_cpu_ptr(&ugen_done, cpu);
 		unsigned long old =3D atomic_long_read(done);
@@ -309,4 +341,5 @@ void arch_mm_mark_ugen(struct mm_struct *mm, unsigned l=
ong ugen)
 		 */
 		atomic_long_cmpxchg(done, old, ugen);
 	}
+	lufd_unlock(flags);
 }
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 593f10aabd45a..414bcabb23b51 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -695,12 +695,22 @@ static inline pud_t pud_mkyoung(pud_t pud)
 	return pud_set_flags(pud, _PAGE_ACCESSED);
 }
=20
+#ifdef CONFIG_LUF_DEBUG
+pud_t pud_mkwrite(pud_t pud);
+static inline pud_t __pud_mkwrite(pud_t pud)
+{
+	pud =3D pud_set_flags(pud, _PAGE_RW);
+
+	return pud_clear_saveddirty(pud);
+}
+#else
 static inline pud_t pud_mkwrite(pud_t pud)
 {
 	pud =3D pud_set_flags(pud, _PAGE_RW);
=20
 	return pud_clear_saveddirty(pud);
 }
+#endif
=20
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 static inline int pte_soft_dirty(pte_t pte)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflus=
h.h
index 1fc5bacd72dff..2825f4befb272 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -297,6 +297,9 @@ extern bool arch_tlbbatch_check_done(struct arch_tlbflu=
sh_unmap_batch *batch, un
 extern bool arch_tlbbatch_diet(struct arch_tlbflush_unmap_batch *batch, un=
signed long ugen);
 extern void arch_tlbbatch_mark_ugen(struct arch_tlbflush_unmap_batch *batc=
h, unsigned long ugen);
 extern void arch_mm_mark_ugen(struct mm_struct *mm, unsigned long ugen);
+#ifdef CONFIG_LUF_DEBUG
+extern void print_lufd_arch(void);
+#endif
=20
 static inline void arch_tlbbatch_clear(struct arch_tlbflush_unmap_batch *b=
atch)
 {
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5745a354a241c..f72e4cfdb0a8d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -901,6 +901,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
=20
 pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
 {
+	lufd_check_pages(pte_page(pte), 0);
 	if (vma->vm_flags & VM_SHADOW_STACK)
 		return pte_mkwrite_shstk(pte);
=20
@@ -911,6 +912,7 @@ pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
=20
 pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
+	lufd_check_pages(pmd_page(pmd), PMD_ORDER);
 	if (vma->vm_flags & VM_SHADOW_STACK)
 		return pmd_mkwrite_shstk(pmd);
=20
@@ -919,6 +921,14 @@ pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vm=
a)
 	return pmd_clear_saveddirty(pmd);
 }
=20
+#ifdef CONFIG_LUF_DEBUG
+pud_t pud_mkwrite(pud_t pud)
+{
+	lufd_check_pages(pud_page(pud), PUD_ORDER);
+	return __pud_mkwrite(pud);
+}
+#endif
+
 void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
 {
 	/*
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 975f58fa4b30f..e9ae0d8f73442 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1253,6 +1253,25 @@ static int __init luf_init_arch(void)
 }
 early_initcall(luf_init_arch);
=20
+#ifdef CONFIG_LUF_DEBUG
+static DEFINE_SPINLOCK(luf_debug_lock);
+#define lufd_lock(f) spin_lock_irqsave(&luf_debug_lock, (f))
+#define lufd_unlock(f) spin_unlock_irqrestore(&luf_debug_lock, (f))
+
+void print_lufd_arch(void)
+{
+	int cpu;
+
+	pr_cont("LUFD ARCH:");
+	for_each_cpu(cpu, cpu_possible_mask)
+		pr_cont(" %lu", atomic_long_read(per_cpu_ptr(&ugen_done, cpu)));
+	pr_cont("\n");
+}
+#else
+#define lufd_lock(f) do { (void)(f); } while(0)
+#define lufd_unlock(f) do { (void)(f); } while(0)
+#endif
+
 /*
  * batch will not be updated.
  */
@@ -1260,17 +1279,22 @@ bool arch_tlbbatch_check_done(struct arch_tlbflush_=
unmap_batch *batch,
 			unsigned long ugen)
 {
 	int cpu;
+	unsigned long flags;
=20
 	if (!ugen)
 		goto out;
=20
+	lufd_lock(flags);
 	for_each_cpu(cpu, &batch->cpumask) {
 		unsigned long done;
=20
 		done =3D atomic_long_read(per_cpu_ptr(&ugen_done, cpu));
-		if (ugen_before(done, ugen))
+		if (ugen_before(done, ugen)) {
+			lufd_unlock(flags);
 			return false;
+		}
 	}
+	lufd_unlock(flags);
 	return true;
 out:
 	return cpumask_empty(&batch->cpumask);
@@ -1280,10 +1304,12 @@ bool arch_tlbbatch_diet(struct arch_tlbflush_unmap_=
batch *batch,
 			unsigned long ugen)
 {
 	int cpu;
+	unsigned long flags;
=20
 	if (!ugen)
 		goto out;
=20
+	lufd_lock(flags);
 	for_each_cpu(cpu, &batch->cpumask) {
 		unsigned long done;
=20
@@ -1291,6 +1317,7 @@ bool arch_tlbbatch_diet(struct arch_tlbflush_unmap_ba=
tch *batch,
 		if (!ugen_before(done, ugen))
 			cpumask_clear_cpu(cpu, &batch->cpumask);
 	}
+	lufd_unlock(flags);
 out:
 	return cpumask_empty(&batch->cpumask);
 }
@@ -1299,10 +1326,12 @@ void arch_tlbbatch_mark_ugen(struct arch_tlbflush_u=
nmap_batch *batch,
 			     unsigned long ugen)
 {
 	int cpu;
+	unsigned long flags;
=20
 	if (!ugen)
 		return;
=20
+	lufd_lock(flags);
 	for_each_cpu(cpu, &batch->cpumask) {
 		atomic_long_t *done =3D per_cpu_ptr(&ugen_done, cpu);
 		unsigned long old =3D atomic_long_read(done);
@@ -1320,15 +1349,18 @@ void arch_tlbbatch_mark_ugen(struct arch_tlbflush_u=
nmap_batch *batch,
 		 */
 		atomic_long_cmpxchg(done, old, ugen);
 	}
+	lufd_unlock(flags);
 }
=20
 void arch_mm_mark_ugen(struct mm_struct *mm, unsigned long ugen)
 {
 	int cpu;
+	unsigned long flags;
=20
 	if (!ugen)
 		return;
=20
+	lufd_lock(flags);
 	for_each_cpu(cpu, mm_cpumask(mm)) {
 		atomic_long_t *done =3D per_cpu_ptr(&ugen_done, cpu);
 		unsigned long old =3D atomic_long_read(done);
@@ -1346,6 +1378,7 @@ void arch_mm_mark_ugen(struct mm_struct *mm, unsigned=
 long ugen)
 		 */
 		atomic_long_cmpxchg(done, old, ugen);
 	}
+	lufd_unlock(flags);
 }
=20
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-inter=
nal.h
index dd100e849f5e0..0792530d1be7b 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -41,6 +41,7 @@ static inline void *kmap(struct page *page)
 {
 	void *addr;
=20
+	lufd_check_pages(page, 0);
 	might_sleep();
 	if (!PageHighMem(page))
 		addr =3D page_address(page);
@@ -161,6 +162,7 @@ static inline struct page *kmap_to_page(void *addr)
=20
 static inline void *kmap(struct page *page)
 {
+	lufd_check_pages(page, 0);
 	might_sleep();
 	return page_address(page);
 }
@@ -177,11 +179,13 @@ static inline void kunmap(struct page *page)
=20
 static inline void *kmap_local_page(struct page *page)
 {
+	lufd_check_pages(page, 0);
 	return page_address(page);
 }
=20
 static inline void *kmap_local_folio(struct folio *folio, size_t offset)
 {
+	lufd_check_folio(folio);
 	return page_address(&folio->page) + offset;
 }
=20
@@ -204,6 +208,7 @@ static inline void __kunmap_local(const void *addr)
=20
 static inline void *kmap_atomic(struct page *page)
 {
+	lufd_check_pages(page, 0);
 	if (IS_ENABLED(CONFIG_PREEMPT_RT))
 		migrate_disable();
 	else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5c81c9831bc5d..9572fbbb9d73f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -44,6 +44,24 @@ extern int sysctl_page_lock_unfairness;
 void mm_core_init(void);
 void init_mm_internals(void);
=20
+#ifdef CONFIG_LUF_DEBUG
+void lufd_check_folio(struct folio *f);
+void lufd_check_pages(const struct page *p, unsigned int order);
+void lufd_check_zone_pages(struct zone *zone, struct page *page, unsigned =
int order);
+void lufd_check_queued_pages(void);
+void lufd_queue_page_for_check(struct page *page, int order);
+void lufd_mark_folio(struct folio *f, unsigned short luf_key);
+void lufd_mark_pages(struct page *p, unsigned int order, unsigned short lu=
f_key);
+#else
+static inline void lufd_check_folio(struct folio *f) {}
+static inline void lufd_check_pages(const struct page *p, unsigned int ord=
er) {}
+static inline void lufd_check_zone_pages(struct zone *zone, struct page *p=
age, unsigned int order) {}
+static inline void lufd_check_queued_pages(void) {}
+static inline void lufd_queue_page_for_check(struct page *page, int order)=
 {}
+static inline void lufd_mark_folio(struct folio *f, unsigned short luf_key=
) {}
+static inline void lufd_mark_pages(struct page *p, unsigned int order, uns=
igned short luf_key) {}
+#endif
+
 #ifndef CONFIG_NUMA		/* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
=20
@@ -113,7 +131,7 @@ extern int mmap_rnd_compat_bits __read_mostly;
 #endif
=20
 #ifndef page_to_virt
-#define page_to_virt(x)	__va(PFN_PHYS(page_to_pfn(x)))
+#define page_to_virt(x)	({ lufd_check_pages(x, 0); __va(PFN_PHYS(page_to_p=
fn(x)));})
 #endif
=20
 #ifndef lm_alias
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e3132e1e5e5d2..e0c5712dc46ff 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -22,6 +22,10 @@
=20
 #include <asm/mmu.h>
=20
+#ifdef CONFIG_LUF_DEBUG
+extern struct page_ext_operations luf_debug_ops;
+#endif
+
 #ifndef AT_VECTOR_SIZE_ARCH
 #define AT_VECTOR_SIZE_ARCH 0
 #endif
@@ -32,18 +36,6 @@
 struct address_space;
 struct mem_cgroup;
=20
-#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-struct luf_batch {
-	struct tlbflush_unmap_batch batch;
-	unsigned long ugen;
-	rwlock_t lock;
-};
-void luf_batch_init(struct luf_batch *lb);
-#else
-struct luf_batch {};
-static inline void luf_batch_init(struct luf_batch *lb) {}
-#endif
-
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index bff5706b76e14..b5dfc451c009b 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -9,6 +9,7 @@
  */
=20
 #include <linux/types.h>
+#include <linux/spinlock_types.h>
=20
 #include <asm/page.h>
=20
@@ -67,4 +68,19 @@ struct tlbflush_unmap_batch {
 #endif
 };
=20
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+struct luf_batch {
+	struct tlbflush_unmap_batch batch;
+	unsigned long ugen;
+	rwlock_t lock;
+};
+void luf_batch_init(struct luf_batch *lb);
+#else
+struct luf_batch {};
+static inline void luf_batch_init(struct luf_batch *lb) {}
+#endif
+
+#if defined(CONFIG_LUF_DEBUG)
+#define NR_LUFD_PAGES 512
+#endif
 #endif /* _LINUX_MM_TYPES_TASK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 463cb2fb8f919..eb1487fa101e6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1380,6 +1380,11 @@ struct task_struct {
 	unsigned long luf_ugen;
 	unsigned long zone_ugen;
 	unsigned long wait_zone_ugen;
+#if defined(CONFIG_LUF_DEBUG)
+	struct page *lufd_pages[NR_LUFD_PAGES];
+	int lufd_pages_order[NR_LUFD_PAGES];
+	int lufd_pages_nr;
+#endif
 #endif
=20
 	struct tlbflush_unmap_batch	tlb_ubc;
diff --git a/mm/highmem.c b/mm/highmem.c
index ef3189b36cadb..a323d5a655bf9 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -576,6 +576,7 @@ void *__kmap_local_page_prot(struct page *page, pgprot_=
t prot)
 {
 	void *kmap;
=20
+	lufd_check_pages(page, 0);
 	/*
 	 * To broaden the usage of the actual kmap_local() machinery always map
 	 * pages when debugging is enabled and the architecture has no problems
diff --git a/mm/memory.c b/mm/memory.c
index c98af5e567e89..89d047867d60d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6124,6 +6124,18 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vm=
a, unsigned long address,
 			mapping =3D vma->vm_file->f_mapping;
 	}
=20
+#ifdef CONFIG_LUF_DEBUG
+	if (luf_flush) {
+		/*
+		 * If it has a VM_SHARED mapping, all the mms involved
+		 * in the struct address_space should be luf_flush'ed.
+		 */
+		if (mapping)
+			luf_flush_mapping(mapping);
+		luf_flush_mm(mm);
+	}
+#endif
+
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		ret =3D hugetlb_fault(vma->vm_mm, vma, address, flags);
 	else
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ccbe49b78190a..c8ab60c60bb08 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -758,6 +758,8 @@ void luf_takeoff_end(struct zone *zone)
 		VM_WARN_ON(current->zone_ugen);
 		VM_WARN_ON(current->wait_zone_ugen);
 	}
+
+	lufd_check_queued_pages();
 }
=20
 /*
@@ -853,8 +855,10 @@ bool luf_takeoff_check_and_fold(struct zone *zone, str=
uct page *page)
 		struct luf_batch *lb;
 		unsigned long lb_ugen;
=20
-		if (!luf_key)
+		if (!luf_key) {
+			lufd_check_pages(page, buddy_order(page));
 			return true;
+		}
=20
 		lb =3D &luf_batch[luf_key];
 		read_lock_irqsave(&lb->lock, flags);
@@ -875,12 +879,15 @@ bool luf_takeoff_check_and_fold(struct zone *zone, st=
ruct page *page)
=20
 		if (!current->luf_ugen || ugen_before(current->luf_ugen, lb_ugen))
 			current->luf_ugen =3D lb_ugen;
+		lufd_queue_page_for_check(page, buddy_order(page));
 		return true;
 	}
=20
 	zone_ugen =3D page_zone_ugen(zone, page);
-	if (!zone_ugen)
+	if (!zone_ugen) {
+		lufd_check_pages(page, buddy_order(page));
 		return true;
+	}
=20
 	/*
 	 * Should not be zero since zone-zone_ugen has been updated in
@@ -888,17 +895,23 @@ bool luf_takeoff_check_and_fold(struct zone *zone, st=
ruct page *page)
 	 */
 	VM_WARN_ON(!zone->zone_ugen);
=20
-	if (!ugen_before(READ_ONCE(zone->zone_ugen_done), zone_ugen))
+	if (!ugen_before(READ_ONCE(zone->zone_ugen_done), zone_ugen)) {
+		lufd_check_pages(page, buddy_order(page));
 		return true;
+	}
=20
 	if (current->luf_no_shootdown)
 		return false;
=20
+	lufd_check_zone_pages(zone, page, buddy_order(page));
+
 	/*
 	 * zone batched flush has been already set.
 	 */
-	if (current->zone_ugen)
+	if (current->zone_ugen) {
+		lufd_queue_page_for_check(page, buddy_order(page));
 		return true;
+	}
=20
 	/*
 	 * Others are already performing tlb shootdown for us.  All we
@@ -933,6 +946,7 @@ bool luf_takeoff_check_and_fold(struct zone *zone, stru=
ct page *page)
 		atomic_long_set(&zone->nr_luf_pages, 0);
 		fold_batch(tlb_ubc_takeoff, &zone->zone_batch, true);
 	}
+	lufd_queue_page_for_check(page, buddy_order(page));
 	return true;
 }
 #endif
@@ -1238,6 +1252,11 @@ static inline void __free_one_page(struct page *page,
 	} else
 		zone_ugen =3D page_zone_ugen(zone, page);
=20
+	if (!zone_ugen)
+		lufd_check_pages(page, order);
+	else
+		lufd_check_zone_pages(zone, page, order);
+
 	while (order < MAX_PAGE_ORDER) {
 		int buddy_mt =3D migratetype;
 		unsigned long buddy_zone_ugen;
@@ -1299,6 +1318,10 @@ static inline void __free_one_page(struct page *page,
 		set_page_zone_ugen(page, zone_ugen);
 		pfn =3D combined_pfn;
 		order++;
+		if (!zone_ugen)
+			lufd_check_pages(page, order);
+		else
+			lufd_check_zone_pages(zone, page, order);
 	}
=20
 done_merging:
@@ -3201,6 +3224,8 @@ void free_unref_page(struct page *page, unsigned int =
order,
 	unsigned long pfn =3D page_to_pfn(page);
 	int migratetype;
=20
+	lufd_mark_pages(page, order, luf_key);
+
 	if (!pcp_allowed_order(order)) {
 		__free_pages_ok(page, order, FPI_NONE, luf_key);
 		return;
@@ -3253,6 +3278,7 @@ void free_unref_folios(struct folio_batch *folios, un=
signed short luf_key)
 		unsigned long pfn =3D folio_pfn(folio);
 		unsigned int order =3D folio_order(folio);
=20
+		lufd_mark_folio(folio, luf_key);
 		if (!free_pages_prepare(&folio->page, order))
 			continue;
 		/*
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 641d93f6af4c1..be40bc2a93378 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -89,6 +89,9 @@ static struct page_ext_operations *page_ext_ops[] __initd=
ata =3D {
 #ifdef CONFIG_PAGE_TABLE_CHECK
 	&page_table_check_ops,
 #endif
+#ifdef CONFIG_LUF_DEBUG
+	&luf_debug_ops,
+#endif
 };
=20
 unsigned long page_ext_size;
diff --git a/mm/rmap.c b/mm/rmap.c
index 55003eb0b4936..fd6d5cb0fa8d0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1161,6 +1161,235 @@ static bool should_defer_flush(struct mm_struct *mm=
, enum ttu_flags flags)
 }
 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
=20
+#ifdef CONFIG_LUF_DEBUG
+
+static bool need_luf_debug(void)
+{
+	return true;
+}
+
+static void init_luf_debug(void)
+{
+	/* Do nothing */
+}
+
+struct page_ext_operations luf_debug_ops =3D {
+	.size =3D sizeof(struct luf_batch),
+	.need =3D need_luf_debug,
+	.init =3D init_luf_debug,
+	.need_shared_flags =3D false,
+};
+
+static bool __lufd_check_zone_pages(struct page *page, int nr,
+		struct tlbflush_unmap_batch *batch, unsigned long ugen)
+{
+	int i;
+
+	for (i =3D 0; i < nr; i++) {
+		struct page_ext *page_ext;
+		struct luf_batch *lb;
+		unsigned long lb_ugen;
+		unsigned long flags;
+		bool ret;
+
+		page_ext =3D page_ext_get(page + i);
+		if (!page_ext)
+			continue;
+
+		lb =3D (struct luf_batch *)page_ext_data(page_ext, &luf_debug_ops);
+		write_lock_irqsave(&lb->lock, flags);
+		lb_ugen =3D lb->ugen;
+		ret =3D arch_tlbbatch_done(&lb->batch.arch, &batch->arch);
+		write_unlock_irqrestore(&lb->lock, flags);
+		page_ext_put(page_ext);
+
+		if (!ret || ugen_before(ugen, lb_ugen))
+			return false;
+	}
+	return true;
+}
+
+void lufd_check_zone_pages(struct zone *zone, struct page *page, unsigned =
int order)
+{
+	bool warn;
+	static bool once =3D false;
+
+	if (!page || !zone)
+		return;
+
+	warn =3D !__lufd_check_zone_pages(page, 1 << order,
+			&zone->zone_batch, zone->luf_ugen);
+
+	if (warn && !READ_ONCE(once)) {
+		WRITE_ONCE(once, true);
+		VM_WARN(1, "LUFD: ugen(%lu) page(%p) order(%u)\n",
+				atomic_long_read(&luf_ugen), page, order);
+		print_lufd_arch();
+	}
+}
+
+static bool __lufd_check_pages(const struct page *page, int nr)
+{
+	int i;
+
+	for (i =3D 0; i < nr; i++) {
+		struct page_ext *page_ext;
+		struct luf_batch *lb;
+		unsigned long lb_ugen;
+		unsigned long flags;
+		bool ret;
+
+		page_ext =3D page_ext_get(page + i);
+		if (!page_ext)
+			continue;
+
+		lb =3D (struct luf_batch *)page_ext_data(page_ext, &luf_debug_ops);
+		write_lock_irqsave(&lb->lock, flags);
+		lb_ugen =3D lb->ugen;
+		ret =3D arch_tlbbatch_diet(&lb->batch.arch, lb_ugen);
+		write_unlock_irqrestore(&lb->lock, flags);
+		page_ext_put(page_ext);
+
+		if (!ret)
+			return false;
+	}
+	return true;
+}
+
+void lufd_queue_page_for_check(struct page *page, int order)
+{
+	struct page **parray =3D current->lufd_pages;
+	int *oarray =3D current->lufd_pages_order;
+
+	if (!page)
+		return;
+
+	if (current->lufd_pages_nr >=3D NR_LUFD_PAGES) {
+		VM_WARN_ONCE(1, "LUFD: NR_LUFD_PAGES is too small.\n");
+		return;
+	}
+
+	*(parray + current->lufd_pages_nr) =3D page;
+	*(oarray + current->lufd_pages_nr) =3D order;
+	current->lufd_pages_nr++;
+}
+
+void lufd_check_queued_pages(void)
+{
+	struct page **parray =3D current->lufd_pages;
+	int *oarray =3D current->lufd_pages_order;
+	int i;
+
+	for (i =3D 0; i < current->lufd_pages_nr; i++)
+		lufd_check_pages(*(parray + i), *(oarray + i));
+	current->lufd_pages_nr =3D 0;
+}
+
+void lufd_check_folio(struct folio *folio)
+{
+	struct page *page;
+	int nr;
+	bool warn;
+	static bool once =3D false;
+
+	if (!folio)
+		return;
+
+	page =3D folio_page(folio, 0);
+	nr =3D folio_nr_pages(folio);
+
+	warn =3D !__lufd_check_pages(page, nr);
+
+	if (warn && !READ_ONCE(once)) {
+		WRITE_ONCE(once, true);
+		VM_WARN(1, "LUFD: ugen(%lu) page(%p) nr(%d)\n",
+				atomic_long_read(&luf_ugen), page, nr);
+		print_lufd_arch();
+	}
+}
+EXPORT_SYMBOL(lufd_check_folio);
+
+void lufd_check_pages(const struct page *page, unsigned int order)
+{
+	bool warn;
+	static bool once =3D false;
+
+	if (!page)
+		return;
+
+	warn =3D !__lufd_check_pages(page, 1 << order);
+
+	if (warn && !READ_ONCE(once)) {
+		WRITE_ONCE(once, true);
+		VM_WARN(1, "LUFD: ugen(%lu) page(%p) order(%u)\n",
+				atomic_long_read(&luf_ugen), page, order);
+		print_lufd_arch();
+	}
+}
+EXPORT_SYMBOL(lufd_check_pages);
+
+static void __lufd_mark_pages(struct page *page, int nr, unsigned short lu=
f_key)
+{
+	int i;
+
+	for (i =3D 0; i < nr; i++) {
+		struct page_ext *page_ext;
+		struct luf_batch *lb;
+
+		page_ext =3D page_ext_get(page + i);
+		if (!page_ext)
+			continue;
+
+		lb =3D (struct luf_batch *)page_ext_data(page_ext, &luf_debug_ops);
+		fold_luf_batch(lb, &luf_batch[luf_key]);
+		page_ext_put(page_ext);
+	}
+}
+
+void lufd_mark_folio(struct folio *folio, unsigned short luf_key)
+{
+	struct page *page;
+	int nr;
+	bool warn;
+	static bool once =3D false;
+
+	if (!luf_key)
+		return;
+
+	page =3D folio_page(folio, 0);
+	nr =3D folio_nr_pages(folio);
+
+	warn =3D !__lufd_check_pages(page, nr);
+	__lufd_mark_pages(page, nr, luf_key);
+
+	if (warn && !READ_ONCE(once)) {
+		WRITE_ONCE(once, true);
+		VM_WARN(1, "LUFD: ugen(%lu) page(%p) nr(%d)\n",
+				atomic_long_read(&luf_ugen), page, nr);
+		print_lufd_arch();
+	}
+}
+
+void lufd_mark_pages(struct page *page, unsigned int order, unsigned short=
 luf_key)
+{
+	bool warn;
+	static bool once =3D false;
+
+	if (!luf_key)
+		return;
+
+	warn =3D !__lufd_check_pages(page, 1 << order);
+	__lufd_mark_pages(page, 1 << order, luf_key);
+
+	if (warn && !READ_ONCE(once)) {
+		WRITE_ONCE(once, true);
+		VM_WARN(1, "LUFD: ugen(%lu) page(%p) order(%u)\n",
+				atomic_long_read(&luf_ugen), page, order);
+		print_lufd_arch();
+	}
+}
+#endif
+
 /**
  * page_address_in_vma - The virtual address of a page in this VMA.
  * @folio: The folio containing the page.
--=20
2.17.1