From nobody Sun Feb  8 15:08:19 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 0E6E0CD13D2
	for <linux-kernel@archiver.kernel.org>; Mon, 18 Sep 2023 07:35:04 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S240091AbjIRHej (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Mon, 18 Sep 2023 03:34:39 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:34882 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S240398AbjIRHeS (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Mon, 18 Sep 2023 03:34:18 -0400
Received: from mgamail.intel.com (mgamail.intel.com [192.55.52.43])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 1ABEECFC
        for <linux-kernel@vger.kernel.org>;
 Mon, 18 Sep 2023 00:33:38 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1695022419; x=1726558419;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=mlfQoxCuP8sG2p4kkpY1b2EQ7BccGlB5GeB5heEQ4Is=;
  b=c9z2Ne3bcRPhO2tiwD5iKVAHOF6c45+YfA53xjhLFh4SDCHfiKFSZxPe
   B+SV7NUc0/ediSNOw4TDWjRBJmHO2Y6Ohrr/9ETZfpVOlKvGNX9DPP5ST
   iq8MnkC6UD7LmRMr2yx8AEndpAgwaCPsLPbFbDJxlSzsOWHXdBMwRzINt
   vB/Hah4RGmy2bgOXAnW9rUpBIBjPLjso0ZE4r+pRuNasLApzX0+76d1Yu
   0/bJHcL0f9ir0de7xuq4YX+d7Q3evAUq+LyRzhn6WOCVDErlqhDN3xFgl
   2AdzjCY+dLn7wOkBsVLZxNIWPEf+ny9jjRMu4aF2fB6BjxIMDiMWKnEIU
   Q==;
X-IronPort-AV: E=McAfee;i="6600,9927,10836"; a="465932097"
X-IronPort-AV: E=Sophos;i="6.02,156,1688454000";
   d="scan'208";a="465932097"
Received: from fmsmga001.fm.intel.com ([10.253.24.23])
  by fmsmga105.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 18 Sep 2023 00:33:37 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=McAfee;i="6600,9927,10836"; a="888929628"
X-IronPort-AV: E=Sophos;i="6.02,156,1688454000";
   d="scan'208";a="888929628"
Received: from fyin-dev.sh.intel.com ([10.239.159.24])
  by fmsmga001.fm.intel.com with ESMTP; 18 Sep 2023 00:32:52 -0700
From: Yin Fengwei <fengwei.yin@intel.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
        akpm@linux-foundation.org, yuzhao@google.com, willy@infradead.org,
        hughd@google.com, yosryahmed@google.com, ryan.roberts@arm.com,
        david@redhat.com, shy828301@gmail.com
Cc: fengwei.yin@intel.com
Subject: [PATCH v3 1/3] mm: add functions folio_in_range() and
 folio_within_vma()
Date: Mon, 18 Sep 2023 15:33:16 +0800
Message-Id: <20230918073318.1181104-2-fengwei.yin@intel.com>
X-Mailer: git-send-email 2.39.2
In-Reply-To: <20230918073318.1181104-1-fengwei.yin@intel.com>
References: <20230918073318.1181104-1-fengwei.yin@intel.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

It will be used to check whether the folio is mapped to specific
VMA and whether the mapping address of folio is in the range.

Also a helper function folio_within_vma() to check whether folio
is in the range of vma based on folio_in_range().

Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
---
 mm/internal.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/mm/internal.h b/mm/internal.h
index 346d82260964..9e2a5b32c659 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -587,6 +587,56 @@ extern long faultin_vma_page_range(struct vm_area_stru=
ct *vma,
 				   bool write, int *locked);
 extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
 			       unsigned long bytes);
+
+/*
+ * NOTE: This function can't tell whether the folio is "fully mapped" in t=
he
+ * range.
+ * "fully mapped" means all the pages of folio is associated with the page
+ * table of range while this function just check whether the folio range is
+ * within the range [start, end). Funcation caller nees to do page table
+ * check if it cares about the page table association.
+ *
+ * Typical usage (like mlock or madvise) is:
+ * Caller knows at least 1 page of folio is associated with page table of =
VMA
+ * and the range [start, end) is intersect with the VMA range. Caller wants
+ * to know whether the folio is fully associated with the range. It calls
+ * this function to check whether the folio is in the range first. Then ch=
ecks
+ * the page table to know whether the folio is fully mapped to the range.
+ */
+static inline bool
+folio_within_range(struct folio *folio, struct vm_area_struct *vma,
+		unsigned long start, unsigned long end)
+{
+	pgoff_t pgoff, addr;
+	unsigned long vma_pglen =3D (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+
+	VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
+	if (start > end)
+		return false;
+
+	if (start < vma->vm_start)
+		start =3D vma->vm_start;
+
+	if (end > vma->vm_end)
+		end =3D vma->vm_end;
+
+	pgoff =3D folio_pgoff(folio);
+
+	/* if folio start address is not in vma range */
+	if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
+		return false;
+
+	addr =3D vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+
+	return !(addr < start || end - addr < folio_size(folio));
+}
+
+static inline bool
+folio_within_vma(struct folio *folio, struct vm_area_struct *vma)
+{
+	return folio_within_range(folio, vma, vma->vm_start, vma->vm_end);
+}
+
 /*
  * mlock_vma_folio() and munlock_vma_folio():
  * should be called with vma's mmap_lock held for read or write,
--=20
2.39.2
From nobody Sun Feb  8 15:08:19 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 2F7D0CD13D8
	for <linux-kernel@archiver.kernel.org>; Mon, 18 Sep 2023 07:35:04 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S240241AbjIRHes (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Mon, 18 Sep 2023 03:34:48 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:53232 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S240537AbjIRHed (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Mon, 18 Sep 2023 03:34:33 -0400
Received: from mgamail.intel.com (mgamail.intel.com [134.134.136.126])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 9DA7ECC8
        for <linux-kernel@vger.kernel.org>;
 Mon, 18 Sep 2023 00:33:52 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1695022432; x=1726558432;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=2yvamEVzhzGGtx2e9kET8hOBuj45FWaLJl5k8kbHNoc=;
  b=UTe/bYNAmIPpwlysghvAfGRTC9gCCPstTKa8HwWRMmnvFinuNfyfbZiB
   jz/Fj9eWQPrSZJH+la2a70MZvloQd0C2POumJjUAKadB/hfYHgON6S35E
   gopjWHc/6DbTcmQ5ayFWrbcJlTdMEjoIe4TcexUJ6UyW3nxiLlCOwUQHw
   iBkVSPp3CvKdJAGhg7zOTWoorgHoYlpkAigi2MzLpR7G5N6ksKnppsmHY
   kqhCPx4cr51uTtoQ4UhTD528TP9RjJLpUaK4LYZPWu/2XGG2vBNwxtyPS
   HFYLeSJ0dgIvCAVZ6d37L7rrK53EMFfYmcocU4u0LZu8fz68PSlymO0Z/
   w==;
X-IronPort-AV: E=McAfee;i="6600,9927,10836"; a="364630957"
X-IronPort-AV: E=Sophos;i="6.02,156,1688454000";
   d="scan'208";a="364630957"
Received: from fmsmga004.fm.intel.com ([10.253.24.48])
  by orsmga106.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 18 Sep 2023 00:33:51 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=McAfee;i="6600,9927,10836"; a="815915431"
X-IronPort-AV: E=Sophos;i="6.02,156,1688454000";
   d="scan'208";a="815915431"
Received: from fyin-dev.sh.intel.com ([10.239.159.24])
  by fmsmga004.fm.intel.com with ESMTP; 18 Sep 2023 00:33:48 -0700
From: Yin Fengwei <fengwei.yin@intel.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
        akpm@linux-foundation.org, yuzhao@google.com, willy@infradead.org,
        hughd@google.com, yosryahmed@google.com, ryan.roberts@arm.com,
        david@redhat.com, shy828301@gmail.com
Cc: fengwei.yin@intel.com
Subject: [PATCH v3 2/3] mm: handle large folio when large folio in VM_LOCKED
 VMA range
Date: Mon, 18 Sep 2023 15:33:17 +0800
Message-Id: <20230918073318.1181104-3-fengwei.yin@intel.com>
X-Mailer: git-send-email 2.39.2
In-Reply-To: <20230918073318.1181104-1-fengwei.yin@intel.com>
References: <20230918073318.1181104-1-fengwei.yin@intel.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

If large folio is in the range of VM_LOCKED VMA, it should be
mlocked to avoid being picked by page reclaim. Which may split
the large folio and then mlock each pages again.

Mlock this kind of large folio to prevent them being picked by
page reclaim.

For the large folio which cross the boundary of VM_LOCKED VMA
or not fully mapped to VM_LOCKED VMA, we'd better not to mlock
it. So if the system is under memory pressure, this kind of
large folio will be split and the pages ouf of VM_LOCKED VMA
can be reclaimed.

Ideally, for large folio, we should mlock it when the large folio
is fully mapped to VMA and munlock it if any page are unmampped
from VMA. But it's not easy to detect whether the large folio is
fully mapped to VMA in some cases (like add/remove rmap). So we
update mlock_vma_folio() and munlock_vma_folio() to mlock/munlock
the folio according to vma->vm_flags. Let caller to decide whether
they should call these two functions.

For add rmap, only mlock normal 4K folio and postpone large folio
handling to page reclaim phase. It is possible to reuse page table
iterator to detect whether folio is fully mapped or not during
page reclaim phase. For remove rmap, invoke munlock_vma_folio()
to munlock folio unconditionly because rmap makes folio not fully
mapped to VMA.

Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
---
 mm/internal.h | 23 ++++++++++--------
 mm/rmap.c     | 66 ++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 68 insertions(+), 21 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 9e2a5b32c659..c1441fd9898e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -645,14 +645,10 @@ folio_within_vma(struct folio *folio, struct vm_area_=
struct *vma)
  * mlock is usually called at the end of page_add_*_rmap(), munlock at
  * the end of page_remove_rmap(); but new anon folios are managed by
  * folio_add_lru_vma() calling mlock_new_folio().
- *
- * @compound is used to include pmd mappings of THPs, but filter out
- * pte mappings of THPs, which cannot be consistently counted: a pte
- * mapping of the THP head cannot be distinguished by the page alone.
  */
 void mlock_folio(struct folio *folio);
 static inline void mlock_vma_folio(struct folio *folio,
-			struct vm_area_struct *vma, bool compound)
+				struct vm_area_struct *vma)
 {
 	/*
 	 * The VM_SPECIAL check here serves two purposes.
@@ -662,17 +658,24 @@ static inline void mlock_vma_folio(struct folio *foli=
o,
 	 *    file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
 	 *    still be set while VM_SPECIAL bits are added: so ignore it then.
 	 */
-	if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) =3D=3D VM_LOCKED) &&
-	    (compound || !folio_test_large(folio)))
+	if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) =3D=3D VM_LOCKED))
 		mlock_folio(folio);
 }
=20
 void munlock_folio(struct folio *folio);
 static inline void munlock_vma_folio(struct folio *folio,
-			struct vm_area_struct *vma, bool compound)
+					struct vm_area_struct *vma)
 {
-	if (unlikely(vma->vm_flags & VM_LOCKED) &&
-	    (compound || !folio_test_large(folio)))
+	/*
+	 * munlock if the function is called. Ideally, we should only
+	 * do munlock if any page of folio is unmapped from VMA and
+	 * cause folio not fully mapped to VMA.
+	 *
+	 * But it's not easy to confirm that's the situation. So we
+	 * always munlock the folio and page reclaim will correct it
+	 * if it's wrong.
+	 */
+	if (unlikely(vma->vm_flags & VM_LOCKED))
 		munlock_folio(folio);
 }
=20
diff --git a/mm/rmap.c b/mm/rmap.c
index 789a2beb8b3a..e4b92e585df9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -798,6 +798,7 @@ struct folio_referenced_arg {
 	unsigned long vm_flags;
 	struct mem_cgroup *memcg;
 };
+
 /*
  * arg: folio_referenced_arg will be passed
  */
@@ -807,17 +808,33 @@ static bool folio_referenced_one(struct folio *folio,
 	struct folio_referenced_arg *pra =3D arg;
 	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
 	int referenced =3D 0;
+	unsigned long start =3D address, ptes =3D 0;
=20
 	while (page_vma_mapped_walk(&pvmw)) {
 		address =3D pvmw.address;
=20
-		if ((vma->vm_flags & VM_LOCKED) &&
-		    (!folio_test_large(folio) || !pvmw.pte)) {
-			/* Restore the mlock which got missed */
-			mlock_vma_folio(folio, vma, !pvmw.pte);
-			page_vma_mapped_walk_done(&pvmw);
-			pra->vm_flags |=3D VM_LOCKED;
-			return false; /* To break the loop */
+		if (vma->vm_flags & VM_LOCKED) {
+			if (!folio_test_large(folio) || !pvmw.pte) {
+				/* Restore the mlock which got missed */
+				mlock_vma_folio(folio, vma);
+				page_vma_mapped_walk_done(&pvmw);
+				pra->vm_flags |=3D VM_LOCKED;
+				return false; /* To break the loop */
+			}
+			/*
+			 * For large folio fully mapped to VMA, will
+			 * be handled after the pvmw loop.
+			 *
+			 * For large folio cross VMA boundaries, it's
+			 * expected to be picked  by page reclaim. But
+			 * should skip reference of pages which are in
+			 * the range of VM_LOCKED vma. As page reclaim
+			 * should just count the reference of pages out
+			 * the range of VM_LOCKED vma.
+			 */
+			ptes++;
+			pra->mapcount--;
+			continue;
 		}
=20
 		if (pvmw.pte) {
@@ -842,6 +859,23 @@ static bool folio_referenced_one(struct folio *folio,
 		pra->mapcount--;
 	}
=20
+	if ((vma->vm_flags & VM_LOCKED) &&
+			folio_test_large(folio) &&
+			folio_within_vma(folio, vma)) {
+		unsigned long s_align, e_align;
+
+		s_align =3D ALIGN_DOWN(start, PMD_SIZE);
+		e_align =3D ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE);
+
+		/* folio doesn't cross page table boundary and fully mapped */
+		if ((s_align =3D=3D e_align) && (ptes =3D=3D folio_nr_pages(folio))) {
+			/* Restore the mlock which got missed */
+			mlock_vma_folio(folio, vma);
+			pra->vm_flags |=3D VM_LOCKED;
+			return false; /* To break the loop */
+		}
+	}
+
 	if (referenced)
 		folio_clear_idle(folio);
 	if (folio_test_clear_young(folio))
@@ -1252,7 +1286,14 @@ void page_add_anon_rmap(struct page *page, struct vm=
_area_struct *vma,
 	VM_WARN_ON_FOLIO(page_mapcount(page) > 1 && PageAnonExclusive(page),
 			 folio);
=20
-	mlock_vma_folio(folio, vma, compound);
+	/*
+	 * For large folio, only mlock it if it's fully mapped to VMA. It's
+	 * not easy to check whether the large folio is fully mapped to VMA
+	 * here. Only mlock normal 4K folio and leave page reclaim to handle
+	 * large folio.
+	 */
+	if (!folio_test_large(folio))
+		mlock_vma_folio(folio, vma);
 }
=20
 /**
@@ -1352,7 +1393,9 @@ void folio_add_file_rmap_range(struct folio *folio, s=
truct page *page,
 	if (nr)
 		__lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr);
=20
-	mlock_vma_folio(folio, vma, compound);
+	/* See comments in page_add_anon_rmap() */
+	if (!folio_test_large(folio))
+		mlock_vma_folio(folio, vma);
 }
=20
 /**
@@ -1463,7 +1506,7 @@ void page_remove_rmap(struct page *page, struct vm_ar=
ea_struct *vma,
 	 * it's only reliable while mapped.
 	 */
=20
-	munlock_vma_folio(folio, vma, compound);
+	munlock_vma_folio(folio, vma);
 }
=20
 /*
@@ -1524,7 +1567,8 @@ static bool try_to_unmap_one(struct folio *folio, str=
uct vm_area_struct *vma,
 		if (!(flags & TTU_IGNORE_MLOCK) &&
 		    (vma->vm_flags & VM_LOCKED)) {
 			/* Restore the mlock which got missed */
-			mlock_vma_folio(folio, vma, false);
+			if (!folio_test_large(folio))
+				mlock_vma_folio(folio, vma);
 			page_vma_mapped_walk_done(&pvmw);
 			ret =3D false;
 			break;
--=20
2.39.2
From nobody Sun Feb  8 15:08:19 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id C69A1C46CA1
	for <linux-kernel@archiver.kernel.org>; Mon, 18 Sep 2023 07:36:07 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S240005AbjIRHfm (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Mon, 18 Sep 2023 03:35:42 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:39338 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S238432AbjIRHfJ (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Mon, 18 Sep 2023 03:35:09 -0400
Received: from mgamail.intel.com (mgamail.intel.com [134.134.136.126])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 0C41710C7
        for <linux-kernel@vger.kernel.org>;
 Mon, 18 Sep 2023 00:34:06 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1695022446; x=1726558446;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=Q8dtjLXEWzMrNVlVSld+SWqxhUjc/Zr37nPzybsHJrg=;
  b=QZSa3r5yWdMaUXZ1cRPNsLloPeAXwYAqgzN2XoCIURJICsqB8Jous8Ae
   h6zo7l6ikh9jka9cAkdTRL8f4XbjZtfIqNsbIUO11gKy4HBhlISRQgKHK
   JOQtUHPECF+OKMpf5HVHyIgWy1Baaqg9yBqJei7BVHxCq+aW/WtLPImgn
   4drf8BDGK2q9Ku+eNXq1/f8t7BxAxUpwsXD6TlSRmjbZH8Sifg90HwvJW
   QBT8gtCdFTb2Sy3SGtXj1lblgDBuvADYwLuvWT4bsC8Pw//zwT+oFeCYa
   1pALFCeR2L8dDUjpC7tlnd4BT91YXc4Q7oycUVbJc5nSkJfNPnnpVElcX
   g==;
X-IronPort-AV: E=McAfee;i="6600,9927,10836"; a="364630974"
X-IronPort-AV: E=Sophos;i="6.02,156,1688454000";
   d="scan'208";a="364630974"
Received: from fmsmga004.fm.intel.com ([10.253.24.48])
  by orsmga106.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 18 Sep 2023 00:34:05 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=McAfee;i="6600,9927,10836"; a="815915477"
X-IronPort-AV: E=Sophos;i="6.02,156,1688454000";
   d="scan'208";a="815915477"
Received: from fyin-dev.sh.intel.com ([10.239.159.24])
  by fmsmga004.fm.intel.com with ESMTP; 18 Sep 2023 00:34:02 -0700
From: Yin Fengwei <fengwei.yin@intel.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
        akpm@linux-foundation.org, yuzhao@google.com, willy@infradead.org,
        hughd@google.com, yosryahmed@google.com, ryan.roberts@arm.com,
        david@redhat.com, shy828301@gmail.com
Cc: fengwei.yin@intel.com
Subject: [PATCH v3 3/3] mm: mlock: update mlock_pte_range to handle large
 folio
Date: Mon, 18 Sep 2023 15:33:18 +0800
Message-Id: <20230918073318.1181104-4-fengwei.yin@intel.com>
X-Mailer: git-send-email 2.39.2
In-Reply-To: <20230918073318.1181104-1-fengwei.yin@intel.com>
References: <20230918073318.1181104-1-fengwei.yin@intel.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

Current kernel only lock base size folio during mlock syscall.
Add large folio support with following rules:
  - Only mlock large folio when it's in VM_LOCKED VMA range
    and fully mapped to page table.

    fully mapped folio is required as if folio is not fully
    mapped to a VM_LOCKED VMA, if system is in memory pressure,
    page reclaim is allowed to pick up this folio, split it
    and reclaim the pages which are not in VM_LOCKED VMA.

  - munlock will apply to the large folio which is in VMA range
    or cross the VMA boundary.

    This is required to handle the case that the large folio is
    mlocked, later the VMA is split in the middle of large folio.

Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
---
 mm/mlock.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 64 insertions(+), 2 deletions(-)

diff --git a/mm/mlock.c b/mm/mlock.c
index 06bdfab83b58..42b6865f8f82 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -305,6 +305,58 @@ void munlock_folio(struct folio *folio)
 	local_unlock(&mlock_fbatch.lock);
 }
=20
+static inline unsigned int folio_mlock_step(struct folio *folio,
+		pte_t *pte, unsigned long addr, unsigned long end)
+{
+	unsigned int count, i, nr =3D folio_nr_pages(folio);
+	unsigned long pfn =3D folio_pfn(folio);
+	pte_t ptent =3D ptep_get(pte);
+
+	if (!folio_test_large(folio))
+		return 1;
+
+	count =3D pfn + nr - pte_pfn(ptent);
+	count =3D min_t(unsigned int, count, (end - addr) >> PAGE_SHIFT);
+
+	for (i =3D 0; i < count; i++, pte++) {
+		pte_t entry =3D ptep_get(pte);
+
+		if (!pte_present(entry))
+			break;
+		if (pte_pfn(entry) - pfn >=3D nr)
+			break;
+	}
+
+	return i;
+}
+
+static inline bool allow_mlock_munlock(struct folio *folio,
+		struct vm_area_struct *vma, unsigned long start,
+		unsigned long end, unsigned int step)
+{
+	/*
+	 * For unlock, allow munlock large folio which is partially
+	 * mapped to VMA. As it's possible that large folio is
+	 * mlocked and VMA is split later.
+	 *
+	 * During memory pressure, such kind of large folio can
+	 * be split. And the pages are not in VM_LOCKed VMA
+	 * can be reclaimed.
+	 */
+	if (!(vma->vm_flags & VM_LOCKED))
+		return true;
+
+	/* folio not in range [start, end), skip mlock */
+	if (!folio_within_range(folio, vma, start, end))
+		return false;
+
+	/* folio is not fully mapped, skip mlock */
+	if (step !=3D folio_nr_pages(folio))
+		return false;
+
+	return true;
+}
+
 static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
 			   unsigned long end, struct mm_walk *walk)
=20
@@ -314,6 +366,8 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long ad=
dr,
 	pte_t *start_pte, *pte;
 	pte_t ptent;
 	struct folio *folio;
+	unsigned int step =3D 1;
+	unsigned long start =3D addr;
=20
 	ptl =3D pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
@@ -334,6 +388,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long ad=
dr,
 		walk->action =3D ACTION_AGAIN;
 		return 0;
 	}
+
 	for (pte =3D start_pte; addr !=3D end; pte++, addr +=3D PAGE_SIZE) {
 		ptent =3D ptep_get(pte);
 		if (!pte_present(ptent))
@@ -341,12 +396,19 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long =
addr,
 		folio =3D vm_normal_folio(vma, addr, ptent);
 		if (!folio || folio_is_zone_device(folio))
 			continue;
-		if (folio_test_large(folio))
-			continue;
+
+		step =3D folio_mlock_step(folio, pte, addr, end);
+		if (!allow_mlock_munlock(folio, vma, start, end, step))
+			goto next_entry;
+
 		if (vma->vm_flags & VM_LOCKED)
 			mlock_folio(folio);
 		else
 			munlock_folio(folio);
+
+next_entry:
+		pte +=3D step - 1;
+		addr +=3D (step - 1) << PAGE_SHIFT;
 	}
 	pte_unmap(start_pte);
 out:
--=20
2.39.2