From nobody Sat May  9 10:44:47 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 747EFC433F5
	for <linux-kernel@archiver.kernel.org>; Mon, 16 May 2022 12:54:46 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234380AbiEPMyo (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Mon, 16 May 2022 08:54:44 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:43686 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S243588AbiEPMyV (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Mon, 16 May 2022 08:54:21 -0400
Received: from mail-wm1-x32a.google.com (mail-wm1-x32a.google.com
 [IPv6:2a00:1450:4864:20::32a])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 814873A1B2
        for <linux-kernel@vger.kernel.org>;
 Mon, 16 May 2022 05:53:47 -0700 (PDT)
Received: by mail-wm1-x32a.google.com with SMTP id k126so8643986wme.2
        for <linux-kernel@vger.kernel.org>;
 Mon, 16 May 2022 05:53:47 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=from:to:cc:subject:date:message-id:in-reply-to:references
         :mime-version:content-transfer-encoding;
        bh=1zMWiBKrxJRt4dm7AECg/jq1FaXgoQHTOEXoqP7ndjs=;
        b=mZvVdyV4R2NcvGq9PkFdJMQvHTRv8eCR1cvl/UDrc9cP/TP6x0tncRCcBuir+2FIdK
         ZxTdX/iI8L9vq2skwvMtr/gCeUCuGpaAuhNu6q5UvR9CSxtwi99hwLYDmkxcHe00t6ew
         6Zlr/N+Ko8pCvL44oTBUHUbtvTel3xc3XVycs6ybFhIZTQtMLHry6Ehj8oSBsjUKEMg5
         20Taq3qqMX3qqjoT18hnG05xQ0gVV/pttQErniIEaCnO/+DXYisblcvK+B/0Gi+6BRCa
         KZSCqQcW8MKFlWmybnuiVfeSrIkjVcaMrgVTl1uSSL9UGZUDcC827xITl6iJcZaVSe+9
         oU2w==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
         :references:mime-version:content-transfer-encoding;
        bh=1zMWiBKrxJRt4dm7AECg/jq1FaXgoQHTOEXoqP7ndjs=;
        b=wj9/u32Es4pSj7Ipr093+bK45P730ZgasRqfhwmMpjkvhHBZZ5TLjsBbiEf4ddKuUK
         LrIuli9QObtM0wTGsY46g7WIsV1pJeZe4qw9S9oOejy40mK/VJcKTjUaMK6D4+q/GU4i
         eU3B0xPkDeqlkCrV7dm9HY6sfCywBeQmW+vYbfTCIZG7edX9iMgqymLlwsgLRY8P9uJU
         SpchOFhAh+VbhZFy8vIBJ7XTumcpEZ6CJ6O96VX+3uQC4Pxrx6qxpVceJ1cqTA8bggYS
         9Lj5ma2HEncpvZNqsHY5sAiKvMVKeIVZs0jUT6M4hvs/+v1A+QBhhh6GJYbHsIuoVrYz
         w4Tg==
X-Gm-Message-State: AOAM532qQQJcdbULEgE6rSglXj18uGgUPVEpYL7z1Cy+jxj2jiip1kH1
        07us6MO50/4frj4R4lcV4O8=
X-Google-Smtp-Source: 
 ABdhPJw8lMmpeW5ADElTuWJi8C91ciKJd0nw+e53cYaNTI5gWF8KbPB3ZQGhRbb9YDktYFilc2BG8A==
X-Received: by 2002:a05:600c:3d8c:b0:394:6097:9994 with SMTP id
 bi12-20020a05600c3d8c00b0039460979994mr26918715wmb.29.1652705625931;
        Mon, 16 May 2022 05:53:45 -0700 (PDT)
Received: from orion.localdomain ([93.99.228.15])
        by smtp.gmail.com with ESMTPSA id
 u23-20020a05600c00d700b003942a244ec2sm9958565wmm.7.2022.05.16.05.53.45
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Mon, 16 May 2022 05:53:45 -0700 (PDT)
Received: by orion.localdomain (Postfix, from userid 1003)
        id 8BBBCA0E76; Mon, 16 May 2022 14:54:07 +0200 (CEST)
From: =?UTF-8?q?Jakub=20Mat=C4=9Bna?= <matenajakub@gmail.com>
To: linux-mm@kvack.org
Cc: patches@lists.linux.dev, linux-kernel@vger.kernel.org,
        vbabka@suse.cz, mhocko@kernel.org, mgorman@techsingularity.net,
        willy@infradead.org, liam.howlett@oracle.com, hughd@google.com,
        kirill@shutemov.name, riel@surriel.com, rostedt@goodmis.org,
        peterz@infradead.org, david@redhat.com,
        =?UTF-8?q?Jakub=20Mat=C4=9Bna?= <matenajakub@gmail.com>
Subject: [RFC PATCH v3 1/6] [PATCH 1/6] mm: refactor of vma_merge()
Date: Mon, 16 May 2022 14:54:00 +0200
Message-Id: <20220516125405.1675-2-matenajakub@gmail.com>
X-Mailer: git-send-email 2.35.1
In-Reply-To: <20220516125405.1675-1-matenajakub@gmail.com>
References: <20220516125405.1675-1-matenajakub@gmail.com>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

Refactor vma_merge() to make it shorter, more understandable and
suitable for tracing of successful merges that are made possible by
following patches in the series. Main change is the elimination of code
duplicity in the case of merge next check. This is done by first doing
checks and caching the results before executing the merge itself. Exit
paths are also unified.

Signed-off-by: Jakub Mat=C4=9Bna <matenajakub@gmail.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/mmap.c | 81 +++++++++++++++++++++++++------------------------------
 1 file changed, 36 insertions(+), 45 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 3aa839f81e63..4a4611443593 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1171,7 +1171,9 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 {
 	pgoff_t pglen =3D (end - addr) >> PAGE_SHIFT;
 	struct vm_area_struct *area, *next;
-	int err;
+	int err =3D -1;
+	bool merge_prev =3D false;
+	bool merge_next =3D false;
=20
 	/*
 	 * We later require that vma->vm_flags =3D=3D vm_flags,
@@ -1190,66 +1192,55 @@ struct vm_area_struct *vma_merge(struct mm_struct *=
mm,
 	VM_WARN_ON(area && end > area->vm_end);
 	VM_WARN_ON(addr >=3D end);
=20
-	/*
-	 * Can it merge with the predecessor?
-	 */
+	/* Can we merge the predecessor? */
 	if (prev && prev->vm_end =3D=3D addr &&
 			mpol_equal(vma_policy(prev), policy) &&
 			can_vma_merge_after(prev, vm_flags,
 					    anon_vma, file, pgoff,
 					    vm_userfaultfd_ctx, anon_name)) {
-		/*
-		 * OK, it can.  Can we now merge in the successor as well?
-		 */
-		if (next && end =3D=3D next->vm_start &&
-				mpol_equal(policy, vma_policy(next)) &&
-				can_vma_merge_before(next, vm_flags,
-						     anon_vma, file,
-						     pgoff+pglen,
-						     vm_userfaultfd_ctx, anon_name) &&
-				is_mergeable_anon_vma(prev->anon_vma,
-						      next->anon_vma, NULL)) {
-							/* cases 1, 6 */
-			err =3D __vma_adjust(prev, prev->vm_start,
-					 next->vm_end, prev->vm_pgoff, NULL,
-					 prev);
-		} else					/* cases 2, 5, 7 */
-			err =3D __vma_adjust(prev, prev->vm_start,
-					 end, prev->vm_pgoff, NULL, prev);
-		if (err)
-			return NULL;
-		khugepaged_enter_vma_merge(prev, vm_flags);
-		return prev;
+		merge_prev =3D true;
+		area =3D prev;
 	}
-
-	/*
-	 * Can this new request be merged in front of next?
-	 */
+	/* Can we merge the successor? */
 	if (next && end =3D=3D next->vm_start &&
 			mpol_equal(policy, vma_policy(next)) &&
 			can_vma_merge_before(next, vm_flags,
 					     anon_vma, file, pgoff+pglen,
 					     vm_userfaultfd_ctx, anon_name)) {
+		merge_next =3D true;
+	}
+	/* Can we merge both the predecessor and the successor? */
+	if (merge_prev && merge_next &&
+			is_mergeable_anon_vma(prev->anon_vma,
+				next->anon_vma, NULL)) {	 /* cases 1, 6 */
+		err =3D __vma_adjust(prev, prev->vm_start,
+					next->vm_end, prev->vm_pgoff, NULL,
+					prev);
+	} else if (merge_prev) {			/* cases 2, 5, 7 */
+		err =3D __vma_adjust(prev, prev->vm_start,
+					end, prev->vm_pgoff, NULL, prev);
+	} else if (merge_next) {
 		if (prev && addr < prev->vm_end)	/* case 4 */
 			err =3D __vma_adjust(prev, prev->vm_start,
-					 addr, prev->vm_pgoff, NULL, next);
-		else {					/* cases 3, 8 */
+					addr, prev->vm_pgoff, NULL, next);
+		else					/* cases 3, 8 */
 			err =3D __vma_adjust(area, addr, next->vm_end,
-					 next->vm_pgoff - pglen, NULL, next);
-			/*
-			 * In case 3 area is already equal to next and
-			 * this is a noop, but in case 8 "area" has
-			 * been removed and next was expanded over it.
-			 */
-			area =3D next;
-		}
-		if (err)
-			return NULL;
-		khugepaged_enter_vma_merge(area, vm_flags);
-		return area;
+					next->vm_pgoff - pglen, NULL, next);
+		/*
+		 * In case 3 and 4 area is already equal to next and
+		 * this is a noop, but in case 8 "area" has
+		 * been removed and next was expanded over it.
+		 */
+		area =3D next;
 	}
=20
-	return NULL;
+	/*
+	 * Cannot merge with predecessor or successor or error in __vma_adjust?
+	 */
+	if (err)
+		return NULL;
+	khugepaged_enter_vma_merge(area, vm_flags);
+	return area;
 }
=20
 /*
--=20
2.35.1

From nobody Sat May  9 10:44:47 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id D2827C433FE
	for <linux-kernel@archiver.kernel.org>; Mon, 16 May 2022 12:54:41 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S243475AbiEPMyj (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Mon, 16 May 2022 08:54:39 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:45594 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S243584AbiEPMyU (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Mon, 16 May 2022 08:54:20 -0400
Received: from mail-wm1-x330.google.com (mail-wm1-x330.google.com
 [IPv6:2a00:1450:4864:20::330])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 8185C3A1B3
        for <linux-kernel@vger.kernel.org>;
 Mon, 16 May 2022 05:53:47 -0700 (PDT)
Received: by mail-wm1-x330.google.com with SMTP id
 p5-20020a1c2905000000b003970dd5404dso920277wmp.0
        for <linux-kernel@vger.kernel.org>;
 Mon, 16 May 2022 05:53:47 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=from:to:cc:subject:date:message-id:in-reply-to:references
         :mime-version:content-transfer-encoding;
        bh=0Z8unS+I3KC7HgCSi3Sa8z96pMNRd1BvZvnwNMD3WTs=;
        b=A3Dlgex7+DnOnYZLaF9FBzoiuQTihhx3vEfDIIJp3ykd3bn66AKO/ul3Gml8GQK9jh
         f3hwwjHkZCiJ9DgKTkGT62t+28fyu4fPeH6rpjIS14P866udo2QEa/nNiX3bv362z4Bl
         rpJU+Bj6Cff+nP2ncVPO9oJ0cnBIsdpRo4ftBVxKQfEBRJFiWMtRksBNCkEDEZkQczGR
         WvnhZx97jd08S63rv/lP1ZWpwI8Mt8hiY5Uqjse+zT5wigM7wz3Athcrzf2qZgpS8z3H
         uysUaWSpR4Co9Q6TEPdesUrO5VMOoIu23/e7zZZGR04IZ88yHXaeQq1cL7Oqi10OAYTH
         IeVg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
         :references:mime-version:content-transfer-encoding;
        bh=0Z8unS+I3KC7HgCSi3Sa8z96pMNRd1BvZvnwNMD3WTs=;
        b=xxoMCr6Uj09NlDq+kSU6VrH8B26Z5wcJbyWMU6aCS6RgCYQSbv9svHYMELm944cNsQ
         oglKkyo9lOxOtjnBh0izqt3Bo7PEjpSyo6embL9X9yRN97KG7qy+SJ2A8u1p7by3ecLz
         iiWLjo2/RPDuJ8gysKUsJ7c5a6FE6KvZwtSN/r5bpadMluDihGRI5idyKse6xQUwUwQv
         m6tUk6res+0YUKfELCh+66fycET6x//xW7g1vPeVKq0ksMwYowyfU81t6oMVs8IxHl8p
         MdWn1s2CeIyoeo622FoQYE8MsgSYtFyW9johqJWO5tKKVSaxPI72Z5A/hsADsuAHXqCu
         GLkw==
X-Gm-Message-State: AOAM532MP/yYsffyAPlEGz02N/ncdo3EhvzMTBFSU+VvzR7hqg5U+5D6
        vGDylIkKfEzzwBcJNz0da5ZZ1M57Y/X3xbU/
X-Google-Smtp-Source: 
 ABdhPJzV07JC9Mg9zVfcWX1VUOyPfRogVK8bD1F7lIrfsaUsrqugDWj/POWQV+HHzB0bOOI1Q6XAQg==
X-Received: by 2002:a05:600c:1da5:b0:395:baf7:ca4c with SMTP id
 p37-20020a05600c1da500b00395baf7ca4cmr16782305wms.99.1652705625778;
        Mon, 16 May 2022 05:53:45 -0700 (PDT)
Received: from orion.localdomain ([93.99.228.15])
        by smtp.gmail.com with ESMTPSA id
 p15-20020a7bcdef000000b00394517e7d98sm10372100wmj.25.2022.05.16.05.53.45
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Mon, 16 May 2022 05:53:45 -0700 (PDT)
Received: by orion.localdomain (Postfix, from userid 1003)
        id 8F349A0E77; Mon, 16 May 2022 14:54:07 +0200 (CEST)
From: =?UTF-8?q?Jakub=20Mat=C4=9Bna?= <matenajakub@gmail.com>
To: linux-mm@kvack.org
Cc: patches@lists.linux.dev, linux-kernel@vger.kernel.org,
        vbabka@suse.cz, mhocko@kernel.org, mgorman@techsingularity.net,
        willy@infradead.org, liam.howlett@oracle.com, hughd@google.com,
        kirill@shutemov.name, riel@surriel.com, rostedt@goodmis.org,
        peterz@infradead.org, david@redhat.com,
        =?UTF-8?q?Jakub=20Mat=C4=9Bna?= <matenajakub@gmail.com>
Subject: [RFC PATCH v3 2/6] [PATCH 2/6] mm: add merging after mremap resize
Date: Mon, 16 May 2022 14:54:01 +0200
Message-Id: <20220516125405.1675-3-matenajakub@gmail.com>
X-Mailer: git-send-email 2.35.1
In-Reply-To: <20220516125405.1675-1-matenajakub@gmail.com>
References: <20220516125405.1675-1-matenajakub@gmail.com>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

When mremap call results in expansion, it might be possible to merge the
VMA with the next VMA which might become adjacent. This patch adds
vma_merge call after the expansion is done to try and merge.

Signed-off-by: Jakub Mat=C4=9Bna <matenajakub@gmail.com>
---
 mm/mremap.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 303d3290b938..75cda854ec58 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -9,6 +9,7 @@
  */
=20
 #include <linux/mm.h>
+#include <linux/mm_inline.h>
 #include <linux/hugetlb.h>
 #include <linux/shm.h>
 #include <linux/ksm.h>
@@ -1022,8 +1023,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigne=
d long, old_len,
 				}
 			}
=20
-			if (vma_adjust(vma, vma->vm_start, addr + new_len,
-				       vma->vm_pgoff, NULL)) {
+			if (!vma_merge(mm, vma, addr + old_len, addr + new_len,
+					vma->vm_flags, vma->anon_vma, vma->vm_file,
+					vma->vm_pgoff + (old_len >> PAGE_SHIFT), vma_policy(vma),
+					vma->vm_userfaultfd_ctx, anon_vma_name(vma))) {
 				vm_unacct_memory(pages);
 				ret =3D -ENOMEM;
 				goto out;
--=20
2.35.1

From nobody Sat May  9 10:44:47 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 65BD0C433FE
	for <linux-kernel@archiver.kernel.org>; Mon, 16 May 2022 12:55:09 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S243594AbiEPMzG (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Mon, 16 May 2022 08:55:06 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:42146 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S243606AbiEPMyY (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Mon, 16 May 2022 08:54:24 -0400
Received: from mail-wm1-x32e.google.com (mail-wm1-x32e.google.com
 [IPv6:2a00:1450:4864:20::32e])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 5ED013A1BE
        for <linux-kernel@vger.kernel.org>;
 Mon, 16 May 2022 05:53:48 -0700 (PDT)
Received: by mail-wm1-x32e.google.com with SMTP id
 n6-20020a05600c3b8600b0039492b44ce7so8569008wms.5
        for <linux-kernel@vger.kernel.org>;
 Mon, 16 May 2022 05:53:48 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=from:to:cc:subject:date:message-id:in-reply-to:references
         :mime-version:content-transfer-encoding;
        bh=4Tb/G+5lkfHnvtEKe+geJ9MpHVXcxF0ik9q8tJTT9QY=;
        b=MBJ5UVxFXZQ3Bbmw7lohTUXD9BhgHvbzqqUwZ4d8YFzAN3UuUh1QCM6pq0g7YLfYMN
         eW3GbDhSHySEAVroDk/PVm39k7ugxJM3/2Yvh/OCdZBBjQPqlGVV8kT2LGQA12Hd5RWi
         Vi9VWMnTegV11lKhOk+gdtP1wq93Fx6WXHJGPTHT85och0Y0yMoNzKJnJtoFkKstu6M2
         YrjodRqz2tfFBKib14RA5CsIQAy9EAItKkqAwpTXk94o/m5cG1bVZUapEORJRwPEmcen
         HKMEOMscXgAfTJC82gnQMt3UrVaUhiFkkds5FM4ZxRrSd9Z/M43rAE7HWm7Yk36041Tf
         4RIQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
         :references:mime-version:content-transfer-encoding;
        bh=4Tb/G+5lkfHnvtEKe+geJ9MpHVXcxF0ik9q8tJTT9QY=;
        b=aifIrokk76LBw+gpcZbNqyb5n9QqWJK3H5ySfNZqtXoOPDtC3N9U07j7voI5eNWyb1
         udGzYohBYfEA5c7os1U5taC1IMG7zo8I/dFkNoTIiXiubYaiTzRc7tIfZWRJ7pfEHDGl
         Ee42BBbRgx8H1o6eVspEmvRX9fbhgeUVdQSGNUbPciXLinu+RYnUALVBOD+i47ezjm9G
         gbqgStZ6S3YeDnOTw5z+vPYSMYkgfBnrIX6TAM1qLKIAQUiQV3ajUIjFVvpS3KK3rIPU
         9eUpbrdc5hSIsq4Fydis8SzPbMeKx78Ftbzw71Si2+gfdczDEk6J4U0rGl+zfHfHJPcD
         mCRQ==
X-Gm-Message-State: AOAM531GRYsDmxbFWlelEdu+3Y1WADnLF1zgmcAYOcrqj0iupQJ9vUhn
        jc4pky8gHb6zZ4/k39YnAyA=
X-Google-Smtp-Source: 
 ABdhPJyPytLIRy+Nv9XrDz0YVXtUAseeu631D+FB5fIf9KftrEScobtVidHF58lNQ04ZrdDKgsBybQ==
X-Received: by 2002:a05:600c:20e:b0:394:2985:6d0c with SMTP id
 14-20020a05600c020e00b0039429856d0cmr26778458wmi.106.1652705626579;
        Mon, 16 May 2022 05:53:46 -0700 (PDT)
Received: from orion.localdomain ([93.99.228.15])
        by smtp.gmail.com with ESMTPSA id
 r15-20020a7bc08f000000b00394615cf468sm13162481wmh.28.2022.05.16.05.53.45
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Mon, 16 May 2022 05:53:45 -0700 (PDT)
Received: by orion.localdomain (Postfix, from userid 1003)
        id 92F45A0E78; Mon, 16 May 2022 14:54:07 +0200 (CEST)
From: =?UTF-8?q?Jakub=20Mat=C4=9Bna?= <matenajakub@gmail.com>
To: linux-mm@kvack.org
Cc: patches@lists.linux.dev, linux-kernel@vger.kernel.org,
        vbabka@suse.cz, mhocko@kernel.org, mgorman@techsingularity.net,
        willy@infradead.org, liam.howlett@oracle.com, hughd@google.com,
        kirill@shutemov.name, riel@surriel.com, rostedt@goodmis.org,
        peterz@infradead.org, david@redhat.com,
        =?UTF-8?q?Jakub=20Mat=C4=9Bna?= <matenajakub@gmail.com>
Subject: [RFC PATCH v3 3/6] [PATCH 3/6] mm: add migration waiting and rmap
 locking to pagewalk
Date: Mon, 16 May 2022 14:54:02 +0200
Message-Id: <20220516125405.1675-4-matenajakub@gmail.com>
X-Mailer: git-send-email 2.35.1
In-Reply-To: <20220516125405.1675-1-matenajakub@gmail.com>
References: <20220516125405.1675-1-matenajakub@gmail.com>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

Following patches need to wait for migration and take rmap locks before
they work with the pte itself. This feature is a compact change and is
therefore extracted into this patch.

In order to wait for migration when a page is being migrated, new flag
is added to pagewalk to optionally enable waiting for migration at the
walk_pte_range_inner() level. Similar flag was added to take rmap locks
at the same level.

When waiting for migration pte lock and rmap locks must be dropped and
taken again after the migration has ended. Similar mechanism is taken if
pte_entry() sets ACTION_AGAIN, which happens in the following patch when
a deadlock is encountered, because of a different lock order used during
the page update.

Migration waiting is done only at the PTE level and is presumes no pmd
entry is specified. If pmd_entry() is set together with page migration
flag a warning is logged. PMD migration waiting can implemented later if
anyone needs it.

At this time flags can be specified only by calling walk_page_vma(). If
needed flags can also be added to other pagewalk API calls.

Signed-off-by: Jakub Mat=C4=9Bna <matenajakub@gmail.com>
---
 fs/proc/task_mmu.c       |  4 +--
 include/linux/pagewalk.h | 11 ++++++-
 include/linux/rmap.h     |  2 ++
 mm/mremap.c              | 17 +---------
 mm/pagewalk.c            | 71 +++++++++++++++++++++++++++++++++++++---
 mm/rmap.c                | 16 +++++++++
 6 files changed, 97 insertions(+), 24 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f46060eb91b5..fd72263456e9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -794,7 +794,7 @@ static void smap_gather_stats(struct vm_area_struct *vm=
a,
 #endif
 	/* mmap_lock is held in m_start */
 	if (!start)
-		walk_page_vma(vma, ops, mss);
+		walk_page_vma(vma, ops, mss, 0);
 	else
 		walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
 }
@@ -1938,7 +1938,7 @@ static int show_numa_map(struct seq_file *m, void *v)
 		seq_puts(m, " huge");
=20
 	/* mmap_lock is held by m_start */
-	walk_page_vma(vma, &show_numa_ops, md);
+	walk_page_vma(vma, &show_numa_ops, md, 0);
=20
 	if (!md->pages)
 		goto out;
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index ac7b38ad5903..07345df51324 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -70,6 +70,13 @@ enum page_walk_action {
 	ACTION_AGAIN =3D 2
 };
=20
+/* Walk flags */
+
+/* Wait for migration before pte entry, not implemented for pmd entries */
+#define WALK_MIGRATION  0x1
+/* Take rmap locks before pte entries */
+#define WALK_LOCK_RMAP	0x2
+
 /**
  * struct mm_walk - walk_page_range data
  * @ops:	operation to call during the walk
@@ -77,6 +84,7 @@ enum page_walk_action {
  * @pgd:	pointer to PGD; only valid with no_vma (otherwise set to NULL)
  * @vma:	vma currently walked (NULL if walking outside vmas)
  * @action:	next action to perform (see enum page_walk_action)
+ * @flags:	flags performing additional operations (see walk flags)
  * @no_vma:	walk ignoring vmas (vma will always be NULL)
  * @private:	private data for callbacks' usage
  *
@@ -88,6 +96,7 @@ struct mm_walk {
 	pgd_t *pgd;
 	struct vm_area_struct *vma;
 	enum page_walk_action action;
+	unsigned long flags;
 	bool no_vma;
 	void *private;
 };
@@ -100,7 +109,7 @@ int walk_page_range_novma(struct mm_struct *mm, unsigne=
d long start,
 			  pgd_t *pgd,
 			  void *private);
 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *op=
s,
-		void *private);
+		void *private, unsigned long flags);
 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
 		      pgoff_t nr, const struct mm_walk_ops *ops,
 		      void *private);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 17230c458341..d2d5e511dd93 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -138,6 +138,8 @@ static inline void anon_vma_unlock_read(struct anon_vma=
 *anon_vma)
  */
 void anon_vma_init(void);	/* create anon_vma_cachep */
 int  __anon_vma_prepare(struct vm_area_struct *);
+void take_rmap_locks(struct vm_area_struct *vma);
+void drop_rmap_locks(struct vm_area_struct *vma);
 void unlink_anon_vmas(struct vm_area_struct *);
 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
diff --git a/mm/mremap.c b/mm/mremap.c
index 75cda854ec58..309fab7ed706 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -24,6 +24,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/uaccess.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/rmap.h>
=20
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
@@ -101,22 +102,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, stru=
ct vm_area_struct *vma,
 	return pmd;
 }
=20
-static void take_rmap_locks(struct vm_area_struct *vma)
-{
-	if (vma->vm_file)
-		i_mmap_lock_write(vma->vm_file->f_mapping);
-	if (vma->anon_vma)
-		anon_vma_lock_write(vma->anon_vma);
-}
-
-static void drop_rmap_locks(struct vm_area_struct *vma)
-{
-	if (vma->anon_vma)
-		anon_vma_unlock_write(vma->anon_vma);
-	if (vma->vm_file)
-		i_mmap_unlock_write(vma->vm_file->f_mapping);
-}
-
 static pte_t move_soft_dirty_pte(pte_t pte)
 {
 	/*
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 9b3db11a4d1d..0bfb8c9255f3 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -3,6 +3,9 @@
 #include <linux/highmem.h>
 #include <linux/sched.h>
 #include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/rmap.h>
=20
 /*
  * We want to know the real level where a entry is located ignoring any
@@ -20,14 +23,62 @@ static int real_depth(int depth)
 	return depth;
 }
=20
+/*
+ * Relock pte lock and optionally rmap locks to prevent possible deadlock
+ * @pte: Locked pte
+ * @addr: Address of the pte
+ * @walk: Pagewalk structure
+ * @ptl: Pte spinlock
+ * @pmd: Pmd to wait for migration *
+ */
+static void walk_pte_relock(pte_t **pte, unsigned long addr, struct mm_wal=
k *walk,
+			spinlock_t *ptl, pmd_t *pmd)
+{
+	if (walk->no_vma)
+		pte_unmap(*pte);
+	else
+		pte_unmap_unlock(*pte, ptl);
+
+	if (walk->flags & WALK_LOCK_RMAP)
+		drop_rmap_locks(walk->vma);
+
+	if (walk->flags & WALK_MIGRATION)
+		migration_entry_wait(walk->mm, pmd, addr);
+
+	if (walk->flags & WALK_LOCK_RMAP)
+		take_rmap_locks(walk->vma);
+
+	if (walk->no_vma)
+		*pte =3D pte_offset_map(pmd, addr);
+	else
+		*pte =3D pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+}
+
 static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
-				unsigned long end, struct mm_walk *walk)
+				unsigned long end, struct mm_walk *walk,
+				spinlock_t *ptl, pmd_t *pmd)
 {
 	const struct mm_walk_ops *ops =3D walk->ops;
 	int err =3D 0;
=20
 	for (;;) {
+		walk->action =3D ACTION_SUBTREE;
+		if ((walk->flags & WALK_MIGRATION) && !pte_present(*pte)) {
+			swp_entry_t entry;
+
+			if (!pte_none(*pte)) {
+				entry =3D pte_to_swp_entry(*pte);
+				if (is_migration_entry(entry)) {
+					walk_pte_relock(&pte, addr, walk, ptl, pmd);
+					continue; /* retry iteration */
+				}
+			}
+		}
 		err =3D ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
+		if (walk->action =3D=3D ACTION_AGAIN) {
+			walk_pte_relock(&pte, addr, walk, ptl, pmd);
+			continue; /* retry iteration */
+		}
 		if (err)
 		       break;
 		if (addr >=3D end - PAGE_SIZE)
@@ -45,16 +96,22 @@ static int walk_pte_range(pmd_t *pmd, unsigned long add=
r, unsigned long end,
 	int err =3D 0;
 	spinlock_t *ptl;
=20
+	if (walk->flags & WALK_LOCK_RMAP)
+		take_rmap_locks(walk->vma);
+
 	if (walk->no_vma) {
 		pte =3D pte_offset_map(pmd, addr);
-		err =3D walk_pte_range_inner(pte, addr, end, walk);
+		err =3D walk_pte_range_inner(pte, addr, end, walk, ptl, pmd);
 		pte_unmap(pte);
 	} else {
 		pte =3D pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
-		err =3D walk_pte_range_inner(pte, addr, end, walk);
+		err =3D walk_pte_range_inner(pte, addr, end, walk, ptl, pmd);
 		pte_unmap_unlock(pte, ptl);
 	}
=20
+	if (walk->flags & WALK_LOCK_RMAP)
+		drop_rmap_locks(walk->vma);
+
 	return err;
 }
=20
@@ -124,8 +181,11 @@ static int walk_pmd_range(pud_t *pud, unsigned long ad=
dr, unsigned long end,
 		 * This implies that each ->pmd_entry() handler
 		 * needs to know about pmd_trans_huge() pmds
 		 */
-		if (ops->pmd_entry)
+		if (ops->pmd_entry) {
+			/* Migration waiting is not implemented for pmd entries */
+			WARN_ON_ONCE(walk->flags & WALK_MIGRATION);
 			err =3D ops->pmd_entry(pmd, addr, next, walk);
+		}
 		if (err)
 			break;
=20
@@ -507,13 +567,14 @@ int walk_page_range_novma(struct mm_struct *mm, unsig=
ned long start,
 }
=20
 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *op=
s,
-		void *private)
+		void *private, unsigned long flags)
 {
 	struct mm_walk walk =3D {
 		.ops		=3D ops,
 		.mm		=3D vma->vm_mm,
 		.vma		=3D vma,
 		.private	=3D private,
+		.flags		=3D flags
 	};
 	int err;
=20
diff --git a/mm/rmap.c b/mm/rmap.c
index fedb82371efe..d4d95ada0946 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2200,6 +2200,22 @@ int make_device_exclusive_range(struct mm_struct *mm=
, unsigned long start,
 EXPORT_SYMBOL_GPL(make_device_exclusive_range);
 #endif
=20
+void take_rmap_locks(struct vm_area_struct *vma)
+{
+	if (vma->vm_file)
+		i_mmap_lock_write(vma->vm_file->f_mapping);
+	if (vma->anon_vma)
+		anon_vma_lock_write(vma->anon_vma);
+}
+
+void drop_rmap_locks(struct vm_area_struct *vma)
+{
+	if (vma->anon_vma)
+		anon_vma_unlock_write(vma->anon_vma);
+	if (vma->vm_file)
+		i_mmap_unlock_write(vma->vm_file->f_mapping);
+}
+
 void __put_anon_vma(struct anon_vma *anon_vma)
 {
 	struct anon_vma *root =3D anon_vma->root;
--=20
2.35.1

From nobody Sat May  9 10:44:47 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 77399C433F5
	for <linux-kernel@archiver.kernel.org>; Mon, 16 May 2022 12:54:53 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S243743AbiEPMyu (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Mon, 16 May 2022 08:54:50 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:45806 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S243542AbiEPMy1 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Mon, 16 May 2022 08:54:27 -0400
Received: from mail-wr1-x42b.google.com (mail-wr1-x42b.google.com
 [IPv6:2a00:1450:4864:20::42b])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B23C53A1BF
        for <linux-kernel@vger.kernel.org>;
 Mon, 16 May 2022 05:53:48 -0700 (PDT)
Received: by mail-wr1-x42b.google.com with SMTP id u3so20422875wrg.3
        for <linux-kernel@vger.kernel.org>;
 Mon, 16 May 2022 05:53:48 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=from:to:cc:subject:date:message-id:in-reply-to:references
         :mime-version:content-transfer-encoding;
        bh=A0rWI2lLONIT7+b9pf3JkzKqaAFR6Sio8Kh0BbPP5t4=;
        b=cRNQBWUSaU8ZTb3mLWayUvSt2WgVFapHE6E+2n78CQWApmnv3YTT/1CpT4J0OFjVJG
         cxoaZ1tDhKtqOChZ7/K0lynsretSamTSt56kDmZm+L/iPkMCrJ/EYEG4mAd+2qPVZM1r
         3TDktZv12TOO4Fk7YUIfYwtHTCYKew20BwJDx5nbPVZ2Wxwd8DvKuGflHem6KlM/hGCM
         EYLyNgYr4OWW6EIgl9g+nkgrTyms4tm8LnIUCtQQoe0Ep+3Eon/PhxktLFMBA79iBGue
         ed9l1U4fNmaYQG663niRCxbqgrZAnq8MmVhgFh/IVrmu+4zRK+Qm52lXqPYo+8ViWh0D
         5nxA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
         :references:mime-version:content-transfer-encoding;
        bh=A0rWI2lLONIT7+b9pf3JkzKqaAFR6Sio8Kh0BbPP5t4=;
        b=v12N5+/LEoiMmp7Fk2Bm2jN7cSULx241nzAIWz2VHQkJ+tot/QYMTWUQNoBd1hvRJl
         XXN2rhBf/z9jia1eXrYsFTl6J7HVhyIwNnTot9D5/H8nmwR2lMC42nrMks1KBIqOoKnd
         XcuZ5IO99r/9DVDZWzVotMZrWkJG3C8BaeQ8fILOxJH967YbUrmcdjuBHZVuaKSHvHXr
         nZ6Nk/Ul0STLFoXfwo3Pg8yKPGkeu9qhIGp9QVDlvPaq2Azj/cjQk71fj8hOGO5KFJFv
         KuXsbFjDlaJqTqofSDyxhyhFSo9KfC1HrnZ4Er8CaU8ILOR7e6fxAZ9BrMFGAAIYobEm
         IbBw==
X-Gm-Message-State: AOAM531STquDzD4RY41ACJp3VtCfqFwgbEAQ9uYpTndmDarko3I7Ywve
        vl6ply0Aw7063v6DBZ6sCpU=
X-Google-Smtp-Source: 
 ABdhPJyskl20gRCnwQJxLVV5hqWKuOYrDBxhqPI4L3BVflIiUd4SYWT46rl95XELnlRm0Wb4JJgd0w==
X-Received: by 2002:adf:eb8e:0:b0:20c:b378:67b2 with SMTP id
 t14-20020adfeb8e000000b0020cb37867b2mr14465715wrn.164.1652705626101;
        Mon, 16 May 2022 05:53:46 -0700 (PDT)
Received: from orion.localdomain ([93.99.228.15])
        by smtp.gmail.com with ESMTPSA id
 l11-20020adfbd8b000000b0020c5253d91esm9607279wrh.106.2022.05.16.05.53.45
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Mon, 16 May 2022 05:53:45 -0700 (PDT)
Received: by orion.localdomain (Postfix, from userid 1003)
        id 96D23A0E79; Mon, 16 May 2022 14:54:07 +0200 (CEST)
From: =?UTF-8?q?Jakub=20Mat=C4=9Bna?= <matenajakub@gmail.com>
To: linux-mm@kvack.org
Cc: patches@lists.linux.dev, linux-kernel@vger.kernel.org,
        vbabka@suse.cz, mhocko@kernel.org, mgorman@techsingularity.net,
        willy@infradead.org, liam.howlett@oracle.com, hughd@google.com,
        kirill@shutemov.name, riel@surriel.com, rostedt@goodmis.org,
        peterz@infradead.org, david@redhat.com,
        =?UTF-8?q?Jakub=20Mat=C4=9Bna?= <matenajakub@gmail.com>
Subject: [RFC PATCH v3 4/6] [PATCH 4/6] mm: adjust page offset in mremap
Date: Mon, 16 May 2022 14:54:03 +0200
Message-Id: <20220516125405.1675-5-matenajakub@gmail.com>
X-Mailer: git-send-email 2.35.1
In-Reply-To: <20220516125405.1675-1-matenajakub@gmail.com>
References: <20220516125405.1675-1-matenajakub@gmail.com>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

Adjust page offset of a VMA when it's moved to a new location by mremap.
This is made possible for all VMAs that do not share their anonymous
pages with other processes and it is checked by going through the anon_vma
tree and looking for parent child relationship. Also and maybe
redundantly this is checked for individual struct pages belonging to the
given vma by looking at their mapcount or swap entry reference count if
the page is swapped out. This is all done in can_update_faulted_pgoff(),
is_shared_pte().

If none of the pages is shared then we proceed with the
page offset update. This means updating page offset in the copy_vma()
which is used when creating the VMA copy or possibly when deciding
whether to merge with a neighboring VMA. We also set pgoff_updatable to
true to later update page offsets of individual pages.
This is done later in move_page_tables() when moving
individual pte entries to the target VMA. The page offset update
actually forces the move to happen at the pte level by using
move_ptes(). It is necessary because the page update must happen
atomically with the move and that is not possible when moving bigger
entries like pmd or pud. We do not need to update swapped out pages,
because in that case page offset is reconstructed automatically from VMA
after the page is swapped in.
As mentioned above there is a small amount of time between checking and
actually updating the page offset of pages as well as between merging VMAs
and again updating the pages. This could potencially interfere with rmap
walk, but fortunately in that case rmap walk can use the still existing old
VMA, as it would before the mremap started. Any other potential changes
to the VMA or pages is prevented by mmap_lock, which prevents forking and
therefore also COW and hence raising the mapcount. Because pages are not
shared, but belong to only one process, there is no other process which
might fork and in that way increase mapcount of the pages in question.

If a page is shared we can't update page offset of the page, because
that would interfere with the page offset for the other processes using
the page. Page offset is basically immutable as long as the page is used
by more than one process.
Previously, adjusting page offset was possible only for not yet faulted
VMAs, even though a page offset matching the virtual address of the
anonymous VMA is necessary to successfully merge with another VMA.

Signed-off-by: Jakub Mat=C4=9Bna <matenajakub@gmail.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
---
 fs/exec.c                |   2 +-
 include/linux/mm.h       |   4 +-
 include/linux/pagewalk.h |   2 +
 include/linux/rmap.h     |   2 +
 mm/mmap.c                | 113 +++++++++++++++++++++++++++++++------
 mm/mremap.c              | 117 +++++++++++++++++++++++++++++++++------
 mm/pagewalk.c            |   2 +-
 mm/rmap.c                |  41 ++++++++++++++
 8 files changed, 244 insertions(+), 39 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index e3e55d5e0be1..207f60fcb2b4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -709,7 +709,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, =
unsigned long shift)
 	 * process cleanup to remove whatever mess we made.
 	 */
 	if (length !=3D move_page_tables(vma, old_start,
-				       vma, new_start, length, false))
+				       vma, new_start, length, false, false))
 		return -ENOMEM;
=20
 	lru_add_drain();
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e34edb775334..d8e482aef901 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1951,7 +1951,7 @@ int get_cmdline(struct task_struct *task, char *buffe=
r, int buflen);
 extern unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len,
-		bool need_rmap_locks);
+		bool need_rmap_locks, bool update_pgoff);
=20
 /*
  * Flags used by change_protection().  For now we make it a bitmap so
@@ -2637,7 +2637,7 @@ extern void __vma_link_rb(struct mm_struct *, struct =
vm_area_struct *,
 extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff,
-	bool *need_rmap_locks);
+	bool *need_rmap_locks, bool *update_pgoff);
 extern void exit_mmap(struct mm_struct *);
=20
 static inline int check_data_rlimit(unsigned long rlim,
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 07345df51324..11c99c8d343b 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -101,6 +101,8 @@ struct mm_walk {
 	void *private;
 };
=20
+int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			struct mm_walk *walk);
 int walk_page_range(struct mm_struct *mm, unsigned long start,
 		unsigned long end, const struct mm_walk_ops *ops,
 		void *private);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d2d5e511dd93..9fee804f47ea 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -144,6 +144,8 @@ void unlink_anon_vmas(struct vm_area_struct *);
 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
=20
+bool rbt_no_children(struct anon_vma *av);
+
 static inline int anon_vma_prepare(struct vm_area_struct *vma)
 {
 	if (likely(vma->anon_vma))
diff --git a/mm/mmap.c b/mm/mmap.c
index 4a4611443593..3ca78baaee13 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -48,6 +48,8 @@
 #include <linux/pkeys.h>
 #include <linux/oom.h>
 #include <linux/sched/mm.h>
+#include <linux/pagewalk.h>
+#include <linux/swapops.h>
=20
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -3189,28 +3191,100 @@ int insert_vm_struct(struct mm_struct *mm, struct =
vm_area_struct *vma)
 	return 0;
 }
=20
+/*
+ * is_shared_pte() - Check if the given pte points to a page that is not s=
hared between processes.
+ * @pte: pte to check
+ * @addr: Address where the page is mapped
+ * @end: Not used
+ * @walk: Pagewalk structure holding pointer to VMA where the page belongs
+ */
+static int is_shared_pte(pte_t *pte, unsigned long addr,
+				unsigned long end, struct mm_walk *walk)
+{
+	int err;
+	struct page *page;
+	struct vm_area_struct *old =3D walk->vma;
+
+	if (is_swap_pte(*pte)) {
+		swp_entry_t entry =3D pte_to_swp_entry(*pte);
+		struct swap_info_struct *info =3D swp_swap_info(entry);
+		/*
+		 * If the reference count is higher than one than the swap slot is used =
by
+		 * more than one process or the swap cache is active, which means that t=
he
+		 * page is mapped by at least one process and swapped out by at least one
+		 * process, so in both cases this means the page is shared.
+		 * There can also exist continuation pages if the reference count is too
+		 * high to fit in just one cell. This is specified by the flag COUNT_CON=
TINUED,
+		 * which again triggers the below condition if set.
+		 */
+		return info->swap_map[swp_offset(entry)] > 1;
+	}
+
+	if (!pte_present(*pte))
+		return 0;
+	page =3D vm_normal_page(old, addr, *pte);
+	if (page =3D=3D NULL)
+		return 0;
+	/* Check page is not shared with other processes */
+	err =3D page_mapcount(page) + page_swapcount(page) > 1;
+	return err;
+}
+
+/**
+ * can_update_faulted_pgoff() - Check if pgoff update is possible for faul=
ted pages of a vma
+ * @vma: VMA which should be moved
+ * @addr: new virtual address
+ * If the vma and its pages are not shared with another process, updating
+ * the new pgoff and also updating index parameter (copy of the pgoff) in
+ * all faulted pages is possible.
+ */
+static bool can_update_faulted_pgoff(struct vm_area_struct *vma, unsigned =
long addr)
+{
+	const struct mm_walk_ops can_update_pgoff_ops =3D {
+		.pte_entry =3D is_shared_pte
+	};
+
+	/* Check vma is not shared with other processes */
+	if (vma->anon_vma->root !=3D vma->anon_vma || !rbt_no_children(vma->anon_=
vma))
+		return 1;
+	/* walk_page_vma() returns 0 on success */
+	return !walk_page_vma(vma, &can_update_pgoff_ops, NULL, WALK_MIGRATION | =
WALK_LOCK_RMAP);
+}
+
 /*
  * Copy the vma structure to a new location in the same mm,
  * prior to moving page table entries, to effect an mremap move.
  */
 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	unsigned long addr, unsigned long len, pgoff_t pgoff,
-	bool *need_rmap_locks)
+	bool *need_rmap_locks, bool *update_pgoff)
 {
 	struct vm_area_struct *vma =3D *vmap;
 	unsigned long vma_start =3D vma->vm_start;
 	struct mm_struct *mm =3D vma->vm_mm;
 	struct vm_area_struct *new_vma, *prev;
 	struct rb_node **rb_link, *rb_parent;
-	bool faulted_in_anon_vma =3D true;
+	bool anon_pgoff_updated =3D false;
+	*need_rmap_locks =3D false;
+	*update_pgoff =3D false;
=20
 	/*
-	 * If anonymous vma has not yet been faulted, update new pgoff
+	 * Try to update new pgoff for anonymous vma
 	 * to match new location, to increase its chance of merging.
 	 */
-	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
-		pgoff =3D addr >> PAGE_SHIFT;
-		faulted_in_anon_vma =3D false;
+	if (unlikely(vma_is_anonymous(vma))) {
+		if (!vma->anon_vma) {
+			pgoff =3D addr >> PAGE_SHIFT;
+			anon_pgoff_updated =3D true;
+		} else {
+			anon_pgoff_updated =3D can_update_faulted_pgoff(vma, addr);
+			if (anon_pgoff_updated) {
+				/* Update pgoff of the copied VMA */
+				pgoff =3D addr >> PAGE_SHIFT;
+				*update_pgoff =3D true;
+				*need_rmap_locks =3D true;
+			}
+		}
 	}
=20
 	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
@@ -3227,19 +3301,25 @@ struct vm_area_struct *copy_vma(struct vm_area_stru=
ct **vmap,
 			/*
 			 * The only way we can get a vma_merge with
 			 * self during an mremap is if the vma hasn't
-			 * been faulted in yet and we were allowed to
-			 * reset the dst vma->vm_pgoff to the
-			 * destination address of the mremap to allow
-			 * the merge to happen. mremap must change the
-			 * vm_pgoff linearity between src and dst vmas
-			 * (in turn preventing a vma_merge) to be
-			 * safe. It is only safe to keep the vm_pgoff
-			 * linear if there are no pages mapped yet.
+			 * been faulted in yet or is not shared and
+			 * we were allowed to reset the dst
+			 * vma->vm_pgoff to the destination address of
+			 * the mremap to allow the merge to happen.
+			 * mremap must change the vm_pgoff linearity
+			 * between src and dst vmas (in turn
+			 * preventing a vma_merge) to be safe. It is
+			 * only safe to keep the vm_pgoff linear if
+			 * there are no pages mapped yet or the none
+			 * of the pages are shared with another process.
 			 */
-			VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
+			VM_BUG_ON_VMA(!anon_pgoff_updated, new_vma);
 			*vmap =3D vma =3D new_vma;
 		}
-		*need_rmap_locks =3D (new_vma->vm_pgoff <=3D vma->vm_pgoff);
+		/*
+		 * If new_vma is located before the old vma, rmap traversal order is alt=
ered
+		 * and we need to apply rmap locks on vma later.
+		 */
+		*need_rmap_locks |=3D (new_vma->vm_pgoff <=3D vma->vm_pgoff);
 	} else {
 		new_vma =3D vm_area_dup(vma);
 		if (!new_vma)
@@ -3256,7 +3336,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct=
 **vmap,
 		if (new_vma->vm_ops && new_vma->vm_ops->open)
 			new_vma->vm_ops->open(new_vma);
 		vma_link(mm, new_vma, prev, rb_link, rb_parent);
-		*need_rmap_locks =3D false;
 	}
 	return new_vma;
=20
diff --git a/mm/mremap.c b/mm/mremap.c
index 309fab7ed706..2ef444abb08a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -24,6 +24,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/uaccess.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/pagewalk.h>
 #include <linux/rmap.h>
=20
 #include <asm/cacheflush.h>
@@ -117,10 +118,66 @@ static pte_t move_soft_dirty_pte(pte_t pte)
 	return pte;
 }
=20
+/*
+ * update_pgoff_page() - Update page offset stored in page->index, if the =
page is not NULL.
+ * @addr: new address to calculate the page offset.
+ * @page: page to update
+ */
+static int update_pgoff_page(unsigned long addr, struct page *page)
+{
+	if (page !=3D NULL) {
+		get_page(page);
+		if (!trylock_page(page)) {
+			put_page(page);
+			return -1;
+		}
+		page->index =3D addr >> PAGE_SHIFT;
+		unlock_page(page);
+		put_page(page);
+	}
+	return 0;
+}
+
+/*
+ * update_pgoff_pte_inner() - Wait for migration and update page offset of
+ * a page represented by pte, if the pte points to mapped page.
+ */
+static int update_pgoff_pte_inner(pte_t *old_pte, unsigned long old_addr,
+					struct vm_area_struct *vma, spinlock_t *old_ptl,
+					pmd_t *old_pmd, unsigned long new_addr)
+{
+	struct page *page;
+	/*
+	 * If pte is in migration state then wait for migration
+	 * and return with -1 to trigger relocking mechanism in move_ptes().
+	 */
+	if (!pte_present(*old_pte)) {
+		if (!pte_none(*old_pte)) {
+			swp_entry_t entry;
+			entry =3D pte_to_swp_entry(*old_pte);
+			if (is_migration_entry(entry)) {
+				migration_entry_wait(vma->vm_mm, old_pmd, old_addr);
+				return -1;
+			}
+		}
+		/*
+		 * If there is no migration entry, but at the same
+		 * time the page is not present then the page offset
+		 * will be reconstructed automatically from the
+		 * VMA after the page is moved back into RAM.
+		 */
+		return 0;
+	}
+
+	page =3D vm_normal_page(vma, old_addr, *old_pte);
+	return update_pgoff_page(new_addr, page);
+}
+
 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		unsigned long old_addr, unsigned long old_end,
 		struct vm_area_struct *new_vma, pmd_t *new_pmd,
-		unsigned long new_addr, bool need_rmap_locks)
+		unsigned long new_addr, bool need_rmap_locks,
+		bool update_pgoff)
 {
 	struct mm_struct *mm =3D vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
@@ -146,6 +203,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t=
 *old_pmd,
 	 *   serialize access to individual ptes, but only rmap traversal
 	 *   order guarantees that we won't miss both the old and new ptes).
 	 */
+
+retry:
 	if (need_rmap_locks)
 		take_rmap_locks(vma);
=20
@@ -166,6 +225,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_=
t *old_pmd,
 		if (pte_none(*old_pte))
 			continue;
=20
+		if (update_pgoff)
+			if (update_pgoff_pte_inner(old_pte, old_addr, vma, old_ptl,
+						old_pmd, new_addr))
+				break; /* Causes unlock after for cycle and goto retry */
 		pte =3D ptep_get_and_clear(mm, old_addr, old_pte);
 		/*
 		 * If we are remapping a valid PTE, make sure
@@ -194,6 +257,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t=
 *old_pmd,
 	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (need_rmap_locks)
 		drop_rmap_locks(vma);
+	if (old_addr < old_end)
+		goto retry;
 }
=20
 #ifndef arch_supports_page_table_move
@@ -422,11 +487,19 @@ static __always_inline unsigned long get_extent(enum =
pgt_entry entry,
  * pgt_entry. Returns true if the move was successful, else false.
  */
 static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vm=
a,
-			unsigned long old_addr, unsigned long new_addr,
-			void *old_entry, void *new_entry, bool need_rmap_locks)
+			struct vm_area_struct *new_vma,	unsigned long old_addr,
+			unsigned long new_addr,	void *old_entry, void *new_entry,
+			bool need_rmap_locks, bool update_pgoff)
 {
 	bool moved =3D false;
=20
+	/*
+	 * In case of page offset update move must be done
+	 * at the pte level using move_ptes()
+	 */
+	if (update_pgoff)
+		return false;
+
 	/* See comment in move_ptes() */
 	if (need_rmap_locks)
 		take_rmap_locks(vma);
@@ -465,7 +538,7 @@ static bool move_pgt_entry(enum pgt_entry entry, struct=
 vm_area_struct *vma,
 unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len,
-		bool need_rmap_locks)
+		bool need_rmap_locks, bool update_pgoff)
 {
 	unsigned long extent, old_end;
 	struct mmu_notifier_range range;
@@ -492,7 +565,14 @@ unsigned long move_page_tables(struct vm_area_struct *=
vma,
 		 * If extent is PUD-sized try to speed up the move by moving at the
 		 * PUD level if possible.
 		 */
-		extent =3D get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
+		if (update_pgoff)
+			/*
+			 * In case of pgoff update, extent is set to PMD
+			 * and is done using move_ptes()
+			 */
+			extent =3D get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
+		else
+			extent =3D get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
=20
 		old_pud =3D get_old_pud(vma->vm_mm, old_addr);
 		if (!old_pud)
@@ -502,15 +582,15 @@ unsigned long move_page_tables(struct vm_area_struct =
*vma,
 			break;
 		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
 			if (extent =3D=3D HPAGE_PUD_SIZE) {
-				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
-					       old_pud, new_pud, need_rmap_locks);
+				move_pgt_entry(HPAGE_PUD, vma, new_vma, old_addr, new_addr,
+					       old_pud, new_pud, need_rmap_locks, update_pgoff);
 				/* We ignore and continue on error? */
 				continue;
 			}
 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent =3D=3D PUD_SIZE) {
=20
-			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
-					   old_pud, new_pud, true))
+			if (move_pgt_entry(NORMAL_PUD, vma, new_vma, old_addr, new_addr,
+					   old_pud, new_pud, true, update_pgoff))
 				continue;
 		}
=20
@@ -524,8 +604,8 @@ unsigned long move_page_tables(struct vm_area_struct *v=
ma,
 		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
 		    pmd_devmap(*old_pmd)) {
 			if (extent =3D=3D HPAGE_PMD_SIZE &&
-			    move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
-					   old_pmd, new_pmd, need_rmap_locks))
+			    move_pgt_entry(HPAGE_PMD, vma, new_vma, old_addr, new_addr,
+					   old_pmd, new_pmd, need_rmap_locks, update_pgoff))
 				continue;
 			split_huge_pmd(vma, old_pmd, old_addr);
 			if (pmd_trans_unstable(old_pmd))
@@ -536,15 +616,15 @@ unsigned long move_page_tables(struct vm_area_struct =
*vma,
 			 * If the extent is PMD-sized, try to speed the move by
 			 * moving at the PMD level if possible.
 			 */
-			if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
-					   old_pmd, new_pmd, true))
+			if (move_pgt_entry(NORMAL_PMD, vma, new_vma, old_addr, new_addr,
+					   old_pmd, new_pmd, true, update_pgoff))
 				continue;
 		}
=20
 		if (pte_alloc(new_vma->vm_mm, new_pmd))
 			break;
 		move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
-			  new_pmd, new_addr, need_rmap_locks);
+			  new_pmd, new_addr, need_rmap_locks, update_pgoff);
 	}
=20
 	mmu_notifier_invalidate_range_end(&range);
@@ -568,7 +648,8 @@ static unsigned long move_vma(struct vm_area_struct *vm=
a,
 	unsigned long hiwater_vm;
 	int split =3D 0;
 	int err =3D 0;
-	bool need_rmap_locks;
+	bool need_rmap_locks =3D false;
+	bool update_pgoff =3D false;
=20
 	/*
 	 * We'd prefer to avoid failure later on in do_munmap:
@@ -608,7 +689,7 @@ static unsigned long move_vma(struct vm_area_struct *vm=
a,
=20
 	new_pgoff =3D vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 	new_vma =3D copy_vma(&vma, new_addr, new_len, new_pgoff,
-			   &need_rmap_locks);
+			   &need_rmap_locks, &update_pgoff);
 	if (!new_vma) {
 		if (vm_flags & VM_ACCOUNT)
 			vm_unacct_memory(to_account >> PAGE_SHIFT);
@@ -616,7 +697,7 @@ static unsigned long move_vma(struct vm_area_struct *vm=
a,
 	}
=20
 	moved_len =3D move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
-				     need_rmap_locks);
+				     need_rmap_locks, update_pgoff);
 	if (moved_len < old_len) {
 		err =3D -ENOMEM;
 	} else if (vma->vm_ops && vma->vm_ops->mremap) {
@@ -630,7 +711,7 @@ static unsigned long move_vma(struct vm_area_struct *vm=
a,
 		 * and then proceed to unmap new area instead of old.
 		 */
 		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
-				 true);
+				 true, update_pgoff);
 		vma =3D new_vma;
 		old_len =3D new_len;
 		old_addr =3D new_addr;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 0bfb8c9255f3..d603962ddd52 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -89,7 +89,7 @@ static int walk_pte_range_inner(pte_t *pte, unsigned long=
 addr,
 	return err;
 }
=20
-static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long en=
d,
+int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			  struct mm_walk *walk)
 {
 	pte_t *pte;
diff --git a/mm/rmap.c b/mm/rmap.c
index d4d95ada0946..b1bddabd21c6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -389,6 +389,47 @@ int anon_vma_fork(struct vm_area_struct *vma, struct v=
m_area_struct *pvma)
 	return -ENOMEM;
 }
=20
+
+/*
+ * rbst_no_children() - Used by rbt_no_children to check node subtree.
+ * Check if none of the VMAs connected to the node subtree via
+ * anon_vma_chain are in child relationship to the given anon_vma.
+ * @av: anon_vma to check
+ * @node: node to check in this level
+ */
+static bool rbst_no_children(struct anon_vma *av, struct rb_node *node)
+{
+	struct anon_vma_chain *model;
+	struct anon_vma_chain *avc;
+
+	if (node =3D=3D NULL) /* leaf node */
+		return true;
+	avc =3D container_of(node, typeof(*(model)), rb);
+	if (avc->vma->anon_vma !=3D av)
+		/*
+		 * Inequality implies avc belongs
+		 * to a VMA of a child process
+		 */
+		return false;
+	return (rbst_no_children(av, node->rb_left) &&
+	rbst_no_children(av, node->rb_right));
+}
+
+/*
+ * rbt_no_children() - Check if none of the VMAs connected to the given
+ * anon_vma via anon_vma_chain are in child relationship
+ * @av: anon_vma to check if it has children
+ */
+bool rbt_no_children(struct anon_vma *av)
+{
+	struct rb_node *root_node;
+
+	if (av =3D=3D NULL || av->degree <=3D 1) /* Higher degree might not neces=
sarily imply children */
+		return true;
+	root_node =3D av->rb_root.rb_root.rb_node;
+	return rbst_no_children(av, root_node);
+}
+
 void unlink_anon_vmas(struct vm_area_struct *vma)
 {
 	struct anon_vma_chain *avc, *next;
--=20
2.35.1

From nobody Sat May  9 10:44:47 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 7FDC6C433EF
	for <linux-kernel@archiver.kernel.org>; Mon, 16 May 2022 12:55:14 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S243611AbiEPMzL (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Mon, 16 May 2022 08:55:11 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:43238 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S243689AbiEPMyb (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Mon, 16 May 2022 08:54:31 -0400
Received: from mail-wr1-x42a.google.com (mail-wr1-x42a.google.com
 [IPv6:2a00:1450:4864:20::42a])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 9F92E3A5CA
        for <linux-kernel@vger.kernel.org>;
 Mon, 16 May 2022 05:53:49 -0700 (PDT)
Received: by mail-wr1-x42a.google.com with SMTP id r23so4368784wrr.2
        for <linux-kernel@vger.kernel.org>;
 Mon, 16 May 2022 05:53:49 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=from:to:cc:subject:date:message-id:in-reply-to:references
         :mime-version:content-transfer-encoding;
        bh=RW3LyhQcqgE9zy1F+xq4iCJlLD9gveB3mgv1QCdDgwo=;
        b=W8LdUuv10ewTcDhz1X7a5zGV2iUuUNQI9DHBp0pB1kj2O1+84SnGrz+DkbU9PjKREZ
         daMHKNbuKz70IqhwUlKmXigrH623arBKegFML7riwWs6TgihD1YLiN73ERETQow+WUno
         lPT6o10eSLd9+yfgruBPo1oIC7zQSll41N0Td10cu4dsyGq8TdqBl2b+W4hCUJFq9Xdx
         NtKaMcvYjQiYUqAW6Ak5tG546MMW1KIIKEc2M8XAKapc3NOIjMPuNss8UI6dHDf9hoWI
         NRbZf9yq3qOFJTHSz1xGoYdlStT0JLMxaiJojNQs3cihQVGO/0XNGLGY3ZR6NZeAX5to
         HeDA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
         :references:mime-version:content-transfer-encoding;
        bh=RW3LyhQcqgE9zy1F+xq4iCJlLD9gveB3mgv1QCdDgwo=;
        b=xfxzAaSPr4ac2pNyvLq4zjAD7yHLDuZHkZA/H6PNvbZqCHCNNrd6SMYRttaNZ4ec40
         Jd1X0Fl6KscCOE29qgW/9FHgXA49PClpwZJQ3c5tGu9zwvzeUPTjIMuS3URtoJqN/m19
         ZwoGwQ0h/W96s1+EQAdFYAjanJewV+3Esc8u0Yn2dIEd84msExXnOKiYkph2Kky6aK6J
         xuCCzkBQ4pqJPRAOqQEP829v/kKKSmVtRWAZHQ5D/10u26Loxnqs1hSEjUgUz3e8bR+h
         98HjEQO0bdUTiFpBE6dvjISPyRCJOjYQqx02arjdSqwOiRXRpAg+EnDeRtXNC8AIv0fx
         2b/w==
X-Gm-Message-State: AOAM530GI3Qamyg52Ha4oqEWqX5wCO//zs24hnJMCkwF8jp+5LTGf1/W
        2ri3AZXkb0gfhd5hnBVYdWU=
X-Google-Smtp-Source: 
 ABdhPJwVaUbvlDO5a9DcZzv6XGZZAZB2JybMFB2YgL9SXzdisoGNcasongI8gLNP4TfGQP3q7OihDw==
X-Received: by 2002:a05:6000:1ac8:b0:20c:dced:2f6c with SMTP id
 i8-20020a0560001ac800b0020cdced2f6cmr13621031wry.107.1652705627883;
        Mon, 16 May 2022 05:53:47 -0700 (PDT)
Received: from orion.localdomain ([93.99.228.15])
        by smtp.gmail.com with ESMTPSA id
 x10-20020adfbb4a000000b0020d0435c97bsm4767381wrg.92.2022.05.16.05.53.46
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Mon, 16 May 2022 05:53:46 -0700 (PDT)
Received: by orion.localdomain (Postfix, from userid 1003)
        id 9A9B0A0E7A; Mon, 16 May 2022 14:54:07 +0200 (CEST)
From: =?UTF-8?q?Jakub=20Mat=C4=9Bna?= <matenajakub@gmail.com>
To: linux-mm@kvack.org
Cc: patches@lists.linux.dev, linux-kernel@vger.kernel.org,
        vbabka@suse.cz, mhocko@kernel.org, mgorman@techsingularity.net,
        willy@infradead.org, liam.howlett@oracle.com, hughd@google.com,
        kirill@shutemov.name, riel@surriel.com, rostedt@goodmis.org,
        peterz@infradead.org, david@redhat.com,
        =?UTF-8?q?Jakub=20Mat=C4=9Bna?= <matenajakub@gmail.com>
Subject: [RFC PATCH v3 5/6] [PATCH 5/6] mm: enable merging of VMAs with
 different anon_vmas
Date: Mon, 16 May 2022 14:54:04 +0200
Message-Id: <20220516125405.1675-6-matenajakub@gmail.com>
X-Mailer: git-send-email 2.35.1
In-Reply-To: <20220516125405.1675-1-matenajakub@gmail.com>
References: <20220516125405.1675-1-matenajakub@gmail.com>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

Enable merging of a VMA even when it is linked to different
anon_vma than the VMA it is being merged to, but only if the VMA
in question does not share any pages with a parent or child process.
This enables merges that would otherwise not be possible and therefore
decrease number of VMAs of a process.

In this patch the VMA is checked only at the level of anon_vma to find
out if it shares any pages with parent or child process. This check is
performed in is_mergeable_anon_vma() which is a part of vma_merge().
In this case it is not easily possible to check mapcount of individual
pages (as opposed to previous commit "mm: adjust page offset in
mremap"), because vma_merge() does not have a
pointer to the VMA or any other means of easily accessing the page
structures. In the following two paragraphs we are using cases 1
through 8 as described in comment before vma_merge().

The update itself is done during __vma_adjust() for cases 4
through 8 and partially for case 1. Other cases must be solved
elsewhere because __vma_adjust() can only work with pages that already
reside in the location of merge, in other words if VMA already exists
in the location where merge is happening. This is not true for cases 2,
3 and partially case 1, where the next VMA is already present but the
middle part is not. Cases 1,2 and 3 are either expanding or moving VMA
to the location of the merge, but unfortunately at the time of the
merge the mapping is not there yet and therefore the page update has
to be done later. An easy way out is if the pages do not exist yet and
therefore there is nothing to update. This happens e.g. when
expanding a mapping in mmap_region() or in do_brk_flags(). On the other
hand the pages do exist and have to be updated during the mremap call
that moves already existing and possibly faulted mapping. In this case
the page update is done in move_page_tables(). It is actually quite
simple as previous commit "mm: adjust page offset in
mremap" already introduced page update and therefore only
change is updating one more parameter. If rmap walk happens between
__vma_adjust() and page update done in move_page_tables() then old VMA
and old anon_vma are used as it would happen before starting the whole
merge.

The cases 4 through 8 correspond to merges which are a result of a
mprotect call or any other flag update that does not move or resize
the mapping. Together with part of case 1 the update of physical pages
is then handled directly in __vma_adjust() as mentioned before. First
it is determined which address range should be updated, which depends
on the specific case 1, 4, 5, 6, 7 or 8. The address range is set to
variables pu_start and pu_end. Secondly the value being set to the
page->mapping must be determined. However, it is always anon_vma
belonging to the expand parameter of the __vma_adjust() call. The
reason we are updating the pages is that in the __vma_adjust() the
ranges vm_start and vm_end are updated for involved VMAs and so
physical pages can belong to different VMA and anon_vma than before.
The problem is that these two updates (VMAs and pages) should happen
atomically from the rmap walk point of view. This would normally be
solved by using rmap locks, but at the same time we must keep in mind
that page migration uses rmap walk at the start and at the end and
rmap locks might trap it in the middle. This would cause pte to not
point to actual pages but instead remain in the migration entry state,
which would block the page update.
The solution is to drop rmap lock to allow page migration to end
and page walk all the relevant pages, where we wait for possible
migration to end and update the page->mapping (aka anon_vma). After
that we again take rmap lock. This whole page update must be done
after the expand VMA is already enlarged, but the source VMA still has
its original range. This way if rmap walk happens while we are updating
the pages then rmap lock will work either with the old or the new
anon_vma and therefore also the old or new VMA.
If the page is swapped out, zero page or KSM page, then it is not
changed and the correct mapping will be reconstructed from the VMA
itself when the page returns to normal state.
Again as mentioned and explained in the previous commit "mm: adjust page of=
fset in
mremap", the mapcount of the
pages should not change between vma_merge checks and actually merging
in __vma_adjust() as potencial fork is prevented by mmap_lock.

If one of the VMAs is not yet faulted and therefore does not have
anon_vma assigned then this patch is not needed and merge happens even
without it.

Signed-off-by: Jakub Mat=C4=9Bna <matenajakub@gmail.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
---
 include/linux/pagewalk.h |  2 +
 include/linux/rmap.h     | 15 ++++++-
 mm/mmap.c                | 60 ++++++++++++++++++++++++---
 mm/mremap.c              | 22 +++++++---
 mm/pagewalk.c            |  2 +-
 mm/rmap.c                | 87 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 176 insertions(+), 12 deletions(-)

diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 11c99c8d343b..9685d1a26f17 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -106,6 +106,8 @@ int walk_pte_range(pmd_t *pmd, unsigned long addr, unsi=
gned long end,
 int walk_page_range(struct mm_struct *mm, unsigned long start,
 		unsigned long end, const struct mm_walk_ops *ops,
 		void *private);
+int __walk_page_range(unsigned long start, unsigned long end,
+			struct mm_walk *walk);
 int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
 			  unsigned long end, const struct mm_walk_ops *ops,
 			  pgd_t *pgd,
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 9fee804f47ea..c1ba908f92e6 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -138,6 +138,8 @@ static inline void anon_vma_unlock_read(struct anon_vma=
 *anon_vma)
  */
 void anon_vma_init(void);	/* create anon_vma_cachep */
 int  __anon_vma_prepare(struct vm_area_struct *);
+void reconnect_pages_range(struct mm_struct *mm, unsigned long start, unsi=
gned long end,
+				struct vm_area_struct *target, struct vm_area_struct *source);
 void take_rmap_locks(struct vm_area_struct *vma);
 void drop_rmap_locks(struct vm_area_struct *vma);
 void unlink_anon_vmas(struct vm_area_struct *);
@@ -154,10 +156,21 @@ static inline int anon_vma_prepare(struct vm_area_str=
uct *vma)
 	return __anon_vma_prepare(vma);
 }
=20
+/**
+ * anon_vma_merge() - Merge anon_vmas of the given VMAs
+ * @vma: VMA being merged to
+ * @next: VMA being merged
+ */
 static inline void anon_vma_merge(struct vm_area_struct *vma,
 				  struct vm_area_struct *next)
 {
-	VM_BUG_ON_VMA(vma->anon_vma !=3D next->anon_vma, vma);
+	struct anon_vma *anon_vma1 =3D vma->anon_vma;
+	struct anon_vma *anon_vma2 =3D next->anon_vma;
+
+	VM_BUG_ON_VMA(anon_vma1 && anon_vma2 && anon_vma1 !=3D anon_vma2 &&
+			((anon_vma2 !=3D anon_vma2->root)
+			|| !rbt_no_children(anon_vma2)), vma);
+
 	unlink_anon_vmas(next);
 }
=20
diff --git a/mm/mmap.c b/mm/mmap.c
index 3ca78baaee13..e7760e378a68 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -753,6 +753,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned l=
ong start,
 	bool start_changed =3D false, end_changed =3D false;
 	long adjust_next =3D 0;
 	int remove_next =3D 0;
+	unsigned long pu_start =3D 0;
+	unsigned long pu_end =3D 0;
=20
 	if (next && !insert) {
 		struct vm_area_struct *exporter =3D NULL, *importer =3D NULL;
@@ -778,6 +780,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned l=
ong start,
 				remove_next =3D 3;
 				VM_WARN_ON(file !=3D next->vm_file);
 				swap(vma, next);
+				pu_start =3D start;
+				pu_end =3D vma->vm_end;
 			} else {
 				VM_WARN_ON(expand !=3D vma);
 				/*
@@ -789,6 +793,10 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned =
long start,
 					   end !=3D next->vm_next->vm_end);
 				/* trim end to next, for case 6 first pass */
 				end =3D next->vm_end;
+				VM_WARN_ON(vma =3D=3D NULL);
+
+				pu_start =3D next->vm_start;
+				pu_end =3D next->vm_end;
 			}
=20
 			exporter =3D next;
@@ -810,6 +818,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned l=
ong start,
 			exporter =3D next;
 			importer =3D vma;
 			VM_WARN_ON(expand !=3D importer);
+			pu_start =3D vma->vm_end;
+			pu_end =3D end;
 		} else if (end < vma->vm_end) {
 			/*
 			 * vma shrinks, and !insert tells it's not
@@ -820,6 +830,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned l=
ong start,
 			exporter =3D vma;
 			importer =3D next;
 			VM_WARN_ON(expand !=3D importer);
+			pu_start =3D end;
+			pu_end =3D vma->vm_end;
 		}
=20
 		/*
@@ -863,8 +875,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned l=
ong start,
 	if (!anon_vma && adjust_next)
 		anon_vma =3D next->anon_vma;
 	if (anon_vma) {
-		VM_WARN_ON(adjust_next && next->anon_vma &&
-			   anon_vma !=3D next->anon_vma);
 		anon_vma_lock_write(anon_vma);
 		anon_vma_interval_tree_pre_update_vma(vma);
 		if (adjust_next)
@@ -887,6 +897,31 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned =
long start,
 		end_changed =3D true;
 	}
 	vma->vm_pgoff =3D pgoff;
+
+	/* Update the anon_vma stored in pages in the range specified by pu_start=
 and pu_end */
+	if (anon_vma && next && anon_vma !=3D next->anon_vma && pu_start !=3D pu_=
end) {
+		struct vm_area_struct *source;
+
+		anon_vma_unlock_write(anon_vma);
+		VM_WARN_ON(expand =3D=3D vma && next->anon_vma &&
+			(next->anon_vma !=3D next->anon_vma->root
+				|| !rbt_no_children(next->anon_vma)));
+		VM_WARN_ON(expand =3D=3D next &&
+			(anon_vma !=3D anon_vma->root || !rbt_no_children(anon_vma)));
+		VM_WARN_ON(expand !=3D vma && expand !=3D next);
+		VM_WARN_ON(expand =3D=3D NULL);
+		if (expand =3D=3D vma)
+			source =3D next;
+		else
+			source =3D vma;
+		/*
+		 * Page walk over affected address range.
+		 * Wait for migration and update page->mapping.
+		 */
+		reconnect_pages_range(mm, pu_start, pu_end, expand, source);
+		anon_vma_lock_write(anon_vma);
+	}
+
 	if (adjust_next) {
 		next->vm_start +=3D adjust_next;
 		next->vm_pgoff +=3D adjust_next >> PAGE_SHIFT;
@@ -991,6 +1026,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned =
long start,
 		if (remove_next =3D=3D 2) {
 			remove_next =3D 1;
 			end =3D next->vm_end;
+			pu_start =3D next->vm_start;
+			pu_end =3D next->vm_end;
 			goto again;
 		}
 		else if (next)
@@ -1067,7 +1104,20 @@ static inline int is_mergeable_anon_vma(struct anon_=
vma *anon_vma1,
 	if ((!anon_vma1 || !anon_vma2) && (!vma ||
 		list_is_singular(&vma->anon_vma_chain)))
 		return 1;
-	return anon_vma1 =3D=3D anon_vma2;
+	if (anon_vma1 =3D=3D anon_vma2)
+		return 1;
+	/*
+	 * Different anon_vma but not shared by several processes
+	 */
+	else if ((anon_vma1 && anon_vma2) &&
+			(anon_vma1 =3D=3D anon_vma1->root)
+			&& (rbt_no_children(anon_vma1)))
+		return 1;
+	/*
+	 * Different anon_vma and shared -> unmergeable
+	 */
+	else
+		return 0;
 }
=20
 /*
@@ -1213,8 +1263,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	}
 	/* Can we merge both the predecessor and the successor? */
 	if (merge_prev && merge_next &&
-			is_mergeable_anon_vma(prev->anon_vma,
-				next->anon_vma, NULL)) {	 /* cases 1, 6 */
+			is_mergeable_anon_vma(next->anon_vma,
+				prev->anon_vma, NULL)) {	 /* cases 1, 6 */
 		err =3D __vma_adjust(prev, prev->vm_start,
 					next->vm_end, prev->vm_pgoff, NULL,
 					prev);
diff --git a/mm/mremap.c b/mm/mremap.c
index 2ef444abb08a..3b2428288b0e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -119,12 +119,16 @@ static pte_t move_soft_dirty_pte(pte_t pte)
 }
=20
 /*
- * update_pgoff_page() - Update page offset stored in page->index, if the =
page is not NULL.
+ * update_pgoff_page() - Update page offset stored in page->index
+ * and anon_vma in page->mapping, if the page is not NULL.
  * @addr: new address to calculate the page offset.
  * @page: page to update
+ * @vma: vma to get anon_vma
  */
-static int update_pgoff_page(unsigned long addr, struct page *page)
+static int update_pgoff_page(unsigned long addr, struct page *page, struct=
 vm_area_struct *vma)
 {
+	struct anon_vma *page_anon_vma;
+	unsigned long anon_mapping;
 	if (page !=3D NULL) {
 		get_page(page);
 		if (!trylock_page(page)) {
@@ -132,6 +136,13 @@ static int update_pgoff_page(unsigned long addr, struc=
t page *page)
 			return -1;
 		}
 		page->index =3D addr >> PAGE_SHIFT;
+
+		anon_mapping =3D (unsigned long)READ_ONCE(page->mapping);
+		page_anon_vma =3D (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+		if (page_anon_vma !=3D vma->anon_vma
+			&& page_anon_vma !=3D NULL) { /* NULL in case of ZERO_PAGE or KSM page =
*/
+			page_move_anon_rmap(page, vma); /* Update physical page's mapping */
+		}
 		unlock_page(page);
 		put_page(page);
 	}
@@ -144,7 +155,8 @@ static int update_pgoff_page(unsigned long addr, struct=
 page *page)
  */
 static int update_pgoff_pte_inner(pte_t *old_pte, unsigned long old_addr,
 					struct vm_area_struct *vma, spinlock_t *old_ptl,
-					pmd_t *old_pmd, unsigned long new_addr)
+					pmd_t *old_pmd, unsigned long new_addr,
+					struct vm_area_struct *new_vma)
 {
 	struct page *page;
 	/*
@@ -170,7 +182,7 @@ static int update_pgoff_pte_inner(pte_t *old_pte, unsig=
ned long old_addr,
 	}
=20
 	page =3D vm_normal_page(vma, old_addr, *old_pte);
-	return update_pgoff_page(new_addr, page);
+	return update_pgoff_page(new_addr, page, new_vma);
 }
=20
 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
@@ -227,7 +239,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t=
 *old_pmd,
=20
 		if (update_pgoff)
 			if (update_pgoff_pte_inner(old_pte, old_addr, vma, old_ptl,
-						old_pmd, new_addr))
+						old_pmd, new_addr, new_vma))
 				break; /* Causes unlock after for cycle and goto retry */
 		pte =3D ptep_get_and_clear(mm, old_addr, old_pte);
 		/*
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index d603962ddd52..4076a5ecdec0 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -419,7 +419,7 @@ static int walk_page_test(unsigned long start, unsigned=
 long end,
 	return 0;
 }
=20
-static int __walk_page_range(unsigned long start, unsigned long end,
+int __walk_page_range(unsigned long start, unsigned long end,
 			struct mm_walk *walk)
 {
 	int err =3D 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index b1bddabd21c6..7caa6ec6110a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -73,6 +73,7 @@
 #include <linux/page_idle.h>
 #include <linux/memremap.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/pagewalk.h>
=20
 #include <asm/tlbflush.h>
=20
@@ -389,6 +390,92 @@ int anon_vma_fork(struct vm_area_struct *vma, struct v=
m_area_struct *pvma)
 	return -ENOMEM;
 }
=20
+/*
+ * reconnect_page() - If the page is not NULL and has a non-NULL anon_vma,
+ * reconnect the page to a anon_vma of the given new VMA.
+ * @page: Page to reconnect to different anon_vma
+ * @old: Old VMA the page is connected to
+ * @new: New VMA the page will be reconnected to
+ */
+static int reconnect_page(struct page *page, struct vm_area_struct *old,
+				struct vm_area_struct *new)
+{
+	struct anon_vma *page_anon_vma;
+	unsigned long anon_mapping;
+	/* Do some checks and lock the page */
+	if (page =3D=3D NULL)
+		return 0; /* Virtual memory page is not mapped */
+	get_page(page);
+	if (!trylock_page(page)) {
+		put_page(page);
+		return -1;
+	}
+	anon_mapping =3D (unsigned long)READ_ONCE(page->mapping);
+	page_anon_vma =3D (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+	if (page_anon_vma !=3D NULL) { /* NULL in case of ZERO_PAGE or KSM page */
+		VM_WARN_ON(page_anon_vma !=3D old->anon_vma);
+		VM_WARN_ON(old->anon_vma =3D=3D new->anon_vma);
+		/* Update physical page's mapping */
+		page_move_anon_rmap(page, new);
+	}
+	unlock_page(page);
+	put_page(page);
+	return 0;
+}
+
+/*
+ * reconnect_page_pte() - Reconnect page mapped by pte from old anon_vma
+ * to new anon_vma.
+ * @pte: pte to work with
+ * @addr: Address where the page should be mapped.
+ * @end: Not used
+ * @walk: Pagewalk structure holding pointer to old and new VMAs.
+ */
+static int reconnect_page_pte(pte_t *pte, unsigned long addr,
+			    unsigned long end, struct mm_walk *walk)
+{
+	struct vm_area_struct *old =3D walk->vma;
+	struct page *page;
+
+	/*
+	 * Page's anon_vma will be reconstructed automatically from the
+	 * VMA after the data will be moved back into RAM
+	 */
+	if (!pte_present(*pte))
+		return 0;
+
+	page =3D vm_normal_page(old, addr, *pte);
+
+	if (reconnect_page(page, old, walk->private) =3D=3D -1)
+		walk->action =3D ACTION_AGAIN;
+	return 0;
+}
+
+/*
+ * reconnect_pages_range() - Reconnect physical pages to anon_vma of targe=
t VMA
+ * @mm: Memory descriptor
+ * @start: range start
+ * @end: range end
+ * @target: VMA to newly contain all physical pages
+ * @source: VMA which contains the all physical page before reconnecting t=
hem
+ */
+void reconnect_pages_range(struct mm_struct *mm, unsigned long start, unsi=
gned long end,
+				struct vm_area_struct *target, struct vm_area_struct *source)
+{
+	const struct mm_walk_ops reconnect_pages_ops =3D {
+		.pte_entry =3D reconnect_page_pte
+	};
+
+	struct mm_walk walk =3D {
+		.ops            =3D &reconnect_pages_ops,
+		.mm             =3D mm,
+		.private        =3D target,
+		.flags          =3D WALK_MIGRATION & WALK_LOCK_RMAP,
+		.vma            =3D source
+	};
+	/* Modify page->mapping for all pages in range */
+	__walk_page_range(start, end, &walk);
+}
=20
 /*
  * rbst_no_children() - Used by rbt_no_children to check node subtree.
--=20
2.35.1

From nobody Sat May  9 10:44:47 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 1BED1C433F5
	for <linux-kernel@archiver.kernel.org>; Mon, 16 May 2022 12:54:59 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S243619AbiEPMy5 (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Mon, 16 May 2022 08:54:57 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:44886 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S243615AbiEPMy3 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Mon, 16 May 2022 08:54:29 -0400
Received: from mail-wm1-x330.google.com (mail-wm1-x330.google.com
 [IPv6:2a00:1450:4864:20::330])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 49FEC3968D
        for <linux-kernel@vger.kernel.org>;
 Mon, 16 May 2022 05:53:49 -0700 (PDT)
Received: by mail-wm1-x330.google.com with SMTP id
 c190-20020a1c35c7000000b0038e37907b5bso10873196wma.0
        for <linux-kernel@vger.kernel.org>;
 Mon, 16 May 2022 05:53:49 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=from:to:cc:subject:date:message-id:in-reply-to:references
         :mime-version:content-transfer-encoding;
        bh=1W3RlyTjVOjH4Gd0UoHRyOhg3A3jrqbRy5vwvsgPFcY=;
        b=g5FLaynBywGous6vCk87ZA5i9Lg2O1nmxnmBO9QP9/58BL4un0UCui3Hgr9/hT3DE6
         GfHkyBIIbDDuMzs7pHrunoD6RX/MxmqNyAFuo8dPkwItIxT5dkVip4qs57dCdrRqqw21
         vGThLf8Aa0qp1RWED+nTR7KlD4R22BEhJyZKi7gwsSqabFJtsFT5JumTOcBXDpjjqXYs
         wycuLYhk3/I6mxwG0tEDuZN/QhWhZhPgTqBdPb0/nFjWQThvrlfZbtqIKQXkEDDc+SKB
         6jMzVgFa2SVEyHOxDUPeAtX+VpnEk1s2xcv/1hiFBNrUpZwbMHlFyv6Q56VOGn4WE5iK
         lzEw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
         :references:mime-version:content-transfer-encoding;
        bh=1W3RlyTjVOjH4Gd0UoHRyOhg3A3jrqbRy5vwvsgPFcY=;
        b=2gOb+9rn4WKK6G7SYrCenaEp75xiQ9QNjp+1yJbs/YeygDvfUcPB4RR1jHyurYA9al
         nv0OccsPrEWIq4/3M0BGzrw2nH5U/YB22RCpQVNS9+VD5Ivqf/C1vZh0nyZWSCEWHpzJ
         +QYnqAt1lWjw3hrbvXQwdpxKdj8edqgmq6Ccilcm1LNS1G/Oli/gDyJfQnVSlo69HefC
         v1G57bB2c4qMGwG06dRHV8hZTVXEDHc6pfy2yTqR/Wm8s0GJF1FvSoLf/pA05pGyU2UE
         7FGkdypkI90iLHhep8YWP6KUgpg3byHVVsttBn+s0HDN2HJGe2F0o6AOJGHlObCZTd0D
         amCw==
X-Gm-Message-State: AOAM531y4WvjabD9YDj3YuyyoqVst1hMHTRMIBA81Vt0TiyyIcF2bCVu
        4ojXrzBTZQ9UrS5TniZPY4GzBhdEQhHNBYg2
X-Google-Smtp-Source: 
 ABdhPJy/2sLuKLl1LvZDJ3ZDRIwboaTEVNLTJRgoLvmkn0XWpdN2Yqwm8wwOfcFFsc0FGMKEkptNOA==
X-Received: by 2002:a7b:c095:0:b0:393:fd2e:9191 with SMTP id
 r21-20020a7bc095000000b00393fd2e9191mr26817829wmh.137.1652705627604;
        Mon, 16 May 2022 05:53:47 -0700 (PDT)
Received: from orion.localdomain ([93.99.228.15])
        by smtp.gmail.com with ESMTPSA id
 u23-20020a05600c00d700b003942a244ec2sm9958610wmm.7.2022.05.16.05.53.46
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Mon, 16 May 2022 05:53:46 -0700 (PDT)
Received: by orion.localdomain (Postfix, from userid 1003)
        id 9E153A0E7B; Mon, 16 May 2022 14:54:07 +0200 (CEST)
From: =?UTF-8?q?Jakub=20Mat=C4=9Bna?= <matenajakub@gmail.com>
To: linux-mm@kvack.org
Cc: patches@lists.linux.dev, linux-kernel@vger.kernel.org,
        vbabka@suse.cz, mhocko@kernel.org, mgorman@techsingularity.net,
        willy@infradead.org, liam.howlett@oracle.com, hughd@google.com,
        kirill@shutemov.name, riel@surriel.com, rostedt@goodmis.org,
        peterz@infradead.org, david@redhat.com,
        =?UTF-8?q?Jakub=20Mat=C4=9Bna?= <matenajakub@gmail.com>
Subject: [RFC PATCH v3 6/6] [PATCH 6/6] mm: add tracing for VMA merges
Date: Mon, 16 May 2022 14:54:05 +0200
Message-Id: <20220516125405.1675-7-matenajakub@gmail.com>
X-Mailer: git-send-email 2.35.1
In-Reply-To: <20220516125405.1675-1-matenajakub@gmail.com>
References: <20220516125405.1675-1-matenajakub@gmail.com>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

Adds trace support for vma_merge to measure successful and unsuccessful
merges of two VMAs with distinct anon_vmas and also trace support for
merges made possible by update of page offset made possible by a previous
patch in this series.

Signed-off-by: Jakub Mat=C4=9Bna <matenajakub@gmail.com>
---
 include/trace/events/mmap.h | 83 +++++++++++++++++++++++++++++++++++++
 mm/internal.h               | 12 ++++++
 mm/mmap.c                   | 69 ++++++++++++++++--------------
 3 files changed, 133 insertions(+), 31 deletions(-)

diff --git a/include/trace/events/mmap.h b/include/trace/events/mmap.h
index 4661f7ba07c0..bad7abe4899c 100644
--- a/include/trace/events/mmap.h
+++ b/include/trace/events/mmap.h
@@ -6,6 +6,27 @@
 #define _TRACE_MMAP_H
=20
 #include <linux/tracepoint.h>
+#include <../mm/internal.h>
+
+#define AV_MERGE_TYPES		\
+	EM(MERGE_FAILED)	\
+	EM(AV_MERGE_FAILED)	\
+	EM(AV_MERGE_NULL)	\
+	EM(AV_MERGE_SAME)	\
+	EMe(AV_MERGE_DIFFERENT)
+
+#undef EM
+#undef EMe
+#define EM(a)	TRACE_DEFINE_ENUM(a);
+#define EMe(a)	TRACE_DEFINE_ENUM(a);
+
+AV_MERGE_TYPES
+
+#undef EM
+#undef EMe
+
+#define EM(a)   { a, #a },
+#define EMe(a)  { a, #a }
=20
 TRACE_EVENT(vm_unmapped_area,
=20
@@ -42,6 +63,68 @@ TRACE_EVENT(vm_unmapped_area,
 		__entry->low_limit, __entry->high_limit, __entry->align_mask,
 		__entry->align_offset)
 );
+
+TRACE_EVENT(vm_av_merge,
+
+	TP_PROTO(int merged, enum vma_merge_res merge_prev,
+			enum vma_merge_res merge_next, enum vma_merge_res merge_both),
+
+	TP_ARGS(merged, merge_prev, merge_next, merge_both),
+
+	TP_STRUCT__entry(
+		__field(int,			merged)
+		__field(enum vma_merge_res,	predecessor_different_av)
+		__field(enum vma_merge_res,	successor_different_av)
+		__field(enum vma_merge_res,	predecessor_with_successor_different_av)
+		__field(int,			same_count)
+		__field(int,			diff_count)
+		__field(int,			failed_count)
+	),
+
+	TP_fast_assign(
+		__entry->merged =3D merged =3D=3D 0;
+		__entry->predecessor_different_av =3D merge_prev;
+		__entry->successor_different_av =3D merge_next;
+		__entry->predecessor_with_successor_different_av =3D merge_both;
+		__entry->same_count =3D (merge_prev =3D=3D AV_MERGE_SAME) +
+			(merge_next =3D=3D AV_MERGE_SAME) +
+			(merge_both =3D=3D AV_MERGE_SAME);
+		__entry->diff_count =3D (merge_prev =3D=3D AV_MERGE_DIFFERENT) +
+			(merge_next =3D=3D AV_MERGE_DIFFERENT) +
+			(merge_both =3D=3D AV_MERGE_DIFFERENT);
+		__entry->failed_count =3D (merge_prev =3D=3D AV_MERGE_FAILED) +
+			(merge_next =3D=3D AV_MERGE_FAILED) +
+			(merge_both =3D=3D AV_MERGE_FAILED);
+	),
+
+	TP_printk("merged=3D%d predecessor=3D%s successor=3D%s predecessor_with_s=
uccessor=3D%s same_count=3D%d diff_count=3D%d failed_count=3D%d",
+		__entry->merged,
+		__print_symbolic(__entry->predecessor_different_av, AV_MERGE_TYPES),
+		__print_symbolic(__entry->successor_different_av, AV_MERGE_TYPES),
+		__print_symbolic(__entry->predecessor_with_successor_different_av, AV_ME=
RGE_TYPES),
+		__entry->same_count, __entry->diff_count, __entry->failed_count)
+
+);
+
+TRACE_EVENT(vm_pgoff_merge,
+
+	TP_PROTO(struct vm_area_struct *vma, bool anon_pgoff_updated),
+
+	TP_ARGS(vma, anon_pgoff_updated),
+
+	TP_STRUCT__entry(
+		__field(bool,	faulted)
+		__field(bool,	updated)
+	),
+
+	TP_fast_assign(
+		__entry->faulted =3D vma->anon_vma;
+		__entry->updated =3D anon_pgoff_updated;
+	),
+
+	TP_printk("faulted=3D%d updated=3D%d\n",
+		__entry->faulted, __entry->updated)
+);
 #endif
=20
 /* This part must be outside protection */
diff --git a/mm/internal.h b/mm/internal.h
index cf16280ce132..9284e779f53d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -35,6 +35,18 @@ struct folio_batch;
 /* Do not use these with a slab allocator */
 #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
=20
+/*
+ * Following values indicate reason for merge success or failure.
+ */
+enum vma_merge_res {
+	MERGE_FAILED,
+	AV_MERGE_FAILED,
+	AV_MERGE_NULL,
+	MERGE_OK =3D AV_MERGE_NULL,
+	AV_MERGE_SAME,
+	AV_MERGE_DIFFERENT,
+};
+
 void page_writeback_init(void);
=20
 static inline void *folio_raw_mapping(struct folio *folio)
diff --git a/mm/mmap.c b/mm/mmap.c
index e7760e378a68..3cecc2efe763 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1103,21 +1103,21 @@ static inline int is_mergeable_anon_vma(struct anon=
_vma *anon_vma1,
 	 */
 	if ((!anon_vma1 || !anon_vma2) && (!vma ||
 		list_is_singular(&vma->anon_vma_chain)))
-		return 1;
+		return AV_MERGE_NULL;
 	if (anon_vma1 =3D=3D anon_vma2)
-		return 1;
+		return AV_MERGE_SAME;
 	/*
 	 * Different anon_vma but not shared by several processes
 	 */
 	else if ((anon_vma1 && anon_vma2) &&
 			(anon_vma1 =3D=3D anon_vma1->root)
 			&& (rbt_no_children(anon_vma1)))
-		return 1;
+		return AV_MERGE_DIFFERENT;
 	/*
 	 * Different anon_vma and shared -> unmergeable
 	 */
 	else
-		return 0;
+		return AV_MERGE_FAILED;
 }
=20
 /*
@@ -1138,12 +1138,10 @@ can_vma_merge_before(struct vm_area_struct *vma, un=
signed long vm_flags,
 		     struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
 		     struct anon_vma_name *anon_name)
 {
-	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) =
&&
-	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
+	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name))
 		if (vma->vm_pgoff =3D=3D vm_pgoff)
-			return 1;
-	}
-	return 0;
+			return is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma);
+	return MERGE_FAILED;
 }
=20
 /*
@@ -1160,14 +1158,13 @@ can_vma_merge_after(struct vm_area_struct *vma, uns=
igned long vm_flags,
 		    struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
 		    struct anon_vma_name *anon_name)
 {
-	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) =
&&
-	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
+	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name))=
 {
 		pgoff_t vm_pglen;
 		vm_pglen =3D vma_pages(vma);
 		if (vma->vm_pgoff + vm_pglen =3D=3D vm_pgoff)
-			return 1;
+			return is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma);
 	}
-	return 0;
+	return MERGE_FAILED;
 }
=20
 /*
@@ -1224,8 +1221,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *m=
m,
 	pgoff_t pglen =3D (end - addr) >> PAGE_SHIFT;
 	struct vm_area_struct *area, *next;
 	int err =3D -1;
-	bool merge_prev =3D false;
-	bool merge_next =3D false;
+	/*
+	 * Following three variables are used to store values
+	 * indicating wheather this VMA and its anon_vma can
+	 * be merged and also the type of failure or success.
+	 */
+	enum vma_merge_res merge_prev =3D MERGE_FAILED;
+	enum vma_merge_res merge_both =3D MERGE_FAILED;
+	enum vma_merge_res merge_next =3D MERGE_FAILED;
=20
 	/*
 	 * We later require that vma->vm_flags =3D=3D vm_flags,
@@ -1246,32 +1249,34 @@ struct vm_area_struct *vma_merge(struct mm_struct *=
mm,
=20
 	/* Can we merge the predecessor? */
 	if (prev && prev->vm_end =3D=3D addr &&
-			mpol_equal(vma_policy(prev), policy) &&
-			can_vma_merge_after(prev, vm_flags,
+			mpol_equal(vma_policy(prev), policy)) {
+		merge_prev =3D can_vma_merge_after(prev, vm_flags,
 					    anon_vma, file, pgoff,
-					    vm_userfaultfd_ctx, anon_name)) {
-		merge_prev =3D true;
-		area =3D prev;
+					    vm_userfaultfd_ctx, anon_name);
 	}
+
 	/* Can we merge the successor? */
 	if (next && end =3D=3D next->vm_start &&
-			mpol_equal(policy, vma_policy(next)) &&
-			can_vma_merge_before(next, vm_flags,
-					     anon_vma, file, pgoff+pglen,
-					     vm_userfaultfd_ctx, anon_name)) {
-		merge_next =3D true;
+			mpol_equal(policy, vma_policy(next))) {
+		merge_next =3D can_vma_merge_before(next, vm_flags,
+					anon_vma, file, pgoff+pglen,
+					vm_userfaultfd_ctx, anon_name);
 	}
+
 	/* Can we merge both the predecessor and the successor? */
-	if (merge_prev && merge_next &&
-			is_mergeable_anon_vma(next->anon_vma,
-				prev->anon_vma, NULL)) {	 /* cases 1, 6 */
+	if (merge_prev >=3D MERGE_OK && merge_next >=3D MERGE_OK)
+		merge_both =3D is_mergeable_anon_vma(next->anon_vma, prev->anon_vma, NUL=
L);
+
+	if (merge_both >=3D MERGE_OK) {	 /* cases 1, 6 */
 		err =3D __vma_adjust(prev, prev->vm_start,
 					next->vm_end, prev->vm_pgoff, NULL,
 					prev);
-	} else if (merge_prev) {			/* cases 2, 5, 7 */
+		area =3D prev;
+	} else if (merge_prev >=3D MERGE_OK) {			/* cases 2, 5, 7 */
 		err =3D __vma_adjust(prev, prev->vm_start,
 					end, prev->vm_pgoff, NULL, prev);
-	} else if (merge_next) {
+		area =3D prev;
+	} else if (merge_next >=3D MERGE_OK) {
 		if (prev && addr < prev->vm_end)	/* case 4 */
 			err =3D __vma_adjust(prev, prev->vm_start,
 					addr, prev->vm_pgoff, NULL, next);
@@ -1285,7 +1290,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 		 */
 		area =3D next;
 	}
-
+	trace_vm_av_merge(err, merge_prev, merge_next, merge_both);
 	/*
 	 * Cannot merge with predecessor or successor or error in __vma_adjust?
 	 */
@@ -3346,6 +3351,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct=
 **vmap,
 		/*
 		 * Source vma may have been merged into new_vma
 		 */
+		trace_vm_pgoff_merge(vma, anon_pgoff_updated);
+
 		if (unlikely(vma_start >=3D new_vma->vm_start &&
 			     vma_start < new_vma->vm_end)) {
 			/*
--=20
2.35.1