From nobody Mon Oct 6 17:02:03 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 87132287249 for ; Fri, 18 Jul 2025 09:03:09 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829391; cv=none; b=oKspjUktzN98BxNnDRgW/kri4CxSzMjl+fDESyr+dP1bpUWp3TRX6hm1zg+twsALTkfB5uytIfssazDwun22oqINuit9yWjITHil/pIZg7qs1Jl6pTR8V1i7H/s4R59lZ5bJb8XQ/vc4nbpVJXHiz+KAG5W9teeckcUcN91taVI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829391; c=relaxed/simple; bh=Q6J9BoERCJVOjVnJc72tXA3APz5aj2+aLW02k+eHr10=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=EXR2LEjuxBx8pvoNaLaBse/+eP8787sMNKu1uiifBU32ZyGz5v2fWOQbbQh3Iy2iQpqaSFJBfYhN2ef3WdkjJL2vBCJBZnp8hh+VZpp14gT9/ReoISonnjfqRWUrmuwyFpfWHqur66Oq//+C6VA0hEsgnEMkA4ZmqaazhfsTnVI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 633DF1A32; Fri, 18 Jul 2025 02:03:01 -0700 (PDT) Received: from MacBook-Pro.blr.arm.com (unknown [10.164.18.51]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id A9B413F66E; Fri, 18 Jul 2025 02:03:00 -0700 (PDT) From: Dev Jain To: akpm@linux-foundation.org Cc: ryan.roberts@arm.com, david@redhat.com, willy@infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, catalin.marinas@arm.com, will@kernel.org, Liam.Howlett@oracle.com, lorenzo.stoakes@oracle.com, vbabka@suse.cz, jannh@google.com, anshuman.khandual@arm.com, peterx@redhat.com, joey.gouly@arm.com, ioworker0@gmail.com, baohua@kernel.org, kevin.brodsky@arm.com, quic_zhenhuah@quicinc.com, christophe.leroy@csgroup.eu, yangyicong@hisilicon.com, linux-arm-kernel@lists.infradead.org, hughd@google.com, yang@os.amperecomputing.com, ziy@nvidia.com, Dev Jain Subject: [PATCH v5 1/7] mm: Refactor MM_CP_PROT_NUMA skipping case into new function Date: Fri, 18 Jul 2025 14:32:38 +0530 Message-Id: <20250718090244.21092-2-dev.jain@arm.com> X-Mailer: git-send-email 2.39.5 (Apple Git-154) In-Reply-To: <20250718090244.21092-1-dev.jain@arm.com> References: <20250718090244.21092-1-dev.jain@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Reduce indentation by refactoring the prot_numa case into a new function. No functional change intended. Signed-off-by: Dev Jain Reviewed-by: Barry Song Reviewed-by: Lorenzo Stoakes --- mm/mprotect.c | 101 +++++++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 46 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 88709c01177b..2a9c73bd0778 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -83,6 +83,59 @@ bool can_change_pte_writable(struct vm_area_struct *vma,= unsigned long addr, return pte_dirty(pte); } =20 +static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr, + pte_t oldpte, pte_t *pte, int target_node) +{ + struct folio *folio; + bool toptier; + int nid; + + /* Avoid TLB flush if possible */ + if (pte_protnone(oldpte)) + return true; + + folio =3D vm_normal_folio(vma, addr, oldpte); + if (!folio) + return true; + + if (folio_is_zone_device(folio) || folio_test_ksm(folio)) + return true; + + /* Also skip shared copy-on-write pages */ + if (is_cow_mapping(vma->vm_flags) && + (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio))) + return true; + + /* + * While migration can move some dirty pages, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (folio_is_file_lru(folio) && folio_test_dirty(folio)) + return true; + + /* + * Don't mess with PTEs if page is already on the node + * a single-threaded process is running on. + */ + nid =3D folio_nid(folio); + if (target_node =3D=3D nid) + return true; + + toptier =3D node_is_toptier(nid); + + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier) + return true; + + if (folio_use_access_time(folio)) + folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); + return false; +} + static long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -117,53 +170,9 @@ static long change_pte_range(struct mmu_gather *tlb, * pages. See similar comment in change_huge_pmd. */ if (prot_numa) { - struct folio *folio; - int nid; - bool toptier; - - /* Avoid TLB flush if possible */ - if (pte_protnone(oldpte)) - continue; - - folio =3D vm_normal_folio(vma, addr, oldpte); - if (!folio || folio_is_zone_device(folio) || - folio_test_ksm(folio)) - continue; - - /* Also skip shared copy-on-write pages */ - if (is_cow_mapping(vma->vm_flags) && - (folio_maybe_dma_pinned(folio) || - folio_maybe_mapped_shared(folio))) - continue; - - /* - * While migration can move some dirty pages, - * it cannot move them all from MIGRATE_ASYNC - * context. - */ - if (folio_is_file_lru(folio) && - folio_test_dirty(folio)) - continue; - - /* - * Don't mess with PTEs if page is already on the node - * a single-threaded process is running on. - */ - nid =3D folio_nid(folio); - if (target_node =3D=3D nid) - continue; - toptier =3D node_is_toptier(nid); - - /* - * Skip scanning top tier node if normal numa - * balancing is disabled - */ - if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - toptier) + if (prot_numa_skip(vma, addr, oldpte, pte, + target_node)) continue; - if (folio_use_access_time(folio)) - folio_xchg_access_time(folio, - jiffies_to_msecs(jiffies)); } =20 oldpte =3D ptep_modify_prot_start(vma, addr, pte); --=20 2.30.2 From nobody Mon Oct 6 17:02:03 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id CA1172951BA for ; Fri, 18 Jul 2025 09:03:17 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829399; cv=none; b=ml/ax2pDxtbV72AD6LL9KND8ADhgnwm8g+e0iFIEhVXswtclvCHSQ90YaRBl09UEbOojYDjbSkd9xZvIE1RbdqWoxFWq0QTB8eSsL297/Ph3rPIQrHjDCuo75P8JOLiiFzLKtjkyF2PBRL/s4m89Xwm5VDhhMOFw0ox93LvpUXc= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829399; c=relaxed/simple; bh=OzaAhIO0b75mljetEjGZDH4UKBji2+6QJz0NWwzj8Lc=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=o0V2VAuscLf/jOrhzDhRvJgbeCFy+BtaUCJeob2DBKEmmwKpOcramMzIAWfIxLZo8Wao5IV1BP1KNsD46lpM3zFv9rxflZnCcAJTaRrBWe/cwxiySRkQ/JQ4fz1EVNHV1Q+lbX6RGp+F60yGvrXwTONTlUcnf4QD0LEaAMLZ+KM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 0BC46176C; Fri, 18 Jul 2025 02:03:10 -0700 (PDT) Received: from MacBook-Pro.blr.arm.com (unknown [10.164.18.51]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 5CCBC3F66E; Fri, 18 Jul 2025 02:03:09 -0700 (PDT) From: Dev Jain To: akpm@linux-foundation.org Cc: ryan.roberts@arm.com, david@redhat.com, willy@infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, catalin.marinas@arm.com, will@kernel.org, Liam.Howlett@oracle.com, lorenzo.stoakes@oracle.com, vbabka@suse.cz, jannh@google.com, anshuman.khandual@arm.com, peterx@redhat.com, joey.gouly@arm.com, ioworker0@gmail.com, baohua@kernel.org, kevin.brodsky@arm.com, quic_zhenhuah@quicinc.com, christophe.leroy@csgroup.eu, yangyicong@hisilicon.com, linux-arm-kernel@lists.infradead.org, hughd@google.com, yang@os.amperecomputing.com, ziy@nvidia.com, Dev Jain Subject: [PATCH v5 2/7] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs Date: Fri, 18 Jul 2025 14:32:39 +0530 Message-Id: <20250718090244.21092-3-dev.jain@arm.com> X-Mailer: git-send-email 2.39.5 (Apple Git-154) In-Reply-To: <20250718090244.21092-1-dev.jain@arm.com> References: <20250718090244.21092-1-dev.jain@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" For the MM_CP_PROT_NUMA skipping case, observe that, if we skip an iteration due to the underlying folio satisfying any of the skip conditions, then for all subsequent ptes which map the same folio, the iteration will be skipped for them too. Therefore, we can optimize by using folio_pte_batch() to batch skip the iterations. Use prot_numa_skip() introduced in the previous patch to determine whether we need to skip the iteration. Change its signature to have a double pointer to a folio, which will be used by mprotect_folio_pte_batch() to determine the number of iterations we can safely skip. Signed-off-by: Dev Jain Reviewed-by: Lorenzo Stoakes Reviewed-by: Ryan Roberts --- mm/mprotect.c | 55 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 2a9c73bd0778..97adc62c50ab 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -83,28 +83,43 @@ bool can_change_pte_writable(struct vm_area_struct *vma= , unsigned long addr, return pte_dirty(pte); } =20 +static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep, + pte_t pte, int max_nr_ptes) +{ + /* No underlying folio, so cannot batch */ + if (!folio) + return 1; + + if (!folio_test_large(folio)) + return 1; + + return folio_pte_batch(folio, ptep, pte, max_nr_ptes); +} + static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr, - pte_t oldpte, pte_t *pte, int target_node) + pte_t oldpte, pte_t *pte, int target_node, + struct folio **foliop) { - struct folio *folio; + struct folio *folio =3D NULL; + bool ret =3D true; bool toptier; int nid; =20 /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) - return true; + goto skip; =20 folio =3D vm_normal_folio(vma, addr, oldpte); if (!folio) - return true; + goto skip; =20 if (folio_is_zone_device(folio) || folio_test_ksm(folio)) - return true; + goto skip; =20 /* Also skip shared copy-on-write pages */ if (is_cow_mapping(vma->vm_flags) && (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio))) - return true; + goto skip; =20 /* * While migration can move some dirty pages, @@ -112,7 +127,7 @@ static bool prot_numa_skip(struct vm_area_struct *vma, = unsigned long addr, * context. */ if (folio_is_file_lru(folio) && folio_test_dirty(folio)) - return true; + goto skip; =20 /* * Don't mess with PTEs if page is already on the node @@ -120,7 +135,7 @@ static bool prot_numa_skip(struct vm_area_struct *vma, = unsigned long addr, */ nid =3D folio_nid(folio); if (target_node =3D=3D nid) - return true; + goto skip; =20 toptier =3D node_is_toptier(nid); =20 @@ -129,11 +144,15 @@ static bool prot_numa_skip(struct vm_area_struct *vma= , unsigned long addr, * balancing is disabled */ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier) - return true; + goto skip; =20 + ret =3D false; if (folio_use_access_time(folio)) folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); - return false; + +skip: + *foliop =3D folio; + return ret; } =20 static long change_pte_range(struct mmu_gather *tlb, @@ -147,6 +166,7 @@ static long change_pte_range(struct mmu_gather *tlb, bool prot_numa =3D cp_flags & MM_CP_PROT_NUMA; bool uffd_wp =3D cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve =3D cp_flags & MM_CP_UFFD_WP_RESOLVE; + int nr_ptes; =20 tlb_change_page_size(tlb, PAGE_SIZE); pte =3D pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -161,8 +181,11 @@ static long change_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { + nr_ptes =3D 1; oldpte =3D ptep_get(pte); if (pte_present(oldpte)) { + int max_nr_ptes =3D (end - addr) >> PAGE_SHIFT; + struct folio *folio; pte_t ptent; =20 /* @@ -170,9 +193,15 @@ static long change_pte_range(struct mmu_gather *tlb, * pages. See similar comment in change_huge_pmd. */ if (prot_numa) { - if (prot_numa_skip(vma, addr, oldpte, pte, - target_node)) + int ret =3D prot_numa_skip(vma, addr, oldpte, pte, + target_node, &folio); + if (ret) { + + /* determine batch to skip */ + nr_ptes =3D mprotect_folio_pte_batch(folio, + pte, oldpte, max_nr_ptes); continue; + } } =20 oldpte =3D ptep_modify_prot_start(vma, addr, pte); @@ -289,7 +318,7 @@ static long change_pte_range(struct mmu_gather *tlb, pages++; } } - } while (pte++, addr +=3D PAGE_SIZE, addr !=3D end); + } while (pte +=3D nr_ptes, addr +=3D nr_ptes * PAGE_SIZE, addr !=3D end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); =20 --=20 2.30.2 From nobody Mon Oct 6 17:02:03 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id BB288220F30 for ; Fri, 18 Jul 2025 09:03:26 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829408; cv=none; b=erPj7wTtFLpOLemIanzThTxgJ2rCmK2mTgBXcK/75CfgcEu+irnDqlOfZi49JVDe7FES/bhdi4L7WN+IwcaAlGnxd717kuZk+Ld9LkXBEbm+0kGo694E/MR6GJEO5/IfuuYq9uqOkDu41Ovznbw5scGXngkVgc26Edfselx/PBs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829408; c=relaxed/simple; bh=aUkDKKNXtSqOHP0pGZ4lu6Uz8r/p5f3PIthySzrM0RA=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=m0wpUd2bcyJG8gQAC4O7ZGqoqQ4+LXXbjgDw3zx1iBal5UcpYjO2qhEXi9McHsQczOh4rlU1g6TKz9T2qQca+5h3/AoUqNttgAzyF+p3wN0FVEBsL74zre0KHRJgmXszW5ZkJytMNBsUX53UbNd2ScBOEUBBkjFCi9daRzqdJSg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id AEF8B176C; Fri, 18 Jul 2025 02:03:18 -0700 (PDT) Received: from MacBook-Pro.blr.arm.com (unknown [10.164.18.51]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 094223F66E; Fri, 18 Jul 2025 02:03:17 -0700 (PDT) From: Dev Jain To: akpm@linux-foundation.org Cc: ryan.roberts@arm.com, david@redhat.com, willy@infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, catalin.marinas@arm.com, will@kernel.org, Liam.Howlett@oracle.com, lorenzo.stoakes@oracle.com, vbabka@suse.cz, jannh@google.com, anshuman.khandual@arm.com, peterx@redhat.com, joey.gouly@arm.com, ioworker0@gmail.com, baohua@kernel.org, kevin.brodsky@arm.com, quic_zhenhuah@quicinc.com, christophe.leroy@csgroup.eu, yangyicong@hisilicon.com, linux-arm-kernel@lists.infradead.org, hughd@google.com, yang@os.amperecomputing.com, ziy@nvidia.com, Dev Jain Subject: [PATCH v5 3/7] mm: Add batched versions of ptep_modify_prot_start/commit Date: Fri, 18 Jul 2025 14:32:40 +0530 Message-Id: <20250718090244.21092-4-dev.jain@arm.com> X-Mailer: git-send-email 2.39.5 (Apple Git-154) In-Reply-To: <20250718090244.21092-1-dev.jain@arm.com> References: <20250718090244.21092-1-dev.jain@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Batch ptep_modify_prot_start/commit in preparation for optimizing mprotect, implementing them as a simple loop over the corresponding single pte helpers. Architecture may override these helpers. Signed-off-by: Dev Jain Reviewed-by: Barry Song Reviewed-by: Lorenzo Stoakes Reviewed-by: Ryan Roberts --- include/linux/pgtable.h | 84 ++++++++++++++++++++++++++++++++++++++++- mm/mprotect.c | 4 +- 2 files changed, 85 insertions(+), 3 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index cf1515c163e2..e3b99920be05 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1331,7 +1331,9 @@ static inline pte_t ptep_modify_prot_start(struct vm_= area_struct *vma, =20 /* * Commit an update to a pte, leaving any hardware-controlled bits in - * the PTE unmodified. + * the PTE unmodified. The pte returned from ptep_modify_prot_start() may + * additionally have young and/or dirty bits set where previously they wer= e not, + * so the updated pte may have these additional changes. */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, @@ -1340,6 +1342,86 @@ static inline void ptep_modify_prot_commit(struct vm= _area_struct *vma, __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ + +/** + * modify_prot_start_ptes - Start a pte protection read-modify-write trans= action + * over a batch of ptes, which protects against asynchronous hardware + * modifications to the ptes. The intention is not to prevent the hardware= from + * making pte updates, but to prevent any updates it may make from being l= ost. + * Please see the comment above ptep_modify_prot_start() for full descript= ion. + * + * @vma: The virtual memory area the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries. + * + * May be overridden by the architecture; otherwise, implemented as a simp= le + * loop over ptep_modify_prot_start(), collecting the a/d bits from each p= te + * in the batch. + * + * Note that PTE bits in the PTE batch besides the PFN can differ. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. All other PTE bits must be identic= al for + * all PTEs in the batch except for young and dirty bits. The PTEs are al= l in + * the same PMD. + */ +#ifndef modify_prot_start_ptes +static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + pte_t pte, tmp_pte; + + pte =3D ptep_modify_prot_start(vma, addr, ptep); + while (--nr) { + ptep++; + addr +=3D PAGE_SIZE; + tmp_pte =3D ptep_modify_prot_start(vma, addr, ptep); + if (pte_dirty(tmp_pte)) + pte =3D pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte =3D pte_mkyoung(pte); + } + return pte; +} +#endif + +/** + * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving = any + * hardware-controlled bits in the PTE unmodified. + * + * @vma: The virtual memory area the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @old_pte: Old page table entry (for the first entry) which is now clear= ed. + * @pte: New page table entry to be set. + * @nr: Number of entries. + * + * May be overridden by the architecture; otherwise, implemented as a simp= le + * loop over ptep_modify_prot_commit(). + * + * Context: The caller holds the page table lock. The PTEs are all in the = same + * PMD. On exit, the set ptes in the batch map the same folio. The ptes se= t by + * ptep_modify_prot_start() may additionally have young and/or dirty bits = set + * where previously they were not, so the updated ptes may have these + * additional changes. + */ +#ifndef modify_prot_commit_ptes +static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, uns= igned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte, unsigned int nr) +{ + int i; + + for (i =3D 0; i < nr; ++i, ++ptep, addr +=3D PAGE_SIZE) { + ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); + + /* Advance PFN only, set same prot */ + old_pte =3D pte_next_pfn(old_pte); + pte =3D pte_next_pfn(pte); + } +} +#endif + #endif /* CONFIG_MMU */ =20 /* diff --git a/mm/mprotect.c b/mm/mprotect.c index 97adc62c50ab..4977f198168e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -204,7 +204,7 @@ static long change_pte_range(struct mmu_gather *tlb, } } =20 - oldpte =3D ptep_modify_prot_start(vma, addr, pte); + oldpte =3D modify_prot_start_ptes(vma, addr, pte, nr_ptes); ptent =3D pte_modify(oldpte, newprot); =20 if (uffd_wp) @@ -230,7 +230,7 @@ static long change_pte_range(struct mmu_gather *tlb, can_change_pte_writable(vma, addr, ptent)) ptent =3D pte_mkwrite(ptent, vma); =20 - ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); + modify_prot_commit_ptes(vma, addr, pte, oldpte, ptent, nr_ptes); if (pte_needs_flush(oldpte, ptent)) tlb_flush_pte_range(tlb, addr, PAGE_SIZE); pages++; --=20 2.30.2 From nobody Mon Oct 6 17:02:03 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 582B7294A0C for ; Fri, 18 Jul 2025 09:03:35 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829417; cv=none; b=jcpQ33CVnQBOEM68HtBxQoEaMmWn/P0CMqzjFu16UOFcVopGEtTP4wAgoRjQ2mnf3PZVCdhSTUJHaEiDz9QbcjJMovAhVKWGangz27t8CDr9CmKRKXXgHM1CxnpZ9OtbkXQqd8lZYgs/Umzz3PQ4XYrgo69PUZ4reBVnDODBDo4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829417; c=relaxed/simple; bh=Lg7dW7bactYjr4swofwJqqd79Xj6oFX6KDX5alxYefo=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=axrX/0L4EMp0EXXj4FNVXPncP1uIIP1rvPuF9rHQxvOgRF9VQMiJ/x7Yeqxp16m1WwvvfxjDdtkD6a4TMb7RCA1maGoxe7MAuon8eJHOE6TyAXMjR81W125nHKavtGwUVr0zOa6xTvVFFn1dYEzL3sUH0d0i+WX1xilDJvKNIEc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 58B94176C; Fri, 18 Jul 2025 02:03:27 -0700 (PDT) Received: from MacBook-Pro.blr.arm.com (unknown [10.164.18.51]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id A93ED3F66E; Fri, 18 Jul 2025 02:03:26 -0700 (PDT) From: Dev Jain To: akpm@linux-foundation.org Cc: ryan.roberts@arm.com, david@redhat.com, willy@infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, catalin.marinas@arm.com, will@kernel.org, Liam.Howlett@oracle.com, lorenzo.stoakes@oracle.com, vbabka@suse.cz, jannh@google.com, anshuman.khandual@arm.com, peterx@redhat.com, joey.gouly@arm.com, ioworker0@gmail.com, baohua@kernel.org, kevin.brodsky@arm.com, quic_zhenhuah@quicinc.com, christophe.leroy@csgroup.eu, yangyicong@hisilicon.com, linux-arm-kernel@lists.infradead.org, hughd@google.com, yang@os.amperecomputing.com, ziy@nvidia.com, Dev Jain Subject: [PATCH v5 4/7] mm: Introduce FPB_RESPECT_WRITE for PTE batching infrastructure Date: Fri, 18 Jul 2025 14:32:41 +0530 Message-Id: <20250718090244.21092-5-dev.jain@arm.com> X-Mailer: git-send-email 2.39.5 (Apple Git-154) In-Reply-To: <20250718090244.21092-1-dev.jain@arm.com> References: <20250718090244.21092-1-dev.jain@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Patch 6 optimizes mprotect() by batch clearing the ptes, masking in the new protections, and batch setting the ptes. Suppose that the first pte of the batch is writable - with the current implementation of folio_pte_batch(), it is not guaranteed that the other ptes in the batch are already writable too, so we may incorrectly end up setting the writable bit on all ptes via modify_prot_commit_ptes(). Therefore, introduce FPB_RESPECT_WRITE so that all ptes in the batch are writable or not. Signed-off-by: Dev Jain Reviewed-by: Lorenzo Stoakes Reviewed-by: Ryan Roberts --- mm/internal.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 5b0f71e5434b..28d2d5b051df 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -208,17 +208,20 @@ typedef int __bitwise fpb_t; /* Compare PTEs respecting the soft-dirty bit. */ #define FPB_RESPECT_SOFT_DIRTY ((__force fpb_t)BIT(1)) =20 +/* Compare PTEs respecting the writable bit. */ +#define FPB_RESPECT_WRITE ((__force fpb_t)BIT(2)) + /* * Merge PTE write bits: if any PTE in the batch is writable, modify the * PTE at @ptentp to be writable. */ -#define FPB_MERGE_WRITE ((__force fpb_t)BIT(2)) +#define FPB_MERGE_WRITE ((__force fpb_t)BIT(3)) =20 /* * Merge PTE young and dirty bits: if any PTE in the batch is young or dir= ty, * modify the PTE at @ptentp to be young or dirty, respectively. */ -#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(3)) +#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(4)) =20 static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) { @@ -226,7 +229,9 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte= , fpb_t flags) pte =3D pte_mkclean(pte); if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY))) pte =3D pte_clear_soft_dirty(pte); - return pte_wrprotect(pte_mkold(pte)); + if (likely(!(flags & FPB_RESPECT_WRITE))) + pte =3D pte_wrprotect(pte); + return pte_mkold(pte); } =20 /** --=20 2.30.2 From nobody Mon Oct 6 17:02:03 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 16D33295513 for ; Fri, 18 Jul 2025 09:03:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829425; cv=none; b=XXX8EskXypupgTqHVvifchd3NrPrClm1OBpnXJc4hC5ULDeGivcUqNvBNWwRqjAB/QrUvHhu3xULUrlfkgCp7PmXWVyWmZKK+QDN73Gir/i31m8103gyI9T5cfWEMrxBFVgKaqIM4c9BrP3zRRyWTteR5W8DXxYh3aR5HDbdn3o= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829425; c=relaxed/simple; bh=gHqPQIAC+ljk+rkN0KeenU85DY2HdWO2rbwDr/2PHC4=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=I2CJvrSFYESSb7MXdrmtEWnRnWIGNUIaiBYzdl4/VQJfpykIFsS3VFNPrcMYK5skUudpV3KoyM3z/7DZwoz+WcdAVX5yYIE4s9NFrYqxj5x/K3HmcKnH6hFjuUq8UC7cscfrjmW+jKBM6ndsLHWsUyF/MdxAo3b7Gr/cgtGC/Jo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 09EBF1A32; Fri, 18 Jul 2025 02:03:36 -0700 (PDT) Received: from MacBook-Pro.blr.arm.com (unknown [10.164.18.51]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 53D8E3F66E; Fri, 18 Jul 2025 02:03:35 -0700 (PDT) From: Dev Jain To: akpm@linux-foundation.org Cc: ryan.roberts@arm.com, david@redhat.com, willy@infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, catalin.marinas@arm.com, will@kernel.org, Liam.Howlett@oracle.com, lorenzo.stoakes@oracle.com, vbabka@suse.cz, jannh@google.com, anshuman.khandual@arm.com, peterx@redhat.com, joey.gouly@arm.com, ioworker0@gmail.com, baohua@kernel.org, kevin.brodsky@arm.com, quic_zhenhuah@quicinc.com, christophe.leroy@csgroup.eu, yangyicong@hisilicon.com, linux-arm-kernel@lists.infradead.org, hughd@google.com, yang@os.amperecomputing.com, ziy@nvidia.com, Dev Jain Subject: [PATCH v5 5/7] mm: Split can_change_pte_writable() into private and shared parts Date: Fri, 18 Jul 2025 14:32:42 +0530 Message-Id: <20250718090244.21092-6-dev.jain@arm.com> X-Mailer: git-send-email 2.39.5 (Apple Git-154) In-Reply-To: <20250718090244.21092-1-dev.jain@arm.com> References: <20250718090244.21092-1-dev.jain@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" In preparation for patch 6 and modularizing the code in general, split can_change_pte_writable() into private and shared VMA parts. No functional change intended. Suggested-by: Lorenzo Stoakes Signed-off-by: Dev Jain Reviewed-by: Lorenzo Stoakes Suggested-by here makes me somewhat biased :P) :>) --- mm/mprotect.c | 50 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 4977f198168e..a1c7d8a4648d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -40,11 +40,8 @@ =20 #include "internal.h" =20 -bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long add= r, - pte_t pte) +static bool maybe_change_pte_writable(struct vm_area_struct *vma, pte_t pt= e) { - struct page *page; - if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) return false; =20 @@ -60,16 +57,32 @@ bool can_change_pte_writable(struct vm_area_struct *vma= , unsigned long addr, if (userfaultfd_pte_wp(vma, pte)) return false; =20 - if (!(vma->vm_flags & VM_SHARED)) { - /* - * Writable MAP_PRIVATE mapping: We can only special-case on - * exclusive anonymous pages, because we know that our - * write-fault handler similarly would map them writable without - * any additional checks while holding the PT lock. - */ - page =3D vm_normal_page(vma, addr, pte); - return page && PageAnon(page) && PageAnonExclusive(page); - } + return true; +} + +static bool can_change_private_pte_writable(struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + struct page *page; + + if (!maybe_change_pte_writable(vma, pte)) + return false; + + /* + * Writable MAP_PRIVATE mapping: We can only special-case on + * exclusive anonymous pages, because we know that our + * write-fault handler similarly would map them writable without + * any additional checks while holding the PT lock. + */ + page =3D vm_normal_page(vma, addr, pte); + return page && PageAnon(page) && PageAnonExclusive(page); +} + +static bool can_change_shared_pte_writable(struct vm_area_struct *vma, + pte_t pte) +{ + if (!maybe_change_pte_writable(vma, pte)) + return false; =20 VM_WARN_ON_ONCE(is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)); =20 @@ -83,6 +96,15 @@ bool can_change_pte_writable(struct vm_area_struct *vma,= unsigned long addr, return pte_dirty(pte); } =20 +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long add= r, + pte_t pte) +{ + if (!(vma->vm_flags & VM_SHARED)) + return can_change_private_pte_writable(vma, addr, pte); + + return can_change_shared_pte_writable(vma, pte); +} + static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, int max_nr_ptes) { --=20 2.30.2 From nobody Mon Oct 6 17:02:03 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 9F7A32951C9 for ; Fri, 18 Jul 2025 09:03:52 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829434; cv=none; b=jLJixGwEhYsTuTKWv+Av1WWfJyy8LPGxcAjjMjcO35d+z44O25BU+4uuoZABi/OF/LD7S3vkYN+qYGoQrcOU1Fv4U3ewOTAzIHkjy+4aEkY7AW3f8hpFf8jcMsnIpXao21BAu+Beb0pRAsoygZtB5pSBgYx0ol6JwJEu/WBQLis= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829434; c=relaxed/simple; bh=D6v2UBGvzI8pa3KQ9JeB2IONHW+6Rt1+5t4VQP6N09Q=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=YpW8IB6RkN895xtkrnXOe1Fg7HKKCsXVfk/iN/V6rwDFt5VNrtguO7YCDABLjkMctnXig/VvGNrC6Ud3h184c2o5rQpWhIMIfVHpOmAMKwBcKhEByZR20IBXfMFeF2Y8qVcSdsOFSOtOU/nIkjF3Mt2vfKUCaGTjStJZ3TCV0Cc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id B42C71A32; Fri, 18 Jul 2025 02:03:44 -0700 (PDT) Received: from MacBook-Pro.blr.arm.com (unknown [10.164.18.51]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 0853C3F66E; Fri, 18 Jul 2025 02:03:43 -0700 (PDT) From: Dev Jain To: akpm@linux-foundation.org Cc: ryan.roberts@arm.com, david@redhat.com, willy@infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, catalin.marinas@arm.com, will@kernel.org, Liam.Howlett@oracle.com, lorenzo.stoakes@oracle.com, vbabka@suse.cz, jannh@google.com, anshuman.khandual@arm.com, peterx@redhat.com, joey.gouly@arm.com, ioworker0@gmail.com, baohua@kernel.org, kevin.brodsky@arm.com, quic_zhenhuah@quicinc.com, christophe.leroy@csgroup.eu, yangyicong@hisilicon.com, linux-arm-kernel@lists.infradead.org, hughd@google.com, yang@os.amperecomputing.com, ziy@nvidia.com, Dev Jain Subject: [PATCH v5 6/7] mm: Optimize mprotect() by PTE batching Date: Fri, 18 Jul 2025 14:32:43 +0530 Message-Id: <20250718090244.21092-7-dev.jain@arm.com> X-Mailer: git-send-email 2.39.5 (Apple Git-154) In-Reply-To: <20250718090244.21092-1-dev.jain@arm.com> References: <20250718090244.21092-1-dev.jain@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Use folio_pte_batch to batch process a large folio. Note that, PTE batching here will save a few function calls, and this strategy in certain cases (not this one) batches atomic operations in general, so we have a performance win for all arches. This patch paves the way for patch 7 which will help us elide the TLBI per contig block on arm64. The correctness of this patch lies on the correctness of setting the new ptes based upon information only from the first pte of the batch (which may also have accumulated a/d bits via modify_prot_start_ptes()). Observe that the flag combination we pass to mprotect_folio_pte_batch() guarantees that the batch is uniform w.r.t the soft-dirty bit and the writable bit. Therefore, the only bits which may differ are the a/d bits. So we only need to worry about code which is concerned about the a/d bits of the PTEs. Setting extra a/d bits on the new ptes where previously they were not set, is fine - setting access bit when it was not set is not an incorrectness problem but will only possibly delay the reclaim of the page mapped by the pte (which is in fact intended because the kernel just operated on this region via mprotect()!). Setting dirty bit when it was not set is again not an incorrectness problem but will only possibly force an unnecessary writeback. So now we need to reason whether something can go wrong via can_change_pte_writable(). The pte_protnone, pte_needs_soft_dirty_wp, and userfaultfd_pte_wp cases are solved due to uniformity in the corresponding bits guaranteed by the flag combination. The ptes all belong to the same VMA (since callers guarantee that [start, end) will lie within the VMA) therefore the conditional based on the VMA is also safe to batch around. Since the dirty bit on the PTE really is just an indication that the folio got written to - even if the PTE is not actually dirty but one of the PTEs in the batch is, the wp-fault optimization can be made. Therefore, it is safe to batch around pte_dirty() in can_change_shared_pte_writable() (in fact this is better since without batching, it may happen that some ptes aren't changed to writable just because they are not dirty, even though the other ptes mapping the same large folio are dirty). To batch around the PageAnonExclusive case, we must check the corresponding condition for every single page. Therefore, from the large folio batch, we process sub batches of ptes mapping pages with the same PageAnonExclusive condition, and process that sub batch, then determine and process the next sub batch, and so on. Note that this does not cause any extra overhead; if suppose the size of the folio batch is 512, then the sub batch processing in total will take 512 iterations, which is the same as what we would have done before. For pte_needs_flush(): ppc does not care about the a/d bits. For x86, PAGE_SAVED_DIRTY is ignored. We will flush only when a/d bits get cleared; since we can only have extra a/d bits due to batching, we will only have an extra flush, not a case where we elide a flush due to batching when we shouldn't have. Signed-off-by: Dev Jain Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan --- mm/mprotect.c | 125 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 113 insertions(+), 12 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index a1c7d8a4648d..2ddd37b2f462 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -106,7 +106,7 @@ bool can_change_pte_writable(struct vm_area_struct *vma= , unsigned long addr, } =20 static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep, - pte_t pte, int max_nr_ptes) + pte_t pte, int max_nr_ptes, fpb_t flags) { /* No underlying folio, so cannot batch */ if (!folio) @@ -115,7 +115,7 @@ static int mprotect_folio_pte_batch(struct folio *folio= , pte_t *ptep, if (!folio_test_large(folio)) return 1; =20 - return folio_pte_batch(folio, ptep, pte, max_nr_ptes); + return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr_ptes, flags); } =20 static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr, @@ -177,6 +177,102 @@ static bool prot_numa_skip(struct vm_area_struct *vma= , unsigned long addr, return ret; } =20 +/* Set nr_ptes number of ptes, starting from idx */ +static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned lo= ng addr, + pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, + int idx, bool set_write, struct mmu_gather *tlb) +{ + /* + * Advance the position in the batch by idx; note that if idx > 0, + * then the nr_ptes passed here is <=3D batch size - idx. + */ + addr +=3D idx * PAGE_SIZE; + ptep +=3D idx; + oldpte =3D pte_advance_pfn(oldpte, idx); + ptent =3D pte_advance_pfn(ptent, idx); + + if (set_write) + ptent =3D pte_mkwrite(ptent, vma); + + modify_prot_commit_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes); + if (pte_needs_flush(oldpte, ptent)) + tlb_flush_pte_range(tlb, addr, nr_ptes * PAGE_SIZE); +} + +/* + * Get max length of consecutive ptes pointing to PageAnonExclusive() page= s or + * !PageAnonExclusive() pages, starting from start_idx. Caller must enforce + * that the ptes point to consecutive pages of the same anon large folio. + */ +static int page_anon_exclusive_sub_batch(int start_idx, int max_len, + struct page *first_page, bool expected_anon_exclusive) +{ + int idx; + + for (idx =3D start_idx + 1; idx < start_idx + max_len; ++idx) { + if (expected_anon_exclusive !=3D PageAnonExclusive(first_page + idx)) + break; + } + return idx - start_idx; +} + +/* + * This function is a result of trying our very best to retain the + * "avoid the write-fault handler" optimization. In can_change_pte_writabl= e(), + * if the vma is a private vma, and we cannot determine whether to change + * the pte to writable just from the vma and the pte, we then need to look + * at the actual page pointed to by the pte. Unfortunately, if we have a + * batch of ptes pointing to consecutive pages of the same anon large foli= o, + * the anon-exclusivity (or the negation) of the first page does not guara= ntee + * the anon-exclusivity (or the negation) of the other pages corresponding= to + * the pte batch; hence in this case it is incorrect to decide to change or + * not change the ptes to writable just by using information from the first + * pte of the batch. Therefore, we must individually check all pages and + * retrieve sub-batches. + */ +static void commit_anon_folio_batch(struct vm_area_struct *vma, + struct folio *folio, unsigned long addr, pte_t *ptep, + pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) +{ + struct page *first_page =3D folio_page(folio, 0); + bool expected_anon_exclusive; + int sub_batch_idx =3D 0; + int len; + + while (nr_ptes) { + expected_anon_exclusive =3D PageAnonExclusive(first_page + sub_batch_idx= ); + len =3D page_anon_exclusive_sub_batch(sub_batch_idx, nr_ptes, + first_page, expected_anon_exclusive); + prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, len, + sub_batch_idx, expected_anon_exclusive, tlb); + sub_batch_idx +=3D len; + nr_ptes -=3D len; + } +} + +static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, + struct folio *folio, unsigned long addr, pte_t *ptep, + pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) +{ + bool set_write; + + if (vma->vm_flags & VM_SHARED) { + set_write =3D can_change_shared_pte_writable(vma, ptent); + prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes, + /* idx =3D */ 0, set_write, tlb); + return; + } + + set_write =3D maybe_change_pte_writable(vma, ptent) && + (folio && folio_test_anon(folio)); + if (!set_write) { + prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes, + /* idx =3D */ 0, set_write, tlb); + return; + } + commit_anon_folio_batch(vma, folio, addr, ptep, oldpte, ptent, nr_ptes, t= lb); +} + static long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -206,8 +302,9 @@ static long change_pte_range(struct mmu_gather *tlb, nr_ptes =3D 1; oldpte =3D ptep_get(pte); if (pte_present(oldpte)) { + const fpb_t flags =3D FPB_RESPECT_SOFT_DIRTY | FPB_RESPECT_WRITE; int max_nr_ptes =3D (end - addr) >> PAGE_SHIFT; - struct folio *folio; + struct folio *folio =3D NULL; pte_t ptent; =20 /* @@ -221,11 +318,16 @@ static long change_pte_range(struct mmu_gather *tlb, =20 /* determine batch to skip */ nr_ptes =3D mprotect_folio_pte_batch(folio, - pte, oldpte, max_nr_ptes); + pte, oldpte, max_nr_ptes, /* flags =3D */ 0); continue; } } =20 + if (!folio) + folio =3D vm_normal_folio(vma, addr, oldpte); + + nr_ptes =3D mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, f= lags); + oldpte =3D modify_prot_start_ptes(vma, addr, pte, nr_ptes); ptent =3D pte_modify(oldpte, newprot); =20 @@ -248,14 +350,13 @@ static long change_pte_range(struct mmu_gather *tlb, * COW or special handling is required. */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && - !pte_write(ptent) && - can_change_pte_writable(vma, addr, ptent)) - ptent =3D pte_mkwrite(ptent, vma); - - modify_prot_commit_ptes(vma, addr, pte, oldpte, ptent, nr_ptes); - if (pte_needs_flush(oldpte, ptent)) - tlb_flush_pte_range(tlb, addr, PAGE_SIZE); - pages++; + !pte_write(ptent)) + set_write_prot_commit_flush_ptes(vma, folio, + addr, pte, oldpte, ptent, nr_ptes, tlb); + else + prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent, + nr_ptes, /* idx =3D */ 0, /* set_write =3D */ false, tlb); + pages +=3D nr_ptes; } else if (is_swap_pte(oldpte)) { swp_entry_t entry =3D pte_to_swp_entry(oldpte); pte_t newpte; --=20 2.30.2 From nobody Mon Oct 6 17:02:03 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 736B22951C9 for ; Fri, 18 Jul 2025 09:04:01 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829442; cv=none; b=gpKNGdy0/Opy+iMAh2lFLwZY/wSUv5di2zjyjKfFprexbL0XXPAiSKPwZwygcM73SQRDtjqOpaV/7QKe1Mr/HnaYB5wnmJtxkQAYpUIlQLhgAarTxGeawX/cxa4GbvUAXRZ2TE6GaKIbYn6K/8g3rZsUpCdyEVOXtNTNB9EMsog= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1752829442; c=relaxed/simple; bh=Lhj/0djTDWSJf1Z6VuMaPZsg4fHZNRtnhEy6yyfSKEo=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=eQ1Fg1JJnD/2WU8gy8znj+gFoJ0lj4jvPcAf61wQ2pooxK6F7j6MOqENMexeNwpA4MRGpBJJBfkoDdvaf+fGNIXfyl7ooAQ3o6LE9Fl8xq4lQVMRkumnnv207G4xnju6WyiVJIlPimPk37HPCLBeeQqW3aM3qLPAnvX742RqFao= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 5FAF325DE; Fri, 18 Jul 2025 02:03:53 -0700 (PDT) Received: from MacBook-Pro.blr.arm.com (unknown [10.164.18.51]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id AF5A63F66E; Fri, 18 Jul 2025 02:03:52 -0700 (PDT) From: Dev Jain To: akpm@linux-foundation.org Cc: ryan.roberts@arm.com, david@redhat.com, willy@infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, catalin.marinas@arm.com, will@kernel.org, Liam.Howlett@oracle.com, lorenzo.stoakes@oracle.com, vbabka@suse.cz, jannh@google.com, anshuman.khandual@arm.com, peterx@redhat.com, joey.gouly@arm.com, ioworker0@gmail.com, baohua@kernel.org, kevin.brodsky@arm.com, quic_zhenhuah@quicinc.com, christophe.leroy@csgroup.eu, yangyicong@hisilicon.com, linux-arm-kernel@lists.infradead.org, hughd@google.com, yang@os.amperecomputing.com, ziy@nvidia.com, Dev Jain Subject: [PATCH v5 7/7] arm64: Add batched versions of ptep_modify_prot_start/commit Date: Fri, 18 Jul 2025 14:32:44 +0530 Message-Id: <20250718090244.21092-8-dev.jain@arm.com> X-Mailer: git-send-email 2.39.5 (Apple Git-154) In-Reply-To: <20250718090244.21092-1-dev.jain@arm.com> References: <20250718090244.21092-1-dev.jain@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Override the generic definition of modify_prot_start_ptes() to use get_and_clear_full_ptes(). This helper does a TLBI only for the starting and ending contpte block of the range, whereas the current implementation will call ptep_get_and_clear() for every contpte block, thus doing a TLBI on every contpte block. Therefore, we have a performance win. The arm64 definition of pte_accessible() allows us to batch in the errata specific case: #define pte_accessible(mm, pte) \ (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte)) All ptes are obviously present in the folio batch, and they are also valid. Override the generic definition of modify_prot_commit_ptes() to simply use set_ptes() to map the new ptes into the pagetable. Reviewed-by: Ryan Roberts Signed-off-by: Dev Jain Reviewed-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 10 ++++++++++ arch/arm64/mm/mmu.c | 28 +++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgta= ble.h index ba63c8736666..abd2dee416b3 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1643,6 +1643,16 @@ extern void ptep_modify_prot_commit(struct vm_area_s= truct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t new_pte); =20 +#define modify_prot_start_ptes modify_prot_start_ptes +extern pte_t modify_prot_start_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + unsigned int nr); + +#define modify_prot_commit_ptes modify_prot_commit_ptes +extern void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned l= ong addr, + pte_t *ptep, pte_t old_pte, pte_t pte, + unsigned int nr); + #ifdef CONFIG_ARM64_CONTPTE =20 /* diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 3d5fb37424ab..abd9725796e9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -26,6 +26,7 @@ #include #include #include +#include =20 #include #include @@ -1524,24 +1525,41 @@ static int __init prevent_bootmem_remove_init(void) early_initcall(prevent_bootmem_remove_init); #endif =20 -pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long add= r, pte_t *ptep) +pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long add= r, + pte_t *ptep, unsigned int nr) { + pte_t pte =3D get_and_clear_full_ptes(vma->vm_mm, addr, ptep, nr, /* full= =3D */ 0); + if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* * Break-before-make (BBM) is required for all user space mappings * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(ptep_get(ptep))) - return ptep_clear_flush(vma, addr, ptep); + if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte)) + __flush_tlb_range(vma, addr, nr * PAGE_SIZE, + PAGE_SIZE, true, 3); } - return ptep_get_and_clear(vma->vm_mm, addr, ptep); + + return pte; +} + +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long add= r, pte_t *ptep) +{ + return modify_prot_start_ptes(vma, addr, ptep, 1); +} + +void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long add= r, + pte_t *ptep, pte_t old_pte, pte_t pte, + unsigned int nr) +{ + set_ptes(vma->vm_mm, addr, ptep, pte, nr); } =20 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long add= r, pte_t *ptep, pte_t old_pte, pte_t pte) { - set_pte_at(vma->vm_mm, addr, ptep, pte); + modify_prot_commit_ptes(vma, addr, ptep, old_pte, pte, 1); } =20 /* --=20 2.30.2