From nobody Wed Oct 8 11:37:19 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 9F39219597F for ; Sat, 28 Jun 2025 11:35:12 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1751110514; cv=none; b=MuLaEF48kUCdZpvpy0MJ3xomo8qT+fQzrqk34ppTwRzchTawtb4asEck55uO7C+rzZpRHnbcrcftjZqUQ9rheSFeaiObt5ccQvjVpGxI9grbK+wulZOO2K6So/gbyJDNwtQ2jbDJ7sr41VLk8jM9DMUZhd939cbW8KFdGzK8l+w= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1751110514; c=relaxed/simple; bh=b1d7X82GHOJNLPT9iA1Dynudh1Z4P9OP6XokhhKVDH8=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=csKcPZ+1kjwx/SqEzrgRQV4CziwhQ5/0ddzFRL0DEUTddQ4U673SvY8N8to5qDgBw6dZvQD6SFhni3YkobmEeAB5Kpog5IHXlIp42hCK6P/P/ZdlbgOeJXvxRWMZ4EWZhOtqyxP4yaw7KdSjofGEVoSDuBnAOrMboavEXMjVELw= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 0837F2574; Sat, 28 Jun 2025 04:34:55 -0700 (PDT) Received: from MacBook-Pro.blr.arm.com (MacBook-Pro.blr.arm.com [10.164.18.46]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 213E33F762; Sat, 28 Jun 2025 04:35:03 -0700 (PDT) From: Dev Jain To: akpm@linux-foundation.org Cc: ryan.roberts@arm.com, david@redhat.com, willy@infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, catalin.marinas@arm.com, will@kernel.org, Liam.Howlett@oracle.com, lorenzo.stoakes@oracle.com, vbabka@suse.cz, jannh@google.com, anshuman.khandual@arm.com, peterx@redhat.com, joey.gouly@arm.com, ioworker0@gmail.com, baohua@kernel.org, kevin.brodsky@arm.com, quic_zhenhuah@quicinc.com, christophe.leroy@csgroup.eu, yangyicong@hisilicon.com, linux-arm-kernel@lists.infradead.org, hughd@google.com, yang@os.amperecomputing.com, ziy@nvidia.com, Dev Jain Subject: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs Date: Sat, 28 Jun 2025 17:04:32 +0530 Message-Id: <20250628113435.46678-2-dev.jain@arm.com> X-Mailer: git-send-email 2.39.3 (Apple Git-146) In-Reply-To: <20250628113435.46678-1-dev.jain@arm.com> References: <20250628113435.46678-1-dev.jain@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" In case of prot_numa, there are various cases in which we can skip to the next iteration. Since the skip condition is based on the folio and not the PTEs, we can skip a PTE batch. Additionally refactor all of this into a new function to clean up the existing code. Signed-off-by: Dev Jain --- mm/mprotect.c | 134 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 87 insertions(+), 47 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 88709c01177b..af10a7fbe6b8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -83,6 +83,83 @@ bool can_change_pte_writable(struct vm_area_struct *vma,= unsigned long addr, return pte_dirty(pte); } =20 +static int mprotect_folio_pte_batch(struct folio *folio, unsigned long add= r, + pte_t *ptep, pte_t pte, int max_nr_ptes) +{ + const fpb_t flags =3D FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; + + if (!folio || !folio_test_large(folio) || (max_nr_ptes =3D=3D 1)) + return 1; + + return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags, + NULL, NULL, NULL); +} + +static int prot_numa_skip_ptes(struct folio **foliop, struct vm_area_struc= t *vma, + unsigned long addr, pte_t oldpte, pte_t *pte, int target_node, + int max_nr_ptes) +{ + struct folio *folio =3D NULL; + int nr_ptes =3D 1; + bool toptier; + int nid; + + /* Avoid TLB flush if possible */ + if (pte_protnone(oldpte)) + goto skip_batch; + + folio =3D vm_normal_folio(vma, addr, oldpte); + if (!folio) + goto skip_batch; + + if (folio_is_zone_device(folio) || folio_test_ksm(folio)) + goto skip_batch; + + /* Also skip shared copy-on-write pages */ + if (is_cow_mapping(vma->vm_flags) && + (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio))) + goto skip_batch; + + /* + * While migration can move some dirty pages, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (folio_is_file_lru(folio) && folio_test_dirty(folio)) + goto skip_batch; + + /* + * Don't mess with PTEs if page is already on the node + * a single-threaded process is running on. + */ + nid =3D folio_nid(folio); + if (target_node =3D=3D nid) + goto skip_batch; + + toptier =3D node_is_toptier(nid); + + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier) + goto skip_batch; + + if (folio_use_access_time(folio)) { + folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); + + /* Do not skip in this case */ + nr_ptes =3D 0; + goto out; + } + +skip_batch: + nr_ptes =3D mprotect_folio_pte_batch(folio, addr, pte, oldpte, max_nr_pte= s); +out: + *foliop =3D folio; + return nr_ptes; +} + static long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -94,6 +171,7 @@ static long change_pte_range(struct mmu_gather *tlb, bool prot_numa =3D cp_flags & MM_CP_PROT_NUMA; bool uffd_wp =3D cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve =3D cp_flags & MM_CP_UFFD_WP_RESOLVE; + int nr_ptes; =20 tlb_change_page_size(tlb, PAGE_SIZE); pte =3D pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -108,8 +186,11 @@ static long change_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { + nr_ptes =3D 1; oldpte =3D ptep_get(pte); if (pte_present(oldpte)) { + int max_nr_ptes =3D (end - addr) >> PAGE_SHIFT; + struct folio *folio =3D NULL; pte_t ptent; =20 /* @@ -117,53 +198,12 @@ static long change_pte_range(struct mmu_gather *tlb, * pages. See similar comment in change_huge_pmd. */ if (prot_numa) { - struct folio *folio; - int nid; - bool toptier; - - /* Avoid TLB flush if possible */ - if (pte_protnone(oldpte)) - continue; - - folio =3D vm_normal_folio(vma, addr, oldpte); - if (!folio || folio_is_zone_device(folio) || - folio_test_ksm(folio)) - continue; - - /* Also skip shared copy-on-write pages */ - if (is_cow_mapping(vma->vm_flags) && - (folio_maybe_dma_pinned(folio) || - folio_maybe_mapped_shared(folio))) - continue; - - /* - * While migration can move some dirty pages, - * it cannot move them all from MIGRATE_ASYNC - * context. - */ - if (folio_is_file_lru(folio) && - folio_test_dirty(folio)) - continue; - - /* - * Don't mess with PTEs if page is already on the node - * a single-threaded process is running on. - */ - nid =3D folio_nid(folio); - if (target_node =3D=3D nid) - continue; - toptier =3D node_is_toptier(nid); - - /* - * Skip scanning top tier node if normal numa - * balancing is disabled - */ - if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - toptier) + nr_ptes =3D prot_numa_skip_ptes(&folio, vma, + addr, oldpte, pte, + target_node, + max_nr_ptes); + if (nr_ptes) continue; - if (folio_use_access_time(folio)) - folio_xchg_access_time(folio, - jiffies_to_msecs(jiffies)); } =20 oldpte =3D ptep_modify_prot_start(vma, addr, pte); @@ -280,7 +320,7 @@ static long change_pte_range(struct mmu_gather *tlb, pages++; } } - } while (pte++, addr +=3D PAGE_SIZE, addr !=3D end); + } while (pte +=3D nr_ptes, addr +=3D nr_ptes * PAGE_SIZE, addr !=3D end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); =20 --=20 2.30.2 From nobody Wed Oct 8 11:37:19 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 9D11319597F for ; Sat, 28 Jun 2025 11:35:20 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1751110522; cv=none; b=Q6RzXbCHNuDBYtZ19qCKjlKi2VIco1qT9uH6FWiOAlpUwxTYPKCFIWhhMGVPC2BNUvgbf92lRPQLq1mxYJdk6gI57Nso8zKBpp5pxgMWulngXF9upfwNlXbZJftqcvDly3h2MU3Bd+nNkg1CXlfwjsq3lWRaLu0coRS56FNq1zQ= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1751110522; c=relaxed/simple; bh=ah2syinEnPNEnbTUmIyBEW8DvuX0rQDh6MOfHgILfvM=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=any6Dd7iobY2nq22+b3Qc1bMvY+RlVQNjObVOt7MC+XU6IKk93sxa45ShSmEcYixjXYKB4+1aZQ+YBeQ7wUBfUwQAQGrxK5GBkTSCEzHtlhBO8A6cVSD7N7UNBu7pKvX3JKgwxRlwBHfy2MTfbpqNFaeqhL7UXtvo9ZcEMVBaU8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 47E391BD0; Sat, 28 Jun 2025 04:35:03 -0700 (PDT) Received: from MacBook-Pro.blr.arm.com (MacBook-Pro.blr.arm.com [10.164.18.46]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 6A4413F762; Sat, 28 Jun 2025 04:35:12 -0700 (PDT) From: Dev Jain To: akpm@linux-foundation.org Cc: ryan.roberts@arm.com, david@redhat.com, willy@infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, catalin.marinas@arm.com, will@kernel.org, Liam.Howlett@oracle.com, lorenzo.stoakes@oracle.com, vbabka@suse.cz, jannh@google.com, anshuman.khandual@arm.com, peterx@redhat.com, joey.gouly@arm.com, ioworker0@gmail.com, baohua@kernel.org, kevin.brodsky@arm.com, quic_zhenhuah@quicinc.com, christophe.leroy@csgroup.eu, yangyicong@hisilicon.com, linux-arm-kernel@lists.infradead.org, hughd@google.com, yang@os.amperecomputing.com, ziy@nvidia.com, Dev Jain Subject: [PATCH v4 2/4] mm: Add batched versions of ptep_modify_prot_start/commit Date: Sat, 28 Jun 2025 17:04:33 +0530 Message-Id: <20250628113435.46678-3-dev.jain@arm.com> X-Mailer: git-send-email 2.39.3 (Apple Git-146) In-Reply-To: <20250628113435.46678-1-dev.jain@arm.com> References: <20250628113435.46678-1-dev.jain@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Batch ptep_modify_prot_start/commit in preparation for optimizing mprotect. Architecture can override these helpers; in case not, they are implemented as a simple loop over the corresponding single pte helpers. Signed-off-by: Dev Jain --- include/linux/pgtable.h | 83 ++++++++++++++++++++++++++++++++++++++++- mm/mprotect.c | 4 +- 2 files changed, 84 insertions(+), 3 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index cf1515c163e2..662f39e7475a 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1331,7 +1331,8 @@ static inline pte_t ptep_modify_prot_start(struct vm_= area_struct *vma, =20 /* * Commit an update to a pte, leaving any hardware-controlled bits in - * the PTE unmodified. + * the PTE unmodified. The pte may have been "upgraded" w.r.t a/d bits com= pared + * to the old_pte, as in, it may have a/d bits on which were off in old_pt= e. */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, @@ -1340,6 +1341,86 @@ static inline void ptep_modify_prot_commit(struct vm= _area_struct *vma, __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ + +/** + * modify_prot_start_ptes - Start a pte protection read-modify-write trans= action + * over a batch of ptes, which protects against asynchronous hardware + * modifications to the ptes. The intention is not to prevent the hardware= from + * making pte updates, but to prevent any updates it may make from being l= ost. + * Please see the comment above ptep_modify_prot_start() for full descript= ion. + * + * @vma: The virtual memory area the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries. + * + * May be overridden by the architecture; otherwise, implemented as a simp= le + * loop over ptep_modify_prot_start(), collecting the a/d bits from each p= te + * in the batch. + * + * Note that PTE bits in the PTE batch besides the PFN can differ. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + * Since the batch is determined from folio_pte_batch, the PTEs must differ + * only in a/d bits (and the soft dirty bit; see fpb_t flags in + * mprotect_folio_pte_batch()). + */ +#ifndef modify_prot_start_ptes +static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + pte_t pte, tmp_pte; + + pte =3D ptep_modify_prot_start(vma, addr, ptep); + while (--nr) { + ptep++; + addr +=3D PAGE_SIZE; + tmp_pte =3D ptep_modify_prot_start(vma, addr, ptep); + if (pte_dirty(tmp_pte)) + pte =3D pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte =3D pte_mkyoung(pte); + } + return pte; +} +#endif + +/** + * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving = any + * hardware-controlled bits in the PTE unmodified. + * + * @vma: The virtual memory area the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @old_pte: Old page table entry (for the first entry) which is now clear= ed. + * @pte: New page table entry to be set. + * @nr: Number of entries. + * + * May be overridden by the architecture; otherwise, implemented as a simp= le + * loop over ptep_modify_prot_commit(). + * + * Context: The caller holds the page table lock. The PTEs are all in the = same + * PMD. On exit, the set ptes in the batch map the same folio. The pte may= have + * been "upgraded" w.r.t a/d bits compared to the old_pte, as in, it may h= ave + * a/d bits on which were off in old_pte. + */ +#ifndef modify_prot_commit_ptes +static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, uns= igned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte, unsigned int nr) +{ + int i; + + for (i =3D 0; i < nr; ++i) { + ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); + ptep++; + addr +=3D PAGE_SIZE; + old_pte =3D pte_next_pfn(old_pte); + pte =3D pte_next_pfn(pte); + } +} +#endif + #endif /* CONFIG_MMU */ =20 /* diff --git a/mm/mprotect.c b/mm/mprotect.c index af10a7fbe6b8..627b0d67cc4a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -206,7 +206,7 @@ static long change_pte_range(struct mmu_gather *tlb, continue; } =20 - oldpte =3D ptep_modify_prot_start(vma, addr, pte); + oldpte =3D modify_prot_start_ptes(vma, addr, pte, nr_ptes); ptent =3D pte_modify(oldpte, newprot); =20 if (uffd_wp) @@ -232,7 +232,7 @@ static long change_pte_range(struct mmu_gather *tlb, can_change_pte_writable(vma, addr, ptent)) ptent =3D pte_mkwrite(ptent, vma); =20 - ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); + modify_prot_commit_ptes(vma, addr, pte, oldpte, ptent, nr_ptes); if (pte_needs_flush(oldpte, ptent)) tlb_flush_pte_range(tlb, addr, PAGE_SIZE); pages++; --=20 2.30.2 From nobody Wed Oct 8 11:37:19 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 0424C19597F for ; Sat, 28 Jun 2025 11:35:28 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1751110530; cv=none; b=BCO+W/Jq3LUVCyQVjGkPjMbz4AeJXPmOKc/RUNcsWPRHiPWZoPX0oeEpIUHY28A1h/dQarSyn2ZA/vf3bK+l44zSMCrAmLYNqdCGAkouqFF1hJB+U7pPZsIuoKQocOb6QKUL0R5jgDVVYXSUPJ6bX4LPkUiFjjBpxCHFd2a2EAc= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1751110530; c=relaxed/simple; bh=nPYaSwFH+Qd3oI6VaUdovIuW0JAmKGFC6AKJtda1+h4=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=atUPhBOym3zfaowBieQj1NVLk7QBRjD0EBGD/WAYOosjgHZSaYqqcMoC6NjSEm+uNRM4pGkHvoRKEZ6wKKRSVeou1BdG64inmRa9XT6nztVLA0QuBeUdBQxN5M08AmQLIpsIvgX/W+XbCt8378CcQz1WmeBZHK/5cd+ZRP8bm7U= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 914721BD0; Sat, 28 Jun 2025 04:35:11 -0700 (PDT) Received: from MacBook-Pro.blr.arm.com (MacBook-Pro.blr.arm.com [10.164.18.46]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id AA6D23F762; Sat, 28 Jun 2025 04:35:20 -0700 (PDT) From: Dev Jain To: akpm@linux-foundation.org Cc: ryan.roberts@arm.com, david@redhat.com, willy@infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, catalin.marinas@arm.com, will@kernel.org, Liam.Howlett@oracle.com, lorenzo.stoakes@oracle.com, vbabka@suse.cz, jannh@google.com, anshuman.khandual@arm.com, peterx@redhat.com, joey.gouly@arm.com, ioworker0@gmail.com, baohua@kernel.org, kevin.brodsky@arm.com, quic_zhenhuah@quicinc.com, christophe.leroy@csgroup.eu, yangyicong@hisilicon.com, linux-arm-kernel@lists.infradead.org, hughd@google.com, yang@os.amperecomputing.com, ziy@nvidia.com, Dev Jain Subject: [PATCH v4 3/4] mm: Optimize mprotect() by PTE-batching Date: Sat, 28 Jun 2025 17:04:34 +0530 Message-Id: <20250628113435.46678-4-dev.jain@arm.com> X-Mailer: git-send-email 2.39.3 (Apple Git-146) In-Reply-To: <20250628113435.46678-1-dev.jain@arm.com> References: <20250628113435.46678-1-dev.jain@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Use folio_pte_batch to batch process a large folio. Reuse the folio from prot_numa case if possible. For all cases other than the PageAnonExclusive case, if the case holds true for one pte in the batch, one can confirm that that case will hold true for other ptes in the batch too; for pte_needs_soft_dirty_wp(), we do not pass FPB_IGNORE_SOFT_DIRTY. modify_prot_start_ptes() collects the dirty and access bits across the batch, therefore batching across pte_dirty(): this is correct since the dirty bit on the PTE really is just an indication that the folio got written to, so even if the PTE is not actually dirty (but one of the PTEs in the batch is), the wp-fault optimization can be made. The crux now is how to batch around the PageAnonExclusive case; we must check the corresponding condition for every single page. Therefore, from the large folio batch, we process sub batches of ptes mapping pages with the same PageAnonExclusive condition, and process that sub batch, then determine and process the next sub batch, and so on. Note that this does not cause any extra overhead; if suppose the size of the folio batch is 512, then the sub batch processing in total will take 512 iterations, which is the same as what we would have done before. Signed-off-by: Dev Jain --- mm/mprotect.c | 143 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 117 insertions(+), 26 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 627b0d67cc4a..28c7ce7728ff 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -40,35 +40,47 @@ =20 #include "internal.h" =20 -bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long add= r, - pte_t pte) -{ - struct page *page; +enum tristate { + TRI_FALSE =3D 0, + TRI_TRUE =3D 1, + TRI_MAYBE =3D -1, +}; =20 +/* + * Returns enum tristate indicating whether the pte can be changed to writ= able. + * If TRI_MAYBE is returned, then the folio is anonymous and the user must + * additionally check PageAnonExclusive() for every page in the desired ra= nge. + */ +static int maybe_change_pte_writable(struct vm_area_struct *vma, + unsigned long addr, pte_t pte, + struct folio *folio) +{ if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) - return false; + return TRI_FALSE; =20 /* Don't touch entries that are not even readable. */ if (pte_protnone(pte)) - return false; + return TRI_FALSE; =20 /* Do we need write faults for softdirty tracking? */ if (pte_needs_soft_dirty_wp(vma, pte)) - return false; + return TRI_FALSE; =20 /* Do we need write faults for uffd-wp tracking? */ if (userfaultfd_pte_wp(vma, pte)) - return false; + return TRI_FALSE; =20 if (!(vma->vm_flags & VM_SHARED)) { /* * Writable MAP_PRIVATE mapping: We can only special-case on * exclusive anonymous pages, because we know that our * write-fault handler similarly would map them writable without - * any additional checks while holding the PT lock. + * any additional checks while holding the PT lock. So if the + * folio is not anonymous, we know we cannot change pte to + * writable. If it is anonymous then the caller must further + * check that the page is AnonExclusive(). */ - page =3D vm_normal_page(vma, addr, pte); - return page && PageAnon(page) && PageAnonExclusive(page); + return (!folio || folio_test_anon(folio)) ? TRI_MAYBE : TRI_FALSE; } =20 VM_WARN_ON_ONCE(is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)); @@ -80,15 +92,61 @@ bool can_change_pte_writable(struct vm_area_struct *vma= , unsigned long addr, * FS was already notified and we can simply mark the PTE writable * just like the write-fault handler would do. */ - return pte_dirty(pte); + return pte_dirty(pte) ? TRI_TRUE : TRI_FALSE; +} + +/* + * Returns the number of pages within the folio, starting from the page + * indicated by pgidx and up to pgidx + max_nr, that have the same value of + * PageAnonExclusive(). Must only be called for anonymous folios. Value of + * PageAnonExclusive() is returned in *exclusive. + */ +static int anon_exclusive_batch(struct folio *folio, int pgidx, int max_nr, + bool *exclusive) +{ + struct page *page; + int nr =3D 1; + + if (!folio) { + *exclusive =3D false; + return nr; + } + + page =3D folio_page(folio, pgidx++); + *exclusive =3D PageAnonExclusive(page); + while (nr < max_nr) { + page =3D folio_page(folio, pgidx++); + if ((*exclusive) !=3D PageAnonExclusive(page)) + break; + nr++; + } + + return nr; +} + +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long add= r, + pte_t pte) +{ + struct page *page; + int ret; + + ret =3D maybe_change_pte_writable(vma, addr, pte, NULL); + if (ret =3D=3D TRI_MAYBE) { + page =3D vm_normal_page(vma, addr, pte); + ret =3D page && PageAnon(page) && PageAnonExclusive(page); + } + + return ret; } =20 static int mprotect_folio_pte_batch(struct folio *folio, unsigned long add= r, - pte_t *ptep, pte_t pte, int max_nr_ptes) + pte_t *ptep, pte_t pte, int max_nr_ptes, fpb_t switch_off_flags) { - const fpb_t flags =3D FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; + fpb_t flags =3D FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; + + flags &=3D ~switch_off_flags; =20 - if (!folio || !folio_test_large(folio) || (max_nr_ptes =3D=3D 1)) + if (!folio || !folio_test_large(folio)) return 1; =20 return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags, @@ -154,7 +212,8 @@ static int prot_numa_skip_ptes(struct folio **foliop, s= truct vm_area_struct *vma } =20 skip_batch: - nr_ptes =3D mprotect_folio_pte_batch(folio, addr, pte, oldpte, max_nr_pte= s); + nr_ptes =3D mprotect_folio_pte_batch(folio, addr, pte, oldpte, + max_nr_ptes, 0); out: *foliop =3D folio; return nr_ptes; @@ -191,7 +250,10 @@ static long change_pte_range(struct mmu_gather *tlb, if (pte_present(oldpte)) { int max_nr_ptes =3D (end - addr) >> PAGE_SHIFT; struct folio *folio =3D NULL; - pte_t ptent; + int sub_nr_ptes, pgidx =3D 0; + pte_t ptent, newpte; + bool sub_set_write; + int set_write; =20 /* * Avoid trapping faults against the zero or KSM @@ -206,6 +268,11 @@ static long change_pte_range(struct mmu_gather *tlb, continue; } =20 + if (!folio) + folio =3D vm_normal_folio(vma, addr, oldpte); + + nr_ptes =3D mprotect_folio_pte_batch(folio, addr, pte, oldpte, + max_nr_ptes, FPB_IGNORE_SOFT_DIRTY); oldpte =3D modify_prot_start_ptes(vma, addr, pte, nr_ptes); ptent =3D pte_modify(oldpte, newprot); =20 @@ -227,15 +294,39 @@ static long change_pte_range(struct mmu_gather *tlb, * example, if a PTE is already dirty and no other * COW or special handling is required. */ - if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && - !pte_write(ptent) && - can_change_pte_writable(vma, addr, ptent)) - ptent =3D pte_mkwrite(ptent, vma); - - modify_prot_commit_ptes(vma, addr, pte, oldpte, ptent, nr_ptes); - if (pte_needs_flush(oldpte, ptent)) - tlb_flush_pte_range(tlb, addr, PAGE_SIZE); - pages++; + set_write =3D (cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && + !pte_write(ptent); + if (set_write) + set_write =3D maybe_change_pte_writable(vma, addr, ptent, folio); + + while (nr_ptes) { + if (set_write =3D=3D TRI_MAYBE) { + sub_nr_ptes =3D anon_exclusive_batch(folio, + pgidx, nr_ptes, &sub_set_write); + } else { + sub_nr_ptes =3D nr_ptes; + sub_set_write =3D (set_write =3D=3D TRI_TRUE); + } + + if (sub_set_write) + newpte =3D pte_mkwrite(ptent, vma); + else + newpte =3D ptent; + + modify_prot_commit_ptes(vma, addr, pte, oldpte, + newpte, sub_nr_ptes); + if (pte_needs_flush(oldpte, newpte)) + tlb_flush_pte_range(tlb, addr, + sub_nr_ptes * PAGE_SIZE); + + addr +=3D sub_nr_ptes * PAGE_SIZE; + pte +=3D sub_nr_ptes; + oldpte =3D pte_advance_pfn(oldpte, sub_nr_ptes); + ptent =3D pte_advance_pfn(ptent, sub_nr_ptes); + nr_ptes -=3D sub_nr_ptes; + pages +=3D sub_nr_ptes; + pgidx +=3D sub_nr_ptes; + } } else if (is_swap_pte(oldpte)) { swp_entry_t entry =3D pte_to_swp_entry(oldpte); pte_t newpte; --=20 2.30.2 From nobody Wed Oct 8 11:37:19 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 19A9D1E1E1C for ; Sat, 28 Jun 2025 11:35:37 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1751110538; cv=none; b=n8ORSghP63cel+I1N/IsV2H3PA4Bzl5dTLRlm132eE1hEalrkbEX6s+X9VXWZsnyCyfTMQsHDsH7rh5ljbMVDKLnQxT38FeGAnXp98vWxstKvgTF53gDJenfSUNY0KrX7XfVoN951y2vRic1PhpMYFQ+pP/Hox3XIk88xYUfRA4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1751110538; c=relaxed/simple; bh=LShM5SswDrluRPqHh5X+SzOl+BLPvCh7VicYl+BQmiA=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=qGN9hDjeyhTX2pd404TbmcdAv7t5+J3bDjZebhZDY0uD+0hjXODn8RyUPYiu4MD5J2+hQTXvqGeuLutERYilxrtL2d73W5wyMgP6TiHjpqhjFE+8d0J+UsNKnyi7AT/3T3/rvU5ZUs0GKCWNIxcI6IWEHYZ8dqNcloUQnv9baJg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id CF0401BD0; Sat, 28 Jun 2025 04:35:19 -0700 (PDT) Received: from MacBook-Pro.blr.arm.com (MacBook-Pro.blr.arm.com [10.164.18.46]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id F31033F762; Sat, 28 Jun 2025 04:35:28 -0700 (PDT) From: Dev Jain To: akpm@linux-foundation.org Cc: ryan.roberts@arm.com, david@redhat.com, willy@infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, catalin.marinas@arm.com, will@kernel.org, Liam.Howlett@oracle.com, lorenzo.stoakes@oracle.com, vbabka@suse.cz, jannh@google.com, anshuman.khandual@arm.com, peterx@redhat.com, joey.gouly@arm.com, ioworker0@gmail.com, baohua@kernel.org, kevin.brodsky@arm.com, quic_zhenhuah@quicinc.com, christophe.leroy@csgroup.eu, yangyicong@hisilicon.com, linux-arm-kernel@lists.infradead.org, hughd@google.com, yang@os.amperecomputing.com, ziy@nvidia.com, Dev Jain Subject: [PATCH v4 4/4] arm64: Add batched versions of ptep_modify_prot_start/commit Date: Sat, 28 Jun 2025 17:04:35 +0530 Message-Id: <20250628113435.46678-5-dev.jain@arm.com> X-Mailer: git-send-email 2.39.3 (Apple Git-146) In-Reply-To: <20250628113435.46678-1-dev.jain@arm.com> References: <20250628113435.46678-1-dev.jain@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Override the generic definition of modify_prot_start_ptes() to use get_and_clear_full_ptes(). This helper does a TLBI only for the starting and ending contpte block of the range, whereas the current implementation will call ptep_get_and_clear() for every contpte block, thus doing a TLBI on every contpte block. Therefore, we have a performance win. The arm64 definition of pte_accessible() allows us to batch in the errata specific case: #define pte_accessible(mm, pte) \ (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte)) All ptes are obviously present in the folio batch, and they are also valid. Override the generic definition of modify_prot_commit_ptes() to simply use set_ptes() to map the new ptes into the pagetable. Signed-off-by: Dev Jain Reviewed-by: Ryan Roberts --- arch/arm64/include/asm/pgtable.h | 10 ++++++++++ arch/arm64/mm/mmu.c | 28 +++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgta= ble.h index ba63c8736666..abd2dee416b3 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1643,6 +1643,16 @@ extern void ptep_modify_prot_commit(struct vm_area_s= truct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t new_pte); =20 +#define modify_prot_start_ptes modify_prot_start_ptes +extern pte_t modify_prot_start_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + unsigned int nr); + +#define modify_prot_commit_ptes modify_prot_commit_ptes +extern void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned l= ong addr, + pte_t *ptep, pte_t old_pte, pte_t pte, + unsigned int nr); + #ifdef CONFIG_ARM64_CONTPTE =20 /* diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 3d5fb37424ab..38325616f467 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -26,6 +26,7 @@ #include #include #include +#include =20 #include #include @@ -1524,24 +1525,41 @@ static int __init prevent_bootmem_remove_init(void) early_initcall(prevent_bootmem_remove_init); #endif =20 -pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long add= r, pte_t *ptep) +pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long add= r, + pte_t *ptep, unsigned int nr) { + pte_t pte =3D get_and_clear_full_ptes(vma->vm_mm, addr, ptep, nr, 0); + if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* * Break-before-make (BBM) is required for all user space mappings * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(ptep_get(ptep))) - return ptep_clear_flush(vma, addr, ptep); + if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte)) + __flush_tlb_range(vma, addr, nr * PAGE_SIZE, + PAGE_SIZE, true, 3); } - return ptep_get_and_clear(vma->vm_mm, addr, ptep); + + return pte; +} + +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long add= r, pte_t *ptep) +{ + return modify_prot_start_ptes(vma, addr, ptep, 1); +} + +void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long add= r, + pte_t *ptep, pte_t old_pte, pte_t pte, + unsigned int nr) +{ + set_ptes(vma->vm_mm, addr, ptep, pte, nr); } =20 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long add= r, pte_t *ptep, pte_t old_pte, pte_t pte) { - set_pte_at(vma->vm_mm, addr, ptep, pte); + modify_prot_commit_ptes(vma, addr, ptep, old_pte, pte, 1); } =20 /* --=20 2.30.2