From nobody Thu Dec 18 09:41:30 2025 Received: from smtpfb1-g21.free.fr (smtpfb1-g21.free.fr [212.27.42.9]) by smtp.subspace.kernel.org (Postfix) with ESMTP id D69E326AFC for ; Wed, 4 Dec 2024 18:36:50 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=212.27.42.9 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733337418; cv=none; b=bJ8iWd4itZHzlo5k1cO/8CQFn8b2rwtPg01fA7oD4sIJCHtlQ5EqwXpePfGXe/sp95ciDg4Q4RjbARwhWwlYw9KVSrD8x18BU/2FhAikf3uKObdIC6aowNbcH5iSz84fRtdVKTcNsvVAdsEJW5qR5Jp/RzHrwf5/aEhY/EE0kcI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733337418; c=relaxed/simple; bh=ghiLoX07RTWfnSswfnotD0Xe2MhLm/GNs/JREtKPCBc=; h=Date:From:To:Cc:Subject:Message-ID:MIME-Version:Content-Type: Content-Disposition; b=UGMeDAE2Feg7Dva5z0aQmw/I0jDNySevsEV7Wf2pC754DlQ7jo+fJDQxUxD5XqZ38pIl8qnsM7dlvo4DBaNcQYzJKtiZX9RwM9tPOTKpT4HJU/Pad9pfX0m5b3mx/1HBkbHNxJiyOzpoBlJfz2AvpCobeTo6FI57avq+1R2e/3g= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=morinfr.org; spf=pass smtp.mailfrom=morinfr.org; dkim=pass (1024-bit key) header.d=morinfr.org header.i=@morinfr.org header.b=lNCeQ97y; arc=none smtp.client-ip=212.27.42.9 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=morinfr.org Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=morinfr.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=morinfr.org header.i=@morinfr.org header.b="lNCeQ97y" Received: from smtp2-g21.free.fr (smtp2-g21.free.fr [212.27.42.2]) by smtpfb1-g21.free.fr (Postfix) with ESMTP id 0C64E6B9FD7 for ; Wed, 4 Dec 2024 19:27:06 +0100 (CET) Received: from bender.morinfr.org (unknown [82.66.66.112]) by smtp2-g21.free.fr (Postfix) with ESMTPS id 2E1E62003F7; Wed, 4 Dec 2024 19:26:51 +0100 (CET) Authentication-Results: smtp2-g21.free.fr; dkim=pass (1024-bit key; unprotected) header.d=morinfr.org header.i=@morinfr.org header.a=rsa-sha256 header.s=20170427 header.b=lNCeQ97y; dkim-atps=neutral DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=morinfr.org ; s=20170427; h=Content-Type:MIME-Version:Message-ID:Subject:Cc:To:From:Date: Sender:Reply-To:Content-Transfer-Encoding:Content-ID:Content-Description: Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc:Resent-Message-ID: In-Reply-To:References:List-Id:List-Help:List-Unsubscribe:List-Subscribe: List-Post:List-Owner:List-Archive; bh=XvkApVkGOyr97+SFFARJioI95kRCf5IOchx6oG1WUzc=; b=lNCeQ97yoKFTCy0Q32IPzldTmr 3vNGQf3sQ8esD/GOX/xB74hAovnS0r4FvI3xVZtYUrXnkVPADymVDQ36d1UvO/rOoNEGvujgJgTlN 9jt7Luo5Q2wVwNk4u9bWuGqJZauX69cXWPJOcTKhK28ZSSdtrWruRjm9quC0PPdaZQCE=; Received: from guillaum by bender.morinfr.org with local (Exim 4.96) (envelope-from ) id 1tIu5K-00156h-1R; Wed, 04 Dec 2024 19:26:50 +0100 Date: Wed, 4 Dec 2024 19:26:50 +0100 From: Guillaume Morin To: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org, guillaume@morinfr.org, Muchun Song , Andrew Morton , Peter Xu , David Hildenbrand , Eric Hagberg Subject: [PATCH v1] hugetlb: support FOLL_FORCE|FOLL_WRITE Message-ID: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Disposition: inline Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" FOLL_FORCE|FOLL_WRITE has never been properly supported for hugetlb mappings. Since 1d8d14641fd94, we explicitly reject it. However running software on hugetlb mappings is a useful optimization. Multiple tools allow to use that such as Intel iodlr or libhugetlbfs. Cc: Muchun Song Cc: Andrew Morton Cc: Peter Xu Cc: David Hildenbrand Cc: Eric Hagberg Signed-off-by: Guillaume Morin --- mm/gup.c | 93 ++++++++++++++++++++++++++-------------------------- mm/hugetlb.c | 20 ++++++----- 2 files changed, 58 insertions(+), 55 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 746070a1d8bf..c680edf33248 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -587,6 +587,33 @@ static struct folio *try_grab_folio_fast(struct page *= page, int refs, } #endif /* CONFIG_HAVE_GUP_FAST */ =20 +/* Common code for can_follow_write_* */ +static inline bool can_follow_write_common(struct page *page, + struct vm_area_struct *vma, unsigned int flags) +{ + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + return page && PageAnon(page) && PageAnonExclusive(page); +} + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags, unsigned long address) { @@ -613,6 +640,22 @@ static struct page *no_page_table(struct vm_area_struc= t *vma, } =20 #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */ +static inline bool can_follow_write_pud(pud_t pud, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) +{ + /* If the pud is writable, we can write to the page. */ + if (pud_write(pud)) + return true; + + if (!can_follow_write_common(page, vma, flags)) + return false; + + /* ... and a write-fault isn't required for other reasons. */ + return !vma_soft_dirty_enabled(vma) || pud_soft_dirty(pud); +} + static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, int flags, struct follow_page_context *ctx) @@ -625,7 +668,8 @@ static struct page *follow_huge_pud(struct vm_area_stru= ct *vma, =20 assert_spin_locked(pud_lockptr(mm, pudp)); =20 - if ((flags & FOLL_WRITE) && !pud_write(pud)) + if ((flags & FOLL_WRITE) && + !can_follow_write_pud(pud, page, vma, flags)) return NULL; =20 if (!pud_present(pud)) @@ -677,27 +721,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, str= uct page *page, if (pmd_write(pmd)) return true; =20 - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; =20 /* ... and a write-fault isn't required for other reasons. */ @@ -798,27 +822,7 @@ static inline bool can_follow_write_pte(pte_t pte, str= uct page *page, if (pte_write(pte)) return true; =20 - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; =20 /* ... and a write-fault isn't required for other reasons. */ @@ -1285,9 +1289,6 @@ static int check_vma_flags(struct vm_area_struct *vma= , unsigned long gup_flags) if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; - /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ - if (is_vm_hugetlb_page(vma)) - return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ea2ed8e301ef..52517b7ce308 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5169,6 +5169,13 @@ static void set_huge_ptep_writable(struct vm_area_st= ruct *vma, update_mmu_cache(vma, address, ptep); } =20 +static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + if (vma->vm_flags & VM_WRITE) + set_huge_ptep_writable(vma, address, ptep); +} + bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; @@ -5802,13 +5809,6 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache= _folio, if (!unshare && huge_pte_uffd_wp(pte)) return 0; =20 - /* - * hugetlb does not support FOLL_FORCE-style write faults that keep the - * PTE mapped R/O such as maybe_mkwrite() would do. - */ - if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) - return VM_FAULT_SIGSEGV; - /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { set_huge_ptep_writable(vma, vmf->address, vmf->pte); @@ -5837,7 +5837,8 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_= folio, SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) - set_huge_ptep_writable(vma, vmf->address, vmf->pte); + set_huge_ptep_maybe_writable(vma, vmf->address, + vmf->pte); =20 delayacct_wpcopy_end(); return 0; @@ -5943,7 +5944,8 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_= folio, spin_lock(vmf->ptl); vmf->pte =3D hugetlb_walk(vma, vmf->address, huge_page_size(h)); if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte)= , pte))) { - pte_t newpte =3D make_huge_pte(vma, &new_folio->page, !unshare); + const bool writable =3D !unshare && (vma->vm_flags & VM_WRITE); + pte_t newpte =3D make_huge_pte(vma, &new_folio->page, writable); =20 /* Break COW or unshare */ huge_ptep_clear_flush(vma, vmf->address, vmf->pte); --=20 2.39.1 --=20 Guillaume Morin