mm/hugetlb.c | 3 +++ 1 file changed, 3 insertions(+)
When restoring a reservation for an anonymous page, we need to check to
freeing a surplus. However, __unmap_hugepage_range() causes data race
because it reads h->surplus_huge_pages without the protection of
hugetlb_lock.
Therefore, we need to add missing hugetlb_lock.
Reported-by: syzbot+417aeb05fd190f3a6da9@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=417aeb05fd190f3a6da9
Fixes: df7a6d1f6405 ("mm/hugetlb: restore the reservation if needed")
Signed-off-by: Jeongjun Park <aha310510@gmail.com>
---
mm/hugetlb.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 753f99b4c718..e8d95a314df2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5951,6 +5951,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If there we are freeing a surplus, do not set the restore
* reservation bit.
*/
+ spin_lock_irq(&hugetlb_lock);
+
if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
folio_test_anon(folio)) {
folio_set_hugetlb_restore_reserve(folio);
@@ -5958,6 +5960,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
adjust_reservation = true;
}
+ spin_unlock_irq(&hugetlb_lock);
spin_unlock(ptl);
/*
--
On Fri, 22 Aug 2025 14:58:57 +0900 Jeongjun Park <aha310510@gmail.com> wrote: > When restoring a reservation for an anonymous page, we need to check to > freeing a surplus. However, __unmap_hugepage_range() causes data race > because it reads h->surplus_huge_pages without the protection of > hugetlb_lock. > > Therefore, we need to add missing hugetlb_lock. > > ... > > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -5951,6 +5951,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, > * If there we are freeing a surplus, do not set the restore > * reservation bit. > */ > + spin_lock_irq(&hugetlb_lock); > + > if (!h->surplus_huge_pages && __vma_private_lock(vma) && > folio_test_anon(folio)) { > folio_set_hugetlb_restore_reserve(folio); > @@ -5958,6 +5960,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, > adjust_reservation = true; > } > > + spin_unlock_irq(&hugetlb_lock); > spin_unlock(ptl); > Does hugetlb_lock nest inside page_table_lock? It's a bit sad to be taking a global lock just to defend against some alleged data race which probably never happens. Doing it once per hugepage probably won't matter but still, is there something more proportionate that we can do here?
Hello Andrew, Andrew Morton <akpm@linux-foundation.org> wrote: > > On Fri, 22 Aug 2025 14:58:57 +0900 Jeongjun Park <aha310510@gmail.com> wrote: > > > When restoring a reservation for an anonymous page, we need to check to > > freeing a surplus. However, __unmap_hugepage_range() causes data race > > because it reads h->surplus_huge_pages without the protection of > > hugetlb_lock. > > > > Therefore, we need to add missing hugetlb_lock. > > > > ... > > > > --- a/mm/hugetlb.c > > +++ b/mm/hugetlb.c > > @@ -5951,6 +5951,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, > > * If there we are freeing a surplus, do not set the restore > > * reservation bit. > > */ > > + spin_lock_irq(&hugetlb_lock); > > + > > if (!h->surplus_huge_pages && __vma_private_lock(vma) && > > folio_test_anon(folio)) { > > folio_set_hugetlb_restore_reserve(folio); > > @@ -5958,6 +5960,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, > > adjust_reservation = true; > > } > > > > + spin_unlock_irq(&hugetlb_lock); > > spin_unlock(ptl); > > > > Does hugetlb_lock nest inside page_table_lock? > > It's a bit sad to be taking a global lock just to defend against some > alleged data race which probably never happens. Doing it once per > hugepage probably won't matter but still, is there something more > proportionate that we can do here? I think it would be better to move the page_table_lock unlock call after the hugetlb_remove_rmap() call. ``` pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) folio_mark_dirty(folio); /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), sz); hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(folio); ``` In __unmap_hugepage_range(), after all of the above code has been executed, the PTE/TLB/rmap are all properly cleaned up. Therefore, there's no need to continue protecting folio_set_hugetlb_restore_reserve(), which only sets bits of folio->private, with a page_table_lock. Regards, Jeongjun Park
+ /* + * Check surplus_huge_pages without taking hugetlb_lock. + * A race here is okay: + * - If surplus goes 0 -> nonzero, we skip restore. + * - If surplus goes nonzero -> 0, we also skip. + * In both cases we just miss a restore, which is safe. + */ + { + unsigned long surplus = READ_ONCE(h->surplus_huge_pages); + + if (!surplus && + __vma_private_lock(vma) && + folio_test_anon(folio) && + READ_ONCE(h->surplus_huge_pages) == surplus) { + folio_set_hugetlb_restore_reserve(folio); + adjust_reservation = true; + } + } spin_unlock(ptl); On 8/23/2025 5:07 AM, Andrew Morton wrote: > On Fri, 22 Aug 2025 14:58:57 +0900 Jeongjun Park <aha310510@gmail.com> wrote: > >> When restoring a reservation for an anonymous page, we need to check to > freeing a surplus. However, __unmap_hugepage_range() causes data > race > because it reads h->surplus_huge_pages without the protection of > > hugetlb_lock. > > Therefore, we need to add missing hugetlb_lock. > > > ... > > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -5951,6 +5951,8 > @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct > vm_area_struct *vma, > * If there we are freeing a surplus, do not set > the restore > * reservation bit. > */ > + spin_lock_irq(&hugetlb_lock); > > + > if (!h->surplus_huge_pages && __vma_private_lock(vma) && > > folio_test_anon(folio)) { > folio_set_hugetlb_restore_reserve(folio); > > @@ -5958,6 +5960,7 @@ void __unmap_hugepage_range(struct mmu_gather > *tlb, struct vm_area_struct *vma, > adjust_reservation = true; > } > > + > spin_unlock_irq(&hugetlb_lock); > spin_unlock(ptl); > > Does hugetlb_lock nest inside page_table_lock? > > It's a bit sad to be taking a global lock just to defend against some > alleged data race which probably never happens. Doing it once per > hugepage probably won't matter but still, is there something more > proportionate that we can do here? >
Hello Giorgi, Giorgi Tchankvetadze <giorgitchankvetadze1997@gmail.com> wrote: > > + /* > + * Check surplus_huge_pages without taking hugetlb_lock. > + * A race here is okay: > + * - If surplus goes 0 -> nonzero, we skip restore. > + * - If surplus goes nonzero -> 0, we also skip. > + * In both cases we just miss a restore, which is safe. > + */ > + { > + unsigned long surplus = READ_ONCE(h->surplus_huge_pages); > + > + if (!surplus && > + __vma_private_lock(vma) && > + folio_test_anon(folio) && > + READ_ONCE(h->surplus_huge_pages) == surplus) { > + folio_set_hugetlb_restore_reserve(folio); > + adjust_reservation = true; > + } > + } > > spin_unlock(ptl); > > Why do you think skipping restoration is safe? As specified in the comments, if scheduled restoration of anonymous pages isn't performed in a timely manner, the backup page can be stolen. And If the original owner tries to fault in the stolen page, it causes a page fault, resulting in a SIGBUS error. Of course, this phenomenon is a rare occurrence due to a race condition, but in workloads that frequently use hugetlb, surplus_huge_pages increases and decreases frequently, and backup pages that are not restored in time due to this race continue to accumulate, so this is not a race that can be ignored. > > > On 8/23/2025 5:07 AM, Andrew Morton wrote: > > On Fri, 22 Aug 2025 14:58:57 +0900 Jeongjun Park <aha310510@gmail.com> wrote: > > > >> When restoring a reservation for an anonymous page, we need to check to > freeing a surplus. However, __unmap_hugepage_range() causes data > > race > because it reads h->surplus_huge_pages without the protection of > > > hugetlb_lock. > > Therefore, we need to add missing hugetlb_lock. > > > > ... > > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -5951,6 +5951,8 > > @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct > > vm_area_struct *vma, > * If there we are freeing a surplus, do not set > > the restore > * reservation bit. > */ > + spin_lock_irq(&hugetlb_lock); > > > + > if (!h->surplus_huge_pages && __vma_private_lock(vma) && > > > folio_test_anon(folio)) { > folio_set_hugetlb_restore_reserve(folio); > > > @@ -5958,6 +5960,7 @@ void __unmap_hugepage_range(struct mmu_gather > > *tlb, struct vm_area_struct *vma, > adjust_reservation = true; > } > > + > > spin_unlock_irq(&hugetlb_lock); > spin_unlock(ptl); > > > Does hugetlb_lock nest inside page_table_lock? > > > > It's a bit sad to be taking a global lock just to defend against some > > alleged data race which probably never happens. Doing it once per > > hugepage probably won't matter but still, is there something more > > proportionate that we can do here? > > > > Regards, Jeongjun Park
On 8/22/25 1:58 AM, Jeongjun Park wrote: > When restoring a reservation for an anonymous page, we need to check to > freeing a surplus. However, __unmap_hugepage_range() causes data race > because it reads h->surplus_huge_pages without the protection of > hugetlb_lock. > > Therefore, we need to add missing hugetlb_lock. > Makes sense as alloc_surplus_hugetlb_folio() takes the hugetlb_lock when reading the hstate. Reviewed-by: Sidhartha Kumar <sidhartha.kumar@oracle.com> > Reported-by: syzbot+417aeb05fd190f3a6da9@syzkaller.appspotmail.com > Closes: https://syzkaller.appspot.com/bug?extid=417aeb05fd190f3a6da9 > Fixes: df7a6d1f6405 ("mm/hugetlb: restore the reservation if needed") > Signed-off-by: Jeongjun Park <aha310510@gmail.com> > --- > mm/hugetlb.c | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 753f99b4c718..e8d95a314df2 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -5951,6 +5951,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, > * If there we are freeing a surplus, do not set the restore > * reservation bit. > */ > + spin_lock_irq(&hugetlb_lock); > + > if (!h->surplus_huge_pages && __vma_private_lock(vma) && > folio_test_anon(folio)) { > folio_set_hugetlb_restore_reserve(folio); > @@ -5958,6 +5960,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, > adjust_reservation = true; > } > > + spin_unlock_irq(&hugetlb_lock); > spin_unlock(ptl); > > /* > -- >
© 2016 - 2025 Red Hat, Inc.