include/linux/page_ref.h | 8 +++++- mm/hugetlb.c | 53 ++++++---------------------------------- mm/hugetlb_vmemmap.c | 16 ++++++++++++ 3 files changed, 30 insertions(+), 47 deletions(-)
While investigating HVO for THPs [1], it turns out that speculative
PFN walkers like compaction can race with vmemmap modifications, e.g.,
CPU 1 (vmemmap modifier) CPU 2 (speculative PFN walker)
------------------------------- ------------------------------
Allocates an LRU folio page1
Sees page1
Frees page1
Allocates a hugeTLB folio page2
(page1 being a tail of page2)
Updates vmemmap mapping page1
get_page_unless_zero(page1)
Even though page1->_refcount is zero after HVO, get_page_unless_zero()
can still try to modify this read-only field, resulting in a crash.
An independent report [2] confirmed this race.
There are two discussed approaches to fix this race:
1. Make RO vmemmap RW so that get_page_unless_zero() can fail without
triggering a PF.
2. Use RCU to make sure get_page_unless_zero() either sees zero
page->_refcount through the old vmemmap or non-zero page->_refcount
through the new one.
The second approach is preferred here because:
1. It can prevent illegal modifications to struct page[] that has been
HVO'ed;
2. It can be generalized, in a way similar to ZERO_PAGE(), to fix
similar races in other places, e.g., arch_remove_memory() on x86
[3], which frees vmemmap mapping offlined struct page[].
While adding synchronize_rcu(), the goal is to be surgical, rather
than optimized. Specifically, calls to synchronize_rcu() on the error
handling paths can be coalesced, but it is not done for the sake of
Simplicity: noticeably, this fix removes ~50% more lines than it adds.
According to the hugetlb_optimize_vmemmap section in
Documentation/admin-guide/sysctl/vm.rst, enabling HVO makes allocating
or freeing hugeTLB pages "~2x slower than before". Having
synchronize_rcu() on top makes those operations even worse, and this
also affects the user interface /proc/sys/vm/nr_overcommit_hugepages.
[1] https://lore.kernel.org/20240229183436.4110845-4-yuzhao@google.com/
[2] https://lore.kernel.org/917FFC7F-0615-44DD-90EE-9F85F8EA9974@linux.dev/
[3] https://lore.kernel.org/be130a96-a27e-4240-ad78-776802f57cad@redhat.com/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Muchun Song <muchun.song@linux.dev>
---
include/linux/page_ref.h | 8 +++++-
mm/hugetlb.c | 53 ++++++----------------------------------
mm/hugetlb_vmemmap.c | 16 ++++++++++++
3 files changed, 30 insertions(+), 47 deletions(-)
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 490d0ad6e56d..8c236c651d1d 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -230,7 +230,13 @@ static inline int folio_ref_dec_return(struct folio *folio)
static inline bool page_ref_add_unless(struct page *page, int nr, int u)
{
- bool ret = atomic_add_unless(&page->_refcount, nr, u);
+ bool ret = false;
+
+ rcu_read_lock();
+ /* avoid writing to the vmemmap area being remapped */
+ if (!page_is_fake_head(page) && page_ref_count(page) != u)
+ ret = atomic_add_unless(&page->_refcount, nr, u);
+ rcu_read_unlock();
if (page_ref_tracepoint_active(page_ref_mod_unless))
__page_ref_mod_unless(page, nr, ret);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9691624fcb79..0a69e194b517 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1629,13 +1629,10 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio,
* folio appears as just a compound page. Otherwise, wait until after
* allocating vmemmap to clear the flag.
*
- * A reference is held on the folio, except in the case of demote.
- *
* Must be called with hugetlb lock held.
*/
-static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
- bool adjust_surplus,
- bool demote)
+static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
+ bool adjust_surplus)
{
int nid = folio_nid(folio);
@@ -1649,6 +1646,7 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
list_del(&folio->lru);
if (folio_test_hugetlb_freed(folio)) {
+ folio_clear_hugetlb_freed(folio);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
}
@@ -1665,33 +1663,13 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
if (!folio_test_hugetlb_vmemmap_optimized(folio))
__folio_clear_hugetlb(folio);
- /*
- * In the case of demote we do not ref count the page as it will soon
- * be turned into a page of smaller size.
- */
- if (!demote)
- folio_ref_unfreeze(folio, 1);
-
h->nr_huge_pages--;
h->nr_huge_pages_node[nid]--;
}
-static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
- bool adjust_surplus)
-{
- __remove_hugetlb_folio(h, folio, adjust_surplus, false);
-}
-
-static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio,
- bool adjust_surplus)
-{
- __remove_hugetlb_folio(h, folio, adjust_surplus, true);
-}
-
static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
bool adjust_surplus)
{
- int zeroed;
int nid = folio_nid(folio);
VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);
@@ -1715,21 +1693,6 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
*/
folio_set_hugetlb_vmemmap_optimized(folio);
- /*
- * This folio is about to be managed by the hugetlb allocator and
- * should have no users. Drop our reference, and check for others
- * just in case.
- */
- zeroed = folio_put_testzero(folio);
- if (unlikely(!zeroed))
- /*
- * It is VERY unlikely soneone else has taken a ref
- * on the folio. In this case, we simply return as
- * free_huge_folio() will be called when this other ref
- * is dropped.
- */
- return;
-
arch_clear_hugetlb_flags(folio);
enqueue_hugetlb_folio(h, folio);
}
@@ -1783,6 +1746,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
spin_unlock_irq(&hugetlb_lock);
}
+ folio_ref_unfreeze(folio, 1);
+
/*
* Non-gigantic pages demoted from CMA allocated gigantic pages
* need to be given back to CMA in free_gigantic_folio.
@@ -3106,11 +3071,8 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
free_new:
spin_unlock_irq(&hugetlb_lock);
- if (new_folio) {
- /* Folio has a zero ref count, but needs a ref to be freed */
- folio_ref_unfreeze(new_folio, 1);
+ if (new_folio)
update_and_free_hugetlb_folio(h, new_folio, false);
- }
return ret;
}
@@ -3965,7 +3927,7 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
- remove_hugetlb_folio_for_demote(h, folio, false);
+ remove_hugetlb_folio(h, folio, false);
spin_unlock_irq(&hugetlb_lock);
/*
@@ -3979,7 +3941,6 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
if (rc) {
/* Allocation of vmemmmap failed, we can not demote folio */
spin_lock_irq(&hugetlb_lock);
- folio_ref_unfreeze(folio, 1);
add_hugetlb_folio(h, folio, false);
return rc;
}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index fa00d61b6c5a..829112b0a914 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -455,6 +455,8 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
unsigned long vmemmap_reuse;
VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
+
if (!folio_test_hugetlb_vmemmap_optimized(folio))
return 0;
@@ -490,6 +492,9 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
*/
int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
{
+ /* avoid writes from page_ref_add_unless() while unfolding vmemmap */
+ synchronize_rcu();
+
return __hugetlb_vmemmap_restore_folio(h, folio, 0);
}
@@ -514,6 +519,9 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h,
long restored = 0;
long ret = 0;
+ /* avoid writes from page_ref_add_unless() while unfolding vmemmap */
+ synchronize_rcu();
+
list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
if (folio_test_hugetlb_vmemmap_optimized(folio)) {
ret = __hugetlb_vmemmap_restore_folio(h, folio,
@@ -559,6 +567,8 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
unsigned long vmemmap_reuse;
VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
+
if (!vmemmap_should_optimize_folio(h, folio))
return ret;
@@ -610,6 +620,9 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
{
LIST_HEAD(vmemmap_pages);
+ /* avoid writes from page_ref_add_unless() while folding vmemmap */
+ synchronize_rcu();
+
__hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
free_vmemmap_page_list(&vmemmap_pages);
}
@@ -653,6 +666,9 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
flush_tlb_all();
+ /* avoid writes from page_ref_add_unless() while folding vmemmap */
+ synchronize_rcu();
+
list_for_each_entry(folio, folio_list, lru) {
int ret;
--
2.45.2.803.g4e1b14247a-goog
On 28.06.24 00:27, Yu Zhao wrote:
> While investigating HVO for THPs [1], it turns out that speculative
> PFN walkers like compaction can race with vmemmap modifications, e.g.,
>
> CPU 1 (vmemmap modifier) CPU 2 (speculative PFN walker)
> ------------------------------- ------------------------------
> Allocates an LRU folio page1
> Sees page1
> Frees page1
>
> Allocates a hugeTLB folio page2
> (page1 being a tail of page2)
>
> Updates vmemmap mapping page1
> get_page_unless_zero(page1)
>
> Even though page1->_refcount is zero after HVO, get_page_unless_zero()
> can still try to modify this read-only field, resulting in a crash.
>
> An independent report [2] confirmed this race.
>
> There are two discussed approaches to fix this race:
> 1. Make RO vmemmap RW so that get_page_unless_zero() can fail without
> triggering a PF.
> 2. Use RCU to make sure get_page_unless_zero() either sees zero
> page->_refcount through the old vmemmap or non-zero page->_refcount
> through the new one.
>
> The second approach is preferred here because:
> 1. It can prevent illegal modifications to struct page[] that has been
> HVO'ed;
> 2. It can be generalized, in a way similar to ZERO_PAGE(), to fix
> similar races in other places, e.g., arch_remove_memory() on x86
> [3], which frees vmemmap mapping offlined struct page[].
>
> While adding synchronize_rcu(), the goal is to be surgical, rather
> than optimized. Specifically, calls to synchronize_rcu() on the error
> handling paths can be coalesced, but it is not done for the sake of
> Simplicity: noticeably, this fix removes ~50% more lines than it adds.
>
> According to the hugetlb_optimize_vmemmap section in
> Documentation/admin-guide/sysctl/vm.rst, enabling HVO makes allocating
> or freeing hugeTLB pages "~2x slower than before". Having
> synchronize_rcu() on top makes those operations even worse, and this
> also affects the user interface /proc/sys/vm/nr_overcommit_hugepages.
>
> [1] https://lore.kernel.org/20240229183436.4110845-4-yuzhao@google.com/
> [2] https://lore.kernel.org/917FFC7F-0615-44DD-90EE-9F85F8EA9974@linux.dev/
> [3] https://lore.kernel.org/be130a96-a27e-4240-ad78-776802f57cad@redhat.com/
>
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> Acked-by: Muchun Song <muchun.song@linux.dev>
> ---
> include/linux/page_ref.h | 8 +++++-
> mm/hugetlb.c | 53 ++++++----------------------------------
> mm/hugetlb_vmemmap.c | 16 ++++++++++++
> 3 files changed, 30 insertions(+), 47 deletions(-)
>
> diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
> index 490d0ad6e56d..8c236c651d1d 100644
> --- a/include/linux/page_ref.h
> +++ b/include/linux/page_ref.h
> @@ -230,7 +230,13 @@ static inline int folio_ref_dec_return(struct folio *folio)
>
> static inline bool page_ref_add_unless(struct page *page, int nr, int u)
> {
> - bool ret = atomic_add_unless(&page->_refcount, nr, u);
> + bool ret = false;
> +
> + rcu_read_lock();
> + /* avoid writing to the vmemmap area being remapped */
> + if (!page_is_fake_head(page) && page_ref_count(page) != u)
> + ret = atomic_add_unless(&page->_refcount, nr, u);
> + rcu_read_unlock();
The page_is_fake_head() thingy in page_ref.h is a bit suboptimal,
currently it really only works on _refcount. But I get why it is
required right now, hmmm.
(independent, all users of page_ref_add_unless seem to pass u==0, maybe
we should clean that up at some point; hard to imagine other use cases
for refcounts besides "unless 0").
--
Cheers,
David / dhildenb
On Thu, 27 Jun 2024 16:27:05 -0600 Yu Zhao <yuzhao@google.com> wrote: > While investigating HVO for THPs [1], it turns out that speculative > PFN walkers like compaction can race with vmemmap modifications, e.g., > > CPU 1 (vmemmap modifier) CPU 2 (speculative PFN walker) > ------------------------------- ------------------------------ > Allocates an LRU folio page1 > Sees page1 > Frees page1 > > Allocates a hugeTLB folio page2 > (page1 being a tail of page2) > > Updates vmemmap mapping page1 > get_page_unless_zero(page1) > > Even though page1->_refcount is zero after HVO, get_page_unless_zero() > can still try to modify this read-only field, resulting in a crash. Ah. So we should backport this into earlier kernels, yes? Are we able to identify a Fixes: for this? Looks difficult. This seems quite hard to trigger. Do any particular userspace actions invoke the race?
On Thu, Jun 27, 2024 at 4:47 PM Andrew Morton <akpm@linux-foundation.org> wrote: > > On Thu, 27 Jun 2024 16:27:05 -0600 Yu Zhao <yuzhao@google.com> wrote: > > > While investigating HVO for THPs [1], it turns out that speculative > > PFN walkers like compaction can race with vmemmap modifications, e.g., > > > > CPU 1 (vmemmap modifier) CPU 2 (speculative PFN walker) > > ------------------------------- ------------------------------ > > Allocates an LRU folio page1 > > Sees page1 > > Frees page1 > > > > Allocates a hugeTLB folio page2 > > (page1 being a tail of page2) > > > > Updates vmemmap mapping page1 > > get_page_unless_zero(page1) > > > > Even though page1->_refcount is zero after HVO, get_page_unless_zero() > > can still try to modify this read-only field, resulting in a crash. > > Ah. So we should backport this into earlier kernels, yes? > > Are we able to identify a Fixes: for this? Looks difficult. > > This seems quite hard to trigger. Do any particular userspace actions > invoke the race? Yes, *very* hard to trigger: 1. Most hugeTLB use cases I know of are static, i.e., reserved at boot time, because allocating at runtime is not reliable at all. 2. On top of that, someone has to be very unlucky to get tripped over above, because the race window is so small -- I wasn't able to trigger it with a stress testing that does nothing but that (with THPs though). So I don't think it's worth cc'ing stable, unless Muchun recommends.
> On Jun 28, 2024, at 07:04, Yu Zhao <yuzhao@google.com> wrote: > > On Thu, Jun 27, 2024 at 4:47 PM Andrew Morton <akpm@linux-foundation.org> wrote: >> >> On Thu, 27 Jun 2024 16:27:05 -0600 Yu Zhao <yuzhao@google.com> wrote: >> >>> While investigating HVO for THPs [1], it turns out that speculative >>> PFN walkers like compaction can race with vmemmap modifications, e.g., >>> >>> CPU 1 (vmemmap modifier) CPU 2 (speculative PFN walker) >>> ------------------------------- ------------------------------ >>> Allocates an LRU folio page1 >>> Sees page1 >>> Frees page1 >>> >>> Allocates a hugeTLB folio page2 >>> (page1 being a tail of page2) >>> >>> Updates vmemmap mapping page1 >>> get_page_unless_zero(page1) >>> >>> Even though page1->_refcount is zero after HVO, get_page_unless_zero() >>> can still try to modify this read-only field, resulting in a crash. >> >> Ah. So we should backport this into earlier kernels, yes? >> >> Are we able to identify a Fixes: for this? Looks difficult. >> >> This seems quite hard to trigger. Do any particular userspace actions >> invoke the race? > > Yes, *very* hard to trigger: > 1. Most hugeTLB use cases I know of are static, i.e., reserved at boot > time, because allocating at runtime is not reliable at all. > 2. On top of that, someone has to be very unlucky to get tripped over > above, because the race window is so small -- I wasn't able to trigger > it with a stress testing that does nothing but that (with THPs > though). > > So I don't think it's worth cc'ing stable, unless Muchun recommends. I agree with Yu. Thanks.
© 2016 - 2025 Red Hat, Inc.