In the caller of map_pte(), we may modify the pvmw->pte after acquiring
the pvmw->ptl, so convert it to using pte_offset_map_rw_nolock(). At
this time, the pte_same() check is not performed after the pvmw->ptl held,
so we should get pmdval and do pmd_same() check to ensure the stability of
pvmw->pmd.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
mm/page_vma_mapped.c | 25 ++++++++++++++++++++-----
1 file changed, 20 insertions(+), 5 deletions(-)
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index ae5cc42aa2087..6410f29b37c1b 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -13,9 +13,11 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
return false;
}
-static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
+static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
+ spinlock_t **ptlp)
{
pte_t ptent;
+ pmd_t pmdval;
if (pvmw->flags & PVMW_SYNC) {
/* Use the stricter lookup */
@@ -25,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
return !!pvmw->pte;
}
+again:
/*
* It is important to return the ptl corresponding to pte,
* in case *pvmw->pmd changes underneath us; so we need to
@@ -32,10 +35,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
* proceeds to loop over next ptes, and finds a match later.
* Though, in most cases, page lock already protects this.
*/
- pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
- pvmw->address, ptlp);
+ pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd,
+ pvmw->address, &pmdval, ptlp);
if (!pvmw->pte)
return false;
+ *pmdvalp = pmdval;
ptent = ptep_get(pvmw->pte);
@@ -67,8 +71,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
} else if (!pte_present(ptent)) {
return false;
}
+ spin_lock(*ptlp);
+ if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pvmw->pmd)))) {
+ pte_unmap_unlock(pvmw->pte, *ptlp);
+ goto again;
+ }
pvmw->ptl = *ptlp;
- spin_lock(pvmw->ptl);
+
return true;
}
@@ -278,7 +287,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
step_forward(pvmw, PMD_SIZE);
continue;
}
- if (!map_pte(pvmw, &ptl)) {
+ if (!map_pte(pvmw, &pmde, &ptl)) {
if (!pvmw->pte)
goto restart;
goto next_pte;
@@ -307,6 +316,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (!pvmw->ptl) {
pvmw->ptl = ptl;
spin_lock(pvmw->ptl);
+ if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) {
+ pte_unmap_unlock(pvmw->pte, pvmw->ptl);
+ pvmw->ptl = NULL;
+ pvmw->pte = NULL;
+ goto restart;
+ }
}
goto this_pte;
} while (pvmw->address < end);
--
2.20.1
> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote: > > In the caller of map_pte(), we may modify the pvmw->pte after acquiring > the pvmw->ptl, so convert it to using pte_offset_map_rw_nolock(). At > this time, the pte_same() check is not performed after the pvmw->ptl held, > so we should get pmdval and do pmd_same() check to ensure the stability of > pvmw->pmd. > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> > --- > mm/page_vma_mapped.c | 25 ++++++++++++++++++++----- > 1 file changed, 20 insertions(+), 5 deletions(-) > > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c > index ae5cc42aa2087..6410f29b37c1b 100644 > --- a/mm/page_vma_mapped.c > +++ b/mm/page_vma_mapped.c > @@ -13,9 +13,11 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) > return false; > } > > -static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) > +static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, > + spinlock_t **ptlp) > { > pte_t ptent; > + pmd_t pmdval; Why declare a new variable? Can’t we use *pmdvalp instead? > > if (pvmw->flags & PVMW_SYNC) { > /* Use the stricter lookup */ > @@ -25,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) > return !!pvmw->pte; > } > > +again: > /* > * It is important to return the ptl corresponding to pte, > * in case *pvmw->pmd changes underneath us; so we need to > @@ -32,10 +35,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) > * proceeds to loop over next ptes, and finds a match later. > * Though, in most cases, page lock already protects this. > */ > - pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd, > - pvmw->address, ptlp); > + pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd, > + pvmw->address, &pmdval, ptlp); > if (!pvmw->pte) > return false; > + *pmdvalp = pmdval; > > ptent = ptep_get(pvmw->pte); > > @@ -67,8 +71,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) > } else if (!pte_present(ptent)) { > return false; > } > + spin_lock(*ptlp); > + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pvmw->pmd)))) { > + pte_unmap_unlock(pvmw->pte, *ptlp); > + goto again; > + } > pvmw->ptl = *ptlp; > - spin_lock(pvmw->ptl); > + > return true; > } > > @@ -278,7 +287,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) > step_forward(pvmw, PMD_SIZE); > continue; > } > - if (!map_pte(pvmw, &ptl)) { > + if (!map_pte(pvmw, &pmde, &ptl)) { > if (!pvmw->pte) > goto restart; > goto next_pte; > @@ -307,6 +316,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) > if (!pvmw->ptl) { > pvmw->ptl = ptl; > spin_lock(pvmw->ptl); > + if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) { > + pte_unmap_unlock(pvmw->pte, pvmw->ptl); > + pvmw->ptl = NULL; > + pvmw->pte = NULL; > + goto restart; > + } > } > goto this_pte; > } while (pvmw->address < end); > -- > 2.20.1 >
On 2024/9/24 16:25, Muchun Song wrote: > > >> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote: >> >> In the caller of map_pte(), we may modify the pvmw->pte after acquiring >> the pvmw->ptl, so convert it to using pte_offset_map_rw_nolock(). At >> this time, the pte_same() check is not performed after the pvmw->ptl held, >> so we should get pmdval and do pmd_same() check to ensure the stability of >> pvmw->pmd. >> >> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> >> --- >> mm/page_vma_mapped.c | 25 ++++++++++++++++++++----- >> 1 file changed, 20 insertions(+), 5 deletions(-) >> >> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c >> index ae5cc42aa2087..6410f29b37c1b 100644 >> --- a/mm/page_vma_mapped.c >> +++ b/mm/page_vma_mapped.c >> @@ -13,9 +13,11 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) >> return false; >> } >> >> -static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) >> +static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, >> + spinlock_t **ptlp) >> { >> pte_t ptent; >> + pmd_t pmdval; > > Why declare a new variable? Can’t we use *pmdvalp instead? It's just a coding habit, both are fine for me. > >> >> if (pvmw->flags & PVMW_SYNC) { >> /* Use the stricter lookup */ >> @@ -25,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) >> return !!pvmw->pte; >> } >> >> +again: >> /* >> * It is important to return the ptl corresponding to pte, >> * in case *pvmw->pmd changes underneath us; so we need to >> @@ -32,10 +35,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) >> * proceeds to loop over next ptes, and finds a match later. >> * Though, in most cases, page lock already protects this. >> */ >> - pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd, >> - pvmw->address, ptlp); >> + pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd, >> + pvmw->address, &pmdval, ptlp); >> if (!pvmw->pte) >> return false; >> + *pmdvalp = pmdval; >> >> ptent = ptep_get(pvmw->pte); >> >> @@ -67,8 +71,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) >> } else if (!pte_present(ptent)) { >> return false; >> } >> + spin_lock(*ptlp); >> + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pvmw->pmd)))) { >> + pte_unmap_unlock(pvmw->pte, *ptlp); >> + goto again; >> + } >> pvmw->ptl = *ptlp; >> - spin_lock(pvmw->ptl); >> + >> return true; >> } >> >> @@ -278,7 +287,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) >> step_forward(pvmw, PMD_SIZE); >> continue; >> } >> - if (!map_pte(pvmw, &ptl)) { >> + if (!map_pte(pvmw, &pmde, &ptl)) { >> if (!pvmw->pte) >> goto restart; >> goto next_pte; >> @@ -307,6 +316,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) >> if (!pvmw->ptl) { >> pvmw->ptl = ptl; >> spin_lock(pvmw->ptl); >> + if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) { >> + pte_unmap_unlock(pvmw->pte, pvmw->ptl); >> + pvmw->ptl = NULL; >> + pvmw->pte = NULL; >> + goto restart; >> + } >> } >> goto this_pte; >> } while (pvmw->address < end); >> -- >> 2.20.1 >>
> On Sep 24, 2024, at 16:33, Qi Zheng <zhengqi.arch@bytedance.com> wrote: > > > > On 2024/9/24 16:25, Muchun Song wrote: >>> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote: >>> >>> In the caller of map_pte(), we may modify the pvmw->pte after acquiring >>> the pvmw->ptl, so convert it to using pte_offset_map_rw_nolock(). At >>> this time, the pte_same() check is not performed after the pvmw->ptl held, >>> so we should get pmdval and do pmd_same() check to ensure the stability of >>> pvmw->pmd. >>> >>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> >>> --- >>> mm/page_vma_mapped.c | 25 ++++++++++++++++++++----- >>> 1 file changed, 20 insertions(+), 5 deletions(-) >>> >>> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c >>> index ae5cc42aa2087..6410f29b37c1b 100644 >>> --- a/mm/page_vma_mapped.c >>> +++ b/mm/page_vma_mapped.c >>> @@ -13,9 +13,11 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) >>> return false; >>> } >>> >>> -static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) >>> +static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, >>> + spinlock_t **ptlp) >>> { >>> pte_t ptent; >>> + pmd_t pmdval; >> Why declare a new variable? Can’t we use *pmdvalp instead? > > It's just a coding habit, both are fine for me. Agree. But sometime it could make code look a little simpler. > >>> >>> if (pvmw->flags & PVMW_SYNC) { >>> /* Use the stricter lookup */ >>> @@ -25,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) >>> return !!pvmw->pte; >>> } >>> >>> +again: >>> /* >>> * It is important to return the ptl corresponding to pte, >>> * in case *pvmw->pmd changes underneath us; so we need to >>> @@ -32,10 +35,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) >>> * proceeds to loop over next ptes, and finds a match later. >>> * Though, in most cases, page lock already protects this. >>> */ >>> - pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd, >>> - pvmw->address, ptlp); >>> + pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd, >>> + pvmw->address, &pmdval, ptlp); >>> if (!pvmw->pte) >>> return false; >>> + *pmdvalp = pmdval; For instance, here, it is unnecessary if pmdvalp is passed directly to pte_offset_map_rw_nolock. >>> >>> ptent = ptep_get(pvmw->pte); >>> >>> @@ -67,8 +71,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) >>> } else if (!pte_present(ptent)) { >>> return false; >>> } >>> + spin_lock(*ptlp); >>> + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pvmw->pmd)))) { >>> + pte_unmap_unlock(pvmw->pte, *ptlp); >>> + goto again; >>> + } >>> pvmw->ptl = *ptlp; >>> - spin_lock(pvmw->ptl); >>> + >>> return true; >>> } >>> >>> @@ -278,7 +287,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) >>> step_forward(pvmw, PMD_SIZE); >>> continue; >>> } >>> - if (!map_pte(pvmw, &ptl)) { >>> + if (!map_pte(pvmw, &pmde, &ptl)) { >>> if (!pvmw->pte) >>> goto restart; >>> goto next_pte; >>> @@ -307,6 +316,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) >>> if (!pvmw->ptl) { >>> pvmw->ptl = ptl; >>> spin_lock(pvmw->ptl); >>> + if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) { >>> + pte_unmap_unlock(pvmw->pte, pvmw->ptl); >>> + pvmw->ptl = NULL; >>> + pvmw->pte = NULL; >>> + goto restart; >>> + } >>> } >>> goto this_pte; >>> } while (pvmw->address < end); >>> -- >>> 2.20.1
On 2024/9/24 16:39, Muchun Song wrote: > > >> On Sep 24, 2024, at 16:33, Qi Zheng <zhengqi.arch@bytedance.com> wrote: >> >> >> >> On 2024/9/24 16:25, Muchun Song wrote: >>>> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote: >>>> >>>> In the caller of map_pte(), we may modify the pvmw->pte after acquiring >>>> the pvmw->ptl, so convert it to using pte_offset_map_rw_nolock(). At >>>> this time, the pte_same() check is not performed after the pvmw->ptl held, >>>> so we should get pmdval and do pmd_same() check to ensure the stability of >>>> pvmw->pmd. >>>> >>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> >>>> --- >>>> mm/page_vma_mapped.c | 25 ++++++++++++++++++++----- >>>> 1 file changed, 20 insertions(+), 5 deletions(-) >>>> >>>> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c >>>> index ae5cc42aa2087..6410f29b37c1b 100644 >>>> --- a/mm/page_vma_mapped.c >>>> +++ b/mm/page_vma_mapped.c >>>> @@ -13,9 +13,11 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) >>>> return false; >>>> } >>>> >>>> -static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) >>>> +static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, >>>> + spinlock_t **ptlp) >>>> { >>>> pte_t ptent; >>>> + pmd_t pmdval; >>> Why declare a new variable? Can’t we use *pmdvalp instead? >> >> It's just a coding habit, both are fine for me. > > Agree. But sometime it could make code look a little simpler. > >> >>>> >>>> if (pvmw->flags & PVMW_SYNC) { >>>> /* Use the stricter lookup */ >>>> @@ -25,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) >>>> return !!pvmw->pte; >>>> } >>>> >>>> +again: >>>> /* >>>> * It is important to return the ptl corresponding to pte, >>>> * in case *pvmw->pmd changes underneath us; so we need to >>>> @@ -32,10 +35,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) >>>> * proceeds to loop over next ptes, and finds a match later. >>>> * Though, in most cases, page lock already protects this. >>>> */ >>>> - pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd, >>>> - pvmw->address, ptlp); >>>> + pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd, >>>> + pvmw->address, &pmdval, ptlp); >>>> if (!pvmw->pte) >>>> return false; >>>> + *pmdvalp = pmdval; > > For instance, here, it is unnecessary if pmdvalp is passed directly to > pte_offset_map_rw_nolock. OK, will use pmdvalp directly. ;) > >>>> >>>> ptent = ptep_get(pvmw->pte); >>>> >>>> @@ -67,8 +71,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) >>>> } else if (!pte_present(ptent)) { >>>> return false; >>>> } >>>> + spin_lock(*ptlp); >>>> + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pvmw->pmd)))) { >>>> + pte_unmap_unlock(pvmw->pte, *ptlp); >>>> + goto again; >>>> + } >>>> pvmw->ptl = *ptlp; >>>> - spin_lock(pvmw->ptl); >>>> + >>>> return true; >>>> } >>>> >>>> @@ -278,7 +287,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) >>>> step_forward(pvmw, PMD_SIZE); >>>> continue; >>>> } >>>> - if (!map_pte(pvmw, &ptl)) { >>>> + if (!map_pte(pvmw, &pmde, &ptl)) { >>>> if (!pvmw->pte) >>>> goto restart; >>>> goto next_pte; >>>> @@ -307,6 +316,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) >>>> if (!pvmw->ptl) { >>>> pvmw->ptl = ptl; >>>> spin_lock(pvmw->ptl); >>>> + if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) { >>>> + pte_unmap_unlock(pvmw->pte, pvmw->ptl); >>>> + pvmw->ptl = NULL; >>>> + pvmw->pte = NULL; >>>> + goto restart; >>>> + } >>>> } >>>> goto this_pte; >>>> } while (pvmw->address < end); >>>> -- >>>> 2.20.1 > >
© 2016 - 2024 Red Hat, Inc.