In the caller of map_pte(), we may modify the pvmw->pte after acquiring
the pvmw->ptl, so convert it to using pte_offset_map_rw_nolock(). At
this time, the pte_same() check is not performed after the pvmw->ptl held,
so we should get pmdval and do pmd_same() check to ensure the stability of
pvmw->pmd.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
mm/page_vma_mapped.c | 25 ++++++++++++++++++++-----
1 file changed, 20 insertions(+), 5 deletions(-)
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index ae5cc42aa2087..6410f29b37c1b 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -13,9 +13,11 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
return false;
}
-static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
+static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
+ spinlock_t **ptlp)
{
pte_t ptent;
+ pmd_t pmdval;
if (pvmw->flags & PVMW_SYNC) {
/* Use the stricter lookup */
@@ -25,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
return !!pvmw->pte;
}
+again:
/*
* It is important to return the ptl corresponding to pte,
* in case *pvmw->pmd changes underneath us; so we need to
@@ -32,10 +35,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
* proceeds to loop over next ptes, and finds a match later.
* Though, in most cases, page lock already protects this.
*/
- pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
- pvmw->address, ptlp);
+ pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd,
+ pvmw->address, &pmdval, ptlp);
if (!pvmw->pte)
return false;
+ *pmdvalp = pmdval;
ptent = ptep_get(pvmw->pte);
@@ -67,8 +71,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
} else if (!pte_present(ptent)) {
return false;
}
+ spin_lock(*ptlp);
+ if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pvmw->pmd)))) {
+ pte_unmap_unlock(pvmw->pte, *ptlp);
+ goto again;
+ }
pvmw->ptl = *ptlp;
- spin_lock(pvmw->ptl);
+
return true;
}
@@ -278,7 +287,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
step_forward(pvmw, PMD_SIZE);
continue;
}
- if (!map_pte(pvmw, &ptl)) {
+ if (!map_pte(pvmw, &pmde, &ptl)) {
if (!pvmw->pte)
goto restart;
goto next_pte;
@@ -307,6 +316,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (!pvmw->ptl) {
pvmw->ptl = ptl;
spin_lock(pvmw->ptl);
+ if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) {
+ pte_unmap_unlock(pvmw->pte, pvmw->ptl);
+ pvmw->ptl = NULL;
+ pvmw->pte = NULL;
+ goto restart;
+ }
}
goto this_pte;
} while (pvmw->address < end);
--
2.20.1
> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>
> In the caller of map_pte(), we may modify the pvmw->pte after acquiring
> the pvmw->ptl, so convert it to using pte_offset_map_rw_nolock(). At
> this time, the pte_same() check is not performed after the pvmw->ptl held,
> so we should get pmdval and do pmd_same() check to ensure the stability of
> pvmw->pmd.
>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
> mm/page_vma_mapped.c | 25 ++++++++++++++++++++-----
> 1 file changed, 20 insertions(+), 5 deletions(-)
>
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index ae5cc42aa2087..6410f29b37c1b 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -13,9 +13,11 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
> return false;
> }
>
> -static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
> +static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
> + spinlock_t **ptlp)
> {
> pte_t ptent;
> + pmd_t pmdval;
Why declare a new variable? Can’t we use *pmdvalp instead?
>
> if (pvmw->flags & PVMW_SYNC) {
> /* Use the stricter lookup */
> @@ -25,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
> return !!pvmw->pte;
> }
>
> +again:
> /*
> * It is important to return the ptl corresponding to pte,
> * in case *pvmw->pmd changes underneath us; so we need to
> @@ -32,10 +35,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
> * proceeds to loop over next ptes, and finds a match later.
> * Though, in most cases, page lock already protects this.
> */
> - pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
> - pvmw->address, ptlp);
> + pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd,
> + pvmw->address, &pmdval, ptlp);
> if (!pvmw->pte)
> return false;
> + *pmdvalp = pmdval;
>
> ptent = ptep_get(pvmw->pte);
>
> @@ -67,8 +71,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
> } else if (!pte_present(ptent)) {
> return false;
> }
> + spin_lock(*ptlp);
> + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pvmw->pmd)))) {
> + pte_unmap_unlock(pvmw->pte, *ptlp);
> + goto again;
> + }
> pvmw->ptl = *ptlp;
> - spin_lock(pvmw->ptl);
> +
> return true;
> }
>
> @@ -278,7 +287,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> step_forward(pvmw, PMD_SIZE);
> continue;
> }
> - if (!map_pte(pvmw, &ptl)) {
> + if (!map_pte(pvmw, &pmde, &ptl)) {
> if (!pvmw->pte)
> goto restart;
> goto next_pte;
> @@ -307,6 +316,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> if (!pvmw->ptl) {
> pvmw->ptl = ptl;
> spin_lock(pvmw->ptl);
> + if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) {
> + pte_unmap_unlock(pvmw->pte, pvmw->ptl);
> + pvmw->ptl = NULL;
> + pvmw->pte = NULL;
> + goto restart;
> + }
> }
> goto this_pte;
> } while (pvmw->address < end);
> --
> 2.20.1
>
On 2024/9/24 16:25, Muchun Song wrote:
>
>
>> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>>
>> In the caller of map_pte(), we may modify the pvmw->pte after acquiring
>> the pvmw->ptl, so convert it to using pte_offset_map_rw_nolock(). At
>> this time, the pte_same() check is not performed after the pvmw->ptl held,
>> so we should get pmdval and do pmd_same() check to ensure the stability of
>> pvmw->pmd.
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>> mm/page_vma_mapped.c | 25 ++++++++++++++++++++-----
>> 1 file changed, 20 insertions(+), 5 deletions(-)
>>
>> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
>> index ae5cc42aa2087..6410f29b37c1b 100644
>> --- a/mm/page_vma_mapped.c
>> +++ b/mm/page_vma_mapped.c
>> @@ -13,9 +13,11 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
>> return false;
>> }
>>
>> -static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
>> +static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
>> + spinlock_t **ptlp)
>> {
>> pte_t ptent;
>> + pmd_t pmdval;
>
> Why declare a new variable? Can’t we use *pmdvalp instead?
It's just a coding habit, both are fine for me.
>
>>
>> if (pvmw->flags & PVMW_SYNC) {
>> /* Use the stricter lookup */
>> @@ -25,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
>> return !!pvmw->pte;
>> }
>>
>> +again:
>> /*
>> * It is important to return the ptl corresponding to pte,
>> * in case *pvmw->pmd changes underneath us; so we need to
>> @@ -32,10 +35,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
>> * proceeds to loop over next ptes, and finds a match later.
>> * Though, in most cases, page lock already protects this.
>> */
>> - pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
>> - pvmw->address, ptlp);
>> + pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd,
>> + pvmw->address, &pmdval, ptlp);
>> if (!pvmw->pte)
>> return false;
>> + *pmdvalp = pmdval;
>>
>> ptent = ptep_get(pvmw->pte);
>>
>> @@ -67,8 +71,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
>> } else if (!pte_present(ptent)) {
>> return false;
>> }
>> + spin_lock(*ptlp);
>> + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pvmw->pmd)))) {
>> + pte_unmap_unlock(pvmw->pte, *ptlp);
>> + goto again;
>> + }
>> pvmw->ptl = *ptlp;
>> - spin_lock(pvmw->ptl);
>> +
>> return true;
>> }
>>
>> @@ -278,7 +287,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>> step_forward(pvmw, PMD_SIZE);
>> continue;
>> }
>> - if (!map_pte(pvmw, &ptl)) {
>> + if (!map_pte(pvmw, &pmde, &ptl)) {
>> if (!pvmw->pte)
>> goto restart;
>> goto next_pte;
>> @@ -307,6 +316,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>> if (!pvmw->ptl) {
>> pvmw->ptl = ptl;
>> spin_lock(pvmw->ptl);
>> + if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) {
>> + pte_unmap_unlock(pvmw->pte, pvmw->ptl);
>> + pvmw->ptl = NULL;
>> + pvmw->pte = NULL;
>> + goto restart;
>> + }
>> }
>> goto this_pte;
>> } while (pvmw->address < end);
>> --
>> 2.20.1
>>
> On Sep 24, 2024, at 16:33, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>
>
>
> On 2024/9/24 16:25, Muchun Song wrote:
>>> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>>>
>>> In the caller of map_pte(), we may modify the pvmw->pte after acquiring
>>> the pvmw->ptl, so convert it to using pte_offset_map_rw_nolock(). At
>>> this time, the pte_same() check is not performed after the pvmw->ptl held,
>>> so we should get pmdval and do pmd_same() check to ensure the stability of
>>> pvmw->pmd.
>>>
>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>> ---
>>> mm/page_vma_mapped.c | 25 ++++++++++++++++++++-----
>>> 1 file changed, 20 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
>>> index ae5cc42aa2087..6410f29b37c1b 100644
>>> --- a/mm/page_vma_mapped.c
>>> +++ b/mm/page_vma_mapped.c
>>> @@ -13,9 +13,11 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
>>> return false;
>>> }
>>>
>>> -static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
>>> +static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
>>> + spinlock_t **ptlp)
>>> {
>>> pte_t ptent;
>>> + pmd_t pmdval;
>> Why declare a new variable? Can’t we use *pmdvalp instead?
>
> It's just a coding habit, both are fine for me.
Agree. But sometime it could make code look a little simpler.
>
>>>
>>> if (pvmw->flags & PVMW_SYNC) {
>>> /* Use the stricter lookup */
>>> @@ -25,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
>>> return !!pvmw->pte;
>>> }
>>>
>>> +again:
>>> /*
>>> * It is important to return the ptl corresponding to pte,
>>> * in case *pvmw->pmd changes underneath us; so we need to
>>> @@ -32,10 +35,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
>>> * proceeds to loop over next ptes, and finds a match later.
>>> * Though, in most cases, page lock already protects this.
>>> */
>>> - pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
>>> - pvmw->address, ptlp);
>>> + pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd,
>>> + pvmw->address, &pmdval, ptlp);
>>> if (!pvmw->pte)
>>> return false;
>>> + *pmdvalp = pmdval;
For instance, here, it is unnecessary if pmdvalp is passed directly to
pte_offset_map_rw_nolock.
>>>
>>> ptent = ptep_get(pvmw->pte);
>>>
>>> @@ -67,8 +71,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
>>> } else if (!pte_present(ptent)) {
>>> return false;
>>> }
>>> + spin_lock(*ptlp);
>>> + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pvmw->pmd)))) {
>>> + pte_unmap_unlock(pvmw->pte, *ptlp);
>>> + goto again;
>>> + }
>>> pvmw->ptl = *ptlp;
>>> - spin_lock(pvmw->ptl);
>>> +
>>> return true;
>>> }
>>>
>>> @@ -278,7 +287,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>>> step_forward(pvmw, PMD_SIZE);
>>> continue;
>>> }
>>> - if (!map_pte(pvmw, &ptl)) {
>>> + if (!map_pte(pvmw, &pmde, &ptl)) {
>>> if (!pvmw->pte)
>>> goto restart;
>>> goto next_pte;
>>> @@ -307,6 +316,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>>> if (!pvmw->ptl) {
>>> pvmw->ptl = ptl;
>>> spin_lock(pvmw->ptl);
>>> + if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) {
>>> + pte_unmap_unlock(pvmw->pte, pvmw->ptl);
>>> + pvmw->ptl = NULL;
>>> + pvmw->pte = NULL;
>>> + goto restart;
>>> + }
>>> }
>>> goto this_pte;
>>> } while (pvmw->address < end);
>>> --
>>> 2.20.1
On 2024/9/24 16:39, Muchun Song wrote:
>
>
>> On Sep 24, 2024, at 16:33, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>>
>>
>>
>> On 2024/9/24 16:25, Muchun Song wrote:
>>>> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>>>>
>>>> In the caller of map_pte(), we may modify the pvmw->pte after acquiring
>>>> the pvmw->ptl, so convert it to using pte_offset_map_rw_nolock(). At
>>>> this time, the pte_same() check is not performed after the pvmw->ptl held,
>>>> so we should get pmdval and do pmd_same() check to ensure the stability of
>>>> pvmw->pmd.
>>>>
>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>> ---
>>>> mm/page_vma_mapped.c | 25 ++++++++++++++++++++-----
>>>> 1 file changed, 20 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
>>>> index ae5cc42aa2087..6410f29b37c1b 100644
>>>> --- a/mm/page_vma_mapped.c
>>>> +++ b/mm/page_vma_mapped.c
>>>> @@ -13,9 +13,11 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
>>>> return false;
>>>> }
>>>>
>>>> -static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
>>>> +static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
>>>> + spinlock_t **ptlp)
>>>> {
>>>> pte_t ptent;
>>>> + pmd_t pmdval;
>>> Why declare a new variable? Can’t we use *pmdvalp instead?
>>
>> It's just a coding habit, both are fine for me.
>
> Agree. But sometime it could make code look a little simpler.
>
>>
>>>>
>>>> if (pvmw->flags & PVMW_SYNC) {
>>>> /* Use the stricter lookup */
>>>> @@ -25,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
>>>> return !!pvmw->pte;
>>>> }
>>>>
>>>> +again:
>>>> /*
>>>> * It is important to return the ptl corresponding to pte,
>>>> * in case *pvmw->pmd changes underneath us; so we need to
>>>> @@ -32,10 +35,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
>>>> * proceeds to loop over next ptes, and finds a match later.
>>>> * Though, in most cases, page lock already protects this.
>>>> */
>>>> - pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
>>>> - pvmw->address, ptlp);
>>>> + pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd,
>>>> + pvmw->address, &pmdval, ptlp);
>>>> if (!pvmw->pte)
>>>> return false;
>>>> + *pmdvalp = pmdval;
>
> For instance, here, it is unnecessary if pmdvalp is passed directly to
> pte_offset_map_rw_nolock.
OK, will use pmdvalp directly. ;)
>
>>>>
>>>> ptent = ptep_get(pvmw->pte);
>>>>
>>>> @@ -67,8 +71,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
>>>> } else if (!pte_present(ptent)) {
>>>> return false;
>>>> }
>>>> + spin_lock(*ptlp);
>>>> + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pvmw->pmd)))) {
>>>> + pte_unmap_unlock(pvmw->pte, *ptlp);
>>>> + goto again;
>>>> + }
>>>> pvmw->ptl = *ptlp;
>>>> - spin_lock(pvmw->ptl);
>>>> +
>>>> return true;
>>>> }
>>>>
>>>> @@ -278,7 +287,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>>>> step_forward(pvmw, PMD_SIZE);
>>>> continue;
>>>> }
>>>> - if (!map_pte(pvmw, &ptl)) {
>>>> + if (!map_pte(pvmw, &pmde, &ptl)) {
>>>> if (!pvmw->pte)
>>>> goto restart;
>>>> goto next_pte;
>>>> @@ -307,6 +316,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>>>> if (!pvmw->ptl) {
>>>> pvmw->ptl = ptl;
>>>> spin_lock(pvmw->ptl);
>>>> + if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) {
>>>> + pte_unmap_unlock(pvmw->pte, pvmw->ptl);
>>>> + pvmw->ptl = NULL;
>>>> + pvmw->pte = NULL;
>>>> + goto restart;
>>>> + }
>>>> }
>>>> goto this_pte;
>>>> } while (pvmw->address < end);
>>>> --
>>>> 2.20.1
>
>
© 2016 - 2026 Red Hat, Inc.