In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after
acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At
this time, the pte_same() check is not performed after the PTL held. So we
should get pgt_pmd and do pmd_same() check after the ptl held.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
mm/khugepaged.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6498721d4783a..8ab79c13d077f 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1605,7 +1605,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
pml = pmd_lock(mm, pmd);
- start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
+ start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
if (!start_pte) /* mmap_lock + page lock should prevent this */
goto abort;
if (!pml)
@@ -1613,6 +1613,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
else if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
+ goto abort;
+
/* step 2: clear page table and adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
@@ -1645,7 +1648,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
nr_ptes++;
}
- pte_unmap(start_pte);
if (!pml)
spin_unlock(ptl);
@@ -1658,13 +1660,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
/* step 4: remove empty page table */
if (!pml) {
pml = pmd_lock(mm, pmd);
- if (ptl != pml)
+ if (ptl != pml) {
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
+ spin_unlock(pml);
+ goto abort;
+ }
+ }
}
pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
pmdp_get_lockless_sync();
if (ptl != pml)
spin_unlock(ptl);
+ pte_unmap(start_pte);
spin_unlock(pml);
mmu_notifier_invalidate_range_end(&range);
--
2.20.1
> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after
> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At
> this time, the pte_same() check is not performed after the PTL held. So we
> should get pgt_pmd and do pmd_same() check after the ptl held.
>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
> mm/khugepaged.c | 14 +++++++++++---
> 1 file changed, 11 insertions(+), 3 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 6498721d4783a..8ab79c13d077f 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1605,7 +1605,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
> if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
> pml = pmd_lock(mm, pmd);
>
> - start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
> + start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
> if (!start_pte) /* mmap_lock + page lock should prevent this */
> goto abort;
> if (!pml)
> @@ -1613,6 +1613,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
> else if (ptl != pml)
> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>
> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
> + goto abort;
> +
> /* step 2: clear page table and adjust rmap */
> for (i = 0, addr = haddr, pte = start_pte;
> i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
> @@ -1645,7 +1648,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
> nr_ptes++;
> }
>
> - pte_unmap(start_pte);
> if (!pml)
> spin_unlock(ptl);
>
> @@ -1658,13 +1660,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
> /* step 4: remove empty page table */
> if (!pml) {
> pml = pmd_lock(mm, pmd);
> - if (ptl != pml)
> + if (ptl != pml) {
> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
> + spin_unlock(pml);
> + goto abort;
Drop the reference of folio and the mm counter twice at the label of abort and the step 3.
> + }
> + }
> }
> pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
> pmdp_get_lockless_sync();
> if (ptl != pml)
> spin_unlock(ptl);
> + pte_unmap(start_pte);
> spin_unlock(pml);
Why not?
pte_unmap_unlock(start_pte, ptl);
if (pml != ptl)
spin_unlock(pml);
>
> mmu_notifier_invalidate_range_end(&range);
> --
> 2.20.1
On 2024/9/24 15:14, Muchun Song wrote:
>
>
>> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after
>> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At
>> this time, the pte_same() check is not performed after the PTL held. So we
>> should get pgt_pmd and do pmd_same() check after the ptl held.
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>> mm/khugepaged.c | 14 +++++++++++---
>> 1 file changed, 11 insertions(+), 3 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 6498721d4783a..8ab79c13d077f 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1605,7 +1605,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>> if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
>> pml = pmd_lock(mm, pmd);
>>
>> - start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
>> + start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
>> if (!start_pte) /* mmap_lock + page lock should prevent this */
>> goto abort;
>> if (!pml)
>> @@ -1613,6 +1613,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>> else if (ptl != pml)
>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>
>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
>> + goto abort;
>> +
>> /* step 2: clear page table and adjust rmap */
>> for (i = 0, addr = haddr, pte = start_pte;
>> i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
>> @@ -1645,7 +1648,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>> nr_ptes++;
>> }
>>
>> - pte_unmap(start_pte);
>> if (!pml)
>> spin_unlock(ptl);
>>
>> @@ -1658,13 +1660,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>> /* step 4: remove empty page table */
>> if (!pml) {
>> pml = pmd_lock(mm, pmd);
>> - if (ptl != pml)
>> + if (ptl != pml) {
>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
>> + spin_unlock(pml);
>> + goto abort;
>
> Drop the reference of folio and the mm counter twice at the label of abort and the step 3.
My bad, should set nr_ptes to 0 and call flush_tlb_mm() here, right?
>
>> + }
>> + }
>> }
>> pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
>> pmdp_get_lockless_sync();
>> if (ptl != pml)
>> spin_unlock(ptl);
>> + pte_unmap(start_pte);
>> spin_unlock(pml);
>
> Why not?
>
> pte_unmap_unlock(start_pte, ptl);
> if (pml != ptl)
> spin_unlock(pml);
Both are fine, will do.
Thanks,
Qi
>
>>
>> mmu_notifier_invalidate_range_end(&range);
>> --
>> 2.20.1
> On Sep 24, 2024, at 15:29, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>
>
>
> On 2024/9/24 15:14, Muchun Song wrote:
>>> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>>> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after
>>> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At
>>> this time, the pte_same() check is not performed after the PTL held. So we
>>> should get pgt_pmd and do pmd_same() check after the ptl held.
>>>
>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>> ---
>>> mm/khugepaged.c | 14 +++++++++++---
>>> 1 file changed, 11 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index 6498721d4783a..8ab79c13d077f 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -1605,7 +1605,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>> if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
>>> pml = pmd_lock(mm, pmd);
>>>
>>> - start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
>>> + start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
>>> if (!start_pte) /* mmap_lock + page lock should prevent this */
>>> goto abort;
>>> if (!pml)
>>> @@ -1613,6 +1613,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>> else if (ptl != pml)
>>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>>
>>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
>>> + goto abort;
>>> +
>>> /* step 2: clear page table and adjust rmap */
>>> for (i = 0, addr = haddr, pte = start_pte;
>>> i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
>>> @@ -1645,7 +1648,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>> nr_ptes++;
>>> }
>>>
>>> - pte_unmap(start_pte);
>>> if (!pml)
>>> spin_unlock(ptl);
>>>
>>> @@ -1658,13 +1660,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>> /* step 4: remove empty page table */
>>> if (!pml) {
>>> pml = pmd_lock(mm, pmd);
>>> - if (ptl != pml)
>>> + if (ptl != pml) {
>>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
>>> + spin_unlock(pml);
>>> + goto abort;
>> Drop the reference of folio and the mm counter twice at the label of abort and the step 3.
>
> My bad, should set nr_ptes to 0 and call flush_tlb_mm() here, right?
Or add a new label "out" just below the "abort". Then go to out.
>
>>> + }
>>> + }
>>> }
>>> pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
>>> pmdp_get_lockless_sync();
>>> if (ptl != pml)
>>> spin_unlock(ptl);
>>> + pte_unmap(start_pte);
>>> spin_unlock(pml);
>> Why not?
>> pte_unmap_unlock(start_pte, ptl);
>> if (pml != ptl)
>> spin_unlock(pml);
>
> Both are fine, will do.
>
> Thanks,
> Qi
>
>>>
>>> mmu_notifier_invalidate_range_end(&range);
>>> --
>>> 2.20.1
On 2024/9/24 16:52, Muchun Song wrote:
>
>
>> On Sep 24, 2024, at 15:29, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>>
>>
>>
>> On 2024/9/24 15:14, Muchun Song wrote:
>>>> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>>>> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after
>>>> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At
>>>> this time, the pte_same() check is not performed after the PTL held. So we
>>>> should get pgt_pmd and do pmd_same() check after the ptl held.
>>>>
>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>> ---
>>>> mm/khugepaged.c | 14 +++++++++++---
>>>> 1 file changed, 11 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>>> index 6498721d4783a..8ab79c13d077f 100644
>>>> --- a/mm/khugepaged.c
>>>> +++ b/mm/khugepaged.c
>>>> @@ -1605,7 +1605,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>> if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
>>>> pml = pmd_lock(mm, pmd);
>>>>
>>>> - start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
>>>> + start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
>>>> if (!start_pte) /* mmap_lock + page lock should prevent this */
>>>> goto abort;
>>>> if (!pml)
>>>> @@ -1613,6 +1613,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>> else if (ptl != pml)
>>>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>>>
>>>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
>>>> + goto abort;
>>>> +
>>>> /* step 2: clear page table and adjust rmap */
>>>> for (i = 0, addr = haddr, pte = start_pte;
>>>> i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
>>>> @@ -1645,7 +1648,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>> nr_ptes++;
>>>> }
>>>>
>>>> - pte_unmap(start_pte);
>>>> if (!pml)
>>>> spin_unlock(ptl);
>>>>
>>>> @@ -1658,13 +1660,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>> /* step 4: remove empty page table */
>>>> if (!pml) {
>>>> pml = pmd_lock(mm, pmd);
>>>> - if (ptl != pml)
>>>> + if (ptl != pml) {
>>>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
>>>> + spin_unlock(pml);
>>>> + goto abort;
>>> Drop the reference of folio and the mm counter twice at the label of abort and the step 3.
>>
>> My bad, should set nr_ptes to 0 and call flush_tlb_mm() here, right?
>
> Or add a new label "out" just below the "abort". Then go to out.
For this way, we also need to call flush_tlb_mm() first, like the
following:
if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
spin_unlock(pml);
flush_tlb_mm(mm);
goto out;
}
>
>>
>>>> + }
>>>> + }
>>>> }
>>>> pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
>>>> pmdp_get_lockless_sync();
>>>> if (ptl != pml)
>>>> spin_unlock(ptl);
>>>> + pte_unmap(start_pte);
>>>> spin_unlock(pml);
>>> Why not?
>>> pte_unmap_unlock(start_pte, ptl);
>>> if (pml != ptl)
>>> spin_unlock(pml);
>>
>> Both are fine, will do.
>>
>> Thanks,
>> Qi
>>
>>>>
>>>> mmu_notifier_invalidate_range_end(&range);
>>>> --
>>>> 2.20.1
>
>
> On Sep 24, 2024, at 16:57, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>
>
>
> On 2024/9/24 16:52, Muchun Song wrote:
>>> On Sep 24, 2024, at 15:29, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>>>
>>>
>>>
>>> On 2024/9/24 15:14, Muchun Song wrote:
>>>>> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>>>>> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after
>>>>> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At
>>>>> this time, the pte_same() check is not performed after the PTL held. So we
>>>>> should get pgt_pmd and do pmd_same() check after the ptl held.
>>>>>
>>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>> ---
>>>>> mm/khugepaged.c | 14 +++++++++++---
>>>>> 1 file changed, 11 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>>>> index 6498721d4783a..8ab79c13d077f 100644
>>>>> --- a/mm/khugepaged.c
>>>>> +++ b/mm/khugepaged.c
>>>>> @@ -1605,7 +1605,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>>> if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
>>>>> pml = pmd_lock(mm, pmd);
>>>>>
>>>>> - start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
>>>>> + start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
>>>>> if (!start_pte) /* mmap_lock + page lock should prevent this */
>>>>> goto abort;
>>>>> if (!pml)
>>>>> @@ -1613,6 +1613,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>>> else if (ptl != pml)
>>>>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>>>>
>>>>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
>>>>> + goto abort;
>>>>> +
>>>>> /* step 2: clear page table and adjust rmap */
>>>>> for (i = 0, addr = haddr, pte = start_pte;
>>>>> i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
>>>>> @@ -1645,7 +1648,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>>> nr_ptes++;
>>>>> }
>>>>>
>>>>> - pte_unmap(start_pte);
>>>>> if (!pml)
>>>>> spin_unlock(ptl);
>>>>>
>>>>> @@ -1658,13 +1660,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>>> /* step 4: remove empty page table */
>>>>> if (!pml) {
>>>>> pml = pmd_lock(mm, pmd);
>>>>> - if (ptl != pml)
>>>>> + if (ptl != pml) {
>>>>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>>>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
>>>>> + spin_unlock(pml);
>>>>> + goto abort;
>>>> Drop the reference of folio and the mm counter twice at the label of abort and the step 3.
>>>
>>> My bad, should set nr_ptes to 0 and call flush_tlb_mm() here, right?
>> Or add a new label "out" just below the "abort". Then go to out.
>
> For this way, we also need to call flush_tlb_mm() first, like the
> following:
>
> if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
> spin_unlock(pml);
> flush_tlb_mm(mm);
> goto out;
> }
Fine.
>
>>>
>>>>> + }
>>>>> + }
>>>>> }
>>>>> pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
>>>>> pmdp_get_lockless_sync();
>>>>> if (ptl != pml)
>>>>> spin_unlock(ptl);
>>>>> + pte_unmap(start_pte);
>>>>> spin_unlock(pml);
>>>> Why not?
>>>> pte_unmap_unlock(start_pte, ptl);
>>>> if (pml != ptl)
>>>> spin_unlock(pml);
>>>
>>> Both are fine, will do.
>>>
>>> Thanks,
>>> Qi
>>>
>>>>>
>>>>> mmu_notifier_invalidate_range_end(&range);
>>>>> --
>>>>> 2.20.1
© 2016 - 2026 Red Hat, Inc.