In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after
acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At
this time, the pte_same() check is not performed after the PTL held. So we
should get pgt_pmd and do pmd_same() check after the ptl held.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
mm/khugepaged.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6498721d4783a..8ab79c13d077f 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1605,7 +1605,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
pml = pmd_lock(mm, pmd);
- start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
+ start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
if (!start_pte) /* mmap_lock + page lock should prevent this */
goto abort;
if (!pml)
@@ -1613,6 +1613,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
else if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
+ goto abort;
+
/* step 2: clear page table and adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
@@ -1645,7 +1648,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
nr_ptes++;
}
- pte_unmap(start_pte);
if (!pml)
spin_unlock(ptl);
@@ -1658,13 +1660,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
/* step 4: remove empty page table */
if (!pml) {
pml = pmd_lock(mm, pmd);
- if (ptl != pml)
+ if (ptl != pml) {
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
+ spin_unlock(pml);
+ goto abort;
+ }
+ }
}
pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
pmdp_get_lockless_sync();
if (ptl != pml)
spin_unlock(ptl);
+ pte_unmap(start_pte);
spin_unlock(pml);
mmu_notifier_invalidate_range_end(&range);
--
2.20.1
> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote: > In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after > acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At > this time, the pte_same() check is not performed after the PTL held. So we > should get pgt_pmd and do pmd_same() check after the ptl held. > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> > --- > mm/khugepaged.c | 14 +++++++++++--- > 1 file changed, 11 insertions(+), 3 deletions(-) > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index 6498721d4783a..8ab79c13d077f 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -1605,7 +1605,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, > if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) > pml = pmd_lock(mm, pmd); > > - start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl); > + start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); > if (!start_pte) /* mmap_lock + page lock should prevent this */ > goto abort; > if (!pml) > @@ -1613,6 +1613,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, > else if (ptl != pml) > spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); > > + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) > + goto abort; > + > /* step 2: clear page table and adjust rmap */ > for (i = 0, addr = haddr, pte = start_pte; > i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { > @@ -1645,7 +1648,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, > nr_ptes++; > } > > - pte_unmap(start_pte); > if (!pml) > spin_unlock(ptl); > > @@ -1658,13 +1660,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, > /* step 4: remove empty page table */ > if (!pml) { > pml = pmd_lock(mm, pmd); > - if (ptl != pml) > + if (ptl != pml) { > spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); > + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { > + spin_unlock(pml); > + goto abort; Drop the reference of folio and the mm counter twice at the label of abort and the step 3. > + } > + } > } > pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); > pmdp_get_lockless_sync(); > if (ptl != pml) > spin_unlock(ptl); > + pte_unmap(start_pte); > spin_unlock(pml); Why not? pte_unmap_unlock(start_pte, ptl); if (pml != ptl) spin_unlock(pml); > > mmu_notifier_invalidate_range_end(&range); > -- > 2.20.1
On 2024/9/24 15:14, Muchun Song wrote: > > >> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote: >> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after >> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At >> this time, the pte_same() check is not performed after the PTL held. So we >> should get pgt_pmd and do pmd_same() check after the ptl held. >> >> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> >> --- >> mm/khugepaged.c | 14 +++++++++++--- >> 1 file changed, 11 insertions(+), 3 deletions(-) >> >> diff --git a/mm/khugepaged.c b/mm/khugepaged.c >> index 6498721d4783a..8ab79c13d077f 100644 >> --- a/mm/khugepaged.c >> +++ b/mm/khugepaged.c >> @@ -1605,7 +1605,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >> if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) >> pml = pmd_lock(mm, pmd); >> >> - start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl); >> + start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); >> if (!start_pte) /* mmap_lock + page lock should prevent this */ >> goto abort; >> if (!pml) >> @@ -1613,6 +1613,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >> else if (ptl != pml) >> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); >> >> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) >> + goto abort; >> + >> /* step 2: clear page table and adjust rmap */ >> for (i = 0, addr = haddr, pte = start_pte; >> i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { >> @@ -1645,7 +1648,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >> nr_ptes++; >> } >> >> - pte_unmap(start_pte); >> if (!pml) >> spin_unlock(ptl); >> >> @@ -1658,13 +1660,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >> /* step 4: remove empty page table */ >> if (!pml) { >> pml = pmd_lock(mm, pmd); >> - if (ptl != pml) >> + if (ptl != pml) { >> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); >> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { >> + spin_unlock(pml); >> + goto abort; > > Drop the reference of folio and the mm counter twice at the label of abort and the step 3. My bad, should set nr_ptes to 0 and call flush_tlb_mm() here, right? > >> + } >> + } >> } >> pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); >> pmdp_get_lockless_sync(); >> if (ptl != pml) >> spin_unlock(ptl); >> + pte_unmap(start_pte); >> spin_unlock(pml); > > Why not? > > pte_unmap_unlock(start_pte, ptl); > if (pml != ptl) > spin_unlock(pml); Both are fine, will do. Thanks, Qi > >> >> mmu_notifier_invalidate_range_end(&range); >> -- >> 2.20.1
> On Sep 24, 2024, at 15:29, Qi Zheng <zhengqi.arch@bytedance.com> wrote: > > > > On 2024/9/24 15:14, Muchun Song wrote: >>> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote: >>> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after >>> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At >>> this time, the pte_same() check is not performed after the PTL held. So we >>> should get pgt_pmd and do pmd_same() check after the ptl held. >>> >>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> >>> --- >>> mm/khugepaged.c | 14 +++++++++++--- >>> 1 file changed, 11 insertions(+), 3 deletions(-) >>> >>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c >>> index 6498721d4783a..8ab79c13d077f 100644 >>> --- a/mm/khugepaged.c >>> +++ b/mm/khugepaged.c >>> @@ -1605,7 +1605,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >>> if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) >>> pml = pmd_lock(mm, pmd); >>> >>> - start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl); >>> + start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); >>> if (!start_pte) /* mmap_lock + page lock should prevent this */ >>> goto abort; >>> if (!pml) >>> @@ -1613,6 +1613,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >>> else if (ptl != pml) >>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); >>> >>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) >>> + goto abort; >>> + >>> /* step 2: clear page table and adjust rmap */ >>> for (i = 0, addr = haddr, pte = start_pte; >>> i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { >>> @@ -1645,7 +1648,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >>> nr_ptes++; >>> } >>> >>> - pte_unmap(start_pte); >>> if (!pml) >>> spin_unlock(ptl); >>> >>> @@ -1658,13 +1660,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >>> /* step 4: remove empty page table */ >>> if (!pml) { >>> pml = pmd_lock(mm, pmd); >>> - if (ptl != pml) >>> + if (ptl != pml) { >>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); >>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { >>> + spin_unlock(pml); >>> + goto abort; >> Drop the reference of folio and the mm counter twice at the label of abort and the step 3. > > My bad, should set nr_ptes to 0 and call flush_tlb_mm() here, right? Or add a new label "out" just below the "abort". Then go to out. > >>> + } >>> + } >>> } >>> pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); >>> pmdp_get_lockless_sync(); >>> if (ptl != pml) >>> spin_unlock(ptl); >>> + pte_unmap(start_pte); >>> spin_unlock(pml); >> Why not? >> pte_unmap_unlock(start_pte, ptl); >> if (pml != ptl) >> spin_unlock(pml); > > Both are fine, will do. > > Thanks, > Qi > >>> >>> mmu_notifier_invalidate_range_end(&range); >>> -- >>> 2.20.1
On 2024/9/24 16:52, Muchun Song wrote: > > >> On Sep 24, 2024, at 15:29, Qi Zheng <zhengqi.arch@bytedance.com> wrote: >> >> >> >> On 2024/9/24 15:14, Muchun Song wrote: >>>> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote: >>>> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after >>>> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At >>>> this time, the pte_same() check is not performed after the PTL held. So we >>>> should get pgt_pmd and do pmd_same() check after the ptl held. >>>> >>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> >>>> --- >>>> mm/khugepaged.c | 14 +++++++++++--- >>>> 1 file changed, 11 insertions(+), 3 deletions(-) >>>> >>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c >>>> index 6498721d4783a..8ab79c13d077f 100644 >>>> --- a/mm/khugepaged.c >>>> +++ b/mm/khugepaged.c >>>> @@ -1605,7 +1605,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >>>> if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) >>>> pml = pmd_lock(mm, pmd); >>>> >>>> - start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl); >>>> + start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); >>>> if (!start_pte) /* mmap_lock + page lock should prevent this */ >>>> goto abort; >>>> if (!pml) >>>> @@ -1613,6 +1613,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >>>> else if (ptl != pml) >>>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); >>>> >>>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) >>>> + goto abort; >>>> + >>>> /* step 2: clear page table and adjust rmap */ >>>> for (i = 0, addr = haddr, pte = start_pte; >>>> i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { >>>> @@ -1645,7 +1648,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >>>> nr_ptes++; >>>> } >>>> >>>> - pte_unmap(start_pte); >>>> if (!pml) >>>> spin_unlock(ptl); >>>> >>>> @@ -1658,13 +1660,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >>>> /* step 4: remove empty page table */ >>>> if (!pml) { >>>> pml = pmd_lock(mm, pmd); >>>> - if (ptl != pml) >>>> + if (ptl != pml) { >>>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); >>>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { >>>> + spin_unlock(pml); >>>> + goto abort; >>> Drop the reference of folio and the mm counter twice at the label of abort and the step 3. >> >> My bad, should set nr_ptes to 0 and call flush_tlb_mm() here, right? > > Or add a new label "out" just below the "abort". Then go to out. For this way, we also need to call flush_tlb_mm() first, like the following: if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { spin_unlock(pml); flush_tlb_mm(mm); goto out; } > >> >>>> + } >>>> + } >>>> } >>>> pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); >>>> pmdp_get_lockless_sync(); >>>> if (ptl != pml) >>>> spin_unlock(ptl); >>>> + pte_unmap(start_pte); >>>> spin_unlock(pml); >>> Why not? >>> pte_unmap_unlock(start_pte, ptl); >>> if (pml != ptl) >>> spin_unlock(pml); >> >> Both are fine, will do. >> >> Thanks, >> Qi >> >>>> >>>> mmu_notifier_invalidate_range_end(&range); >>>> -- >>>> 2.20.1 > >
> On Sep 24, 2024, at 16:57, Qi Zheng <zhengqi.arch@bytedance.com> wrote: > > > > On 2024/9/24 16:52, Muchun Song wrote: >>> On Sep 24, 2024, at 15:29, Qi Zheng <zhengqi.arch@bytedance.com> wrote: >>> >>> >>> >>> On 2024/9/24 15:14, Muchun Song wrote: >>>>> On Sep 24, 2024, at 14:11, Qi Zheng <zhengqi.arch@bytedance.com> wrote: >>>>> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after >>>>> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At >>>>> this time, the pte_same() check is not performed after the PTL held. So we >>>>> should get pgt_pmd and do pmd_same() check after the ptl held. >>>>> >>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> >>>>> --- >>>>> mm/khugepaged.c | 14 +++++++++++--- >>>>> 1 file changed, 11 insertions(+), 3 deletions(-) >>>>> >>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c >>>>> index 6498721d4783a..8ab79c13d077f 100644 >>>>> --- a/mm/khugepaged.c >>>>> +++ b/mm/khugepaged.c >>>>> @@ -1605,7 +1605,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >>>>> if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) >>>>> pml = pmd_lock(mm, pmd); >>>>> >>>>> - start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl); >>>>> + start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); >>>>> if (!start_pte) /* mmap_lock + page lock should prevent this */ >>>>> goto abort; >>>>> if (!pml) >>>>> @@ -1613,6 +1613,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >>>>> else if (ptl != pml) >>>>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); >>>>> >>>>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) >>>>> + goto abort; >>>>> + >>>>> /* step 2: clear page table and adjust rmap */ >>>>> for (i = 0, addr = haddr, pte = start_pte; >>>>> i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { >>>>> @@ -1645,7 +1648,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >>>>> nr_ptes++; >>>>> } >>>>> >>>>> - pte_unmap(start_pte); >>>>> if (!pml) >>>>> spin_unlock(ptl); >>>>> >>>>> @@ -1658,13 +1660,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, >>>>> /* step 4: remove empty page table */ >>>>> if (!pml) { >>>>> pml = pmd_lock(mm, pmd); >>>>> - if (ptl != pml) >>>>> + if (ptl != pml) { >>>>> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); >>>>> + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { >>>>> + spin_unlock(pml); >>>>> + goto abort; >>>> Drop the reference of folio and the mm counter twice at the label of abort and the step 3. >>> >>> My bad, should set nr_ptes to 0 and call flush_tlb_mm() here, right? >> Or add a new label "out" just below the "abort". Then go to out. > > For this way, we also need to call flush_tlb_mm() first, like the > following: > > if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { > spin_unlock(pml); > flush_tlb_mm(mm); > goto out; > } Fine. > >>> >>>>> + } >>>>> + } >>>>> } >>>>> pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); >>>>> pmdp_get_lockless_sync(); >>>>> if (ptl != pml) >>>>> spin_unlock(ptl); >>>>> + pte_unmap(start_pte); >>>>> spin_unlock(pml); >>>> Why not? >>>> pte_unmap_unlock(start_pte, ptl); >>>> if (pml != ptl) >>>> spin_unlock(pml); >>> >>> Both are fine, will do. >>> >>> Thanks, >>> Qi >>> >>>>> >>>>> mmu_notifier_invalidate_range_end(&range); >>>>> -- >>>>> 2.20.1
© 2016 - 2024 Red Hat, Inc.