Currently if a PTE points to a FS DAX page vm_normal_page() will
return NULL as these have their own special refcounting scheme. A
future change will allow FS DAX pages to be refcounted the same as any
other normal page.
Therefore vm_normal_page() will start returning FS DAX pages. To avoid
any change in behaviour callers that don't expect FS DAX pages will
need to explicitly check for this. As vm_normal_page() can already
return ZONE_DEVICE pages most callers already include a check for any
ZONE_DEVICE page.
However some callers don't, so add explicit checks where required.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
---
arch/x86/mm/pat/memtype.c | 4 +++-
fs/proc/task_mmu.c | 16 ++++++++++++----
mm/memcontrol-v1.c | 2 +-
3 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 1fa0bf6..eb84593 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -951,6 +951,7 @@ static void free_pfn_range(u64 paddr, unsigned long size)
static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
resource_size_t *phys)
{
+ struct folio *folio;
pte_t *ptep, pte;
spinlock_t *ptl;
@@ -960,7 +961,8 @@ static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
pte = ptep_get(ptep);
/* Never return PFNs of anon folios in COW mappings. */
- if (vm_normal_folio(vma, vma->vm_start, pte)) {
+ folio = vm_normal_folio(vma, vma->vm_start, pte);
+ if (folio || (folio && !folio_is_device_dax(folio))) {
pte_unmap_unlock(ptep, ptl);
return -EINVAL;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5f171ad..456b010 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -816,6 +816,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (pte_present(ptent)) {
page = vm_normal_page(vma, addr, ptent);
+ if (page && is_device_dax_page(page))
+ page = NULL;
young = pte_young(ptent);
dirty = pte_dirty(ptent);
present = true;
@@ -864,6 +866,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
if (pmd_present(*pmd)) {
page = vm_normal_page_pmd(vma, addr, *pmd);
+ if (page && is_device_dax_page(page))
+ page = NULL;
present = true;
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
@@ -1385,7 +1389,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
return false;
folio = vm_normal_folio(vma, addr, pte);
- if (!folio)
+ if (!folio || folio_is_device_dax(folio))
return false;
return folio_maybe_dma_pinned(folio);
}
@@ -1710,6 +1714,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
frame = pte_pfn(pte);
flags |= PM_PRESENT;
page = vm_normal_page(vma, addr, pte);
+ if (page && is_device_dax_page(page))
+ page = NULL;
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
if (pte_uffd_wp(pte))
@@ -2096,7 +2102,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
if (p->masks_of_interest & PAGE_IS_FILE) {
page = vm_normal_page(vma, addr, pte);
- if (page && !PageAnon(page))
+ if (page && !PageAnon(page) &&
+ !is_device_dax_page(page))
categories |= PAGE_IS_FILE;
}
@@ -2158,7 +2165,8 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
if (p->masks_of_interest & PAGE_IS_FILE) {
page = vm_normal_page_pmd(vma, addr, pmd);
- if (page && !PageAnon(page))
+ if (page && !PageAnon(page) &&
+ !is_device_dax_page(page))
categories |= PAGE_IS_FILE;
}
@@ -2919,7 +2927,7 @@ static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
return NULL;
page = vm_normal_page_pmd(vma, addr, pmd);
- if (!page)
+ if (!page || is_device_dax_page(page))
return NULL;
if (PageReserved(page))
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index b37c0d8..e16053c 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -667,7 +667,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
{
struct page *page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ if (!page || is_device_dax_page(page))
return NULL;
if (PageAnon(page)) {
if (!(mc.flags & MOVE_ANON))
--
git-series 0.9.1
Alistair Popple wrote: > Currently if a PTE points to a FS DAX page vm_normal_page() will > return NULL as these have their own special refcounting scheme. A > future change will allow FS DAX pages to be refcounted the same as any > other normal page. > > Therefore vm_normal_page() will start returning FS DAX pages. To avoid > any change in behaviour callers that don't expect FS DAX pages will > need to explicitly check for this. As vm_normal_page() can already > return ZONE_DEVICE pages most callers already include a check for any > ZONE_DEVICE page. > > However some callers don't, so add explicit checks where required. I would expect justification for each of these conversions, and hopefully with fsdax returning fully formed folios there is less need to sprinkle these checks around. At a minimum I think this patch needs to be broken up by file touched. > Signed-off-by: Alistair Popple <apopple@nvidia.com> > --- > arch/x86/mm/pat/memtype.c | 4 +++- > fs/proc/task_mmu.c | 16 ++++++++++++---- > mm/memcontrol-v1.c | 2 +- > 3 files changed, 16 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c > index 1fa0bf6..eb84593 100644 > --- a/arch/x86/mm/pat/memtype.c > +++ b/arch/x86/mm/pat/memtype.c > @@ -951,6 +951,7 @@ static void free_pfn_range(u64 paddr, unsigned long size) > static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, > resource_size_t *phys) > { > + struct folio *folio; > pte_t *ptep, pte; > spinlock_t *ptl; > > @@ -960,7 +961,8 @@ static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, > pte = ptep_get(ptep); > > /* Never return PFNs of anon folios in COW mappings. */ > - if (vm_normal_folio(vma, vma->vm_start, pte)) { > + folio = vm_normal_folio(vma, vma->vm_start, pte); > + if (folio || (folio && !folio_is_device_dax(folio))) { ...for example, I do not immediately see why follow_phys() would need to be careful with fsdax pages? ...but I do see why copy_page_range() (which calls follow_phys() through track_pfn_copy()) might care. It just turns out that vma_needs_copy(), afaics, bypasses dax MAP_SHARED mappings. So this touch of memtype.c looks like it can be dropped. > pte_unmap_unlock(ptep, ptl); > return -EINVAL; > } > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > index 5f171ad..456b010 100644 > --- a/fs/proc/task_mmu.c > +++ b/fs/proc/task_mmu.c > @@ -816,6 +816,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, > > if (pte_present(ptent)) { > page = vm_normal_page(vma, addr, ptent); > + if (page && is_device_dax_page(page)) > + page = NULL; > young = pte_young(ptent); > dirty = pte_dirty(ptent); > present = true; > @@ -864,6 +866,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, > > if (pmd_present(*pmd)) { > page = vm_normal_page_pmd(vma, addr, *pmd); > + if (page && is_device_dax_page(page)) > + page = NULL; > present = true; > } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { > swp_entry_t entry = pmd_to_swp_entry(*pmd); The above can be replaced with a catch like if (folio_test_device(folio)) return; ...in smaps_account() since ZONE_DEVICE pages are not suitable to account as they do not reflect any memory pressure on the system memory pool. > @@ -1385,7 +1389,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, > if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) > return false; > folio = vm_normal_folio(vma, addr, pte); > - if (!folio) > + if (!folio || folio_is_device_dax(folio)) > return false; > return folio_maybe_dma_pinned(folio); The whole point of ZONE_DEVICE is to account for DMA so I see no reason for pte_is_pinned() to special case dax. The caller of pte_is_pinned() is doing it for soft_dirty reasons, and I believe soft_dirty is already disabled for vma_is_dax(). I assume MEMORY_DEVICE_PRIVATE also does not support soft-dirty, so I expect all ZONE_DEVICE already opt-out of this. > } > @@ -1710,6 +1714,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, > frame = pte_pfn(pte); > flags |= PM_PRESENT; > page = vm_normal_page(vma, addr, pte); > + if (page && is_device_dax_page(page)) > + page = NULL; > if (pte_soft_dirty(pte)) > flags |= PM_SOFT_DIRTY; > if (pte_uffd_wp(pte)) > @@ -2096,7 +2102,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, > > if (p->masks_of_interest & PAGE_IS_FILE) { > page = vm_normal_page(vma, addr, pte); > - if (page && !PageAnon(page)) > + if (page && !PageAnon(page) && > + !is_device_dax_page(page)) > categories |= PAGE_IS_FILE; > } > > @@ -2158,7 +2165,8 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, > > if (p->masks_of_interest & PAGE_IS_FILE) { > page = vm_normal_page_pmd(vma, addr, pmd); > - if (page && !PageAnon(page)) > + if (page && !PageAnon(page) && > + !is_device_dax_page(page)) > categories |= PAGE_IS_FILE; > } > > @@ -2919,7 +2927,7 @@ static struct page *can_gather_numa_stats_pmd(pmd_t pmd, > return NULL; > > page = vm_normal_page_pmd(vma, addr, pmd); > - if (!page) > + if (!page || is_device_dax_page(page)) > return NULL; I am not immediately seeing a reason to block pagemap_read() from interrogating dax-backed virtual mappings. I think these protections can be dropped. > > if (PageReserved(page)) > diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c > index b37c0d8..e16053c 100644 > --- a/mm/memcontrol-v1.c > +++ b/mm/memcontrol-v1.c > @@ -667,7 +667,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, > { > struct page *page = vm_normal_page(vma, addr, ptent); > > - if (!page) > + if (!page || is_device_dax_page(page)) > return NULL; > if (PageAnon(page)) { > if (!(mc.flags & MOVE_ANON)) I think this better handled with something like this to disable all memcg accounting for ZONE_DEVICE pages: diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index b37c0d870816..cfc43e8c59fe 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -940,8 +940,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (folio_memcg(folio) == mc.from) { ret = MC_TARGET_PAGE; - if (folio_is_device_private(folio) || - folio_is_device_coherent(folio)) + if (folio_is_device(folio)) ret = MC_TARGET_DEVICE; if (target) target->folio = folio;
Dan Williams <dan.j.williams@intel.com> writes: > Alistair Popple wrote: >> Currently if a PTE points to a FS DAX page vm_normal_page() will >> return NULL as these have their own special refcounting scheme. A >> future change will allow FS DAX pages to be refcounted the same as any >> other normal page. >> >> Therefore vm_normal_page() will start returning FS DAX pages. To avoid >> any change in behaviour callers that don't expect FS DAX pages will >> need to explicitly check for this. As vm_normal_page() can already >> return ZONE_DEVICE pages most callers already include a check for any >> ZONE_DEVICE page. >> >> However some callers don't, so add explicit checks where required. > > I would expect justification for each of these conversions, and > hopefully with fsdax returning fully formed folios there is less need to > sprinkle these checks around. > > At a minimum I think this patch needs to be broken up by file touched. Good idea. >> Signed-off-by: Alistair Popple <apopple@nvidia.com> >> --- >> arch/x86/mm/pat/memtype.c | 4 +++- >> fs/proc/task_mmu.c | 16 ++++++++++++---- >> mm/memcontrol-v1.c | 2 +- >> 3 files changed, 16 insertions(+), 6 deletions(-) >> >> diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c >> index 1fa0bf6..eb84593 100644 >> --- a/arch/x86/mm/pat/memtype.c >> +++ b/arch/x86/mm/pat/memtype.c >> @@ -951,6 +951,7 @@ static void free_pfn_range(u64 paddr, unsigned long size) >> static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, >> resource_size_t *phys) >> { >> + struct folio *folio; >> pte_t *ptep, pte; >> spinlock_t *ptl; >> >> @@ -960,7 +961,8 @@ static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, >> pte = ptep_get(ptep); >> >> /* Never return PFNs of anon folios in COW mappings. */ >> - if (vm_normal_folio(vma, vma->vm_start, pte)) { >> + folio = vm_normal_folio(vma, vma->vm_start, pte); >> + if (folio || (folio && !folio_is_device_dax(folio))) { > > ...for example, I do not immediately see why follow_phys() would need to > be careful with fsdax pages? The intent was to maintain the original behaviour as much as possible, partly to reduce the chance of unintended bugs/consequences and partly to maintain my sanity by not having to dig too deeply into all the callers. I see I got this a little bit wrong though - it only filters FSDAX pages and not device DAX (my intent was to filter both). > ...but I do see why copy_page_range() (which calls follow_phys() through > track_pfn_copy()) might care. It just turns out that vma_needs_copy(), > afaics, bypasses dax MAP_SHARED mappings. > > So this touch of memtype.c looks like it can be dropped. Ok. Although it feels safer to leave it (along with a check for device DAX). Someone can always remove it in future if they really do want DAX pages but this is all x86 specific so will take your guidance here. >> pte_unmap_unlock(ptep, ptl); >> return -EINVAL; >> } >> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c >> index 5f171ad..456b010 100644 >> --- a/fs/proc/task_mmu.c >> +++ b/fs/proc/task_mmu.c >> @@ -816,6 +816,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, >> >> if (pte_present(ptent)) { >> page = vm_normal_page(vma, addr, ptent); >> + if (page && is_device_dax_page(page)) >> + page = NULL; >> young = pte_young(ptent); >> dirty = pte_dirty(ptent); >> present = true; >> @@ -864,6 +866,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, >> >> if (pmd_present(*pmd)) { >> page = vm_normal_page_pmd(vma, addr, *pmd); >> + if (page && is_device_dax_page(page)) >> + page = NULL; >> present = true; >> } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { >> swp_entry_t entry = pmd_to_swp_entry(*pmd); > > The above can be replaced with a catch like > > if (folio_test_device(folio)) > return; > > ...in smaps_account() since ZONE_DEVICE pages are not suitable to > account as they do not reflect any memory pressure on the system memory > pool. Sounds good. >> @@ -1385,7 +1389,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, >> if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) >> return false; >> folio = vm_normal_folio(vma, addr, pte); >> - if (!folio) >> + if (!folio || folio_is_device_dax(folio)) >> return false; >> return folio_maybe_dma_pinned(folio); > > The whole point of ZONE_DEVICE is to account for DMA so I see no reason > for pte_is_pinned() to special case dax. The caller of pte_is_pinned() > is doing it for soft_dirty reasons, and I believe soft_dirty is already > disabled for vma_is_dax(). I assume MEMORY_DEVICE_PRIVATE also does not > support soft-dirty, so I expect all ZONE_DEVICE already opt-out of this. Actually soft-dirty is theoretically supported on DEVICE_PRIVATE pages in the sense that the soft-dirty bits are copied around. Whether or not it actually works is a different question though, I've certainly never tried it. Again, was just trying to maintain previous behaviour but can drop this check. >> } >> @@ -1710,6 +1714,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, >> frame = pte_pfn(pte); >> flags |= PM_PRESENT; >> page = vm_normal_page(vma, addr, pte); >> + if (page && is_device_dax_page(page)) >> + page = NULL; >> if (pte_soft_dirty(pte)) >> flags |= PM_SOFT_DIRTY; >> if (pte_uffd_wp(pte)) >> @@ -2096,7 +2102,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, >> >> if (p->masks_of_interest & PAGE_IS_FILE) { >> page = vm_normal_page(vma, addr, pte); >> - if (page && !PageAnon(page)) >> + if (page && !PageAnon(page) && >> + !is_device_dax_page(page)) >> categories |= PAGE_IS_FILE; >> } >> >> @@ -2158,7 +2165,8 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, >> >> if (p->masks_of_interest & PAGE_IS_FILE) { >> page = vm_normal_page_pmd(vma, addr, pmd); >> - if (page && !PageAnon(page)) >> + if (page && !PageAnon(page) && >> + !is_device_dax_page(page)) >> categories |= PAGE_IS_FILE; >> } >> >> @@ -2919,7 +2927,7 @@ static struct page *can_gather_numa_stats_pmd(pmd_t pmd, >> return NULL; >> >> page = vm_normal_page_pmd(vma, addr, pmd); >> - if (!page) >> + if (!page || is_device_dax_page(page)) >> return NULL; > > I am not immediately seeing a reason to block pagemap_read() from > interrogating dax-backed virtual mappings. I think these protections can > be dropped. Ok. >> >> if (PageReserved(page)) >> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c >> index b37c0d8..e16053c 100644 >> --- a/mm/memcontrol-v1.c >> +++ b/mm/memcontrol-v1.c >> @@ -667,7 +667,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, >> { >> struct page *page = vm_normal_page(vma, addr, ptent); >> >> - if (!page) >> + if (!page || is_device_dax_page(page)) >> return NULL; >> if (PageAnon(page)) { >> if (!(mc.flags & MOVE_ANON)) > > I think this better handled with something like this to disable all > memcg accounting for ZONE_DEVICE pages: Ok, thanks for the review. > diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c > index b37c0d870816..cfc43e8c59fe 100644 > --- a/mm/memcontrol-v1.c > +++ b/mm/memcontrol-v1.c > @@ -940,8 +940,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, > */ > if (folio_memcg(folio) == mc.from) { > ret = MC_TARGET_PAGE; > - if (folio_is_device_private(folio) || > - folio_is_device_coherent(folio)) > + if (folio_is_device(folio)) > ret = MC_TARGET_DEVICE; > if (target) > target->folio = folio;
© 2016 - 2024 Red Hat, Inc.