Currently if a PTE points to a FS DAX page vm_normal_page() will
return NULL as these have their own special refcounting scheme. A
future change will allow FS DAX pages to be refcounted the same as any
other normal page.
Therefore vm_normal_page() will start returning FS DAX pages. To avoid
any change in behaviour callers that don't expect FS DAX pages will
need to explicitly check for this. As vm_normal_page() can already
return ZONE_DEVICE pages most callers already include a check for any
ZONE_DEVICE page.
However some callers don't, so add explicit checks where required.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
---
arch/x86/mm/pat/memtype.c | 4 +++-
fs/proc/task_mmu.c | 16 ++++++++++++----
mm/memcontrol-v1.c | 2 +-
3 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 1fa0bf6..eb84593 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -951,6 +951,7 @@ static void free_pfn_range(u64 paddr, unsigned long size)
static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
resource_size_t *phys)
{
+ struct folio *folio;
pte_t *ptep, pte;
spinlock_t *ptl;
@@ -960,7 +961,8 @@ static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
pte = ptep_get(ptep);
/* Never return PFNs of anon folios in COW mappings. */
- if (vm_normal_folio(vma, vma->vm_start, pte)) {
+ folio = vm_normal_folio(vma, vma->vm_start, pte);
+ if (folio || (folio && !folio_is_device_dax(folio))) {
pte_unmap_unlock(ptep, ptl);
return -EINVAL;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5f171ad..456b010 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -816,6 +816,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (pte_present(ptent)) {
page = vm_normal_page(vma, addr, ptent);
+ if (page && is_device_dax_page(page))
+ page = NULL;
young = pte_young(ptent);
dirty = pte_dirty(ptent);
present = true;
@@ -864,6 +866,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
if (pmd_present(*pmd)) {
page = vm_normal_page_pmd(vma, addr, *pmd);
+ if (page && is_device_dax_page(page))
+ page = NULL;
present = true;
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
@@ -1385,7 +1389,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
return false;
folio = vm_normal_folio(vma, addr, pte);
- if (!folio)
+ if (!folio || folio_is_device_dax(folio))
return false;
return folio_maybe_dma_pinned(folio);
}
@@ -1710,6 +1714,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
frame = pte_pfn(pte);
flags |= PM_PRESENT;
page = vm_normal_page(vma, addr, pte);
+ if (page && is_device_dax_page(page))
+ page = NULL;
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
if (pte_uffd_wp(pte))
@@ -2096,7 +2102,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
if (p->masks_of_interest & PAGE_IS_FILE) {
page = vm_normal_page(vma, addr, pte);
- if (page && !PageAnon(page))
+ if (page && !PageAnon(page) &&
+ !is_device_dax_page(page))
categories |= PAGE_IS_FILE;
}
@@ -2158,7 +2165,8 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
if (p->masks_of_interest & PAGE_IS_FILE) {
page = vm_normal_page_pmd(vma, addr, pmd);
- if (page && !PageAnon(page))
+ if (page && !PageAnon(page) &&
+ !is_device_dax_page(page))
categories |= PAGE_IS_FILE;
}
@@ -2919,7 +2927,7 @@ static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
return NULL;
page = vm_normal_page_pmd(vma, addr, pmd);
- if (!page)
+ if (!page || is_device_dax_page(page))
return NULL;
if (PageReserved(page))
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index b37c0d8..e16053c 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -667,7 +667,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
{
struct page *page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ if (!page || is_device_dax_page(page))
return NULL;
if (PageAnon(page)) {
if (!(mc.flags & MOVE_ANON))
--
git-series 0.9.1
Alistair Popple wrote:
> Currently if a PTE points to a FS DAX page vm_normal_page() will
> return NULL as these have their own special refcounting scheme. A
> future change will allow FS DAX pages to be refcounted the same as any
> other normal page.
>
> Therefore vm_normal_page() will start returning FS DAX pages. To avoid
> any change in behaviour callers that don't expect FS DAX pages will
> need to explicitly check for this. As vm_normal_page() can already
> return ZONE_DEVICE pages most callers already include a check for any
> ZONE_DEVICE page.
>
> However some callers don't, so add explicit checks where required.
I would expect justification for each of these conversions, and
hopefully with fsdax returning fully formed folios there is less need to
sprinkle these checks around.
At a minimum I think this patch needs to be broken up by file touched.
> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> ---
> arch/x86/mm/pat/memtype.c | 4 +++-
> fs/proc/task_mmu.c | 16 ++++++++++++----
> mm/memcontrol-v1.c | 2 +-
> 3 files changed, 16 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
> index 1fa0bf6..eb84593 100644
> --- a/arch/x86/mm/pat/memtype.c
> +++ b/arch/x86/mm/pat/memtype.c
> @@ -951,6 +951,7 @@ static void free_pfn_range(u64 paddr, unsigned long size)
> static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
> resource_size_t *phys)
> {
> + struct folio *folio;
> pte_t *ptep, pte;
> spinlock_t *ptl;
>
> @@ -960,7 +961,8 @@ static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
> pte = ptep_get(ptep);
>
> /* Never return PFNs of anon folios in COW mappings. */
> - if (vm_normal_folio(vma, vma->vm_start, pte)) {
> + folio = vm_normal_folio(vma, vma->vm_start, pte);
> + if (folio || (folio && !folio_is_device_dax(folio))) {
...for example, I do not immediately see why follow_phys() would need to
be careful with fsdax pages?
...but I do see why copy_page_range() (which calls follow_phys() through
track_pfn_copy()) might care. It just turns out that vma_needs_copy(),
afaics, bypasses dax MAP_SHARED mappings.
So this touch of memtype.c looks like it can be dropped.
> pte_unmap_unlock(ptep, ptl);
> return -EINVAL;
> }
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 5f171ad..456b010 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -816,6 +816,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
>
> if (pte_present(ptent)) {
> page = vm_normal_page(vma, addr, ptent);
> + if (page && is_device_dax_page(page))
> + page = NULL;
> young = pte_young(ptent);
> dirty = pte_dirty(ptent);
> present = true;
> @@ -864,6 +866,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
>
> if (pmd_present(*pmd)) {
> page = vm_normal_page_pmd(vma, addr, *pmd);
> + if (page && is_device_dax_page(page))
> + page = NULL;
> present = true;
> } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
> swp_entry_t entry = pmd_to_swp_entry(*pmd);
The above can be replaced with a catch like
if (folio_test_device(folio))
return;
...in smaps_account() since ZONE_DEVICE pages are not suitable to
account as they do not reflect any memory pressure on the system memory
pool.
> @@ -1385,7 +1389,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
> if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
> return false;
> folio = vm_normal_folio(vma, addr, pte);
> - if (!folio)
> + if (!folio || folio_is_device_dax(folio))
> return false;
> return folio_maybe_dma_pinned(folio);
The whole point of ZONE_DEVICE is to account for DMA so I see no reason
for pte_is_pinned() to special case dax. The caller of pte_is_pinned()
is doing it for soft_dirty reasons, and I believe soft_dirty is already
disabled for vma_is_dax(). I assume MEMORY_DEVICE_PRIVATE also does not
support soft-dirty, so I expect all ZONE_DEVICE already opt-out of this.
> }
> @@ -1710,6 +1714,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
> frame = pte_pfn(pte);
> flags |= PM_PRESENT;
> page = vm_normal_page(vma, addr, pte);
> + if (page && is_device_dax_page(page))
> + page = NULL;
> if (pte_soft_dirty(pte))
> flags |= PM_SOFT_DIRTY;
> if (pte_uffd_wp(pte))
> @@ -2096,7 +2102,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
>
> if (p->masks_of_interest & PAGE_IS_FILE) {
> page = vm_normal_page(vma, addr, pte);
> - if (page && !PageAnon(page))
> + if (page && !PageAnon(page) &&
> + !is_device_dax_page(page))
> categories |= PAGE_IS_FILE;
> }
>
> @@ -2158,7 +2165,8 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
>
> if (p->masks_of_interest & PAGE_IS_FILE) {
> page = vm_normal_page_pmd(vma, addr, pmd);
> - if (page && !PageAnon(page))
> + if (page && !PageAnon(page) &&
> + !is_device_dax_page(page))
> categories |= PAGE_IS_FILE;
> }
>
> @@ -2919,7 +2927,7 @@ static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
> return NULL;
>
> page = vm_normal_page_pmd(vma, addr, pmd);
> - if (!page)
> + if (!page || is_device_dax_page(page))
> return NULL;
I am not immediately seeing a reason to block pagemap_read() from
interrogating dax-backed virtual mappings. I think these protections can
be dropped.
>
> if (PageReserved(page))
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index b37c0d8..e16053c 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -667,7 +667,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
> {
> struct page *page = vm_normal_page(vma, addr, ptent);
>
> - if (!page)
> + if (!page || is_device_dax_page(page))
> return NULL;
> if (PageAnon(page)) {
> if (!(mc.flags & MOVE_ANON))
I think this better handled with something like this to disable all
memcg accounting for ZONE_DEVICE pages:
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index b37c0d870816..cfc43e8c59fe 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -940,8 +940,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (folio_memcg(folio) == mc.from) {
ret = MC_TARGET_PAGE;
- if (folio_is_device_private(folio) ||
- folio_is_device_coherent(folio))
+ if (folio_is_device(folio))
ret = MC_TARGET_DEVICE;
if (target)
target->folio = folio;
Dan Williams <dan.j.williams@intel.com> writes:
> Alistair Popple wrote:
>> Currently if a PTE points to a FS DAX page vm_normal_page() will
>> return NULL as these have their own special refcounting scheme. A
>> future change will allow FS DAX pages to be refcounted the same as any
>> other normal page.
>>
>> Therefore vm_normal_page() will start returning FS DAX pages. To avoid
>> any change in behaviour callers that don't expect FS DAX pages will
>> need to explicitly check for this. As vm_normal_page() can already
>> return ZONE_DEVICE pages most callers already include a check for any
>> ZONE_DEVICE page.
>>
>> However some callers don't, so add explicit checks where required.
>
> I would expect justification for each of these conversions, and
> hopefully with fsdax returning fully formed folios there is less need to
> sprinkle these checks around.
>
> At a minimum I think this patch needs to be broken up by file touched.
Good idea.
>> Signed-off-by: Alistair Popple <apopple@nvidia.com>
>> ---
>> arch/x86/mm/pat/memtype.c | 4 +++-
>> fs/proc/task_mmu.c | 16 ++++++++++++----
>> mm/memcontrol-v1.c | 2 +-
>> 3 files changed, 16 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
>> index 1fa0bf6..eb84593 100644
>> --- a/arch/x86/mm/pat/memtype.c
>> +++ b/arch/x86/mm/pat/memtype.c
>> @@ -951,6 +951,7 @@ static void free_pfn_range(u64 paddr, unsigned long size)
>> static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
>> resource_size_t *phys)
>> {
>> + struct folio *folio;
>> pte_t *ptep, pte;
>> spinlock_t *ptl;
>>
>> @@ -960,7 +961,8 @@ static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
>> pte = ptep_get(ptep);
>>
>> /* Never return PFNs of anon folios in COW mappings. */
>> - if (vm_normal_folio(vma, vma->vm_start, pte)) {
>> + folio = vm_normal_folio(vma, vma->vm_start, pte);
>> + if (folio || (folio && !folio_is_device_dax(folio))) {
>
> ...for example, I do not immediately see why follow_phys() would need to
> be careful with fsdax pages?
The intent was to maintain the original behaviour as much as
possible, partly to reduce the chance of unintended bugs/consequences
and partly to maintain my sanity by not having to dig too deeply into
all the callers.
I see I got this a little bit wrong though - it only filters FSDAX pages
and not device DAX (my intent was to filter both).
> ...but I do see why copy_page_range() (which calls follow_phys() through
> track_pfn_copy()) might care. It just turns out that vma_needs_copy(),
> afaics, bypasses dax MAP_SHARED mappings.
>
> So this touch of memtype.c looks like it can be dropped.
Ok. Although it feels safer to leave it (along with a check for device
DAX). Someone can always remove it in future if they really do want DAX
pages but this is all x86 specific so will take your guidance here.
>> pte_unmap_unlock(ptep, ptl);
>> return -EINVAL;
>> }
>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>> index 5f171ad..456b010 100644
>> --- a/fs/proc/task_mmu.c
>> +++ b/fs/proc/task_mmu.c
>> @@ -816,6 +816,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
>>
>> if (pte_present(ptent)) {
>> page = vm_normal_page(vma, addr, ptent);
>> + if (page && is_device_dax_page(page))
>> + page = NULL;
>> young = pte_young(ptent);
>> dirty = pte_dirty(ptent);
>> present = true;
>> @@ -864,6 +866,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
>>
>> if (pmd_present(*pmd)) {
>> page = vm_normal_page_pmd(vma, addr, *pmd);
>> + if (page && is_device_dax_page(page))
>> + page = NULL;
>> present = true;
>> } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
>> swp_entry_t entry = pmd_to_swp_entry(*pmd);
>
> The above can be replaced with a catch like
>
> if (folio_test_device(folio))
> return;
>
> ...in smaps_account() since ZONE_DEVICE pages are not suitable to
> account as they do not reflect any memory pressure on the system memory
> pool.
Sounds good.
>> @@ -1385,7 +1389,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
>> if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
>> return false;
>> folio = vm_normal_folio(vma, addr, pte);
>> - if (!folio)
>> + if (!folio || folio_is_device_dax(folio))
>> return false;
>> return folio_maybe_dma_pinned(folio);
>
> The whole point of ZONE_DEVICE is to account for DMA so I see no reason
> for pte_is_pinned() to special case dax. The caller of pte_is_pinned()
> is doing it for soft_dirty reasons, and I believe soft_dirty is already
> disabled for vma_is_dax(). I assume MEMORY_DEVICE_PRIVATE also does not
> support soft-dirty, so I expect all ZONE_DEVICE already opt-out of this.
Actually soft-dirty is theoretically supported on DEVICE_PRIVATE pages
in the sense that the soft-dirty bits are copied around. Whether or not
it actually works is a different question though, I've certainly never
tried it.
Again, was just trying to maintain previous behaviour but can drop this
check.
>> }
>> @@ -1710,6 +1714,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
>> frame = pte_pfn(pte);
>> flags |= PM_PRESENT;
>> page = vm_normal_page(vma, addr, pte);
>> + if (page && is_device_dax_page(page))
>> + page = NULL;
>> if (pte_soft_dirty(pte))
>> flags |= PM_SOFT_DIRTY;
>> if (pte_uffd_wp(pte))
>> @@ -2096,7 +2102,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
>>
>> if (p->masks_of_interest & PAGE_IS_FILE) {
>> page = vm_normal_page(vma, addr, pte);
>> - if (page && !PageAnon(page))
>> + if (page && !PageAnon(page) &&
>> + !is_device_dax_page(page))
>> categories |= PAGE_IS_FILE;
>> }
>>
>> @@ -2158,7 +2165,8 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
>>
>> if (p->masks_of_interest & PAGE_IS_FILE) {
>> page = vm_normal_page_pmd(vma, addr, pmd);
>> - if (page && !PageAnon(page))
>> + if (page && !PageAnon(page) &&
>> + !is_device_dax_page(page))
>> categories |= PAGE_IS_FILE;
>> }
>>
>> @@ -2919,7 +2927,7 @@ static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
>> return NULL;
>>
>> page = vm_normal_page_pmd(vma, addr, pmd);
>> - if (!page)
>> + if (!page || is_device_dax_page(page))
>> return NULL;
>
> I am not immediately seeing a reason to block pagemap_read() from
> interrogating dax-backed virtual mappings. I think these protections can
> be dropped.
Ok.
>>
>> if (PageReserved(page))
>> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
>> index b37c0d8..e16053c 100644
>> --- a/mm/memcontrol-v1.c
>> +++ b/mm/memcontrol-v1.c
>> @@ -667,7 +667,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
>> {
>> struct page *page = vm_normal_page(vma, addr, ptent);
>>
>> - if (!page)
>> + if (!page || is_device_dax_page(page))
>> return NULL;
>> if (PageAnon(page)) {
>> if (!(mc.flags & MOVE_ANON))
>
> I think this better handled with something like this to disable all
> memcg accounting for ZONE_DEVICE pages:
Ok, thanks for the review.
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index b37c0d870816..cfc43e8c59fe 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -940,8 +940,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
> */
> if (folio_memcg(folio) == mc.from) {
> ret = MC_TARGET_PAGE;
> - if (folio_is_device_private(folio) ||
> - folio_is_device_coherent(folio))
> + if (folio_is_device(folio))
> ret = MC_TARGET_DEVICE;
> if (target)
> target->folio = folio;
© 2016 - 2026 Red Hat, Inc.