Extend core huge page management functions to handle device-private THP
entries. This enables proper handling of large device-private folios in
fundamental MM operations.
The following functions have been updated:
- copy_huge_pmd(): Handle device-private entries during fork/clone
- zap_huge_pmd(): Properly free device-private THP during munmap
- change_huge_pmd(): Support protection changes on device-private THP
- __pte_offset_map(): Add device-private entry awareness
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
---
include/linux/swapops.h | 32 +++++++++++++++++++++++
mm/huge_memory.c | 56 ++++++++++++++++++++++++++++++++++-------
mm/pgtable-generic.c | 2 +-
3 files changed, 80 insertions(+), 10 deletions(-)
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 64ea151a7ae3..2687928a8146 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -594,10 +594,42 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
}
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
+
+/**
+ * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry
+ * @pmd: The PMD to check
+ *
+ * Returns true if the PMD contains a swap entry that represents a device private
+ * page mapping. This is used for zone device private pages that have been
+ * swapped out but still need special handling during various memory management
+ * operations.
+ *
+ * Return: 1 if PMD contains device private entry, 0 otherwise
+ */
+static inline int is_pmd_device_private_entry(pmd_t pmd)
+{
+ return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
+}
+
+#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+static inline int is_pmd_device_private_entry(pmd_t pmd)
+{
+ return 0;
+}
+
+#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
static inline int non_swap_entry(swp_entry_t entry)
{
return swp_type(entry) >= MAX_SWAPFILES;
}
+static inline int is_pmd_non_present_folio_entry(pmd_t pmd)
+{
+ return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd);
+}
+
#endif /* CONFIG_MMU */
#endif /* _LINUX_SWAPOPS_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1b81680b4225..8e0a1747762d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1703,17 +1703,45 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (unlikely(is_swap_pmd(pmd))) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
- VM_BUG_ON(!is_pmd_migration_entry(pmd));
- if (!is_readable_migration_entry(entry)) {
- entry = make_readable_migration_entry(
- swp_offset(entry));
+ VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd));
+
+ if (is_writable_migration_entry(entry) ||
+ is_readable_exclusive_migration_entry(entry)) {
+ entry = make_readable_migration_entry(swp_offset(entry));
pmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*src_pmd))
pmd = pmd_swp_mksoft_dirty(pmd);
if (pmd_swp_uffd_wp(*src_pmd))
pmd = pmd_swp_mkuffd_wp(pmd);
set_pmd_at(src_mm, addr, src_pmd, pmd);
+ } else if (is_device_private_entry(entry)) {
+ /*
+ * For device private entries, since there are no
+ * read exclusive entries, writable = !readable
+ */
+ if (is_writable_device_private_entry(entry)) {
+ entry = make_readable_device_private_entry(swp_offset(entry));
+ pmd = swp_entry_to_pmd(entry);
+
+ if (pmd_swp_soft_dirty(*src_pmd))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ if (pmd_swp_uffd_wp(*src_pmd))
+ pmd = pmd_swp_mkuffd_wp(pmd);
+ set_pmd_at(src_mm, addr, src_pmd, pmd);
+ }
+
+ src_folio = pfn_swap_entry_folio(entry);
+ VM_WARN_ON(!folio_test_large(src_folio));
+
+ folio_get(src_folio);
+ /*
+ * folio_try_dup_anon_rmap_pmd does not fail for
+ * device private entries.
+ */
+ folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
+ dst_vma, src_vma);
}
+
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
@@ -2211,15 +2239,16 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
folio_remove_rmap_pmd(folio, page, vma);
WARN_ON_ONCE(folio_mapcount(folio) < 0);
VM_BUG_ON_PAGE(!PageHead(page), page);
- } else if (thp_migration_supported()) {
+ } else if (is_pmd_non_present_folio_entry(orig_pmd)) {
swp_entry_t entry;
- VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
folio = pfn_swap_entry_folio(entry);
flush_needed = 0;
- } else
- WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+
+ if (!thp_migration_supported())
+ WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+ }
if (folio_test_anon(folio)) {
zap_deposited_table(tlb->mm, pmd);
@@ -2239,6 +2268,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
folio_mark_accessed(folio);
}
+ if (folio_is_device_private(folio)) {
+ folio_remove_rmap_pmd(folio, &folio->page, vma);
+ WARN_ON_ONCE(folio_mapcount(folio) < 0);
+ folio_put(folio);
+ }
+
spin_unlock(ptl);
if (flush_needed)
tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
@@ -2367,7 +2402,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct folio *folio = pfn_swap_entry_folio(entry);
pmd_t newpmd;
- VM_BUG_ON(!is_pmd_migration_entry(*pmd));
+ VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd));
if (is_writable_migration_entry(entry)) {
/*
* A protection check is difficult so
@@ -2380,6 +2415,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
newpmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
+ } else if (is_writable_device_private_entry(entry)) {
+ entry = make_readable_device_private_entry(swp_offset(entry));
+ newpmd = swp_entry_to_pmd(entry);
} else {
newpmd = *pmd;
}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 567e2d084071..0c847cdf4fd3 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
if (pmdvalp)
*pmdvalp = pmdval;
- if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
+ if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval)))
goto nomap;
if (unlikely(pmd_trans_huge(pmdval)))
goto nomap;
--
2.51.0
On Wed, Oct 1, 2025 at 4:20 PM Balbir Singh <balbirs@nvidia.com> wrote:
>
> Extend core huge page management functions to handle device-private THP
> entries. This enables proper handling of large device-private folios in
> fundamental MM operations.
>
> The following functions have been updated:
>
> - copy_huge_pmd(): Handle device-private entries during fork/clone
> - zap_huge_pmd(): Properly free device-private THP during munmap
> - change_huge_pmd(): Support protection changes on device-private THP
> - __pte_offset_map(): Add device-private entry awareness
>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Zi Yan <ziy@nvidia.com>
> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
> Cc: Rakie Kim <rakie.kim@sk.com>
> Cc: Byungchul Park <byungchul@sk.com>
> Cc: Gregory Price <gourry@gourry.net>
> Cc: Ying Huang <ying.huang@linux.alibaba.com>
> Cc: Alistair Popple <apopple@nvidia.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> Cc: Nico Pache <npache@redhat.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Dev Jain <dev.jain@arm.com>
> Cc: Barry Song <baohua@kernel.org>
> Cc: Lyude Paul <lyude@redhat.com>
> Cc: Danilo Krummrich <dakr@kernel.org>
> Cc: David Airlie <airlied@gmail.com>
> Cc: Simona Vetter <simona@ffwll.ch>
> Cc: Ralph Campbell <rcampbell@nvidia.com>
> Cc: Mika Penttilä <mpenttil@redhat.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Francois Dugast <francois.dugast@intel.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Acked-by: Zi Yan <ziy@nvidia.com>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
> ---
> include/linux/swapops.h | 32 +++++++++++++++++++++++
> mm/huge_memory.c | 56 ++++++++++++++++++++++++++++++++++-------
> mm/pgtable-generic.c | 2 +-
> 3 files changed, 80 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index 64ea151a7ae3..2687928a8146 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -594,10 +594,42 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
> }
> #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>
> +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
> +
> +/**
> + * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry
> + * @pmd: The PMD to check
> + *
> + * Returns true if the PMD contains a swap entry that represents a device private
> + * page mapping. This is used for zone device private pages that have been
> + * swapped out but still need special handling during various memory management
> + * operations.
> + *
> + * Return: 1 if PMD contains device private entry, 0 otherwise
> + */
> +static inline int is_pmd_device_private_entry(pmd_t pmd)
> +{
> + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
> +}
> +
> +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
> +
> +static inline int is_pmd_device_private_entry(pmd_t pmd)
> +{
> + return 0;
> +}
> +
> +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
> +
> static inline int non_swap_entry(swp_entry_t entry)
> {
> return swp_type(entry) >= MAX_SWAPFILES;
> }
>
> +static inline int is_pmd_non_present_folio_entry(pmd_t pmd)
> +{
> + return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd);
> +}
> +
> #endif /* CONFIG_MMU */
> #endif /* _LINUX_SWAPOPS_H */
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 1b81680b4225..8e0a1747762d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1703,17 +1703,45 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> if (unlikely(is_swap_pmd(pmd))) {
> swp_entry_t entry = pmd_to_swp_entry(pmd);
>
> - VM_BUG_ON(!is_pmd_migration_entry(pmd));
> - if (!is_readable_migration_entry(entry)) {
> - entry = make_readable_migration_entry(
> - swp_offset(entry));
> + VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd));
> +
> + if (is_writable_migration_entry(entry) ||
> + is_readable_exclusive_migration_entry(entry)) {
> + entry = make_readable_migration_entry(swp_offset(entry));
> pmd = swp_entry_to_pmd(entry);
> if (pmd_swp_soft_dirty(*src_pmd))
> pmd = pmd_swp_mksoft_dirty(pmd);
> if (pmd_swp_uffd_wp(*src_pmd))
> pmd = pmd_swp_mkuffd_wp(pmd);
> set_pmd_at(src_mm, addr, src_pmd, pmd);
> + } else if (is_device_private_entry(entry)) {
> + /*
> + * For device private entries, since there are no
> + * read exclusive entries, writable = !readable
> + */
> + if (is_writable_device_private_entry(entry)) {
> + entry = make_readable_device_private_entry(swp_offset(entry));
> + pmd = swp_entry_to_pmd(entry);
> +
> + if (pmd_swp_soft_dirty(*src_pmd))
> + pmd = pmd_swp_mksoft_dirty(pmd);
> + if (pmd_swp_uffd_wp(*src_pmd))
> + pmd = pmd_swp_mkuffd_wp(pmd);
> + set_pmd_at(src_mm, addr, src_pmd, pmd);
> + }
> +
> + src_folio = pfn_swap_entry_folio(entry);
> + VM_WARN_ON(!folio_test_large(src_folio));
> +
> + folio_get(src_folio);
> + /*
> + * folio_try_dup_anon_rmap_pmd does not fail for
> + * device private entries.
> + */
> + folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
> + dst_vma, src_vma);
> }
> +
> add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> mm_inc_nr_ptes(dst_mm);
> pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
> @@ -2211,15 +2239,16 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> folio_remove_rmap_pmd(folio, page, vma);
> WARN_ON_ONCE(folio_mapcount(folio) < 0);
> VM_BUG_ON_PAGE(!PageHead(page), page);
> - } else if (thp_migration_supported()) {
> + } else if (is_pmd_non_present_folio_entry(orig_pmd)) {
> swp_entry_t entry;
>
> - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
> entry = pmd_to_swp_entry(orig_pmd);
> folio = pfn_swap_entry_folio(entry);
> flush_needed = 0;
> - } else
> - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
> +
> + if (!thp_migration_supported())
> + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
> + }
>
> if (folio_test_anon(folio)) {
> zap_deposited_table(tlb->mm, pmd);
> @@ -2239,6 +2268,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> folio_mark_accessed(folio);
> }
>
> + if (folio_is_device_private(folio)) {
> + folio_remove_rmap_pmd(folio, &folio->page, vma);
> + WARN_ON_ONCE(folio_mapcount(folio) < 0);
> + folio_put(folio);
> + }
IIUC, a device-private THP is always anonymous, right? would it make sense
to move this folio_is_device_private() block inside the folio_test_anon()
check above?
> +
> spin_unlock(ptl);
> if (flush_needed)
> tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
> @@ -2367,7 +2402,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> struct folio *folio = pfn_swap_entry_folio(entry);
> pmd_t newpmd;
>
> - VM_BUG_ON(!is_pmd_migration_entry(*pmd));
> + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd));
> if (is_writable_migration_entry(entry)) {
> /*
> * A protection check is difficult so
> @@ -2380,6 +2415,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> newpmd = swp_entry_to_pmd(entry);
> if (pmd_swp_soft_dirty(*pmd))
> newpmd = pmd_swp_mksoft_dirty(newpmd);
> + } else if (is_writable_device_private_entry(entry)) {
> + entry = make_readable_device_private_entry(swp_offset(entry));
> + newpmd = swp_entry_to_pmd(entry);
> } else {
> newpmd = *pmd;
> }
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 567e2d084071..0c847cdf4fd3 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
>
> if (pmdvalp)
> *pmdvalp = pmdval;
> - if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
> + if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval)))
> goto nomap;
> if (unlikely(pmd_trans_huge(pmdval)))
> goto nomap;
> --
> 2.51.0
>
>
On 10/13/25 02:46, Lance Yang wrote:
> On Wed, Oct 1, 2025 at 4:20 PM Balbir Singh <balbirs@nvidia.com> wrote:
>>
>> Extend core huge page management functions to handle device-private THP
>> entries. This enables proper handling of large device-private folios in
>> fundamental MM operations.
>>
>> The following functions have been updated:
>>
>> - copy_huge_pmd(): Handle device-private entries during fork/clone
>> - zap_huge_pmd(): Properly free device-private THP during munmap
>> - change_huge_pmd(): Support protection changes on device-private THP
>> - __pte_offset_map(): Add device-private entry awareness
>>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: Zi Yan <ziy@nvidia.com>
>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>> Cc: Rakie Kim <rakie.kim@sk.com>
>> Cc: Byungchul Park <byungchul@sk.com>
>> Cc: Gregory Price <gourry@gourry.net>
>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>> Cc: Alistair Popple <apopple@nvidia.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>> Cc: Nico Pache <npache@redhat.com>
>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>> Cc: Dev Jain <dev.jain@arm.com>
>> Cc: Barry Song <baohua@kernel.org>
>> Cc: Lyude Paul <lyude@redhat.com>
>> Cc: Danilo Krummrich <dakr@kernel.org>
>> Cc: David Airlie <airlied@gmail.com>
>> Cc: Simona Vetter <simona@ffwll.ch>
>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>> Cc: Mika Penttilä <mpenttil@redhat.com>
>> Cc: Matthew Brost <matthew.brost@intel.com>
>> Cc: Francois Dugast <francois.dugast@intel.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Acked-by: Zi Yan <ziy@nvidia.com>
>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>> ---
>> include/linux/swapops.h | 32 +++++++++++++++++++++++
>> mm/huge_memory.c | 56 ++++++++++++++++++++++++++++++++++-------
>> mm/pgtable-generic.c | 2 +-
>> 3 files changed, 80 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
>> index 64ea151a7ae3..2687928a8146 100644
>> --- a/include/linux/swapops.h
>> +++ b/include/linux/swapops.h
>> @@ -594,10 +594,42 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>> }
>> #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>
>> +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
>> +
>> +/**
>> + * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry
>> + * @pmd: The PMD to check
>> + *
>> + * Returns true if the PMD contains a swap entry that represents a device private
>> + * page mapping. This is used for zone device private pages that have been
>> + * swapped out but still need special handling during various memory management
>> + * operations.
>> + *
>> + * Return: 1 if PMD contains device private entry, 0 otherwise
>> + */
>> +static inline int is_pmd_device_private_entry(pmd_t pmd)
>> +{
>> + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
>> +}
>> +
>> +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
>> +
>> +static inline int is_pmd_device_private_entry(pmd_t pmd)
>> +{
>> + return 0;
>> +}
>> +
>> +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
>> +
>> static inline int non_swap_entry(swp_entry_t entry)
>> {
>> return swp_type(entry) >= MAX_SWAPFILES;
>> }
>>
>> +static inline int is_pmd_non_present_folio_entry(pmd_t pmd)
>> +{
>> + return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd);
>> +}
>> +
>> #endif /* CONFIG_MMU */
>> #endif /* _LINUX_SWAPOPS_H */
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 1b81680b4225..8e0a1747762d 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -1703,17 +1703,45 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>> if (unlikely(is_swap_pmd(pmd))) {
>> swp_entry_t entry = pmd_to_swp_entry(pmd);
>>
>> - VM_BUG_ON(!is_pmd_migration_entry(pmd));
>> - if (!is_readable_migration_entry(entry)) {
>> - entry = make_readable_migration_entry(
>> - swp_offset(entry));
>> + VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd));
>> +
>> + if (is_writable_migration_entry(entry) ||
>> + is_readable_exclusive_migration_entry(entry)) {
>> + entry = make_readable_migration_entry(swp_offset(entry));
>> pmd = swp_entry_to_pmd(entry);
>> if (pmd_swp_soft_dirty(*src_pmd))
>> pmd = pmd_swp_mksoft_dirty(pmd);
>> if (pmd_swp_uffd_wp(*src_pmd))
>> pmd = pmd_swp_mkuffd_wp(pmd);
>> set_pmd_at(src_mm, addr, src_pmd, pmd);
>> + } else if (is_device_private_entry(entry)) {
>> + /*
>> + * For device private entries, since there are no
>> + * read exclusive entries, writable = !readable
>> + */
>> + if (is_writable_device_private_entry(entry)) {
>> + entry = make_readable_device_private_entry(swp_offset(entry));
>> + pmd = swp_entry_to_pmd(entry);
>> +
>> + if (pmd_swp_soft_dirty(*src_pmd))
>> + pmd = pmd_swp_mksoft_dirty(pmd);
>> + if (pmd_swp_uffd_wp(*src_pmd))
>> + pmd = pmd_swp_mkuffd_wp(pmd);
>> + set_pmd_at(src_mm, addr, src_pmd, pmd);
>> + }
>> +
>> + src_folio = pfn_swap_entry_folio(entry);
>> + VM_WARN_ON(!folio_test_large(src_folio));
>> +
>> + folio_get(src_folio);
>> + /*
>> + * folio_try_dup_anon_rmap_pmd does not fail for
>> + * device private entries.
>> + */
>> + folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
>> + dst_vma, src_vma);
>> }
>> +
>> add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> mm_inc_nr_ptes(dst_mm);
>> pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
>> @@ -2211,15 +2239,16 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>> folio_remove_rmap_pmd(folio, page, vma);
>> WARN_ON_ONCE(folio_mapcount(folio) < 0);
>> VM_BUG_ON_PAGE(!PageHead(page), page);
>> - } else if (thp_migration_supported()) {
>> + } else if (is_pmd_non_present_folio_entry(orig_pmd)) {
>> swp_entry_t entry;
>>
>> - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
>> entry = pmd_to_swp_entry(orig_pmd);
>> folio = pfn_swap_entry_folio(entry);
>> flush_needed = 0;
>> - } else
>> - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
>> +
>> + if (!thp_migration_supported())
>> + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
>> + }
>>
>> if (folio_test_anon(folio)) {
>> zap_deposited_table(tlb->mm, pmd);
>> @@ -2239,6 +2268,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>> folio_mark_accessed(folio);
>> }
>>
>> + if (folio_is_device_private(folio)) {
>> + folio_remove_rmap_pmd(folio, &folio->page, vma);
>> + WARN_ON_ONCE(folio_mapcount(folio) < 0);
>> + folio_put(folio);
>> + }
>
> IIUC, a device-private THP is always anonymous, right? would it make sense
> to move this folio_is_device_private() block inside the folio_test_anon()
> check above?
>
Yes, they are, there is discussion on file-backed mapping at
https://lwn.net/Articles/1016124/. I don't see a benefit from moving it, do you?
Balbir
[...]
On 2025/10/13 08:01, Balbir Singh wrote:
> On 10/13/25 02:46, Lance Yang wrote:
>> On Wed, Oct 1, 2025 at 4:20 PM Balbir Singh <balbirs@nvidia.com> wrote:
>>>
>>> Extend core huge page management functions to handle device-private THP
>>> entries. This enables proper handling of large device-private folios in
>>> fundamental MM operations.
>>>
>>> The following functions have been updated:
>>>
>>> - copy_huge_pmd(): Handle device-private entries during fork/clone
>>> - zap_huge_pmd(): Properly free device-private THP during munmap
>>> - change_huge_pmd(): Support protection changes on device-private THP
>>> - __pte_offset_map(): Add device-private entry awareness
>>>
>>> Cc: David Hildenbrand <david@redhat.com>
>>> Cc: Zi Yan <ziy@nvidia.com>
>>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>>> Cc: Rakie Kim <rakie.kim@sk.com>
>>> Cc: Byungchul Park <byungchul@sk.com>
>>> Cc: Gregory Price <gourry@gourry.net>
>>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>>> Cc: Alistair Popple <apopple@nvidia.com>
>>> Cc: Oscar Salvador <osalvador@suse.de>
>>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>>> Cc: Nico Pache <npache@redhat.com>
>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>> Cc: Dev Jain <dev.jain@arm.com>
>>> Cc: Barry Song <baohua@kernel.org>
>>> Cc: Lyude Paul <lyude@redhat.com>
>>> Cc: Danilo Krummrich <dakr@kernel.org>
>>> Cc: David Airlie <airlied@gmail.com>
>>> Cc: Simona Vetter <simona@ffwll.ch>
>>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>>> Cc: Mika Penttilä <mpenttil@redhat.com>
>>> Cc: Matthew Brost <matthew.brost@intel.com>
>>> Cc: Francois Dugast <francois.dugast@intel.com>
>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>> Acked-by: Zi Yan <ziy@nvidia.com>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>>> ---
>>> include/linux/swapops.h | 32 +++++++++++++++++++++++
>>> mm/huge_memory.c | 56 ++++++++++++++++++++++++++++++++++-------
>>> mm/pgtable-generic.c | 2 +-
>>> 3 files changed, 80 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
>>> index 64ea151a7ae3..2687928a8146 100644
>>> --- a/include/linux/swapops.h
>>> +++ b/include/linux/swapops.h
>>> @@ -594,10 +594,42 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>>> }
>>> #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>>
>>> +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
>>> +
>>> +/**
>>> + * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry
>>> + * @pmd: The PMD to check
>>> + *
>>> + * Returns true if the PMD contains a swap entry that represents a device private
>>> + * page mapping. This is used for zone device private pages that have been
>>> + * swapped out but still need special handling during various memory management
>>> + * operations.
>>> + *
>>> + * Return: 1 if PMD contains device private entry, 0 otherwise
>>> + */
>>> +static inline int is_pmd_device_private_entry(pmd_t pmd)
>>> +{
>>> + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
>>> +}
>>> +
>>> +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>> +
>>> +static inline int is_pmd_device_private_entry(pmd_t pmd)
>>> +{
>>> + return 0;
>>> +}
>>> +
>>> +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>> +
>>> static inline int non_swap_entry(swp_entry_t entry)
>>> {
>>> return swp_type(entry) >= MAX_SWAPFILES;
>>> }
>>>
>>> +static inline int is_pmd_non_present_folio_entry(pmd_t pmd)
>>> +{
>>> + return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd);
>>> +}
>>> +
>>> #endif /* CONFIG_MMU */
>>> #endif /* _LINUX_SWAPOPS_H */
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 1b81680b4225..8e0a1747762d 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -1703,17 +1703,45 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>> if (unlikely(is_swap_pmd(pmd))) {
>>> swp_entry_t entry = pmd_to_swp_entry(pmd);
>>>
>>> - VM_BUG_ON(!is_pmd_migration_entry(pmd));
>>> - if (!is_readable_migration_entry(entry)) {
>>> - entry = make_readable_migration_entry(
>>> - swp_offset(entry));
>>> + VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd));
>>> +
>>> + if (is_writable_migration_entry(entry) ||
>>> + is_readable_exclusive_migration_entry(entry)) {
>>> + entry = make_readable_migration_entry(swp_offset(entry));
>>> pmd = swp_entry_to_pmd(entry);
>>> if (pmd_swp_soft_dirty(*src_pmd))
>>> pmd = pmd_swp_mksoft_dirty(pmd);
>>> if (pmd_swp_uffd_wp(*src_pmd))
>>> pmd = pmd_swp_mkuffd_wp(pmd);
>>> set_pmd_at(src_mm, addr, src_pmd, pmd);
>>> + } else if (is_device_private_entry(entry)) {
>>> + /*
>>> + * For device private entries, since there are no
>>> + * read exclusive entries, writable = !readable
>>> + */
>>> + if (is_writable_device_private_entry(entry)) {
>>> + entry = make_readable_device_private_entry(swp_offset(entry));
>>> + pmd = swp_entry_to_pmd(entry);
>>> +
>>> + if (pmd_swp_soft_dirty(*src_pmd))
>>> + pmd = pmd_swp_mksoft_dirty(pmd);
>>> + if (pmd_swp_uffd_wp(*src_pmd))
>>> + pmd = pmd_swp_mkuffd_wp(pmd);
>>> + set_pmd_at(src_mm, addr, src_pmd, pmd);
>>> + }
>>> +
>>> + src_folio = pfn_swap_entry_folio(entry);
>>> + VM_WARN_ON(!folio_test_large(src_folio));
>>> +
>>> + folio_get(src_folio);
>>> + /*
>>> + * folio_try_dup_anon_rmap_pmd does not fail for
>>> + * device private entries.
>>> + */
>>> + folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
>>> + dst_vma, src_vma);
>>> }
>>> +
>>> add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>> mm_inc_nr_ptes(dst_mm);
>>> pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
>>> @@ -2211,15 +2239,16 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>> folio_remove_rmap_pmd(folio, page, vma);
>>> WARN_ON_ONCE(folio_mapcount(folio) < 0);
>>> VM_BUG_ON_PAGE(!PageHead(page), page);
>>> - } else if (thp_migration_supported()) {
>>> + } else if (is_pmd_non_present_folio_entry(orig_pmd)) {
>>> swp_entry_t entry;
>>>
>>> - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
>>> entry = pmd_to_swp_entry(orig_pmd);
>>> folio = pfn_swap_entry_folio(entry);
>>> flush_needed = 0;
>>> - } else
>>> - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
>>> +
>>> + if (!thp_migration_supported())
>>> + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
>>> + }
>>>
>>> if (folio_test_anon(folio)) {
>>> zap_deposited_table(tlb->mm, pmd);
>>> @@ -2239,6 +2268,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>> folio_mark_accessed(folio);
>>> }
>>>
>>> + if (folio_is_device_private(folio)) {
>>> + folio_remove_rmap_pmd(folio, &folio->page, vma);
>>> + WARN_ON_ONCE(folio_mapcount(folio) < 0);
>>> + folio_put(folio);
>>> + }
>>
>> IIUC, a device-private THP is always anonymous, right? would it make sense
>> to move this folio_is_device_private() block inside the folio_test_anon()
>> check above?
>>
> Yes, they are, there is discussion on file-backed mapping at
> https://lwn.net/Articles/1016124/. I don't see a benefit from moving it, do you?
Ah, I see. Never mind :)
Cheers,
Lance
This patch triggers a regression for s390x kvm as qemu guests can no longer start error: kvm run failed Cannot allocate memory PSW=mask 0000000180000000 addr 000000007fd00600 R00=0000000000000000 R01=0000000000000000 R02=0000000000000000 R03=0000000000000000 R04=0000000000000000 R05=0000000000000000 R06=0000000000000000 R07=0000000000000000 R08=0000000000000000 R09=0000000000000000 R10=0000000000000000 R11=0000000000000000 R12=0000000000000000 R13=0000000000000000 R14=0000000000000000 R15=0000000000000000 C00=00000000000000e0 C01=0000000000000000 C02=0000000000000000 C03=0000000000000000 C04=0000000000000000 C05=0000000000000000 C06=0000000000000000 C07=0000000000000000 C08=0000000000000000 C09=0000000000000000 C10=0000000000000000 C11=0000000000000000 C12=0000000000000000 C13=0000000000000000 C14=00000000c2000000 C15=0000000000000000 KVM on s390x does not use THP so far, will investigate. Does anyone have a quick idea? Christian Borntraeger
On 17.10.25 16:49, Christian Borntraeger wrote: > This patch triggers a regression for s390x kvm as qemu guests can no longer start > > error: kvm run failed Cannot allocate memory > PSW=mask 0000000180000000 addr 000000007fd00600 > R00=0000000000000000 R01=0000000000000000 R02=0000000000000000 R03=0000000000000000 > R04=0000000000000000 R05=0000000000000000 R06=0000000000000000 R07=0000000000000000 > R08=0000000000000000 R09=0000000000000000 R10=0000000000000000 R11=0000000000000000 > R12=0000000000000000 R13=0000000000000000 R14=0000000000000000 R15=0000000000000000 > C00=00000000000000e0 C01=0000000000000000 C02=0000000000000000 C03=0000000000000000 > C04=0000000000000000 C05=0000000000000000 C06=0000000000000000 C07=0000000000000000 > C08=0000000000000000 C09=0000000000000000 C10=0000000000000000 C11=0000000000000000 > C12=0000000000000000 C13=0000000000000000 C14=00000000c2000000 C15=0000000000000000 > > KVM on s390x does not use THP so far, will investigate. Does anyone have a quick idea? Only when running KVM guests and apart from that everything else seems to be fine? That's weird :) -- Cheers David / dhildenb
Am 17.10.25 um 16:54 schrieb David Hildenbrand: > On 17.10.25 16:49, Christian Borntraeger wrote: >> This patch triggers a regression for s390x kvm as qemu guests can no longer start >> >> error: kvm run failed Cannot allocate memory >> PSW=mask 0000000180000000 addr 000000007fd00600 >> R00=0000000000000000 R01=0000000000000000 R02=0000000000000000 R03=0000000000000000 >> R04=0000000000000000 R05=0000000000000000 R06=0000000000000000 R07=0000000000000000 >> R08=0000000000000000 R09=0000000000000000 R10=0000000000000000 R11=0000000000000000 >> R12=0000000000000000 R13=0000000000000000 R14=0000000000000000 R15=0000000000000000 >> C00=00000000000000e0 C01=0000000000000000 C02=0000000000000000 C03=0000000000000000 >> C04=0000000000000000 C05=0000000000000000 C06=0000000000000000 C07=0000000000000000 >> C08=0000000000000000 C09=0000000000000000 C10=0000000000000000 C11=0000000000000000 >> C12=0000000000000000 C13=0000000000000000 C14=00000000c2000000 C15=0000000000000000 >> >> KVM on s390x does not use THP so far, will investigate. Does anyone have a quick idea? > > Only when running KVM guests and apart from that everything else seems to be fine? We have other weirdness in linux-next but in different areas. Could that somehow be related to use disabling THP for the kvm address space?
On 17.10.25 17:01, Christian Borntraeger wrote: > Am 17.10.25 um 16:54 schrieb David Hildenbrand: >> On 17.10.25 16:49, Christian Borntraeger wrote: >>> This patch triggers a regression for s390x kvm as qemu guests can no longer start >>> >>> error: kvm run failed Cannot allocate memory >>> PSW=mask 0000000180000000 addr 000000007fd00600 >>> R00=0000000000000000 R01=0000000000000000 R02=0000000000000000 R03=0000000000000000 >>> R04=0000000000000000 R05=0000000000000000 R06=0000000000000000 R07=0000000000000000 >>> R08=0000000000000000 R09=0000000000000000 R10=0000000000000000 R11=0000000000000000 >>> R12=0000000000000000 R13=0000000000000000 R14=0000000000000000 R15=0000000000000000 >>> C00=00000000000000e0 C01=0000000000000000 C02=0000000000000000 C03=0000000000000000 >>> C04=0000000000000000 C05=0000000000000000 C06=0000000000000000 C07=0000000000000000 >>> C08=0000000000000000 C09=0000000000000000 C10=0000000000000000 C11=0000000000000000 >>> C12=0000000000000000 C13=0000000000000000 C14=00000000c2000000 C15=0000000000000000 >>> >>> KVM on s390x does not use THP so far, will investigate. Does anyone have a quick idea? >> >> Only when running KVM guests and apart from that everything else seems to be fine? > > We have other weirdness in linux-next but in different areas. Could that somehow be > related to use disabling THP for the kvm address space? Not sure ... it's a bit weird. I mean, when KVM disables THPs we essentially just remap everything to be mapped by PTEs. So there shouldn't be any PMDs in that whole process. Remapping a file THP (shmem) implies zapping the THP completely. I assume in your kernel config has CONFIG_ZONE_DEVICE and CONFIG_ARCH_ENABLE_THP_MIGRATION set, right? I'd rule out copy_huge_pmd(), zap_huge_pmd() a well. What happens if you revert the change in mm/pgtable-generic.c? But the whole -ENOMEM error is a weird symptom. -- Cheers David / dhildenb
Am 17.10.25 um 17:07 schrieb David Hildenbrand:
> On 17.10.25 17:01, Christian Borntraeger wrote:
>> Am 17.10.25 um 16:54 schrieb David Hildenbrand:
>>> On 17.10.25 16:49, Christian Borntraeger wrote:
>>>> This patch triggers a regression for s390x kvm as qemu guests can no longer start
>>>>
>>>> error: kvm run failed Cannot allocate memory
>>>> PSW=mask 0000000180000000 addr 000000007fd00600
>>>> R00=0000000000000000 R01=0000000000000000 R02=0000000000000000 R03=0000000000000000
>>>> R04=0000000000000000 R05=0000000000000000 R06=0000000000000000 R07=0000000000000000
>>>> R08=0000000000000000 R09=0000000000000000 R10=0000000000000000 R11=0000000000000000
>>>> R12=0000000000000000 R13=0000000000000000 R14=0000000000000000 R15=0000000000000000
>>>> C00=00000000000000e0 C01=0000000000000000 C02=0000000000000000 C03=0000000000000000
>>>> C04=0000000000000000 C05=0000000000000000 C06=0000000000000000 C07=0000000000000000
>>>> C08=0000000000000000 C09=0000000000000000 C10=0000000000000000 C11=0000000000000000
>>>> C12=0000000000000000 C13=0000000000000000 C14=00000000c2000000 C15=0000000000000000
>>>>
>>>> KVM on s390x does not use THP so far, will investigate. Does anyone have a quick idea?
>>>
>>> Only when running KVM guests and apart from that everything else seems to be fine?
>>
>> We have other weirdness in linux-next but in different areas. Could that somehow be
>> related to use disabling THP for the kvm address space?
>
> Not sure ... it's a bit weird. I mean, when KVM disables THPs we essentially just remap everything to be mapped by PTEs. So there shouldn't be any PMDs in that whole process.
>
> Remapping a file THP (shmem) implies zapping the THP completely.
>
>
> I assume in your kernel config has CONFIG_ZONE_DEVICE and CONFIG_ARCH_ENABLE_THP_MIGRATION set, right?
yes.
>
> I'd rule out copy_huge_pmd(), zap_huge_pmd() a well.
>
>
> What happens if you revert the change in mm/pgtable-generic.c?
That partial revert seems to fix the issue
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0c847cdf4fd3..567e2d084071 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
if (pmdvalp)
*pmdvalp = pmdval;
- if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval)))
+ if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
goto nomap;
if (unlikely(pmd_trans_huge(pmdval)))
goto nomap;
On 17.10.25 17:20, Christian Borntraeger wrote: > > > Am 17.10.25 um 17:07 schrieb David Hildenbrand: >> On 17.10.25 17:01, Christian Borntraeger wrote: >>> Am 17.10.25 um 16:54 schrieb David Hildenbrand: >>>> On 17.10.25 16:49, Christian Borntraeger wrote: >>>>> This patch triggers a regression for s390x kvm as qemu guests can no longer start >>>>> >>>>> error: kvm run failed Cannot allocate memory >>>>> PSW=mask 0000000180000000 addr 000000007fd00600 >>>>> R00=0000000000000000 R01=0000000000000000 R02=0000000000000000 R03=0000000000000000 >>>>> R04=0000000000000000 R05=0000000000000000 R06=0000000000000000 R07=0000000000000000 >>>>> R08=0000000000000000 R09=0000000000000000 R10=0000000000000000 R11=0000000000000000 >>>>> R12=0000000000000000 R13=0000000000000000 R14=0000000000000000 R15=0000000000000000 >>>>> C00=00000000000000e0 C01=0000000000000000 C02=0000000000000000 C03=0000000000000000 >>>>> C04=0000000000000000 C05=0000000000000000 C06=0000000000000000 C07=0000000000000000 >>>>> C08=0000000000000000 C09=0000000000000000 C10=0000000000000000 C11=0000000000000000 >>>>> C12=0000000000000000 C13=0000000000000000 C14=00000000c2000000 C15=0000000000000000 >>>>> >>>>> KVM on s390x does not use THP so far, will investigate. Does anyone have a quick idea? >>>> >>>> Only when running KVM guests and apart from that everything else seems to be fine? >>> >>> We have other weirdness in linux-next but in different areas. Could that somehow be >>> related to use disabling THP for the kvm address space? >> >> Not sure ... it's a bit weird. I mean, when KVM disables THPs we essentially just remap everything to be mapped by PTEs. So there shouldn't be any PMDs in that whole process. >> >> Remapping a file THP (shmem) implies zapping the THP completely. >> >> >> I assume in your kernel config has CONFIG_ZONE_DEVICE and CONFIG_ARCH_ENABLE_THP_MIGRATION set, right? > > yes. > >> >> I'd rule out copy_huge_pmd(), zap_huge_pmd() a well. >> >> >> What happens if you revert the change in mm/pgtable-generic.c? > > That partial revert seems to fix the issue > diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c > index 0c847cdf4fd3..567e2d084071 100644 > --- a/mm/pgtable-generic.c > +++ b/mm/pgtable-generic.c > @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) > > if (pmdvalp) > *pmdvalp = pmdval; > - if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) > + if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) Okay, but that means that effectively we stumble over a PMD entry that is not a migration entry but still non-present. And I would expect that it's a page table, because otherwise the change wouldn't make a difference. And the weird thing is that this only triggers sometimes, because if it would always trigger nothing would ever work. Is there some weird scenario where s390x might set a left page table mapped in a PMD to non-present? Staring at the definition of pmd_present() on s390x it's really just return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0; Maybe this is happening in the gmap code only and not actually in the core-mm code? -- Cheers David / dhildenb
On 10/18/25 04:07, David Hildenbrand wrote: > On 17.10.25 17:20, Christian Borntraeger wrote: >> >> >> Am 17.10.25 um 17:07 schrieb David Hildenbrand: >>> On 17.10.25 17:01, Christian Borntraeger wrote: >>>> Am 17.10.25 um 16:54 schrieb David Hildenbrand: >>>>> On 17.10.25 16:49, Christian Borntraeger wrote: >>>>>> This patch triggers a regression for s390x kvm as qemu guests can no longer start >>>>>> >>>>>> error: kvm run failed Cannot allocate memory >>>>>> PSW=mask 0000000180000000 addr 000000007fd00600 >>>>>> R00=0000000000000000 R01=0000000000000000 R02=0000000000000000 R03=0000000000000000 >>>>>> R04=0000000000000000 R05=0000000000000000 R06=0000000000000000 R07=0000000000000000 >>>>>> R08=0000000000000000 R09=0000000000000000 R10=0000000000000000 R11=0000000000000000 >>>>>> R12=0000000000000000 R13=0000000000000000 R14=0000000000000000 R15=0000000000000000 >>>>>> C00=00000000000000e0 C01=0000000000000000 C02=0000000000000000 C03=0000000000000000 >>>>>> C04=0000000000000000 C05=0000000000000000 C06=0000000000000000 C07=0000000000000000 >>>>>> C08=0000000000000000 C09=0000000000000000 C10=0000000000000000 C11=0000000000000000 >>>>>> C12=0000000000000000 C13=0000000000000000 C14=00000000c2000000 C15=0000000000000000 >>>>>> >>>>>> KVM on s390x does not use THP so far, will investigate. Does anyone have a quick idea? >>>>> >>>>> Only when running KVM guests and apart from that everything else seems to be fine? >>>> >>>> We have other weirdness in linux-next but in different areas. Could that somehow be >>>> related to use disabling THP for the kvm address space? >>> >>> Not sure ... it's a bit weird. I mean, when KVM disables THPs we essentially just remap everything to be mapped by PTEs. So there shouldn't be any PMDs in that whole process. >>> >>> Remapping a file THP (shmem) implies zapping the THP completely. >>> >>> >>> I assume in your kernel config has CONFIG_ZONE_DEVICE and CONFIG_ARCH_ENABLE_THP_MIGRATION set, right? >> >> yes. >> >>> >>> I'd rule out copy_huge_pmd(), zap_huge_pmd() a well. >>> >>> >>> What happens if you revert the change in mm/pgtable-generic.c? >> >> That partial revert seems to fix the issue >> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c >> index 0c847cdf4fd3..567e2d084071 100644 >> --- a/mm/pgtable-generic.c >> +++ b/mm/pgtable-generic.c >> @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) >> if (pmdvalp) >> *pmdvalp = pmdval; >> - if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) >> + if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) > > Okay, but that means that effectively we stumble over a PMD entry that is not a migration entry but still non-present. > > And I would expect that it's a page table, because otherwise the change > wouldn't make a difference. > > And the weird thing is that this only triggers sometimes, because if > it would always trigger nothing would ever work. > > Is there some weird scenario where s390x might set a left page table mapped in a PMD to non-present? > Good point > Staring at the definition of pmd_present() on s390x it's really just > > return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0; > > > Maybe this is happening in the gmap code only and not actually in the core-mm code? > I am not an s390 expert, but just looking at the code So the check on s390 effectively segment_entry/present = false or segment_entry_empty/invalid = true Given that the revert works, the check changes to segment_entry/present = false or pmd_migration_entry (PAGE_INVALID | PAGE_PROTECT) So it isn't the first check of segment_entry/present = false sounds like for s390 we would want __pte_offset_map to allow mappings with segment_entry_empty/invalid entries? Any chance we can get the stack trace and a dump of the PMD entry when the issue occurs? In the meanwhile, does this fix/workaround work? diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 0c847cdf4fd3..31c1754d5bd4 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) if (pmdvalp) *pmdvalp = pmdval; - if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) + if (unlikely(pmd_none(pmdval) || is_pmd_non_present_folio_entry(pmdval))) goto nomap; if (unlikely(pmd_trans_huge(pmdval))) goto nomap; Thanks David and Christian! Balbir
Am 17.10.25 um 23:56 schrieb Balbir Singh: > In the meanwhile, does this fix/workaround work? > > diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c > index 0c847cdf4fd3..31c1754d5bd4 100644 > --- a/mm/pgtable-generic.c > +++ b/mm/pgtable-generic.c > @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) > > if (pmdvalp) > *pmdvalp = pmdval; > - if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) > + if (unlikely(pmd_none(pmdval) || is_pmd_non_present_folio_entry(pmdval))) > goto nomap; > if (unlikely(pmd_trans_huge(pmdval))) > goto nomap; > Yes, this seems to work. CC Claudio.
On 20.10.25 09:00, Christian Borntraeger wrote: > Am 17.10.25 um 23:56 schrieb Balbir Singh: > >> In the meanwhile, does this fix/workaround work? >> >> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c >> index 0c847cdf4fd3..31c1754d5bd4 100644 >> --- a/mm/pgtable-generic.c >> +++ b/mm/pgtable-generic.c >> @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) >> >> if (pmdvalp) >> *pmdvalp = pmdval; >> - if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) >> + if (unlikely(pmd_none(pmdval) || is_pmd_non_present_folio_entry(pmdval))) >> goto nomap; >> if (unlikely(pmd_trans_huge(pmdval))) >> goto nomap; >> > > Yes, this seems to work. Right, but that's not what we will want here. We'll have to adjust s390x gmap code (which is getting redesigned either way) to only take the page lock. In the end, we'll want here later a single if (!pmd_present(pmdval)) goto nomap; -- Cheers David / dhildenb
On Mon, 20 Oct 2025 10:41:28 +0200
David Hildenbrand <david@redhat.com> wrote:
> On 20.10.25 09:00, Christian Borntraeger wrote:
> > Am 17.10.25 um 23:56 schrieb Balbir Singh:
> >
> >> In the meanwhile, does this fix/workaround work?
> >>
> >> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> >> index 0c847cdf4fd3..31c1754d5bd4 100644
> >> --- a/mm/pgtable-generic.c
> >> +++ b/mm/pgtable-generic.c
> >> @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
> >>
> >> if (pmdvalp)
> >> *pmdvalp = pmdval;
> >> - if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval)))
> >> + if (unlikely(pmd_none(pmdval) || is_pmd_non_present_folio_entry(pmdval)))
> >> goto nomap;
> >> if (unlikely(pmd_trans_huge(pmdval)))
> >> goto nomap;
> >>
> >
> > Yes, this seems to work.
>
> Right, but that's not what we will want here. We'll have to adjust s390x
> gmap code (which is getting redesigned either way) to only take the page
> lock.
>
> In the end, we'll want here later a single
>
> if (!pmd_present(pmdval))
> goto nomap;
>
this seems to do the trick:
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 8ff6bba107e8..22c448b32340 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -599,8 +599,9 @@ int __gmap_link(struct gmap *gmap, unsigned long
gaddr, unsigned long vmaddr) | _SEGMENT_ENTRY_GMAP_UC
| _SEGMENT_ENTRY;
} else
- *table = pmd_val(*pmd) &
- _SEGMENT_ENTRY_HARDWARE_BITS;
+ *table = (pmd_val(*pmd) &
+ _SEGMENT_ENTRY_HARDWARE_BITS)
+ | _SEGMENT_ENTRY;
}
} else if (*table & _SEGMENT_ENTRY_PROTECT &&
!(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
it marks non-leaf gmap segment (pmd) entries as present, just as normal
pmds would be.
I think it's a good enough fix for now, pending the rewrite, which I
hope to get in the next merge window
Am 27.10.25 um 17:47 schrieb Claudio Imbrenda:
> On Mon, 20 Oct 2025 10:41:28 +0200
> David Hildenbrand <david@redhat.com> wrote:
>
>> On 20.10.25 09:00, Christian Borntraeger wrote:
>>> Am 17.10.25 um 23:56 schrieb Balbir Singh:
>>>
>>>> In the meanwhile, does this fix/workaround work?
>>>>
>>>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>>>> index 0c847cdf4fd3..31c1754d5bd4 100644
>>>> --- a/mm/pgtable-generic.c
>>>> +++ b/mm/pgtable-generic.c
>>>> @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
>>>>
>>>> if (pmdvalp)
>>>> *pmdvalp = pmdval;
>>>> - if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval)))
>>>> + if (unlikely(pmd_none(pmdval) || is_pmd_non_present_folio_entry(pmdval)))
>>>> goto nomap;
>>>> if (unlikely(pmd_trans_huge(pmdval)))
>>>> goto nomap;
>>>>
>>>
>>> Yes, this seems to work.
>>
>> Right, but that's not what we will want here. We'll have to adjust s390x
>> gmap code (which is getting redesigned either way) to only take the page
>> lock.
>>
>> In the end, we'll want here later a single
>>
>> if (!pmd_present(pmdval))
>> goto nomap;
>>
>
> this seems to do the trick:
>
> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> index 8ff6bba107e8..22c448b32340 100644
> --- a/arch/s390/mm/gmap.c
> +++ b/arch/s390/mm/gmap.c
> @@ -599,8 +599,9 @@ int __gmap_link(struct gmap *gmap, unsigned long
> gaddr, unsigned long vmaddr) | _SEGMENT_ENTRY_GMAP_UC
> | _SEGMENT_ENTRY;
> } else
> - *table = pmd_val(*pmd) &
> - _SEGMENT_ENTRY_HARDWARE_BITS;
> + *table = (pmd_val(*pmd) &
> + _SEGMENT_ENTRY_HARDWARE_BITS)
> + | _SEGMENT_ENTRY;
> }
> } else if (*table & _SEGMENT_ENTRY_PROTECT &&
> !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
>
>
Tested-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
can you send a proper patch? I guess we should add it to Andrews mm true to keep it close to the patch that uncovered the issue.
s390 maintainers cced.
This patch solves the issue uncovered by patch caf527048be8
("mm/huge_memory: add device-private THP support to PMD operations"),
which is at the moment in -next.
@Andrew: do you think it's possible to squeeze this patch in -next
_before_ the patches that introduce the issue? This will guarantee that
the patch is merged first, and will not break bisections once merged.
Claudio Imbrenda (1):
KVM: s390: Fix missing present bit for gmap puds
arch/s390/mm/gmap.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--
2.51.0
On Tue, 28 Oct 2025 14:01:49 +0100 Claudio Imbrenda <imbrenda@linux.ibm.com> wrote: > @Andrew: do you think it's possible to squeeze this patch in -next > _before_ the patches that introduce the issue? This will guarantee that > the patch is merged first, and will not break bisections once merged. no problem, thanks.
For hugetlbs, gmap puds have the present bit set. For normal puds
(which point to ptes), the bit is not set. This is in contrast to the
normal userspace puds, which always have the bit set for present pmds.
This causes issues when ___pte_offset_map() is modified to only check
for the present bit.
The solution to the problem is simply to always set the present bit for
present gmap pmds.
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Link: https://lore.kernel.org/lkml/20251017144924.10034-1-borntraeger@linux.ibm.com/
Tested-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
arch/s390/mm/gmap.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 8ff6bba107e8..22c448b32340 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -599,8 +599,9 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
| _SEGMENT_ENTRY_GMAP_UC
| _SEGMENT_ENTRY;
} else
- *table = pmd_val(*pmd) &
- _SEGMENT_ENTRY_HARDWARE_BITS;
+ *table = (pmd_val(*pmd) &
+ _SEGMENT_ENTRY_HARDWARE_BITS)
+ | _SEGMENT_ENTRY;
}
} else if (*table & _SEGMENT_ENTRY_PROTECT &&
!(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
--
2.51.0
On 28.10.25 14:01, Claudio Imbrenda wrote:
> For hugetlbs, gmap puds have the present bit set. For normal puds
> (which point to ptes), the bit is not set. This is in contrast to the
> normal userspace puds, which always have the bit set for present pmds.
>
> This causes issues when ___pte_offset_map() is modified to only check
> for the present bit.
>
> The solution to the problem is simply to always set the present bit for
> present gmap pmds.
>
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> Link: https://lore.kernel.org/lkml/20251017144924.10034-1-borntraeger@linux.ibm.com/
> Tested-by: Christian Borntraeger <borntraeger@linux.ibm.com>
> Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
> ---
> arch/s390/mm/gmap.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> index 8ff6bba107e8..22c448b32340 100644
> --- a/arch/s390/mm/gmap.c
> +++ b/arch/s390/mm/gmap.c
> @@ -599,8 +599,9 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
> | _SEGMENT_ENTRY_GMAP_UC
> | _SEGMENT_ENTRY;
> } else
> - *table = pmd_val(*pmd) &
> - _SEGMENT_ENTRY_HARDWARE_BITS;
I'd add a comment here like
/* Make sure that pmd_present() will work on these entries. */
> + *table = (pmd_val(*pmd) &
> + _SEGMENT_ENTRY_HARDWARE_BITS)
> + | _SEGMENT_ENTRY;
> }
> } else if (*table & _SEGMENT_ENTRY_PROTECT &&
> !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
Reviewed-by: David Hildenbrand <david@redhat.com>
--
Cheers
David / dhildenb
On Wed, 29 Oct 2025 11:00:14 +0100
David Hildenbrand <david@redhat.com> wrote:
> On 28.10.25 14:01, Claudio Imbrenda wrote:
> > For hugetlbs, gmap puds have the present bit set. For normal puds
> > (which point to ptes), the bit is not set. This is in contrast to the
> > normal userspace puds, which always have the bit set for present pmds.
> >
> > This causes issues when ___pte_offset_map() is modified to only check
> > for the present bit.
> >
> > The solution to the problem is simply to always set the present bit for
> > present gmap pmds.
> >
> > Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> > Link: https://lore.kernel.org/lkml/20251017144924.10034-1-borntraeger@linux.ibm.com/
> > Tested-by: Christian Borntraeger <borntraeger@linux.ibm.com>
> > Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
> > ---
> > arch/s390/mm/gmap.c | 5 +++--
> > 1 file changed, 3 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> > index 8ff6bba107e8..22c448b32340 100644
> > --- a/arch/s390/mm/gmap.c
> > +++ b/arch/s390/mm/gmap.c
> > @@ -599,8 +599,9 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
> > | _SEGMENT_ENTRY_GMAP_UC
> > | _SEGMENT_ENTRY;
> > } else
> > - *table = pmd_val(*pmd) &
> > - _SEGMENT_ENTRY_HARDWARE_BITS;
>
> I'd add a comment here like
>
> /* Make sure that pmd_present() will work on these entries. */
the whole file is going away very soon anyway
>
> > + *table = (pmd_val(*pmd) &
> > + _SEGMENT_ENTRY_HARDWARE_BITS)
> > + | _SEGMENT_ENTRY;
> > }
> > } else if (*table & _SEGMENT_ENTRY_PROTECT &&
> > !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
>
> Reviewed-by: David Hildenbrand <david@redhat.com>
>
On 10/29/25 00:01, Claudio Imbrenda wrote:
> For hugetlbs, gmap puds have the present bit set. For normal puds
> (which point to ptes), the bit is not set. This is in contrast to the
> normal userspace puds, which always have the bit set for present pmds.
>
> This causes issues when ___pte_offset_map() is modified to only check
> for the present bit.
>
> The solution to the problem is simply to always set the present bit for
> present gmap pmds.
>
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> Link: https://lore.kernel.org/lkml/20251017144924.10034-1-borntraeger@linux.ibm.com/
> Tested-by: Christian Borntraeger <borntraeger@linux.ibm.com>
> Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
> ---
> arch/s390/mm/gmap.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> index 8ff6bba107e8..22c448b32340 100644
> --- a/arch/s390/mm/gmap.c
> +++ b/arch/s390/mm/gmap.c
> @@ -599,8 +599,9 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
> | _SEGMENT_ENTRY_GMAP_UC
> | _SEGMENT_ENTRY;
> } else
> - *table = pmd_val(*pmd) &
> - _SEGMENT_ENTRY_HARDWARE_BITS;
> + *table = (pmd_val(*pmd) &
> + _SEGMENT_ENTRY_HARDWARE_BITS)
> + | _SEGMENT_ENTRY;
> }
> } else if (*table & _SEGMENT_ENTRY_PROTECT &&
> !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
Acked-by: Balbir Singh <balbirs@nvidia.com>
On 10/28/25 04:06, Christian Borntraeger wrote:
> Am 27.10.25 um 17:47 schrieb Claudio Imbrenda:
>> On Mon, 20 Oct 2025 10:41:28 +0200
>> David Hildenbrand <david@redhat.com> wrote:
>>
>>> On 20.10.25 09:00, Christian Borntraeger wrote:
>>>> Am 17.10.25 um 23:56 schrieb Balbir Singh:
>>>>
>>>>> In the meanwhile, does this fix/workaround work?
>>>>>
>>>>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>>>>> index 0c847cdf4fd3..31c1754d5bd4 100644
>>>>> --- a/mm/pgtable-generic.c
>>>>> +++ b/mm/pgtable-generic.c
>>>>> @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
>>>>> if (pmdvalp)
>>>>> *pmdvalp = pmdval;
>>>>> - if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval)))
>>>>> + if (unlikely(pmd_none(pmdval) || is_pmd_non_present_folio_entry(pmdval)))
>>>>> goto nomap;
>>>>> if (unlikely(pmd_trans_huge(pmdval)))
>>>>> goto nomap;
>>>>>
>>>>
>>>> Yes, this seems to work.
>>>
>>> Right, but that's not what we will want here. We'll have to adjust s390x
>>> gmap code (which is getting redesigned either way) to only take the page
>>> lock.
>>>
>>> In the end, we'll want here later a single
>>>
>>> if (!pmd_present(pmdval))
>>> goto nomap;
>>>
>>
>> this seems to do the trick:
>>
>> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
>> index 8ff6bba107e8..22c448b32340 100644
>> --- a/arch/s390/mm/gmap.c
>> +++ b/arch/s390/mm/gmap.c
>> @@ -599,8 +599,9 @@ int __gmap_link(struct gmap *gmap, unsigned long
>> gaddr, unsigned long vmaddr) | _SEGMENT_ENTRY_GMAP_UC
>> | _SEGMENT_ENTRY;
>> } else
>> - *table = pmd_val(*pmd) &
>> - _SEGMENT_ENTRY_HARDWARE_BITS;
>> + *table = (pmd_val(*pmd) &
>> + _SEGMENT_ENTRY_HARDWARE_BITS)
>> + | _SEGMENT_ENTRY;
>> }
>> } else if (*table & _SEGMENT_ENTRY_PROTECT &&
>> !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
>>
>>
>
> Tested-by: Christian Borntraeger <borntraeger@linux.ibm.com>
> Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
>
> can you send a proper patch? I guess we should add it to Andrews mm true to keep it close to the patch that uncovered the issue.
> s390 maintainers cced.
Thanks for finding the fix. Ideally, we want this fix just before my series if possible!
Balbir
On 27.10.25 17:47, Claudio Imbrenda wrote:
> On Mon, 20 Oct 2025 10:41:28 +0200
> David Hildenbrand <david@redhat.com> wrote:
>
>> On 20.10.25 09:00, Christian Borntraeger wrote:
>>> Am 17.10.25 um 23:56 schrieb Balbir Singh:
>>>
>>>> In the meanwhile, does this fix/workaround work?
>>>>
>>>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>>>> index 0c847cdf4fd3..31c1754d5bd4 100644
>>>> --- a/mm/pgtable-generic.c
>>>> +++ b/mm/pgtable-generic.c
>>>> @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
>>>>
>>>> if (pmdvalp)
>>>> *pmdvalp = pmdval;
>>>> - if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval)))
>>>> + if (unlikely(pmd_none(pmdval) || is_pmd_non_present_folio_entry(pmdval)))
>>>> goto nomap;
>>>> if (unlikely(pmd_trans_huge(pmdval)))
>>>> goto nomap;
>>>>
>>>
>>> Yes, this seems to work.
>>
>> Right, but that's not what we will want here. We'll have to adjust s390x
>> gmap code (which is getting redesigned either way) to only take the page
>> lock.
>>
>> In the end, we'll want here later a single
>>
>> if (!pmd_present(pmdval))
>> goto nomap;
>>
>
> this seems to do the trick:
>
> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> index 8ff6bba107e8..22c448b32340 100644
> --- a/arch/s390/mm/gmap.c
> +++ b/arch/s390/mm/gmap.c
> @@ -599,8 +599,9 @@ int __gmap_link(struct gmap *gmap, unsigned long
> gaddr, unsigned long vmaddr) | _SEGMENT_ENTRY_GMAP_UC
> | _SEGMENT_ENTRY;
> } else
> - *table = pmd_val(*pmd) &
> - _SEGMENT_ENTRY_HARDWARE_BITS;
> + *table = (pmd_val(*pmd) &
> + _SEGMENT_ENTRY_HARDWARE_BITS)
> + | _SEGMENT_ENTRY;
Probably worth adding a comment. I remember we don't reuse this bit as a
SW bit in gmap code, right?
> }
> } else if (*table & _SEGMENT_ENTRY_PROTECT &&
> !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
>
>
>
> it marks non-leaf gmap segment (pmd) entries as present, just as normal
> pmds would be.
Yeah, I looked into hand-coding the PTL lookup but it just gets nasty
real quick.
>
> I think it's a good enough fix for now, pending the rewrite, which I
> hope to get in the next merge window
Agreed.
--
Cheers
David / dhildenb
On Mon, 20 Oct 2025 10:41:28 +0200 David Hildenbrand <david@redhat.com> wrote: > On 20.10.25 09:00, Christian Borntraeger wrote: > > Am 17.10.25 um 23:56 schrieb Balbir Singh: > > > >> In the meanwhile, does this fix/workaround work? > >> > >> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c > >> index 0c847cdf4fd3..31c1754d5bd4 100644 > >> --- a/mm/pgtable-generic.c > >> +++ b/mm/pgtable-generic.c > >> @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) > >> > >> if (pmdvalp) > >> *pmdvalp = pmdval; > >> - if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) > >> + if (unlikely(pmd_none(pmdval) || is_pmd_non_present_folio_entry(pmdval))) > >> goto nomap; > >> if (unlikely(pmd_trans_huge(pmdval))) > >> goto nomap; > >> > > > > Yes, this seems to work. > > Right, but that's not what we will want here. We'll have to adjust s390x I'm looking into that > gmap code (which is getting redesigned either way) to only take the page unfortunately the rework won't make it in 6.18, so I'll have to quickly cobble together a fix > lock. > > In the end, we'll want here later a single > > if (!pmd_present(pmdval)) > goto nomap; >
On 17.10.25 23:56, Balbir Singh wrote: > On 10/18/25 04:07, David Hildenbrand wrote: >> On 17.10.25 17:20, Christian Borntraeger wrote: >>> >>> >>> Am 17.10.25 um 17:07 schrieb David Hildenbrand: >>>> On 17.10.25 17:01, Christian Borntraeger wrote: >>>>> Am 17.10.25 um 16:54 schrieb David Hildenbrand: >>>>>> On 17.10.25 16:49, Christian Borntraeger wrote: >>>>>>> This patch triggers a regression for s390x kvm as qemu guests can no longer start >>>>>>> >>>>>>> error: kvm run failed Cannot allocate memory >>>>>>> PSW=mask 0000000180000000 addr 000000007fd00600 >>>>>>> R00=0000000000000000 R01=0000000000000000 R02=0000000000000000 R03=0000000000000000 >>>>>>> R04=0000000000000000 R05=0000000000000000 R06=0000000000000000 R07=0000000000000000 >>>>>>> R08=0000000000000000 R09=0000000000000000 R10=0000000000000000 R11=0000000000000000 >>>>>>> R12=0000000000000000 R13=0000000000000000 R14=0000000000000000 R15=0000000000000000 >>>>>>> C00=00000000000000e0 C01=0000000000000000 C02=0000000000000000 C03=0000000000000000 >>>>>>> C04=0000000000000000 C05=0000000000000000 C06=0000000000000000 C07=0000000000000000 >>>>>>> C08=0000000000000000 C09=0000000000000000 C10=0000000000000000 C11=0000000000000000 >>>>>>> C12=0000000000000000 C13=0000000000000000 C14=00000000c2000000 C15=0000000000000000 >>>>>>> >>>>>>> KVM on s390x does not use THP so far, will investigate. Does anyone have a quick idea? >>>>>> >>>>>> Only when running KVM guests and apart from that everything else seems to be fine? >>>>> >>>>> We have other weirdness in linux-next but in different areas. Could that somehow be >>>>> related to use disabling THP for the kvm address space? >>>> >>>> Not sure ... it's a bit weird. I mean, when KVM disables THPs we essentially just remap everything to be mapped by PTEs. So there shouldn't be any PMDs in that whole process. >>>> >>>> Remapping a file THP (shmem) implies zapping the THP completely. >>>> >>>> >>>> I assume in your kernel config has CONFIG_ZONE_DEVICE and CONFIG_ARCH_ENABLE_THP_MIGRATION set, right? >>> >>> yes. >>> >>>> >>>> I'd rule out copy_huge_pmd(), zap_huge_pmd() a well. >>>> >>>> >>>> What happens if you revert the change in mm/pgtable-generic.c? >>> >>> That partial revert seems to fix the issue >>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c >>> index 0c847cdf4fd3..567e2d084071 100644 >>> --- a/mm/pgtable-generic.c >>> +++ b/mm/pgtable-generic.c >>> @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) >>> if (pmdvalp) >>> *pmdvalp = pmdval; >>> - if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) >>> + if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) >> >> Okay, but that means that effectively we stumble over a PMD entry that is not a migration entry but still non-present. >> >> And I would expect that it's a page table, because otherwise the change >> wouldn't make a difference. >> >> And the weird thing is that this only triggers sometimes, because if >> it would always trigger nothing would ever work. >> >> Is there some weird scenario where s390x might set a left page table mapped in a PMD to non-present? >> > > Good point > >> Staring at the definition of pmd_present() on s390x it's really just >> >> return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0; >> >> >> Maybe this is happening in the gmap code only and not actually in the core-mm code? >> > > > I am not an s390 expert, but just looking at the code > > So the check on s390 effectively > > segment_entry/present = false or segment_entry_empty/invalid = true pmd_present() == true iff _SEGMENT_ENTRY_PRESENT is set because return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0; is the same as return pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT; But that means we have something where _SEGMENT_ENTRY_PRESENT is not set. I suspect that can only be the gmap tables. Likely __gmap_link() does not set _SEGMENT_ENTRY_PRESENT, which is fine because it's a software managed bit for "ordinary" page tables, not gmap tables. Which raises the question why someone would wrongly use pte_offset_map()/__pte_offset_map() on the gmap tables. I cannot immediately spot any such usage in kvm/gmap code, though. -- Cheers David / dhildenb
On 18.10.25 00:15, David Hildenbrand wrote: > On 17.10.25 23:56, Balbir Singh wrote: >> On 10/18/25 04:07, David Hildenbrand wrote: >>> On 17.10.25 17:20, Christian Borntraeger wrote: >>>> >>>> >>>> Am 17.10.25 um 17:07 schrieb David Hildenbrand: >>>>> On 17.10.25 17:01, Christian Borntraeger wrote: >>>>>> Am 17.10.25 um 16:54 schrieb David Hildenbrand: >>>>>>> On 17.10.25 16:49, Christian Borntraeger wrote: >>>>>>>> This patch triggers a regression for s390x kvm as qemu guests can no longer start >>>>>>>> >>>>>>>> error: kvm run failed Cannot allocate memory >>>>>>>> PSW=mask 0000000180000000 addr 000000007fd00600 >>>>>>>> R00=0000000000000000 R01=0000000000000000 R02=0000000000000000 R03=0000000000000000 >>>>>>>> R04=0000000000000000 R05=0000000000000000 R06=0000000000000000 R07=0000000000000000 >>>>>>>> R08=0000000000000000 R09=0000000000000000 R10=0000000000000000 R11=0000000000000000 >>>>>>>> R12=0000000000000000 R13=0000000000000000 R14=0000000000000000 R15=0000000000000000 >>>>>>>> C00=00000000000000e0 C01=0000000000000000 C02=0000000000000000 C03=0000000000000000 >>>>>>>> C04=0000000000000000 C05=0000000000000000 C06=0000000000000000 C07=0000000000000000 >>>>>>>> C08=0000000000000000 C09=0000000000000000 C10=0000000000000000 C11=0000000000000000 >>>>>>>> C12=0000000000000000 C13=0000000000000000 C14=00000000c2000000 C15=0000000000000000 >>>>>>>> >>>>>>>> KVM on s390x does not use THP so far, will investigate. Does anyone have a quick idea? >>>>>>> >>>>>>> Only when running KVM guests and apart from that everything else seems to be fine? >>>>>> >>>>>> We have other weirdness in linux-next but in different areas. Could that somehow be >>>>>> related to use disabling THP for the kvm address space? >>>>> >>>>> Not sure ... it's a bit weird. I mean, when KVM disables THPs we essentially just remap everything to be mapped by PTEs. So there shouldn't be any PMDs in that whole process. >>>>> >>>>> Remapping a file THP (shmem) implies zapping the THP completely. >>>>> >>>>> >>>>> I assume in your kernel config has CONFIG_ZONE_DEVICE and CONFIG_ARCH_ENABLE_THP_MIGRATION set, right? >>>> >>>> yes. >>>> >>>>> >>>>> I'd rule out copy_huge_pmd(), zap_huge_pmd() a well. >>>>> >>>>> >>>>> What happens if you revert the change in mm/pgtable-generic.c? >>>> >>>> That partial revert seems to fix the issue >>>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c >>>> index 0c847cdf4fd3..567e2d084071 100644 >>>> --- a/mm/pgtable-generic.c >>>> +++ b/mm/pgtable-generic.c >>>> @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) >>>> if (pmdvalp) >>>> *pmdvalp = pmdval; >>>> - if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) >>>> + if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) >>> >>> Okay, but that means that effectively we stumble over a PMD entry that is not a migration entry but still non-present. >>> >>> And I would expect that it's a page table, because otherwise the change >>> wouldn't make a difference. >>> >>> And the weird thing is that this only triggers sometimes, because if >>> it would always trigger nothing would ever work. >>> >>> Is there some weird scenario where s390x might set a left page table mapped in a PMD to non-present? >>> >> >> Good point >> >>> Staring at the definition of pmd_present() on s390x it's really just >>> >>> return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0; >>> >>> >>> Maybe this is happening in the gmap code only and not actually in the core-mm code? >>> >> >> >> I am not an s390 expert, but just looking at the code >> >> So the check on s390 effectively >> >> segment_entry/present = false or segment_entry_empty/invalid = true > > pmd_present() == true iff _SEGMENT_ENTRY_PRESENT is set > > because > > return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0; > > is the same as > > return pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT; > > But that means we have something where _SEGMENT_ENTRY_PRESENT is not set. > > I suspect that can only be the gmap tables. > > Likely __gmap_link() does not set _SEGMENT_ENTRY_PRESENT, which is fine > because it's a software managed bit for "ordinary" page tables, not gmap > tables. > > Which raises the question why someone would wrongly use > pte_offset_map()/__pte_offset_map() on the gmap tables. > > I cannot immediately spot any such usage in kvm/gmap code, though. > Ah, it's all that pte_alloc_map_lock() stuff in gmap.c. Oh my. So we're mapping a user PTE table that is linked into the gmap tables through a PMD table that does not have the right sw bits set we would expect in a user PMD table. What's also scary is that pte_alloc_map_lock() would try to pte_alloc() a user page table in the gmap, which sounds completely wrong? Yeah, when walking the gmap and wanting to lock the linked user PTE table, we should probably never use the pte_*map variants but obtain the lock through pte_lockptr(). All magic we end up doing with RCU etc in __pte_offset_map_lock() does not apply to the gmap PMD table. -- Cheers David / dhildenb
Am 18.10.25 um 00:41 schrieb David Hildenbrand: > On 18.10.25 00:15, David Hildenbrand wrote: >> On 17.10.25 23:56, Balbir Singh wrote: >>> On 10/18/25 04:07, David Hildenbrand wrote: >>>> On 17.10.25 17:20, Christian Borntraeger wrote: >>>>> >>>>> >>>>> Am 17.10.25 um 17:07 schrieb David Hildenbrand: >>>>>> On 17.10.25 17:01, Christian Borntraeger wrote: >>>>>>> Am 17.10.25 um 16:54 schrieb David Hildenbrand: >>>>>>>> On 17.10.25 16:49, Christian Borntraeger wrote: >>>>>>>>> This patch triggers a regression for s390x kvm as qemu guests can no longer start >>>>>>>>> >>>>>>>>> error: kvm run failed Cannot allocate memory >>>>>>>>> PSW=mask 0000000180000000 addr 000000007fd00600 >>>>>>>>> R00=0000000000000000 R01=0000000000000000 R02=0000000000000000 R03=0000000000000000 >>>>>>>>> R04=0000000000000000 R05=0000000000000000 R06=0000000000000000 R07=0000000000000000 >>>>>>>>> R08=0000000000000000 R09=0000000000000000 R10=0000000000000000 R11=0000000000000000 >>>>>>>>> R12=0000000000000000 R13=0000000000000000 R14=0000000000000000 R15=0000000000000000 >>>>>>>>> C00=00000000000000e0 C01=0000000000000000 C02=0000000000000000 C03=0000000000000000 >>>>>>>>> C04=0000000000000000 C05=0000000000000000 C06=0000000000000000 C07=0000000000000000 >>>>>>>>> C08=0000000000000000 C09=0000000000000000 C10=0000000000000000 C11=0000000000000000 >>>>>>>>> C12=0000000000000000 C13=0000000000000000 C14=00000000c2000000 C15=0000000000000000 >>>>>>>>> >>>>>>>>> KVM on s390x does not use THP so far, will investigate. Does anyone have a quick idea? >>>>>>>> >>>>>>>> Only when running KVM guests and apart from that everything else seems to be fine? >>>>>>> >>>>>>> We have other weirdness in linux-next but in different areas. Could that somehow be >>>>>>> related to use disabling THP for the kvm address space? >>>>>> >>>>>> Not sure ... it's a bit weird. I mean, when KVM disables THPs we essentially just remap everything to be mapped by PTEs. So there shouldn't be any PMDs in that whole process. >>>>>> >>>>>> Remapping a file THP (shmem) implies zapping the THP completely. >>>>>> >>>>>> >>>>>> I assume in your kernel config has CONFIG_ZONE_DEVICE and CONFIG_ARCH_ENABLE_THP_MIGRATION set, right? >>>>> >>>>> yes. >>>>> >>>>>> >>>>>> I'd rule out copy_huge_pmd(), zap_huge_pmd() a well. >>>>>> >>>>>> >>>>>> What happens if you revert the change in mm/pgtable-generic.c? >>>>> >>>>> That partial revert seems to fix the issue >>>>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c >>>>> index 0c847cdf4fd3..567e2d084071 100644 >>>>> --- a/mm/pgtable-generic.c >>>>> +++ b/mm/pgtable-generic.c >>>>> @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) >>>>> if (pmdvalp) >>>>> *pmdvalp = pmdval; >>>>> - if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) >>>>> + if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) >>>> >>>> Okay, but that means that effectively we stumble over a PMD entry that is not a migration entry but still non-present. >>>> >>>> And I would expect that it's a page table, because otherwise the change >>>> wouldn't make a difference. >>>> >>>> And the weird thing is that this only triggers sometimes, because if >>>> it would always trigger nothing would ever work. >>>> >>>> Is there some weird scenario where s390x might set a left page table mapped in a PMD to non-present? >>>> >>> >>> Good point >>> >>>> Staring at the definition of pmd_present() on s390x it's really just >>>> >>>> return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0; >>>> >>>> >>>> Maybe this is happening in the gmap code only and not actually in the core-mm code? >>>> >>> >>> >>> I am not an s390 expert, but just looking at the code >>> >>> So the check on s390 effectively >>> >>> segment_entry/present = false or segment_entry_empty/invalid = true >> >> pmd_present() == true iff _SEGMENT_ENTRY_PRESENT is set >> >> because >> >> return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0; >> >> is the same as >> >> return pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT; >> >> But that means we have something where _SEGMENT_ENTRY_PRESENT is not set. >> >> I suspect that can only be the gmap tables. >> >> Likely __gmap_link() does not set _SEGMENT_ENTRY_PRESENT, which is fine >> because it's a software managed bit for "ordinary" page tables, not gmap >> tables. >> >> Which raises the question why someone would wrongly use >> pte_offset_map()/__pte_offset_map() on the gmap tables. >> >> I cannot immediately spot any such usage in kvm/gmap code, though. >> > > Ah, it's all that pte_alloc_map_lock() stuff in gmap.c. > > Oh my. > > So we're mapping a user PTE table that is linked into the gmap tables through a PMD table that does not have the right sw bits set we would expect in a user PMD table. > > What's also scary is that pte_alloc_map_lock() would try to pte_alloc() a user page table in the gmap, which sounds completely wrong? > > Yeah, when walking the gmap and wanting to lock the linked user PTE table, we should probably never use the pte_*map variants but obtain > the lock through pte_lockptr(). > > All magic we end up doing with RCU etc in __pte_offset_map_lock() > does not apply to the gmap PMD table. > CC Claudio.
© 2016 - 2026 Red Hat, Inc.