Extend migrate_vma_collect_pmd() to handle partially mapped large
folios that require splitting before migration can proceed.
During PTE walk in the collection phase, if a large folio is only
partially mapped in the migration range, it must be split to ensure
the folio is correctly migrated.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
---
mm/migrate_device.c | 94 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 94 insertions(+)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index abd9f6850db6..f45ef182287d 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -54,6 +54,53 @@ static int migrate_vma_collect_hole(unsigned long start,
return 0;
}
+/**
+ * migrate_vma_split_folio() - Helper function to split a THP folio
+ * @folio: the folio to split
+ * @fault_page: struct page associated with the fault if any
+ *
+ * Returns 0 on success
+ */
+static int migrate_vma_split_folio(struct folio *folio,
+ struct page *fault_page)
+{
+ int ret;
+ struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
+ struct folio *new_fault_folio = NULL;
+
+ if (folio != fault_folio) {
+ folio_get(folio);
+ folio_lock(folio);
+ }
+
+ ret = split_folio(folio);
+ if (ret) {
+ if (folio != fault_folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ return ret;
+ }
+
+ new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
+
+ /*
+ * Ensure the lock is held on the correct
+ * folio after the split
+ */
+ if (!new_fault_folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ } else if (folio != new_fault_folio) {
+ folio_get(new_fault_folio);
+ folio_lock(new_fault_folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ return 0;
+}
+
static int migrate_vma_collect_pmd(pmd_t *pmdp,
unsigned long start,
unsigned long end,
@@ -136,6 +183,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
* page table entry. Other special swap entries are not
* migratable, and we ignore regular swapped page.
*/
+ struct folio *folio;
+
entry = pte_to_swp_entry(pte);
if (!is_device_private_entry(entry))
goto next;
@@ -147,6 +196,29 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
pgmap->owner != migrate->pgmap_owner)
goto next;
+ folio = page_folio(page);
+ if (folio_test_large(folio)) {
+ int ret;
+
+ /*
+ * The reason for finding pmd present with a
+ * large folio for the pte is partial unmaps.
+ * Split the folio now for the migration to be
+ * handled correctly
+ */
+ pte_unmap_unlock(ptep, ptl);
+ ret = migrate_vma_split_folio(folio,
+ migrate->fault_page);
+
+ if (ret) {
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ goto next;
+ }
+
+ addr = start;
+ goto again;
+ }
+
mpfn = migrate_pfn(page_to_pfn(page)) |
MIGRATE_PFN_MIGRATE;
if (is_writable_device_private_entry(entry))
@@ -171,6 +243,28 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
pgmap->owner != migrate->pgmap_owner)
goto next;
}
+ folio = page_folio(page);
+ if (folio_test_large(folio)) {
+ int ret;
+
+ /*
+ * The reason for finding pmd present with a
+ * large folio for the pte is partial unmaps.
+ * Split the folio now for the migration to be
+ * handled correctly
+ */
+ pte_unmap_unlock(ptep, ptl);
+ ret = migrate_vma_split_folio(folio,
+ migrate->fault_page);
+
+ if (ret) {
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ goto next;
+ }
+
+ addr = start;
+ goto again;
+ }
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
--
2.50.1
On Mon, 8 Sep 2025 10:04:38 +1000 Balbir Singh <balbirs@nvidia.com> wrote:
> Extend migrate_vma_collect_pmd() to handle partially mapped large
> folios that require splitting before migration can proceed.
>
> During PTE walk in the collection phase, if a large folio is only
> partially mapped in the migration range, it must be split to ensure
> the folio is correctly migrated.
>
> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
> ---
> mm/migrate_device.c | 94 +++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 94 insertions(+)
>
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index abd9f6850db6..f45ef182287d 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -54,6 +54,53 @@ static int migrate_vma_collect_hole(unsigned long start,
> return 0;
> }
>
> +/**
> + * migrate_vma_split_folio() - Helper function to split a THP folio
> + * @folio: the folio to split
> + * @fault_page: struct page associated with the fault if any
> + *
> + * Returns 0 on success
> + */
> +static int migrate_vma_split_folio(struct folio *folio,
> + struct page *fault_page)
> +{
> + int ret;
> + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
> + struct folio *new_fault_folio = NULL;
> +
> + if (folio != fault_folio) {
> + folio_get(folio);
> + folio_lock(folio);
> + }
Can fault_folio ever be non-null and different from folio? Apologies for
not knowing the lock ordering rules but this jumps out.
> +
> + ret = split_folio(folio);
> + if (ret) {
> + if (folio != fault_folio) {
> + folio_unlock(folio);
> + folio_put(folio);
> + }
> + return ret;
> + }
> +
> + new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
> +
> + /*
> + * Ensure the lock is held on the correct
> + * folio after the split
> + */
> + if (!new_fault_folio) {
> + folio_unlock(folio);
> + folio_put(folio);
> + } else if (folio != new_fault_folio) {
> + folio_get(new_fault_folio);
> + folio_lock(new_fault_folio);
> + folio_unlock(folio);
> + folio_put(folio);
> + }
Same question here, do we need trylocks?
-chris
On 9/19/25 02:42, Chris Mason wrote:
> On Mon, 8 Sep 2025 10:04:38 +1000 Balbir Singh <balbirs@nvidia.com> wrote:
>
>> Extend migrate_vma_collect_pmd() to handle partially mapped large
>> folios that require splitting before migration can proceed.
>>
>> During PTE walk in the collection phase, if a large folio is only
>> partially mapped in the migration range, it must be split to ensure
>> the folio is correctly migrated.
>>
>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>> ---
>> mm/migrate_device.c | 94 +++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 94 insertions(+)
>>
>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index abd9f6850db6..f45ef182287d 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -54,6 +54,53 @@ static int migrate_vma_collect_hole(unsigned long start,
>> return 0;
>> }
>>
>> +/**
>> + * migrate_vma_split_folio() - Helper function to split a THP folio
>> + * @folio: the folio to split
>> + * @fault_page: struct page associated with the fault if any
>> + *
>> + * Returns 0 on success
>> + */
>> +static int migrate_vma_split_folio(struct folio *folio,
>> + struct page *fault_page)
>> +{
>> + int ret;
>> + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
>> + struct folio *new_fault_folio = NULL;
>> +
>> + if (folio != fault_folio) {
>> + folio_get(folio);
>> + folio_lock(folio);
>> + }
>
> Can fault_folio ever be non-null and different from folio? Apologies for
> not knowing the lock ordering rules but this jumps out.
>
Yes, migration can occur in fault context or be driver driven
>> +
>> + ret = split_folio(folio);
>> + if (ret) {
>> + if (folio != fault_folio) {
>> + folio_unlock(folio);
>> + folio_put(folio);
>> + }
>> + return ret;
>> + }
>> +
>> + new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
>> +
>> + /*
>> + * Ensure the lock is held on the correct
>> + * folio after the split
>> + */
>> + if (!new_fault_folio) {
>> + folio_unlock(folio);
>> + folio_put(folio);
>> + } else if (folio != new_fault_folio) {
>> + folio_get(new_fault_folio);
>> + folio_lock(new_fault_folio);
>> + folio_unlock(folio);
>> + folio_put(folio);
>> + }
>
> Same question here, do we need trylocks?
>
Since we had the folio lock before, the assumption is that we can
still grab the lock after split and it's OK to wait, since this
is not a hot-path.
Balbir Singh
On 9/19/25 4:36 AM, Balbir Singh wrote:
> On 9/19/25 02:42, Chris Mason wrote:
>> On Mon, 8 Sep 2025 10:04:38 +1000 Balbir Singh <balbirs@nvidia.com> wrote:
>>
>>> Extend migrate_vma_collect_pmd() to handle partially mapped large
>>> folios that require splitting before migration can proceed.
>>>
>>> During PTE walk in the collection phase, if a large folio is only
>>> partially mapped in the migration range, it must be split to ensure
>>> the folio is correctly migrated.
>>>
>>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>>> ---
>>> mm/migrate_device.c | 94 +++++++++++++++++++++++++++++++++++++++++++++
>>> 1 file changed, 94 insertions(+)
>>>
>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>> index abd9f6850db6..f45ef182287d 100644
>>> --- a/mm/migrate_device.c
>>> +++ b/mm/migrate_device.c
>>> @@ -54,6 +54,53 @@ static int migrate_vma_collect_hole(unsigned long start,
>>> return 0;
>>> }
>>>
>>> +/**
>>> + * migrate_vma_split_folio() - Helper function to split a THP folio
>>> + * @folio: the folio to split
>>> + * @fault_page: struct page associated with the fault if any
>>> + *
>>> + * Returns 0 on success
>>> + */
>>> +static int migrate_vma_split_folio(struct folio *folio,
>>> + struct page *fault_page)
>>> +{
>>> + int ret;
>>> + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
>>> + struct folio *new_fault_folio = NULL;
>>> +
>>> + if (folio != fault_folio) {
>>> + folio_get(folio);
>>> + folio_lock(folio);
>>> + }
>>
>> Can fault_folio ever be non-null and different from folio? Apologies for
>> not knowing the lock ordering rules but this jumps out.
>>
>
> Yes, migration can occur in fault context or be driver driven
>
>>> +
>>> + ret = split_folio(folio);
>>> + if (ret) {
>>> + if (folio != fault_folio) {
>>> + folio_unlock(folio);
>>> + folio_put(folio);
>>> + }
>>> + return ret;
>>> + }
>>> +
>>> + new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
>>> +
>>> + /*
>>> + * Ensure the lock is held on the correct
>>> + * folio after the split
>>> + */
>>> + if (!new_fault_folio) {
>>> + folio_unlock(folio);
>>> + folio_put(folio);
>>> + } else if (folio != new_fault_folio) {
>>> + folio_get(new_fault_folio);
>>> + folio_lock(new_fault_folio);
>>> + folio_unlock(folio);
>>> + folio_put(folio);
>>> + }
>>
>> Same question here, do we need trylocks?
>>
>
> Since we had the folio lock before, the assumption is that we can
> still grab the lock after split and it's OK to wait, since this
> is not a hot-path.
I think the lock ordering rules either let us take two folios without
trylock or they don't...holding the lock in the past shouldn't change
things? The same holds true above, two locks + no ordering, one or both
of these locking sites should deadlock.
But obviously I'm a tourist here, and I need to refresh the review
queue, so I'll move on ;) Thanks for taking a look at it.
-chris
Hi,
On 9/8/25 03:04, Balbir Singh wrote:
> Extend migrate_vma_collect_pmd() to handle partially mapped large
> folios that require splitting before migration can proceed.
>
> During PTE walk in the collection phase, if a large folio is only
> partially mapped in the migration range, it must be split to ensure
> the folio is correctly migrated.
>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Zi Yan <ziy@nvidia.com>
> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
> Cc: Rakie Kim <rakie.kim@sk.com>
> Cc: Byungchul Park <byungchul@sk.com>
> Cc: Gregory Price <gourry@gourry.net>
> Cc: Ying Huang <ying.huang@linux.alibaba.com>
> Cc: Alistair Popple <apopple@nvidia.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> Cc: Nico Pache <npache@redhat.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Dev Jain <dev.jain@arm.com>
> Cc: Barry Song <baohua@kernel.org>
> Cc: Lyude Paul <lyude@redhat.com>
> Cc: Danilo Krummrich <dakr@kernel.org>
> Cc: David Airlie <airlied@gmail.com>
> Cc: Simona Vetter <simona@ffwll.ch>
> Cc: Ralph Campbell <rcampbell@nvidia.com>
> Cc: Mika Penttilä <mpenttil@redhat.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Francois Dugast <francois.dugast@intel.com>
>
> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
> ---
> mm/migrate_device.c | 94 +++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 94 insertions(+)
>
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index abd9f6850db6..f45ef182287d 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -54,6 +54,53 @@ static int migrate_vma_collect_hole(unsigned long start,
> return 0;
> }
>
> +/**
> + * migrate_vma_split_folio() - Helper function to split a THP folio
> + * @folio: the folio to split
> + * @fault_page: struct page associated with the fault if any
> + *
> + * Returns 0 on success
> + */
> +static int migrate_vma_split_folio(struct folio *folio,
> + struct page *fault_page)
> +{
> + int ret;
> + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
> + struct folio *new_fault_folio = NULL;
> +
> + if (folio != fault_folio) {
> + folio_get(folio);
> + folio_lock(folio);
> + }
> +
> + ret = split_folio(folio);
> + if (ret) {
> + if (folio != fault_folio) {
> + folio_unlock(folio);
> + folio_put(folio);
> + }
> + return ret;
> + }
> +
> + new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
> +
> + /*
> + * Ensure the lock is held on the correct
> + * folio after the split
> + */
> + if (!new_fault_folio) {
> + folio_unlock(folio);
> + folio_put(folio);
> + } else if (folio != new_fault_folio) {
> + folio_get(new_fault_folio);
> + folio_lock(new_fault_folio);
> + folio_unlock(folio);
> + folio_put(folio);
> + }
> +
> + return 0;
> +}
> +
> static int migrate_vma_collect_pmd(pmd_t *pmdp,
> unsigned long start,
> unsigned long end,
> @@ -136,6 +183,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> * page table entry. Other special swap entries are not
> * migratable, and we ignore regular swapped page.
> */
> + struct folio *folio;
> +
> entry = pte_to_swp_entry(pte);
> if (!is_device_private_entry(entry))
> goto next;
> @@ -147,6 +196,29 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> pgmap->owner != migrate->pgmap_owner)
> goto next;
>
> + folio = page_folio(page);
> + if (folio_test_large(folio)) {
> + int ret;
> +
> + /*
> + * The reason for finding pmd present with a
> + * large folio for the pte is partial unmaps.
> + * Split the folio now for the migration to be
> + * handled correctly
> + */
> + pte_unmap_unlock(ptep, ptl);
> + ret = migrate_vma_split_folio(folio,
> + migrate->fault_page);
> +
> + if (ret) {
> + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> + goto next;
> + }
> +
> + addr = start;
> + goto again;
> + }
> +
> mpfn = migrate_pfn(page_to_pfn(page)) |
> MIGRATE_PFN_MIGRATE;
> if (is_writable_device_private_entry(entry))
> @@ -171,6 +243,28 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> pgmap->owner != migrate->pgmap_owner)
> goto next;
> }
> + folio = page_folio(page);
> + if (folio_test_large(folio)) {
> + int ret;
> +
> + /*
> + * The reason for finding pmd present with a
> + * large folio for the pte is partial unmaps.
> + * Split the folio now for the migration to be
> + * handled correctly
> + */
This comment is still not changed, there are other reasons for pte mapped large pages.
Also now all the mTHPs are splitted, which is change of behavior (currently ignored)
for order < PMD_ORDER.
> + pte_unmap_unlock(ptep, ptl);
> + ret = migrate_vma_split_folio(folio,
> + migrate->fault_page);
> +
> + if (ret) {
> + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> + goto next;
> + }
> +
> + addr = start;
> + goto again;
> + }
> mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
> mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
> }
--Mika
On 9/8/25 14:14, Mika Penttilä wrote:
> Hi,
>
> On 9/8/25 03:04, Balbir Singh wrote:
>
>> Extend migrate_vma_collect_pmd() to handle partially mapped large
>> folios that require splitting before migration can proceed.
>>
>> During PTE walk in the collection phase, if a large folio is only
>> partially mapped in the migration range, it must be split to ensure
>> the folio is correctly migrated.
>>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: Zi Yan <ziy@nvidia.com>
>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>> Cc: Rakie Kim <rakie.kim@sk.com>
>> Cc: Byungchul Park <byungchul@sk.com>
>> Cc: Gregory Price <gourry@gourry.net>
>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>> Cc: Alistair Popple <apopple@nvidia.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>> Cc: Nico Pache <npache@redhat.com>
>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>> Cc: Dev Jain <dev.jain@arm.com>
>> Cc: Barry Song <baohua@kernel.org>
>> Cc: Lyude Paul <lyude@redhat.com>
>> Cc: Danilo Krummrich <dakr@kernel.org>
>> Cc: David Airlie <airlied@gmail.com>
>> Cc: Simona Vetter <simona@ffwll.ch>
>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>> Cc: Mika Penttilä <mpenttil@redhat.com>
>> Cc: Matthew Brost <matthew.brost@intel.com>
>> Cc: Francois Dugast <francois.dugast@intel.com>
>>
>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>> ---
>> mm/migrate_device.c | 94 +++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 94 insertions(+)
>>
>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index abd9f6850db6..f45ef182287d 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -54,6 +54,53 @@ static int migrate_vma_collect_hole(unsigned long start,
>> return 0;
>> }
>>
>> +/**
>> + * migrate_vma_split_folio() - Helper function to split a THP folio
>> + * @folio: the folio to split
>> + * @fault_page: struct page associated with the fault if any
>> + *
>> + * Returns 0 on success
>> + */
>> +static int migrate_vma_split_folio(struct folio *folio,
>> + struct page *fault_page)
>> +{
>> + int ret;
>> + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
>> + struct folio *new_fault_folio = NULL;
>> +
>> + if (folio != fault_folio) {
>> + folio_get(folio);
>> + folio_lock(folio);
>> + }
>> +
>> + ret = split_folio(folio);
>> + if (ret) {
>> + if (folio != fault_folio) {
>> + folio_unlock(folio);
>> + folio_put(folio);
>> + }
>> + return ret;
>> + }
>> +
>> + new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
>> +
>> + /*
>> + * Ensure the lock is held on the correct
>> + * folio after the split
>> + */
>> + if (!new_fault_folio) {
>> + folio_unlock(folio);
>> + folio_put(folio);
>> + } else if (folio != new_fault_folio) {
>> + folio_get(new_fault_folio);
>> + folio_lock(new_fault_folio);
>> + folio_unlock(folio);
>> + folio_put(folio);
>> + }
>> +
>> + return 0;
>> +}
>> +
>> static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> unsigned long start,
>> unsigned long end,
>> @@ -136,6 +183,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> * page table entry. Other special swap entries are not
>> * migratable, and we ignore regular swapped page.
>> */
>> + struct folio *folio;
>> +
>> entry = pte_to_swp_entry(pte);
>> if (!is_device_private_entry(entry))
>> goto next;
>> @@ -147,6 +196,29 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> pgmap->owner != migrate->pgmap_owner)
>> goto next;
>>
>> + folio = page_folio(page);
>> + if (folio_test_large(folio)) {
>> + int ret;
>> +
>> + /*
>> + * The reason for finding pmd present with a
>> + * large folio for the pte is partial unmaps.
>> + * Split the folio now for the migration to be
>> + * handled correctly
>> + */
>> + pte_unmap_unlock(ptep, ptl);
>> + ret = migrate_vma_split_folio(folio,
>> + migrate->fault_page);
>> +
>> + if (ret) {
>> + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
>> + goto next;
>> + }
>> +
>> + addr = start;
>> + goto again;
>> + }
>> +
>> mpfn = migrate_pfn(page_to_pfn(page)) |
>> MIGRATE_PFN_MIGRATE;
>> if (is_writable_device_private_entry(entry))
>> @@ -171,6 +243,28 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> pgmap->owner != migrate->pgmap_owner)
>> goto next;
>> }
>> + folio = page_folio(page);
>> + if (folio_test_large(folio)) {
>> + int ret;
>> +
>> + /*
>> + * The reason for finding pmd present with a
>> + * large folio for the pte is partial unmaps.
>> + * Split the folio now for the migration to be
>> + * handled correctly
>> + */
>
> This comment is still not changed, there are other reasons for pte mapped large pages.
> Also now all the mTHPs are splitted, which is change of behavior (currently ignored)
> for order < PMD_ORDER.
Oh! sorry I missed it. I am attaching the version with the comments removed.
On the behaviour change, I agree, but it is required for migration to occur.
Updated patch below:
mm/migrate_device: handle partially mapped folios during collection
Extend migrate_vma_collect_pmd() to handle partially mapped large
folios that require splitting before migration can proceed.
During PTE walk in the collection phase, if a large folio is only
partially mapped in the migration range, it must be split to ensure
the folio is correctly migrated.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
---
mm/migrate_device.c | 82 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 82 insertions(+)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index abd9f6850db6..0afdc8b67c60 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -54,6 +54,53 @@ static int migrate_vma_collect_hole(unsigned long start,
return 0;
}
+/**
+ * migrate_vma_split_folio() - Helper function to split a THP folio
+ * @folio: the folio to split
+ * @fault_page: struct page associated with the fault if any
+ *
+ * Returns 0 on success
+ */
+static int migrate_vma_split_folio(struct folio *folio,
+ struct page *fault_page)
+{
+ int ret;
+ struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
+ struct folio *new_fault_folio = NULL;
+
+ if (folio != fault_folio) {
+ folio_get(folio);
+ folio_lock(folio);
+ }
+
+ ret = split_folio(folio);
+ if (ret) {
+ if (folio != fault_folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ return ret;
+ }
+
+ new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
+
+ /*
+ * Ensure the lock is held on the correct
+ * folio after the split
+ */
+ if (!new_fault_folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ } else if (folio != new_fault_folio) {
+ folio_get(new_fault_folio);
+ folio_lock(new_fault_folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ return 0;
+}
+
static int migrate_vma_collect_pmd(pmd_t *pmdp,
unsigned long start,
unsigned long end,
@@ -136,6 +183,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
* page table entry. Other special swap entries are not
* migratable, and we ignore regular swapped page.
*/
+ struct folio *folio;
+
entry = pte_to_swp_entry(pte);
if (!is_device_private_entry(entry))
goto next;
@@ -147,6 +196,23 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
pgmap->owner != migrate->pgmap_owner)
goto next;
+ folio = page_folio(page);
+ if (folio_test_large(folio)) {
+ int ret;
+
+ pte_unmap_unlock(ptep, ptl);
+ ret = migrate_vma_split_folio(folio,
+ migrate->fault_page);
+
+ if (ret) {
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ goto next;
+ }
+
+ addr = start;
+ goto again;
+ }
+
mpfn = migrate_pfn(page_to_pfn(page)) |
MIGRATE_PFN_MIGRATE;
if (is_writable_device_private_entry(entry))
@@ -171,6 +237,22 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
pgmap->owner != migrate->pgmap_owner)
goto next;
}
+ folio = page_folio(page);
+ if (folio_test_large(folio)) {
+ int ret;
+
+ pte_unmap_unlock(ptep, ptl);
+ ret = migrate_vma_split_folio(folio,
+ migrate->fault_page);
+
+ if (ret) {
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ goto next;
+ }
+
+ addr = start;
+ goto again;
+ }
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
--
2.50.1
Balbir Singh
© 2016 - 2026 Red Hat, Inc.