mm/hmm.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+)
When the PMD swap entry is device private and owned by the caller,
skip the range faulting and instead just set the correct HMM PFNs.
This is similar to the logic for PTEs in hmm_vma_handle_pte().
For now, each hmm_pfns[i] entry is populated as it is currently done
in hmm_vma_handle_pmd() but this might not be necessary. A follow-up
optimization could be to make use of the order and skip populating
subsequent PFNs.
Signed-off-by: Francois Dugast <francois.dugast@intel.com>
---
mm/hmm.c | 25 +++++++++++++++++++++++++
1 file changed, 25 insertions(+)
diff --git a/mm/hmm.c b/mm/hmm.c
index f2415b4b2cdd..63ec1b18a656 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -355,6 +355,31 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
}
if (!pmd_present(pmd)) {
+ swp_entry_t entry = pmd_to_swp_entry(pmd);
+
+ /*
+ * Don't fault in device private pages owned by the caller,
+ * just report the PFNs.
+ */
+ if (is_device_private_entry(entry) &&
+ pfn_swap_entry_folio(entry)->pgmap->owner ==
+ range->dev_private_owner) {
+ unsigned long cpu_flags = HMM_PFN_VALID |
+ hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
+ unsigned long pfn = swp_offset_pfn(entry);
+ unsigned long i;
+
+ if (is_writable_device_private_entry(entry))
+ cpu_flags |= HMM_PFN_WRITE;
+
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
+
+ return 0;
+ }
+
if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
return -EFAULT;
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
--
2.43.0
On Tue, Jul 22, 2025 at 09:34:45PM +0200, Francois Dugast wrote: > When the PMD swap entry is device private and owned by the caller, > skip the range faulting and instead just set the correct HMM PFNs. > This is similar to the logic for PTEs in hmm_vma_handle_pte(). > > For now, each hmm_pfns[i] entry is populated as it is currently done > in hmm_vma_handle_pmd() but this might not be necessary. A follow-up > optimization could be to make use of the order and skip populating > subsequent PFNs. > > Signed-off-by: Francois Dugast <francois.dugast@intel.com> > --- > mm/hmm.c | 25 +++++++++++++++++++++++++ > 1 file changed, 25 insertions(+) > > diff --git a/mm/hmm.c b/mm/hmm.c > index f2415b4b2cdd..63ec1b18a656 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -355,6 +355,31 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > } > > if (!pmd_present(pmd)) { > + swp_entry_t entry = pmd_to_swp_entry(pmd); > + > + /* > + * Don't fault in device private pages owned by the caller, > + * just report the PFNs. > + */ > + if (is_device_private_entry(entry) && > + pfn_swap_entry_folio(entry)->pgmap->owner == > + range->dev_private_owner) { > + unsigned long cpu_flags = HMM_PFN_VALID | > + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); > + unsigned long pfn = swp_offset_pfn(entry); > + unsigned long i; > + > + if (is_writable_device_private_entry(entry)) > + cpu_flags |= HMM_PFN_WRITE; > + > + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { > + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; > + hmm_pfns[i] |= pfn | cpu_flags; > + } > + > + return 0; > + } > + > if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) > return -EFAULT; I think here if this is a is_device_private_entry(entry), we need to call hmm_vma_fault. Matt > return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); > -- > 2.43.0 >
On Thu, Aug 07, 2025 at 05:21:45PM -0700, Matthew Brost wrote: > On Tue, Jul 22, 2025 at 09:34:45PM +0200, Francois Dugast wrote: > > When the PMD swap entry is device private and owned by the caller, > > skip the range faulting and instead just set the correct HMM PFNs. > > This is similar to the logic for PTEs in hmm_vma_handle_pte(). > > > > For now, each hmm_pfns[i] entry is populated as it is currently done > > in hmm_vma_handle_pmd() but this might not be necessary. A follow-up > > optimization could be to make use of the order and skip populating > > subsequent PFNs. > > > > Signed-off-by: Francois Dugast <francois.dugast@intel.com> > > --- > > mm/hmm.c | 25 +++++++++++++++++++++++++ > > 1 file changed, 25 insertions(+) > > > > diff --git a/mm/hmm.c b/mm/hmm.c > > index f2415b4b2cdd..63ec1b18a656 100644 > > --- a/mm/hmm.c > > +++ b/mm/hmm.c > > @@ -355,6 +355,31 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > > } > > > > if (!pmd_present(pmd)) { > > + swp_entry_t entry = pmd_to_swp_entry(pmd); > > + > > + /* > > + * Don't fault in device private pages owned by the caller, > > + * just report the PFNs. > > + */ > > + if (is_device_private_entry(entry) && > > + pfn_swap_entry_folio(entry)->pgmap->owner == > > + range->dev_private_owner) { > > + unsigned long cpu_flags = HMM_PFN_VALID | > > + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); > > + unsigned long pfn = swp_offset_pfn(entry); > > + unsigned long i; > > + > > + if (is_writable_device_private_entry(entry)) > > + cpu_flags |= HMM_PFN_WRITE; > > + > > + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { > > + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; > > + hmm_pfns[i] |= pfn | cpu_flags; > > + } > > + > > + return 0; > > + } > > + > > if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) > > return -EFAULT; > > I think here if this is a is_device_private_entry(entry), we need to > call hmm_vma_fault. Yes that seems needed, thanks for the catch. Francois > > Matt > > > return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); > > -- > > 2.43.0 > >
On 7/23/25 05:34, Francois Dugast wrote: > When the PMD swap entry is device private and owned by the caller, > skip the range faulting and instead just set the correct HMM PFNs. > This is similar to the logic for PTEs in hmm_vma_handle_pte(). > > For now, each hmm_pfns[i] entry is populated as it is currently done > in hmm_vma_handle_pmd() but this might not be necessary. A follow-up > optimization could be to make use of the order and skip populating > subsequent PFNs. I think we should test and remove these now > > Signed-off-by: Francois Dugast <francois.dugast@intel.com> > --- > mm/hmm.c | 25 +++++++++++++++++++++++++ > 1 file changed, 25 insertions(+) > > diff --git a/mm/hmm.c b/mm/hmm.c > index f2415b4b2cdd..63ec1b18a656 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -355,6 +355,31 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > } > > if (!pmd_present(pmd)) { > + swp_entry_t entry = pmd_to_swp_entry(pmd); > + > + /* > + * Don't fault in device private pages owned by the caller, > + * just report the PFNs. > + */ > + if (is_device_private_entry(entry) && > + pfn_swap_entry_folio(entry)->pgmap->owner == > + range->dev_private_owner) { > + unsigned long cpu_flags = HMM_PFN_VALID | > + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); > + unsigned long pfn = swp_offset_pfn(entry); > + unsigned long i; > + > + if (is_writable_device_private_entry(entry)) > + cpu_flags |= HMM_PFN_WRITE; > + > + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { > + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; > + hmm_pfns[i] |= pfn | cpu_flags; > + } > + As discussed, can we remove these. > + return 0; > + } All of this be under CONFIG_ARCH_ENABLE_THP_MIGRATION > + > if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) > return -EFAULT; > return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); Balbir Singh
On Thu, Jul 24, 2025 at 10:25:11AM +1000, Balbir Singh wrote: > On 7/23/25 05:34, Francois Dugast wrote: > > When the PMD swap entry is device private and owned by the caller, > > skip the range faulting and instead just set the correct HMM PFNs. > > This is similar to the logic for PTEs in hmm_vma_handle_pte(). > > > > For now, each hmm_pfns[i] entry is populated as it is currently done > > in hmm_vma_handle_pmd() but this might not be necessary. A follow-up > > optimization could be to make use of the order and skip populating > > subsequent PFNs. > > I think we should test and remove these now > +Jason, Leon – perhaps either of you can provide insight into why hmm_vma_handle_pmd fully populates the HMM PFNs when a higher-order page is found. If we can be assured that changing this won’t break other parts of the kernel, I agree it should be removed. A snippet of documentation should also be added indicating that when higher-order PFNs are found, subsequent PFNs within the range will remain unpopulated. I can verify that GPU SVM works just fine without these PFNs being populated. Matt > > > > Signed-off-by: Francois Dugast <francois.dugast@intel.com> > > --- > > mm/hmm.c | 25 +++++++++++++++++++++++++ > > 1 file changed, 25 insertions(+) > > > > diff --git a/mm/hmm.c b/mm/hmm.c > > index f2415b4b2cdd..63ec1b18a656 100644 > > --- a/mm/hmm.c > > +++ b/mm/hmm.c > > @@ -355,6 +355,31 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > > } > > > > if (!pmd_present(pmd)) { > > + swp_entry_t entry = pmd_to_swp_entry(pmd); > > + > > + /* > > + * Don't fault in device private pages owned by the caller, > > + * just report the PFNs. > > + */ > > + if (is_device_private_entry(entry) && > > + pfn_swap_entry_folio(entry)->pgmap->owner == > > + range->dev_private_owner) { > > + unsigned long cpu_flags = HMM_PFN_VALID | > > + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); > > + unsigned long pfn = swp_offset_pfn(entry); > > + unsigned long i; > > + > > + if (is_writable_device_private_entry(entry)) > > + cpu_flags |= HMM_PFN_WRITE; > > + > > + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { > > + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; > > + hmm_pfns[i] |= pfn | cpu_flags; > > + } > > + > > As discussed, can we remove these. > > > + return 0; > > + } > > All of this be under CONFIG_ARCH_ENABLE_THP_MIGRATION > > > + > > if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) > > return -EFAULT; > > return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); > > > > Balbir Singh
On Wed, Jul 23, 2025 at 10:02:58PM -0700, Matthew Brost wrote: > On Thu, Jul 24, 2025 at 10:25:11AM +1000, Balbir Singh wrote: > > On 7/23/25 05:34, Francois Dugast wrote: > > > When the PMD swap entry is device private and owned by the caller, > > > skip the range faulting and instead just set the correct HMM PFNs. > > > This is similar to the logic for PTEs in hmm_vma_handle_pte(). > > > > > > For now, each hmm_pfns[i] entry is populated as it is currently done > > > in hmm_vma_handle_pmd() but this might not be necessary. A follow-up > > > optimization could be to make use of the order and skip populating > > > subsequent PFNs. > > > > I think we should test and remove these now > > > > +Jason, Leon – perhaps either of you can provide insight into why > hmm_vma_handle_pmd fully populates the HMM PFNs when a higher-order page > is found. I'm not sure why this is burined in some weird unrelated thread, please post patches normally and CC the right people. At least the main patch looks reasonable to me, and probably should do pgd as well while at it? > If we can be assured that changing this won’t break other parts of the > kernel, I agree it should be removed. A snippet of documentation should > also be added indicating that when higher-order PFNs are found, > subsequent PFNs within the range will remain unpopulated. I can verify > that GPU SVM works just fine without these PFNs being populated. We can only do this if someone audits all the current users to confirm they are compatible. Do that and then it is OK to propose the change. I think the current version evolved as an optimization so I would not be surprised to see that some callers need fixing. Jason
On 7/24/25 08:02, Matthew Brost wrote: > On Thu, Jul 24, 2025 at 10:25:11AM +1000, Balbir Singh wrote: >> On 7/23/25 05:34, Francois Dugast wrote: >>> When the PMD swap entry is device private and owned by the caller, >>> skip the range faulting and instead just set the correct HMM PFNs. >>> This is similar to the logic for PTEs in hmm_vma_handle_pte(). >>> >>> For now, each hmm_pfns[i] entry is populated as it is currently done >>> in hmm_vma_handle_pmd() but this might not be necessary. A follow-up >>> optimization could be to make use of the order and skip populating >>> subsequent PFNs. >> I think we should test and remove these now >> > +Jason, Leon – perhaps either of you can provide insight into why > hmm_vma_handle_pmd fully populates the HMM PFNs when a higher-order page > is found. > > If we can be assured that changing this won’t break other parts of the > kernel, I agree it should be removed. A snippet of documentation should > also be added indicating that when higher-order PFNs are found, > subsequent PFNs within the range will remain unpopulated. I can verify > that GPU SVM works just fine without these PFNs being populated. afaics the device can consume the range as smaller pages also, and some hmm users depend on that. > Matt --Mika > >>> Signed-off-by: Francois Dugast <francois.dugast@intel.com> >>> --- >>> mm/hmm.c | 25 +++++++++++++++++++++++++ >>> 1 file changed, 25 insertions(+) >>> >>> diff --git a/mm/hmm.c b/mm/hmm.c >>> index f2415b4b2cdd..63ec1b18a656 100644 >>> --- a/mm/hmm.c >>> +++ b/mm/hmm.c >>> @@ -355,6 +355,31 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, >>> } >>> >>> if (!pmd_present(pmd)) { >>> + swp_entry_t entry = pmd_to_swp_entry(pmd); >>> + >>> + /* >>> + * Don't fault in device private pages owned by the caller, >>> + * just report the PFNs. >>> + */ >>> + if (is_device_private_entry(entry) && >>> + pfn_swap_entry_folio(entry)->pgmap->owner == >>> + range->dev_private_owner) { >>> + unsigned long cpu_flags = HMM_PFN_VALID | >>> + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); >>> + unsigned long pfn = swp_offset_pfn(entry); >>> + unsigned long i; >>> + >>> + if (is_writable_device_private_entry(entry)) >>> + cpu_flags |= HMM_PFN_WRITE; >>> + >>> + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { >>> + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; >>> + hmm_pfns[i] |= pfn | cpu_flags; >>> + } >>> + >> As discussed, can we remove these. >> >>> + return 0; >>> + } >> All of this be under CONFIG_ARCH_ENABLE_THP_MIGRATION >> >>> + >>> if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) >>> return -EFAULT; >>> return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); >> >> >> Balbir Singh
On Thu, Jul 24, 2025 at 08:46:11AM +0300, Mika Penttilä wrote: > > On 7/24/25 08:02, Matthew Brost wrote: > > On Thu, Jul 24, 2025 at 10:25:11AM +1000, Balbir Singh wrote: > >> On 7/23/25 05:34, Francois Dugast wrote: > >>> When the PMD swap entry is device private and owned by the caller, > >>> skip the range faulting and instead just set the correct HMM PFNs. > >>> This is similar to the logic for PTEs in hmm_vma_handle_pte(). > >>> > >>> For now, each hmm_pfns[i] entry is populated as it is currently done > >>> in hmm_vma_handle_pmd() but this might not be necessary. A follow-up > >>> optimization could be to make use of the order and skip populating > >>> subsequent PFNs. > >> I think we should test and remove these now > >> > > +Jason, Leon – perhaps either of you can provide insight into why > > hmm_vma_handle_pmd fully populates the HMM PFNs when a higher-order page > > is found. > > > > If we can be assured that changing this won’t break other parts of the > > kernel, I agree it should be removed. A snippet of documentation should > > also be added indicating that when higher-order PFNs are found, > > subsequent PFNs within the range will remain unpopulated. I can verify > > that GPU SVM works just fine without these PFNs being populated. > > afaics the device can consume the range as smaller pages also, and some > hmm users depend on that. > Sure, but I think that should be fixed in the device code. If a large-order PFN is found, the subsequent PFNs can clearly be inferred. It's a micro-optimization here, but devices or callers capable of handling this properly shouldn't force a hacky, less optimal behavior on core code. If anything relies on the current behavior, we should fix it and ensure correctness. Matt > > > Matt > > > --Mika > > > > > >>> Signed-off-by: Francois Dugast <francois.dugast@intel.com> > >>> --- > >>> mm/hmm.c | 25 +++++++++++++++++++++++++ > >>> 1 file changed, 25 insertions(+) > >>> > >>> diff --git a/mm/hmm.c b/mm/hmm.c > >>> index f2415b4b2cdd..63ec1b18a656 100644 > >>> --- a/mm/hmm.c > >>> +++ b/mm/hmm.c > >>> @@ -355,6 +355,31 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > >>> } > >>> > >>> if (!pmd_present(pmd)) { > >>> + swp_entry_t entry = pmd_to_swp_entry(pmd); > >>> + > >>> + /* > >>> + * Don't fault in device private pages owned by the caller, > >>> + * just report the PFNs. > >>> + */ > >>> + if (is_device_private_entry(entry) && > >>> + pfn_swap_entry_folio(entry)->pgmap->owner == > >>> + range->dev_private_owner) { > >>> + unsigned long cpu_flags = HMM_PFN_VALID | > >>> + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); > >>> + unsigned long pfn = swp_offset_pfn(entry); > >>> + unsigned long i; > >>> + > >>> + if (is_writable_device_private_entry(entry)) > >>> + cpu_flags |= HMM_PFN_WRITE; > >>> + > >>> + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { > >>> + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; > >>> + hmm_pfns[i] |= pfn | cpu_flags; > >>> + } > >>> + > >> As discussed, can we remove these. > >> > >>> + return 0; > >>> + } > >> All of this be under CONFIG_ARCH_ENABLE_THP_MIGRATION > >> > >>> + > >>> if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) > >>> return -EFAULT; > >>> return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); > >> > >> > >> Balbir Singh >
On 7/24/25 08:57, Matthew Brost wrote: > On Thu, Jul 24, 2025 at 08:46:11AM +0300, Mika Penttilä wrote: >> On 7/24/25 08:02, Matthew Brost wrote: >>> On Thu, Jul 24, 2025 at 10:25:11AM +1000, Balbir Singh wrote: >>>> On 7/23/25 05:34, Francois Dugast wrote: >>>>> When the PMD swap entry is device private and owned by the caller, >>>>> skip the range faulting and instead just set the correct HMM PFNs. >>>>> This is similar to the logic for PTEs in hmm_vma_handle_pte(). >>>>> >>>>> For now, each hmm_pfns[i] entry is populated as it is currently done >>>>> in hmm_vma_handle_pmd() but this might not be necessary. A follow-up >>>>> optimization could be to make use of the order and skip populating >>>>> subsequent PFNs. >>>> I think we should test and remove these now >>>> >>> +Jason, Leon – perhaps either of you can provide insight into why >>> hmm_vma_handle_pmd fully populates the HMM PFNs when a higher-order page >>> is found. >>> >>> If we can be assured that changing this won’t break other parts of the >>> kernel, I agree it should be removed. A snippet of documentation should >>> also be added indicating that when higher-order PFNs are found, >>> subsequent PFNs within the range will remain unpopulated. I can verify >>> that GPU SVM works just fine without these PFNs being populated. >> afaics the device can consume the range as smaller pages also, and some >> hmm users depend on that. >> > Sure, but I think that should be fixed in the device code. If a > large-order PFN is found, the subsequent PFNs can clearly be inferred. > It's a micro-optimization here, but devices or callers capable of > handling this properly shouldn't force a hacky, less optimal behavior on > core code. If anything relies on the current behavior, we should fix it > and ensure correctness. Yes sure device code can be changed but meant to say we can't just delete those lines without breaking existing users. > > Matt --Mika > >>> Matt >> >> --Mika >> >> >>>>> Signed-off-by: Francois Dugast <francois.dugast@intel.com> >>>>> --- >>>>> mm/hmm.c | 25 +++++++++++++++++++++++++ >>>>> 1 file changed, 25 insertions(+) >>>>> >>>>> diff --git a/mm/hmm.c b/mm/hmm.c >>>>> index f2415b4b2cdd..63ec1b18a656 100644 >>>>> --- a/mm/hmm.c >>>>> +++ b/mm/hmm.c >>>>> @@ -355,6 +355,31 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, >>>>> } >>>>> >>>>> if (!pmd_present(pmd)) { >>>>> + swp_entry_t entry = pmd_to_swp_entry(pmd); >>>>> + >>>>> + /* >>>>> + * Don't fault in device private pages owned by the caller, >>>>> + * just report the PFNs. >>>>> + */ >>>>> + if (is_device_private_entry(entry) && >>>>> + pfn_swap_entry_folio(entry)->pgmap->owner == >>>>> + range->dev_private_owner) { >>>>> + unsigned long cpu_flags = HMM_PFN_VALID | >>>>> + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); >>>>> + unsigned long pfn = swp_offset_pfn(entry); >>>>> + unsigned long i; >>>>> + >>>>> + if (is_writable_device_private_entry(entry)) >>>>> + cpu_flags |= HMM_PFN_WRITE; >>>>> + >>>>> + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { >>>>> + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; >>>>> + hmm_pfns[i] |= pfn | cpu_flags; >>>>> + } >>>>> + >>>> As discussed, can we remove these. >>>> >>>>> + return 0; >>>>> + } >>>> All of this be under CONFIG_ARCH_ENABLE_THP_MIGRATION >>>> >>>>> + >>>>> if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) >>>>> return -EFAULT; >>>>> return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); >>>> >>>> Balbir Singh
On Thu, Jul 24, 2025 at 09:04:36AM +0300, Mika Penttilä wrote: > > On 7/24/25 08:57, Matthew Brost wrote: > > On Thu, Jul 24, 2025 at 08:46:11AM +0300, Mika Penttilä wrote: > >> On 7/24/25 08:02, Matthew Brost wrote: > >>> On Thu, Jul 24, 2025 at 10:25:11AM +1000, Balbir Singh wrote: > >>>> On 7/23/25 05:34, Francois Dugast wrote: > >>>>> When the PMD swap entry is device private and owned by the caller, > >>>>> skip the range faulting and instead just set the correct HMM PFNs. > >>>>> This is similar to the logic for PTEs in hmm_vma_handle_pte(). > >>>>> > >>>>> For now, each hmm_pfns[i] entry is populated as it is currently done > >>>>> in hmm_vma_handle_pmd() but this might not be necessary. A follow-up > >>>>> optimization could be to make use of the order and skip populating > >>>>> subsequent PFNs. > >>>> I think we should test and remove these now > >>>> > >>> +Jason, Leon – perhaps either of you can provide insight into why > >>> hmm_vma_handle_pmd fully populates the HMM PFNs when a higher-order page > >>> is found. > >>> > >>> If we can be assured that changing this won’t break other parts of the > >>> kernel, I agree it should be removed. A snippet of documentation should > >>> also be added indicating that when higher-order PFNs are found, > >>> subsequent PFNs within the range will remain unpopulated. I can verify > >>> that GPU SVM works just fine without these PFNs being populated. > >> afaics the device can consume the range as smaller pages also, and some > >> hmm users depend on that. > >> > > Sure, but I think that should be fixed in the device code. If a > > large-order PFN is found, the subsequent PFNs can clearly be inferred. > > It's a micro-optimization here, but devices or callers capable of > > handling this properly shouldn't force a hacky, less optimal behavior on > > core code. If anything relies on the current behavior, we should fix it > > and ensure correctness. > > Yes sure device code can be changed but meant to say we can't just > delete those lines without breaking existing users. Mika is right. RDMA subsystem and HMM users there need to be updated. We have special flag (IB_ACCESS_HUGETLB) that prepare whole RDMA stack to handle large order PFNs. If this flag is not provided, we need to fallback to basic device page size (4k) and for that we expect fully populated PFN list. Thanks
On Tue, 22 Jul 2025 21:34:45 +0200 Francois Dugast <francois.dugast@intel.com> wrote: > When the PMD swap entry is device private and owned by the caller, > skip the range faulting and instead just set the correct HMM PFNs. > This is similar to the logic for PTEs in hmm_vma_handle_pte(). Please always tell us why a patch does something, not only what it does. > For now, each hmm_pfns[i] entry is populated as it is currently done > in hmm_vma_handle_pmd() but this might not be necessary. A follow-up > optimization could be to make use of the order and skip populating > subsequent PFNs. I infer from this paragraph that this patch is a performance optimization? Have its effects been measured? > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -355,6 +355,31 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > } > > if (!pmd_present(pmd)) { > + swp_entry_t entry = pmd_to_swp_entry(pmd); > + > + /* > + * Don't fault in device private pages owned by the caller, > + * just report the PFNs. > + */ Similarly, this tells us "what" it does, which is fairly obvious from the code itself. What is not obvious from the code is the "why". > + if (is_device_private_entry(entry) && > + pfn_swap_entry_folio(entry)->pgmap->owner == > + range->dev_private_owner) { > + unsigned long cpu_flags = HMM_PFN_VALID | > + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); > + unsigned long pfn = swp_offset_pfn(entry); > + unsigned long i; > + > + if (is_writable_device_private_entry(entry)) > + cpu_flags |= HMM_PFN_WRITE; > + > + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { > + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; > + hmm_pfns[i] |= pfn | cpu_flags; > + } > + > + return 0; > + } > + > if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) > return -EFAULT; > return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
On Tue, Jul 22, 2025 at 01:07:21PM -0700, Andrew Morton wrote: > On Tue, 22 Jul 2025 21:34:45 +0200 Francois Dugast <francois.dugast@intel.com> wrote: > > > When the PMD swap entry is device private and owned by the caller, > > skip the range faulting and instead just set the correct HMM PFNs. > > This is similar to the logic for PTEs in hmm_vma_handle_pte(). > > Please always tell us why a patch does something, not only what it does. Sure, let me improve this in the next version. > > > For now, each hmm_pfns[i] entry is populated as it is currently done > > in hmm_vma_handle_pmd() but this might not be necessary. A follow-up > > optimization could be to make use of the order and skip populating > > subsequent PFNs. > > I infer from this paragraph that this patch is a performance > optimization? Have its effects been measured? Yes, this performance optimization would come from avoiding the loop over the range but it has neither been properly tested nor measured yet. > > > --- a/mm/hmm.c > > +++ b/mm/hmm.c > > @@ -355,6 +355,31 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > > } > > > > if (!pmd_present(pmd)) { > > + swp_entry_t entry = pmd_to_swp_entry(pmd); > > + > > + /* > > + * Don't fault in device private pages owned by the caller, > > + * just report the PFNs. > > + */ > > Similarly, this tells us "what" it does, which is fairly obvious from > the code itself. What is not obvious from the code is the "why". Indeed, will fix. > > > + if (is_device_private_entry(entry) && > > + pfn_swap_entry_folio(entry)->pgmap->owner == > > + range->dev_private_owner) { > > + unsigned long cpu_flags = HMM_PFN_VALID | > > + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); > > + unsigned long pfn = swp_offset_pfn(entry); > > + unsigned long i; > > + > > + if (is_writable_device_private_entry(entry)) > > + cpu_flags |= HMM_PFN_WRITE; > > + > > + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { > > + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; > > + hmm_pfns[i] |= pfn | cpu_flags; > > + } > > + > > + return 0; > > + } > > + > > if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) > > return -EFAULT; > > return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); >
On Wed, Jul 23, 2025 at 05:34:17PM +0200, Francois Dugast wrote: > On Tue, Jul 22, 2025 at 01:07:21PM -0700, Andrew Morton wrote: > > On Tue, 22 Jul 2025 21:34:45 +0200 Francois Dugast <francois.dugast@intel.com> wrote: > > > > > When the PMD swap entry is device private and owned by the caller, > > > skip the range faulting and instead just set the correct HMM PFNs. > > > This is similar to the logic for PTEs in hmm_vma_handle_pte(). > > > > Please always tell us why a patch does something, not only what it does. > > Sure, let me improve this in the next version. > > > > > > For now, each hmm_pfns[i] entry is populated as it is currently done > > > in hmm_vma_handle_pmd() but this might not be necessary. A follow-up > > > optimization could be to make use of the order and skip populating > > > subsequent PFNs. > > > > I infer from this paragraph that this patch is a performance > > optimization? Have its effects been measured? > > Yes, this performance optimization would come from avoiding the loop > over the range but it has neither been properly tested nor measured > yet. > This is also a functional change. Once THP device pages are enabled (for performance), we will encounter device-private swap entries in PMDs. At that point, the correct behavior is to populate HMM PFNs from the swap entry when dev_private_owner matches; otherwise, trigger a fault if the HMM range-walk input requests one, or skip it in the non-faulting case. It’s harmless to merge this patch before THP device pages are enabled, since with the current code base we never find device-private swap entries in PMDs. I'd include something like the above explanation in the patch commit message, or in code comments if needed. Matt > > > > > --- a/mm/hmm.c > > > +++ b/mm/hmm.c > > > @@ -355,6 +355,31 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > > > } > > > > > > if (!pmd_present(pmd)) { > > > + swp_entry_t entry = pmd_to_swp_entry(pmd); > > > + > > > + /* > > > + * Don't fault in device private pages owned by the caller, > > > + * just report the PFNs. > > > + */ > > > > Similarly, this tells us "what" it does, which is fairly obvious from > > the code itself. What is not obvious from the code is the "why". > > Indeed, will fix. > > > > > > + if (is_device_private_entry(entry) && > > > + pfn_swap_entry_folio(entry)->pgmap->owner == > > > + range->dev_private_owner) { > > > + unsigned long cpu_flags = HMM_PFN_VALID | > > > + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); > > > + unsigned long pfn = swp_offset_pfn(entry); > > > + unsigned long i; > > > + > > > + if (is_writable_device_private_entry(entry)) > > > + cpu_flags |= HMM_PFN_WRITE; > > > + > > > + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { > > > + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; > > > + hmm_pfns[i] |= pfn | cpu_flags; > > > + } > > > + > > > + return 0; > > > + } > > > + > > > if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) > > > return -EFAULT; > > > return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); > >
© 2016 - 2025 Red Hat, Inc.