The single "real" user in the tree of make_device_exclusive_range() always
requests making only a single address exclusive. The current implementation
is hard to fix for properly supporting anonymous THP / large folios and
for avoiding messing with rmap walks in weird ways.
So let's always process a single address/page and return folio + page to
minimize page -> folio lookups. This is a preparation for further
changes.
Reject any non-anonymous or hugetlb folios early, directly after GUP.
Signed-off-by: David Hildenbrand <david@redhat.com>
---
Documentation/mm/hmm.rst | 2 +-
Documentation/translations/zh_CN/mm/hmm.rst | 2 +-
drivers/gpu/drm/nouveau/nouveau_svm.c | 5 +-
include/linux/mmu_notifier.h | 2 +-
include/linux/rmap.h | 5 +-
lib/test_hmm.c | 45 +++++------
mm/rmap.c | 90 +++++++++++----------
7 files changed, 75 insertions(+), 76 deletions(-)
diff --git a/Documentation/mm/hmm.rst b/Documentation/mm/hmm.rst
index f6d53c37a2ca..7d61b7a8b65b 100644
--- a/Documentation/mm/hmm.rst
+++ b/Documentation/mm/hmm.rst
@@ -400,7 +400,7 @@ Exclusive access memory
Some devices have features such as atomic PTE bits that can be used to implement
atomic access to system memory. To support atomic operations to a shared virtual
memory page such a device needs access to that page which is exclusive of any
-userspace access from the CPU. The ``make_device_exclusive_range()`` function
+userspace access from the CPU. The ``make_device_exclusive()`` function
can be used to make a memory range inaccessible from userspace.
This replaces all mappings for pages in the given range with special swap
diff --git a/Documentation/translations/zh_CN/mm/hmm.rst b/Documentation/translations/zh_CN/mm/hmm.rst
index 0669f947d0bc..22c210f4e94f 100644
--- a/Documentation/translations/zh_CN/mm/hmm.rst
+++ b/Documentation/translations/zh_CN/mm/hmm.rst
@@ -326,7 +326,7 @@ devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``s
一些设备具有诸如原子PTE位的功能,可以用来实现对系统内存的原子访问。为了支持对一
个共享的虚拟内存页的原子操作,这样的设备需要对该页的访问是排他的,而不是来自CPU
-的任何用户空间访问。 ``make_device_exclusive_range()`` 函数可以用来使一
+的任何用户空间访问。 ``make_device_exclusive()`` 函数可以用来使一
个内存范围不能从用户空间访问。
这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index b4da82ddbb6b..39e3740980bb 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -609,10 +609,9 @@ static int nouveau_atomic_range_fault(struct nouveau_svmm *svmm,
notifier_seq = mmu_interval_read_begin(¬ifier->notifier);
mmap_read_lock(mm);
- ret = make_device_exclusive_range(mm, start, start + PAGE_SIZE,
- &page, drm->dev);
+ page = make_device_exclusive(mm, start, drm->dev, &folio);
mmap_read_unlock(mm);
- if (ret <= 0 || !page) {
+ if (IS_ERR(page)) {
ret = -EINVAL;
goto out;
}
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index e2dd57ca368b..d4e714661826 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -46,7 +46,7 @@ struct mmu_interval_notifier;
* @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
* longer have exclusive access to the page. When sent during creation of an
* exclusive range the owner will be initialised to the value provided by the
- * caller of make_device_exclusive_range(), otherwise the owner will be NULL.
+ * caller of make_device_exclusive(), otherwise the owner will be NULL.
*/
enum mmu_notifier_event {
MMU_NOTIFY_UNMAP = 0,
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 683a04088f3f..86425d42c1a9 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -663,9 +663,8 @@ int folio_referenced(struct folio *, int is_locked,
void try_to_migrate(struct folio *folio, enum ttu_flags flags);
void try_to_unmap(struct folio *, enum ttu_flags flags);
-int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, struct page **pages,
- void *arg);
+struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
+ void *owner, struct folio **foliop);
/* Avoid racy checks */
#define PVMW_SYNC (1 << 0)
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 056f2e411d7b..9e1b07a227a3 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -780,10 +780,8 @@ static int dmirror_exclusive(struct dmirror *dmirror,
unsigned long start, end, addr;
unsigned long size = cmd->npages << PAGE_SHIFT;
struct mm_struct *mm = dmirror->notifier.mm;
- struct page *pages[64];
struct dmirror_bounce bounce;
- unsigned long next;
- int ret;
+ int ret = 0;
start = cmd->addr;
end = start + size;
@@ -795,36 +793,31 @@ static int dmirror_exclusive(struct dmirror *dmirror,
return -EINVAL;
mmap_read_lock(mm);
- for (addr = start; addr < end; addr = next) {
- unsigned long mapped = 0;
- int i;
-
- next = min(end, addr + (ARRAY_SIZE(pages) << PAGE_SHIFT));
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ struct folio *folio;
+ struct page *page;
- ret = make_device_exclusive_range(mm, addr, next, pages, NULL);
- /*
- * Do dmirror_atomic_map() iff all pages are marked for
- * exclusive access to avoid accessing uninitialized
- * fields of pages.
- */
- if (ret == (next - addr) >> PAGE_SHIFT)
- mapped = dmirror_atomic_map(addr, next, pages, dmirror);
- for (i = 0; i < ret; i++) {
- if (pages[i]) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
+ page = make_device_exclusive(mm, addr, &folio, NULL);
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ break;
}
- if (addr + (mapped << PAGE_SHIFT) < next) {
- mmap_read_unlock(mm);
- mmput(mm);
- return -EBUSY;
- }
+ ret = dmirror_atomic_map(addr, addr + PAGE_SIZE, &page, dmirror);
+ if (!ret)
+ ret = -EBUSY;
+ folio_unlock(folio);
+ folio_put(folio);
+
+ if (ret)
+ break;
}
mmap_read_unlock(mm);
mmput(mm);
+ if (ret)
+ return -EBUSY;
+
/* Return the migrated data for verification. */
ret = dmirror_bounce_init(&bounce, start, size);
if (ret)
diff --git a/mm/rmap.c b/mm/rmap.c
index 17fbfa61f7ef..676df4fba5b0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2495,70 +2495,78 @@ static bool folio_make_device_exclusive(struct folio *folio,
.arg = &args,
};
- /*
- * Restrict to anonymous folios for now to avoid potential writeback
- * issues.
- */
- if (!folio_test_anon(folio) || folio_test_hugetlb(folio))
- return false;
-
rmap_walk(folio, &rwc);
return args.valid && !folio_mapcount(folio);
}
/**
- * make_device_exclusive_range() - Mark a range for exclusive use by a device
+ * make_device_exclusive() - Mark an address for exclusive use by a device
* @mm: mm_struct of associated target process
- * @start: start of the region to mark for exclusive device access
- * @end: end address of region
- * @pages: returns the pages which were successfully marked for exclusive access
+ * @addr: the virtual address to mark for exclusive device access
* @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
+ * @foliop: folio pointer will be stored here on success.
+ *
+ * This function looks up the page mapped at the given address, grabs a
+ * folio reference, locks the folio and replaces the PTE with special
+ * device-exclusive non-swap entry, preventing userspace CPU access. The
+ * function will return with the folio locked and referenced.
*
- * Returns: number of pages found in the range by GUP. A page is marked for
- * exclusive access only if the page pointer is non-NULL.
+ * On fault these special device-exclusive entries are replaced with the
+ * original PTE under folio lock, after calling MMU notifiers.
*
- * This function finds ptes mapping page(s) to the given address range, locks
- * them and replaces mappings with special swap entries preventing userspace CPU
- * access. On fault these entries are replaced with the original mapping after
- * calling MMU notifiers.
+ * Only anonymous non-hugetlb folios are supported and the VMA must have
+ * write permissions such that we can fault in the anonymous page writable
+ * in order to mark it exclusive. The caller must hold the mmap_lock in read
+ * mode.
*
* A driver using this to program access from a device must use a mmu notifier
* critical section to hold a device specific lock during programming. Once
* programming is complete it should drop the page lock and reference after
* which point CPU access to the page will revoke the exclusive access.
+ *
+ * Returns: pointer to mapped page on success, otherwise a negative error.
*/
-int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, struct page **pages,
- void *owner)
+struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
+ void *owner, struct folio **foliop)
{
- long npages = (end - start) >> PAGE_SHIFT;
- long i;
+ struct folio *folio;
+ struct page *page;
+ long npages;
+
+ mmap_assert_locked(mm);
- npages = get_user_pages_remote(mm, start, npages,
+ /*
+ * Fault in the page writable and try to lock it; note that if the
+ * address would already be marked for exclusive use by the device,
+ * the GUP call would undo that first by triggering a fault.
+ */
+ npages = get_user_pages_remote(mm, addr, 1,
FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
- pages, NULL);
- if (npages < 0)
- return npages;
-
- for (i = 0; i < npages; i++, start += PAGE_SIZE) {
- struct folio *folio = page_folio(pages[i]);
- if (PageTail(pages[i]) || !folio_trylock(folio)) {
- folio_put(folio);
- pages[i] = NULL;
- continue;
- }
+ &page, NULL);
+ if (npages != 1)
+ return ERR_PTR(npages);
+ folio = page_folio(page);
- if (!folio_make_device_exclusive(folio, mm, start, owner)) {
- folio_unlock(folio);
- folio_put(folio);
- pages[i] = NULL;
- }
+ if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
+ folio_put(folio);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ return ERR_PTR(-EBUSY);
}
- return npages;
+ if (!folio_make_device_exclusive(folio, mm, addr, owner)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ERR_PTR(-EBUSY);
+ }
+ *foliop = folio;
+ return page;
}
-EXPORT_SYMBOL_GPL(make_device_exclusive_range);
+EXPORT_SYMBOL_GPL(make_device_exclusive);
#endif
void __put_anon_vma(struct anon_vma *anon_vma)
--
2.48.1
On Wed, Jan 29, 2025 at 12:54:01PM +0100, David Hildenbrand wrote:
> The single "real" user in the tree of make_device_exclusive_range() always
> requests making only a single address exclusive. The current implementation
> is hard to fix for properly supporting anonymous THP / large folios and
> for avoiding messing with rmap walks in weird ways.
>
> So let's always process a single address/page and return folio + page to
> minimize page -> folio lookups. This is a preparation for further
> changes.
>
> Reject any non-anonymous or hugetlb folios early, directly after GUP.
>
> Signed-off-by: David Hildenbrand <david@redhat.com>
Yeah this makes sense. Even for pmd entries I think we want to make this
very explicit with an explicit hugetlb opt-in I think.
Acked-by: Simona Vetter <simona.vetter@ffwll.ch>
> ---
> Documentation/mm/hmm.rst | 2 +-
> Documentation/translations/zh_CN/mm/hmm.rst | 2 +-
> drivers/gpu/drm/nouveau/nouveau_svm.c | 5 +-
> include/linux/mmu_notifier.h | 2 +-
> include/linux/rmap.h | 5 +-
> lib/test_hmm.c | 45 +++++------
> mm/rmap.c | 90 +++++++++++----------
> 7 files changed, 75 insertions(+), 76 deletions(-)
>
> diff --git a/Documentation/mm/hmm.rst b/Documentation/mm/hmm.rst
> index f6d53c37a2ca..7d61b7a8b65b 100644
> --- a/Documentation/mm/hmm.rst
> +++ b/Documentation/mm/hmm.rst
> @@ -400,7 +400,7 @@ Exclusive access memory
> Some devices have features such as atomic PTE bits that can be used to implement
> atomic access to system memory. To support atomic operations to a shared virtual
> memory page such a device needs access to that page which is exclusive of any
> -userspace access from the CPU. The ``make_device_exclusive_range()`` function
> +userspace access from the CPU. The ``make_device_exclusive()`` function
> can be used to make a memory range inaccessible from userspace.
>
> This replaces all mappings for pages in the given range with special swap
> diff --git a/Documentation/translations/zh_CN/mm/hmm.rst b/Documentation/translations/zh_CN/mm/hmm.rst
> index 0669f947d0bc..22c210f4e94f 100644
> --- a/Documentation/translations/zh_CN/mm/hmm.rst
> +++ b/Documentation/translations/zh_CN/mm/hmm.rst
> @@ -326,7 +326,7 @@ devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``s
>
> 一些设备具有诸如原子PTE位的功能,可以用来实现对系统内存的原子访问。为了支持对一
> 个共享的虚拟内存页的原子操作,这样的设备需要对该页的访问是排他的,而不是来自CPU
> -的任何用户空间访问。 ``make_device_exclusive_range()`` 函数可以用来使一
> +的任何用户空间访问。 ``make_device_exclusive()`` 函数可以用来使一
> 个内存范围不能从用户空间访问。
>
> 这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会
> diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
> index b4da82ddbb6b..39e3740980bb 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_svm.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
> @@ -609,10 +609,9 @@ static int nouveau_atomic_range_fault(struct nouveau_svmm *svmm,
>
> notifier_seq = mmu_interval_read_begin(¬ifier->notifier);
> mmap_read_lock(mm);
> - ret = make_device_exclusive_range(mm, start, start + PAGE_SIZE,
> - &page, drm->dev);
> + page = make_device_exclusive(mm, start, drm->dev, &folio);
> mmap_read_unlock(mm);
> - if (ret <= 0 || !page) {
> + if (IS_ERR(page)) {
> ret = -EINVAL;
> goto out;
> }
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> index e2dd57ca368b..d4e714661826 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -46,7 +46,7 @@ struct mmu_interval_notifier;
> * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
> * longer have exclusive access to the page. When sent during creation of an
> * exclusive range the owner will be initialised to the value provided by the
> - * caller of make_device_exclusive_range(), otherwise the owner will be NULL.
> + * caller of make_device_exclusive(), otherwise the owner will be NULL.
> */
> enum mmu_notifier_event {
> MMU_NOTIFY_UNMAP = 0,
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 683a04088f3f..86425d42c1a9 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -663,9 +663,8 @@ int folio_referenced(struct folio *, int is_locked,
> void try_to_migrate(struct folio *folio, enum ttu_flags flags);
> void try_to_unmap(struct folio *, enum ttu_flags flags);
>
> -int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
> - unsigned long end, struct page **pages,
> - void *arg);
> +struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
> + void *owner, struct folio **foliop);
>
> /* Avoid racy checks */
> #define PVMW_SYNC (1 << 0)
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index 056f2e411d7b..9e1b07a227a3 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -780,10 +780,8 @@ static int dmirror_exclusive(struct dmirror *dmirror,
> unsigned long start, end, addr;
> unsigned long size = cmd->npages << PAGE_SHIFT;
> struct mm_struct *mm = dmirror->notifier.mm;
> - struct page *pages[64];
> struct dmirror_bounce bounce;
> - unsigned long next;
> - int ret;
> + int ret = 0;
>
> start = cmd->addr;
> end = start + size;
> @@ -795,36 +793,31 @@ static int dmirror_exclusive(struct dmirror *dmirror,
> return -EINVAL;
>
> mmap_read_lock(mm);
> - for (addr = start; addr < end; addr = next) {
> - unsigned long mapped = 0;
> - int i;
> -
> - next = min(end, addr + (ARRAY_SIZE(pages) << PAGE_SHIFT));
> + for (addr = start; addr < end; addr += PAGE_SIZE) {
> + struct folio *folio;
> + struct page *page;
>
> - ret = make_device_exclusive_range(mm, addr, next, pages, NULL);
> - /*
> - * Do dmirror_atomic_map() iff all pages are marked for
> - * exclusive access to avoid accessing uninitialized
> - * fields of pages.
> - */
> - if (ret == (next - addr) >> PAGE_SHIFT)
> - mapped = dmirror_atomic_map(addr, next, pages, dmirror);
> - for (i = 0; i < ret; i++) {
> - if (pages[i]) {
> - unlock_page(pages[i]);
> - put_page(pages[i]);
> - }
> + page = make_device_exclusive(mm, addr, &folio, NULL);
> + if (IS_ERR(page)) {
> + ret = PTR_ERR(page);
> + break;
> }
>
> - if (addr + (mapped << PAGE_SHIFT) < next) {
> - mmap_read_unlock(mm);
> - mmput(mm);
> - return -EBUSY;
> - }
> + ret = dmirror_atomic_map(addr, addr + PAGE_SIZE, &page, dmirror);
> + if (!ret)
> + ret = -EBUSY;
> + folio_unlock(folio);
> + folio_put(folio);
> +
> + if (ret)
> + break;
> }
> mmap_read_unlock(mm);
> mmput(mm);
>
> + if (ret)
> + return -EBUSY;
> +
> /* Return the migrated data for verification. */
> ret = dmirror_bounce_init(&bounce, start, size);
> if (ret)
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 17fbfa61f7ef..676df4fba5b0 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -2495,70 +2495,78 @@ static bool folio_make_device_exclusive(struct folio *folio,
> .arg = &args,
> };
>
> - /*
> - * Restrict to anonymous folios for now to avoid potential writeback
> - * issues.
> - */
> - if (!folio_test_anon(folio) || folio_test_hugetlb(folio))
> - return false;
> -
> rmap_walk(folio, &rwc);
>
> return args.valid && !folio_mapcount(folio);
> }
>
> /**
> - * make_device_exclusive_range() - Mark a range for exclusive use by a device
> + * make_device_exclusive() - Mark an address for exclusive use by a device
> * @mm: mm_struct of associated target process
> - * @start: start of the region to mark for exclusive device access
> - * @end: end address of region
> - * @pages: returns the pages which were successfully marked for exclusive access
> + * @addr: the virtual address to mark for exclusive device access
> * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
> + * @foliop: folio pointer will be stored here on success.
> + *
> + * This function looks up the page mapped at the given address, grabs a
> + * folio reference, locks the folio and replaces the PTE with special
> + * device-exclusive non-swap entry, preventing userspace CPU access. The
> + * function will return with the folio locked and referenced.
> *
> - * Returns: number of pages found in the range by GUP. A page is marked for
> - * exclusive access only if the page pointer is non-NULL.
> + * On fault these special device-exclusive entries are replaced with the
> + * original PTE under folio lock, after calling MMU notifiers.
> *
> - * This function finds ptes mapping page(s) to the given address range, locks
> - * them and replaces mappings with special swap entries preventing userspace CPU
> - * access. On fault these entries are replaced with the original mapping after
> - * calling MMU notifiers.
> + * Only anonymous non-hugetlb folios are supported and the VMA must have
> + * write permissions such that we can fault in the anonymous page writable
> + * in order to mark it exclusive. The caller must hold the mmap_lock in read
> + * mode.
> *
> * A driver using this to program access from a device must use a mmu notifier
> * critical section to hold a device specific lock during programming. Once
> * programming is complete it should drop the page lock and reference after
> * which point CPU access to the page will revoke the exclusive access.
> + *
> + * Returns: pointer to mapped page on success, otherwise a negative error.
> */
> -int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
> - unsigned long end, struct page **pages,
> - void *owner)
> +struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
> + void *owner, struct folio **foliop)
> {
> - long npages = (end - start) >> PAGE_SHIFT;
> - long i;
> + struct folio *folio;
> + struct page *page;
> + long npages;
> +
> + mmap_assert_locked(mm);
>
> - npages = get_user_pages_remote(mm, start, npages,
> + /*
> + * Fault in the page writable and try to lock it; note that if the
> + * address would already be marked for exclusive use by the device,
> + * the GUP call would undo that first by triggering a fault.
> + */
> + npages = get_user_pages_remote(mm, addr, 1,
> FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
> - pages, NULL);
> - if (npages < 0)
> - return npages;
> -
> - for (i = 0; i < npages; i++, start += PAGE_SIZE) {
> - struct folio *folio = page_folio(pages[i]);
> - if (PageTail(pages[i]) || !folio_trylock(folio)) {
> - folio_put(folio);
> - pages[i] = NULL;
> - continue;
> - }
> + &page, NULL);
> + if (npages != 1)
> + return ERR_PTR(npages);
> + folio = page_folio(page);
>
> - if (!folio_make_device_exclusive(folio, mm, start, owner)) {
> - folio_unlock(folio);
> - folio_put(folio);
> - pages[i] = NULL;
> - }
> + if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
> + folio_put(folio);
> + return ERR_PTR(-EOPNOTSUPP);
> + }
> +
> + if (!folio_trylock(folio)) {
> + folio_put(folio);
> + return ERR_PTR(-EBUSY);
> }
>
> - return npages;
> + if (!folio_make_device_exclusive(folio, mm, addr, owner)) {
> + folio_unlock(folio);
> + folio_put(folio);
> + return ERR_PTR(-EBUSY);
> + }
> + *foliop = folio;
> + return page;
> }
> -EXPORT_SYMBOL_GPL(make_device_exclusive_range);
> +EXPORT_SYMBOL_GPL(make_device_exclusive);
> #endif
>
> void __put_anon_vma(struct anon_vma *anon_vma)
> --
> 2.48.1
>
--
Simona Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
On 30.01.25 14:46, Simona Vetter wrote: > On Wed, Jan 29, 2025 at 12:54:01PM +0100, David Hildenbrand wrote: >> The single "real" user in the tree of make_device_exclusive_range() always >> requests making only a single address exclusive. The current implementation >> is hard to fix for properly supporting anonymous THP / large folios and >> for avoiding messing with rmap walks in weird ways. >> >> So let's always process a single address/page and return folio + page to >> minimize page -> folio lookups. This is a preparation for further >> changes. >> >> Reject any non-anonymous or hugetlb folios early, directly after GUP. >> >> Signed-off-by: David Hildenbrand <david@redhat.com> > > Yeah this makes sense. Even for pmd entries I think we want to make this > very explicit with an explicit hugetlb opt-in I think. > > Acked-by: Simona Vetter <simona.vetter@ffwll.ch> Thanks, I'll fold in the following: diff --git a/mm/rmap.c b/mm/rmap.c index 676df4fba5b0..94256925682d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2525,6 +2525,10 @@ static bool folio_make_device_exclusive(struct folio *folio, * programming is complete it should drop the page lock and reference after * which point CPU access to the page will revoke the exclusive access. * + * Note: This function always operates on individual PTEs mapping individual + * pages. PMD-sized THPs are first remapped to be mapped by PTEs before the + * conversion happens on a single PTE corresponding to @addr. + * * Returns: pointer to mapped page on success, otherwise a negative error. */ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, -- Cheers, David / dhildenb
On Wed, Jan 29, 2025 at 12:54:01PM +0100, David Hildenbrand wrote:
> The single "real" user in the tree of make_device_exclusive_range() always
> requests making only a single address exclusive. The current implementation
> is hard to fix for properly supporting anonymous THP / large folios and
> for avoiding messing with rmap walks in weird ways.
>
> So let's always process a single address/page and return folio + page to
> minimize page -> folio lookups. This is a preparation for further
> changes.
>
> Reject any non-anonymous or hugetlb folios early, directly after GUP.
>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
> Documentation/mm/hmm.rst | 2 +-
> Documentation/translations/zh_CN/mm/hmm.rst | 2 +-
> drivers/gpu/drm/nouveau/nouveau_svm.c | 5 +-
> include/linux/mmu_notifier.h | 2 +-
> include/linux/rmap.h | 5 +-
> lib/test_hmm.c | 45 +++++------
> mm/rmap.c | 90 +++++++++++----------
> 7 files changed, 75 insertions(+), 76 deletions(-)
>
> diff --git a/Documentation/mm/hmm.rst b/Documentation/mm/hmm.rst
> index f6d53c37a2ca..7d61b7a8b65b 100644
> --- a/Documentation/mm/hmm.rst
> +++ b/Documentation/mm/hmm.rst
> @@ -400,7 +400,7 @@ Exclusive access memory
> Some devices have features such as atomic PTE bits that can be used to implement
> atomic access to system memory. To support atomic operations to a shared virtual
> memory page such a device needs access to that page which is exclusive of any
> -userspace access from the CPU. The ``make_device_exclusive_range()`` function
> +userspace access from the CPU. The ``make_device_exclusive()`` function
> can be used to make a memory range inaccessible from userspace.
>
> This replaces all mappings for pages in the given range with special swap
> diff --git a/Documentation/translations/zh_CN/mm/hmm.rst b/Documentation/translations/zh_CN/mm/hmm.rst
> index 0669f947d0bc..22c210f4e94f 100644
> --- a/Documentation/translations/zh_CN/mm/hmm.rst
> +++ b/Documentation/translations/zh_CN/mm/hmm.rst
> @@ -326,7 +326,7 @@ devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``s
>
> 一些设备具有诸如原子PTE位的功能,可以用来实现对系统内存的原子访问。为了支持对一
> 个共享的虚拟内存页的原子操作,这样的设备需要对该页的访问是排他的,而不是来自CPU
> -的任何用户空间访问。 ``make_device_exclusive_range()`` 函数可以用来使一
> +的任何用户空间访问。 ``make_device_exclusive()`` 函数可以用来使一
> 个内存范围不能从用户空间访问。
>
> 这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会
> diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
> index b4da82ddbb6b..39e3740980bb 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_svm.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
> @@ -609,10 +609,9 @@ static int nouveau_atomic_range_fault(struct nouveau_svmm *svmm,
>
> notifier_seq = mmu_interval_read_begin(¬ifier->notifier);
> mmap_read_lock(mm);
> - ret = make_device_exclusive_range(mm, start, start + PAGE_SIZE,
> - &page, drm->dev);
> + page = make_device_exclusive(mm, start, drm->dev, &folio);
> mmap_read_unlock(mm);
> - if (ret <= 0 || !page) {
> + if (IS_ERR(page)) {
> ret = -EINVAL;
> goto out;
> }
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> index e2dd57ca368b..d4e714661826 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -46,7 +46,7 @@ struct mmu_interval_notifier;
> * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
> * longer have exclusive access to the page. When sent during creation of an
> * exclusive range the owner will be initialised to the value provided by the
> - * caller of make_device_exclusive_range(), otherwise the owner will be NULL.
> + * caller of make_device_exclusive(), otherwise the owner will be NULL.
> */
> enum mmu_notifier_event {
> MMU_NOTIFY_UNMAP = 0,
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 683a04088f3f..86425d42c1a9 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -663,9 +663,8 @@ int folio_referenced(struct folio *, int is_locked,
> void try_to_migrate(struct folio *folio, enum ttu_flags flags);
> void try_to_unmap(struct folio *, enum ttu_flags flags);
>
> -int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
> - unsigned long end, struct page **pages,
> - void *arg);
> +struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
> + void *owner, struct folio **foliop);
>
> /* Avoid racy checks */
> #define PVMW_SYNC (1 << 0)
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index 056f2e411d7b..9e1b07a227a3 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -780,10 +780,8 @@ static int dmirror_exclusive(struct dmirror *dmirror,
> unsigned long start, end, addr;
> unsigned long size = cmd->npages << PAGE_SHIFT;
> struct mm_struct *mm = dmirror->notifier.mm;
> - struct page *pages[64];
> struct dmirror_bounce bounce;
> - unsigned long next;
> - int ret;
> + int ret = 0;
>
> start = cmd->addr;
> end = start + size;
> @@ -795,36 +793,31 @@ static int dmirror_exclusive(struct dmirror *dmirror,
> return -EINVAL;
>
> mmap_read_lock(mm);
> - for (addr = start; addr < end; addr = next) {
> - unsigned long mapped = 0;
> - int i;
> -
> - next = min(end, addr + (ARRAY_SIZE(pages) << PAGE_SHIFT));
> + for (addr = start; addr < end; addr += PAGE_SIZE) {
> + struct folio *folio;
> + struct page *page;
>
> - ret = make_device_exclusive_range(mm, addr, next, pages, NULL);
> - /*
> - * Do dmirror_atomic_map() iff all pages are marked for
> - * exclusive access to avoid accessing uninitialized
> - * fields of pages.
> - */
> - if (ret == (next - addr) >> PAGE_SHIFT)
> - mapped = dmirror_atomic_map(addr, next, pages, dmirror);
> - for (i = 0; i < ret; i++) {
> - if (pages[i]) {
> - unlock_page(pages[i]);
> - put_page(pages[i]);
> - }
> + page = make_device_exclusive(mm, addr, &folio, NULL);
> + if (IS_ERR(page)) {
> + ret = PTR_ERR(page);
> + break;
> }
>
> - if (addr + (mapped << PAGE_SHIFT) < next) {
> - mmap_read_unlock(mm);
> - mmput(mm);
> - return -EBUSY;
> - }
> + ret = dmirror_atomic_map(addr, addr + PAGE_SIZE, &page, dmirror);
> + if (!ret)
> + ret = -EBUSY;
> + folio_unlock(folio);
> + folio_put(folio);
> +
> + if (ret)
> + break;
> }
> mmap_read_unlock(mm);
> mmput(mm);
>
> + if (ret)
> + return -EBUSY;
> +
> /* Return the migrated data for verification. */
> ret = dmirror_bounce_init(&bounce, start, size);
> if (ret)
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 17fbfa61f7ef..676df4fba5b0 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -2495,70 +2495,78 @@ static bool folio_make_device_exclusive(struct folio *folio,
> .arg = &args,
> };
>
> - /*
> - * Restrict to anonymous folios for now to avoid potential writeback
> - * issues.
> - */
> - if (!folio_test_anon(folio) || folio_test_hugetlb(folio))
> - return false;
A bit of churn with the previous patch but I suppose it makes a stable backport
of the previous patch easier. The rest looks good so:
Reviewed-by: Alistair Popple <apopple@nvidia.com>
> rmap_walk(folio, &rwc);
>
> return args.valid && !folio_mapcount(folio);
> }
>
> /**
> - * make_device_exclusive_range() - Mark a range for exclusive use by a device
> + * make_device_exclusive() - Mark an address for exclusive use by a device
> * @mm: mm_struct of associated target process
> - * @start: start of the region to mark for exclusive device access
> - * @end: end address of region
> - * @pages: returns the pages which were successfully marked for exclusive access
> + * @addr: the virtual address to mark for exclusive device access
> * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
> + * @foliop: folio pointer will be stored here on success.
> + *
> + * This function looks up the page mapped at the given address, grabs a
> + * folio reference, locks the folio and replaces the PTE with special
> + * device-exclusive non-swap entry, preventing userspace CPU access. The
> + * function will return with the folio locked and referenced.
> *
> - * Returns: number of pages found in the range by GUP. A page is marked for
> - * exclusive access only if the page pointer is non-NULL.
> + * On fault these special device-exclusive entries are replaced with the
> + * original PTE under folio lock, after calling MMU notifiers.
> *
> - * This function finds ptes mapping page(s) to the given address range, locks
> - * them and replaces mappings with special swap entries preventing userspace CPU
> - * access. On fault these entries are replaced with the original mapping after
> - * calling MMU notifiers.
> + * Only anonymous non-hugetlb folios are supported and the VMA must have
> + * write permissions such that we can fault in the anonymous page writable
> + * in order to mark it exclusive. The caller must hold the mmap_lock in read
> + * mode.
> *
> * A driver using this to program access from a device must use a mmu notifier
> * critical section to hold a device specific lock during programming. Once
> * programming is complete it should drop the page lock and reference after
> * which point CPU access to the page will revoke the exclusive access.
> + *
> + * Returns: pointer to mapped page on success, otherwise a negative error.
> */
> -int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
> - unsigned long end, struct page **pages,
> - void *owner)
> +struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
> + void *owner, struct folio **foliop)
> {
> - long npages = (end - start) >> PAGE_SHIFT;
> - long i;
> + struct folio *folio;
> + struct page *page;
> + long npages;
> +
> + mmap_assert_locked(mm);
>
> - npages = get_user_pages_remote(mm, start, npages,
> + /*
> + * Fault in the page writable and try to lock it; note that if the
> + * address would already be marked for exclusive use by the device,
> + * the GUP call would undo that first by triggering a fault.
> + */
> + npages = get_user_pages_remote(mm, addr, 1,
> FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
> - pages, NULL);
> - if (npages < 0)
> - return npages;
> -
> - for (i = 0; i < npages; i++, start += PAGE_SIZE) {
> - struct folio *folio = page_folio(pages[i]);
> - if (PageTail(pages[i]) || !folio_trylock(folio)) {
> - folio_put(folio);
> - pages[i] = NULL;
> - continue;
> - }
> + &page, NULL);
> + if (npages != 1)
> + return ERR_PTR(npages);
> + folio = page_folio(page);
>
> - if (!folio_make_device_exclusive(folio, mm, start, owner)) {
> - folio_unlock(folio);
> - folio_put(folio);
> - pages[i] = NULL;
> - }
> + if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
> + folio_put(folio);
> + return ERR_PTR(-EOPNOTSUPP);
> + }
> +
> + if (!folio_trylock(folio)) {
> + folio_put(folio);
> + return ERR_PTR(-EBUSY);
> }
>
> - return npages;
> + if (!folio_make_device_exclusive(folio, mm, addr, owner)) {
> + folio_unlock(folio);
> + folio_put(folio);
> + return ERR_PTR(-EBUSY);
> + }
> + *foliop = folio;
> + return page;
> }
> -EXPORT_SYMBOL_GPL(make_device_exclusive_range);
> +EXPORT_SYMBOL_GPL(make_device_exclusive);
> #endif
>
> void __put_anon_vma(struct anon_vma *anon_vma)
> --
> 2.48.1
>
On Thu, Jan 30, 2025 at 04:57:39PM +1100, Alistair Popple wrote:
> On Wed, Jan 29, 2025 at 12:54:01PM +0100, David Hildenbrand wrote:
[...]
> > -int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
> > - unsigned long end, struct page **pages,
> > - void *owner)
> > +struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
> > + void *owner, struct folio **foliop)
> > {
> > - long npages = (end - start) >> PAGE_SHIFT;
> > - long i;
> > + struct folio *folio;
> > + struct page *page;
> > + long npages;
> > +
> > + mmap_assert_locked(mm);
> >
> > - npages = get_user_pages_remote(mm, start, npages,
> > + /*
> > + * Fault in the page writable and try to lock it; note that if the
> > + * address would already be marked for exclusive use by the device,
> > + * the GUP call would undo that first by triggering a fault.
> > + */
> > + npages = get_user_pages_remote(mm, addr, 1,
> > FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
> > - pages, NULL);
> > - if (npages < 0)
> > - return npages;
> > -
> > - for (i = 0; i < npages; i++, start += PAGE_SIZE) {
> > - struct folio *folio = page_folio(pages[i]);
> > - if (PageTail(pages[i]) || !folio_trylock(folio)) {
> > - folio_put(folio);
> > - pages[i] = NULL;
> > - continue;
> > - }
> > + &page, NULL);
> > + if (npages != 1)
> > + return ERR_PTR(npages);
> > + folio = page_folio(page);
> >
> > - if (!folio_make_device_exclusive(folio, mm, start, owner)) {
> > - folio_unlock(folio);
> > - folio_put(folio);
> > - pages[i] = NULL;
> > - }
> > + if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
> > + folio_put(folio);
> > + return ERR_PTR(-EOPNOTSUPP);
> > + }
> > +
> > + if (!folio_trylock(folio)) {
Actually I think we can make this folio_lock(folio) now. The only reason for
the trylock was to avoid deadlock between other threads looping over a range
of folios while holding folio locks which is something the migration code also
does.
- Alistair
> > + folio_put(folio);
> > + return ERR_PTR(-EBUSY);
> > }
> >
> > - return npages;
> > + if (!folio_make_device_exclusive(folio, mm, addr, owner)) {
> > + folio_unlock(folio);
> > + folio_put(folio);
> > + return ERR_PTR(-EBUSY);
> > + }
> > + *foliop = folio;
> > + return page;
> > }
> > -EXPORT_SYMBOL_GPL(make_device_exclusive_range);
> > +EXPORT_SYMBOL_GPL(make_device_exclusive);
> > #endif
> >
> > void __put_anon_vma(struct anon_vma *anon_vma)
> > --
> > 2.48.1
> >
>
On 31.01.25 01:28, Alistair Popple wrote:
> On Thu, Jan 30, 2025 at 04:57:39PM +1100, Alistair Popple wrote:
>> On Wed, Jan 29, 2025 at 12:54:01PM +0100, David Hildenbrand wrote:
>
> [...]
>
>>> -int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
>>> - unsigned long end, struct page **pages,
>>> - void *owner)
>>> +struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
>>> + void *owner, struct folio **foliop)
>>> {
>>> - long npages = (end - start) >> PAGE_SHIFT;
>>> - long i;
>>> + struct folio *folio;
>>> + struct page *page;
>>> + long npages;
>>> +
>>> + mmap_assert_locked(mm);
>>>
>>> - npages = get_user_pages_remote(mm, start, npages,
>>> + /*
>>> + * Fault in the page writable and try to lock it; note that if the
>>> + * address would already be marked for exclusive use by the device,
>>> + * the GUP call would undo that first by triggering a fault.
>>> + */
>>> + npages = get_user_pages_remote(mm, addr, 1,
>>> FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
>>> - pages, NULL);
>>> - if (npages < 0)
>>> - return npages;
>>> -
>>> - for (i = 0; i < npages; i++, start += PAGE_SIZE) {
>>> - struct folio *folio = page_folio(pages[i]);
>>> - if (PageTail(pages[i]) || !folio_trylock(folio)) {
>>> - folio_put(folio);
>>> - pages[i] = NULL;
>>> - continue;
>>> - }
>>> + &page, NULL);
>>> + if (npages != 1)
>>> + return ERR_PTR(npages);
>>> + folio = page_folio(page);
>>>
>>> - if (!folio_make_device_exclusive(folio, mm, start, owner)) {
>>> - folio_unlock(folio);
>>> - folio_put(folio);
>>> - pages[i] = NULL;
>>> - }
>>> + if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
>>> + folio_put(folio);
>>> + return ERR_PTR(-EOPNOTSUPP);
>>> + }
>>> +
>>> + if (!folio_trylock(folio)) {
>
> Actually I think we can make this folio_lock(folio) now. The only reason for
> the trylock was to avoid deadlock between other threads looping over a range
> of folios while holding folio locks which is something the migration code also
> does.
Okay, let me do that in a separate patch. Thanks!
--
Cheers,
David / dhildenb
>> diff --git a/mm/rmap.c b/mm/rmap.c >> index 17fbfa61f7ef..676df4fba5b0 100644 >> --- a/mm/rmap.c >> +++ b/mm/rmap.c >> @@ -2495,70 +2495,78 @@ static bool folio_make_device_exclusive(struct folio *folio, >> .arg = &args, >> }; >> >> - /* >> - * Restrict to anonymous folios for now to avoid potential writeback >> - * issues. >> - */ >> - if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) >> - return false; > > A bit of churn with the previous patch but I suppose it makes a stable backport > of the previous patch easier. Yes, that's the idea. Thanks! -- Cheers, David / dhildenb
© 2016 - 2026 Red Hat, Inc.