[PATCH 1/6] mm/mmu_notifier: Allow two-pass struct mmu_interval_notifiers

Thomas Hellström posted 6 patches 1 month, 1 week ago
[PATCH 1/6] mm/mmu_notifier: Allow two-pass struct mmu_interval_notifiers
Posted by Thomas Hellström 1 month, 1 week ago
GPU use-cases for mmu_interval_notifiers with hmm often involve
starting a gpu operation and then waiting for it to complete.
These operations are typically context preemption or TLB flushing.

With single-pass notifiers per GPU this doesn't scale in
multi-gpu scenarios. In those scenarios we'd want to first start
preemption- or TLB flushing on all GPUs and as a second pass wait
for them to complete.

One can do this on per-driver basis multiplexing per-driver
notifiers but that would mean sharing the notifier "user" lock
across all GPUs and that doesn't scale well either, so adding support
for multi-pass in the core appears to be the right choice.

Implement two-pass capability in the mmu_interval_notifier. Use a
linked list for the final passes to minimize the impact for
use-cases that don't need the multi-pass functionality by avoiding
a second interval tree walk, and to be able to easily pass data
between the two passes.

v1:
- Restrict to two passes (Jason Gunthorpe)
- Improve on documentation (Jason Gunthorpe)
- Improve on function naming (Alistair Popple)

Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Simona Vetter <simona.vetter@ffwll.ch>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: <dri-devel@lists.freedesktop.org>
Cc: <linux-mm@kvack.org>
Cc: <linux-kernel@vger.kernel.org>

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 include/linux/mmu_notifier.h | 42 ++++++++++++++++++++++++
 mm/mmu_notifier.c            | 63 ++++++++++++++++++++++++++++++------
 2 files changed, 96 insertions(+), 9 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index d1094c2d5fb6..14cfb3735699 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -233,16 +233,58 @@ struct mmu_notifier {
 	unsigned int users;
 };
 
+/**
+ * struct mmu_interval_notifier_finish - mmu_interval_notifier two-pass abstraction
+ * @link: List link for the notifiers pending pass list
+ *
+ * Allocate, typically using GFP_NOWAIT in the interval notifier's first pass.
+ * If allocation fails (which is not unlikely under memory pressure), fall back
+ * to single-pass operation. Note that with a large number of notifiers
+ * implementing two passes, allocation with GFP_NOWAIT will become increasingly
+ * likely to fail, so consider implementing a small pool instead of using
+ * kmalloc() allocations.
+ *
+ * If the implementation needs to pass data between the two passes,
+ * the recommended way is to embed strct mmu_interval_notifier_finish into a larger
+ * structure that also contains the data needed to be shared. Keep in mind that
+ * a notifier callback can be invoked in parallel, and each invocation needs its
+ * own struct mmu_interval_notifier_finish.
+ */
+struct mmu_interval_notifier_finish {
+	struct list_head link;
+	/**
+	 * @finish: Driver callback for the finish pass.
+	 * @final: Pointer to the mmu_interval_notifier_finish structure.
+	 * @range: The mmu_notifier_range.
+	 * @cur_seq: The current sequence set by the first pass.
+	 *
+	 * Note that there is no error reporting for additional passes.
+	 */
+	void (*finish)(struct mmu_interval_notifier_finish *final,
+		       const struct mmu_notifier_range *range,
+		       unsigned long cur_seq);
+};
+
 /**
  * struct mmu_interval_notifier_ops
  * @invalidate: Upon return the caller must stop using any SPTEs within this
  *              range. This function can sleep. Return false only if sleeping
  *              was required but mmu_notifier_range_blockable(range) is false.
+ * @invalidate_start: Similar to @invalidate, but intended for two-pass notifier
+ *                    callbacks where the callto @invalidate_start is the first
+ *                    pass and any struct mmu_interval_notifier_finish pointer
+ *                    returned in the @fini parameter describes the final pass.
+ *                    If @fini is %NULL on return, then no final pass will be
+ *                    called.
  */
 struct mmu_interval_notifier_ops {
 	bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
 			   const struct mmu_notifier_range *range,
 			   unsigned long cur_seq);
+	bool (*invalidate_start)(struct mmu_interval_notifier *interval_sub,
+				 const struct mmu_notifier_range *range,
+				 unsigned long cur_seq,
+				 struct mmu_interval_notifier_finish **final);
 };
 
 struct mmu_interval_notifier {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8e0125dc0522..fceadcd8ca24 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -260,6 +260,18 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
 }
 EXPORT_SYMBOL_GPL(mmu_interval_read_begin);
 
+static void mn_itree_final_pass(struct list_head *final_passes,
+				const struct mmu_notifier_range *range,
+				unsigned long cur_seq)
+{
+	struct mmu_interval_notifier_finish *f, *next;
+
+	list_for_each_entry_safe(f, next, final_passes, link) {
+		list_del(&f->link);
+		f->finish(f, range, cur_seq);
+	}
+}
+
 static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions,
 			     struct mm_struct *mm)
 {
@@ -271,6 +283,7 @@ static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions,
 		.end = ULONG_MAX,
 	};
 	struct mmu_interval_notifier *interval_sub;
+	LIST_HEAD(final_passes);
 	unsigned long cur_seq;
 	bool ret;
 
@@ -278,11 +291,25 @@ static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions,
 		     mn_itree_inv_start_range(subscriptions, &range, &cur_seq);
 	     interval_sub;
 	     interval_sub = mn_itree_inv_next(interval_sub, &range)) {
-		ret = interval_sub->ops->invalidate(interval_sub, &range,
-						    cur_seq);
+		if (interval_sub->ops->invalidate_start) {
+			struct mmu_interval_notifier_finish *final = NULL;
+
+			ret = interval_sub->ops->invalidate_start(interval_sub,
+								  &range,
+								  cur_seq,
+								  &final);
+			if (ret && final)
+				list_add_tail(&final->link, &final_passes);
+
+		} else {
+			ret = interval_sub->ops->invalidate(interval_sub,
+							    &range,
+							    cur_seq);
+		}
 		WARN_ON(!ret);
 	}
 
+	mn_itree_final_pass(&final_passes, &range, cur_seq);
 	mn_itree_inv_end(subscriptions);
 }
 
@@ -430,7 +457,9 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
 			       const struct mmu_notifier_range *range)
 {
 	struct mmu_interval_notifier *interval_sub;
+	LIST_HEAD(final_passes);
 	unsigned long cur_seq;
+	int err = 0;
 
 	for (interval_sub =
 		     mn_itree_inv_start_range(subscriptions, range, &cur_seq);
@@ -438,23 +467,39 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
 	     interval_sub = mn_itree_inv_next(interval_sub, range)) {
 		bool ret;
 
-		ret = interval_sub->ops->invalidate(interval_sub, range,
-						    cur_seq);
+		if (interval_sub->ops->invalidate_start) {
+			struct mmu_interval_notifier_finish *final = NULL;
+
+			ret = interval_sub->ops->invalidate_start(interval_sub,
+								  range,
+								  cur_seq,
+								  &final);
+			if (ret && final)
+				list_add_tail(&final->link, &final_passes);
+
+		} else {
+			ret = interval_sub->ops->invalidate(interval_sub,
+							    range,
+							    cur_seq);
+		}
 		if (!ret) {
 			if (WARN_ON(mmu_notifier_range_blockable(range)))
 				continue;
-			goto out_would_block;
+			err = -EAGAIN;
+			break;
 		}
 	}
-	return 0;
 
-out_would_block:
+	mn_itree_final_pass(&final_passes, range, cur_seq);
+
 	/*
 	 * On -EAGAIN the non-blocking caller is not allowed to call
 	 * invalidate_range_end()
 	 */
-	mn_itree_inv_end(subscriptions);
-	return -EAGAIN;
+	if (err)
+		mn_itree_inv_end(subscriptions);
+
+	return err;
 }
 
 static int mn_hlist_invalidate_range_start(
-- 
2.50.1

Re: [PATCH 1/6] mm/mmu_notifier: Allow two-pass struct mmu_interval_notifiers
Posted by Jason Gunthorpe 3 weeks, 3 days ago
On Thu, Aug 21, 2025 at 01:46:21PM +0200, Thomas Hellström wrote:
> +struct mmu_interval_notifier_finish {
> +	struct list_head link;
> +	/**
> +	 * @finish: Driver callback for the finish pass.
> +	 * @final: Pointer to the mmu_interval_notifier_finish structure.
> +	 * @range: The mmu_notifier_range.
> +	 * @cur_seq: The current sequence set by the first pass.
> +	 *
> +	 * Note that there is no error reporting for additional passes.
> +	 */
> +	void (*finish)(struct mmu_interval_notifier_finish *final,
> +		       const struct mmu_notifier_range *range,
> +		       unsigned long cur_seq);

I would rather this be in mmu_interval_notifier_ops, though I guess I
see why it was done like this, I don't think it is a great idea for
DRM to wrapper the notifier library with yet another library :\

Regardless

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Jason
Re: [PATCH 1/6] mm/mmu_notifier: Allow two-pass struct mmu_interval_notifiers
Posted by Thomas Hellström 3 weeks, 3 days ago
Hi, Jason

On 9/9/25 19:18, Jason Gunthorpe wrote:
> On Thu, Aug 21, 2025 at 01:46:21PM +0200, Thomas Hellström wrote:
>> +struct mmu_interval_notifier_finish {
>> +	struct list_head link;
>> +	/**
>> +	 * @finish: Driver callback for the finish pass.
>> +	 * @final: Pointer to the mmu_interval_notifier_finish structure.
>> +	 * @range: The mmu_notifier_range.
>> +	 * @cur_seq: The current sequence set by the first pass.
>> +	 *
>> +	 * Note that there is no error reporting for additional passes.
>> +	 */
>> +	void (*finish)(struct mmu_interval_notifier_finish *final,
>> +		       const struct mmu_notifier_range *range,
>> +		       unsigned long cur_seq);
> I would rather this be in mmu_interval_notifier_ops

Thanks for reviewing.

We could have the struct mmu_interval_notifier_finish have a pointer to 
mmu_interval_notifier_ops or even to
mmu_interval_notifier. Now that you mention it IIRC Linus has pointed 
out that he strongly prefer function pointers in const ops wherever 
possible. Would like to keep the linked list, though, as we've discussed 
before, for passing of state and to avoid a second interval tree traversal.

Thanks,

Thomas



>   though I guess I
> see why it was done like this, I don't think it is a great idea for
> DRM to wrapper the notifier library with yet another library :\
>
> Regardless
>
> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
>
> Jason
Re: [PATCH 1/6] mm/mmu_notifier: Allow two-pass struct mmu_interval_notifiers
Posted by Thomas Hellström 1 month ago
Hi,

@Jason, @Alistair, Gentle ping, could you have a look and R-B, Ack if
OK?

Thanks,
Thomas


On Thu, 2025-08-21 at 13:46 +0200, Thomas Hellström wrote:
> GPU use-cases for mmu_interval_notifiers with hmm often involve
> starting a gpu operation and then waiting for it to complete.
> These operations are typically context preemption or TLB flushing.
> 
> With single-pass notifiers per GPU this doesn't scale in
> multi-gpu scenarios. In those scenarios we'd want to first start
> preemption- or TLB flushing on all GPUs and as a second pass wait
> for them to complete.
> 
> One can do this on per-driver basis multiplexing per-driver
> notifiers but that would mean sharing the notifier "user" lock
> across all GPUs and that doesn't scale well either, so adding support
> for multi-pass in the core appears to be the right choice.
> 
> Implement two-pass capability in the mmu_interval_notifier. Use a
> linked list for the final passes to minimize the impact for
> use-cases that don't need the multi-pass functionality by avoiding
> a second interval tree walk, and to be able to easily pass data
> between the two passes.
> 
> v1:
> - Restrict to two passes (Jason Gunthorpe)
> - Improve on documentation (Jason Gunthorpe)
> - Improve on function naming (Alistair Popple)
> 
> Cc: Jason Gunthorpe <jgg@ziepe.ca>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Simona Vetter <simona.vetter@ffwll.ch>
> Cc: Dave Airlie <airlied@gmail.com>
> Cc: Alistair Popple <apopple@nvidia.com>
> Cc: <dri-devel@lists.freedesktop.org>
> Cc: <linux-mm@kvack.org>
> Cc: <linux-kernel@vger.kernel.org>
> 
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> ---
>  include/linux/mmu_notifier.h | 42 ++++++++++++++++++++++++
>  mm/mmu_notifier.c            | 63 ++++++++++++++++++++++++++++++----
> --
>  2 files changed, 96 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/mmu_notifier.h
> b/include/linux/mmu_notifier.h
> index d1094c2d5fb6..14cfb3735699 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -233,16 +233,58 @@ struct mmu_notifier {
>  	unsigned int users;
>  };
>  
> +/**
> + * struct mmu_interval_notifier_finish - mmu_interval_notifier two-
> pass abstraction
> + * @link: List link for the notifiers pending pass list
> + *
> + * Allocate, typically using GFP_NOWAIT in the interval notifier's
> first pass.
> + * If allocation fails (which is not unlikely under memory
> pressure), fall back
> + * to single-pass operation. Note that with a large number of
> notifiers
> + * implementing two passes, allocation with GFP_NOWAIT will become
> increasingly
> + * likely to fail, so consider implementing a small pool instead of
> using
> + * kmalloc() allocations.
> + *
> + * If the implementation needs to pass data between the two passes,
> + * the recommended way is to embed strct
> mmu_interval_notifier_finish into a larger
> + * structure that also contains the data needed to be shared. Keep
> in mind that
> + * a notifier callback can be invoked in parallel, and each
> invocation needs its
> + * own struct mmu_interval_notifier_finish.
> + */
> +struct mmu_interval_notifier_finish {
> +	struct list_head link;
> +	/**
> +	 * @finish: Driver callback for the finish pass.
> +	 * @final: Pointer to the mmu_interval_notifier_finish
> structure.
> +	 * @range: The mmu_notifier_range.
> +	 * @cur_seq: The current sequence set by the first pass.
> +	 *
> +	 * Note that there is no error reporting for additional
> passes.
> +	 */
> +	void (*finish)(struct mmu_interval_notifier_finish *final,
> +		       const struct mmu_notifier_range *range,
> +		       unsigned long cur_seq);
> +};
> +
>  /**
>   * struct mmu_interval_notifier_ops
>   * @invalidate: Upon return the caller must stop using any SPTEs
> within this
>   *              range. This function can sleep. Return false only if
> sleeping
>   *              was required but mmu_notifier_range_blockable(range)
> is false.
> + * @invalidate_start: Similar to @invalidate, but intended for two-
> pass notifier
> + *                    callbacks where the callto @invalidate_start
> is the first
> + *                    pass and any struct
> mmu_interval_notifier_finish pointer
> + *                    returned in the @fini parameter describes the
> final pass.
> + *                    If @fini is %NULL on return, then no final
> pass will be
> + *                    called.
>   */
>  struct mmu_interval_notifier_ops {
>  	bool (*invalidate)(struct mmu_interval_notifier
> *interval_sub,
>  			   const struct mmu_notifier_range *range,
>  			   unsigned long cur_seq);
> +	bool (*invalidate_start)(struct mmu_interval_notifier
> *interval_sub,
> +				 const struct mmu_notifier_range
> *range,
> +				 unsigned long cur_seq,
> +				 struct mmu_interval_notifier_finish
> **final);
>  };
>  
>  struct mmu_interval_notifier {
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> index 8e0125dc0522..fceadcd8ca24 100644
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -260,6 +260,18 @@ mmu_interval_read_begin(struct
> mmu_interval_notifier *interval_sub)
>  }
>  EXPORT_SYMBOL_GPL(mmu_interval_read_begin);
>  
> +static void mn_itree_final_pass(struct list_head *final_passes,
> +				const struct mmu_notifier_range
> *range,
> +				unsigned long cur_seq)
> +{
> +	struct mmu_interval_notifier_finish *f, *next;
> +
> +	list_for_each_entry_safe(f, next, final_passes, link) {
> +		list_del(&f->link);
> +		f->finish(f, range, cur_seq);
> +	}
> +}
> +
>  static void mn_itree_release(struct mmu_notifier_subscriptions
> *subscriptions,
>  			     struct mm_struct *mm)
>  {
> @@ -271,6 +283,7 @@ static void mn_itree_release(struct
> mmu_notifier_subscriptions *subscriptions,
>  		.end = ULONG_MAX,
>  	};
>  	struct mmu_interval_notifier *interval_sub;
> +	LIST_HEAD(final_passes);
>  	unsigned long cur_seq;
>  	bool ret;
>  
> @@ -278,11 +291,25 @@ static void mn_itree_release(struct
> mmu_notifier_subscriptions *subscriptions,
>  		     mn_itree_inv_start_range(subscriptions, &range,
> &cur_seq);
>  	     interval_sub;
>  	     interval_sub = mn_itree_inv_next(interval_sub, &range))
> {
> -		ret = interval_sub->ops->invalidate(interval_sub,
> &range,
> -						    cur_seq);
> +		if (interval_sub->ops->invalidate_start) {
> +			struct mmu_interval_notifier_finish *final =
> NULL;
> +
> +			ret = interval_sub->ops-
> >invalidate_start(interval_sub,
> +								 
> &range,
> +								 
> cur_seq,
> +								 
> &final);
> +			if (ret && final)
> +				list_add_tail(&final->link,
> &final_passes);
> +
> +		} else {
> +			ret = interval_sub->ops-
> >invalidate(interval_sub,
> +							    &range,
> +							   
> cur_seq);
> +		}
>  		WARN_ON(!ret);
>  	}
>  
> +	mn_itree_final_pass(&final_passes, &range, cur_seq);
>  	mn_itree_inv_end(subscriptions);
>  }
>  
> @@ -430,7 +457,9 @@ static int mn_itree_invalidate(struct
> mmu_notifier_subscriptions *subscriptions,
>  			       const struct mmu_notifier_range
> *range)
>  {
>  	struct mmu_interval_notifier *interval_sub;
> +	LIST_HEAD(final_passes);
>  	unsigned long cur_seq;
> +	int err = 0;
>  
>  	for (interval_sub =
>  		     mn_itree_inv_start_range(subscriptions, range,
> &cur_seq);
> @@ -438,23 +467,39 @@ static int mn_itree_invalidate(struct
> mmu_notifier_subscriptions *subscriptions,
>  	     interval_sub = mn_itree_inv_next(interval_sub, range))
> {
>  		bool ret;
>  
> -		ret = interval_sub->ops->invalidate(interval_sub,
> range,
> -						    cur_seq);
> +		if (interval_sub->ops->invalidate_start) {
> +			struct mmu_interval_notifier_finish *final =
> NULL;
> +
> +			ret = interval_sub->ops-
> >invalidate_start(interval_sub,
> +								 
> range,
> +								 
> cur_seq,
> +								 
> &final);
> +			if (ret && final)
> +				list_add_tail(&final->link,
> &final_passes);
> +
> +		} else {
> +			ret = interval_sub->ops-
> >invalidate(interval_sub,
> +							    range,
> +							   
> cur_seq);
> +		}
>  		if (!ret) {
>  			if
> (WARN_ON(mmu_notifier_range_blockable(range)))
>  				continue;
> -			goto out_would_block;
> +			err = -EAGAIN;
> +			break;
>  		}
>  	}
> -	return 0;
>  
> -out_would_block:
> +	mn_itree_final_pass(&final_passes, range, cur_seq);
> +
>  	/*
>  	 * On -EAGAIN the non-blocking caller is not allowed to call
>  	 * invalidate_range_end()
>  	 */
> -	mn_itree_inv_end(subscriptions);
> -	return -EAGAIN;
> +	if (err)
> +		mn_itree_inv_end(subscriptions);
> +
> +	return err;
>  }
>  
>  static int mn_hlist_invalidate_range_start(