Multi-pass MMU interval notifiers

[RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Thomas Hellström 1 month, 3 weeks ago

GPU use-cases for mmu_interval_notifiers with hmm often involve
starting a gpu operation and then waiting for it to complete.
These operations are typically context preemption or TLB flushing.

With single-pass notifiers per GPU this doesn't scale in
multi-gpu scenarios. In those scenarios we'd want to first start
preemption- or TLB flushing on all GPUs and as a second pass wait
for them to complete on all gpus.

One can do this on per-driver basis multiplexing per-driver
notifiers but that would mean sharing the notifier "user" lock
across all GPUs and that doesn't scale well either, so adding support
for multi-pass in the core appears like the right choice.

Implement multi-pass capability in the mmu_interval_notifier. Use a
linked list for the additional passes to minimize the impact for
use-cases that don't need the multi-pass functionality.

Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Simona Vetter <simona.vetter@ffwll.ch>
Cc: Dave Airlie <airlied@gmail.com>
Cc: <dri-devel@lists.freedesktop.org>
Cc: <linux-mm@kvack.org>
Cc: <linux-kernel@vger.kernel.org>

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 include/linux/mmu_notifier.h | 30 ++++++++++++++++
 mm/mmu_notifier.c            | 67 +++++++++++++++++++++++++++++++-----
 2 files changed, 88 insertions(+), 9 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index d1094c2d5fb6..1107a8eafd8a 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -233,6 +233,32 @@ struct mmu_notifier {
 	unsigned int users;
 };
 
+/**
+ * struct mmu_interval_notifier_pass - mmu_interval_notifier multi-pass abstraction
+ * @link: List link for the notifiers pending pass list
+ *
+ * Allocate, typically using GFP_NOWAIT in the interval notifier's first pass.
+ * If allocation fails (which is not unlikely under memory pressure), fall back
+ * to single-pass operation.
+ */
+struct mmu_interval_notifier_pass {
+	struct list_head link;
+	/**
+	 * @pass: Driver callback for additionall pass.
+	 * @additional_pass: Pointer to the mmu_interval_notifier_pass structure.
+	 * @range: The mmu_notifier_range.
+	 * @cur_seq: The current sequence set by the first pass.
+	 *
+	 * Return: Either a pointer to a valid mmu_interval_notifier_pass for
+	 * another pass to be called, or %NULL if processing is complete for this
+	 * notifier. There is no error reporting mechanism for additional passes.
+	 */
+	struct mmu_interval_notifier_pass *
+	(*pass) (struct mmu_interval_notifier_pass *additional_pass,
+		 const struct mmu_notifier_range *range,
+		 unsigned long cur_seq);
+};
+
 /**
  * struct mmu_interval_notifier_ops
  * @invalidate: Upon return the caller must stop using any SPTEs within this
@@ -243,6 +269,10 @@ struct mmu_interval_notifier_ops {
 	bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
 			   const struct mmu_notifier_range *range,
 			   unsigned long cur_seq);
+	bool (*invalidate_multipass)(struct mmu_interval_notifier *interval_sub,
+				     const struct mmu_notifier_range *range,
+				     unsigned long cur_seq,
+				     struct mmu_interval_notifier_pass **pass);
 };
 
 struct mmu_interval_notifier {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8e0125dc0522..dd6af87db103 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -260,6 +260,22 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
 }
 EXPORT_SYMBOL_GPL(mmu_interval_read_begin);
 
+static void mn_itree_additional_passes(struct list_head *additional_passes,
+				       const struct mmu_notifier_range *range,
+				       unsigned long cur_seq)
+{
+	struct mmu_interval_notifier_pass *p, *next;
+
+	while (!list_empty(additional_passes)) {
+		list_for_each_entry_safe(p, next, additional_passes, link) {
+			list_del_init(&p->link);
+			p = p->pass(p, range, cur_seq);
+			if (p)
+				list_add_tail(&p->link, additional_passes);
+		}
+	}
+}
+
 static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions,
 			     struct mm_struct *mm)
 {
@@ -272,17 +288,32 @@ static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions,
 	};
 	struct mmu_interval_notifier *interval_sub;
 	unsigned long cur_seq;
+	LIST_HEAD(additional_passes);
 	bool ret;
 
 	for (interval_sub =
 		     mn_itree_inv_start_range(subscriptions, &range, &cur_seq);
 	     interval_sub;
 	     interval_sub = mn_itree_inv_next(interval_sub, &range)) {
-		ret = interval_sub->ops->invalidate(interval_sub, &range,
-						    cur_seq);
+		if (interval_sub->ops->invalidate_multipass) {
+			struct mmu_interval_notifier_pass *second = NULL;
+
+			ret = interval_sub->ops->invalidate_multipass(interval_sub,
+								      &range,
+								      cur_seq,
+								      &second);
+			if (ret && second)
+				list_add_tail(&second->link, &additional_passes);
+
+		} else {
+			ret = interval_sub->ops->invalidate(interval_sub,
+							    &range,
+							    cur_seq);
+		}
 		WARN_ON(!ret);
 	}
 
+	mn_itree_additional_passes(&additional_passes, &range, cur_seq);
 	mn_itree_inv_end(subscriptions);
 }
 
@@ -431,6 +462,8 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
 {
 	struct mmu_interval_notifier *interval_sub;
 	unsigned long cur_seq;
+	LIST_HEAD(additional_passes);
+	int err = 0;
 
 	for (interval_sub =
 		     mn_itree_inv_start_range(subscriptions, range, &cur_seq);
@@ -438,23 +471,39 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
 	     interval_sub = mn_itree_inv_next(interval_sub, range)) {
 		bool ret;
 
-		ret = interval_sub->ops->invalidate(interval_sub, range,
-						    cur_seq);
+		if (interval_sub->ops->invalidate_multipass) {
+			struct mmu_interval_notifier_pass *second = NULL;
+
+			ret = interval_sub->ops->invalidate_multipass(interval_sub,
+								      range,
+								      cur_seq,
+								      &second);
+			if (ret && second)
+				list_add_tail(&second->link, &additional_passes);
+
+		} else {
+			ret = interval_sub->ops->invalidate(interval_sub,
+							    range,
+							    cur_seq);
+		}
 		if (!ret) {
 			if (WARN_ON(mmu_notifier_range_blockable(range)))
 				continue;
-			goto out_would_block;
+			err = -EAGAIN;
+			break;
 		}
 	}
-	return 0;
 
-out_would_block:
+	mn_itree_additional_passes(&additional_passes, range, cur_seq);
+
 	/*
 	 * On -EAGAIN the non-blocking caller is not allowed to call
 	 * invalidate_range_end()
 	 */
-	mn_itree_inv_end(subscriptions);
-	return -EAGAIN;
+	if (err)
+		mn_itree_inv_end(subscriptions);
+
+	return err;
 }
 
 static int mn_hlist_invalidate_range_start(
-- 
2.50.1

Re: [RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Alistair Popple 1 month, 2 weeks ago

On Sat, Aug 09, 2025 at 03:51:32PM +0200, Thomas Hellström wrote:
> GPU use-cases for mmu_interval_notifiers with hmm often involve
> starting a gpu operation and then waiting for it to complete.
> These operations are typically context preemption or TLB flushing.
> 
> With single-pass notifiers per GPU this doesn't scale in
> multi-gpu scenarios. In those scenarios we'd want to first start
> preemption- or TLB flushing on all GPUs and as a second pass wait
> for them to complete on all gpus.
> 
> One can do this on per-driver basis multiplexing per-driver
> notifiers but that would mean sharing the notifier "user" lock
> across all GPUs and that doesn't scale well either, so adding support
> for multi-pass in the core appears like the right choice.
> 
> Implement multi-pass capability in the mmu_interval_notifier. Use a
> linked list for the additional passes to minimize the impact for
> use-cases that don't need the multi-pass functionality.
> 
> Cc: Jason Gunthorpe <jgg@ziepe.ca>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Simona Vetter <simona.vetter@ffwll.ch>
> Cc: Dave Airlie <airlied@gmail.com>
> Cc: <dri-devel@lists.freedesktop.org>
> Cc: <linux-mm@kvack.org>
> Cc: <linux-kernel@vger.kernel.org>
> 
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> ---
>  include/linux/mmu_notifier.h | 30 ++++++++++++++++
>  mm/mmu_notifier.c            | 67 +++++++++++++++++++++++++++++++-----
>  2 files changed, 88 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> index d1094c2d5fb6..1107a8eafd8a 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -233,6 +233,32 @@ struct mmu_notifier {
>  	unsigned int users;
>  };
>  
> +/**
> + * struct mmu_interval_notifier_pass - mmu_interval_notifier multi-pass abstraction
> + * @link: List link for the notifiers pending pass list
> + *
> + * Allocate, typically using GFP_NOWAIT in the interval notifier's first pass.
> + * If allocation fails (which is not unlikely under memory pressure), fall back
> + * to single-pass operation.
> + */
> +struct mmu_interval_notifier_pass {

If we limit the number of passes to two maybe call this
`mmu_interval_notifier_finish()`? ...

> +	struct list_head link;
> +	/**
> +	 * @pass: Driver callback for additionall pass.
> +	 * @additional_pass: Pointer to the mmu_interval_notifier_pass structure.
> +	 * @range: The mmu_notifier_range.
> +	 * @cur_seq: The current sequence set by the first pass.
> +	 *
> +	 * Return: Either a pointer to a valid mmu_interval_notifier_pass for
> +	 * another pass to be called, or %NULL if processing is complete for this
> +	 * notifier. There is no error reporting mechanism for additional passes.
> +	 */
> +	struct mmu_interval_notifier_pass *
> +	(*pass) (struct mmu_interval_notifier_pass *additional_pass,

... and call this `finish()` ...

> +		 const struct mmu_notifier_range *range,
> +		 unsigned long cur_seq);
> +};
> +
>  /**
>   * struct mmu_interval_notifier_ops
>   * @invalidate: Upon return the caller must stop using any SPTEs within this
> @@ -243,6 +269,10 @@ struct mmu_interval_notifier_ops {
>  	bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
>  			   const struct mmu_notifier_range *range,
>  			   unsigned long cur_seq);
> +	bool (*invalidate_multipass)(struct mmu_interval_notifier *interval_sub,

... and then this could be called `invalidate_start()`. That might address some
of the concerns with naming.

> +				     const struct mmu_notifier_range *range,
> +				     unsigned long cur_seq,
> +				     struct mmu_interval_notifier_pass **pass);
>  };
>  
>  struct mmu_interval_notifier {
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> index 8e0125dc0522..dd6af87db103 100644
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -260,6 +260,22 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
>  }
>  EXPORT_SYMBOL_GPL(mmu_interval_read_begin);
>  
> +static void mn_itree_additional_passes(struct list_head *additional_passes,
> +				       const struct mmu_notifier_range *range,
> +				       unsigned long cur_seq)
> +{
> +	struct mmu_interval_notifier_pass *p, *next;
> +
> +	while (!list_empty(additional_passes)) {
> +		list_for_each_entry_safe(p, next, additional_passes, link) {
> +			list_del_init(&p->link);
> +			p = p->pass(p, range, cur_seq);
> +			if (p)
> +				list_add_tail(&p->link, additional_passes);
> +		}
> +	}
> +}
> +
>  static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions,
>  			     struct mm_struct *mm)
>  {
> @@ -272,17 +288,32 @@ static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions,
>  	};
>  	struct mmu_interval_notifier *interval_sub;
>  	unsigned long cur_seq;
> +	LIST_HEAD(additional_passes);
>  	bool ret;
>  
>  	for (interval_sub =
>  		     mn_itree_inv_start_range(subscriptions, &range, &cur_seq);
>  	     interval_sub;
>  	     interval_sub = mn_itree_inv_next(interval_sub, &range)) {
> -		ret = interval_sub->ops->invalidate(interval_sub, &range,
> -						    cur_seq);
> +		if (interval_sub->ops->invalidate_multipass) {
> +			struct mmu_interval_notifier_pass *second = NULL;
> +
> +			ret = interval_sub->ops->invalidate_multipass(interval_sub,
> +								      &range,
> +								      cur_seq,
> +								      &second);
> +			if (ret && second)
> +				list_add_tail(&second->link, &additional_passes);
> +
> +		} else {
> +			ret = interval_sub->ops->invalidate(interval_sub,
> +							    &range,
> +							    cur_seq);
> +		}
>  		WARN_ON(!ret);
>  	}
>  
> +	mn_itree_additional_passes(&additional_passes, &range, cur_seq);
>  	mn_itree_inv_end(subscriptions);
>  }
>  
> @@ -431,6 +462,8 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
>  {
>  	struct mmu_interval_notifier *interval_sub;
>  	unsigned long cur_seq;
> +	LIST_HEAD(additional_passes);
> +	int err = 0;
>  
>  	for (interval_sub =
>  		     mn_itree_inv_start_range(subscriptions, range, &cur_seq);
> @@ -438,23 +471,39 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
>  	     interval_sub = mn_itree_inv_next(interval_sub, range)) {
>  		bool ret;
>  
> -		ret = interval_sub->ops->invalidate(interval_sub, range,
> -						    cur_seq);
> +		if (interval_sub->ops->invalidate_multipass) {
> +			struct mmu_interval_notifier_pass *second = NULL;
> +
> +			ret = interval_sub->ops->invalidate_multipass(interval_sub,
> +								      range,
> +								      cur_seq,
> +								      &second);
> +			if (ret && second)
> +				list_add_tail(&second->link, &additional_passes);
> +
> +		} else {
> +			ret = interval_sub->ops->invalidate(interval_sub,
> +							    range,
> +							    cur_seq);
> +		}
>  		if (!ret) {
>  			if (WARN_ON(mmu_notifier_range_blockable(range)))
>  				continue;
> -			goto out_would_block;
> +			err = -EAGAIN;
> +			break;
>  		}
>  	}
> -	return 0;
>  
> -out_would_block:
> +	mn_itree_additional_passes(&additional_passes, range, cur_seq);
> +
>  	/*
>  	 * On -EAGAIN the non-blocking caller is not allowed to call
>  	 * invalidate_range_end()
>  	 */
> -	mn_itree_inv_end(subscriptions);
> -	return -EAGAIN;
> +	if (err)
> +		mn_itree_inv_end(subscriptions);
> +
> +	return err;
>  }
>  
>  static int mn_hlist_invalidate_range_start(
> -- 
> 2.50.1
> 
>

Re: [RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Thomas Hellström 1 month, 2 weeks ago

On Tue, 2025-08-19 at 20:03 +1000, Alistair Popple wrote:
> On Sat, Aug 09, 2025 at 03:51:32PM +0200, Thomas Hellström wrote:
> > GPU use-cases for mmu_interval_notifiers with hmm often involve
> > starting a gpu operation and then waiting for it to complete.
> > These operations are typically context preemption or TLB flushing.
> > 
> > With single-pass notifiers per GPU this doesn't scale in
> > multi-gpu scenarios. In those scenarios we'd want to first start
> > preemption- or TLB flushing on all GPUs and as a second pass wait
> > for them to complete on all gpus.
> > 
> > One can do this on per-driver basis multiplexing per-driver
> > notifiers but that would mean sharing the notifier "user" lock
> > across all GPUs and that doesn't scale well either, so adding
> > support
> > for multi-pass in the core appears like the right choice.
> > 
> > Implement multi-pass capability in the mmu_interval_notifier. Use a
> > linked list for the additional passes to minimize the impact for
> > use-cases that don't need the multi-pass functionality.
> > 
> > Cc: Jason Gunthorpe <jgg@ziepe.ca>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Simona Vetter <simona.vetter@ffwll.ch>
> > Cc: Dave Airlie <airlied@gmail.com>
> > Cc: <dri-devel@lists.freedesktop.org>
> > Cc: <linux-mm@kvack.org>
> > Cc: <linux-kernel@vger.kernel.org>
> > 
> > Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > ---
> >  include/linux/mmu_notifier.h | 30 ++++++++++++++++
> >  mm/mmu_notifier.c            | 67 +++++++++++++++++++++++++++++++-
> > ----
> >  2 files changed, 88 insertions(+), 9 deletions(-)
> > 
> > diff --git a/include/linux/mmu_notifier.h
> > b/include/linux/mmu_notifier.h
> > index d1094c2d5fb6..1107a8eafd8a 100644
> > --- a/include/linux/mmu_notifier.h
> > +++ b/include/linux/mmu_notifier.h
> > @@ -233,6 +233,32 @@ struct mmu_notifier {
> >  	unsigned int users;
> >  };
> >  
> > +/**
> > + * struct mmu_interval_notifier_pass - mmu_interval_notifier
> > multi-pass abstraction
> > + * @link: List link for the notifiers pending pass list
> > + *
> > + * Allocate, typically using GFP_NOWAIT in the interval notifier's
> > first pass.
> > + * If allocation fails (which is not unlikely under memory
> > pressure), fall back
> > + * to single-pass operation.
> > + */
> > +struct mmu_interval_notifier_pass {
> 
> If we limit the number of passes to two maybe call this
> `mmu_interval_notifier_finish()`? ...
> 
> > +	struct list_head link;
> > +	/**
> > +	 * @pass: Driver callback for additionall pass.
> > +	 * @additional_pass: Pointer to the
> > mmu_interval_notifier_pass structure.
> > +	 * @range: The mmu_notifier_range.
> > +	 * @cur_seq: The current sequence set by the first pass.
> > +	 *
> > +	 * Return: Either a pointer to a valid
> > mmu_interval_notifier_pass for
> > +	 * another pass to be called, or %NULL if processing is
> > complete for this
> > +	 * notifier. There is no error reporting mechanism for
> > additional passes.
> > +	 */
> > +	struct mmu_interval_notifier_pass *
> > +	(*pass) (struct mmu_interval_notifier_pass
> > *additional_pass,
> 

> 
> > +		 const struct mmu_notifier_range *range,
> > +		 unsigned long cur_seq);
> > +};
> > +
> >  /**
> >   * struct mmu_interval_notifier_ops
> >   * @invalidate: Upon return the caller must stop using any SPTEs
> > within this
> > @@ -243,6 +269,10 @@ struct mmu_interval_notifier_ops {
> >  	bool (*invalidate)(struct mmu_interval_notifier
> > *interval_sub,
> >  			   const struct mmu_notifier_range *range,
> >  			   unsigned long cur_seq);
> > +	bool (*invalidate_multipass)(struct mmu_interval_notifier
> > *interval_sub,
> 
> ... and then this could be called `invalidate_start()`. That might
> address some
> of the concerns with naming.

Makes sense. I'll have a look at that.

/Thomas


> 
> > +				     const struct
> > mmu_notifier_range *range,
> > +				     unsigned long cur_seq,
> > +				     struct
> > mmu_interval_notifier_pass **pass);
> >  };
> >  
> >  struct mmu_interval_notifier {
> > diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> > index 8e0125dc0522..dd6af87db103 100644
> > --- a/mm/mmu_notifier.c
> > +++ b/mm/mmu_notifier.c
> > @@ -260,6 +260,22 @@ mmu_interval_read_begin(struct
> > mmu_interval_notifier *interval_sub)
> >  }
> >  EXPORT_SYMBOL_GPL(mmu_interval_read_begin);
> >  
> > +static void mn_itree_additional_passes(struct list_head
> > *additional_passes,
> > +				       const struct
> > mmu_notifier_range *range,
> > +				       unsigned long cur_seq)
> > +{
> > +	struct mmu_interval_notifier_pass *p, *next;
> > +
> > +	while (!list_empty(additional_passes)) {
> > +		list_for_each_entry_safe(p, next,
> > additional_passes, link) {
> > +			list_del_init(&p->link);
> > +			p = p->pass(p, range, cur_seq);
> > +			if (p)
> > +				list_add_tail(&p->link,
> > additional_passes);
> > +		}
> > +	}
> > +}
> > +
> >  static void mn_itree_release(struct mmu_notifier_subscriptions
> > *subscriptions,
> >  			     struct mm_struct *mm)
> >  {
> > @@ -272,17 +288,32 @@ static void mn_itree_release(struct
> > mmu_notifier_subscriptions *subscriptions,
> >  	};
> >  	struct mmu_interval_notifier *interval_sub;
> >  	unsigned long cur_seq;
> > +	LIST_HEAD(additional_passes);
> >  	bool ret;
> >  
> >  	for (interval_sub =
> >  		     mn_itree_inv_start_range(subscriptions,
> > &range, &cur_seq);
> >  	     interval_sub;
> >  	     interval_sub = mn_itree_inv_next(interval_sub,
> > &range)) {
> > -		ret = interval_sub->ops->invalidate(interval_sub,
> > &range,
> > -						    cur_seq);
> > +		if (interval_sub->ops->invalidate_multipass) {
> > +			struct mmu_interval_notifier_pass *second
> > = NULL;
> > +
> > +			ret = interval_sub->ops-
> > >invalidate_multipass(interval_sub,
> > +								  
> >     &range,
> > +								  
> >     cur_seq,
> > +								  
> >     &second);
> > +			if (ret && second)
> > +				list_add_tail(&second->link,
> > &additional_passes);
> > +
> > +		} else {
> > +			ret = interval_sub->ops-
> > >invalidate(interval_sub,
> > +							   
> > &range,
> > +							   
> > cur_seq);
> > +		}
> >  		WARN_ON(!ret);
> >  	}
> >  
> > +	mn_itree_additional_passes(&additional_passes, &range,
> > cur_seq);
> >  	mn_itree_inv_end(subscriptions);
> >  }
> >  
> > @@ -431,6 +462,8 @@ static int mn_itree_invalidate(struct
> > mmu_notifier_subscriptions *subscriptions,
> >  {
> >  	struct mmu_interval_notifier *interval_sub;
> >  	unsigned long cur_seq;
> > +	LIST_HEAD(additional_passes);
> > +	int err = 0;
> >  
> >  	for (interval_sub =
> >  		     mn_itree_inv_start_range(subscriptions,
> > range, &cur_seq);
> > @@ -438,23 +471,39 @@ static int mn_itree_invalidate(struct
> > mmu_notifier_subscriptions *subscriptions,
> >  	     interval_sub = mn_itree_inv_next(interval_sub,
> > range)) {
> >  		bool ret;
> >  
> > -		ret = interval_sub->ops->invalidate(interval_sub,
> > range,
> > -						    cur_seq);
> > +		if (interval_sub->ops->invalidate_multipass) {
> > +			struct mmu_interval_notifier_pass *second
> > = NULL;
> > +
> > +			ret = interval_sub->ops-
> > >invalidate_multipass(interval_sub,
> > +								  
> >     range,
> > +								  
> >     cur_seq,
> > +								  
> >     &second);
> > +			if (ret && second)
> > +				list_add_tail(&second->link,
> > &additional_passes);
> > +
> > +		} else {
> > +			ret = interval_sub->ops-
> > >invalidate(interval_sub,
> > +							    range,
> > +							   
> > cur_seq);
> > +		}
> >  		if (!ret) {
> >  			if
> > (WARN_ON(mmu_notifier_range_blockable(range)))
> >  				continue;
> > -			goto out_would_block;
> > +			err = -EAGAIN;
> > +			break;
> >  		}
> >  	}
> > -	return 0;
> >  
> > -out_would_block:
> > +	mn_itree_additional_passes(&additional_passes, range,
> > cur_seq);
> > +
> >  	/*
> >  	 * On -EAGAIN the non-blocking caller is not allowed to
> > call
> >  	 * invalidate_range_end()
> >  	 */
> > -	mn_itree_inv_end(subscriptions);
> > -	return -EAGAIN;
> > +	if (err)
> > +		mn_itree_inv_end(subscriptions);
> > +
> > +	return err;
> >  }
> >  
> >  static int mn_hlist_invalidate_range_start(
> > -- 
> > 2.50.1
> > 
> >

Re: [RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Jason Gunthorpe 1 month, 2 weeks ago

On Sat, Aug 09, 2025 at 03:51:32PM +0200, Thomas Hellström wrote:
> GPU use-cases for mmu_interval_notifiers with hmm often involve
> starting a gpu operation and then waiting for it to complete.
> These operations are typically context preemption or TLB flushing.
> 
> With single-pass notifiers per GPU this doesn't scale in
> multi-gpu scenarios. In those scenarios we'd want to first start
> preemption- or TLB flushing on all GPUs and as a second pass wait
> for them to complete on all gpus.

The idea seems reasonable but I'm not sure I like the naming of
'multipass' or necessarily the complexity.

This is sort of a co-operative multithreading thing.

Do you really need a linked list here? At least justify the design
choices in the commit message..

> +struct mmu_interval_notifier_pass {
> +	struct list_head link;
> +	/**
> +	 * @pass: Driver callback for additionall pass.
> +	 * @additional_pass: Pointer to the mmu_interval_notifier_pass structure.
> +	 * @range: The mmu_notifier_range.
> +	 * @cur_seq: The current sequence set by the first pass.
> +	 *
> +	 * Return: Either a pointer to a valid mmu_interval_notifier_pass for
> +	 * another pass to be called, or %NULL if processing is complete for this
> +	 * notifier. There is no error reporting mechanism for additional passes.
> +	 */
> +	struct mmu_interval_notifier_pass *
> +	(*pass) (struct mmu_interval_notifier_pass *additional_pass,
> +		 const struct mmu_notifier_range *range,
> +		 unsigned long cur_seq);
> +};
> +
>  /**
>   * struct mmu_interval_notifier_ops
>   * @invalidate: Upon return the caller must stop using any SPTEs within this
> @@ -243,6 +269,10 @@ struct mmu_interval_notifier_ops {
>  	bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
>  			   const struct mmu_notifier_range *range,
>  			   unsigned long cur_seq);
> +	bool (*invalidate_multipass)(struct mmu_interval_notifier *interval_sub,
> +				     const struct mmu_notifier_range *range,
> +				     unsigned long cur_seq,
> +				     struct mmu_interval_notifier_pass **pass);

Couldn't this just have a pass number counter and some return code to
indicate this notifier is done?

Or do you really need more than 2 passes? Start/finish make sense
too. Otherwise you may have issues overlapping the backgroundable
operations between different driver types?

> +static void mn_itree_additional_passes(struct list_head *additional_passes,
> +				       const struct mmu_notifier_range *range,
> +				       unsigned long cur_seq)
> +{
> +	struct mmu_interval_notifier_pass *p, *next;
> +
> +	while (!list_empty(additional_passes)) {
> +		list_for_each_entry_safe(p, next, additional_passes, link) {
> +			list_del_init(&p->link);
> +			p = p->pass(p, range, cur_seq);
> +			if (p)
> +				list_add_tail(&p->link, additional_passes);
> +		}
> +	}

Like this is very naive, if one driver has only 'prepare' and 'wait
for device ack' passes, then it will immediately stop being concurrent
while another device may be still working on its 3rd pass.

So either this should be more complicated to properly support
different numbers of passes per registration or we should just support
two passes and be done with it?

Jason

Re: [RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Matthew Brost 1 month, 2 weeks ago

On Mon, Aug 18, 2025 at 01:07:26PM -0300, Jason Gunthorpe wrote:
> On Sat, Aug 09, 2025 at 03:51:32PM +0200, Thomas Hellström wrote:
> > GPU use-cases for mmu_interval_notifiers with hmm often involve
> > starting a gpu operation and then waiting for it to complete.
> > These operations are typically context preemption or TLB flushing.
> > 
> > With single-pass notifiers per GPU this doesn't scale in
> > multi-gpu scenarios. In those scenarios we'd want to first start
> > preemption- or TLB flushing on all GPUs and as a second pass wait
> > for them to complete on all gpus.
> 
> The idea seems reasonable but I'm not sure I like the naming of
> 'multipass' or necessarily the complexity.
> 
> This is sort of a co-operative multithreading thing.
> 
> Do you really need a linked list here? At least justify the design
> choices in the commit message..
> 

I think this choice makes sense: it allows embedding the wait state from
the initial notifier call into the pass structure. Patch [6] shows this
by attaching the issued TLB invalidation fences to the pass. Since a
single notifier may be invoked multiple times with different ranges but
the same seqno, I think this is the correct design choice—otherwise
there’s no unambiguous way to track per-invocation wait state. I agree
this should be documented in both the commit message and kernel-doc.

Matt

[6] https://patchwork.freedesktop.org/patch/667844/?series=152725&rev=1

> > +struct mmu_interval_notifier_pass {
> > +	struct list_head link;
> > +	/**
> > +	 * @pass: Driver callback for additionall pass.
> > +	 * @additional_pass: Pointer to the mmu_interval_notifier_pass structure.
> > +	 * @range: The mmu_notifier_range.
> > +	 * @cur_seq: The current sequence set by the first pass.
> > +	 *
> > +	 * Return: Either a pointer to a valid mmu_interval_notifier_pass for
> > +	 * another pass to be called, or %NULL if processing is complete for this
> > +	 * notifier. There is no error reporting mechanism for additional passes.
> > +	 */
> > +	struct mmu_interval_notifier_pass *
> > +	(*pass) (struct mmu_interval_notifier_pass *additional_pass,
> > +		 const struct mmu_notifier_range *range,
> > +		 unsigned long cur_seq);
> > +};
> > +
> >  /**
> >   * struct mmu_interval_notifier_ops
> >   * @invalidate: Upon return the caller must stop using any SPTEs within this
> > @@ -243,6 +269,10 @@ struct mmu_interval_notifier_ops {
> >  	bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
> >  			   const struct mmu_notifier_range *range,
> >  			   unsigned long cur_seq);
> > +	bool (*invalidate_multipass)(struct mmu_interval_notifier *interval_sub,
> > +				     const struct mmu_notifier_range *range,
> > +				     unsigned long cur_seq,
> > +				     struct mmu_interval_notifier_pass **pass);
> 
> Couldn't this just have a pass number counter and some return code to
> indicate this notifier is done?
> 
> Or do you really need more than 2 passes? Start/finish make sense
> too. Otherwise you may have issues overlapping the backgroundable
> operations between different driver types?
> 
> > +static void mn_itree_additional_passes(struct list_head *additional_passes,
> > +				       const struct mmu_notifier_range *range,
> > +				       unsigned long cur_seq)
> > +{
> > +	struct mmu_interval_notifier_pass *p, *next;
> > +
> > +	while (!list_empty(additional_passes)) {
> > +		list_for_each_entry_safe(p, next, additional_passes, link) {
> > +			list_del_init(&p->link);
> > +			p = p->pass(p, range, cur_seq);
> > +			if (p)
> > +				list_add_tail(&p->link, additional_passes);
> > +		}
> > +	}
> 
> Like this is very naive, if one driver has only 'prepare' and 'wait
> for device ack' passes, then it will immediately stop being concurrent
> while another device may be still working on its 3rd pass.
> 
> So either this should be more complicated to properly support
> different numbers of passes per registration or we should just support
> two passes and be done with it?
> 
> Jason

Re: [RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Jason Gunthorpe 1 month, 2 weeks ago

On Mon, Aug 18, 2025 at 09:25:20AM -0700, Matthew Brost wrote:
> I think this choice makes sense: it allows embedding the wait state from
> the initial notifier call into the pass structure. Patch [6] shows this
> by attaching the issued TLB invalidation fences to the pass. Since a
> single notifier may be invoked multiple times with different ranges but
> the same seqno,

That should be explained, but also seems to be a bit of a different
issue..

If the design is really to only have two passes and this linked list
is about retaining state then there should not be so much freedom to
have more passes.

Jason

Re: [RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Matthew Brost 1 month, 2 weeks ago

On Mon, Aug 18, 2025 at 01:36:17PM -0300, Jason Gunthorpe wrote:
> On Mon, Aug 18, 2025 at 09:25:20AM -0700, Matthew Brost wrote:
> > I think this choice makes sense: it allows embedding the wait state from
> > the initial notifier call into the pass structure. Patch [6] shows this
> > by attaching the issued TLB invalidation fences to the pass. Since a
> > single notifier may be invoked multiple times with different ranges but
> > the same seqno,
> 
> That should be explained, but also seems to be a bit of a different
> issue..
> 
> If the design is really to only have two passes and this linked list
> is about retaining state then there should not be so much freedom to
> have more passes.

I’ll let Thomas weigh in on whether we really need more than two passes;
my feeling is that two passes are likely sufficient. It’s also worth
noting that the linked list has an added benefit: the notifier tree only
needs to be walked once (a small time-complexity win).

Matt

> 
> Jason

Re: [RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Jason Gunthorpe 1 month, 2 weeks ago

On Mon, Aug 18, 2025 at 09:44:01AM -0700, Matthew Brost wrote:
> On Mon, Aug 18, 2025 at 01:36:17PM -0300, Jason Gunthorpe wrote:
> > On Mon, Aug 18, 2025 at 09:25:20AM -0700, Matthew Brost wrote:
> > > I think this choice makes sense: it allows embedding the wait state from
> > > the initial notifier call into the pass structure. Patch [6] shows this
> > > by attaching the issued TLB invalidation fences to the pass. Since a
> > > single notifier may be invoked multiple times with different ranges but
> > > the same seqno,
> > 
> > That should be explained, but also seems to be a bit of a different
> > issue..
> > 
> > If the design is really to only have two passes and this linked list
> > is about retaining state then there should not be so much freedom to
> > have more passes.
> 
> I’ll let Thomas weigh in on whether we really need more than two passes;
> my feeling is that two passes are likely sufficient. It’s also worth
> noting that the linked list has an added benefit: the notifier tree only
> needs to be walked once (a small time-complexity win).

You may end up keeping the linked list just with no way to add a third
pass.

Jason

Re: [RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Alistair Popple 1 month, 2 weeks ago

On Mon, Aug 18, 2025 at 01:46:55PM -0300, Jason Gunthorpe wrote:
> On Mon, Aug 18, 2025 at 09:44:01AM -0700, Matthew Brost wrote:
> > On Mon, Aug 18, 2025 at 01:36:17PM -0300, Jason Gunthorpe wrote:
> > > On Mon, Aug 18, 2025 at 09:25:20AM -0700, Matthew Brost wrote:
> > > > I think this choice makes sense: it allows embedding the wait state from
> > > > the initial notifier call into the pass structure. Patch [6] shows this
> > > > by attaching the issued TLB invalidation fences to the pass. Since a
> > > > single notifier may be invoked multiple times with different ranges but
> > > > the same seqno,
> > > 
> > > That should be explained, but also seems to be a bit of a different
> > > issue..
> > > 
> > > If the design is really to only have two passes and this linked list
> > > is about retaining state then there should not be so much freedom to
> > > have more passes.
> > 
> > I’ll let Thomas weigh in on whether we really need more than two passes;
> > my feeling is that two passes are likely sufficient. It’s also worth
> > noting that the linked list has an added benefit: the notifier tree only
> > needs to be walked once (a small time-complexity win).
> 
> You may end up keeping the linked list just with no way to add a third
> pass.

It seems to me though that linked list still adds unnecessary complexity. I
think this would all be much easier to follow if we just added two new callbacks
- invalidate_start() and invalidate_end() say.

Admitedly that would still require the linked list (or something similar) to
retain the ability to hold/pass a context between the start and end callbacks.
Which is bit annoying, it's a pity we need to allocate memory in a performance
sensitive path to effectively pass (at least in this case) a single pointer. I
can't think of any obvious solutions to that though.

> Jason
>

Re: [RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Thomas Hellström 1 month, 2 weeks ago

On Tue, 2025-08-19 at 19:55 +1000, Alistair Popple wrote:
> On Mon, Aug 18, 2025 at 01:46:55PM -0300, Jason Gunthorpe wrote:
> > On Mon, Aug 18, 2025 at 09:44:01AM -0700, Matthew Brost wrote:
> > > On Mon, Aug 18, 2025 at 01:36:17PM -0300, Jason Gunthorpe wrote:
> > > > On Mon, Aug 18, 2025 at 09:25:20AM -0700, Matthew Brost wrote:
> > > > > I think this choice makes sense: it allows embedding the wait
> > > > > state from
> > > > > the initial notifier call into the pass structure. Patch [6]
> > > > > shows this
> > > > > by attaching the issued TLB invalidation fences to the pass.
> > > > > Since a
> > > > > single notifier may be invoked multiple times with different
> > > > > ranges but
> > > > > the same seqno,
> > > > 
> > > > That should be explained, but also seems to be a bit of a
> > > > different
> > > > issue..
> > > > 
> > > > If the design is really to only have two passes and this linked
> > > > list
> > > > is about retaining state then there should not be so much
> > > > freedom to
> > > > have more passes.
> > > 
> > > I’ll let Thomas weigh in on whether we really need more than two
> > > passes;
> > > my feeling is that two passes are likely sufficient. It’s also
> > > worth
> > > noting that the linked list has an added benefit: the notifier
> > > tree only
> > > needs to be walked once (a small time-complexity win).
> > 
> > You may end up keeping the linked list just with no way to add a
> > third
> > pass.
> 
> It seems to me though that linked list still adds unnecessary
> complexity. I
> think this would all be much easier to follow if we just added two
> new callbacks
> - invalidate_start() and invalidate_end() say.

One thing that the linked list avoids, though, is traversing the
interval tree two times. It has O(n*log(n)) whereas the linked list
overhead is just O(n_2pass).

> 
> Admitedly that would still require the linked list (or something
> similar) to
> retain the ability to hold/pass a context between the start and end
> callbacks.
> Which is bit annoying, it's a pity we need to allocate memory in a
> performance
> sensitive path to effectively pass (at least in this case) a single
> pointer. I
> can't think of any obvious solutions to that though.

One idea is for any two-pass notifier implementation to use a small
pool. That would also to some extent mitigate the risk of out-of-memory
with GFP_NOWAIT.

/Thomas


> 
> > Jason
> >

Re: [RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Matthew Brost 1 month, 2 weeks ago

On Tue, Aug 19, 2025 at 01:33:40PM +0200, Thomas Hellström wrote:
> On Tue, 2025-08-19 at 19:55 +1000, Alistair Popple wrote:
> > On Mon, Aug 18, 2025 at 01:46:55PM -0300, Jason Gunthorpe wrote:
> > > On Mon, Aug 18, 2025 at 09:44:01AM -0700, Matthew Brost wrote:
> > > > On Mon, Aug 18, 2025 at 01:36:17PM -0300, Jason Gunthorpe wrote:
> > > > > On Mon, Aug 18, 2025 at 09:25:20AM -0700, Matthew Brost wrote:
> > > > > > I think this choice makes sense: it allows embedding the wait
> > > > > > state from
> > > > > > the initial notifier call into the pass structure. Patch [6]
> > > > > > shows this
> > > > > > by attaching the issued TLB invalidation fences to the pass.
> > > > > > Since a
> > > > > > single notifier may be invoked multiple times with different
> > > > > > ranges but
> > > > > > the same seqno,
> > > > > 
> > > > > That should be explained, but also seems to be a bit of a
> > > > > different
> > > > > issue..
> > > > > 
> > > > > If the design is really to only have two passes and this linked
> > > > > list
> > > > > is about retaining state then there should not be so much
> > > > > freedom to
> > > > > have more passes.
> > > > 
> > > > I’ll let Thomas weigh in on whether we really need more than two
> > > > passes;
> > > > my feeling is that two passes are likely sufficient. It’s also
> > > > worth
> > > > noting that the linked list has an added benefit: the notifier
> > > > tree only
> > > > needs to be walked once (a small time-complexity win).
> > > 
> > > You may end up keeping the linked list just with no way to add a
> > > third
> > > pass.
> > 
> > It seems to me though that linked list still adds unnecessary
> > complexity. I
> > think this would all be much easier to follow if we just added two
> > new callbacks
> > - invalidate_start() and invalidate_end() say.
> 
> One thing that the linked list avoids, though, is traversing the
> interval tree two times. It has O(n*log(n)) whereas the linked list
> overhead is just O(n_2pass).
> 
> > 
> > Admitedly that would still require the linked list (or something
> > similar) to
> > retain the ability to hold/pass a context between the start and end
> > callbacks.
> > Which is bit annoying, it's a pity we need to allocate memory in a
> > performance
> > sensitive path to effectively pass (at least in this case) a single
> > pointer. I
> > can't think of any obvious solutions to that though.
> 
> One idea is for any two-pass notifier implementation to use a small
> pool. That would also to some extent mitigate the risk of out-of-memory
> with GFP_NOWAIT.
> 

I think we can attach a preallocated list entry to the driver-side
notifier state; then you’d only need to allocate (or block) if that
notifier is invoked more than once while a wait action (e.g., a TLB
invalidation) is outstanding. Multiple invocations are technically
possible, but in practice I’d expect them to be rare.

I’m not sure how much of a win this is, though. On Intel hardware, TLB
invalidations are several orders of magnitude slower than the software
steps our notifiers perform. Ultimately, whether to allocate or
preallocate is a driver-side choice.

Matt

> /Thomas
> 
> 
> > 
> > > Jason
> > > 
>

Re: [RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Thomas Hellström 1 month, 2 weeks ago

On Tue, 2025-08-19 at 08:35 -0700, Matthew Brost wrote:
> On Tue, Aug 19, 2025 at 01:33:40PM +0200, Thomas Hellström wrote:
> > On Tue, 2025-08-19 at 19:55 +1000, Alistair Popple wrote:
> > > On Mon, Aug 18, 2025 at 01:46:55PM -0300, Jason Gunthorpe wrote:
> > > > On Mon, Aug 18, 2025 at 09:44:01AM -0700, Matthew Brost wrote:
> > > > > On Mon, Aug 18, 2025 at 01:36:17PM -0300, Jason Gunthorpe
> > > > > wrote:
> > > > > > On Mon, Aug 18, 2025 at 09:25:20AM -0700, Matthew Brost
> > > > > > wrote:
> > > > > > > I think this choice makes sense: it allows embedding the
> > > > > > > wait
> > > > > > > state from
> > > > > > > the initial notifier call into the pass structure. Patch
> > > > > > > [6]
> > > > > > > shows this
> > > > > > > by attaching the issued TLB invalidation fences to the
> > > > > > > pass.
> > > > > > > Since a
> > > > > > > single notifier may be invoked multiple times with
> > > > > > > different
> > > > > > > ranges but
> > > > > > > the same seqno,
> > > > > > 
> > > > > > That should be explained, but also seems to be a bit of a
> > > > > > different
> > > > > > issue..
> > > > > > 
> > > > > > If the design is really to only have two passes and this
> > > > > > linked
> > > > > > list
> > > > > > is about retaining state then there should not be so much
> > > > > > freedom to
> > > > > > have more passes.
> > > > > 
> > > > > I’ll let Thomas weigh in on whether we really need more than
> > > > > two
> > > > > passes;
> > > > > my feeling is that two passes are likely sufficient. It’s
> > > > > also
> > > > > worth
> > > > > noting that the linked list has an added benefit: the
> > > > > notifier
> > > > > tree only
> > > > > needs to be walked once (a small time-complexity win).
> > > > 
> > > > You may end up keeping the linked list just with no way to add
> > > > a
> > > > third
> > > > pass.
> > > 
> > > It seems to me though that linked list still adds unnecessary
> > > complexity. I
> > > think this would all be much easier to follow if we just added
> > > two
> > > new callbacks
> > > - invalidate_start() and invalidate_end() say.
> > 
> > One thing that the linked list avoids, though, is traversing the
> > interval tree two times. It has O(n*log(n)) whereas the linked list
> > overhead is just O(n_2pass).
> > 
> > > 
> > > Admitedly that would still require the linked list (or something
> > > similar) to
> > > retain the ability to hold/pass a context between the start and
> > > end
> > > callbacks.
> > > Which is bit annoying, it's a pity we need to allocate memory in
> > > a
> > > performance
> > > sensitive path to effectively pass (at least in this case) a
> > > single
> > > pointer. I
> > > can't think of any obvious solutions to that though.
> > 
> > One idea is for any two-pass notifier implementation to use a small
> > pool. That would also to some extent mitigate the risk of out-of-
> > memory
> > with GFP_NOWAIT.
> > 
> 
> I think we can attach a preallocated list entry to the driver-side
> notifier state; then you’d only need to allocate (or block) if that
> notifier is invoked more than once while a wait action (e.g., a TLB
> invalidation) is outstanding. Multiple invocations are technically
> possible, but in practice I’d expect them to be rare.
> 
> I’m not sure how much of a win this is, though. On Intel hardware,
> TLB
> invalidations are several orders of magnitude slower than the
> software
> steps our notifiers perform. Ultimately, whether to allocate or
> preallocate is a driver-side choice.

I agree we shouldn't enforce anything at this point. But if we envision
a situation where multiple subsystem two-pass notifiers subscribe, the
GFP_NOWAIT memory might be exhausted by the notifiers called first. A
greedy behavior that might eventually cause serialization anyway.

So to behave nicely towards other notifier subscriptions, an
implementation should ideally have something pre-allocated.

/Thomas


> 
> Matt
> 
> > /Thomas
> > 
> > 
> > > 
> > > > Jason
> > > > 
> >

Re: [RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Thomas Hellström 1 month, 2 weeks ago

On Mon, 2025-08-18 at 13:36 -0300, Jason Gunthorpe wrote:
> On Mon, Aug 18, 2025 at 09:25:20AM -0700, Matthew Brost wrote:
> > I think this choice makes sense: it allows embedding the wait state
> > from
> > the initial notifier call into the pass structure. Patch [6] shows
> > this
> > by attaching the issued TLB invalidation fences to the pass. Since
> > a
> > single notifier may be invoked multiple times with different ranges
> > but
> > the same seqno,
> 
> That should be explained, but also seems to be a bit of a different
> issue..
> 
> If the design is really to only have two passes and this linked list
> is about retaining state then there should not be so much freedom to
> have more passes.

Actually the initial suggestion was two passes only. Then I thought I
saw a use-case for even three passes and added the multi-pass thing,
but I think it turned out we didn't have such a use-case. IMO we could
restrict it to two-pass. Matthew, that should be completely OK for the
SVM use-case, right?

/Thomas


> 
> Jason

Re: [RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes

Posted by Matthew Brost 1 month, 2 weeks ago

On Mon, Aug 18, 2025 at 06:42:36PM +0200, Thomas Hellström wrote:
> On Mon, 2025-08-18 at 13:36 -0300, Jason Gunthorpe wrote:
> > On Mon, Aug 18, 2025 at 09:25:20AM -0700, Matthew Brost wrote:
> > > I think this choice makes sense: it allows embedding the wait state
> > > from
> > > the initial notifier call into the pass structure. Patch [6] shows
> > > this
> > > by attaching the issued TLB invalidation fences to the pass. Since
> > > a
> > > single notifier may be invoked multiple times with different ranges
> > > but
> > > the same seqno,
> > 
> > That should be explained, but also seems to be a bit of a different
> > issue..
> > 
> > If the design is really to only have two passes and this linked list
> > is about retaining state then there should not be so much freedom to
> > have more passes.
> 
> Actually the initial suggestion was two passes only. Then I thought I
> saw a use-case for even three passes and added the multi-pass thing,
> but I think it turned out we didn't have such a use-case. IMO we could
> restrict it to two-pass. Matthew, that should be completely OK for the
> SVM use-case, right?
> 

Yea, I just replied that 2 passes should be sufficient.

Matt

> /Thomas
> 
> 
> > 
> > Jason
>

[RFC PATCH 1/6] mm/mmu_notifier: Allow multiple struct mmu_interval_notifier passes
[RFC PATCH 2/6] drm/gpusvm: Update GPU SVM / Xe to twopass MMU notifier
[RFC PATCH 3/6] drm/gpusvm: Add drm_gpusvm_in_notifier_* helpers
[RFC PATCH 4/6] drm/xe: Skip waiting on unarmed fences in xe_gt_tlb_invalidation_fence_wait
[RFC PATCH 5/6] drm/xe: Add fences argument to xe_vm_range_tilemask_tlb_invalidation
[RFC PATCH 6/6] drm/xe: Implement two pass MMU notifiers for SVM