xe_vm_range_tilemask_tlb_inval() submits TLB invalidation requests to
all GTs in a tile mask and then immediately waits for them to complete
before returning. This is fine for the existing callers, but a
subsequent patch will need to defer the wait in order to overlap TLB
invalidations across multiple VMAs.
Introduce xe_tlb_inval_range_tilemask_submit() and
xe_tlb_inval_batch_wait() in xe_tlb_inval.c as the submit and wait
halves respectively. The batch of fences is carried in the new
xe_tlb_inval_batch structure. Remove xe_vm_range_tilemask_tlb_inval()
and convert all three call sites to the new API.
Assisted-by: GitHub Copilot:claude-sonnet-4.6
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
drivers/gpu/drm/xe/xe_svm.c | 6 +-
drivers/gpu/drm/xe/xe_tlb_inval.c | 82 +++++++++++++++++++++++++
drivers/gpu/drm/xe/xe_tlb_inval.h | 6 ++
drivers/gpu/drm/xe/xe_tlb_inval_types.h | 14 +++++
drivers/gpu/drm/xe/xe_vm.c | 69 +++------------------
drivers/gpu/drm/xe/xe_vm.h | 3 -
drivers/gpu/drm/xe/xe_vm_madvise.c | 9 ++-
drivers/gpu/drm/xe/xe_vm_types.h | 1 +
8 files changed, 123 insertions(+), 67 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
index 002b6c22ad3f..6ea4972c2791 100644
--- a/drivers/gpu/drm/xe/xe_svm.c
+++ b/drivers/gpu/drm/xe/xe_svm.c
@@ -19,6 +19,7 @@
#include "xe_pt.h"
#include "xe_svm.h"
#include "xe_tile.h"
+#include "xe_tlb_inval.h"
#include "xe_ttm_vram_mgr.h"
#include "xe_vm.h"
#include "xe_vm_types.h"
@@ -225,6 +226,7 @@ static void xe_svm_invalidate(struct drm_gpusvm *gpusvm,
const struct mmu_notifier_range *mmu_range)
{
struct xe_vm *vm = gpusvm_to_vm(gpusvm);
+ struct xe_tlb_inval_batch _batch;
struct xe_device *xe = vm->xe;
struct drm_gpusvm_range *r, *first;
struct xe_tile *tile;
@@ -276,7 +278,9 @@ static void xe_svm_invalidate(struct drm_gpusvm *gpusvm,
xe_device_wmb(xe);
- err = xe_vm_range_tilemask_tlb_inval(vm, adj_start, adj_end, tile_mask);
+ err = xe_tlb_inval_range_tilemask_submit(xe, vm->usm.asid, adj_start, adj_end,
+ tile_mask, &_batch);
+ xe_tlb_inval_batch_wait(&_batch);
WARN_ON_ONCE(err);
range_notifier_event_end:
diff --git a/drivers/gpu/drm/xe/xe_tlb_inval.c b/drivers/gpu/drm/xe/xe_tlb_inval.c
index 933f30fb617d..343e37cfe715 100644
--- a/drivers/gpu/drm/xe/xe_tlb_inval.c
+++ b/drivers/gpu/drm/xe/xe_tlb_inval.c
@@ -486,3 +486,85 @@ bool xe_tlb_inval_idle(struct xe_tlb_inval *tlb_inval)
guard(spinlock_irq)(&tlb_inval->pending_lock);
return list_is_singular(&tlb_inval->pending_fences);
}
+
+/**
+ * xe_tlb_inval_batch_wait() - Wait for all fences in a TLB invalidation batch
+ * @batch: Batch of TLB invalidation fences to wait on
+ *
+ * Waits for every fence in @batch to signal, then resets @batch so it can be
+ * reused for a subsequent invalidation.
+ */
+void xe_tlb_inval_batch_wait(struct xe_tlb_inval_batch *batch)
+{
+ struct xe_tlb_inval_fence *fence = &batch->fence[0];
+ unsigned int i;
+
+ for (i = 0; i < batch->num_fences; ++i)
+ xe_tlb_inval_fence_wait(fence++);
+
+ batch->num_fences = 0;
+}
+
+/**
+ * xe_tlb_inval_range_tilemask_submit() - Submit TLB invalidations for an
+ * address range on a tile mask
+ * @xe: The xe device
+ * @asid: Address space ID
+ * @start: start address
+ * @end: end address
+ * @tile_mask: mask for which gt's issue tlb invalidation
+ * @batch: Batch of tlb invalidate fences
+ *
+ * Issue a range based TLB invalidation for gt's in tilemask
+ *
+ * Returns 0 for success, negative error code otherwise.
+ */
+int xe_tlb_inval_range_tilemask_submit(struct xe_device *xe, u32 asid,
+ u64 start, u64 end, u8 tile_mask,
+ struct xe_tlb_inval_batch *batch)
+{
+ struct xe_tlb_inval_fence *fence = &batch->fence[0];
+ struct xe_tile *tile;
+ u32 fence_id = 0;
+ u8 id;
+ int err;
+
+ batch->num_fences = 0;
+ if (!tile_mask)
+ return 0;
+
+ for_each_tile(tile, xe, id) {
+ if (!(tile_mask & BIT(id)))
+ continue;
+
+ xe_tlb_inval_fence_init(&tile->primary_gt->tlb_inval,
+ &fence[fence_id], true);
+
+ err = xe_tlb_inval_range(&tile->primary_gt->tlb_inval,
+ &fence[fence_id], start, end,
+ asid, NULL);
+ if (err)
+ goto wait;
+ ++fence_id;
+
+ if (!tile->media_gt)
+ continue;
+
+ xe_tlb_inval_fence_init(&tile->media_gt->tlb_inval,
+ &fence[fence_id], true);
+
+ err = xe_tlb_inval_range(&tile->media_gt->tlb_inval,
+ &fence[fence_id], start, end,
+ asid, NULL);
+ if (err)
+ goto wait;
+ ++fence_id;
+ }
+
+wait:
+ batch->num_fences = fence_id;
+ if (err)
+ xe_tlb_inval_batch_wait(batch);
+
+ return err;
+}
diff --git a/drivers/gpu/drm/xe/xe_tlb_inval.h b/drivers/gpu/drm/xe/xe_tlb_inval.h
index 62089254fa23..a76b7823a5f2 100644
--- a/drivers/gpu/drm/xe/xe_tlb_inval.h
+++ b/drivers/gpu/drm/xe/xe_tlb_inval.h
@@ -45,4 +45,10 @@ void xe_tlb_inval_done_handler(struct xe_tlb_inval *tlb_inval, int seqno);
bool xe_tlb_inval_idle(struct xe_tlb_inval *tlb_inval);
+int xe_tlb_inval_range_tilemask_submit(struct xe_device *xe, u32 asid,
+ u64 start, u64 end, u8 tile_mask,
+ struct xe_tlb_inval_batch *batch);
+
+void xe_tlb_inval_batch_wait(struct xe_tlb_inval_batch *batch);
+
#endif /* _XE_TLB_INVAL_ */
diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_types.h b/drivers/gpu/drm/xe/xe_tlb_inval_types.h
index 3b089f90f002..3d1797d186fd 100644
--- a/drivers/gpu/drm/xe/xe_tlb_inval_types.h
+++ b/drivers/gpu/drm/xe/xe_tlb_inval_types.h
@@ -9,6 +9,8 @@
#include <linux/workqueue.h>
#include <linux/dma-fence.h>
+#include "xe_device_types.h"
+
struct drm_suballoc;
struct xe_tlb_inval;
@@ -132,4 +134,16 @@ struct xe_tlb_inval_fence {
ktime_t inval_time;
};
+/**
+ * struct xe_tlb_inval_batch - Batch of TLB invalidation fences
+ *
+ * Holds one fence per GT covered by a TLB invalidation request.
+ */
+struct xe_tlb_inval_batch {
+ /** @fence: per-GT TLB invalidation fences */
+ struct xe_tlb_inval_fence fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
+ /** @num_fences: number of valid entries in @fence */
+ unsigned int num_fences;
+};
+
#endif
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 548b0769b3ef..7f29d2b2972d 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -3966,66 +3966,6 @@ void xe_vm_unlock(struct xe_vm *vm)
dma_resv_unlock(xe_vm_resv(vm));
}
-/**
- * xe_vm_range_tilemask_tlb_inval - Issue a TLB invalidation on this tilemask for an
- * address range
- * @vm: The VM
- * @start: start address
- * @end: end address
- * @tile_mask: mask for which gt's issue tlb invalidation
- *
- * Issue a range based TLB invalidation for gt's in tilemask
- *
- * Returns 0 for success, negative error code otherwise.
- */
-int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start,
- u64 end, u8 tile_mask)
-{
- struct xe_tlb_inval_fence
- fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
- struct xe_tile *tile;
- u32 fence_id = 0;
- u8 id;
- int err;
-
- if (!tile_mask)
- return 0;
-
- for_each_tile(tile, vm->xe, id) {
- if (!(tile_mask & BIT(id)))
- continue;
-
- xe_tlb_inval_fence_init(&tile->primary_gt->tlb_inval,
- &fence[fence_id], true);
-
- err = xe_tlb_inval_range(&tile->primary_gt->tlb_inval,
- &fence[fence_id], start, end,
- vm->usm.asid, NULL);
- if (err)
- goto wait;
- ++fence_id;
-
- if (!tile->media_gt)
- continue;
-
- xe_tlb_inval_fence_init(&tile->media_gt->tlb_inval,
- &fence[fence_id], true);
-
- err = xe_tlb_inval_range(&tile->media_gt->tlb_inval,
- &fence[fence_id], start, end,
- vm->usm.asid, NULL);
- if (err)
- goto wait;
- ++fence_id;
- }
-
-wait:
- for (id = 0; id < fence_id; ++id)
- xe_tlb_inval_fence_wait(&fence[id]);
-
- return err;
-}
-
/**
* xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
* @vma: VMA to invalidate
@@ -4040,6 +3980,7 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
{
struct xe_device *xe = xe_vma_vm(vma)->xe;
struct xe_vm *vm = xe_vma_vm(vma);
+ struct xe_tlb_inval_batch _batch;
struct xe_tile *tile;
u8 tile_mask = 0;
int ret = 0;
@@ -4080,12 +4021,16 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
xe_device_wmb(xe);
- ret = xe_vm_range_tilemask_tlb_inval(xe_vma_vm(vma), xe_vma_start(vma),
- xe_vma_end(vma), tile_mask);
+ ret = xe_tlb_inval_range_tilemask_submit(xe, xe_vma_vm(vma)->usm.asid,
+ xe_vma_start(vma), xe_vma_end(vma),
+ tile_mask, &_batch);
/* WRITE_ONCE pairs with READ_ONCE in xe_vm_has_valid_gpu_mapping() */
WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
+ if (!ret)
+ xe_tlb_inval_batch_wait(&_batch);
+
return ret;
}
diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
index f849e369432b..62f4b6fec0bc 100644
--- a/drivers/gpu/drm/xe/xe_vm.h
+++ b/drivers/gpu/drm/xe/xe_vm.h
@@ -240,9 +240,6 @@ struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
struct xe_svm_range *range);
-int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start,
- u64 end, u8 tile_mask);
-
int xe_vm_invalidate_vma(struct xe_vma *vma);
int xe_vm_validate_protected(struct xe_vm *vm);
diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c b/drivers/gpu/drm/xe/xe_vm_madvise.c
index 95bf53cc29e3..39717026e84f 100644
--- a/drivers/gpu/drm/xe/xe_vm_madvise.c
+++ b/drivers/gpu/drm/xe/xe_vm_madvise.c
@@ -12,6 +12,7 @@
#include "xe_pat.h"
#include "xe_pt.h"
#include "xe_svm.h"
+#include "xe_tlb_inval.h"
struct xe_vmas_in_madvise_range {
u64 addr;
@@ -235,13 +236,19 @@ static u8 xe_zap_ptes_in_madvise_range(struct xe_vm *vm, u64 start, u64 end)
static int xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64 start, u64 end)
{
u8 tile_mask = xe_zap_ptes_in_madvise_range(vm, start, end);
+ struct xe_tlb_inval_batch batch;
+ int err;
if (!tile_mask)
return 0;
xe_device_wmb(vm->xe);
- return xe_vm_range_tilemask_tlb_inval(vm, start, end, tile_mask);
+ err = xe_tlb_inval_range_tilemask_submit(vm->xe, vm->usm.asid, start, end,
+ tile_mask, &batch);
+ xe_tlb_inval_batch_wait(&batch);
+
+ return err;
}
static bool madvise_args_are_sane(struct xe_device *xe, const struct drm_xe_madvise *args)
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index 1f6f7e30e751..de6544165cfa 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -18,6 +18,7 @@
#include "xe_device_types.h"
#include "xe_pt_types.h"
#include "xe_range_fence.h"
+#include "xe_tlb_inval_types.h"
#include "xe_userptr.h"
struct drm_pagemap;
--
2.53.0
On Mon, Mar 02, 2026 at 05:32:47PM +0100, Thomas Hellström wrote:
> xe_vm_range_tilemask_tlb_inval() submits TLB invalidation requests to
> all GTs in a tile mask and then immediately waits for them to complete
> before returning. This is fine for the existing callers, but a
> subsequent patch will need to defer the wait in order to overlap TLB
> invalidations across multiple VMAs.
>
> Introduce xe_tlb_inval_range_tilemask_submit() and
> xe_tlb_inval_batch_wait() in xe_tlb_inval.c as the submit and wait
> halves respectively. The batch of fences is carried in the new
> xe_tlb_inval_batch structure. Remove xe_vm_range_tilemask_tlb_inval()
> and convert all three call sites to the new API.
>
Mostly nits...
> Assisted-by: GitHub Copilot:claude-sonnet-4.6
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> ---
> drivers/gpu/drm/xe/xe_svm.c | 6 +-
> drivers/gpu/drm/xe/xe_tlb_inval.c | 82 +++++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_tlb_inval.h | 6 ++
> drivers/gpu/drm/xe/xe_tlb_inval_types.h | 14 +++++
> drivers/gpu/drm/xe/xe_vm.c | 69 +++------------------
> drivers/gpu/drm/xe/xe_vm.h | 3 -
> drivers/gpu/drm/xe/xe_vm_madvise.c | 9 ++-
> drivers/gpu/drm/xe/xe_vm_types.h | 1 +
> 8 files changed, 123 insertions(+), 67 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
> index 002b6c22ad3f..6ea4972c2791 100644
> --- a/drivers/gpu/drm/xe/xe_svm.c
> +++ b/drivers/gpu/drm/xe/xe_svm.c
> @@ -19,6 +19,7 @@
> #include "xe_pt.h"
> #include "xe_svm.h"
> #include "xe_tile.h"
> +#include "xe_tlb_inval.h"
> #include "xe_ttm_vram_mgr.h"
> #include "xe_vm.h"
> #include "xe_vm_types.h"
> @@ -225,6 +226,7 @@ static void xe_svm_invalidate(struct drm_gpusvm *gpusvm,
> const struct mmu_notifier_range *mmu_range)
> {
> struct xe_vm *vm = gpusvm_to_vm(gpusvm);
> + struct xe_tlb_inval_batch _batch;
> struct xe_device *xe = vm->xe;
> struct drm_gpusvm_range *r, *first;
> struct xe_tile *tile;
> @@ -276,7 +278,9 @@ static void xe_svm_invalidate(struct drm_gpusvm *gpusvm,
>
> xe_device_wmb(xe);
>
> - err = xe_vm_range_tilemask_tlb_inval(vm, adj_start, adj_end, tile_mask);
> + err = xe_tlb_inval_range_tilemask_submit(xe, vm->usm.asid, adj_start, adj_end,
> + tile_mask, &_batch);
> + xe_tlb_inval_batch_wait(&_batch);
No need to call wait on an error but it is harmless.
So you could write it like this:
if (!WARN_ON_ONCE(err))
xe_tlb_inval_batch_wait(&_batch);
> WARN_ON_ONCE(err);
>
> range_notifier_event_end:
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval.c b/drivers/gpu/drm/xe/xe_tlb_inval.c
> index 933f30fb617d..343e37cfe715 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval.c
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval.c
> @@ -486,3 +486,85 @@ bool xe_tlb_inval_idle(struct xe_tlb_inval *tlb_inval)
> guard(spinlock_irq)(&tlb_inval->pending_lock);
> return list_is_singular(&tlb_inval->pending_fences);
> }
> +
> +/**
> + * xe_tlb_inval_batch_wait() - Wait for all fences in a TLB invalidation batch
> + * @batch: Batch of TLB invalidation fences to wait on
> + *
> + * Waits for every fence in @batch to signal, then resets @batch so it can be
> + * reused for a subsequent invalidation.
> + */
> +void xe_tlb_inval_batch_wait(struct xe_tlb_inval_batch *batch)
> +{
> + struct xe_tlb_inval_fence *fence = &batch->fence[0];
Would this be better:
s/&batch->fence[0]/batch->fence
Personal preference I guess.
> + unsigned int i;
> +
> + for (i = 0; i < batch->num_fences; ++i)
> + xe_tlb_inval_fence_wait(fence++);
> +
> + batch->num_fences = 0;
> +}
> +
> +/**
> + * xe_tlb_inval_range_tilemask_submit() - Submit TLB invalidations for an
> + * address range on a tile mask
> + * @xe: The xe device
> + * @asid: Address space ID
> + * @start: start address
> + * @end: end address
> + * @tile_mask: mask for which gt's issue tlb invalidation
> + * @batch: Batch of tlb invalidate fences
> + *
> + * Issue a range based TLB invalidation for gt's in tilemask
> + *
Mention no need to wait on batch if this function returns an error?
> + * Returns 0 for success, negative error code otherwise.
> + */
> +int xe_tlb_inval_range_tilemask_submit(struct xe_device *xe, u32 asid,
> + u64 start, u64 end, u8 tile_mask,
> + struct xe_tlb_inval_batch *batch)
> +{
> + struct xe_tlb_inval_fence *fence = &batch->fence[0];
> + struct xe_tile *tile;
> + u32 fence_id = 0;
> + u8 id;
> + int err;
> +
> + batch->num_fences = 0;
> + if (!tile_mask)
> + return 0;
> +
> + for_each_tile(tile, xe, id) {
> + if (!(tile_mask & BIT(id)))
> + continue;
> +
> + xe_tlb_inval_fence_init(&tile->primary_gt->tlb_inval,
> + &fence[fence_id], true);
> +
> + err = xe_tlb_inval_range(&tile->primary_gt->tlb_inval,
> + &fence[fence_id], start, end,
> + asid, NULL);
> + if (err)
> + goto wait;
> + ++fence_id;
> +
> + if (!tile->media_gt)
> + continue;
> +
> + xe_tlb_inval_fence_init(&tile->media_gt->tlb_inval,
> + &fence[fence_id], true);
> +
> + err = xe_tlb_inval_range(&tile->media_gt->tlb_inval,
> + &fence[fence_id], start, end,
> + asid, NULL);
> + if (err)
> + goto wait;
> + ++fence_id;
> + }
> +
> +wait:
> + batch->num_fences = fence_id;
Should 'batch->num_fences' only get set on success?
> + if (err)
> + xe_tlb_inval_batch_wait(batch);
> +
> + return err;
> +}
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval.h b/drivers/gpu/drm/xe/xe_tlb_inval.h
> index 62089254fa23..a76b7823a5f2 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval.h
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval.h
> @@ -45,4 +45,10 @@ void xe_tlb_inval_done_handler(struct xe_tlb_inval *tlb_inval, int seqno);
>
> bool xe_tlb_inval_idle(struct xe_tlb_inval *tlb_inval);
>
> +int xe_tlb_inval_range_tilemask_submit(struct xe_device *xe, u32 asid,
> + u64 start, u64 end, u8 tile_mask,
> + struct xe_tlb_inval_batch *batch);
> +
> +void xe_tlb_inval_batch_wait(struct xe_tlb_inval_batch *batch);
> +
> #endif /* _XE_TLB_INVAL_ */
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_types.h b/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> index 3b089f90f002..3d1797d186fd 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> @@ -9,6 +9,8 @@
> #include <linux/workqueue.h>
> #include <linux/dma-fence.h>
>
> +#include "xe_device_types.h"
> +
> struct drm_suballoc;
> struct xe_tlb_inval;
>
> @@ -132,4 +134,16 @@ struct xe_tlb_inval_fence {
> ktime_t inval_time;
> };
>
> +/**
> + * struct xe_tlb_inval_batch - Batch of TLB invalidation fences
> + *
> + * Holds one fence per GT covered by a TLB invalidation request.
> + */
> +struct xe_tlb_inval_batch {
> + /** @fence: per-GT TLB invalidation fences */
> + struct xe_tlb_inval_fence fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
> + /** @num_fences: number of valid entries in @fence */
> + unsigned int num_fences;
> +};
> +
> #endif
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 548b0769b3ef..7f29d2b2972d 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -3966,66 +3966,6 @@ void xe_vm_unlock(struct xe_vm *vm)
> dma_resv_unlock(xe_vm_resv(vm));
> }
>
> -/**
> - * xe_vm_range_tilemask_tlb_inval - Issue a TLB invalidation on this tilemask for an
> - * address range
> - * @vm: The VM
> - * @start: start address
> - * @end: end address
> - * @tile_mask: mask for which gt's issue tlb invalidation
> - *
> - * Issue a range based TLB invalidation for gt's in tilemask
> - *
> - * Returns 0 for success, negative error code otherwise.
> - */
> -int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start,
> - u64 end, u8 tile_mask)
> -{
> - struct xe_tlb_inval_fence
> - fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
> - struct xe_tile *tile;
> - u32 fence_id = 0;
> - u8 id;
> - int err;
> -
> - if (!tile_mask)
> - return 0;
> -
> - for_each_tile(tile, vm->xe, id) {
> - if (!(tile_mask & BIT(id)))
> - continue;
> -
> - xe_tlb_inval_fence_init(&tile->primary_gt->tlb_inval,
> - &fence[fence_id], true);
> -
> - err = xe_tlb_inval_range(&tile->primary_gt->tlb_inval,
> - &fence[fence_id], start, end,
> - vm->usm.asid, NULL);
> - if (err)
> - goto wait;
> - ++fence_id;
> -
> - if (!tile->media_gt)
> - continue;
> -
> - xe_tlb_inval_fence_init(&tile->media_gt->tlb_inval,
> - &fence[fence_id], true);
> -
> - err = xe_tlb_inval_range(&tile->media_gt->tlb_inval,
> - &fence[fence_id], start, end,
> - vm->usm.asid, NULL);
> - if (err)
> - goto wait;
> - ++fence_id;
> - }
> -
> -wait:
> - for (id = 0; id < fence_id; ++id)
> - xe_tlb_inval_fence_wait(&fence[id]);
> -
> - return err;
> -}
> -
> /**
> * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
> * @vma: VMA to invalidate
> @@ -4040,6 +3980,7 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
> {
> struct xe_device *xe = xe_vma_vm(vma)->xe;
> struct xe_vm *vm = xe_vma_vm(vma);
> + struct xe_tlb_inval_batch _batch;
Why not just 'batch'?
> struct xe_tile *tile;
> u8 tile_mask = 0;
> int ret = 0;
> @@ -4080,12 +4021,16 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
>
> xe_device_wmb(xe);
>
> - ret = xe_vm_range_tilemask_tlb_inval(xe_vma_vm(vma), xe_vma_start(vma),
> - xe_vma_end(vma), tile_mask);
> + ret = xe_tlb_inval_range_tilemask_submit(xe, xe_vma_vm(vma)->usm.asid,
> + xe_vma_start(vma), xe_vma_end(vma),
> + tile_mask, &_batch);
>
> /* WRITE_ONCE pairs with READ_ONCE in xe_vm_has_valid_gpu_mapping() */
> WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
>
> + if (!ret)
> + xe_tlb_inval_batch_wait(&_batch);
> +
Here we skip the wait on error, hence my suggestion to skip waits in
other code paths or at a minimum make call sematics consistent.
> return ret;
> }
>
> diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
> index f849e369432b..62f4b6fec0bc 100644
> --- a/drivers/gpu/drm/xe/xe_vm.h
> +++ b/drivers/gpu/drm/xe/xe_vm.h
> @@ -240,9 +240,6 @@ struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
> struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
> struct xe_svm_range *range);
>
> -int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start,
> - u64 end, u8 tile_mask);
> -
> int xe_vm_invalidate_vma(struct xe_vma *vma);
>
> int xe_vm_validate_protected(struct xe_vm *vm);
> diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c b/drivers/gpu/drm/xe/xe_vm_madvise.c
> index 95bf53cc29e3..39717026e84f 100644
> --- a/drivers/gpu/drm/xe/xe_vm_madvise.c
> +++ b/drivers/gpu/drm/xe/xe_vm_madvise.c
> @@ -12,6 +12,7 @@
> #include "xe_pat.h"
> #include "xe_pt.h"
> #include "xe_svm.h"
> +#include "xe_tlb_inval.h"
>
> struct xe_vmas_in_madvise_range {
> u64 addr;
> @@ -235,13 +236,19 @@ static u8 xe_zap_ptes_in_madvise_range(struct xe_vm *vm, u64 start, u64 end)
> static int xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64 start, u64 end)
> {
> u8 tile_mask = xe_zap_ptes_in_madvise_range(vm, start, end);
> + struct xe_tlb_inval_batch batch;
> + int err;
>
> if (!tile_mask)
> return 0;
>
> xe_device_wmb(vm->xe);
>
> - return xe_vm_range_tilemask_tlb_inval(vm, start, end, tile_mask);
> + err = xe_tlb_inval_range_tilemask_submit(vm->xe, vm->usm.asid, start, end,
> + tile_mask, &batch);
> + xe_tlb_inval_batch_wait(&batch);
No need to wait on error.
> +
> + return err;
> }
>
> static bool madvise_args_are_sane(struct xe_device *xe, const struct drm_xe_madvise *args)
> diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
> index 1f6f7e30e751..de6544165cfa 100644
> --- a/drivers/gpu/drm/xe/xe_vm_types.h
> +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> @@ -18,6 +18,7 @@
> #include "xe_device_types.h"
> #include "xe_pt_types.h"
> #include "xe_range_fence.h"
> +#include "xe_tlb_inval_types.h"
> #include "xe_userptr.h"
>
> struct drm_pagemap;
> --
> 2.53.0
>
On Mon, 2026-03-02 at 11:06 -0800, Matthew Brost wrote:
> On Mon, Mar 02, 2026 at 05:32:47PM +0100, Thomas Hellström wrote:
> > xe_vm_range_tilemask_tlb_inval() submits TLB invalidation requests
> > to
> > all GTs in a tile mask and then immediately waits for them to
> > complete
> > before returning. This is fine for the existing callers, but a
> > subsequent patch will need to defer the wait in order to overlap
> > TLB
> > invalidations across multiple VMAs.
> >
> > Introduce xe_tlb_inval_range_tilemask_submit() and
> > xe_tlb_inval_batch_wait() in xe_tlb_inval.c as the submit and wait
> > halves respectively. The batch of fences is carried in the new
> > xe_tlb_inval_batch structure. Remove
> > xe_vm_range_tilemask_tlb_inval()
> > and convert all three call sites to the new API.
> >
>
> Mostly nits...
>
> > Assisted-by: GitHub Copilot:claude-sonnet-4.6
> > Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > ---
> > drivers/gpu/drm/xe/xe_svm.c | 6 +-
> > drivers/gpu/drm/xe/xe_tlb_inval.c | 82
> > +++++++++++++++++++++++++
> > drivers/gpu/drm/xe/xe_tlb_inval.h | 6 ++
> > drivers/gpu/drm/xe/xe_tlb_inval_types.h | 14 +++++
> > drivers/gpu/drm/xe/xe_vm.c | 69 +++------------------
> > drivers/gpu/drm/xe/xe_vm.h | 3 -
> > drivers/gpu/drm/xe/xe_vm_madvise.c | 9 ++-
> > drivers/gpu/drm/xe/xe_vm_types.h | 1 +
> > 8 files changed, 123 insertions(+), 67 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_svm.c
> > b/drivers/gpu/drm/xe/xe_svm.c
> > index 002b6c22ad3f..6ea4972c2791 100644
> > --- a/drivers/gpu/drm/xe/xe_svm.c
> > +++ b/drivers/gpu/drm/xe/xe_svm.c
> > @@ -19,6 +19,7 @@
> > #include "xe_pt.h"
> > #include "xe_svm.h"
> > #include "xe_tile.h"
> > +#include "xe_tlb_inval.h"
> > #include "xe_ttm_vram_mgr.h"
> > #include "xe_vm.h"
> > #include "xe_vm_types.h"
> > @@ -225,6 +226,7 @@ static void xe_svm_invalidate(struct drm_gpusvm
> > *gpusvm,
> > const struct mmu_notifier_range
> > *mmu_range)
> > {
> > struct xe_vm *vm = gpusvm_to_vm(gpusvm);
> > + struct xe_tlb_inval_batch _batch;
> > struct xe_device *xe = vm->xe;
> > struct drm_gpusvm_range *r, *first;
> > struct xe_tile *tile;
> > @@ -276,7 +278,9 @@ static void xe_svm_invalidate(struct drm_gpusvm
> > *gpusvm,
> >
> > xe_device_wmb(xe);
> >
> > - err = xe_vm_range_tilemask_tlb_inval(vm, adj_start,
> > adj_end, tile_mask);
> > + err = xe_tlb_inval_range_tilemask_submit(xe, vm->usm.asid,
> > adj_start, adj_end,
> > + tile_mask,
> > &_batch);
> > + xe_tlb_inval_batch_wait(&_batch);
>
> No need to call wait on an error but it is harmless.
>
> So you could write it like this:
>
> if (!WARN_ON_ONCE(err))
> xe_tlb_inval_batch_wait(&_batch);
Sure.
>
> > WARN_ON_ONCE(err);
> >
> > range_notifier_event_end:
> > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval.c
> > b/drivers/gpu/drm/xe/xe_tlb_inval.c
> > index 933f30fb617d..343e37cfe715 100644
> > --- a/drivers/gpu/drm/xe/xe_tlb_inval.c
> > +++ b/drivers/gpu/drm/xe/xe_tlb_inval.c
> > @@ -486,3 +486,85 @@ bool xe_tlb_inval_idle(struct xe_tlb_inval
> > *tlb_inval)
> > guard(spinlock_irq)(&tlb_inval->pending_lock);
> > return list_is_singular(&tlb_inval->pending_fences);
> > }
> > +
> > +/**
> > + * xe_tlb_inval_batch_wait() - Wait for all fences in a TLB
> > invalidation batch
> > + * @batch: Batch of TLB invalidation fences to wait on
> > + *
> > + * Waits for every fence in @batch to signal, then resets @batch
> > so it can be
> > + * reused for a subsequent invalidation.
> > + */
> > +void xe_tlb_inval_batch_wait(struct xe_tlb_inval_batch *batch)
> > +{
> > + struct xe_tlb_inval_fence *fence = &batch->fence[0];
>
> Would this be better:
>
> s/&batch->fence[0]/batch->fence
>
> Personal preference I guess.
Yeah, I typically use the former to make it easier for
the reader to remember we're pointing to the first element of an array.
>
> > + unsigned int i;
> > +
> > + for (i = 0; i < batch->num_fences; ++i)
> > + xe_tlb_inval_fence_wait(fence++);
> > +
> > + batch->num_fences = 0;
> > +}
> > +
> > +/**
> > + * xe_tlb_inval_range_tilemask_submit() - Submit TLB invalidations
> > for an
> > + * address range on a tile mask
> > + * @xe: The xe device
> > + * @asid: Address space ID
> > + * @start: start address
> > + * @end: end address
> > + * @tile_mask: mask for which gt's issue tlb invalidation
> > + * @batch: Batch of tlb invalidate fences
> > + *
> > + * Issue a range based TLB invalidation for gt's in tilemask
> > + *
>
> Mention no need to wait on batch if this function returns an error?
Sure.
>
> > + * Returns 0 for success, negative error code otherwise.
> > + */
> > +int xe_tlb_inval_range_tilemask_submit(struct xe_device *xe, u32
> > asid,
> > + u64 start, u64 end, u8
> > tile_mask,
> > + struct xe_tlb_inval_batch
> > *batch)
> > +{
> > + struct xe_tlb_inval_fence *fence = &batch->fence[0];
> > + struct xe_tile *tile;
> > + u32 fence_id = 0;
> > + u8 id;
> > + int err;
> > +
> > + batch->num_fences = 0;
> > + if (!tile_mask)
> > + return 0;
> > +
> > + for_each_tile(tile, xe, id) {
> > + if (!(tile_mask & BIT(id)))
> > + continue;
> > +
> > + xe_tlb_inval_fence_init(&tile->primary_gt-
> > >tlb_inval,
> > + &fence[fence_id], true);
> > +
> > + err = xe_tlb_inval_range(&tile->primary_gt-
> > >tlb_inval,
> > + &fence[fence_id], start,
> > end,
> > + asid, NULL);
> > + if (err)
> > + goto wait;
> > + ++fence_id;
> > +
> > + if (!tile->media_gt)
> > + continue;
> > +
> > + xe_tlb_inval_fence_init(&tile->media_gt-
> > >tlb_inval,
> > + &fence[fence_id], true);
> > +
> > + err = xe_tlb_inval_range(&tile->media_gt-
> > >tlb_inval,
> > + &fence[fence_id], start,
> > end,
> > + asid, NULL);
> > + if (err)
> > + goto wait;
> > + ++fence_id;
> > + }
> > +
> > +wait:
> > + batch->num_fences = fence_id;
>
> Should 'batch->num_fences' only get set on success?
We need it for the error wait below, after which it gets cleared.
>
> > + if (err)
> > + xe_tlb_inval_batch_wait(batch);
> > +
> > + return err;
> > +}
> > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval.h
> > b/drivers/gpu/drm/xe/xe_tlb_inval.h
> > index 62089254fa23..a76b7823a5f2 100644
> > --- a/drivers/gpu/drm/xe/xe_tlb_inval.h
> > +++ b/drivers/gpu/drm/xe/xe_tlb_inval.h
> > @@ -45,4 +45,10 @@ void xe_tlb_inval_done_handler(struct
> > xe_tlb_inval *tlb_inval, int seqno);
> >
> > bool xe_tlb_inval_idle(struct xe_tlb_inval *tlb_inval);
> >
> > +int xe_tlb_inval_range_tilemask_submit(struct xe_device *xe, u32
> > asid,
> > + u64 start, u64 end, u8
> > tile_mask,
> > + struct xe_tlb_inval_batch
> > *batch);
> > +
> > +void xe_tlb_inval_batch_wait(struct xe_tlb_inval_batch *batch);
> > +
> > #endif /* _XE_TLB_INVAL_ */
> > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> > b/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> > index 3b089f90f002..3d1797d186fd 100644
> > --- a/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> > @@ -9,6 +9,8 @@
> > #include <linux/workqueue.h>
> > #include <linux/dma-fence.h>
> >
> > +#include "xe_device_types.h"
> > +
> > struct drm_suballoc;
> > struct xe_tlb_inval;
> >
> > @@ -132,4 +134,16 @@ struct xe_tlb_inval_fence {
> > ktime_t inval_time;
> > };
> >
> > +/**
> > + * struct xe_tlb_inval_batch - Batch of TLB invalidation fences
> > + *
> > + * Holds one fence per GT covered by a TLB invalidation request.
> > + */
> > +struct xe_tlb_inval_batch {
> > + /** @fence: per-GT TLB invalidation fences */
> > + struct xe_tlb_inval_fence fence[XE_MAX_TILES_PER_DEVICE *
> > XE_MAX_GT_PER_TILE];
> > + /** @num_fences: number of valid entries in @fence */
> > + unsigned int num_fences;
> > +};
> > +
> > #endif
> > diff --git a/drivers/gpu/drm/xe/xe_vm.c
> > b/drivers/gpu/drm/xe/xe_vm.c
> > index 548b0769b3ef..7f29d2b2972d 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.c
> > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > @@ -3966,66 +3966,6 @@ void xe_vm_unlock(struct xe_vm *vm)
> > dma_resv_unlock(xe_vm_resv(vm));
> > }
> >
> > -/**
> > - * xe_vm_range_tilemask_tlb_inval - Issue a TLB invalidation on
> > this tilemask for an
> > - * address range
> > - * @vm: The VM
> > - * @start: start address
> > - * @end: end address
> > - * @tile_mask: mask for which gt's issue tlb invalidation
> > - *
> > - * Issue a range based TLB invalidation for gt's in tilemask
> > - *
> > - * Returns 0 for success, negative error code otherwise.
> > - */
> > -int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start,
> > - u64 end, u8 tile_mask)
> > -{
> > - struct xe_tlb_inval_fence
> > - fence[XE_MAX_TILES_PER_DEVICE *
> > XE_MAX_GT_PER_TILE];
> > - struct xe_tile *tile;
> > - u32 fence_id = 0;
> > - u8 id;
> > - int err;
> > -
> > - if (!tile_mask)
> > - return 0;
> > -
> > - for_each_tile(tile, vm->xe, id) {
> > - if (!(tile_mask & BIT(id)))
> > - continue;
> > -
> > - xe_tlb_inval_fence_init(&tile->primary_gt-
> > >tlb_inval,
> > - &fence[fence_id], true);
> > -
> > - err = xe_tlb_inval_range(&tile->primary_gt-
> > >tlb_inval,
> > - &fence[fence_id], start,
> > end,
> > - vm->usm.asid, NULL);
> > - if (err)
> > - goto wait;
> > - ++fence_id;
> > -
> > - if (!tile->media_gt)
> > - continue;
> > -
> > - xe_tlb_inval_fence_init(&tile->media_gt-
> > >tlb_inval,
> > - &fence[fence_id], true);
> > -
> > - err = xe_tlb_inval_range(&tile->media_gt-
> > >tlb_inval,
> > - &fence[fence_id], start,
> > end,
> > - vm->usm.asid, NULL);
> > - if (err)
> > - goto wait;
> > - ++fence_id;
> > - }
> > -
> > -wait:
> > - for (id = 0; id < fence_id; ++id)
> > - xe_tlb_inval_fence_wait(&fence[id]);
> > -
> > - return err;
> > -}
> > -
> > /**
> > * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without
> > a lock
> > * @vma: VMA to invalidate
> > @@ -4040,6 +3980,7 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
> > {
> > struct xe_device *xe = xe_vma_vm(vma)->xe;
> > struct xe_vm *vm = xe_vma_vm(vma);
> > + struct xe_tlb_inval_batch _batch;
>
> Why not just 'batch'?
>
> > struct xe_tile *tile;
> > u8 tile_mask = 0;
> > int ret = 0;
> > @@ -4080,12 +4021,16 @@ int xe_vm_invalidate_vma(struct xe_vma
> > *vma)
> >
> > xe_device_wmb(xe);
> >
> > - ret = xe_vm_range_tilemask_tlb_inval(xe_vma_vm(vma),
> > xe_vma_start(vma),
> > - xe_vma_end(vma),
> > tile_mask);
> > + ret = xe_tlb_inval_range_tilemask_submit(xe,
> > xe_vma_vm(vma)->usm.asid,
> > +
> > xe_vma_start(vma), xe_vma_end(vma),
> > + tile_mask,
> > &_batch);
> >
> > /* WRITE_ONCE pairs with READ_ONCE in
> > xe_vm_has_valid_gpu_mapping() */
> > WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
> >
> > + if (!ret)
> > + xe_tlb_inval_batch_wait(&_batch);
> > +
>
> Here we skip the wait on error, hence my suggestion to skip waits in
> other code paths or at a minimum make call sematics consistent.
Makes sense.
>
> > return ret;
> > }
> >
> > diff --git a/drivers/gpu/drm/xe/xe_vm.h
> > b/drivers/gpu/drm/xe/xe_vm.h
> > index f849e369432b..62f4b6fec0bc 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.h
> > +++ b/drivers/gpu/drm/xe/xe_vm.h
> > @@ -240,9 +240,6 @@ struct dma_fence *xe_vm_range_rebind(struct
> > xe_vm *vm,
> > struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
> > struct xe_svm_range *range);
> >
> > -int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start,
> > - u64 end, u8 tile_mask);
> > -
> > int xe_vm_invalidate_vma(struct xe_vma *vma);
> >
> > int xe_vm_validate_protected(struct xe_vm *vm);
> > diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c
> > b/drivers/gpu/drm/xe/xe_vm_madvise.c
> > index 95bf53cc29e3..39717026e84f 100644
> > --- a/drivers/gpu/drm/xe/xe_vm_madvise.c
> > +++ b/drivers/gpu/drm/xe/xe_vm_madvise.c
> > @@ -12,6 +12,7 @@
> > #include "xe_pat.h"
> > #include "xe_pt.h"
> > #include "xe_svm.h"
> > +#include "xe_tlb_inval.h"
> >
> > struct xe_vmas_in_madvise_range {
> > u64 addr;
> > @@ -235,13 +236,19 @@ static u8 xe_zap_ptes_in_madvise_range(struct
> > xe_vm *vm, u64 start, u64 end)
> > static int xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64
> > start, u64 end)
> > {
> > u8 tile_mask = xe_zap_ptes_in_madvise_range(vm, start,
> > end);
> > + struct xe_tlb_inval_batch batch;
> > + int err;
> >
> > if (!tile_mask)
> > return 0;
> >
> > xe_device_wmb(vm->xe);
> >
> > - return xe_vm_range_tilemask_tlb_inval(vm, start, end,
> > tile_mask);
> > + err = xe_tlb_inval_range_tilemask_submit(vm->xe, vm-
> > >usm.asid, start, end,
> > + tile_mask,
> > &batch);
> > + xe_tlb_inval_batch_wait(&batch);
>
> No need to wait on error.
Will fix
Thanks,
Thomas
>
> > +
> > + return err;
> > }
> >
> > static bool madvise_args_are_sane(struct xe_device *xe, const
> > struct drm_xe_madvise *args)
> > diff --git a/drivers/gpu/drm/xe/xe_vm_types.h
> > b/drivers/gpu/drm/xe/xe_vm_types.h
> > index 1f6f7e30e751..de6544165cfa 100644
> > --- a/drivers/gpu/drm/xe/xe_vm_types.h
> > +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> > @@ -18,6 +18,7 @@
> > #include "xe_device_types.h"
> > #include "xe_pt_types.h"
> > #include "xe_range_fence.h"
> > +#include "xe_tlb_inval_types.h"
> > #include "xe_userptr.h"
> >
> > struct drm_pagemap;
> > --
> > 2.53.0
> >
On Mon, Mar 02, 2026 at 10:29:22PM +0100, Thomas Hellström wrote:
> On Mon, 2026-03-02 at 11:06 -0800, Matthew Brost wrote:
> > On Mon, Mar 02, 2026 at 05:32:47PM +0100, Thomas Hellström wrote:
> > > xe_vm_range_tilemask_tlb_inval() submits TLB invalidation requests
> > > to
> > > all GTs in a tile mask and then immediately waits for them to
> > > complete
> > > before returning. This is fine for the existing callers, but a
> > > subsequent patch will need to defer the wait in order to overlap
> > > TLB
> > > invalidations across multiple VMAs.
> > >
> > > Introduce xe_tlb_inval_range_tilemask_submit() and
> > > xe_tlb_inval_batch_wait() in xe_tlb_inval.c as the submit and wait
> > > halves respectively. The batch of fences is carried in the new
> > > xe_tlb_inval_batch structure. Remove
> > > xe_vm_range_tilemask_tlb_inval()
> > > and convert all three call sites to the new API.
> > >
> >
> > Mostly nits...
> >
> > > Assisted-by: GitHub Copilot:claude-sonnet-4.6
> > > Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > ---
> > > drivers/gpu/drm/xe/xe_svm.c | 6 +-
> > > drivers/gpu/drm/xe/xe_tlb_inval.c | 82
> > > +++++++++++++++++++++++++
> > > drivers/gpu/drm/xe/xe_tlb_inval.h | 6 ++
> > > drivers/gpu/drm/xe/xe_tlb_inval_types.h | 14 +++++
> > > drivers/gpu/drm/xe/xe_vm.c | 69 +++------------------
> > > drivers/gpu/drm/xe/xe_vm.h | 3 -
> > > drivers/gpu/drm/xe/xe_vm_madvise.c | 9 ++-
> > > drivers/gpu/drm/xe/xe_vm_types.h | 1 +
> > > 8 files changed, 123 insertions(+), 67 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_svm.c
> > > b/drivers/gpu/drm/xe/xe_svm.c
> > > index 002b6c22ad3f..6ea4972c2791 100644
> > > --- a/drivers/gpu/drm/xe/xe_svm.c
> > > +++ b/drivers/gpu/drm/xe/xe_svm.c
> > > @@ -19,6 +19,7 @@
> > > #include "xe_pt.h"
> > > #include "xe_svm.h"
> > > #include "xe_tile.h"
> > > +#include "xe_tlb_inval.h"
> > > #include "xe_ttm_vram_mgr.h"
> > > #include "xe_vm.h"
> > > #include "xe_vm_types.h"
> > > @@ -225,6 +226,7 @@ static void xe_svm_invalidate(struct drm_gpusvm
> > > *gpusvm,
> > > const struct mmu_notifier_range
> > > *mmu_range)
> > > {
> > > struct xe_vm *vm = gpusvm_to_vm(gpusvm);
> > > + struct xe_tlb_inval_batch _batch;
> > > struct xe_device *xe = vm->xe;
> > > struct drm_gpusvm_range *r, *first;
> > > struct xe_tile *tile;
> > > @@ -276,7 +278,9 @@ static void xe_svm_invalidate(struct drm_gpusvm
> > > *gpusvm,
> > >
> > > xe_device_wmb(xe);
> > >
> > > - err = xe_vm_range_tilemask_tlb_inval(vm, adj_start,
> > > adj_end, tile_mask);
> > > + err = xe_tlb_inval_range_tilemask_submit(xe, vm->usm.asid,
> > > adj_start, adj_end,
> > > + tile_mask,
> > > &_batch);
> > > + xe_tlb_inval_batch_wait(&_batch);
> >
> > No need to call wait on an error but it is harmless.
> >
> > So you could write it like this:
> >
> > if (!WARN_ON_ONCE(err))
> > xe_tlb_inval_batch_wait(&_batch);
>
> Sure.
>
> >
> > > WARN_ON_ONCE(err);
> > >
> > > range_notifier_event_end:
> > > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval.c
> > > b/drivers/gpu/drm/xe/xe_tlb_inval.c
> > > index 933f30fb617d..343e37cfe715 100644
> > > --- a/drivers/gpu/drm/xe/xe_tlb_inval.c
> > > +++ b/drivers/gpu/drm/xe/xe_tlb_inval.c
> > > @@ -486,3 +486,85 @@ bool xe_tlb_inval_idle(struct xe_tlb_inval
> > > *tlb_inval)
> > > guard(spinlock_irq)(&tlb_inval->pending_lock);
> > > return list_is_singular(&tlb_inval->pending_fences);
> > > }
> > > +
> > > +/**
> > > + * xe_tlb_inval_batch_wait() - Wait for all fences in a TLB
> > > invalidation batch
> > > + * @batch: Batch of TLB invalidation fences to wait on
> > > + *
> > > + * Waits for every fence in @batch to signal, then resets @batch
> > > so it can be
> > > + * reused for a subsequent invalidation.
> > > + */
> > > +void xe_tlb_inval_batch_wait(struct xe_tlb_inval_batch *batch)
> > > +{
> > > + struct xe_tlb_inval_fence *fence = &batch->fence[0];
> >
> > Would this be better:
> >
> > s/&batch->fence[0]/batch->fence
> >
> > Personal preference I guess.
>
> Yeah, I typically use the former to make it easier for
> the reader to remember we're pointing to the first element of an array.
>
Ok, fine with this. I know I have done it both ways more than once.
> >
> > > + unsigned int i;
> > > +
> > > + for (i = 0; i < batch->num_fences; ++i)
> > > + xe_tlb_inval_fence_wait(fence++);
> > > +
> > > + batch->num_fences = 0;
> > > +}
> > > +
> > > +/**
> > > + * xe_tlb_inval_range_tilemask_submit() - Submit TLB invalidations
> > > for an
> > > + * address range on a tile mask
> > > + * @xe: The xe device
> > > + * @asid: Address space ID
> > > + * @start: start address
> > > + * @end: end address
> > > + * @tile_mask: mask for which gt's issue tlb invalidation
> > > + * @batch: Batch of tlb invalidate fences
> > > + *
> > > + * Issue a range based TLB invalidation for gt's in tilemask
> > > + *
> >
> > Mention no need to wait on batch if this function returns an error?
>
> Sure.
>
> >
> > > + * Returns 0 for success, negative error code otherwise.
> > > + */
> > > +int xe_tlb_inval_range_tilemask_submit(struct xe_device *xe, u32
> > > asid,
> > > + u64 start, u64 end, u8
> > > tile_mask,
> > > + struct xe_tlb_inval_batch
> > > *batch)
> > > +{
> > > + struct xe_tlb_inval_fence *fence = &batch->fence[0];
> > > + struct xe_tile *tile;
> > > + u32 fence_id = 0;
> > > + u8 id;
> > > + int err;
> > > +
> > > + batch->num_fences = 0;
> > > + if (!tile_mask)
> > > + return 0;
> > > +
> > > + for_each_tile(tile, xe, id) {
> > > + if (!(tile_mask & BIT(id)))
> > > + continue;
> > > +
> > > + xe_tlb_inval_fence_init(&tile->primary_gt-
> > > >tlb_inval,
> > > + &fence[fence_id], true);
> > > +
> > > + err = xe_tlb_inval_range(&tile->primary_gt-
> > > >tlb_inval,
> > > + &fence[fence_id], start,
> > > end,
> > > + asid, NULL);
> > > + if (err)
> > > + goto wait;
> > > + ++fence_id;
> > > +
> > > + if (!tile->media_gt)
> > > + continue;
> > > +
> > > + xe_tlb_inval_fence_init(&tile->media_gt-
> > > >tlb_inval,
> > > + &fence[fence_id], true);
> > > +
> > > + err = xe_tlb_inval_range(&tile->media_gt-
> > > >tlb_inval,
> > > + &fence[fence_id], start,
> > > end,
> > > + asid, NULL);
> > > + if (err)
> > > + goto wait;
> > > + ++fence_id;
> > > + }
> > > +
> > > +wait:
> > > + batch->num_fences = fence_id;
> >
> > Should 'batch->num_fences' only get set on success?
>
> We need it for the error wait below, after which it gets cleared.
>
Right, bad suggestion.
Matt
> >
> > > + if (err)
> > > + xe_tlb_inval_batch_wait(batch);
> > > +
> > > + return err;
> > > +}
> > > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval.h
> > > b/drivers/gpu/drm/xe/xe_tlb_inval.h
> > > index 62089254fa23..a76b7823a5f2 100644
> > > --- a/drivers/gpu/drm/xe/xe_tlb_inval.h
> > > +++ b/drivers/gpu/drm/xe/xe_tlb_inval.h
> > > @@ -45,4 +45,10 @@ void xe_tlb_inval_done_handler(struct
> > > xe_tlb_inval *tlb_inval, int seqno);
> > >
> > > bool xe_tlb_inval_idle(struct xe_tlb_inval *tlb_inval);
> > >
> > > +int xe_tlb_inval_range_tilemask_submit(struct xe_device *xe, u32
> > > asid,
> > > + u64 start, u64 end, u8
> > > tile_mask,
> > > + struct xe_tlb_inval_batch
> > > *batch);
> > > +
> > > +void xe_tlb_inval_batch_wait(struct xe_tlb_inval_batch *batch);
> > > +
> > > #endif /* _XE_TLB_INVAL_ */
> > > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> > > b/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> > > index 3b089f90f002..3d1797d186fd 100644
> > > --- a/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> > > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> > > @@ -9,6 +9,8 @@
> > > #include <linux/workqueue.h>
> > > #include <linux/dma-fence.h>
> > >
> > > +#include "xe_device_types.h"
> > > +
> > > struct drm_suballoc;
> > > struct xe_tlb_inval;
> > >
> > > @@ -132,4 +134,16 @@ struct xe_tlb_inval_fence {
> > > ktime_t inval_time;
> > > };
> > >
> > > +/**
> > > + * struct xe_tlb_inval_batch - Batch of TLB invalidation fences
> > > + *
> > > + * Holds one fence per GT covered by a TLB invalidation request.
> > > + */
> > > +struct xe_tlb_inval_batch {
> > > + /** @fence: per-GT TLB invalidation fences */
> > > + struct xe_tlb_inval_fence fence[XE_MAX_TILES_PER_DEVICE *
> > > XE_MAX_GT_PER_TILE];
> > > + /** @num_fences: number of valid entries in @fence */
> > > + unsigned int num_fences;
> > > +};
> > > +
> > > #endif
> > > diff --git a/drivers/gpu/drm/xe/xe_vm.c
> > > b/drivers/gpu/drm/xe/xe_vm.c
> > > index 548b0769b3ef..7f29d2b2972d 100644
> > > --- a/drivers/gpu/drm/xe/xe_vm.c
> > > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > > @@ -3966,66 +3966,6 @@ void xe_vm_unlock(struct xe_vm *vm)
> > > dma_resv_unlock(xe_vm_resv(vm));
> > > }
> > >
> > > -/**
> > > - * xe_vm_range_tilemask_tlb_inval - Issue a TLB invalidation on
> > > this tilemask for an
> > > - * address range
> > > - * @vm: The VM
> > > - * @start: start address
> > > - * @end: end address
> > > - * @tile_mask: mask for which gt's issue tlb invalidation
> > > - *
> > > - * Issue a range based TLB invalidation for gt's in tilemask
> > > - *
> > > - * Returns 0 for success, negative error code otherwise.
> > > - */
> > > -int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start,
> > > - u64 end, u8 tile_mask)
> > > -{
> > > - struct xe_tlb_inval_fence
> > > - fence[XE_MAX_TILES_PER_DEVICE *
> > > XE_MAX_GT_PER_TILE];
> > > - struct xe_tile *tile;
> > > - u32 fence_id = 0;
> > > - u8 id;
> > > - int err;
> > > -
> > > - if (!tile_mask)
> > > - return 0;
> > > -
> > > - for_each_tile(tile, vm->xe, id) {
> > > - if (!(tile_mask & BIT(id)))
> > > - continue;
> > > -
> > > - xe_tlb_inval_fence_init(&tile->primary_gt-
> > > >tlb_inval,
> > > - &fence[fence_id], true);
> > > -
> > > - err = xe_tlb_inval_range(&tile->primary_gt-
> > > >tlb_inval,
> > > - &fence[fence_id], start,
> > > end,
> > > - vm->usm.asid, NULL);
> > > - if (err)
> > > - goto wait;
> > > - ++fence_id;
> > > -
> > > - if (!tile->media_gt)
> > > - continue;
> > > -
> > > - xe_tlb_inval_fence_init(&tile->media_gt-
> > > >tlb_inval,
> > > - &fence[fence_id], true);
> > > -
> > > - err = xe_tlb_inval_range(&tile->media_gt-
> > > >tlb_inval,
> > > - &fence[fence_id], start,
> > > end,
> > > - vm->usm.asid, NULL);
> > > - if (err)
> > > - goto wait;
> > > - ++fence_id;
> > > - }
> > > -
> > > -wait:
> > > - for (id = 0; id < fence_id; ++id)
> > > - xe_tlb_inval_fence_wait(&fence[id]);
> > > -
> > > - return err;
> > > -}
> > > -
> > > /**
> > > * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without
> > > a lock
> > > * @vma: VMA to invalidate
> > > @@ -4040,6 +3980,7 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
> > > {
> > > struct xe_device *xe = xe_vma_vm(vma)->xe;
> > > struct xe_vm *vm = xe_vma_vm(vma);
> > > + struct xe_tlb_inval_batch _batch;
> >
> > Why not just 'batch'?
> >
> > > struct xe_tile *tile;
> > > u8 tile_mask = 0;
> > > int ret = 0;
> > > @@ -4080,12 +4021,16 @@ int xe_vm_invalidate_vma(struct xe_vma
> > > *vma)
> > >
> > > xe_device_wmb(xe);
> > >
> > > - ret = xe_vm_range_tilemask_tlb_inval(xe_vma_vm(vma),
> > > xe_vma_start(vma),
> > > - xe_vma_end(vma),
> > > tile_mask);
> > > + ret = xe_tlb_inval_range_tilemask_submit(xe,
> > > xe_vma_vm(vma)->usm.asid,
> > > +
> > > xe_vma_start(vma), xe_vma_end(vma),
> > > + tile_mask,
> > > &_batch);
> > >
> > > /* WRITE_ONCE pairs with READ_ONCE in
> > > xe_vm_has_valid_gpu_mapping() */
> > > WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
> > >
> > > + if (!ret)
> > > + xe_tlb_inval_batch_wait(&_batch);
> > > +
> >
> > Here we skip the wait on error, hence my suggestion to skip waits in
> > other code paths or at a minimum make call sematics consistent.
>
> Makes sense.
>
> >
> > > return ret;
> > > }
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_vm.h
> > > b/drivers/gpu/drm/xe/xe_vm.h
> > > index f849e369432b..62f4b6fec0bc 100644
> > > --- a/drivers/gpu/drm/xe/xe_vm.h
> > > +++ b/drivers/gpu/drm/xe/xe_vm.h
> > > @@ -240,9 +240,6 @@ struct dma_fence *xe_vm_range_rebind(struct
> > > xe_vm *vm,
> > > struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
> > > struct xe_svm_range *range);
> > >
> > > -int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start,
> > > - u64 end, u8 tile_mask);
> > > -
> > > int xe_vm_invalidate_vma(struct xe_vma *vma);
> > >
> > > int xe_vm_validate_protected(struct xe_vm *vm);
> > > diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c
> > > b/drivers/gpu/drm/xe/xe_vm_madvise.c
> > > index 95bf53cc29e3..39717026e84f 100644
> > > --- a/drivers/gpu/drm/xe/xe_vm_madvise.c
> > > +++ b/drivers/gpu/drm/xe/xe_vm_madvise.c
> > > @@ -12,6 +12,7 @@
> > > #include "xe_pat.h"
> > > #include "xe_pt.h"
> > > #include "xe_svm.h"
> > > +#include "xe_tlb_inval.h"
> > >
> > > struct xe_vmas_in_madvise_range {
> > > u64 addr;
> > > @@ -235,13 +236,19 @@ static u8 xe_zap_ptes_in_madvise_range(struct
> > > xe_vm *vm, u64 start, u64 end)
> > > static int xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64
> > > start, u64 end)
> > > {
> > > u8 tile_mask = xe_zap_ptes_in_madvise_range(vm, start,
> > > end);
> > > + struct xe_tlb_inval_batch batch;
> > > + int err;
> > >
> > > if (!tile_mask)
> > > return 0;
> > >
> > > xe_device_wmb(vm->xe);
> > >
> > > - return xe_vm_range_tilemask_tlb_inval(vm, start, end,
> > > tile_mask);
> > > + err = xe_tlb_inval_range_tilemask_submit(vm->xe, vm-
> > > >usm.asid, start, end,
> > > + tile_mask,
> > > &batch);
> > > + xe_tlb_inval_batch_wait(&batch);
> >
> > No need to wait on error.
>
> Will fix
>
> Thanks,
> Thomas
>
>
>
> >
> > > +
> > > + return err;
> > > }
> > >
> > > static bool madvise_args_are_sane(struct xe_device *xe, const
> > > struct drm_xe_madvise *args)
> > > diff --git a/drivers/gpu/drm/xe/xe_vm_types.h
> > > b/drivers/gpu/drm/xe/xe_vm_types.h
> > > index 1f6f7e30e751..de6544165cfa 100644
> > > --- a/drivers/gpu/drm/xe/xe_vm_types.h
> > > +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> > > @@ -18,6 +18,7 @@
> > > #include "xe_device_types.h"
> > > #include "xe_pt_types.h"
> > > #include "xe_range_fence.h"
> > > +#include "xe_tlb_inval_types.h"
> > > #include "xe_userptr.h"
> > >
> > > struct drm_pagemap;
> > > --
> > > 2.53.0
> > >
© 2016 - 2026 Red Hat, Inc.