[v1] net/mlx5: HWS, Refactor action STE handling

[PATCH net-next 11/12] net/mlx5: HWS, Free unused action STE tables

Posted by Tariq Toukan 10 months ago

From: Vlad Dogaru <vdogaru@nvidia.com>

Periodically check for unused action STE tables and free their
associated resources. In order to do this safely, add a per-queue lock
to synchronize the garbage collect work with regular operations on
steering rules.

Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../mlx5/core/steering/hws/action_ste_pool.c  | 88 ++++++++++++++++++-
 .../mlx5/core/steering/hws/action_ste_pool.h  | 11 +++
 .../mellanox/mlx5/core/steering/hws/context.h |  1 +
 3 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c
index e5c453ec65b3..509e19ec101b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c
@@ -160,6 +160,7 @@ hws_action_ste_table_alloc(struct mlx5hws_action_ste_pool_element *parent_elem)
 
 	action_tbl->parent_elem = parent_elem;
 	INIT_LIST_HEAD(&action_tbl->list_node);
+	action_tbl->last_used = jiffies;
 	list_add(&action_tbl->list_node, &parent_elem->available);
 	parent_elem->log_sz = log_sz;
 
@@ -237,6 +238,8 @@ static int hws_action_ste_pool_init(struct mlx5hws_context *ctx,
 	enum mlx5hws_pool_optimize opt;
 	int err;
 
+	mutex_init(&pool->lock);
+
 	/* Rules which are added for both RX and TX must use the same action STE
 	 * indices for both. If we were to use a single table, then RX-only and
 	 * TX-only rules would waste the unused entries. Thus, we use separate
@@ -248,6 +251,7 @@ static int hws_action_ste_pool_init(struct mlx5hws_context *ctx,
 						       opt);
 		if (err)
 			goto destroy_elems;
+		pool->elems[opt].parent_pool = pool;
 	}
 
 	return 0;
@@ -268,6 +272,58 @@ static void hws_action_ste_pool_destroy(struct mlx5hws_action_ste_pool *pool)
 		hws_action_ste_pool_element_destroy(&pool->elems[opt]);
 }
 
+static void hws_action_ste_pool_element_collect_stale(
+	struct mlx5hws_action_ste_pool_element *elem, struct list_head *cleanup)
+{
+	struct mlx5hws_action_ste_table *action_tbl, *p;
+	unsigned long expire_time, now;
+
+	expire_time = secs_to_jiffies(MLX5HWS_ACTION_STE_POOL_EXPIRE_SECONDS);
+	now = jiffies;
+
+	list_for_each_entry_safe(action_tbl, p, &elem->available, list_node) {
+		if (mlx5hws_pool_full(action_tbl->pool) &&
+		    time_before(action_tbl->last_used + expire_time, now))
+			list_move(&action_tbl->list_node, cleanup);
+	}
+}
+
+static void hws_action_ste_table_cleanup_list(struct list_head *cleanup)
+{
+	struct mlx5hws_action_ste_table *action_tbl, *p;
+
+	list_for_each_entry_safe(action_tbl, p, cleanup, list_node)
+		hws_action_ste_table_destroy(action_tbl);
+}
+
+static void hws_action_ste_pool_cleanup(struct work_struct *work)
+{
+	enum mlx5hws_pool_optimize opt;
+	struct mlx5hws_context *ctx;
+	LIST_HEAD(cleanup);
+	int i;
+
+	ctx = container_of(work, struct mlx5hws_context,
+			   action_ste_cleanup.work);
+
+	for (i = 0; i < ctx->queues; i++) {
+		struct mlx5hws_action_ste_pool *p = &ctx->action_ste_pool[i];
+
+		mutex_lock(&p->lock);
+		for (opt = MLX5HWS_POOL_OPTIMIZE_NONE;
+		     opt < MLX5HWS_POOL_OPTIMIZE_MAX; opt++)
+			hws_action_ste_pool_element_collect_stale(
+				&p->elems[opt], &cleanup);
+		mutex_unlock(&p->lock);
+	}
+
+	hws_action_ste_table_cleanup_list(&cleanup);
+
+	schedule_delayed_work(&ctx->action_ste_cleanup,
+			      secs_to_jiffies(
+				  MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS));
+}
+
 int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx)
 {
 	struct mlx5hws_action_ste_pool *pool;
@@ -286,6 +342,12 @@ int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx)
 
 	ctx->action_ste_pool = pool;
 
+	INIT_DELAYED_WORK(&ctx->action_ste_cleanup,
+			  hws_action_ste_pool_cleanup);
+	schedule_delayed_work(
+		&ctx->action_ste_cleanup,
+		secs_to_jiffies(MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS));
+
 	return 0;
 
 free_pool:
@@ -301,6 +363,8 @@ void mlx5hws_action_ste_pool_uninit(struct mlx5hws_context *ctx)
 	size_t queues = ctx->queues;
 	int i;
 
+	cancel_delayed_work_sync(&ctx->action_ste_cleanup);
+
 	for (i = 0; i < queues; i++)
 		hws_action_ste_pool_destroy(&ctx->action_ste_pool[i]);
 
@@ -331,6 +395,7 @@ hws_action_ste_table_chunk_alloc(struct mlx5hws_action_ste_table *action_tbl,
 		return err;
 
 	chunk->action_tbl = action_tbl;
+	action_tbl->last_used = jiffies;
 
 	return 0;
 }
@@ -347,6 +412,8 @@ int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool,
 	if (skip_rx && skip_tx)
 		return -EINVAL;
 
+	mutex_lock(&pool->lock);
+
 	elem = hws_action_ste_choose_elem(pool, skip_rx, skip_tx);
 
 	mlx5hws_dbg(elem->ctx,
@@ -363,26 +430,39 @@ int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool,
 
 	if (!found) {
 		action_tbl = hws_action_ste_table_alloc(elem);
-		if (IS_ERR(action_tbl))
-			return PTR_ERR(action_tbl);
+		if (IS_ERR(action_tbl)) {
+			err = PTR_ERR(action_tbl);
+			goto out;
+		}
 
 		err = hws_action_ste_table_chunk_alloc(action_tbl, chunk);
 		if (err)
-			return err;
+			goto out;
 	}
 
 	if (mlx5hws_pool_empty(action_tbl->pool))
 		list_move(&action_tbl->list_node, &elem->full);
 
-	return 0;
+	err = 0;
+
+out:
+	mutex_unlock(&pool->lock);
+
+	return err;
 }
 
 void mlx5hws_action_ste_chunk_free(struct mlx5hws_action_ste_chunk *chunk)
 {
+	struct mutex *lock = &chunk->action_tbl->parent_elem->parent_pool->lock;
+
 	mlx5hws_dbg(chunk->action_tbl->pool->ctx,
 		    "Freeing action STEs offset %d order %d\n",
 		    chunk->ste.offset, chunk->ste.order);
+
+	mutex_lock(lock);
 	mlx5hws_pool_chunk_free(chunk->action_tbl->pool, &chunk->ste);
+	chunk->action_tbl->last_used = jiffies;
 	list_move(&chunk->action_tbl->list_node,
 		  &chunk->action_tbl->parent_elem->available);
+	mutex_unlock(lock);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h
index 2de660a63223..a8ba97359e31 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h
@@ -8,6 +8,9 @@
 #define MLX5HWS_ACTION_STE_TABLE_STEP_LOG_SZ 1
 #define MLX5HWS_ACTION_STE_TABLE_MAX_LOG_SZ 20
 
+#define MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS 300
+#define MLX5HWS_ACTION_STE_POOL_EXPIRE_SECONDS 300
+
 struct mlx5hws_action_ste_pool_element;
 
 struct mlx5hws_action_ste_table {
@@ -19,10 +22,12 @@ struct mlx5hws_action_ste_table {
 	u32 rtc_0_id;
 	u32 rtc_1_id;
 	struct list_head list_node;
+	unsigned long last_used;
 };
 
 struct mlx5hws_action_ste_pool_element {
 	struct mlx5hws_context *ctx;
+	struct mlx5hws_action_ste_pool *parent_pool;
 	size_t log_sz;  /* Size of the largest table so far. */
 	enum mlx5hws_pool_optimize opt;
 	struct list_head available;
@@ -33,6 +38,12 @@ struct mlx5hws_action_ste_pool_element {
  * per queue.
  */
 struct mlx5hws_action_ste_pool {
+	/* Protects the entire pool. We have one pool per queue and only one
+	 * operation can be active per rule at a given time. Thus this lock
+	 * protects solely against concurrent garbage collection and we expect
+	 * very little contention.
+	 */
+	struct mutex lock;
 	struct mlx5hws_action_ste_pool_element elems[MLX5HWS_POOL_OPTIMIZE_MAX];
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h
index e987e93bbc6e..3f8938c73dc0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h
@@ -40,6 +40,7 @@ struct mlx5hws_context {
 	u32 pd_num;
 	struct mlx5hws_pool *stc_pool;
 	struct mlx5hws_action_ste_pool *action_ste_pool; /* One per queue */
+	struct delayed_work action_ste_cleanup;
 	struct mlx5hws_context_common_res common_res;
 	struct mlx5hws_pattern_cache *pattern_cache;
 	struct mlx5hws_definer_cache *definer_cache;
-- 
2.31.1

Re: [PATCH net-next 11/12] net/mlx5: HWS, Free unused action STE tables

Posted by Michal Kubiak 10 months ago

On Tue, Apr 08, 2025 at 05:00:55PM +0300, Tariq Toukan wrote:
> From: Vlad Dogaru <vdogaru@nvidia.com>
> 
> Periodically check for unused action STE tables and free their
> associated resources. In order to do this safely, add a per-queue lock
> to synchronize the garbage collect work with regular operations on
> steering rules.
> 
> Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
> Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
> Reviewed-by: Mark Bloch <mbloch@nvidia.com>
> Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
> ---
>  .../mlx5/core/steering/hws/action_ste_pool.c  | 88 ++++++++++++++++++-
>  .../mlx5/core/steering/hws/action_ste_pool.h  | 11 +++
>  .../mellanox/mlx5/core/steering/hws/context.h |  1 +
>  3 files changed, 96 insertions(+), 4 deletions(-)

[...]

> +
> +static void hws_action_ste_pool_cleanup(struct work_struct *work)
> +{
> +	enum mlx5hws_pool_optimize opt;
> +	struct mlx5hws_context *ctx;
> +	LIST_HEAD(cleanup);
> +	int i;
> +
> +	ctx = container_of(work, struct mlx5hws_context,
> +			   action_ste_cleanup.work);
> +
> +	for (i = 0; i < ctx->queues; i++) {
> +		struct mlx5hws_action_ste_pool *p = &ctx->action_ste_pool[i];
> +
> +		mutex_lock(&p->lock);
> +		for (opt = MLX5HWS_POOL_OPTIMIZE_NONE;
> +		     opt < MLX5HWS_POOL_OPTIMIZE_MAX; opt++)
> +			hws_action_ste_pool_element_collect_stale(
> +				&p->elems[opt], &cleanup);
> +		mutex_unlock(&p->lock);
> +	}

As I understand, in the loop above all unused items are being collected
to remove them at the end of the function, using `hws_action_ste_table_cleanup_list()`.

I noticed that only the collecting of elements is protected with the mutex.
So I have a question: is it possible that in a very short period of time
(between `mutex_unlock()` and `hws_action_ste_table_cleanup_list()` calls),
the cleanup list can somehow be invalidated (by changing the STE state
in another thread)?

> +
> +	hws_action_ste_table_cleanup_list(&cleanup);
> +
> +	schedule_delayed_work(&ctx->action_ste_cleanup,
> +			      secs_to_jiffies(
> +				  MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS));
> +}
> +

[...]

Thanks,
Michal

Re: [PATCH net-next 11/12] net/mlx5: HWS, Free unused action STE tables

Posted by Vlad Dogaru 10 months ago

On Thu, Apr 10, 2025 at 07:28:18PM +0200, Michal Kubiak wrote:
> On Tue, Apr 08, 2025 at 05:00:55PM +0300, Tariq Toukan wrote:
> > From: Vlad Dogaru <vdogaru@nvidia.com>
> > 
> > Periodically check for unused action STE tables and free their
> > associated resources. In order to do this safely, add a per-queue lock
> > to synchronize the garbage collect work with regular operations on
> > steering rules.
> > 
> > Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
> > Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
> > Reviewed-by: Mark Bloch <mbloch@nvidia.com>
> > Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
> > ---
> >  .../mlx5/core/steering/hws/action_ste_pool.c  | 88 ++++++++++++++++++-
> >  .../mlx5/core/steering/hws/action_ste_pool.h  | 11 +++
> >  .../mellanox/mlx5/core/steering/hws/context.h |  1 +
> >  3 files changed, 96 insertions(+), 4 deletions(-)
> 
> [...]
> 
> > +
> > +static void hws_action_ste_pool_cleanup(struct work_struct *work)
> > +{
> > +	enum mlx5hws_pool_optimize opt;
> > +	struct mlx5hws_context *ctx;
> > +	LIST_HEAD(cleanup);
> > +	int i;
> > +
> > +	ctx = container_of(work, struct mlx5hws_context,
> > +			   action_ste_cleanup.work);
> > +
> > +	for (i = 0; i < ctx->queues; i++) {
> > +		struct mlx5hws_action_ste_pool *p = &ctx->action_ste_pool[i];
> > +
> > +		mutex_lock(&p->lock);
> > +		for (opt = MLX5HWS_POOL_OPTIMIZE_NONE;
> > +		     opt < MLX5HWS_POOL_OPTIMIZE_MAX; opt++)
> > +			hws_action_ste_pool_element_collect_stale(
> > +				&p->elems[opt], &cleanup);
> > +		mutex_unlock(&p->lock);
> > +	}
> 
> As I understand, in the loop above all unused items are being collected
> to remove them at the end of the function, using `hws_action_ste_table_cleanup_list()`.
> 
> I noticed that only the collecting of elements is protected with the mutex.
> So I have a question: is it possible that in a very short period of time
> (between `mutex_unlock()` and `hws_action_ste_table_cleanup_list()` calls),
> the cleanup list can somehow be invalidated (by changing the STE state
> in another thread)?

An action_ste_table is either:
(a) in an action_ste_pool (indirectly, via a pool element); or
(b) in a cleanup list.

In situation (a), both the table's last_used timestamp and its offsets
are protected by the parent pool's mutex. The table can only be accessed
through its parent pool.

In situation (b), the table can only be accessed by its parent list, and
the only thing we do with it is free it.

There is only one transition, from state (a) to state (b), never the
other way around. This transition is done under the parent pool's mutex.

We only move tables to the cleanup list when all of their elements are
available, so there is no risk of a `chunk_free` call accessing a table
that's on a cleanup list: there are no chunks left to free.

I think this is airtight, but I'm happy to explore possible scenarios if
you have any in mind.

Thank you for the detailed review,
Vlad

Re: [PATCH net-next 11/12] net/mlx5: HWS, Free unused action STE tables

Posted by Michal Kubiak 10 months ago

On Thu, Apr 10, 2025 at 08:20:25PM +0200, Vlad Dogaru wrote:
> On Thu, Apr 10, 2025 at 07:28:18PM +0200, Michal Kubiak wrote:
> > On Tue, Apr 08, 2025 at 05:00:55PM +0300, Tariq Toukan wrote:
> > > From: Vlad Dogaru <vdogaru@nvidia.com>
> > > 
> > > Periodically check for unused action STE tables and free their
> > > associated resources. In order to do this safely, add a per-queue lock
> > > to synchronize the garbage collect work with regular operations on
> > > steering rules.
> > > 
> > > Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
> > > Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
> > > Reviewed-by: Mark Bloch <mbloch@nvidia.com>
> > > Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
> > > ---
> > >  .../mlx5/core/steering/hws/action_ste_pool.c  | 88 ++++++++++++++++++-
> > >  .../mlx5/core/steering/hws/action_ste_pool.h  | 11 +++
> > >  .../mellanox/mlx5/core/steering/hws/context.h |  1 +
> > >  3 files changed, 96 insertions(+), 4 deletions(-)
> > 
> > [...]
> > 
> > > +
> > > +static void hws_action_ste_pool_cleanup(struct work_struct *work)
> > > +{
> > > +	enum mlx5hws_pool_optimize opt;
> > > +	struct mlx5hws_context *ctx;
> > > +	LIST_HEAD(cleanup);
> > > +	int i;
> > > +
> > > +	ctx = container_of(work, struct mlx5hws_context,
> > > +			   action_ste_cleanup.work);
> > > +
> > > +	for (i = 0; i < ctx->queues; i++) {
> > > +		struct mlx5hws_action_ste_pool *p = &ctx->action_ste_pool[i];
> > > +
> > > +		mutex_lock(&p->lock);
> > > +		for (opt = MLX5HWS_POOL_OPTIMIZE_NONE;
> > > +		     opt < MLX5HWS_POOL_OPTIMIZE_MAX; opt++)
> > > +			hws_action_ste_pool_element_collect_stale(
> > > +				&p->elems[opt], &cleanup);
> > > +		mutex_unlock(&p->lock);
> > > +	}
> > 
> > As I understand, in the loop above all unused items are being collected
> > to remove them at the end of the function, using `hws_action_ste_table_cleanup_list()`.
> > 
> > I noticed that only the collecting of elements is protected with the mutex.
> > So I have a question: is it possible that in a very short period of time
> > (between `mutex_unlock()` and `hws_action_ste_table_cleanup_list()` calls),
> > the cleanup list can somehow be invalidated (by changing the STE state
> > in another thread)?
> 
> An action_ste_table is either:
> (a) in an action_ste_pool (indirectly, via a pool element); or
> (b) in a cleanup list.
> 
> In situation (a), both the table's last_used timestamp and its offsets
> are protected by the parent pool's mutex. The table can only be accessed
> through its parent pool.
> 
> In situation (b), the table can only be accessed by its parent list, and
> the only thing we do with it is free it.
> 
> There is only one transition, from state (a) to state (b), never the
> other way around. This transition is done under the parent pool's mutex.
> 
> We only move tables to the cleanup list when all of their elements are
> available, so there is no risk of a `chunk_free` call accessing a table
> that's on a cleanup list: there are no chunks left to free.
> 
> I think this is airtight, but I'm happy to explore possible scenarios if
> you have any in mind.
> 
> Thank you for the detailed review,
> Vlad


Hi Vlad,

Thanks for your detailed explanation!
I was under the mistaken assumption that the `cleanup` list only had
some sort of reference to actual data. Now I see that the function
`hws_action_ste_pool_element_collect_stale` moves the real STE data
from one list to another, so only that function call should be protected
by the mutex.
Perhaps, I should have inspected the implementation of
`hws_action_ste_pool_element_collect_stale()` more closely. :-)

Thanks,
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>

PS. I saw your V2 - will take a look.