[v10] Support rate management on traffic classes in devlink and mlx5

[PATCH net-next V10 2/6] selftest: netdevsim: Add devlink rate tc-bw test

Posted by Tariq Toukan 8 months, 3 weeks ago

From: Carolina Jubran <cjubran@nvidia.com>

Test verifies that netdevsim correctly implements devlink ops callbacks
that set tc-bw on leaf or node rate object.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 drivers/net/netdevsim/dev.c                   | 43 ++++++++++++++++
 drivers/net/netdevsim/netdevsim.h             |  1 +
 .../drivers/net/netdevsim/devlink.sh          | 51 +++++++++++++++++++
 3 files changed, 95 insertions(+)

diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index 3e0b61202f0c..b3647691060c 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -388,6 +388,17 @@ static const struct file_operations nsim_dev_rate_parent_fops = {
 	.owner = THIS_MODULE,
 };
 
+static void nsim_dev_tc_bw_debugfs_init(struct dentry *ddir, u32 *tc_bw)
+{
+	int i;
+
+	for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) {
+		char name[16];
+
+		snprintf(name, sizeof(name), "tc%d_bw", i);
+		debugfs_create_u32(name, 0400, ddir, &tc_bw[i]);
+	}
+}
 static int nsim_dev_port_debugfs_init(struct nsim_dev *nsim_dev,
 				      struct nsim_dev_port *nsim_dev_port)
 {
@@ -415,6 +426,8 @@ static int nsim_dev_port_debugfs_init(struct nsim_dev *nsim_dev,
 								 nsim_dev_port->ddir,
 								 &nsim_dev_port->parent_name,
 								 &nsim_dev_rate_parent_fops);
+		nsim_dev_tc_bw_debugfs_init(nsim_dev_port->ddir,
+					    nsim_dev_port->tc_bw);
 	}
 	debugfs_create_symlink("dev", nsim_dev_port->ddir, dev_link_name);
 
@@ -1172,6 +1185,19 @@ static int nsim_rate_bytes_to_units(char *name, u64 *rate, struct netlink_ext_ac
 	return 0;
 }
 
+static int nsim_leaf_tc_bw_set(struct devlink_rate *devlink_rate,
+			       void *priv, u32 *tc_bw,
+			       struct netlink_ext_ack *extack)
+{
+	struct nsim_dev_port *nsim_dev_port = priv;
+	int i;
+
+	for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++)
+		nsim_dev_port->tc_bw[i] = tc_bw[i];
+
+	return 0;
+}
+
 static int nsim_leaf_tx_share_set(struct devlink_rate *devlink_rate, void *priv,
 				  u64 tx_share, struct netlink_ext_ack *extack)
 {
@@ -1210,8 +1236,21 @@ struct nsim_rate_node {
 	char *parent_name;
 	u16 tx_share;
 	u16 tx_max;
+	u32 tc_bw[DEVLINK_RATE_TCS_MAX];
 };
 
+static int nsim_node_tc_bw_set(struct devlink_rate *devlink_rate, void *priv,
+			       u32 *tc_bw, struct netlink_ext_ack *extack)
+{
+	struct nsim_rate_node *nsim_node = priv;
+	int i;
+
+	for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++)
+		nsim_node->tc_bw[i] = tc_bw[i];
+
+	return 0;
+}
+
 static int nsim_node_tx_share_set(struct devlink_rate *devlink_rate, void *priv,
 				  u64 tx_share, struct netlink_ext_ack *extack)
 {
@@ -1264,6 +1303,8 @@ static int nsim_rate_node_new(struct devlink_rate *node, void **priv,
 						     &nsim_node->parent_name,
 						     &nsim_dev_rate_parent_fops);
 
+	nsim_dev_tc_bw_debugfs_init(nsim_node->ddir, nsim_node->tc_bw);
+
 	*priv = nsim_node;
 	return 0;
 }
@@ -1340,8 +1381,10 @@ static const struct devlink_ops nsim_dev_devlink_ops = {
 	.trap_policer_counter_get = nsim_dev_devlink_trap_policer_counter_get,
 	.rate_leaf_tx_share_set = nsim_leaf_tx_share_set,
 	.rate_leaf_tx_max_set = nsim_leaf_tx_max_set,
+	.rate_leaf_tc_bw_set = nsim_leaf_tc_bw_set,
 	.rate_node_tx_share_set = nsim_node_tx_share_set,
 	.rate_node_tx_max_set = nsim_node_tx_max_set,
+	.rate_node_tc_bw_set = nsim_node_tc_bw_set,
 	.rate_node_new = nsim_rate_node_new,
 	.rate_node_del = nsim_rate_node_del,
 	.rate_leaf_parent_set = nsim_rate_leaf_parent_set,
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index d04401f0bdf7..4ec6735e5af4 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -282,6 +282,7 @@ struct nsim_dev_port {
 	struct dentry *ddir;
 	struct dentry *rate_parent;
 	char *parent_name;
+	u32 tc_bw[DEVLINK_RATE_TCS_MAX];
 	struct netdevsim *ns;
 };
 
diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
index b5ea2526f23c..09068d69d2cc 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
@@ -608,6 +608,44 @@ rate_attr_parent_check()
 	check_err $? "Unexpected parent attr value $api_value != $parent"
 }
 
+rate_attr_tc_bw_check()
+{
+	local handle=$1
+	local tc_bw=$2
+	local debug_file=$3
+
+	local tc_bw_str=""
+	for bw in $tc_bw; do
+		local tc=${bw%%:*}
+		local value=${bw##*:}
+		tc_bw_str="$tc_bw_str $tc:$value"
+	done
+	tc_bw_str=${tc_bw_str# }
+
+	rate_attr_set $handle tc-bw "$tc_bw_str"
+	check_err $? "Failed to set tc-bw values"
+
+	for bw in $tc_bw; do
+		local tc=${bw%%:*}
+		local value=${bw##*:}
+		local debug_value=$(cat $debug_file/tc${tc}_bw)
+		check_err $? "Failed to read tc-bw value from debugfs for tc$tc"
+		[ "$debug_value" == "$value" ]
+		check_err $? "Unexpected tc-bw debug value for tc$tc: $debug_value != $value"
+	done
+
+	for bw in $tc_bw; do
+		local tc=${bw%%:*}
+		local expected_value=${bw##*:}
+		local api_value=$(rate_attr_get $handle tc_$tc)
+		if [ "$api_value" = "null" ]; then
+			api_value=0
+		fi
+		[ "$api_value" == "$expected_value" ]
+		check_err $? "Unexpected tc-bw value for tc$tc: $api_value != $expected_value"
+	done
+}
+
 rate_node_add()
 {
 	local handle=$1
@@ -649,6 +687,13 @@ rate_test()
 		rate=$(($rate+100))
 	done
 
+	local tc_bw="0:0 1:40 2:0 3:0 4:0 5:0 6:60 7:0"
+	for r_obj in $leafs
+	do
+		rate_attr_tc_bw_check $r_obj "$tc_bw" \
+			$DEBUGFS_DIR/ports/${r_obj##*/}
+	done
+
 	local node1_name='group1'
 	local node1="$DL_HANDLE/$node1_name"
 	rate_node_add "$node1"
@@ -666,6 +711,12 @@ rate_test()
 	rate_attr_tx_rate_check $node1 tx_max $node_tx_max \
 		$DEBUGFS_DIR/rate_nodes/${node1##*/}/tx_max
 
+
+	local tc_bw="0:20 1:0 2:0 3:0 4:0 5:20 6:60 7:0"
+	rate_attr_tc_bw_check $node1 "$tc_bw" \
+		$DEBUGFS_DIR/rate_nodes/${node1##*/}
+
+
 	rate_node_del "$node1"
 	check_err $? "Failed to delete node $node1"
 	local num_nodes=`rate_nodes_get $DL_HANDLE | wc -w`
-- 
2.31.1

Re: [PATCH net-next V10 2/6] selftest: netdevsim: Add devlink rate tc-bw test

Posted by Jakub Kicinski 8 months, 3 weeks ago

On Tue, 20 May 2025 21:38:03 +0300 Tariq Toukan wrote:
> Test verifies that netdevsim correctly implements devlink ops callbacks
> that set tc-bw on leaf or node rate object.

Please add a test that can actually validate a NIC HW.
The test probably needs to be in Python to use a remote endpoint,
and should live under tools/testing/../drivers/net/hw

We had a long conversation about what we expect from the API 
vs how your HW works. One of the test cases should confirm
the expected behavior, IOW fail on mlx5. Which is fine,
unlikely that any NIC will have 100% compliance. But at
least we will be documenting the expectations.

Re: [PATCH net-next V10 2/6] selftest: netdevsim: Add devlink rate tc-bw test

Posted by Carolina Jubran 8 months, 3 weeks ago


On 21/05/2025 1:59, Jakub Kicinski wrote:
> On Tue, 20 May 2025 21:38:03 +0300 Tariq Toukan wrote:
>> Test verifies that netdevsim correctly implements devlink ops callbacks
>> that set tc-bw on leaf or node rate object.
> 
> Please add a test that can actually validate a NIC HW.
> The test probably needs to be in Python to use a remote endpoint,
> and should live under tools/testing/../drivers/net/hw
> 
> We had a long conversation about what we expect from the API
> vs how your HW works. One of the test cases should confirm
> the expected behavior, IOW fail on mlx5. Which is fine,
> unlikely that any NIC will have 100% compliance. But at
> least we will be documenting the expectations.

Working on this. Let me know if the netdevsim selftest should stay 
alongside the hardware tests, or be removed.

Thanks!

Re: [PATCH net-next V10 2/6] selftest: netdevsim: Add devlink rate tc-bw test

Posted by Jakub Kicinski 8 months, 3 weeks ago

On Thu, 22 May 2025 00:05:32 +0300 Carolina Jubran wrote:
> On 21/05/2025 1:59, Jakub Kicinski wrote:
> > On Tue, 20 May 2025 21:38:03 +0300 Tariq Toukan wrote:  
> >> Test verifies that netdevsim correctly implements devlink ops callbacks
> >> that set tc-bw on leaf or node rate object.  
> > 
> > Please add a test that can actually validate a NIC HW.
> > The test probably needs to be in Python to use a remote endpoint,
> > and should live under tools/testing/../drivers/net/hw
> > 
> > We had a long conversation about what we expect from the API
> > vs how your HW works. One of the test cases should confirm
> > the expected behavior, IOW fail on mlx5. Which is fine,
> > unlikely that any NIC will have 100% compliance. But at
> > least we will be documenting the expectations.  
> 
> Working on this. Let me know if the netdevsim selftest should stay 
> alongside the hardware tests, or be removed.

I think it's nice to have. But please share a link to the patches which
add the support in the CLI next time. Can be a lore link if posted, or a
GH repo. 

I need to pull the pending patches into the CI so that the test can run.

Re: [PATCH net-next V10 2/6] selftest: netdevsim: Add devlink rate tc-bw test

Posted by Tariq Toukan 8 months, 3 weeks ago


On 21/05/2025 1:59, Jakub Kicinski wrote:
> On Tue, 20 May 2025 21:38:03 +0300 Tariq Toukan wrote:
>> Test verifies that netdevsim correctly implements devlink ops callbacks
>> that set tc-bw on leaf or node rate object.
> 
> Please add a test that can actually validate a NIC HW.
> The test probably needs to be in Python to use a remote endpoint,
> and should live under tools/testing/../drivers/net/hw
> 
> We had a long conversation about what we expect from the API
> vs how your HW works. One of the test cases should confirm
> the expected behavior, IOW fail on mlx5. Which is fine,
> unlikely that any NIC will have 100% compliance. But at
> least we will be documenting the expectations.
> 

No problem with that, we'll add.

We could've saved this extra cycle if my questions [1] exactly about 
this topic weren't ignored.
Area is vague and not well defined. We can continue with the iterative 
guess and fix cycles, or alternatively get it clearly and formally defined.

[1] 
https://lore.kernel.org/all/98386cab-11c0-4f74-9925-8230af2e65c8@gmail.com/

Re: [PATCH net-next V10 2/6] selftest: netdevsim: Add devlink rate tc-bw test

Posted by Jakub Kicinski 8 months, 3 weeks ago

On Wed, 21 May 2025 10:05:13 +0300 Tariq Toukan wrote:
> On 21/05/2025 1:59, Jakub Kicinski wrote:
> > On Tue, 20 May 2025 21:38:03 +0300 Tariq Toukan wrote:  
> >> Test verifies that netdevsim correctly implements devlink ops callbacks
> >> that set tc-bw on leaf or node rate object.  
> > 
> > Please add a test that can actually validate a NIC HW.
> > The test probably needs to be in Python to use a remote endpoint,
> > and should live under tools/testing/../drivers/net/hw
> > 
> > We had a long conversation about what we expect from the API
> > vs how your HW works. One of the test cases should confirm
> > the expected behavior, IOW fail on mlx5. Which is fine,
> > unlikely that any NIC will have 100% compliance. But at
> > least we will be documenting the expectations.
> 
> No problem with that, we'll add.
> 
> We could've saved this extra cycle if my questions [1] exactly about 
> this topic weren't ignored.
> Area is vague and not well defined. We can continue with the iterative 
> guess and fix cycles, or alternatively get it clearly and formally defined.

I started a couple of times on answering but my hands go a little limb
when I have to explain things so obvious like "testing is a crucial part
of software development" :S  I mean.. nvidia certainly tests their code,
so I'm not sure where the disconnect is. I had a short conversation with
Gal at some conference where he, AFAIU, was doubting that device testing
can be part of an open source project.

It certainly is not advantageous to companies to have to share their
test code. So when you ask me for details on the rules what I hear is
"how can we make sure we do as little as possible".

Broadly, any new uAPI should come with tests which exercise the
functionality. We started a decade or so ago with netdevsim tests
which just validate the API layer itself. That did not provide
sufficient validation of the real implementations, crucially it did 
not check whether shallow APIs (devlink) actually behave the same
when implemented by multiple vendors. So two years ago we built 
the Python harness to be able to write tests for NIC functionality.
That is the level of testing we expect now. Obviously there will always
be exceptions. For instance I was pushing for common tests for the time
sync code (DPLL etc.) but I was convinced by the experts that it's hard
and that they generally test with $x0,000 measurement equipment.
So fair, I guess that's too hard. But for BW shaping tests? 
IIRC mlxsw has qdisc offload tests for BW shaping upstream.

Re: [PATCH net-next V10 2/6] selftest: netdevsim: Add devlink rate tc-bw test

Posted by Gal Pressman 8 months, 2 weeks ago

On 21/05/2025 17:10, Jakub Kicinski wrote:
>> We could've saved this extra cycle if my questions [1] exactly about 
>> this topic weren't ignored.
>> Area is vague and not well defined. We can continue with the iterative 
>> guess and fix cycles, or alternatively get it clearly and formally defined.
> 
> I started a couple of times on answering but my hands go a little limb
> when I have to explain things so obvious like "testing is a crucial part
> of software development" :S

You're acting as if kernel testing is obvious, and the only way to test
the kernel is through selftests. The kernel and networking subsystem are
tested regardless of selftests, the fact that you don't see the tests
does not mean it doesn't happen.

We are not new contributors, plenty of uapi changes have been merged
without selftests, it's natural for us to try and understand this new
requirement.

> I mean.. nvidia certainly tests their code, so I'm not sure where the disconnect is. 

Absolutely!
Our testing and coverage are far more complex and valuable than what the
existing selftests provide.

We have testing infrastructure that predates these recent selftests
addition by years, and we don't plan on switching them to something
inferior.

Writing selftests will not replace our internal tests, we're wasting
extra cycles adapting our tests to yet another project, just for the
sake of having them in the kernel tree?

> I had a short conversation with
> Gal at some conference where he, AFAIU, was doubting that device testing
> can be part of an open source project.

What's your point?
Did that prevent me from being a top 10 netdev selftests contributor in
v6.15?

I don't care if tests are open source or not, I care about having good
tests that run often and report bugs and regressions.

> 
> It certainly is not advantageous to companies to have to share their
> test code. So when you ask me for details on the rules what I hear is
> "how can we make sure we do as little as possible".

No, I'm asking for predictability, we've been circling this point for
quite some time.

We shouldn't have to wait until v9 to guess whether a certain submission
requires the addition of a test.
We shouldn't submit a bug fix, to find out that it's blocked due to lack
of a test.

As an example, we have well documented coding style, so:
- You don't have to waste your time commenting on things that could have
been handled before submission.
- Rules are clear, comments don't rely on personal taste or mood.
- Developers learn, number of review iterations are reduced.

What's the difference with documenting requirements for tests? We can't
read your mind.

> 
> Broadly, any new uAPI should come with tests which exercise the
> functionality.

I'm fine with this definition (though I think it's too vague), can I add
it to maintainer-netdev.rst?