From: Carolina Jubran <cjubran@nvidia.com>
Introduce support for specifying bandwidth proportions between traffic
classes (TC) in the devlink-rate API. This new option allows users to
allocate bandwidth across multiple traffic classes in a single command.
This feature provides a more granular control over traffic management,
especially for scenarios requiring Enhanced Transmission Selection.
Users can now define a specific bandwidth share for each traffic class,
such as allocating 20% for TC0 (TCP/UDP) and 80% for TC5 (RoCE).
Example:
DEV=pci/0000:08:00.0
$ devlink port function rate add $DEV/vfs_group tx_share 10Gbit \
tx_max 50Gbit tc-bw 0:20 1:0 2:0 3:0 4:0 5:80 6:0 7:0
$ devlink port function rate set $DEV/vfs_group \
tc-bw 0:20 1:0 2:0 3:0 4:0 5:20 6:60 7:0
Example usage with ynl:
./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
--do rate-set --json '{
"bus-name": "pci",
"dev-name": "0000:08:00.0",
"port-index": 1,
"rate-tc-bws": [
{"rate-tc-index": 0, "rate-tc-bw": 50},
{"rate-tc-index": 1, "rate-tc-bw": 50},
{"rate-tc-index": 2, "rate-tc-bw": 0},
{"rate-tc-index": 3, "rate-tc-bw": 0},
{"rate-tc-index": 4, "rate-tc-bw": 0},
{"rate-tc-index": 5, "rate-tc-bw": 0},
{"rate-tc-index": 6, "rate-tc-bw": 0},
{"rate-tc-index": 7, "rate-tc-bw": 0}
]
}'
./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
--do rate-get --json '{
"bus-name": "pci",
"dev-name": "0000:08:00.0",
"port-index": 1
}'
output for rate-get:
{'bus-name': 'pci',
'dev-name': '0000:08:00.0',
'port-index': 1,
'rate-tc-bws': [{'rate-tc-bw': 50, 'rate-tc-index': 0},
{'rate-tc-bw': 50, 'rate-tc-index': 1},
{'rate-tc-bw': 0, 'rate-tc-index': 2},
{'rate-tc-bw': 0, 'rate-tc-index': 3},
{'rate-tc-bw': 0, 'rate-tc-index': 4},
{'rate-tc-bw': 0, 'rate-tc-index': 5},
{'rate-tc-bw': 0, 'rate-tc-index': 6},
{'rate-tc-bw': 0, 'rate-tc-index': 7}],
'rate-tx-max': 0,
'rate-tx-priority': 0,
'rate-tx-share': 0,
'rate-tx-weight': 0,
'rate-type': 'leaf'}
Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
Documentation/netlink/specs/devlink.yaml | 36 ++++-
.../networking/devlink/devlink-port.rst | 7 +
include/net/devlink.h | 9 ++
include/uapi/linux/devlink.h | 4 +
net/devlink/netlink_gen.c | 16 ++-
net/devlink/netlink_gen.h | 2 +
net/devlink/rate.c | 127 ++++++++++++++++++
7 files changed, 196 insertions(+), 5 deletions(-)
diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml
index bd9726269b4f..64b6aaa02047 100644
--- a/Documentation/netlink/specs/devlink.yaml
+++ b/Documentation/netlink/specs/devlink.yaml
@@ -202,6 +202,11 @@ definitions:
name: exception
-
name: control
+ -
+ name: devlink-rate-tc-index-max
+ header: net/devlink.h
+ type: const
+ value: 7
attribute-sets:
-
@@ -820,7 +825,26 @@ attribute-sets:
-
name: region-direct
type: flag
-
+ -
+ name: rate-tc-bws
+ type: nest
+ multi-attr: true
+ nested-attributes: dl-rate-tc-bws
+ -
+ name: rate-tc-index
+ type: u8
+ checks:
+ min: 0
+ max: devlink-rate-tc-index-max
+ -
+ name: rate-tc-bw
+ type: u32
+ doc: |
+ Specifies the bandwidth allocation for the Traffic Class as a
+ percentage.
+ checks:
+ min: 0
+ max: 100
-
name: dl-dev-stats
subset-of: devlink
@@ -1225,6 +1249,14 @@ attribute-sets:
-
name: flash
type: flag
+ -
+ name: dl-rate-tc-bws
+ subset-of: devlink
+ attributes:
+ -
+ name: rate-tc-index
+ -
+ name: rate-tc-bw
operations:
enum-model: directional
@@ -2150,6 +2182,7 @@ operations:
- rate-tx-priority
- rate-tx-weight
- rate-parent-node-name
+ - rate-tc-bws
-
name: rate-new
@@ -2170,6 +2203,7 @@ operations:
- rate-tx-priority
- rate-tx-weight
- rate-parent-node-name
+ - rate-tc-bws
-
name: rate-del
diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst
index 9d22d41a7cd1..bc3b41ac2d51 100644
--- a/Documentation/networking/devlink/devlink-port.rst
+++ b/Documentation/networking/devlink/devlink-port.rst
@@ -418,6 +418,13 @@ API allows to configure following rate object's parameters:
to all node children limits. ``tx_max`` is an upper limit for children.
``tx_share`` is a total bandwidth distributed among children.
+``tc_bw``
+ Allow users to set the bandwidth allocation per traffic class on rate
+ objects. This enables fine-grained QoS configurations by assigning specific
+ bandwidth percentages to different traffic classes. When applied to a
+ non-leaf node, tc_bw determines how bandwidth is shared among its child
+ elements.
+
``tx_priority`` and ``tx_weight`` can be used simultaneously. In that case
nodes with the same priority form a WFQ subgroup in the sibling group
and arbitration among them is based on assigned weights.
diff --git a/include/net/devlink.h b/include/net/devlink.h
index b8783126c1ed..1b7fa11b5841 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -20,6 +20,7 @@
#include <uapi/linux/devlink.h>
#include <linux/xarray.h>
#include <linux/firmware.h>
+#include <linux/dcbnl.h>
struct devlink;
struct devlink_linecard;
@@ -99,6 +100,8 @@ struct devlink_port_attrs {
};
};
+#define DEVLINK_RATE_TC_INDEX_MAX (IEEE_8021QAZ_MAX_TCS - 1)
+
struct devlink_rate {
struct list_head list;
enum devlink_rate_type type;
@@ -118,6 +121,8 @@ struct devlink_rate {
u32 tx_priority;
u32 tx_weight;
+
+ u32 tc_bw[IEEE_8021QAZ_MAX_TCS];
};
struct devlink_port {
@@ -1482,6 +1487,8 @@ struct devlink_ops {
u32 tx_priority, struct netlink_ext_ack *extack);
int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
u32 tx_weight, struct netlink_ext_ack *extack);
+ int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate, void *priv,
+ u32 *tc_bw, struct netlink_ext_ack *extack);
int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv,
u64 tx_share, struct netlink_ext_ack *extack);
int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv,
@@ -1490,6 +1497,8 @@ struct devlink_ops {
u32 tx_priority, struct netlink_ext_ack *extack);
int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
u32 tx_weight, struct netlink_ext_ack *extack);
+ int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate, void *priv,
+ u32 *tc_bw, struct netlink_ext_ack *extack);
int (*rate_node_new)(struct devlink_rate *rate_node, void **priv,
struct netlink_ext_ack *extack);
int (*rate_node_del)(struct devlink_rate *rate_node, void *priv,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 9401aa343673..b3b538c67c34 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -614,6 +614,10 @@ enum devlink_attr {
DEVLINK_ATTR_REGION_DIRECT, /* flag */
+ DEVLINK_ATTR_RATE_TC_BWS, /* nested */
+ DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */
+ DEVLINK_ATTR_RATE_TC_BW, /* u32 */
+
/* Add new attributes above here, update the spec in
* Documentation/netlink/specs/devlink.yaml and re-generate
* net/devlink/netlink_gen.c.
diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c
index f9786d51f68f..186f31522af0 100644
--- a/net/devlink/netlink_gen.c
+++ b/net/devlink/netlink_gen.c
@@ -9,6 +9,7 @@
#include "netlink_gen.h"
#include <uapi/linux/devlink.h>
+#include <net/devlink.h>
/* Common nested types */
const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1] = {
@@ -18,6 +19,11 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_
[DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15),
};
+const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = {
+ [DEVLINK_ATTR_RATE_TC_INDEX] = NLA_POLICY_RANGE(NLA_U8, 0, DEVLINK_RATE_TC_INDEX_MAX),
+ [DEVLINK_ATTR_RATE_TC_BW] = NLA_POLICY_RANGE(NLA_U32, 0, 100),
+};
+
const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = {
[DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, },
};
@@ -496,7 +502,7 @@ static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_
};
/* DEVLINK_CMD_RATE_SET - do */
-static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = {
+static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
@@ -505,10 +511,11 @@ static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_W
[DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, },
[DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, },
[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy),
};
/* DEVLINK_CMD_RATE_NEW - do */
-static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = {
+static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
@@ -517,6 +524,7 @@ static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_W
[DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, },
[DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, },
[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy),
};
/* DEVLINK_CMD_RATE_DEL - do */
@@ -1164,7 +1172,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
.doit = devlink_nl_rate_set_doit,
.post_doit = devlink_nl_post_doit,
.policy = devlink_rate_set_nl_policy,
- .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT,
+ .maxattr = DEVLINK_ATTR_RATE_TC_BWS,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
{
@@ -1174,7 +1182,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
.doit = devlink_nl_rate_new_doit,
.post_doit = devlink_nl_post_doit,
.policy = devlink_rate_new_nl_policy,
- .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT,
+ .maxattr = DEVLINK_ATTR_RATE_TC_BWS,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
{
diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h
index 8f2bd50ddf5e..e3558cf89be4 100644
--- a/net/devlink/netlink_gen.h
+++ b/net/devlink/netlink_gen.h
@@ -10,9 +10,11 @@
#include <net/genetlink.h>
#include <uapi/linux/devlink.h>
+#include <net/devlink.h>
/* Common nested types */
extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1];
+extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1];
extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1];
/* Ops table for devlink */
diff --git a/net/devlink/rate.c b/net/devlink/rate.c
index 8828ffaf6cbc..aff5682aead7 100644
--- a/net/devlink/rate.c
+++ b/net/devlink/rate.c
@@ -80,6 +80,29 @@ devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info)
return ERR_PTR(-EINVAL);
}
+static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw)
+{
+ struct nlattr *nla_tc_bw;
+ int i;
+
+ for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+ nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS);
+ if (!nla_tc_bw)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) ||
+ nla_put_u32(msg, DEVLINK_ATTR_RATE_TC_BW, tc_bw[i]))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nla_tc_bw);
+ }
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, nla_tc_bw);
+ return -EMSGSIZE;
+}
+
static int devlink_nl_rate_fill(struct sk_buff *msg,
struct devlink_rate *devlink_rate,
enum devlink_command cmd, u32 portid, u32 seq,
@@ -129,6 +152,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg,
devlink_rate->parent->name))
goto nla_put_failure;
+ if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw))
+ goto nla_put_failure;
+
genlmsg_end(msg, hdr);
return 0;
@@ -316,6 +342,89 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
return 0;
}
+static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw,
+ unsigned long *bitmap, struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[DEVLINK_ATTR_MAX + 1];
+ u8 tc_index;
+
+ nla_parse_nested(tb, DEVLINK_ATTR_MAX, parent_nest, devlink_dl_rate_tc_bws_nl_policy,
+ extack);
+ if (!tb[DEVLINK_ATTR_RATE_TC_INDEX]) {
+ NL_SET_ERR_ATTR_MISS(extack, parent_nest, DEVLINK_ATTR_RATE_TC_INDEX);
+ return -EINVAL;
+ }
+
+ tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]);
+
+ if (!tb[DEVLINK_ATTR_RATE_TC_BW]) {
+ NL_SET_ERR_ATTR_MISS(extack, parent_nest, DEVLINK_ATTR_RATE_TC_BW);
+ return -EINVAL;
+ }
+
+ if (test_and_set_bit(tc_index, bitmap)) {
+ NL_SET_ERR_MSG_FMT(extack, "Duplicate traffic class index specified (%u)",
+ tc_index);
+ return -EINVAL;
+ }
+
+ tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_TC_BW]);
+
+ return 0;
+}
+
+static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate,
+ struct genl_info *info)
+{
+ DECLARE_BITMAP(bitmap, IEEE_8021QAZ_MAX_TCS) = {};
+ struct devlink *devlink = devlink_rate->devlink;
+ const struct devlink_ops *ops = devlink->ops;
+ int rem, err = -EOPNOTSUPP, i, total = 0;
+ u32 tc_bw[IEEE_8021QAZ_MAX_TCS] = {};
+ struct nlattr *attr;
+
+ nla_for_each_attr(attr, genlmsg_data(info->genlhdr),
+ genlmsg_len(info->genlhdr), rem) {
+ if (nla_type(attr) == DEVLINK_ATTR_RATE_TC_BWS) {
+ err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap, info->extack);
+ if (err)
+ return err;
+ }
+ }
+
+ for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+ if (!test_bit(i, bitmap)) {
+ NL_SET_ERR_MSG_FMT(info->extack,
+ "Bandwidth values must be specified for all %u traffic classes",
+ IEEE_8021QAZ_MAX_TCS);
+ return -EINVAL;
+ }
+
+ total += tc_bw[i];
+ }
+
+ if (total && total != 100) {
+ NL_SET_ERR_MSG_FMT(info->extack,
+ "Sum of all traffic class bandwidth values must be 100, got %u",
+ total);
+ return -EINVAL;
+ }
+
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, tc_bw,
+ info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, tc_bw,
+ info->extack);
+
+ if (err)
+ return err;
+
+ memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw));
+
+ return 0;
+}
+
static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
const struct devlink_ops *ops,
struct genl_info *info)
@@ -388,6 +497,12 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
return err;
}
+ if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) {
+ err = devlink_nl_rate_tc_bw_set(devlink_rate, info);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -423,6 +538,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
"TX weight set isn't supported for the leafs");
return false;
}
+ if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && !ops->rate_leaf_tc_bw_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TC_BWS],
+ "TC bandwidth set isn't supported for the leafs");
+ return false;
+ }
} else if (type == DEVLINK_RATE_TYPE_NODE) {
if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) {
NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes");
@@ -449,6 +570,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
"TX weight set isn't supported for the nodes");
return false;
}
+ if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && !ops->rate_node_tc_bw_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TC_BWS],
+ "TC bandwidth set isn't supported for the nodes");
+ return false;
+ }
} else {
WARN(1, "Unknown type of rate object");
return false;
--
2.31.1
On Tue, 6 May 2025 14:26:39 +0300 Tariq Toukan wrote: > diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml > index bd9726269b4f..64b6aaa02047 100644 > --- a/Documentation/netlink/specs/devlink.yaml > +++ b/Documentation/netlink/specs/devlink.yaml > @@ -202,6 +202,11 @@ definitions: > name: exception > - > name: control > + - > + name: devlink-rate-tc-index-max > + header: net/devlink.h > + type: const > + value: 7 The spec is for uAPI, but you're declaring a value which is only known to the kernel.. What is the thinking here? It can't stay as is, it breaks the build for user space, since there is no user space header under net/devlink.h -- pw-bot: cr
© 2016 - 2025 Red Hat, Inc.