From: Cosmin Ratiu <cratiu@nvidia.com>
This commit makes use of the building blocks previously added to
implement cross-device rate nodes.
A new 'supported_cross_device_rate_nodes' bool is added to devlink_ops
which lets drivers advertise support for cross-device rate objects.
If enabled and if there is a common shared devlink instance, then:
- all rate objects will be stored in the top-most common nested instance
and
- rate objects can have parents from other devices sharing the same
common instance.
The parent devlink from info->ctx is not locked, so none of its mutable
fields can be used. But parent setting only requires comparing devlink
pointer comparisons. Additionally, since the shared devlink is locked,
other rate operations cannot concurrently happen.
Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../networking/devlink/devlink-port.rst | 2 +
include/net/devlink.h | 5 +
net/devlink/rate.c | 92 +++++++++++++++++--
3 files changed, 90 insertions(+), 9 deletions(-)
diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst
index 5e397798a402..976bc5ca0962 100644
--- a/Documentation/networking/devlink/devlink-port.rst
+++ b/Documentation/networking/devlink/devlink-port.rst
@@ -417,6 +417,8 @@ API allows to configure following rate object's parameters:
Parent node name. Parent node rate limits are considered as additional limits
to all node children limits. ``tx_max`` is an upper limit for children.
``tx_share`` is a total bandwidth distributed among children.
+ If the device supports cross-function scheduling, the parent can be from a
+ different function of the same underlying device.
``tc_bw``
Allow users to set the bandwidth allocation per traffic class on rate
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 3038af6ec017..8d5ad5d4f1d0 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1585,6 +1585,11 @@ struct devlink_ops {
struct devlink_rate *parent,
void *priv_child, void *priv_parent,
struct netlink_ext_ack *extack);
+ /* Indicates if cross-device rate nodes are supported.
+ * This also requires a shared common ancestor object all devices that
+ * could share rate nodes are nested in.
+ */
+ bool supported_cross_device_rate_nodes;
/**
* selftests_check() - queries if selftest is supported
* @devlink: devlink instance
diff --git a/net/devlink/rate.c b/net/devlink/rate.c
index 1949746fab29..f243cccc95be 100644
--- a/net/devlink/rate.c
+++ b/net/devlink/rate.c
@@ -30,19 +30,53 @@ devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info)
return devlink_rate ?: ERR_PTR(-ENODEV);
}
+/* Repeatedly locks the nested-in devlink instances while cross device rate
+ * nodes are supported. Returns the devlink instance where rates should be
+ * stored.
+ */
static struct devlink *devl_rate_lock(struct devlink *devlink)
{
- return devlink;
+ struct devlink *rate_devlink = devlink;
+
+ while (rate_devlink->ops &&
+ rate_devlink->ops->supported_cross_device_rate_nodes) {
+ devlink = devlink_nested_in_get_lock(rate_devlink->rel);
+ if (!devlink)
+ break;
+ rate_devlink = devlink;
+ }
+ return rate_devlink;
}
+/* Variant of the above for when the nested-in devlink instances are already
+ * locked.
+ */
static struct devlink *
devl_get_rate_node_instance_locked(struct devlink *devlink)
{
- return devlink;
+ struct devlink *rate_devlink = devlink;
+
+ while (rate_devlink->ops &&
+ rate_devlink->ops->supported_cross_device_rate_nodes) {
+ devlink = devlink_nested_in_get_locked(rate_devlink->rel);
+ if (!devlink)
+ break;
+ rate_devlink = devlink;
+ }
+ return rate_devlink;
}
+/* Repeatedly unlocks the nested-in devlink instances of 'devlink' while cross
+ * device nodes are supported.
+ */
static void devl_rate_unlock(struct devlink *devlink)
{
+ if (!devlink || !devlink->ops ||
+ !devlink->ops->supported_cross_device_rate_nodes)
+ return;
+
+ devl_rate_unlock(devlink_nested_in_get_locked(devlink->rel));
+ devlink_nested_in_put_unlock(devlink->rel);
}
static struct devlink_rate *
@@ -120,6 +154,24 @@ static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw)
return -EMSGSIZE;
}
+static int devlink_nl_rate_parent_fill(struct sk_buff *msg,
+ struct devlink_rate *devlink_rate)
+{
+ struct devlink_rate *parent = devlink_rate->parent;
+ struct devlink *devlink = parent->devlink;
+
+ if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME,
+ parent->name))
+ return -EMSGSIZE;
+
+ if (devlink != devlink_rate->devlink &&
+ devlink_nl_put_nested_handle(msg, devlink_net(devlink), devlink,
+ DEVLINK_ATTR_PARENT_DEV))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
static int devlink_nl_rate_fill(struct sk_buff *msg,
struct devlink_rate *devlink_rate,
enum devlink_command cmd, u32 portid, u32 seq,
@@ -164,10 +216,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg,
devlink_rate->tx_weight))
goto nla_put_failure;
- if (devlink_rate->parent)
- if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME,
- devlink_rate->parent->name))
- goto nla_put_failure;
+ if (devlink_rate->parent &&
+ devlink_nl_rate_parent_fill(msg, devlink_rate))
+ goto nla_put_failure;
if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw))
goto nla_put_failure;
@@ -320,13 +371,14 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
struct genl_info *info,
struct nlattr *nla_parent)
{
- struct devlink *devlink = devlink_rate->devlink;
+ struct devlink *devlink = devlink_rate->devlink, *parent_devlink;
const char *parent_name = nla_data(nla_parent);
const struct devlink_ops *ops = devlink->ops;
size_t len = strlen(parent_name);
struct devlink_rate *parent;
int err = -EOPNOTSUPP;
+ parent_devlink = devlink_nl_ctx(info)->parent_devlink ? : devlink;
parent = devlink_rate->parent;
if (parent && !len) {
@@ -344,7 +396,13 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
refcount_dec(&parent->refcnt);
devlink_rate->parent = NULL;
} else if (len) {
- parent = devlink_rate_node_get_by_name(devlink, parent_name);
+ /* parent_devlink (when different than devlink) isn't locked,
+ * but the rate node devlink instance is, so nobody from the
+ * same group of devices sharing rates could change the used
+ * fields or unregister the parent.
+ */
+ parent = devlink_rate_node_get_by_name(parent_devlink,
+ parent_name);
if (IS_ERR(parent))
return -ENODEV;
@@ -625,7 +683,8 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
int devlink_nl_rate_set_doit(struct sk_buff *skb, struct genl_info *info)
{
- struct devlink *devlink = devlink_nl_ctx(info)->devlink;
+ struct devlink_nl_ctx *ctx = devlink_nl_ctx(info);
+ struct devlink *devlink = ctx->devlink;
struct devlink_rate *devlink_rate;
const struct devlink_ops *ops;
int err;
@@ -644,6 +703,14 @@ int devlink_nl_rate_set_doit(struct sk_buff *skb, struct genl_info *info)
goto unlock;
}
+ if (ctx->parent_devlink && ctx->parent_devlink != devlink &&
+ !ops->supported_cross_device_rate_nodes) {
+ NL_SET_ERR_MSG(info->extack,
+ "Cross-device rate parents aren't supported");
+ err = -EOPNOTSUPP;
+ goto unlock;
+ }
+
err = devlink_nl_rate_set(devlink_rate, ops, info);
if (!err)
@@ -671,6 +738,13 @@ int devlink_nl_rate_new_doit(struct sk_buff *skb, struct genl_info *info)
if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE))
return -EOPNOTSUPP;
+ if (ctx->parent_devlink && ctx->parent_devlink != devlink &&
+ !ops->supported_cross_device_rate_nodes) {
+ NL_SET_ERR_MSG(info->extack,
+ "Cross-device rate parents aren't supported");
+ return -EOPNOTSUPP;
+ }
+
rate_devlink = devl_rate_lock(devlink);
rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs);
if (!IS_ERR(rate_node)) {
--
2.44.0
This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
---
devlink: Allow rate node parents from other devlinks
This commit makes use of the building blocks previously added to
implement cross-device rate nodes.
> diff --git a/include/net/devlink.h b/include/net/devlink.h
> index 3038af6ec017e..8d5ad5d4f1d05 100644
> --- a/include/net/devlink.h
> +++ b/include/net/devlink.h
> @@ -1585,6 +1585,11 @@ struct devlink_ops {
> struct devlink_rate *parent,
> void *priv_child, void *priv_parent,
> struct netlink_ext_ack *extack);
> + /* Indicates if cross-device rate nodes are supported.
> + * This also requires a shared common ancestor object all devices that
> + * could share rate nodes are nested in.
> + */
> + bool supported_cross_device_rate_nodes;
When a device acting as a cross-device rate parent is destroyed,
devl_rate_nodes_destroy() iterates the shared rate_list to detach its
children. If it encounters a child rate node belonging to a different device,
it invokes the child driver's ops->rate_node_parent_set or
ops->rate_leaf_parent_set callbacks.
Since the cleanup path only holds the parent's devlink lock and the shared
root devlink lock, does this allow the child driver's rate operations to
execute concurrently with other operations on the child device without holding
the child's devlink instance lock?
[ ... ]
> diff --git a/net/devlink/rate.c b/net/devlink/rate.c
> index 1949746fab291..f243cccc95bee 100644
> --- a/net/devlink/rate.c
> +++ b/net/devlink/rate.c
> @@ -30,19 +30,53 @@ devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info)
> return devlink_rate ?: ERR_PTR(-ENODEV);
> }
>
> +/* Repeatedly locks the nested-in devlink instances while cross device rate
> + * nodes are supported. Returns the devlink instance where rates should be
> + * stored.
> + */
> static struct devlink *devl_rate_lock(struct devlink *devlink)
> {
> - return devlink;
> + struct devlink *rate_devlink = devlink;
> +
> + while (rate_devlink->ops &&
> + rate_devlink->ops->supported_cross_device_rate_nodes) {
> + devlink = devlink_nested_in_get_lock(rate_devlink->rel);
> + if (!devlink)
> + break;
> + rate_devlink = devlink;
> + }
> + return rate_devlink;
> }
[ ... ]
> +/* Repeatedly unlocks the nested-in devlink instances of 'devlink' while cross
> + * device nodes are supported.
> + */
> static void devl_rate_unlock(struct devlink *devlink)
> {
> + if (!devlink || !devlink->ops ||
> + !devlink->ops->supported_cross_device_rate_nodes)
> + return;
> +
> + devl_rate_unlock(devlink_nested_in_get_locked(devlink->rel));
> + devlink_nested_in_put_unlock(devlink->rel);
> }
If a parent devlink is not yet registered, devlink_nested_in_get_lock() can
return NULL in devl_rate_lock() and break the loop without taking a lock.
If another thread completes registering the parent devlink concurrently,
devlink_nested_in_get_locked() in devl_rate_unlock() will find it.
Could this lead to unlocking a mutex that was never acquired and dropping a
reference that was never taken, resulting in a use-after-free or lock
corruption?
[ ... ]
> @@ -320,13 +371,14 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
> struct genl_info *info,
> struct nlattr *nla_parent)
> {
> - struct devlink *devlink = devlink_rate->devlink;
> + struct devlink *devlink = devlink_rate->devlink, *parent_devlink;
> const char *parent_name = nla_data(nla_parent);
> const struct devlink_ops *ops = devlink->ops;
> size_t len = strlen(parent_name);
> struct devlink_rate *parent;
> int err = -EOPNOTSUPP;
>
> + parent_devlink = devlink_nl_ctx(info)->parent_devlink ? : devlink;
> parent = devlink_rate->parent;
>
> if (parent && !len) {
> @@ -344,7 +396,13 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
> refcount_dec(&parent->refcnt);
> devlink_rate->parent = NULL;
> } else if (len) {
> - parent = devlink_rate_node_get_by_name(devlink, parent_name);
> + /* parent_devlink (when different than devlink) isn't locked,
> + * but the rate node devlink instance is, so nobody from the
> + * same group of devices sharing rates could change the used
> + * fields or unregister the parent.
> + */
> + parent = devlink_rate_node_get_by_name(parent_devlink,
> + parent_name);
Is parent_devlink validated to ensure it shares the same common rate root
ancestor as devlink?
If an arbitrary unrelated parent_devlink is specified,
devlink_rate_node_get_by_name() could traverse an unrelated parent's
rate_list that was never locked by devl_rate_lock(), leading to a data race.
Additionally, if a rate node is found, its priv pointer is passed to the
target driver's ops->rate_node_parent_set. Could this cause the target driver
to cast a foreign priv pointer to its own private struct type, resulting in
type confusion and memory corruption?
On Mon, 2026-03-30 at 19:08 -0700, Jakub Kicinski wrote:
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> ---
> devlink: Allow rate node parents from other devlinks
>
> This commit makes use of the building blocks previously added to
> implement cross-device rate nodes.
>
> > diff --git a/include/net/devlink.h b/include/net/devlink.h
> > index 3038af6ec017e..8d5ad5d4f1d05 100644
> > --- a/include/net/devlink.h
> > +++ b/include/net/devlink.h
> > @@ -1585,6 +1585,11 @@ struct devlink_ops {
> > struct devlink_rate *parent,
> > void *priv_child, void
> > *priv_parent,
> > struct netlink_ext_ack
> > *extack);
> > + /* Indicates if cross-device rate nodes are supported.
> > + * This also requires a shared common ancestor object all
> > devices that
> > + * could share rate nodes are nested in.
> > + */
> > + bool supported_cross_device_rate_nodes;
>
> When a device acting as a cross-device rate parent is destroyed,
> devl_rate_nodes_destroy() iterates the shared rate_list to detach its
> children. If it encounters a child rate node belonging to a different
> device,
> it invokes the child driver's ops->rate_node_parent_set or
> ops->rate_leaf_parent_set callbacks.
>
> Since the cleanup path only holds the parent's devlink lock and the
> shared
> root devlink lock, does this allow the child driver's rate operations
> to
> execute concurrently with other operations on the child device
> without holding
> the child's devlink instance lock?
The rate_node lock protects changes to the rate hierarchy, even if
child devlink instance locks aren't held. The code works by locking
everything from the affected instance A up to the instance at the root
of the hierarchy R, and affects the hierarchy with R's instance lock
held. I think this can be made clearer in the code/comments though, so
Will tweak some things in the next version.
>
> [ ... ]
> > diff --git a/net/devlink/rate.c b/net/devlink/rate.c
> > index 1949746fab291..f243cccc95bee 100644
> > --- a/net/devlink/rate.c
> > +++ b/net/devlink/rate.c
> > @@ -30,19 +30,53 @@ devlink_rate_leaf_get_from_info(struct devlink
> > *devlink, struct genl_info *info)
> > return devlink_rate ?: ERR_PTR(-ENODEV);
> > }
> >
> > +/* Repeatedly locks the nested-in devlink instances while cross
> > device rate
> > + * nodes are supported. Returns the devlink instance where rates
> > should be
> > + * stored.
> > + */
> > static struct devlink *devl_rate_lock(struct devlink *devlink)
> > {
> > - return devlink;
> > + struct devlink *rate_devlink = devlink;
> > +
> > + while (rate_devlink->ops &&
> > + rate_devlink->ops-
> > >supported_cross_device_rate_nodes) {
> > + devlink = devlink_nested_in_get_lock(rate_devlink-
> > >rel);
> > + if (!devlink)
> > + break;
> > + rate_devlink = devlink;
> > + }
> > + return rate_devlink;
> > }
>
> [ ... ]
>
> > +/* Repeatedly unlocks the nested-in devlink instances of 'devlink'
> > while cross
> > + * device nodes are supported.
> > + */
> > static void devl_rate_unlock(struct devlink *devlink)
> > {
> > + if (!devlink || !devlink->ops ||
> > + !devlink->ops->supported_cross_device_rate_nodes)
> > + return;
> > +
> > + devl_rate_unlock(devlink_nested_in_get_locked(devlink-
> > >rel));
> > + devlink_nested_in_put_unlock(devlink->rel);
> > }
>
> If a parent devlink is not yet registered,
> devlink_nested_in_get_lock() can
> return NULL in devl_rate_lock() and break the loop without taking a
> lock.
>
> If another thread completes registering the parent devlink
> concurrently,
> devlink_nested_in_get_locked() in devl_rate_unlock() will find it.
>
> Could this lead to unlocking a mutex that was never acquired and
> dropping a
> reference that was never taken, resulting in a use-after-free or lock
> corruption?
A valid concern in theory, even though in practice there won't be
another thread registering a shared devlink instance that would race
with registration in this way.
I will make these lock/unlock helpers more robust in the next version.
>
> [ ... ]
> > @@ -320,13 +371,14 @@ devlink_nl_rate_parent_node_set(struct
> > devlink_rate *devlink_rate,
> > struct genl_info *info,
> > struct nlattr *nla_parent)
> > {
> > - struct devlink *devlink = devlink_rate->devlink;
> > + struct devlink *devlink = devlink_rate->devlink,
> > *parent_devlink;
> > const char *parent_name = nla_data(nla_parent);
> > const struct devlink_ops *ops = devlink->ops;
> > size_t len = strlen(parent_name);
> > struct devlink_rate *parent;
> > int err = -EOPNOTSUPP;
> >
> > + parent_devlink = devlink_nl_ctx(info)->parent_devlink ? :
> > devlink;
> > parent = devlink_rate->parent;
> >
> > if (parent && !len) {
> > @@ -344,7 +396,13 @@ devlink_nl_rate_parent_node_set(struct
> > devlink_rate *devlink_rate,
> > refcount_dec(&parent->refcnt);
> > devlink_rate->parent = NULL;
> > } else if (len) {
> > - parent = devlink_rate_node_get_by_name(devlink,
> > parent_name);
> > + /* parent_devlink (when different than devlink)
> > isn't locked,
> > + * but the rate node devlink instance is, so
> > nobody from the
> > + * same group of devices sharing rates could
> > change the used
> > + * fields or unregister the parent.
> > + */
> > + parent =
> > devlink_rate_node_get_by_name(parent_devlink,
> > +
> > parent_name);
>
> Is parent_devlink validated to ensure it shares the same common rate
> root
> ancestor as devlink?
>
> If an arbitrary unrelated parent_devlink is specified,
> devlink_rate_node_get_by_name() could traverse an unrelated parent's
> rate_list that was never locked by devl_rate_lock(), leading to a
> data race.
>
> Additionally, if a rate node is found, its priv pointer is passed to
> the
> target driver's ops->rate_node_parent_set. Could this cause the
> target driver
> to cast a foreign priv pointer to its own private struct type,
> resulting in
> type confusion and memory corruption?
Right, there's no common ancestor validation which could lead to
unpleasant results if one tries to set a rate parent across drivers
which support cross-esw.
I'll add it in the next version.
Cosmin.
© 2016 - 2026 Red Hat, Inc.