[libvirt] [PATCHv2] nodedev: add switchdev to NIC capabilities

Edan David posted 1 patch 6 years, 8 months ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/libvirt tags/patchew/1503307193-41414-1-git-send-email-edand@mellanox.com
configure.ac                                      |  13 ++
docs/formatnode.html.in                           |   1 +
src/util/virnetdev.c                              | 187 +++++++++++++++++++++-
src/util/virnetdev.h                              |   1 +
tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml |   1 +
tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml |   1 +
6 files changed, 203 insertions(+), 1 deletion(-)
[libvirt] [PATCHv2] nodedev: add switchdev to NIC capabilities
Posted by Edan David 6 years, 8 months ago
Adding functionality to libvirt that will allow querying the interface
for the availability of switchdev Offloading NIC capabilities.
The switchdev mode was introduced in kernel 4.8, the iproute2-devlink
command to retrieve the swtichdev NIC feature,
Command example:  devlink dev eswitch show pci/0000:03:00.0
This feature is needed for Openstack so we can do a scheduling decision
if the NIC is in Hardware Offload (switchdev) or regular SR-IOV (legacy) mode.
And select the appropriate hypervisors with the requested capability see [1].

[1] - https://specs.openstack.org/openstack/nova-specs/specs/pike/approved/enable-sriov-nic-features.html
---
 configure.ac                                      |  13 ++
 docs/formatnode.html.in                           |   1 +
 src/util/virnetdev.c                              | 187 +++++++++++++++++++++-
 src/util/virnetdev.h                              |   1 +
 tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml |   1 +
 tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml |   1 +
 6 files changed, 203 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index b12b7fa..c089798 100644
--- a/configure.ac
+++ b/configure.ac
@@ -627,6 +627,19 @@ if test "$with_linux" = "yes"; then
     AC_CHECK_HEADERS([linux/btrfs.h])
 fi
 
+dnl
+dnl check for kernel headers required by devlink
+dnl
+if test "$with_linux" = "yes"; then
+    AC_CHECK_HEADERS([linux/devlink.h])
+    AC_CHECK_DECLS([DEVLINK_GENL_VERSION, DEVLINK_GENL_NAME, DEVLINK_ATTR_MAX, DEVLINK_CMD_ESWITCH_GET, DEVLINK_ATTR_BUS_NAME, DEVLINK_ATTR_DEV_NAME, DEVLINK_ATTR_ESWITCH_MODE, DEVLINK_ESWITCH_MODE_SWITCHDEV],
+                   [AC_DEFINE([HAVE_DECL_DEVLINK],
+                              [1],
+                              [whether devlink declarations is available])],
+                   [],
+                   [[#include <linux/devlink.h>]])
+fi
+
 dnl Allow perl/python overrides
 AC_PATH_PROGS([PYTHON], [python2 python])
 if test -z "$PYTHON"; then
diff --git a/docs/formatnode.html.in b/docs/formatnode.html.in
index 4d935b5..29244a8 100644
--- a/docs/formatnode.html.in
+++ b/docs/formatnode.html.in
@@ -227,6 +227,7 @@
                     <dt><code>rxhash</code></dt><dd>receive-hashing</dd>
                     <dt><code>rdma</code></dt><dd>remote-direct-memory-access</dd>
                     <dt><code>txudptnl</code></dt><dd>tx-udp-tunnel-segmentation</dd>
+                    <dt><code>switchdev</code></dt><dd>kernel-forward-plane-offload</dd>
                 </dl>
               </dd>
               <dt><code>capability</code></dt>
diff --git a/src/util/virnetdev.c b/src/util/virnetdev.c
index 51a6e42..fc7c961 100644
--- a/src/util/virnetdev.c
+++ b/src/util/virnetdev.c
@@ -59,6 +59,10 @@
 # include <net/if_dl.h>
 #endif
 
+#if HAVE_DECL_DEVLINK
+# include <linux/devlink.h>
+#endif
+
 #ifndef IFNAMSIZ
 # define IFNAMSIZ 16
 #endif
@@ -2481,7 +2485,8 @@ VIR_ENUM_IMPL(virNetDevFeature,
               "ntuple",
               "rxhash",
               "rdma",
-              "txudptnl")
+              "txudptnl",
+              "switchdev")
 
 #ifdef __linux__
 int
@@ -2936,6 +2941,7 @@ int virNetDevGetRxFilter(const char *ifname,
     return ret;
 }
 
+
 #if defined(SIOCETHTOOL) && defined(HAVE_STRUCT_IFREQ)
 
 /**
@@ -3115,6 +3121,182 @@ virNetDevGetEthtoolFeatures(virBitmapPtr bitmap,
 }
 
 
+#if HAVE_DECL_DEVLINK
+/**
+ * virNetDevPutExtraHeader
+ * reserve and prepare room for an extra header
+ * This function sets to zero the room that is required to put the extra
+ * header after the initial Netlink header. This function also increases
+ * the nlmsg_len field.
+ *
+ * @nlh: pointer to Netlink header
+ * @size: size of the extra header that we want to put
+ *
+ * Returns pointer to the start of the extended header
+ */
+static void *
+virNetDevPutExtraHeader(struct nlmsghdr *nlh,
+                        size_t size)
+{
+    char *ptr = (char *)nlh + nlh->nlmsg_len;
+    size_t len = NLMSG_ALIGN(size);
+    nlh->nlmsg_len += len;
+    memset(ptr, 0, len);
+    return ptr;
+}
+
+
+/**
+ * virNetDevGetFamilyId:
+ * This function supplies the devlink family id
+ *
+ * @family_name: the name of the family to query
+ *
+ * Returns family id or 0 on failure.
+ */
+static uint32_t
+virNetDevGetFamilyId(const char *family_name)
+{
+    struct nl_msg *nl_msg = NULL;
+    struct nlmsghdr *resp = NULL;
+    struct genlmsghdr* gmsgh = NULL;
+    struct nlattr *tb[CTRL_ATTR_MAX + 1] = {NULL, };
+    unsigned int recvbuflen;
+    uint32_t family_id = 0;
+
+    if (!(nl_msg = nlmsg_alloc_simple(GENL_ID_CTRL,
+                                      NLM_F_REQUEST | NLM_F_ACK))) {
+        virReportOOMError();
+        goto cleanup;
+    }
+
+    if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr))))
+        goto cleanup;
+
+    gmsgh->cmd = CTRL_CMD_GETFAMILY;
+    gmsgh->version = DEVLINK_GENL_VERSION;
+
+    if (nla_put_string(nl_msg, CTRL_ATTR_FAMILY_NAME, family_name) < 0) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("allocated netlink buffer is too small"));
+        goto cleanup;
+    }
+
+    if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0)
+        goto cleanup;
+
+    if (nlmsg_parse(resp, sizeof(struct nlmsghdr), tb, CTRL_CMD_MAX, NULL) < 0) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("malformed netlink response message"));
+        goto cleanup;
+    }
+
+    if (tb[CTRL_ATTR_FAMILY_ID] == NULL)
+        goto cleanup;
+
+    family_id = *(uint32_t *)RTA_DATA(tb[CTRL_ATTR_FAMILY_ID]);
+
+ cleanup:
+    nlmsg_free(nl_msg);
+    VIR_FREE(resp);
+    return family_id;
+}
+
+
+/**
+ * virNetDevSwitchdevFeature
+ * This function checks for the availability of Switchdev feature
+ * and add it to bitmap
+ *
+ * @ifname: name of the interface
+ * @out: add Switchdev feature if exist to bitmap
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+static int
+virNetDevSwitchdevFeature(const char *ifname,
+                          virBitmapPtr *out)
+{
+    struct nl_msg *nl_msg = NULL;
+    struct nlmsghdr *resp = NULL;
+    unsigned int recvbuflen;
+    struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {NULL, };
+    virPCIDevicePtr pci_device_ptr = NULL;
+    struct genlmsghdr* gmsgh = NULL;
+    const char *pci_name;
+    char *pfname = NULL;
+    int is_vf = -1;
+    int ret = -1;
+    uint32_t family_id;
+
+    if ((family_id = virNetDevGetFamilyId(DEVLINK_GENL_NAME)) <= 0)
+        return ret;
+
+    if ((is_vf = virNetDevIsVirtualFunction(ifname)) < 0)
+        return ret;
+
+    if (is_vf == 1 && virNetDevGetPhysicalFunction(ifname, &pfname) < 0)
+        goto cleanup;
+
+    if (!(nl_msg = nlmsg_alloc_simple(family_id,
+                                      NLM_F_REQUEST | NLM_F_ACK))) {
+        virReportOOMError();
+        goto cleanup;
+    }
+
+    if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr))))
+        goto cleanup;
+
+    gmsgh->cmd = DEVLINK_CMD_ESWITCH_GET;
+    gmsgh->version = DEVLINK_GENL_VERSION;
+
+    pci_device_ptr = pfname ? virNetDevGetPCIDevice(pfname) :
+                              virNetDevGetPCIDevice(ifname);
+    if (pci_device_ptr == NULL)
+        goto cleanup;
+
+    pci_name = virPCIDeviceGetName(pci_device_ptr);
+
+    if (nla_put(nl_msg, DEVLINK_ATTR_BUS_NAME, strlen("pci")+1, "pci") < 0 ||
+        nla_put(nl_msg, DEVLINK_ATTR_DEV_NAME, strlen(pci_name)+1, pci_name) < 0) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("allocated netlink buffer is too small"));
+        goto cleanup;
+    }
+
+    if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0)
+        goto cleanup;
+
+    if (nlmsg_parse(resp, sizeof(struct genlmsghdr), tb, DEVLINK_ATTR_MAX, NULL) < 0) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("malformed netlink response message"));
+        goto cleanup;
+    }
+
+    if (tb[DEVLINK_ATTR_ESWITCH_MODE] &&
+        *(int *)RTA_DATA(tb[DEVLINK_ATTR_ESWITCH_MODE]) == DEVLINK_ESWITCH_MODE_SWITCHDEV) {
+        ignore_value(virBitmapSetBit(*out, VIR_NET_DEV_FEAT_SWITCHDEV));
+    }
+
+    ret = 0;
+
+ cleanup:
+    nlmsg_free(nl_msg);
+    virPCIDeviceFree(pci_device_ptr);
+    VIR_FREE(resp);
+    VIR_FREE(pfname);
+    return ret;
+}
+#else
+static int
+virNetDevSwitchdevFeature(const char *ifname ATTRIBUTE_UNUSED,
+                          virBitmapPtr *out ATTRIBUTE_UNUSED)
+{
+    return 0;
+}
+#endif
+
+
 # if HAVE_DECL_ETHTOOL_GFEATURES
 /**
  * virNetDevGFeatureAvailable
@@ -3315,6 +3497,9 @@ virNetDevGetFeatures(const char *ifname,
     if (virNetDevRDMAFeature(ifname, out) < 0)
         goto cleanup;
 
+    if (virNetDevSwitchdevFeature(ifname, out) < 0)
+        goto cleanup;
+
     ret = 0;
  cleanup:
     VIR_FORCE_CLOSE(fd);
diff --git a/src/util/virnetdev.h b/src/util/virnetdev.h
index 9205c0e..71eaf45 100644
--- a/src/util/virnetdev.h
+++ b/src/util/virnetdev.h
@@ -112,6 +112,7 @@ typedef enum {
     VIR_NET_DEV_FEAT_RXHASH,
     VIR_NET_DEV_FEAT_RDMA,
     VIR_NET_DEV_FEAT_TXUDPTNL,
+    VIR_NET_DEV_FEAT_SWITCHDEV,
     VIR_NET_DEV_FEAT_LAST
 } virNetDevFeature;
 
diff --git a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
index d4c96e8..88252e6 100644
--- a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
+++ b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
@@ -15,6 +15,7 @@
     <feature name='rxhash'/>
     <feature name='rdma'/>
     <feature name='txudptnl'/>
+    <feature name='switchdev'/>
     <capability type='80211'/>
   </capability>
 </device>
diff --git a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
index 71bf90e..f77dfcc 100644
--- a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
+++ b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
@@ -15,6 +15,7 @@
     <feature name='rxhash'/>
     <feature name='rdma'/>
     <feature name='txudptnl'/>
+    <feature name='switchdev'/>
     <capability type='80203'/>
   </capability>
 </device>
-- 
2.1.4

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCHv2] nodedev: add switchdev to NIC capabilities
Posted by John Ferlan 6 years, 8 months ago

On 08/21/2017 05:19 AM, Edan David wrote:
> Adding functionality to libvirt that will allow querying the interface
> for the availability of switchdev Offloading NIC capabilities.
> The switchdev mode was introduced in kernel 4.8, the iproute2-devlink
> command to retrieve the swtichdev NIC feature,
> Command example:  devlink dev eswitch show pci/0000:03:00.0
> This feature is needed for Openstack so we can do a scheduling decision
> if the NIC is in Hardware Offload (switchdev) or regular SR-IOV (legacy) mode.
> And select the appropriate hypervisors with the requested capability see [1].
> 
> [1] - https://specs.openstack.org/openstack/nova-specs/specs/pike/approved/enable-sriov-nic-features.html
> ---
>  configure.ac                                      |  13 ++
>  docs/formatnode.html.in                           |   1 +
>  src/util/virnetdev.c                              | 187 +++++++++++++++++++++-
>  src/util/virnetdev.h                              |   1 +
>  tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml |   1 +
>  tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml |   1 +
>  6 files changed, 203 insertions(+), 1 deletion(-)
> 

Again - make check syntax-check had a failure because properly indent
"#if" to be "# if" (also "# else" and "# endif")

I can fix that easily before pushing though.

Reviewed-by: John Ferlan <jferlan@redhat.com>

I'll wait a day before pushing - just in case laine comes back from
chasing the eclipse and has a comment to make...

Tks -

John

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCHv2] nodedev: add switchdev to NIC capabilities
Posted by Edan David 6 years, 8 months ago
Hi John,

For some reason I don't see these errors in my environment,
I would appreciate it if you could fix them before merging.
Thank you!!

-----Original Message-----
From: John Ferlan [mailto:jferlan@redhat.com] 
Sent: Wednesday, August 23, 2017 8:44 PM
To: Edan David <edand@mellanox.com>; libvir-list@redhat.com
Subject: Re: [libvirt] [PATCHv2] nodedev: add switchdev to NIC capabilities



On 08/21/2017 05:19 AM, Edan David wrote:
> Adding functionality to libvirt that will allow querying the interface 
> for the availability of switchdev Offloading NIC capabilities.
> The switchdev mode was introduced in kernel 4.8, the iproute2-devlink 
> command to retrieve the swtichdev NIC feature, Command example:  
> devlink dev eswitch show pci/0000:03:00.0 This feature is needed for 
> Openstack so we can do a scheduling decision if the NIC is in Hardware 
> Offload (switchdev) or regular SR-IOV (legacy) mode.
> And select the appropriate hypervisors with the requested capability see [1].
> 
> [1] - 
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fspe
> cs.openstack.org%2Fopenstack%2Fnova-specs%2Fspecs%2Fpike%2Fapproved%2F
> enable-sriov-nic-features.html&data=02%7C01%7Cedand%40mellanox.com%7C7
> 58a68d1796c435a8af108d4ea4e889b%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0
> %7C0%7C636391070397000461&sdata=dbe8Z6P71ZEqC3u46YB1hkmmm8pQgYpWjjVxvB
> fu5Ko%3D&reserved=0
> ---
>  configure.ac                                      |  13 ++
>  docs/formatnode.html.in                           |   1 +
>  src/util/virnetdev.c                              | 187 +++++++++++++++++++++-
>  src/util/virnetdev.h                              |   1 +
>  tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml |   1 +
>  tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml |   1 +
>  6 files changed, 203 insertions(+), 1 deletion(-)
> 

Again - make check syntax-check had a failure because properly indent "#if" to be "# if" (also "# else" and "# endif")

I can fix that easily before pushing though.

Reviewed-by: John Ferlan <jferlan@redhat.com>

I'll wait a day before pushing - just in case laine comes back from chasing the eclipse and has a comment to make...

Tks -

John


--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCHv2] nodedev: add switchdev to NIC capabilities
Posted by Laine Stump 6 years, 7 months ago
(Almost all of my comments result in "ok, no action needed". There are
just three items I would like to see changed (2 trivial, 1 also small
but Edan or John may think it prudent to re-test with the change before
pushing) - I marked those comments with [**].

On 08/21/2017 05:19 AM, Edan David wrote:
> Adding functionality to libvirt that will allow querying the interface
> for the availability of switchdev Offloading NIC capabilities.
> The switchdev mode was introduced in kernel 4.8, the iproute2-devlink
> command to retrieve the swtichdev NIC feature,
> Command example:  devlink dev eswitch show pci/0000:03:00.0
> This feature is needed for Openstack so we can do a scheduling decision
> if the NIC is in Hardware Offload (switchdev) or regular SR-IOV (legacy) mode.
> And select the appropriate hypervisors with the requested capability see [1].
>
> [1] - https://specs.openstack.org/openstack/nova-specs/specs/pike/approved/enable-sriov-nic-features.html
> ---
>  configure.ac                                      |  13 ++
>  docs/formatnode.html.in                           |   1 +
>  src/util/virnetdev.c                              | 187 +++++++++++++++++++++-
>  src/util/virnetdev.h                              |   1 +
>  tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml |   1 +
>  tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml |   1 +
>  6 files changed, 203 insertions(+), 1 deletion(-)
>
> diff --git a/configure.ac b/configure.ac
> index b12b7fa..c089798 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -627,6 +627,19 @@ if test "$with_linux" = "yes"; then
>      AC_CHECK_HEADERS([linux/btrfs.h])
>  fi
>  
> +dnl
> +dnl check for kernel headers required by devlink
> +dnl
> +if test "$with_linux" = "yes"; then
> +    AC_CHECK_HEADERS([linux/devlink.h])
> +    AC_CHECK_DECLS([DEVLINK_GENL_VERSION, DEVLINK_GENL_NAME, DEVLINK_ATTR_MAX, DEVLINK_CMD_ESWITCH_GET, DEVLINK_ATTR_BUS_NAME, DEVLINK_ATTR_DEV_NAME, DEVLINK_ATTR_ESWITCH_MODE, DEVLINK_ESWITCH_MODE_SWITCHDEV],

That's very ..... thorough, and potentially misleading if someone ever
wanted to use devlink to check for something other than switchdev (e.g.
[mythical feature X]) on some system that didn't have the proper stuff
defined for switchdev, but did have it for [other mythical feature X].
But that's all just hypothetical, so this is fine with me.


> +                   [AC_DEFINE([HAVE_DECL_DEVLINK],
> +                              [1],
> +                              [whether devlink declarations is available])],

[**]
s/is/are/

> +                   [],
> +                   [[#include <linux/devlink.h>]])
> +fi
> +
>  dnl Allow perl/python overrides
>  AC_PATH_PROGS([PYTHON], [python2 python])
>  if test -z "$PYTHON"; then
> diff --git a/docs/formatnode.html.in b/docs/formatnode.html.in
> index 4d935b5..29244a8 100644
> --- a/docs/formatnode.html.in
> +++ b/docs/formatnode.html.in
> @@ -227,6 +227,7 @@
>                      <dt><code>rxhash</code></dt><dd>receive-hashing</dd>
>                      <dt><code>rdma</code></dt><dd>remote-direct-memory-access</dd>
>                      <dt><code>txudptnl</code></dt><dd>tx-udp-tunnel-segmentation</dd>
> +                    <dt><code>switchdev</code></dt><dd>kernel-forward-plane-offload</dd>

pretty odd abbreviation. But it is what it is :-)

>                  </dl>
>                </dd>
>                <dt><code>capability</code></dt>
> diff --git a/src/util/virnetdev.c b/src/util/virnetdev.c
> index 51a6e42..fc7c961 100644
> --- a/src/util/virnetdev.c
> +++ b/src/util/virnetdev.c
> @@ -59,6 +59,10 @@
>  # include <net/if_dl.h>
>  #endif
>  
> +#if HAVE_DECL_DEVLINK
> +# include <linux/devlink.h>
> +#endif
> +
>  #ifndef IFNAMSIZ
>  # define IFNAMSIZ 16
>  #endif
> @@ -2481,7 +2485,8 @@ VIR_ENUM_IMPL(virNetDevFeature,
>                "ntuple",
>                "rxhash",
>                "rdma",
> -              "txudptnl")
> +              "txudptnl",
> +              "switchdev")
>  
>  #ifdef __linux__
>  int
> @@ -2936,6 +2941,7 @@ int virNetDevGetRxFilter(const char *ifname,
>      return ret;
>  }
>  
> +

[**]
Added a spurious extra line.

>  #if defined(SIOCETHTOOL) && defined(HAVE_STRUCT_IFREQ)
>  
>  /**
> @@ -3115,6 +3121,182 @@ virNetDevGetEthtoolFeatures(virBitmapPtr bitmap,
>  }
>  
>  
> +#if HAVE_DECL_DEVLINK
> +/**
> + * virNetDevPutExtraHeader
> + * reserve and prepare room for an extra header
> + * This function sets to zero the room that is required to put the extra
> + * header after the initial Netlink header. This function also increases
> + * the nlmsg_len field.
> + *
> + * @nlh: pointer to Netlink header
> + * @size: size of the extra header that we want to put
> + *
> + * Returns pointer to the start of the extended header
> + */
> +static void *
> +virNetDevPutExtraHeader(struct nlmsghdr *nlh,
> +                        size_t size)
> +{
> +    char *ptr = (char *)nlh + nlh->nlmsg_len;
> +    size_t len = NLMSG_ALIGN(size);
> +    nlh->nlmsg_len += len;
> +    memset(ptr, 0, len);

[**]
I'm fairly confident this memset() is unnecessary. The buffer the "extra
header" is being added to is allocated with nlmsg_alloc_simple(); I
looked at the source for nlmsg_alloc_simple(), and it ends up calling
calloc(), which will initialize everything to 0 anyway (and any
important fields are set to some other value immediately after return
from virNetDevPutExtraHeader()). The reason the extra memset() bothers
me is that 1) we don't set memory allocated for netlink messages to 0
anywhere else in our netlink-related code so it's inconsistent, and 2)
having an extra memset(0) when it's not needed could end up becoming the
start of a "cargo cult" practice of calling memset(0) all over the place
just because we're paranoid - I'd rather avoid extra initialization if
it's redundant/superfluous.

I of course wouldn't want this to be pushed with such a change without
having it tested. I did make this change locally and ran it under gdb
with a breakpoint on virNetDevPutExtraHeader(), and saw that the entire
4096 bytes of *nl_msg is initialized to 0; it's up to you whether or not
you think it's necessary to re-test on a system that has a card that
support switchdev.

(alternately, John could push it as is, and I could send a separate
patch removing the memset(), which could be ACKed by Edan after he tries
it (or NACKed in the event that I'm completely wrong about it :-)

> +    return ptr;
> +}

I *was* going to say this doesn't need to be inside HAVE_DECL_DEVLINK,
but since it's defined as static, and only used by a function that is
itself inside HAVE_DECL_DEVLINK, this placement is correct.


> +
> +
> +/**
> + * virNetDevGetFamilyId:
> + * This function supplies the devlink family id
> + *
> + * @family_name: the name of the family to query
> + *
> + * Returns family id or 0 on failure.
> + */
> +static uint32_t
> +virNetDevGetFamilyId(const char *family_name)
> +{
> +    struct nl_msg *nl_msg = NULL;
> +    struct nlmsghdr *resp = NULL;
> +    struct genlmsghdr* gmsgh = NULL;
> +    struct nlattr *tb[CTRL_ATTR_MAX + 1] = {NULL, };
> +    unsigned int recvbuflen;
> +    uint32_t family_id = 0;
> +
> +    if (!(nl_msg = nlmsg_alloc_simple(GENL_ID_CTRL,
> +                                      NLM_F_REQUEST | NLM_F_ACK))) {
> +        virReportOOMError();
> +        goto cleanup;
> +    }
> +
> +    if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr))))
> +        goto cleanup;
> +
> +    gmsgh->cmd = CTRL_CMD_GETFAMILY;
> +    gmsgh->version = DEVLINK_GENL_VERSION;
> +
> +    if (nla_put_string(nl_msg, CTRL_ATTR_FAMILY_NAME, family_name) < 0) {
> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                       _("allocated netlink buffer is too small"));
> +        goto cleanup;
> +    }
> +
> +    if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0)
> +        goto cleanup;
> +
> +    if (nlmsg_parse(resp, sizeof(struct nlmsghdr), tb, CTRL_CMD_MAX, NULL) < 0) {
> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                       _("malformed netlink response message"));
> +        goto cleanup;
> +    }
> +
> +    if (tb[CTRL_ATTR_FAMILY_ID] == NULL)
> +        goto cleanup;
> +
> +    family_id = *(uint32_t *)RTA_DATA(tb[CTRL_ATTR_FAMILY_ID]);
> +
> + cleanup:
> +    nlmsg_free(nl_msg);
> +    VIR_FREE(resp);
> +    return family_id;
> +}
> +
> +
> +/**
> + * virNetDevSwitchdevFeature
> + * This function checks for the availability of Switchdev feature
> + * and add it to bitmap
> + *
> + * @ifname: name of the interface
> + * @out: add Switchdev feature if exist to bitmap
> + *
> + * Returns 0 on success, -1 on failure.
> + */
> +static int
> +virNetDevSwitchdevFeature(const char *ifname,
> +                          virBitmapPtr *out)
> +{
> +    struct nl_msg *nl_msg = NULL;
> +    struct nlmsghdr *resp = NULL;
> +    unsigned int recvbuflen;
> +    struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {NULL, };
> +    virPCIDevicePtr pci_device_ptr = NULL;
> +    struct genlmsghdr* gmsgh = NULL;
> +    const char *pci_name;
> +    char *pfname = NULL;
> +    int is_vf = -1;
> +    int ret = -1;
> +    uint32_t family_id;
> +
> +    if ((family_id = virNetDevGetFamilyId(DEVLINK_GENL_NAME)) <= 0)
> +        return ret;
> +
> +    if ((is_vf = virNetDevIsVirtualFunction(ifname)) < 0)
> +        return ret;
> +
> +    if (is_vf == 1 && virNetDevGetPhysicalFunction(ifname, &pfname) < 0)
> +        goto cleanup;
> +
> +    if (!(nl_msg = nlmsg_alloc_simple(family_id,
> +                                      NLM_F_REQUEST | NLM_F_ACK))) {
> +        virReportOOMError();
> +        goto cleanup;
> +    }
> +
> +    if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr))))
> +        goto cleanup;
> +
> +    gmsgh->cmd = DEVLINK_CMD_ESWITCH_GET;
> +    gmsgh->version = DEVLINK_GENL_VERSION;
> +
> +    pci_device_ptr = pfname ? virNetDevGetPCIDevice(pfname) :
> +                              virNetDevGetPCIDevice(ifname);
> +    if (pci_device_ptr == NULL)
> +        goto cleanup;
> +
> +    pci_name = virPCIDeviceGetName(pci_device_ptr);
> +
> +    if (nla_put(nl_msg, DEVLINK_ATTR_BUS_NAME, strlen("pci")+1, "pci") < 0 ||
> +        nla_put(nl_msg, DEVLINK_ATTR_DEV_NAME, strlen(pci_name)+1, pci_name) < 0) {
> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                       _("allocated netlink buffer is too small"));
> +        goto cleanup;
> +    }
> +
> +    if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0)
> +        goto cleanup;
> +
> +    if (nlmsg_parse(resp, sizeof(struct genlmsghdr), tb, DEVLINK_ATTR_MAX, NULL) < 0) {
> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                       _("malformed netlink response message"));
> +        goto cleanup;
> +    }
> +
> +    if (tb[DEVLINK_ATTR_ESWITCH_MODE] &&
> +        *(int *)RTA_DATA(tb[DEVLINK_ATTR_ESWITCH_MODE]) == DEVLINK_ESWITCH_MODE_SWITCHDEV) {
> +        ignore_value(virBitmapSetBit(*out, VIR_NET_DEV_FEAT_SWITCHDEV));
> +    }


I won't pretend that I know all about these particular netlink
messages/attributes. I can say that the code all looks similar to our
other netlink code, and when I run it on my host that doesn't have cards
supporting switchdev, it does execute this code and reports no switchdev
capability in the nodedev-dumpxml output of my NICs; since I'm sure Edan
has done the same test on a system that *does* have switchdev-capable
NICs, I'd say that's adequate testing.

Based on that I give my ACK (with at least the two trivial [**] items
changed, preferably also memset() removed), or if you prefer:

Reviewed-by: Laine Stump <laine@laine.org>



> +
> +    ret = 0;
> +
> + cleanup:
> +    nlmsg_free(nl_msg);
> +    virPCIDeviceFree(pci_device_ptr);
> +    VIR_FREE(resp);
> +    VIR_FREE(pfname);
> +    return ret;
> +}
> +#else
> +static int
> +virNetDevSwitchdevFeature(const char *ifname ATTRIBUTE_UNUSED,
> +                          virBitmapPtr *out ATTRIBUTE_UNUSED)
> +{
> +    return 0;
> +}
> +#endif
> +
> +
>  # if HAVE_DECL_ETHTOOL_GFEATURES
>  /**
>   * virNetDevGFeatureAvailable
> @@ -3315,6 +3497,9 @@ virNetDevGetFeatures(const char *ifname,
>      if (virNetDevRDMAFeature(ifname, out) < 0)
>          goto cleanup;
>  
> +    if (virNetDevSwitchdevFeature(ifname, out) < 0)
> +        goto cleanup;
> +
>      ret = 0;
>   cleanup:
>      VIR_FORCE_CLOSE(fd);
> diff --git a/src/util/virnetdev.h b/src/util/virnetdev.h
> index 9205c0e..71eaf45 100644
> --- a/src/util/virnetdev.h
> +++ b/src/util/virnetdev.h
> @@ -112,6 +112,7 @@ typedef enum {
>      VIR_NET_DEV_FEAT_RXHASH,
>      VIR_NET_DEV_FEAT_RDMA,
>      VIR_NET_DEV_FEAT_TXUDPTNL,
> +    VIR_NET_DEV_FEAT_SWITCHDEV,
>      VIR_NET_DEV_FEAT_LAST
>  } virNetDevFeature;
>  
> diff --git a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
> index d4c96e8..88252e6 100644
> --- a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
> +++ b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
> @@ -15,6 +15,7 @@
>      <feature name='rxhash'/>
>      <feature name='rdma'/>
>      <feature name='txudptnl'/>
> +    <feature name='switchdev'/>
>      <capability type='80211'/>
>    </capability>
>  </device>
> diff --git a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
> index 71bf90e..f77dfcc 100644
> --- a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
> +++ b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
> @@ -15,6 +15,7 @@
>      <feature name='rxhash'/>
>      <feature name='rdma'/>
>      <feature name='txudptnl'/>
> +    <feature name='switchdev'/>
>      <capability type='80203'/>
>    </capability>
>  </device>


--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCHv2] nodedev: add switchdev to NIC capabilities
Posted by John Ferlan 6 years, 7 months ago

On 09/14/2017 11:23 AM, Laine Stump wrote:
> (Almost all of my comments result in "ok, no action needed". There are
> just three items I would like to see changed (2 trivial, 1 also small
> but Edan or John may think it prudent to re-test with the change before
> pushing) - I marked those comments with [**].
> 
> On 08/21/2017 05:19 AM, Edan David wrote:
>> Adding functionality to libvirt that will allow querying the interface
>> for the availability of switchdev Offloading NIC capabilities.
>> The switchdev mode was introduced in kernel 4.8, the iproute2-devlink
>> command to retrieve the swtichdev NIC feature,
>> Command example:  devlink dev eswitch show pci/0000:03:00.0
>> This feature is needed for Openstack so we can do a scheduling decision
>> if the NIC is in Hardware Offload (switchdev) or regular SR-IOV (legacy) mode.
>> And select the appropriate hypervisors with the requested capability see [1].
>>
>> [1] - https://specs.openstack.org/openstack/nova-specs/specs/pike/approved/enable-sriov-nic-features.html
>> ---
>>  configure.ac                                      |  13 ++
>>  docs/formatnode.html.in                           |   1 +
>>  src/util/virnetdev.c                              | 187 +++++++++++++++++++++-
>>  src/util/virnetdev.h                              |   1 +
>>  tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml |   1 +
>>  tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml |   1 +
>>  6 files changed, 203 insertions(+), 1 deletion(-)
>>
>> diff --git a/configure.ac b/configure.ac
>> index b12b7fa..c089798 100644
>> --- a/configure.ac
>> +++ b/configure.ac
>> @@ -627,6 +627,19 @@ if test "$with_linux" = "yes"; then
>>      AC_CHECK_HEADERS([linux/btrfs.h])
>>  fi
>>  
>> +dnl
>> +dnl check for kernel headers required by devlink
>> +dnl
>> +if test "$with_linux" = "yes"; then
>> +    AC_CHECK_HEADERS([linux/devlink.h])
>> +    AC_CHECK_DECLS([DEVLINK_GENL_VERSION, DEVLINK_GENL_NAME, DEVLINK_ATTR_MAX, DEVLINK_CMD_ESWITCH_GET, DEVLINK_ATTR_BUS_NAME, DEVLINK_ATTR_DEV_NAME, DEVLINK_ATTR_ESWITCH_MODE, DEVLINK_ESWITCH_MODE_SWITCHDEV],
> 
> That's very ..... thorough, and potentially misleading if someone ever
> wanted to use devlink to check for something other than switchdev (e.g.
> [mythical feature X]) on some system that didn't have the proper stuff
> defined for switchdev, but did have it for [other mythical feature X].
> But that's all just hypothetical, so this is fine with me.
> 
> 
>> +                   [AC_DEFINE([HAVE_DECL_DEVLINK],
>> +                              [1],
>> +                              [whether devlink declarations is available])],
> 
> [**]
> s/is/are/
> 

Yep - altered...

>> +                   [],
>> +                   [[#include <linux/devlink.h>]])
>> +fi
>> +
>>  dnl Allow perl/python overrides
>>  AC_PATH_PROGS([PYTHON], [python2 python])
>>  if test -z "$PYTHON"; then
>> diff --git a/docs/formatnode.html.in b/docs/formatnode.html.in
>> index 4d935b5..29244a8 100644
>> --- a/docs/formatnode.html.in
>> +++ b/docs/formatnode.html.in
>> @@ -227,6 +227,7 @@
>>                      <dt><code>rxhash</code></dt><dd>receive-hashing</dd>
>>                      <dt><code>rdma</code></dt><dd>remote-direct-memory-access</dd>
>>                      <dt><code>txudptnl</code></dt><dd>tx-udp-tunnel-segmentation</dd>
>> +                    <dt><code>switchdev</code></dt><dd>kernel-forward-plane-offload</dd>
> 
> pretty odd abbreviation. But it is what it is :-)
> >>                  </dl>
>>                </dd>
>>                <dt><code>capability</code></dt>
>> diff --git a/src/util/virnetdev.c b/src/util/virnetdev.c
>> index 51a6e42..fc7c961 100644
>> --- a/src/util/virnetdev.c
>> +++ b/src/util/virnetdev.c
>> @@ -59,6 +59,10 @@
>>  # include <net/if_dl.h>
>>  #endif
>>  
>> +#if HAVE_DECL_DEVLINK
>> +# include <linux/devlink.h>
>> +#endif
>> +
>>  #ifndef IFNAMSIZ
>>  # define IFNAMSIZ 16
>>  #endif
>> @@ -2481,7 +2485,8 @@ VIR_ENUM_IMPL(virNetDevFeature,
>>                "ntuple",
>>                "rxhash",
>>                "rdma",
>> -              "txudptnl")
>> +              "txudptnl",
>> +              "switchdev")
>>  
>>  #ifdef __linux__
>>  int
>> @@ -2936,6 +2941,7 @@ int virNetDevGetRxFilter(const char *ifname,
>>      return ret;
>>  }
>>  
>> +
> 
> [**]
> Added a spurious extra line.
> 

Removed.

>>  #if defined(SIOCETHTOOL) && defined(HAVE_STRUCT_IFREQ)
>>  
>>  /**
>> @@ -3115,6 +3121,182 @@ virNetDevGetEthtoolFeatures(virBitmapPtr bitmap,
>>  }
>>  
>>  
>> +#if HAVE_DECL_DEVLINK
>> +/**
>> + * virNetDevPutExtraHeader
>> + * reserve and prepare room for an extra header
>> + * This function sets to zero the room that is required to put the extra
>> + * header after the initial Netlink header. This function also increases
>> + * the nlmsg_len field.
>> + *
>> + * @nlh: pointer to Netlink header
>> + * @size: size of the extra header that we want to put
>> + *
>> + * Returns pointer to the start of the extended header
>> + */
>> +static void *
>> +virNetDevPutExtraHeader(struct nlmsghdr *nlh,
>> +                        size_t size)
>> +{
>> +    char *ptr = (char *)nlh + nlh->nlmsg_len;
>> +    size_t len = NLMSG_ALIGN(size);
>> +    nlh->nlmsg_len += len;
>> +    memset(ptr, 0, len);
> 
> [**]
> I'm fairly confident this memset() is unnecessary. The buffer the "extra
> header" is being added to is allocated with nlmsg_alloc_simple(); I
> looked at the source for nlmsg_alloc_simple(), and it ends up calling
> calloc(), which will initialize everything to 0 anyway (and any
> important fields are set to some other value immediately after return
> from virNetDevPutExtraHeader()). The reason the extra memset() bothers
> me is that 1) we don't set memory allocated for netlink messages to 0
> anywhere else in our netlink-related code so it's inconsistent, and 2)
> having an extra memset(0) when it's not needed could end up becoming the
> start of a "cargo cult" practice of calling memset(0) all over the place
> just because we're paranoid - I'd rather avoid extra initialization if
> it's redundant/superfluous.
> 
> I of course wouldn't want this to be pushed with such a change without
> having it tested. I did make this change locally and ran it under gdb
> with a breakpoint on virNetDevPutExtraHeader(), and saw that the entire
> 4096 bytes of *nl_msg is initialized to 0; it's up to you whether or not
> you think it's necessary to re-test on a system that has a card that
> support switchdev.
> 
> (alternately, John could push it as is, and I could send a separate
> patch removing the memset(), which could be ACKed by Edan after he tries
> it (or NACKed in the event that I'm completely wrong about it :-)
> 

I saw this on my initial review and thought perhaps it was superfluous.
Tracing through various code to find the original calloc is impressive!
I didn't originally do anything with it because I didn't think it would
hurt, but I agree with you.

So I've removed it from my local copy of the patch that I'll use to push
on Monday. Perhaps by then Edan will see these messages and be able to
indicate one way or another whether it should be kept from a "retest"
POV.  Besides, if anything breaks in one of the many CI builds, I don't
want to spent weekend hours trying to clean up after it.


John

>> +    return ptr;
>> +}
> 
> I *was* going to say this doesn't need to be inside HAVE_DECL_DEVLINK,
> but since it's defined as static, and only used by a function that is
> itself inside HAVE_DECL_DEVLINK, this placement is correct.
> 
> 
>> +
>> +
>> +/**
>> + * virNetDevGetFamilyId:
>> + * This function supplies the devlink family id
>> + *
>> + * @family_name: the name of the family to query
>> + *
>> + * Returns family id or 0 on failure.
>> + */
>> +static uint32_t
>> +virNetDevGetFamilyId(const char *family_name)
>> +{
>> +    struct nl_msg *nl_msg = NULL;
>> +    struct nlmsghdr *resp = NULL;
>> +    struct genlmsghdr* gmsgh = NULL;
>> +    struct nlattr *tb[CTRL_ATTR_MAX + 1] = {NULL, };
>> +    unsigned int recvbuflen;
>> +    uint32_t family_id = 0;
>> +
>> +    if (!(nl_msg = nlmsg_alloc_simple(GENL_ID_CTRL,
>> +                                      NLM_F_REQUEST | NLM_F_ACK))) {
>> +        virReportOOMError();
>> +        goto cleanup;
>> +    }
>> +
>> +    if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr))))
>> +        goto cleanup;
>> +
>> +    gmsgh->cmd = CTRL_CMD_GETFAMILY;
>> +    gmsgh->version = DEVLINK_GENL_VERSION;
>> +
>> +    if (nla_put_string(nl_msg, CTRL_ATTR_FAMILY_NAME, family_name) < 0) {
>> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>> +                       _("allocated netlink buffer is too small"));
>> +        goto cleanup;
>> +    }
>> +
>> +    if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0)
>> +        goto cleanup;
>> +
>> +    if (nlmsg_parse(resp, sizeof(struct nlmsghdr), tb, CTRL_CMD_MAX, NULL) < 0) {
>> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>> +                       _("malformed netlink response message"));
>> +        goto cleanup;
>> +    }
>> +
>> +    if (tb[CTRL_ATTR_FAMILY_ID] == NULL)
>> +        goto cleanup;
>> +
>> +    family_id = *(uint32_t *)RTA_DATA(tb[CTRL_ATTR_FAMILY_ID]);
>> +
>> + cleanup:
>> +    nlmsg_free(nl_msg);
>> +    VIR_FREE(resp);
>> +    return family_id;
>> +}
>> +
>> +
>> +/**
>> + * virNetDevSwitchdevFeature
>> + * This function checks for the availability of Switchdev feature
>> + * and add it to bitmap
>> + *
>> + * @ifname: name of the interface
>> + * @out: add Switchdev feature if exist to bitmap
>> + *
>> + * Returns 0 on success, -1 on failure.
>> + */
>> +static int
>> +virNetDevSwitchdevFeature(const char *ifname,
>> +                          virBitmapPtr *out)
>> +{
>> +    struct nl_msg *nl_msg = NULL;
>> +    struct nlmsghdr *resp = NULL;
>> +    unsigned int recvbuflen;
>> +    struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {NULL, };
>> +    virPCIDevicePtr pci_device_ptr = NULL;
>> +    struct genlmsghdr* gmsgh = NULL;
>> +    const char *pci_name;
>> +    char *pfname = NULL;
>> +    int is_vf = -1;
>> +    int ret = -1;
>> +    uint32_t family_id;
>> +
>> +    if ((family_id = virNetDevGetFamilyId(DEVLINK_GENL_NAME)) <= 0)
>> +        return ret;
>> +
>> +    if ((is_vf = virNetDevIsVirtualFunction(ifname)) < 0)
>> +        return ret;
>> +
>> +    if (is_vf == 1 && virNetDevGetPhysicalFunction(ifname, &pfname) < 0)
>> +        goto cleanup;
>> +
>> +    if (!(nl_msg = nlmsg_alloc_simple(family_id,
>> +                                      NLM_F_REQUEST | NLM_F_ACK))) {
>> +        virReportOOMError();
>> +        goto cleanup;
>> +    }
>> +
>> +    if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr))))
>> +        goto cleanup;
>> +
>> +    gmsgh->cmd = DEVLINK_CMD_ESWITCH_GET;
>> +    gmsgh->version = DEVLINK_GENL_VERSION;
>> +
>> +    pci_device_ptr = pfname ? virNetDevGetPCIDevice(pfname) :
>> +                              virNetDevGetPCIDevice(ifname);
>> +    if (pci_device_ptr == NULL)
>> +        goto cleanup;
>> +
>> +    pci_name = virPCIDeviceGetName(pci_device_ptr);
>> +
>> +    if (nla_put(nl_msg, DEVLINK_ATTR_BUS_NAME, strlen("pci")+1, "pci") < 0 ||
>> +        nla_put(nl_msg, DEVLINK_ATTR_DEV_NAME, strlen(pci_name)+1, pci_name) < 0) {
>> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>> +                       _("allocated netlink buffer is too small"));
>> +        goto cleanup;
>> +    }
>> +
>> +    if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0)
>> +        goto cleanup;
>> +
>> +    if (nlmsg_parse(resp, sizeof(struct genlmsghdr), tb, DEVLINK_ATTR_MAX, NULL) < 0) {
>> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>> +                       _("malformed netlink response message"));
>> +        goto cleanup;
>> +    }
>> +
>> +    if (tb[DEVLINK_ATTR_ESWITCH_MODE] &&
>> +        *(int *)RTA_DATA(tb[DEVLINK_ATTR_ESWITCH_MODE]) == DEVLINK_ESWITCH_MODE_SWITCHDEV) {
>> +        ignore_value(virBitmapSetBit(*out, VIR_NET_DEV_FEAT_SWITCHDEV));
>> +    }
> 
> 
> I won't pretend that I know all about these particular netlink
> messages/attributes. I can say that the code all looks similar to our
> other netlink code, and when I run it on my host that doesn't have cards
> supporting switchdev, it does execute this code and reports no switchdev
> capability in the nodedev-dumpxml output of my NICs; since I'm sure Edan
> has done the same test on a system that *does* have switchdev-capable
> NICs, I'd say that's adequate testing.
> 
> Based on that I give my ACK (with at least the two trivial [**] items
> changed, preferably also memset() removed), or if you prefer:
> 
> Reviewed-by: Laine Stump <laine@laine.org>
> 
> 
> 
>> +
>> +    ret = 0;
>> +
>> + cleanup:
>> +    nlmsg_free(nl_msg);
>> +    virPCIDeviceFree(pci_device_ptr);
>> +    VIR_FREE(resp);
>> +    VIR_FREE(pfname);
>> +    return ret;
>> +}
>> +#else
>> +static int
>> +virNetDevSwitchdevFeature(const char *ifname ATTRIBUTE_UNUSED,
>> +                          virBitmapPtr *out ATTRIBUTE_UNUSED)
>> +{
>> +    return 0;
>> +}
>> +#endif
>> +
>> +
>>  # if HAVE_DECL_ETHTOOL_GFEATURES
>>  /**
>>   * virNetDevGFeatureAvailable
>> @@ -3315,6 +3497,9 @@ virNetDevGetFeatures(const char *ifname,
>>      if (virNetDevRDMAFeature(ifname, out) < 0)
>>          goto cleanup;
>>  
>> +    if (virNetDevSwitchdevFeature(ifname, out) < 0)
>> +        goto cleanup;
>> +
>>      ret = 0;
>>   cleanup:
>>      VIR_FORCE_CLOSE(fd);
>> diff --git a/src/util/virnetdev.h b/src/util/virnetdev.h
>> index 9205c0e..71eaf45 100644
>> --- a/src/util/virnetdev.h
>> +++ b/src/util/virnetdev.h
>> @@ -112,6 +112,7 @@ typedef enum {
>>      VIR_NET_DEV_FEAT_RXHASH,
>>      VIR_NET_DEV_FEAT_RDMA,
>>      VIR_NET_DEV_FEAT_TXUDPTNL,
>> +    VIR_NET_DEV_FEAT_SWITCHDEV,
>>      VIR_NET_DEV_FEAT_LAST
>>  } virNetDevFeature;
>>  
>> diff --git a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
>> index d4c96e8..88252e6 100644
>> --- a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
>> +++ b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
>> @@ -15,6 +15,7 @@
>>      <feature name='rxhash'/>
>>      <feature name='rdma'/>
>>      <feature name='txudptnl'/>
>> +    <feature name='switchdev'/>
>>      <capability type='80211'/>
>>    </capability>
>>  </device>
>> diff --git a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
>> index 71bf90e..f77dfcc 100644
>> --- a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
>> +++ b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
>> @@ -15,6 +15,7 @@
>>      <feature name='rxhash'/>
>>      <feature name='rdma'/>
>>      <feature name='txudptnl'/>
>> +    <feature name='switchdev'/>
>>      <capability type='80203'/>
>>    </capability>
>>  </device>
> 
> 

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCHv2] nodedev: add switchdev to NIC capabilities
Posted by Edan David 6 years, 7 months ago
I removed the memset from virNetDevPutExtraHeader and tested on a card supporting switchdev.
It looks fine to me :)
Great review guys, thanks!


-----Original Message-----
From: John Ferlan [mailto:jferlan@redhat.com] 
Sent: Saturday, September 16, 2017 2:04 PM
To: Laine Stump <laine@laine.org>; libvir-list@redhat.com
Cc: Edan David <edand@mellanox.com>
Subject: Re: [libvirt] [PATCHv2] nodedev: add switchdev to NIC capabilities



On 09/14/2017 11:23 AM, Laine Stump wrote:
> (Almost all of my comments result in "ok, no action needed". There are 
> just three items I would like to see changed (2 trivial, 1 also small 
> but Edan or John may think it prudent to re-test with the change 
> before
> pushing) - I marked those comments with [**].
> 
> On 08/21/2017 05:19 AM, Edan David wrote:
>> Adding functionality to libvirt that will allow querying the 
>> interface for the availability of switchdev Offloading NIC capabilities.
>> The switchdev mode was introduced in kernel 4.8, the iproute2-devlink 
>> command to retrieve the swtichdev NIC feature, Command example:  
>> devlink dev eswitch show pci/0000:03:00.0 This feature is needed for 
>> Openstack so we can do a scheduling decision if the NIC is in 
>> Hardware Offload (switchdev) or regular SR-IOV (legacy) mode.
>> And select the appropriate hypervisors with the requested capability see [1].
>>
>> [1] - 
>> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fsp
>> ecs.openstack.org%2Fopenstack%2Fnova-specs%2Fspecs%2Fpike%2Fapproved%
>> 2Fenable-sriov-nic-features.html&data=02%7C01%7Cedand%40mellanox.com%
>> 7C1e30890a5c7a4f9f045f08d4fcf2b479%7Ca652971c7d2e4d9ba6a4d149256f461b
>> %7C0%7C0%7C636411566714987559&sdata=Ri340H6MLap3HpOMDrb%2FFk6D9agQjEh
>> C9cvR7%2FbV1Ls%3D&reserved=0
>> ---
>>  configure.ac                                      |  13 ++
>>  docs/formatnode.html.in                           |   1 +
>>  src/util/virnetdev.c                              | 187 +++++++++++++++++++++-
>>  src/util/virnetdev.h                              |   1 +
>>  tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml |   1 +
>>  tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml |   1 +
>>  6 files changed, 203 insertions(+), 1 deletion(-)
>>
>> diff --git a/configure.ac b/configure.ac index b12b7fa..c089798 
>> 100644
>> --- a/configure.ac
>> +++ b/configure.ac
>> @@ -627,6 +627,19 @@ if test "$with_linux" = "yes"; then
>>      AC_CHECK_HEADERS([linux/btrfs.h])  fi
>>  
>> +dnl
>> +dnl check for kernel headers required by devlink dnl if test 
>> +"$with_linux" = "yes"; then
>> +    AC_CHECK_HEADERS([linux/devlink.h])
>> +    AC_CHECK_DECLS([DEVLINK_GENL_VERSION, DEVLINK_GENL_NAME, 
>> +DEVLINK_ATTR_MAX, DEVLINK_CMD_ESWITCH_GET, DEVLINK_ATTR_BUS_NAME, 
>> +DEVLINK_ATTR_DEV_NAME, DEVLINK_ATTR_ESWITCH_MODE, 
>> +DEVLINK_ESWITCH_MODE_SWITCHDEV],
> 
> That's very ..... thorough, and potentially misleading if someone ever 
> wanted to use devlink to check for something other than switchdev (e.g.
> [mythical feature X]) on some system that didn't have the proper stuff 
> defined for switchdev, but did have it for [other mythical feature X].
> But that's all just hypothetical, so this is fine with me.
> 
> 
>> +                   [AC_DEFINE([HAVE_DECL_DEVLINK],
>> +                              [1],
>> +                              [whether devlink declarations is 
>> + available])],
> 
> [**]
> s/is/are/
> 

Yep - altered...

>> +                   [],
>> +                   [[#include <linux/devlink.h>]]) fi
>> +
>>  dnl Allow perl/python overrides
>>  AC_PATH_PROGS([PYTHON], [python2 python])  if test -z "$PYTHON"; 
>> then diff --git a/docs/formatnode.html.in b/docs/formatnode.html.in 
>> index 4d935b5..29244a8 100644
>> --- a/docs/formatnode.html.in
>> +++ b/docs/formatnode.html.in
>> @@ -227,6 +227,7 @@
>>                      <dt><code>rxhash</code></dt><dd>receive-hashing</dd>
>>                      <dt><code>rdma</code></dt><dd>remote-direct-memory-access</dd>
>>                      
>> <dt><code>txudptnl</code></dt><dd>tx-udp-tunnel-segmentation</dd>
>> +                    
>> + <dt><code>switchdev</code></dt><dd>kernel-forward-plane-offload</dd
>> + >
> 
> pretty odd abbreviation. But it is what it is :-)
> >>                  </dl>
>>                </dd>
>>                <dt><code>capability</code></dt> diff --git 
>> a/src/util/virnetdev.c b/src/util/virnetdev.c index 51a6e42..fc7c961 
>> 100644
>> --- a/src/util/virnetdev.c
>> +++ b/src/util/virnetdev.c
>> @@ -59,6 +59,10 @@
>>  # include <net/if_dl.h>
>>  #endif
>>  
>> +#if HAVE_DECL_DEVLINK
>> +# include <linux/devlink.h>
>> +#endif
>> +
>>  #ifndef IFNAMSIZ
>>  # define IFNAMSIZ 16
>>  #endif
>> @@ -2481,7 +2485,8 @@ VIR_ENUM_IMPL(virNetDevFeature,
>>                "ntuple",
>>                "rxhash",
>>                "rdma",
>> -              "txudptnl")
>> +              "txudptnl",
>> +              "switchdev")
>>  
>>  #ifdef __linux__
>>  int
>> @@ -2936,6 +2941,7 @@ int virNetDevGetRxFilter(const char *ifname,
>>      return ret;
>>  }
>>  
>> +
> 
> [**]
> Added a spurious extra line.
> 

Removed.

>>  #if defined(SIOCETHTOOL) && defined(HAVE_STRUCT_IFREQ)
>>  
>>  /**
>> @@ -3115,6 +3121,182 @@ virNetDevGetEthtoolFeatures(virBitmapPtr 
>> bitmap,  }
>>  
>>  
>> +#if HAVE_DECL_DEVLINK
>> +/**
>> + * virNetDevPutExtraHeader
>> + * reserve and prepare room for an extra header
>> + * This function sets to zero the room that is required to put the 
>> +extra
>> + * header after the initial Netlink header. This function also 
>> +increases
>> + * the nlmsg_len field.
>> + *
>> + * @nlh: pointer to Netlink header
>> + * @size: size of the extra header that we want to put
>> + *
>> + * Returns pointer to the start of the extended header  */ static 
>> +void * virNetDevPutExtraHeader(struct nlmsghdr *nlh,
>> +                        size_t size) {
>> +    char *ptr = (char *)nlh + nlh->nlmsg_len;
>> +    size_t len = NLMSG_ALIGN(size);
>> +    nlh->nlmsg_len += len;
>> +    memset(ptr, 0, len);
> 
> [**]
> I'm fairly confident this memset() is unnecessary. The buffer the 
> "extra header" is being added to is allocated with 
> nlmsg_alloc_simple(); I looked at the source for nlmsg_alloc_simple(), 
> and it ends up calling calloc(), which will initialize everything to 0 
> anyway (and any important fields are set to some other value 
> immediately after return from virNetDevPutExtraHeader()). The reason 
> the extra memset() bothers me is that 1) we don't set memory allocated 
> for netlink messages to 0 anywhere else in our netlink-related code so 
> it's inconsistent, and 2) having an extra memset(0) when it's not 
> needed could end up becoming the start of a "cargo cult" practice of 
> calling memset(0) all over the place just because we're paranoid - I'd 
> rather avoid extra initialization if it's redundant/superfluous.
> 
> I of course wouldn't want this to be pushed with such a change without 
> having it tested. I did make this change locally and ran it under gdb 
> with a breakpoint on virNetDevPutExtraHeader(), and saw that the 
> entire
> 4096 bytes of *nl_msg is initialized to 0; it's up to you whether or 
> not you think it's necessary to re-test on a system that has a card 
> that support switchdev.
> 
> (alternately, John could push it as is, and I could send a separate 
> patch removing the memset(), which could be ACKed by Edan after he 
> tries it (or NACKed in the event that I'm completely wrong about it 
> :-)
> 

I saw this on my initial review and thought perhaps it was superfluous.
Tracing through various code to find the original calloc is impressive!
I didn't originally do anything with it because I didn't think it would hurt, but I agree with you.

So I've removed it from my local copy of the patch that I'll use to push on Monday. Perhaps by then Edan will see these messages and be able to indicate one way or another whether it should be kept from a "retest"
POV.  Besides, if anything breaks in one of the many CI builds, I don't want to spent weekend hours trying to clean up after it.


John

>> +    return ptr;
>> +}
> 
> I *was* going to say this doesn't need to be inside HAVE_DECL_DEVLINK, 
> but since it's defined as static, and only used by a function that is 
> itself inside HAVE_DECL_DEVLINK, this placement is correct.
> 
> 
>> +
>> +
>> +/**
>> + * virNetDevGetFamilyId:
>> + * This function supplies the devlink family id
>> + *
>> + * @family_name: the name of the family to query
>> + *
>> + * Returns family id or 0 on failure.
>> + */
>> +static uint32_t
>> +virNetDevGetFamilyId(const char *family_name) {
>> +    struct nl_msg *nl_msg = NULL;
>> +    struct nlmsghdr *resp = NULL;
>> +    struct genlmsghdr* gmsgh = NULL;
>> +    struct nlattr *tb[CTRL_ATTR_MAX + 1] = {NULL, };
>> +    unsigned int recvbuflen;
>> +    uint32_t family_id = 0;
>> +
>> +    if (!(nl_msg = nlmsg_alloc_simple(GENL_ID_CTRL,
>> +                                      NLM_F_REQUEST | NLM_F_ACK))) {
>> +        virReportOOMError();
>> +        goto cleanup;
>> +    }
>> +
>> +    if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr))))
>> +        goto cleanup;
>> +
>> +    gmsgh->cmd = CTRL_CMD_GETFAMILY;
>> +    gmsgh->version = DEVLINK_GENL_VERSION;
>> +
>> +    if (nla_put_string(nl_msg, CTRL_ATTR_FAMILY_NAME, family_name) < 0) {
>> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>> +                       _("allocated netlink buffer is too small"));
>> +        goto cleanup;
>> +    }
>> +
>> +    if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0)
>> +        goto cleanup;
>> +
>> +    if (nlmsg_parse(resp, sizeof(struct nlmsghdr), tb, CTRL_CMD_MAX, NULL) < 0) {
>> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>> +                       _("malformed netlink response message"));
>> +        goto cleanup;
>> +    }
>> +
>> +    if (tb[CTRL_ATTR_FAMILY_ID] == NULL)
>> +        goto cleanup;
>> +
>> +    family_id = *(uint32_t *)RTA_DATA(tb[CTRL_ATTR_FAMILY_ID]);
>> +
>> + cleanup:
>> +    nlmsg_free(nl_msg);
>> +    VIR_FREE(resp);
>> +    return family_id;
>> +}
>> +
>> +
>> +/**
>> + * virNetDevSwitchdevFeature
>> + * This function checks for the availability of Switchdev feature
>> + * and add it to bitmap
>> + *
>> + * @ifname: name of the interface
>> + * @out: add Switchdev feature if exist to bitmap
>> + *
>> + * Returns 0 on success, -1 on failure.
>> + */
>> +static int
>> +virNetDevSwitchdevFeature(const char *ifname,
>> +                          virBitmapPtr *out) {
>> +    struct nl_msg *nl_msg = NULL;
>> +    struct nlmsghdr *resp = NULL;
>> +    unsigned int recvbuflen;
>> +    struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {NULL, };
>> +    virPCIDevicePtr pci_device_ptr = NULL;
>> +    struct genlmsghdr* gmsgh = NULL;
>> +    const char *pci_name;
>> +    char *pfname = NULL;
>> +    int is_vf = -1;
>> +    int ret = -1;
>> +    uint32_t family_id;
>> +
>> +    if ((family_id = virNetDevGetFamilyId(DEVLINK_GENL_NAME)) <= 0)
>> +        return ret;
>> +
>> +    if ((is_vf = virNetDevIsVirtualFunction(ifname)) < 0)
>> +        return ret;
>> +
>> +    if (is_vf == 1 && virNetDevGetPhysicalFunction(ifname, &pfname) < 0)
>> +        goto cleanup;
>> +
>> +    if (!(nl_msg = nlmsg_alloc_simple(family_id,
>> +                                      NLM_F_REQUEST | NLM_F_ACK))) {
>> +        virReportOOMError();
>> +        goto cleanup;
>> +    }
>> +
>> +    if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr))))
>> +        goto cleanup;
>> +
>> +    gmsgh->cmd = DEVLINK_CMD_ESWITCH_GET;
>> +    gmsgh->version = DEVLINK_GENL_VERSION;
>> +
>> +    pci_device_ptr = pfname ? virNetDevGetPCIDevice(pfname) :
>> +                              virNetDevGetPCIDevice(ifname);
>> +    if (pci_device_ptr == NULL)
>> +        goto cleanup;
>> +
>> +    pci_name = virPCIDeviceGetName(pci_device_ptr);
>> +
>> +    if (nla_put(nl_msg, DEVLINK_ATTR_BUS_NAME, strlen("pci")+1, "pci") < 0 ||
>> +        nla_put(nl_msg, DEVLINK_ATTR_DEV_NAME, strlen(pci_name)+1, pci_name) < 0) {
>> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>> +                       _("allocated netlink buffer is too small"));
>> +        goto cleanup;
>> +    }
>> +
>> +    if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0)
>> +        goto cleanup;
>> +
>> +    if (nlmsg_parse(resp, sizeof(struct genlmsghdr), tb, DEVLINK_ATTR_MAX, NULL) < 0) {
>> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>> +                       _("malformed netlink response message"));
>> +        goto cleanup;
>> +    }
>> +
>> +    if (tb[DEVLINK_ATTR_ESWITCH_MODE] &&
>> +        *(int *)RTA_DATA(tb[DEVLINK_ATTR_ESWITCH_MODE]) == DEVLINK_ESWITCH_MODE_SWITCHDEV) {
>> +        ignore_value(virBitmapSetBit(*out, VIR_NET_DEV_FEAT_SWITCHDEV));
>> +    }
> 
> 
> I won't pretend that I know all about these particular netlink 
> messages/attributes. I can say that the code all looks similar to our 
> other netlink code, and when I run it on my host that doesn't have 
> cards supporting switchdev, it does execute this code and reports no 
> switchdev capability in the nodedev-dumpxml output of my NICs; since 
> I'm sure Edan has done the same test on a system that *does* have 
> switchdev-capable NICs, I'd say that's adequate testing.
> 
> Based on that I give my ACK (with at least the two trivial [**] items 
> changed, preferably also memset() removed), or if you prefer:
> 
> Reviewed-by: Laine Stump <laine@laine.org>
> 
> 
> 
>> +
>> +    ret = 0;
>> +
>> + cleanup:
>> +    nlmsg_free(nl_msg);
>> +    virPCIDeviceFree(pci_device_ptr);
>> +    VIR_FREE(resp);
>> +    VIR_FREE(pfname);
>> +    return ret;
>> +}
>> +#else
>> +static int
>> +virNetDevSwitchdevFeature(const char *ifname ATTRIBUTE_UNUSED,
>> +                          virBitmapPtr *out ATTRIBUTE_UNUSED) {
>> +    return 0;
>> +}
>> +#endif
>> +
>> +
>>  # if HAVE_DECL_ETHTOOL_GFEATURES
>>  /**
>>   * virNetDevGFeatureAvailable
>> @@ -3315,6 +3497,9 @@ virNetDevGetFeatures(const char *ifname,
>>      if (virNetDevRDMAFeature(ifname, out) < 0)
>>          goto cleanup;
>>  
>> +    if (virNetDevSwitchdevFeature(ifname, out) < 0)
>> +        goto cleanup;
>> +
>>      ret = 0;
>>   cleanup:
>>      VIR_FORCE_CLOSE(fd);
>> diff --git a/src/util/virnetdev.h b/src/util/virnetdev.h index 
>> 9205c0e..71eaf45 100644
>> --- a/src/util/virnetdev.h
>> +++ b/src/util/virnetdev.h
>> @@ -112,6 +112,7 @@ typedef enum {
>>      VIR_NET_DEV_FEAT_RXHASH,
>>      VIR_NET_DEV_FEAT_RDMA,
>>      VIR_NET_DEV_FEAT_TXUDPTNL,
>> +    VIR_NET_DEV_FEAT_SWITCHDEV,
>>      VIR_NET_DEV_FEAT_LAST
>>  } virNetDevFeature;
>>  
>> diff --git a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml 
>> b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
>> index d4c96e8..88252e6 100644
>> --- a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
>> +++ b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
>> @@ -15,6 +15,7 @@
>>      <feature name='rxhash'/>
>>      <feature name='rdma'/>
>>      <feature name='txudptnl'/>
>> +    <feature name='switchdev'/>
>>      <capability type='80211'/>
>>    </capability>
>>  </device>
>> diff --git a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml 
>> b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
>> index 71bf90e..f77dfcc 100644
>> --- a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
>> +++ b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
>> @@ -15,6 +15,7 @@
>>      <feature name='rxhash'/>
>>      <feature name='rdma'/>
>>      <feature name='txudptnl'/>
>> +    <feature name='switchdev'/>
>>      <capability type='80203'/>
>>    </capability>
>>  </device>
> 
> 

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCHv2] nodedev: add switchdev to NIC capabilities
Posted by John Ferlan 6 years, 7 months ago

On 09/17/2017 05:32 AM, Edan David wrote:
> I removed the memset from virNetDevPutExtraHeader and tested on a card supporting switchdev.
> It looks fine to me :)
> Great review guys, thanks!
> 

Thanks for checking... This is now pushed.

Congrats on your first libvirt patch,

John

> 
> -----Original Message-----
> From: John Ferlan [mailto:jferlan@redhat.com] 
> Sent: Saturday, September 16, 2017 2:04 PM
> To: Laine Stump <laine@laine.org>; libvir-list@redhat.com
> Cc: Edan David <edand@mellanox.com>
> Subject: Re: [libvirt] [PATCHv2] nodedev: add switchdev to NIC capabilities
> 
> 
> 
> On 09/14/2017 11:23 AM, Laine Stump wrote:
>> (Almost all of my comments result in "ok, no action needed". There are 
>> just three items I would like to see changed (2 trivial, 1 also small 
>> but Edan or John may think it prudent to re-test with the change 
>> before
>> pushing) - I marked those comments with [**].
>>
>> On 08/21/2017 05:19 AM, Edan David wrote:
>>> Adding functionality to libvirt that will allow querying the 
>>> interface for the availability of switchdev Offloading NIC capabilities.
>>> The switchdev mode was introduced in kernel 4.8, the iproute2-devlink 
>>> command to retrieve the swtichdev NIC feature, Command example:  
>>> devlink dev eswitch show pci/0000:03:00.0 This feature is needed for 
>>> Openstack so we can do a scheduling decision if the NIC is in 
>>> Hardware Offload (switchdev) or regular SR-IOV (legacy) mode.
>>> And select the appropriate hypervisors with the requested capability see [1].
>>>
>>> [1] - 
>>> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fsp
>>> ecs.openstack.org%2Fopenstack%2Fnova-specs%2Fspecs%2Fpike%2Fapproved%
>>> 2Fenable-sriov-nic-features.html&data=02%7C01%7Cedand%40mellanox.com%
>>> 7C1e30890a5c7a4f9f045f08d4fcf2b479%7Ca652971c7d2e4d9ba6a4d149256f461b
>>> %7C0%7C0%7C636411566714987559&sdata=Ri340H6MLap3HpOMDrb%2FFk6D9agQjEh
>>> C9cvR7%2FbV1Ls%3D&reserved=0
>>> ---
>>>  configure.ac                                      |  13 ++
>>>  docs/formatnode.html.in                           |   1 +
>>>  src/util/virnetdev.c                              | 187 +++++++++++++++++++++-
>>>  src/util/virnetdev.h                              |   1 +
>>>  tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml |   1 +
>>>  tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml |   1 +
>>>  6 files changed, 203 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/configure.ac b/configure.ac index b12b7fa..c089798 
>>> 100644
>>> --- a/configure.ac
>>> +++ b/configure.ac
>>> @@ -627,6 +627,19 @@ if test "$with_linux" = "yes"; then
>>>      AC_CHECK_HEADERS([linux/btrfs.h])  fi
>>>  
>>> +dnl
>>> +dnl check for kernel headers required by devlink dnl if test 
>>> +"$with_linux" = "yes"; then
>>> +    AC_CHECK_HEADERS([linux/devlink.h])
>>> +    AC_CHECK_DECLS([DEVLINK_GENL_VERSION, DEVLINK_GENL_NAME, 
>>> +DEVLINK_ATTR_MAX, DEVLINK_CMD_ESWITCH_GET, DEVLINK_ATTR_BUS_NAME, 
>>> +DEVLINK_ATTR_DEV_NAME, DEVLINK_ATTR_ESWITCH_MODE, 
>>> +DEVLINK_ESWITCH_MODE_SWITCHDEV],
>>
>> That's very ..... thorough, and potentially misleading if someone ever 
>> wanted to use devlink to check for something other than switchdev (e.g.
>> [mythical feature X]) on some system that didn't have the proper stuff 
>> defined for switchdev, but did have it for [other mythical feature X].
>> But that's all just hypothetical, so this is fine with me.
>>
>>
>>> +                   [AC_DEFINE([HAVE_DECL_DEVLINK],
>>> +                              [1],
>>> +                              [whether devlink declarations is 
>>> + available])],
>>
>> [**]
>> s/is/are/
>>
> 
> Yep - altered...
> 
>>> +                   [],
>>> +                   [[#include <linux/devlink.h>]]) fi
>>> +
>>>  dnl Allow perl/python overrides
>>>  AC_PATH_PROGS([PYTHON], [python2 python])  if test -z "$PYTHON"; 
>>> then diff --git a/docs/formatnode.html.in b/docs/formatnode.html.in 
>>> index 4d935b5..29244a8 100644
>>> --- a/docs/formatnode.html.in
>>> +++ b/docs/formatnode.html.in
>>> @@ -227,6 +227,7 @@
>>>                      <dt><code>rxhash</code></dt><dd>receive-hashing</dd>
>>>                      <dt><code>rdma</code></dt><dd>remote-direct-memory-access</dd>
>>>                      
>>> <dt><code>txudptnl</code></dt><dd>tx-udp-tunnel-segmentation</dd>
>>> +                    
>>> + <dt><code>switchdev</code></dt><dd>kernel-forward-plane-offload</dd
>>> + >
>>
>> pretty odd abbreviation. But it is what it is :-)
>>>>                  </dl>
>>>                </dd>
>>>                <dt><code>capability</code></dt> diff --git 
>>> a/src/util/virnetdev.c b/src/util/virnetdev.c index 51a6e42..fc7c961 
>>> 100644
>>> --- a/src/util/virnetdev.c
>>> +++ b/src/util/virnetdev.c
>>> @@ -59,6 +59,10 @@
>>>  # include <net/if_dl.h>
>>>  #endif
>>>  
>>> +#if HAVE_DECL_DEVLINK
>>> +# include <linux/devlink.h>
>>> +#endif
>>> +
>>>  #ifndef IFNAMSIZ
>>>  # define IFNAMSIZ 16
>>>  #endif
>>> @@ -2481,7 +2485,8 @@ VIR_ENUM_IMPL(virNetDevFeature,
>>>                "ntuple",
>>>                "rxhash",
>>>                "rdma",
>>> -              "txudptnl")
>>> +              "txudptnl",
>>> +              "switchdev")
>>>  
>>>  #ifdef __linux__
>>>  int
>>> @@ -2936,6 +2941,7 @@ int virNetDevGetRxFilter(const char *ifname,
>>>      return ret;
>>>  }
>>>  
>>> +
>>
>> [**]
>> Added a spurious extra line.
>>
> 
> Removed.
> 
>>>  #if defined(SIOCETHTOOL) && defined(HAVE_STRUCT_IFREQ)
>>>  
>>>  /**
>>> @@ -3115,6 +3121,182 @@ virNetDevGetEthtoolFeatures(virBitmapPtr 
>>> bitmap,  }
>>>  
>>>  
>>> +#if HAVE_DECL_DEVLINK
>>> +/**
>>> + * virNetDevPutExtraHeader
>>> + * reserve and prepare room for an extra header
>>> + * This function sets to zero the room that is required to put the 
>>> +extra
>>> + * header after the initial Netlink header. This function also 
>>> +increases
>>> + * the nlmsg_len field.
>>> + *
>>> + * @nlh: pointer to Netlink header
>>> + * @size: size of the extra header that we want to put
>>> + *
>>> + * Returns pointer to the start of the extended header  */ static 
>>> +void * virNetDevPutExtraHeader(struct nlmsghdr *nlh,
>>> +                        size_t size) {
>>> +    char *ptr = (char *)nlh + nlh->nlmsg_len;
>>> +    size_t len = NLMSG_ALIGN(size);
>>> +    nlh->nlmsg_len += len;
>>> +    memset(ptr, 0, len);
>>
>> [**]
>> I'm fairly confident this memset() is unnecessary. The buffer the 
>> "extra header" is being added to is allocated with 
>> nlmsg_alloc_simple(); I looked at the source for nlmsg_alloc_simple(), 
>> and it ends up calling calloc(), which will initialize everything to 0 
>> anyway (and any important fields are set to some other value 
>> immediately after return from virNetDevPutExtraHeader()). The reason 
>> the extra memset() bothers me is that 1) we don't set memory allocated 
>> for netlink messages to 0 anywhere else in our netlink-related code so 
>> it's inconsistent, and 2) having an extra memset(0) when it's not 
>> needed could end up becoming the start of a "cargo cult" practice of 
>> calling memset(0) all over the place just because we're paranoid - I'd 
>> rather avoid extra initialization if it's redundant/superfluous.
>>
>> I of course wouldn't want this to be pushed with such a change without 
>> having it tested. I did make this change locally and ran it under gdb 
>> with a breakpoint on virNetDevPutExtraHeader(), and saw that the 
>> entire
>> 4096 bytes of *nl_msg is initialized to 0; it's up to you whether or 
>> not you think it's necessary to re-test on a system that has a card 
>> that support switchdev.
>>
>> (alternately, John could push it as is, and I could send a separate 
>> patch removing the memset(), which could be ACKed by Edan after he 
>> tries it (or NACKed in the event that I'm completely wrong about it 
>> :-)
>>
> 
> I saw this on my initial review and thought perhaps it was superfluous.
> Tracing through various code to find the original calloc is impressive!
> I didn't originally do anything with it because I didn't think it would hurt, but I agree with you.
> 
> So I've removed it from my local copy of the patch that I'll use to push on Monday. Perhaps by then Edan will see these messages and be able to indicate one way or another whether it should be kept from a "retest"
> POV.  Besides, if anything breaks in one of the many CI builds, I don't want to spent weekend hours trying to clean up after it.
> 
> 
> John
> 
>>> +    return ptr;
>>> +}
>>
>> I *was* going to say this doesn't need to be inside HAVE_DECL_DEVLINK, 
>> but since it's defined as static, and only used by a function that is 
>> itself inside HAVE_DECL_DEVLINK, this placement is correct.
>>
>>
>>> +
>>> +
>>> +/**
>>> + * virNetDevGetFamilyId:
>>> + * This function supplies the devlink family id
>>> + *
>>> + * @family_name: the name of the family to query
>>> + *
>>> + * Returns family id or 0 on failure.
>>> + */
>>> +static uint32_t
>>> +virNetDevGetFamilyId(const char *family_name) {
>>> +    struct nl_msg *nl_msg = NULL;
>>> +    struct nlmsghdr *resp = NULL;
>>> +    struct genlmsghdr* gmsgh = NULL;
>>> +    struct nlattr *tb[CTRL_ATTR_MAX + 1] = {NULL, };
>>> +    unsigned int recvbuflen;
>>> +    uint32_t family_id = 0;
>>> +
>>> +    if (!(nl_msg = nlmsg_alloc_simple(GENL_ID_CTRL,
>>> +                                      NLM_F_REQUEST | NLM_F_ACK))) {
>>> +        virReportOOMError();
>>> +        goto cleanup;
>>> +    }
>>> +
>>> +    if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr))))
>>> +        goto cleanup;
>>> +
>>> +    gmsgh->cmd = CTRL_CMD_GETFAMILY;
>>> +    gmsgh->version = DEVLINK_GENL_VERSION;
>>> +
>>> +    if (nla_put_string(nl_msg, CTRL_ATTR_FAMILY_NAME, family_name) < 0) {
>>> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>>> +                       _("allocated netlink buffer is too small"));
>>> +        goto cleanup;
>>> +    }
>>> +
>>> +    if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0)
>>> +        goto cleanup;
>>> +
>>> +    if (nlmsg_parse(resp, sizeof(struct nlmsghdr), tb, CTRL_CMD_MAX, NULL) < 0) {
>>> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>>> +                       _("malformed netlink response message"));
>>> +        goto cleanup;
>>> +    }
>>> +
>>> +    if (tb[CTRL_ATTR_FAMILY_ID] == NULL)
>>> +        goto cleanup;
>>> +
>>> +    family_id = *(uint32_t *)RTA_DATA(tb[CTRL_ATTR_FAMILY_ID]);
>>> +
>>> + cleanup:
>>> +    nlmsg_free(nl_msg);
>>> +    VIR_FREE(resp);
>>> +    return family_id;
>>> +}
>>> +
>>> +
>>> +/**
>>> + * virNetDevSwitchdevFeature
>>> + * This function checks for the availability of Switchdev feature
>>> + * and add it to bitmap
>>> + *
>>> + * @ifname: name of the interface
>>> + * @out: add Switchdev feature if exist to bitmap
>>> + *
>>> + * Returns 0 on success, -1 on failure.
>>> + */
>>> +static int
>>> +virNetDevSwitchdevFeature(const char *ifname,
>>> +                          virBitmapPtr *out) {
>>> +    struct nl_msg *nl_msg = NULL;
>>> +    struct nlmsghdr *resp = NULL;
>>> +    unsigned int recvbuflen;
>>> +    struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {NULL, };
>>> +    virPCIDevicePtr pci_device_ptr = NULL;
>>> +    struct genlmsghdr* gmsgh = NULL;
>>> +    const char *pci_name;
>>> +    char *pfname = NULL;
>>> +    int is_vf = -1;
>>> +    int ret = -1;
>>> +    uint32_t family_id;
>>> +
>>> +    if ((family_id = virNetDevGetFamilyId(DEVLINK_GENL_NAME)) <= 0)
>>> +        return ret;
>>> +
>>> +    if ((is_vf = virNetDevIsVirtualFunction(ifname)) < 0)
>>> +        return ret;
>>> +
>>> +    if (is_vf == 1 && virNetDevGetPhysicalFunction(ifname, &pfname) < 0)
>>> +        goto cleanup;
>>> +
>>> +    if (!(nl_msg = nlmsg_alloc_simple(family_id,
>>> +                                      NLM_F_REQUEST | NLM_F_ACK))) {
>>> +        virReportOOMError();
>>> +        goto cleanup;
>>> +    }
>>> +
>>> +    if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr))))
>>> +        goto cleanup;
>>> +
>>> +    gmsgh->cmd = DEVLINK_CMD_ESWITCH_GET;
>>> +    gmsgh->version = DEVLINK_GENL_VERSION;
>>> +
>>> +    pci_device_ptr = pfname ? virNetDevGetPCIDevice(pfname) :
>>> +                              virNetDevGetPCIDevice(ifname);
>>> +    if (pci_device_ptr == NULL)
>>> +        goto cleanup;
>>> +
>>> +    pci_name = virPCIDeviceGetName(pci_device_ptr);
>>> +
>>> +    if (nla_put(nl_msg, DEVLINK_ATTR_BUS_NAME, strlen("pci")+1, "pci") < 0 ||
>>> +        nla_put(nl_msg, DEVLINK_ATTR_DEV_NAME, strlen(pci_name)+1, pci_name) < 0) {
>>> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>>> +                       _("allocated netlink buffer is too small"));
>>> +        goto cleanup;
>>> +    }
>>> +
>>> +    if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0)
>>> +        goto cleanup;
>>> +
>>> +    if (nlmsg_parse(resp, sizeof(struct genlmsghdr), tb, DEVLINK_ATTR_MAX, NULL) < 0) {
>>> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>>> +                       _("malformed netlink response message"));
>>> +        goto cleanup;
>>> +    }
>>> +
>>> +    if (tb[DEVLINK_ATTR_ESWITCH_MODE] &&
>>> +        *(int *)RTA_DATA(tb[DEVLINK_ATTR_ESWITCH_MODE]) == DEVLINK_ESWITCH_MODE_SWITCHDEV) {
>>> +        ignore_value(virBitmapSetBit(*out, VIR_NET_DEV_FEAT_SWITCHDEV));
>>> +    }
>>
>>
>> I won't pretend that I know all about these particular netlink 
>> messages/attributes. I can say that the code all looks similar to our 
>> other netlink code, and when I run it on my host that doesn't have 
>> cards supporting switchdev, it does execute this code and reports no 
>> switchdev capability in the nodedev-dumpxml output of my NICs; since 
>> I'm sure Edan has done the same test on a system that *does* have 
>> switchdev-capable NICs, I'd say that's adequate testing.
>>
>> Based on that I give my ACK (with at least the two trivial [**] items 
>> changed, preferably also memset() removed), or if you prefer:
>>
>> Reviewed-by: Laine Stump <laine@laine.org>
>>
>>
>>
>>> +
>>> +    ret = 0;
>>> +
>>> + cleanup:
>>> +    nlmsg_free(nl_msg);
>>> +    virPCIDeviceFree(pci_device_ptr);
>>> +    VIR_FREE(resp);
>>> +    VIR_FREE(pfname);
>>> +    return ret;
>>> +}
>>> +#else
>>> +static int
>>> +virNetDevSwitchdevFeature(const char *ifname ATTRIBUTE_UNUSED,
>>> +                          virBitmapPtr *out ATTRIBUTE_UNUSED) {
>>> +    return 0;
>>> +}
>>> +#endif
>>> +
>>> +
>>>  # if HAVE_DECL_ETHTOOL_GFEATURES
>>>  /**
>>>   * virNetDevGFeatureAvailable
>>> @@ -3315,6 +3497,9 @@ virNetDevGetFeatures(const char *ifname,
>>>      if (virNetDevRDMAFeature(ifname, out) < 0)
>>>          goto cleanup;
>>>  
>>> +    if (virNetDevSwitchdevFeature(ifname, out) < 0)
>>> +        goto cleanup;
>>> +
>>>      ret = 0;
>>>   cleanup:
>>>      VIR_FORCE_CLOSE(fd);
>>> diff --git a/src/util/virnetdev.h b/src/util/virnetdev.h index 
>>> 9205c0e..71eaf45 100644
>>> --- a/src/util/virnetdev.h
>>> +++ b/src/util/virnetdev.h
>>> @@ -112,6 +112,7 @@ typedef enum {
>>>      VIR_NET_DEV_FEAT_RXHASH,
>>>      VIR_NET_DEV_FEAT_RDMA,
>>>      VIR_NET_DEV_FEAT_TXUDPTNL,
>>> +    VIR_NET_DEV_FEAT_SWITCHDEV,
>>>      VIR_NET_DEV_FEAT_LAST
>>>  } virNetDevFeature;
>>>  
>>> diff --git a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml 
>>> b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
>>> index d4c96e8..88252e6 100644
>>> --- a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
>>> +++ b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
>>> @@ -15,6 +15,7 @@
>>>      <feature name='rxhash'/>
>>>      <feature name='rdma'/>
>>>      <feature name='txudptnl'/>
>>> +    <feature name='switchdev'/>
>>>      <capability type='80211'/>
>>>    </capability>
>>>  </device>
>>> diff --git a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml 
>>> b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
>>> index 71bf90e..f77dfcc 100644
>>> --- a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
>>> +++ b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
>>> @@ -15,6 +15,7 @@
>>>      <feature name='rxhash'/>
>>>      <feature name='rdma'/>
>>>      <feature name='txudptnl'/>
>>> +    <feature name='switchdev'/>
>>>      <capability type='80203'/>
>>>    </capability>
>>>  </device>
>>
>>

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCHv2] nodedev: add switchdev to NIC capabilities
Posted by Ján Tomko 6 years, 7 months ago
On Thu, Sep 14, 2017 at 11:23:02AM -0400, Laine Stump wrote:
>(Almost all of my comments result in "ok, no action needed". There are
>just three items I would like to see changed (2 trivial, 1 also small
>but Edan or John may think it prudent to re-test with the change before
>pushing) - I marked those comments with [**].
>
>On 08/21/2017 05:19 AM, Edan David wrote:
>> Adding functionality to libvirt that will allow querying the interface
>> for the availability of switchdev Offloading NIC capabilities.
>> The switchdev mode was introduced in kernel 4.8, the iproute2-devlink
>> command to retrieve the swtichdev NIC feature,
>> Command example:  devlink dev eswitch show pci/0000:03:00.0
>> This feature is needed for Openstack so we can do a scheduling decision
>> if the NIC is in Hardware Offload (switchdev) or regular SR-IOV (legacy) mode.
>> And select the appropriate hypervisors with the requested capability see [1].
>>
>> [1] - https://specs.openstack.org/openstack/nova-specs/specs/pike/approved/enable-sriov-nic-features.html
>> ---
>>  configure.ac                                      |  13 ++
>>  docs/formatnode.html.in                           |   1 +
>>  src/util/virnetdev.c                              | 187 +++++++++++++++++++++-
>>  src/util/virnetdev.h                              |   1 +
>>  tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml |   1 +
>>  tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml |   1 +
>>  6 files changed, 203 insertions(+), 1 deletion(-)
>>

This patch fails to compile on a Gentoo machine with
sys-kernel/linux-headers-4.8:

util/virnetdev.c:3248:18: error: use of undeclared identifier 'DEVLINK_CMD_ESWITCH_GET'; did you mean 'DEVLINK_CMD_ESWITCH_MODE_GET'?
    gmsgh->cmd = DEVLINK_CMD_ESWITCH_GET;
                 ^~~~~~~~~~~~~~~~~~~~~~~
                 DEVLINK_CMD_ESWITCH_MODE_GET
/usr/include/linux/devlink.h:60:2: note: 'DEVLINK_CMD_ESWITCH_MODE_GET' declared here
        DEVLINK_CMD_ESWITCH_MODE_GET,
        ^
1 error generated.

>> diff --git a/configure.ac b/configure.ac
>> index b12b7fa..c089798 100644
>> --- a/configure.ac
>> +++ b/configure.ac
>> @@ -627,6 +627,19 @@ if test "$with_linux" = "yes"; then
>>      AC_CHECK_HEADERS([linux/btrfs.h])
>>  fi
>>
>> +dnl
>> +dnl check for kernel headers required by devlink
>> +dnl
>> +if test "$with_linux" = "yes"; then
>> +    AC_CHECK_HEADERS([linux/devlink.h])
>> +    AC_CHECK_DECLS([DEVLINK_GENL_VERSION, DEVLINK_GENL_NAME, DEVLINK_ATTR_MAX, DEVLINK_CMD_ESWITCH_GET, DEVLINK_ATTR_BUS_NAME, DEVLINK_ATTR_DEV_NAME, DEVLINK_ATTR_ESWITCH_MODE, DEVLINK_ESWITCH_MODE_SWITCHDEV],
>
>That's very ..... thorough, and potentially misleading if someone ever
>wanted to use devlink to check for something other than switchdev (e.g.
>[mythical feature X]) on some system that didn't have the proper stuff
>defined for switchdev, but did have it for [other mythical feature X].
>But that's all just hypothetical, so this is fine with me.
>
>
>> +                   [AC_DEFINE([HAVE_DECL_DEVLINK],
>> +                              [1],
>> +                              [whether devlink declarations is available])],
>
>[**]
>s/is/are/
>

It seems the [action-if-found] is executed for every found symbol:

#define HAVE_DECL_DEVLINK_GENL_VERSION 1
#define HAVE_DECL_DEVLINK 1
#define HAVE_DECL_DEVLINK_GENL_NAME 1
#define HAVE_DECL_DEVLINK 1
#define HAVE_DECL_DEVLINK_ATTR_MAX 1
#define HAVE_DECL_DEVLINK 1
#define HAVE_DECL_DEVLINK_CMD_ESWITCH_GET 0
#define HAVE_DECL_DEVLINK_ATTR_BUS_NAME 1
#define HAVE_DECL_DEVLINK 1
#define HAVE_DECL_DEVLINK_ATTR_DEV_NAME 1
#define HAVE_DECL_DEVLINK 1
#define HAVE_DECL_DEVLINK_ATTR_ESWITCH_MODE 1
#define HAVE_DECL_DEVLINK 1
#define HAVE_DECL_DEVLINK_ESWITCH_MODE_SWITCHDEV 1
#define HAVE_DECL_DEVLINK 1

so this usage of AC_CHECK_DECLS is bogus.

Jan

>> +                   [],
>> +                   [[#include <linux/devlink.h>]])
>> +fi
>> +
>>  dnl Allow perl/python overrides
>>  AC_PATH_PROGS([PYTHON], [python2 python])
>>  if test -z "$PYTHON"; then
--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCHv2] nodedev: add switchdev to NIC capabilities
Posted by John Ferlan 6 years, 7 months ago

On 09/18/2017 10:57 AM, Ján Tomko wrote:
> On Thu, Sep 14, 2017 at 11:23:02AM -0400, Laine Stump wrote:
>> (Almost all of my comments result in "ok, no action needed". There are
>> just three items I would like to see changed (2 trivial, 1 also small
>> but Edan or John may think it prudent to re-test with the change before
>> pushing) - I marked those comments with [**].
>>
>> On 08/21/2017 05:19 AM, Edan David wrote:
>>> Adding functionality to libvirt that will allow querying the interface
>>> for the availability of switchdev Offloading NIC capabilities.
>>> The switchdev mode was introduced in kernel 4.8, the iproute2-devlink
>>> command to retrieve the swtichdev NIC feature,
>>> Command example:  devlink dev eswitch show pci/0000:03:00.0
>>> This feature is needed for Openstack so we can do a scheduling decision
>>> if the NIC is in Hardware Offload (switchdev) or regular SR-IOV
>>> (legacy) mode.
>>> And select the appropriate hypervisors with the requested capability
>>> see [1].
>>>
>>> [1] -
>>> https://specs.openstack.org/openstack/nova-specs/specs/pike/approved/enable-sriov-nic-features.html
>>>
>>> ---
>>>  configure.ac                                      |  13 ++
>>>  docs/formatnode.html.in                           |   1 +
>>>  src/util/virnetdev.c                              | 187
>>> +++++++++++++++++++++-
>>>  src/util/virnetdev.h                              |   1 +
>>>  tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml |   1 +
>>>  tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml |   1 +
>>>  6 files changed, 203 insertions(+), 1 deletion(-)
>>>
> 
> This patch fails to compile on a Gentoo machine with
> sys-kernel/linux-headers-4.8:
> 
> util/virnetdev.c:3248:18: error: use of undeclared identifier
> 'DEVLINK_CMD_ESWITCH_GET'; did you mean 'DEVLINK_CMD_ESWITCH_MODE_GET'?
>    gmsgh->cmd = DEVLINK_CMD_ESWITCH_GET;
>                 ^~~~~~~~~~~~~~~~~~~~~~~
>                 DEVLINK_CMD_ESWITCH_MODE_GET
> /usr/include/linux/devlink.h:60:2: note: 'DEVLINK_CMD_ESWITCH_MODE_GET'
> declared here
>        DEVLINK_CMD_ESWITCH_MODE_GET,
>        ^
> 1 error generated.
> 
>>> diff --git a/configure.ac b/configure.ac
>>> index b12b7fa..c089798 100644
>>> --- a/configure.ac
>>> +++ b/configure.ac
>>> @@ -627,6 +627,19 @@ if test "$with_linux" = "yes"; then
>>>      AC_CHECK_HEADERS([linux/btrfs.h])
>>>  fi
>>>
>>> +dnl
>>> +dnl check for kernel headers required by devlink
>>> +dnl
>>> +if test "$with_linux" = "yes"; then
>>> +    AC_CHECK_HEADERS([linux/devlink.h])
>>> +    AC_CHECK_DECLS([DEVLINK_GENL_VERSION, DEVLINK_GENL_NAME,
>>> DEVLINK_ATTR_MAX, DEVLINK_CMD_ESWITCH_GET, DEVLINK_ATTR_BUS_NAME,
>>> DEVLINK_ATTR_DEV_NAME, DEVLINK_ATTR_ESWITCH_MODE,
>>> DEVLINK_ESWITCH_MODE_SWITCHDEV],
>>
>> That's very ..... thorough, and potentially misleading if someone ever
>> wanted to use devlink to check for something other than switchdev (e.g.
>> [mythical feature X]) on some system that didn't have the proper stuff
>> defined for switchdev, but did have it for [other mythical feature X].
>> But that's all just hypothetical, so this is fine with me.
>>
>>
>>> +                   [AC_DEFINE([HAVE_DECL_DEVLINK],
>>> +                              [1],
>>> +                              [whether devlink declarations is
>>> available])],
>>
>> [**]
>> s/is/are/
>>
> 
> It seems the [action-if-found] is executed for every found symbol:
> 
> #define HAVE_DECL_DEVLINK_GENL_VERSION 1
> #define HAVE_DECL_DEVLINK 1
> #define HAVE_DECL_DEVLINK_GENL_NAME 1
> #define HAVE_DECL_DEVLINK 1
> #define HAVE_DECL_DEVLINK_ATTR_MAX 1
> #define HAVE_DECL_DEVLINK 1
> #define HAVE_DECL_DEVLINK_CMD_ESWITCH_GET 0
> #define HAVE_DECL_DEVLINK_ATTR_BUS_NAME 1
> #define HAVE_DECL_DEVLINK 1
> #define HAVE_DECL_DEVLINK_ATTR_DEV_NAME 1
> #define HAVE_DECL_DEVLINK 1
> #define HAVE_DECL_DEVLINK_ATTR_ESWITCH_MODE 1
> #define HAVE_DECL_DEVLINK 1
> #define HAVE_DECL_DEVLINK_ESWITCH_MODE_SWITCHDEV 1
> #define HAVE_DECL_DEVLINK 1
> 
> so this usage of AC_CHECK_DECLS is bogus.
> 
> Jan
> 

So then the question becomes is it "safe enough" to only look for
HAVE_DECL_DEVLINK_CMD_ESWITCH_GET and assume the others are present or
does there have to be some sort of AC_CHECK_DECLS for every one of the
symbols that are going to be used and then some way to only allow that
code to be compiled if all are true?

I'd say the former - just check for CMD_ESWITCH_GET and assume the
others, but you know what can be said about assumptions... And this is
why I didn't want to push this on Saturday!  bad enough that I've been
in and out of the office today ;-)

John

>>> +                   [],
>>> +                   [[#include <linux/devlink.h>]])
>>> +fi
>>> +
>>>  dnl Allow perl/python overrides
>>>  AC_PATH_PROGS([PYTHON], [python2 python])
>>>  if test -z "$PYTHON"; then
> 
> 
> --
> libvir-list mailing list
> libvir-list@redhat.com
> https://www.redhat.com/mailman/listinfo/libvir-list
> 

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list