[RFC net-next v2 1/9] net: napi: Add napi_storage

Joe Damato posted 9 patches 2 months, 3 weeks ago
There is a newer version of this series
[RFC net-next v2 1/9] net: napi: Add napi_storage
Posted by Joe Damato 2 months, 3 weeks ago
Add a persistent NAPI storage area for NAPI configuration to the core.
Drivers opt-in to setting the storage for a NAPI by passing an index
when calling netif_napi_add_storage.

napi_storage is allocated in alloc_netdev_mqs, freed in free_netdev
(after the NAPIs are deleted), and set to 0 when napi_enable is called.

Signed-off-by: Joe Damato <jdamato@fastly.com>
---
 .../networking/net_cachelines/net_device.rst  |  1 +
 include/linux/netdevice.h                     | 34 +++++++++++++++++++
 net/core/dev.c                                | 18 +++++++++-
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 22b07c814f4a..a82751c88d18 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -106,6 +106,7 @@ rx_handler_func_t*                  rx_handler              read_mostly
 void*                               rx_handler_data         read_mostly         -                   
 struct_netdev_queue*                ingress_queue           read_mostly         -                   
 struct_bpf_mprog_entry              tcx_ingress             -                   read_mostly         sch_handle_ingress
+struct napi_storage*                napi_storage            -                   read_mostly         napi_complete_done
 struct_nf_hook_entries*             nf_hooks_ingress                                                
 unsigned_char                       broadcast[32]                                                   
 struct_cpu_rmap*                    rx_cpu_rmap                                                     
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b47c00657bd0..54da1c800e65 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -342,6 +342,14 @@ struct gro_list {
  */
 #define GRO_HASH_BUCKETS	8
 
+/*
+ * Structure for per-NAPI storage
+ */
+struct napi_storage {
+	u64 gro_flush_timeout;
+	u32 defer_hard_irqs;
+};
+
 /*
  * Structure for NAPI scheduling similar to tasklet but with weighting
  */
@@ -377,6 +385,8 @@ struct napi_struct {
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
 	int			irq;
+	int			index;
+	struct napi_storage	*napi_storage;
 };
 
 enum {
@@ -2009,6 +2019,9 @@ enum netdev_reg_state {
  *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
  *		   where the clock is recovered.
  *
+ *	@napi_storage: An array of napi_storage structures containing per-NAPI
+ *		       settings.
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -2087,6 +2100,7 @@ struct net_device {
 #ifdef CONFIG_NET_XGRESS
 	struct bpf_mprog_entry __rcu *tcx_ingress;
 #endif
+	struct napi_storage	*napi_storage;
 	__cacheline_group_end(net_device_read_rx);
 
 	char			name[IFNAMSIZ];
@@ -2648,6 +2662,24 @@ netif_napi_add_tx_weight(struct net_device *dev,
 	netif_napi_add_weight(dev, napi, poll, weight);
 }
 
+/**
+ * netif_napi_add_storage - initialize a NAPI context and set storage area
+ * @dev: network device
+ * @napi: NAPI context
+ * @poll: polling function
+ * @weight: the poll weight of this NAPI
+ * @index: the NAPI index
+ */
+static inline void
+netif_napi_add_storage(struct net_device *dev, struct napi_struct *napi,
+		       int (*poll)(struct napi_struct *, int), int weight,
+		       int index)
+{
+	napi->index = index;
+	napi->napi_storage = &dev->napi_storage[index];
+	netif_napi_add_weight(dev, napi, poll, weight);
+}
+
 /**
  * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
  * @dev:  network device
@@ -2683,6 +2715,8 @@ void __netif_napi_del(struct napi_struct *napi);
  */
 static inline void netif_napi_del(struct napi_struct *napi)
 {
+	napi->napi_storage = NULL;
+	napi->index = -1;
 	__netif_napi_del(napi);
 	synchronize_net();
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index 22c3f14d9287..ca90e8cab121 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6719,6 +6719,9 @@ void napi_enable(struct napi_struct *n)
 		if (n->dev->threaded && n->thread)
 			new |= NAPIF_STATE_THREADED;
 	} while (!try_cmpxchg(&n->state, &val, new));
+
+	if (n->napi_storage)
+		memset(n->napi_storage, 0, sizeof(*n->napi_storage));
 }
 EXPORT_SYMBOL(napi_enable);
 
@@ -11054,6 +11057,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		unsigned int txqs, unsigned int rxqs)
 {
 	struct net_device *dev;
+	size_t napi_storage_sz;
+	unsigned int maxqs;
 
 	BUG_ON(strlen(name) >= sizeof(dev->name));
 
@@ -11067,6 +11072,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		return NULL;
 	}
 
+	WARN_ON_ONCE(txqs != rxqs);
+	maxqs = max(txqs, rxqs);
+
 	dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
 		       GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
 	if (!dev)
@@ -11141,6 +11149,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	if (!dev->ethtool)
 		goto free_all;
 
+	napi_storage_sz = array_size(maxqs, sizeof(*dev->napi_storage));
+	dev->napi_storage = kvzalloc(napi_storage_sz, GFP_KERNEL_ACCOUNT);
+	if (!dev->napi_storage)
+		goto free_all;
+
 	strscpy(dev->name, name);
 	dev->name_assign_type = name_assign_type;
 	dev->group = INIT_NETDEV_GROUP;
@@ -11202,6 +11215,8 @@ void free_netdev(struct net_device *dev)
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
+	kvfree(dev->napi_storage);
+
 	ref_tracker_dir_exit(&dev->refcnt_tracker);
 #ifdef CONFIG_PCPU_DEV_REFCNT
 	free_percpu(dev->pcpu_refcnt);
@@ -11979,7 +11994,8 @@ static void __init net_dev_struct_check(void)
 #ifdef CONFIG_NET_XGRESS
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
 #endif
-	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_storage);
+	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 112);
 }
 
 /*
-- 
2.25.1
Re: [RFC net-next v2 1/9] net: napi: Add napi_storage
Posted by Jakub Kicinski 2 months, 3 weeks ago
On Sun,  8 Sep 2024 16:06:35 +0000 Joe Damato wrote:
> Add a persistent NAPI storage area for NAPI configuration to the core.
> Drivers opt-in to setting the storage for a NAPI by passing an index
> when calling netif_napi_add_storage.
> 
> napi_storage is allocated in alloc_netdev_mqs, freed in free_netdev
> (after the NAPIs are deleted), and set to 0 when napi_enable is called.

>  enum {
> @@ -2009,6 +2019,9 @@ enum netdev_reg_state {
>   *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
>   *		   where the clock is recovered.
>   *
> + *	@napi_storage: An array of napi_storage structures containing per-NAPI
> + *		       settings.

FWIW you can use inline kdoc, with the size of the struct it's easier
to find it. Also this doesn't need to be accessed from fastpath so you
can move it down.

> +/**
> + * netif_napi_add_storage - initialize a NAPI context and set storage area
> + * @dev: network device
> + * @napi: NAPI context
> + * @poll: polling function
> + * @weight: the poll weight of this NAPI
> + * @index: the NAPI index
> + */
> +static inline void
> +netif_napi_add_storage(struct net_device *dev, struct napi_struct *napi,
> +		       int (*poll)(struct napi_struct *, int), int weight,
> +		       int index)
> +{
> +	napi->index = index;
> +	napi->napi_storage = &dev->napi_storage[index];
> +	netif_napi_add_weight(dev, napi, poll, weight);

You can drop the weight param, just pass NAPI_POLL_WEIGHT.

Then -- change netif_napi_add_weight() to prevent if from
calling napi_hash_add() if it has index >= 0

> diff --git a/net/core/dev.c b/net/core/dev.c
> index 22c3f14d9287..ca90e8cab121 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6719,6 +6719,9 @@ void napi_enable(struct napi_struct *n)
>  		if (n->dev->threaded && n->thread)
>  			new |= NAPIF_STATE_THREADED;
>  	} while (!try_cmpxchg(&n->state, &val, new));
> +
> +	if (n->napi_storage)
> +		memset(n->napi_storage, 0, sizeof(*n->napi_storage));

And here inherit the settings and the NAPI ID from storage, then call
napi_hash_add(). napi_hash_add() will need a minor diff to use the
existing ID if already assigned.

And the inverse of that has to happen in napi_disable() (unhash, save
settings to storage), and __netif_napi_del() (don't unhash if it has
index).

I think that should work?
Re: [RFC net-next v2 1/9] net: napi: Add napi_storage
Posted by Joe Damato 2 months, 2 weeks ago
On Mon, Sep 09, 2024 at 04:40:39PM -0700, Jakub Kicinski wrote:
> On Sun,  8 Sep 2024 16:06:35 +0000 Joe Damato wrote:
> > Add a persistent NAPI storage area for NAPI configuration to the core.
> > Drivers opt-in to setting the storage for a NAPI by passing an index
> > when calling netif_napi_add_storage.
> > 
> > napi_storage is allocated in alloc_netdev_mqs, freed in free_netdev
> > (after the NAPIs are deleted), and set to 0 when napi_enable is called.
> 
> >  enum {
> > @@ -2009,6 +2019,9 @@ enum netdev_reg_state {
> >   *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
> >   *		   where the clock is recovered.
> >   *
> > + *	@napi_storage: An array of napi_storage structures containing per-NAPI
> > + *		       settings.
> 
> FWIW you can use inline kdoc, with the size of the struct it's easier
> to find it. Also this doesn't need to be accessed from fastpath so you
> can move it down.
> 
> > +/**
> > + * netif_napi_add_storage - initialize a NAPI context and set storage area
> > + * @dev: network device
> > + * @napi: NAPI context
> > + * @poll: polling function
> > + * @weight: the poll weight of this NAPI
> > + * @index: the NAPI index
> > + */
> > +static inline void
> > +netif_napi_add_storage(struct net_device *dev, struct napi_struct *napi,
> > +		       int (*poll)(struct napi_struct *, int), int weight,
> > +		       int index)
> > +{
> > +	napi->index = index;
> > +	napi->napi_storage = &dev->napi_storage[index];
> > +	netif_napi_add_weight(dev, napi, poll, weight);
> 
> You can drop the weight param, just pass NAPI_POLL_WEIGHT.
> 
> Then -- change netif_napi_add_weight() to prevent if from
> calling napi_hash_add() if it has index >= 0
> 
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 22c3f14d9287..ca90e8cab121 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -6719,6 +6719,9 @@ void napi_enable(struct napi_struct *n)
> >  		if (n->dev->threaded && n->thread)
> >  			new |= NAPIF_STATE_THREADED;
> >  	} while (!try_cmpxchg(&n->state, &val, new));
> > +
> > +	if (n->napi_storage)
> > +		memset(n->napi_storage, 0, sizeof(*n->napi_storage));
> 
> And here inherit the settings and the NAPI ID from storage, then call
> napi_hash_add(). napi_hash_add() will need a minor diff to use the
> existing ID if already assigned.
> 
> And the inverse of that has to happen in napi_disable() (unhash, save
> settings to storage), and __netif_napi_del() (don't unhash if it has
> index).
> 
> I think that should work?

I made the changes you suggested above yesterday and I had also
renamed the struct to napi_config because I also liked that better
than storage.

I'll update the code to put the values in the napi_struct and copy
them over as you suggested in your other message.

That said: the copying thing is more of an optimization, so the
changes I have should be almost (?) working and adding that copying
of the values into the napi_struct should only be a performance
thing and not a functionality/correctness thing.

I say that because there's still a bug I'm trying to work out with
mlx5 and it's almost certainly in the codepath Stanislav pointed out
in his messages [1] [2]. Haven't had much time to debug it just yet,
but I am hoping to be able to debug it and submit another RFC before
the end of this week.

FWIW: I too have fallen down this code path in mlx5 in the past for
other reasons. It appears it is time to fall down it again.

[1]: https://lore.kernel.org/netdev/Ztjv-dgNFwFBnXwd@mini-arch/
[2]: https://lore.kernel.org/netdev/Zt94tXG_lzGLWo1w@mini-arch/
Re: [RFC net-next v2 1/9] net: napi: Add napi_storage
Posted by Joe Damato 2 months, 3 weeks ago
On Mon, Sep 09, 2024 at 04:40:39PM -0700, Jakub Kicinski wrote:
> On Sun,  8 Sep 2024 16:06:35 +0000 Joe Damato wrote:
> > Add a persistent NAPI storage area for NAPI configuration to the core.
> > Drivers opt-in to setting the storage for a NAPI by passing an index
> > when calling netif_napi_add_storage.
> > 
> > napi_storage is allocated in alloc_netdev_mqs, freed in free_netdev
> > (after the NAPIs are deleted), and set to 0 when napi_enable is called.
> 
> >  enum {
> > @@ -2009,6 +2019,9 @@ enum netdev_reg_state {
> >   *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
> >   *		   where the clock is recovered.
> >   *
> > + *	@napi_storage: An array of napi_storage structures containing per-NAPI
> > + *		       settings.
> 
> FWIW you can use inline kdoc, with the size of the struct it's easier
> to find it. Also this doesn't need to be accessed from fastpath so you
> can move it down.

OK. I figured since it was being deref'd in napi_complete_done
(where we previously read napi_defer_hard_irqs and
gro_flush_timeout) it needed to be in the fast path.

I'll move it down for the next RFC.

> > +/**
> > + * netif_napi_add_storage - initialize a NAPI context and set storage area
> > + * @dev: network device
> > + * @napi: NAPI context
> > + * @poll: polling function
> > + * @weight: the poll weight of this NAPI
> > + * @index: the NAPI index
> > + */
> > +static inline void
> > +netif_napi_add_storage(struct net_device *dev, struct napi_struct *napi,
> > +		       int (*poll)(struct napi_struct *, int), int weight,
> > +		       int index)
> > +{
> > +	napi->index = index;
> > +	napi->napi_storage = &dev->napi_storage[index];
> > +	netif_napi_add_weight(dev, napi, poll, weight);
> 
> You can drop the weight param, just pass NAPI_POLL_WEIGHT.

OK will do.

> Then -- change netif_napi_add_weight() to prevent if from
> calling napi_hash_add() if it has index >= 0

OK.

> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 22c3f14d9287..ca90e8cab121 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -6719,6 +6719,9 @@ void napi_enable(struct napi_struct *n)
> >  		if (n->dev->threaded && n->thread)
> >  			new |= NAPIF_STATE_THREADED;
> >  	} while (!try_cmpxchg(&n->state, &val, new));
> > +
> > +	if (n->napi_storage)
> > +		memset(n->napi_storage, 0, sizeof(*n->napi_storage));

OK, your comments below will probably make more sense to me after I
try implementing it, but I'll definitely have some questions.

> And here inherit the settings and the NAPI ID from storage, then call
> napi_hash_add(). napi_hash_add() will need a minor diff to use the
> existing ID if already assigned.

I don't think I realized we settled on the NAPI ID being persistent.
I'm not opposed to that, I just think I missed that part in the
previous conversation.

I'll give it a shot and see what the next RFC looks like.

> And the inverse of that has to happen in napi_disable() (unhash, save
> settings to storage), and __netif_napi_del() (don't unhash if it has
> index).
> 
> I think that should work?

Only one way to find out ;)

Separately: your comment about documenting rings to NAPIs... I am
not following that bit.

Is that a thing you meant should be documented for driver writers to
follow to reduce churn ?
Re: [RFC net-next v2 1/9] net: napi: Add napi_storage
Posted by Jakub Kicinski 2 months, 3 weeks ago
On Tue, 10 Sep 2024 08:13:51 +0200 Joe Damato wrote:
> On Mon, Sep 09, 2024 at 04:40:39PM -0700, Jakub Kicinski wrote:
> > On Sun,  8 Sep 2024 16:06:35 +0000 Joe Damato wrote:  
> > > Add a persistent NAPI storage area for NAPI configuration to the core.
> > > Drivers opt-in to setting the storage for a NAPI by passing an index
> > > when calling netif_napi_add_storage.
> > > 
> > > napi_storage is allocated in alloc_netdev_mqs, freed in free_netdev
> > > (after the NAPIs are deleted), and set to 0 when napi_enable is called.  
> >   
> > >  enum {
> > > @@ -2009,6 +2019,9 @@ enum netdev_reg_state {
> > >   *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
> > >   *		   where the clock is recovered.
> > >   *
> > > + *	@napi_storage: An array of napi_storage structures containing per-NAPI
> > > + *		       settings.  
> > 
> > FWIW you can use inline kdoc, with the size of the struct it's easier
> > to find it. Also this doesn't need to be accessed from fastpath so you
> > can move it down.  
> 
> OK. I figured since it was being deref'd in napi_complete_done
> (where we previously read napi_defer_hard_irqs and
> gro_flush_timeout) it needed to be in the fast path.
> 
> I'll move it down for the next RFC.

Hm, fair point. In my mind I expected we still add the fast path fields
to NAPI instances. And the storage would only be there to stash that
information for the period of time when real NAPI instances are not
present (napi_disable() -> napi_enable() cycles).

But looking at napi_struct, all the cachelines seem full, anyway, so we
can as well split the info. No strong preference, feel free to keep as
is, then. But maybe rename from napi_storage to napi_config or such?

> > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > index 22c3f14d9287..ca90e8cab121 100644
> > > --- a/net/core/dev.c
> > > +++ b/net/core/dev.c
> > > @@ -6719,6 +6719,9 @@ void napi_enable(struct napi_struct *n)
> > >  		if (n->dev->threaded && n->thread)
> > >  			new |= NAPIF_STATE_THREADED;
> > >  	} while (!try_cmpxchg(&n->state, &val, new));
> > > +
> > > +	if (n->napi_storage)
> > > +		memset(n->napi_storage, 0, sizeof(*n->napi_storage));  
> 
> OK, your comments below will probably make more sense to me after I
> try implementing it, but I'll definitely have some questions.
> 
> > And here inherit the settings and the NAPI ID from storage, then call
> > napi_hash_add(). napi_hash_add() will need a minor diff to use the
> > existing ID if already assigned.  
> 
> I don't think I realized we settled on the NAPI ID being persistent.
> I'm not opposed to that, I just think I missed that part in the
> previous conversation.
> 
> I'll give it a shot and see what the next RFC looks like.

The main reason to try to make NAPI ID persistent from the start is that
if it works we don't have to add index to the uAPI. I don't feel
strongly about it, if you or anyone else has arguments against / why
it won't work.

> > And the inverse of that has to happen in napi_disable() (unhash, save
> > settings to storage), and __netif_napi_del() (don't unhash if it has
> > index).
> > 
> > I think that should work?  
> 
> Only one way to find out ;)
> 
> Separately: your comment about documenting rings to NAPIs... I am
> not following that bit.
> 
> Is that a thing you meant should be documented for driver writers to
> follow to reduce churn ?

Which comment?
Re: [RFC net-next v2 1/9] net: napi: Add napi_storage
Posted by Joe Damato 2 months, 3 weeks ago
On Tue, Sep 10, 2024 at 07:52:17AM -0700, Jakub Kicinski wrote:
> On Tue, 10 Sep 2024 08:13:51 +0200 Joe Damato wrote:
> > On Mon, Sep 09, 2024 at 04:40:39PM -0700, Jakub Kicinski wrote:
> > > On Sun,  8 Sep 2024 16:06:35 +0000 Joe Damato wrote:  
> > > > Add a persistent NAPI storage area for NAPI configuration to the core.
> > > > Drivers opt-in to setting the storage for a NAPI by passing an index
> > > > when calling netif_napi_add_storage.
> > > > 
> > > > napi_storage is allocated in alloc_netdev_mqs, freed in free_netdev
> > > > (after the NAPIs are deleted), and set to 0 when napi_enable is called.  
> > >   
> > > >  enum {
> > > > @@ -2009,6 +2019,9 @@ enum netdev_reg_state {
> > > >   *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
> > > >   *		   where the clock is recovered.
> > > >   *
> > > > + *	@napi_storage: An array of napi_storage structures containing per-NAPI
> > > > + *		       settings.  
> > > 
> > > FWIW you can use inline kdoc, with the size of the struct it's easier
> > > to find it. Also this doesn't need to be accessed from fastpath so you
> > > can move it down.  
> > 
> > OK. I figured since it was being deref'd in napi_complete_done
> > (where we previously read napi_defer_hard_irqs and
> > gro_flush_timeout) it needed to be in the fast path.
> > 
> > I'll move it down for the next RFC.
> 
> Hm, fair point. In my mind I expected we still add the fast path fields
> to NAPI instances. And the storage would only be there to stash that
> information for the period of time when real NAPI instances are not
> present (napi_disable() -> napi_enable() cycles).

I see, I hadn't understood that part. It sounds like you were
thinking we'd stash the values in between whereas I thought we were
reading from the struct directly (hence the implementation of the
static inline wrappers).

I don't really have a preference one way or the other.

> But looking at napi_struct, all the cachelines seem full, anyway, so we
> can as well split the info. No strong preference, feel free to keep as
> is, then. But maybe rename from napi_storage to napi_config or such?

I can give that a shot for the next RFC and see how it looks. We can
always duplicate the fields in napi_struct and napi_config and copy
them over as you suggested above.

> > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > index 22c3f14d9287..ca90e8cab121 100644
> > > > --- a/net/core/dev.c
> > > > +++ b/net/core/dev.c
> > > > @@ -6719,6 +6719,9 @@ void napi_enable(struct napi_struct *n)
> > > >  		if (n->dev->threaded && n->thread)
> > > >  			new |= NAPIF_STATE_THREADED;
> > > >  	} while (!try_cmpxchg(&n->state, &val, new));
> > > > +
> > > > +	if (n->napi_storage)
> > > > +		memset(n->napi_storage, 0, sizeof(*n->napi_storage));  
> > 
> > OK, your comments below will probably make more sense to me after I
> > try implementing it, but I'll definitely have some questions.
> > 
> > > And here inherit the settings and the NAPI ID from storage, then call
> > > napi_hash_add(). napi_hash_add() will need a minor diff to use the
> > > existing ID if already assigned.  
> > 
> > I don't think I realized we settled on the NAPI ID being persistent.
> > I'm not opposed to that, I just think I missed that part in the
> > previous conversation.
> > 
> > I'll give it a shot and see what the next RFC looks like.
> 
> The main reason to try to make NAPI ID persistent from the start is that
> if it works we don't have to add index to the uAPI. I don't feel
> strongly about it, if you or anyone else has arguments against / why
> it won't work.

Yea, I think not exposing the index in the uAPI is probably a good
idea? Making the NAPI IDs persistent let's us avoid that so I can
give that a shot because it's easier from the user app perspective,
IMO.

> > > And the inverse of that has to happen in napi_disable() (unhash, save
> > > settings to storage), and __netif_napi_del() (don't unhash if it has
> > > index).
> > > 
> > > I think that should work?  
> > 
> > Only one way to find out ;)
> > 
> > Separately: your comment about documenting rings to NAPIs... I am
> > not following that bit.
> > 
> > Is that a thing you meant should be documented for driver writers to
> > follow to reduce churn ?
> 
> Which comment?

In this message:

https://lore.kernel.org/netdev/20240903124008.4793c087@kernel.org/

You mentioned this, which I interpreted as a thing that core needs
to solve for, but perhaps this intended as advice for drivers?

  Maybe it's enough to document how rings are distributed to NAPIs?
  
  First set of NAPIs should get allocated to the combined channels,
  then for remaining rx- and tx-only NAPIs they should be interleaved
  starting with rx?
  
  Example, asymmetric config: combined + some extra tx:
  
      combined        tx
   [0..#combined-1] [#combined..#combined+#tx-1]
  
  Split rx / tx - interleave:
  
   [0 rx0] [1 tx0] [2 rx1] [3 tx1] [4 rx2] [5 tx2] ...
  
  This would limit the churn when changing channel counts.
Re: [RFC net-next v2 1/9] net: napi: Add napi_storage
Posted by Jakub Kicinski 2 months, 2 weeks ago
On Tue, 10 Sep 2024 18:10:42 +0200 Joe Damato wrote:
> On Tue, Sep 10, 2024 at 07:52:17AM -0700, Jakub Kicinski wrote:
> > Hm, fair point. In my mind I expected we still add the fast path fields
> > to NAPI instances. And the storage would only be there to stash that
> > information for the period of time when real NAPI instances are not
> > present (napi_disable() -> napi_enable() cycles).  
> 
> I see, I hadn't understood that part. It sounds like you were
> thinking we'd stash the values in between whereas I thought we were
> reading from the struct directly (hence the implementation of the
> static inline wrappers).
> 
> I don't really have a preference one way or the other.

Me neither. I suspect having the fields in napi_struct to be slightly
more optimal. No need for indirect accesses via napi->storage->$field,
and conditions to check if napi->storage is set. We can make sure we
populate the fields in NAPIs when they are created and when sysfs writes
happen. So slightly better fastpath locality at the expense of more
code on the control path keeping it in sync.

FWIW the more we discuss this the less I like the word "storage" :)
If you could sed -i 's/storage/config/' on the patches that'd great :)

> > > I don't think I realized we settled on the NAPI ID being persistent.
> > > I'm not opposed to that, I just think I missed that part in the
> > > previous conversation.
> > > 
> > > I'll give it a shot and see what the next RFC looks like.  
> > 
> > The main reason to try to make NAPI ID persistent from the start is that
> > if it works we don't have to add index to the uAPI. I don't feel
> > strongly about it, if you or anyone else has arguments against / why
> > it won't work.  
> 
> Yea, I think not exposing the index in the uAPI is probably a good
> idea? Making the NAPI IDs persistent let's us avoid that so I can
> give that a shot because it's easier from the user app perspective,
> IMO.

Right, basically we can always add it later. Removing it later won't
be possible :S

> > > Only one way to find out ;)
> > > 
> > > Separately: your comment about documenting rings to NAPIs... I am
> > > not following that bit.
> > > 
> > > Is that a thing you meant should be documented for driver writers to
> > > follow to reduce churn ?  
> > 
> > Which comment?  
> 
> In this message:
> 
> https://lore.kernel.org/netdev/20240903124008.4793c087@kernel.org/
> 
> You mentioned this, which I interpreted as a thing that core needs
> to solve for, but perhaps this intended as advice for drivers?
> 
>   Maybe it's enough to document how rings are distributed to NAPIs?
>   
>   First set of NAPIs should get allocated to the combined channels,
>   then for remaining rx- and tx-only NAPIs they should be interleaved
>   starting with rx?
>   
>   Example, asymmetric config: combined + some extra tx:
>   
>       combined        tx
>    [0..#combined-1] [#combined..#combined+#tx-1]
>   
>   Split rx / tx - interleave:
>   
>    [0 rx0] [1 tx0] [2 rx1] [3 tx1] [4 rx2] [5 tx2] ...
>   
>   This would limit the churn when changing channel counts.

Oh, yes. I'm not sure _where_ to document it. But if the driver supports
asymmetric ring count or dedicated Rx and Tx NAPIs - this is the
recommended way to distributing the rings to NAPIs, to, as you say,
limit the config churn / mismatch after ring count changes.
Re: [RFC net-next v2 1/9] net: napi: Add napi_storage
Posted by Joe Damato 2 months, 2 weeks ago
On Tue, Sep 10, 2024 at 05:10:48PM -0700, Jakub Kicinski wrote:
> On Tue, 10 Sep 2024 18:10:42 +0200 Joe Damato wrote:
> > On Tue, Sep 10, 2024 at 07:52:17AM -0700, Jakub Kicinski wrote:
> > > Hm, fair point. In my mind I expected we still add the fast path fields
> > > to NAPI instances. And the storage would only be there to stash that
> > > information for the period of time when real NAPI instances are not
> > > present (napi_disable() -> napi_enable() cycles).  
> > 
> > I see, I hadn't understood that part. It sounds like you were
> > thinking we'd stash the values in between whereas I thought we were
> > reading from the struct directly (hence the implementation of the
> > static inline wrappers).
> > 
> > I don't really have a preference one way or the other.
> 
> Me neither. I suspect having the fields in napi_struct to be slightly
> more optimal. No need for indirect accesses via napi->storage->$field,
> and conditions to check if napi->storage is set. We can make sure we
> populate the fields in NAPIs when they are created and when sysfs writes
> happen. So slightly better fastpath locality at the expense of more
> code on the control path keeping it in sync.
> 
> FWIW the more we discuss this the less I like the word "storage" :)
> If you could sed -i 's/storage/config/' on the patches that'd great :)

Yup, I actually did that yesterday. I also like config more.

Right now, I have:
  - struct napi_config
  - in the napi_struct its a struct napi_config *config
  - in the net_device its a struct napi_config *napi_config

I figured in the second case you are already de-refing through a
"napi_struct" so writing "napi->napi_config->..." seemed a bit wordy
compared to "napi->config->...".
 
> > > > I don't think I realized we settled on the NAPI ID being persistent.
> > > > I'm not opposed to that, I just think I missed that part in the
> > > > previous conversation.
> > > > 
> > > > I'll give it a shot and see what the next RFC looks like.  
> > > 
> > > The main reason to try to make NAPI ID persistent from the start is that
> > > if it works we don't have to add index to the uAPI. I don't feel
> > > strongly about it, if you or anyone else has arguments against / why
> > > it won't work.  
> > 
> > Yea, I think not exposing the index in the uAPI is probably a good
> > idea? Making the NAPI IDs persistent let's us avoid that so I can
> > give that a shot because it's easier from the user app perspective,
> > IMO.
> 
> Right, basically we can always add it later. Removing it later won't
> be possible :S

Yup, I totally get that.
 
> > > > Only one way to find out ;)
> > > > 
> > > > Separately: your comment about documenting rings to NAPIs... I am
> > > > not following that bit.
> > > > 
> > > > Is that a thing you meant should be documented for driver writers to
> > > > follow to reduce churn ?  
> > > 
> > > Which comment?  
> > 
> > In this message:
> > 
> > https://lore.kernel.org/netdev/20240903124008.4793c087@kernel.org/
> > 
> > You mentioned this, which I interpreted as a thing that core needs
> > to solve for, but perhaps this intended as advice for drivers?
> > 
> >   Maybe it's enough to document how rings are distributed to NAPIs?
> >   
> >   First set of NAPIs should get allocated to the combined channels,
> >   then for remaining rx- and tx-only NAPIs they should be interleaved
> >   starting with rx?
> >   
> >   Example, asymmetric config: combined + some extra tx:
> >   
> >       combined        tx
> >    [0..#combined-1] [#combined..#combined+#tx-1]
> >   
> >   Split rx / tx - interleave:
> >   
> >    [0 rx0] [1 tx0] [2 rx1] [3 tx1] [4 rx2] [5 tx2] ...
> >   
> >   This would limit the churn when changing channel counts.
> 
> Oh, yes. I'm not sure _where_ to document it. But if the driver supports
> asymmetric ring count or dedicated Rx and Tx NAPIs - this is the
> recommended way to distributing the rings to NAPIs, to, as you say,
> limit the config churn / mismatch after ring count changes.

OK, so that seems like it's related but separate from the changes I
am proposing via RFC, so I don't think I need to include that in my
next RFC.
Re: [RFC net-next v2 1/9] net: napi: Add napi_storage
Posted by Joe Damato 2 months, 3 weeks ago
On Sun, Sep 08, 2024 at 04:06:35PM +0000, Joe Damato wrote:
> Add a persistent NAPI storage area for NAPI configuration to the core.
> Drivers opt-in to setting the storage for a NAPI by passing an index
> when calling netif_napi_add_storage.
> 
> napi_storage is allocated in alloc_netdev_mqs, freed in free_netdev
> (after the NAPIs are deleted), and set to 0 when napi_enable is called.
> 
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
>  .../networking/net_cachelines/net_device.rst  |  1 +
>  include/linux/netdevice.h                     | 34 +++++++++++++++++++
>  net/core/dev.c                                | 18 +++++++++-
>  3 files changed, 52 insertions(+), 1 deletion(-)
> 

[...]

> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6719,6 +6719,9 @@ void napi_enable(struct napi_struct *n)
>  		if (n->dev->threaded && n->thread)
>  			new |= NAPIF_STATE_THREADED;
>  	} while (!try_cmpxchg(&n->state, &val, new));
> +
> +	if (n->napi_storage)
> +		memset(n->napi_storage, 0, sizeof(*n->napi_storage));

This part is very obviously wrong ;)

I think when I was reading the other thread about resetting on
channel change [1] I got a bit confused.

Maybe what was intended was on napi_enable, do nothing / remove the
above code (which I suppose would give the persistence desired?).

But modify net/ethtool/channels.c to reset NAPIs to the global
(sysfs) settings? Not sure how to balance both persistence with
queue count changes in a way that makes sense for users of the API.

And, I didn't quite follow the bits about:
  1. The proposed ring to NAPI mapping
  2. The two step "takeover" which seemed to imply that we might
     pull napi_id into napi_storage? Or maybe I just read that part
     wrong?

I'll go back re-re-(re?)-read those emails to see if I can figure
out what the desired implementation is.

I'd welcome any/all feedback/clarifications on this part in
particular.

[1]: https://lore.kernel.org/netdev/20240903124008.4793c087@kernel.org/
Re: [RFC net-next v2 1/9] net: napi: Add napi_storage
Posted by Stanislav Fomichev 2 months, 3 weeks ago
On 09/08, Joe Damato wrote:
> On Sun, Sep 08, 2024 at 04:06:35PM +0000, Joe Damato wrote:
> > Add a persistent NAPI storage area for NAPI configuration to the core.
> > Drivers opt-in to setting the storage for a NAPI by passing an index
> > when calling netif_napi_add_storage.
> > 
> > napi_storage is allocated in alloc_netdev_mqs, freed in free_netdev
> > (after the NAPIs are deleted), and set to 0 when napi_enable is called.
> > 
> > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > ---
> >  .../networking/net_cachelines/net_device.rst  |  1 +
> >  include/linux/netdevice.h                     | 34 +++++++++++++++++++
> >  net/core/dev.c                                | 18 +++++++++-
> >  3 files changed, 52 insertions(+), 1 deletion(-)
> > 
> 
> [...]
> 
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -6719,6 +6719,9 @@ void napi_enable(struct napi_struct *n)
> >  		if (n->dev->threaded && n->thread)
> >  			new |= NAPIF_STATE_THREADED;
> >  	} while (!try_cmpxchg(&n->state, &val, new));
> > +
> > +	if (n->napi_storage)
> > +		memset(n->napi_storage, 0, sizeof(*n->napi_storage));
> 
> This part is very obviously wrong ;)
> 
> I think when I was reading the other thread about resetting on
> channel change [1] I got a bit confused.
> 
> Maybe what was intended was on napi_enable, do nothing / remove the
> above code (which I suppose would give the persistence desired?).
> 
> But modify net/ethtool/channels.c to reset NAPIs to the global
> (sysfs) settings? Not sure how to balance both persistence with
> queue count changes in a way that makes sense for users of the API.
> 
> And, I didn't quite follow the bits about:
>   1. The proposed ring to NAPI mapping

[..]

>   2. The two step "takeover" which seemed to imply that we might
>      pull napi_id into napi_storage? Or maybe I just read that part
>      wrong?

Yes, the suggestion here is to drop patch #2 from your series and
keep napi_id as a user facing 'id' for the persistent storage. But,
obviously, this requires persistent napi_id(s) that survive device
resets.

The function that allocates new napi_id is napi_hash_add
from netif_napi_add_weight. So we can do either of the following:
1. Keep everything as is, but add the napi_rehash somewhere
   around napi_enable to 'takeover' previously allocated napi_id.
2. (preferred) Separate napi_hash_add out of netif_napi_add_weight.
   And have some new napi_hash_with_id(previous_napi_id) to expose it to the
   userspace. Then convert mlx5 to this new interface.
Re: [RFC net-next v2 1/9] net: napi: Add napi_storage
Posted by Joe Damato 2 months, 3 weeks ago
On Mon, Sep 09, 2024 at 03:37:41PM -0700, Stanislav Fomichev wrote:
> On 09/08, Joe Damato wrote:
> > On Sun, Sep 08, 2024 at 04:06:35PM +0000, Joe Damato wrote:
> > > Add a persistent NAPI storage area for NAPI configuration to the core.
> > > Drivers opt-in to setting the storage for a NAPI by passing an index
> > > when calling netif_napi_add_storage.
> > > 
> > > napi_storage is allocated in alloc_netdev_mqs, freed in free_netdev
> > > (after the NAPIs are deleted), and set to 0 when napi_enable is called.
> > > 
> > > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > > ---
> > >  .../networking/net_cachelines/net_device.rst  |  1 +
> > >  include/linux/netdevice.h                     | 34 +++++++++++++++++++
> > >  net/core/dev.c                                | 18 +++++++++-
> > >  3 files changed, 52 insertions(+), 1 deletion(-)
> > > 
> > 
> > [...]
> > 
> > > --- a/net/core/dev.c
> > > +++ b/net/core/dev.c
> > > @@ -6719,6 +6719,9 @@ void napi_enable(struct napi_struct *n)
> > >  		if (n->dev->threaded && n->thread)
> > >  			new |= NAPIF_STATE_THREADED;
> > >  	} while (!try_cmpxchg(&n->state, &val, new));
> > > +
> > > +	if (n->napi_storage)
> > > +		memset(n->napi_storage, 0, sizeof(*n->napi_storage));
> > 
> > This part is very obviously wrong ;)
> > 
> > I think when I was reading the other thread about resetting on
> > channel change [1] I got a bit confused.
> > 
> > Maybe what was intended was on napi_enable, do nothing / remove the
> > above code (which I suppose would give the persistence desired?).
> > 
> > But modify net/ethtool/channels.c to reset NAPIs to the global
> > (sysfs) settings? Not sure how to balance both persistence with
> > queue count changes in a way that makes sense for users of the API.
> > 
> > And, I didn't quite follow the bits about:
> >   1. The proposed ring to NAPI mapping
> 
> [..]
> 
> >   2. The two step "takeover" which seemed to imply that we might
> >      pull napi_id into napi_storage? Or maybe I just read that part
> >      wrong?
> 
> Yes, the suggestion here is to drop patch #2 from your series and
> keep napi_id as a user facing 'id' for the persistent storage. But,
> obviously, this requires persistent napi_id(s) that survive device
> resets.
> 
> The function that allocates new napi_id is napi_hash_add
> from netif_napi_add_weight. So we can do either of the following:
> 1. Keep everything as is, but add the napi_rehash somewhere
>    around napi_enable to 'takeover' previously allocated napi_id.
> 2. (preferred) Separate napi_hash_add out of netif_napi_add_weight.
>    And have some new napi_hash_with_id(previous_napi_id) to expose it to the
>    userspace. Then convert mlx5 to this new interface.

Jakub is this what you were thinking too?

If this is the case, then the netlink code needs to be tweaked to
operate on NAPI IDs again (since they are persistent) instead of
ifindex + napi_storage index?

LMK.
Re: [RFC net-next v2 1/9] net: napi: Add napi_storage
Posted by Jakub Kicinski 2 months, 3 weeks ago
On Tue, 10 Sep 2024 08:16:05 +0200 Joe Damato wrote:
> > >   2. The two step "takeover" which seemed to imply that we might
> > >      pull napi_id into napi_storage? Or maybe I just read that part
> > >      wrong?  
> > 
> > Yes, the suggestion here is to drop patch #2 from your series and
> > keep napi_id as a user facing 'id' for the persistent storage. But,
> > obviously, this requires persistent napi_id(s) that survive device
> > resets.
> > 
> > The function that allocates new napi_id is napi_hash_add
> > from netif_napi_add_weight. So we can do either of the following:
> > 1. Keep everything as is, but add the napi_rehash somewhere
> >    around napi_enable to 'takeover' previously allocated napi_id.
> > 2. (preferred) Separate napi_hash_add out of netif_napi_add_weight.
> >    And have some new napi_hash_with_id(previous_napi_id) to expose it to the
> >    userspace. Then convert mlx5 to this new interface.  
> 
> Jakub is this what you were thinking too?
> 
> If this is the case, then the netlink code needs to be tweaked to
> operate on NAPI IDs again (since they are persistent) instead of
> ifindex + napi_storage index?

No strong preference on the driver facing API, TBH, your
netif_napi_add_storage() with some additional 'ifs' in the existing
functions should work.

Also no strong preference on the uAPI, avoiding adding new fields is
a little bit tempting. But if you think NAPI ID won't work or is less
clean - we can stick the index in.