drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c | 34 ++++++++++++-------- 1 file changed, 21 insertions(+), 13 deletions(-)
From: Vitor Soares <vitor.soares@toradex.com>
When the mcp251xfd_start_xmit() function fails, the driver stops
processing messages, and the interrupt routine does not return,
running indefinitely even after killing the running application.
Error messages:
[ 441.298819] mcp251xfd spi2.0 can0: ERROR in mcp251xfd_start_xmit: -16
[ 441.306498] mcp251xfd spi2.0 can0: Transmit Event FIFO buffer not empty. (seq=0x000017c7, tef_tail=0x000017cf, tef_head=0x000017d0, tx_head=0x000017d3).
... and repeat forever.
The issue can be triggered when multiple devices share the same
SPI interface. And there is concurrent access to the bus.
The problem occurs because tx_ring->head increments even if
mcp251xfd_start_xmit() fails. Consequently, the driver skips one
TX package while still expecting a response in
mcp251xfd_handle_tefif_one().
This patch resolves the issue by decreasing tx_ring->head if
mcp251xfd_start_xmit() fails. With the fix, if we trigger the issue and
the err = -EBUSY, the driver returns NETDEV_TX_BUSY. The network stack
retries to transmit the message.
Otherwise, it prints an error and discards the message.
Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN")
Cc: stable@vger.kernel.org
Signed-off-by: Vitor Soares <vitor.soares@toradex.com>
---
V2->V3:
- Add tx_dropped stats.
- netdev_sent_queue() only if can_put_echo_skb() succeed.
V1->V2:
- Return NETDEV_TX_BUSY if mcp251xfd_tx_obj_write() == -EBUSY.
- Rework the commit message to address the change above.
- Change can_put_echo_skb() to be called after mcp251xfd_tx_obj_write() succeed.
Otherwise, we get Kernel NULL pointer dereference error.
drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c | 34 ++++++++++++--------
1 file changed, 21 insertions(+), 13 deletions(-)
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
index 160528d3cc26..146c44e47c60 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
@@ -166,6 +166,7 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb,
struct net_device *ndev)
{
struct mcp251xfd_priv *priv = netdev_priv(ndev);
+ struct net_device_stats *stats = &ndev->stats;
struct mcp251xfd_tx_ring *tx_ring = priv->tx;
struct mcp251xfd_tx_obj *tx_obj;
unsigned int frame_len;
@@ -181,25 +182,32 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb,
tx_obj = mcp251xfd_get_tx_obj_next(tx_ring);
mcp251xfd_tx_obj_from_skb(priv, tx_obj, skb, tx_ring->head);
- /* Stop queue if we occupy the complete TX FIFO */
tx_head = mcp251xfd_get_tx_head(tx_ring);
- tx_ring->head++;
- if (mcp251xfd_get_tx_free(tx_ring) == 0)
- netif_stop_queue(ndev);
-
frame_len = can_skb_get_frame_len(skb);
- err = can_put_echo_skb(skb, ndev, tx_head, frame_len);
- if (!err)
- netdev_sent_queue(priv->ndev, frame_len);
+
+ tx_ring->head++;
err = mcp251xfd_tx_obj_write(priv, tx_obj);
- if (err)
- goto out_err;
+ if (err) {
+ tx_ring->head--;
- return NETDEV_TX_OK;
+ if (err == -EBUSY)
+ return NETDEV_TX_BUSY;
- out_err:
- netdev_err(priv->ndev, "ERROR in %s: %d\n", __func__, err);
+ stats->tx_dropped++;
+
+ if (net_ratelimit())
+ netdev_err(priv->ndev,
+ "ERROR in %s: %d\n", __func__, err);
+ } else {
+ err = can_put_echo_skb(skb, ndev, tx_head, frame_len);
+ if (!err)
+ netdev_sent_queue(priv->ndev, frame_len);
+
+ /* Stop queue if we occupy the complete TX FIFO */
+ if (mcp251xfd_get_tx_free(tx_ring) == 0)
+ netif_stop_queue(ndev);
+ }
return NETDEV_TX_OK;
}
--
2.34.1
On 11.03.2024 12:11:43, Vitor Soares wrote:
> From: Vitor Soares <vitor.soares@toradex.com>
>
> When the mcp251xfd_start_xmit() function fails, the driver stops
> processing messages, and the interrupt routine does not return,
> running indefinitely even after killing the running application.
>
> Error messages:
> [ 441.298819] mcp251xfd spi2.0 can0: ERROR in mcp251xfd_start_xmit: -16
> [ 441.306498] mcp251xfd spi2.0 can0: Transmit Event FIFO buffer not empty. (seq=0x000017c7, tef_tail=0x000017cf, tef_head=0x000017d0, tx_head=0x000017d3).
> ... and repeat forever.
>
> The issue can be triggered when multiple devices share the same
> SPI interface. And there is concurrent access to the bus.
>
> The problem occurs because tx_ring->head increments even if
> mcp251xfd_start_xmit() fails. Consequently, the driver skips one
> TX package while still expecting a response in
> mcp251xfd_handle_tefif_one().
>
> This patch resolves the issue by decreasing tx_ring->head if
> mcp251xfd_start_xmit() fails. With the fix, if we trigger the issue and
> the err = -EBUSY, the driver returns NETDEV_TX_BUSY. The network stack
> retries to transmit the message.
> Otherwise, it prints an error and discards the message.
>
> Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN")
> Cc: stable@vger.kernel.org
> Signed-off-by: Vitor Soares <vitor.soares@toradex.com>
> ---
>
> V2->V3:
> - Add tx_dropped stats.
> - netdev_sent_queue() only if can_put_echo_skb() succeed.
>
> V1->V2:
> - Return NETDEV_TX_BUSY if mcp251xfd_tx_obj_write() == -EBUSY.
> - Rework the commit message to address the change above.
> - Change can_put_echo_skb() to be called after mcp251xfd_tx_obj_write() succeed.
> Otherwise, we get Kernel NULL pointer dereference error.
>
> drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c | 34 ++++++++++++--------
> 1 file changed, 21 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
> index 160528d3cc26..146c44e47c60 100644
> --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
> +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
> @@ -166,6 +166,7 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb,
> struct net_device *ndev)
> {
> struct mcp251xfd_priv *priv = netdev_priv(ndev);
> + struct net_device_stats *stats = &ndev->stats;
> struct mcp251xfd_tx_ring *tx_ring = priv->tx;
> struct mcp251xfd_tx_obj *tx_obj;
> unsigned int frame_len;
> @@ -181,25 +182,32 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb,
> tx_obj = mcp251xfd_get_tx_obj_next(tx_ring);
> mcp251xfd_tx_obj_from_skb(priv, tx_obj, skb, tx_ring->head);
>
> - /* Stop queue if we occupy the complete TX FIFO */
> tx_head = mcp251xfd_get_tx_head(tx_ring);
> - tx_ring->head++;
> - if (mcp251xfd_get_tx_free(tx_ring) == 0)
> - netif_stop_queue(ndev);
> -
> frame_len = can_skb_get_frame_len(skb);
> - err = can_put_echo_skb(skb, ndev, tx_head, frame_len);
> - if (!err)
> - netdev_sent_queue(priv->ndev, frame_len);
> +
> + tx_ring->head++;
>
> err = mcp251xfd_tx_obj_write(priv, tx_obj);
> - if (err)
> - goto out_err;
> + if (err) {
> + tx_ring->head--;
>
> - return NETDEV_TX_OK;
> + if (err == -EBUSY)
> + return NETDEV_TX_BUSY;
>
> - out_err:
> - netdev_err(priv->ndev, "ERROR in %s: %d\n", __func__, err);
> + stats->tx_dropped++;
> +
> + if (net_ratelimit())
> + netdev_err(priv->ndev,
> + "ERROR in %s: %d\n", __func__, err);
> + } else {
> + err = can_put_echo_skb(skb, ndev, tx_head, frame_len);
> + if (!err)
> + netdev_sent_queue(priv->ndev, frame_len);
> +
> + /* Stop queue if we occupy the complete TX FIFO */
> + if (mcp251xfd_get_tx_free(tx_ring) == 0)
> + netif_stop_queue(ndev);
> + }
Once you have sent the TX objects to the chip, it can trigger a TX
complete IRQ. This is not very likely as the SPI sends asynchronously
and the xmit handler cannot sleep.
This means you have to call can_put_echo_skb() and stop the queue if
needed, _before_ you call mcp251xfd_tx_obj_write(). In case of an error,
you have to roll-back these.
regards,
Marc
--
Pengutronix e.K. | Marc Kleine-Budde |
Embedded Linux | https://www.pengutronix.de |
Vertretung Nürnberg | Phone: +49-5121-206917-129 |
Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917-9 |
Hello Marc,
Thanks for your feedback.
On Tue, 2024-04-30 at 11:10 +0200, Marc Kleine-Budde wrote:
> On 11.03.2024 12:11:43, Vitor Soares wrote:
> > From: Vitor Soares <vitor.soares@toradex.com>
> >
> > When the mcp251xfd_start_xmit() function fails, the driver stops
> > processing messages, and the interrupt routine does not return,
> > running indefinitely even after killing the running application.
> >
> > Error messages:
> > [ 441.298819] mcp251xfd spi2.0 can0: ERROR in mcp251xfd_start_xmit: -16
> > [ 441.306498] mcp251xfd spi2.0 can0: Transmit Event FIFO buffer not empty. (seq=0x000017c7,
> > tef_tail=0x000017cf, tef_head=0x000017d0, tx_head=0x000017d3).
> > ... and repeat forever.
> >
> > The issue can be triggered when multiple devices share the same
> > SPI interface. And there is concurrent access to the bus.
> >
> > The problem occurs because tx_ring->head increments even if
> > mcp251xfd_start_xmit() fails. Consequently, the driver skips one
> > TX package while still expecting a response in
> > mcp251xfd_handle_tefif_one().
> >
> > This patch resolves the issue by decreasing tx_ring->head if
> > mcp251xfd_start_xmit() fails. With the fix, if we trigger the issue and
> > the err = -EBUSY, the driver returns NETDEV_TX_BUSY. The network stack
> > retries to transmit the message.
> > Otherwise, it prints an error and discards the message.
> >
> > Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Vitor Soares <vitor.soares@toradex.com>
> > ---
> >
> > V2->V3:
> > - Add tx_dropped stats.
> > - netdev_sent_queue() only if can_put_echo_skb() succeed.
> >
> > V1->V2:
> > - Return NETDEV_TX_BUSY if mcp251xfd_tx_obj_write() == -EBUSY.
> > - Rework the commit message to address the change above.
> > - Change can_put_echo_skb() to be called after mcp251xfd_tx_obj_write() succeed.
> > Otherwise, we get Kernel NULL pointer dereference error.
> >
> > drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c | 34 ++++++++++++--------
> > 1 file changed, 21 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
> > b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
> > index 160528d3cc26..146c44e47c60 100644
> > --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
> > +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
> > @@ -166,6 +166,7 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb,
> > struct net_device *ndev)
> > {
> > struct mcp251xfd_priv *priv = netdev_priv(ndev);
> > + struct net_device_stats *stats = &ndev->stats;
> > struct mcp251xfd_tx_ring *tx_ring = priv->tx;
> > struct mcp251xfd_tx_obj *tx_obj;
> > unsigned int frame_len;
> > @@ -181,25 +182,32 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb,
> > tx_obj = mcp251xfd_get_tx_obj_next(tx_ring);
> > mcp251xfd_tx_obj_from_skb(priv, tx_obj, skb, tx_ring->head);
> >
> > - /* Stop queue if we occupy the complete TX FIFO */
> > tx_head = mcp251xfd_get_tx_head(tx_ring);
> > - tx_ring->head++;
> > - if (mcp251xfd_get_tx_free(tx_ring) == 0)
> > - netif_stop_queue(ndev);
> > -
> > frame_len = can_skb_get_frame_len(skb);
> > - err = can_put_echo_skb(skb, ndev, tx_head, frame_len);
> > - if (!err)
> > - netdev_sent_queue(priv->ndev, frame_len);
> > +
> > + tx_ring->head++;
> >
> > err = mcp251xfd_tx_obj_write(priv, tx_obj);
> > - if (err)
> > - goto out_err;
> > + if (err) {
> > + tx_ring->head--;
> >
> > - return NETDEV_TX_OK;
> > + if (err == -EBUSY)
> > + return NETDEV_TX_BUSY;
> >
> > - out_err:
> > - netdev_err(priv->ndev, "ERROR in %s: %d\n", __func__, err);
> > + stats->tx_dropped++;
> > +
> > + if (net_ratelimit())
> > + netdev_err(priv->ndev,
> > + "ERROR in %s: %d\n", __func__, err);
> > + } else {
> > + err = can_put_echo_skb(skb, ndev, tx_head, frame_len);
> > + if (!err)
> > + netdev_sent_queue(priv->ndev, frame_len);
> > +
> > + /* Stop queue if we occupy the complete TX FIFO */
> > + if (mcp251xfd_get_tx_free(tx_ring) == 0)
> > + netif_stop_queue(ndev);
> > + }
>
> Once you have sent the TX objects to the chip, it can trigger a TX
> complete IRQ. This is not very likely as the SPI sends asynchronously
> and the xmit handler cannot sleep.
>
> This means you have to call can_put_echo_skb() and stop the queue if
> needed, _before_ you call mcp251xfd_tx_obj_write(). In case of an error,
> you have to roll-back these.
Doing this way, I have to can_free_echo_skb() and it won't be possible
to return NETDEV_TX_BUSY.
Do you know how can we return NETDEV_TX_BUSY when spi_async() == EBUSY
to allow the network stack to retry to transmit the package?
Best regards,
Vitor Soares
>
> regards,
> Marc
>
Hello Mark,
On Mon, Mar 11, 2024 at 12:11:43PM +0000, Vitor Soares wrote:
> From: Vitor Soares <vitor.soares@toradex.com>
>
> When the mcp251xfd_start_xmit() function fails, the driver stops
> processing messages, and the interrupt routine does not return,
> running indefinitely even after killing the running application.
>
> Error messages:
> [ 441.298819] mcp251xfd spi2.0 can0: ERROR in mcp251xfd_start_xmit: -16
> [ 441.306498] mcp251xfd spi2.0 can0: Transmit Event FIFO buffer not empty. (seq=0x000017c7, tef_tail=0x000017cf, tef_head=0x000017d0, tx_head=0x000017d3).
> ... and repeat forever.
>
> The issue can be triggered when multiple devices share the same
> SPI interface. And there is concurrent access to the bus.
>
> The problem occurs because tx_ring->head increments even if
> mcp251xfd_start_xmit() fails. Consequently, the driver skips one
> TX package while still expecting a response in
> mcp251xfd_handle_tefif_one().
>
> This patch resolves the issue by decreasing tx_ring->head if
> mcp251xfd_start_xmit() fails. With the fix, if we trigger the issue and
> the err = -EBUSY, the driver returns NETDEV_TX_BUSY. The network stack
> retries to transmit the message.
> Otherwise, it prints an error and discards the message.
>
> Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN")
> Cc: stable@vger.kernel.org
> Signed-off-by: Vitor Soares <vitor.soares@toradex.com>
Any other feedback on this? Just reaching out to be that this does not
fall through the cracks.
Francesco
The previous email misses the v3 on the subject, added on this one.
On Mon, 2024-03-11 at 12:11 +0000, Vitor Soares wrote:
> From: Vitor Soares <vitor.soares@toradex.com>
>
> When the mcp251xfd_start_xmit() function fails, the driver stops
> processing messages, and the interrupt routine does not return,
> running indefinitely even after killing the running application.
>
> Error messages:
> [ 441.298819] mcp251xfd spi2.0 can0: ERROR in mcp251xfd_start_xmit:
> -16
> [ 441.306498] mcp251xfd spi2.0 can0: Transmit Event FIFO buffer not
> empty. (seq=0x000017c7, tef_tail=0x000017cf, tef_head=0x000017d0,
> tx_head=0x000017d3).
> ... and repeat forever.
>
> The issue can be triggered when multiple devices share the same
> SPI interface. And there is concurrent access to the bus.
>
> The problem occurs because tx_ring->head increments even if
> mcp251xfd_start_xmit() fails. Consequently, the driver skips one
> TX package while still expecting a response in
> mcp251xfd_handle_tefif_one().
>
> This patch resolves the issue by decreasing tx_ring->head if
> mcp251xfd_start_xmit() fails. With the fix, if we trigger the issue
> and
> the err = -EBUSY, the driver returns NETDEV_TX_BUSY. The network
> stack
> retries to transmit the message.
> Otherwise, it prints an error and discards the message.
>
> Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip
> MCP25xxFD SPI CAN")
> Cc: stable@vger.kernel.org
> Signed-off-by: Vitor Soares <vitor.soares@toradex.com>
> ---
>
> V2->V3:
> - Add tx_dropped stats.
> - netdev_sent_queue() only if can_put_echo_skb() succeed.
>
> V1->V2:
> - Return NETDEV_TX_BUSY if mcp251xfd_tx_obj_write() == -EBUSY.
> - Rework the commit message to address the change above.
> - Change can_put_echo_skb() to be called after
> mcp251xfd_tx_obj_write() succeed.
> Otherwise, we get Kernel NULL pointer dereference error.
>
> drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c | 34 ++++++++++++------
> --
> 1 file changed, 21 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
> b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
> index 160528d3cc26..146c44e47c60 100644
> --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
> +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
> @@ -166,6 +166,7 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff
> *skb,
> struct net_device *ndev)
> {
> struct mcp251xfd_priv *priv = netdev_priv(ndev);
> + struct net_device_stats *stats = &ndev->stats;
> struct mcp251xfd_tx_ring *tx_ring = priv->tx;
> struct mcp251xfd_tx_obj *tx_obj;
> unsigned int frame_len;
> @@ -181,25 +182,32 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff
> *skb,
> tx_obj = mcp251xfd_get_tx_obj_next(tx_ring);
> mcp251xfd_tx_obj_from_skb(priv, tx_obj, skb, tx_ring->head);
>
> - /* Stop queue if we occupy the complete TX FIFO */
> tx_head = mcp251xfd_get_tx_head(tx_ring);
> - tx_ring->head++;
> - if (mcp251xfd_get_tx_free(tx_ring) == 0)
> - netif_stop_queue(ndev);
> -
> frame_len = can_skb_get_frame_len(skb);
> - err = can_put_echo_skb(skb, ndev, tx_head, frame_len);
> - if (!err)
> - netdev_sent_queue(priv->ndev, frame_len);
> +
> + tx_ring->head++;
>
> err = mcp251xfd_tx_obj_write(priv, tx_obj);
> - if (err)
> - goto out_err;
> + if (err) {
> + tx_ring->head--;
>
> - return NETDEV_TX_OK;
> + if (err == -EBUSY)
> + return NETDEV_TX_BUSY;
>
> - out_err:
> - netdev_err(priv->ndev, "ERROR in %s: %d\n", __func__, err);
> + stats->tx_dropped++;
> +
> + if (net_ratelimit())
> + netdev_err(priv->ndev,
> + "ERROR in %s: %d\n", __func__,
> err);
> + } else {
> + err = can_put_echo_skb(skb, ndev, tx_head,
> frame_len);
> + if (!err)
> + netdev_sent_queue(priv->ndev, frame_len);
> +
> + /* Stop queue if we occupy the complete TX FIFO */
> + if (mcp251xfd_get_tx_free(tx_ring) == 0)
> + netif_stop_queue(ndev);
> + }
>
> return NETDEV_TX_OK;
> }
© 2016 - 2026 Red Hat, Inc.