This will unmask the following error interrupts from the MAC-PHY.
tx protocol error
rx buffer overflow error
loss of framing error
header error
The MAC-PHY will signal an error by setting the EXST bit in the receive
data footer which will then allow the host to read the STATUS0 register
to find the source of the error.
Signed-off-by: Parthiban Veerasooran <Parthiban.Veerasooran@microchip.com>
---
drivers/net/ethernet/oa_tc6.c | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/drivers/net/ethernet/oa_tc6.c b/drivers/net/ethernet/oa_tc6.c
index 4b0f63c02c35..850765574d30 100644
--- a/drivers/net/ethernet/oa_tc6.c
+++ b/drivers/net/ethernet/oa_tc6.c
@@ -18,6 +18,13 @@
#define OA_TC6_REG_STATUS0 0x0008
#define STATUS0_RESETC BIT(6) /* Reset Complete */
+/* Interrupt Mask Register #0 */
+#define OA_TC6_REG_INT_MASK0 0x000C
+#define INT_MASK0_HEADER_ERR_MASK BIT(5)
+#define INT_MASK0_LOSS_OF_FRAME_ERR_MASK BIT(4)
+#define INT_MASK0_RX_BUFFER_OVERFLOW_ERR_MASK BIT(3)
+#define INT_MASK0_TX_PROTOCOL_ERR_MASK BIT(0)
+
/* Control command header */
#define OA_TC6_CTRL_HEADER_DATA_NOT_CTRL BIT(31)
#define OA_TC6_CTRL_HEADER_WRITE_NOT_READ BIT(29)
@@ -324,6 +331,23 @@ static int oa_tc6_sw_reset_macphy(struct oa_tc6 *tc6)
return oa_tc6_write_register(tc6, OA_TC6_REG_STATUS0, regval);
}
+static int oa_tc6_unmask_macphy_error_interrupts(struct oa_tc6 *tc6)
+{
+ u32 regval;
+ int ret;
+
+ ret = oa_tc6_read_register(tc6, OA_TC6_REG_INT_MASK0, ®val);
+ if (ret)
+ return ret;
+
+ regval &= ~(INT_MASK0_TX_PROTOCOL_ERR_MASK |
+ INT_MASK0_RX_BUFFER_OVERFLOW_ERR_MASK |
+ INT_MASK0_LOSS_OF_FRAME_ERR_MASK |
+ INT_MASK0_HEADER_ERR_MASK);
+
+ return oa_tc6_write_register(tc6, OA_TC6_REG_INT_MASK0, regval);
+}
+
/**
* oa_tc6_init - allocates and initializes oa_tc6 structure.
* @spi: device with which data will be exchanged.
@@ -364,6 +388,13 @@ struct oa_tc6 *oa_tc6_init(struct spi_device *spi)
return NULL;
}
+ ret = oa_tc6_unmask_macphy_error_interrupts(tc6);
+ if (ret) {
+ dev_err(&tc6->spi->dev,
+ "MAC-PHY error interrupts unmask failed: %d\n", ret);
+ return NULL;
+ }
+
return tc6;
}
EXPORT_SYMBOL_GPL(oa_tc6_init);
--
2.34.1
> +static int oa_tc6_unmask_macphy_error_interrupts(struct oa_tc6 *tc6)
> +{
> + u32 regval;
> + int ret;
> +
> + ret = oa_tc6_read_register(tc6, OA_TC6_REG_INT_MASK0, ®val);
> + if (ret)
> + return ret;
> +
> + regval &= ~(INT_MASK0_TX_PROTOCOL_ERR_MASK |
> + INT_MASK0_RX_BUFFER_OVERFLOW_ERR_MASK |
> + INT_MASK0_LOSS_OF_FRAME_ERR_MASK |
> + INT_MASK0_HEADER_ERR_MASK);
> +
> + return oa_tc6_write_register(tc6, OA_TC6_REG_INT_MASK0, regval);
> +}
> +
This togheter with patch 11 works poorly for me. I get alot of kernel
output, dropped packets and lower performance.
Below is an example for a run when I curl a 10MB blob
time curl 20.0.0.55:8000/rdump -o dump -w '{%speed_download}'
% Total % Received % Xferd Average Speed Time Time Time Current
Dload U[ 387.944737] net_ratelimit: 38 callbacks suppressed
pload Total Spent Left Sp[ 387.944755] eth0: Receive buffer overflow error
eed
0 0 0 0 0 0 0 0 --:--:-- --:-[ 387.961424] eth0: Receive buffer overflow error
0 10.0M 0 2896 0 0 13031 0 0:13:24 --:--:-- 0:13:24 12986[ 388.204257] eth0: Receive buffer overflow error
[ 388.209848] eth0: Receive buffer overflow error
[ 388.240705] eth0: Receive buffer overflow error
[ 388.246205] eth0: Receive buffer overflow error
[ 388.360265] eth0: Receive buffer overflow error
[ 388.365755] eth0: Receive buffer overflow error
[ 388.371328] eth0: Receive buffer overflow error
[ 388.396937] eth0: Receive buffer overflow error
32 10.0M 32 3362k 0 0 826k 0 0:00:12 0:00:04 0:00:08 826k[ 392.950696] net_ratelimit: 84 callbacks suppressed
[ 392.950711] eth0: Receive buffer overflow error
41 10.0M 41 4259k 0 0 840k 0 0:00:12 0:00:05 0:00:07 878k[ 393.009785] eth0: Receive buffer overflow error
[ 393.016651] eth0: Receive buffer overflow error
[ 393.121278] eth0: Receive buffer overflow error
[ 393.126876] eth0: Receive buffer overflow error
[ 393.204983] eth0: Receive buffer overflow error
[ 393.210590] eth0: Receive buffer overflow error
[ 393.248977] eth0: Receive buffer overflow error
[ 393.254575] eth0: Receive buffer overflow error
[ 393.352949] eth0: Receive buffer overflow error
77 10.0M 77 7903k 0 0 870k 0 0:00:11 0:00:09 0:00:02 906k[ 397.956674] net_ratelimit: 88 callbacks suppressed
[ 397.956691] eth0: Receive buffer overflow error
[ 397.967182] eth0: Receive buffer overflow error
86 10.0M 86 8837k 0 0 877k 0 0:00:11 0:00:10 0:00:01 915k[ 398.048630] eth0: Receive buffer overflow error
[ 398.054171] eth0: Receive buffer overflow error
[ 398.092858] eth0: Receive buffer overflow error
[ 398.113975] eth0: Receive buffer overflow error
[ 398.177558] eth0: Receive buffer overflow error
[ 398.209782] eth0: Receive buffer overflow error
[ 398.272621] eth0: Receive buffer overflow error
[ 398.278306] eth0: Receive buffer overflow error
100 10.0M 100 10.0M 0 0 877k 0 0:00:11 0:00:11 --:--:-- 921k
{%speed_download}
real 0m11.681s
user 0m0.117s
sys 0m0.226s
I tried this patch
diff --git a/drivers/net/ethernet/oa_tc6.c b/drivers/net/ethernet/oa_tc6.c
index 9f17f3712137..bd7bd3ef6897 100644
--- a/drivers/net/ethernet/oa_tc6.c
+++ b/drivers/net/ethernet/oa_tc6.c
@@ -615,21 +615,9 @@ static int oa_tc6_sw_reset_macphy(struct oa_tc6 *tc6)
return oa_tc6_write_register(tc6, OA_TC6_REG_STATUS0, regval);
}
-static int oa_tc6_unmask_macphy_error_interrupts(struct oa_tc6 *tc6)
+static int oa_tc6_disable_imask0_interrupts(struct oa_tc6 *tc6)
{
- u32 regval;
- int ret;
-
- ret = oa_tc6_read_register(tc6, OA_TC6_REG_INT_MASK0, ®val);
- if (ret)
- return ret;
-
- regval &= ~(INT_MASK0_TX_PROTOCOL_ERR_MASK |
- INT_MASK0_RX_BUFFER_OVERFLOW_ERR_MASK |
- INT_MASK0_LOSS_OF_FRAME_ERR_MASK |
- INT_MASK0_HEADER_ERR_MASK);
-
- return oa_tc6_write_register(tc6, OA_TC6_REG_INT_MASK0, regval);
+ return oa_tc6_write_register(tc6, OA_TC6_REG_INT_MASK0, (u32)-1);
}
static int oa_tc6_enable_data_transfer(struct oa_tc6 *tc6)
@@ -1234,7 +1222,7 @@ struct oa_tc6 *oa_tc6_init(struct spi_device *spi, struct net_device *netdev)
return NULL;
}
- ret = oa_tc6_unmask_macphy_error_interrupts(tc6);
+ ret = oa_tc6_disable_imask0_interrupts(tc6);
if (ret) {
dev_err(&tc6->spi->dev,
"MAC-PHY error interrupts unmask failed: %d\n", ret);
Which results in no log spam, ~5-10% higher throughput and no dropped
packets when I look at /sys/class/net/eth0/statistics/rx_dropped
Wheras when I did an equivalent run with a 100MB file I got 1918
rx_dropped with interrupts enabled.
When I dig through wireshark I can see some retransmissions when
interrupts are disabled, but the network stack seems to handle that with
grace.
Maybe the decision on wheter to enable interrupts or not should be on
the mac driver level, not network framework level?
If this really is preferrable for some reason, could a module option be
added so that it does not require running a fork to disable interrupts?
There is also a interrupt mask 1 register that is not touched in this
patch series, and since things started working better for me with just
touching reg0 I didn't bother tweaking anything int reg1.
BR
Ramón
On Sat, Apr 27, 2024 at 09:52:15PM +0200, Ramón Nordin Rodriguez wrote:
> > +static int oa_tc6_unmask_macphy_error_interrupts(struct oa_tc6 *tc6)
> > +{
> > + u32 regval;
> > + int ret;
> > +
> > + ret = oa_tc6_read_register(tc6, OA_TC6_REG_INT_MASK0, ®val);
> > + if (ret)
> > + return ret;
> > +
> > + regval &= ~(INT_MASK0_TX_PROTOCOL_ERR_MASK |
> > + INT_MASK0_RX_BUFFER_OVERFLOW_ERR_MASK |
> > + INT_MASK0_LOSS_OF_FRAME_ERR_MASK |
> > + INT_MASK0_HEADER_ERR_MASK);
> > +
> > + return oa_tc6_write_register(tc6, OA_TC6_REG_INT_MASK0, regval);
> > +}
> > +
>
> This togheter with patch 11 works poorly for me. I get alot of kernel
> output, dropped packets and lower performance.
> Below is an example for a run when I curl a 10MB blob
>
> time curl 20.0.0.55:8000/rdump -o dump -w '{%speed_download}'
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload U[ 387.944737] net_ratelimit: 38 callbacks suppressed
> pload Total Spent Left Sp[ 387.944755] eth0: Receive buffer overflow error
> eed
> 0 0 0 0 0 0 0 0 --:--:-- --:-[ 387.961424] eth0: Receive buffer overflow error
> 0 10.0M 0 2896 0 0 13031 0 0:13:24 --:--:-- 0:13:24 12986[ 388.204257] eth0: Receive buffer overflow error
> [ 388.209848] eth0: Receive buffer overflow error
How fast is your SPI bus? Faster than the link speed? Or slower?
It could be different behaviour is needed depending on the SPI bus
speed. If the SPI bus is faster than the link speed, by some margin,
the receiver buffer should not overflow, since the CPU can empty the
buffer faster than it fills.
If however, the SPI bus is slower than the link speed, there will be
buffer overflows, and a reliance on TCP backing off and slowing down.
The driver should not be spamming the log, since it is going to happen
and there is nothing that can be done about it.
> I tried this patch
>
> diff --git a/drivers/net/ethernet/oa_tc6.c b/drivers/net/ethernet/oa_tc6.c
> index 9f17f3712137..bd7bd3ef6897 100644
> --- a/drivers/net/ethernet/oa_tc6.c
> +++ b/drivers/net/ethernet/oa_tc6.c
> @@ -615,21 +615,9 @@ static int oa_tc6_sw_reset_macphy(struct oa_tc6 *tc6)
> return oa_tc6_write_register(tc6, OA_TC6_REG_STATUS0, regval);
> }
>
> -static int oa_tc6_unmask_macphy_error_interrupts(struct oa_tc6 *tc6)
> +static int oa_tc6_disable_imask0_interrupts(struct oa_tc6 *tc6)
> {
> - u32 regval;
> - int ret;
> -
> - ret = oa_tc6_read_register(tc6, OA_TC6_REG_INT_MASK0, ®val);
> - if (ret)
> - return ret;
> -
> - regval &= ~(INT_MASK0_TX_PROTOCOL_ERR_MASK |
> - INT_MASK0_RX_BUFFER_OVERFLOW_ERR_MASK |
> - INT_MASK0_LOSS_OF_FRAME_ERR_MASK |
> - INT_MASK0_HEADER_ERR_MASK);
> -
> - return oa_tc6_write_register(tc6, OA_TC6_REG_INT_MASK0, regval);
> + return oa_tc6_write_register(tc6, OA_TC6_REG_INT_MASK0, (u32)-1);
So this appears to be disabling all error interrupts?
This is maybe going too far. Overflow errors are going to happen if
you have a slow SPI bus. So i probably would not enable that. However,
are the other errors actually expected in normal usage? If not, leave
them enabled, because they might indicate real problems.
> Which results in no log spam, ~5-10% higher throughput and no dropped
> packets when I look at /sys/class/net/eth0/statistics/rx_dropped
You cannot trust rx_dropped because you just disabled the code which
increments it! The device is probably still dropping packets, and they
are no longer counted.
It could be the performance increase comes from two places:
1) Spending time and bus bandwidth dealing with the buffer overflow
interrupt
2) Printing out the serial port.
Please could you benchmark a few things:
1) Remove the printk("Receive buffer overflow error"), but otherwise
keep the code the same. That will give us an idea how much the serial
port matters.
2) Disable only the RX buffer overflow interrupt
3) Disable all error interrupts.
Andrew
> It could be the performance increase comes from two places:
>
> 1) Spending time and bus bandwidth dealing with the buffer overflow
> interrupt
>
> 2) Printing out the serial port.
>
> Please could you benchmark a few things:
>
> 1) Remove the printk("Receive buffer overflow error"), but otherwise
> keep the code the same. That will give us an idea how much the serial
> port matters.
>
> 2) Disable only the RX buffer overflow interrupt
>
> 3) Disable all error interrupts.
>
Test setup
- Server side -
PC with a lan8670 usb eval board running a http server that serves
a 10MB binary blob. The http server is just python3 -m http.server
- Client side -
iMX8mn board (quad core arm64) with lan8650 mac-phy (25MHz spi)
running curl to download the blob from the server and writing to
a ramfs (ddr4 1.xGHz, so should not be a bottleneck)
Below are a collection of samples of different test runs. All of the
test runs have run a minor patch for hw reset (else nothing works for me).
Using curl is not the most scientific approach here, but iperf has
not exposed any problems for me earlier with overflows.
So sticking with curl since it's easy and definetly gets the overflows.
n | name | min | avg | max | rx dropped | samples
1 | no mod | 827K | 846K | 891K | 945 | 5
2 | no log | 711K | 726K | 744K | 562 | 5
3 | less irq | 815K | 833K | 846K | N/A | 5
4 | no irq | 914K | 924K | 931K | N/A | 5
5 | simple | 857K | 868K | 879K | 615 | 5
Description of each scenario
1 - no mod
So this just runs a hw reset to get the chip working (described in earlier posts)
2 - no log
This scenario just removes the logging when handling the irq state
--- a/drivers/net/ethernet/oa_tc6.c
+++ b/drivers/net/ethernet/oa_tc6.c
@@ -688,8 +688,6 @@ static int oa_tc6_process_extended_status(struct oa_tc6 *tc6)
if (FIELD_GET(STATUS0_RX_BUFFER_OVERFLOW_ERROR, value)) {
tc6->rx_buf_overflow = true;
oa_tc6_cleanup_ongoing_rx_skb(tc6);
- net_err_ratelimited("%s: Receive buffer overflow error\n",
- tc6->netdev->name);
return -EAGAIN;
}
if (FIELD_GET(STATUS0_TX_PROTOCOL_ERROR, value)) {
3 - less irq
This scenario disables the overflow interrupt but keeps the others
--- a/drivers/net/ethernet/oa_tc6.c
+++ b/drivers/net/ethernet/oa_tc6.c
@@ -625,10 +625,10 @@ static int oa_tc6_unmask_macphy_error_interrupts(struct oa_tc6 *tc6)
return ret;
regval &= ~(INT_MASK0_TX_PROTOCOL_ERR_MASK |
- INT_MASK0_RX_BUFFER_OVERFLOW_ERR_MASK |
INT_MASK0_LOSS_OF_FRAME_ERR_MASK |
INT_MASK0_HEADER_ERR_MASK);
+ regval |= INT_MASK0_RX_BUFFER_OVERFLOW_ERR_MASK;
return oa_tc6_write_register(tc6, OA_TC6_REG_INT_MASK0, regval);
}
4 - no irq
This scenario disables all imask0 interrupts with the following change
diff --git a/drivers/net/ethernet/oa_tc6.c b/drivers/net/ethernet/oa_tc6.c
index 9f17f3712137..88a9c6ccb37a 100644
--- a/drivers/net/ethernet/oa_tc6.c
+++ b/drivers/net/ethernet/oa_tc6.c
@@ -624,12 +624,7 @@ static int oa_tc6_unmask_macphy_error_interrupts(struct oa_tc6 *tc6)
if (ret)
return ret;
- regval &= ~(INT_MASK0_TX_PROTOCOL_ERR_MASK |
- INT_MASK0_RX_BUFFER_OVERFLOW_ERR_MASK |
- INT_MASK0_LOSS_OF_FRAME_ERR_MASK |
- INT_MASK0_HEADER_ERR_MASK);
-
- return oa_tc6_write_register(tc6, OA_TC6_REG_INT_MASK0, regval);
+ return oa_tc6_write_register(tc6, OA_TC6_REG_INT_MASK0, (u32)-1);
}
static int oa_tc6_enable_data_transfer(struct oa_tc6 *tc6)
5 - simple
This keeps the interrupt but does not log or drop the socket buffer on irq
Moving the rx dropped increment here makes it more of a irq counter I guess,
so maybe not relevant.
diff --git a/drivers/net/ethernet/oa_tc6.c b/drivers/net/ethernet/oa_tc6.c
index 9f17f3712137..cbc20a725ad0 100644
--- a/drivers/net/ethernet/oa_tc6.c
+++ b/drivers/net/ethernet/oa_tc6.c
@@ -687,10 +687,7 @@ static int oa_tc6_process_extended_status(struct oa_tc6 *tc6)
if (FIELD_GET(STATUS0_RX_BUFFER_OVERFLOW_ERROR, value)) {
tc6->rx_buf_overflow = true;
- oa_tc6_cleanup_ongoing_rx_skb(tc6);
- net_err_ratelimited("%s: Receive buffer overflow error\n",
- tc6->netdev->name);
- return -EAGAIN;
+ tc6->netdev->stats.rx_dropped++;
}
if (FIELD_GET(STATUS0_TX_PROTOCOL_ERROR, value)) {
netdev_err(tc6->netdev, "Transmit protocol error\n");
- postamble -
Removing the logging made things considerably slower which probably
indicates that there is a timing dependent behaviour in the driver.
I have a hard time explaining why there is a throughput difference
between scenario 3 and 4 since I did not get the logs that any of the
other interrupts happened.
Maybe the irq handling adds some unexpected context switching overhead.
My recommendation going forward would be to disable the rx buffer
overlow interrupt and removing any code related to handling of it.
R
> n | name | min | avg | max | rx dropped | samples > 1 | no mod | 827K | 846K | 891K | 945 | 5 > 2 | no log | 711K | 726K | 744K | 562 | 5 > 3 | less irq | 815K | 833K | 846K | N/A | 5 > 4 | no irq | 914K | 924K | 931K | N/A | 5 > 5 | simple | 857K | 868K | 879K | 615 | 5 That is odd. Side question: What CONFIG_HZ= do you have? 100, 250, 1000? Try 1000. I've seen problems where the driver wants to sleep for a short time, but the CONFIG_HZ value limits how short a time it can actually sleep. It ends up sleeping much longer than it wants. Andrew
> > n | name | min | avg | max | rx dropped | samples
> > 1 | no mod | 827K | 846K | 891K | 945 | 5
> > 2 | no log | 711K | 726K | 744K | 562 | 5
> > 3 | less irq | 815K | 833K | 846K | N/A | 5
> > 4 | no irq | 914K | 924K | 931K | N/A | 5
> > 5 | simple | 857K | 868K | 879K | 615 | 5
>
> That is odd.
>
> Side question: What CONFIG_HZ= do you have? 100, 250, 1000? Try
> 1000. I've seen problems where the driver wants to sleep for a short
> time, but the CONFIG_HZ value limits how short a time it can actually
> sleep. It ends up sleeping much longer than it wants.
>
I have been doing my best to abuse the link some more. In brief tweaking
CONFIG_HZ has some but limited effect.
Saturating the link with the rx buffer interrupt enabled breaks the driver.
Saturating the link with the rx buffer interrupt disabled has poor
performance.
The following scenario has been tested. Both ends of the link run:
* server.py
* client.py
One end is an arm64 quad core running at 1.2GHz with the lan8650 macphy.
The other end is an amd 3950x running the lan8670 usb eval board.
Both systems should be fast enough that running python should not be a
limiting factor.
-- The test code --
server.py
#!/bin/env python3
import socket
def serve(sock: socket.socket):
while True:
client, addr = sock.accept()
print(f'connection from: {addr}')
while len(client.recv(2048)) > 0:
pass
print('client disconnected')
client.close()
if __name__ == '__main__':
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.bind(('0.0.0.0', 4040))
sock.listen(1)
serve(sock)
print("something went wrong")
client.py
#!/bin/env python3
import socket
import sys
if __name__ == '__main__':
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.connect((sys.argv[1], 4040))
while True:
sock.sendall(b'0'*2048)
-- test runs --
run 1 - all interrupts enabled
Time to failure:
1 min or less
Kernel output:
[ 94.361312] sched: RT throttling activated
top output:
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
145 root -51 0 0 0 0 R 95.5 0.0 1:11.22 oa-tc6-spi-thread
link stats:
3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000
link/ether 32:c2:7e:22:93:99 brd ff:ff:ff:ff:ff:ff
RX: bytes packets errors dropped missed mcast
3371902 7186 0 48 0 0
RX errors: length crc frame fifo overrun
0 0 0 0 0
TX: bytes packets errors dropped carrier collsns
10341438 8071 0 0 0 0
TX errors: aborted fifo window heartbt transns
0 0 0 0 1
state:
Completly borked, can't ping in or out, bringing the interface down then up
has no effect.
There is no SPI clock and no interrupts generated by the mac-phy.
The worker thread seems to have live locked.
-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
run 2 - RX_BUFFER_OVERLOW interrupt disabled
state:
Runs just fine but the oa-tc6-spi-thread is consuming 10-20% cpu
Ping times have increased from 1-2ms to 8-35ms
-- additional notes --
When tweaking CONFIG_HZ I do get some changes in behaviour, the cpu
consumption stays stable at 20%+-2 with CONFIG_HZ=250, when increased to
CONFIG_HZ=1000 it jumps up and down between 10-20%.
I don't have access to a logic analyzer but my old oscilloscope is
almost reliable. I could confirm that the spi clock is indeed running at
the expected 25MHz, but I could observe some gaps of up to 320µs so
that's 8k spi cycles spent doing something else.
These gaps were observed on the SPI clock and the macphy interrupt was
active for the same ammount of time(though this was measured independently
and not on the same trigger).
I've been drinking way to much coffe, so soldering is not gonna happen
today (shaky hands), but if it helps I can solder wires to attach both
probes to confirm that the gap in the SPI clock happens at the same time
or not as the interrupt is active.
I'd be keen on hearing what Microchips plans to address. If tracking
down performance issues is a priority I'll probably not spend any time
on it, if not then I'll definetly dig into it more.
Let me know if anything is unclear or if I can help out with anything
specific.
R
Hi Ramon,
On 01/05/24 11:59 pm, Ramón Nordin Rodriguez wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe
>
>>> n | name | min | avg | max | rx dropped | samples
>>> 1 | no mod | 827K | 846K | 891K | 945 | 5
>>> 2 | no log | 711K | 726K | 744K | 562 | 5
>>> 3 | less irq | 815K | 833K | 846K | N/A | 5
>>> 4 | no irq | 914K | 924K | 931K | N/A | 5
>>> 5 | simple | 857K | 868K | 879K | 615 | 5
>>
>> That is odd.
>>
>> Side question: What CONFIG_HZ= do you have? 100, 250, 1000? Try
>> 1000. I've seen problems where the driver wants to sleep for a short
>> time, but the CONFIG_HZ value limits how short a time it can actually
>> sleep. It ends up sleeping much longer than it wants.
>>
>
> I have been doing my best to abuse the link some more. In brief tweaking
> CONFIG_HZ has some but limited effect.
> Saturating the link with the rx buffer interrupt enabled breaks the driver.
> Saturating the link with the rx buffer interrupt disabled has poor
> performance.
>
> The following scenario has been tested. Both ends of the link run:
> * server.py
> * client.py
>
> One end is an arm64 quad core running at 1.2GHz with the lan8650 macphy.
> The other end is an amd 3950x running the lan8670 usb eval board.
> Both systems should be fast enough that running python should not be a
> limiting factor.
>
> -- The test code --
> server.py
> #!/bin/env python3
> import socket
>
> def serve(sock: socket.socket):
> while True:
> client, addr = sock.accept()
> print(f'connection from: {addr}')
> while len(client.recv(2048)) > 0:
> pass
> print('client disconnected')
> client.close()
>
> if __name__ == '__main__':
> sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
> sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
> sock.bind(('0.0.0.0', 4040))
> sock.listen(1)
> serve(sock)
> print("something went wrong")
>
> client.py
> #!/bin/env python3
> import socket
> import sys
>
> if __name__ == '__main__':
> sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
> sock.connect((sys.argv[1], 4040))
>
> while True:
> sock.sendall(b'0'*2048)
>
> -- test runs --
> run 1 - all interrupts enabled
> Time to failure:
> 1 min or less
>
> Kernel output:
> [ 94.361312] sched: RT throttling activated
>
> top output:
> PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
> 145 root -51 0 0 0 0 R 95.5 0.0 1:11.22 oa-tc6-spi-thread
>
> link stats:
> 3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000
> link/ether 32:c2:7e:22:93:99 brd ff:ff:ff:ff:ff:ff
> RX: bytes packets errors dropped missed mcast
> 3371902 7186 0 48 0 0
> RX errors: length crc frame fifo overrun
> 0 0 0 0 0
> TX: bytes packets errors dropped carrier collsns
> 10341438 8071 0 0 0 0
> TX errors: aborted fifo window heartbt transns
> 0 0 0 0 1
> state:
> Completly borked, can't ping in or out, bringing the interface down then up
> has no effect.
> There is no SPI clock and no interrupts generated by the mac-phy.
> The worker thread seems to have live locked.
>
> -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
> run 2 - RX_BUFFER_OVERLOW interrupt disabled
>
> state:
> Runs just fine but the oa-tc6-spi-thread is consuming 10-20% cpu
> Ping times have increased from 1-2ms to 8-35ms
>
>
> -- additional notes --
> When tweaking CONFIG_HZ I do get some changes in behaviour, the cpu
> consumption stays stable at 20%+-2 with CONFIG_HZ=250, when increased to
> CONFIG_HZ=1000 it jumps up and down between 10-20%.
>
> I don't have access to a logic analyzer but my old oscilloscope is
> almost reliable. I could confirm that the spi clock is indeed running at
> the expected 25MHz, but I could observe some gaps of up to 320µs so
> that's 8k spi cycles spent doing something else.
> These gaps were observed on the SPI clock and the macphy interrupt was
> active for the same ammount of time(though this was measured independently
> and not on the same trigger).
> I've been drinking way to much coffe, so soldering is not gonna happen
> today (shaky hands), but if it helps I can solder wires to attach both
> probes to confirm that the gap in the SPI clock happens at the same time
> or not as the interrupt is active.
>
> I'd be keen on hearing what Microchips plans to address. If tracking
> down performance issues is a priority I'll probably not spend any time
> on it, if not then I'll definetly dig into it more.
I tried to reproduce the issue in my setup with your above applications.
But surprisingly I couldn't reproduce the issue you are facing.
One end is Raspberry Pi 4 with lan8651 MAC-PHY and the other end is
Raspberry Pi 4 with EVB-LAN8670-USB Stick.
lan8651 MAC-PHY side:
---------------------
pi@raspberrypi:~/lan865x/v4_mainline $ python3 server.py
connection from: ('192.168.5.101', 46608)
Top output:
-----------
top - 14:28:14 up 1:14, 4 users, load average: 0.68, 0.67, 0.63
Tasks: 201 total, 1 running, 200 sleeping, 0 stopped, 0 zombie
%Cpu(s): 0.2 us, 1.1 sy, 0.0 ni, 98.7 id, 0.0 wa, 0.0 hi, 0.1 si,
0.0 st
MiB Mem : 7810.0 total, 7110.5 free, 392.3 used, 432.4
buff/cache
MiB Swap: 100.0 total, 100.0 free, 0.0 used. 7417.7 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+
COMMAND
7219 root -51 0 0 0 0 S 2.7 0.0 1:13.11
oa-tc6-spi-thread
35307 pi 20 0 16280 9216 5248 S 0.7 0.1 0:16.53
python3
88831 pi 20 0 11728 4864 2816 R 0.3 0.1 0:00.22
top
89819 vnc 20 0 2320 384 384 S 0.3 0.0 0:00.04
sh
1 root 20 0 168652 11580 8436 S 0.0 0.1 0:03.85
systemd
2 root 20 0 0 0 0 S 0.0 0.0 0:00.01
kthreadd
3 root 20 0 0 0 0 S 0.0 0.0 0:00.00
pool_workqueue_release
4 root 0 -20 0 0 0 I 0.0 0.0 0:00.00
kworker/R-rcu_g
5 root 0 -20 0 0 0 I 0.0 0.0 0:00.00
kworker/R-rcu_p
6 root 0 -20 0 0 0 I 0.0 0.0 0:00.00
kworker/R-slub_
7 root 0 -20 0 0 0 I 0.0 0.0 0:00.00
kworker/R-netns
11 root 20 0 0 0 0 I 0.0 0.0 0:00.03
kworker/u8:0-ext4-rsv-conversion
12 root 0 -20 0 0 0 I 0.0 0.0 0:00.00
kworker/R-mm_pe
ifconfig output:
----------------
eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 192.168.5.100 netmask 255.255.255.0 broadcast 0.0.0.0
ether 04:05:06:01:02:03 txqueuelen 1000 (Ethernet)
RX packets 1589879 bytes 2391045582 (2.2 GiB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 1022419 bytes 71954905 (68.6 MiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
device interrupt 55
dmesg output:
-------------
pi@raspberrypi:~/lan865x/v4_mainline $ dmesg
[ 234.019968] LAN865X Rev.B0 Internal Phy spi0.0:00: attached PHY
driver (mii_bus:phy_addr=spi0.0:00, irq=POLL)
[ 234.069387] lan865x spi0.0 eth1: Link is Up - 10Mbps/Half - flow
control off
EVB-LAN8670-USB stick side:
---------------------------
pi@pv-rpi-tp2:~/microchip/t1s-usb $ python3 client.py 192.168.5.100
ifconfig output:
----------------
eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 192.168.5.101 netmask 255.255.255.0 broadcast 192.168.5.255
ether 00:1e:c0:d1:ca:39 txqueuelen 1000 (Ethernet)
RX packets 1034606 bytes 58330335 (55.6 MiB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 1722400 bytes 2628199197 (2.4 GiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
Let me know if you want to try anything more.
Best regards,
Parthiban V
>
> Let me know if anything is unclear or if I can help out with anything
> specific.
>
> R
> I tried to reproduce the issue in my setup with your above applications. > But surprisingly I couldn't reproduce the issue you are facing. > > One end is Raspberry Pi 4 with lan8651 MAC-PHY and the other end is > Raspberry Pi 4 with EVB-LAN8670-USB Stick. > > lan8651 MAC-PHY side: > --------------------- > Did you run both client and server on both ends? The previous tests I have done have only sent traffic in one direction, which did not lead to a live lock. But both sending and receiving as fast as possible in both directions trigger the problems on my end. R
On 02/05/24 3:49 pm, Ramón Nordin Rodriguez wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe
>
>> I tried to reproduce the issue in my setup with your above applications.
>> But surprisingly I couldn't reproduce the issue you are facing.
>>
>> One end is Raspberry Pi 4 with lan8651 MAC-PHY and the other end is
>> Raspberry Pi 4 with EVB-LAN8670-USB Stick.
>>
>> lan8651 MAC-PHY side:
>> ---------------------
>>
>
> Did you run both client and server on both ends? The previous tests I
> have done have only sent traffic in one direction, which did not lead to
> a live lock.
> But both sending and receiving as fast as possible in both directions
> trigger the problems on my end.
OK, today I executed both the client and server on both the sides. It is
running more than an hour, it didn't crash. But there are some "Receive
buffer overflow error" time to time in the dmesg. I don't know whether
this is an expected behavior with this type of stress test.
top output:
-----------
pi@raspberrypi:~/lan865x/v4_mainline $ top
top - 19:47:07 up 54 min, 5 users, load average: 1.23, 1.20, 1.14
Tasks: 205 total, 1 running, 204 sleeping, 0 stopped, 0 zombie
%Cpu(s): 0.3 us, 7.5 sy, 0.0 ni, 92.1 id, 0.0 wa, 0.0 hi, 0.2 si,
0.0 st
MiB Mem : 7810.0 total, 7013.5 free, 402.6 used, 518.0
buff/cache
MiB Swap: 100.0 total, 100.0 free, 0.0 used. 7407.5 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+
COMMAND
7218 root -51 0 0 0 0 D 28.6 0.0 13:51.12
oa-tc6-spi-thread
63853 vnc 20 0 2320 384 384 S 0.7 0.0 0:00.23
sh
65090 pi 20 0 11728 4992 2816 R 0.7 0.1 0:00.04
top
17 root 20 0 0 0 0 I 0.3 0.0 0:01.74
rcu_preempt
27 root 20 0 0 0 0 S 0.3 0.0 0:16.67
ksoftirqd/2
268 root 20 0 50024 16132 15108 S 0.3 0.2 0:02.02
systemd-journal
956 pi 20 0 699196 37896 29160 S 0.3 0.5 0:03.60
wireplumber
7328 pi 20 0 16280 9216 5248 S 0.3 0.1 0:07.61
python3
1 root 20 0 168692 11340 8320 S 0.0 0.1 0:03.67
systemd
2 root 20 0 0 0 0 S 0.0 0.0 0:00.01
kthreadd
ifconfig output:
----------------
eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 192.168.5.100 netmask 255.255.255.0 broadcast 0.0.0.0
ether 04:05:06:01:02:03 txqueuelen 1000 (Ethernet)
RX packets 1944541 bytes 909191168 (867.0 MiB)
RX errors 0 dropped 2 overruns 0 frame 0
TX packets 2402855 bytes 3129162624 (2.9 GiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
device interrupt 55
dmesg output:
-------------
[ 285.482275] LAN865X Rev.B0 Internal Phy spi0.0:00: attached PHY
driver (mii_bus:phy_addr=spi0.0:00, irq=POLL)
[ 285.534760] lan865x spi0.0 eth1: Link is Up - 10Mbps/Half - flow
control off
[ 341.466221] eth1: Receive buffer overflow error
[ 345.730222] eth1: Receive buffer overflow error
[ 345.891126] eth1: Receive buffer overflow error
[ 346.074220] eth1: Receive buffer overflow error
[ 354.374223] eth1: Receive buffer overflow error
[ 354.375422] eth1: Receive buffer overflow error
[ 355.786219] eth1: Receive buffer overflow error
[ 356.898221] eth1: Receive buffer overflow error
[ 357.782219] eth1: Receive buffer overflow error
[ 357.783194] eth1: Receive buffer overflow error
[ 365.446224] eth1: Receive buffer overflow error
[ 387.722221] eth1: Receive buffer overflow error
[ 387.790218] eth1: Receive buffer overflow error
[ 387.950224] eth1: Receive buffer overflow error
[ 387.951199] eth1: Receive buffer overflow error
[ 388.790219] eth1: Receive buffer overflow error
[ 389.010220] eth1: Receive buffer overflow error
[ 391.066221] eth1: Receive buffer overflow error
[ 409.746225] eth1: Receive buffer overflow error
[ 442.835278] eth1: Receive buffer overflow error
[ 447.410222] eth1: Receive buffer overflow error
[ 447.510220] eth1: Receive buffer overflow error
[ 447.758219] eth1: Receive buffer overflow error
[ 448.066221] eth1: Receive buffer overflow error
[ 449.590220] eth1: Receive buffer overflow error
[ 450.686221] eth1: Receive buffer overflow error
[ 451.238222] eth1: Receive buffer overflow error
[ 451.490220] eth1: Receive buffer overflow error
[ 472.254221] eth1: Receive buffer overflow error
[ 473.302222] eth1: Receive buffer overflow error
[ 474.070220] eth1: Receive buffer overflow error
[ 475.262220] eth1: Receive buffer overflow error
[ 475.362223] eth1: Receive buffer overflow error
[ 477.362224] eth1: Receive buffer overflow error
[ 501.918221] eth1: Receive buffer overflow error
[ 505.510219] eth1: Receive buffer overflow error
[ 516.466220] eth1: Receive buffer overflow error
[ 600.830221] eth1: Receive buffer overflow error
[ 675.614219] eth1: Receive buffer overflow error
[ 676.318223] eth1: Receive buffer overflow error
[ 676.594223] eth1: Receive buffer overflow error
[ 678.702219] eth1: Receive buffer overflow error
[ 679.102219] eth1: Receive buffer overflow error
[ 679.642218] eth1: Receive buffer overflow error
[ 679.894218] eth1: Receive buffer overflow error
[ 679.906224] eth1: Receive buffer overflow error
[ 682.529386] eth1: Receive buffer overflow error
[ 682.786221] eth1: Receive buffer overflow error
[ 703.374220] eth1: Receive buffer overflow error
[ 703.462219] eth1: Receive buffer overflow error
[ 716.606222] eth1: Receive buffer overflow error
[ 717.134219] eth1: Receive buffer overflow error
[ 722.838219] eth1: Receive buffer overflow error
[ 801.522220] eth1: Receive buffer overflow error
[ 871.946226] eth1: Receive buffer overflow error
[ 1304.190220] eth1: Receive buffer overflow error
[ 1307.734221] eth1: Receive buffer overflow error
[ 1309.738224] eth1: Receive buffer overflow error
[ 1605.222219] eth1: Receive buffer overflow error
[ 1739.927121] eth1: Receive buffer overflow error
[ 2240.002213] eth1: Receive buffer overflow error
[ 2241.355378] eth1: Receive buffer overflow error
[ 2300.374218] eth1: Receive buffer overflow error
[ 2332.850214] eth1: Receive buffer overflow error
[ 2347.322215] eth1: Receive buffer overflow error
[ 2377.702216] eth1: Receive buffer overflow error
[ 2393.942214] eth1: Receive buffer overflow error
[ 2404.986214] eth1: Receive buffer overflow error
[ 2407.538216] eth1: Receive buffer overflow error
[ 2407.539190] eth1: Receive buffer overflow error
[ 2411.266213] eth1: Receive buffer overflow error
[ 2421.542220] eth1: Receive buffer overflow error
[ 2713.142214] eth1: Receive buffer overflow error
[ 3401.226215] eth1: Receive buffer overflow error
[ 3494.394214] eth1: Receive buffer overflow error
[ 3511.850214] eth1: Receive buffer overflow error
Best regards,
Parthiban V
> R
>
> [ 285.482275] LAN865X Rev.B0 Internal Phy spi0.0:00: attached PHY > driver (mii_bus:phy_addr=spi0.0:00, irq=POLL) > [ 285.534760] lan865x spi0.0 eth1: Link is Up - 10Mbps/Half - flow > control off > [ 341.466221] eth1: Receive buffer overflow error > [ 345.730222] eth1: Receive buffer overflow error > [ 345.891126] eth1: Receive buffer overflow error > [ 346.074220] eth1: Receive buffer overflow error Generally we only log real errors. Is a receive buffer overflow a real error? I would say not. But it would be good to count it. There was also the open question, what exactly does a receive buffer overflow mean? The spec says: 9.2.8.11 RXBOE Receive Buffer Overflow Error. When set, this bit indicates that the receive buffer (from the network) has overflowed and receive frame data was lost. Which is a bit ambiguous. I would hope it means the receiver has dropped the packet. It will not be passed to the host. But other than maybe count it, i don't think there is anything to do. But Ramón was suggesting you actually drop the frame currently be transferred over the SPI bus? Andrew
> I don't have access to a logic analyzer but my old oscilloscope is > almost reliable. I could confirm that the spi clock is indeed running at > the expected 25MHz, but I could observe some gaps of up to 320µs so > that's 8k spi cycles spent doing something else. > These gaps were observed on the SPI clock and the macphy interrupt was > active for the same ammount of time(though this was measured independently > and not on the same trigger). > I've been drinking way to much coffe, so soldering is not gonna happen > today (shaky hands), but if it helps I can solder wires to attach both > probes to confirm that the gap in the SPI clock happens at the same time > or not as the interrupt is active. What i expect you will see is that the interrupt line goes active. A while later there is an SPI bus transfer to fetch the interrupt status, and when that transfer completes the interrupt should go inactive. But you will need your second channel to confirm this. One question would be, is 320µs reasonable for an interrupt, and one SPI transfer? I would then expect a number of back to back transfers, each around 64 bytes in size, as it gets the received frame. The gaps between those transfers will be interesting. Andrew
> Kernel output: > [ 94.361312] sched: RT throttling activated > > top output: > PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND > 145 root -51 0 0 0 0 R 95.5 0.0 1:11.22 oa-tc6-spi-thread > > link stats: > 3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000 > link/ether 32:c2:7e:22:93:99 brd ff:ff:ff:ff:ff:ff > RX: bytes packets errors dropped missed mcast > 3371902 7186 0 48 0 0 > RX errors: length crc frame fifo overrun > 0 0 0 0 0 > TX: bytes packets errors dropped carrier collsns > 10341438 8071 0 0 0 0 > TX errors: aborted fifo window heartbt transns > 0 0 0 0 1 > state: > Completly borked, can't ping in or out, bringing the interface down then up > has no effect. Completely borked is not good. > I'd be keen on hearing what Microchips plans to address. If tracking > down performance issues is a priority I'll probably not spend any time > on it, if not then I'll definetly dig into it more. Borked ! = performance issue. This should be addressed. Lets see if the python scripts are enough to be able to reproduce it. We should not rule out differences in SPI drivers. So it might be you need to debug this, if Microchip cannot reproduce it. I would also suggest you enable the "kernel hacking" debug features for finding deadlocks, bad spinlocks, stuck RT tasks etc. That might give some clues. Andrew
On Sun, Apr 28, 2024 at 04:59:40PM +0200, Andrew Lunn wrote: > > n | name | min | avg | max | rx dropped | samples > > 1 | no mod | 827K | 846K | 891K | 945 | 5 > > 2 | no log | 711K | 726K | 744K | 562 | 5 > > 3 | less irq | 815K | 833K | 846K | N/A | 5 > > 4 | no irq | 914K | 924K | 931K | N/A | 5 > > 5 | simple | 857K | 868K | 879K | 615 | 5 > > That is odd. > > Side question: What CONFIG_HZ= do you have? 100, 250, 1000? Try > 1000. I've seen problems where the driver wants to sleep for a short > time, but the CONFIG_HZ value limits how short a time it can actually > sleep. It ends up sleeping much longer than it wants. > Good catch, had it set to 250. I'll rebuild with CONFIG_HZ=1000 and rerun the tests.
> How fast is your SPI bus? Faster than the link speed? Or slower?
>
> It could be different behaviour is needed depending on the SPI bus
> speed. If the SPI bus is faster than the link speed, by some margin,
> the receiver buffer should not overflow, since the CPU can empty the
> buffer faster than it fills.
I'm running at 25MHz, I'm guessing that should translate to fast enough
for the 10MBit half duplex link.
But I'm not sure how the spi clock translates to bps here.
>
> If however, the SPI bus is slower than the link speed, there will be
> buffer overflows, and a reliance on TCP backing off and slowing down.
> The driver should not be spamming the log, since it is going to happen
> and there is nothing that can be done about it.
>
I agree, I think the print could be a DBG if deemed necessary, but there
is also the dropped counter to look at.
> > I tried this patch
> >
> > diff --git a/drivers/net/ethernet/oa_tc6.c b/drivers/net/ethernet/oa_tc6.c
> > index 9f17f3712137..bd7bd3ef6897 100644
> > --- a/drivers/net/ethernet/oa_tc6.c
> > +++ b/drivers/net/ethernet/oa_tc6.c
> > @@ -615,21 +615,9 @@ static int oa_tc6_sw_reset_macphy(struct oa_tc6 *tc6)
> > return oa_tc6_write_register(tc6, OA_TC6_REG_STATUS0, regval);
> > }
> >
> > -static int oa_tc6_unmask_macphy_error_interrupts(struct oa_tc6 *tc6)
> > +static int oa_tc6_disable_imask0_interrupts(struct oa_tc6 *tc6)
> > {
> > - u32 regval;
> > - int ret;
> > -
> > - ret = oa_tc6_read_register(tc6, OA_TC6_REG_INT_MASK0, ®val);
> > - if (ret)
> > - return ret;
> > -
> > - regval &= ~(INT_MASK0_TX_PROTOCOL_ERR_MASK |
> > - INT_MASK0_RX_BUFFER_OVERFLOW_ERR_MASK |
> > - INT_MASK0_LOSS_OF_FRAME_ERR_MASK |
> > - INT_MASK0_HEADER_ERR_MASK);
> > -
> > - return oa_tc6_write_register(tc6, OA_TC6_REG_INT_MASK0, regval);
> > + return oa_tc6_write_register(tc6, OA_TC6_REG_INT_MASK0, (u32)-1);
>
> So this appears to be disabling all error interrupts?
Yes, and I think you are right in that it's an overcorrection. There is
a secondary interrupt mask register as well that is not touched by the
driver, so that's left at whatever the chip defaults to on reset.
>
> This is maybe going too far. Overflow errors are going to happen if
> you have a slow SPI bus. So i probably would not enable that. However,
> are the other errors actually expected in normal usage? If not, leave
> them enabled, because they might indicate real problems.
I'm guessing you are right and that the others actually would be
meningful to log.
There is a nested question here as well, and that is wheter to keep or
drop the code that drops the rx buffer on overflow interrupt.
I think not dropping the full buffer could be one of the reasons for the
perf change.
>
> > Which results in no log spam, ~5-10% higher throughput and no dropped
> > packets when I look at /sys/class/net/eth0/statistics/rx_dropped
>
> You cannot trust rx_dropped because you just disabled the code which
> increments it! The device is probably still dropping packets, and they
> are no longer counted.
I'll curb my enthusiasm a bit :)
>
> It could be the performance increase comes from two places:
>
> 1) Spending time and bus bandwidth dealing with the buffer overflow
> interrupt
>
> 2) Printing out the serial port.
>
I think it's possible that the buffer cleanup triggered after the
overflow interrupt hits could be cause 3) here, but just a guess.
> Please could you benchmark a few things:
>
> 1) Remove the printk("Receive buffer overflow error"), but otherwise
> keep the code the same. That will give us an idea how much the serial
> port matters.
>
> 2) Disable only the RX buffer overflow interrupt
>
> 3) Disable all error interrupts.
>
Thanks for layout it out so clearly. I'll run through the scenarios!
R
On Sat, Apr 27, 2024 at 11:55:13PM +0200, Ramón Nordin Rodriguez wrote: > > How fast is your SPI bus? Faster than the link speed? Or slower? > > > > It could be different behaviour is needed depending on the SPI bus > > speed. If the SPI bus is faster than the link speed, by some margin, > > the receiver buffer should not overflow, since the CPU can empty the > > buffer faster than it fills. > > I'm running at 25MHz, I'm guessing that should translate to fast enough > for the 10MBit half duplex link. > But I'm not sure how the spi clock translates to bps here. That seems plenty fast. Maybe you can get a bus pirate or similar sniffing the bus. Maybe there are big gaps between the transfers for some reason? Or the interrupt controller is very slow? > I'm guessing you are right and that the others actually would be > meningful to log. > There is a nested question here as well, and that is wheter to keep or > drop the code that drops the rx buffer on overflow interrupt. > I think not dropping the full buffer could be one of the reasons for the > perf change. You need to look careful at what a buffer overflow means, as written in the standard. Does it mean a chunk has been dropped from the frame currently being transferred over the SPI bus? If so, you need to drop the frame, because it is missing 64 bytes somewhere. That could happen if the device has very minimal buffering and does cut through. So the frame goes straight to the SPI bus while it is still being received from the line. Or the device could have sufficient buffers to hold a few full frames. It has run out of such buffers while receiving, and so dropped the frame. You never see that frame over SPI because it has already been discarded. If so, linux should not be dropping anything, the device already has. Given your 25Mhz bus speed, i think there at least two things wrong here. Dropping frames is too noise, and potentially doing it wrong. I also think there is something not optimal in your SPI master, because 25MHz should not have trouble with 10Mbps line speed. Andrew
On Thu, Apr 18, 2024 at 06:26:41PM +0530, Parthiban Veerasooran wrote:
> This will unmask the following error interrupts from the MAC-PHY.
> tx protocol error
> rx buffer overflow error
> loss of framing error
> header error
> The MAC-PHY will signal an error by setting the EXST bit in the receive
> data footer which will then allow the host to read the STATUS0 register
> to find the source of the error.
>
> Signed-off-by: Parthiban Veerasooran <Parthiban.Veerasooran@microchip.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Andrew
© 2016 - 2026 Red Hat, Inc.