drivers/net/ethernet/mediatek/mtk_eth_soc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
Utilize kernel prefetch methods for faster cache line access.
This change boosts driver performance,
allowing the CPU to handle about 5% more packets/sec.
Signed-off-by: Elad Yifee <eladwf@gmail.com>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 0cc2dd85652f..1a0704166103 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1963,6 +1963,7 @@ static u32 mtk_xdp_run(struct mtk_eth *eth, struct mtk_rx_ring *ring,
if (!prog)
goto out;
+ prefetchw(xdp->data_hard_start);
act = bpf_prog_run_xdp(prog, xdp);
switch (act) {
case XDP_PASS:
@@ -2039,7 +2040,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget,
idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size);
rxd = ring->dma + idx * eth->soc->rx.desc_size;
data = ring->data[idx];
-
+ prefetch(rxd);
if (!mtk_rx_get_desc(eth, &trxd, rxd))
break;
@@ -2105,6 +2106,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget,
if (ret != XDP_PASS)
goto skip_rx;
+ net_prefetch(xdp.data_meta);
skb = build_skb(data, PAGE_SIZE);
if (unlikely(!skb)) {
page_pool_put_full_page(ring->page_pool,
@@ -2113,6 +2115,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget,
goto skip_rx;
}
+ prefetchw(skb->data);
skb_reserve(skb, xdp.data - xdp.data_hard_start);
skb_put(skb, xdp.data_end - xdp.data);
skb_mark_for_recycle(skb);
@@ -2143,6 +2146,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget,
dma_unmap_single(eth->dma_dev, ((u64)trxd.rxd1 | addr64),
ring->buf_size, DMA_FROM_DEVICE);
+ net_prefetch(data);
skb = build_skb(data, ring->frag_size);
if (unlikely(!skb)) {
netdev->stats.rx_dropped++;
@@ -2150,6 +2154,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget,
goto skip_rx;
}
+ prefetchw(skb->data);
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
skb_put(skb, pktlen);
}
--
2.45.2
On Sat, Jul 20, 2024 at 07:46:18PM +0300, Elad Yifee wrote: > Utilize kernel prefetch methods for faster cache line access. > This change boosts driver performance, > allowing the CPU to handle about 5% more packets/sec. Nit: It'd be great to see before/after numbers and/or an explanation of how you measured this in the commit message. > Signed-off-by: Elad Yifee <eladwf@gmail.com> > --- > drivers/net/ethernet/mediatek/mtk_eth_soc.c | 7 ++++++- > 1 file changed, 6 insertions(+), 1 deletion(-) > > diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c > index 0cc2dd85652f..1a0704166103 100644 > --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c > +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c > @@ -1963,6 +1963,7 @@ static u32 mtk_xdp_run(struct mtk_eth *eth, struct mtk_rx_ring *ring, > if (!prog) > goto out; > > + prefetchw(xdp->data_hard_start); Is there any reason to mix net_prefetch (as you have below) with prefetch and prefetchw ? IMHO: you should consider using net_prefetch and net_prefetchw everywhere instead of using both in your code. > act = bpf_prog_run_xdp(prog, xdp); > switch (act) { > case XDP_PASS: > @@ -2039,7 +2040,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, > idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size); > rxd = ring->dma + idx * eth->soc->rx.desc_size; > data = ring->data[idx]; > - > + prefetch(rxd); Maybe net_prefetch instead, as mentioned above? > if (!mtk_rx_get_desc(eth, &trxd, rxd)) > break; > > @@ -2105,6 +2106,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, > if (ret != XDP_PASS) > goto skip_rx; > > + net_prefetch(xdp.data_meta); > skb = build_skb(data, PAGE_SIZE); > if (unlikely(!skb)) { > page_pool_put_full_page(ring->page_pool, > @@ -2113,6 +2115,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, > goto skip_rx; > } > > + prefetchw(skb->data); Maybe net_prefetchw instead, as mentioned above? > skb_reserve(skb, xdp.data - xdp.data_hard_start); > skb_put(skb, xdp.data_end - xdp.data); > skb_mark_for_recycle(skb); > @@ -2143,6 +2146,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, > dma_unmap_single(eth->dma_dev, ((u64)trxd.rxd1 | addr64), > ring->buf_size, DMA_FROM_DEVICE); > > + net_prefetch(data); > skb = build_skb(data, ring->frag_size); > if (unlikely(!skb)) { > netdev->stats.rx_dropped++; > @@ -2150,6 +2154,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, > goto skip_rx; > } > > + prefetchw(skb->data); Maybe net_prefetchw instead, as mentioned above?
On Mon, Jul 22, 2024 at 7:17 PM Joe Damato <jdamato@fastly.com> wrote: > > On Sat, Jul 20, 2024 at 07:46:18PM +0300, Elad Yifee wrote: > > Utilize kernel prefetch methods for faster cache line access. > > This change boosts driver performance, > > allowing the CPU to handle about 5% more packets/sec. > > Nit: It'd be great to see before/after numbers and/or an explanation of > how you measured this in the commit message. Sure, I'll add iperf3 results in the next version. > Is there any reason to mix net_prefetch (as you have below) with > prefetch and prefetchw ? > > IMHO: you should consider using net_prefetch and net_prefetchw > everywhere instead of using both in your code. You are right, honestly I didn't notice it exists. I'll replace all prefetchw with net_prefetchw. > > @@ -2039,7 +2040,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, > > idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size); > > rxd = ring->dma + idx * eth->soc->rx.desc_size; > > data = ring->data[idx]; > > - > > + prefetch(rxd); > > Maybe net_prefetch instead, as mentioned above? This is the only case where I think prefetch should be used since it's only the descriptor. Thank you for your suggestions
On Mon, Jul 22, 2024 at 09:04:06PM +0300, Elad Yifee wrote: > On Mon, Jul 22, 2024 at 7:17 PM Joe Damato <jdamato@fastly.com> wrote: > > > > On Sat, Jul 20, 2024 at 07:46:18PM +0300, Elad Yifee wrote: > > > Utilize kernel prefetch methods for faster cache line access. > > > This change boosts driver performance, > > > allowing the CPU to handle about 5% more packets/sec. > > > > Nit: It'd be great to see before/after numbers and/or an explanation of > > how you measured this in the commit message. > Sure, I'll add iperf3 results in the next version. Thanks, that'd be helpful! [...] > > > @@ -2039,7 +2040,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, > > > idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size); > > > rxd = ring->dma + idx * eth->soc->rx.desc_size; > > > data = ring->data[idx]; > > > - > > > + prefetch(rxd); > > > > Maybe net_prefetch instead, as mentioned above? > This is the only case where I think prefetch should be used since it's > only the descriptor. I think you are implying that the optimization in the case of L1_CACHE_BYTES < 128 is unnecessary because because the mtk_rx_dma_v2 descriptors will be too far (i * eth->soc->rx.desc_size) apart to get any benefit from prefetching more data ? If my understanding is correct, then yes: I agree. > Thank you for your suggestions No problem!
© 2016 - 2024 Red Hat, Inc.