[PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation

Xin Zhao posted 2 patches 1 month ago
There is a newer version of this series
[PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
Posted by Xin Zhao 1 month ago
In a system with high real-time requirements, the timeout mechanism of
ordinary timers with jiffies granularity is insufficient to meet the
demands for real-time performance. Meanwhile, the optimization of CPU
usage with af_packet is quite significant. Use hrtimer instead of timer
to help compensate for the shortcomings in real-time performance.
In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
enough, with fluctuations reaching over 8ms (on a system with HZ=250).
This is unacceptable in some high real-time systems that require timely
processing of network packets. By replacing it with hrtimer, if a timeout
of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
3 ms.

Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
---
Changes in v8:
- Simplify the logic related to setting timeout.

Changes in v7:
- Only update the hrtimer expire time within the hrtimer callback.

Changes in v1:
- Do not add another config for the current changes.

---
 net/packet/af_packet.c | 79 +++++++++---------------------------------
 net/packet/diag.c      |  2 +-
 net/packet/internal.h  |  6 ++--
 3 files changed, 20 insertions(+), 67 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d4eb4a4fe..3e3bb4216 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
 static int prb_queue_frozen(struct tpacket_kbdq_core *);
 static void prb_open_block(struct tpacket_kbdq_core *,
 		struct tpacket_block_desc *);
-static void prb_retire_rx_blk_timer_expired(struct timer_list *);
-static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
+static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
 		struct tpacket3_hdr *);
@@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
 	return proto;
 }
 
-static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
-{
-	timer_delete_sync(&pkc->retire_blk_timer);
-}
-
 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
 		struct sk_buff_head *rb_queue)
 {
 	struct tpacket_kbdq_core *pkc;
 
 	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
-
-	spin_lock_bh(&rb_queue->lock);
-	pkc->delete_blk_timer = 1;
-	spin_unlock_bh(&rb_queue->lock);
-
-	prb_del_retire_blk_timer(pkc);
-}
-
-static void prb_setup_retire_blk_timer(struct packet_sock *po)
-{
-	struct tpacket_kbdq_core *pkc;
-
-	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
-	timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
-		    0);
-	pkc->retire_blk_timer.expires = jiffies;
+	hrtimer_cancel(&pkc->retire_blk_timer);
 }
 
 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
@@ -671,29 +650,22 @@ static void init_prb_bdqc(struct packet_sock *po,
 	p1->version = po->tp_version;
 	po->stats.stats3.tp_freeze_q_cnt = 0;
 	if (req_u->req3.tp_retire_blk_tov)
-		p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
+		p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
 	else
-		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
-						req_u->req3.tp_block_size);
-	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
+		p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
+						req_u->req3.tp_block_size));
 	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
 	rwlock_init(&p1->blk_fill_in_prog_lock);
 
 	p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
 	prb_init_ft_ops(p1, req_u);
-	prb_setup_retire_blk_timer(po);
+	hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
+		      CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+	hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
+		      HRTIMER_MODE_REL_SOFT);
 	prb_open_block(p1, pbd);
 }
 
-/*  Do NOT update the last_blk_num first.
- *  Assumes sk_buff_head lock is held.
- */
-static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
-{
-	mod_timer(&pkc->retire_blk_timer,
-			jiffies + pkc->tov_in_jiffies);
-}
-
 /*
  * Timer logic:
  * 1) We refresh the timer only when we open a block.
@@ -717,7 +689,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
  * prb_calc_retire_blk_tmo() calculates the tmo.
  *
  */
-static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
+static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
 {
 	struct packet_sock *po =
 		timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
@@ -730,9 +702,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
 	frozen = prb_queue_frozen(pkc);
 	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
 
-	if (unlikely(pkc->delete_blk_timer))
-		goto out;
-
 	/* We only need to plug the race when the block is partially filled.
 	 * tpacket_rcv:
 	 *		lock(); increment BLOCK_NUM_PKTS; unlock()
@@ -749,26 +718,16 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
 	}
 
 	if (!frozen) {
-		if (!BLOCK_NUM_PKTS(pbd)) {
-			/* An empty block. Just refresh the timer. */
-			goto refresh_timer;
+		if (BLOCK_NUM_PKTS(pbd)) {
+			/* Not an empty block. Need retire the block. */
+			prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
+			prb_dispatch_next_block(pkc, po);
 		}
-		prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
-		if (!prb_dispatch_next_block(pkc, po))
-			goto refresh_timer;
-		else
-			goto out;
 	} else {
 		/* Case 1. Queue was frozen because user-space was
 		 * lagging behind.
 		 */
-		if (prb_curr_blk_in_use(pbd)) {
-			/*
-			 * Ok, user-space is still behind.
-			 * So just refresh the timer.
-			 */
-			goto refresh_timer;
-		} else {
+		if (!prb_curr_blk_in_use(pbd)) {
 			/* Case 2. queue was frozen,user-space caught up,
 			 * now the link went idle && the timer fired.
 			 * We don't have a block to close.So we open this
@@ -777,15 +736,12 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
 			 * Thawing/timer-refresh is a side effect.
 			 */
 			prb_open_block(pkc, pbd);
-			goto out;
 		}
 	}
 
-refresh_timer:
-	_prb_refresh_rx_retire_blk_timer(pkc);
-
-out:
+	hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime);
 	spin_unlock(&po->sk.sk_receive_queue.lock);
+	return HRTIMER_RESTART;
 }
 
 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
@@ -917,7 +873,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
 	pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
 
 	prb_thaw_queue(pkc1);
-	_prb_refresh_rx_retire_blk_timer(pkc1);
 
 	smp_wmb();
 }
diff --git a/net/packet/diag.c b/net/packet/diag.c
index 6ce1dcc28..c8f43e0c1 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
 	pdr.pdr_frame_nr = ring->frame_max + 1;
 
 	if (ver > TPACKET_V2) {
-		pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
+		pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime);
 		pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
 		pdr.pdr_features = ring->prb_bdqc.feature_req_word;
 	} else {
diff --git a/net/packet/internal.h b/net/packet/internal.h
index d367b9f93..f8cfd9213 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -20,7 +20,6 @@ struct tpacket_kbdq_core {
 	unsigned int	feature_req_word;
 	unsigned int	hdrlen;
 	unsigned char	reset_pending_on_curr_blk;
-	unsigned char   delete_blk_timer;
 	unsigned short	kactive_blk_num;
 	unsigned short	blk_sizeof_priv;
 
@@ -39,12 +38,11 @@ struct tpacket_kbdq_core {
 	/* Default is set to 8ms */
 #define DEFAULT_PRB_RETIRE_TOV	(8)
 
-	unsigned short  retire_blk_tov;
+	ktime_t		interval_ktime;
 	unsigned short  version;
-	unsigned long	tov_in_jiffies;
 
 	/* timer to retire an outstanding block */
-	struct timer_list retire_blk_timer;
+	struct hrtimer  retire_blk_timer;
 };
 
 struct pgv {
-- 
2.34.1
Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
Posted by Jason Xing 1 month ago
On Sun, Aug 31, 2025 at 6:09 PM Xin Zhao <jackzxcui1989@163.com> wrote:
>
> In a system with high real-time requirements, the timeout mechanism of
> ordinary timers with jiffies granularity is insufficient to meet the
> demands for real-time performance. Meanwhile, the optimization of CPU
> usage with af_packet is quite significant. Use hrtimer instead of timer
> to help compensate for the shortcomings in real-time performance.
> In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
> enough, with fluctuations reaching over 8ms (on a system with HZ=250).
> This is unacceptable in some high real-time systems that require timely
> processing of network packets. By replacing it with hrtimer, if a timeout
> of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
> 3 ms.
>
> Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
> ---
> Changes in v8:
> - Simplify the logic related to setting timeout.
>
> Changes in v7:
> - Only update the hrtimer expire time within the hrtimer callback.
>
> Changes in v1:
> - Do not add another config for the current changes.
>
> ---
>  net/packet/af_packet.c | 79 +++++++++---------------------------------
>  net/packet/diag.c      |  2 +-
>  net/packet/internal.h  |  6 ++--
>  3 files changed, 20 insertions(+), 67 deletions(-)
>
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index d4eb4a4fe..3e3bb4216 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
>  static int prb_queue_frozen(struct tpacket_kbdq_core *);
>  static void prb_open_block(struct tpacket_kbdq_core *,
>                 struct tpacket_block_desc *);
> -static void prb_retire_rx_blk_timer_expired(struct timer_list *);
> -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
> +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
>  static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
>  static void prb_clear_rxhash(struct tpacket_kbdq_core *,
>                 struct tpacket3_hdr *);
> @@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
>         return proto;
>  }
>
> -static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> -{
> -       timer_delete_sync(&pkc->retire_blk_timer);
> -}
> -
>  static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
>                 struct sk_buff_head *rb_queue)
>  {
>         struct tpacket_kbdq_core *pkc;
>
>         pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> -
> -       spin_lock_bh(&rb_queue->lock);
> -       pkc->delete_blk_timer = 1;
> -       spin_unlock_bh(&rb_queue->lock);
> -
> -       prb_del_retire_blk_timer(pkc);
> -}
> -
> -static void prb_setup_retire_blk_timer(struct packet_sock *po)
> -{
> -       struct tpacket_kbdq_core *pkc;
> -
> -       pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> -       timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> -                   0);
> -       pkc->retire_blk_timer.expires = jiffies;
> +       hrtimer_cancel(&pkc->retire_blk_timer);
>  }
>
>  static int prb_calc_retire_blk_tmo(struct packet_sock *po,
> @@ -671,29 +650,22 @@ static void init_prb_bdqc(struct packet_sock *po,
>         p1->version = po->tp_version;
>         po->stats.stats3.tp_freeze_q_cnt = 0;
>         if (req_u->req3.tp_retire_blk_tov)
> -               p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
> +               p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
>         else
> -               p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
> -                                               req_u->req3.tp_block_size);
> -       p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
> +               p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
> +                                               req_u->req3.tp_block_size));
>         p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
>         rwlock_init(&p1->blk_fill_in_prog_lock);
>
>         p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
>         prb_init_ft_ops(p1, req_u);
> -       prb_setup_retire_blk_timer(po);
> +       hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> +                     CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
> +       hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
> +                     HRTIMER_MODE_REL_SOFT);

You expect to see it start at the setsockopt phase? Even if it's far
from the real use of recv at the moment.

>         prb_open_block(p1, pbd);
>  }
>
> -/*  Do NOT update the last_blk_num first.
> - *  Assumes sk_buff_head lock is held.
> - */
> -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> -{
> -       mod_timer(&pkc->retire_blk_timer,
> -                       jiffies + pkc->tov_in_jiffies);
> -}
> -
>  /*
>   * Timer logic:
>   * 1) We refresh the timer only when we open a block.
> @@ -717,7 +689,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
>   * prb_calc_retire_blk_tmo() calculates the tmo.
>   *
>   */
> -static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
>  {
>         struct packet_sock *po =
>                 timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
> @@ -730,9 +702,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
>         frozen = prb_queue_frozen(pkc);
>         pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
>
> -       if (unlikely(pkc->delete_blk_timer))
> -               goto out;
> -
>         /* We only need to plug the race when the block is partially filled.
>          * tpacket_rcv:
>          *              lock(); increment BLOCK_NUM_PKTS; unlock()
> @@ -749,26 +718,16 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
>         }
>
>         if (!frozen) {
> -               if (!BLOCK_NUM_PKTS(pbd)) {
> -                       /* An empty block. Just refresh the timer. */
> -                       goto refresh_timer;
> +               if (BLOCK_NUM_PKTS(pbd)) {
> +                       /* Not an empty block. Need retire the block. */
> +                       prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
> +                       prb_dispatch_next_block(pkc, po);
>                 }
> -               prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
> -               if (!prb_dispatch_next_block(pkc, po))
> -                       goto refresh_timer;
> -               else
> -                       goto out;
>         } else {
>                 /* Case 1. Queue was frozen because user-space was
>                  * lagging behind.
>                  */
> -               if (prb_curr_blk_in_use(pbd)) {
> -                       /*
> -                        * Ok, user-space is still behind.
> -                        * So just refresh the timer.
> -                        */
> -                       goto refresh_timer;
> -               } else {
> +               if (!prb_curr_blk_in_use(pbd)) {
>                         /* Case 2. queue was frozen,user-space caught up,
>                          * now the link went idle && the timer fired.
>                          * We don't have a block to close.So we open this
> @@ -777,15 +736,12 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
>                          * Thawing/timer-refresh is a side effect.
>                          */
>                         prb_open_block(pkc, pbd);
> -                       goto out;
>                 }
>         }
>
> -refresh_timer:
> -       _prb_refresh_rx_retire_blk_timer(pkc);
> -
> -out:
> +       hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime);
>         spin_unlock(&po->sk.sk_receive_queue.lock);
> +       return HRTIMER_RESTART;
>  }
>
>  static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
> @@ -917,7 +873,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
>         pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
>
>         prb_thaw_queue(pkc1);
> -       _prb_refresh_rx_retire_blk_timer(pkc1);

Could you say more on why you remove this here and only reset/update
the expiry time in the timer handler? Probably I missed something
appearing in the previous long discussion.

>
>         smp_wmb();
>  }
> diff --git a/net/packet/diag.c b/net/packet/diag.c
> index 6ce1dcc28..c8f43e0c1 100644
> --- a/net/packet/diag.c
> +++ b/net/packet/diag.c
> @@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
>         pdr.pdr_frame_nr = ring->frame_max + 1;
>
>         if (ver > TPACKET_V2) {
> -               pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
> +               pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime);
>                 pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
>                 pdr.pdr_features = ring->prb_bdqc.feature_req_word;
>         } else {
> diff --git a/net/packet/internal.h b/net/packet/internal.h
> index d367b9f93..f8cfd9213 100644
> --- a/net/packet/internal.h
> +++ b/net/packet/internal.h
> @@ -20,7 +20,6 @@ struct tpacket_kbdq_core {
>         unsigned int    feature_req_word;
>         unsigned int    hdrlen;
>         unsigned char   reset_pending_on_curr_blk;
> -       unsigned char   delete_blk_timer;
>         unsigned short  kactive_blk_num;
>         unsigned short  blk_sizeof_priv;
>
> @@ -39,12 +38,11 @@ struct tpacket_kbdq_core {
>         /* Default is set to 8ms */
>  #define DEFAULT_PRB_RETIRE_TOV (8)
>
> -       unsigned short  retire_blk_tov;
> +       ktime_t         interval_ktime;
>         unsigned short  version;
> -       unsigned long   tov_in_jiffies;
>
>         /* timer to retire an outstanding block */
> -       struct timer_list retire_blk_timer;
> +       struct hrtimer  retire_blk_timer;
>  };

The whole structure needs a new organization?

Before:
        /* size: 152, cachelines: 3, members: 22 */
        /* sum members: 144, holes: 2, sum holes: 8 */
        /* paddings: 1, sum paddings: 4 */
        /* last cacheline: 24 bytes */
After:
        /* size: 176, cachelines: 3, members: 19 */
        /* sum members: 163, holes: 4, sum holes: 13 */
        /* paddings: 1, sum paddings: 4 */
        /* forced alignments: 1, forced holes: 1, sum forced holes: 6 */
        /* last cacheline: 48 bytes */

Thanks,
Jason

>
>  struct pgv {
> --
> 2.34.1
>
>
Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
Posted by Jason Xing 1 month ago
On Tue, Sep 2, 2025 at 11:43 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Sun, Aug 31, 2025 at 6:09 PM Xin Zhao <jackzxcui1989@163.com> wrote:
> >
> > In a system with high real-time requirements, the timeout mechanism of
> > ordinary timers with jiffies granularity is insufficient to meet the
> > demands for real-time performance. Meanwhile, the optimization of CPU
> > usage with af_packet is quite significant. Use hrtimer instead of timer
> > to help compensate for the shortcomings in real-time performance.
> > In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
> > enough, with fluctuations reaching over 8ms (on a system with HZ=250).
> > This is unacceptable in some high real-time systems that require timely
> > processing of network packets. By replacing it with hrtimer, if a timeout
> > of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
> > 3 ms.
> >
> > Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
> > ---
> > Changes in v8:
> > - Simplify the logic related to setting timeout.
> >
> > Changes in v7:
> > - Only update the hrtimer expire time within the hrtimer callback.
> >
> > Changes in v1:
> > - Do not add another config for the current changes.
> >
> > ---
> >  net/packet/af_packet.c | 79 +++++++++---------------------------------
> >  net/packet/diag.c      |  2 +-
> >  net/packet/internal.h  |  6 ++--
> >  3 files changed, 20 insertions(+), 67 deletions(-)
> >
> > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > index d4eb4a4fe..3e3bb4216 100644
> > --- a/net/packet/af_packet.c
> > +++ b/net/packet/af_packet.c
> > @@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
> >  static int prb_queue_frozen(struct tpacket_kbdq_core *);
> >  static void prb_open_block(struct tpacket_kbdq_core *,
> >                 struct tpacket_block_desc *);
> > -static void prb_retire_rx_blk_timer_expired(struct timer_list *);
> > -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
> > +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
> >  static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
> >  static void prb_clear_rxhash(struct tpacket_kbdq_core *,
> >                 struct tpacket3_hdr *);
> > @@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
> >         return proto;
> >  }
> >
> > -static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> > -{
> > -       timer_delete_sync(&pkc->retire_blk_timer);
> > -}
> > -
> >  static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
> >                 struct sk_buff_head *rb_queue)
> >  {
> >         struct tpacket_kbdq_core *pkc;
> >
> >         pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> > -
> > -       spin_lock_bh(&rb_queue->lock);
> > -       pkc->delete_blk_timer = 1;

One more review from my side is that as to the removal of
delete_blk_timer, I'm afraid it deserves a clarification in the commit
message.

> > -       spin_unlock_bh(&rb_queue->lock);
> > -
> > -       prb_del_retire_blk_timer(pkc);
> > -}
> > -
> > -static void prb_setup_retire_blk_timer(struct packet_sock *po)
> > -{
> > -       struct tpacket_kbdq_core *pkc;
> > -
> > -       pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> > -       timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> > -                   0);
> > -       pkc->retire_blk_timer.expires = jiffies;
> > +       hrtimer_cancel(&pkc->retire_blk_timer);
> >  }
> >
> >  static int prb_calc_retire_blk_tmo(struct packet_sock *po,
> > @@ -671,29 +650,22 @@ static void init_prb_bdqc(struct packet_sock *po,
> >         p1->version = po->tp_version;
> >         po->stats.stats3.tp_freeze_q_cnt = 0;
> >         if (req_u->req3.tp_retire_blk_tov)
> > -               p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
> > +               p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
> >         else
> > -               p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
> > -                                               req_u->req3.tp_block_size);
> > -       p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
> > +               p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
> > +                                               req_u->req3.tp_block_size));
> >         p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
> >         rwlock_init(&p1->blk_fill_in_prog_lock);
> >
> >         p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
> >         prb_init_ft_ops(p1, req_u);
> > -       prb_setup_retire_blk_timer(po);
> > +       hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> > +                     CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
> > +       hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
> > +                     HRTIMER_MODE_REL_SOFT);
>
> You expect to see it start at the setsockopt phase? Even if it's far
> from the real use of recv at the moment.
>
> >         prb_open_block(p1, pbd);
> >  }
> >
> > -/*  Do NOT update the last_blk_num first.
> > - *  Assumes sk_buff_head lock is held.
> > - */
> > -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> > -{
> > -       mod_timer(&pkc->retire_blk_timer,
> > -                       jiffies + pkc->tov_in_jiffies);
> > -}
> > -
> >  /*
> >   * Timer logic:
> >   * 1) We refresh the timer only when we open a block.
> > @@ -717,7 +689,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> >   * prb_calc_retire_blk_tmo() calculates the tmo.
> >   *
> >   */
> > -static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> > +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
> >  {
> >         struct packet_sock *po =
> >                 timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
> > @@ -730,9 +702,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> >         frozen = prb_queue_frozen(pkc);
> >         pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
> >
> > -       if (unlikely(pkc->delete_blk_timer))
> > -               goto out;
> > -
> >         /* We only need to plug the race when the block is partially filled.
> >          * tpacket_rcv:
> >          *              lock(); increment BLOCK_NUM_PKTS; unlock()
> > @@ -749,26 +718,16 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> >         }
> >
> >         if (!frozen) {
> > -               if (!BLOCK_NUM_PKTS(pbd)) {
> > -                       /* An empty block. Just refresh the timer. */
> > -                       goto refresh_timer;
> > +               if (BLOCK_NUM_PKTS(pbd)) {
> > +                       /* Not an empty block. Need retire the block. */
> > +                       prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
> > +                       prb_dispatch_next_block(pkc, po);
> >                 }
> > -               prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
> > -               if (!prb_dispatch_next_block(pkc, po))
> > -                       goto refresh_timer;
> > -               else
> > -                       goto out;
> >         } else {
> >                 /* Case 1. Queue was frozen because user-space was
> >                  * lagging behind.
> >                  */
> > -               if (prb_curr_blk_in_use(pbd)) {
> > -                       /*
> > -                        * Ok, user-space is still behind.
> > -                        * So just refresh the timer.
> > -                        */
> > -                       goto refresh_timer;
> > -               } else {
> > +               if (!prb_curr_blk_in_use(pbd)) {
> >                         /* Case 2. queue was frozen,user-space caught up,
> >                          * now the link went idle && the timer fired.
> >                          * We don't have a block to close.So we open this
> > @@ -777,15 +736,12 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> >                          * Thawing/timer-refresh is a side effect.
> >                          */
> >                         prb_open_block(pkc, pbd);
> > -                       goto out;
> >                 }
> >         }
> >
> > -refresh_timer:
> > -       _prb_refresh_rx_retire_blk_timer(pkc);
> > -
> > -out:
> > +       hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime);
> >         spin_unlock(&po->sk.sk_receive_queue.lock);
> > +       return HRTIMER_RESTART;
> >  }
> >
> >  static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
> > @@ -917,7 +873,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
> >         pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
> >
> >         prb_thaw_queue(pkc1);
> > -       _prb_refresh_rx_retire_blk_timer(pkc1);
>
> Could you say more on why you remove this here and only reset/update
> the expiry time in the timer handler? Probably I missed something
> appearing in the previous long discussion.

I gradually understand your thought behind this modification. You're
trying to move the timer operation out of prb_open_block() and then
spread the timer operation into each caller.

You probably miss the following call trace:
packet_current_rx_frame() -> __packet_lookup_frame_in_block() ->
prb_open_block() -> _prb_refresh_rx_retire_blk_timer()
?

May I ask why bother introducing so many changes like this instead of
leaving it as-is?

Thanks,
Jason

>
> >
> >         smp_wmb();
> >  }
> > diff --git a/net/packet/diag.c b/net/packet/diag.c
> > index 6ce1dcc28..c8f43e0c1 100644
> > --- a/net/packet/diag.c
> > +++ b/net/packet/diag.c
> > @@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
> >         pdr.pdr_frame_nr = ring->frame_max + 1;
> >
> >         if (ver > TPACKET_V2) {
> > -               pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
> > +               pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime);
> >                 pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
> >                 pdr.pdr_features = ring->prb_bdqc.feature_req_word;
> >         } else {
> > diff --git a/net/packet/internal.h b/net/packet/internal.h
> > index d367b9f93..f8cfd9213 100644
> > --- a/net/packet/internal.h
> > +++ b/net/packet/internal.h
> > @@ -20,7 +20,6 @@ struct tpacket_kbdq_core {
> >         unsigned int    feature_req_word;
> >         unsigned int    hdrlen;
> >         unsigned char   reset_pending_on_curr_blk;
> > -       unsigned char   delete_blk_timer;
> >         unsigned short  kactive_blk_num;
> >         unsigned short  blk_sizeof_priv;
> >
> > @@ -39,12 +38,11 @@ struct tpacket_kbdq_core {
> >         /* Default is set to 8ms */
> >  #define DEFAULT_PRB_RETIRE_TOV (8)
> >
> > -       unsigned short  retire_blk_tov;
> > +       ktime_t         interval_ktime;
> >         unsigned short  version;
> > -       unsigned long   tov_in_jiffies;
> >
> >         /* timer to retire an outstanding block */
> > -       struct timer_list retire_blk_timer;
> > +       struct hrtimer  retire_blk_timer;
> >  };
>
> The whole structure needs a new organization?
>
> Before:
>         /* size: 152, cachelines: 3, members: 22 */
>         /* sum members: 144, holes: 2, sum holes: 8 */
>         /* paddings: 1, sum paddings: 4 */
>         /* last cacheline: 24 bytes */
> After:
>         /* size: 176, cachelines: 3, members: 19 */
>         /* sum members: 163, holes: 4, sum holes: 13 */
>         /* paddings: 1, sum paddings: 4 */
>         /* forced alignments: 1, forced holes: 1, sum forced holes: 6 */
>         /* last cacheline: 48 bytes */
>
> Thanks,
> Jason
>
> >
> >  struct pgv {
> > --
> > 2.34.1
> >
> >
Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
Posted by Willem de Bruijn 1 month ago
Xin Zhao wrote:
> In a system with high real-time requirements, the timeout mechanism of
> ordinary timers with jiffies granularity is insufficient to meet the
> demands for real-time performance. Meanwhile, the optimization of CPU
> usage with af_packet is quite significant. Use hrtimer instead of timer
> to help compensate for the shortcomings in real-time performance.
> In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
> enough, with fluctuations reaching over 8ms (on a system with HZ=250).
> This is unacceptable in some high real-time systems that require timely
> processing of network packets. By replacing it with hrtimer, if a timeout
> of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
> 3 ms.
> 
> Signed-off-by: Xin Zhao <jackzxcui1989@163.com>

Tiny style point that is probably not worth respinning for.

Otherwise

Reviewed-by: Willem de Bruijn <willemb@google.com>



> ---
> Changes in v8:
> - Simplify the logic related to setting timeout.
> 
> Changes in v7:
> - Only update the hrtimer expire time within the hrtimer callback.
> 
> Changes in v1:
> - Do not add another config for the current changes.
> 
> ---
>  net/packet/af_packet.c | 79 +++++++++---------------------------------
>  net/packet/diag.c      |  2 +-
>  net/packet/internal.h  |  6 ++--
>  3 files changed, 20 insertions(+), 67 deletions(-)
> 
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index d4eb4a4fe..3e3bb4216 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
>  static int prb_queue_frozen(struct tpacket_kbdq_core *);
>  static void prb_open_block(struct tpacket_kbdq_core *,
>  		struct tpacket_block_desc *);
> -static void prb_retire_rx_blk_timer_expired(struct timer_list *);
> -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
> +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
>  static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
>  static void prb_clear_rxhash(struct tpacket_kbdq_core *,
>  		struct tpacket3_hdr *);
> @@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
>  	return proto;
>  }
>  
> -static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> -{
> -	timer_delete_sync(&pkc->retire_blk_timer);
> -}
> -
>  static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
>  		struct sk_buff_head *rb_queue)
>  {
>  	struct tpacket_kbdq_core *pkc;
>  
>  	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> -
> -	spin_lock_bh(&rb_queue->lock);
> -	pkc->delete_blk_timer = 1;
> -	spin_unlock_bh(&rb_queue->lock);
> -
> -	prb_del_retire_blk_timer(pkc);
> -}
> -
> -static void prb_setup_retire_blk_timer(struct packet_sock *po)
> -{
> -	struct tpacket_kbdq_core *pkc;
> -
> -	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> -	timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> -		    0);
> -	pkc->retire_blk_timer.expires = jiffies;
> +	hrtimer_cancel(&pkc->retire_blk_timer);
>  }
>  
>  static int prb_calc_retire_blk_tmo(struct packet_sock *po,
> @@ -671,29 +650,22 @@ static void init_prb_bdqc(struct packet_sock *po,
>  	p1->version = po->tp_version;
>  	po->stats.stats3.tp_freeze_q_cnt = 0;
>  	if (req_u->req3.tp_retire_blk_tov)
> -		p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
> +		p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
>  	else
> -		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
> -						req_u->req3.tp_block_size);
> -	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
> +		p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
> +						req_u->req3.tp_block_size));

req_u is not aligned with the line above.