tun/tap & vhost-net: apply qdisc backpressure on full ptr_ring to reduce TX drops

[PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Simon Schippers 1 month ago

Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
and wake the corresponding netdev subqueue when consuming an entry frees
space in the underlying ptr_ring.

Stopping of the netdev queue when the ptr_ring is full will be introduced
in an upcoming commit.

Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
---
 drivers/net/tap.c | 23 ++++++++++++++++++++++-
 drivers/net/tun.c | 25 +++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 1197f245e873..2442cf7ac385 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
 	return ret ? ret : total;
 }
 
+static void *tap_ring_consume(struct tap_queue *q)
+{
+	struct ptr_ring *ring = &q->ring;
+	struct net_device *dev;
+	void *ptr;
+
+	spin_lock(&ring->consumer_lock);
+
+	ptr = __ptr_ring_consume(ring);
+	if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
+		rcu_read_lock();
+		dev = rcu_dereference(q->tap)->dev;
+		netif_wake_subqueue(dev, q->queue_index);
+		rcu_read_unlock();
+	}
+
+	spin_unlock(&ring->consumer_lock);
+
+	return ptr;
+}
+
 static ssize_t tap_do_read(struct tap_queue *q,
 			   struct iov_iter *to,
 			   int noblock, struct sk_buff *skb)
@@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
 					TASK_INTERRUPTIBLE);
 
 		/* Read frames from the queue */
-		skb = ptr_ring_consume(&q->ring);
+		skb = tap_ring_consume(q);
 		if (skb)
 			break;
 		if (noblock) {
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8192740357a0..7148f9a844a4 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	return total;
 }
 
+static void *tun_ring_consume(struct tun_file *tfile)
+{
+	struct ptr_ring *ring = &tfile->tx_ring;
+	struct net_device *dev;
+	void *ptr;
+
+	spin_lock(&ring->consumer_lock);
+
+	ptr = __ptr_ring_consume(ring);
+	if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
+		rcu_read_lock();
+		dev = rcu_dereference(tfile->tun)->dev;
+		netif_wake_subqueue(dev, tfile->queue_index);
+		rcu_read_unlock();
+	}
+
+	spin_unlock(&ring->consumer_lock);
+
+	return ptr;
+}
+
 static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	void *ptr = NULL;
 	int error = 0;
 
-	ptr = ptr_ring_consume(&tfile->tx_ring);
+	ptr = tun_ring_consume(tfile);
 	if (ptr)
 		goto out;
 	if (noblock) {
@@ -2131,7 +2152,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		ptr = ptr_ring_consume(&tfile->tx_ring);
+		ptr = tun_ring_consume(tfile);
 		if (ptr)
 			break;
 		if (signal_pending(current)) {
-- 
2.43.0

Re: [PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Jason Wang 1 month ago

On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
<simon.schippers@tu-dortmund.de> wrote:
>
> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
> and wake the corresponding netdev subqueue when consuming an entry frees
> space in the underlying ptr_ring.
>
> Stopping of the netdev queue when the ptr_ring is full will be introduced
> in an upcoming commit.
>
> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> ---
>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
>  2 files changed, 45 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> index 1197f245e873..2442cf7ac385 100644
> --- a/drivers/net/tap.c
> +++ b/drivers/net/tap.c
> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
>         return ret ? ret : total;
>  }
>
> +static void *tap_ring_consume(struct tap_queue *q)
> +{
> +       struct ptr_ring *ring = &q->ring;
> +       struct net_device *dev;
> +       void *ptr;
> +
> +       spin_lock(&ring->consumer_lock);
> +
> +       ptr = __ptr_ring_consume(ring);
> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> +               rcu_read_lock();
> +               dev = rcu_dereference(q->tap)->dev;
> +               netif_wake_subqueue(dev, q->queue_index);
> +               rcu_read_unlock();
> +       }
> +
> +       spin_unlock(&ring->consumer_lock);
> +
> +       return ptr;
> +}
> +
>  static ssize_t tap_do_read(struct tap_queue *q,
>                            struct iov_iter *to,
>                            int noblock, struct sk_buff *skb)
> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
>                                         TASK_INTERRUPTIBLE);
>
>                 /* Read frames from the queue */
> -               skb = ptr_ring_consume(&q->ring);
> +               skb = tap_ring_consume(q);
>                 if (skb)
>                         break;
>                 if (noblock) {
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 8192740357a0..7148f9a844a4 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>         return total;
>  }
>
> +static void *tun_ring_consume(struct tun_file *tfile)
> +{
> +       struct ptr_ring *ring = &tfile->tx_ring;
> +       struct net_device *dev;
> +       void *ptr;
> +
> +       spin_lock(&ring->consumer_lock);
> +
> +       ptr = __ptr_ring_consume(ring);
> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {

I guess it's the "bug" I mentioned in the previous patch that leads to
the check of __ptr_ring_consume_created_space() here. If it's true,
another call to tweak the current API.

> +               rcu_read_lock();
> +               dev = rcu_dereference(tfile->tun)->dev;
> +               netif_wake_subqueue(dev, tfile->queue_index);

This would cause the producer TX_SOFTIRQ to run on the same cpu which
I'm not sure is what we want.

> +               rcu_read_unlock();
> +       }

Btw, this function duplicates a lot of logic of tap_ring_consume() we
should consider to merge the logic.

> +
> +       spin_unlock(&ring->consumer_lock);
> +
> +       return ptr;
> +}
> +
>  static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>  {
>         DECLARE_WAITQUEUE(wait, current);
>         void *ptr = NULL;
>         int error = 0;
>
> -       ptr = ptr_ring_consume(&tfile->tx_ring);
> +       ptr = tun_ring_consume(tfile);

I'm not sure having a separate patch like this may help. For example,
it will introduce performance regression.

>         if (ptr)
>                 goto out;
>         if (noblock) {
> @@ -2131,7 +2152,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>
>         while (1) {
>                 set_current_state(TASK_INTERRUPTIBLE);
> -               ptr = ptr_ring_consume(&tfile->tx_ring);
> +               ptr = tun_ring_consume(tfile);
>                 if (ptr)
>                         break;
>                 if (signal_pending(current)) {
> --
> 2.43.0
>

Thanks

[PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Simon Schippers 1 month ago

On 1/8/26 04:38, Jason Wang wrote:
> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
> <simon.schippers@tu-dortmund.de> wrote:
>>
>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
>> and wake the corresponding netdev subqueue when consuming an entry frees
>> space in the underlying ptr_ring.
>>
>> Stopping of the netdev queue when the ptr_ring is full will be introduced
>> in an upcoming commit.
>>
>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>> ---
>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>  2 files changed, 45 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>> index 1197f245e873..2442cf7ac385 100644
>> --- a/drivers/net/tap.c
>> +++ b/drivers/net/tap.c
>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
>>         return ret ? ret : total;
>>  }
>>
>> +static void *tap_ring_consume(struct tap_queue *q)
>> +{
>> +       struct ptr_ring *ring = &q->ring;
>> +       struct net_device *dev;
>> +       void *ptr;
>> +
>> +       spin_lock(&ring->consumer_lock);
>> +
>> +       ptr = __ptr_ring_consume(ring);
>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>> +               rcu_read_lock();
>> +               dev = rcu_dereference(q->tap)->dev;
>> +               netif_wake_subqueue(dev, q->queue_index);
>> +               rcu_read_unlock();
>> +       }
>> +
>> +       spin_unlock(&ring->consumer_lock);
>> +
>> +       return ptr;
>> +}
>> +
>>  static ssize_t tap_do_read(struct tap_queue *q,
>>                            struct iov_iter *to,
>>                            int noblock, struct sk_buff *skb)
>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
>>                                         TASK_INTERRUPTIBLE);
>>
>>                 /* Read frames from the queue */
>> -               skb = ptr_ring_consume(&q->ring);
>> +               skb = tap_ring_consume(q);
>>                 if (skb)
>>                         break;
>>                 if (noblock) {
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index 8192740357a0..7148f9a844a4 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>         return total;
>>  }
>>
>> +static void *tun_ring_consume(struct tun_file *tfile)
>> +{
>> +       struct ptr_ring *ring = &tfile->tx_ring;
>> +       struct net_device *dev;
>> +       void *ptr;
>> +
>> +       spin_lock(&ring->consumer_lock);
>> +
>> +       ptr = __ptr_ring_consume(ring);
>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> 
> I guess it's the "bug" I mentioned in the previous patch that leads to
> the check of __ptr_ring_consume_created_space() here. If it's true,
> another call to tweak the current API.
> 
>> +               rcu_read_lock();
>> +               dev = rcu_dereference(tfile->tun)->dev;
>> +               netif_wake_subqueue(dev, tfile->queue_index);
> 
> This would cause the producer TX_SOFTIRQ to run on the same cpu which
> I'm not sure is what we want.

What else would you suggest calling to wake the queue?

> 
>> +               rcu_read_unlock();
>> +       }
> 
> Btw, this function duplicates a lot of logic of tap_ring_consume() we
> should consider to merge the logic.

Yes, it is largely the same approach, but it would require accessing the
net_device each time.

> 
>> +
>> +       spin_unlock(&ring->consumer_lock);
>> +
>> +       return ptr;
>> +}
>> +
>>  static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>>  {
>>         DECLARE_WAITQUEUE(wait, current);
>>         void *ptr = NULL;
>>         int error = 0;
>>
>> -       ptr = ptr_ring_consume(&tfile->tx_ring);
>> +       ptr = tun_ring_consume(tfile);
> 
> I'm not sure having a separate patch like this may help. For example,
> it will introduce performance regression.

I ran benchmarks for the whole patch set with noqueue (where the queue is
not stopped to preserve the old behavior), as described in the cover
letter, and observed no performance regression. This leads me to conclude
that there is no performance impact because of this patch when the queue
is not stopped.

> 
>>         if (ptr)
>>                 goto out;
>>         if (noblock) {
>> @@ -2131,7 +2152,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>>
>>         while (1) {
>>                 set_current_state(TASK_INTERRUPTIBLE);
>> -               ptr = ptr_ring_consume(&tfile->tx_ring);
>> +               ptr = tun_ring_consume(tfile);
>>                 if (ptr)
>>                         break;
>>                 if (signal_pending(current)) {
>> --
>> 2.43.0
>>
> 
> Thanks
>

Re: [PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Jason Wang 1 month ago

On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
<simon.schippers@tu-dortmund.de> wrote:
>
> On 1/8/26 04:38, Jason Wang wrote:
> > On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
> > <simon.schippers@tu-dortmund.de> wrote:
> >>
> >> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
> >> and wake the corresponding netdev subqueue when consuming an entry frees
> >> space in the underlying ptr_ring.
> >>
> >> Stopping of the netdev queue when the ptr_ring is full will be introduced
> >> in an upcoming commit.
> >>
> >> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> >> ---
> >>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
> >>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
> >>  2 files changed, 45 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> >> index 1197f245e873..2442cf7ac385 100644
> >> --- a/drivers/net/tap.c
> >> +++ b/drivers/net/tap.c
> >> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
> >>         return ret ? ret : total;
> >>  }
> >>
> >> +static void *tap_ring_consume(struct tap_queue *q)
> >> +{
> >> +       struct ptr_ring *ring = &q->ring;
> >> +       struct net_device *dev;
> >> +       void *ptr;
> >> +
> >> +       spin_lock(&ring->consumer_lock);
> >> +
> >> +       ptr = __ptr_ring_consume(ring);
> >> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >> +               rcu_read_lock();
> >> +               dev = rcu_dereference(q->tap)->dev;
> >> +               netif_wake_subqueue(dev, q->queue_index);
> >> +               rcu_read_unlock();
> >> +       }
> >> +
> >> +       spin_unlock(&ring->consumer_lock);
> >> +
> >> +       return ptr;
> >> +}
> >> +
> >>  static ssize_t tap_do_read(struct tap_queue *q,
> >>                            struct iov_iter *to,
> >>                            int noblock, struct sk_buff *skb)
> >> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
> >>                                         TASK_INTERRUPTIBLE);
> >>
> >>                 /* Read frames from the queue */
> >> -               skb = ptr_ring_consume(&q->ring);
> >> +               skb = tap_ring_consume(q);
> >>                 if (skb)
> >>                         break;
> >>                 if (noblock) {
> >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >> index 8192740357a0..7148f9a844a4 100644
> >> --- a/drivers/net/tun.c
> >> +++ b/drivers/net/tun.c
> >> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
> >>         return total;
> >>  }
> >>
> >> +static void *tun_ring_consume(struct tun_file *tfile)
> >> +{
> >> +       struct ptr_ring *ring = &tfile->tx_ring;
> >> +       struct net_device *dev;
> >> +       void *ptr;
> >> +
> >> +       spin_lock(&ring->consumer_lock);
> >> +
> >> +       ptr = __ptr_ring_consume(ring);
> >> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >
> > I guess it's the "bug" I mentioned in the previous patch that leads to
> > the check of __ptr_ring_consume_created_space() here. If it's true,
> > another call to tweak the current API.
> >
> >> +               rcu_read_lock();
> >> +               dev = rcu_dereference(tfile->tun)->dev;
> >> +               netif_wake_subqueue(dev, tfile->queue_index);
> >
> > This would cause the producer TX_SOFTIRQ to run on the same cpu which
> > I'm not sure is what we want.
>
> What else would you suggest calling to wake the queue?

I don't have a good method in my mind, just want to point out its implications.

>
> >
> >> +               rcu_read_unlock();
> >> +       }
> >
> > Btw, this function duplicates a lot of logic of tap_ring_consume() we
> > should consider to merge the logic.
>
> Yes, it is largely the same approach, but it would require accessing the
> net_device each time.

The problem is that, at least for TUN, the socket is loosely coupled
with the netdev. It means the netdev can go away while the socket
might still exist. That's why vhost only talks to the socket, not the
netdev. If we really want to go this way, here, we should at least
check the existence of tun->dev first.

>
> >
> >> +
> >> +       spin_unlock(&ring->consumer_lock);
> >> +
> >> +       return ptr;
> >> +}
> >> +
> >>  static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
> >>  {
> >>         DECLARE_WAITQUEUE(wait, current);
> >>         void *ptr = NULL;
> >>         int error = 0;
> >>
> >> -       ptr = ptr_ring_consume(&tfile->tx_ring);
> >> +       ptr = tun_ring_consume(tfile);
> >
> > I'm not sure having a separate patch like this may help. For example,
> > it will introduce performance regression.
>
> I ran benchmarks for the whole patch set with noqueue (where the queue is
> not stopped to preserve the old behavior), as described in the cover
> letter, and observed no performance regression. This leads me to conclude
> that there is no performance impact because of this patch when the queue
> is not stopped.

Have you run a benchmark per patch? Or it might just be because the
regression is not obvious. But at least this patch would introduce
more atomic operations or it might just because the TUN doesn't
support burst so pktgen can't have the best PPS.

Thanks


>
> >
> >>         if (ptr)
> >>                 goto out;
> >>         if (noblock) {
> >> @@ -2131,7 +2152,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
> >>
> >>         while (1) {
> >>                 set_current_state(TASK_INTERRUPTIBLE);
> >> -               ptr = ptr_ring_consume(&tfile->tx_ring);
> >> +               ptr = tun_ring_consume(tfile);
> >>                 if (ptr)
> >>                         break;
> >>                 if (signal_pending(current)) {
> >> --
> >> 2.43.0
> >>
> >
> > Thanks
> >
>

[PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Simon Schippers 2 weeks, 5 days ago

On 1/9/26 07:02, Jason Wang wrote:
> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
> <simon.schippers@tu-dortmund.de> wrote:
>>
>> On 1/8/26 04:38, Jason Wang wrote:
>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>
>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
>>>> and wake the corresponding netdev subqueue when consuming an entry frees
>>>> space in the underlying ptr_ring.
>>>>
>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
>>>> in an upcoming commit.
>>>>
>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>> ---
>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>>>> index 1197f245e873..2442cf7ac385 100644
>>>> --- a/drivers/net/tap.c
>>>> +++ b/drivers/net/tap.c
>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
>>>>         return ret ? ret : total;
>>>>  }
>>>>
>>>> +static void *tap_ring_consume(struct tap_queue *q)
>>>> +{
>>>> +       struct ptr_ring *ring = &q->ring;
>>>> +       struct net_device *dev;
>>>> +       void *ptr;
>>>> +
>>>> +       spin_lock(&ring->consumer_lock);
>>>> +
>>>> +       ptr = __ptr_ring_consume(ring);
>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>> +               rcu_read_lock();
>>>> +               dev = rcu_dereference(q->tap)->dev;
>>>> +               netif_wake_subqueue(dev, q->queue_index);
>>>> +               rcu_read_unlock();
>>>> +       }
>>>> +
>>>> +       spin_unlock(&ring->consumer_lock);
>>>> +
>>>> +       return ptr;
>>>> +}
>>>> +
>>>>  static ssize_t tap_do_read(struct tap_queue *q,
>>>>                            struct iov_iter *to,
>>>>                            int noblock, struct sk_buff *skb)
>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
>>>>                                         TASK_INTERRUPTIBLE);
>>>>
>>>>                 /* Read frames from the queue */
>>>> -               skb = ptr_ring_consume(&q->ring);
>>>> +               skb = tap_ring_consume(q);
>>>>                 if (skb)
>>>>                         break;
>>>>                 if (noblock) {
>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>> index 8192740357a0..7148f9a844a4 100644
>>>> --- a/drivers/net/tun.c
>>>> +++ b/drivers/net/tun.c
>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>>>         return total;
>>>>  }
>>>>
>>>> +static void *tun_ring_consume(struct tun_file *tfile)
>>>> +{
>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
>>>> +       struct net_device *dev;
>>>> +       void *ptr;
>>>> +
>>>> +       spin_lock(&ring->consumer_lock);
>>>> +
>>>> +       ptr = __ptr_ring_consume(ring);
>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>
>>> I guess it's the "bug" I mentioned in the previous patch that leads to
>>> the check of __ptr_ring_consume_created_space() here. If it's true,
>>> another call to tweak the current API.
>>>
>>>> +               rcu_read_lock();
>>>> +               dev = rcu_dereference(tfile->tun)->dev;
>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
>>>
>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
>>> I'm not sure is what we want.
>>
>> What else would you suggest calling to wake the queue?
> 
> I don't have a good method in my mind, just want to point out its implications.

I have to admit I'm a bit stuck at this point, particularly with this
aspect.

What is the correct way to pass the producer CPU ID to the consumer?
Would it make sense to store smp_processor_id() in the tfile inside
tun_net_xmit(), or should it instead be stored in the skb (similar to the
XDP bit)? In the latter case, my concern is that this information may
already be significantly outdated by the time it is used.

Based on that, my idea would be for the consumer to wake the producer by
invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
smp_call_function_single().
Is this a reasonable approach?

More generally, would triggering TX_SOFTIRQ on the consumer CPU be
considered a deal-breaker for the patch set?

Thanks!

> 
>>
>>>
>>>> +               rcu_read_unlock();
>>>> +       }
>>>
>>> Btw, this function duplicates a lot of logic of tap_ring_consume() we
>>> should consider to merge the logic.
>>
>> Yes, it is largely the same approach, but it would require accessing the
>> net_device each time.
> 
> The problem is that, at least for TUN, the socket is loosely coupled
> with the netdev. It means the netdev can go away while the socket
> might still exist. That's why vhost only talks to the socket, not the
> netdev. If we really want to go this way, here, we should at least
> check the existence of tun->dev first.
> 
>>
>>>
>>>> +
>>>> +       spin_unlock(&ring->consumer_lock);
>>>> +
>>>> +       return ptr;
>>>> +}
>>>> +
>>>>  static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>>>>  {
>>>>         DECLARE_WAITQUEUE(wait, current);
>>>>         void *ptr = NULL;
>>>>         int error = 0;
>>>>
>>>> -       ptr = ptr_ring_consume(&tfile->tx_ring);
>>>> +       ptr = tun_ring_consume(tfile);
>>>
>>> I'm not sure having a separate patch like this may help. For example,
>>> it will introduce performance regression.
>>
>> I ran benchmarks for the whole patch set with noqueue (where the queue is
>> not stopped to preserve the old behavior), as described in the cover
>> letter, and observed no performance regression. This leads me to conclude
>> that there is no performance impact because of this patch when the queue
>> is not stopped.
> 
> Have you run a benchmark per patch? Or it might just be because the
> regression is not obvious. But at least this patch would introduce
> more atomic operations or it might just because the TUN doesn't
> support burst so pktgen can't have the best PPS.
> 
> Thanks
> 
> 
>>
>>>
>>>>         if (ptr)
>>>>                 goto out;
>>>>         if (noblock) {
>>>> @@ -2131,7 +2152,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>>>>
>>>>         while (1) {
>>>>                 set_current_state(TASK_INTERRUPTIBLE);
>>>> -               ptr = ptr_ring_consume(&tfile->tx_ring);
>>>> +               ptr = tun_ring_consume(tfile);
>>>>                 if (ptr)
>>>>                         break;
>>>>                 if (signal_pending(current)) {
>>>> --
>>>> 2.43.0
>>>>
>>>
>>> Thanks
>>>
>>
>

Re: [PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Jason Wang 2 weeks, 4 days ago

On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
<simon.schippers@tu-dortmund.de> wrote:
>
> On 1/9/26 07:02, Jason Wang wrote:
> > On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
> > <simon.schippers@tu-dortmund.de> wrote:
> >>
> >> On 1/8/26 04:38, Jason Wang wrote:
> >>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
> >>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>
> >>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
> >>>> and wake the corresponding netdev subqueue when consuming an entry frees
> >>>> space in the underlying ptr_ring.
> >>>>
> >>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
> >>>> in an upcoming commit.
> >>>>
> >>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> >>>> ---
> >>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
> >>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
> >>>>  2 files changed, 45 insertions(+), 3 deletions(-)
> >>>>
> >>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> >>>> index 1197f245e873..2442cf7ac385 100644
> >>>> --- a/drivers/net/tap.c
> >>>> +++ b/drivers/net/tap.c
> >>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
> >>>>         return ret ? ret : total;
> >>>>  }
> >>>>
> >>>> +static void *tap_ring_consume(struct tap_queue *q)
> >>>> +{
> >>>> +       struct ptr_ring *ring = &q->ring;
> >>>> +       struct net_device *dev;
> >>>> +       void *ptr;
> >>>> +
> >>>> +       spin_lock(&ring->consumer_lock);
> >>>> +
> >>>> +       ptr = __ptr_ring_consume(ring);
> >>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>> +               rcu_read_lock();
> >>>> +               dev = rcu_dereference(q->tap)->dev;
> >>>> +               netif_wake_subqueue(dev, q->queue_index);
> >>>> +               rcu_read_unlock();
> >>>> +       }
> >>>> +
> >>>> +       spin_unlock(&ring->consumer_lock);
> >>>> +
> >>>> +       return ptr;
> >>>> +}
> >>>> +
> >>>>  static ssize_t tap_do_read(struct tap_queue *q,
> >>>>                            struct iov_iter *to,
> >>>>                            int noblock, struct sk_buff *skb)
> >>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
> >>>>                                         TASK_INTERRUPTIBLE);
> >>>>
> >>>>                 /* Read frames from the queue */
> >>>> -               skb = ptr_ring_consume(&q->ring);
> >>>> +               skb = tap_ring_consume(q);
> >>>>                 if (skb)
> >>>>                         break;
> >>>>                 if (noblock) {
> >>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>>> index 8192740357a0..7148f9a844a4 100644
> >>>> --- a/drivers/net/tun.c
> >>>> +++ b/drivers/net/tun.c
> >>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
> >>>>         return total;
> >>>>  }
> >>>>
> >>>> +static void *tun_ring_consume(struct tun_file *tfile)
> >>>> +{
> >>>> +       struct ptr_ring *ring = &tfile->tx_ring;
> >>>> +       struct net_device *dev;
> >>>> +       void *ptr;
> >>>> +
> >>>> +       spin_lock(&ring->consumer_lock);
> >>>> +
> >>>> +       ptr = __ptr_ring_consume(ring);
> >>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>
> >>> I guess it's the "bug" I mentioned in the previous patch that leads to
> >>> the check of __ptr_ring_consume_created_space() here. If it's true,
> >>> another call to tweak the current API.
> >>>
> >>>> +               rcu_read_lock();
> >>>> +               dev = rcu_dereference(tfile->tun)->dev;
> >>>> +               netif_wake_subqueue(dev, tfile->queue_index);
> >>>
> >>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
> >>> I'm not sure is what we want.
> >>
> >> What else would you suggest calling to wake the queue?
> >
> > I don't have a good method in my mind, just want to point out its implications.
>
> I have to admit I'm a bit stuck at this point, particularly with this
> aspect.
>
> What is the correct way to pass the producer CPU ID to the consumer?
> Would it make sense to store smp_processor_id() in the tfile inside
> tun_net_xmit(), or should it instead be stored in the skb (similar to the
> XDP bit)? In the latter case, my concern is that this information may
> already be significantly outdated by the time it is used.
>
> Based on that, my idea would be for the consumer to wake the producer by
> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
> smp_call_function_single().
> Is this a reasonable approach?

I'm not sure but it would introduce costs like IPI.

>
> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
> considered a deal-breaker for the patch set?

It depends on whether or not it has effects on the performance.
Especially when vhost is pinned.

Thanks

>
> Thanks!
>
> >
> >>
> >>>
> >>>> +               rcu_read_unlock();
> >>>> +       }
> >>>
> >>> Btw, this function duplicates a lot of logic of tap_ring_consume() we
> >>> should consider to merge the logic.
> >>
> >> Yes, it is largely the same approach, but it would require accessing the
> >> net_device each time.
> >
> > The problem is that, at least for TUN, the socket is loosely coupled
> > with the netdev. It means the netdev can go away while the socket
> > might still exist. That's why vhost only talks to the socket, not the
> > netdev. If we really want to go this way, here, we should at least
> > check the existence of tun->dev first.
> >
> >>
> >>>
> >>>> +
> >>>> +       spin_unlock(&ring->consumer_lock);
> >>>> +
> >>>> +       return ptr;
> >>>> +}
> >>>> +
> >>>>  static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
> >>>>  {
> >>>>         DECLARE_WAITQUEUE(wait, current);
> >>>>         void *ptr = NULL;
> >>>>         int error = 0;
> >>>>
> >>>> -       ptr = ptr_ring_consume(&tfile->tx_ring);
> >>>> +       ptr = tun_ring_consume(tfile);
> >>>
> >>> I'm not sure having a separate patch like this may help. For example,
> >>> it will introduce performance regression.
> >>
> >> I ran benchmarks for the whole patch set with noqueue (where the queue is
> >> not stopped to preserve the old behavior), as described in the cover
> >> letter, and observed no performance regression. This leads me to conclude
> >> that there is no performance impact because of this patch when the queue
> >> is not stopped.
> >
> > Have you run a benchmark per patch? Or it might just be because the
> > regression is not obvious. But at least this patch would introduce
> > more atomic operations or it might just because the TUN doesn't
> > support burst so pktgen can't have the best PPS.
> >
> > Thanks
> >
> >
> >>
> >>>
> >>>>         if (ptr)
> >>>>                 goto out;
> >>>>         if (noblock) {
> >>>> @@ -2131,7 +2152,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
> >>>>
> >>>>         while (1) {
> >>>>                 set_current_state(TASK_INTERRUPTIBLE);
> >>>> -               ptr = ptr_ring_consume(&tfile->tx_ring);
> >>>> +               ptr = tun_ring_consume(tfile);
> >>>>                 if (ptr)
> >>>>                         break;
> >>>>                 if (signal_pending(current)) {
> >>>> --
> >>>> 2.43.0
> >>>>
> >>>
> >>> Thanks
> >>>
> >>
> >
>

Re: [PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Jason Wang 2 weeks, 3 days ago

On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
> <simon.schippers@tu-dortmund.de> wrote:
> >
> > On 1/9/26 07:02, Jason Wang wrote:
> > > On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
> > > <simon.schippers@tu-dortmund.de> wrote:
> > >>
> > >> On 1/8/26 04:38, Jason Wang wrote:
> > >>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
> > >>> <simon.schippers@tu-dortmund.de> wrote:
> > >>>>
> > >>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
> > >>>> and wake the corresponding netdev subqueue when consuming an entry frees
> > >>>> space in the underlying ptr_ring.
> > >>>>
> > >>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
> > >>>> in an upcoming commit.
> > >>>>
> > >>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> > >>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> > >>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> > >>>> ---
> > >>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
> > >>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
> > >>>>  2 files changed, 45 insertions(+), 3 deletions(-)
> > >>>>
> > >>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> > >>>> index 1197f245e873..2442cf7ac385 100644
> > >>>> --- a/drivers/net/tap.c
> > >>>> +++ b/drivers/net/tap.c
> > >>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
> > >>>>         return ret ? ret : total;
> > >>>>  }
> > >>>>
> > >>>> +static void *tap_ring_consume(struct tap_queue *q)
> > >>>> +{
> > >>>> +       struct ptr_ring *ring = &q->ring;
> > >>>> +       struct net_device *dev;
> > >>>> +       void *ptr;
> > >>>> +
> > >>>> +       spin_lock(&ring->consumer_lock);
> > >>>> +
> > >>>> +       ptr = __ptr_ring_consume(ring);
> > >>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> > >>>> +               rcu_read_lock();
> > >>>> +               dev = rcu_dereference(q->tap)->dev;
> > >>>> +               netif_wake_subqueue(dev, q->queue_index);
> > >>>> +               rcu_read_unlock();
> > >>>> +       }
> > >>>> +
> > >>>> +       spin_unlock(&ring->consumer_lock);
> > >>>> +
> > >>>> +       return ptr;
> > >>>> +}
> > >>>> +
> > >>>>  static ssize_t tap_do_read(struct tap_queue *q,
> > >>>>                            struct iov_iter *to,
> > >>>>                            int noblock, struct sk_buff *skb)
> > >>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
> > >>>>                                         TASK_INTERRUPTIBLE);
> > >>>>
> > >>>>                 /* Read frames from the queue */
> > >>>> -               skb = ptr_ring_consume(&q->ring);
> > >>>> +               skb = tap_ring_consume(q);
> > >>>>                 if (skb)
> > >>>>                         break;
> > >>>>                 if (noblock) {
> > >>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> > >>>> index 8192740357a0..7148f9a844a4 100644
> > >>>> --- a/drivers/net/tun.c
> > >>>> +++ b/drivers/net/tun.c
> > >>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
> > >>>>         return total;
> > >>>>  }
> > >>>>
> > >>>> +static void *tun_ring_consume(struct tun_file *tfile)
> > >>>> +{
> > >>>> +       struct ptr_ring *ring = &tfile->tx_ring;
> > >>>> +       struct net_device *dev;
> > >>>> +       void *ptr;
> > >>>> +
> > >>>> +       spin_lock(&ring->consumer_lock);
> > >>>> +
> > >>>> +       ptr = __ptr_ring_consume(ring);
> > >>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> > >>>
> > >>> I guess it's the "bug" I mentioned in the previous patch that leads to
> > >>> the check of __ptr_ring_consume_created_space() here. If it's true,
> > >>> another call to tweak the current API.
> > >>>
> > >>>> +               rcu_read_lock();
> > >>>> +               dev = rcu_dereference(tfile->tun)->dev;
> > >>>> +               netif_wake_subqueue(dev, tfile->queue_index);
> > >>>
> > >>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
> > >>> I'm not sure is what we want.
> > >>
> > >> What else would you suggest calling to wake the queue?
> > >
> > > I don't have a good method in my mind, just want to point out its implications.
> >
> > I have to admit I'm a bit stuck at this point, particularly with this
> > aspect.
> >
> > What is the correct way to pass the producer CPU ID to the consumer?
> > Would it make sense to store smp_processor_id() in the tfile inside
> > tun_net_xmit(), or should it instead be stored in the skb (similar to the
> > XDP bit)? In the latter case, my concern is that this information may
> > already be significantly outdated by the time it is used.
> >
> > Based on that, my idea would be for the consumer to wake the producer by
> > invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
> > smp_call_function_single().
> > Is this a reasonable approach?
>
> I'm not sure but it would introduce costs like IPI.
>
> >
> > More generally, would triggering TX_SOFTIRQ on the consumer CPU be
> > considered a deal-breaker for the patch set?
>
> It depends on whether or not it has effects on the performance.
> Especially when vhost is pinned.

I meant we can benchmark to see the impact. For example, pin vhost to
a specific CPU and the try to see the impact of the TX_SOFTIRQ.

Thanks

[PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Simon Schippers 2 weeks, 3 days ago

On 1/23/26 04:05, Jason Wang wrote:
> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
>> <simon.schippers@tu-dortmund.de> wrote:
>>>
>>> On 1/9/26 07:02, Jason Wang wrote:
>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>
>>>>> On 1/8/26 04:38, Jason Wang wrote:
>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>
>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
>>>>>>> space in the underlying ptr_ring.
>>>>>>>
>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
>>>>>>> in an upcoming commit.
>>>>>>>
>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>> ---
>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>>>>>>> index 1197f245e873..2442cf7ac385 100644
>>>>>>> --- a/drivers/net/tap.c
>>>>>>> +++ b/drivers/net/tap.c
>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
>>>>>>>         return ret ? ret : total;
>>>>>>>  }
>>>>>>>
>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
>>>>>>> +{
>>>>>>> +       struct ptr_ring *ring = &q->ring;
>>>>>>> +       struct net_device *dev;
>>>>>>> +       void *ptr;
>>>>>>> +
>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>> +
>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>> +               rcu_read_lock();
>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
>>>>>>> +               rcu_read_unlock();
>>>>>>> +       }
>>>>>>> +
>>>>>>> +       spin_unlock(&ring->consumer_lock);
>>>>>>> +
>>>>>>> +       return ptr;
>>>>>>> +}
>>>>>>> +
>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>                            struct iov_iter *to,
>>>>>>>                            int noblock, struct sk_buff *skb)
>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>                                         TASK_INTERRUPTIBLE);
>>>>>>>
>>>>>>>                 /* Read frames from the queue */
>>>>>>> -               skb = ptr_ring_consume(&q->ring);
>>>>>>> +               skb = tap_ring_consume(q);
>>>>>>>                 if (skb)
>>>>>>>                         break;
>>>>>>>                 if (noblock) {
>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>> index 8192740357a0..7148f9a844a4 100644
>>>>>>> --- a/drivers/net/tun.c
>>>>>>> +++ b/drivers/net/tun.c
>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>>>>>>         return total;
>>>>>>>  }
>>>>>>>
>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
>>>>>>> +{
>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
>>>>>>> +       struct net_device *dev;
>>>>>>> +       void *ptr;
>>>>>>> +
>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>> +
>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>
>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
>>>>>> another call to tweak the current API.
>>>>>>
>>>>>>> +               rcu_read_lock();
>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
>>>>>>
>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
>>>>>> I'm not sure is what we want.
>>>>>
>>>>> What else would you suggest calling to wake the queue?
>>>>
>>>> I don't have a good method in my mind, just want to point out its implications.
>>>
>>> I have to admit I'm a bit stuck at this point, particularly with this
>>> aspect.
>>>
>>> What is the correct way to pass the producer CPU ID to the consumer?
>>> Would it make sense to store smp_processor_id() in the tfile inside
>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
>>> XDP bit)? In the latter case, my concern is that this information may
>>> already be significantly outdated by the time it is used.
>>>
>>> Based on that, my idea would be for the consumer to wake the producer by
>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
>>> smp_call_function_single().
>>> Is this a reasonable approach?
>>
>> I'm not sure but it would introduce costs like IPI.
>>
>>>
>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
>>> considered a deal-breaker for the patch set?
>>
>> It depends on whether or not it has effects on the performance.
>> Especially when vhost is pinned.
> 
> I meant we can benchmark to see the impact. For example, pin vhost to
> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
> 
> Thanks
> 

I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
for both the stock and patched versions. The benchmarks were run with
the full patch series applied, since testing only patches 1-3 would not
be meaningful - the queue is never stopped in that case, so no
TX_SOFTIRQ is triggered.

Compared to the non-pinned CPU benchmarks in the cover letter,
performance is lower for pktgen with a single thread but higher with
four threads. The results show no regression for the patched version,
with even slight performance improvements observed:

+-------------------------+-----------+----------------+
| pktgen benchmarks to    | Stock     | Patched with   |
| Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
| 100M packets            |           |                |
| vhost pinned to core 0  |           |                |
+-----------+-------------+-----------+----------------+
| TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
|  +        +-------------+-----------+----------------+
| vhost-net | Lost        | 1154 Kpps | 0              |
+-----------+-------------+-----------+----------------+

+-------------------------+-----------+----------------+
| pktgen benchmarks to    | Stock     | Patched with   |
| Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
| 100M packets            |           |                |
| vhost pinned to core 0  |           |                |
| *4 threads*             |           |                |
+-----------+-------------+-----------+----------------+
| TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
|  +        +-------------+-----------+----------------+
| vhost-net | Lost        | 1527 Kpps | 0              |
+-----------+-------------+-----------+----------------+

+------------------------+-------------+----------------+
| iperf3 TCP benchmarks  | Stock       | Patched with   |
| to Debian VM 120s      |             | fq_codel qdisc |
| vhost pinned to core 0 |             |                |
+------------------------+-------------+----------------+
| TAP                    | 22.0 Gbit/s | 22.0 Gbit/s    |
|  +                     |             |                |
| vhost-net              |             |                |
+------------------------+-------------+----------------+

+---------------------------+-------------+----------------+
| iperf3 TCP benchmarks     | Stock       | Patched with   |
| to Debian VM 120s         |             | fq_codel qdisc |
| vhost pinned to core 0    |             |                |
| *4 iperf3 client threads* |             |                |
+---------------------------+-------------+----------------+
| TAP                       | 21.4 Gbit/s | 21.5 Gbit/s    |
|  +                        |             |                |
| vhost-net                 |             |                |
+---------------------------+-------------+----------------+

[PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Simon Schippers 1 week, 5 days ago

On 1/23/26 10:54, Simon Schippers wrote:
> On 1/23/26 04:05, Jason Wang wrote:
>> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
>>>
>>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>
>>>> On 1/9/26 07:02, Jason Wang wrote:
>>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>
>>>>>> On 1/8/26 04:38, Jason Wang wrote:
>>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>
>>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
>>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
>>>>>>>> space in the underlying ptr_ring.
>>>>>>>>
>>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
>>>>>>>> in an upcoming commit.
>>>>>>>>
>>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>>> ---
>>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
>>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>>>>>>>> index 1197f245e873..2442cf7ac385 100644
>>>>>>>> --- a/drivers/net/tap.c
>>>>>>>> +++ b/drivers/net/tap.c
>>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
>>>>>>>>         return ret ? ret : total;
>>>>>>>>  }
>>>>>>>>
>>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
>>>>>>>> +{
>>>>>>>> +       struct ptr_ring *ring = &q->ring;
>>>>>>>> +       struct net_device *dev;
>>>>>>>> +       void *ptr;
>>>>>>>> +
>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>> +
>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>> +               rcu_read_lock();
>>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
>>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
>>>>>>>> +               rcu_read_unlock();
>>>>>>>> +       }
>>>>>>>> +
>>>>>>>> +       spin_unlock(&ring->consumer_lock);
>>>>>>>> +
>>>>>>>> +       return ptr;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>                            struct iov_iter *to,
>>>>>>>>                            int noblock, struct sk_buff *skb)
>>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>                                         TASK_INTERRUPTIBLE);
>>>>>>>>
>>>>>>>>                 /* Read frames from the queue */
>>>>>>>> -               skb = ptr_ring_consume(&q->ring);
>>>>>>>> +               skb = tap_ring_consume(q);
>>>>>>>>                 if (skb)
>>>>>>>>                         break;
>>>>>>>>                 if (noblock) {
>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>>> index 8192740357a0..7148f9a844a4 100644
>>>>>>>> --- a/drivers/net/tun.c
>>>>>>>> +++ b/drivers/net/tun.c
>>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>>>>>>>         return total;
>>>>>>>>  }
>>>>>>>>
>>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
>>>>>>>> +{
>>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
>>>>>>>> +       struct net_device *dev;
>>>>>>>> +       void *ptr;
>>>>>>>> +
>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>> +
>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>
>>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
>>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
>>>>>>> another call to tweak the current API.
>>>>>>>
>>>>>>>> +               rcu_read_lock();
>>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
>>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
>>>>>>>
>>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
>>>>>>> I'm not sure is what we want.
>>>>>>
>>>>>> What else would you suggest calling to wake the queue?
>>>>>
>>>>> I don't have a good method in my mind, just want to point out its implications.
>>>>
>>>> I have to admit I'm a bit stuck at this point, particularly with this
>>>> aspect.
>>>>
>>>> What is the correct way to pass the producer CPU ID to the consumer?
>>>> Would it make sense to store smp_processor_id() in the tfile inside
>>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
>>>> XDP bit)? In the latter case, my concern is that this information may
>>>> already be significantly outdated by the time it is used.
>>>>
>>>> Based on that, my idea would be for the consumer to wake the producer by
>>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
>>>> smp_call_function_single().
>>>> Is this a reasonable approach?
>>>
>>> I'm not sure but it would introduce costs like IPI.
>>>
>>>>
>>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
>>>> considered a deal-breaker for the patch set?
>>>
>>> It depends on whether or not it has effects on the performance.
>>> Especially when vhost is pinned.
>>
>> I meant we can benchmark to see the impact. For example, pin vhost to
>> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
>>
>> Thanks
>>
> 
> I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
> for both the stock and patched versions. The benchmarks were run with
> the full patch series applied, since testing only patches 1-3 would not
> be meaningful - the queue is never stopped in that case, so no
> TX_SOFTIRQ is triggered.
> 
> Compared to the non-pinned CPU benchmarks in the cover letter,
> performance is lower for pktgen with a single thread but higher with
> four threads. The results show no regression for the patched version,
> with even slight performance improvements observed:
> 
> +-------------------------+-----------+----------------+
> | pktgen benchmarks to    | Stock     | Patched with   |
> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> | 100M packets            |           |                |
> | vhost pinned to core 0  |           |                |
> +-----------+-------------+-----------+----------------+
> | TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
> |  +        +-------------+-----------+----------------+
> | vhost-net | Lost        | 1154 Kpps | 0              |
> +-----------+-------------+-----------+----------------+
> 
> +-------------------------+-----------+----------------+
> | pktgen benchmarks to    | Stock     | Patched with   |
> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> | 100M packets            |           |                |
> | vhost pinned to core 0  |           |                |
> | *4 threads*             |           |                |
> +-----------+-------------+-----------+----------------+
> | TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
> |  +        +-------------+-----------+----------------+
> | vhost-net | Lost        | 1527 Kpps | 0              |
> +-----------+-------------+-----------+----------------+
> 
> +------------------------+-------------+----------------+
> | iperf3 TCP benchmarks  | Stock       | Patched with   |
> | to Debian VM 120s      |             | fq_codel qdisc |
> | vhost pinned to core 0 |             |                |
> +------------------------+-------------+----------------+
> | TAP                    | 22.0 Gbit/s | 22.0 Gbit/s    |
> |  +                     |             |                |
> | vhost-net              |             |                |
> +------------------------+-------------+----------------+
> 
> +---------------------------+-------------+----------------+
> | iperf3 TCP benchmarks     | Stock       | Patched with   |
> | to Debian VM 120s         |             | fq_codel qdisc |
> | vhost pinned to core 0    |             |                |
> | *4 iperf3 client threads* |             |                |
> +---------------------------+-------------+----------------+
> | TAP                       | 21.4 Gbit/s | 21.5 Gbit/s    |
> |  +                        |             |                |
> | vhost-net                 |             |                |
> +---------------------------+-------------+----------------+

What are your thoughts on this?

Thanks!

Re: [PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Jason Wang 1 week, 5 days ago

On Wed, Jan 28, 2026 at 12:48 AM Simon Schippers
<simon.schippers@tu-dortmund.de> wrote:
>
> On 1/23/26 10:54, Simon Schippers wrote:
> > On 1/23/26 04:05, Jason Wang wrote:
> >> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>
> >>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
> >>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>
> >>>> On 1/9/26 07:02, Jason Wang wrote:
> >>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
> >>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>
> >>>>>> On 1/8/26 04:38, Jason Wang wrote:
> >>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
> >>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>
> >>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
> >>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
> >>>>>>>> space in the underlying ptr_ring.
> >>>>>>>>
> >>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
> >>>>>>>> in an upcoming commit.
> >>>>>>>>
> >>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> >>>>>>>> ---
> >>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
> >>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
> >>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
> >>>>>>>>
> >>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> >>>>>>>> index 1197f245e873..2442cf7ac385 100644
> >>>>>>>> --- a/drivers/net/tap.c
> >>>>>>>> +++ b/drivers/net/tap.c
> >>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
> >>>>>>>>         return ret ? ret : total;
> >>>>>>>>  }
> >>>>>>>>
> >>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
> >>>>>>>> +{
> >>>>>>>> +       struct ptr_ring *ring = &q->ring;
> >>>>>>>> +       struct net_device *dev;
> >>>>>>>> +       void *ptr;
> >>>>>>>> +
> >>>>>>>> +       spin_lock(&ring->consumer_lock);
> >>>>>>>> +
> >>>>>>>> +       ptr = __ptr_ring_consume(ring);
> >>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>>>>>> +               rcu_read_lock();
> >>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
> >>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
> >>>>>>>> +               rcu_read_unlock();
> >>>>>>>> +       }
> >>>>>>>> +
> >>>>>>>> +       spin_unlock(&ring->consumer_lock);
> >>>>>>>> +
> >>>>>>>> +       return ptr;
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
> >>>>>>>>                            struct iov_iter *to,
> >>>>>>>>                            int noblock, struct sk_buff *skb)
> >>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
> >>>>>>>>                                         TASK_INTERRUPTIBLE);
> >>>>>>>>
> >>>>>>>>                 /* Read frames from the queue */
> >>>>>>>> -               skb = ptr_ring_consume(&q->ring);
> >>>>>>>> +               skb = tap_ring_consume(q);
> >>>>>>>>                 if (skb)
> >>>>>>>>                         break;
> >>>>>>>>                 if (noblock) {
> >>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>>>>>>> index 8192740357a0..7148f9a844a4 100644
> >>>>>>>> --- a/drivers/net/tun.c
> >>>>>>>> +++ b/drivers/net/tun.c
> >>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
> >>>>>>>>         return total;
> >>>>>>>>  }
> >>>>>>>>
> >>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
> >>>>>>>> +{
> >>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
> >>>>>>>> +       struct net_device *dev;
> >>>>>>>> +       void *ptr;
> >>>>>>>> +
> >>>>>>>> +       spin_lock(&ring->consumer_lock);
> >>>>>>>> +
> >>>>>>>> +       ptr = __ptr_ring_consume(ring);
> >>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>>>>>
> >>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
> >>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
> >>>>>>> another call to tweak the current API.
> >>>>>>>
> >>>>>>>> +               rcu_read_lock();
> >>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
> >>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
> >>>>>>>
> >>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
> >>>>>>> I'm not sure is what we want.
> >>>>>>
> >>>>>> What else would you suggest calling to wake the queue?
> >>>>>
> >>>>> I don't have a good method in my mind, just want to point out its implications.
> >>>>
> >>>> I have to admit I'm a bit stuck at this point, particularly with this
> >>>> aspect.
> >>>>
> >>>> What is the correct way to pass the producer CPU ID to the consumer?
> >>>> Would it make sense to store smp_processor_id() in the tfile inside
> >>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
> >>>> XDP bit)? In the latter case, my concern is that this information may
> >>>> already be significantly outdated by the time it is used.
> >>>>
> >>>> Based on that, my idea would be for the consumer to wake the producer by
> >>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
> >>>> smp_call_function_single().
> >>>> Is this a reasonable approach?
> >>>
> >>> I'm not sure but it would introduce costs like IPI.
> >>>
> >>>>
> >>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
> >>>> considered a deal-breaker for the patch set?
> >>>
> >>> It depends on whether or not it has effects on the performance.
> >>> Especially when vhost is pinned.
> >>
> >> I meant we can benchmark to see the impact. For example, pin vhost to
> >> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
> >>
> >> Thanks
> >>
> >
> > I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
> > for both the stock and patched versions. The benchmarks were run with
> > the full patch series applied, since testing only patches 1-3 would not
> > be meaningful - the queue is never stopped in that case, so no
> > TX_SOFTIRQ is triggered.
> >
> > Compared to the non-pinned CPU benchmarks in the cover letter,
> > performance is lower for pktgen with a single thread but higher with
> > four threads. The results show no regression for the patched version,
> > with even slight performance improvements observed:
> >
> > +-------------------------+-----------+----------------+
> > | pktgen benchmarks to    | Stock     | Patched with   |
> > | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> > | 100M packets            |           |                |
> > | vhost pinned to core 0  |           |                |
> > +-----------+-------------+-----------+----------------+
> > | TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
> > |  +        +-------------+-----------+----------------+
> > | vhost-net | Lost        | 1154 Kpps | 0              |
> > +-----------+-------------+-----------+----------------+
> >
> > +-------------------------+-----------+----------------+
> > | pktgen benchmarks to    | Stock     | Patched with   |
> > | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> > | 100M packets            |           |                |
> > | vhost pinned to core 0  |           |                |
> > | *4 threads*             |           |                |
> > +-----------+-------------+-----------+----------------+
> > | TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
> > |  +        +-------------+-----------+----------------+
> > | vhost-net | Lost        | 1527 Kpps | 0              |
> > +-----------+-------------+-----------+----------------+

The PPS seems to be low. I'd suggest using testpmd (rxonly) mode in
the guest or an xdp program that did XDP_DROP in the guest.

> >
> > +------------------------+-------------+----------------+
> > | iperf3 TCP benchmarks  | Stock       | Patched with   |
> > | to Debian VM 120s      |             | fq_codel qdisc |
> > | vhost pinned to core 0 |             |                |
> > +------------------------+-------------+----------------+
> > | TAP                    | 22.0 Gbit/s | 22.0 Gbit/s    |
> > |  +                     |             |                |
> > | vhost-net              |             |                |
> > +------------------------+-------------+----------------+
> >
> > +---------------------------+-------------+----------------+
> > | iperf3 TCP benchmarks     | Stock       | Patched with   |
> > | to Debian VM 120s         |             | fq_codel qdisc |
> > | vhost pinned to core 0    |             |                |
> > | *4 iperf3 client threads* |             |                |
> > +---------------------------+-------------+----------------+
> > | TAP                       | 21.4 Gbit/s | 21.5 Gbit/s    |
> > |  +                        |             |                |
> > | vhost-net                 |             |                |
> > +---------------------------+-------------+----------------+
>
> What are your thoughts on this?
>
> Thanks!
>
>

Thanks

[PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Simon Schippers 1 week, 5 days ago

On 1/28/26 08:03, Jason Wang wrote:
> On Wed, Jan 28, 2026 at 12:48 AM Simon Schippers
> <simon.schippers@tu-dortmund.de> wrote:
>>
>> On 1/23/26 10:54, Simon Schippers wrote:
>>> On 1/23/26 04:05, Jason Wang wrote:
>>>> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>
>>>>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>
>>>>>> On 1/9/26 07:02, Jason Wang wrote:
>>>>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>
>>>>>>>> On 1/8/26 04:38, Jason Wang wrote:
>>>>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>
>>>>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
>>>>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
>>>>>>>>>> space in the underlying ptr_ring.
>>>>>>>>>>
>>>>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
>>>>>>>>>> in an upcoming commit.
>>>>>>>>>>
>>>>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>>>>> ---
>>>>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
>>>>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>>>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>>>>>>>>>> index 1197f245e873..2442cf7ac385 100644
>>>>>>>>>> --- a/drivers/net/tap.c
>>>>>>>>>> +++ b/drivers/net/tap.c
>>>>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
>>>>>>>>>>         return ret ? ret : total;
>>>>>>>>>>  }
>>>>>>>>>>
>>>>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
>>>>>>>>>> +{
>>>>>>>>>> +       struct ptr_ring *ring = &q->ring;
>>>>>>>>>> +       struct net_device *dev;
>>>>>>>>>> +       void *ptr;
>>>>>>>>>> +
>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>>>> +
>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>> +               rcu_read_lock();
>>>>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
>>>>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
>>>>>>>>>> +               rcu_read_unlock();
>>>>>>>>>> +       }
>>>>>>>>>> +
>>>>>>>>>> +       spin_unlock(&ring->consumer_lock);
>>>>>>>>>> +
>>>>>>>>>> +       return ptr;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>>>                            struct iov_iter *to,
>>>>>>>>>>                            int noblock, struct sk_buff *skb)
>>>>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>>>                                         TASK_INTERRUPTIBLE);
>>>>>>>>>>
>>>>>>>>>>                 /* Read frames from the queue */
>>>>>>>>>> -               skb = ptr_ring_consume(&q->ring);
>>>>>>>>>> +               skb = tap_ring_consume(q);
>>>>>>>>>>                 if (skb)
>>>>>>>>>>                         break;
>>>>>>>>>>                 if (noblock) {
>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>>>>> index 8192740357a0..7148f9a844a4 100644
>>>>>>>>>> --- a/drivers/net/tun.c
>>>>>>>>>> +++ b/drivers/net/tun.c
>>>>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>>>>>>>>>         return total;
>>>>>>>>>>  }
>>>>>>>>>>
>>>>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
>>>>>>>>>> +{
>>>>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
>>>>>>>>>> +       struct net_device *dev;
>>>>>>>>>> +       void *ptr;
>>>>>>>>>> +
>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>>>> +
>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>
>>>>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
>>>>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
>>>>>>>>> another call to tweak the current API.
>>>>>>>>>
>>>>>>>>>> +               rcu_read_lock();
>>>>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
>>>>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
>>>>>>>>>
>>>>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
>>>>>>>>> I'm not sure is what we want.
>>>>>>>>
>>>>>>>> What else would you suggest calling to wake the queue?
>>>>>>>
>>>>>>> I don't have a good method in my mind, just want to point out its implications.
>>>>>>
>>>>>> I have to admit I'm a bit stuck at this point, particularly with this
>>>>>> aspect.
>>>>>>
>>>>>> What is the correct way to pass the producer CPU ID to the consumer?
>>>>>> Would it make sense to store smp_processor_id() in the tfile inside
>>>>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
>>>>>> XDP bit)? In the latter case, my concern is that this information may
>>>>>> already be significantly outdated by the time it is used.
>>>>>>
>>>>>> Based on that, my idea would be for the consumer to wake the producer by
>>>>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
>>>>>> smp_call_function_single().
>>>>>> Is this a reasonable approach?
>>>>>
>>>>> I'm not sure but it would introduce costs like IPI.
>>>>>
>>>>>>
>>>>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
>>>>>> considered a deal-breaker for the patch set?
>>>>>
>>>>> It depends on whether or not it has effects on the performance.
>>>>> Especially when vhost is pinned.
>>>>
>>>> I meant we can benchmark to see the impact. For example, pin vhost to
>>>> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
>>>>
>>>> Thanks
>>>>
>>>
>>> I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
>>> for both the stock and patched versions. The benchmarks were run with
>>> the full patch series applied, since testing only patches 1-3 would not
>>> be meaningful - the queue is never stopped in that case, so no
>>> TX_SOFTIRQ is triggered.
>>>
>>> Compared to the non-pinned CPU benchmarks in the cover letter,
>>> performance is lower for pktgen with a single thread but higher with
>>> four threads. The results show no regression for the patched version,
>>> with even slight performance improvements observed:
>>>
>>> +-------------------------+-----------+----------------+
>>> | pktgen benchmarks to    | Stock     | Patched with   |
>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
>>> | 100M packets            |           |                |
>>> | vhost pinned to core 0  |           |                |
>>> +-----------+-------------+-----------+----------------+
>>> | TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
>>> |  +        +-------------+-----------+----------------+
>>> | vhost-net | Lost        | 1154 Kpps | 0              |
>>> +-----------+-------------+-----------+----------------+
>>>
>>> +-------------------------+-----------+----------------+
>>> | pktgen benchmarks to    | Stock     | Patched with   |
>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
>>> | 100M packets            |           |                |
>>> | vhost pinned to core 0  |           |                |
>>> | *4 threads*             |           |                |
>>> +-----------+-------------+-----------+----------------+
>>> | TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
>>> |  +        +-------------+-----------+----------------+
>>> | vhost-net | Lost        | 1527 Kpps | 0              |
>>> +-----------+-------------+-----------+----------------+
> 
> The PPS seems to be low. I'd suggest using testpmd (rxonly) mode in
> the guest or an xdp program that did XDP_DROP in the guest.

I forgot to mention that these PPS values are per thread.
So overall we have 71 Kpps * 4 = 284 Kpps and 79 Kpps * 4 = 326 Kpps,
respectively. For packet loss, that comes out to 1154 Kpps * 4 =
4616 Kpps and 0, respectively.

Sorry about that!

The pktgen benchmarks with a single thread look fine, right?

I'll still look into using an XDP program that does XDP_DROP in the
guest.

Thanks!

> 
>>>
>>> +------------------------+-------------+----------------+
>>> | iperf3 TCP benchmarks  | Stock       | Patched with   |
>>> | to Debian VM 120s      |             | fq_codel qdisc |
>>> | vhost pinned to core 0 |             |                |
>>> +------------------------+-------------+----------------+
>>> | TAP                    | 22.0 Gbit/s | 22.0 Gbit/s    |
>>> |  +                     |             |                |
>>> | vhost-net              |             |                |
>>> +------------------------+-------------+----------------+
>>>
>>> +---------------------------+-------------+----------------+
>>> | iperf3 TCP benchmarks     | Stock       | Patched with   |
>>> | to Debian VM 120s         |             | fq_codel qdisc |
>>> | vhost pinned to core 0    |             |                |
>>> | *4 iperf3 client threads* |             |                |
>>> +---------------------------+-------------+----------------+
>>> | TAP                       | 21.4 Gbit/s | 21.5 Gbit/s    |
>>> |  +                        |             |                |
>>> | vhost-net                 |             |                |
>>> +---------------------------+-------------+----------------+
>>
>> What are your thoughts on this?
>>
>> Thanks!
>>
>>
> 
> Thanks
>

Re: [PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Jason Wang 1 week, 4 days ago

On Wed, Jan 28, 2026 at 3:54 PM Simon Schippers
<simon.schippers@tu-dortmund.de> wrote:
>
> On 1/28/26 08:03, Jason Wang wrote:
> > On Wed, Jan 28, 2026 at 12:48 AM Simon Schippers
> > <simon.schippers@tu-dortmund.de> wrote:
> >>
> >> On 1/23/26 10:54, Simon Schippers wrote:
> >>> On 1/23/26 04:05, Jason Wang wrote:
> >>>> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>
> >>>>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
> >>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>
> >>>>>> On 1/9/26 07:02, Jason Wang wrote:
> >>>>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
> >>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>
> >>>>>>>> On 1/8/26 04:38, Jason Wang wrote:
> >>>>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
> >>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>>>
> >>>>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
> >>>>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
> >>>>>>>>>> space in the underlying ptr_ring.
> >>>>>>>>>>
> >>>>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
> >>>>>>>>>> in an upcoming commit.
> >>>>>>>>>>
> >>>>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> >>>>>>>>>> ---
> >>>>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
> >>>>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
> >>>>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
> >>>>>>>>>>
> >>>>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> >>>>>>>>>> index 1197f245e873..2442cf7ac385 100644
> >>>>>>>>>> --- a/drivers/net/tap.c
> >>>>>>>>>> +++ b/drivers/net/tap.c
> >>>>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
> >>>>>>>>>>         return ret ? ret : total;
> >>>>>>>>>>  }
> >>>>>>>>>>
> >>>>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
> >>>>>>>>>> +{
> >>>>>>>>>> +       struct ptr_ring *ring = &q->ring;
> >>>>>>>>>> +       struct net_device *dev;
> >>>>>>>>>> +       void *ptr;
> >>>>>>>>>> +
> >>>>>>>>>> +       spin_lock(&ring->consumer_lock);
> >>>>>>>>>> +
> >>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
> >>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>>>>>>>> +               rcu_read_lock();
> >>>>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
> >>>>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
> >>>>>>>>>> +               rcu_read_unlock();
> >>>>>>>>>> +       }
> >>>>>>>>>> +
> >>>>>>>>>> +       spin_unlock(&ring->consumer_lock);
> >>>>>>>>>> +
> >>>>>>>>>> +       return ptr;
> >>>>>>>>>> +}
> >>>>>>>>>> +
> >>>>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
> >>>>>>>>>>                            struct iov_iter *to,
> >>>>>>>>>>                            int noblock, struct sk_buff *skb)
> >>>>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
> >>>>>>>>>>                                         TASK_INTERRUPTIBLE);
> >>>>>>>>>>
> >>>>>>>>>>                 /* Read frames from the queue */
> >>>>>>>>>> -               skb = ptr_ring_consume(&q->ring);
> >>>>>>>>>> +               skb = tap_ring_consume(q);
> >>>>>>>>>>                 if (skb)
> >>>>>>>>>>                         break;
> >>>>>>>>>>                 if (noblock) {
> >>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>>>>>>>>> index 8192740357a0..7148f9a844a4 100644
> >>>>>>>>>> --- a/drivers/net/tun.c
> >>>>>>>>>> +++ b/drivers/net/tun.c
> >>>>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
> >>>>>>>>>>         return total;
> >>>>>>>>>>  }
> >>>>>>>>>>
> >>>>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
> >>>>>>>>>> +{
> >>>>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
> >>>>>>>>>> +       struct net_device *dev;
> >>>>>>>>>> +       void *ptr;
> >>>>>>>>>> +
> >>>>>>>>>> +       spin_lock(&ring->consumer_lock);
> >>>>>>>>>> +
> >>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
> >>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>>>>>>>
> >>>>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
> >>>>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
> >>>>>>>>> another call to tweak the current API.
> >>>>>>>>>
> >>>>>>>>>> +               rcu_read_lock();
> >>>>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
> >>>>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
> >>>>>>>>>
> >>>>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
> >>>>>>>>> I'm not sure is what we want.
> >>>>>>>>
> >>>>>>>> What else would you suggest calling to wake the queue?
> >>>>>>>
> >>>>>>> I don't have a good method in my mind, just want to point out its implications.
> >>>>>>
> >>>>>> I have to admit I'm a bit stuck at this point, particularly with this
> >>>>>> aspect.
> >>>>>>
> >>>>>> What is the correct way to pass the producer CPU ID to the consumer?
> >>>>>> Would it make sense to store smp_processor_id() in the tfile inside
> >>>>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
> >>>>>> XDP bit)? In the latter case, my concern is that this information may
> >>>>>> already be significantly outdated by the time it is used.
> >>>>>>
> >>>>>> Based on that, my idea would be for the consumer to wake the producer by
> >>>>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
> >>>>>> smp_call_function_single().
> >>>>>> Is this a reasonable approach?
> >>>>>
> >>>>> I'm not sure but it would introduce costs like IPI.
> >>>>>
> >>>>>>
> >>>>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
> >>>>>> considered a deal-breaker for the patch set?
> >>>>>
> >>>>> It depends on whether or not it has effects on the performance.
> >>>>> Especially when vhost is pinned.
> >>>>
> >>>> I meant we can benchmark to see the impact. For example, pin vhost to
> >>>> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
> >>>>
> >>>> Thanks
> >>>>
> >>>
> >>> I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
> >>> for both the stock and patched versions. The benchmarks were run with
> >>> the full patch series applied, since testing only patches 1-3 would not
> >>> be meaningful - the queue is never stopped in that case, so no
> >>> TX_SOFTIRQ is triggered.
> >>>
> >>> Compared to the non-pinned CPU benchmarks in the cover letter,
> >>> performance is lower for pktgen with a single thread but higher with
> >>> four threads. The results show no regression for the patched version,
> >>> with even slight performance improvements observed:
> >>>
> >>> +-------------------------+-----------+----------------+
> >>> | pktgen benchmarks to    | Stock     | Patched with   |
> >>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> >>> | 100M packets            |           |                |
> >>> | vhost pinned to core 0  |           |                |
> >>> +-----------+-------------+-----------+----------------+
> >>> | TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
> >>> |  +        +-------------+-----------+----------------+
> >>> | vhost-net | Lost        | 1154 Kpps | 0              |
> >>> +-----------+-------------+-----------+----------------+
> >>>
> >>> +-------------------------+-----------+----------------+
> >>> | pktgen benchmarks to    | Stock     | Patched with   |
> >>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> >>> | 100M packets            |           |                |
> >>> | vhost pinned to core 0  |           |                |
> >>> | *4 threads*             |           |                |
> >>> +-----------+-------------+-----------+----------------+
> >>> | TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
> >>> |  +        +-------------+-----------+----------------+
> >>> | vhost-net | Lost        | 1527 Kpps | 0              |
> >>> +-----------+-------------+-----------+----------------+
> >
> > The PPS seems to be low. I'd suggest using testpmd (rxonly) mode in
> > the guest or an xdp program that did XDP_DROP in the guest.
>
> I forgot to mention that these PPS values are per thread.
> So overall we have 71 Kpps * 4 = 284 Kpps and 79 Kpps * 4 = 326 Kpps,
> respectively. For packet loss, that comes out to 1154 Kpps * 4 =
> 4616 Kpps and 0, respectively.
>
> Sorry about that!
>
> The pktgen benchmarks with a single thread look fine, right?

Still looks very low. E.g I just have a run of pktgen (using
pktgen_sample03_burst_single_flow.sh) without a XDP_DROP in the guest,
I can get 1Mpps.

>
> I'll still look into using an XDP program that does XDP_DROP in the
> guest.
>
> Thanks!

Thanks

>
> >
> >>>
> >>> +------------------------+-------------+----------------+
> >>> | iperf3 TCP benchmarks  | Stock       | Patched with   |
> >>> | to Debian VM 120s      |             | fq_codel qdisc |
> >>> | vhost pinned to core 0 |             |                |
> >>> +------------------------+-------------+----------------+
> >>> | TAP                    | 22.0 Gbit/s | 22.0 Gbit/s    |
> >>> |  +                     |             |                |
> >>> | vhost-net              |             |                |
> >>> +------------------------+-------------+----------------+
> >>>
> >>> +---------------------------+-------------+----------------+
> >>> | iperf3 TCP benchmarks     | Stock       | Patched with   |
> >>> | to Debian VM 120s         |             | fq_codel qdisc |
> >>> | vhost pinned to core 0    |             |                |
> >>> | *4 iperf3 client threads* |             |                |
> >>> +---------------------------+-------------+----------------+
> >>> | TAP                       | 21.4 Gbit/s | 21.5 Gbit/s    |
> >>> |  +                        |             |                |
> >>> | vhost-net                 |             |                |
> >>> +---------------------------+-------------+----------------+
> >>
> >> What are your thoughts on this?
> >>
> >> Thanks!
> >>
> >>
> >
> > Thanks
> >
>

[PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Simon Schippers 1 week, 4 days ago

On 1/29/26 02:14, Jason Wang wrote:
> On Wed, Jan 28, 2026 at 3:54 PM Simon Schippers
> <simon.schippers@tu-dortmund.de> wrote:
>>
>> On 1/28/26 08:03, Jason Wang wrote:
>>> On Wed, Jan 28, 2026 at 12:48 AM Simon Schippers
>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>
>>>> On 1/23/26 10:54, Simon Schippers wrote:
>>>>> On 1/23/26 04:05, Jason Wang wrote:
>>>>>> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>
>>>>>>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>
>>>>>>>> On 1/9/26 07:02, Jason Wang wrote:
>>>>>>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>
>>>>>>>>>> On 1/8/26 04:38, Jason Wang wrote:
>>>>>>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
>>>>>>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
>>>>>>>>>>>> space in the underlying ptr_ring.
>>>>>>>>>>>>
>>>>>>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
>>>>>>>>>>>> in an upcoming commit.
>>>>>>>>>>>>
>>>>>>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>>>>>>> ---
>>>>>>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
>>>>>>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>>>>>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>>>>>>>>>>>> index 1197f245e873..2442cf7ac385 100644
>>>>>>>>>>>> --- a/drivers/net/tap.c
>>>>>>>>>>>> +++ b/drivers/net/tap.c
>>>>>>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
>>>>>>>>>>>>         return ret ? ret : total;
>>>>>>>>>>>>  }
>>>>>>>>>>>>
>>>>>>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +       struct ptr_ring *ring = &q->ring;
>>>>>>>>>>>> +       struct net_device *dev;
>>>>>>>>>>>> +       void *ptr;
>>>>>>>>>>>> +
>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>>>>>> +
>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>>>> +               rcu_read_lock();
>>>>>>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
>>>>>>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
>>>>>>>>>>>> +               rcu_read_unlock();
>>>>>>>>>>>> +       }
>>>>>>>>>>>> +
>>>>>>>>>>>> +       spin_unlock(&ring->consumer_lock);
>>>>>>>>>>>> +
>>>>>>>>>>>> +       return ptr;
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>>>>>                            struct iov_iter *to,
>>>>>>>>>>>>                            int noblock, struct sk_buff *skb)
>>>>>>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>>>>>                                         TASK_INTERRUPTIBLE);
>>>>>>>>>>>>
>>>>>>>>>>>>                 /* Read frames from the queue */
>>>>>>>>>>>> -               skb = ptr_ring_consume(&q->ring);
>>>>>>>>>>>> +               skb = tap_ring_consume(q);
>>>>>>>>>>>>                 if (skb)
>>>>>>>>>>>>                         break;
>>>>>>>>>>>>                 if (noblock) {
>>>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>>>>>>> index 8192740357a0..7148f9a844a4 100644
>>>>>>>>>>>> --- a/drivers/net/tun.c
>>>>>>>>>>>> +++ b/drivers/net/tun.c
>>>>>>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>>>>>>>>>>>         return total;
>>>>>>>>>>>>  }
>>>>>>>>>>>>
>>>>>>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
>>>>>>>>>>>> +       struct net_device *dev;
>>>>>>>>>>>> +       void *ptr;
>>>>>>>>>>>> +
>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>>>>>> +
>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>>>
>>>>>>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
>>>>>>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
>>>>>>>>>>> another call to tweak the current API.
>>>>>>>>>>>
>>>>>>>>>>>> +               rcu_read_lock();
>>>>>>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
>>>>>>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
>>>>>>>>>>>
>>>>>>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
>>>>>>>>>>> I'm not sure is what we want.
>>>>>>>>>>
>>>>>>>>>> What else would you suggest calling to wake the queue?
>>>>>>>>>
>>>>>>>>> I don't have a good method in my mind, just want to point out its implications.
>>>>>>>>
>>>>>>>> I have to admit I'm a bit stuck at this point, particularly with this
>>>>>>>> aspect.
>>>>>>>>
>>>>>>>> What is the correct way to pass the producer CPU ID to the consumer?
>>>>>>>> Would it make sense to store smp_processor_id() in the tfile inside
>>>>>>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
>>>>>>>> XDP bit)? In the latter case, my concern is that this information may
>>>>>>>> already be significantly outdated by the time it is used.
>>>>>>>>
>>>>>>>> Based on that, my idea would be for the consumer to wake the producer by
>>>>>>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
>>>>>>>> smp_call_function_single().
>>>>>>>> Is this a reasonable approach?
>>>>>>>
>>>>>>> I'm not sure but it would introduce costs like IPI.
>>>>>>>
>>>>>>>>
>>>>>>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
>>>>>>>> considered a deal-breaker for the patch set?
>>>>>>>
>>>>>>> It depends on whether or not it has effects on the performance.
>>>>>>> Especially when vhost is pinned.
>>>>>>
>>>>>> I meant we can benchmark to see the impact. For example, pin vhost to
>>>>>> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
>>>>>>
>>>>>> Thanks
>>>>>>
>>>>>
>>>>> I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
>>>>> for both the stock and patched versions. The benchmarks were run with
>>>>> the full patch series applied, since testing only patches 1-3 would not
>>>>> be meaningful - the queue is never stopped in that case, so no
>>>>> TX_SOFTIRQ is triggered.
>>>>>
>>>>> Compared to the non-pinned CPU benchmarks in the cover letter,
>>>>> performance is lower for pktgen with a single thread but higher with
>>>>> four threads. The results show no regression for the patched version,
>>>>> with even slight performance improvements observed:
>>>>>
>>>>> +-------------------------+-----------+----------------+
>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
>>>>> | 100M packets            |           |                |
>>>>> | vhost pinned to core 0  |           |                |
>>>>> +-----------+-------------+-----------+----------------+
>>>>> | TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
>>>>> |  +        +-------------+-----------+----------------+
>>>>> | vhost-net | Lost        | 1154 Kpps | 0              |
>>>>> +-----------+-------------+-----------+----------------+
>>>>>
>>>>> +-------------------------+-----------+----------------+
>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
>>>>> | 100M packets            |           |                |
>>>>> | vhost pinned to core 0  |           |                |
>>>>> | *4 threads*             |           |                |
>>>>> +-----------+-------------+-----------+----------------+
>>>>> | TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
>>>>> |  +        +-------------+-----------+----------------+
>>>>> | vhost-net | Lost        | 1527 Kpps | 0              |
>>>>> +-----------+-------------+-----------+----------------+
>>>
>>> The PPS seems to be low. I'd suggest using testpmd (rxonly) mode in
>>> the guest or an xdp program that did XDP_DROP in the guest.
>>
>> I forgot to mention that these PPS values are per thread.
>> So overall we have 71 Kpps * 4 = 284 Kpps and 79 Kpps * 4 = 326 Kpps,
>> respectively. For packet loss, that comes out to 1154 Kpps * 4 =
>> 4616 Kpps and 0, respectively.
>>
>> Sorry about that!
>>
>> The pktgen benchmarks with a single thread look fine, right?
> 
> Still looks very low. E.g I just have a run of pktgen (using
> pktgen_sample03_burst_single_flow.sh) without a XDP_DROP in the guest,
> I can get 1Mpps.

Keep in mind that I am using an older CPU (i5-6300HQ). For the
single-threaded tests I always used pktgen_sample01_simple.sh, and for
the multi-threaded tests I always used pktgen_sample02_multiqueue.sh.

Using pktgen_sample03_burst_single_flow.sh as you did fails for me (even
though the same parameters work fine for sample01 and sample02):

samples/pktgen/pktgen_sample03_burst_single_flow.sh -i tap0 -m
52:54:00:12:34:56 -d 10.0.0.2 -n 100000000
/samples/pktgen/functions.sh: line 79: echo: write error: Operation not
supported
ERROR: Write error(1) occurred
cmd: "burst 32 > /proc/net/pktgen/tap0@0"

...and I do not know what I am doing wrong, even after looking at
Documentation/networking/pktgen.rst. Every burst size except 1 fails.
Any clues?

Thanks!

> 
>>
>> I'll still look into using an XDP program that does XDP_DROP in the
>> guest.
>>
>> Thanks!
> 
> Thanks
> 
>>
>>>
>>>>>
>>>>> +------------------------+-------------+----------------+
>>>>> | iperf3 TCP benchmarks  | Stock       | Patched with   |
>>>>> | to Debian VM 120s      |             | fq_codel qdisc |
>>>>> | vhost pinned to core 0 |             |                |
>>>>> +------------------------+-------------+----------------+
>>>>> | TAP                    | 22.0 Gbit/s | 22.0 Gbit/s    |
>>>>> |  +                     |             |                |
>>>>> | vhost-net              |             |                |
>>>>> +------------------------+-------------+----------------+
>>>>>
>>>>> +---------------------------+-------------+----------------+
>>>>> | iperf3 TCP benchmarks     | Stock       | Patched with   |
>>>>> | to Debian VM 120s         |             | fq_codel qdisc |
>>>>> | vhost pinned to core 0    |             |                |
>>>>> | *4 iperf3 client threads* |             |                |
>>>>> +---------------------------+-------------+----------------+
>>>>> | TAP                       | 21.4 Gbit/s | 21.5 Gbit/s    |
>>>>> |  +                        |             |                |
>>>>> | vhost-net                 |             |                |
>>>>> +---------------------------+-------------+----------------+
>>>>
>>>> What are your thoughts on this?
>>>>
>>>> Thanks!
>>>>
>>>>
>>>
>>> Thanks
>>>
>>
>

Re: [PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Jason Wang 1 week, 3 days ago

On Thu, Jan 29, 2026 at 5:25 PM Simon Schippers
<simon.schippers@tu-dortmund.de> wrote:
>
> On 1/29/26 02:14, Jason Wang wrote:
> > On Wed, Jan 28, 2026 at 3:54 PM Simon Schippers
> > <simon.schippers@tu-dortmund.de> wrote:
> >>
> >> On 1/28/26 08:03, Jason Wang wrote:
> >>> On Wed, Jan 28, 2026 at 12:48 AM Simon Schippers
> >>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>
> >>>> On 1/23/26 10:54, Simon Schippers wrote:
> >>>>> On 1/23/26 04:05, Jason Wang wrote:
> >>>>>> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>>>
> >>>>>>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
> >>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>
> >>>>>>>> On 1/9/26 07:02, Jason Wang wrote:
> >>>>>>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
> >>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>>>
> >>>>>>>>>> On 1/8/26 04:38, Jason Wang wrote:
> >>>>>>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
> >>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>>>>>
> >>>>>>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
> >>>>>>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
> >>>>>>>>>>>> space in the underlying ptr_ring.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
> >>>>>>>>>>>> in an upcoming commit.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>>>>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>>>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> >>>>>>>>>>>> ---
> >>>>>>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
> >>>>>>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
> >>>>>>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
> >>>>>>>>>>>>
> >>>>>>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> >>>>>>>>>>>> index 1197f245e873..2442cf7ac385 100644
> >>>>>>>>>>>> --- a/drivers/net/tap.c
> >>>>>>>>>>>> +++ b/drivers/net/tap.c
> >>>>>>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
> >>>>>>>>>>>>         return ret ? ret : total;
> >>>>>>>>>>>>  }
> >>>>>>>>>>>>
> >>>>>>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
> >>>>>>>>>>>> +{
> >>>>>>>>>>>> +       struct ptr_ring *ring = &q->ring;
> >>>>>>>>>>>> +       struct net_device *dev;
> >>>>>>>>>>>> +       void *ptr;
> >>>>>>>>>>>> +
> >>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
> >>>>>>>>>>>> +
> >>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
> >>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>>>>>>>>>> +               rcu_read_lock();
> >>>>>>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
> >>>>>>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
> >>>>>>>>>>>> +               rcu_read_unlock();
> >>>>>>>>>>>> +       }
> >>>>>>>>>>>> +
> >>>>>>>>>>>> +       spin_unlock(&ring->consumer_lock);
> >>>>>>>>>>>> +
> >>>>>>>>>>>> +       return ptr;
> >>>>>>>>>>>> +}
> >>>>>>>>>>>> +
> >>>>>>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
> >>>>>>>>>>>>                            struct iov_iter *to,
> >>>>>>>>>>>>                            int noblock, struct sk_buff *skb)
> >>>>>>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
> >>>>>>>>>>>>                                         TASK_INTERRUPTIBLE);
> >>>>>>>>>>>>
> >>>>>>>>>>>>                 /* Read frames from the queue */
> >>>>>>>>>>>> -               skb = ptr_ring_consume(&q->ring);
> >>>>>>>>>>>> +               skb = tap_ring_consume(q);
> >>>>>>>>>>>>                 if (skb)
> >>>>>>>>>>>>                         break;
> >>>>>>>>>>>>                 if (noblock) {
> >>>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>>>>>>>>>>> index 8192740357a0..7148f9a844a4 100644
> >>>>>>>>>>>> --- a/drivers/net/tun.c
> >>>>>>>>>>>> +++ b/drivers/net/tun.c
> >>>>>>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
> >>>>>>>>>>>>         return total;
> >>>>>>>>>>>>  }
> >>>>>>>>>>>>
> >>>>>>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
> >>>>>>>>>>>> +{
> >>>>>>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
> >>>>>>>>>>>> +       struct net_device *dev;
> >>>>>>>>>>>> +       void *ptr;
> >>>>>>>>>>>> +
> >>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
> >>>>>>>>>>>> +
> >>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
> >>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>>>>>>>>>
> >>>>>>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
> >>>>>>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
> >>>>>>>>>>> another call to tweak the current API.
> >>>>>>>>>>>
> >>>>>>>>>>>> +               rcu_read_lock();
> >>>>>>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
> >>>>>>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
> >>>>>>>>>>>
> >>>>>>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
> >>>>>>>>>>> I'm not sure is what we want.
> >>>>>>>>>>
> >>>>>>>>>> What else would you suggest calling to wake the queue?
> >>>>>>>>>
> >>>>>>>>> I don't have a good method in my mind, just want to point out its implications.
> >>>>>>>>
> >>>>>>>> I have to admit I'm a bit stuck at this point, particularly with this
> >>>>>>>> aspect.
> >>>>>>>>
> >>>>>>>> What is the correct way to pass the producer CPU ID to the consumer?
> >>>>>>>> Would it make sense to store smp_processor_id() in the tfile inside
> >>>>>>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
> >>>>>>>> XDP bit)? In the latter case, my concern is that this information may
> >>>>>>>> already be significantly outdated by the time it is used.
> >>>>>>>>
> >>>>>>>> Based on that, my idea would be for the consumer to wake the producer by
> >>>>>>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
> >>>>>>>> smp_call_function_single().
> >>>>>>>> Is this a reasonable approach?
> >>>>>>>
> >>>>>>> I'm not sure but it would introduce costs like IPI.
> >>>>>>>
> >>>>>>>>
> >>>>>>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
> >>>>>>>> considered a deal-breaker for the patch set?
> >>>>>>>
> >>>>>>> It depends on whether or not it has effects on the performance.
> >>>>>>> Especially when vhost is pinned.
> >>>>>>
> >>>>>> I meant we can benchmark to see the impact. For example, pin vhost to
> >>>>>> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
> >>>>>>
> >>>>>> Thanks
> >>>>>>
> >>>>>
> >>>>> I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
> >>>>> for both the stock and patched versions. The benchmarks were run with
> >>>>> the full patch series applied, since testing only patches 1-3 would not
> >>>>> be meaningful - the queue is never stopped in that case, so no
> >>>>> TX_SOFTIRQ is triggered.
> >>>>>
> >>>>> Compared to the non-pinned CPU benchmarks in the cover letter,
> >>>>> performance is lower for pktgen with a single thread but higher with
> >>>>> four threads. The results show no regression for the patched version,
> >>>>> with even slight performance improvements observed:
> >>>>>
> >>>>> +-------------------------+-----------+----------------+
> >>>>> | pktgen benchmarks to    | Stock     | Patched with   |
> >>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> >>>>> | 100M packets            |           |                |
> >>>>> | vhost pinned to core 0  |           |                |
> >>>>> +-----------+-------------+-----------+----------------+
> >>>>> | TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
> >>>>> |  +        +-------------+-----------+----------------+
> >>>>> | vhost-net | Lost        | 1154 Kpps | 0              |
> >>>>> +-----------+-------------+-----------+----------------+
> >>>>>
> >>>>> +-------------------------+-----------+----------------+
> >>>>> | pktgen benchmarks to    | Stock     | Patched with   |
> >>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> >>>>> | 100M packets            |           |                |
> >>>>> | vhost pinned to core 0  |           |                |
> >>>>> | *4 threads*             |           |                |
> >>>>> +-----------+-------------+-----------+----------------+
> >>>>> | TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
> >>>>> |  +        +-------------+-----------+----------------+
> >>>>> | vhost-net | Lost        | 1527 Kpps | 0              |
> >>>>> +-----------+-------------+-----------+----------------+
> >>>
> >>> The PPS seems to be low. I'd suggest using testpmd (rxonly) mode in
> >>> the guest or an xdp program that did XDP_DROP in the guest.
> >>
> >> I forgot to mention that these PPS values are per thread.
> >> So overall we have 71 Kpps * 4 = 284 Kpps and 79 Kpps * 4 = 326 Kpps,
> >> respectively. For packet loss, that comes out to 1154 Kpps * 4 =
> >> 4616 Kpps and 0, respectively.
> >>
> >> Sorry about that!
> >>
> >> The pktgen benchmarks with a single thread look fine, right?
> >
> > Still looks very low. E.g I just have a run of pktgen (using
> > pktgen_sample03_burst_single_flow.sh) without a XDP_DROP in the guest,
> > I can get 1Mpps.
>
> Keep in mind that I am using an older CPU (i5-6300HQ). For the
> single-threaded tests I always used pktgen_sample01_simple.sh, and for
> the multi-threaded tests I always used pktgen_sample02_multiqueue.sh.
>
> Using pktgen_sample03_burst_single_flow.sh as you did fails for me (even
> though the same parameters work fine for sample01 and sample02):
>
> samples/pktgen/pktgen_sample03_burst_single_flow.sh -i tap0 -m
> 52:54:00:12:34:56 -d 10.0.0.2 -n 100000000
> /samples/pktgen/functions.sh: line 79: echo: write error: Operation not
> supported
> ERROR: Write error(1) occurred
> cmd: "burst 32 > /proc/net/pktgen/tap0@0"
>
> ...and I do not know what I am doing wrong, even after looking at
> Documentation/networking/pktgen.rst. Every burst size except 1 fails.
> Any clues?

Please use -b 0, and I'm Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz.

Another thing I can think of is to disable

1) mitigations in both guest and host
2) any kernel debug features in both host and guest

Thanks

>
> Thanks!
>
> >
> >>
> >> I'll still look into using an XDP program that does XDP_DROP in the
> >> guest.
> >>
> >> Thanks!
> >
> > Thanks
> >
> >>
> >>>
> >>>>>
> >>>>> +------------------------+-------------+----------------+
> >>>>> | iperf3 TCP benchmarks  | Stock       | Patched with   |
> >>>>> | to Debian VM 120s      |             | fq_codel qdisc |
> >>>>> | vhost pinned to core 0 |             |                |
> >>>>> +------------------------+-------------+----------------+
> >>>>> | TAP                    | 22.0 Gbit/s | 22.0 Gbit/s    |
> >>>>> |  +                     |             |                |
> >>>>> | vhost-net              |             |                |
> >>>>> +------------------------+-------------+----------------+
> >>>>>
> >>>>> +---------------------------+-------------+----------------+
> >>>>> | iperf3 TCP benchmarks     | Stock       | Patched with   |
> >>>>> | to Debian VM 120s         |             | fq_codel qdisc |
> >>>>> | vhost pinned to core 0    |             |                |
> >>>>> | *4 iperf3 client threads* |             |                |
> >>>>> +---------------------------+-------------+----------------+
> >>>>> | TAP                       | 21.4 Gbit/s | 21.5 Gbit/s    |
> >>>>> |  +                        |             |                |
> >>>>> | vhost-net                 |             |                |
> >>>>> +---------------------------+-------------+----------------+
> >>>>
> >>>> What are your thoughts on this?
> >>>>
> >>>> Thanks!
> >>>>
> >>>>
> >>>
> >>> Thanks
> >>>
> >>
> >
>

[PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Simon Schippers 1 week ago

On 1/30/26 02:51, Jason Wang wrote:
> On Thu, Jan 29, 2026 at 5:25 PM Simon Schippers
> <simon.schippers@tu-dortmund.de> wrote:
>>
>> On 1/29/26 02:14, Jason Wang wrote:
>>> On Wed, Jan 28, 2026 at 3:54 PM Simon Schippers
>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>
>>>> On 1/28/26 08:03, Jason Wang wrote:
>>>>> On Wed, Jan 28, 2026 at 12:48 AM Simon Schippers
>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>
>>>>>> On 1/23/26 10:54, Simon Schippers wrote:
>>>>>>> On 1/23/26 04:05, Jason Wang wrote:
>>>>>>>> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>>>
>>>>>>>>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>
>>>>>>>>>> On 1/9/26 07:02, Jason Wang wrote:
>>>>>>>>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On 1/8/26 04:38, Jason Wang wrote:
>>>>>>>>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
>>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
>>>>>>>>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
>>>>>>>>>>>>>> space in the underlying ptr_ring.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
>>>>>>>>>>>>>> in an upcoming commit.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
>>>>>>>>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>>>>>>>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>>>>>>>>>>>>>> index 1197f245e873..2442cf7ac385 100644
>>>>>>>>>>>>>> --- a/drivers/net/tap.c
>>>>>>>>>>>>>> +++ b/drivers/net/tap.c
>>>>>>>>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
>>>>>>>>>>>>>>         return ret ? ret : total;
>>>>>>>>>>>>>>  }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>> +       struct ptr_ring *ring = &q->ring;
>>>>>>>>>>>>>> +       struct net_device *dev;
>>>>>>>>>>>>>> +       void *ptr;
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>>>>>> +               rcu_read_lock();
>>>>>>>>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
>>>>>>>>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
>>>>>>>>>>>>>> +               rcu_read_unlock();
>>>>>>>>>>>>>> +       }
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +       spin_unlock(&ring->consumer_lock);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +       return ptr;
>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>>>>>>>                            struct iov_iter *to,
>>>>>>>>>>>>>>                            int noblock, struct sk_buff *skb)
>>>>>>>>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>>>>>>>                                         TASK_INTERRUPTIBLE);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>                 /* Read frames from the queue */
>>>>>>>>>>>>>> -               skb = ptr_ring_consume(&q->ring);
>>>>>>>>>>>>>> +               skb = tap_ring_consume(q);
>>>>>>>>>>>>>>                 if (skb)
>>>>>>>>>>>>>>                         break;
>>>>>>>>>>>>>>                 if (noblock) {
>>>>>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>>>>>>>>> index 8192740357a0..7148f9a844a4 100644
>>>>>>>>>>>>>> --- a/drivers/net/tun.c
>>>>>>>>>>>>>> +++ b/drivers/net/tun.c
>>>>>>>>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>>>>>>>>>>>>>         return total;
>>>>>>>>>>>>>>  }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
>>>>>>>>>>>>>> +       struct net_device *dev;
>>>>>>>>>>>>>> +       void *ptr;
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>>>>>
>>>>>>>>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
>>>>>>>>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
>>>>>>>>>>>>> another call to tweak the current API.
>>>>>>>>>>>>>
>>>>>>>>>>>>>> +               rcu_read_lock();
>>>>>>>>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
>>>>>>>>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
>>>>>>>>>>>>>
>>>>>>>>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
>>>>>>>>>>>>> I'm not sure is what we want.
>>>>>>>>>>>>
>>>>>>>>>>>> What else would you suggest calling to wake the queue?
>>>>>>>>>>>
>>>>>>>>>>> I don't have a good method in my mind, just want to point out its implications.
>>>>>>>>>>
>>>>>>>>>> I have to admit I'm a bit stuck at this point, particularly with this
>>>>>>>>>> aspect.
>>>>>>>>>>
>>>>>>>>>> What is the correct way to pass the producer CPU ID to the consumer?
>>>>>>>>>> Would it make sense to store smp_processor_id() in the tfile inside
>>>>>>>>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
>>>>>>>>>> XDP bit)? In the latter case, my concern is that this information may
>>>>>>>>>> already be significantly outdated by the time it is used.
>>>>>>>>>>
>>>>>>>>>> Based on that, my idea would be for the consumer to wake the producer by
>>>>>>>>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
>>>>>>>>>> smp_call_function_single().
>>>>>>>>>> Is this a reasonable approach?
>>>>>>>>>
>>>>>>>>> I'm not sure but it would introduce costs like IPI.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
>>>>>>>>>> considered a deal-breaker for the patch set?
>>>>>>>>>
>>>>>>>>> It depends on whether or not it has effects on the performance.
>>>>>>>>> Especially when vhost is pinned.
>>>>>>>>
>>>>>>>> I meant we can benchmark to see the impact. For example, pin vhost to
>>>>>>>> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
>>>>>>>>
>>>>>>>> Thanks
>>>>>>>>
>>>>>>>
>>>>>>> I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
>>>>>>> for both the stock and patched versions. The benchmarks were run with
>>>>>>> the full patch series applied, since testing only patches 1-3 would not
>>>>>>> be meaningful - the queue is never stopped in that case, so no
>>>>>>> TX_SOFTIRQ is triggered.
>>>>>>>
>>>>>>> Compared to the non-pinned CPU benchmarks in the cover letter,
>>>>>>> performance is lower for pktgen with a single thread but higher with
>>>>>>> four threads. The results show no regression for the patched version,
>>>>>>> with even slight performance improvements observed:
>>>>>>>
>>>>>>> +-------------------------+-----------+----------------+
>>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
>>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
>>>>>>> | 100M packets            |           |                |
>>>>>>> | vhost pinned to core 0  |           |                |
>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>> | TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
>>>>>>> |  +        +-------------+-----------+----------------+
>>>>>>> | vhost-net | Lost        | 1154 Kpps | 0              |
>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>
>>>>>>> +-------------------------+-----------+----------------+
>>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
>>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
>>>>>>> | 100M packets            |           |                |
>>>>>>> | vhost pinned to core 0  |           |                |
>>>>>>> | *4 threads*             |           |                |
>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>> | TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
>>>>>>> |  +        +-------------+-----------+----------------+
>>>>>>> | vhost-net | Lost        | 1527 Kpps | 0              |
>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>
>>>>> The PPS seems to be low. I'd suggest using testpmd (rxonly) mode in
>>>>> the guest or an xdp program that did XDP_DROP in the guest.
>>>>
>>>> I forgot to mention that these PPS values are per thread.
>>>> So overall we have 71 Kpps * 4 = 284 Kpps and 79 Kpps * 4 = 326 Kpps,
>>>> respectively. For packet loss, that comes out to 1154 Kpps * 4 =
>>>> 4616 Kpps and 0, respectively.
>>>>
>>>> Sorry about that!
>>>>
>>>> The pktgen benchmarks with a single thread look fine, right?
>>>
>>> Still looks very low. E.g I just have a run of pktgen (using
>>> pktgen_sample03_burst_single_flow.sh) without a XDP_DROP in the guest,
>>> I can get 1Mpps.
>>
>> Keep in mind that I am using an older CPU (i5-6300HQ). For the
>> single-threaded tests I always used pktgen_sample01_simple.sh, and for
>> the multi-threaded tests I always used pktgen_sample02_multiqueue.sh.
>>
>> Using pktgen_sample03_burst_single_flow.sh as you did fails for me (even
>> though the same parameters work fine for sample01 and sample02):
>>
>> samples/pktgen/pktgen_sample03_burst_single_flow.sh -i tap0 -m
>> 52:54:00:12:34:56 -d 10.0.0.2 -n 100000000
>> /samples/pktgen/functions.sh: line 79: echo: write error: Operation not
>> supported
>> ERROR: Write error(1) occurred
>> cmd: "burst 32 > /proc/net/pktgen/tap0@0"
>>
>> ...and I do not know what I am doing wrong, even after looking at
>> Documentation/networking/pktgen.rst. Every burst size except 1 fails.
>> Any clues?
> 
> Please use -b 0, and I'm Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz.

I tried using "-b 0", and while it worked, there was no noticeable
performance improvement.

> 
> Another thing I can think of is to disable
> 
> 1) mitigations in both guest and host
> 2) any kernel debug features in both host and guest

I also rebuilt the kernel with everything disabled under
"Kernel hacking", but that didn’t make any difference either.

Because of this, I ran "pktgen_sample01_simple.sh" and
"pktgen_sample02_multiqueue.sh" on my AMD Ryzen 5 5600X system. The
results were about 374 Kpps with TAP and 1192 Kpps with TAP+vhost_net,
with very similar performance between the stock and patched kernels.

Personally, I think the low performance is to blame on the hardware.

Thanks!

> 
> Thanks
> 
>>
>> Thanks!
>>
>>>
>>>>
>>>> I'll still look into using an XDP program that does XDP_DROP in the
>>>> guest.
>>>>
>>>> Thanks!
>>>
>>> Thanks
>>>
>>>>
>>>>>
>>>>>>>
>>>>>>> +------------------------+-------------+----------------+
>>>>>>> | iperf3 TCP benchmarks  | Stock       | Patched with   |
>>>>>>> | to Debian VM 120s      |             | fq_codel qdisc |
>>>>>>> | vhost pinned to core 0 |             |                |
>>>>>>> +------------------------+-------------+----------------+
>>>>>>> | TAP                    | 22.0 Gbit/s | 22.0 Gbit/s    |
>>>>>>> |  +                     |             |                |
>>>>>>> | vhost-net              |             |                |
>>>>>>> +------------------------+-------------+----------------+
>>>>>>>
>>>>>>> +---------------------------+-------------+----------------+
>>>>>>> | iperf3 TCP benchmarks     | Stock       | Patched with   |
>>>>>>> | to Debian VM 120s         |             | fq_codel qdisc |
>>>>>>> | vhost pinned to core 0    |             |                |
>>>>>>> | *4 iperf3 client threads* |             |                |
>>>>>>> +---------------------------+-------------+----------------+
>>>>>>> | TAP                       | 21.4 Gbit/s | 21.5 Gbit/s    |
>>>>>>> |  +                        |             |                |
>>>>>>> | vhost-net                 |             |                |
>>>>>>> +---------------------------+-------------+----------------+
>>>>>>
>>>>>> What are your thoughts on this?
>>>>>>
>>>>>> Thanks!
>>>>>>
>>>>>>
>>>>>
>>>>> Thanks
>>>>>
>>>>
>>>
>>
>

Re: [PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Jason Wang 6 days, 11 hours ago

On Mon, Feb 2, 2026 at 4:19 AM Simon Schippers
<simon.schippers@tu-dortmund.de> wrote:
>
> On 1/30/26 02:51, Jason Wang wrote:
> > On Thu, Jan 29, 2026 at 5:25 PM Simon Schippers
> > <simon.schippers@tu-dortmund.de> wrote:
> >>
> >> On 1/29/26 02:14, Jason Wang wrote:
> >>> On Wed, Jan 28, 2026 at 3:54 PM Simon Schippers
> >>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>
> >>>> On 1/28/26 08:03, Jason Wang wrote:
> >>>>> On Wed, Jan 28, 2026 at 12:48 AM Simon Schippers
> >>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>
> >>>>>> On 1/23/26 10:54, Simon Schippers wrote:
> >>>>>>> On 1/23/26 04:05, Jason Wang wrote:
> >>>>>>>> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>>>>>
> >>>>>>>>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
> >>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>>>
> >>>>>>>>>> On 1/9/26 07:02, Jason Wang wrote:
> >>>>>>>>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
> >>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>>>>>
> >>>>>>>>>>>> On 1/8/26 04:38, Jason Wang wrote:
> >>>>>>>>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
> >>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
> >>>>>>>>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
> >>>>>>>>>>>>>> space in the underlying ptr_ring.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
> >>>>>>>>>>>>>> in an upcoming commit.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>>>>>>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>>>>>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> >>>>>>>>>>>>>> ---
> >>>>>>>>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
> >>>>>>>>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
> >>>>>>>>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> >>>>>>>>>>>>>> index 1197f245e873..2442cf7ac385 100644
> >>>>>>>>>>>>>> --- a/drivers/net/tap.c
> >>>>>>>>>>>>>> +++ b/drivers/net/tap.c
> >>>>>>>>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
> >>>>>>>>>>>>>>         return ret ? ret : total;
> >>>>>>>>>>>>>>  }
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
> >>>>>>>>>>>>>> +{
> >>>>>>>>>>>>>> +       struct ptr_ring *ring = &q->ring;
> >>>>>>>>>>>>>> +       struct net_device *dev;
> >>>>>>>>>>>>>> +       void *ptr;
> >>>>>>>>>>>>>> +
> >>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
> >>>>>>>>>>>>>> +
> >>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
> >>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>>>>>>>>>>>> +               rcu_read_lock();
> >>>>>>>>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
> >>>>>>>>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
> >>>>>>>>>>>>>> +               rcu_read_unlock();
> >>>>>>>>>>>>>> +       }
> >>>>>>>>>>>>>> +
> >>>>>>>>>>>>>> +       spin_unlock(&ring->consumer_lock);
> >>>>>>>>>>>>>> +
> >>>>>>>>>>>>>> +       return ptr;
> >>>>>>>>>>>>>> +}
> >>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
> >>>>>>>>>>>>>>                            struct iov_iter *to,
> >>>>>>>>>>>>>>                            int noblock, struct sk_buff *skb)
> >>>>>>>>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
> >>>>>>>>>>>>>>                                         TASK_INTERRUPTIBLE);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>                 /* Read frames from the queue */
> >>>>>>>>>>>>>> -               skb = ptr_ring_consume(&q->ring);
> >>>>>>>>>>>>>> +               skb = tap_ring_consume(q);
> >>>>>>>>>>>>>>                 if (skb)
> >>>>>>>>>>>>>>                         break;
> >>>>>>>>>>>>>>                 if (noblock) {
> >>>>>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>>>>>>>>>>>>> index 8192740357a0..7148f9a844a4 100644
> >>>>>>>>>>>>>> --- a/drivers/net/tun.c
> >>>>>>>>>>>>>> +++ b/drivers/net/tun.c
> >>>>>>>>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
> >>>>>>>>>>>>>>         return total;
> >>>>>>>>>>>>>>  }
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
> >>>>>>>>>>>>>> +{
> >>>>>>>>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
> >>>>>>>>>>>>>> +       struct net_device *dev;
> >>>>>>>>>>>>>> +       void *ptr;
> >>>>>>>>>>>>>> +
> >>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
> >>>>>>>>>>>>>> +
> >>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
> >>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
> >>>>>>>>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
> >>>>>>>>>>>>> another call to tweak the current API.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>> +               rcu_read_lock();
> >>>>>>>>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
> >>>>>>>>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
> >>>>>>>>>>>>> I'm not sure is what we want.
> >>>>>>>>>>>>
> >>>>>>>>>>>> What else would you suggest calling to wake the queue?
> >>>>>>>>>>>
> >>>>>>>>>>> I don't have a good method in my mind, just want to point out its implications.
> >>>>>>>>>>
> >>>>>>>>>> I have to admit I'm a bit stuck at this point, particularly with this
> >>>>>>>>>> aspect.
> >>>>>>>>>>
> >>>>>>>>>> What is the correct way to pass the producer CPU ID to the consumer?
> >>>>>>>>>> Would it make sense to store smp_processor_id() in the tfile inside
> >>>>>>>>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
> >>>>>>>>>> XDP bit)? In the latter case, my concern is that this information may
> >>>>>>>>>> already be significantly outdated by the time it is used.
> >>>>>>>>>>
> >>>>>>>>>> Based on that, my idea would be for the consumer to wake the producer by
> >>>>>>>>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
> >>>>>>>>>> smp_call_function_single().
> >>>>>>>>>> Is this a reasonable approach?
> >>>>>>>>>
> >>>>>>>>> I'm not sure but it would introduce costs like IPI.
> >>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
> >>>>>>>>>> considered a deal-breaker for the patch set?
> >>>>>>>>>
> >>>>>>>>> It depends on whether or not it has effects on the performance.
> >>>>>>>>> Especially when vhost is pinned.
> >>>>>>>>
> >>>>>>>> I meant we can benchmark to see the impact. For example, pin vhost to
> >>>>>>>> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
> >>>>>>>>
> >>>>>>>> Thanks
> >>>>>>>>
> >>>>>>>
> >>>>>>> I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
> >>>>>>> for both the stock and patched versions. The benchmarks were run with
> >>>>>>> the full patch series applied, since testing only patches 1-3 would not
> >>>>>>> be meaningful - the queue is never stopped in that case, so no
> >>>>>>> TX_SOFTIRQ is triggered.
> >>>>>>>
> >>>>>>> Compared to the non-pinned CPU benchmarks in the cover letter,
> >>>>>>> performance is lower for pktgen with a single thread but higher with
> >>>>>>> four threads. The results show no regression for the patched version,
> >>>>>>> with even slight performance improvements observed:
> >>>>>>>
> >>>>>>> +-------------------------+-----------+----------------+
> >>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
> >>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> >>>>>>> | 100M packets            |           |                |
> >>>>>>> | vhost pinned to core 0  |           |                |
> >>>>>>> +-----------+-------------+-----------+----------------+
> >>>>>>> | TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
> >>>>>>> |  +        +-------------+-----------+----------------+
> >>>>>>> | vhost-net | Lost        | 1154 Kpps | 0              |
> >>>>>>> +-----------+-------------+-----------+----------------+
> >>>>>>>
> >>>>>>> +-------------------------+-----------+----------------+
> >>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
> >>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> >>>>>>> | 100M packets            |           |                |
> >>>>>>> | vhost pinned to core 0  |           |                |
> >>>>>>> | *4 threads*             |           |                |
> >>>>>>> +-----------+-------------+-----------+----------------+
> >>>>>>> | TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
> >>>>>>> |  +        +-------------+-----------+----------------+
> >>>>>>> | vhost-net | Lost        | 1527 Kpps | 0              |
> >>>>>>> +-----------+-------------+-----------+----------------+
> >>>>>
> >>>>> The PPS seems to be low. I'd suggest using testpmd (rxonly) mode in
> >>>>> the guest or an xdp program that did XDP_DROP in the guest.
> >>>>
> >>>> I forgot to mention that these PPS values are per thread.
> >>>> So overall we have 71 Kpps * 4 = 284 Kpps and 79 Kpps * 4 = 326 Kpps,
> >>>> respectively. For packet loss, that comes out to 1154 Kpps * 4 =
> >>>> 4616 Kpps and 0, respectively.
> >>>>
> >>>> Sorry about that!
> >>>>
> >>>> The pktgen benchmarks with a single thread look fine, right?
> >>>
> >>> Still looks very low. E.g I just have a run of pktgen (using
> >>> pktgen_sample03_burst_single_flow.sh) without a XDP_DROP in the guest,
> >>> I can get 1Mpps.
> >>
> >> Keep in mind that I am using an older CPU (i5-6300HQ). For the
> >> single-threaded tests I always used pktgen_sample01_simple.sh, and for
> >> the multi-threaded tests I always used pktgen_sample02_multiqueue.sh.
> >>
> >> Using pktgen_sample03_burst_single_flow.sh as you did fails for me (even
> >> though the same parameters work fine for sample01 and sample02):
> >>
> >> samples/pktgen/pktgen_sample03_burst_single_flow.sh -i tap0 -m
> >> 52:54:00:12:34:56 -d 10.0.0.2 -n 100000000
> >> /samples/pktgen/functions.sh: line 79: echo: write error: Operation not
> >> supported
> >> ERROR: Write error(1) occurred
> >> cmd: "burst 32 > /proc/net/pktgen/tap0@0"
> >>
> >> ...and I do not know what I am doing wrong, even after looking at
> >> Documentation/networking/pktgen.rst. Every burst size except 1 fails.
> >> Any clues?
> >
> > Please use -b 0, and I'm Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz.
>
> I tried using "-b 0", and while it worked, there was no noticeable
> performance improvement.
>
> >
> > Another thing I can think of is to disable
> >
> > 1) mitigations in both guest and host
> > 2) any kernel debug features in both host and guest
>
> I also rebuilt the kernel with everything disabled under
> "Kernel hacking", but that didn’t make any difference either.
>
> Because of this, I ran "pktgen_sample01_simple.sh" and
> "pktgen_sample02_multiqueue.sh" on my AMD Ryzen 5 5600X system. The
> results were about 374 Kpps with TAP and 1192 Kpps with TAP+vhost_net,
> with very similar performance between the stock and patched kernels.
>
> Personally, I think the low performance is to blame on the hardware.

Let's double confirm this by:

1) make sure pktgen is using 100% CPU
2) Perf doesn't show anything strange for pktgen thread

Thanks

>
> Thanks!
>
> >
> > Thanks
> >
> >>
> >> Thanks!
> >>
> >>>
> >>>>
> >>>> I'll still look into using an XDP program that does XDP_DROP in the
> >>>> guest.
> >>>>
> >>>> Thanks!
> >>>
> >>> Thanks
> >>>
> >>>>
> >>>>>
> >>>>>>>
> >>>>>>> +------------------------+-------------+----------------+
> >>>>>>> | iperf3 TCP benchmarks  | Stock       | Patched with   |
> >>>>>>> | to Debian VM 120s      |             | fq_codel qdisc |
> >>>>>>> | vhost pinned to core 0 |             |                |
> >>>>>>> +------------------------+-------------+----------------+
> >>>>>>> | TAP                    | 22.0 Gbit/s | 22.0 Gbit/s    |
> >>>>>>> |  +                     |             |                |
> >>>>>>> | vhost-net              |             |                |
> >>>>>>> +------------------------+-------------+----------------+
> >>>>>>>
> >>>>>>> +---------------------------+-------------+----------------+
> >>>>>>> | iperf3 TCP benchmarks     | Stock       | Patched with   |
> >>>>>>> | to Debian VM 120s         |             | fq_codel qdisc |
> >>>>>>> | vhost pinned to core 0    |             |                |
> >>>>>>> | *4 iperf3 client threads* |             |                |
> >>>>>>> +---------------------------+-------------+----------------+
> >>>>>>> | TAP                       | 21.4 Gbit/s | 21.5 Gbit/s    |
> >>>>>>> |  +                        |             |                |
> >>>>>>> | vhost-net                 |             |                |
> >>>>>>> +---------------------------+-------------+----------------+
> >>>>>>
> >>>>>> What are your thoughts on this?
> >>>>>>
> >>>>>> Thanks!
> >>>>>>
> >>>>>>
> >>>>>
> >>>>> Thanks
> >>>>>
> >>>>
> >>>
> >>
> >
>

[PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Simon Schippers 5 days ago

On 2/3/26 04:48, Jason Wang wrote:
> On Mon, Feb 2, 2026 at 4:19 AM Simon Schippers
> <simon.schippers@tu-dortmund.de> wrote:
>>
>> On 1/30/26 02:51, Jason Wang wrote:
>>> On Thu, Jan 29, 2026 at 5:25 PM Simon Schippers
>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>
>>>> On 1/29/26 02:14, Jason Wang wrote:
>>>>> On Wed, Jan 28, 2026 at 3:54 PM Simon Schippers
>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>
>>>>>> On 1/28/26 08:03, Jason Wang wrote:
>>>>>>> On Wed, Jan 28, 2026 at 12:48 AM Simon Schippers
>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>
>>>>>>>> On 1/23/26 10:54, Simon Schippers wrote:
>>>>>>>>> On 1/23/26 04:05, Jason Wang wrote:
>>>>>>>>>> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>>>>>
>>>>>>>>>>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On 1/9/26 07:02, Jason Wang wrote:
>>>>>>>>>>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
>>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 1/8/26 04:38, Jason Wang wrote:
>>>>>>>>>>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
>>>>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
>>>>>>>>>>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
>>>>>>>>>>>>>>>> space in the underlying ptr_ring.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
>>>>>>>>>>>>>>>> in an upcoming commit.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>>>>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>>>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
>>>>>>>>>>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>>>>>>>>>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>>>>>>>>>>>>>>>> index 1197f245e873..2442cf7ac385 100644
>>>>>>>>>>>>>>>> --- a/drivers/net/tap.c
>>>>>>>>>>>>>>>> +++ b/drivers/net/tap.c
>>>>>>>>>>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
>>>>>>>>>>>>>>>>         return ret ? ret : total;
>>>>>>>>>>>>>>>>  }
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>> +       struct ptr_ring *ring = &q->ring;
>>>>>>>>>>>>>>>> +       struct net_device *dev;
>>>>>>>>>>>>>>>> +       void *ptr;
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>>>>>>>> +               rcu_read_lock();
>>>>>>>>>>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
>>>>>>>>>>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
>>>>>>>>>>>>>>>> +               rcu_read_unlock();
>>>>>>>>>>>>>>>> +       }
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +       spin_unlock(&ring->consumer_lock);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +       return ptr;
>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>>>>>>>>>                            struct iov_iter *to,
>>>>>>>>>>>>>>>>                            int noblock, struct sk_buff *skb)
>>>>>>>>>>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>>>>>>>>>                                         TASK_INTERRUPTIBLE);
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>                 /* Read frames from the queue */
>>>>>>>>>>>>>>>> -               skb = ptr_ring_consume(&q->ring);
>>>>>>>>>>>>>>>> +               skb = tap_ring_consume(q);
>>>>>>>>>>>>>>>>                 if (skb)
>>>>>>>>>>>>>>>>                         break;
>>>>>>>>>>>>>>>>                 if (noblock) {
>>>>>>>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>>>>>>>>>>> index 8192740357a0..7148f9a844a4 100644
>>>>>>>>>>>>>>>> --- a/drivers/net/tun.c
>>>>>>>>>>>>>>>> +++ b/drivers/net/tun.c
>>>>>>>>>>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>>>>>>>>>>>>>>>         return total;
>>>>>>>>>>>>>>>>  }
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
>>>>>>>>>>>>>>>> +       struct net_device *dev;
>>>>>>>>>>>>>>>> +       void *ptr;
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
>>>>>>>>>>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
>>>>>>>>>>>>>>> another call to tweak the current API.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> +               rcu_read_lock();
>>>>>>>>>>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
>>>>>>>>>>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
>>>>>>>>>>>>>>> I'm not sure is what we want.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> What else would you suggest calling to wake the queue?
>>>>>>>>>>>>>
>>>>>>>>>>>>> I don't have a good method in my mind, just want to point out its implications.
>>>>>>>>>>>>
>>>>>>>>>>>> I have to admit I'm a bit stuck at this point, particularly with this
>>>>>>>>>>>> aspect.
>>>>>>>>>>>>
>>>>>>>>>>>> What is the correct way to pass the producer CPU ID to the consumer?
>>>>>>>>>>>> Would it make sense to store smp_processor_id() in the tfile inside
>>>>>>>>>>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
>>>>>>>>>>>> XDP bit)? In the latter case, my concern is that this information may
>>>>>>>>>>>> already be significantly outdated by the time it is used.
>>>>>>>>>>>>
>>>>>>>>>>>> Based on that, my idea would be for the consumer to wake the producer by
>>>>>>>>>>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
>>>>>>>>>>>> smp_call_function_single().
>>>>>>>>>>>> Is this a reasonable approach?
>>>>>>>>>>>
>>>>>>>>>>> I'm not sure but it would introduce costs like IPI.
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
>>>>>>>>>>>> considered a deal-breaker for the patch set?
>>>>>>>>>>>
>>>>>>>>>>> It depends on whether or not it has effects on the performance.
>>>>>>>>>>> Especially when vhost is pinned.
>>>>>>>>>>
>>>>>>>>>> I meant we can benchmark to see the impact. For example, pin vhost to
>>>>>>>>>> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
>>>>>>>>>>
>>>>>>>>>> Thanks
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
>>>>>>>>> for both the stock and patched versions. The benchmarks were run with
>>>>>>>>> the full patch series applied, since testing only patches 1-3 would not
>>>>>>>>> be meaningful - the queue is never stopped in that case, so no
>>>>>>>>> TX_SOFTIRQ is triggered.
>>>>>>>>>
>>>>>>>>> Compared to the non-pinned CPU benchmarks in the cover letter,
>>>>>>>>> performance is lower for pktgen with a single thread but higher with
>>>>>>>>> four threads. The results show no regression for the patched version,
>>>>>>>>> with even slight performance improvements observed:
>>>>>>>>>
>>>>>>>>> +-------------------------+-----------+----------------+
>>>>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
>>>>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
>>>>>>>>> | 100M packets            |           |                |
>>>>>>>>> | vhost pinned to core 0  |           |                |
>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>> | TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
>>>>>>>>> |  +        +-------------+-----------+----------------+
>>>>>>>>> | vhost-net | Lost        | 1154 Kpps | 0              |
>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>>
>>>>>>>>> +-------------------------+-----------+----------------+
>>>>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
>>>>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
>>>>>>>>> | 100M packets            |           |                |
>>>>>>>>> | vhost pinned to core 0  |           |                |
>>>>>>>>> | *4 threads*             |           |                |
>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>> | TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
>>>>>>>>> |  +        +-------------+-----------+----------------+
>>>>>>>>> | vhost-net | Lost        | 1527 Kpps | 0              |
>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>
>>>>>>> The PPS seems to be low. I'd suggest using testpmd (rxonly) mode in
>>>>>>> the guest or an xdp program that did XDP_DROP in the guest.
>>>>>>
>>>>>> I forgot to mention that these PPS values are per thread.
>>>>>> So overall we have 71 Kpps * 4 = 284 Kpps and 79 Kpps * 4 = 326 Kpps,
>>>>>> respectively. For packet loss, that comes out to 1154 Kpps * 4 =
>>>>>> 4616 Kpps and 0, respectively.
>>>>>>
>>>>>> Sorry about that!
>>>>>>
>>>>>> The pktgen benchmarks with a single thread look fine, right?
>>>>>
>>>>> Still looks very low. E.g I just have a run of pktgen (using
>>>>> pktgen_sample03_burst_single_flow.sh) without a XDP_DROP in the guest,
>>>>> I can get 1Mpps.
>>>>
>>>> Keep in mind that I am using an older CPU (i5-6300HQ). For the
>>>> single-threaded tests I always used pktgen_sample01_simple.sh, and for
>>>> the multi-threaded tests I always used pktgen_sample02_multiqueue.sh.
>>>>
>>>> Using pktgen_sample03_burst_single_flow.sh as you did fails for me (even
>>>> though the same parameters work fine for sample01 and sample02):
>>>>
>>>> samples/pktgen/pktgen_sample03_burst_single_flow.sh -i tap0 -m
>>>> 52:54:00:12:34:56 -d 10.0.0.2 -n 100000000
>>>> /samples/pktgen/functions.sh: line 79: echo: write error: Operation not
>>>> supported
>>>> ERROR: Write error(1) occurred
>>>> cmd: "burst 32 > /proc/net/pktgen/tap0@0"
>>>>
>>>> ...and I do not know what I am doing wrong, even after looking at
>>>> Documentation/networking/pktgen.rst. Every burst size except 1 fails.
>>>> Any clues?
>>>
>>> Please use -b 0, and I'm Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz.
>>
>> I tried using "-b 0", and while it worked, there was no noticeable
>> performance improvement.
>>
>>>
>>> Another thing I can think of is to disable
>>>
>>> 1) mitigations in both guest and host
>>> 2) any kernel debug features in both host and guest
>>
>> I also rebuilt the kernel with everything disabled under
>> "Kernel hacking", but that didn’t make any difference either.
>>
>> Because of this, I ran "pktgen_sample01_simple.sh" and
>> "pktgen_sample02_multiqueue.sh" on my AMD Ryzen 5 5600X system. The
>> results were about 374 Kpps with TAP and 1192 Kpps with TAP+vhost_net,
>> with very similar performance between the stock and patched kernels.
>>
>> Personally, I think the low performance is to blame on the hardware.
> 
> Let's double confirm this by:
> 
> 1) make sure pktgen is using 100% CPU
> 2) Perf doesn't show anything strange for pktgen thread
> 
> Thanks
> 

I ran pktgen using pktgen_sample01_simple.sh and, in parallel, started a
100 second perf stat measurement covering all kpktgend threads.

Across all configurations, a single CPU was fully utilized.

Apart from that, the patched variants show a higher branch frequency and
a slightly increased number of context switches.


The detailed results are provided below:

Processor: Ryzen 5 5600X

pktgen command:
sudo perf stat samples/pktgen/pktgen_sample01_simple.sh -i tap0 -m
52:54:00:12:34:56 -d 10.0.0.2 -n 10000000000

perf stat command:
sudo perf stat --timeout 100000 -p $(pgrep kpktgend | tr '\n' ,) -o X.txt


Results:
Stock TAP:
            46.997      context-switches                 #    467,2 cs/sec  cs_per_second     
                 0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
                 0      page-faults                      #      0,0 faults/sec  page_faults_per_second
        100.587,69 msec task-clock                       #      1,0 CPUs  CPUs_utilized       
     8.491.586.483      branch-misses                    #     10,9 %  branch_miss_rate         (50,24%)
    77.734.761.406      branches                         #    772,8 M/sec  branch_frequency     (66,85%)
   382.420.291.585      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
   377.612.185.141      instructions                     #      1,0 instructions  insn_per_cycle  (66,85%)
    84.012.185.936      stalled-cycles-frontend          #     0,22 frontend_cycles_idle        (66,35%)

     100,100414494 seconds time elapsed


Stock TAP+vhost-net:
            47.087      context-switches                 #    468,1 cs/sec  cs_per_second     
                 0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
                 0      page-faults                      #      0,0 faults/sec  page_faults_per_second
        100.594,09 msec task-clock                       #      1,0 CPUs  CPUs_utilized       
     8.034.703.613      branch-misses                    #     11,1 %  branch_miss_rate         (50,24%)
    72.477.989.922      branches                         #    720,5 M/sec  branch_frequency     (66,86%)
   382.218.276.832      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
   349.555.577.281      instructions                     #      0,9 instructions  insn_per_cycle  (66,85%)
    83.917.644.262      stalled-cycles-frontend          #     0,22 frontend_cycles_idle        (66,35%)

     100,100520402 seconds time elapsed


Patched TAP:
            47.862      context-switches                 #    475,8 cs/sec  cs_per_second     
                 0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
                 0      page-faults                      #      0,0 faults/sec  page_faults_per_second
        100.589,30 msec task-clock                       #      1,0 CPUs  CPUs_utilized       
     9.337.258.794      branch-misses                    #      9,4 %  branch_miss_rate         (50,19%)
    99.518.421.676      branches                         #    989,4 M/sec  branch_frequency     (66,85%)
   382.508.244.894      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
   312.582.270.975      instructions                     #      0,8 instructions  insn_per_cycle  (66,85%)
    76.338.503.984      stalled-cycles-frontend          #     0,20 frontend_cycles_idle        (66,39%)

     100,101262454 seconds time elapsed


Patched TAP+vhost-net:
            47.892      context-switches                 #    476,1 cs/sec  cs_per_second     
                 0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
                 0      page-faults                      #      0,0 faults/sec  page_faults_per_second
        100.581,95 msec task-clock                       #      1,0 CPUs  CPUs_utilized       
     9.083.588.313      branch-misses                    #     10,1 %  branch_miss_rate         (50,28%)
    90.300.124.712      branches                         #    897,8 M/sec  branch_frequency     (66,85%)
   382.374.510.376      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
   340.089.181.199      instructions                     #      0,9 instructions  insn_per_cycle  (66,85%)
    78.151.408.955      stalled-cycles-frontend          #     0,20 frontend_cycles_idle        (66,31%)

     100,101212911 seconds time elapsed

>>
>> Thanks!
>>
>>>
>>> Thanks
>>>
>>>>
>>>> Thanks!
>>>>
>>>>>
>>>>>>
>>>>>> I'll still look into using an XDP program that does XDP_DROP in the
>>>>>> guest.
>>>>>>
>>>>>> Thanks!
>>>>>
>>>>> Thanks
>>>>>
>>>>>>
>>>>>>>
>>>>>>>>>
>>>>>>>>> +------------------------+-------------+----------------+
>>>>>>>>> | iperf3 TCP benchmarks  | Stock       | Patched with   |
>>>>>>>>> | to Debian VM 120s      |             | fq_codel qdisc |
>>>>>>>>> | vhost pinned to core 0 |             |                |
>>>>>>>>> +------------------------+-------------+----------------+
>>>>>>>>> | TAP                    | 22.0 Gbit/s | 22.0 Gbit/s    |
>>>>>>>>> |  +                     |             |                |
>>>>>>>>> | vhost-net              |             |                |
>>>>>>>>> +------------------------+-------------+----------------+
>>>>>>>>>
>>>>>>>>> +---------------------------+-------------+----------------+
>>>>>>>>> | iperf3 TCP benchmarks     | Stock       | Patched with   |
>>>>>>>>> | to Debian VM 120s         |             | fq_codel qdisc |
>>>>>>>>> | vhost pinned to core 0    |             |                |
>>>>>>>>> | *4 iperf3 client threads* |             |                |
>>>>>>>>> +---------------------------+-------------+----------------+
>>>>>>>>> | TAP                       | 21.4 Gbit/s | 21.5 Gbit/s    |
>>>>>>>>> |  +                        |             |                |
>>>>>>>>> | vhost-net                 |             |                |
>>>>>>>>> +---------------------------+-------------+----------------+
>>>>>>>>
>>>>>>>> What are your thoughts on this?
>>>>>>>>
>>>>>>>> Thanks!
>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>> Thanks
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
>

Re: [PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Jason Wang 4 days, 11 hours ago

On Wed, Feb 4, 2026 at 11:44 PM Simon Schippers
<simon.schippers@tu-dortmund.de> wrote:
>
> On 2/3/26 04:48, Jason Wang wrote:
> > On Mon, Feb 2, 2026 at 4:19 AM Simon Schippers
> > <simon.schippers@tu-dortmund.de> wrote:
> >>
> >> On 1/30/26 02:51, Jason Wang wrote:
> >>> On Thu, Jan 29, 2026 at 5:25 PM Simon Schippers
> >>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>
> >>>> On 1/29/26 02:14, Jason Wang wrote:
> >>>>> On Wed, Jan 28, 2026 at 3:54 PM Simon Schippers
> >>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>
> >>>>>> On 1/28/26 08:03, Jason Wang wrote:
> >>>>>>> On Wed, Jan 28, 2026 at 12:48 AM Simon Schippers
> >>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>
> >>>>>>>> On 1/23/26 10:54, Simon Schippers wrote:
> >>>>>>>>> On 1/23/26 04:05, Jason Wang wrote:
> >>>>>>>>>> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>>>>>>>
> >>>>>>>>>>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
> >>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>>>>>
> >>>>>>>>>>>> On 1/9/26 07:02, Jason Wang wrote:
> >>>>>>>>>>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
> >>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> On 1/8/26 04:38, Jason Wang wrote:
> >>>>>>>>>>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
> >>>>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
> >>>>>>>>>>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
> >>>>>>>>>>>>>>>> space in the underlying ptr_ring.
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
> >>>>>>>>>>>>>>>> in an upcoming commit.
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>>>>>>>>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>>>>>>>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> >>>>>>>>>>>>>>>> ---
> >>>>>>>>>>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
> >>>>>>>>>>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
> >>>>>>>>>>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> >>>>>>>>>>>>>>>> index 1197f245e873..2442cf7ac385 100644
> >>>>>>>>>>>>>>>> --- a/drivers/net/tap.c
> >>>>>>>>>>>>>>>> +++ b/drivers/net/tap.c
> >>>>>>>>>>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
> >>>>>>>>>>>>>>>>         return ret ? ret : total;
> >>>>>>>>>>>>>>>>  }
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
> >>>>>>>>>>>>>>>> +{
> >>>>>>>>>>>>>>>> +       struct ptr_ring *ring = &q->ring;
> >>>>>>>>>>>>>>>> +       struct net_device *dev;
> >>>>>>>>>>>>>>>> +       void *ptr;
> >>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
> >>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
> >>>>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>>>>>>>>>>>>>> +               rcu_read_lock();
> >>>>>>>>>>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
> >>>>>>>>>>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
> >>>>>>>>>>>>>>>> +               rcu_read_unlock();
> >>>>>>>>>>>>>>>> +       }
> >>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>> +       spin_unlock(&ring->consumer_lock);
> >>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>> +       return ptr;
> >>>>>>>>>>>>>>>> +}
> >>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
> >>>>>>>>>>>>>>>>                            struct iov_iter *to,
> >>>>>>>>>>>>>>>>                            int noblock, struct sk_buff *skb)
> >>>>>>>>>>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
> >>>>>>>>>>>>>>>>                                         TASK_INTERRUPTIBLE);
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>                 /* Read frames from the queue */
> >>>>>>>>>>>>>>>> -               skb = ptr_ring_consume(&q->ring);
> >>>>>>>>>>>>>>>> +               skb = tap_ring_consume(q);
> >>>>>>>>>>>>>>>>                 if (skb)
> >>>>>>>>>>>>>>>>                         break;
> >>>>>>>>>>>>>>>>                 if (noblock) {
> >>>>>>>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>>>>>>>>>>>>>>> index 8192740357a0..7148f9a844a4 100644
> >>>>>>>>>>>>>>>> --- a/drivers/net/tun.c
> >>>>>>>>>>>>>>>> +++ b/drivers/net/tun.c
> >>>>>>>>>>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
> >>>>>>>>>>>>>>>>         return total;
> >>>>>>>>>>>>>>>>  }
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
> >>>>>>>>>>>>>>>> +{
> >>>>>>>>>>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
> >>>>>>>>>>>>>>>> +       struct net_device *dev;
> >>>>>>>>>>>>>>>> +       void *ptr;
> >>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
> >>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
> >>>>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
> >>>>>>>>>>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
> >>>>>>>>>>>>>>> another call to tweak the current API.
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> +               rcu_read_lock();
> >>>>>>>>>>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
> >>>>>>>>>>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
> >>>>>>>>>>>>>>> I'm not sure is what we want.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> What else would you suggest calling to wake the queue?
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> I don't have a good method in my mind, just want to point out its implications.
> >>>>>>>>>>>>
> >>>>>>>>>>>> I have to admit I'm a bit stuck at this point, particularly with this
> >>>>>>>>>>>> aspect.
> >>>>>>>>>>>>
> >>>>>>>>>>>> What is the correct way to pass the producer CPU ID to the consumer?
> >>>>>>>>>>>> Would it make sense to store smp_processor_id() in the tfile inside
> >>>>>>>>>>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
> >>>>>>>>>>>> XDP bit)? In the latter case, my concern is that this information may
> >>>>>>>>>>>> already be significantly outdated by the time it is used.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Based on that, my idea would be for the consumer to wake the producer by
> >>>>>>>>>>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
> >>>>>>>>>>>> smp_call_function_single().
> >>>>>>>>>>>> Is this a reasonable approach?
> >>>>>>>>>>>
> >>>>>>>>>>> I'm not sure but it would introduce costs like IPI.
> >>>>>>>>>>>
> >>>>>>>>>>>>
> >>>>>>>>>>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
> >>>>>>>>>>>> considered a deal-breaker for the patch set?
> >>>>>>>>>>>
> >>>>>>>>>>> It depends on whether or not it has effects on the performance.
> >>>>>>>>>>> Especially when vhost is pinned.
> >>>>>>>>>>
> >>>>>>>>>> I meant we can benchmark to see the impact. For example, pin vhost to
> >>>>>>>>>> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
> >>>>>>>>>>
> >>>>>>>>>> Thanks
> >>>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
> >>>>>>>>> for both the stock and patched versions. The benchmarks were run with
> >>>>>>>>> the full patch series applied, since testing only patches 1-3 would not
> >>>>>>>>> be meaningful - the queue is never stopped in that case, so no
> >>>>>>>>> TX_SOFTIRQ is triggered.
> >>>>>>>>>
> >>>>>>>>> Compared to the non-pinned CPU benchmarks in the cover letter,
> >>>>>>>>> performance is lower for pktgen with a single thread but higher with
> >>>>>>>>> four threads. The results show no regression for the patched version,
> >>>>>>>>> with even slight performance improvements observed:
> >>>>>>>>>
> >>>>>>>>> +-------------------------+-----------+----------------+
> >>>>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
> >>>>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> >>>>>>>>> | 100M packets            |           |                |
> >>>>>>>>> | vhost pinned to core 0  |           |                |
> >>>>>>>>> +-----------+-------------+-----------+----------------+
> >>>>>>>>> | TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
> >>>>>>>>> |  +        +-------------+-----------+----------------+
> >>>>>>>>> | vhost-net | Lost        | 1154 Kpps | 0              |
> >>>>>>>>> +-----------+-------------+-----------+----------------+
> >>>>>>>>>
> >>>>>>>>> +-------------------------+-----------+----------------+
> >>>>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
> >>>>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> >>>>>>>>> | 100M packets            |           |                |
> >>>>>>>>> | vhost pinned to core 0  |           |                |
> >>>>>>>>> | *4 threads*             |           |                |
> >>>>>>>>> +-----------+-------------+-----------+----------------+
> >>>>>>>>> | TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
> >>>>>>>>> |  +        +-------------+-----------+----------------+
> >>>>>>>>> | vhost-net | Lost        | 1527 Kpps | 0              |
> >>>>>>>>> +-----------+-------------+-----------+----------------+
> >>>>>>>
> >>>>>>> The PPS seems to be low. I'd suggest using testpmd (rxonly) mode in
> >>>>>>> the guest or an xdp program that did XDP_DROP in the guest.
> >>>>>>
> >>>>>> I forgot to mention that these PPS values are per thread.
> >>>>>> So overall we have 71 Kpps * 4 = 284 Kpps and 79 Kpps * 4 = 326 Kpps,
> >>>>>> respectively. For packet loss, that comes out to 1154 Kpps * 4 =
> >>>>>> 4616 Kpps and 0, respectively.
> >>>>>>
> >>>>>> Sorry about that!
> >>>>>>
> >>>>>> The pktgen benchmarks with a single thread look fine, right?
> >>>>>
> >>>>> Still looks very low. E.g I just have a run of pktgen (using
> >>>>> pktgen_sample03_burst_single_flow.sh) without a XDP_DROP in the guest,
> >>>>> I can get 1Mpps.
> >>>>
> >>>> Keep in mind that I am using an older CPU (i5-6300HQ). For the
> >>>> single-threaded tests I always used pktgen_sample01_simple.sh, and for
> >>>> the multi-threaded tests I always used pktgen_sample02_multiqueue.sh.
> >>>>
> >>>> Using pktgen_sample03_burst_single_flow.sh as you did fails for me (even
> >>>> though the same parameters work fine for sample01 and sample02):
> >>>>
> >>>> samples/pktgen/pktgen_sample03_burst_single_flow.sh -i tap0 -m
> >>>> 52:54:00:12:34:56 -d 10.0.0.2 -n 100000000
> >>>> /samples/pktgen/functions.sh: line 79: echo: write error: Operation not
> >>>> supported
> >>>> ERROR: Write error(1) occurred
> >>>> cmd: "burst 32 > /proc/net/pktgen/tap0@0"
> >>>>
> >>>> ...and I do not know what I am doing wrong, even after looking at
> >>>> Documentation/networking/pktgen.rst. Every burst size except 1 fails.
> >>>> Any clues?
> >>>
> >>> Please use -b 0, and I'm Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz.
> >>
> >> I tried using "-b 0", and while it worked, there was no noticeable
> >> performance improvement.
> >>
> >>>
> >>> Another thing I can think of is to disable
> >>>
> >>> 1) mitigations in both guest and host
> >>> 2) any kernel debug features in both host and guest
> >>
> >> I also rebuilt the kernel with everything disabled under
> >> "Kernel hacking", but that didn’t make any difference either.
> >>
> >> Because of this, I ran "pktgen_sample01_simple.sh" and
> >> "pktgen_sample02_multiqueue.sh" on my AMD Ryzen 5 5600X system. The
> >> results were about 374 Kpps with TAP and 1192 Kpps with TAP+vhost_net,
> >> with very similar performance between the stock and patched kernels.
> >>
> >> Personally, I think the low performance is to blame on the hardware.
> >
> > Let's double confirm this by:
> >
> > 1) make sure pktgen is using 100% CPU
> > 2) Perf doesn't show anything strange for pktgen thread
> >
> > Thanks
> >
>
> I ran pktgen using pktgen_sample01_simple.sh and, in parallel, started a
> 100 second perf stat measurement covering all kpktgend threads.
>
> Across all configurations, a single CPU was fully utilized.
>
> Apart from that, the patched variants show a higher branch frequency and
> a slightly increased number of context switches.
>
>
> The detailed results are provided below:
>
> Processor: Ryzen 5 5600X
>
> pktgen command:
> sudo perf stat samples/pktgen/pktgen_sample01_simple.sh -i tap0 -m
> 52:54:00:12:34:56 -d 10.0.0.2 -n 10000000000
>
> perf stat command:
> sudo perf stat --timeout 100000 -p $(pgrep kpktgend | tr '\n' ,) -o X.txt
>
>
> Results:
> Stock TAP:
>             46.997      context-switches                 #    467,2 cs/sec  cs_per_second
>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
>         100.587,69 msec task-clock                       #      1,0 CPUs  CPUs_utilized
>      8.491.586.483      branch-misses                    #     10,9 %  branch_miss_rate         (50,24%)
>     77.734.761.406      branches                         #    772,8 M/sec  branch_frequency     (66,85%)
>    382.420.291.585      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
>    377.612.185.141      instructions                     #      1,0 instructions  insn_per_cycle  (66,85%)
>     84.012.185.936      stalled-cycles-frontend          #     0,22 frontend_cycles_idle        (66,35%)
>
>      100,100414494 seconds time elapsed
>
>
> Stock TAP+vhost-net:
>             47.087      context-switches                 #    468,1 cs/sec  cs_per_second
>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
>         100.594,09 msec task-clock                       #      1,0 CPUs  CPUs_utilized
>      8.034.703.613      branch-misses                    #     11,1 %  branch_miss_rate         (50,24%)
>     72.477.989.922      branches                         #    720,5 M/sec  branch_frequency     (66,86%)
>    382.218.276.832      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
>    349.555.577.281      instructions                     #      0,9 instructions  insn_per_cycle  (66,85%)
>     83.917.644.262      stalled-cycles-frontend          #     0,22 frontend_cycles_idle        (66,35%)
>
>      100,100520402 seconds time elapsed
>
>
> Patched TAP:
>             47.862      context-switches                 #    475,8 cs/sec  cs_per_second
>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
>         100.589,30 msec task-clock                       #      1,0 CPUs  CPUs_utilized
>      9.337.258.794      branch-misses                    #      9,4 %  branch_miss_rate         (50,19%)
>     99.518.421.676      branches                         #    989,4 M/sec  branch_frequency     (66,85%)
>    382.508.244.894      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
>    312.582.270.975      instructions                     #      0,8 instructions  insn_per_cycle  (66,85%)
>     76.338.503.984      stalled-cycles-frontend          #     0,20 frontend_cycles_idle        (66,39%)
>
>      100,101262454 seconds time elapsed
>
>
> Patched TAP+vhost-net:
>             47.892      context-switches                 #    476,1 cs/sec  cs_per_second
>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
>         100.581,95 msec task-clock                       #      1,0 CPUs  CPUs_utilized
>      9.083.588.313      branch-misses                    #     10,1 %  branch_miss_rate         (50,28%)
>     90.300.124.712      branches                         #    897,8 M/sec  branch_frequency     (66,85%)
>    382.374.510.376      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
>    340.089.181.199      instructions                     #      0,9 instructions  insn_per_cycle  (66,85%)
>     78.151.408.955      stalled-cycles-frontend          #     0,20 frontend_cycles_idle        (66,31%)
>
>      100,101212911 seconds time elapsed

Thanks for sharing. I have more questions:

1) The number of CPU and vCPUs
2) If you pin vhost or vCPU threads
3) what does perf top looks like or perf top -p $pid_of_vhost

>
> >>
> >> Thanks!
> >>
> >>>
> >>> Thanks
> >>>
> >>>>
> >>>> Thanks!
> >>>>
> >>>>>
> >>>>>>
> >>>>>> I'll still look into using an XDP program that does XDP_DROP in the
> >>>>>> guest.
> >>>>>>
> >>>>>> Thanks!
> >>>>>
> >>>>> Thanks
> >>>>>
> >>>>>>
> >>>>>>>
> >>>>>>>>>
> >>>>>>>>> +------------------------+-------------+----------------+
> >>>>>>>>> | iperf3 TCP benchmarks  | Stock       | Patched with   |
> >>>>>>>>> | to Debian VM 120s      |             | fq_codel qdisc |
> >>>>>>>>> | vhost pinned to core 0 |             |                |
> >>>>>>>>> +------------------------+-------------+----------------+
> >>>>>>>>> | TAP                    | 22.0 Gbit/s | 22.0 Gbit/s    |
> >>>>>>>>> |  +                     |             |                |
> >>>>>>>>> | vhost-net              |             |                |
> >>>>>>>>> +------------------------+-------------+----------------+
> >>>>>>>>>
> >>>>>>>>> +---------------------------+-------------+----------------+
> >>>>>>>>> | iperf3 TCP benchmarks     | Stock       | Patched with   |
> >>>>>>>>> | to Debian VM 120s         |             | fq_codel qdisc |
> >>>>>>>>> | vhost pinned to core 0    |             |                |
> >>>>>>>>> | *4 iperf3 client threads* |             |                |
> >>>>>>>>> +---------------------------+-------------+----------------+
> >>>>>>>>> | TAP                       | 21.4 Gbit/s | 21.5 Gbit/s    |
> >>>>>>>>> |  +                        |             |                |
> >>>>>>>>> | vhost-net                 |             |                |
> >>>>>>>>> +---------------------------+-------------+----------------+
> >>>>>>>>
> >>>>>>>> What are your thoughts on this?
> >>>>>>>>
> >>>>>>>> Thanks!
> >>>>>>>>
> >>>>>>>>
> >>>>>>>
> >>>>>>> Thanks
> >>>>>>>
> >>>>>>
> >>>>>
> >>>>
> >>>
> >>
> >
>

[PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Simon Schippers 3 days, 17 hours ago

On 2/5/26 04:59, Jason Wang wrote:
> On Wed, Feb 4, 2026 at 11:44 PM Simon Schippers
> <simon.schippers@tu-dortmund.de> wrote:
>>
>> On 2/3/26 04:48, Jason Wang wrote:
>>> On Mon, Feb 2, 2026 at 4:19 AM Simon Schippers
>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>
>>>> On 1/30/26 02:51, Jason Wang wrote:
>>>>> On Thu, Jan 29, 2026 at 5:25 PM Simon Schippers
>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>
>>>>>> On 1/29/26 02:14, Jason Wang wrote:
>>>>>>> On Wed, Jan 28, 2026 at 3:54 PM Simon Schippers
>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>
>>>>>>>> On 1/28/26 08:03, Jason Wang wrote:
>>>>>>>>> On Wed, Jan 28, 2026 at 12:48 AM Simon Schippers
>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>
>>>>>>>>>> On 1/23/26 10:54, Simon Schippers wrote:
>>>>>>>>>>> On 1/23/26 04:05, Jason Wang wrote:
>>>>>>>>>>>> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>>>>>>>
>>>>>>>>>>>>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
>>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 1/9/26 07:02, Jason Wang wrote:
>>>>>>>>>>>>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
>>>>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> On 1/8/26 04:38, Jason Wang wrote:
>>>>>>>>>>>>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
>>>>>>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
>>>>>>>>>>>>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
>>>>>>>>>>>>>>>>>> space in the underlying ptr_ring.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
>>>>>>>>>>>>>>>>>> in an upcoming commit.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>>>>>>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>>>>>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>>>>>>>>>>>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>>>>>>>>>>>>>>>>>> index 1197f245e873..2442cf7ac385 100644
>>>>>>>>>>>>>>>>>> --- a/drivers/net/tap.c
>>>>>>>>>>>>>>>>>> +++ b/drivers/net/tap.c
>>>>>>>>>>>>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
>>>>>>>>>>>>>>>>>>         return ret ? ret : total;
>>>>>>>>>>>>>>>>>>  }
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>> +       struct ptr_ring *ring = &q->ring;
>>>>>>>>>>>>>>>>>> +       struct net_device *dev;
>>>>>>>>>>>>>>>>>> +       void *ptr;
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>>>>>>>>>> +               rcu_read_lock();
>>>>>>>>>>>>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
>>>>>>>>>>>>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
>>>>>>>>>>>>>>>>>> +               rcu_read_unlock();
>>>>>>>>>>>>>>>>>> +       }
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +       spin_unlock(&ring->consumer_lock);
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +       return ptr;
>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>>>>>>>>>>>                            struct iov_iter *to,
>>>>>>>>>>>>>>>>>>                            int noblock, struct sk_buff *skb)
>>>>>>>>>>>>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>>>>>>>>>>>                                         TASK_INTERRUPTIBLE);
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>                 /* Read frames from the queue */
>>>>>>>>>>>>>>>>>> -               skb = ptr_ring_consume(&q->ring);
>>>>>>>>>>>>>>>>>> +               skb = tap_ring_consume(q);
>>>>>>>>>>>>>>>>>>                 if (skb)
>>>>>>>>>>>>>>>>>>                         break;
>>>>>>>>>>>>>>>>>>                 if (noblock) {
>>>>>>>>>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>>>>>>>>>>>>> index 8192740357a0..7148f9a844a4 100644
>>>>>>>>>>>>>>>>>> --- a/drivers/net/tun.c
>>>>>>>>>>>>>>>>>> +++ b/drivers/net/tun.c
>>>>>>>>>>>>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>>>>>>>>>>>>>>>>>         return total;
>>>>>>>>>>>>>>>>>>  }
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
>>>>>>>>>>>>>>>>>> +       struct net_device *dev;
>>>>>>>>>>>>>>>>>> +       void *ptr;
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
>>>>>>>>>>>>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
>>>>>>>>>>>>>>>>> another call to tweak the current API.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> +               rcu_read_lock();
>>>>>>>>>>>>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
>>>>>>>>>>>>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
>>>>>>>>>>>>>>>>> I'm not sure is what we want.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> What else would you suggest calling to wake the queue?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I don't have a good method in my mind, just want to point out its implications.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I have to admit I'm a bit stuck at this point, particularly with this
>>>>>>>>>>>>>> aspect.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> What is the correct way to pass the producer CPU ID to the consumer?
>>>>>>>>>>>>>> Would it make sense to store smp_processor_id() in the tfile inside
>>>>>>>>>>>>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
>>>>>>>>>>>>>> XDP bit)? In the latter case, my concern is that this information may
>>>>>>>>>>>>>> already be significantly outdated by the time it is used.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Based on that, my idea would be for the consumer to wake the producer by
>>>>>>>>>>>>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
>>>>>>>>>>>>>> smp_call_function_single().
>>>>>>>>>>>>>> Is this a reasonable approach?
>>>>>>>>>>>>>
>>>>>>>>>>>>> I'm not sure but it would introduce costs like IPI.
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
>>>>>>>>>>>>>> considered a deal-breaker for the patch set?
>>>>>>>>>>>>>
>>>>>>>>>>>>> It depends on whether or not it has effects on the performance.
>>>>>>>>>>>>> Especially when vhost is pinned.
>>>>>>>>>>>>
>>>>>>>>>>>> I meant we can benchmark to see the impact. For example, pin vhost to
>>>>>>>>>>>> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
>>>>>>>>>>>>
>>>>>>>>>>>> Thanks
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
>>>>>>>>>>> for both the stock and patched versions. The benchmarks were run with
>>>>>>>>>>> the full patch series applied, since testing only patches 1-3 would not
>>>>>>>>>>> be meaningful - the queue is never stopped in that case, so no
>>>>>>>>>>> TX_SOFTIRQ is triggered.
>>>>>>>>>>>
>>>>>>>>>>> Compared to the non-pinned CPU benchmarks in the cover letter,
>>>>>>>>>>> performance is lower for pktgen with a single thread but higher with
>>>>>>>>>>> four threads. The results show no regression for the patched version,
>>>>>>>>>>> with even slight performance improvements observed:
>>>>>>>>>>>
>>>>>>>>>>> +-------------------------+-----------+----------------+
>>>>>>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
>>>>>>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
>>>>>>>>>>> | 100M packets            |           |                |
>>>>>>>>>>> | vhost pinned to core 0  |           |                |
>>>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>>>> | TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
>>>>>>>>>>> |  +        +-------------+-----------+----------------+
>>>>>>>>>>> | vhost-net | Lost        | 1154 Kpps | 0              |
>>>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>>>>
>>>>>>>>>>> +-------------------------+-----------+----------------+
>>>>>>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
>>>>>>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
>>>>>>>>>>> | 100M packets            |           |                |
>>>>>>>>>>> | vhost pinned to core 0  |           |                |
>>>>>>>>>>> | *4 threads*             |           |                |
>>>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>>>> | TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
>>>>>>>>>>> |  +        +-------------+-----------+----------------+
>>>>>>>>>>> | vhost-net | Lost        | 1527 Kpps | 0              |
>>>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>>
>>>>>>>>> The PPS seems to be low. I'd suggest using testpmd (rxonly) mode in
>>>>>>>>> the guest or an xdp program that did XDP_DROP in the guest.
>>>>>>>>
>>>>>>>> I forgot to mention that these PPS values are per thread.
>>>>>>>> So overall we have 71 Kpps * 4 = 284 Kpps and 79 Kpps * 4 = 326 Kpps,
>>>>>>>> respectively. For packet loss, that comes out to 1154 Kpps * 4 =
>>>>>>>> 4616 Kpps and 0, respectively.
>>>>>>>>
>>>>>>>> Sorry about that!
>>>>>>>>
>>>>>>>> The pktgen benchmarks with a single thread look fine, right?
>>>>>>>
>>>>>>> Still looks very low. E.g I just have a run of pktgen (using
>>>>>>> pktgen_sample03_burst_single_flow.sh) without a XDP_DROP in the guest,
>>>>>>> I can get 1Mpps.
>>>>>>
>>>>>> Keep in mind that I am using an older CPU (i5-6300HQ). For the
>>>>>> single-threaded tests I always used pktgen_sample01_simple.sh, and for
>>>>>> the multi-threaded tests I always used pktgen_sample02_multiqueue.sh.
>>>>>>
>>>>>> Using pktgen_sample03_burst_single_flow.sh as you did fails for me (even
>>>>>> though the same parameters work fine for sample01 and sample02):
>>>>>>
>>>>>> samples/pktgen/pktgen_sample03_burst_single_flow.sh -i tap0 -m
>>>>>> 52:54:00:12:34:56 -d 10.0.0.2 -n 100000000
>>>>>> /samples/pktgen/functions.sh: line 79: echo: write error: Operation not
>>>>>> supported
>>>>>> ERROR: Write error(1) occurred
>>>>>> cmd: "burst 32 > /proc/net/pktgen/tap0@0"
>>>>>>
>>>>>> ...and I do not know what I am doing wrong, even after looking at
>>>>>> Documentation/networking/pktgen.rst. Every burst size except 1 fails.
>>>>>> Any clues?
>>>>>
>>>>> Please use -b 0, and I'm Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz.
>>>>
>>>> I tried using "-b 0", and while it worked, there was no noticeable
>>>> performance improvement.
>>>>
>>>>>
>>>>> Another thing I can think of is to disable
>>>>>
>>>>> 1) mitigations in both guest and host
>>>>> 2) any kernel debug features in both host and guest
>>>>
>>>> I also rebuilt the kernel with everything disabled under
>>>> "Kernel hacking", but that didn’t make any difference either.
>>>>
>>>> Because of this, I ran "pktgen_sample01_simple.sh" and
>>>> "pktgen_sample02_multiqueue.sh" on my AMD Ryzen 5 5600X system. The
>>>> results were about 374 Kpps with TAP and 1192 Kpps with TAP+vhost_net,
>>>> with very similar performance between the stock and patched kernels.
>>>>
>>>> Personally, I think the low performance is to blame on the hardware.
>>>
>>> Let's double confirm this by:
>>>
>>> 1) make sure pktgen is using 100% CPU
>>> 2) Perf doesn't show anything strange for pktgen thread
>>>
>>> Thanks
>>>
>>
>> I ran pktgen using pktgen_sample01_simple.sh and, in parallel, started a
>> 100 second perf stat measurement covering all kpktgend threads.
>>
>> Across all configurations, a single CPU was fully utilized.
>>
>> Apart from that, the patched variants show a higher branch frequency and
>> a slightly increased number of context switches.
>>
>>
>> The detailed results are provided below:
>>
>> Processor: Ryzen 5 5600X
>>
>> pktgen command:
>> sudo perf stat samples/pktgen/pktgen_sample01_simple.sh -i tap0 -m
>> 52:54:00:12:34:56 -d 10.0.0.2 -n 10000000000
>>
>> perf stat command:
>> sudo perf stat --timeout 100000 -p $(pgrep kpktgend | tr '\n' ,) -o X.txt
>>
>>
>> Results:
>> Stock TAP:
>>             46.997      context-switches                 #    467,2 cs/sec  cs_per_second
>>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
>>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
>>         100.587,69 msec task-clock                       #      1,0 CPUs  CPUs_utilized
>>      8.491.586.483      branch-misses                    #     10,9 %  branch_miss_rate         (50,24%)
>>     77.734.761.406      branches                         #    772,8 M/sec  branch_frequency     (66,85%)
>>    382.420.291.585      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
>>    377.612.185.141      instructions                     #      1,0 instructions  insn_per_cycle  (66,85%)
>>     84.012.185.936      stalled-cycles-frontend          #     0,22 frontend_cycles_idle        (66,35%)
>>
>>      100,100414494 seconds time elapsed
>>
>>
>> Stock TAP+vhost-net:
>>             47.087      context-switches                 #    468,1 cs/sec  cs_per_second
>>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
>>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
>>         100.594,09 msec task-clock                       #      1,0 CPUs  CPUs_utilized
>>      8.034.703.613      branch-misses                    #     11,1 %  branch_miss_rate         (50,24%)
>>     72.477.989.922      branches                         #    720,5 M/sec  branch_frequency     (66,86%)
>>    382.218.276.832      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
>>    349.555.577.281      instructions                     #      0,9 instructions  insn_per_cycle  (66,85%)
>>     83.917.644.262      stalled-cycles-frontend          #     0,22 frontend_cycles_idle        (66,35%)
>>
>>      100,100520402 seconds time elapsed
>>
>>
>> Patched TAP:
>>             47.862      context-switches                 #    475,8 cs/sec  cs_per_second
>>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
>>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
>>         100.589,30 msec task-clock                       #      1,0 CPUs  CPUs_utilized
>>      9.337.258.794      branch-misses                    #      9,4 %  branch_miss_rate         (50,19%)
>>     99.518.421.676      branches                         #    989,4 M/sec  branch_frequency     (66,85%)
>>    382.508.244.894      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
>>    312.582.270.975      instructions                     #      0,8 instructions  insn_per_cycle  (66,85%)
>>     76.338.503.984      stalled-cycles-frontend          #     0,20 frontend_cycles_idle        (66,39%)
>>
>>      100,101262454 seconds time elapsed
>>
>>
>> Patched TAP+vhost-net:
>>             47.892      context-switches                 #    476,1 cs/sec  cs_per_second
>>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
>>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
>>         100.581,95 msec task-clock                       #      1,0 CPUs  CPUs_utilized
>>      9.083.588.313      branch-misses                    #     10,1 %  branch_miss_rate         (50,28%)
>>     90.300.124.712      branches                         #    897,8 M/sec  branch_frequency     (66,85%)
>>    382.374.510.376      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
>>    340.089.181.199      instructions                     #      0,9 instructions  insn_per_cycle  (66,85%)
>>     78.151.408.955      stalled-cycles-frontend          #     0,20 frontend_cycles_idle        (66,31%)
>>
>>      100,101212911 seconds time elapsed
> 
> Thanks for sharing. I have more questions:
> 
> 1) The number of CPU and vCPUs

qemu runs with a single core. And my host system is now a Ryzen 5 5600x
with 6 cores, 12 threads.
This is my command for TAP+vhost-net:

sudo qemu-system-x86_64 -hda debian.qcow2
-netdev tap,id=mynet0,ifname=tap0,script=no,downscript=no,vhost=on
-device virtio-net-pci,netdev=mynet0 -m 1024 -enable-kvm

For TAP only it is the same but without vhost=on.

> 2) If you pin vhost or vCPU threads

Not in the previous shown benchmark. I pinned vhost in other benchmarks
but since there is only minor PPS difference I omitted for the sake of
simplicity.

> 3) what does perf top looks like or perf top -p $pid_of_vhost

The perf reports for the pid_of_vhost from pktgen_sample01_simple.sh
with TAP+vhost-net (not pinned, pktgen single queue, fq_codel) are shown
below. I can not see a huge difference between stock and patched.

Also I included perf reports from the pktgen_pids. I find them more
intersting because tun_net_xmit shows less overhead for the patched.
I assume that is due to the stopped netdev queue.

I have now benchmarked pretty much all possible combinations (with a
script) of TAP/TAP+vhost-net, single/multi-queue pktgen, vhost
pinned/not pinned, with/without -b 0, fq_codel/noqueue... All of that
with perf records..
I could share them if you want but I feel this is getting out of hand. 


Stock:
sudo perf record -p "$vhost_pid"
...
# Overhead  Command          Shared Object               Symbol                                    
# ........  ...............  ..........................  ..........................................
#
     5.97%  vhost-4874       [kernel.kallsyms]           [k] _copy_to_iter
     2.68%  vhost-4874       [kernel.kallsyms]           [k] tun_do_read
     2.23%  vhost-4874       [kernel.kallsyms]           [k] native_write_msr
     1.93%  vhost-4874       [kernel.kallsyms]           [k] __check_object_size
     1.61%  vhost-4874       [kernel.kallsyms]           [k] __slab_free.isra.0
     1.56%  vhost-4874       [kernel.kallsyms]           [k] __get_user_nocheck_2
     1.54%  vhost-4874       [kernel.kallsyms]           [k] iov_iter_zero
     1.45%  vhost-4874       [kernel.kallsyms]           [k] kmem_cache_free
     1.43%  vhost-4874       [kernel.kallsyms]           [k] tun_recvmsg
     1.24%  vhost-4874       [kernel.kallsyms]           [k] sk_skb_reason_drop
     1.12%  vhost-4874       [kernel.kallsyms]           [k] srso_alias_safe_ret
     1.07%  vhost-4874       [kernel.kallsyms]           [k] native_read_msr
     0.76%  vhost-4874       [kernel.kallsyms]           [k] simple_copy_to_iter
     0.75%  vhost-4874       [kernel.kallsyms]           [k] srso_alias_return_thunk
     0.69%  vhost-4874       [vhost]                     [k] 0x0000000000002e70
     0.59%  vhost-4874       [kernel.kallsyms]           [k] skb_release_data
     0.59%  vhost-4874       [kernel.kallsyms]           [k] __skb_datagram_iter
     0.53%  vhost-4874       [vhost]                     [k] 0x0000000000002e5f
     0.51%  vhost-4874       [kernel.kallsyms]           [k] slab_update_freelist.isra.0
     0.46%  vhost-4874       [kernel.kallsyms]           [k] kfree_skbmem
     0.44%  vhost-4874       [kernel.kallsyms]           [k] skb_copy_datagram_iter
     0.43%  vhost-4874       [kernel.kallsyms]           [k] skb_free_head
     0.37%  qemu-system-x86  [unknown]                   [k] 0xffffffffba898b1b
     0.35%  vhost-4874       [vhost]                     [k] 0x0000000000002e6b
     0.33%  vhost-4874       [vhost_net]                 [k] 0x000000000000357d
     0.28%  vhost-4874       [kernel.kallsyms]           [k] __check_heap_object
     0.27%  vhost-4874       [vhost_net]                 [k] 0x00000000000035f3
     0.26%  vhost-4874       [vhost_net]                 [k] 0x00000000000030f6
     0.26%  vhost-4874       [kernel.kallsyms]           [k] __virt_addr_valid
     0.24%  vhost-4874       [kernel.kallsyms]           [k] iov_iter_advance
     0.22%  vhost-4874       [kernel.kallsyms]           [k] perf_event_update_userpage
     0.22%  vhost-4874       [kernel.kallsyms]           [k] check_stack_object
     0.19%  qemu-system-x86  [unknown]                   [k] 0xffffffffba2a68cd
     0.19%  vhost-4874       [kernel.kallsyms]           [k] dequeue_entities
     0.19%  vhost-4874       [vhost_net]                 [k] 0x0000000000003237
     0.18%  vhost-4874       [vhost_net]                 [k] 0x0000000000003550
     0.18%  vhost-4874       [kernel.kallsyms]           [k] x86_pmu_del
     0.18%  vhost-4874       [vhost_net]                 [k] 0x00000000000034a0
     0.17%  vhost-4874       [kernel.kallsyms]           [k] x86_pmu_disable_all
     0.16%  vhost-4874       [vhost_net]                 [k] 0x0000000000003523
     0.16%  vhost-4874       [kernel.kallsyms]           [k] amd_pmu_addr_offset
...


sudo perf record -p "$kpktgend_pids":
...
# Overhead  Command      Shared Object      Symbol                                         
# ........  ...........  .................  ...............................................
#
    10.98%  kpktgend_0   [kernel.kallsyms]  [k] tun_net_xmit
    10.45%  kpktgend_0   [kernel.kallsyms]  [k] memset
     8.40%  kpktgend_0   [kernel.kallsyms]  [k] __alloc_skb
     6.31%  kpktgend_0   [kernel.kallsyms]  [k] kmem_cache_alloc_node_noprof
     3.13%  kpktgend_0   [kernel.kallsyms]  [k] srso_alias_safe_ret
     2.40%  kpktgend_0   [kernel.kallsyms]  [k] sk_skb_reason_drop
     2.11%  kpktgend_0   [kernel.kallsyms]  [k] srso_alias_return_thunk
     1.76%  kpktgend_0   [kernel.kallsyms]  [k] __netdev_alloc_skb
     1.74%  kpktgend_0   [kernel.kallsyms]  [k] __get_random_u32_below
     1.67%  kpktgend_0   [kernel.kallsyms]  [k] kmalloc_reserve
     1.61%  kpktgend_0   [pktgen]           [k] 0x0000000000003305
     1.57%  kpktgend_0   [pktgen]           [k] 0x00000000000032ff
     1.56%  kpktgend_0   [kernel.kallsyms]  [k] sock_def_readable
     1.49%  kpktgend_0   [kernel.kallsyms]  [k] kmem_cache_free
     1.48%  kpktgend_0   [kernel.kallsyms]  [k] chacha_permute
     1.39%  kpktgend_0   [kernel.kallsyms]  [k] get_random_u32
     1.12%  kpktgend_0   [pktgen]           [k] 0x0000000000003334
     1.09%  kpktgend_0   [pktgen]           [k] 0x000000000000332a
     0.99%  kpktgend_0   [pktgen]           [k] 0x0000000000003116
     0.92%  kpktgend_0   [kernel.kallsyms]  [k] skb_release_data
     0.91%  kpktgend_0   [kernel.kallsyms]  [k] skb_put
     0.88%  kpktgend_0   [pktgen]           [k] 0x0000000000004121
     0.77%  kpktgend_0   [pktgen]           [k] 0x0000000000003427
     0.75%  kpktgend_0   [pktgen]           [k] 0x0000000000004337
     0.70%  kpktgend_0   [pktgen]           [k] 0x00000000000021b9
     0.68%  kpktgend_0   [pktgen]           [k] 0x0000000000002447
     0.68%  kpktgend_0   [pktgen]           [k] 0x0000000000003919
     0.65%  kpktgend_0   [kernel.kallsyms]  [k] __local_bh_enable_ip
     0.63%  kpktgend_0   [kernel.kallsyms]  [k] skb_free_head
     0.63%  kpktgend_0   [kernel.kallsyms]  [k] kfree_skbmem
     0.61%  kpktgend_0   [pktgen]           [k] 0x0000000000003257
     0.60%  kpktgend_0   [pktgen]           [k] 0x000000000000243a
     0.59%  kpktgend_0   [pktgen]           [k] 0x000000000000413d
     0.58%  kpktgend_0   [pktgen]           [k] 0x00000000000040eb
     0.58%  kpktgend_0   [pktgen]           [k] 0x000000000000435f
     0.51%  kpktgend_0   [kernel.kallsyms]  [k] _raw_spin_lock
     0.50%  kpktgend_0   [kernel.kallsyms]  [k] __rcu_read_unlock
     0.45%  kpktgend_0   [pktgen]           [k] 0x000000000000330d
     0.45%  kpktgend_0   [pktgen]           [k] 0x0000000000004124
     0.44%  kpktgend_0   [pktgen]           [k] 0x000000000000433c
     0.43%  kpktgend_0   [pktgen]           [k] 0x0000000000003111


====================================================================
Patched: 
sudo perf record -p "$vhost_pid"
...
# Overhead  Command          Shared Object               Symbol                                    
# ........  ...............  ..........................  ..........................................
#
     5.85%  vhost-7042       [kernel.kallsyms]           [k] _copy_to_iter
     2.75%  vhost-7042       [kernel.kallsyms]           [k] tun_do_read
     2.37%  vhost-7042       [kernel.kallsyms]           [k] __check_object_size
     2.28%  vhost-7042       [kernel.kallsyms]           [k] native_write_msr
     1.74%  vhost-7042       [kernel.kallsyms]           [k] __slab_free.isra.0
     1.61%  vhost-7042       [kernel.kallsyms]           [k] iov_iter_zero
     1.54%  vhost-7042       [kernel.kallsyms]           [k] kmem_cache_free
     1.53%  vhost-7042       [kernel.kallsyms]           [k] tun_recvmsg
     1.33%  vhost-7042       [kernel.kallsyms]           [k] __get_user_nocheck_2
     1.28%  vhost-7042       [kernel.kallsyms]           [k] sk_skb_reason_drop
     1.09%  vhost-7042       [kernel.kallsyms]           [k] native_read_msr
     1.04%  vhost-7042       [kernel.kallsyms]           [k] srso_alias_safe_ret
     0.92%  vhost-7042       [kernel.kallsyms]           [k] simple_copy_to_iter
     0.84%  vhost-7042       [kernel.kallsyms]           [k] skb_release_data
     0.75%  vhost-7042       [kernel.kallsyms]           [k] srso_alias_return_thunk
     0.72%  vhost-7042       [kernel.kallsyms]           [k] __skb_datagram_iter
     0.70%  vhost-7042       [vhost]                     [k] 0x0000000000002e70
     0.53%  vhost-7042       [vhost]                     [k] 0x0000000000002e5f
     0.52%  vhost-7042       [kernel.kallsyms]           [k] slab_update_freelist.isra.0
     0.45%  vhost-7042       [kernel.kallsyms]           [k] skb_free_head
     0.44%  vhost-7042       [kernel.kallsyms]           [k] skb_copy_datagram_iter
     0.44%  vhost-7042       [kernel.kallsyms]           [k] kfree_skbmem
     0.34%  vhost-7042       [vhost_net]                 [k] 0x00000000000033e6
     0.33%  vhost-7042       [kernel.kallsyms]           [k] iov_iter_advance
     0.33%  vhost-7042       [vhost]                     [k] 0x0000000000002e6b
     0.31%  qemu-system-x86  [unknown]                   [k] 0xffffffffaa898b1b
     0.28%  vhost-7042       [vhost_net]                 [k] 0x00000000000033b9
     0.27%  vhost-7042       [vhost_net]                 [k] 0x000000000000345c
     0.27%  vhost-7042       [vhost_net]                 [k] 0x00000000000035c6
     0.27%  vhost-7042       [kernel.kallsyms]           [k] __check_heap_object
     0.25%  vhost-7042       [kernel.kallsyms]           [k] perf_event_update_userpage
     0.23%  vhost-7042       [kernel.kallsyms]           [k] __virt_addr_valid
     0.19%  vhost-7042       [kernel.kallsyms]           [k] x86_pmu_disable_all
...


sudo perf record -p "$kpktgend_pids":
...
# Overhead  Command      Shared Object      Symbol                                   
# ........  ...........  .................  .........................................
#
     5.98%  kpktgend_0   [pktgen]           [k] 0x0000000000003305
     5.94%  kpktgend_0   [pktgen]           [k] 0x00000000000032ff
     5.93%  kpktgend_0   [kernel.kallsyms]  [k] memset
     5.13%  kpktgend_0   [kernel.kallsyms]  [k] tun_net_xmit
     5.00%  kpktgend_0   [pktgen]           [k] 0x000000000000330d
     4.68%  kpktgend_0   [kernel.kallsyms]  [k] __alloc_skb
     4.22%  kpktgend_0   [pktgen]           [k] 0x0000000000003334
     3.51%  kpktgend_0   [kernel.kallsyms]  [k] srso_alias_safe_ret
     3.46%  kpktgend_0   [kernel.kallsyms]  [k] kmem_cache_alloc_node_noprof
     2.57%  kpktgend_0   [kernel.kallsyms]  [k] srso_alias_return_thunk
     2.49%  kpktgend_0   [pktgen]           [k] 0x000000000000332a
     2.02%  kpktgend_0   [pktgen]           [k] 0x0000000000003927
     1.94%  kpktgend_0   [kernel.kallsyms]  [k] __local_bh_enable_ip
     1.92%  kpktgend_0   [pktgen]           [k] 0x000000000000332f
     1.83%  kpktgend_0   [pktgen]           [k] 0x0000000000003116
     1.65%  kpktgend_0   [kernel.kallsyms]  [k] sock_def_readable
     1.51%  kpktgend_0   [pktgen]           [k] 0x00000000000032fd
     1.35%  kpktgend_0   [pktgen]           [k] 0x00000000000030bd
     1.35%  kpktgend_0   [pktgen]           [k] 0x0000000000003919
     1.20%  kpktgend_0   [kernel.kallsyms]  [k] __rcu_read_lock
     1.14%  kpktgend_0   [kernel.kallsyms]  [k] __rcu_read_unlock
     1.06%  kpktgend_0   [kernel.kallsyms]  [k] kthread_should_stop
     0.89%  kpktgend_0   [kernel.kallsyms]  [k] kmalloc_reserve
     0.88%  kpktgend_0   [kernel.kallsyms]  [k] __get_random_u32_below
     0.83%  kpktgend_0   [kernel.kallsyms]  [k] __netdev_alloc_skb
     0.83%  kpktgend_0   [kernel.kallsyms]  [k] chacha_permute
     0.74%  kpktgend_0   [kernel.kallsyms]  [k] get_random_u32
     0.72%  kpktgend_0   [pktgen]           [k] 0x00000000000030c5
     0.71%  kpktgend_0   [pktgen]           [k] 0x00000000000030c1
     0.70%  kpktgend_0   [pktgen]           [k] 0x00000000000030ce
     0.68%  kpktgend_0   [pktgen]           [k] 0x00000000000030d1
     0.68%  kpktgend_0   [pktgen]           [k] 0x000000000000391e
     0.63%  kpktgend_0   [pktgen]           [k] 0x000000000000311f
     0.62%  kpktgend_0   [pktgen]           [k] 0x000000000000312c
     0.61%  kpktgend_0   [pktgen]           [k] 0x0000000000003131
     0.61%  kpktgend_0   [pktgen]           [k] 0x0000000000003124
     0.57%  kpktgend_0   [pktgen]           [k] 0x0000000000003111
     0.56%  kpktgend_0   [kernel.kallsyms]  [k] skb_put
     0.55%  kpktgend_0   [pktgen]           [k] 0x00000000000030b8
     0.44%  kpktgend_0   [pktgen]           [k] 0x0000000000004337
     0.43%  kpktgend_0   [pktgen]           [k] 0x0000000000004121

Thanks :)

> 
>>
>>>>
>>>> Thanks!
>>>>
>>>>>
>>>>> Thanks
>>>>>
>>>>>>
>>>>>> Thanks!
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> I'll still look into using an XDP program that does XDP_DROP in the
>>>>>>>> guest.
>>>>>>>>
>>>>>>>> Thanks!
>>>>>>>
>>>>>>> Thanks
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> +------------------------+-------------+----------------+
>>>>>>>>>>> | iperf3 TCP benchmarks  | Stock       | Patched with   |
>>>>>>>>>>> | to Debian VM 120s      |             | fq_codel qdisc |
>>>>>>>>>>> | vhost pinned to core 0 |             |                |
>>>>>>>>>>> +------------------------+-------------+----------------+
>>>>>>>>>>> | TAP                    | 22.0 Gbit/s | 22.0 Gbit/s    |
>>>>>>>>>>> |  +                     |             |                |
>>>>>>>>>>> | vhost-net              |             |                |
>>>>>>>>>>> +------------------------+-------------+----------------+
>>>>>>>>>>>
>>>>>>>>>>> +---------------------------+-------------+----------------+
>>>>>>>>>>> | iperf3 TCP benchmarks     | Stock       | Patched with   |
>>>>>>>>>>> | to Debian VM 120s         |             | fq_codel qdisc |
>>>>>>>>>>> | vhost pinned to core 0    |             |                |
>>>>>>>>>>> | *4 iperf3 client threads* |             |                |
>>>>>>>>>>> +---------------------------+-------------+----------------+
>>>>>>>>>>> | TAP                       | 21.4 Gbit/s | 21.5 Gbit/s    |
>>>>>>>>>>> |  +                        |             |                |
>>>>>>>>>>> | vhost-net                 |             |                |
>>>>>>>>>>> +---------------------------+-------------+----------------+
>>>>>>>>>>
>>>>>>>>>> What are your thoughts on this?
>>>>>>>>>>
>>>>>>>>>> Thanks!
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Thanks
>>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
>

Re: [PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Jason Wang 3 days, 12 hours ago

On Fri, Feb 6, 2026 at 6:28 AM Simon Schippers
<simon.schippers@tu-dortmund.de> wrote:
>
> On 2/5/26 04:59, Jason Wang wrote:
> > On Wed, Feb 4, 2026 at 11:44 PM Simon Schippers
> > <simon.schippers@tu-dortmund.de> wrote:
> >>
> >> On 2/3/26 04:48, Jason Wang wrote:
> >>> On Mon, Feb 2, 2026 at 4:19 AM Simon Schippers
> >>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>
> >>>> On 1/30/26 02:51, Jason Wang wrote:
> >>>>> On Thu, Jan 29, 2026 at 5:25 PM Simon Schippers
> >>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>
> >>>>>> On 1/29/26 02:14, Jason Wang wrote:
> >>>>>>> On Wed, Jan 28, 2026 at 3:54 PM Simon Schippers
> >>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>
> >>>>>>>> On 1/28/26 08:03, Jason Wang wrote:
> >>>>>>>>> On Wed, Jan 28, 2026 at 12:48 AM Simon Schippers
> >>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>>>
> >>>>>>>>>> On 1/23/26 10:54, Simon Schippers wrote:
> >>>>>>>>>>> On 1/23/26 04:05, Jason Wang wrote:
> >>>>>>>>>>>> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
> >>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> On 1/9/26 07:02, Jason Wang wrote:
> >>>>>>>>>>>>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
> >>>>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> On 1/8/26 04:38, Jason Wang wrote:
> >>>>>>>>>>>>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
> >>>>>>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
> >>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
> >>>>>>>>>>>>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
> >>>>>>>>>>>>>>>>>> space in the underlying ptr_ring.
> >>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
> >>>>>>>>>>>>>>>>>> in an upcoming commit.
> >>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>>>>>>>>>>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> >>>>>>>>>>>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> >>>>>>>>>>>>>>>>>> ---
> >>>>>>>>>>>>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
> >>>>>>>>>>>>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
> >>>>>>>>>>>>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
> >>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> >>>>>>>>>>>>>>>>>> index 1197f245e873..2442cf7ac385 100644
> >>>>>>>>>>>>>>>>>> --- a/drivers/net/tap.c
> >>>>>>>>>>>>>>>>>> +++ b/drivers/net/tap.c
> >>>>>>>>>>>>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
> >>>>>>>>>>>>>>>>>>         return ret ? ret : total;
> >>>>>>>>>>>>>>>>>>  }
> >>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
> >>>>>>>>>>>>>>>>>> +{
> >>>>>>>>>>>>>>>>>> +       struct ptr_ring *ring = &q->ring;
> >>>>>>>>>>>>>>>>>> +       struct net_device *dev;
> >>>>>>>>>>>>>>>>>> +       void *ptr;
> >>>>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
> >>>>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
> >>>>>>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>>>>>>>>>>>>>>>> +               rcu_read_lock();
> >>>>>>>>>>>>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
> >>>>>>>>>>>>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
> >>>>>>>>>>>>>>>>>> +               rcu_read_unlock();
> >>>>>>>>>>>>>>>>>> +       }
> >>>>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>>>> +       spin_unlock(&ring->consumer_lock);
> >>>>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>>>> +       return ptr;
> >>>>>>>>>>>>>>>>>> +}
> >>>>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
> >>>>>>>>>>>>>>>>>>                            struct iov_iter *to,
> >>>>>>>>>>>>>>>>>>                            int noblock, struct sk_buff *skb)
> >>>>>>>>>>>>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
> >>>>>>>>>>>>>>>>>>                                         TASK_INTERRUPTIBLE);
> >>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>>                 /* Read frames from the queue */
> >>>>>>>>>>>>>>>>>> -               skb = ptr_ring_consume(&q->ring);
> >>>>>>>>>>>>>>>>>> +               skb = tap_ring_consume(q);
> >>>>>>>>>>>>>>>>>>                 if (skb)
> >>>>>>>>>>>>>>>>>>                         break;
> >>>>>>>>>>>>>>>>>>                 if (noblock) {
> >>>>>>>>>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>>>>>>>>>>>>>>>>> index 8192740357a0..7148f9a844a4 100644
> >>>>>>>>>>>>>>>>>> --- a/drivers/net/tun.c
> >>>>>>>>>>>>>>>>>> +++ b/drivers/net/tun.c
> >>>>>>>>>>>>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
> >>>>>>>>>>>>>>>>>>         return total;
> >>>>>>>>>>>>>>>>>>  }
> >>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
> >>>>>>>>>>>>>>>>>> +{
> >>>>>>>>>>>>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
> >>>>>>>>>>>>>>>>>> +       struct net_device *dev;
> >>>>>>>>>>>>>>>>>> +       void *ptr;
> >>>>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
> >>>>>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
> >>>>>>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
> >>>>>>>>>>>>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
> >>>>>>>>>>>>>>>>> another call to tweak the current API.
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> +               rcu_read_lock();
> >>>>>>>>>>>>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
> >>>>>>>>>>>>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
> >>>>>>>>>>>>>>>>> I'm not sure is what we want.
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> What else would you suggest calling to wake the queue?
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> I don't have a good method in my mind, just want to point out its implications.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> I have to admit I'm a bit stuck at this point, particularly with this
> >>>>>>>>>>>>>> aspect.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> What is the correct way to pass the producer CPU ID to the consumer?
> >>>>>>>>>>>>>> Would it make sense to store smp_processor_id() in the tfile inside
> >>>>>>>>>>>>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
> >>>>>>>>>>>>>> XDP bit)? In the latter case, my concern is that this information may
> >>>>>>>>>>>>>> already be significantly outdated by the time it is used.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Based on that, my idea would be for the consumer to wake the producer by
> >>>>>>>>>>>>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
> >>>>>>>>>>>>>> smp_call_function_single().
> >>>>>>>>>>>>>> Is this a reasonable approach?
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> I'm not sure but it would introduce costs like IPI.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
> >>>>>>>>>>>>>> considered a deal-breaker for the patch set?
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> It depends on whether or not it has effects on the performance.
> >>>>>>>>>>>>> Especially when vhost is pinned.
> >>>>>>>>>>>>
> >>>>>>>>>>>> I meant we can benchmark to see the impact. For example, pin vhost to
> >>>>>>>>>>>> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Thanks
> >>>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
> >>>>>>>>>>> for both the stock and patched versions. The benchmarks were run with
> >>>>>>>>>>> the full patch series applied, since testing only patches 1-3 would not
> >>>>>>>>>>> be meaningful - the queue is never stopped in that case, so no
> >>>>>>>>>>> TX_SOFTIRQ is triggered.
> >>>>>>>>>>>
> >>>>>>>>>>> Compared to the non-pinned CPU benchmarks in the cover letter,
> >>>>>>>>>>> performance is lower for pktgen with a single thread but higher with
> >>>>>>>>>>> four threads. The results show no regression for the patched version,
> >>>>>>>>>>> with even slight performance improvements observed:
> >>>>>>>>>>>
> >>>>>>>>>>> +-------------------------+-----------+----------------+
> >>>>>>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
> >>>>>>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> >>>>>>>>>>> | 100M packets            |           |                |
> >>>>>>>>>>> | vhost pinned to core 0  |           |                |
> >>>>>>>>>>> +-----------+-------------+-----------+----------------+
> >>>>>>>>>>> | TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
> >>>>>>>>>>> |  +        +-------------+-----------+----------------+
> >>>>>>>>>>> | vhost-net | Lost        | 1154 Kpps | 0              |
> >>>>>>>>>>> +-----------+-------------+-----------+----------------+
> >>>>>>>>>>>
> >>>>>>>>>>> +-------------------------+-----------+----------------+
> >>>>>>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
> >>>>>>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
> >>>>>>>>>>> | 100M packets            |           |                |
> >>>>>>>>>>> | vhost pinned to core 0  |           |                |
> >>>>>>>>>>> | *4 threads*             |           |                |
> >>>>>>>>>>> +-----------+-------------+-----------+----------------+
> >>>>>>>>>>> | TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
> >>>>>>>>>>> |  +        +-------------+-----------+----------------+
> >>>>>>>>>>> | vhost-net | Lost        | 1527 Kpps | 0              |
> >>>>>>>>>>> +-----------+-------------+-----------+----------------+
> >>>>>>>>>
> >>>>>>>>> The PPS seems to be low. I'd suggest using testpmd (rxonly) mode in
> >>>>>>>>> the guest or an xdp program that did XDP_DROP in the guest.
> >>>>>>>>
> >>>>>>>> I forgot to mention that these PPS values are per thread.
> >>>>>>>> So overall we have 71 Kpps * 4 = 284 Kpps and 79 Kpps * 4 = 326 Kpps,
> >>>>>>>> respectively. For packet loss, that comes out to 1154 Kpps * 4 =
> >>>>>>>> 4616 Kpps and 0, respectively.
> >>>>>>>>
> >>>>>>>> Sorry about that!
> >>>>>>>>
> >>>>>>>> The pktgen benchmarks with a single thread look fine, right?
> >>>>>>>
> >>>>>>> Still looks very low. E.g I just have a run of pktgen (using
> >>>>>>> pktgen_sample03_burst_single_flow.sh) without a XDP_DROP in the guest,
> >>>>>>> I can get 1Mpps.
> >>>>>>
> >>>>>> Keep in mind that I am using an older CPU (i5-6300HQ). For the
> >>>>>> single-threaded tests I always used pktgen_sample01_simple.sh, and for
> >>>>>> the multi-threaded tests I always used pktgen_sample02_multiqueue.sh.
> >>>>>>
> >>>>>> Using pktgen_sample03_burst_single_flow.sh as you did fails for me (even
> >>>>>> though the same parameters work fine for sample01 and sample02):
> >>>>>>
> >>>>>> samples/pktgen/pktgen_sample03_burst_single_flow.sh -i tap0 -m
> >>>>>> 52:54:00:12:34:56 -d 10.0.0.2 -n 100000000
> >>>>>> /samples/pktgen/functions.sh: line 79: echo: write error: Operation not
> >>>>>> supported
> >>>>>> ERROR: Write error(1) occurred
> >>>>>> cmd: "burst 32 > /proc/net/pktgen/tap0@0"
> >>>>>>
> >>>>>> ...and I do not know what I am doing wrong, even after looking at
> >>>>>> Documentation/networking/pktgen.rst. Every burst size except 1 fails.
> >>>>>> Any clues?
> >>>>>
> >>>>> Please use -b 0, and I'm Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz.
> >>>>
> >>>> I tried using "-b 0", and while it worked, there was no noticeable
> >>>> performance improvement.
> >>>>
> >>>>>
> >>>>> Another thing I can think of is to disable
> >>>>>
> >>>>> 1) mitigations in both guest and host
> >>>>> 2) any kernel debug features in both host and guest
> >>>>
> >>>> I also rebuilt the kernel with everything disabled under
> >>>> "Kernel hacking", but that didn’t make any difference either.
> >>>>
> >>>> Because of this, I ran "pktgen_sample01_simple.sh" and
> >>>> "pktgen_sample02_multiqueue.sh" on my AMD Ryzen 5 5600X system. The
> >>>> results were about 374 Kpps with TAP and 1192 Kpps with TAP+vhost_net,
> >>>> with very similar performance between the stock and patched kernels.
> >>>>
> >>>> Personally, I think the low performance is to blame on the hardware.
> >>>
> >>> Let's double confirm this by:
> >>>
> >>> 1) make sure pktgen is using 100% CPU
> >>> 2) Perf doesn't show anything strange for pktgen thread
> >>>
> >>> Thanks
> >>>
> >>
> >> I ran pktgen using pktgen_sample01_simple.sh and, in parallel, started a
> >> 100 second perf stat measurement covering all kpktgend threads.
> >>
> >> Across all configurations, a single CPU was fully utilized.
> >>
> >> Apart from that, the patched variants show a higher branch frequency and
> >> a slightly increased number of context switches.
> >>
> >>
> >> The detailed results are provided below:
> >>
> >> Processor: Ryzen 5 5600X
> >>
> >> pktgen command:
> >> sudo perf stat samples/pktgen/pktgen_sample01_simple.sh -i tap0 -m
> >> 52:54:00:12:34:56 -d 10.0.0.2 -n 10000000000
> >>
> >> perf stat command:
> >> sudo perf stat --timeout 100000 -p $(pgrep kpktgend | tr '\n' ,) -o X.txt
> >>
> >>
> >> Results:
> >> Stock TAP:
> >>             46.997      context-switches                 #    467,2 cs/sec  cs_per_second
> >>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
> >>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
> >>         100.587,69 msec task-clock                       #      1,0 CPUs  CPUs_utilized
> >>      8.491.586.483      branch-misses                    #     10,9 %  branch_miss_rate         (50,24%)
> >>     77.734.761.406      branches                         #    772,8 M/sec  branch_frequency     (66,85%)
> >>    382.420.291.585      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
> >>    377.612.185.141      instructions                     #      1,0 instructions  insn_per_cycle  (66,85%)
> >>     84.012.185.936      stalled-cycles-frontend          #     0,22 frontend_cycles_idle        (66,35%)
> >>
> >>      100,100414494 seconds time elapsed
> >>
> >>
> >> Stock TAP+vhost-net:
> >>             47.087      context-switches                 #    468,1 cs/sec  cs_per_second
> >>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
> >>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
> >>         100.594,09 msec task-clock                       #      1,0 CPUs  CPUs_utilized
> >>      8.034.703.613      branch-misses                    #     11,1 %  branch_miss_rate         (50,24%)
> >>     72.477.989.922      branches                         #    720,5 M/sec  branch_frequency     (66,86%)
> >>    382.218.276.832      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
> >>    349.555.577.281      instructions                     #      0,9 instructions  insn_per_cycle  (66,85%)
> >>     83.917.644.262      stalled-cycles-frontend          #     0,22 frontend_cycles_idle        (66,35%)
> >>
> >>      100,100520402 seconds time elapsed
> >>
> >>
> >> Patched TAP:
> >>             47.862      context-switches                 #    475,8 cs/sec  cs_per_second
> >>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
> >>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
> >>         100.589,30 msec task-clock                       #      1,0 CPUs  CPUs_utilized
> >>      9.337.258.794      branch-misses                    #      9,4 %  branch_miss_rate         (50,19%)
> >>     99.518.421.676      branches                         #    989,4 M/sec  branch_frequency     (66,85%)
> >>    382.508.244.894      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
> >>    312.582.270.975      instructions                     #      0,8 instructions  insn_per_cycle  (66,85%)
> >>     76.338.503.984      stalled-cycles-frontend          #     0,20 frontend_cycles_idle        (66,39%)
> >>
> >>      100,101262454 seconds time elapsed
> >>
> >>
> >> Patched TAP+vhost-net:
> >>             47.892      context-switches                 #    476,1 cs/sec  cs_per_second
> >>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
> >>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
> >>         100.581,95 msec task-clock                       #      1,0 CPUs  CPUs_utilized
> >>      9.083.588.313      branch-misses                    #     10,1 %  branch_miss_rate         (50,28%)
> >>     90.300.124.712      branches                         #    897,8 M/sec  branch_frequency     (66,85%)
> >>    382.374.510.376      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
> >>    340.089.181.199      instructions                     #      0,9 instructions  insn_per_cycle  (66,85%)
> >>     78.151.408.955      stalled-cycles-frontend          #     0,20 frontend_cycles_idle        (66,31%)
> >>
> >>      100,101212911 seconds time elapsed
> >
> > Thanks for sharing. I have more questions:
> >
> > 1) The number of CPU and vCPUs
>
> qemu runs with a single core. And my host system is now a Ryzen 5 5600x
> with 6 cores, 12 threads.
> This is my command for TAP+vhost-net:
>
> sudo qemu-system-x86_64 -hda debian.qcow2
> -netdev tap,id=mynet0,ifname=tap0,script=no,downscript=no,vhost=on
> -device virtio-net-pci,netdev=mynet0 -m 1024 -enable-kvm
>
> For TAP only it is the same but without vhost=on.
>
> > 2) If you pin vhost or vCPU threads
>
> Not in the previous shown benchmark. I pinned vhost in other benchmarks
> but since there is only minor PPS difference I omitted for the sake of
> simplicity.
>
> > 3) what does perf top looks like or perf top -p $pid_of_vhost
>
> The perf reports for the pid_of_vhost from pktgen_sample01_simple.sh
> with TAP+vhost-net (not pinned, pktgen single queue, fq_codel) are shown
> below. I can not see a huge difference between stock and patched.
>
> Also I included perf reports from the pktgen_pids. I find them more
> intersting because tun_net_xmit shows less overhead for the patched.
> I assume that is due to the stopped netdev queue.
>
> I have now benchmarked pretty much all possible combinations (with a
> script) of TAP/TAP+vhost-net, single/multi-queue pktgen, vhost
> pinned/not pinned, with/without -b 0, fq_codel/noqueue... All of that
> with perf records..
> I could share them if you want but I feel this is getting out of hand.
>
>
> Stock:
> sudo perf record -p "$vhost_pid"
> ...
> # Overhead  Command          Shared Object               Symbol
> # ........  ...............  ..........................  ..........................................
> #
>      5.97%  vhost-4874       [kernel.kallsyms]           [k] _copy_to_iter
>      2.68%  vhost-4874       [kernel.kallsyms]           [k] tun_do_read
>      2.23%  vhost-4874       [kernel.kallsyms]           [k] native_write_msr
>      1.93%  vhost-4874       [kernel.kallsyms]           [k] __check_object_size

Let's disable CONFIG_HARDENED_USERCOPY and retry.

>      1.61%  vhost-4874       [kernel.kallsyms]           [k] __slab_free.isra.0
>      1.56%  vhost-4874       [kernel.kallsyms]           [k] __get_user_nocheck_2
>      1.54%  vhost-4874       [kernel.kallsyms]           [k] iov_iter_zero
>      1.45%  vhost-4874       [kernel.kallsyms]           [k] kmem_cache_free
>      1.43%  vhost-4874       [kernel.kallsyms]           [k] tun_recvmsg
>      1.24%  vhost-4874       [kernel.kallsyms]           [k] sk_skb_reason_drop
>      1.12%  vhost-4874       [kernel.kallsyms]           [k] srso_alias_safe_ret
>      1.07%  vhost-4874       [kernel.kallsyms]           [k] native_read_msr
>      0.76%  vhost-4874       [kernel.kallsyms]           [k] simple_copy_to_iter
>      0.75%  vhost-4874       [kernel.kallsyms]           [k] srso_alias_return_thunk
>      0.69%  vhost-4874       [vhost]                     [k] 0x0000000000002e70
>      0.59%  vhost-4874       [kernel.kallsyms]           [k] skb_release_data
>      0.59%  vhost-4874       [kernel.kallsyms]           [k] __skb_datagram_iter
>      0.53%  vhost-4874       [vhost]                     [k] 0x0000000000002e5f
>      0.51%  vhost-4874       [kernel.kallsyms]           [k] slab_update_freelist.isra.0
>      0.46%  vhost-4874       [kernel.kallsyms]           [k] kfree_skbmem
>      0.44%  vhost-4874       [kernel.kallsyms]           [k] skb_copy_datagram_iter
>      0.43%  vhost-4874       [kernel.kallsyms]           [k] skb_free_head
>      0.37%  qemu-system-x86  [unknown]                   [k] 0xffffffffba898b1b
>      0.35%  vhost-4874       [vhost]                     [k] 0x0000000000002e6b
>      0.33%  vhost-4874       [vhost_net]                 [k] 0x000000000000357d
>      0.28%  vhost-4874       [kernel.kallsyms]           [k] __check_heap_object
>      0.27%  vhost-4874       [vhost_net]                 [k] 0x00000000000035f3
>      0.26%  vhost-4874       [vhost_net]                 [k] 0x00000000000030f6
>      0.26%  vhost-4874       [kernel.kallsyms]           [k] __virt_addr_valid
>      0.24%  vhost-4874       [kernel.kallsyms]           [k] iov_iter_advance
>      0.22%  vhost-4874       [kernel.kallsyms]           [k] perf_event_update_userpage
>      0.22%  vhost-4874       [kernel.kallsyms]           [k] check_stack_object
>      0.19%  qemu-system-x86  [unknown]                   [k] 0xffffffffba2a68cd
>      0.19%  vhost-4874       [kernel.kallsyms]           [k] dequeue_entities
>      0.19%  vhost-4874       [vhost_net]                 [k] 0x0000000000003237
>      0.18%  vhost-4874       [vhost_net]                 [k] 0x0000000000003550
>      0.18%  vhost-4874       [kernel.kallsyms]           [k] x86_pmu_del
>      0.18%  vhost-4874       [vhost_net]                 [k] 0x00000000000034a0
>      0.17%  vhost-4874       [kernel.kallsyms]           [k] x86_pmu_disable_all
>      0.16%  vhost-4874       [vhost_net]                 [k] 0x0000000000003523
>      0.16%  vhost-4874       [kernel.kallsyms]           [k] amd_pmu_addr_offset
> ...
>
>
> sudo perf record -p "$kpktgend_pids":
> ...
> # Overhead  Command      Shared Object      Symbol
> # ........  ...........  .................  ...............................................
> #
>     10.98%  kpktgend_0   [kernel.kallsyms]  [k] tun_net_xmit
>     10.45%  kpktgend_0   [kernel.kallsyms]  [k] memset
>      8.40%  kpktgend_0   [kernel.kallsyms]  [k] __alloc_skb
>      6.31%  kpktgend_0   [kernel.kallsyms]  [k] kmem_cache_alloc_node_noprof
>      3.13%  kpktgend_0   [kernel.kallsyms]  [k] srso_alias_safe_ret
>      2.40%  kpktgend_0   [kernel.kallsyms]  [k] sk_skb_reason_drop
>      2.11%  kpktgend_0   [kernel.kallsyms]  [k] srso_alias_return_thunk

This is a hint that SRSO migitaion is enabled.

Have you disabled CPU_MITIGATIONS via either Kconfig or kernel command
line (mitigations = off) for both host and guest?

Thanks

[PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Simon Schippers 21 hours ago

On 2/6/26 04:21, Jason Wang wrote:
> On Fri, Feb 6, 2026 at 6:28 AM Simon Schippers
> <simon.schippers@tu-dortmund.de> wrote:
>>
>> On 2/5/26 04:59, Jason Wang wrote:
>>> On Wed, Feb 4, 2026 at 11:44 PM Simon Schippers
>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>
>>>> On 2/3/26 04:48, Jason Wang wrote:
>>>>> On Mon, Feb 2, 2026 at 4:19 AM Simon Schippers
>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>
>>>>>> On 1/30/26 02:51, Jason Wang wrote:
>>>>>>> On Thu, Jan 29, 2026 at 5:25 PM Simon Schippers
>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>
>>>>>>>> On 1/29/26 02:14, Jason Wang wrote:
>>>>>>>>> On Wed, Jan 28, 2026 at 3:54 PM Simon Schippers
>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>
>>>>>>>>>> On 1/28/26 08:03, Jason Wang wrote:
>>>>>>>>>>> On Wed, Jan 28, 2026 at 12:48 AM Simon Schippers
>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On 1/23/26 10:54, Simon Schippers wrote:
>>>>>>>>>>>>> On 1/23/26 04:05, Jason Wang wrote:
>>>>>>>>>>>>>> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
>>>>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> On 1/9/26 07:02, Jason Wang wrote:
>>>>>>>>>>>>>>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
>>>>>>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> On 1/8/26 04:38, Jason Wang wrote:
>>>>>>>>>>>>>>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
>>>>>>>>>>>>>>>>>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
>>>>>>>>>>>>>>>>>>>> and wake the corresponding netdev subqueue when consuming an entry frees
>>>>>>>>>>>>>>>>>>>> space in the underlying ptr_ring.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
>>>>>>>>>>>>>>>>>>>> in an upcoming commit.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>>>>>>>>>>>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>>>>>>>>>>>>>>>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>>>>>>>>>>>>>>>>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>>>>>>>>>>>>>>>>>>>> index 1197f245e873..2442cf7ac385 100644
>>>>>>>>>>>>>>>>>>>> --- a/drivers/net/tap.c
>>>>>>>>>>>>>>>>>>>> +++ b/drivers/net/tap.c
>>>>>>>>>>>>>>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
>>>>>>>>>>>>>>>>>>>>         return ret ? ret : total;
>>>>>>>>>>>>>>>>>>>>  }
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>> +       struct ptr_ring *ring = &q->ring;
>>>>>>>>>>>>>>>>>>>> +       struct net_device *dev;
>>>>>>>>>>>>>>>>>>>> +       void *ptr;
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>>>>>>>>>>>> +               rcu_read_lock();
>>>>>>>>>>>>>>>>>>>> +               dev = rcu_dereference(q->tap)->dev;
>>>>>>>>>>>>>>>>>>>> +               netif_wake_subqueue(dev, q->queue_index);
>>>>>>>>>>>>>>>>>>>> +               rcu_read_unlock();
>>>>>>>>>>>>>>>>>>>> +       }
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +       spin_unlock(&ring->consumer_lock);
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +       return ptr;
>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>  static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>>>>>>>>>>>>>                            struct iov_iter *to,
>>>>>>>>>>>>>>>>>>>>                            int noblock, struct sk_buff *skb)
>>>>>>>>>>>>>>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>>>>>>>>>>>>>                                         TASK_INTERRUPTIBLE);
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>                 /* Read frames from the queue */
>>>>>>>>>>>>>>>>>>>> -               skb = ptr_ring_consume(&q->ring);
>>>>>>>>>>>>>>>>>>>> +               skb = tap_ring_consume(q);
>>>>>>>>>>>>>>>>>>>>                 if (skb)
>>>>>>>>>>>>>>>>>>>>                         break;
>>>>>>>>>>>>>>>>>>>>                 if (noblock) {
>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>>>>>>>>>>>>>>> index 8192740357a0..7148f9a844a4 100644
>>>>>>>>>>>>>>>>>>>> --- a/drivers/net/tun.c
>>>>>>>>>>>>>>>>>>>> +++ b/drivers/net/tun.c
>>>>>>>>>>>>>>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>>>>>>>>>>>>>>>>>>>         return total;
>>>>>>>>>>>>>>>>>>>>  }
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
>>>>>>>>>>>>>>>>>>>> +       struct net_device *dev;
>>>>>>>>>>>>>>>>>>>> +       void *ptr;
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +       spin_lock(&ring->consumer_lock);
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +       ptr = __ptr_ring_consume(ring);
>>>>>>>>>>>>>>>>>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> I guess it's the "bug" I mentioned in the previous patch that leads to
>>>>>>>>>>>>>>>>>>> the check of __ptr_ring_consume_created_space() here. If it's true,
>>>>>>>>>>>>>>>>>>> another call to tweak the current API.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> +               rcu_read_lock();
>>>>>>>>>>>>>>>>>>>> +               dev = rcu_dereference(tfile->tun)->dev;
>>>>>>>>>>>>>>>>>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
>>>>>>>>>>>>>>>>>>> I'm not sure is what we want.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> What else would you suggest calling to wake the queue?
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I don't have a good method in my mind, just want to point out its implications.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I have to admit I'm a bit stuck at this point, particularly with this
>>>>>>>>>>>>>>>> aspect.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> What is the correct way to pass the producer CPU ID to the consumer?
>>>>>>>>>>>>>>>> Would it make sense to store smp_processor_id() in the tfile inside
>>>>>>>>>>>>>>>> tun_net_xmit(), or should it instead be stored in the skb (similar to the
>>>>>>>>>>>>>>>> XDP bit)? In the latter case, my concern is that this information may
>>>>>>>>>>>>>>>> already be significantly outdated by the time it is used.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Based on that, my idea would be for the consumer to wake the producer by
>>>>>>>>>>>>>>>> invoking a new function (e.g., tun_wake_queue()) on the producer CPU via
>>>>>>>>>>>>>>>> smp_call_function_single().
>>>>>>>>>>>>>>>> Is this a reasonable approach?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I'm not sure but it would introduce costs like IPI.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> More generally, would triggering TX_SOFTIRQ on the consumer CPU be
>>>>>>>>>>>>>>>> considered a deal-breaker for the patch set?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> It depends on whether or not it has effects on the performance.
>>>>>>>>>>>>>>> Especially when vhost is pinned.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I meant we can benchmark to see the impact. For example, pin vhost to
>>>>>>>>>>>>>> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Thanks
>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c 0 ...
>>>>>>>>>>>>> for both the stock and patched versions. The benchmarks were run with
>>>>>>>>>>>>> the full patch series applied, since testing only patches 1-3 would not
>>>>>>>>>>>>> be meaningful - the queue is never stopped in that case, so no
>>>>>>>>>>>>> TX_SOFTIRQ is triggered.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Compared to the non-pinned CPU benchmarks in the cover letter,
>>>>>>>>>>>>> performance is lower for pktgen with a single thread but higher with
>>>>>>>>>>>>> four threads. The results show no regression for the patched version,
>>>>>>>>>>>>> with even slight performance improvements observed:
>>>>>>>>>>>>>
>>>>>>>>>>>>> +-------------------------+-----------+----------------+
>>>>>>>>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
>>>>>>>>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
>>>>>>>>>>>>> | 100M packets            |           |                |
>>>>>>>>>>>>> | vhost pinned to core 0  |           |                |
>>>>>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>>>>>> | TAP       | Transmitted | 452 Kpps  | 454 Kpps       |
>>>>>>>>>>>>> |  +        +-------------+-----------+----------------+
>>>>>>>>>>>>> | vhost-net | Lost        | 1154 Kpps | 0              |
>>>>>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>>>>>>
>>>>>>>>>>>>> +-------------------------+-----------+----------------+
>>>>>>>>>>>>> | pktgen benchmarks to    | Stock     | Patched with   |
>>>>>>>>>>>>> | Debian VM, i5 6300HQ,   |           | fq_codel qdisc |
>>>>>>>>>>>>> | 100M packets            |           |                |
>>>>>>>>>>>>> | vhost pinned to core 0  |           |                |
>>>>>>>>>>>>> | *4 threads*             |           |                |
>>>>>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>>>>>> | TAP       | Transmitted | 71 Kpps   | 79 Kpps        |
>>>>>>>>>>>>> |  +        +-------------+-----------+----------------+
>>>>>>>>>>>>> | vhost-net | Lost        | 1527 Kpps | 0              |
>>>>>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>>>>
>>>>>>>>>>> The PPS seems to be low. I'd suggest using testpmd (rxonly) mode in
>>>>>>>>>>> the guest or an xdp program that did XDP_DROP in the guest.
>>>>>>>>>>
>>>>>>>>>> I forgot to mention that these PPS values are per thread.
>>>>>>>>>> So overall we have 71 Kpps * 4 = 284 Kpps and 79 Kpps * 4 = 326 Kpps,
>>>>>>>>>> respectively. For packet loss, that comes out to 1154 Kpps * 4 =
>>>>>>>>>> 4616 Kpps and 0, respectively.
>>>>>>>>>>
>>>>>>>>>> Sorry about that!
>>>>>>>>>>
>>>>>>>>>> The pktgen benchmarks with a single thread look fine, right?
>>>>>>>>>
>>>>>>>>> Still looks very low. E.g I just have a run of pktgen (using
>>>>>>>>> pktgen_sample03_burst_single_flow.sh) without a XDP_DROP in the guest,
>>>>>>>>> I can get 1Mpps.
>>>>>>>>
>>>>>>>> Keep in mind that I am using an older CPU (i5-6300HQ). For the
>>>>>>>> single-threaded tests I always used pktgen_sample01_simple.sh, and for
>>>>>>>> the multi-threaded tests I always used pktgen_sample02_multiqueue.sh.
>>>>>>>>
>>>>>>>> Using pktgen_sample03_burst_single_flow.sh as you did fails for me (even
>>>>>>>> though the same parameters work fine for sample01 and sample02):
>>>>>>>>
>>>>>>>> samples/pktgen/pktgen_sample03_burst_single_flow.sh -i tap0 -m
>>>>>>>> 52:54:00:12:34:56 -d 10.0.0.2 -n 100000000
>>>>>>>> /samples/pktgen/functions.sh: line 79: echo: write error: Operation not
>>>>>>>> supported
>>>>>>>> ERROR: Write error(1) occurred
>>>>>>>> cmd: "burst 32 > /proc/net/pktgen/tap0@0"
>>>>>>>>
>>>>>>>> ...and I do not know what I am doing wrong, even after looking at
>>>>>>>> Documentation/networking/pktgen.rst. Every burst size except 1 fails.
>>>>>>>> Any clues?
>>>>>>>
>>>>>>> Please use -b 0, and I'm Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz.
>>>>>>
>>>>>> I tried using "-b 0", and while it worked, there was no noticeable
>>>>>> performance improvement.
>>>>>>
>>>>>>>
>>>>>>> Another thing I can think of is to disable
>>>>>>>
>>>>>>> 1) mitigations in both guest and host
>>>>>>> 2) any kernel debug features in both host and guest
>>>>>>
>>>>>> I also rebuilt the kernel with everything disabled under
>>>>>> "Kernel hacking", but that didn’t make any difference either.
>>>>>>
>>>>>> Because of this, I ran "pktgen_sample01_simple.sh" and
>>>>>> "pktgen_sample02_multiqueue.sh" on my AMD Ryzen 5 5600X system. The
>>>>>> results were about 374 Kpps with TAP and 1192 Kpps with TAP+vhost_net,
>>>>>> with very similar performance between the stock and patched kernels.
>>>>>>
>>>>>> Personally, I think the low performance is to blame on the hardware.
>>>>>
>>>>> Let's double confirm this by:
>>>>>
>>>>> 1) make sure pktgen is using 100% CPU
>>>>> 2) Perf doesn't show anything strange for pktgen thread
>>>>>
>>>>> Thanks
>>>>>
>>>>
>>>> I ran pktgen using pktgen_sample01_simple.sh and, in parallel, started a
>>>> 100 second perf stat measurement covering all kpktgend threads.
>>>>
>>>> Across all configurations, a single CPU was fully utilized.
>>>>
>>>> Apart from that, the patched variants show a higher branch frequency and
>>>> a slightly increased number of context switches.
>>>>
>>>>
>>>> The detailed results are provided below:
>>>>
>>>> Processor: Ryzen 5 5600X
>>>>
>>>> pktgen command:
>>>> sudo perf stat samples/pktgen/pktgen_sample01_simple.sh -i tap0 -m
>>>> 52:54:00:12:34:56 -d 10.0.0.2 -n 10000000000
>>>>
>>>> perf stat command:
>>>> sudo perf stat --timeout 100000 -p $(pgrep kpktgend | tr '\n' ,) -o X.txt
>>>>
>>>>
>>>> Results:
>>>> Stock TAP:
>>>>             46.997      context-switches                 #    467,2 cs/sec  cs_per_second
>>>>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
>>>>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
>>>>         100.587,69 msec task-clock                       #      1,0 CPUs  CPUs_utilized
>>>>      8.491.586.483      branch-misses                    #     10,9 %  branch_miss_rate         (50,24%)
>>>>     77.734.761.406      branches                         #    772,8 M/sec  branch_frequency     (66,85%)
>>>>    382.420.291.585      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
>>>>    377.612.185.141      instructions                     #      1,0 instructions  insn_per_cycle  (66,85%)
>>>>     84.012.185.936      stalled-cycles-frontend          #     0,22 frontend_cycles_idle        (66,35%)
>>>>
>>>>      100,100414494 seconds time elapsed
>>>>
>>>>
>>>> Stock TAP+vhost-net:
>>>>             47.087      context-switches                 #    468,1 cs/sec  cs_per_second
>>>>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
>>>>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
>>>>         100.594,09 msec task-clock                       #      1,0 CPUs  CPUs_utilized
>>>>      8.034.703.613      branch-misses                    #     11,1 %  branch_miss_rate         (50,24%)
>>>>     72.477.989.922      branches                         #    720,5 M/sec  branch_frequency     (66,86%)
>>>>    382.218.276.832      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
>>>>    349.555.577.281      instructions                     #      0,9 instructions  insn_per_cycle  (66,85%)
>>>>     83.917.644.262      stalled-cycles-frontend          #     0,22 frontend_cycles_idle        (66,35%)
>>>>
>>>>      100,100520402 seconds time elapsed
>>>>
>>>>
>>>> Patched TAP:
>>>>             47.862      context-switches                 #    475,8 cs/sec  cs_per_second
>>>>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
>>>>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
>>>>         100.589,30 msec task-clock                       #      1,0 CPUs  CPUs_utilized
>>>>      9.337.258.794      branch-misses                    #      9,4 %  branch_miss_rate         (50,19%)
>>>>     99.518.421.676      branches                         #    989,4 M/sec  branch_frequency     (66,85%)
>>>>    382.508.244.894      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
>>>>    312.582.270.975      instructions                     #      0,8 instructions  insn_per_cycle  (66,85%)
>>>>     76.338.503.984      stalled-cycles-frontend          #     0,20 frontend_cycles_idle        (66,39%)
>>>>
>>>>      100,101262454 seconds time elapsed
>>>>
>>>>
>>>> Patched TAP+vhost-net:
>>>>             47.892      context-switches                 #    476,1 cs/sec  cs_per_second
>>>>                  0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
>>>>                  0      page-faults                      #      0,0 faults/sec  page_faults_per_second
>>>>         100.581,95 msec task-clock                       #      1,0 CPUs  CPUs_utilized
>>>>      9.083.588.313      branch-misses                    #     10,1 %  branch_miss_rate         (50,28%)
>>>>     90.300.124.712      branches                         #    897,8 M/sec  branch_frequency     (66,85%)
>>>>    382.374.510.376      cpu-cycles                       #      3,8 GHz  cycles_frequency       (66,85%)
>>>>    340.089.181.199      instructions                     #      0,9 instructions  insn_per_cycle  (66,85%)
>>>>     78.151.408.955      stalled-cycles-frontend          #     0,20 frontend_cycles_idle        (66,31%)
>>>>
>>>>      100,101212911 seconds time elapsed
>>>
>>> Thanks for sharing. I have more questions:
>>>
>>> 1) The number of CPU and vCPUs
>>
>> qemu runs with a single core. And my host system is now a Ryzen 5 5600x
>> with 6 cores, 12 threads.
>> This is my command for TAP+vhost-net:
>>
>> sudo qemu-system-x86_64 -hda debian.qcow2
>> -netdev tap,id=mynet0,ifname=tap0,script=no,downscript=no,vhost=on
>> -device virtio-net-pci,netdev=mynet0 -m 1024 -enable-kvm
>>
>> For TAP only it is the same but without vhost=on.
>>
>>> 2) If you pin vhost or vCPU threads
>>
>> Not in the previous shown benchmark. I pinned vhost in other benchmarks
>> but since there is only minor PPS difference I omitted for the sake of
>> simplicity.
>>
>>> 3) what does perf top looks like or perf top -p $pid_of_vhost
>>
>> The perf reports for the pid_of_vhost from pktgen_sample01_simple.sh
>> with TAP+vhost-net (not pinned, pktgen single queue, fq_codel) are shown
>> below. I can not see a huge difference between stock and patched.
>>
>> Also I included perf reports from the pktgen_pids. I find them more
>> intersting because tun_net_xmit shows less overhead for the patched.
>> I assume that is due to the stopped netdev queue.
>>
>> I have now benchmarked pretty much all possible combinations (with a
>> script) of TAP/TAP+vhost-net, single/multi-queue pktgen, vhost
>> pinned/not pinned, with/without -b 0, fq_codel/noqueue... All of that
>> with perf records..
>> I could share them if you want but I feel this is getting out of hand.
>>
>>
>> Stock:
>> sudo perf record -p "$vhost_pid"
>> ...
>> # Overhead  Command          Shared Object               Symbol
>> # ........  ...............  ..........................  ..........................................
>> #
>>      5.97%  vhost-4874       [kernel.kallsyms]           [k] _copy_to_iter
>>      2.68%  vhost-4874       [kernel.kallsyms]           [k] tun_do_read
>>      2.23%  vhost-4874       [kernel.kallsyms]           [k] native_write_msr
>>      1.93%  vhost-4874       [kernel.kallsyms]           [k] __check_object_size
> 
> Let's disable CONFIG_HARDENED_USERCOPY and retry.
> 
>>      1.61%  vhost-4874       [kernel.kallsyms]           [k] __slab_free.isra.0
>>      1.56%  vhost-4874       [kernel.kallsyms]           [k] __get_user_nocheck_2
>>      1.54%  vhost-4874       [kernel.kallsyms]           [k] iov_iter_zero
>>      1.45%  vhost-4874       [kernel.kallsyms]           [k] kmem_cache_free
>>      1.43%  vhost-4874       [kernel.kallsyms]           [k] tun_recvmsg
>>      1.24%  vhost-4874       [kernel.kallsyms]           [k] sk_skb_reason_drop
>>      1.12%  vhost-4874       [kernel.kallsyms]           [k] srso_alias_safe_ret
>>      1.07%  vhost-4874       [kernel.kallsyms]           [k] native_read_msr
>>      0.76%  vhost-4874       [kernel.kallsyms]           [k] simple_copy_to_iter
>>      0.75%  vhost-4874       [kernel.kallsyms]           [k] srso_alias_return_thunk
>>      0.69%  vhost-4874       [vhost]                     [k] 0x0000000000002e70
>>      0.59%  vhost-4874       [kernel.kallsyms]           [k] skb_release_data
>>      0.59%  vhost-4874       [kernel.kallsyms]           [k] __skb_datagram_iter
>>      0.53%  vhost-4874       [vhost]                     [k] 0x0000000000002e5f
>>      0.51%  vhost-4874       [kernel.kallsyms]           [k] slab_update_freelist.isra.0
>>      0.46%  vhost-4874       [kernel.kallsyms]           [k] kfree_skbmem
>>      0.44%  vhost-4874       [kernel.kallsyms]           [k] skb_copy_datagram_iter
>>      0.43%  vhost-4874       [kernel.kallsyms]           [k] skb_free_head
>>      0.37%  qemu-system-x86  [unknown]                   [k] 0xffffffffba898b1b
>>      0.35%  vhost-4874       [vhost]                     [k] 0x0000000000002e6b
>>      0.33%  vhost-4874       [vhost_net]                 [k] 0x000000000000357d
>>      0.28%  vhost-4874       [kernel.kallsyms]           [k] __check_heap_object
>>      0.27%  vhost-4874       [vhost_net]                 [k] 0x00000000000035f3
>>      0.26%  vhost-4874       [vhost_net]                 [k] 0x00000000000030f6
>>      0.26%  vhost-4874       [kernel.kallsyms]           [k] __virt_addr_valid
>>      0.24%  vhost-4874       [kernel.kallsyms]           [k] iov_iter_advance
>>      0.22%  vhost-4874       [kernel.kallsyms]           [k] perf_event_update_userpage
>>      0.22%  vhost-4874       [kernel.kallsyms]           [k] check_stack_object
>>      0.19%  qemu-system-x86  [unknown]                   [k] 0xffffffffba2a68cd
>>      0.19%  vhost-4874       [kernel.kallsyms]           [k] dequeue_entities
>>      0.19%  vhost-4874       [vhost_net]                 [k] 0x0000000000003237
>>      0.18%  vhost-4874       [vhost_net]                 [k] 0x0000000000003550
>>      0.18%  vhost-4874       [kernel.kallsyms]           [k] x86_pmu_del
>>      0.18%  vhost-4874       [vhost_net]                 [k] 0x00000000000034a0
>>      0.17%  vhost-4874       [kernel.kallsyms]           [k] x86_pmu_disable_all
>>      0.16%  vhost-4874       [vhost_net]                 [k] 0x0000000000003523
>>      0.16%  vhost-4874       [kernel.kallsyms]           [k] amd_pmu_addr_offset
>> ...
>>
>>
>> sudo perf record -p "$kpktgend_pids":
>> ...
>> # Overhead  Command      Shared Object      Symbol
>> # ........  ...........  .................  ...............................................
>> #
>>     10.98%  kpktgend_0   [kernel.kallsyms]  [k] tun_net_xmit
>>     10.45%  kpktgend_0   [kernel.kallsyms]  [k] memset
>>      8.40%  kpktgend_0   [kernel.kallsyms]  [k] __alloc_skb
>>      6.31%  kpktgend_0   [kernel.kallsyms]  [k] kmem_cache_alloc_node_noprof
>>      3.13%  kpktgend_0   [kernel.kallsyms]  [k] srso_alias_safe_ret
>>      2.40%  kpktgend_0   [kernel.kallsyms]  [k] sk_skb_reason_drop
>>      2.11%  kpktgend_0   [kernel.kallsyms]  [k] srso_alias_return_thunk
> 
> This is a hint that SRSO migitaion is enabled.
> 
> Have you disabled CPU_MITIGATIONS via either Kconfig or kernel command
> line (mitigations = off) for both host and guest?
> 
> Thanks
> 

Your both suggested changes really boosted the performance, especially
for TAP.

I disabled SRSO mitigation with spec_rstack_overflow=off and went from
"Mitigation: Safe RET" to "Vulnerable" on the host. The VM showed "Not
affected" but I applied spec_rstack_overflow=off anyway.

Here are some new benchmarks for pktgen_sample01_simple.sh:
(I also have other available and I can share them if you want.)

+-------------------------+-----------+----------------+
| pktgen benchmarks to    | Stock     | Patched with   |
| Debian VM, R5 5600X,    |           | fq_codel qdisc |
| 100M packets            |           |                |
| CPU not pinned          |           |                |
+-----------+-------------+-----------+----------------+
| TAP       | Transmitted | 1330 Kpps | 1033 Kpps      |
|           +-------------+-----------+----------------+
|           | Lost        | 3895 Kpps | 0              |
+-----------+-------------+-----------+----------------+
| TAP       | Transmitted | 1408 Kpps | 1420 Kpps      |
|  +        +-------------+-----------+----------------+
| vhost-net | Lost        | 3712 Kpps | 0              |
+-----------+-------------+-----------+----------------+

I do not understand why there is a regression for TAP but not for
TAP+vhost-net...


The perf report of pktgen and perf stat for TAP & TAP+vhost-net are
below. I also included perf reports & perf statsof vhost for
TAP+vhost-net.

=========================================================================

TAP stock:
perf report of pktgen:

# Overhead  Command      Shared Object      Symbol                                        
# ........  ...........  .................  ..............................................
#
    22.39%  kpktgend_0   [kernel.kallsyms]  [k] memset
    10.59%  kpktgend_0   [kernel.kallsyms]  [k] __alloc_skb
     7.56%  kpktgend_0   [kernel.kallsyms]  [k] kmem_cache_alloc_node_noprof
     5.74%  kpktgend_0   [kernel.kallsyms]  [k] tun_net_xmit
     4.76%  kpktgend_0   [kernel.kallsyms]  [k] kmem_cache_free
     3.23%  kpktgend_0   [kernel.kallsyms]  [k] chacha_permute
     2.55%  kpktgend_0   [pktgen]           [k] 0x0000000000003255
     2.49%  kpktgend_0   [pktgen]           [k] 0x000000000000324f
     2.48%  kpktgend_0   [pktgen]           [k] 0x000000000000325d
     2.44%  kpktgend_0   [kernel.kallsyms]  [k] get_random_u32
     2.21%  kpktgend_0   [kernel.kallsyms]  [k] skb_put
     1.46%  kpktgend_0   [kernel.kallsyms]  [k] sk_skb_reason_drop
     1.36%  kpktgend_0   [kernel.kallsyms]  [k] ip_send_check
     1.17%  kpktgend_0   [kernel.kallsyms]  [k] __local_bh_enable_ip
     1.09%  kpktgend_0   [kernel.kallsyms]  [k] _raw_spin_lock
     1.01%  kpktgend_0   [kernel.kallsyms]  [k] kmalloc_reserve
     0.85%  kpktgend_0   [kernel.kallsyms]  [k] skb_release_data
     0.83%  kpktgend_0   [kernel.kallsyms]  [k] __netdev_alloc_skb
     0.71%  kpktgend_0   [pktgen]           [k] 0x000000000000324d
     0.68%  kpktgend_0   [kernel.kallsyms]  [k] __rcu_read_unlock
     0.64%  kpktgend_0   [kernel.kallsyms]  [k] skb_tx_error
     0.59%  kpktgend_0   [kernel.kallsyms]  [k] __get_random_u32_below
     0.58%  kpktgend_0   [kernel.kallsyms]  [k] sock_def_readable
     0.51%  kpktgend_0   [pktgen]           [k] 0x000000000000422e
     0.50%  kpktgend_0   [kernel.kallsyms]  [k] __rcu_read_lock
     0.48%  kpktgend_0   [kernel.kallsyms]  [k] _get_random_bytes
     0.46%  kpktgend_0   [pktgen]           [k] 0x0000000000004220
     0.46%  kpktgend_0   [pktgen]           [k] 0x0000000000004229
     0.45%  kpktgend_0   [kernel.kallsyms]  [k] skb_release_head_state
     0.44%  kpktgend_0   [pktgen]           [k] 0x000000000000211d
...


perf stat of pktgen:
 Performance counter stats for process id '4740,4741,4742,4743,4744,4745,4746,4747,4748,4749,4750,4751,4752,4753,4754,4755,4756,4757,4758,4759,4760,4761,4762,4763,4764,4765,4766,4767,4768,4769,4770,4771,4772,4773,4774,4775,4776,4777,4778,4779,4780,4781,4782,4783,4784,4785,4786,4787':

            35.436      context-switches                 #    469,7 cs/sec  cs_per_second     
                 0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
                 0      page-faults                      #      0,0 faults/sec  page_faults_per_second
         75.443,67 msec task-clock                       #      1,0 CPUs  CPUs_utilized       
       548.187.113      branch-misses                    #      0,5 %  branch_miss_rate         (50,18%)
   119.270.991.801      branches                         #   1580,9 M/sec  branch_frequency     (66,79%)
   347.803.953.690      cpu-cycles                       #      4,6 GHz  cycles_frequency       (66,79%)
   689.142.448.524      instructions                     #      2,0 instructions  insn_per_cycle  (66,79%)
    11.063.715.152      stalled-cycles-frontend          #     0,03 frontend_cycles_idle        (66,43%)

      75,698467362 seconds time elapsed


=========================================================================

TAP patched:
perf report of pktgen:

# Overhead  Command      Shared Object      Symbol                                        
# ........  ...........  .................  ..............................................
#
    16.18%  kpktgend_0   [pktgen]           [k] 0x0000000000003255
    16.11%  kpktgend_0   [pktgen]           [k] 0x000000000000324f
    16.10%  kpktgend_0   [pktgen]           [k] 0x000000000000325d
     4.78%  kpktgend_0   [kernel.kallsyms]  [k] memset
     4.54%  kpktgend_0   [kernel.kallsyms]  [k] __local_bh_enable_ip
     2.62%  kpktgend_0   [pktgen]           [k] 0x000000000000324d
     2.42%  kpktgend_0   [kernel.kallsyms]  [k] __alloc_skb
     1.89%  kpktgend_0   [kernel.kallsyms]  [k] kthread_should_stop
     1.77%  kpktgend_0   [kernel.kallsyms]  [k] __rcu_read_unlock
     1.66%  kpktgend_0   [kernel.kallsyms]  [k] kmem_cache_alloc_node_noprof
     1.53%  kpktgend_0   [kernel.kallsyms]  [k] __rcu_read_lock
     1.44%  kpktgend_0   [kernel.kallsyms]  [k] tun_net_xmit
     1.42%  kpktgend_0   [kernel.kallsyms]  [k] __cond_resched
     0.91%  kpktgend_0   [pktgen]           [k] 0x0000000000003877
     0.91%  kpktgend_0   [pktgen]           [k] 0x0000000000003284
     0.89%  kpktgend_0   [pktgen]           [k] 0x000000000000327f
     0.75%  kpktgend_0   [kernel.kallsyms]  [k] chacha_permute
     0.64%  kpktgend_0   [pktgen]           [k] 0x0000000000003061
     0.61%  kpktgend_0   [kernel.kallsyms]  [k] get_random_u32
     0.57%  kpktgend_0   [kernel.kallsyms]  [k] sock_def_readable
     0.52%  kpktgend_0   [kernel.kallsyms]  [k] skb_put
     0.48%  kpktgend_0   [pktgen]           [k] 0x000000000000326d
     0.47%  kpktgend_0   [pktgen]           [k] 0x0000000000003265
     0.47%  kpktgend_0   [pktgen]           [k] 0x0000000000003864
     0.45%  kpktgend_0   [pktgen]           [k] 0x0000000000003008
     0.35%  kpktgend_0   [pktgen]           [k] 0x000000000000449b
     0.34%  kpktgend_0   [pktgen]           [k] 0x0000000000003242
     0.32%  kpktgend_0   [pktgen]           [k] 0x00000000000030a6
     0.32%  kpktgend_0   [pktgen]           [k] 0x000000000000308b
     0.32%  kpktgend_0   [pktgen]           [k] 0x0000000000003869
     0.31%  kpktgend_0   [pktgen]           [k] 0x00000000000030c2
...

perf stat of pktgen:

 Performance counter stats for process id '3257,3258,3259,3260,3261,3262,3263,3264,3265,3266,3267,3268,3269,3270,3271,3272,3273,3274,3275,3276,3277,3278,3279,3280,3281,3282,3283,3284,3285,3286,3287,3288,3289,3290,3291,3292,3293,3294,3295,3296,3297,3298,3299,3300,3301,3302,3303,3304':

            45.545      context-switches                 #    468,9 cs/sec  cs_per_second     
                 0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
                 0      page-faults                      #      0,0 faults/sec  page_faults_per_second
         97.130,77 msec task-clock                       #      1,0 CPUs  CPUs_utilized       
       237.212.098      branch-misses                    #      0,1 %  branch_miss_rate         (50,12%)
   172.088.418.840      branches                         #   1771,7 M/sec  branch_frequency     (66,78%)
   447.219.346.605      cpu-cycles                       #      4,6 GHz  cycles_frequency       (66,79%)
   619.203.459.603      instructions                     #      1,4 instructions  insn_per_cycle  (66,79%)
     5.821.044.711      stalled-cycles-frontend          #     0,01 frontend_cycles_idle        (66,48%)

      97,353332168 seconds time elapsed

=========================================================================

TAP+vhost-net stock:

perf report of pktgen:

# Overhead  Command      Shared Object      Symbol                                        
# ........  ...........  .................  ..............................................
#
    22.25%  kpktgend_0   [kernel.kallsyms]  [k] memset
    10.73%  kpktgend_0   [kernel.kallsyms]  [k] __alloc_skb
     7.69%  kpktgend_0   [kernel.kallsyms]  [k] kmem_cache_alloc_node_noprof
     5.71%  kpktgend_0   [kernel.kallsyms]  [k] tun_net_xmit
     4.66%  kpktgend_0   [kernel.kallsyms]  [k] kmem_cache_free
     3.20%  kpktgend_0   [kernel.kallsyms]  [k] chacha_permute
     2.50%  kpktgend_0   [pktgen]           [k] 0x000000000000325d
     2.48%  kpktgend_0   [pktgen]           [k] 0x0000000000003255
     2.45%  kpktgend_0   [pktgen]           [k] 0x000000000000324f
     2.41%  kpktgend_0   [kernel.kallsyms]  [k] get_random_u32
     2.22%  kpktgend_0   [kernel.kallsyms]  [k] skb_put
     1.44%  kpktgend_0   [kernel.kallsyms]  [k] sk_skb_reason_drop
     1.34%  kpktgend_0   [kernel.kallsyms]  [k] ip_send_check
     1.22%  kpktgend_0   [kernel.kallsyms]  [k] __local_bh_enable_ip
     1.06%  kpktgend_0   [kernel.kallsyms]  [k] _raw_spin_lock
     1.04%  kpktgend_0   [kernel.kallsyms]  [k] kmalloc_reserve
     0.85%  kpktgend_0   [kernel.kallsyms]  [k] skb_release_data
     0.83%  kpktgend_0   [kernel.kallsyms]  [k] __netdev_alloc_skb
     0.72%  kpktgend_0   [pktgen]           [k] 0x000000000000324d
     0.70%  kpktgend_0   [kernel.kallsyms]  [k] __rcu_read_unlock
     0.62%  kpktgend_0   [kernel.kallsyms]  [k] skb_tx_error
     0.61%  kpktgend_0   [kernel.kallsyms]  [k] __get_random_u32_below
     0.60%  kpktgend_0   [kernel.kallsyms]  [k] sock_def_readable
     0.52%  kpktgend_0   [kernel.kallsyms]  [k] __rcu_read_lock
     0.47%  kpktgend_0   [kernel.kallsyms]  [k] _get_random_bytes
     0.47%  kpktgend_0   [pktgen]           [k] 0x000000000000422e
     0.45%  kpktgend_0   [pktgen]           [k] 0x0000000000004229
     0.44%  kpktgend_0   [pktgen]           [k] 0x0000000000004220
     0.43%  kpktgend_0   [kernel.kallsyms]  [k] skb_release_head_state
     0.42%  kpktgend_0   [kernel.kallsyms]  [k] netdev_core_stats_inc
     0.42%  kpktgend_0   [pktgen]           [k] 0x0000000000002119
...

perf stat of pktgen:

 Performance counter stats for process id '4740,4741,4742,4743,4744,4745,4746,4747,4748,4749,4750,4751,4752,4753,4754,4755,4756,4757,4758,4759,4760,4761,4762,4763,4764,4765,4766,4767,4768,4769,4770,4771,4772,4773,4774,4775,4776,4777,4778,4779,4780,4781,4782,4783,4784,4785,4786,4787':

            34.830      context-switches                 #    489,0 cs/sec  cs_per_second     
                 0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
                 0      page-faults                      #      0,0 faults/sec  page_faults_per_second
         71.224,77 msec task-clock                       #      1,0 CPUs  CPUs_utilized       
       506.905.400      branch-misses                    #      0,5 %  branch_miss_rate         (50,15%)
   110.207.563.428      branches                         #   1547,3 M/sec  branch_frequency     (66,78%)
   324.745.594.771      cpu-cycles                       #      4,6 GHz  cycles_frequency       (66,77%)
   635.181.893.816      instructions                     #      2,0 instructions  insn_per_cycle  (66,77%)
    10.450.586.633      stalled-cycles-frontend          #     0,03 frontend_cycles_idle        (66,46%)

      71,547831150 seconds time elapsed


perf report of vhost:

# Overhead  Command          Shared Object               Symbol                                          
# ........  ...............  ..........................  ................................................
#
     8.66%  vhost-14592      [kernel.kallsyms]           [k] _copy_to_iter
     2.76%  vhost-14592      [kernel.kallsyms]           [k] native_write_msr
     2.57%  vhost-14592      [kernel.kallsyms]           [k] __get_user_nocheck_2
     2.03%  vhost-14592      [kernel.kallsyms]           [k] iov_iter_zero
     1.21%  vhost-14592      [kernel.kallsyms]           [k] native_read_msr
     0.89%  vhost-14592      [kernel.kallsyms]           [k] kmem_cache_free
     0.85%  vhost-14592      [kernel.kallsyms]           [k] __slab_free.isra.0
     0.84%  vhost-14592      [vhost]                     [k] 0x0000000000002e3a
     0.83%  vhost-14592      [kernel.kallsyms]           [k] tun_do_read
     0.74%  vhost-14592      [kernel.kallsyms]           [k] tun_recvmsg
     0.72%  vhost-14592      [kernel.kallsyms]           [k] slab_update_freelist.isra.0
     0.49%  vhost-14592      [vhost]                     [k] 0x0000000000002e29
     0.45%  vhost-14592      [vhost]                     [k] 0x0000000000002e35
     0.43%  qemu-system-x86  [unknown]                   [k] 0xffffffffb5298b1b
     0.26%  vhost-14592      [kernel.kallsyms]           [k] __skb_datagram_iter
     0.24%  vhost-14592      [kernel.kallsyms]           [k] skb_release_data
     0.24%  qemu-system-x86  [unknown]                   [k] 0xffffffffb4ca68cd
     0.24%  vhost-14592      [kernel.kallsyms]           [k] iov_iter_advance
     0.22%  qemu-system-x86  qemu-system-x86_64          [.] 0x00000000008eb79c
     0.22%  qemu-system-x86  qemu-system-x86_64          [.] 0x00000000008eba58
     0.14%  vhost-14592      [kernel.kallsyms]           [k] sk_skb_reason_drop
     0.14%  vhost-14592      [kernel.kallsyms]           [k] amd_pmu_addr_offset
     0.13%  qemu-system-x86  qemu-system-x86_64          [.] 0x00000000008eba54
     0.13%  vhost-14592      [kernel.kallsyms]           [k] skb_free_head
     0.12%  qemu-system-x86  qemu-system-x86_64          [.] 0x00000000008eba50
     0.12%  vhost-14592      [kernel.kallsyms]           [k] skb_release_head_state
     0.10%  qemu-system-x86  [kernel.kallsyms]           [k] native_write_msr
     0.09%  vhost-14592      [kernel.kallsyms]           [k] event_sched_out
     0.09%  vhost-14592      [kernel.kallsyms]           [k] x86_pmu_del
     0.09%  qemu-system-x86  qemu-system-x86_64          [.] 0x00000000008eb798
     0.09%  vhost-14592      [kernel.kallsyms]           [k] put_cpu_partial
...


perf stat of vhost:

 Performance counter stats for process id '14592':

         1.576.207      context-switches                 #  15070,7 cs/sec  cs_per_second     
               459      cpu-migrations                   #      4,4 migrations/sec  migrations_per_second
                 2      page-faults                      #      0,0 faults/sec  page_faults_per_second
        104.587,77 msec task-clock                       #      1,5 CPUs  CPUs_utilized       
       401.899.188      branch-misses                    #      0,2 %  branch_miss_rate         (49,91%)
   174.642.296.972      branches                         #   1669,8 M/sec  branch_frequency     (66,71%)
   453.598.103.128      cpu-cycles                       #      4,3 GHz  cycles_frequency       (66,98%)
   957.886.719.689      instructions                     #      2,1 instructions  insn_per_cycle  (66,77%)
    11.834.633.090      stalled-cycles-frontend          #     0,03 frontend_cycles_idle        (66,54%)

      71,561336447 seconds time elapsed


=========================================================================

TAP+vhost-net patched:

perf report of pktgen:

# Overhead  Command      Shared Object      Symbol                                        
# ........  ...........  .................  ..............................................
#
    16.83%  kpktgend_0   [pktgen]           [k] 0x000000000000324f
    16.81%  kpktgend_0   [pktgen]           [k] 0x0000000000003255
    16.74%  kpktgend_0   [pktgen]           [k] 0x000000000000325d
     5.96%  kpktgend_0   [kernel.kallsyms]  [k] memset
     3.87%  kpktgend_0   [kernel.kallsyms]  [k] __local_bh_enable_ip
     2.87%  kpktgend_0   [kernel.kallsyms]  [k] __alloc_skb
     1.77%  kpktgend_0   [kernel.kallsyms]  [k] kmem_cache_alloc_node_noprof
     1.72%  kpktgend_0   [pktgen]           [k] 0x000000000000324d
     1.68%  kpktgend_0   [kernel.kallsyms]  [k] tun_net_xmit
     1.63%  kpktgend_0   [kernel.kallsyms]  [k] __rcu_read_unlock
     1.56%  kpktgend_0   [kernel.kallsyms]  [k] kthread_should_stop
     1.41%  kpktgend_0   [kernel.kallsyms]  [k] __rcu_read_lock
     1.19%  kpktgend_0   [kernel.kallsyms]  [k] __cond_resched
     0.83%  kpktgend_0   [kernel.kallsyms]  [k] chacha_permute
     0.79%  kpktgend_0   [pktgen]           [k] 0x000000000000327f
     0.78%  kpktgend_0   [pktgen]           [k] 0x0000000000003284
     0.77%  kpktgend_0   [pktgen]           [k] 0x0000000000003877
     0.69%  kpktgend_0   [kernel.kallsyms]  [k] sock_def_readable
     0.66%  kpktgend_0   [kernel.kallsyms]  [k] get_random_u32
     0.56%  kpktgend_0   [kernel.kallsyms]  [k] skb_put
     0.54%  kpktgend_0   [pktgen]           [k] 0x0000000000003061
     0.41%  kpktgend_0   [pktgen]           [k] 0x0000000000003864
     0.41%  kpktgend_0   [pktgen]           [k] 0x0000000000003265
     0.40%  kpktgend_0   [pktgen]           [k] 0x0000000000003008
     0.39%  kpktgend_0   [pktgen]           [k] 0x000000000000326d
     0.37%  kpktgend_0   [kernel.kallsyms]  [k] ip_send_check
     0.36%  kpktgend_0   [pktgen]           [k] 0x000000000000422e
     0.32%  kpktgend_0   [pktgen]           [k] 0x0000000000004220
     0.30%  kpktgend_0   [pktgen]           [k] 0x0000000000004229
     0.29%  kpktgend_0   [kernel.kallsyms]  [k] kmalloc_reserve
     0.28%  kpktgend_0   [kernel.kallsyms]  [k] _raw_spin_lock
...

perf stat of pktgen:

 Performance counter stats for process id '3257,3258,3259,3260,3261,3262,3263,3264,3265,3266,3267,3268,3269,3270,3271,3272,3273,3274,3275,3276,3277,3278,3279,3280,3281,3282,3283,3284,3285,3286,3287,3288,3289,3290,3291,3292,3293,3294,3295,3296,3297,3298,3299,3300,3301,3302,3303,3304':

            34.525      context-switches                 #    489,1 cs/sec  cs_per_second     
                 0      cpu-migrations                   #      0,0 migrations/sec  migrations_per_second
                 0      page-faults                      #      0,0 faults/sec  page_faults_per_second
         70.593,02 msec task-clock                       #      1,0 CPUs  CPUs_utilized       
       225.587.357      branch-misses                    #      0,2 %  branch_miss_rate         (50,15%)
   135.486.264.836      branches                         #   1919,3 M/sec  branch_frequency     (66,77%)
   324.131.813.682      cpu-cycles                       #      4,6 GHz  cycles_frequency       (66,77%)
   501.960.610.999      instructions                     #      1,5 instructions  insn_per_cycle  (66,77%)
     2.689.294.657      stalled-cycles-frontend          #     0,01 frontend_cycles_idle        (66,46%)

      70,928052784 seconds time elapsed


perf report of vhost:

# Overhead  Command          Shared Object               Symbol                                          
# ........  ...............  ..........................  ................................................
#
     8.95%  vhost-12220      [kernel.kallsyms]           [k] _copy_to_iter
     4.03%  vhost-12220      [kernel.kallsyms]           [k] native_write_msr
     2.44%  vhost-12220      [kernel.kallsyms]           [k] __get_user_nocheck_2
     2.12%  vhost-12220      [kernel.kallsyms]           [k] iov_iter_zero
     1.74%  vhost-12220      [kernel.kallsyms]           [k] native_read_msr
     0.92%  vhost-12220      [kernel.kallsyms]           [k] kmem_cache_free
     0.87%  vhost-12220      [vhost]                     [k] 0x0000000000002e3a
     0.86%  vhost-12220      [kernel.kallsyms]           [k] __slab_free.isra.0
     0.82%  vhost-12220      [kernel.kallsyms]           [k] tun_recvmsg
     0.82%  vhost-12220      [kernel.kallsyms]           [k] tun_do_read
     0.73%  vhost-12220      [kernel.kallsyms]           [k] slab_update_freelist.isra.0
     0.51%  vhost-12220      [vhost]                     [k] 0x0000000000002e29
     0.47%  vhost-12220      [vhost]                     [k] 0x0000000000002e35
     0.40%  qemu-system-x86  [unknown]                   [k] 0xffffffff97e98b1b
     0.28%  vhost-12220      [kernel.kallsyms]           [k] __skb_datagram_iter
     0.26%  qemu-system-x86  qemu-system-x86_64          [.] 0x00000000008eba58
     0.24%  vhost-12220      [kernel.kallsyms]           [k] iov_iter_advance
     0.22%  qemu-system-x86  [unknown]                   [k] 0xffffffff978a68cd
     0.22%  vhost-12220      [kernel.kallsyms]           [k] skb_release_data
     0.21%  vhost-12220      [kernel.kallsyms]           [k] amd_pmu_addr_offset
     0.19%  vhost-12220      [kernel.kallsyms]           [k] tun_ring_consume_batched
     0.18%  vhost-12220      [kernel.kallsyms]           [k] __rcu_read_unlock
     0.14%  vhost-12220      [kernel.kallsyms]           [k] sk_skb_reason_drop
     0.13%  qemu-system-x86  qemu-system-x86_64          [.] 0x00000000008eb79c
     0.13%  vhost-12220      [kernel.kallsyms]           [k] skb_release_head_state
     0.13%  qemu-system-x86  qemu-system-x86_64          [.] 0x00000000008eba54
     0.13%  vhost-12220      [kernel.kallsyms]           [k] psi_group_change
     0.13%  qemu-system-x86  qemu-system-x86_64          [.] 0x00000000008eba50
     0.11%  vhost-12220      [kernel.kallsyms]           [k] skb_free_head
     0.10%  vhost-12220      [kernel.kallsyms]           [k] __update_load_avg_cfs_rq
     0.10%  vhost-12220      [kernel.kallsyms]           [k] update_load_avg
...


perf stat of vhost:

 Performance counter stats for process id '12220':

         2.841.331      context-switches                 #  26120,3 cs/sec  cs_per_second     
             1.902      cpu-migrations                   #     17,5 migrations/sec  migrations_per_second
                 2      page-faults                      #      0,0 faults/sec  page_faults_per_second
        108.778,75 msec task-clock                       #      1,5 CPUs  CPUs_utilized       
       422.032.153      branch-misses                    #      0,2 %  branch_miss_rate         (49,95%)
   177.051.281.496      branches                         #   1627,6 M/sec  branch_frequency     (66,59%)
   458.977.136.165      cpu-cycles                       #      4,2 GHz  cycles_frequency       (66,47%)
   968.869.747.208      instructions                     #      2,1 instructions  insn_per_cycle  (66,70%)
    12.748.378.886      stalled-cycles-frontend          #     0,03 frontend_cycles_idle        (66,76%)

      70,946778111 seconds time elapsed

Re: [PATCH net-next v7 3/9] tun/tap: add ptr_ring consume helper with netdev queue wakeup

Posted by Simon Schippers 1 month ago

On 1/9/26 07:02, Jason Wang wrote:
> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
> <simon.schippers@tu-dortmund.de> wrote:
>>
>> On 1/8/26 04:38, Jason Wang wrote:
>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
>>> <simon.schippers@tu-dortmund.de> wrote:
>>>>
>>>> Introduce {tun,tap}_ring_consume() helpers that wrap __ptr_ring_consume()
>>>> and wake the corresponding netdev subqueue when consuming an entry frees
>>>> space in the underlying ptr_ring.
>>>>
>>>> Stopping of the netdev queue when the ptr_ring is full will be introduced
>>>> in an upcoming commit.
>>>>
>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>> ---
>>>>  drivers/net/tap.c | 23 ++++++++++++++++++++++-
>>>>  drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>>>  2 files changed, 45 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>>>> index 1197f245e873..2442cf7ac385 100644
>>>> --- a/drivers/net/tap.c
>>>> +++ b/drivers/net/tap.c
>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct tap_queue *q,
>>>>         return ret ? ret : total;
>>>>  }
>>>>
>>>> +static void *tap_ring_consume(struct tap_queue *q)
>>>> +{
>>>> +       struct ptr_ring *ring = &q->ring;
>>>> +       struct net_device *dev;
>>>> +       void *ptr;
>>>> +
>>>> +       spin_lock(&ring->consumer_lock);
>>>> +
>>>> +       ptr = __ptr_ring_consume(ring);
>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>> +               rcu_read_lock();
>>>> +               dev = rcu_dereference(q->tap)->dev;
>>>> +               netif_wake_subqueue(dev, q->queue_index);
>>>> +               rcu_read_unlock();
>>>> +       }
>>>> +
>>>> +       spin_unlock(&ring->consumer_lock);
>>>> +
>>>> +       return ptr;
>>>> +}
>>>> +
>>>>  static ssize_t tap_do_read(struct tap_queue *q,
>>>>                            struct iov_iter *to,
>>>>                            int noblock, struct sk_buff *skb)
>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
>>>>                                         TASK_INTERRUPTIBLE);
>>>>
>>>>                 /* Read frames from the queue */
>>>> -               skb = ptr_ring_consume(&q->ring);
>>>> +               skb = tap_ring_consume(q);
>>>>                 if (skb)
>>>>                         break;
>>>>                 if (noblock) {
>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>> index 8192740357a0..7148f9a844a4 100644
>>>> --- a/drivers/net/tun.c
>>>> +++ b/drivers/net/tun.c
>>>> @@ -2113,13 +2113,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>>>         return total;
>>>>  }
>>>>
>>>> +static void *tun_ring_consume(struct tun_file *tfile)
>>>> +{
>>>> +       struct ptr_ring *ring = &tfile->tx_ring;
>>>> +       struct net_device *dev;
>>>> +       void *ptr;
>>>> +
>>>> +       spin_lock(&ring->consumer_lock);
>>>> +
>>>> +       ptr = __ptr_ring_consume(ring);
>>>> +       if (unlikely(ptr && __ptr_ring_consume_created_space(ring, 1))) {
>>>
>>> I guess it's the "bug" I mentioned in the previous patch that leads to
>>> the check of __ptr_ring_consume_created_space() here. If it's true,
>>> another call to tweak the current API.
>>>
>>>> +               rcu_read_lock();
>>>> +               dev = rcu_dereference(tfile->tun)->dev;
>>>> +               netif_wake_subqueue(dev, tfile->queue_index);
>>>
>>> This would cause the producer TX_SOFTIRQ to run on the same cpu which
>>> I'm not sure is what we want.
>>
>> What else would you suggest calling to wake the queue?
> 
> I don't have a good method in my mind, just want to point out its implications.

Okay :)
> 
>>
>>>
>>>> +               rcu_read_unlock();
>>>> +       }
>>>
>>> Btw, this function duplicates a lot of logic of tap_ring_consume() we
>>> should consider to merge the logic.
>>
>> Yes, it is largely the same approach, but it would require accessing the
>> net_device each time.
> 
> The problem is that, at least for TUN, the socket is loosely coupled
> with the netdev. It means the netdev can go away while the socket
> might still exist. That's why vhost only talks to the socket, not the
> netdev. If we really want to go this way, here, we should at least
> check the existence of tun->dev first.

You are right, I missed that.

> 
>>
>>>
>>>> +
>>>> +       spin_unlock(&ring->consumer_lock);
>>>> +
>>>> +       return ptr;
>>>> +}
>>>> +
>>>>  static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>>>>  {
>>>>         DECLARE_WAITQUEUE(wait, current);
>>>>         void *ptr = NULL;
>>>>         int error = 0;
>>>>
>>>> -       ptr = ptr_ring_consume(&tfile->tx_ring);
>>>> +       ptr = tun_ring_consume(tfile);
>>>
>>> I'm not sure having a separate patch like this may help. For example,
>>> it will introduce performance regression.
>>
>> I ran benchmarks for the whole patch set with noqueue (where the queue is
>> not stopped to preserve the old behavior), as described in the cover
>> letter, and observed no performance regression. This leads me to conclude
>> that there is no performance impact because of this patch when the queue
>> is not stopped.
> 
> Have you run a benchmark per patch? Or it might just be because the
> regression is not obvious. But at least this patch would introduce
> more atomic operations or it might just because the TUN doesn't
> support burst so pktgen can't have the best PPS.

No, I haven't. I see your point that this patch adds an additional
atomic test_and_clear_bit() (which will always return false without a
queue stop), and I should test that.

> 
> Thanks
> 
> 
>>
>>>
>>>>         if (ptr)
>>>>                 goto out;
>>>>         if (noblock) {
>>>> @@ -2131,7 +2152,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>>>>
>>>>         while (1) {
>>>>                 set_current_state(TASK_INTERRUPTIBLE);
>>>> -               ptr = ptr_ring_consume(&tfile->tx_ring);
>>>> +               ptr = tun_ring_consume(tfile);
>>>>                 if (ptr)
>>>>                         break;
>>>>                 if (signal_pending(current)) {
>>>> --
>>>> 2.43.0
>>>>
>>>
>>> Thanks
>>>
>>
>