From nobody Fri Sep 20 07:00:01 2024
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 2FAA4C4167B
	for <linux-kernel@archiver.kernel.org>; Tue,  5 Dec 2023 11:35:18 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1346987AbjLELfJ (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 5 Dec 2023 06:35:09 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41280 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1347022AbjLELex (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 5 Dec 2023 06:34:53 -0500
Received: from szxga03-in.huawei.com (szxga03-in.huawei.com [45.249.212.189])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 05DE9D42;
        Tue,  5 Dec 2023 03:34:52 -0800 (PST)
Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.53])
        by szxga03-in.huawei.com (SkyGuard) with ESMTP id 4SkyxY2tr3z14L6L;
        Tue,  5 Dec 2023 19:29:53 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 5 Dec 2023 19:34:50 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
        Yunsheng Lin <linyunsheng@huawei.com>,
        Alexander Duyck <alexander.duyck@gmail.com>,
        Andrew Morton <akpm@linux-foundation.org>,
        Eric Dumazet <edumazet@google.com>, <linux-mm@kvack.org>
Subject: [PATCH net-next 1/6] mm/page_alloc: modify page_frag_alloc_align() to
 accept align as an argument
Date: Tue, 5 Dec 2023 19:34:39 +0800
Message-ID: <20231205113444.63015-2-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20231205113444.63015-1-linyunsheng@huawei.com>
References: <20231205113444.63015-1-linyunsheng@huawei.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Originating-IP: [10.69.192.56]
X-ClientProxiedBy: dggems702-chm.china.huawei.com (10.3.19.179) To
 dggpemm500005.china.huawei.com (7.185.36.74)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

napi_alloc_frag_align() and netdev_alloc_frag_align() accept
align as an argument, and they are thin wrappers around the
__napi_alloc_frag_align() and __netdev_alloc_frag_align() APIs
doing the align and align_mask conversion, in order to call
page_frag_alloc_align() directly.

As __napi_alloc_frag_align() and __netdev_alloc_frag_align()
APIs are only used by the above thin wrappers, it seems that
it makes more sense to remove align and align_mask conversion
and call page_frag_alloc_align() directly. By doing that, we
can also avoid the confusion between napi_alloc_frag_align()
accepting align as an argument and page_frag_alloc_align()
accepting align_mask as an argument when they both have the
'align' suffix.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Alexander Duyck <alexander.duyck@gmail.com>
---
 include/linux/gfp.h    |  4 ++--
 include/linux/skbuff.h | 22 ++++------------------
 mm/page_alloc.c        |  6 ++++--
 net/core/skbuff.c      | 14 +++++++-------
 4 files changed, 17 insertions(+), 29 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index de292a007138..bbd75976541e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -314,12 +314,12 @@ struct page_frag_cache;
 extern void __page_frag_cache_drain(struct page *page, unsigned int count);
 extern void *page_frag_alloc_align(struct page_frag_cache *nc,
 				   unsigned int fragsz, gfp_t gfp_mask,
-				   unsigned int align_mask);
+				   unsigned int align);
=20
 static inline void *page_frag_alloc(struct page_frag_cache *nc,
 			     unsigned int fragsz, gfp_t gfp_mask)
 {
-	return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
+	return page_frag_alloc_align(nc, fragsz, gfp_mask, 1);
 }
=20
 extern void page_frag_free(void *addr);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b370eb8d70f7..095747c500b6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3194,7 +3194,7 @@ static inline void skb_queue_purge(struct sk_buff_hea=
d *list)
 unsigned int skb_rbtree_purge(struct rb_root *root);
 void skb_errqueue_purge(struct sk_buff_head *list);
=20
-void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_ma=
sk);
+void *netdev_alloc_frag_align(unsigned int fragsz, unsigned int align);
=20
 /**
  * netdev_alloc_frag - allocate a page fragment
@@ -3205,14 +3205,7 @@ void *__netdev_alloc_frag_align(unsigned int fragsz,=
 unsigned int align_mask);
  */
 static inline void *netdev_alloc_frag(unsigned int fragsz)
 {
-	return __netdev_alloc_frag_align(fragsz, ~0u);
-}
-
-static inline void *netdev_alloc_frag_align(unsigned int fragsz,
-					    unsigned int align)
-{
-	WARN_ON_ONCE(!is_power_of_2(align));
-	return __netdev_alloc_frag_align(fragsz, -align);
+	return netdev_alloc_frag_align(fragsz, 1);
 }
=20
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int le=
ngth,
@@ -3272,18 +3265,11 @@ static inline void skb_free_frag(void *addr)
 	page_frag_free(addr);
 }
=20
-void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask=
);
+void *napi_alloc_frag_align(unsigned int fragsz, unsigned int align);
=20
 static inline void *napi_alloc_frag(unsigned int fragsz)
 {
-	return __napi_alloc_frag_align(fragsz, ~0u);
-}
-
-static inline void *napi_alloc_frag_align(unsigned int fragsz,
-					  unsigned int align)
-{
-	WARN_ON_ONCE(!is_power_of_2(align));
-	return __napi_alloc_frag_align(fragsz, -align);
+	return napi_alloc_frag_align(fragsz, 1);
 }
=20
 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 37ca4f4b62bf..9a16305cf985 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4718,12 +4718,14 @@ EXPORT_SYMBOL(__page_frag_cache_drain);
=20
 void *page_frag_alloc_align(struct page_frag_cache *nc,
 		      unsigned int fragsz, gfp_t gfp_mask,
-		      unsigned int align_mask)
+		      unsigned int align)
 {
 	unsigned int size =3D PAGE_SIZE;
 	struct page *page;
 	int offset;
=20
+	WARN_ON_ONCE(!is_power_of_2(align));
+
 	if (unlikely(!nc->va)) {
 refill:
 		page =3D __page_frag_cache_refill(nc, gfp_mask);
@@ -4782,7 +4784,7 @@ void *page_frag_alloc_align(struct page_frag_cache *n=
c,
 	}
=20
 	nc->pagecnt_bias--;
-	offset &=3D align_mask;
+	offset &=3D -align;
 	nc->offset =3D offset;
=20
 	return nc->va + offset;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b157efea5dea..b98d1da4004a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -291,17 +291,17 @@ void napi_get_frags_check(struct napi_struct *napi)
 	local_bh_enable();
 }
=20
-void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
+void *napi_alloc_frag_align(unsigned int fragsz, unsigned int align)
 {
 	struct napi_alloc_cache *nc =3D this_cpu_ptr(&napi_alloc_cache);
=20
 	fragsz =3D SKB_DATA_ALIGN(fragsz);
=20
-	return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
+	return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align);
 }
-EXPORT_SYMBOL(__napi_alloc_frag_align);
+EXPORT_SYMBOL(napi_alloc_frag_align);
=20
-void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_ma=
sk)
+void *netdev_alloc_frag_align(unsigned int fragsz, unsigned int align)
 {
 	void *data;
=20
@@ -309,18 +309,18 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, =
unsigned int align_mask)
 	if (in_hardirq() || irqs_disabled()) {
 		struct page_frag_cache *nc =3D this_cpu_ptr(&netdev_alloc_cache);
=20
-		data =3D page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
+		data =3D page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align);
 	} else {
 		struct napi_alloc_cache *nc;
=20
 		local_bh_disable();
 		nc =3D this_cpu_ptr(&napi_alloc_cache);
-		data =3D page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask=
);
+		data =3D page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align);
 		local_bh_enable();
 	}
 	return data;
 }
-EXPORT_SYMBOL(__netdev_alloc_frag_align);
+EXPORT_SYMBOL(netdev_alloc_frag_align);
=20
 static struct sk_buff *napi_skb_cache_get(void)
 {
--=20
2.33.0
From nobody Fri Sep 20 07:00:01 2024
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 887F3C4167B
	for <linux-kernel@archiver.kernel.org>; Tue,  5 Dec 2023 11:35:24 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1442143AbjLELfP (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 5 Dec 2023 06:35:15 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41336 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1347038AbjLELey (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 5 Dec 2023 06:34:54 -0500
Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 25498D51;
        Tue,  5 Dec 2023 03:34:54 -0800 (PST)
Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.57])
        by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4Skz2G66sczWjJk;
        Tue,  5 Dec 2023 19:33:58 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 5 Dec 2023 19:34:51 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
        Yunsheng Lin <linyunsheng@huawei.com>,
        Alexander Duyck <alexander.duyck@gmail.com>,
        "Michael S. Tsirkin" <mst@redhat.com>,
        Jason Wang <jasowang@redhat.com>,
        Andrew Morton <akpm@linux-foundation.org>,
        Eric Dumazet <edumazet@google.com>, <kvm@vger.kernel.org>,
        <virtualization@lists.linux.dev>, <linux-mm@kvack.org>
Subject: [PATCH net-next 2/6] page_frag: unify gfp bit for order 3 page
 allocation
Date: Tue, 5 Dec 2023 19:34:40 +0800
Message-ID: <20231205113444.63015-3-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20231205113444.63015-1-linyunsheng@huawei.com>
References: <20231205113444.63015-1-linyunsheng@huawei.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Originating-IP: [10.69.192.56]
X-ClientProxiedBy: dggems702-chm.china.huawei.com (10.3.19.179) To
 dggpemm500005.china.huawei.com (7.185.36.74)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

Currently there seems to be three page frag implementions
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is xor'd to avoid
direct reclaim in skb_page_frag_refill(), but it is not
xor'd in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and xor'ing
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Alexander Duyck <alexander.duyck@gmail.com>
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c     | 4 ++--
 net/core/sock.c     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net=
 *net, unsigned int sz,
 		/* Avoid direct reclaim but allow kswapd to wake */
 		pfrag->page =3D alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
 					  __GFP_COMP | __GFP_NOWARN |
-					  __GFP_NORETRY,
+					  __GFP_NORETRY | __GFP_NOMEMALLOC,
 					  SKB_FRAG_PAGE_ORDER);
 		if (likely(pfrag->page)) {
 			pfrag->size =3D PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9a16305cf985..1f0b36dd81b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4693,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct p=
age_frag_cache *nc,
 	gfp_t gfp =3D gfp_mask;
=20
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	gfp_mask |=3D __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-		    __GFP_NOMEMALLOC;
+	gfp_mask =3D (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
 	page =3D alloc_pages_node(NUMA_NO_NODE, gfp_mask,
 				PAGE_FRAG_CACHE_MAX_ORDER);
 	nc->size =3D page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
diff --git a/net/core/sock.c b/net/core/sock.c
index fef349dd72fa..4efa9cae4b0d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2904,7 +2904,7 @@ bool skb_page_frag_refill(unsigned int sz, struct pag=
e_frag *pfrag, gfp_t gfp)
 		/* Avoid direct reclaim but allow kswapd to wake */
 		pfrag->page =3D alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
 					  __GFP_COMP | __GFP_NOWARN |
-					  __GFP_NORETRY,
+					  __GFP_NORETRY | __GFP_NOMEMALLOC,
 					  SKB_FRAG_PAGE_ORDER);
 		if (likely(pfrag->page)) {
 			pfrag->size =3D PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
--=20
2.33.0
From nobody Fri Sep 20 07:00:01 2024
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id A60DFC4167B
	for <linux-kernel@archiver.kernel.org>; Tue,  5 Dec 2023 11:35:28 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1346894AbjLELfU (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 5 Dec 2023 06:35:20 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:58552 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1346995AbjLELey (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 5 Dec 2023 06:34:54 -0500
Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 7BB39172A;
        Tue,  5 Dec 2023 03:34:54 -0800 (PST)
Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.55])
        by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4SkyyH27zdzFr77;
        Tue,  5 Dec 2023 19:30:31 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 5 Dec 2023 19:34:52 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
        Yunsheng Lin <linyunsheng@huawei.com>,
        Alexander Duyck <alexander.duyck@gmail.com>,
        Andrew Morton <akpm@linux-foundation.org>, <linux-mm@kvack.org>
Subject: [PATCH net-next 3/6] mm/page_alloc: use initial zero offset for
 page_frag_alloc_align()
Date: Tue, 5 Dec 2023 19:34:41 +0800
Message-ID: <20231205113444.63015-4-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20231205113444.63015-1-linyunsheng@huawei.com>
References: <20231205113444.63015-1-linyunsheng@huawei.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Originating-IP: [10.69.192.56]
X-ClientProxiedBy: dggems702-chm.china.huawei.com (10.3.19.179) To
 dggpemm500005.china.huawei.com (7.185.36.74)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

The next patch is above to use page_frag_alloc_align() to
replace vhost_net_page_frag_refill(), the main difference
between those two frag page implementations is whether we
use a initial zero offset or not.

It seems more nature to use a initial zero offset, as it
may enable more correct cache prefetching and skb frag
coalescing in the networking, so change it to use initial
zero offset.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Alexander Duyck <alexander.duyck@gmail.com>
---
 mm/page_alloc.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1f0b36dd81b5..083e0c38fb62 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4720,7 +4720,7 @@ void *page_frag_alloc_align(struct page_frag_cache *n=
c,
 		      unsigned int fragsz, gfp_t gfp_mask,
 		      unsigned int align)
 {
-	unsigned int size =3D PAGE_SIZE;
+	unsigned int size;
 	struct page *page;
 	int offset;
=20
@@ -4732,10 +4732,6 @@ void *page_frag_alloc_align(struct page_frag_cache *=
nc,
 		if (!page)
 			return NULL;
=20
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size =3D nc->size;
-#endif
 		/* Even if we own the page, we do not use atomic_set().
 		 * This would break get_page_unless_zero() users.
 		 */
@@ -4744,11 +4740,18 @@ void *page_frag_alloc_align(struct page_frag_cache =
*nc,
 		/* reset page count bias and offset to start of new frag */
 		nc->pfmemalloc =3D page_is_pfmemalloc(page);
 		nc->pagecnt_bias =3D PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->offset =3D size;
+		nc->offset =3D 0;
 	}
=20
-	offset =3D nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	/* if size can vary use size else just use PAGE_SIZE */
+	size =3D nc->size;
+#else
+	size =3D PAGE_SIZE;
+#endif
+
+	offset =3D ALIGN(nc->offset, align);
+	if (unlikely(offset + fragsz > size)) {
 		page =3D virt_to_page(nc->va);
=20
 		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
@@ -4759,17 +4762,13 @@ void *page_frag_alloc_align(struct page_frag_cache =
*nc,
 			goto refill;
 		}
=20
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size =3D nc->size;
-#endif
 		/* OK, page count is 0, we can safely set it */
 		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
=20
 		/* reset page count bias and offset to start of new frag */
 		nc->pagecnt_bias =3D PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		offset =3D size - fragsz;
-		if (unlikely(offset < 0)) {
+		offset =3D 0;
+		if (unlikely(fragsz > size)) {
 			/*
 			 * The caller is trying to allocate a fragment
 			 * with fragsz > PAGE_SIZE but the cache isn't big
@@ -4784,8 +4783,7 @@ void *page_frag_alloc_align(struct page_frag_cache *n=
c,
 	}
=20
 	nc->pagecnt_bias--;
-	offset &=3D -align;
-	nc->offset =3D offset;
+	nc->offset =3D offset + fragsz;
=20
 	return nc->va + offset;
 }
--=20
2.33.0
From nobody Fri Sep 20 07:00:01 2024
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id ED2C5C46CA3
	for <linux-kernel@archiver.kernel.org>; Tue,  5 Dec 2023 11:35:31 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1442171AbjLELfX (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 5 Dec 2023 06:35:23 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41374 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1347050AbjLELe4 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 5 Dec 2023 06:34:56 -0500
Received: from szxga01-in.huawei.com (szxga01-in.huawei.com [45.249.212.187])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 821D8D53;
        Tue,  5 Dec 2023 03:34:55 -0800 (PST)
Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.54])
        by szxga01-in.huawei.com (SkyGuard) with ESMTP id 4Skyyz1RZQzrVDt;
        Tue,  5 Dec 2023 19:31:07 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 5 Dec 2023 19:34:53 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
        Yunsheng Lin <linyunsheng@huawei.com>,
        "Michael S. Tsirkin" <mst@redhat.com>,
        Jason Wang <jasowang@redhat.com>,
        Alexei Starovoitov <ast@kernel.org>,
        Daniel Borkmann <daniel@iogearbox.net>,
        Jesper Dangaard Brouer <hawk@kernel.org>,
        John Fastabend <john.fastabend@gmail.com>,
        <kvm@vger.kernel.org>, <virtualization@lists.linux.dev>,
        <bpf@vger.kernel.org>
Subject: [PATCH net-next 4/6] vhost/net: remove vhost_net_page_frag_refill()
Date: Tue, 5 Dec 2023 19:34:42 +0800
Message-ID: <20231205113444.63015-5-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20231205113444.63015-1-linyunsheng@huawei.com>
References: <20231205113444.63015-1-linyunsheng@huawei.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Originating-IP: [10.69.192.56]
X-ClientProxiedBy: dggems702-chm.china.huawei.com (10.3.19.179) To
 dggpemm500005.china.huawei.com (7.185.36.74)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c | 93 ++++++++++++++-------------------------------
 1 file changed, 29 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..805e11d598e4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
 	unsigned tx_zcopy_err;
 	/* Flush in progress. Protected by tx vq lock. */
 	bool tx_flush;
-	/* Private page frag */
-	struct page_frag page_frag;
-	/* Refcount bias of page frag */
-	int refcnt_bias;
+	/* Private page frag cache */
+	struct page_frag_cache pf_cache;
 };
=20
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, s=
ize_t total_len)
 	       !vhost_vq_avail_empty(vq->dev, vq);
 }
=20
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int=
 sz,
-				       struct page_frag *pfrag, gfp_t gfp)
-{
-	if (pfrag->page) {
-		if (pfrag->offset + sz <=3D pfrag->size)
-			return true;
-		__page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-	}
-
-	pfrag->offset =3D 0;
-	net->refcnt_bias =3D 0;
-	if (SKB_FRAG_PAGE_ORDER) {
-		/* Avoid direct reclaim but allow kswapd to wake */
-		pfrag->page =3D alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
-					  __GFP_COMP | __GFP_NOWARN |
-					  __GFP_NORETRY | __GFP_NOMEMALLOC,
-					  SKB_FRAG_PAGE_ORDER);
-		if (likely(pfrag->page)) {
-			pfrag->size =3D PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-			goto done;
-		}
-	}
-	pfrag->page =3D alloc_page(gfp);
-	if (likely(pfrag->page)) {
-		pfrag->size =3D PAGE_SIZE;
-		goto done;
-	}
-	return false;
-
-done:
-	net->refcnt_bias =3D USHRT_MAX;
-	page_ref_add(pfrag->page, USHRT_MAX - 1);
-	return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
=20
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtque=
ue *nvq,
 	struct vhost_net *net =3D container_of(vq->dev, struct vhost_net,
 					     dev);
 	struct socket *sock =3D vhost_vq_get_backend(vq);
-	struct page_frag *alloc_frag =3D &net->page_frag;
 	struct virtio_net_hdr *gso;
 	struct xdp_buff *xdp =3D &nvq->xdp[nvq->batched_xdp];
 	struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtque=
ue *nvq,
 	int sock_hlen =3D nvq->sock_hlen;
 	void *buf;
 	int copied;
+	int ret;
=20
 	if (unlikely(len < nvq->sock_hlen))
 		return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtq=
ueue *nvq,
 		return -ENOSPC;
=20
 	buflen +=3D SKB_DATA_ALIGN(len + pad);
-	alloc_frag->offset =3D ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-	if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-						 alloc_frag, GFP_KERNEL)))
+	buf =3D page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL,
+				    SMP_CACHE_BYTES);
+	if (unlikely(!buf))
 		return -ENOMEM;
=20
-	buf =3D (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-	copied =3D copy_page_from_iter(alloc_frag->page,
-				     alloc_frag->offset +
-				     offsetof(struct tun_xdp_hdr, gso),
-				     sock_hlen, from);
-	if (copied !=3D sock_hlen)
-		return -EFAULT;
+	copied =3D copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+				sock_hlen, from);
+	if (copied !=3D sock_hlen) {
+		ret =3D -EFAULT;
+		goto err;
+	}
=20
 	hdr =3D buf;
 	gso =3D &hdr->gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtq=
ueue *nvq,
 			       vhost16_to_cpu(vq, gso->csum_start) +
 			       vhost16_to_cpu(vq, gso->csum_offset) + 2);
=20
-		if (vhost16_to_cpu(vq, gso->hdr_len) > len)
-			return -EINVAL;
+		if (vhost16_to_cpu(vq, gso->hdr_len) > len) {
+			ret =3D -EINVAL;
+			goto err;
+		}
 	}
=20
 	len -=3D sock_hlen;
-	copied =3D copy_page_from_iter(alloc_frag->page,
-				     alloc_frag->offset + pad,
-				     len, from);
-	if (copied !=3D len)
-		return -EFAULT;
+	copied =3D copy_from_iter(buf + pad, len, from);
+	if (copied !=3D len) {
+		ret =3D -EFAULT;
+		goto err;
+	}
=20
 	xdp_init_buff(xdp, buflen, NULL);
 	xdp_prepare_buff(xdp, buf, pad, len, true);
 	hdr->buflen =3D buflen;
=20
-	--net->refcnt_bias;
-	alloc_frag->offset +=3D buflen;
-
 	++nvq->batched_xdp;
=20
 	return 0;
+
+err:
+	page_frag_free(buf);
+	return ret;
 }
=20
 static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
@@ -1353,8 +1318,7 @@ static int vhost_net_open(struct inode *inode, struct=
 file *f)
 			vqs[VHOST_NET_VQ_RX]);
=20
 	f->private_data =3D n;
-	n->page_frag.page =3D NULL;
-	n->refcnt_bias =3D 0;
+	n->pf_cache.va =3D NULL;
=20
 	return 0;
 }
@@ -1422,8 +1386,9 @@ static int vhost_net_release(struct inode *inode, str=
uct file *f)
 	kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
 	kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
 	kfree(n->dev.vqs);
-	if (n->page_frag.page)
-		__page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
+	if (n->pf_cache.va)
+		__page_frag_cache_drain(virt_to_head_page(n->pf_cache.va),
+					n->pf_cache.pagecnt_bias);
 	kvfree(n);
 	return 0;
 }
--=20
2.33.0
From nobody Fri Sep 20 07:00:01 2024
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 48917C4167B
	for <linux-kernel@archiver.kernel.org>; Tue,  5 Dec 2023 11:35:35 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1347005AbjLELf0 (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 5 Dec 2023 06:35:26 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:58374 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1347055AbjLELe4 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 5 Dec 2023 06:34:56 -0500
Received: from szxga08-in.huawei.com (szxga08-in.huawei.com [45.249.212.255])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 9C8A012C;
        Tue,  5 Dec 2023 03:34:57 -0800 (PST)
Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.54])
        by szxga08-in.huawei.com (SkyGuard) with ESMTP id 4Skyz163TSz1Q65P;
        Tue,  5 Dec 2023 19:31:09 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 5 Dec 2023 19:34:55 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
        Yunsheng Lin <linyunsheng@huawei.com>,
        Jeroen de Borst <jeroendb@google.com>,
        Praveen Kaligineedi <pkaligineedi@google.com>,
        Shailend Chand <shailend@google.com>,
        Eric Dumazet <edumazet@google.com>,
        Felix Fietkau <nbd@nbd.name>, John Crispin <john@phrozen.org>,
        Sean Wang <sean.wang@mediatek.com>,
        Mark Lee <Mark-MC.Lee@mediatek.com>,
        Lorenzo Bianconi <lorenzo@kernel.org>,
        Matthias Brugger <matthias.bgg@gmail.com>,
        AngeloGioacchino Del Regno
        <angelogioacchino.delregno@collabora.com>,
        Keith Busch <kbusch@kernel.org>, Jens Axboe <axboe@kernel.dk>,
        Christoph Hellwig <hch@lst.de>,
        Sagi Grimberg <sagi@grimberg.me>,
        Chaitanya Kulkarni <kch@nvidia.com>,
        "Michael S. Tsirkin" <mst@redhat.com>,
        Jason Wang <jasowang@redhat.com>,
        Andrew Morton <akpm@linux-foundation.org>,
        <linux-arm-kernel@lists.infradead.org>,
        <linux-mediatek@lists.infradead.org>,
        <linux-nvme@lists.infradead.org>, <kvm@vger.kernel.org>,
        <virtualization@lists.linux.dev>, <linux-mm@kvack.org>
Subject: [PATCH net-next 5/6] net: introduce page_frag_cache_drain()
Date: Tue, 5 Dec 2023 19:34:43 +0800
Message-ID: <20231205113444.63015-6-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20231205113444.63015-1-linyunsheng@huawei.com>
References: <20231205113444.63015-1-linyunsheng@huawei.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Originating-IP: [10.69.192.56]
X-ClientProxiedBy: dggems702-chm.china.huawei.com (10.3.19.179) To
 dggpemm500005.china.huawei.com (7.185.36.74)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

When draining a page_frag_cache, most user are doing
the similar steps, so introduce an API to avoid code
duplication.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/ethernet/google/gve/gve_main.c | 11 ++---------
 drivers/net/ethernet/mediatek/mtk_wed_wo.c | 17 ++---------------
 drivers/nvme/host/tcp.c                    |  7 +------
 drivers/nvme/target/tcp.c                  |  4 +---
 drivers/vhost/net.c                        |  4 +---
 include/linux/gfp.h                        |  2 ++
 mm/page_alloc.c                            | 10 ++++++++++
 7 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ether=
net/google/gve/gve_main.c
index 619bf63ec935..d976190b0f4d 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1278,17 +1278,10 @@ static void gve_unreg_xdp_info(struct gve_priv *pri=
v)
=20
 static void gve_drain_page_cache(struct gve_priv *priv)
 {
-	struct page_frag_cache *nc;
 	int i;
=20
-	for (i =3D 0; i < priv->rx_cfg.num_queues; i++) {
-		nc =3D &priv->rx[i].page_cache;
-		if (nc->va) {
-			__page_frag_cache_drain(virt_to_page(nc->va),
-						nc->pagecnt_bias);
-			nc->va =3D NULL;
-		}
-	}
+	for (i =3D 0; i < priv->rx_cfg.num_queues; i++)
+		page_frag_cache_drain(&priv->rx[i].page_cache);
 }
=20
 static int gve_open(struct net_device *dev)
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c b/drivers/net/ether=
net/mediatek/mtk_wed_wo.c
index 7ffbd4fca881..df0a3ceaf59b 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
@@ -286,7 +286,6 @@ mtk_wed_wo_queue_free(struct mtk_wed_wo *wo, struct mtk=
_wed_wo_queue *q)
 static void
 mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *=
q)
 {
-	struct page *page;
 	int i;
=20
 	for (i =3D 0; i < q->n_desc; i++) {
@@ -298,19 +297,12 @@ mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, stru=
ct mtk_wed_wo_queue *q)
 		entry->buf =3D NULL;
 	}
=20
-	if (!q->cache.va)
-		return;
-
-	page =3D virt_to_page(q->cache.va);
-	__page_frag_cache_drain(page, q->cache.pagecnt_bias);
-	memset(&q->cache, 0, sizeof(q->cache));
+	page_frag_cache_drain(&q->cache);
 }
=20
 static void
 mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *=
q)
 {
-	struct page *page;
-
 	for (;;) {
 		void *buf =3D mtk_wed_wo_dequeue(wo, q, NULL, true);
=20
@@ -320,12 +312,7 @@ mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struc=
t mtk_wed_wo_queue *q)
 		skb_free_frag(buf);
 	}
=20
-	if (!q->cache.va)
-		return;
-
-	page =3D virt_to_page(q->cache.va);
-	__page_frag_cache_drain(page, q->cache.pagecnt_bias);
-	memset(&q->cache, 0, sizeof(q->cache));
+	page_frag_cache_drain(&q->cache);
 }
=20
 static void
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index d79811cfa0ce..1c85e1398e4e 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1344,7 +1344,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_c=
trl *ctrl)
=20
 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 {
-	struct page *page;
 	struct nvme_tcp_ctrl *ctrl =3D to_tcp_ctrl(nctrl);
 	struct nvme_tcp_queue *queue =3D &ctrl->queues[qid];
 	unsigned int noreclaim_flag;
@@ -1355,11 +1354,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nc=
trl, int qid)
 	if (queue->hdr_digest || queue->data_digest)
 		nvme_tcp_free_crypto(queue);
=20
-	if (queue->pf_cache.va) {
-		page =3D virt_to_head_page(queue->pf_cache.va);
-		__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
-		queue->pf_cache.va =3D NULL;
-	}
+	page_frag_cache_drain(&queue->pf_cache);
=20
 	noreclaim_flag =3D memalloc_noreclaim_save();
 	/* ->sock will be released by fput() */
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 4cc27856aa8f..11237557cfc5 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1576,7 +1576,6 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct=
 nvmet_tcp_queue *queue)
=20
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
 {
-	struct page *page;
 	struct nvmet_tcp_queue *queue =3D
 		container_of(w, struct nvmet_tcp_queue, release_work);
=20
@@ -1600,8 +1599,7 @@ static void nvmet_tcp_release_queue_work(struct work_=
struct *w)
 	if (queue->hdr_digest || queue->data_digest)
 		nvmet_tcp_free_crypto(queue);
 	ida_free(&nvmet_tcp_queue_ida, queue->idx);
-	page =3D virt_to_head_page(queue->pf_cache.va);
-	__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
+	page_frag_cache_drain(&queue->pf_cache);
 	kfree(queue);
 }
=20
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 805e11d598e4..4b2fcb228a0a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1386,9 +1386,7 @@ static int vhost_net_release(struct inode *inode, str=
uct file *f)
 	kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
 	kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
 	kfree(n->dev.vqs);
-	if (n->pf_cache.va)
-		__page_frag_cache_drain(virt_to_head_page(n->pf_cache.va),
-					n->pf_cache.pagecnt_bias);
+	page_frag_cache_drain(&n->pf_cache);
 	kvfree(n);
 	return 0;
 }
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index bbd75976541e..03ba079655d3 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -316,6 +316,8 @@ extern void *page_frag_alloc_align(struct page_frag_cac=
he *nc,
 				   unsigned int fragsz, gfp_t gfp_mask,
 				   unsigned int align);
=20
+void page_frag_cache_drain(struct page_frag_cache *nc);
+
 static inline void *page_frag_alloc(struct page_frag_cache *nc,
 			     unsigned int fragsz, gfp_t gfp_mask)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 083e0c38fb62..5a0e68edcb05 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4716,6 +4716,16 @@ void __page_frag_cache_drain(struct page *page, unsi=
gned int count)
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
=20
+void page_frag_cache_drain(struct page_frag_cache *nc)
+{
+	if (!nc->va)
+		return;
+
+	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
+	nc->va =3D NULL;
+}
+EXPORT_SYMBOL(page_frag_cache_drain);
+
 void *page_frag_alloc_align(struct page_frag_cache *nc,
 		      unsigned int fragsz, gfp_t gfp_mask,
 		      unsigned int align)
--=20
2.33.0
From nobody Fri Sep 20 07:00:01 2024
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 990C3C4167B
	for <linux-kernel@archiver.kernel.org>; Tue,  5 Dec 2023 11:35:38 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1377024AbjLELf3 (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 5 Dec 2023 06:35:29 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41444 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1347069AbjLELe5 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 5 Dec 2023 06:34:57 -0500
Received: from szxga01-in.huawei.com (szxga01-in.huawei.com [45.249.212.187])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id A5426D5A;
        Tue,  5 Dec 2023 03:34:58 -0800 (PST)
Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.56])
        by szxga01-in.huawei.com (SkyGuard) with ESMTP id 4Skyz22Ct0zrVFD;
        Tue,  5 Dec 2023 19:31:10 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 5 Dec 2023 19:34:56 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
        Yunsheng Lin <linyunsheng@huawei.com>,
        "Michael S. Tsirkin" <mst@redhat.com>,
        Jason Wang <jasowang@redhat.com>,
        Xuan Zhuo <xuanzhuo@linux.alibaba.com>,
        <virtualization@lists.linux.dev>
Subject: [PATCH net-next 6/6] tools: virtio: introduce vhost_net_test
Date: Tue, 5 Dec 2023 19:34:44 +0800
Message-ID: <20231205113444.63015-7-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20231205113444.63015-1-linyunsheng@huawei.com>
References: <20231205113444.63015-1-linyunsheng@huawei.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Originating-IP: [10.69.192.56]
X-ClientProxiedBy: dggems702-chm.china.huawei.com (10.3.19.179) To
 dggpemm500005.china.huawei.com (7.185.36.74)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

introduce vhost_net_test basing on virtio_test to test
vhost_net changing in the kernel.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 tools/virtio/Makefile         |   8 +-
 tools/virtio/vhost_net_test.c | 441 ++++++++++++++++++++++++++++++++++
 2 files changed, 446 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
=20
 try-run =3D $(shell set -e;		\
 	if ($(1)) >/dev/null 2>&1;	\
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=3Dclean
=20
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-	${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-              vhost_test/Module.symvers vhost_test/modules.order *.d
+	${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+              vhost_test/.*.cmd vhost_test/Module.symvers \
+              vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index 000000000000..7e7b7aba3668
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <getopt.h>
+#include <limits.h>
+#include <string.h>
+#include <poll.h>
+#include <sys/eventfd.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <linux/virtio_types.h>
+#include <linux/vhost.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ring.h>
+#include <linux/if.h>
+#include <linux/if_tun.h>
+
+#define RANDOM_BATCH -1
+
+static int tun_alloc(void)
+{
+	struct ifreq ifr;
+	int fd, e;
+
+	fd =3D open("/dev/net/tun", O_RDWR);
+	if (fd < 0) {
+		perror("Cannot open /dev/net/tun");
+		return fd;
+	}
+
+	memset(&ifr, 0, sizeof(ifr));
+
+	ifr.ifr_flags =3D IFF_TUN | IFF_NO_PI;
+	strncpy(ifr.ifr_name, "tun0", IFNAMSIZ);
+
+	e =3D ioctl(fd, TUNSETIFF, (void *) &ifr);
+	if (e < 0) {
+		perror("ioctl[TUNSETIFF]");
+		close(fd);
+		return e;
+	}
+
+	return fd;
+}
+
+/* Unused */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+	int kick;
+	int call;
+	int num;
+	int idx;
+	void *ring;
+	/* copy used for control */
+	struct vring vring;
+	struct virtqueue *vq;
+};
+
+struct vdev_info {
+	struct virtio_device vdev;
+	int control;
+	struct pollfd fds[1];
+	struct vq_info vqs[1];
+	int nvqs;
+	void *buf;
+	size_t buf_size;
+	struct vhost_memory *mem;
+};
+
+static struct vhost_vring_file no_backend =3D { .index =3D 1, .fd =3D -1 },
+				     backend =3D { .index =3D 1, .fd =3D 1 };
+static const struct vhost_vring_state null_state =3D {};
+
+bool vq_notify(struct virtqueue *vq)
+{
+	struct vq_info *info =3D vq->priv;
+	unsigned long long v =3D 1;
+	int r;
+	r =3D write(info->kick, &v, sizeof v);
+	assert(r =3D=3D sizeof v);
+	return true;
+}
+
+void vq_callback(struct virtqueue *vq)
+{
+}
+
+
+void vhost_vq_setup(struct vdev_info *dev, struct vq_info *info)
+{
+	struct vhost_vring_state state =3D { .index =3D info->idx };
+	struct vhost_vring_file file =3D { .index =3D info->idx };
+	unsigned long long features =3D dev->vdev.features;
+	struct vhost_vring_addr addr =3D {
+		.index =3D info->idx,
+		.desc_user_addr =3D (uint64_t)(unsigned long)info->vring.desc,
+		.avail_user_addr =3D (uint64_t)(unsigned long)info->vring.avail,
+		.used_user_addr =3D (uint64_t)(unsigned long)info->vring.used,
+	};
+	int r;
+	r =3D ioctl(dev->control, VHOST_SET_FEATURES, &features);
+	assert(r >=3D 0);
+	state.num =3D info->vring.num;
+	r =3D ioctl(dev->control, VHOST_SET_VRING_NUM, &state);
+	assert(r >=3D 0);
+	state.num =3D 0;
+	r =3D ioctl(dev->control, VHOST_SET_VRING_BASE, &state);
+	assert(r >=3D 0);
+	r =3D ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr);
+	assert(r >=3D 0);
+	file.fd =3D info->kick;
+	r =3D ioctl(dev->control, VHOST_SET_VRING_KICK, &file);
+	assert(r >=3D 0);
+	file.fd =3D info->call;
+	r =3D ioctl(dev->control, VHOST_SET_VRING_CALL, &file);
+	assert(r >=3D 0);
+}
+
+static void vq_reset(struct vq_info *info, int num, struct virtio_device *=
vdev)
+{
+	if (info->vq)
+		vring_del_virtqueue(info->vq);
+
+	memset(info->ring, 0, vring_size(num, 4096));
+	vring_init(&info->vring, num, info->ring, 4096);
+	info->vq =3D vring_new_virtqueue(info->idx, num, 4096, vdev, true, false,
+				       info->ring, vq_notify, vq_callback, "test");
+	assert(info->vq);
+	info->vq->priv =3D info;
+}
+
+static void vq_info_add(struct vdev_info *dev, int num)
+{
+	struct vq_info *info =3D &dev->vqs[dev->nvqs];
+	int r;
+
+	/* use VHOST_NET_VQ_TX for testing */
+	info->idx =3D 1;
+	info->kick =3D eventfd(0, EFD_NONBLOCK);
+	info->call =3D eventfd(0, EFD_NONBLOCK);
+	r =3D posix_memalign(&info->ring, 4096, vring_size(num, 4096));
+	assert(r >=3D 0);
+	vq_reset(info, num, &dev->vdev);
+	vhost_vq_setup(dev, info);
+	dev->fds[0].fd =3D info->call;
+	dev->fds[0].events =3D POLLIN;
+	dev->nvqs++;
+}
+
+static void vdev_info_init(struct vdev_info* dev, unsigned long long featu=
res)
+{
+	int r;
+	memset(dev, 0, sizeof *dev);
+	dev->vdev.features =3D features;
+	INIT_LIST_HEAD(&dev->vdev.vqs);
+	spin_lock_init(&dev->vdev.vqs_list_lock);
+	dev->buf_size =3D 1024;
+	dev->buf =3D malloc(dev->buf_size);
+	assert(dev->buf);
+	dev->control =3D open("/dev/vhost-net", O_RDWR);
+	assert(dev->control >=3D 0);
+	r =3D ioctl(dev->control, VHOST_SET_OWNER, NULL);
+	assert(r >=3D 0);
+	dev->mem =3D malloc(offsetof(struct vhost_memory, regions) +
+			  sizeof dev->mem->regions[0]);
+	assert(dev->mem);
+	memset(dev->mem, 0, offsetof(struct vhost_memory, regions) +
+                          sizeof dev->mem->regions[0]);
+	dev->mem->nregions =3D 1;
+	dev->mem->regions[0].guest_phys_addr =3D (long)dev->buf;
+	dev->mem->regions[0].userspace_addr =3D (long)dev->buf;
+	dev->mem->regions[0].memory_size =3D dev->buf_size;
+	r =3D ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem);
+	assert(r >=3D 0);
+}
+
+/* TODO: this is pretty bad: we get a cache line bounce
+ * for the wait queue on poll and another one on read,
+ * plus the read which is there just to clear the
+ * current state. */
+static void wait_for_interrupt(struct vdev_info *dev)
+{
+	int i;
+	unsigned long long val;
+	poll(dev->fds, dev->nvqs, -1);
+	for (i =3D 0; i < dev->nvqs; ++i)
+		if (dev->fds[i].revents & POLLIN) {
+			read(dev->fds[i].fd, &val, sizeof val);
+		}
+}
+
+static void run_test(struct vdev_info *dev, struct vq_info *vq,
+		     bool delayed, int batch, int reset_n, int bufs)
+{
+	struct scatterlist sl;
+	long started =3D 0, completed =3D 0, next_reset =3D reset_n;
+	long completed_before, started_before;
+	int r;
+	unsigned int len;
+	long long spurious =3D 0;
+	const bool random_batch =3D batch =3D=3D RANDOM_BATCH;
+
+	r =3D ioctl(dev->control, VHOST_NET_SET_BACKEND, &backend);
+	assert(!r);
+
+	if (!reset_n) {
+		next_reset =3D INT_MAX;
+	}
+
+	for (;;) {
+		virtqueue_disable_cb(vq->vq);
+		completed_before =3D completed;
+		started_before =3D started;
+		do {
+			const bool reset =3D completed > next_reset;
+			if (random_batch)
+				batch =3D (random() % vq->vring.num) + 1;
+
+			while (started < bufs &&
+			       (started - completed) < batch) {
+				sg_init_one(&sl, dev->buf, dev->buf_size);
+				r =3D virtqueue_add_outbuf(vq->vq, &sl, 1,
+							 dev->buf + started,
+							 GFP_ATOMIC);
+				if (unlikely(r !=3D 0)) {
+					if (r =3D=3D -ENOSPC &&
+					    started > started_before)
+						r =3D 0;
+					else
+						r =3D -1;
+					break;
+				}
+
+				++started;
+
+				if (unlikely(!virtqueue_kick(vq->vq))) {
+					r =3D -1;
+					break;
+				}
+			}
+
+			if (started >=3D bufs)
+				r =3D -1;
+
+			if (reset) {
+				r =3D ioctl(dev->control, VHOST_NET_SET_BACKEND,
+					  &no_backend);
+				assert(!r);
+			}
+
+			/* Flush out completed bufs if any */
+			while (virtqueue_get_buf(vq->vq, &len)) {
+				++completed;
+				r =3D 0;
+			}
+
+			if (reset) {
+				struct vhost_vring_state s =3D { .index =3D 0 };
+
+				vq_reset(vq, vq->vring.num, &dev->vdev);
+
+				r =3D ioctl(dev->control, VHOST_GET_VRING_BASE,
+					  &s);
+				assert(!r);
+
+				s.num =3D 0;
+				r =3D ioctl(dev->control, VHOST_SET_VRING_BASE,
+					  &null_state);
+				assert(!r);
+
+				r =3D ioctl(dev->control, VHOST_NET_SET_BACKEND,
+					  &backend);
+				assert(!r);
+
+				started =3D completed;
+				while (completed > next_reset)
+					next_reset +=3D completed;
+			}
+		} while (r =3D=3D 0);
+		if (completed =3D=3D completed_before && started =3D=3D started_before)
+			++spurious;
+		assert(completed <=3D bufs);
+		assert(started <=3D bufs);
+		if (completed =3D=3D bufs)
+			break;
+		if (delayed) {
+			if (virtqueue_enable_cb_delayed(vq->vq))
+				wait_for_interrupt(dev);
+		} else {
+			if (virtqueue_enable_cb(vq->vq))
+				wait_for_interrupt(dev);
+		}
+	}
+	fprintf(stderr,
+		"spurious wakeups: 0x%llx started=3D0x%lx completed=3D0x%lx\n",
+		spurious, started, completed);
+}
+
+const char optstring[] =3D "h";
+const struct option longopts[] =3D {
+	{
+		.name =3D "help",
+		.val =3D 'h',
+	},
+	{
+		.name =3D "event-idx",
+		.val =3D 'E',
+	},
+	{
+		.name =3D "no-event-idx",
+		.val =3D 'e',
+	},
+	{
+		.name =3D "indirect",
+		.val =3D 'I',
+	},
+	{
+		.name =3D "no-indirect",
+		.val =3D 'i',
+	},
+	{
+		.name =3D "virtio-1",
+		.val =3D '1',
+	},
+	{
+		.name =3D "no-virtio-1",
+		.val =3D '0',
+	},
+	{
+		.name =3D "delayed-interrupt",
+		.val =3D 'D',
+	},
+	{
+		.name =3D "no-delayed-interrupt",
+		.val =3D 'd',
+	},
+	{
+		.name =3D "buf-num",
+		.val =3D 'n',
+		.has_arg =3D required_argument,
+	},
+	{
+		.name =3D "batch",
+		.val =3D 'b',
+		.has_arg =3D required_argument,
+	},
+	{
+		.name =3D "reset",
+		.val =3D 'r',
+		.has_arg =3D optional_argument,
+	},
+	{
+	}
+};
+
+static void help(int status)
+{
+	fprintf(stderr, "Usage: virtio_test [--help]"
+		" [--no-indirect]"
+		" [--no-event-idx]"
+		" [--no-virtio-1]"
+		" [--delayed-interrupt]"
+		" [--batch=3Drandom/N]"
+		" [--reset=3DN]"
+		"\n");
+
+	exit(status);
+}
+
+int main(int argc, char **argv)
+{
+	struct vdev_info dev;
+	unsigned long long features =3D (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+		(1ULL << VIRTIO_RING_F_EVENT_IDX) | (1ULL << VIRTIO_F_VERSION_1);
+	long batch =3D 1, reset =3D 0, nbufs =3D 0x100000;
+	int o;
+	bool delayed =3D false;
+
+	for (;;) {
+		o =3D getopt_long(argc, argv, optstring, longopts, NULL);
+		switch (o) {
+		case -1:
+			goto done;
+		case '?':
+			help(2);
+		case 'e':
+			features &=3D ~(1ULL << VIRTIO_RING_F_EVENT_IDX);
+			break;
+		case 'h':
+			help(0);
+		case 'i':
+			features &=3D ~(1ULL << VIRTIO_RING_F_INDIRECT_DESC);
+			break;
+		case '0':
+			features &=3D ~(1ULL << VIRTIO_F_VERSION_1);
+			break;
+		case 'D':
+			delayed =3D true;
+			break;
+		case 'b':
+			if (0 =3D=3D strcmp(optarg, "random")) {
+				batch =3D RANDOM_BATCH;
+			} else {
+				batch =3D strtol(optarg, NULL, 10);
+				assert(batch > 0);
+				assert(batch < (long)INT_MAX + 1);
+			}
+			break;
+		case 'r':
+			if (!optarg) {
+				reset =3D 1;
+			} else {
+				reset =3D strtol(optarg, NULL, 10);
+				assert(reset > 0);
+				assert(reset < (long)INT_MAX + 1);
+			}
+			break;
+		case 'n':
+			nbufs =3D strtol(optarg, NULL, 10);
+			assert(nbufs > 0);
+			break;
+		default:
+			assert(0);
+			break;
+		}
+	}
+
+done:
+	backend.fd =3D tun_alloc();
+	assert(backend.fd >=3D 0);
+	vdev_info_init(&dev, features);
+	vq_info_add(&dev, 256);
+	run_test(&dev, &dev.vqs[0], delayed, batch, reset, nbufs);
+	return 0;
+}
--=20
2.33.0