From nobody Sun Feb  8 16:51:59 2026
Received: from szxga04-in.huawei.com (szxga04-in.huawei.com [45.249.212.190])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 39EEE664A2;
	Tue, 30 Jan 2024 11:38:00 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=45.249.212.190
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706614683; cv=none;
 b=ehF/s2V1vUEtEoADb6zrZ6W2jn2Ry/LoMyKwKb+g+kSlq3MRElqv5zIIVtyhMhkZ9hldlN16O0udLrikl2vFM9tiDPnb2O6HvWC/6dT7Fu1A0GJt6XoYgu3itR+uwNiaDAbVx94Xva+4F6jFToLncq2hHV8vpfprCynQJ85UKRE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706614683; c=relaxed/simple;
	bh=zqm6RABGSsvRS43KAGKzzihR7za65HBbv0dnlTU9Ess=;
	h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=lMBwyY16rvSq5OnG4QM9WLcDQ3Qua5FNhYjzUsumr++PecWY2CIrTn+05vjSZ0+pEyhwBjhSqVevL32tUpsV+HQX59JjNr2RcW+FzAfwFMlG/DriFPU2MMMLyMojIiMHtU2XH8ISmeWqGprNTs54UnR/EHMjvT4PSxruFJCgBgc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com;
 spf=pass smtp.mailfrom=huawei.com; arc=none smtp.client-ip=45.249.212.190
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=huawei.com
Received: from mail.maildlp.com (unknown [172.19.88.214])
	by szxga04-in.huawei.com (SkyGuard) with ESMTP id 4TPNQx3VyMz29km6;
	Tue, 30 Jan 2024 19:36:09 +0800 (CST)
Received: from dggpemm500005.china.huawei.com (unknown [7.185.36.74])
	by mail.maildlp.com (Postfix) with ESMTPS id 19FB01A016B;
	Tue, 30 Jan 2024 19:37:59 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 30 Jan 2024 19:37:58 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>, Yunsheng Lin
	<linyunsheng@huawei.com>, Alexander Duyck <alexander.duyck@gmail.com>, Andrew
 Morton <akpm@linux-foundation.org>, Eric Dumazet <edumazet@google.com>,
	<linux-mm@kvack.org>
Subject: [PATCH net-next v4 1/5] mm/page_alloc: modify page_frag_alloc_align()
 to accept align as an argument
Date: Tue, 30 Jan 2024 19:37:06 +0800
Message-ID: <20240130113710.34511-2-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20240130113710.34511-1-linyunsheng@huawei.com>
References: <20240130113710.34511-1-linyunsheng@huawei.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-ClientProxiedBy: dggems706-chm.china.huawei.com (10.3.19.183) To
 dggpemm500005.china.huawei.com (7.185.36.74)
Content-Type: text/plain; charset="utf-8"

napi_alloc_frag_align() and netdev_alloc_frag_align() accept
align as an argument, and they are thin wrappers around the
__napi_alloc_frag_align() and __netdev_alloc_frag_align() APIs
doing the alignment checking and align mask conversion, in order
to call page_frag_alloc_align() directly. The intention here is
to keep the alignment checking and the alignmask conversion in
in-line wrapper to avoid those kind of operations during execution
time since it can usually be handled during compile time.

We are going to use page_frag_alloc_align() in vhost_net.c, it
need the same kind of alignment checking and alignmask conversion,
so split up page_frag_alloc_align into an inline wrapper doing the
above operation, and add __page_frag_alloc_align() which is passed
with the align mask the original function expected as suggested by
Alexander.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Alexander Duyck <alexander.duyck@gmail.com>
---
 include/linux/gfp.h | 15 +++++++++++----
 mm/page_alloc.c     |  8 ++++----
 net/core/skbuff.c   |  6 +++---
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index de292a007138..28aea17fa59b 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -312,14 +312,21 @@ extern void free_pages(unsigned long addr, unsigned i=
nt order);
=20
 struct page_frag_cache;
 extern void __page_frag_cache_drain(struct page *page, unsigned int count);
-extern void *page_frag_alloc_align(struct page_frag_cache *nc,
-				   unsigned int fragsz, gfp_t gfp_mask,
-				   unsigned int align_mask);
+void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fra=
gsz,
+			      gfp_t gfp_mask, unsigned int align_mask);
+
+static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
+					  unsigned int fragsz, gfp_t gfp_mask,
+					  unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align));
+	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
+}
=20
 static inline void *page_frag_alloc(struct page_frag_cache *nc,
 			     unsigned int fragsz, gfp_t gfp_mask)
 {
-	return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
+	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
 }
=20
 extern void page_frag_free(void *addr);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 150d4f23b010..c0f7e67c4250 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4708,9 +4708,9 @@ void __page_frag_cache_drain(struct page *page, unsig=
ned int count)
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
=20
-void *page_frag_alloc_align(struct page_frag_cache *nc,
-		      unsigned int fragsz, gfp_t gfp_mask,
-		      unsigned int align_mask)
+void *__page_frag_alloc_align(struct page_frag_cache *nc,
+			      unsigned int fragsz, gfp_t gfp_mask,
+			      unsigned int align_mask)
 {
 	unsigned int size =3D PAGE_SIZE;
 	struct page *page;
@@ -4779,7 +4779,7 @@ void *page_frag_alloc_align(struct page_frag_cache *n=
c,
=20
 	return nc->va + offset;
 }
-EXPORT_SYMBOL(page_frag_alloc_align);
+EXPORT_SYMBOL(__page_frag_alloc_align);
=20
 /*
  * Frees a page fragment allocated out of either a compound or order 0 pag=
e.
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index edbbef563d4d..bc8f3858bc1c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -297,7 +297,7 @@ void *__napi_alloc_frag_align(unsigned int fragsz, unsi=
gned int align_mask)
=20
 	fragsz =3D SKB_DATA_ALIGN(fragsz);
=20
-	return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
+	return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
 }
 EXPORT_SYMBOL(__napi_alloc_frag_align);
=20
@@ -309,13 +309,13 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, =
unsigned int align_mask)
 	if (in_hardirq() || irqs_disabled()) {
 		struct page_frag_cache *nc =3D this_cpu_ptr(&netdev_alloc_cache);
=20
-		data =3D page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
+		data =3D __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
 	} else {
 		struct napi_alloc_cache *nc;
=20
 		local_bh_disable();
 		nc =3D this_cpu_ptr(&napi_alloc_cache);
-		data =3D page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask=
);
+		data =3D __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_ma=
sk);
 		local_bh_enable();
 	}
 	return data;
--=20
2.33.0
From nobody Sun Feb  8 16:51:59 2026
Received: from szxga07-in.huawei.com (szxga07-in.huawei.com [45.249.212.35])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 05E2B67731;
	Tue, 30 Jan 2024 11:38:02 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=45.249.212.35
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706614685; cv=none;
 b=ZyJxfagfvnPmb5IS+TMtX/Nm6lH05mCtls+YSJpTcrnYfa4yvGeGZCzwpDZV/FmVqA1Xa3zgEc0UF4DQvq7vE3mGs3CsG0A2dymZstLuefxpBwMoYIQePI9/NBnVTPq+Wf4yex3JFjXu0gnTUF90ujrES9KTwSGwAovUCvBYCyQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706614685; c=relaxed/simple;
	bh=iIw9m8E+34rmyeAQdxJf1gG5mn3NGG9OKYzHsTrTvN4=;
	h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=Xo2Q3SAiB+uB3k0hj/Gs7EJw/36JCzoMV8jAwGiiqcLobSTGfpFtc0/7/nvxa6NXRBOPp7HWifWm8CK0+BlRRC3aB389LCzxiY4dNcxfe0N1i+gwNzs1rx4v6QXZEj9hX4vGuhFm2CKvqo9zXTtXzjC9r04nkVUUynn3azFw23Y=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com;
 spf=pass smtp.mailfrom=huawei.com; arc=none smtp.client-ip=45.249.212.35
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=huawei.com
Received: from mail.maildlp.com (unknown [172.19.88.214])
	by szxga07-in.huawei.com (SkyGuard) with ESMTP id 4TPNQz0vJ8z1Q8H5;
	Tue, 30 Jan 2024 19:36:11 +0800 (CST)
Received: from dggpemm500005.china.huawei.com (unknown [7.185.36.74])
	by mail.maildlp.com (Postfix) with ESMTPS id 1039C1A016B;
	Tue, 30 Jan 2024 19:38:01 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 30 Jan 2024 19:38:00 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>, Yunsheng Lin
	<linyunsheng@huawei.com>, Alexander Duyck <alexanderduyck@fb.com>, Alexander
 Duyck <alexander.duyck@gmail.com>, "Michael S. Tsirkin" <mst@redhat.com>,
	Jason Wang <jasowang@redhat.com>, Andrew Morton <akpm@linux-foundation.org>,
	Eric Dumazet <edumazet@google.com>, <kvm@vger.kernel.org>,
	<virtualization@lists.linux.dev>, <linux-mm@kvack.org>
Subject: [PATCH net-next v4 2/5] page_frag: unify gfp bits for order 3 page
 allocation
Date: Tue, 30 Jan 2024 19:37:07 +0800
Message-ID: <20240130113710.34511-3-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20240130113710.34511-1-linyunsheng@huawei.com>
References: <20240130113710.34511-1-linyunsheng@huawei.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-ClientProxiedBy: dggems706-chm.china.huawei.com (10.3.19.183) To
 dggpemm500005.china.huawei.com (7.185.36.74)
Content-Type: text/plain; charset="utf-8"

Currently there seems to be three page frag implementions
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
direct reclaim in skb_page_frag_refill(), but it is not
masked off in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and masking off
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
CC: Alexander Duyck <alexander.duyck@gmail.com>
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c     | 4 ++--
 net/core/sock.c     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net=
 *net, unsigned int sz,
 		/* Avoid direct reclaim but allow kswapd to wake */
 		pfrag->page =3D alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
 					  __GFP_COMP | __GFP_NOWARN |
-					  __GFP_NORETRY,
+					  __GFP_NORETRY | __GFP_NOMEMALLOC,
 					  SKB_FRAG_PAGE_ORDER);
 		if (likely(pfrag->page)) {
 			pfrag->size =3D PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c0f7e67c4250..636145c29f70 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4685,8 +4685,8 @@ static struct page *__page_frag_cache_refill(struct p=
age_frag_cache *nc,
 	gfp_t gfp =3D gfp_mask;
=20
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	gfp_mask |=3D __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-		    __GFP_NOMEMALLOC;
+	gfp_mask =3D (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
 	page =3D alloc_pages_node(NUMA_NO_NODE, gfp_mask,
 				PAGE_FRAG_CACHE_MAX_ORDER);
 	nc->size =3D page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
diff --git a/net/core/sock.c b/net/core/sock.c
index 88bf810394a5..8289a3d8c375 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2919,7 +2919,7 @@ bool skb_page_frag_refill(unsigned int sz, struct pag=
e_frag *pfrag, gfp_t gfp)
 		/* Avoid direct reclaim but allow kswapd to wake */
 		pfrag->page =3D alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
 					  __GFP_COMP | __GFP_NOWARN |
-					  __GFP_NORETRY,
+					  __GFP_NORETRY | __GFP_NOMEMALLOC,
 					  SKB_FRAG_PAGE_ORDER);
 		if (likely(pfrag->page)) {
 			pfrag->size =3D PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
--=20
2.33.0
From nobody Sun Feb  8 16:51:59 2026
Received: from szxga05-in.huawei.com (szxga05-in.huawei.com [45.249.212.191])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D9F836A344;
	Tue, 30 Jan 2024 11:38:11 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=45.249.212.191
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706614694; cv=none;
 b=e0IH8EiqpYQZRl1zo1IaeT90NokxG5LyRsnsXQ1ENc6v6rTazpb8sK3MCsoYEkC1+hRzEbau8vK5fY4hQIrtAETFwV6P3nE9hYOfDRsY0amQSPF9LfK4Vg9v33sbLJ0K0guKWFT2r3S4/sJQymd957OPFc8XrjkIcGiruNy2yx4=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706614694; c=relaxed/simple;
	bh=Po9UmepGah+wDcjtjlXv76oDnLy8VqpUY9/GZvXhthQ=;
	h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=CmakfbUSPhQTlM1spPW5B1k8xJnI8LRL7L5oY4+fD7/CwX6scuTSK4fR8bLyXK3nuO0ZjnBhcgUZBkgdOtVXlbhy+lMWwHPa8d2OyoRN+Q3E2/ItNiH1YpKlBowKBuTLQ/tNjR42Z7CbdKlvmWA6et7aU/iz3deFKoHLFwuV4l0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com;
 spf=pass smtp.mailfrom=huawei.com; arc=none smtp.client-ip=45.249.212.191
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=huawei.com
Received: from mail.maildlp.com (unknown [172.19.162.112])
	by szxga05-in.huawei.com (SkyGuard) with ESMTP id 4TPNR22P6rz1gy1H;
	Tue, 30 Jan 2024 19:36:14 +0800 (CST)
Received: from dggpemm500005.china.huawei.com (unknown [7.185.36.74])
	by mail.maildlp.com (Postfix) with ESMTPS id B48431404D7;
	Tue, 30 Jan 2024 19:38:03 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 30 Jan 2024 19:38:03 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>, Yunsheng Lin
	<linyunsheng@huawei.com>, Jason Wang <jasowang@redhat.com>, Alexander Duyck
	<alexanderduyck@fb.com>, Jeroen de Borst <jeroendb@google.com>, Praveen
 Kaligineedi <pkaligineedi@google.com>, Shailend Chand <shailend@google.com>,
	Eric Dumazet <edumazet@google.com>, Felix Fietkau <nbd@nbd.name>, Sean Wang
	<sean.wang@mediatek.com>, Mark Lee <Mark-MC.Lee@mediatek.com>, Lorenzo
 Bianconi <lorenzo@kernel.org>, Matthias Brugger <matthias.bgg@gmail.com>,
	AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>, Keith
 Busch <kbusch@kernel.org>, Jens Axboe <axboe@kernel.dk>, Christoph Hellwig
	<hch@lst.de>, Sagi Grimberg <sagi@grimberg.me>, Chaitanya Kulkarni
	<kch@nvidia.com>, Andrew Morton <akpm@linux-foundation.org>,
	<linux-arm-kernel@lists.infradead.org>, <linux-mediatek@lists.infradead.org>,
	<linux-nvme@lists.infradead.org>, <linux-mm@kvack.org>
Subject: [PATCH net-next v4 3/5] net: introduce page_frag_cache_drain()
Date: Tue, 30 Jan 2024 19:37:08 +0800
Message-ID: <20240130113710.34511-4-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20240130113710.34511-1-linyunsheng@huawei.com>
References: <20240130113710.34511-1-linyunsheng@huawei.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-ClientProxiedBy: dggems706-chm.china.huawei.com (10.3.19.183) To
 dggpemm500005.china.huawei.com (7.185.36.74)
Content-Type: text/plain; charset="utf-8"

When draining a page_frag_cache, most user are doing
the similar steps, so introduce an API to avoid code
duplication.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
---
 drivers/net/ethernet/google/gve/gve_main.c | 11 ++---------
 drivers/net/ethernet/mediatek/mtk_wed_wo.c | 17 ++---------------
 drivers/nvme/host/tcp.c                    |  7 +------
 drivers/nvme/target/tcp.c                  |  4 +---
 include/linux/gfp.h                        |  1 +
 mm/page_alloc.c                            | 10 ++++++++++
 6 files changed, 17 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ether=
net/google/gve/gve_main.c
index db6d9ae7cd78..dec6458bb8d7 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1276,17 +1276,10 @@ static void gve_unreg_xdp_info(struct gve_priv *pri=
v)
=20
 static void gve_drain_page_cache(struct gve_priv *priv)
 {
-	struct page_frag_cache *nc;
 	int i;
=20
-	for (i =3D 0; i < priv->rx_cfg.num_queues; i++) {
-		nc =3D &priv->rx[i].page_cache;
-		if (nc->va) {
-			__page_frag_cache_drain(virt_to_page(nc->va),
-						nc->pagecnt_bias);
-			nc->va =3D NULL;
-		}
-	}
+	for (i =3D 0; i < priv->rx_cfg.num_queues; i++)
+		page_frag_cache_drain(&priv->rx[i].page_cache);
 }
=20
 static void gve_qpls_get_curr_alloc_cfg(struct gve_priv *priv,
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c b/drivers/net/ether=
net/mediatek/mtk_wed_wo.c
index d58b07e7e123..7063c78bd35f 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
@@ -286,7 +286,6 @@ mtk_wed_wo_queue_free(struct mtk_wed_wo *wo, struct mtk=
_wed_wo_queue *q)
 static void
 mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *=
q)
 {
-	struct page *page;
 	int i;
=20
 	for (i =3D 0; i < q->n_desc; i++) {
@@ -301,19 +300,12 @@ mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, stru=
ct mtk_wed_wo_queue *q)
 		entry->buf =3D NULL;
 	}
=20
-	if (!q->cache.va)
-		return;
-
-	page =3D virt_to_page(q->cache.va);
-	__page_frag_cache_drain(page, q->cache.pagecnt_bias);
-	memset(&q->cache, 0, sizeof(q->cache));
+	page_frag_cache_drain(&q->cache);
 }
=20
 static void
 mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *=
q)
 {
-	struct page *page;
-
 	for (;;) {
 		void *buf =3D mtk_wed_wo_dequeue(wo, q, NULL, true);
=20
@@ -323,12 +315,7 @@ mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struc=
t mtk_wed_wo_queue *q)
 		skb_free_frag(buf);
 	}
=20
-	if (!q->cache.va)
-		return;
-
-	page =3D virt_to_page(q->cache.va);
-	__page_frag_cache_drain(page, q->cache.pagecnt_bias);
-	memset(&q->cache, 0, sizeof(q->cache));
+	page_frag_cache_drain(&q->cache);
 }
=20
 static void
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index d058d990532b..22e1fb9c9c0f 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1344,7 +1344,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_c=
trl *ctrl)
=20
 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 {
-	struct page *page;
 	struct nvme_tcp_ctrl *ctrl =3D to_tcp_ctrl(nctrl);
 	struct nvme_tcp_queue *queue =3D &ctrl->queues[qid];
 	unsigned int noreclaim_flag;
@@ -1355,11 +1354,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nc=
trl, int qid)
 	if (queue->hdr_digest || queue->data_digest)
 		nvme_tcp_free_crypto(queue);
=20
-	if (queue->pf_cache.va) {
-		page =3D virt_to_head_page(queue->pf_cache.va);
-		__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
-		queue->pf_cache.va =3D NULL;
-	}
+	page_frag_cache_drain(&queue->pf_cache);
=20
 	noreclaim_flag =3D memalloc_noreclaim_save();
 	/* ->sock will be released by fput() */
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 6a1e6bb80062..56224dc59f17 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1591,7 +1591,6 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct=
 nvmet_tcp_queue *queue)
=20
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
 {
-	struct page *page;
 	struct nvmet_tcp_queue *queue =3D
 		container_of(w, struct nvmet_tcp_queue, release_work);
=20
@@ -1615,8 +1614,7 @@ static void nvmet_tcp_release_queue_work(struct work_=
struct *w)
 	if (queue->hdr_digest || queue->data_digest)
 		nvmet_tcp_free_crypto(queue);
 	ida_free(&nvmet_tcp_queue_ida, queue->idx);
-	page =3D virt_to_head_page(queue->pf_cache.va);
-	__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
+	page_frag_cache_drain(&queue->pf_cache);
 	kfree(queue);
 }
=20
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 28aea17fa59b..6cef1c241180 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -311,6 +311,7 @@ extern void __free_pages(struct page *page, unsigned in=
t order);
 extern void free_pages(unsigned long addr, unsigned int order);
=20
 struct page_frag_cache;
+void page_frag_cache_drain(struct page_frag_cache *nc);
 extern void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fra=
gsz,
 			      gfp_t gfp_mask, unsigned int align_mask);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 636145c29f70..06aa1ebbd21c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4699,6 +4699,16 @@ static struct page *__page_frag_cache_refill(struct =
page_frag_cache *nc,
 	return page;
 }
=20
+void page_frag_cache_drain(struct page_frag_cache *nc)
+{
+	if (!nc->va)
+		return;
+
+	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
+	nc->va =3D NULL;
+}
+EXPORT_SYMBOL(page_frag_cache_drain);
+
 void __page_frag_cache_drain(struct page *page, unsigned int count)
 {
 	VM_BUG_ON_PAGE(page_ref_count(page) =3D=3D 0, page);
--=20
2.33.0
From nobody Sun Feb  8 16:51:59 2026
Received: from szxga04-in.huawei.com (szxga04-in.huawei.com [45.249.212.190])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7CC906A038;
	Tue, 30 Jan 2024 11:38:07 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=45.249.212.190
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706614690; cv=none;
 b=XMgQQz18IqsFDyGfVSgre1feaxzGx8druoOO/selqM7sjNKI3+KALCa4d1+nNqaGTprWZloQxMQiDT6ll3BEGQNltxiVq/XkG8EYFBFzWu/oIPnWQKqCAEysAMJG8JqQgePICp2vav9p4+1pOmOrHqp8COrhXaoZI9BWBISvBFY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706614690; c=relaxed/simple;
	bh=EkOVPmyG8AwI1ztH4WPEnwo5N1dm4jflvATq8fvXt7Q=;
	h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=lkWF3TPHzRDgTYJCZaBPxzWtRy1Z9NTCtjsYJd1/H19my2/SuB/106fi75LKl7lUvNVROeVlvFqUh/Lh3ErweL7oWVO1KpXhu2YrDOYdacSxh07xCzErhBGL1wdhh1YwC1yrotBA2mDjcRoNEp/M/cbR+SvsMhoAkMM9Vj2b58U=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com;
 spf=pass smtp.mailfrom=huawei.com; arc=none smtp.client-ip=45.249.212.190
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=huawei.com
Received: from mail.maildlp.com (unknown [172.19.162.112])
	by szxga04-in.huawei.com (SkyGuard) with ESMTP id 4TPNS23wNyz1xmr9;
	Tue, 30 Jan 2024 19:37:06 +0800 (CST)
Received: from dggpemm500005.china.huawei.com (unknown [7.185.36.74])
	by mail.maildlp.com (Postfix) with ESMTPS id 76D191404FF;
	Tue, 30 Jan 2024 19:38:05 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 30 Jan 2024 19:38:05 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>, Yunsheng Lin
	<linyunsheng@huawei.com>, Jason Wang <jasowang@redhat.com>, "Michael S.
 Tsirkin" <mst@redhat.com>, Alexei Starovoitov <ast@kernel.org>, Daniel
 Borkmann <daniel@iogearbox.net>, Jesper Dangaard Brouer <hawk@kernel.org>,
	John Fastabend <john.fastabend@gmail.com>, <kvm@vger.kernel.org>,
	<virtualization@lists.linux.dev>, <bpf@vger.kernel.org>
Subject: [PATCH net-next v4 4/5] vhost/net: remove
 vhost_net_page_frag_refill()
Date: Tue, 30 Jan 2024 19:37:09 +0800
Message-ID: <20240130113710.34511-5-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20240130113710.34511-1-linyunsheng@huawei.com>
References: <20240130113710.34511-1-linyunsheng@huawei.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-ClientProxiedBy: dggems706-chm.china.huawei.com (10.3.19.183) To
 dggpemm500005.china.huawei.com (7.185.36.74)
Content-Type: text/plain; charset="utf-8"

The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c | 91 ++++++++++++++-------------------------------
 1 file changed, 27 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..4b2fcb228a0a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
 	unsigned tx_zcopy_err;
 	/* Flush in progress. Protected by tx vq lock. */
 	bool tx_flush;
-	/* Private page frag */
-	struct page_frag page_frag;
-	/* Refcount bias of page frag */
-	int refcnt_bias;
+	/* Private page frag cache */
+	struct page_frag_cache pf_cache;
 };
=20
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, s=
ize_t total_len)
 	       !vhost_vq_avail_empty(vq->dev, vq);
 }
=20
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int=
 sz,
-				       struct page_frag *pfrag, gfp_t gfp)
-{
-	if (pfrag->page) {
-		if (pfrag->offset + sz <=3D pfrag->size)
-			return true;
-		__page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-	}
-
-	pfrag->offset =3D 0;
-	net->refcnt_bias =3D 0;
-	if (SKB_FRAG_PAGE_ORDER) {
-		/* Avoid direct reclaim but allow kswapd to wake */
-		pfrag->page =3D alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
-					  __GFP_COMP | __GFP_NOWARN |
-					  __GFP_NORETRY | __GFP_NOMEMALLOC,
-					  SKB_FRAG_PAGE_ORDER);
-		if (likely(pfrag->page)) {
-			pfrag->size =3D PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-			goto done;
-		}
-	}
-	pfrag->page =3D alloc_page(gfp);
-	if (likely(pfrag->page)) {
-		pfrag->size =3D PAGE_SIZE;
-		goto done;
-	}
-	return false;
-
-done:
-	net->refcnt_bias =3D USHRT_MAX;
-	page_ref_add(pfrag->page, USHRT_MAX - 1);
-	return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
=20
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtque=
ue *nvq,
 	struct vhost_net *net =3D container_of(vq->dev, struct vhost_net,
 					     dev);
 	struct socket *sock =3D vhost_vq_get_backend(vq);
-	struct page_frag *alloc_frag =3D &net->page_frag;
 	struct virtio_net_hdr *gso;
 	struct xdp_buff *xdp =3D &nvq->xdp[nvq->batched_xdp];
 	struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtque=
ue *nvq,
 	int sock_hlen =3D nvq->sock_hlen;
 	void *buf;
 	int copied;
+	int ret;
=20
 	if (unlikely(len < nvq->sock_hlen))
 		return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtq=
ueue *nvq,
 		return -ENOSPC;
=20
 	buflen +=3D SKB_DATA_ALIGN(len + pad);
-	alloc_frag->offset =3D ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-	if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-						 alloc_frag, GFP_KERNEL)))
+	buf =3D page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL,
+				    SMP_CACHE_BYTES);
+	if (unlikely(!buf))
 		return -ENOMEM;
=20
-	buf =3D (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-	copied =3D copy_page_from_iter(alloc_frag->page,
-				     alloc_frag->offset +
-				     offsetof(struct tun_xdp_hdr, gso),
-				     sock_hlen, from);
-	if (copied !=3D sock_hlen)
-		return -EFAULT;
+	copied =3D copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+				sock_hlen, from);
+	if (copied !=3D sock_hlen) {
+		ret =3D -EFAULT;
+		goto err;
+	}
=20
 	hdr =3D buf;
 	gso =3D &hdr->gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtq=
ueue *nvq,
 			       vhost16_to_cpu(vq, gso->csum_start) +
 			       vhost16_to_cpu(vq, gso->csum_offset) + 2);
=20
-		if (vhost16_to_cpu(vq, gso->hdr_len) > len)
-			return -EINVAL;
+		if (vhost16_to_cpu(vq, gso->hdr_len) > len) {
+			ret =3D -EINVAL;
+			goto err;
+		}
 	}
=20
 	len -=3D sock_hlen;
-	copied =3D copy_page_from_iter(alloc_frag->page,
-				     alloc_frag->offset + pad,
-				     len, from);
-	if (copied !=3D len)
-		return -EFAULT;
+	copied =3D copy_from_iter(buf + pad, len, from);
+	if (copied !=3D len) {
+		ret =3D -EFAULT;
+		goto err;
+	}
=20
 	xdp_init_buff(xdp, buflen, NULL);
 	xdp_prepare_buff(xdp, buf, pad, len, true);
 	hdr->buflen =3D buflen;
=20
-	--net->refcnt_bias;
-	alloc_frag->offset +=3D buflen;
-
 	++nvq->batched_xdp;
=20
 	return 0;
+
+err:
+	page_frag_free(buf);
+	return ret;
 }
=20
 static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
@@ -1353,8 +1318,7 @@ static int vhost_net_open(struct inode *inode, struct=
 file *f)
 			vqs[VHOST_NET_VQ_RX]);
=20
 	f->private_data =3D n;
-	n->page_frag.page =3D NULL;
-	n->refcnt_bias =3D 0;
+	n->pf_cache.va =3D NULL;
=20
 	return 0;
 }
@@ -1422,8 +1386,7 @@ static int vhost_net_release(struct inode *inode, str=
uct file *f)
 	kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
 	kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
 	kfree(n->dev.vqs);
-	if (n->page_frag.page)
-		__page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
+	page_frag_cache_drain(&n->pf_cache);
 	kvfree(n);
 	return 0;
 }
--=20
2.33.0
From nobody Sun Feb  8 16:51:59 2026
Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 547466A03A;
	Tue, 30 Jan 2024 11:38:10 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=45.249.212.188
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706614692; cv=none;
 b=m4+gmicF33lb2RS+cooG1iLnus8dd9Teu/4dH88p78OiOYtCWwzP45hsun3OExAUxQD2iQ2eEZ/XKv27o2wxHk62/tWK/xTr/W4p6NPsTev5UmKNzuwSVxd1GsfU3AFMLHHhUqpaSsaFw5GmwhnkO1WIfmJh45GkNo9r4MUqNrQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706614692; c=relaxed/simple;
	bh=cexFbN7tkIS6apl/vMvEeGyzs3s1WmkbCvYFJX01Uew=;
	h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=jbaui82C6ZhG0szUi1nimgyt5d9s3b8R1n+31BX3c5Wl7tueWi7x0XQXDuYwTbjVoLl9x3V54+NgCDMuXVnQctE377HUd57dtNvbkCAFybvLjuaVaTTeQRfjyWjXhDbyIO17GE1oN1/+JHSo+OSRkew7IbpLvjSVEEE0PhjESwA=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com;
 spf=pass smtp.mailfrom=huawei.com; arc=none smtp.client-ip=45.249.212.188
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=huawei.com
Received: from mail.maildlp.com (unknown [172.19.88.194])
	by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4TPNSk0wTTzLqKg;
	Tue, 30 Jan 2024 19:37:42 +0800 (CST)
Received: from dggpemm500005.china.huawei.com (unknown [7.185.36.74])
	by mail.maildlp.com (Postfix) with ESMTPS id F00C91400FF;
	Tue, 30 Jan 2024 19:38:07 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 30 Jan 2024 19:38:07 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>, Yunsheng Lin
	<linyunsheng@huawei.com>, "Michael S. Tsirkin" <mst@redhat.com>, Jason Wang
	<jasowang@redhat.com>, Xuan Zhuo <xuanzhuo@linux.alibaba.com>,
	<virtualization@lists.linux.dev>
Subject: [PATCH net-next v4 5/5] tools: virtio: introduce vhost_net_test
Date: Tue, 30 Jan 2024 19:37:10 +0800
Message-ID: <20240130113710.34511-6-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20240130113710.34511-1-linyunsheng@huawei.com>
References: <20240130113710.34511-1-linyunsheng@huawei.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-ClientProxiedBy: dggems706-chm.china.huawei.com (10.3.19.183) To
 dggpemm500005.china.huawei.com (7.185.36.74)
Content-Type: text/plain; charset="utf-8"

introduce vhost_net_test basing on virtio_test to test
vhost_net changing in the kernel.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 tools/virtio/.gitignore       |   1 +
 tools/virtio/Makefile         |   8 +-
 tools/virtio/vhost_net_test.c | 576 ++++++++++++++++++++++++++++++++++
 3 files changed, 582 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/.gitignore b/tools/virtio/.gitignore
index 9934d48d9a55..7e47b281c442 100644
--- a/tools/virtio/.gitignore
+++ b/tools/virtio/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 *.d
 virtio_test
+vhost_net_test
 vringh_test
 virtio-trace/trace-agent
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
=20
 try-run =3D $(shell set -e;		\
 	if ($(1)) >/dev/null 2>&1;	\
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=3Dclean
=20
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-	${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-              vhost_test/Module.symvers vhost_test/modules.order *.d
+	${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+              vhost_test/.*.cmd vhost_test/Module.symvers \
+              vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index 000000000000..e336792a0d77
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,576 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <getopt.h>
+#include <limits.h>
+#include <string.h>
+#include <poll.h>
+#include <sys/eventfd.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <linux/virtio_types.h>
+#include <linux/vhost.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ring.h>
+#include <linux/if.h>
+#include <linux/if_tun.h>
+#include <linux/in.h>
+#include <linux/if_packet.h>
+#include <netinet/ether.h>
+
+#define RANDOM_BATCH	-1
+#define HDR_LEN		12
+#define TEST_BUF_LEN	256
+#define TEST_PTYPE	ETH_P_LOOPBACK
+
+/* Used by implementation of kmalloc() in tools/virtio/linux/kernel.h */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+	int kick;
+	int call;
+	int idx;
+	long started;
+	long completed;
+	struct pollfd fds;
+	void *ring;
+	/* copy used for control */
+	struct vring vring;
+	struct virtqueue *vq;
+};
+
+struct vdev_info {
+	struct virtio_device vdev;
+	int control;
+	struct vq_info vqs[2];
+	int nvqs;
+	void *buf;
+	size_t buf_size;
+	char *test_buf;
+	char *res_buf;
+	struct vhost_memory *mem;
+	int sock;
+	int ifindex;
+	unsigned char mac[ETHER_ADDR_LEN];
+};
+
+static int tun_alloc(struct vdev_info *dev)
+{
+	struct ifreq ifr;
+	int len =3D HDR_LEN;
+	int fd, e;
+
+	fd =3D open("/dev/net/tun", O_RDWR);
+	if (fd < 0) {
+		perror("Cannot open /dev/net/tun");
+		return fd;
+	}
+
+	memset(&ifr, 0, sizeof(ifr));
+
+	ifr.ifr_flags =3D IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+	snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+
+	e =3D ioctl(fd, TUNSETIFF, &ifr);
+	if (e < 0) {
+		perror("ioctl[TUNSETIFF]");
+		close(fd);
+		return e;
+	}
+
+	e =3D ioctl(fd, TUNSETVNETHDRSZ, &len);
+	if (e < 0) {
+		perror("ioctl[TUNSETVNETHDRSZ]");
+		close(fd);
+		return e;
+	}
+
+	e =3D ioctl(fd, SIOCGIFHWADDR, &ifr);
+	if (e < 0) {
+		perror("ioctl[SIOCGIFHWADDR]");
+		close(fd);
+		return e;
+	}
+
+	memcpy(dev->mac, &ifr.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
+	return fd;
+}
+
+static void vdev_create_socket(struct vdev_info *dev)
+{
+	struct ifreq ifr;
+
+	dev->sock =3D socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE));
+	assert(dev->sock !=3D -1);
+
+	snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+	assert(ioctl(dev->sock, SIOCGIFINDEX, &ifr) >=3D 0);
+
+	dev->ifindex =3D ifr.ifr_ifindex;
+
+	/* Set the flags that bring the device up */
+	assert(ioctl(dev->sock, SIOCGIFFLAGS, &ifr) >=3D 0);
+	ifr.ifr_flags |=3D (IFF_UP | IFF_RUNNING);
+	assert(ioctl(dev->sock, SIOCSIFFLAGS, &ifr) >=3D 0);
+}
+
+static void vdev_send_packet(struct vdev_info *dev)
+{
+	char *sendbuf =3D dev->test_buf + HDR_LEN;
+	struct sockaddr_ll saddrll =3D {0};
+	int sockfd =3D dev->sock;
+	int ret;
+
+	saddrll.sll_family =3D PF_PACKET;
+	saddrll.sll_ifindex =3D dev->ifindex;
+	saddrll.sll_halen =3D ETH_ALEN;
+	saddrll.sll_protocol =3D htons(TEST_PTYPE);
+
+	ret =3D sendto(sockfd, sendbuf, TEST_BUF_LEN, 0,
+		     (struct sockaddr *)&saddrll,
+		     sizeof(struct sockaddr_ll));
+	assert(ret >=3D 0);
+}
+
+static bool vq_notify(struct virtqueue *vq)
+{
+	struct vq_info *info =3D vq->priv;
+	unsigned long long v =3D 1;
+	int r;
+
+	r =3D write(info->kick, &v, sizeof(v));
+	assert(r =3D=3D sizeof(v));
+
+	return true;
+}
+
+static void vq_callback(struct virtqueue *vq)
+{
+}
+
+static void vhost_vq_setup(struct vdev_info *dev, struct vq_info *info)
+{
+	struct vhost_vring_addr addr =3D {
+		.index =3D info->idx,
+		.desc_user_addr =3D (uint64_t)(unsigned long)info->vring.desc,
+		.avail_user_addr =3D (uint64_t)(unsigned long)info->vring.avail,
+		.used_user_addr =3D (uint64_t)(unsigned long)info->vring.used,
+	};
+	struct vhost_vring_state state =3D { .index =3D info->idx };
+	struct vhost_vring_file file =3D { .index =3D info->idx };
+	int r;
+
+	state.num =3D info->vring.num;
+	r =3D ioctl(dev->control, VHOST_SET_VRING_NUM, &state);
+	assert(r >=3D 0);
+
+	state.num =3D 0;
+	r =3D ioctl(dev->control, VHOST_SET_VRING_BASE, &state);
+	assert(r >=3D 0);
+
+	r =3D ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr);
+	assert(r >=3D 0);
+
+	file.fd =3D info->kick;
+	r =3D ioctl(dev->control, VHOST_SET_VRING_KICK, &file);
+	assert(r >=3D 0);
+
+	file.fd =3D info->call;
+	r =3D ioctl(dev->control, VHOST_SET_VRING_CALL, &file);
+	assert(r >=3D 0);
+}
+
+static void vq_reset(struct vq_info *info, int num, struct virtio_device *=
vdev)
+{
+	if (info->vq)
+		vring_del_virtqueue(info->vq);
+
+	memset(info->ring, 0, vring_size(num, 4096));
+	vring_init(&info->vring, num, info->ring, 4096);
+	info->vq =3D vring_new_virtqueue(info->idx, num, 4096, vdev, true, false,
+				       info->ring, vq_notify, vq_callback, "test");
+	assert(info->vq);
+	info->vq->priv =3D info;
+}
+
+static void vq_info_add(struct vdev_info *dev, int idx, int num, int fd)
+{
+	struct vhost_vring_file backend =3D { .index =3D idx, .fd =3D fd };
+	struct vq_info *info =3D &dev->vqs[idx];
+	int r;
+
+	info->idx =3D idx;
+	info->kick =3D eventfd(0, EFD_NONBLOCK);
+	info->call =3D eventfd(0, EFD_NONBLOCK);
+	r =3D posix_memalign(&info->ring, 4096, vring_size(num, 4096));
+	assert(r >=3D 0);
+	vq_reset(info, num, &dev->vdev);
+	vhost_vq_setup(dev, info);
+	info->fds.fd =3D info->call;
+	info->fds.events =3D POLLIN;
+
+	r =3D ioctl(dev->control, VHOST_NET_SET_BACKEND, &backend);
+	assert(!r);
+}
+
+static void vdev_info_init(struct vdev_info *dev, unsigned long long featu=
res)
+{
+	struct ether_header *eh;
+	int i, r;
+
+	dev->vdev.features =3D features;
+	INIT_LIST_HEAD(&dev->vdev.vqs);
+	spin_lock_init(&dev->vdev.vqs_list_lock);
+
+	dev->buf_size =3D (HDR_LEN + TEST_BUF_LEN) * 2;
+	dev->buf =3D malloc(dev->buf_size);
+	assert(dev->buf);
+	dev->test_buf =3D dev->buf;
+	dev->res_buf =3D dev->test_buf + HDR_LEN + TEST_BUF_LEN;
+
+	memset(dev->test_buf, 0, HDR_LEN + TEST_BUF_LEN);
+	eh =3D (struct ether_header *)(dev->test_buf + HDR_LEN);
+	eh->ether_type =3D htons(TEST_PTYPE);
+	memcpy(eh->ether_dhost, dev->mac, ETHER_ADDR_LEN);
+	memcpy(eh->ether_shost, dev->mac, ETHER_ADDR_LEN);
+
+	for (i =3D sizeof(*eh); i < TEST_BUF_LEN; i++)
+		dev->test_buf[i + HDR_LEN] =3D (char)i;
+
+	dev->control =3D open("/dev/vhost-net", O_RDWR);
+	assert(dev->control >=3D 0);
+
+	r =3D ioctl(dev->control, VHOST_SET_OWNER, NULL);
+	assert(r >=3D 0);
+
+	dev->mem =3D malloc(offsetof(struct vhost_memory, regions) +
+			  sizeof(dev->mem->regions[0]));
+	assert(dev->mem);
+	memset(dev->mem, 0, offsetof(struct vhost_memory, regions) +
+	       sizeof(dev->mem->regions[0]));
+	dev->mem->nregions =3D 1;
+	dev->mem->regions[0].guest_phys_addr =3D (long)dev->buf;
+	dev->mem->regions[0].userspace_addr =3D (long)dev->buf;
+	dev->mem->regions[0].memory_size =3D dev->buf_size;
+
+	r =3D ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem);
+	assert(r >=3D 0);
+
+	r =3D ioctl(dev->control, VHOST_SET_FEATURES, &features);
+	assert(r >=3D 0);
+
+	dev->nvqs =3D 2;
+}
+
+static void wait_for_interrupt(struct vq_info *vq)
+{
+	unsigned long long val;
+
+	poll(&vq->fds, 1, -1);
+
+	if (vq->fds.revents & POLLIN)
+		read(vq->fds.fd, &val, sizeof(val));
+}
+
+static void verify_res_buf(char *res_buf)
+{
+	int i;
+
+	for (i =3D ETHER_HDR_LEN; i < TEST_BUF_LEN; i++)
+		assert(res_buf[i] =3D=3D (char)i);
+}
+
+static void run_tx_test(struct vdev_info *dev, struct vq_info *vq,
+			bool delayed, int batch, int bufs)
+{
+	const bool random_batch =3D batch =3D=3D RANDOM_BATCH;
+	long long spurious =3D 0;
+	struct scatterlist sl;
+	unsigned int len;
+	int r;
+
+	for (;;) {
+		long started_before =3D vq->started;
+		long completed_before =3D vq->completed;
+
+		virtqueue_disable_cb(vq->vq);
+		do {
+			if (random_batch)
+				batch =3D (random() % vq->vring.num) + 1;
+
+			while (vq->started < bufs &&
+			       (vq->started - vq->completed) < batch) {
+				sg_init_one(&sl, dev->test_buf, HDR_LEN + TEST_BUF_LEN);
+				r =3D virtqueue_add_outbuf(vq->vq, &sl, 1,
+							 dev->test_buf + vq->started,
+							 GFP_ATOMIC);
+				if (unlikely(r !=3D 0)) {
+					if (r =3D=3D -ENOSPC &&
+					    vq->started > started_before)
+						r =3D 0;
+					else
+						r =3D -1;
+					break;
+				}
+
+				++vq->started;
+
+				if (unlikely(!virtqueue_kick(vq->vq))) {
+					r =3D -1;
+					break;
+				}
+			}
+
+			if (vq->started >=3D bufs)
+				r =3D -1;
+
+			/* Flush out completed bufs if any */
+			while (virtqueue_get_buf(vq->vq, &len)) {
+				int n;
+
+				n =3D recvfrom(dev->sock, dev->res_buf, TEST_BUF_LEN, 0, NULL, NULL);
+				assert(n =3D=3D TEST_BUF_LEN);
+				verify_res_buf(dev->res_buf);
+
+				++vq->completed;
+				r =3D 0;
+			}
+		} while (r =3D=3D 0);
+
+		if (vq->completed =3D=3D completed_before && vq->started =3D=3D started_=
before)
+			++spurious;
+
+		assert(vq->completed <=3D bufs);
+		assert(vq->started <=3D bufs);
+		if (vq->completed =3D=3D bufs)
+			break;
+
+		if (delayed) {
+			if (virtqueue_enable_cb_delayed(vq->vq))
+				wait_for_interrupt(vq);
+		} else {
+			if (virtqueue_enable_cb(vq->vq))
+				wait_for_interrupt(vq);
+		}
+	}
+	printf("TX spurious wakeups: 0x%llx started=3D0x%lx completed=3D0x%lx\n",
+	       spurious, vq->started, vq->completed);
+}
+
+static void run_rx_test(struct vdev_info *dev, struct vq_info *vq,
+			bool delayed, int batch, int bufs)
+{
+	const bool random_batch =3D batch =3D=3D RANDOM_BATCH;
+	long long spurious =3D 0;
+	struct scatterlist sl;
+	unsigned int len;
+	int r;
+
+	for (;;) {
+		long started_before =3D vq->started;
+		long completed_before =3D vq->completed;
+
+		do {
+			if (random_batch)
+				batch =3D (random() % vq->vring.num) + 1;
+
+			while (vq->started < bufs &&
+			       (vq->started - vq->completed) < batch) {
+				sg_init_one(&sl, dev->res_buf, HDR_LEN + TEST_BUF_LEN);
+
+				r =3D virtqueue_add_inbuf(vq->vq, &sl, 1,
+							dev->res_buf + vq->started,
+							GFP_ATOMIC);
+				if (unlikely(r !=3D 0)) {
+					if (r =3D=3D -ENOSPC &&
+					    vq->started > started_before)
+						r =3D 0;
+					else
+						r =3D -1;
+					break;
+				}
+
+				++vq->started;
+
+				vdev_send_packet(dev);
+
+				if (unlikely(!virtqueue_kick(vq->vq))) {
+					r =3D -1;
+					break;
+				}
+			}
+
+			if (vq->started >=3D bufs)
+				r =3D -1;
+
+			/* Flush out completed bufs if any */
+			while (virtqueue_get_buf(vq->vq, &len)) {
+				struct ether_header *eh;
+
+				eh =3D (struct ether_header *)(dev->res_buf + HDR_LEN);
+
+				/* tun netdev is up and running, ignore the
+				 * non-TEST_PTYPE packet.
+				 */
+				if (eh->ether_type !=3D htons(TEST_PTYPE)) {
+					++vq->completed;
+					r =3D 0;
+					continue;
+				}
+
+				assert(len =3D=3D TEST_BUF_LEN + HDR_LEN);
+				verify_res_buf(dev->res_buf + HDR_LEN);
+
+				++vq->completed;
+				r =3D 0;
+			}
+		} while (r =3D=3D 0);
+		if (vq->completed =3D=3D completed_before && vq->started =3D=3D started_=
before)
+			++spurious;
+
+		assert(vq->completed <=3D bufs);
+		assert(vq->started <=3D bufs);
+		if (vq->completed =3D=3D bufs)
+			break;
+	}
+
+	printf("RX spurious wakeups: 0x%llx started=3D0x%lx completed=3D0x%lx\n",
+	       spurious, vq->started, vq->completed);
+}
+
+static const char optstring[] =3D "h";
+static const struct option longopts[] =3D {
+	{
+		.name =3D "help",
+		.val =3D 'h',
+	},
+	{
+		.name =3D "event-idx",
+		.val =3D 'E',
+	},
+	{
+		.name =3D "no-event-idx",
+		.val =3D 'e',
+	},
+	{
+		.name =3D "indirect",
+		.val =3D 'I',
+	},
+	{
+		.name =3D "no-indirect",
+		.val =3D 'i',
+	},
+	{
+		.name =3D "virtio-1",
+		.val =3D '1',
+	},
+	{
+		.name =3D "no-virtio-1",
+		.val =3D '0',
+	},
+	{
+		.name =3D "delayed-interrupt",
+		.val =3D 'D',
+	},
+	{
+		.name =3D "no-delayed-interrupt",
+		.val =3D 'd',
+	},
+	{
+		.name =3D "buf-num",
+		.val =3D 'n',
+		.has_arg =3D required_argument,
+	},
+	{
+		.name =3D "batch",
+		.val =3D 'b',
+		.has_arg =3D required_argument,
+	},
+	{
+	}
+};
+
+static void help(int status)
+{
+	fprintf(stderr, "Usage: vhost_net_test [--help]"
+		" [--no-indirect]"
+		" [--no-event-idx]"
+		" [--no-virtio-1]"
+		" [--delayed-interrupt]"
+		" [--buf-num]"
+		" [--batch=3Drandom/N]"
+		"\n");
+
+	exit(status);
+}
+
+int main(int argc, char **argv)
+{
+	unsigned long long features =3D (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+		(1ULL << VIRTIO_RING_F_EVENT_IDX) | (1ULL << VIRTIO_F_VERSION_1);
+	long batch =3D 1, nbufs =3D 0x100000;
+	struct vdev_info dev;
+	bool delayed =3D false;
+	int o, fd;
+
+	for (;;) {
+		o =3D getopt_long(argc, argv, optstring, longopts, NULL);
+		switch (o) {
+		case -1:
+			goto done;
+		case '?':
+			help(2);
+		case 'e':
+			features &=3D ~(1ULL << VIRTIO_RING_F_EVENT_IDX);
+			break;
+		case 'h':
+			help(0);
+		case 'i':
+			features &=3D ~(1ULL << VIRTIO_RING_F_INDIRECT_DESC);
+			break;
+		case '0':
+			features &=3D ~(1ULL << VIRTIO_F_VERSION_1);
+			break;
+		case 'D':
+			delayed =3D true;
+			break;
+		case 'b':
+			if (!strcmp(optarg, "random")) {
+				batch =3D RANDOM_BATCH;
+			} else {
+				batch =3D strtol(optarg, NULL, 10);
+				assert(batch > 0);
+				assert(batch < (long)INT_MAX + 1);
+			}
+			break;
+		case 'n':
+			nbufs =3D strtol(optarg, NULL, 10);
+			assert(nbufs > 0);
+			break;
+		default:
+			assert(0);
+			break;
+		}
+	}
+
+done:
+	memset(&dev, 0, sizeof(dev));
+
+	fd =3D tun_alloc(&dev);
+	assert(fd >=3D 0);
+
+	vdev_info_init(&dev, features);
+	vq_info_add(&dev, 0, 256, fd);
+	vq_info_add(&dev, 1, 256, fd);
+	vdev_create_socket(&dev);
+
+	run_rx_test(&dev, &dev.vqs[0], delayed, batch, nbufs);
+	run_tx_test(&dev, &dev.vqs[1], delayed, batch, nbufs);
+
+	return 0;
+}
--=20
2.33.0