From nobody Sun Feb  8 13:17:59 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 17D07C41513
	for <linux-kernel@archiver.kernel.org>; Wed, 16 Aug 2023 10:05:39 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S243689AbjHPKFJ (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 16 Aug 2023 06:05:09 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:35386 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S243681AbjHPKEf (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 16 Aug 2023 06:04:35 -0400
Received: from szxga03-in.huawei.com (szxga03-in.huawei.com [45.249.212.189])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 4ABBAC1;
        Wed, 16 Aug 2023 03:04:33 -0700 (PDT)
Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.54])
        by szxga03-in.huawei.com (SkyGuard) with ESMTP id 4RQkDR4nrxzFqgc;
        Wed, 16 Aug 2023 18:01:11 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.31; Wed, 16 Aug 2023 18:04:08 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
        Yunsheng Lin <linyunsheng@huawei.com>,
        Lorenzo Bianconi <lorenzo@kernel.org>,
        Alexander Duyck <alexander.duyck@gmail.com>,
        Liang Chen <liangchen.linux@gmail.com>,
        Alexander Lobakin <aleksander.lobakin@intel.com>,
        Saeed Mahameed <saeedm@nvidia.com>,
        Leon Romanovsky <leon@kernel.org>,
        Eric Dumazet <edumazet@google.com>,
        Jesper Dangaard Brouer <hawk@kernel.org>,
        Ilias Apalodimas <ilias.apalodimas@linaro.org>,
        <linux-rdma@vger.kernel.org>
Subject: [PATCH net-next v7 1/6] page_pool: frag API support for 32-bit arch
 with 64-bit DMA
Date: Wed, 16 Aug 2023 18:01:08 +0800
Message-ID: <20230816100113.41034-2-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20230816100113.41034-1-linyunsheng@huawei.com>
References: <20230816100113.41034-1-linyunsheng@huawei.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Originating-IP: [10.69.192.56]
X-ClientProxiedBy: dggems701-chm.china.huawei.com (10.3.19.178) To
 dggpemm500005.china.huawei.com (7.185.36.74)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

Currently page_pool_alloc_frag() is not supported in 32-bit
arch with 64-bit DMA because of the overlap issue between
pp_frag_count and dma_addr_upper in 'struct page' for those
arches, which seems to be quite common, see [1], which means
driver may need to handle it when using frag API.

In order to simplify the driver's work when using frag API
this patch allows page_pool_alloc_frag() to call
page_pool_alloc_pages() to return pages for those arches.

Add a PP_FLAG_PAGE_SPLIT_IN_DRIVER flag in order to fail the
page_pool creation for 32-bit arch with 64-bit DMA when driver
tries to do the page splitting itself.

Note that it may aggravate truesize underestimate problem for
skb as there is no page splitting for those pages, if driver
need a accurate truesize, it may calculate that according to
frag size, page order and PAGE_POOL_DMA_USE_PP_FRAG_COUNT
being true or not. And we may provide a helper for that if it
turns out to be helpful.

1. https://lore.kernel.org/all/20211117075652.58299-1-linyunsheng@huawei.co=
m/

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  3 +-
 include/net/page_pool/helpers.h               | 38 +++++++++++++++--
 include/net/page_pool/types.h                 | 42 ++++++++++++-------
 net/core/page_pool.c                          | 15 +++----
 4 files changed, 68 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/ne=
t/ethernet/mellanox/mlx5/core/en_main.c
index bc9d5a5bea01..ec9c5a8cbda6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -834,7 +834,8 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params,
 		struct page_pool_params pp_params =3D { 0 };
=20
 		pp_params.order     =3D 0;
-		pp_params.flags     =3D PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | PP_FLAG=
_PAGE_FRAG;
+		pp_params.flags     =3D PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV |
+					PP_FLAG_PAGE_SPLIT_IN_DRIVER;
 		pp_params.pool_size =3D pool_size;
 		pp_params.nid       =3D node;
 		pp_params.dev       =3D rq->pdev;
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helper=
s.h
index 94231533a369..cb18de55f239 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -29,8 +29,12 @@
 #ifndef _NET_PAGE_POOL_HELPERS_H
 #define _NET_PAGE_POOL_HELPERS_H
=20
+#include <linux/dma-mapping.h>
 #include <net/page_pool/types.h>
=20
+#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT	\
+		(sizeof(dma_addr_t) > sizeof(unsigned long))
+
 #ifdef CONFIG_PAGE_POOL_STATS
 int page_pool_ethtool_stats_get_count(void);
 u8 *page_pool_ethtool_stats_get_strings(u8 *data);
@@ -73,6 +77,29 @@ static inline struct page *page_pool_dev_alloc_pages(str=
uct page_pool *pool)
 	return page_pool_alloc_pages(pool, gfp);
 }
=20
+static inline struct page *page_pool_alloc_frag(struct page_pool *pool,
+						unsigned int *offset,
+						unsigned int size, gfp_t gfp)
+{
+	unsigned int max_size =3D PAGE_SIZE << pool->p.order;
+
+	size =3D ALIGN(size, dma_get_cache_alignment());
+
+	if (WARN_ON(size > max_size))
+		return NULL;
+
+	/* Don't allow page splitting and allocate one big frag
+	 * for 32-bit arch with 64-bit DMA, corresponding to
+	 * the checking in page_pool_is_last_frag().
+	 */
+	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
+		*offset =3D 0;
+		return page_pool_alloc_pages(pool, gfp);
+	}
+
+	return __page_pool_alloc_frag(pool, offset, size, gfp);
+}
+
 static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
 						    unsigned int *offset,
 						    unsigned int size)
@@ -134,8 +161,14 @@ static inline long page_pool_defrag_page(struct page *=
page, long nr)
 static inline bool page_pool_is_last_frag(struct page_pool *pool,
 					  struct page *page)
 {
-	/* If fragments aren't enabled or count is 0 we were the last user */
+	/* We assume we are the last frag user that is still holding
+	 * on to the page if:
+	 * 1. Fragments aren't enabled.
+	 * 2. We are running in 32-bit arch with 64-bit DMA.
+	 * 3. page_pool_defrag_page() indicate we are the last user.
+	 */
 	return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
+	       PAGE_POOL_DMA_USE_PP_FRAG_COUNT ||
 	       (page_pool_defrag_page(page, 1) =3D=3D 0);
 }
=20
@@ -197,9 +230,6 @@ static inline void page_pool_recycle_direct(struct page=
_pool *pool,
 	page_pool_put_full_page(pool, page, true);
 }
=20
-#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT	\
-		(sizeof(dma_addr_t) > sizeof(unsigned long))
-
 /**
  * page_pool_get_dma_addr() - Retrieve the stored DMA address.
  * @page:	page allocated from a page pool
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 887e7946a597..079337c42aa6 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -6,21 +6,29 @@
 #include <linux/dma-direction.h>
 #include <linux/ptr_ring.h>
=20
-#define PP_FLAG_DMA_MAP		BIT(0) /* Should page_pool do the DMA
-					* map/unmap
-					*/
-#define PP_FLAG_DMA_SYNC_DEV	BIT(1) /* If set all pages that the driver ge=
ts
-					* from page_pool will be
-					* DMA-synced-for-device according to
-					* the length provided by the device
-					* driver.
-					* Please note DMA-sync-for-CPU is still
-					* device driver responsibility
-					*/
-#define PP_FLAG_PAGE_FRAG	BIT(2) /* for page frag feature */
+/* Should page_pool do the DMA map/unmap */
+#define PP_FLAG_DMA_MAP			BIT(0)
+
+/* If set all pages that the driver gets from page_pool will be
+ * DMA-synced-for-device according to the length provided by the device dr=
iver.
+ * Please note DMA-sync-for-CPU is still device driver responsibility
+ */
+#define PP_FLAG_DMA_SYNC_DEV		BIT(1)
+
+/* for page frag feature */
+#define PP_FLAG_PAGE_FRAG		BIT(2)
+
+/* If set driver will do the page splitting itself. This is used to fail t=
he
+ * page_pool creation because there is overlap issue between pp_frag_count=
 and
+ * dma_addr_upper in 'struct page' for some arches with
+ * PAGE_POOL_DMA_USE_PP_FRAG_COUNT being true.
+ */
+#define PP_FLAG_PAGE_SPLIT_IN_DRIVER	BIT(3)
+
 #define PP_FLAG_ALL		(PP_FLAG_DMA_MAP |\
 				 PP_FLAG_DMA_SYNC_DEV |\
-				 PP_FLAG_PAGE_FRAG)
+				 PP_FLAG_PAGE_FRAG |\
+				 PP_FLAG_PAGE_SPLIT_IN_DRIVER)
=20
 /*
  * Fast allocation side cache array/stack
@@ -45,7 +53,8 @@ struct pp_alloc_cache {
=20
 /**
  * struct page_pool_params - page pool parameters
- * @flags:	PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG
+ * @flags:	PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG,
+ *		PP_FLAG_PAGE_SPLIT_IN_DRIVER
  * @order:	2^order pages on allocation
  * @pool_size:	size of the ptr_ring
  * @nid:	NUMA node id to allocate from pages from
@@ -183,8 +192,9 @@ struct page_pool {
 };
=20
 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
-struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *of=
fset,
-				  unsigned int size, gfp_t gfp);
+struct page *__page_pool_alloc_frag(struct page_pool *pool,
+				    unsigned int *offset, unsigned int size,
+				    gfp_t gfp);
 struct page_pool *page_pool_create(const struct page_pool_params *params);
=20
 struct xdp_mem_info;
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 77cb75e63aca..7d5f0512aa13 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -14,7 +14,6 @@
 #include <net/xdp.h>
=20
 #include <linux/dma-direction.h>
-#include <linux/dma-mapping.h>
 #include <linux/page-flags.h>
 #include <linux/mm.h> /* for put_page() */
 #include <linux/poison.h>
@@ -212,7 +211,7 @@ static int page_pool_init(struct page_pool *pool,
 	}
=20
 	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
-	    pool->p.flags & PP_FLAG_PAGE_FRAG)
+	    pool->p.flags & PP_FLAG_PAGE_SPLIT_IN_DRIVER)
 		return -EINVAL;
=20
 #ifdef CONFIG_PAGE_POOL_STATS
@@ -737,18 +736,16 @@ static void page_pool_free_frag(struct page_pool *poo=
l)
 	page_pool_return_page(pool, page);
 }
=20
-struct page *page_pool_alloc_frag(struct page_pool *pool,
-				  unsigned int *offset,
-				  unsigned int size, gfp_t gfp)
+struct page *__page_pool_alloc_frag(struct page_pool *pool,
+				    unsigned int *offset,
+				    unsigned int size, gfp_t gfp)
 {
 	unsigned int max_size =3D PAGE_SIZE << pool->p.order;
 	struct page *page =3D pool->frag_page;
=20
-	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
-		    size > max_size))
+	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG)))
 		return NULL;
=20
-	size =3D ALIGN(size, dma_get_cache_alignment());
 	*offset =3D pool->frag_offset;
=20
 	if (page && *offset + size > max_size) {
@@ -781,7 +778,7 @@ struct page *page_pool_alloc_frag(struct page_pool *poo=
l,
 	alloc_stat_inc(pool, fast);
 	return page;
 }
-EXPORT_SYMBOL(page_pool_alloc_frag);
+EXPORT_SYMBOL(__page_pool_alloc_frag);
=20
 static void page_pool_empty_ring(struct page_pool *pool)
 {
--=20
2.33.0
From nobody Sun Feb  8 13:17:59 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 2BA68C05052
	for <linux-kernel@archiver.kernel.org>; Wed, 16 Aug 2023 10:05:08 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S243725AbjHPKEm (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 16 Aug 2023 06:04:42 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:35372 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S243676AbjHPKEe (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 16 Aug 2023 06:04:34 -0400
Received: from szxga03-in.huawei.com (szxga03-in.huawei.com [45.249.212.189])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 4ADDEE3;
        Wed, 16 Aug 2023 03:04:33 -0700 (PDT)
Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.53])
        by szxga03-in.huawei.com (SkyGuard) with ESMTP id 4RQkDS3Mh4zFqgl;
        Wed, 16 Aug 2023 18:01:12 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.31; Wed, 16 Aug 2023 18:04:09 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
        Yunsheng Lin <linyunsheng@huawei.com>,
        Lorenzo Bianconi <lorenzo@kernel.org>,
        Alexander Duyck <alexander.duyck@gmail.com>,
        Liang Chen <liangchen.linux@gmail.com>,
        Alexander Lobakin <aleksander.lobakin@intel.com>,
        Jesper Dangaard Brouer <hawk@kernel.org>,
        Ilias Apalodimas <ilias.apalodimas@linaro.org>,
        Eric Dumazet <edumazet@google.com>
Subject: [PATCH net-next v7 2/6] page_pool: unify frag_count handling in
 page_pool_is_last_frag()
Date: Wed, 16 Aug 2023 18:01:09 +0800
Message-ID: <20230816100113.41034-3-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20230816100113.41034-1-linyunsheng@huawei.com>
References: <20230816100113.41034-1-linyunsheng@huawei.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Originating-IP: [10.69.192.56]
X-ClientProxiedBy: dggems701-chm.china.huawei.com (10.3.19.178) To
 dggpemm500005.china.huawei.com (7.185.36.74)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

Currently when page_pool_create() is called with
PP_FLAG_PAGE_FRAG flag, page_pool_alloc_pages() is only
allowed to be called under the below constraints:
1. page_pool_fragment_page() need to be called to setup
   page->pp_frag_count immediately.
2. page_pool_defrag_page() often need to be called to drain
   the page->pp_frag_count when there is no more user will
   be holding on to that page.

Those constraints exist in order to support a page to be
split into multi frags.

And those constraints have some overhead because of the
cache line dirtying/bouncing and atomic update.

Those constraints are unavoidable for case when we need a
page to be split into more than one frag, but there is also
case that we want to avoid the above constraints and their
overhead when a page can't be split as it can only hold a big
frag as requested by user, depending on different use cases:
use case 1: allocate page without page splitting.
use case 2: allocate page with page splitting.
use case 3: allocate page with or without page splitting
            depending on the frag size.

Currently page pool only provide page_pool_alloc_pages() and
page_pool_alloc_frag() API to enable the 1 & 2 separately,
so we can not use a combination of 1 & 2 to enable 3, it is
not possible yet because of the per page_pool flag
PP_FLAG_PAGE_FRAG.

So in order to allow allocating unsplit page without the
overhead of split page while still allow allocating split
page we need to remove the per page_pool flag in
page_pool_is_last_frag(), as best as I can think of, it seems
there are two methods as below:
1. Add per page flag/bit to indicate a page is split or
   not, which means we might need to update that flag/bit
   everytime the page is recycled, dirtying the cache line
   of 'struct page' for use case 1.
2. Unify the page->pp_frag_count handling for both split and
   unsplit page by assuming all pages in the page pool is split
   into a big frag initially.

As page pool already supports use case 1 without dirtying the
cache line of 'struct page' whenever a page is recyclable, we
need to support the above use case 3 with minimal overhead,
especially not adding any noticeable overhead for use case 1,
and we are already doing an optimization by not updating
pp_frag_count in page_pool_defrag_page() for the last frag
user, this patch chooses to unify the pp_frag_count handling
to support the above use case 3.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 include/net/page_pool/helpers.h | 54 +++++++++++++++++++++++----------
 net/core/page_pool.c            | 10 +++++-
 2 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helper=
s.h
index cb18de55f239..19e8ba056868 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -134,7 +134,8 @@ inline enum dma_data_direction page_pool_get_dma_dir(st=
ruct page_pool *pool)
  */
 static inline void page_pool_fragment_page(struct page *page, long nr)
 {
-	atomic_long_set(&page->pp_frag_count, nr);
+	if (!PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
+		atomic_long_set(&page->pp_frag_count, nr);
 }
=20
 static inline long page_pool_defrag_page(struct page *page, long nr)
@@ -142,33 +143,54 @@ static inline long page_pool_defrag_page(struct page =
*page, long nr)
 	long ret;
=20
 	/* If nr =3D=3D pp_frag_count then we have cleared all remaining
-	 * references to the page. No need to actually overwrite it, instead
-	 * we can leave this to be overwritten by the calling function.
+	 * references to the page:
+	 * 1. 'n =3D=3D 1': no need to actually overwrite it.
+	 * 2. 'n !=3D 1': overwrite it with one, which is the rare case
+	 *              for frag draining.
 	 *
-	 * The main advantage to doing this is that an atomic_read is
-	 * generally a much cheaper operation than an atomic update,
-	 * especially when dealing with a page that may be partitioned
-	 * into only 2 or 3 pieces.
+	 * The main advantage to doing this is that not only we avoid a
+	 * atomic update, as an atomic_read is generally a much cheaper
+	 * operation than an atomic update, especially when dealing with
+	 * a page that may be partitioned into only 2 or 3 pieces; but
+	 * also unify the frag and non-frag handling by ensuring all
+	 * pages have been split into one big frag initially, and only
+	 * overwrite it when the page is split into more than one frag.
 	 */
-	if (atomic_long_read(&page->pp_frag_count) =3D=3D nr)
+	if (atomic_long_read(&page->pp_frag_count) =3D=3D nr) {
+		/* As we have ensured nr is always one for constant case
+		 * using the BUILD_BUG_ON(), only need to handle the
+		 * non-constant case here for frag count draining, which
+		 * is a rare case.
+		 */
+		BUILD_BUG_ON(__builtin_constant_p(nr) && nr !=3D 1);
+		if (!__builtin_constant_p(nr))
+			atomic_long_set(&page->pp_frag_count, 1);
+
 		return 0;
+	}
=20
 	ret =3D atomic_long_sub_return(nr, &page->pp_frag_count);
 	WARN_ON(ret < 0);
+
+	/* We are the last user here too, reset frag count back to 1 to
+	 * ensure all pages have been split into one big frag initially,
+	 * this should be the rare case when the last two frag users call
+	 * page_pool_defrag_page() currently.
+	 */
+	if (unlikely(!ret))
+		atomic_long_set(&page->pp_frag_count, 1);
+
 	return ret;
 }
=20
-static inline bool page_pool_is_last_frag(struct page_pool *pool,
-					  struct page *page)
+static inline bool page_pool_is_last_frag(struct page *page)
 {
 	/* We assume we are the last frag user that is still holding
 	 * on to the page if:
-	 * 1. Fragments aren't enabled.
-	 * 2. We are running in 32-bit arch with 64-bit DMA.
-	 * 3. page_pool_defrag_page() indicate we are the last user.
+	 * 1. We are running in 32-bit arch with 64-bit DMA.
+	 * 2. page_pool_defrag_page() indicate we are the last user.
 	 */
-	return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
-	       PAGE_POOL_DMA_USE_PP_FRAG_COUNT ||
+	return PAGE_POOL_DMA_USE_PP_FRAG_COUNT ||
 	       (page_pool_defrag_page(page, 1) =3D=3D 0);
 }
=20
@@ -194,7 +216,7 @@ static inline void page_pool_put_page(struct page_pool =
*pool,
 	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
 	 */
 #ifdef CONFIG_PAGE_POOL
-	if (!page_pool_is_last_frag(pool, page))
+	if (!page_pool_is_last_frag(page))
 		return;
=20
 	page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 7d5f0512aa13..386e6d791e90 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -371,6 +371,14 @@ static void page_pool_set_pp_info(struct page_pool *po=
ol,
 {
 	page->pp =3D pool;
 	page->pp_magic |=3D PP_SIGNATURE;
+
+	/* Ensuring all pages have been split into one big frag initially:
+	 * page_pool_set_pp_info() is only called once for every page when it
+	 * is allocated from the page allocator and page_pool_fragment_page()
+	 * is dirtying the same cache line as the page->pp_magic above, so
+	 * the overhead is negligible.
+	 */
+	page_pool_fragment_page(page, 1);
 	if (pool->p.init_callback)
 		pool->p.init_callback(page, pool->p.init_arg);
 }
@@ -667,7 +675,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, vo=
id **data,
 		struct page *page =3D virt_to_head_page(data[i]);
=20
 		/* It is not the last user for the page frag case */
-		if (!page_pool_is_last_frag(pool, page))
+		if (!page_pool_is_last_frag(page))
 			continue;
=20
 		page =3D __page_pool_put_page(pool, page, -1, false);
--=20
2.33.0
From nobody Sun Feb  8 13:17:59 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id C1A67C04A94
	for <linux-kernel@archiver.kernel.org>; Wed, 16 Aug 2023 10:05:07 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S243691AbjHPKEi (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 16 Aug 2023 06:04:38 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:43252 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S243670AbjHPKEQ (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 16 Aug 2023 06:04:16 -0400
Received: from szxga01-in.huawei.com (szxga01-in.huawei.com [45.249.212.187])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id A2D13C1;
        Wed, 16 Aug 2023 03:04:14 -0700 (PDT)
Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.57])
        by szxga01-in.huawei.com (SkyGuard) with ESMTP id 4RQkCl3HDQztRyn;
        Wed, 16 Aug 2023 18:00:35 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.31; Wed, 16 Aug 2023 18:04:11 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
        Yunsheng Lin <linyunsheng@huawei.com>,
        Lorenzo Bianconi <lorenzo@kernel.org>,
        Alexander Duyck <alexander.duyck@gmail.com>,
        Liang Chen <liangchen.linux@gmail.com>,
        Alexander Lobakin <aleksander.lobakin@intel.com>,
        Michael Chan <michael.chan@broadcom.com>,
        Eric Dumazet <edumazet@google.com>,
        Yisen Zhuang <yisen.zhuang@huawei.com>,
        Salil Mehta <salil.mehta@huawei.com>,
        Sunil Goutham <sgoutham@marvell.com>,
        Geetha sowjanya <gakula@marvell.com>,
        Subbaraya Sundeep <sbhatta@marvell.com>,
        hariprasad <hkelam@marvell.com>, Felix Fietkau <nbd@nbd.name>,
        Ryder Lee <ryder.lee@mediatek.com>,
        Shayne Chen <shayne.chen@mediatek.com>,
        Sean Wang <sean.wang@mediatek.com>,
        Kalle Valo <kvalo@kernel.org>,
        Matthias Brugger <matthias.bgg@gmail.com>,
        AngeloGioacchino Del Regno
        <angelogioacchino.delregno@collabora.com>,
        Jesper Dangaard Brouer <hawk@kernel.org>,
        Ilias Apalodimas <ilias.apalodimas@linaro.org>,
        <linux-wireless@vger.kernel.org>,
        <linux-arm-kernel@lists.infradead.org>,
        <linux-mediatek@lists.infradead.org>
Subject: [PATCH net-next v7 3/6] page_pool: remove PP_FLAG_PAGE_FRAG
Date: Wed, 16 Aug 2023 18:01:10 +0800
Message-ID: <20230816100113.41034-4-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20230816100113.41034-1-linyunsheng@huawei.com>
References: <20230816100113.41034-1-linyunsheng@huawei.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Originating-IP: [10.69.192.56]
X-ClientProxiedBy: dggems701-chm.china.huawei.com (10.3.19.178) To
 dggpemm500005.china.huawei.com (7.185.36.74)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

PP_FLAG_PAGE_FRAG is not really needed after pp_frag_count
handling is unified and page_pool_alloc_frag() is supported
in 32-bit arch with 64-bit DMA, so remove it.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c                | 2 --
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c          | 3 +--
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c | 2 +-
 drivers/net/wireless/mediatek/mt76/mac80211.c            | 2 +-
 include/net/page_pool/types.h                            | 8 ++------
 net/core/page_pool.c                                     | 3 ---
 net/core/skbuff.c                                        | 2 +-
 7 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethern=
et/broadcom/bnxt/bnxt.c
index 7be917a8da48..60b699be0d9b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3249,8 +3249,6 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
 	pp.napi =3D &rxr->bnapi->napi;
 	pp.dev =3D &bp->pdev->dev;
 	pp.dma_dir =3D DMA_BIDIRECTIONAL;
-	if (PAGE_SIZE > BNXT_RX_PAGE_SIZE)
-		pp.flags |=3D PP_FLAG_PAGE_FRAG;
=20
 	rxr->page_pool =3D page_pool_create(&pp);
 	if (IS_ERR(rxr->page_pool)) {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/=
ethernet/hisilicon/hns3/hns3_enet.c
index eac2d0573241..ff0c219365f1 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -4926,8 +4926,7 @@ static void hns3_put_ring_config(struct hns3_nic_priv=
 *priv)
 static void hns3_alloc_page_pool(struct hns3_enet_ring *ring)
 {
 	struct page_pool_params pp_params =3D {
-		.flags =3D PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG |
-				PP_FLAG_DMA_SYNC_DEV,
+		.flags =3D PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
 		.order =3D hns3_page_order(ring),
 		.pool_size =3D ring->desc_num * hns3_buf_size(ring) /
 				(PAGE_SIZE << hns3_page_order(ring)),
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/dri=
vers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
index dce3cea00032..edc6acebf369 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
@@ -1433,7 +1433,7 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id,
 		return 0;
 	}
=20
-	pp_params.flags =3D PP_FLAG_PAGE_FRAG | PP_FLAG_DMA_MAP;
+	pp_params.flags =3D PP_FLAG_DMA_MAP;
 	pp_params.pool_size =3D numptrs;
 	pp_params.nid =3D NUMA_NO_NODE;
 	pp_params.dev =3D pfvf->dev;
diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wi=
reless/mediatek/mt76/mac80211.c
index d158320bc15d..fe7cc67b7ee2 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -566,7 +566,7 @@ int mt76_create_page_pool(struct mt76_dev *dev, struct =
mt76_queue *q)
 {
 	struct page_pool_params pp_params =3D {
 		.order =3D 0,
-		.flags =3D PP_FLAG_PAGE_FRAG,
+		.flags =3D 0,
 		.nid =3D NUMA_NO_NODE,
 		.dev =3D dev->dma_dev,
 	};
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 079337c42aa6..4775cf95edb7 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -15,19 +15,15 @@
  */
 #define PP_FLAG_DMA_SYNC_DEV		BIT(1)
=20
-/* for page frag feature */
-#define PP_FLAG_PAGE_FRAG		BIT(2)
-
 /* If set driver will do the page splitting itself. This is used to fail t=
he
  * page_pool creation because there is overlap issue between pp_frag_count=
 and
  * dma_addr_upper in 'struct page' for some arches with
  * PAGE_POOL_DMA_USE_PP_FRAG_COUNT being true.
  */
-#define PP_FLAG_PAGE_SPLIT_IN_DRIVER	BIT(3)
+#define PP_FLAG_PAGE_SPLIT_IN_DRIVER	BIT(2)
=20
 #define PP_FLAG_ALL		(PP_FLAG_DMA_MAP |\
 				 PP_FLAG_DMA_SYNC_DEV |\
-				 PP_FLAG_PAGE_FRAG |\
 				 PP_FLAG_PAGE_SPLIT_IN_DRIVER)
=20
 /*
@@ -53,7 +49,7 @@ struct pp_alloc_cache {
=20
 /**
  * struct page_pool_params - page pool parameters
- * @flags:	PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG,
+ * @flags:	PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV,
  *		PP_FLAG_PAGE_SPLIT_IN_DRIVER
  * @order:	2^order pages on allocation
  * @pool_size:	size of the ptr_ring
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 386e6d791e90..d3b8efe98d5e 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -751,9 +751,6 @@ struct page *__page_pool_alloc_frag(struct page_pool *p=
ool,
 	unsigned int max_size =3D PAGE_SIZE << pool->p.order;
 	struct page *page =3D pool->frag_page;
=20
-	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG)))
-		return NULL;
-
 	*offset =3D pool->frag_offset;
=20
 	if (page && *offset + size > max_size) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 33fdf04d4334..4b90b6ed10b2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5709,7 +5709,7 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_b=
uff *from,
 	/* In general, avoid mixing page_pool and non-page_pool allocated
 	 * pages within the same SKB. Additionally avoid dealing with clones
 	 * with page_pool pages, in case the SKB is using page_pool fragment
-	 * references (PP_FLAG_PAGE_FRAG). Since we only take full page
+	 * references (page_pool_alloc_frag()). Since we only take full page
 	 * references for cloned SKBs at the moment that would result in
 	 * inconsistent reference counts.
 	 * In theory we could take full references if @from is cloned and
--=20
2.33.0
From nobody Sun Feb  8 13:17:59 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 02641C04FDF
	for <linux-kernel@archiver.kernel.org>; Wed, 16 Aug 2023 10:05:07 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S243711AbjHPKEj (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 16 Aug 2023 06:04:39 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:43266 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S243671AbjHPKES (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 16 Aug 2023 06:04:18 -0400
Received: from szxga01-in.huawei.com (szxga01-in.huawei.com [45.249.212.187])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 34AF310F;
        Wed, 16 Aug 2023 03:04:15 -0700 (PDT)
Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.53])
        by szxga01-in.huawei.com (SkyGuard) with ESMTP id 4RQkCm2S64ztS2n;
        Wed, 16 Aug 2023 18:00:36 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.31; Wed, 16 Aug 2023 18:04:12 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
        Yunsheng Lin <linyunsheng@huawei.com>,
        Lorenzo Bianconi <lorenzo@kernel.org>,
        Alexander Duyck <alexander.duyck@gmail.com>,
        Liang Chen <liangchen.linux@gmail.com>,
        Alexander Lobakin <aleksander.lobakin@intel.com>,
        Jesper Dangaard Brouer <hawk@kernel.org>,
        Ilias Apalodimas <ilias.apalodimas@linaro.org>,
        Eric Dumazet <edumazet@google.com>
Subject: [PATCH net-next v7 4/6] page_pool: introduce
 page_pool[_cache]_alloc() API
Date: Wed, 16 Aug 2023 18:01:11 +0800
Message-ID: <20230816100113.41034-5-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20230816100113.41034-1-linyunsheng@huawei.com>
References: <20230816100113.41034-1-linyunsheng@huawei.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Originating-IP: [10.69.192.56]
X-ClientProxiedBy: dggems701-chm.china.huawei.com (10.3.19.178) To
 dggpemm500005.china.huawei.com (7.185.36.74)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

Currently page pool supports the below use cases:
use case 1: allocate page without page splitting using
            page_pool_alloc_pages() API if the driver knows
            that the memory it need is always bigger than
            half of the page allocated from page pool.
use case 2: allocate page frag with page splitting using
            page_pool_alloc_frag() API if the driver knows
            that the memory it need is always smaller than
            or equal to the half of the page allocated from
            page pool.

There is emerging use case [1] & [2] that is a mix of the
above two case: the driver doesn't know the size of memory it
need beforehand, so the driver may use something like below to
allocate memory with least memory utilization and performance
penalty:

if (size << 1 > max_size)
	page =3D page_pool_alloc_pages();
else
	page =3D page_pool_alloc_frag();

To avoid the driver doing something like above, add the
page_pool[_cache]_alloc() API to support the above use case,
and update the true size of memory that is acctually allocated
by updating '*size' back to the driver in order to avoid
exacerbating truesize underestimate problem.

Rename page_pool_free() which is used in the destroy process to
__page_pool_destroy() to avoid confusion with the newly added
API.

1. https://lore.kernel.org/all/d3ae6bd3537fbce379382ac6a42f67e22f27ece2.168=
3896626.git.lorenzo@kernel.org/
2. https://lore.kernel.org/all/20230526054621.18371-3-liangchen.linux@gmail=
.com/

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 include/net/page_pool/helpers.h | 70 +++++++++++++++++++++++++++++++++
 net/core/page_pool.c            |  4 +-
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helper=
s.h
index 19e8ba056868..b920224f6584 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -109,6 +109,70 @@ static inline struct page *page_pool_dev_alloc_frag(st=
ruct page_pool *pool,
 	return page_pool_alloc_frag(pool, offset, size, gfp);
 }
=20
+static inline struct page *page_pool_alloc(struct page_pool *pool,
+					   unsigned int *offset,
+					   unsigned int *size, gfp_t gfp)
+{
+	unsigned int max_size =3D PAGE_SIZE << pool->p.order;
+	struct page *page;
+
+	*size =3D ALIGN(*size, dma_get_cache_alignment());
+
+	if (WARN_ON(*size > max_size))
+		return NULL;
+
+	if ((*size << 1) > max_size || PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
+		*size =3D max_size;
+		*offset =3D 0;
+		return page_pool_alloc_pages(pool, gfp);
+	}
+
+	page =3D __page_pool_alloc_frag(pool, offset, *size, gfp);
+	if (unlikely(!page))
+		return NULL;
+
+	/* There is very likely not enough space for another frag, so append the
+	 * remaining size to the current frag to avoid truesize underestimate
+	 * problem.
+	 */
+	if (pool->frag_offset + *size > max_size) {
+		*size =3D max_size - *offset;
+		pool->frag_offset =3D max_size;
+	}
+
+	return page;
+}
+
+static inline struct page *page_pool_dev_alloc(struct page_pool *pool,
+					       unsigned int *offset,
+					       unsigned int *size)
+{
+	gfp_t gfp =3D (GFP_ATOMIC | __GFP_NOWARN);
+
+	return page_pool_alloc(pool, offset, size, gfp);
+}
+
+static inline void *page_pool_cache_alloc(struct page_pool *pool,
+					  unsigned int *size, gfp_t gfp)
+{
+	unsigned int offset;
+	struct page *page;
+
+	page =3D page_pool_alloc(pool, &offset, size, gfp);
+	if (unlikely(!page))
+		return NULL;
+
+	return page_address(page) + offset;
+}
+
+static inline void *page_pool_dev_cache_alloc(struct page_pool *pool,
+					      unsigned int *size)
+{
+	gfp_t gfp =3D (GFP_ATOMIC | __GFP_NOWARN);
+
+	return page_pool_cache_alloc(pool, size, gfp);
+}
+
 /**
  * page_pool_get_dma_dir() - Retrieve the stored DMA direction.
  * @pool:	pool from which page was allocated
@@ -252,6 +316,12 @@ static inline void page_pool_recycle_direct(struct pag=
e_pool *pool,
 	page_pool_put_full_page(pool, page, true);
 }
=20
+static inline void page_pool_cache_free(struct page_pool *pool, void *data,
+					bool allow_direct)
+{
+	page_pool_put_page(pool, virt_to_head_page(data), -1, allow_direct);
+}
+
 /**
  * page_pool_get_dma_addr() - Retrieve the stored DMA address.
  * @page:	page allocated from a page pool
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index d3b8efe98d5e..6f970fc7a0a7 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -800,7 +800,7 @@ static void page_pool_empty_ring(struct page_pool *pool)
 	}
 }
=20
-static void page_pool_free(struct page_pool *pool)
+static void __page_pool_destroy(struct page_pool *pool)
 {
 	if (pool->disconnect)
 		pool->disconnect(pool);
@@ -851,7 +851,7 @@ static int page_pool_release(struct page_pool *pool)
 	page_pool_scrub(pool);
 	inflight =3D page_pool_inflight(pool);
 	if (!inflight)
-		page_pool_free(pool);
+		__page_pool_destroy(pool);
=20
 	return inflight;
 }
--=20
2.33.0
From nobody Sun Feb  8 13:17:59 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id E6946C04FE0
	for <linux-kernel@archiver.kernel.org>; Wed, 16 Aug 2023 10:05:07 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S243704AbjHPKEj (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 16 Aug 2023 06:04:39 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:43282 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S243672AbjHPKES (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 16 Aug 2023 06:04:18 -0400
Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 28DF5E48;
        Wed, 16 Aug 2023 03:04:16 -0700 (PDT)
Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.56])
        by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4RQkCt5ZZ6zNmt5;
        Wed, 16 Aug 2023 18:00:42 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.31; Wed, 16 Aug 2023 18:04:14 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
        Yunsheng Lin <linyunsheng@huawei.com>,
        Lorenzo Bianconi <lorenzo@kernel.org>,
        Alexander Duyck <alexander.duyck@gmail.com>,
        Liang Chen <liangchen.linux@gmail.com>,
        Alexander Lobakin <aleksander.lobakin@intel.com>,
        Jesper Dangaard Brouer <hawk@kernel.org>,
        Ilias Apalodimas <ilias.apalodimas@linaro.org>,
        Eric Dumazet <edumazet@google.com>,
        Jonathan Corbet <corbet@lwn.net>,
        Alexei Starovoitov <ast@kernel.org>,
        Daniel Borkmann <daniel@iogearbox.net>,
        John Fastabend <john.fastabend@gmail.com>,
        <linux-doc@vger.kernel.org>, <bpf@vger.kernel.org>
Subject: [PATCH net-next v7 5/6] page_pool: update document about frag API
Date: Wed, 16 Aug 2023 18:01:12 +0800
Message-ID: <20230816100113.41034-6-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20230816100113.41034-1-linyunsheng@huawei.com>
References: <20230816100113.41034-1-linyunsheng@huawei.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Originating-IP: [10.69.192.56]
X-ClientProxiedBy: dggems701-chm.china.huawei.com (10.3.19.178) To
 dggpemm500005.china.huawei.com (7.185.36.74)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

As more drivers begin to use the frag API, update the
document about how to decide which API to use for the
driver author.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 Documentation/networking/page_pool.rst |  4 +-
 include/net/page_pool/helpers.h        | 67 +++++++++++++++++++++++---
 2 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/Documentation/networking/page_pool.rst b/Documentation/network=
ing/page_pool.rst
index 215ebc92752c..0c0705994f51 100644
--- a/Documentation/networking/page_pool.rst
+++ b/Documentation/networking/page_pool.rst
@@ -58,7 +58,9 @@ a page will cause no race conditions is enough.
=20
 .. kernel-doc:: include/net/page_pool/helpers.h
    :identifiers: page_pool_put_page page_pool_put_full_page
-		 page_pool_recycle_direct page_pool_dev_alloc_pages
+		 page_pool_recycle_direct page_pool_cache_free
+		 page_pool_dev_alloc_pages page_pool_dev_alloc_frag
+		 page_pool_dev_alloc page_pool_dev_cache_alloc
 		 page_pool_get_dma_addr page_pool_get_dma_dir
=20
 .. kernel-doc:: net/core/page_pool.c
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helper=
s.h
index b920224f6584..4abca6c9d9f4 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -8,13 +8,28 @@
 /**
  * DOC: page_pool allocator
  *
- * The page_pool allocator is optimized for the XDP mode that
- * uses one frame per-page, but it can fallback on the
- * regular page allocator APIs.
+ * The page_pool allocator is optimized for recycling page or page frag us=
ed by
+ * skb packet and xdp frame.
  *
- * Basic use involves replacing alloc_pages() calls with the
- * page_pool_alloc_pages() call.  Drivers should use
- * page_pool_dev_alloc_pages() replacing dev_alloc_pages().
+ * Basic use involves replacing napi_alloc_frag() and alloc_pages() calls =
with
+ * page_pool_cache_alloc() and page_pool_alloc(), which allocate memory wi=
th or
+ * without page splitting depending on the requested memory size.
+ *
+ * If the driver knows that it always requires full pages or its allocatio=
ns are
+ * always smaller than half a page, it can use one of the more specific API
+ * calls:
+ *
+ * 1. page_pool_alloc_pages(): allocate memory without page splitting when
+ * driver knows that the memory it need is always bigger than half of the =
page
+ * allocated from page pool. There is no cache line dirtying for 'struct p=
age'
+ * when a page is recycled back to the page pool.
+ *
+ * 2. page_pool_alloc_frag(): allocate memory with page splitting when dri=
ver
+ * knows that the memory it need is always smaller than or equal to half o=
f the
+ * page allocated from page pool. Page splitting enables memory saving and=
 thus
+ * avoids TLB/cache miss for data access, but there also is some cost to
+ * implement page splitting, mainly some cache line dirtying/bouncing for
+ * 'struct page' and atomic operation for page->pp_frag_count.
  *
  * API keeps track of in-flight pages, in order to let API user know
  * when it is safe to free a page_pool object.  Thus, API users
@@ -100,6 +115,17 @@ static inline struct page *page_pool_alloc_frag(struct=
 page_pool *pool,
 	return __page_pool_alloc_frag(pool, offset, size, gfp);
 }
=20
+/**
+ * page_pool_dev_alloc_frag() - allocate a page frag.
+ * @pool: pool from which to allocate
+ * @offset: offset to the allocated page
+ * @size: requested size
+ *
+ * Get a page frag from the page allocator or page_pool caches.
+ *
+ * Return:
+ * Returns allocated page frag, otherwise return NULL.
+ */
 static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
 						    unsigned int *offset,
 						    unsigned int size)
@@ -143,6 +169,17 @@ static inline struct page *page_pool_alloc(struct page=
_pool *pool,
 	return page;
 }
=20
+/**
+ * page_pool_dev_alloc() - allocate a page or a page frag.
+ * @pool: pool from which to allocate
+ * @offset: offset to the allocated page
+ * @size: in as the requested size, out as the allocated size
+ *
+ * Get a page or a page frag from the page allocator or page_pool caches.
+ *
+ * Return:
+ * Returns a page or a page frag, otherwise return NULL.
+ */
 static inline struct page *page_pool_dev_alloc(struct page_pool *pool,
 					       unsigned int *offset,
 					       unsigned int *size)
@@ -165,6 +202,16 @@ static inline void *page_pool_cache_alloc(struct page_=
pool *pool,
 	return page_address(page) + offset;
 }
=20
+/**
+ * page_pool_dev_cache_alloc() - allocate a cache.
+ * @pool: pool from which to allocate
+ * @size: in as the requested size, out as the allocated size
+ *
+ * Get a cache from the page allocator or page_pool caches.
+ *
+ * Return:
+ * Returns the addr for the allocated cache, otherwise return NULL.
+ */
 static inline void *page_pool_dev_cache_alloc(struct page_pool *pool,
 					      unsigned int *size)
 {
@@ -316,6 +363,14 @@ static inline void page_pool_recycle_direct(struct pag=
e_pool *pool,
 	page_pool_put_full_page(pool, page, true);
 }
=20
+/**
+ * page_pool_cache_free() - free a cache into the page_pool
+ * @pool: pool from which cache was allocated
+ * @data: addr of cache to be free
+ * @allow_direct: freed by the consumer, allow lockless caching
+ *
+ * Free a cache allocated from page_pool_dev_cache_alloc().
+ */
 static inline void page_pool_cache_free(struct page_pool *pool, void *data,
 					bool allow_direct)
 {
--=20
2.33.0
From nobody Sun Feb  8 13:17:59 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 18DB6C04FE1
	for <linux-kernel@archiver.kernel.org>; Wed, 16 Aug 2023 10:05:08 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S243720AbjHPKEl (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 16 Aug 2023 06:04:41 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:43318 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S243674AbjHPKES (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 16 Aug 2023 06:04:18 -0400
Received: from szxga01-in.huawei.com (szxga01-in.huawei.com [45.249.212.187])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 66FE5C1;
        Wed, 16 Aug 2023 03:04:17 -0700 (PDT)
Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.57])
        by szxga01-in.huawei.com (SkyGuard) with ESMTP id 4RQkCp6X2xztS3Y;
        Wed, 16 Aug 2023 18:00:38 +0800 (CST)
Received: from localhost.localdomain (10.69.192.56) by
 dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.31; Wed, 16 Aug 2023 18:04:15 +0800
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>
CC: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
        Yunsheng Lin <linyunsheng@huawei.com>,
        Lorenzo Bianconi <lorenzo@kernel.org>,
        Alexander Duyck <alexander.duyck@gmail.com>,
        Liang Chen <liangchen.linux@gmail.com>,
        Alexander Lobakin <aleksander.lobakin@intel.com>,
        Eric Dumazet <edumazet@google.com>,
        Alexei Starovoitov <ast@kernel.org>,
        Daniel Borkmann <daniel@iogearbox.net>,
        Jesper Dangaard Brouer <hawk@kernel.org>,
        John Fastabend <john.fastabend@gmail.com>,
        <bpf@vger.kernel.org>
Subject: [PATCH net-next v7 6/6] net: veth: use newly added page pool API for
 veth with xdp
Date: Wed, 16 Aug 2023 18:01:13 +0800
Message-ID: <20230816100113.41034-7-linyunsheng@huawei.com>
X-Mailer: git-send-email 2.33.0
In-Reply-To: <20230816100113.41034-1-linyunsheng@huawei.com>
References: <20230816100113.41034-1-linyunsheng@huawei.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Originating-IP: [10.69.192.56]
X-ClientProxiedBy: dggems701-chm.china.huawei.com (10.3.19.178) To
 dggpemm500005.china.huawei.com (7.185.36.74)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

Use page_pool[_cache]_alloc() API to allocate memory with
least memory utilization and performance penalty.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/veth.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 953f6d8f8db0..f9bb79d89a84 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -736,10 +736,11 @@ static int veth_convert_skb_to_xdp_buff(struct veth_r=
q *rq,
 	if (skb_shared(skb) || skb_head_is_locked(skb) ||
 	    skb_shinfo(skb)->nr_frags ||
 	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
-		u32 size, len, max_head_size, off;
+		u32 size, len, max_head_size, off, truesize, page_offset;
 		struct sk_buff *nskb;
 		struct page *page;
 		int i, head_off;
+		void *data;
=20
 		/* We need a private copy of the skb and data buffers since
 		 * the ebpf program can modify it. We segment the original skb
@@ -752,14 +753,17 @@ static int veth_convert_skb_to_xdp_buff(struct veth_r=
q *rq,
 		if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size)
 			goto drop;
=20
+		size =3D min_t(u32, skb->len, max_head_size);
+		truesize =3D SKB_HEAD_ALIGN(size) + VETH_XDP_HEADROOM;
+
 		/* Allocate skb head */
-		page =3D page_pool_dev_alloc_pages(rq->page_pool);
-		if (!page)
+		data =3D page_pool_dev_cache_alloc(rq->page_pool, &truesize);
+		if (!data)
 			goto drop;
=20
-		nskb =3D napi_build_skb(page_address(page), PAGE_SIZE);
+		nskb =3D napi_build_skb(data, truesize);
 		if (!nskb) {
-			page_pool_put_full_page(rq->page_pool, page, true);
+			page_pool_cache_free(rq->page_pool, data, true);
 			goto drop;
 		}
=20
@@ -767,7 +771,6 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq =
*rq,
 		skb_copy_header(nskb, skb);
 		skb_mark_for_recycle(nskb);
=20
-		size =3D min_t(u32, skb->len, max_head_size);
 		if (skb_copy_bits(skb, 0, nskb->data, size)) {
 			consume_skb(nskb);
 			goto drop;
@@ -782,14 +785,18 @@ static int veth_convert_skb_to_xdp_buff(struct veth_r=
q *rq,
 		len =3D skb->len - off;
=20
 		for (i =3D 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
-			page =3D page_pool_dev_alloc_pages(rq->page_pool);
+			size =3D min_t(u32, len, PAGE_SIZE);
+			truesize =3D size;
+
+			page =3D page_pool_dev_alloc(rq->page_pool, &page_offset,
+						   &truesize);
 			if (!page) {
 				consume_skb(nskb);
 				goto drop;
 			}
=20
-			size =3D min_t(u32, len, PAGE_SIZE);
-			skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE);
+			skb_add_rx_frag(nskb, i, page, page_offset, size,
+					truesize);
 			if (skb_copy_bits(skb, off, page_address(page),
 					  size)) {
 				consume_skb(nskb);
--=20
2.33.0