From nobody Thu Nov 14 04:58:37 2024 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 9F0E0C4167B for ; Fri, 1 Dec 2023 12:02:23 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1378680AbjLAMCP (ORCPT ); Fri, 1 Dec 2023 07:02:15 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:42210 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1378625AbjLAMCM (ORCPT ); Fri, 1 Dec 2023 07:02:12 -0500 Received: from szxga01-in.huawei.com (szxga01-in.huawei.com [45.249.212.187]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id EDA64D48; Fri, 1 Dec 2023 04:02:17 -0800 (PST) Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.55]) by szxga01-in.huawei.com (SkyGuard) with ESMTP id 4ShWmV5SyqzsRSL; Fri, 1 Dec 2023 19:58:34 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Fri, 1 Dec 2023 20:02:16 +0800 From: Yunsheng Lin To: , , CC: , , Yunsheng Lin , Alexander Duyck , Andrew Morton , Eric Dumazet , Subject: [PATCH RFC 1/6] mm/page_alloc: modify page_frag_alloc_align() to accept align as an argument Date: Fri, 1 Dec 2023 20:02:02 +0800 Message-ID: <20231201120208.15080-2-linyunsheng@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20231201120208.15080-1-linyunsheng@huawei.com> References: <20231201120208.15080-1-linyunsheng@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Originating-IP: [10.69.192.56] X-ClientProxiedBy: dggems703-chm.china.huawei.com (10.3.19.180) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" napi_alloc_frag_align() and netdev_alloc_frag_align() accept align as an argument, and they are thin wrappers around the __napi_alloc_frag_align() and __netdev_alloc_frag_align() APIs doing the align and align_mask conversion, in order to call page_frag_alloc_align() directly. As __napi_alloc_frag_align() and __netdev_alloc_frag_align() APIs are only used by the above thin wrappers, it seems that it makes more sense to remove align and align_mask conversion and call page_frag_alloc_align() directly. By doing that, we can also avoid the confusion between napi_alloc_frag_align() accepting align as an argument and page_frag_alloc_align() accepting align_align as an argument when they both have the 'align' suffix. Signed-off-by: Yunsheng Lin CC: Alexander Duyck --- include/linux/gfp.h | 4 ++-- include/linux/skbuff.h | 22 ++++------------------ mm/page_alloc.c | 6 ++++-- net/core/skbuff.c | 14 +++++++------- 4 files changed, 17 insertions(+), 29 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index de292a007138..bbd75976541e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -314,12 +314,12 @@ struct page_frag_cache; extern void __page_frag_cache_drain(struct page *page, unsigned int count); extern void *page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, - unsigned int align_mask); + unsigned int align); =20 static inline void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { - return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u); + return page_frag_alloc_align(nc, fragsz, gfp_mask, 1); } =20 extern void page_frag_free(void *addr); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 27998f73183e..c27ed5ab6557 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3182,7 +3182,7 @@ static inline void skb_queue_purge(struct sk_buff_hea= d *list) unsigned int skb_rbtree_purge(struct rb_root *root); void skb_errqueue_purge(struct sk_buff_head *list); =20 -void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_ma= sk); +void *netdev_alloc_frag_align(unsigned int fragsz, unsigned int align); =20 /** * netdev_alloc_frag - allocate a page fragment @@ -3193,14 +3193,7 @@ void *__netdev_alloc_frag_align(unsigned int fragsz,= unsigned int align_mask); */ static inline void *netdev_alloc_frag(unsigned int fragsz) { - return __netdev_alloc_frag_align(fragsz, ~0u); -} - -static inline void *netdev_alloc_frag_align(unsigned int fragsz, - unsigned int align) -{ - WARN_ON_ONCE(!is_power_of_2(align)); - return __netdev_alloc_frag_align(fragsz, -align); + return netdev_alloc_frag_align(fragsz, 1); } =20 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int le= ngth, @@ -3260,18 +3253,11 @@ static inline void skb_free_frag(void *addr) page_frag_free(addr); } =20 -void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask= ); +void *napi_alloc_frag_align(unsigned int fragsz, unsigned int align); =20 static inline void *napi_alloc_frag(unsigned int fragsz) { - return __napi_alloc_frag_align(fragsz, ~0u); -} - -static inline void *napi_alloc_frag_align(unsigned int fragsz, - unsigned int align) -{ - WARN_ON_ONCE(!is_power_of_2(align)); - return __napi_alloc_frag_align(fragsz, -align); + return napi_alloc_frag_align(fragsz, 1); } =20 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 37ca4f4b62bf..9a16305cf985 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4718,12 +4718,14 @@ EXPORT_SYMBOL(__page_frag_cache_drain); =20 void *page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, - unsigned int align_mask) + unsigned int align) { unsigned int size =3D PAGE_SIZE; struct page *page; int offset; =20 + WARN_ON_ONCE(!is_power_of_2(align)); + if (unlikely(!nc->va)) { refill: page =3D __page_frag_cache_refill(nc, gfp_mask); @@ -4782,7 +4784,7 @@ void *page_frag_alloc_align(struct page_frag_cache *n= c, } =20 nc->pagecnt_bias--; - offset &=3D align_mask; + offset &=3D -align; nc->offset =3D offset; =20 return nc->va + offset; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b157efea5dea..b98d1da4004a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -291,17 +291,17 @@ void napi_get_frags_check(struct napi_struct *napi) local_bh_enable(); } =20 -void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) +void *napi_alloc_frag_align(unsigned int fragsz, unsigned int align) { struct napi_alloc_cache *nc =3D this_cpu_ptr(&napi_alloc_cache); =20 fragsz =3D SKB_DATA_ALIGN(fragsz); =20 - return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); + return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align); } -EXPORT_SYMBOL(__napi_alloc_frag_align); +EXPORT_SYMBOL(napi_alloc_frag_align); =20 -void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_ma= sk) +void *netdev_alloc_frag_align(unsigned int fragsz, unsigned int align) { void *data; =20 @@ -309,18 +309,18 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, = unsigned int align_mask) if (in_hardirq() || irqs_disabled()) { struct page_frag_cache *nc =3D this_cpu_ptr(&netdev_alloc_cache); =20 - data =3D page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); + data =3D page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align); } else { struct napi_alloc_cache *nc; =20 local_bh_disable(); nc =3D this_cpu_ptr(&napi_alloc_cache); - data =3D page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask= ); + data =3D page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align); local_bh_enable(); } return data; } -EXPORT_SYMBOL(__netdev_alloc_frag_align); +EXPORT_SYMBOL(netdev_alloc_frag_align); =20 static struct sk_buff *napi_skb_cache_get(void) { --=20 2.33.0 From nobody Thu Nov 14 04:58:37 2024 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 03732C4167B for ; Fri, 1 Dec 2023 12:02:25 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1378690AbjLAMCR (ORCPT ); Fri, 1 Dec 2023 07:02:17 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54536 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1378677AbjLAMCO (ORCPT ); Fri, 1 Dec 2023 07:02:14 -0500 Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 7A03A1B4; Fri, 1 Dec 2023 04:02:20 -0800 (PST) Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.54]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4ShWlq02qgzShJS; Fri, 1 Dec 2023 19:57:59 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Fri, 1 Dec 2023 20:02:18 +0800 From: Yunsheng Lin To: , , CC: , , Yunsheng Lin , Alexander Duyck , "Michael S. Tsirkin" , Jason Wang , Andrew Morton , Eric Dumazet , , , Subject: [PATCH RFC 2/6] page_frag: unify gfp bit for order 3 page allocation Date: Fri, 1 Dec 2023 20:02:03 +0800 Message-ID: <20231201120208.15080-3-linyunsheng@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20231201120208.15080-1-linyunsheng@huawei.com> References: <20231201120208.15080-1-linyunsheng@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Originating-IP: [10.69.192.56] X-ClientProxiedBy: dggems703-chm.china.huawei.com (10.3.19.180) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Currently there seems to be three page frag implementions which all try to allocate order 3 page, if that fails, it then fail back to allocate order 0 page, and each of them all allow order 3 page allocation to fail under certain condition by using specific gfp bits. The gfp bits for order 3 page allocation are different between different implementation, __GFP_NOMEMALLOC is or'd to forbid access to emergency reserves memory for __page_frag_cache_refill(), but it is not or'd in other implementions, __GFP_DIRECT_RECLAIM is xor'd to avoid direct reclaim in skb_page_frag_refill(), but it is not xor'd in __page_frag_cache_refill(). This patch unifies the gfp bits used between different implementions by or'ing __GFP_NOMEMALLOC and xor'ing __GFP_DIRECT_RECLAIM for order 3 page allocation to avoid possible pressure for mm. Signed-off-by: Yunsheng Lin CC: Alexander Duyck --- drivers/vhost/net.c | 2 +- mm/page_alloc.c | 4 ++-- net/core/sock.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f2ed7167c848..e574e21cc0ca 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net= *net, unsigned int sz, /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page =3D alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | - __GFP_NORETRY, + __GFP_NORETRY | __GFP_NOMEMALLOC, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size =3D PAGE_SIZE << SKB_FRAG_PAGE_ORDER; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9a16305cf985..1f0b36dd81b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4693,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct p= age_frag_cache *nc, gfp_t gfp =3D gfp_mask; =20 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - gfp_mask |=3D __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC; + gfp_mask =3D (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | + __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC; page =3D alloc_pages_node(NUMA_NO_NODE, gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER); nc->size =3D page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; diff --git a/net/core/sock.c b/net/core/sock.c index fef349dd72fa..4efa9cae4b0d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2904,7 +2904,7 @@ bool skb_page_frag_refill(unsigned int sz, struct pag= e_frag *pfrag, gfp_t gfp) /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page =3D alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | - __GFP_NORETRY, + __GFP_NORETRY | __GFP_NOMEMALLOC, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size =3D PAGE_SIZE << SKB_FRAG_PAGE_ORDER; --=20 2.33.0 From nobody Thu Nov 14 04:58:37 2024 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id CBE5EC4167B for ; Fri, 1 Dec 2023 12:02:32 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1378700AbjLAMCX (ORCPT ); Fri, 1 Dec 2023 07:02:23 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54574 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1378688AbjLAMCQ (ORCPT ); Fri, 1 Dec 2023 07:02:16 -0500 Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 2529A170E; Fri, 1 Dec 2023 04:02:22 -0800 (PST) Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.56]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4ShWqx2WS9zWhx4; Fri, 1 Dec 2023 20:01:33 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Fri, 1 Dec 2023 20:02:20 +0800 From: Yunsheng Lin To: , , CC: , , Yunsheng Lin , Alexander Duyck , Andrew Morton , Subject: [PATCH RFC 3/6] mm/page_alloc: use initial zero offset for page_frag_alloc_align() Date: Fri, 1 Dec 2023 20:02:04 +0800 Message-ID: <20231201120208.15080-4-linyunsheng@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20231201120208.15080-1-linyunsheng@huawei.com> References: <20231201120208.15080-1-linyunsheng@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Originating-IP: [10.69.192.56] X-ClientProxiedBy: dggems703-chm.china.huawei.com (10.3.19.180) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" The next patch is above to use page_frag_alloc_align() to replace vhost_net_page_frag_refill(), the main difference between those two frag page implementations is whether we use a initial zero offset or not. It seems more nature to use a initial zero offset, as it may enable more correct cache prefetching and skb frag coalescing in the networking, so change it to use initial zero offset. Signed-off-by: Yunsheng Lin CC: Alexander Duyck --- mm/page_alloc.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1f0b36dd81b5..083e0c38fb62 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4720,7 +4720,7 @@ void *page_frag_alloc_align(struct page_frag_cache *n= c, unsigned int fragsz, gfp_t gfp_mask, unsigned int align) { - unsigned int size =3D PAGE_SIZE; + unsigned int size; struct page *page; int offset; =20 @@ -4732,10 +4732,6 @@ void *page_frag_alloc_align(struct page_frag_cache *= nc, if (!page) return NULL; =20 -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - /* if size can vary use size else just use PAGE_SIZE */ - size =3D nc->size; -#endif /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ @@ -4744,11 +4740,18 @@ void *page_frag_alloc_align(struct page_frag_cache = *nc, /* reset page count bias and offset to start of new frag */ nc->pfmemalloc =3D page_is_pfmemalloc(page); nc->pagecnt_bias =3D PAGE_FRAG_CACHE_MAX_SIZE + 1; - nc->offset =3D size; + nc->offset =3D 0; } =20 - offset =3D nc->offset - fragsz; - if (unlikely(offset < 0)) { +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + /* if size can vary use size else just use PAGE_SIZE */ + size =3D nc->size; +#else + size =3D PAGE_SIZE; +#endif + + offset =3D ALIGN(nc->offset, align); + if (unlikely(offset + fragsz > size)) { page =3D virt_to_page(nc->va); =20 if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) @@ -4759,17 +4762,13 @@ void *page_frag_alloc_align(struct page_frag_cache = *nc, goto refill; } =20 -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - /* if size can vary use size else just use PAGE_SIZE */ - size =3D nc->size; -#endif /* OK, page count is 0, we can safely set it */ set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); =20 /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias =3D PAGE_FRAG_CACHE_MAX_SIZE + 1; - offset =3D size - fragsz; - if (unlikely(offset < 0)) { + offset =3D 0; + if (unlikely(fragsz > size)) { /* * The caller is trying to allocate a fragment * with fragsz > PAGE_SIZE but the cache isn't big @@ -4784,8 +4783,7 @@ void *page_frag_alloc_align(struct page_frag_cache *n= c, } =20 nc->pagecnt_bias--; - offset &=3D -align; - nc->offset =3D offset; + nc->offset =3D offset + fragsz; =20 return nc->va + offset; } --=20 2.33.0 From nobody Thu Nov 14 04:58:37 2024 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id A7AF1C4167B for ; Fri, 1 Dec 2023 12:02:36 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1378713AbjLAMC1 (ORCPT ); Fri, 1 Dec 2023 07:02:27 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:49916 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1378685AbjLAMCW (ORCPT ); Fri, 1 Dec 2023 07:02:22 -0500 Received: from szxga03-in.huawei.com (szxga03-in.huawei.com [45.249.212.189]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 876AA171B; Fri, 1 Dec 2023 04:02:24 -0800 (PST) Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.55]) by szxga03-in.huawei.com (SkyGuard) with ESMTP id 4ShWlF1GCdzMnPl; Fri, 1 Dec 2023 19:57:29 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Fri, 1 Dec 2023 20:02:22 +0800 From: Yunsheng Lin To: , , CC: , , Yunsheng Lin , "Michael S. Tsirkin" , Jason Wang , Alexei Starovoitov , Daniel Borkmann , Jesper Dangaard Brouer , John Fastabend , , , Subject: [PATCH RFC 4/6] vhost/net: remove vhost_net_page_frag_refill() Date: Fri, 1 Dec 2023 20:02:05 +0800 Message-ID: <20231201120208.15080-5-linyunsheng@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20231201120208.15080-1-linyunsheng@huawei.com> References: <20231201120208.15080-1-linyunsheng@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Originating-IP: [10.69.192.56] X-ClientProxiedBy: dggems703-chm.china.huawei.com (10.3.19.180) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" The page frag in vhost_net_page_frag_refill() uses the 'struct page_frag' from skb_page_frag_refill(), but it's implementation is similar to page_frag_alloc_align() now. This patch removes vhost_net_page_frag_refill() by using 'struct page_frag_cache' instead of 'struct page_frag', and allocating frag using page_frag_alloc_align(). The added benefit is that not only unifying the page frag implementation a little, but also having about 0.5% performance boost testing by using the vhost_net_test introduced in the last patch. Signed-off-by: Yunsheng Lin --- drivers/vhost/net.c | 93 ++++++++++++++------------------------------- 1 file changed, 29 insertions(+), 64 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index e574e21cc0ca..805e11d598e4 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -141,10 +141,8 @@ struct vhost_net { unsigned tx_zcopy_err; /* Flush in progress. Protected by tx vq lock. */ bool tx_flush; - /* Private page frag */ - struct page_frag page_frag; - /* Refcount bias of page frag */ - int refcnt_bias; + /* Private page frag cache */ + struct page_frag_cache pf_cache; }; =20 static unsigned vhost_net_zcopy_mask __read_mostly; @@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, s= ize_t total_len) !vhost_vq_avail_empty(vq->dev, vq); } =20 -static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int= sz, - struct page_frag *pfrag, gfp_t gfp) -{ - if (pfrag->page) { - if (pfrag->offset + sz <=3D pfrag->size) - return true; - __page_frag_cache_drain(pfrag->page, net->refcnt_bias); - } - - pfrag->offset =3D 0; - net->refcnt_bias =3D 0; - if (SKB_FRAG_PAGE_ORDER) { - /* Avoid direct reclaim but allow kswapd to wake */ - pfrag->page =3D alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | - __GFP_COMP | __GFP_NOWARN | - __GFP_NORETRY | __GFP_NOMEMALLOC, - SKB_FRAG_PAGE_ORDER); - if (likely(pfrag->page)) { - pfrag->size =3D PAGE_SIZE << SKB_FRAG_PAGE_ORDER; - goto done; - } - } - pfrag->page =3D alloc_page(gfp); - if (likely(pfrag->page)) { - pfrag->size =3D PAGE_SIZE; - goto done; - } - return false; - -done: - net->refcnt_bias =3D USHRT_MAX; - page_ref_add(pfrag->page, USHRT_MAX - 1); - return true; -} - #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) =20 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, @@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtque= ue *nvq, struct vhost_net *net =3D container_of(vq->dev, struct vhost_net, dev); struct socket *sock =3D vhost_vq_get_backend(vq); - struct page_frag *alloc_frag =3D &net->page_frag; struct virtio_net_hdr *gso; struct xdp_buff *xdp =3D &nvq->xdp[nvq->batched_xdp]; struct tun_xdp_hdr *hdr; @@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtque= ue *nvq, int sock_hlen =3D nvq->sock_hlen; void *buf; int copied; + int ret; =20 if (unlikely(len < nvq->sock_hlen)) return -EFAULT; @@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtq= ueue *nvq, return -ENOSPC; =20 buflen +=3D SKB_DATA_ALIGN(len + pad); - alloc_frag->offset =3D ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); - if (unlikely(!vhost_net_page_frag_refill(net, buflen, - alloc_frag, GFP_KERNEL))) + buf =3D page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL, + SMP_CACHE_BYTES); + if (unlikely(!buf)) return -ENOMEM; =20 - buf =3D (char *)page_address(alloc_frag->page) + alloc_frag->offset; - copied =3D copy_page_from_iter(alloc_frag->page, - alloc_frag->offset + - offsetof(struct tun_xdp_hdr, gso), - sock_hlen, from); - if (copied !=3D sock_hlen) - return -EFAULT; + copied =3D copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso), + sock_hlen, from); + if (copied !=3D sock_hlen) { + ret =3D -EFAULT; + goto err; + } =20 hdr =3D buf; gso =3D &hdr->gso; @@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtq= ueue *nvq, vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2); =20 - if (vhost16_to_cpu(vq, gso->hdr_len) > len) - return -EINVAL; + if (vhost16_to_cpu(vq, gso->hdr_len) > len) { + ret =3D -EINVAL; + goto err; + } } =20 len -=3D sock_hlen; - copied =3D copy_page_from_iter(alloc_frag->page, - alloc_frag->offset + pad, - len, from); - if (copied !=3D len) - return -EFAULT; + copied =3D copy_from_iter(buf + pad, len, from); + if (copied !=3D len) { + ret =3D -EFAULT; + goto err; + } =20 xdp_init_buff(xdp, buflen, NULL); xdp_prepare_buff(xdp, buf, pad, len, true); hdr->buflen =3D buflen; =20 - --net->refcnt_bias; - alloc_frag->offset +=3D buflen; - ++nvq->batched_xdp; =20 return 0; + +err: + page_frag_free(buf); + return ret; } =20 static void handle_tx_copy(struct vhost_net *net, struct socket *sock) @@ -1353,8 +1318,7 @@ static int vhost_net_open(struct inode *inode, struct= file *f) vqs[VHOST_NET_VQ_RX]); =20 f->private_data =3D n; - n->page_frag.page =3D NULL; - n->refcnt_bias =3D 0; + n->pf_cache.va =3D NULL; =20 return 0; } @@ -1422,8 +1386,9 @@ static int vhost_net_release(struct inode *inode, str= uct file *f) kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); kfree(n->vqs[VHOST_NET_VQ_TX].xdp); kfree(n->dev.vqs); - if (n->page_frag.page) - __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias); + if (n->pf_cache.va) + __page_frag_cache_drain(virt_to_head_page(n->pf_cache.va), + n->pf_cache.pagecnt_bias); kvfree(n); return 0; } --=20 2.33.0 From nobody Thu Nov 14 04:58:37 2024 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id E1EEFC10DCE for ; Fri, 1 Dec 2023 12:02:41 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1378699AbjLAMCd (ORCPT ); Fri, 1 Dec 2023 07:02:33 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:49916 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1378712AbjLAMC1 (ORCPT ); Fri, 1 Dec 2023 07:02:27 -0500 Received: from szxga08-in.huawei.com (szxga08-in.huawei.com [45.249.212.255]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 1C6B41720; Fri, 1 Dec 2023 04:02:32 -0800 (PST) Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.54]) by szxga08-in.huawei.com (SkyGuard) with ESMTP id 4ShWmn704Gz1P91G; Fri, 1 Dec 2023 19:58:49 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Fri, 1 Dec 2023 20:02:30 +0800 From: Yunsheng Lin To: , , CC: , , Yunsheng Lin , Jeroen de Borst , Praveen Kaligineedi , Shailend Chand , Eric Dumazet , Felix Fietkau , John Crispin , Sean Wang , Mark Lee , Lorenzo Bianconi , Matthias Brugger , AngeloGioacchino Del Regno , Keith Busch , Jens Axboe , Christoph Hellwig , Sagi Grimberg , Chaitanya Kulkarni , "Michael S. Tsirkin" , Jason Wang , Andrew Morton , , , , , , Subject: [PATCH RFC 5/6] net: introduce page_frag_cache_drain() Date: Fri, 1 Dec 2023 20:02:06 +0800 Message-ID: <20231201120208.15080-6-linyunsheng@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20231201120208.15080-1-linyunsheng@huawei.com> References: <20231201120208.15080-1-linyunsheng@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Originating-IP: [10.69.192.56] X-ClientProxiedBy: dggems703-chm.china.huawei.com (10.3.19.180) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" When draining a page_frag_cache, most user are doing the similar steps, so introduce an API to avoid code duplication. Signed-off-by: Yunsheng Lin --- drivers/net/ethernet/google/gve/gve_main.c | 11 ++--------- drivers/net/ethernet/mediatek/mtk_wed_wo.c | 17 ++--------------- drivers/nvme/host/tcp.c | 7 +------ drivers/nvme/target/tcp.c | 4 +--- drivers/vhost/net.c | 4 +--- include/linux/gfp.h | 2 ++ mm/page_alloc.c | 10 ++++++++++ 7 files changed, 19 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ether= net/google/gve/gve_main.c index 619bf63ec935..d976190b0f4d 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1278,17 +1278,10 @@ static void gve_unreg_xdp_info(struct gve_priv *pri= v) =20 static void gve_drain_page_cache(struct gve_priv *priv) { - struct page_frag_cache *nc; int i; =20 - for (i =3D 0; i < priv->rx_cfg.num_queues; i++) { - nc =3D &priv->rx[i].page_cache; - if (nc->va) { - __page_frag_cache_drain(virt_to_page(nc->va), - nc->pagecnt_bias); - nc->va =3D NULL; - } - } + for (i =3D 0; i < priv->rx_cfg.num_queues; i++) + page_frag_cache_drain(&priv->rx[i].page_cache); } =20 static int gve_open(struct net_device *dev) diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c b/drivers/net/ether= net/mediatek/mtk_wed_wo.c index 7ffbd4fca881..df0a3ceaf59b 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c +++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c @@ -286,7 +286,6 @@ mtk_wed_wo_queue_free(struct mtk_wed_wo *wo, struct mtk= _wed_wo_queue *q) static void mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *= q) { - struct page *page; int i; =20 for (i =3D 0; i < q->n_desc; i++) { @@ -298,19 +297,12 @@ mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, stru= ct mtk_wed_wo_queue *q) entry->buf =3D NULL; } =20 - if (!q->cache.va) - return; - - page =3D virt_to_page(q->cache.va); - __page_frag_cache_drain(page, q->cache.pagecnt_bias); - memset(&q->cache, 0, sizeof(q->cache)); + page_frag_cache_drain(&q->cache); } =20 static void mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *= q) { - struct page *page; - for (;;) { void *buf =3D mtk_wed_wo_dequeue(wo, q, NULL, true); =20 @@ -320,12 +312,7 @@ mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struc= t mtk_wed_wo_queue *q) skb_free_frag(buf); } =20 - if (!q->cache.va) - return; - - page =3D virt_to_page(q->cache.va); - __page_frag_cache_drain(page, q->cache.pagecnt_bias); - memset(&q->cache, 0, sizeof(q->cache)); + page_frag_cache_drain(&q->cache); } =20 static void diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 89661a9cf850..8d4f4a06f9d9 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1338,7 +1338,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_c= trl *ctrl) =20 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) { - struct page *page; struct nvme_tcp_ctrl *ctrl =3D to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue =3D &ctrl->queues[qid]; unsigned int noreclaim_flag; @@ -1349,11 +1348,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nc= trl, int qid) if (queue->hdr_digest || queue->data_digest) nvme_tcp_free_crypto(queue); =20 - if (queue->pf_cache.va) { - page =3D virt_to_head_page(queue->pf_cache.va); - __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); - queue->pf_cache.va =3D NULL; - } + page_frag_cache_drain(&queue->pf_cache); =20 noreclaim_flag =3D memalloc_noreclaim_save(); /* ->sock will be released by fput() */ diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 92b74d0b8686..f9a553d70a61 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1576,7 +1576,6 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct= nvmet_tcp_queue *queue) =20 static void nvmet_tcp_release_queue_work(struct work_struct *w) { - struct page *page; struct nvmet_tcp_queue *queue =3D container_of(w, struct nvmet_tcp_queue, release_work); =20 @@ -1600,8 +1599,7 @@ static void nvmet_tcp_release_queue_work(struct work_= struct *w) if (queue->hdr_digest || queue->data_digest) nvmet_tcp_free_crypto(queue); ida_free(&nvmet_tcp_queue_ida, queue->idx); - page =3D virt_to_head_page(queue->pf_cache.va); - __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); + page_frag_cache_drain(&queue->pf_cache); kfree(queue); } =20 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 805e11d598e4..4b2fcb228a0a 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1386,9 +1386,7 @@ static int vhost_net_release(struct inode *inode, str= uct file *f) kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); kfree(n->vqs[VHOST_NET_VQ_TX].xdp); kfree(n->dev.vqs); - if (n->pf_cache.va) - __page_frag_cache_drain(virt_to_head_page(n->pf_cache.va), - n->pf_cache.pagecnt_bias); + page_frag_cache_drain(&n->pf_cache); kvfree(n); return 0; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index bbd75976541e..03ba079655d3 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -316,6 +316,8 @@ extern void *page_frag_alloc_align(struct page_frag_cac= he *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align); =20 +void page_frag_cache_drain(struct page_frag_cache *nc); + static inline void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 083e0c38fb62..5a0e68edcb05 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4716,6 +4716,16 @@ void __page_frag_cache_drain(struct page *page, unsi= gned int count) } EXPORT_SYMBOL(__page_frag_cache_drain); =20 +void page_frag_cache_drain(struct page_frag_cache *nc) +{ + if (!nc->va) + return; + + __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias); + nc->va =3D NULL; +} +EXPORT_SYMBOL(page_frag_cache_drain); + void *page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align) --=20 2.33.0 From nobody Thu Nov 14 04:58:37 2024 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 33559C4167B for ; Fri, 1 Dec 2023 12:02:52 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1378685AbjLAMCn (ORCPT ); Fri, 1 Dec 2023 07:02:43 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:50026 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1378703AbjLAMCd (ORCPT ); Fri, 1 Dec 2023 07:02:33 -0500 Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id BF0F11712; Fri, 1 Dec 2023 04:02:36 -0800 (PST) Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.55]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4ShWrC6DqNzWhy8; Fri, 1 Dec 2023 20:01:47 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Fri, 1 Dec 2023 20:02:34 +0800 From: Yunsheng Lin To: , , CC: , , Yunsheng Lin , "Michael S. Tsirkin" , Jason Wang , Xuan Zhuo , Subject: [PATCH RFC 6/6] tools: virtio: introduce vhost_net_test Date: Fri, 1 Dec 2023 20:02:07 +0800 Message-ID: <20231201120208.15080-7-linyunsheng@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20231201120208.15080-1-linyunsheng@huawei.com> References: <20231201120208.15080-1-linyunsheng@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Originating-IP: [10.69.192.56] X-ClientProxiedBy: dggems703-chm.china.huawei.com (10.3.19.180) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" introduce vhost_net_test basing on virtio_test to test vhost_net changing in the kernel. Signed-off-by: Yunsheng Lin --- tools/virtio/Makefile | 8 +- tools/virtio/vhost_net_test.c | 441 ++++++++++++++++++++++++++++++++++ 2 files changed, 446 insertions(+), 3 deletions(-) create mode 100644 tools/virtio/vhost_net_test.c diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile index d128925980e0..e25e99c1c3b7 100644 --- a/tools/virtio/Makefile +++ b/tools/virtio/Makefile @@ -1,8 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 all: test mod -test: virtio_test vringh_test +test: virtio_test vringh_test vhost_net_test virtio_test: virtio_ring.o virtio_test.o vringh_test: vringh_test.o vringh.o virtio_ring.o +vhost_net_test: virtio_ring.o vhost_net_test.o =20 try-run =3D $(shell set -e; \ if ($(1)) >/dev/null 2>&1; \ @@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=3Dclean =20 .PHONY: all test mod clean vhost oot oot-clean oot-build clean: - ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \ - vhost_test/Module.symvers vhost_test/modules.order *.d + ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \ + vhost_test/.*.cmd vhost_test/Module.symvers \ + vhost_test/modules.order *.d -include *.d diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c new file mode 100644 index 000000000000..7e7b7aba3668 --- /dev/null +++ b/tools/virtio/vhost_net_test.c @@ -0,0 +1,441 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RANDOM_BATCH -1 + +static int tun_alloc(void) +{ + struct ifreq ifr; + int fd, e; + + fd =3D open("/dev/net/tun", O_RDWR); + if (fd < 0) { + perror("Cannot open /dev/net/tun"); + return fd; + } + + memset(&ifr, 0, sizeof(ifr)); + + ifr.ifr_flags =3D IFF_TUN | IFF_NO_PI; + strncpy(ifr.ifr_name, "tun0", IFNAMSIZ); + + e =3D ioctl(fd, TUNSETIFF, (void *) &ifr); + if (e < 0) { + perror("ioctl[TUNSETIFF]"); + close(fd); + return e; + } + + return fd; +} + +/* Unused */ +void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; + +struct vq_info { + int kick; + int call; + int num; + int idx; + void *ring; + /* copy used for control */ + struct vring vring; + struct virtqueue *vq; +}; + +struct vdev_info { + struct virtio_device vdev; + int control; + struct pollfd fds[1]; + struct vq_info vqs[1]; + int nvqs; + void *buf; + size_t buf_size; + struct vhost_memory *mem; +}; + +static struct vhost_vring_file no_backend =3D { .index =3D 1, .fd =3D -1 }, + backend =3D { .index =3D 1, .fd =3D 1 }; +static const struct vhost_vring_state null_state =3D {}; + +bool vq_notify(struct virtqueue *vq) +{ + struct vq_info *info =3D vq->priv; + unsigned long long v =3D 1; + int r; + r =3D write(info->kick, &v, sizeof v); + assert(r =3D=3D sizeof v); + return true; +} + +void vq_callback(struct virtqueue *vq) +{ +} + + +void vhost_vq_setup(struct vdev_info *dev, struct vq_info *info) +{ + struct vhost_vring_state state =3D { .index =3D info->idx }; + struct vhost_vring_file file =3D { .index =3D info->idx }; + unsigned long long features =3D dev->vdev.features; + struct vhost_vring_addr addr =3D { + .index =3D info->idx, + .desc_user_addr =3D (uint64_t)(unsigned long)info->vring.desc, + .avail_user_addr =3D (uint64_t)(unsigned long)info->vring.avail, + .used_user_addr =3D (uint64_t)(unsigned long)info->vring.used, + }; + int r; + r =3D ioctl(dev->control, VHOST_SET_FEATURES, &features); + assert(r >=3D 0); + state.num =3D info->vring.num; + r =3D ioctl(dev->control, VHOST_SET_VRING_NUM, &state); + assert(r >=3D 0); + state.num =3D 0; + r =3D ioctl(dev->control, VHOST_SET_VRING_BASE, &state); + assert(r >=3D 0); + r =3D ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr); + assert(r >=3D 0); + file.fd =3D info->kick; + r =3D ioctl(dev->control, VHOST_SET_VRING_KICK, &file); + assert(r >=3D 0); + file.fd =3D info->call; + r =3D ioctl(dev->control, VHOST_SET_VRING_CALL, &file); + assert(r >=3D 0); +} + +static void vq_reset(struct vq_info *info, int num, struct virtio_device *= vdev) +{ + if (info->vq) + vring_del_virtqueue(info->vq); + + memset(info->ring, 0, vring_size(num, 4096)); + vring_init(&info->vring, num, info->ring, 4096); + info->vq =3D vring_new_virtqueue(info->idx, num, 4096, vdev, true, false, + info->ring, vq_notify, vq_callback, "test"); + assert(info->vq); + info->vq->priv =3D info; +} + +static void vq_info_add(struct vdev_info *dev, int num) +{ + struct vq_info *info =3D &dev->vqs[dev->nvqs]; + int r; + + /* use VHOST_NET_VQ_TX for testing */ + info->idx =3D 1; + info->kick =3D eventfd(0, EFD_NONBLOCK); + info->call =3D eventfd(0, EFD_NONBLOCK); + r =3D posix_memalign(&info->ring, 4096, vring_size(num, 4096)); + assert(r >=3D 0); + vq_reset(info, num, &dev->vdev); + vhost_vq_setup(dev, info); + dev->fds[0].fd =3D info->call; + dev->fds[0].events =3D POLLIN; + dev->nvqs++; +} + +static void vdev_info_init(struct vdev_info* dev, unsigned long long featu= res) +{ + int r; + memset(dev, 0, sizeof *dev); + dev->vdev.features =3D features; + INIT_LIST_HEAD(&dev->vdev.vqs); + spin_lock_init(&dev->vdev.vqs_list_lock); + dev->buf_size =3D 1024; + dev->buf =3D malloc(dev->buf_size); + assert(dev->buf); + dev->control =3D open("/dev/vhost-net", O_RDWR); + assert(dev->control >=3D 0); + r =3D ioctl(dev->control, VHOST_SET_OWNER, NULL); + assert(r >=3D 0); + dev->mem =3D malloc(offsetof(struct vhost_memory, regions) + + sizeof dev->mem->regions[0]); + assert(dev->mem); + memset(dev->mem, 0, offsetof(struct vhost_memory, regions) + + sizeof dev->mem->regions[0]); + dev->mem->nregions =3D 1; + dev->mem->regions[0].guest_phys_addr =3D (long)dev->buf; + dev->mem->regions[0].userspace_addr =3D (long)dev->buf; + dev->mem->regions[0].memory_size =3D dev->buf_size; + r =3D ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem); + assert(r >=3D 0); +} + +/* TODO: this is pretty bad: we get a cache line bounce + * for the wait queue on poll and another one on read, + * plus the read which is there just to clear the + * current state. */ +static void wait_for_interrupt(struct vdev_info *dev) +{ + int i; + unsigned long long val; + poll(dev->fds, dev->nvqs, -1); + for (i =3D 0; i < dev->nvqs; ++i) + if (dev->fds[i].revents & POLLIN) { + read(dev->fds[i].fd, &val, sizeof val); + } +} + +static void run_test(struct vdev_info *dev, struct vq_info *vq, + bool delayed, int batch, int reset_n, int bufs) +{ + struct scatterlist sl; + long started =3D 0, completed =3D 0, next_reset =3D reset_n; + long completed_before, started_before; + int r; + unsigned int len; + long long spurious =3D 0; + const bool random_batch =3D batch =3D=3D RANDOM_BATCH; + + r =3D ioctl(dev->control, VHOST_NET_SET_BACKEND, &backend); + assert(!r); + + if (!reset_n) { + next_reset =3D INT_MAX; + } + + for (;;) { + virtqueue_disable_cb(vq->vq); + completed_before =3D completed; + started_before =3D started; + do { + const bool reset =3D completed > next_reset; + if (random_batch) + batch =3D (random() % vq->vring.num) + 1; + + while (started < bufs && + (started - completed) < batch) { + sg_init_one(&sl, dev->buf, dev->buf_size); + r =3D virtqueue_add_outbuf(vq->vq, &sl, 1, + dev->buf + started, + GFP_ATOMIC); + if (unlikely(r !=3D 0)) { + if (r =3D=3D -ENOSPC && + started > started_before) + r =3D 0; + else + r =3D -1; + break; + } + + ++started; + + if (unlikely(!virtqueue_kick(vq->vq))) { + r =3D -1; + break; + } + } + + if (started >=3D bufs) + r =3D -1; + + if (reset) { + r =3D ioctl(dev->control, VHOST_NET_SET_BACKEND, + &no_backend); + assert(!r); + } + + /* Flush out completed bufs if any */ + while (virtqueue_get_buf(vq->vq, &len)) { + ++completed; + r =3D 0; + } + + if (reset) { + struct vhost_vring_state s =3D { .index =3D 0 }; + + vq_reset(vq, vq->vring.num, &dev->vdev); + + r =3D ioctl(dev->control, VHOST_GET_VRING_BASE, + &s); + assert(!r); + + s.num =3D 0; + r =3D ioctl(dev->control, VHOST_SET_VRING_BASE, + &null_state); + assert(!r); + + r =3D ioctl(dev->control, VHOST_NET_SET_BACKEND, + &backend); + assert(!r); + + started =3D completed; + while (completed > next_reset) + next_reset +=3D completed; + } + } while (r =3D=3D 0); + if (completed =3D=3D completed_before && started =3D=3D started_before) + ++spurious; + assert(completed <=3D bufs); + assert(started <=3D bufs); + if (completed =3D=3D bufs) + break; + if (delayed) { + if (virtqueue_enable_cb_delayed(vq->vq)) + wait_for_interrupt(dev); + } else { + if (virtqueue_enable_cb(vq->vq)) + wait_for_interrupt(dev); + } + } + fprintf(stderr, + "spurious wakeups: 0x%llx started=3D0x%lx completed=3D0x%lx\n", + spurious, started, completed); +} + +const char optstring[] =3D "h"; +const struct option longopts[] =3D { + { + .name =3D "help", + .val =3D 'h', + }, + { + .name =3D "event-idx", + .val =3D 'E', + }, + { + .name =3D "no-event-idx", + .val =3D 'e', + }, + { + .name =3D "indirect", + .val =3D 'I', + }, + { + .name =3D "no-indirect", + .val =3D 'i', + }, + { + .name =3D "virtio-1", + .val =3D '1', + }, + { + .name =3D "no-virtio-1", + .val =3D '0', + }, + { + .name =3D "delayed-interrupt", + .val =3D 'D', + }, + { + .name =3D "no-delayed-interrupt", + .val =3D 'd', + }, + { + .name =3D "buf-num", + .val =3D 'n', + .has_arg =3D required_argument, + }, + { + .name =3D "batch", + .val =3D 'b', + .has_arg =3D required_argument, + }, + { + .name =3D "reset", + .val =3D 'r', + .has_arg =3D optional_argument, + }, + { + } +}; + +static void help(int status) +{ + fprintf(stderr, "Usage: virtio_test [--help]" + " [--no-indirect]" + " [--no-event-idx]" + " [--no-virtio-1]" + " [--delayed-interrupt]" + " [--batch=3Drandom/N]" + " [--reset=3DN]" + "\n"); + + exit(status); +} + +int main(int argc, char **argv) +{ + struct vdev_info dev; + unsigned long long features =3D (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | + (1ULL << VIRTIO_RING_F_EVENT_IDX) | (1ULL << VIRTIO_F_VERSION_1); + long batch =3D 1, reset =3D 0, nbufs =3D 0x100000; + int o; + bool delayed =3D false; + + for (;;) { + o =3D getopt_long(argc, argv, optstring, longopts, NULL); + switch (o) { + case -1: + goto done; + case '?': + help(2); + case 'e': + features &=3D ~(1ULL << VIRTIO_RING_F_EVENT_IDX); + break; + case 'h': + help(0); + case 'i': + features &=3D ~(1ULL << VIRTIO_RING_F_INDIRECT_DESC); + break; + case '0': + features &=3D ~(1ULL << VIRTIO_F_VERSION_1); + break; + case 'D': + delayed =3D true; + break; + case 'b': + if (0 =3D=3D strcmp(optarg, "random")) { + batch =3D RANDOM_BATCH; + } else { + batch =3D strtol(optarg, NULL, 10); + assert(batch > 0); + assert(batch < (long)INT_MAX + 1); + } + break; + case 'r': + if (!optarg) { + reset =3D 1; + } else { + reset =3D strtol(optarg, NULL, 10); + assert(reset > 0); + assert(reset < (long)INT_MAX + 1); + } + break; + case 'n': + nbufs =3D strtol(optarg, NULL, 10); + assert(nbufs > 0); + break; + default: + assert(0); + break; + } + } + +done: + backend.fd =3D tun_alloc(); + assert(backend.fd >=3D 0); + vdev_info_init(&dev, features); + vq_info_add(&dev, 256); + run_test(&dev, &dev.vqs[0], delayed, batch, reset, nbufs); + return 0; +} --=20 2.33.0