From nobody Fri Sep 20 04:04:16 2024 Received: from szxga04-in.huawei.com (szxga04-in.huawei.com [45.249.212.190]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 69CC6823BC for ; Thu, 28 Mar 2024 13:41:09 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=45.249.212.190 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1711633272; cv=none; b=uZJaac1DXD+3J3xcGr81zC/p5rpspCgTUnHMT3NM0VFO5gXluh8febN0OXJjQg0mCix8PJmiYUwN5o8nHfRbdj4rq698pnEKUdfo36baBu2Q0T4Pa6Afk5XrESXmY0oaht5Gd1Gd2p2srls2jjh0K+k4IagzerfPMM6Qy9FpPtc= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1711633272; c=relaxed/simple; bh=G1s28Ul5IpWN4uFA7dMKP+hw7F2xOgJDOxMyjAlfcgU=; h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=fnRkY0lSReE1A9i4mJ6jJqD28eHT698Ww57ssmbiwz+qKQBUMShltRWbc9bgluzjrRTATqj7uLNCF/HTHhrTjpnSpcfs9f3OQ8Q9N2zeTuoIpircNldwPoYHxEEB62WP9EFD5RrQr1+n1JVZ83NhN4EWCxTFDJL88y2tG+5y134= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com; spf=pass smtp.mailfrom=huawei.com; arc=none smtp.client-ip=45.249.212.190 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.19.88.214]) by szxga04-in.huawei.com (SkyGuard) with ESMTP id 4V54PG2BpJz29lRD; Thu, 28 Mar 2024 21:38:26 +0800 (CST) Received: from dggpemm500005.china.huawei.com (unknown [7.185.36.74]) by mail.maildlp.com (Postfix) with ESMTPS id CE73C1A016F; Thu, 28 Mar 2024 21:41:06 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.1.2507.35; Thu, 28 Mar 2024 21:41:06 +0800 From: Yunsheng Lin To: , , CC: , , Yunsheng Lin , Willem de Bruijn , Jason Wang , Eric Dumazet , Ingo Molnar , Peter Zijlstra , Juri Lelli , Vincent Guittot , Dietmar Eggemann , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider , John Fastabend , Jakub Sitnicki , David Ahern , Matthieu Baerts , Mat Martineau , Geliang Tang , Boris Pismenny , , Subject: [PATCH RFC 10/10] net: replace page_frag with page_frag_cache Date: Thu, 28 Mar 2024 21:38:39 +0800 Message-ID: <20240328133839.13620-11-linyunsheng@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20240328133839.13620-1-linyunsheng@huawei.com> References: <20240328133839.13620-1-linyunsheng@huawei.com> Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-ClientProxiedBy: dggems705-chm.china.huawei.com (10.3.19.182) To dggpemm500005.china.huawei.com (7.185.36.74) Content-Type: text/plain; charset="utf-8" Use the newly introduced prepare/commit API to replace page_frag with page_frag_cache for sk_page_frag(). Signed-off-by: Yunsheng Lin --- drivers/net/tun.c | 36 ++++++----- include/linux/sched.h | 5 +- include/net/sock.h | 7 ++- kernel/exit.c | 3 +- kernel/fork.c | 2 +- net/core/skbuff.c | 29 +++++---- net/core/skmsg.c | 24 +++++--- net/core/sock.c | 24 ++++---- net/ipv4/ip_output.c | 37 ++++++----- net/ipv4/tcp.c | 33 +++++----- net/ipv4/tcp_output.c | 30 +++++---- net/ipv6/ip6_output.c | 37 ++++++----- net/kcm/kcmsock.c | 28 ++++----- net/mptcp/protocol.c | 72 ++++++++++++++-------- net/tls/tls_device.c | 139 ++++++++++++++++++++++++------------------ 15 files changed, 290 insertions(+), 216 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 0b3f21cba552..a1c9b51e146b 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1598,10 +1598,11 @@ static bool tun_can_build_skb(struct tun_struct *tu= n, struct tun_file *tfile, } =20 static struct sk_buff *__tun_build_skb(struct tun_file *tfile, - struct page_frag *alloc_frag, char *buf, + struct page_frag_cache *alloc_frag, + char *buf, unsigned int offset, int buflen, int len, int pad) { - struct sk_buff *skb =3D build_skb(buf, buflen); + struct sk_buff *skb =3D build_skb(buf + offset, buflen); =20 if (!skb) return ERR_PTR(-ENOMEM); @@ -1609,9 +1610,7 @@ static struct sk_buff *__tun_build_skb(struct tun_fil= e *tfile, skb_reserve(skb, pad); skb_put(skb, len); skb_set_owner_w(skb, tfile->socket.sk); - - get_page(alloc_frag->page); - alloc_frag->offset +=3D buflen; + page_frag_alloc_commit(alloc_frag, offset, buflen); =20 return skb; } @@ -1660,9 +1659,10 @@ static struct sk_buff *tun_build_skb(struct tun_stru= ct *tun, struct virtio_net_hdr *hdr, int len, int *skb_xdp) { - struct page_frag *alloc_frag =3D ¤t->task_frag; + struct page_frag_cache *alloc_frag =3D ¤t->task_frag; struct bpf_prog *xdp_prog; int buflen =3D SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + unsigned int offset, size; char *buf; size_t copied; int pad =3D TUN_RX_PAD; @@ -1675,14 +1675,13 @@ static struct sk_buff *tun_build_skb(struct tun_str= uct *tun, buflen +=3D SKB_DATA_ALIGN(len + pad); rcu_read_unlock(); =20 - alloc_frag->offset =3D ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); - if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) + size =3D buflen; + buf =3D page_frag_alloc_prepare_align(alloc_frag, &offset, &size, + SMP_CACHE_BYTES, GFP_KERNEL); + if (unlikely(!buf)) return ERR_PTR(-ENOMEM); =20 - buf =3D (char *)page_address(alloc_frag->page) + alloc_frag->offset; - copied =3D copy_page_from_iter(alloc_frag->page, - alloc_frag->offset + pad, - len, from); + copied =3D copy_from_iter(buf + offset + pad, len, from); if (copied !=3D len) return ERR_PTR(-EFAULT); =20 @@ -1692,8 +1691,8 @@ static struct sk_buff *tun_build_skb(struct tun_struc= t *tun, */ if (hdr->gso_type || !xdp_prog) { *skb_xdp =3D 1; - return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, - pad); + return __tun_build_skb(tfile, alloc_frag, buf, offset, buflen, + len, pad); } =20 *skb_xdp =3D 0; @@ -1710,13 +1709,12 @@ static struct sk_buff *tun_build_skb(struct tun_str= uct *tun, =20 act =3D bpf_prog_run_xdp(xdp_prog, &xdp); if (act =3D=3D XDP_REDIRECT || act =3D=3D XDP_TX) { - get_page(alloc_frag->page); - alloc_frag->offset +=3D buflen; + page_frag_alloc_commit(alloc_frag, offset, buflen); } err =3D tun_xdp_act(tun, xdp_prog, &xdp, act); if (err < 0) { if (act =3D=3D XDP_REDIRECT || act =3D=3D XDP_TX) - put_page(alloc_frag->page); + page_frag_free_va(buf); goto out; } =20 @@ -1731,8 +1729,8 @@ static struct sk_buff *tun_build_skb(struct tun_struc= t *tun, rcu_read_unlock(); local_bh_enable(); =20 - return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad); - + return __tun_build_skb(tfile, alloc_frag, buf, offset, buflen, len, + pad); out: rcu_read_unlock(); local_bh_enable(); diff --git a/include/linux/sched.h b/include/linux/sched.h index 3c2abbc587b4..436642be0867 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include @@ -45,6 +45,7 @@ #include #include #include +#include #include =20 /* task_struct member predeclarations (sorted alphabetically): */ @@ -1338,7 +1339,7 @@ struct task_struct { /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; =20 - struct page_frag task_frag; + struct page_frag_cache task_frag; =20 #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; diff --git a/include/net/sock.h b/include/net/sock.h index 20df93699b60..9d8fb8df1ce9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -461,7 +461,7 @@ struct sock { struct sk_buff_head sk_write_queue; u32 sk_dst_pending_confirm; u32 sk_pacing_status; /* see enum sk_pacing */ - struct page_frag sk_frag; + struct page_frag_cache sk_frag; struct timer_list sk_timer; =20 unsigned long sk_pacing_rate; /* bytes per second */ @@ -2560,7 +2560,7 @@ static inline void sk_stream_moderate_sndbuf(struct s= ock *sk) * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ -static inline struct page_frag *sk_page_frag(struct sock *sk) +static inline struct page_frag_cache *sk_page_frag(struct sock *sk) { if (sk->sk_use_task_frag) return ¤t->task_frag; @@ -2568,7 +2568,8 @@ static inline struct page_frag *sk_page_frag(struct s= ock *sk) return &sk->sk_frag; } =20 -bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); +void *sk_page_frag_alloc_prepare(struct sock *sk, struct page_frag_cache *= pfrag, + unsigned int *size, unsigned int *offset); =20 /* * Default write policy as shown to user space via poll/select/SIGIO diff --git a/kernel/exit.c b/kernel/exit.c index 41a12630cbbc..8203275fd5ff 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -913,8 +913,7 @@ void __noreturn do_exit(long code) if (tsk->splice_pipe) free_pipe_info(tsk->splice_pipe); =20 - if (tsk->task_frag.page) - put_page(tsk->task_frag.page); + page_frag_cache_drain(&tsk->task_frag); =20 exit_task_stack_account(tsk); =20 diff --git a/kernel/fork.c b/kernel/fork.c index 39a5046c2f0b..8e5abc30c47a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1158,10 +1158,10 @@ static struct task_struct *dup_task_struct(struct t= ask_struct *orig, int node) tsk->btrace_seq =3D 0; #endif tsk->splice_pipe =3D NULL; - tsk->task_frag.page =3D NULL; tsk->wake_q.next =3D NULL; tsk->worker_private =3D NULL; =20 + page_frag_cache_init(&tsk->task_frag); kcov_task_init(tsk); kmsan_task_create(tsk); kmap_local_fork(tsk); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e6bc0dec7463..88da8c52a121 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3017,23 +3017,24 @@ static void sock_spd_release(struct splice_pipe_des= c *spd, unsigned int i) put_page(spd->pages[i]); } =20 -static struct page *linear_to_page(struct page *page, unsigned int *len, - unsigned int *offset, - struct sock *sk) +static struct page *linear_to_page(struct page_frag_cache *pfrag, + struct page *page, unsigned int *offset, + unsigned int *len, struct sock *sk) { - struct page_frag *pfrag =3D sk_page_frag(sk); + unsigned int new_len, new_offset; + void *va; =20 - if (!sk_page_frag_refill(sk, pfrag)) + va =3D sk_page_frag_alloc_prepare(sk, pfrag, &new_offset, &new_len); + if (!va) return NULL; =20 - *len =3D min_t(unsigned int, *len, pfrag->size - pfrag->offset); + *len =3D min_t(unsigned int, *len, new_len); =20 - memcpy(page_address(pfrag->page) + pfrag->offset, + memcpy(va + new_offset, page_address(page) + *offset, *len); - *offset =3D pfrag->offset; - pfrag->offset +=3D *len; + *offset =3D new_offset; =20 - return pfrag->page; + return virt_to_page(va); } =20 static bool spd_can_coalesce(const struct splice_pipe_desc *spd, @@ -3055,19 +3056,23 @@ static bool spd_fill_page(struct splice_pipe_desc *= spd, bool linear, struct sock *sk) { + struct page_frag_cache *pfrag =3D sk_page_frag(sk); + if (unlikely(spd->nr_pages =3D=3D MAX_SKB_FRAGS)) return true; =20 if (linear) { - page =3D linear_to_page(page, len, &offset, sk); + page =3D linear_to_page(pfrag, page, &offset, len, sk); if (!page) return true; } if (spd_can_coalesce(spd, page, offset)) { spd->partial[spd->nr_pages - 1].len +=3D *len; + page_frag_alloc_commit_noref(pfrag, offset, *len); return false; } - get_page(page); + + page_frag_alloc_commit(pfrag, offset, *len); spd->pages[spd->nr_pages] =3D page; spd->partial[spd->nr_pages].len =3D *len; spd->partial[spd->nr_pages].offset =3D offset; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4d75ef9d24bf..b843083bdd4e 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -27,23 +27,26 @@ static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, = int elem_first_coalesce) int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce) { - struct page_frag *pfrag =3D sk_page_frag(sk); + struct page_frag_cache *pfrag =3D sk_page_frag(sk); u32 osize =3D msg->sg.size; int ret =3D 0; =20 len -=3D msg->sg.size; while (len > 0) { + unsigned int frag_offset, frag_len; struct scatterlist *sge; - u32 orig_offset; + struct page *page; int use, i; + void *va; =20 - if (!sk_page_frag_refill(sk, pfrag)) { + va =3D sk_page_frag_alloc_prepare(sk, pfrag, &frag_offset, + &frag_len); + if (!va) { ret =3D -ENOMEM; goto msg_trim; } =20 - orig_offset =3D pfrag->offset; - use =3D min_t(int, len, pfrag->size - orig_offset); + use =3D min_t(int, len, frag_len); if (!sk_wmem_schedule(sk, use)) { ret =3D -ENOMEM; goto msg_trim; @@ -53,10 +56,12 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, i= nt len, sk_msg_iter_var_prev(i); sge =3D &msg->sg.data[i]; =20 + page =3D virt_to_page(va); if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) && - sg_page(sge) =3D=3D pfrag->page && - sge->offset + sge->length =3D=3D orig_offset) { + sg_page(sge) =3D=3D page && + sge->offset + sge->length =3D=3D frag_offset) { sge->length +=3D use; + page_frag_alloc_commit_noref(pfrag, frag_offset, use); } else { if (sk_msg_full(msg)) { ret =3D -ENOSPC; @@ -65,14 +70,13 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, i= nt len, =20 sge =3D &msg->sg.data[msg->sg.end]; sg_unmark_end(sge); - sg_set_page(sge, pfrag->page, use, orig_offset); - get_page(pfrag->page); + sg_set_page(sge, page, use, frag_offset); + page_frag_alloc_commit(pfrag, frag_offset, use); sk_msg_iter_next(msg, end); } =20 sk_mem_charge(sk, use); msg->sg.size +=3D use; - pfrag->offset +=3D use; len -=3D use; } =20 diff --git a/net/core/sock.c b/net/core/sock.c index 43bf3818c19e..e10247783ada 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2191,10 +2191,7 @@ static void __sk_destruct(struct rcu_head *head) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); =20 - if (sk->sk_frag.page) { - put_page(sk->sk_frag.page); - sk->sk_frag.page =3D NULL; - } + page_frag_cache_drain(&sk->sk_frag); =20 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ put_cred(sk->sk_peer_cred); @@ -2936,16 +2933,21 @@ bool skb_page_frag_refill(unsigned int sz, struct p= age_frag *pfrag, gfp_t gfp) } EXPORT_SYMBOL(skb_page_frag_refill); =20 -bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +void *sk_page_frag_alloc_prepare(struct sock *sk, struct page_frag_cache *= pfrag, + unsigned int *offset, unsigned int *size) { - if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) - return true; + void *va; + + *size =3D 32U; + va =3D page_frag_alloc_prepare(pfrag, offset, size, sk->sk_allocation); + if (likely(va)) + return va; =20 sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); - return false; + return NULL; } -EXPORT_SYMBOL(sk_page_frag_refill); +EXPORT_SYMBOL(sk_page_frag_alloc_prepare); =20 void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) @@ -3479,8 +3481,8 @@ void sock_init_data_uid(struct socket *sock, struct s= ock *sk, kuid_t uid) sk->sk_error_report =3D sock_def_error_report; sk->sk_destruct =3D sock_def_destruct; =20 - sk->sk_frag.page =3D NULL; - sk->sk_frag.offset =3D 0; + page_frag_cache_init(&sk->sk_frag); + sk->sk_peek_off =3D -1; =20 sk->sk_peer_pid =3D NULL; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1fe794967211..2e96bf6935e1 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -952,7 +952,7 @@ static int __ip_append_data(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork, - struct page_frag *pfrag, + struct page_frag_cache *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, @@ -1228,31 +1228,40 @@ static int __ip_append_data(struct sock *sk, wmem_alloc_delta +=3D copy; } else if (!zc) { int i =3D skb_shinfo(skb)->nr_frags; + unsigned int frag_offset, frag_size; + struct page *page; + void *va; =20 err =3D -ENOMEM; - if (!sk_page_frag_refill(sk, pfrag)) + va =3D sk_page_frag_alloc_prepare(sk, pfrag, &frag_offset, + &frag_size); + if (!va) goto error; =20 + page =3D virt_to_page(va); skb_zcopy_downgrade_managed(skb); - if (!skb_can_coalesce(skb, i, pfrag->page, - pfrag->offset)) { + copy =3D min_t(int, copy, frag_size); + if (getfrag(from, va + frag_offset, offset, copy, + skb->len, skb) < 0) + goto error_efault; + + if (!skb_can_coalesce(skb, i, page, frag_offset)) { err =3D -EMSGSIZE; if (i =3D=3D MAX_SKB_FRAGS) goto error; =20 - __skb_fill_page_desc(skb, i, pfrag->page, - pfrag->offset, 0); + __skb_fill_page_desc(skb, i, page, frag_offset, + copy); skb_shinfo(skb)->nr_frags =3D ++i; - get_page(pfrag->page); + page_frag_alloc_commit(pfrag, frag_offset, + copy); + } else { + skb_frag_size_add( + &skb_shinfo(skb)->frags[i - 1], copy); + page_frag_alloc_commit_noref(pfrag, frag_offset, + copy); } - copy =3D min_t(int, copy, pfrag->size - pfrag->offset); - if (getfrag(from, - page_address(pfrag->page) + pfrag->offset, - offset, copy, skb->len, skb) < 0) - goto error_efault; =20 - pfrag->offset +=3D copy; - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb_len_add(skb, copy); wmem_alloc_delta +=3D copy; } else { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d20b62d52171..7f01d1a1e0ee 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1177,13 +1177,17 @@ int tcp_sendmsg_locked(struct sock *sk, struct msgh= dr *msg, size_t size) if (zc =3D=3D 0) { bool merge =3D true; int i =3D skb_shinfo(skb)->nr_frags; - struct page_frag *pfrag =3D sk_page_frag(sk); + struct page_frag_cache *pfrag =3D sk_page_frag(sk); + unsigned int offset, size; + struct page *page; + void *va; =20 - if (!sk_page_frag_refill(sk, pfrag)) + va =3D sk_page_frag_alloc_prepare(sk, pfrag, &offset, &size); + if (!va) goto wait_for_space; =20 - if (!skb_can_coalesce(skb, i, pfrag->page, - pfrag->offset)) { + page =3D virt_to_page(va); + if (!skb_can_coalesce(skb, i, page, offset)) { if (i >=3D READ_ONCE(sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; @@ -1191,7 +1195,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr= *msg, size_t size) merge =3D false; } =20 - copy =3D min_t(int, copy, pfrag->size - pfrag->offset); + copy =3D min_t(int, copy, size); =20 if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) { if (tcp_downgrade_zcopy_pure(sk, skb)) @@ -1203,22 +1207,19 @@ int tcp_sendmsg_locked(struct sock *sk, struct msgh= dr *msg, size_t size) if (!copy) goto wait_for_space; =20 - err =3D skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, - pfrag->page, - pfrag->offset, - copy); + err =3D skb_copy_to_va_nocache(sk, &msg->msg_iter, skb, + va + offset, copy); if (err) goto do_error; =20 /* Update the skb. */ if (merge) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + page_frag_alloc_commit_noref(pfrag, offset, copy); } else { - skb_fill_page_desc(skb, i, pfrag->page, - pfrag->offset, copy); - page_ref_inc(pfrag->page); + skb_fill_page_desc(skb, i, page, offset, copy); + page_frag_alloc_commit(pfrag, offset, copy); } - pfrag->offset +=3D copy; } else if (zc =3D=3D MSG_ZEROCOPY) { /* First append to a fragless skb builds initial * pure zerocopy skb @@ -3099,11 +3100,7 @@ int tcp_disconnect(struct sock *sk, int flags) =20 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); =20 - if (sk->sk_frag.page) { - put_page(sk->sk_frag.page); - sk->sk_frag.page =3D NULL; - sk->sk_frag.offset =3D 0; - } + page_frag_cache_drain(&sk->sk_frag); sk_error_report(sk); return 0; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e3167ad96567..b41293981066 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3927,9 +3927,12 @@ static int tcp_send_syn_data(struct sock *sk, struct= sk_buff *syn) struct inet_connection_sock *icsk =3D inet_csk(sk); struct tcp_sock *tp =3D tcp_sk(sk); struct tcp_fastopen_request *fo =3D tp->fastopen_req; - struct page_frag *pfrag =3D sk_page_frag(sk); + struct page_frag_cache *pfrag =3D sk_page_frag(sk); + unsigned int offset, size; struct sk_buff *syn_data; int space, err =3D 0; + struct page *page; + void *va; =20 tp->rx_opt.mss_clamp =3D tp->advmss; /* If MSS is not cached */ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie)) @@ -3948,30 +3951,33 @@ static int tcp_send_syn_data(struct sock *sk, struc= t sk_buff *syn) =20 space =3D min_t(size_t, space, fo->size); =20 - if (space && - !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE), - pfrag, sk->sk_allocation)) - goto fallback; + if (space) { + size =3D min_t(size_t, space, PAGE_SIZE); + va =3D page_frag_alloc_prepare(pfrag, &offset, &size, + sk->sk_allocation); + if (!va) + goto fallback; + } + syn_data =3D tcp_stream_alloc_skb(sk, sk->sk_allocation, false); if (!syn_data) goto fallback; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); if (space) { - space =3D min_t(size_t, space, pfrag->size - pfrag->offset); + space =3D min_t(size_t, space, size); space =3D tcp_wmem_schedule(sk, space); } if (space) { - space =3D copy_page_from_iter(pfrag->page, pfrag->offset, - space, &fo->data->msg_iter); + space =3D _copy_from_iter(va + offset, space, + &fo->data->msg_iter); if (unlikely(!space)) { tcp_skb_tsorted_anchor_cleanup(syn_data); kfree_skb(syn_data); goto fallback; } - skb_fill_page_desc(syn_data, 0, pfrag->page, - pfrag->offset, space); - page_ref_inc(pfrag->page); - pfrag->offset +=3D space; + page =3D virt_to_page(va); + skb_fill_page_desc(syn_data, 0, page, offset, space); + page_frag_alloc_commit(pfrag, offset, space); skb_len_add(syn_data, space); skb_zcopy_set(syn_data, fo->uarg, NULL); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b9dd3a66e423..c95dcb04c3b2 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1404,7 +1404,7 @@ static int __ip6_append_data(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork_full, struct inet6_cork *v6_cork, - struct page_frag *pfrag, + struct page_frag_cache *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, @@ -1745,32 +1745,41 @@ static int __ip6_append_data(struct sock *sk, copy =3D err; wmem_alloc_delta +=3D copy; } else if (!zc) { + unsigned int frag_offset, frag_size; int i =3D skb_shinfo(skb)->nr_frags; + struct page *page; + void *va; =20 err =3D -ENOMEM; - if (!sk_page_frag_refill(sk, pfrag)) + va =3D sk_page_frag_alloc_prepare(sk, pfrag, &frag_offset, + &frag_size); + if (!va) goto error; =20 + page =3D virt_to_page(va); skb_zcopy_downgrade_managed(skb); - if (!skb_can_coalesce(skb, i, pfrag->page, - pfrag->offset)) { + copy =3D min_t(int, copy, frag_size); + if (getfrag(from, va + frag_offset, offset, copy, + skb->len, skb) < 0) + goto error_efault; + + if (!skb_can_coalesce(skb, i, page, frag_offset)) { err =3D -EMSGSIZE; if (i =3D=3D MAX_SKB_FRAGS) goto error; =20 - __skb_fill_page_desc(skb, i, pfrag->page, - pfrag->offset, 0); + __skb_fill_page_desc(skb, i, page, frag_offset, + copy); skb_shinfo(skb)->nr_frags =3D ++i; - get_page(pfrag->page); + page_frag_alloc_commit(pfrag, frag_offset, + copy); + } else { + skb_frag_size_add( + &skb_shinfo(skb)->frags[i - 1], copy); + page_frag_alloc_commit_noref(pfrag, frag_offset, + copy); } - copy =3D min_t(int, copy, pfrag->size - pfrag->offset); - if (getfrag(from, - page_address(pfrag->page) + pfrag->offset, - offset, copy, skb->len, skb) < 0) - goto error_efault; =20 - pfrag->offset +=3D copy; - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb->len +=3D copy; skb->data_len +=3D copy; skb->truesize +=3D copy; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 2f191e50d4fc..5e64ae32f760 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -803,13 +803,17 @@ static int kcm_sendmsg(struct socket *sock, struct ms= ghdr *msg, size_t len) while (msg_data_left(msg)) { bool merge =3D true; int i =3D skb_shinfo(skb)->nr_frags; - struct page_frag *pfrag =3D sk_page_frag(sk); + struct page_frag_cache *pfrag =3D sk_page_frag(sk); + unsigned int offset, size; + struct page *page; + void *va; =20 - if (!sk_page_frag_refill(sk, pfrag)) + va =3D sk_page_frag_alloc_prepare(sk, pfrag, &offset, &size); + if (!va) goto wait_for_memory; =20 - if (!skb_can_coalesce(skb, i, pfrag->page, - pfrag->offset)) { + page =3D virt_to_page(va); + if (!skb_can_coalesce(skb, i, page, offset)) { if (i =3D=3D MAX_SKB_FRAGS) { struct sk_buff *tskb; =20 @@ -850,15 +854,12 @@ static int kcm_sendmsg(struct socket *sock, struct ms= ghdr *msg, size_t len) if (head !=3D skb) head->truesize +=3D copy; } else { - copy =3D min_t(int, msg_data_left(msg), - pfrag->size - pfrag->offset); + copy =3D min_t(int, msg_data_left(msg), size); if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; =20 - err =3D skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, - pfrag->page, - pfrag->offset, - copy); + err =3D skb_copy_to_va_nocache(sk, &msg->msg_iter, skb, + va + offset, copy); if (err) goto out_error; =20 @@ -866,13 +867,12 @@ static int kcm_sendmsg(struct socket *sock, struct ms= ghdr *msg, size_t len) if (merge) { skb_frag_size_add( &skb_shinfo(skb)->frags[i - 1], copy); + page_frag_alloc_commit_noref(pfrag, offset, copy); } else { - skb_fill_page_desc(skb, i, pfrag->page, - pfrag->offset, copy); - get_page(pfrag->page); + skb_fill_page_desc(skb, i, page, offset, copy); + page_frag_alloc_commit(pfrag, offset, copy); } =20 - pfrag->offset +=3D copy; } =20 copied +=3D copy; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3a1967bc7bad..7253e4950feb 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -957,17 +957,16 @@ static bool mptcp_skb_can_collapse_to(u64 write_seq, } =20 /* we can append data to the given data frag if: - * - there is space available in the backing page_frag - * - the data frag tail matches the current page_frag free offset + * - the data frag tail matches the current page and offset * - the data frag end sequence number matches the current write seq */ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, - const struct page_frag *pfrag, + const struct page *page, + const unsigned int offset, const struct mptcp_data_frag *df) { - return df && pfrag->page =3D=3D df->page && - pfrag->size - pfrag->offset > 0 && - pfrag->offset =3D=3D (df->offset + df->data_len) && + return df && page =3D=3D df->page && + offset =3D=3D (df->offset + df->data_len) && df->data_seq + df->data_len =3D=3D msk->write_seq; } =20 @@ -1082,30 +1081,35 @@ static void mptcp_enter_memory_pressure(struct sock= *sk) /* ensure we get enough memory for the frag hdr, beyond some minimal amoun= t of * data */ -static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfra= g) +static void *mptcp_page_frag_alloc_prepare(struct sock *sk, + struct page_frag_cache *pfrag, + unsigned int *offset, + unsigned int *size) { - if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), - pfrag, sk->sk_allocation))) - return true; + void *va; + + va =3D page_frag_alloc_prepare(pfrag, offset, size, sk->sk_allocation); + if (likely(va)) + return va; =20 mptcp_enter_memory_pressure(sk); - return false; + return NULL; } =20 static struct mptcp_data_frag * -mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfra= g, - int orig_offset) +mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page *page, + unsigned int orig_offset) { int offset =3D ALIGN(orig_offset, sizeof(long)); struct mptcp_data_frag *dfrag; =20 - dfrag =3D (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); + dfrag =3D (struct mptcp_data_frag *)(page_to_virt(page) + offset); dfrag->data_len =3D 0; dfrag->data_seq =3D msk->write_seq; dfrag->overhead =3D offset - orig_offset + sizeof(struct mptcp_data_frag); dfrag->offset =3D offset + sizeof(struct mptcp_data_frag); dfrag->already_sent =3D 0; - dfrag->page =3D pfrag->page; + dfrag->page =3D page; =20 return dfrag; } @@ -1788,7 +1792,7 @@ static u32 mptcp_send_limit(const struct sock *sk) static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk =3D mptcp_sk(sk); - struct page_frag *pfrag; + struct page_frag_cache *pfrag; size_t copied =3D 0; int ret =3D 0; long timeo; @@ -1827,9 +1831,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msg= hdr *msg, size_t len) while (msg_data_left(msg)) { int total_ts, frag_truesize =3D 0; struct mptcp_data_frag *dfrag; - bool dfrag_collapsed; - size_t psize, offset; + bool dfrag_collapsed =3D false; + unsigned int offset, size; + struct page *page; + size_t psize; u32 copy_limit; + void *va; =20 /* ensure fitting the notsent_lowat() constraint */ copy_limit =3D mptcp_send_limit(sk); @@ -1840,12 +1847,23 @@ static int mptcp_sendmsg(struct sock *sk, struct ms= ghdr *msg, size_t len) * page allocator */ dfrag =3D mptcp_pending_tail(sk); - dfrag_collapsed =3D mptcp_frag_can_collapse_to(msk, pfrag, dfrag); + size =3D 32U; + va =3D mptcp_page_frag_alloc_prepare(sk, pfrag, &offset, &size); + if (!va) + goto wait_for_memory; + + page =3D virt_to_page(va); + dfrag_collapsed =3D mptcp_frag_can_collapse_to(msk, page, offset, + dfrag); if (!dfrag_collapsed) { - if (!mptcp_page_frag_refill(sk, pfrag)) + size =3D 32U + sizeof(struct mptcp_data_frag); + va =3D mptcp_page_frag_alloc_prepare(sk, pfrag, &offset, + &size); + if (!va) goto wait_for_memory; =20 - dfrag =3D mptcp_carve_data_frag(msk, pfrag, pfrag->offset); + page =3D virt_to_page(va); + dfrag =3D mptcp_carve_data_frag(msk, page, offset); frag_truesize =3D dfrag->overhead; } =20 @@ -1853,8 +1871,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msgh= dr *msg, size_t len) * memory accounting will prevent execessive memory usage * anyway */ - offset =3D dfrag->offset + dfrag->data_len; - psize =3D pfrag->size - offset; + psize =3D size - frag_truesize; psize =3D min_t(size_t, psize, msg_data_left(msg)); psize =3D min_t(size_t, psize, copy_limit); total_ts =3D psize + frag_truesize; @@ -1863,7 +1880,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msgh= dr *msg, size_t len) goto wait_for_memory; =20 ret =3D do_copy_data_nocache(sk, psize, &msg->msg_iter, - page_address(dfrag->page) + offset); + va + dfrag->offset + dfrag->data_len); if (ret) goto do_error; =20 @@ -1872,7 +1889,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msgh= dr *msg, size_t len) copied +=3D psize; dfrag->data_len +=3D psize; frag_truesize +=3D psize; - pfrag->offset +=3D frag_truesize; WRITE_ONCE(msk->write_seq, msk->write_seq + psize); =20 /* charge data on mptcp pending queue to the msk socket @@ -1880,11 +1896,15 @@ static int mptcp_sendmsg(struct sock *sk, struct ms= ghdr *msg, size_t len) */ sk_wmem_queued_add(sk, frag_truesize); if (!dfrag_collapsed) { - get_page(dfrag->page); + page_frag_alloc_commit(pfrag, offset, frag_truesize); list_add_tail(&dfrag->list, &msk->rtx_queue); if (!msk->first_pending) WRITE_ONCE(msk->first_pending, dfrag); + } else { + page_frag_alloc_commit_noref(pfrag, offset, + frag_truesize); } + pr_debug("msk=3D%p dfrag at seq=3D%llu len=3D%u sent=3D%u new=3D%d", msk, dfrag->data_seq, dfrag->data_len, dfrag->already_sent, !dfrag_collapsed); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index bf8ed36b1ad6..af9cf86b727a 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -255,25 +255,43 @@ static void tls_device_resync_tx(struct sock *sk, str= uct tls_context *tls_ctx, clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags); } =20 -static void tls_append_frag(struct tls_record_info *record, - struct page_frag *pfrag, - int size) +static void tls_append_pfrag(struct tls_record_info *record, + struct page_frag_cache *pfrag, struct page *page, + unsigned int offset, unsigned int size) { skb_frag_t *frag; =20 frag =3D &record->frags[record->num_frags - 1]; - if (skb_frag_page(frag) =3D=3D pfrag->page && - skb_frag_off(frag) + skb_frag_size(frag) =3D=3D pfrag->offset) { + if (skb_frag_page(frag) =3D=3D page && + skb_frag_off(frag) + skb_frag_size(frag) =3D=3D offset) { skb_frag_size_add(frag, size); + page_frag_alloc_commit_noref(pfrag, offset, size); } else { ++frag; - skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, - size); + skb_frag_fill_page_desc(frag, page, offset, size); ++record->num_frags; - get_page(pfrag->page); + page_frag_alloc_commit(pfrag, offset, size); + } + + record->len +=3D size; +} + +static void tls_append_page(struct tls_record_info *record, struct page *p= age, + unsigned int offset, unsigned int size) +{ + skb_frag_t *frag; + + frag =3D &record->frags[record->num_frags - 1]; + if (skb_frag_page(frag) =3D=3D page && + skb_frag_off(frag) + skb_frag_size(frag) =3D=3D offset) { + skb_frag_size_add(frag, size); + } else { + ++frag; + skb_frag_fill_page_desc(frag, page, offset, size); + ++record->num_frags; + get_page(page); } =20 - pfrag->offset +=3D size; record->len +=3D size; } =20 @@ -314,11 +332,13 @@ static int tls_push_record(struct sock *sk, static void tls_device_record_close(struct sock *sk, struct tls_context *ctx, struct tls_record_info *record, - struct page_frag *pfrag, + struct page_frag_cache *pfrag, unsigned char record_type) { struct tls_prot_info *prot =3D &ctx->prot_info; - struct page_frag dummy_tag_frag; + unsigned int offset, size; + struct page *page; + void *va; =20 /* append tag * device will fill in the tag, we just need to append a placeholder @@ -326,13 +346,14 @@ static void tls_device_record_close(struct sock *sk, * increases frag count) * if we can't allocate memory now use the dummy page */ - if (unlikely(pfrag->size - pfrag->offset < prot->tag_size) && - !skb_page_frag_refill(prot->tag_size, pfrag, sk->sk_allocation)) { - dummy_tag_frag.page =3D dummy_page; - dummy_tag_frag.offset =3D 0; - pfrag =3D &dummy_tag_frag; + size =3D prot->tag_size; + va =3D page_frag_alloc_prepare(pfrag, &offset, &size, sk->sk_allocation); + if (unlikely(!va)) { + tls_append_page(record, dummy_page, 0, prot->tag_size); + } else { + page =3D virt_to_page(va); + tls_append_pfrag(record, pfrag, page, offset, prot->tag_size); } - tls_append_frag(record, pfrag, prot->tag_size); =20 /* fill prepend */ tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]), @@ -340,23 +361,34 @@ static void tls_device_record_close(struct sock *sk, record_type); } =20 -static int tls_create_new_record(struct tls_offload_context_tx *offload_ct= x, - struct page_frag *pfrag, +static int tls_create_new_record(struct sock *sk, + struct tls_offload_context_tx *offload_ctx, + struct page_frag_cache *pfrag, size_t prepend_size) { struct tls_record_info *record; + unsigned int offset, size; + struct page *page; skb_frag_t *frag; + void *va; + + size =3D prepend_size; + va =3D page_frag_alloc_prepare(pfrag, &offset, &size, sk->sk_allocation); + if (!va) { + READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return -ENOMEM; + } =20 record =3D kmalloc(sizeof(*record), GFP_KERNEL); if (!record) return -ENOMEM; =20 + page =3D virt_to_page(va); frag =3D &record->frags[0]; - skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, - prepend_size); + skb_frag_fill_page_desc(frag, page, offset, prepend_size); =20 - get_page(pfrag->page); - pfrag->offset +=3D prepend_size; + page_frag_alloc_commit(pfrag, offset, prepend_size); =20 record->num_frags =3D 1; record->len =3D prepend_size; @@ -364,33 +396,21 @@ static int tls_create_new_record(struct tls_offload_c= ontext_tx *offload_ctx, return 0; } =20 -static int tls_do_allocation(struct sock *sk, - struct tls_offload_context_tx *offload_ctx, - struct page_frag *pfrag, - size_t prepend_size) +static void *tls_do_allocation(struct sock *sk, + struct tls_offload_context_tx *offload_ctx, + struct page_frag_cache *pfrag, + size_t prepend_size, unsigned int *offset, + unsigned int *size) { - int ret; - if (!offload_ctx->open_record) { - if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, - sk->sk_allocation))) { - READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); - sk_stream_moderate_sndbuf(sk); - return -ENOMEM; - } + int ret; =20 - ret =3D tls_create_new_record(offload_ctx, pfrag, prepend_size); + ret =3D tls_create_new_record(sk, offload_ctx, pfrag, prepend_size); if (ret) - return ret; - - if (pfrag->size > pfrag->offset) - return 0; + return NULL; } =20 - if (!sk_page_frag_refill(sk, pfrag)) - return -ENOMEM; - - return 0; + return sk_page_frag_alloc_prepare(sk, pfrag, offset, size); } =20 static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter = *i) @@ -427,8 +447,8 @@ static int tls_push_data(struct sock *sk, struct tls_prot_info *prot =3D &tls_ctx->prot_info; struct tls_offload_context_tx *ctx =3D tls_offload_ctx_tx(tls_ctx); struct tls_record_info *record; + struct page_frag_cache *pfrag; int tls_push_record_flags; - struct page_frag *pfrag; size_t orig_size =3D size; u32 max_open_record_len; bool more =3D false; @@ -465,8 +485,13 @@ static int tls_push_data(struct sock *sk, max_open_record_len =3D TLS_MAX_PAYLOAD_SIZE + prot->prepend_size; do { - rc =3D tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); - if (unlikely(rc)) { + unsigned int frag_offset, frag_size; + struct page *page; + void *va; + + va =3D tls_do_allocation(sk, ctx, pfrag, prot->prepend_size, + &frag_offset, &frag_size); + if (unlikely(!va)) { rc =3D sk_stream_wait_memory(sk, &timeo); if (!rc) continue; @@ -494,8 +519,7 @@ static int tls_push_data(struct sock *sk, =20 copy =3D min_t(size_t, size, max_open_record_len - record->len); if (copy && (flags & MSG_SPLICE_PAGES)) { - struct page_frag zc_pfrag; - struct page **pages =3D &zc_pfrag.page; + struct page **pages =3D &page; size_t off; =20 rc =3D iov_iter_extract_pages(iter, &pages, @@ -507,24 +531,23 @@ static int tls_push_data(struct sock *sk, } copy =3D rc; =20 - if (WARN_ON_ONCE(!sendpage_ok(zc_pfrag.page))) { + if (WARN_ON_ONCE(!sendpage_ok(page))) { iov_iter_revert(iter, copy); rc =3D -EIO; goto handle_error; } =20 - zc_pfrag.offset =3D off; - zc_pfrag.size =3D copy; - tls_append_frag(record, &zc_pfrag, copy); + tls_append_page(record, page, off, copy); } else if (copy) { - copy =3D min_t(size_t, copy, pfrag->size - pfrag->offset); + copy =3D min_t(size_t, copy, frag_size); =20 - rc =3D tls_device_copy_data(page_address(pfrag->page) + - pfrag->offset, copy, - iter); + rc =3D tls_device_copy_data(va + frag_offset, copy, iter); if (rc) goto handle_error; - tls_append_frag(record, pfrag, copy); + + page =3D virt_to_page(va); + tls_append_pfrag(record, pfrag, page, frag_offset, + copy); } =20 size -=3D copy; --=20 2.33.0