From nobody Wed Dec 17 10:21:24 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7AD2222B59D for ; Fri, 21 Mar 2025 16:14:27 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.133.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1742573669; cv=none; b=A7WnNVRKvG/0DxaGinrHuPj0imhsl4NZbct06tmGUiJoXMLt39vfVG+U+88CiHWlbUr6ndS9lsHZmYy8SwLHTRw4OsvWYGQXpohSNrKIQCBC4IAbzHKhlN+89+QTegK5WI7EIi4/tC/M4GKmZObIG0fiPgaA4UP5sqC3N2kIm+4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1742573669; c=relaxed/simple; bh=9ndzdhte+6JNVy/8URnQXZLHzsej+iACNbrr7M5jc8Y=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=pFkcx9LoerdpJ+OObtjE4EMji8IpfQQTYpI61lAeNfJrmEonZMR2Go7HhNfLPkh4h1ovP9hRosYJVpLJ1MsZnJJopFRn4gJ82y+37lku4qIqSx2Z13dLJu5Nx60QylSImm3df/44mLcSLvCqBW86biSk6qdi3ldu5QwQxNtF3y8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=ReVVc8x7; arc=none smtp.client-ip=170.10.133.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="ReVVc8x7" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1742573666; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=D0wLDVf11WSl4C3Uw20envF+F+Pk9F9IBIa6lbPk+B4=; b=ReVVc8x7glk9F3twFW+Tb76dMivjikFAN0Ad1aW82a4XQw4RPKaA8p2fldsYEM5nKvQGop nSMHmI6SbvegcCHcL1BMriHIYIF9oACb+/krWjRaI5zvY7lVwIDWP5JnMrF992g/ujY64c RWTC6UvTZptQ660AmzQH8iR4UaQOCl4= Received: from mx-prod-mc-06.mail-002.prod.us-west-2.aws.redhat.com (ec2-35-165-154-97.us-west-2.compute.amazonaws.com [35.165.154.97]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-584-y0Bk-TPyNnulgNwPsh9IvA-1; Fri, 21 Mar 2025 12:14:22 -0400 X-MC-Unique: y0Bk-TPyNnulgNwPsh9IvA-1 X-Mimecast-MFC-AGG-ID: y0Bk-TPyNnulgNwPsh9IvA_1742573659 Received: from mx-prod-int-02.mail-002.prod.us-west-2.aws.redhat.com (mx-prod-int-02.mail-002.prod.us-west-2.aws.redhat.com [10.30.177.15]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mx-prod-mc-06.mail-002.prod.us-west-2.aws.redhat.com (Postfix) with ESMTPS id CFE271800875; Fri, 21 Mar 2025 16:14:18 +0000 (UTC) Received: from warthog.procyon.org.com (unknown [10.42.28.61]) by mx-prod-int-02.mail-002.prod.us-west-2.aws.redhat.com (Postfix) with ESMTP id 7507E1955BFE; Fri, 21 Mar 2025 16:14:15 +0000 (UTC) From: David Howells To: Leon Romanovsky Cc: David Howells , Christian Brauner , Matthew Wilcox , Chuck Lever , Steve French , Ilya Dryomov , netfs@lists.linux.dev, linux-fsdevel@vger.kernel.org, linux-block@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org Subject: [RFC PATCH 1/4] iov_iter: Move ITER_DISCARD and ITER_XARRAY iteration out-of-line Date: Fri, 21 Mar 2025 16:14:01 +0000 Message-ID: <20250321161407.3333724-2-dhowells@redhat.com> In-Reply-To: <20250321161407.3333724-1-dhowells@redhat.com> References: <20250321161407.3333724-1-dhowells@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.0 on 10.30.177.15 Content-Type: text/plain; charset="utf-8" Move ITER_DISCARD and ITER_XARRAY iteration out-of-line in preparation of adding other iteration types which will also be out-of-line. Signed-off-by: David Howells --- include/linux/iov_iter.h | 77 +++----------------------------------- lib/iov_iter.c | 81 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 72 deletions(-) diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h index c4aa58032faf..0c47933df517 100644 --- a/include/linux/iov_iter.h +++ b/include/linux/iov_iter.h @@ -17,6 +17,9 @@ typedef size_t (*iov_step_f)(void *iter_base, size_t prog= ress, size_t len, typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, siz= e_t len, void *priv, void *priv2); =20 +size_t __iterate_and_advance2(struct iov_iter *iter, size_t len, void *pri= v, + void *priv2, iov_ustep_f ustep, iov_step_f step); + /* * Handle ITER_UBUF. */ @@ -195,72 +198,6 @@ size_t iterate_folioq(struct iov_iter *iter, size_t le= n, void *priv, void *priv2 return progress; } =20 -/* - * Handle ITER_XARRAY. - */ -static __always_inline -size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void = *priv2, - iov_step_f step) -{ - struct folio *folio; - size_t progress =3D 0; - loff_t start =3D iter->xarray_start + iter->iov_offset; - pgoff_t index =3D start / PAGE_SIZE; - XA_STATE(xas, iter->xarray, index); - - rcu_read_lock(); - xas_for_each(&xas, folio, ULONG_MAX) { - size_t remain, consumed, offset, part, flen; - - if (xas_retry(&xas, folio)) - continue; - if (WARN_ON(xa_is_value(folio))) - break; - if (WARN_ON(folio_test_hugetlb(folio))) - break; - - offset =3D offset_in_folio(folio, start + progress); - flen =3D min(folio_size(folio) - offset, len); - - while (flen) { - void *base =3D kmap_local_folio(folio, offset); - - part =3D min_t(size_t, flen, - PAGE_SIZE - offset_in_page(offset)); - remain =3D step(base, progress, part, priv, priv2); - kunmap_local(base); - - consumed =3D part - remain; - progress +=3D consumed; - len -=3D consumed; - - if (remain || len =3D=3D 0) - goto out; - flen -=3D consumed; - offset +=3D consumed; - } - } - -out: - rcu_read_unlock(); - iter->iov_offset +=3D progress; - iter->count -=3D progress; - return progress; -} - -/* - * Handle ITER_DISCARD. - */ -static __always_inline -size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void= *priv2, - iov_step_f step) -{ - size_t progress =3D len; - - iter->count -=3D progress; - return progress; -} - /** * iterate_and_advance2 - Iterate over an iterator * @iter: The iterator to iterate over. @@ -306,9 +243,7 @@ size_t iterate_and_advance2(struct iov_iter *iter, size= _t len, void *priv, return iterate_kvec(iter, len, priv, priv2, step); if (iov_iter_is_folioq(iter)) return iterate_folioq(iter, len, priv, priv2, step); - if (iov_iter_is_xarray(iter)) - return iterate_xarray(iter, len, priv, priv2, step); - return iterate_discard(iter, len, priv, priv2, step); + return __iterate_and_advance2(iter, len, priv, priv2, ustep, step); } =20 /** @@ -370,9 +305,7 @@ size_t iterate_and_advance_kernel(struct iov_iter *iter= , size_t len, void *priv, return iterate_kvec(iter, len, priv, priv2, step); if (iov_iter_is_folioq(iter)) return iterate_folioq(iter, len, priv, priv2, step); - if (iov_iter_is_xarray(iter)) - return iterate_xarray(iter, len, priv, priv2, step); - return iterate_discard(iter, len, priv, priv2, step); + return __iterate_and_advance2(iter, len, priv, priv2, NULL, step); } =20 #endif /* _LINUX_IOV_ITER_H */ diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 65f550cb5081..33a8746e593e 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1927,3 +1927,84 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, return -EFAULT; } EXPORT_SYMBOL_GPL(iov_iter_extract_pages); + +/* + * Handle ITER_XARRAY. + */ +static __always_inline +size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void = *priv2, + iov_step_f step) +{ + struct folio *folio; + size_t progress =3D 0; + loff_t start =3D iter->xarray_start + iter->iov_offset; + pgoff_t index =3D start / PAGE_SIZE; + XA_STATE(xas, iter->xarray, index); + + rcu_read_lock(); + xas_for_each(&xas, folio, ULONG_MAX) { + size_t remain, consumed, offset, part, flen; + + if (xas_retry(&xas, folio)) + continue; + if (WARN_ON(xa_is_value(folio))) + break; + if (WARN_ON(folio_test_hugetlb(folio))) + break; + + offset =3D offset_in_folio(folio, start + progress); + flen =3D min(folio_size(folio) - offset, len); + + while (flen) { + void *base =3D kmap_local_folio(folio, offset); + + part =3D min_t(size_t, flen, + PAGE_SIZE - offset_in_page(offset)); + remain =3D step(base, progress, part, priv, priv2); + kunmap_local(base); + + consumed =3D part - remain; + progress +=3D consumed; + len -=3D consumed; + + if (remain || len =3D=3D 0) + goto out; + flen -=3D consumed; + offset +=3D consumed; + } + } + +out: + rcu_read_unlock(); + iter->iov_offset +=3D progress; + iter->count -=3D progress; + return progress; +} + +/* + * Handle ITER_DISCARD. + */ +static __always_inline +size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void= *priv2, + iov_step_f step) +{ + size_t progress =3D len; + + iter->count -=3D progress; + return progress; +} + +/* + * Out of line iteration for iterator types that don't need such fast hand= ling. + */ +size_t __iterate_and_advance2(struct iov_iter *iter, size_t len, void *pri= v, + void *priv2, iov_ustep_f ustep, iov_step_f step) +{ + if (iov_iter_is_discard(iter)) + return iterate_discard(iter, len, priv, priv2, step); + if (iov_iter_is_xarray(iter)) + return iterate_xarray(iter, len, priv, priv2, step); + WARN_ON(1); + return 0; +} +EXPORT_SYMBOL(__iterate_and_advance2); From nobody Wed Dec 17 10:21:24 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.129.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8FC6722B8CF for ; Fri, 21 Mar 2025 16:14:29 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.129.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1742573671; cv=none; b=Hiz42LD4geVQ7LDYCdu1KZaaWgHvASBPaCUawtguPBWLGbkGXmnQWKcxsX09vyqUkRAlIdIlULdDCGqNw9Gk0IfYQWN4l9ouO0KAG/v0XJL5viBzNrOBDZXm3HaRH57DS34AQfdZ9KBHlH6smOQQOn7hmFYkUVHQ2Atc5v9tYyE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1742573671; c=relaxed/simple; bh=l2b1GcwPJAEPhOwxBQ1cF95qrnUO7ROHoDw/IdBu/Z0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=OQ8dFvwhdm00Am8SqCe3hYNkHrUqDpUpQzu45Ajmj8AmVtuwjfO+gZWnihinF2ky+0/4EUIdKji7VzCOp/V/wONdQvqhU2jF8qP5OQVF2YmIz8RK2/+b7S++/j1/q4INlZFkYzK5l+YSdhiJWVavpU2DRrCIRtFqGQWzDcuhGU4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=hcX6hpdX; arc=none smtp.client-ip=170.10.129.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="hcX6hpdX" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1742573668; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=HNZpwpQRDsQTXPWBbnZ/kr6+Jl8CRL/WggEaD7n3AVo=; b=hcX6hpdXEZP9prw7fNEM2kmCcD76tKAbr0EJRVzuTtYs/MWk9yEXV2s3/oflBMMQ/8WpxE erYM3+yeQqq7Y71qLZeoVdG2+Tw3t8OvPPPUgd4OjfOI2yvZuO9ZRQAPfOgEwgV0S6IT2v MndLIeCLybTa4BMg+UPE1VSDMJMuj9Q= Received: from mx-prod-mc-08.mail-002.prod.us-west-2.aws.redhat.com (ec2-35-165-154-97.us-west-2.compute.amazonaws.com [35.165.154.97]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-173-JD6l5jh6OGyM8ciJnzstOA-1; Fri, 21 Mar 2025 12:14:26 -0400 X-MC-Unique: JD6l5jh6OGyM8ciJnzstOA-1 X-Mimecast-MFC-AGG-ID: JD6l5jh6OGyM8ciJnzstOA_1742573665 Received: from mx-prod-int-03.mail-002.prod.us-west-2.aws.redhat.com (mx-prod-int-03.mail-002.prod.us-west-2.aws.redhat.com [10.30.177.12]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mx-prod-mc-08.mail-002.prod.us-west-2.aws.redhat.com (Postfix) with ESMTPS id A39031809CA5; Fri, 21 Mar 2025 16:14:24 +0000 (UTC) Received: from warthog.procyon.org.com (unknown [10.42.28.61]) by mx-prod-int-03.mail-002.prod.us-west-2.aws.redhat.com (Postfix) with ESMTP id 1FA2B19373C4; Fri, 21 Mar 2025 16:14:19 +0000 (UTC) From: David Howells To: Leon Romanovsky Cc: David Howells , Christian Brauner , Matthew Wilcox , Chuck Lever , Steve French , Ilya Dryomov , netfs@lists.linux.dev, linux-fsdevel@vger.kernel.org, linux-block@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, Trond Myklebust Subject: [RFC PATCH 2/4] iov_iter: Add an iterator-of-iterators Date: Fri, 21 Mar 2025 16:14:02 +0000 Message-ID: <20250321161407.3333724-3-dhowells@redhat.com> In-Reply-To: <20250321161407.3333724-1-dhowells@redhat.com> References: <20250321161407.3333724-1-dhowells@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.0 on 10.30.177.12 Content-Type: text/plain; charset="utf-8" Add a new I/O iterator type, ITER_ITERLIST, that allows iteration over a series of I/O iterators, provided the iterators are all the same direction (all ITER_SOURCE or all ITER_DEST) and none of them are themselves ITER_ITERLIST (this function is recursive). To make reversion possible, I've added an 'orig_count' member into the iov_iter struct so that reversion of an ITER_ITERLIST can know when to go move backwards through the iter list. It might make more sense to make the iterator list element, say: struct itervec { struct iov_iter iter; size_t orig_count; }; rather than expanding struct iov_iter itself and have iov_iter_iterlist() set vec[i].orig_count from vec[i].iter->count. Also, for the moment, I've only permitted its use with source iterators (eg. sendmsg). To use this, you allocate an array of iterators and point the list iterator at it, e.g.: struct iov_iter iters[3]; struct msghdr msg; iov_iter_bvec(&iters[0], ITER_SOURCE, &head_bv, 1, sizeof(marker) + head->iov_len); iov_iter_xarray(&iters[1], ITER_SOURCE, xdr->pages, xdr->page_fpos, xdr->page_len); iov_iter_kvec(&iters[2], ITER_SOURCE, &tail_kv, 1, tail->iov_len); iov_iter_iterlist(&msg.msg_iter, ITER_SOURCE, iters, 3, size); This can be used by network filesystem protocols, such as sunrpc, to glue a header and a trailer on to some data to form a message and then dump the entire message onto the socket in a single go. [!] Note: I'm not entirely sure that this is a good idea: the problem is that it's reasonably common practice to copy an iterator by direct assignment - and that works for the existing iterators... but not this one. With the iterator-of-iterators, the list of iterators has to be modified if we recurse. It's probably fine just for calling sendmsg() from network filesystems, but I'm not 100% sure of that. Suggested-by: Trond Myklebust Signed-off-by: David Howells --- include/linux/uio.h | 15 +++++ lib/iov_iter.c | 158 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 172 insertions(+), 1 deletion(-) diff --git a/include/linux/uio.h b/include/linux/uio.h index 8ada84e85447..59a586333e1b 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -29,6 +29,7 @@ enum iter_type { ITER_FOLIOQ, ITER_XARRAY, ITER_DISCARD, + ITER_ITERLIST, }; =20 #define ITER_SOURCE 1 // =3D=3D WRITE @@ -71,6 +72,7 @@ struct iov_iter { const struct folio_queue *folioq; struct xarray *xarray; void __user *ubuf; + struct iov_iterlist *iterlist; }; size_t count; }; @@ -82,6 +84,11 @@ struct iov_iter { }; }; =20 +struct iov_iterlist { + struct iov_iter iter; + size_t orig_count; +}; + typedef __u16 uio_meta_flags_t; =20 struct uio_meta { @@ -149,6 +156,11 @@ static inline bool iov_iter_is_xarray(const struct iov= _iter *i) return iov_iter_type(i) =3D=3D ITER_XARRAY; } =20 +static inline bool iov_iter_is_iterlist(const struct iov_iter *i) +{ + return iov_iter_type(i) =3D=3D ITER_ITERLIST; +} + static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->data_source ? WRITE : READ; @@ -302,6 +314,9 @@ void iov_iter_folio_queue(struct iov_iter *i, unsigned = int direction, unsigned int first_slot, unsigned int offset, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xa= rray *xarray, loff_t start, size_t count); +void iov_iter_iterlist(struct iov_iter *i, unsigned int direction, + struct iov_iterlist *iterlist, unsigned long nr_segs, + size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 33a8746e593e..1d9190abfeb5 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -578,6 +578,19 @@ void iov_iter_advance(struct iov_iter *i, size_t size) iov_iter_folioq_advance(i, size); } else if (iov_iter_is_discard(i)) { i->count -=3D size; + } else if (iov_iter_is_iterlist(i)) { + i->count -=3D size; + for (;;) { + size_t part =3D umin(size, i->iterlist->iter.count); + + if (part > 0) + iov_iter_advance(&i->iterlist->iter, part); + size -=3D part; + if (!size) + break; + i->iterlist++; + i->nr_segs--; + } } } EXPORT_SYMBOL(iov_iter_advance); @@ -608,6 +621,23 @@ static void iov_iter_folioq_revert(struct iov_iter *i,= size_t unroll) i->folioq =3D folioq; } =20 +static void iov_iter_revert_iterlist(struct iov_iter *i, size_t unroll) +{ + for (;;) { + struct iov_iterlist *il =3D i->iterlist; + + size_t part =3D umin(unroll, il->orig_count - il->iter.count); + + if (part > 0) + iov_iter_revert(&il->iter, part); + unroll -=3D part; + if (!unroll) + break; + i->iterlist--; + i->nr_segs++; + } +} + void iov_iter_revert(struct iov_iter *i, size_t unroll) { if (!unroll) @@ -617,6 +647,8 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) i->count +=3D unroll; if (unlikely(iov_iter_is_discard(i))) return; + if (unlikely(iov_iter_is_iterlist(i))) + return iov_iter_revert_iterlist(i, unroll); if (unroll <=3D i->iov_offset) { i->iov_offset -=3D unroll; return; @@ -663,6 +695,8 @@ EXPORT_SYMBOL(iov_iter_revert); */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { + if (iov_iter_is_iterlist(i)) + i =3D &i->iterlist->iter; if (i->nr_segs > 1) { if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return min(i->count, iter_iov(i)->iov_len - i->iov_offset); @@ -787,6 +821,41 @@ void iov_iter_discard(struct iov_iter *i, unsigned int= direction, size_t count) } EXPORT_SYMBOL(iov_iter_discard); =20 +/** + * iov_iter_iterlist - Initialise an I/O iterator that is a list of iterat= ors + * @iter: The iterator to initialise. + * @direction: The direction of the transfer. + * @iterlist: The list of iterators + * @nr_segs: The number of elements in the list + * @count: The size of the I/O buffer in bytes. + * + * Set up an I/O iterator that walks over an array of other iterators. It= 's + * only available as a source iterator (for WRITE) and none of the iterato= rs in + * the array can be of ITER_ITERLIST type to prevent infinite recursion. + */ +void iov_iter_iterlist(struct iov_iter *iter, unsigned int direction, + struct iov_iterlist *iterlist, unsigned long nr_segs, + size_t count) +{ + unsigned long i; + + BUG_ON(direction !=3D WRITE); + for (i =3D 0; i < nr_segs; i++) { + BUG_ON(iterlist[i].iter.iter_type =3D=3D ITER_ITERLIST); + BUG_ON(iterlist[i].iter.data_source !=3D direction); + iterlist[i].orig_count =3D iterlist[i].iter.count; + } + + *iter =3D (struct iov_iter){ + .iter_type =3D ITER_ITERLIST, + .data_source =3D true, + .count =3D count, + .iterlist =3D iterlist, + .nr_segs =3D nr_segs, + }; +} +EXPORT_SYMBOL(iov_iter_iterlist); + static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr= _mask, unsigned len_mask) { @@ -947,6 +1016,15 @@ unsigned long iov_iter_alignment(const struct iov_ite= r *i) if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count; =20 + if (iov_iter_is_iterlist(i)) { + unsigned long align =3D 0; + unsigned int j; + + for (j =3D 0; j < i->nr_segs; j++) + align |=3D iov_iter_alignment(&i->iterlist[j].iter); + return align; + } + return 0; } EXPORT_SYMBOL(iov_iter_alignment); @@ -1206,6 +1284,18 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov= _iter *i, return iter_folioq_get_pages(i, pages, maxsize, maxpages, start); if (iov_iter_is_xarray(i)) return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); + if (iov_iter_is_iterlist(i)) { + ssize_t size; + + while (!i->iterlist->iter.count) { + i->iterlist++; + i->nr_segs--; + } + size =3D __iov_iter_get_pages_alloc(&i->iterlist->iter, + pages, maxsize, maxpages, start); + i->count -=3D size; + return size; + } return -EFAULT; } =20 @@ -1274,6 +1364,21 @@ static int bvec_npages(const struct iov_iter *i, int= maxpages) return npages; } =20 +static int iterlist_npages(const struct iov_iter *i, int maxpages) +{ + const struct iov_iterlist *p; + ssize_t size =3D i->count; + int npages =3D 0; + + for (p =3D i->iterlist; size; p++) { + size -=3D p->iter.count; + npages +=3D iov_iter_npages(&p->iter, maxpages - npages); + if (unlikely(npages >=3D maxpages)) + return maxpages; + } + return npages; +} + int iov_iter_npages(const struct iov_iter *i, int maxpages) { if (unlikely(!i->count)) @@ -1298,6 +1403,8 @@ int iov_iter_npages(const struct iov_iter *i, int max= pages) int npages =3D DIV_ROUND_UP(offset + i->count, PAGE_SIZE); return min(npages, maxpages); } + if (iov_iter_is_iterlist(i)) + return iterlist_npages(i, maxpages); return 0; } EXPORT_SYMBOL(iov_iter_npages); @@ -1309,11 +1416,14 @@ const void *dup_iter(struct iov_iter *new, struct i= ov_iter *old, gfp_t flags) return new->bvec =3D kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), flags); - else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) + if (iov_iter_is_kvec(new) || iter_is_iovec(new)) /* iovec and kvec have identical layout */ return new->__iov =3D kmemdup(new->__iov, new->nr_segs * sizeof(struct iovec), flags); + if (WARN_ON_ONCE(iov_iter_is_iterlist(old))) + /* Don't allow dup'ing of iterlist as the cleanup is complicated */ + return NULL; return NULL; } EXPORT_SYMBOL(dup_iter); @@ -1924,6 +2034,23 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, return iov_iter_extract_xarray_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); + if (iov_iter_is_iterlist(i)) { + ssize_t size; + + while (i->nr_segs && !i->iterlist->iter.count) { + i->iterlist++; + i->nr_segs--; + } + if (!i->nr_segs) { + WARN_ON_ONCE(i->count); + return 0; + } + size =3D iov_iter_extract_pages(&i->iterlist->iter, + pages, maxsize, maxpages, + extraction_flags, offset0); + i->count -=3D size; + return size; + } return -EFAULT; } EXPORT_SYMBOL_GPL(iov_iter_extract_pages); @@ -1994,6 +2121,33 @@ size_t iterate_discard(struct iov_iter *iter, size_t= len, void *priv, void *priv return progress; } =20 +/* + * Handle iteration over ITER_ITERLIST. + */ +static size_t iterate_iterlist(struct iov_iter *iter, size_t len, void *pr= iv, void *priv2, + iov_ustep_f ustep, iov_step_f step) +{ + struct iov_iterlist *p =3D iter->iterlist; + size_t progress =3D 0; + + do { + size_t consumed; + + consumed =3D iterate_and_advance2(&p->iter, len, priv, priv2, ustep, ste= p); + + len -=3D consumed; + progress +=3D consumed; + if (p->iter.count) + break; + p++; + } while (len); + + iter->nr_segs -=3D p - iter->iterlist; + iter->iterlist =3D p; + iter->count -=3D progress; + return progress; +} + /* * Out of line iteration for iterator types that don't need such fast hand= ling. */ @@ -2004,6 +2158,8 @@ size_t __iterate_and_advance2(struct iov_iter *iter, = size_t len, void *priv, return iterate_discard(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); + if (iov_iter_is_iterlist(iter)) + return iterate_iterlist(iter, len, priv, priv2, ustep, step); WARN_ON(1); return 0; } From nobody Wed Dec 17 10:21:24 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.129.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C4E8622D784 for ; Fri, 21 Mar 2025 16:14:33 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.129.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1742573675; cv=none; b=Ji8z08RIklyJGXkjWwVTPIFbgRCk74ZVilY/ZRL5BU3hX8oAXBU/7KOdVDA8sZJRToHEsxBwFJSzg6GhkVbgjgbMTbxzNArVHCEK1Tp2yzMP80MLM18HV+T64zXyL8S6sxVoVgLtrqncLqmgIJoLnsdQNr1fGdfaMPwHpAD3HyY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1742573675; c=relaxed/simple; bh=GqaOwhB3Z9lrdXIil/wS6IEs78AoHI886w0ijBcF1Nw=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=r+mr5b1JtqS5EUO8kWAanLalX4/NnvSLt/COblnZ3usN3w16EbmLFnHHvzpsINPxP6MNuOw0oPgRwXaGQThbNjeAGF0lcfDUU6b5gBgSaNHGRKLQFpM4d/YdQSfyp6/JJgFhf7R9JlJmWmsmJdzFXNwInqI9YJpCP5HEmlWJDXE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=WZVSNQWJ; arc=none smtp.client-ip=170.10.129.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="WZVSNQWJ" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1742573673; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=xO8vXcEEPHEwzKEymAYWWkxy3bveqdMiRNK7ViMq3/8=; b=WZVSNQWJ1GYhTWoBObPT5sFEajJSQQYJn7hzqEyxt6Ko6k/ZGfDbtpuyKDstRGPmRNC4u/ zn/JhFP1hn+yyRabwovyCNm8EmUm/3LiF9M5IT76GY1DYSfaFf/jFjI0bhMb+Imc0ZnY3H pyTgni72H0gt5HtNbbqjimFoEFOITGk= Received: from mx-prod-mc-04.mail-002.prod.us-west-2.aws.redhat.com (ec2-54-186-198-63.us-west-2.compute.amazonaws.com [54.186.198.63]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-167-J_hiXevqOZKYdauQ2n8oLw-1; Fri, 21 Mar 2025 12:14:31 -0400 X-MC-Unique: J_hiXevqOZKYdauQ2n8oLw-1 X-Mimecast-MFC-AGG-ID: J_hiXevqOZKYdauQ2n8oLw_1742573670 Received: from mx-prod-int-02.mail-002.prod.us-west-2.aws.redhat.com (mx-prod-int-02.mail-002.prod.us-west-2.aws.redhat.com [10.30.177.15]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mx-prod-mc-04.mail-002.prod.us-west-2.aws.redhat.com (Postfix) with ESMTPS id C2B541903081; Fri, 21 Mar 2025 16:14:29 +0000 (UTC) Received: from warthog.procyon.org.com (unknown [10.42.28.61]) by mx-prod-int-02.mail-002.prod.us-west-2.aws.redhat.com (Postfix) with ESMTP id 075461955BFE; Fri, 21 Mar 2025 16:14:25 +0000 (UTC) From: David Howells To: Leon Romanovsky Cc: David Howells , Christian Brauner , Matthew Wilcox , Chuck Lever , Steve French , Ilya Dryomov , netfs@lists.linux.dev, linux-fsdevel@vger.kernel.org, linux-block@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org Subject: [RFC PATCH 3/4] iov_iter: Add a scatterlist iterator type Date: Fri, 21 Mar 2025 16:14:03 +0000 Message-ID: <20250321161407.3333724-4-dhowells@redhat.com> In-Reply-To: <20250321161407.3333724-1-dhowells@redhat.com> References: <20250321161407.3333724-1-dhowells@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.0 on 10.30.177.15 Content-Type: text/plain; charset="utf-8" Add an iterator type that can iterate over a scatterlist. This can be used as a bridge to help convert things that take scatterlists into things that take I/O iterators. Signed-off-by: David Howells --- include/linux/uio.h | 12 ++ lib/iov_iter.c | 315 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 321 insertions(+), 6 deletions(-) diff --git a/include/linux/uio.h b/include/linux/uio.h index 59a586333e1b..0e50f4af6877 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -12,6 +12,7 @@ =20 struct page; struct folio_queue; +struct scatterlist; =20 typedef unsigned int __bitwise iov_iter_extraction_t; =20 @@ -30,6 +31,7 @@ enum iter_type { ITER_XARRAY, ITER_DISCARD, ITER_ITERLIST, + ITER_SCATTERLIST, }; =20 #define ITER_SOURCE 1 // =3D=3D WRITE @@ -46,6 +48,7 @@ struct iov_iter { bool nofault; bool data_source; size_t iov_offset; + size_t orig_count; /* * Hack alert: overlay ubuf_iovec with iovec + count, so * that the members resolve correctly regardless of the type @@ -73,11 +76,13 @@ struct iov_iter { struct xarray *xarray; void __user *ubuf; struct iov_iterlist *iterlist; + struct scatterlist *sglist; }; size_t count; }; }; union { + struct scatterlist *sglist_head; unsigned long nr_segs; u8 folioq_slot; loff_t xarray_start; @@ -161,6 +166,11 @@ static inline bool iov_iter_is_iterlist(const struct i= ov_iter *i) return iov_iter_type(i) =3D=3D ITER_ITERLIST; } =20 +static inline bool iov_iter_is_scatterlist(const struct iov_iter *i) +{ + return iov_iter_type(i) =3D=3D ITER_SCATTERLIST; +} + static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->data_source ? WRITE : READ; @@ -317,6 +327,8 @@ void iov_iter_xarray(struct iov_iter *i, unsigned int d= irection, struct xarray * void iov_iter_iterlist(struct iov_iter *i, unsigned int direction, struct iov_iterlist *iterlist, unsigned long nr_segs, size_t count); +void iov_iter_scatterlist(struct iov_iter *i, unsigned int direction, + struct scatterlist *sglist, size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 1d9190abfeb5..ed9859af3c5d 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -562,6 +562,26 @@ static void iov_iter_folioq_advance(struct iov_iter *i= , size_t size) i->folioq =3D folioq; } =20 +static void iov_iter_scatterlist_advance(struct iov_iter *i, size_t size) +{ + struct scatterlist *sg; + + if (!i->count) + return; + i->count -=3D size; + + size +=3D i->iov_offset; + + for (sg =3D i->sglist; sg; sg_next(sg)) { + if (likely(size < sg->length)) + break; + size -=3D sg->length; + } + WARN_ON(!sg && size > 0); + i->iov_offset =3D size; + i->sglist =3D sg; +} + void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(i->count < size)) @@ -591,6 +611,8 @@ void iov_iter_advance(struct iov_iter *i, size_t size) i->iterlist++; i->nr_segs--; } + } else if (iov_iter_is_scatterlist(i)) { + iov_iter_scatterlist_advance(i, size); } } EXPORT_SYMBOL(iov_iter_advance); @@ -638,6 +660,15 @@ static void iov_iter_revert_iterlist(struct iov_iter *= i, size_t unroll) } } =20 +static void iov_iter_revert_scatterlist(struct iov_iter *i) +{ + size_t skip =3D i->orig_count - i->count; + + i->sglist =3D i->sglist_head; + i->count =3D i->orig_count; + iov_iter_advance(i, skip); +} + void iov_iter_revert(struct iov_iter *i, size_t unroll) { if (!unroll) @@ -649,6 +680,8 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) return; if (unlikely(iov_iter_is_iterlist(i))) return iov_iter_revert_iterlist(i, unroll); + if (unlikely(iov_iter_is_scatterlist(i))) + return iov_iter_revert_scatterlist(i); if (unroll <=3D i->iov_offset) { i->iov_offset -=3D unroll; return; @@ -706,6 +739,8 @@ size_t iov_iter_single_seg_count(const struct iov_iter = *i) if (unlikely(iov_iter_is_folioq(i))) return !i->count ? 0 : umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count); + if (unlikely(iov_iter_is_scatterlist(i))) + return !i->sglist ? 0 : umin(i->count, i->sglist->length - i->iov_offset= ); return i->count; } EXPORT_SYMBOL(iov_iter_single_seg_count); @@ -856,6 +891,33 @@ void iov_iter_iterlist(struct iov_iter *iter, unsigned= int direction, } EXPORT_SYMBOL(iov_iter_iterlist); =20 +/** + * iov_iter_scatterlist - Initialise an I/O iterator for a scatterlist cha= in + * @iter: The iterator to initialise. + * @direction: The direction of the transfer. + * @sglist: The head of the scatterlist + * @count: The size of the I/O buffer in bytes. + * + * Set up an I/O iterator that walks over a scatterlist. Because scatterl= ists + * can be chained and have no back pointers, reversion requires starting a= gain + * at the beginning and counting forwards. + */ +void iov_iter_scatterlist(struct iov_iter *iter, unsigned int direction, + struct scatterlist *sglist, size_t count) +{ + WARN_ON(direction & ~(READ | WRITE)); + *iter =3D (struct iov_iter){ + .iter_type =3D ITER_SCATTERLIST, + .data_source =3D direction, + .sglist =3D sglist, + .sglist_head =3D sglist, + .iov_offset =3D 0, + .count =3D count, + .orig_count =3D count, + }; +} +EXPORT_SYMBOL(iov_iter_scatterlist); + static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr= _mask, unsigned len_mask) { @@ -994,6 +1056,26 @@ static unsigned long iov_iter_alignment_bvec(const st= ruct iov_iter *i) return res; } =20 +static unsigned long iov_iter_alignment_scatterlist(const struct iov_iter = *i) +{ + struct scatterlist *sg; + unsigned skip =3D i->iov_offset; + unsigned res =3D 0; + size_t size =3D i->count; + + for (sg =3D i->sglist; sg; sg =3D sg_next(sg)) { + size_t len =3D sg->length - skip; + res |=3D (unsigned long)sg->offset + skip; + if (len > size) + len =3D size; + res |=3D len; + size -=3D len; + skip =3D 0; + } while (size); + + return res; +} + unsigned long iov_iter_alignment(const struct iov_iter *i) { if (likely(iter_is_ubuf(i))) { @@ -1024,6 +1106,8 @@ unsigned long iov_iter_alignment(const struct iov_ite= r *i) align |=3D iov_iter_alignment(&i->iterlist[j].iter); return align; } + if (iov_iter_is_scatterlist(i)) + return iov_iter_alignment_scatterlist(i); =20 return 0; } @@ -1058,13 +1142,8 @@ unsigned long iov_iter_gap_alignment(const struct io= v_iter *i) } EXPORT_SYMBOL(iov_iter_gap_alignment); =20 -static int want_pages_array(struct page ***res, size_t size, - size_t start, unsigned int maxpages) +static int __want_pages_array(struct page ***res, unsigned int count) { - unsigned int count =3D DIV_ROUND_UP(size + start, PAGE_SIZE); - - if (count > maxpages) - count =3D maxpages; WARN_ON(!count); // caller should've prevented that if (!*res) { *res =3D kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); @@ -1074,6 +1153,16 @@ static int want_pages_array(struct page ***res, size= _t size, return count; } =20 +static int want_pages_array(struct page ***res, size_t size, + size_t start, unsigned int maxpages) +{ + size_t count =3D DIV_ROUND_UP(size + start, PAGE_SIZE); + + if (count > maxpages) + count =3D maxpages; + return __want_pages_array(res, count); +} + static ssize_t iter_folioq_get_pages(struct iov_iter *iter, struct page ***ppages, size_t maxsize, unsigned maxpages, size_t *_start_offset) @@ -1186,6 +1275,52 @@ static ssize_t iter_xarray_get_pages(struct iov_iter= *i, return maxsize; } =20 +static struct page *first_scatterlist_segment(const struct iov_iter *i, + size_t *size, size_t *start) +{ + struct scatterlist *sg =3D i->sglist; + struct page *page; + size_t skip =3D i->iov_offset, len; + + if (!sg) + return NULL; + + len =3D sg->length - skip; + if (*size > len) + *size =3D len; + skip +=3D sg->offset; + page =3D sg_page(sg) + skip / PAGE_SIZE; + *start =3D skip % PAGE_SIZE; + return page; +} + +static ssize_t iter_scatterlist_get_pages(struct iov_iter *i, + struct page ***pages, size_t maxsize, + unsigned maxpages, size_t *start) +{ + struct page **p, *page; + unsigned int n; + + page =3D first_scatterlist_segment(i, &maxsize, start); + if (!page) + return -EFAULT; + n =3D want_pages_array(pages, maxsize, *start, maxpages); + if (!n) + return -ENOMEM; + p =3D *pages; + for (int k =3D 0; k < n; k++) + get_page(p[k] =3D page + k); + maxsize =3D min_t(size_t, maxsize, n * PAGE_SIZE - *start); + i->count -=3D maxsize; + i->iov_offset +=3D maxsize; + if (i->iov_offset =3D=3D i->bvec->bv_len) { + i->iov_offset =3D 0; + i->bvec++; + i->nr_segs--; + } + return maxsize; +} + /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t = *size) { @@ -1296,6 +1431,8 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_= iter *i, i->count -=3D size; return size; } + if (iov_iter_is_scatterlist(i)) + return iter_scatterlist_get_pages(i, pages, maxsize, maxpages, start); return -EFAULT; } =20 @@ -1379,6 +1516,25 @@ static int iterlist_npages(const struct iov_iter *i,= int maxpages) return npages; } =20 +static int scatterlist_npages(const struct iov_iter *i, int maxpages) +{ + struct scatterlist *sg; + size_t skip =3D i->iov_offset, size =3D i->count; + int npages =3D 0; + + for (sg =3D i->sglist; sg && size; sg =3D sg_next(sg)) { + unsigned offs =3D (sg->offset + skip) % PAGE_SIZE; + size_t len =3D umin(sg->length - skip, size); + + size -=3D len; + npages +=3D DIV_ROUND_UP(offs + len, PAGE_SIZE); + if (unlikely(npages > maxpages)) + return maxpages; + skip =3D 0; + } + return npages; +} + int iov_iter_npages(const struct iov_iter *i, int maxpages) { if (unlikely(!i->count)) @@ -1405,6 +1561,8 @@ int iov_iter_npages(const struct iov_iter *i, int max= pages) } if (iov_iter_is_iterlist(i)) return iterlist_npages(i, maxpages); + if (iov_iter_is_scatterlist(i)) + return scatterlist_npages(i, maxpages); return 0; } EXPORT_SYMBOL(iov_iter_npages); @@ -1792,6 +1950,107 @@ static ssize_t iov_iter_extract_xarray_pages(struct= iov_iter *i, return maxsize; } =20 +/* + * Count the number of virtually contiguous pages in a scatterlist iterator + * from the current point. + */ +static size_t count_scatterlist_contig_pages(const struct iov_iter *i, + size_t maxpages, size_t maxsize) +{ + struct scatterlist *sg; + size_t npages =3D 0; + size_t skip =3D i->iov_offset, size =3D umin(i->count, maxsize); + + for (sg =3D i->sglist; sg && size; sg =3D sg_next(sg)) { + size_t offs =3D (sg->offset + skip) % PAGE_SIZE; + size_t part =3D umin(sg->length - skip, size); + + if (!part) + break; + size -=3D part; + npages +=3D DIV_ROUND_UP(offs + part, PAGE_SIZE); + if (unlikely(npages > maxpages)) + return maxpages; + if (((offs + part) % PAGE_SIZE) !=3D 0) + break; + skip =3D 0; + } + return npages; +} + +/* + * Extract a list of contiguous pages from an ITER_FOLIOQ iterator. This = does + * not get references on the pages, nor does it get a pin on them. + */ +static ssize_t iov_iter_extract_scatterlist_pages(struct iov_iter *i, + struct page ***pages, size_t maxsize, + unsigned int maxpages, + iov_iter_extraction_t extraction_flags, + size_t *offset0) +{ + struct scatterlist *sg =3D i->sglist; + struct page **p; + size_t npages, skip, size =3D 0; + int nr =3D 0; + + if (!sg) + return 0; + + while (skip =3D i->iov_offset, + skip =3D=3D sg->length) { + sg =3D sg_next(sg); + i->sglist =3D sg; + i->iov_offset =3D 0; + if (!sg) + return 0; + } + + npages =3D count_scatterlist_contig_pages(i, maxpages, maxsize); + + maxpages =3D __want_pages_array(pages, npages); + if (!maxpages) + return -ENOMEM; + *offset0 =3D (sg->offset + skip) & ~PAGE_MASK; + p =3D *pages; + + for (sg =3D i->sglist; sg; sg =3D sg_next(sg)) { + struct page *page =3D sg_page(sg); + size_t part =3D umin(sg->length - skip, maxsize); + size_t off =3D sg->offset + skip; + + if (!part) + break; + + page +=3D off / PAGE_SIZE; + off %=3D PAGE_SIZE; + + do { + size_t chunk =3D umin(part, PAGE_SIZE - off); + + p[nr++] =3D page; + page++; + maxpages--; + maxsize -=3D chunk; + size +=3D chunk; + skip +=3D chunk; + part -=3D chunk; + off =3D 0; + } while (part && maxsize && maxpages); + + if (((sg->offset + skip + part) % PAGE_SIZE) !=3D 0) + break; + if (!maxsize || !maxpages) { + if (!part) + sg =3D sg_next(sg); + break; + } + skip =3D 0; + } + + iov_iter_advance(i, size); + return size; +} + /* * Extract a list of virtually contiguous pages from an ITER_BVEC iterator. * This does not get references on the pages, nor does it get a pin on the= m. @@ -2051,6 +2310,10 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, i->count -=3D size; return size; } + if (iov_iter_is_scatterlist(i)) + return iov_iter_extract_scatterlist_pages(i, pages, maxsize, + maxpages, extraction_flags, + offset0); return -EFAULT; } EXPORT_SYMBOL_GPL(iov_iter_extract_pages); @@ -2148,6 +2411,44 @@ static size_t iterate_iterlist(struct iov_iter *iter= , size_t len, void *priv, vo return progress; } =20 +/* + * Handle iteration over ITER_SCATTERLIST. + */ +static size_t iterate_scatterlist(struct iov_iter *iter, size_t len, void = *priv, void *priv2, + iov_step_f step) +{ + struct scatterlist *sg =3D iter->sglist; + size_t progress =3D 0, skip =3D iter->iov_offset; + + do { + struct page *page =3D sg_page(sg); + size_t remain, consumed; + size_t offset =3D sg->offset + skip, part; + void *kaddr =3D kmap_local_page(page + offset / PAGE_SIZE); + + part =3D min3(len, + (size_t)(sg->length - skip), + (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); + remain =3D step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2); + kunmap_local(kaddr); + consumed =3D part - remain; + len -=3D consumed; + progress +=3D consumed; + skip +=3D consumed; + if (skip >=3D sg->length) { + skip =3D 0; + sg =3D sg_next(sg); + } + if (remain) + break; + } while (len); + + iter->sglist =3D sg; + iter->iov_offset =3D skip; + iter->count -=3D progress; + return progress; +} + /* * Out of line iteration for iterator types that don't need such fast hand= ling. */ @@ -2160,6 +2461,8 @@ size_t __iterate_and_advance2(struct iov_iter *iter, = size_t len, void *priv, return iterate_xarray(iter, len, priv, priv2, step); if (iov_iter_is_iterlist(iter)) return iterate_iterlist(iter, len, priv, priv2, ustep, step); + if (iov_iter_is_scatterlist(iter)) + return iterate_scatterlist(iter, len, priv, priv2, step); WARN_ON(1); return 0; } From nobody Wed Dec 17 10:21:24 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 421CC22DFA5 for ; Fri, 21 Mar 2025 16:14:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.133.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1742573686; cv=none; b=rA2okusg6LJfvIiJoNJahYzhE8AAtnmxaovZxcDWB+VPYKpbSgsMVz18lcqGxU7PH1UvN/hWCmJKOCBXJO56XOqvdKOjE6rEt7QFydYZS36gMODjKasGYmRBxJa46mCvnG0NMN/E+FFpBii6KjKkczJ4rQYDKoUNgoykBUzIKqQ= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1742573686; c=relaxed/simple; bh=QJbi407Gq8VLLgi9E0vdiZ8uX6C0oRfqhu+9aRZPznQ=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=aE1PBbPaaSb6QyzuJBJrCsBhb9ymC0IzDpl2dFfmzVUgdB0HT+B9A3v1k97hav6VTzwfN41sF7g4Jf0ZEzpRCdwy7tu9R3UNC0fKOgGqhMllNUaH77v0tH1UVM60BSJ8b4PcPymw8v4QAWkOSS7abm3WLsCOMA15yOfCzacwzsA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=HabGai7G; arc=none smtp.client-ip=170.10.133.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="HabGai7G" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1742573682; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=ZSC5YwicpO03gYExoUotpiHs2EHAFn+TU50pqasUPmU=; b=HabGai7GfQgOh2CFTJWPaOqVh8LOD9JM14mKMs75kiV4u+agWMX+kqHPL2kjNSDHhycAb1 pPGgrJ0pprfGB3OjTJCCDNpQi+ExbTQ/PkbA0WLX+yfZKDuQLdBrfeth7/uaMgVG2++NOg Xhl8Gt4nkBGcchq93ROSHuj2VS3y0aE= Received: from mx-prod-mc-08.mail-002.prod.us-west-2.aws.redhat.com (ec2-35-165-154-97.us-west-2.compute.amazonaws.com [35.165.154.97]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-208-Ev37cor8PDCv9N0Af2ivaQ-1; Fri, 21 Mar 2025 12:14:37 -0400 X-MC-Unique: Ev37cor8PDCv9N0Af2ivaQ-1 X-Mimecast-MFC-AGG-ID: Ev37cor8PDCv9N0Af2ivaQ_1742573674 Received: from mx-prod-int-06.mail-002.prod.us-west-2.aws.redhat.com (mx-prod-int-06.mail-002.prod.us-west-2.aws.redhat.com [10.30.177.93]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mx-prod-mc-08.mail-002.prod.us-west-2.aws.redhat.com (Postfix) with ESMTPS id 3680C1801A07; Fri, 21 Mar 2025 16:14:34 +0000 (UTC) Received: from warthog.procyon.org.com (unknown [10.42.28.61]) by mx-prod-int-06.mail-002.prod.us-west-2.aws.redhat.com (Postfix) with ESMTP id ED872180174E; Fri, 21 Mar 2025 16:14:30 +0000 (UTC) From: David Howells To: Leon Romanovsky Cc: David Howells , Christian Brauner , Matthew Wilcox , Chuck Lever , Steve French , Ilya Dryomov , netfs@lists.linux.dev, linux-fsdevel@vger.kernel.org, linux-block@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org Subject: [RFC PATCH 4/4] iov_iter: Add a scatterlist iterator type [INCOMPLETE] Date: Fri, 21 Mar 2025 16:14:04 +0000 Message-ID: <20250321161407.3333724-5-dhowells@redhat.com> In-Reply-To: <20250321161407.3333724-1-dhowells@redhat.com> References: <20250321161407.3333724-1-dhowells@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.4.1 on 10.30.177.93 Content-Type: text/plain; charset="utf-8" Add an iterator type that can iterate over a socket buffer. [!] Note this is not yet completely implemented and won't compile. Signed-off-by: David Howells --- include/linux/uio.h | 10 ++++ lib/iov_iter.c | 121 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) diff --git a/include/linux/uio.h b/include/linux/uio.h index 0e50f4af6877..87d6ba660489 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -13,6 +13,7 @@ struct page; struct folio_queue; struct scatterlist; +struct sk_buff; =20 typedef unsigned int __bitwise iov_iter_extraction_t; =20 @@ -32,6 +33,7 @@ enum iter_type { ITER_DISCARD, ITER_ITERLIST, ITER_SCATTERLIST, + ITER_SKBUFF, }; =20 #define ITER_SOURCE 1 // =3D=3D WRITE @@ -77,6 +79,7 @@ struct iov_iter { void __user *ubuf; struct iov_iterlist *iterlist; struct scatterlist *sglist; + const struct sk_buff *skb; }; size_t count; }; @@ -171,6 +174,11 @@ static inline bool iov_iter_is_scatterlist(const struc= t iov_iter *i) return iov_iter_type(i) =3D=3D ITER_SCATTERLIST; } =20 +static inline bool iov_iter_is_skbuff(const struct iov_iter *i) +{ + return iov_iter_type(i) =3D=3D ITER_SKBUFF; +} + static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->data_source ? WRITE : READ; @@ -329,6 +337,8 @@ void iov_iter_iterlist(struct iov_iter *i, unsigned int= direction, size_t count); void iov_iter_scatterlist(struct iov_iter *i, unsigned int direction, struct scatterlist *sglist, size_t count); +void iov_iter_skbuff(struct iov_iter *i, unsigned int direction, + const struct sk_buff *skb, size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index ed9859af3c5d..01215316d272 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -12,6 +12,7 @@ #include #include #include +#include =20 static __always_inline size_t copy_to_user_iter(void __user *iter_to, size_t progress, @@ -918,6 +919,29 @@ void iov_iter_scatterlist(struct iov_iter *iter, unsig= ned int direction, } EXPORT_SYMBOL(iov_iter_scatterlist); =20 +/** + * iov_iter_skbuff - Initialise an I/O iterator for a socket buffer + * @iter: The iterator to initialise. + * @direction: The direction of the transfer. + * @skb: The socket buffer + * @count: The size of the I/O buffer in bytes. + * + * Set up an I/O iterator that walks over a socket buffer. + */ +void iov_iter_skbuff(struct iov_iter *i, unsigned int direction, + const struct sk_buff *skb, size_t count) +{ + WARN_ON(direction & ~(READ | WRITE)); + *iter =3D (struct iov_iter){ + .iter_type =3D ITER_SKBUFF, + .data_source =3D direction, + .skb =3D skb, + .iov_offset =3D 0, + .count =3D count, + }; +} +EXPORT_SYMBOL(iov_iter_skbuff); + static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr= _mask, unsigned len_mask) { @@ -2314,6 +2338,10 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, return iov_iter_extract_scatterlist_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); + if (iov_iter_is_skbuff(i)) + return iov_iter_extract_skbuff_pages(i, pages, maxsize, + maxpages, extraction_flags, + offset0); return -EFAULT; } EXPORT_SYMBOL_GPL(iov_iter_extract_pages); @@ -2449,6 +2477,97 @@ static size_t iterate_scatterlist(struct iov_iter *i= ter, size_t len, void *priv, return progress; } =20 +struct skbuff_iter_ctx { + iov_step_f step; + size_t progress; + void *priv; + void *priv2; +}; + +static bool iterate_skbuff_frag(const struct sk_buff *skb, struct skbuff_i= ter_ctx *ctx, + int offset, int len, int recursion_level) +{ + struct sk_buff *frag_iter; + size_t skip =3D offset, part, remain, consumed; + + if (unlikely(recursion_level >=3D 24)) + return false; + + part =3D skb_headlen(skb); + if (skip < part) { + part =3D umin(part - skip, len); + remain =3D ctx->step(skb->data + skip, ctx->progress, part, + ctx->priv, ctx->priv2); + consumed =3D part - remain; + ctx->progress +=3D consumed; + len -=3D consumed; + if (remain > 0 || len <=3D 0) + return false; + skip =3D 0; + } else { + skip -=3D part; + } + + for (int i =3D 0; i < skb_shinfo(skb)->nr_frags; i++) { + const skb_frag_t *frag =3D &skb_shinfo(skb)->frags[i]; + size_t fsize =3D skb_frag_size(frag); + + if (skip >=3D fsize) { + skip -=3D fsize; + continue; + } + + part =3D umin(fsize - skip, len); + remain =3D ctx->step(skb_frag_address(frag) + skip, + ctx->progress, part, ctx->priv, ctx->priv2); + consumed =3D part - remain; + ctx->progress +=3D consumed; + len -=3D consumed; + if (remain > 0 || len <=3D 0) + return false; + skip =3D 0; + } + + skb_walk_frags(skb, frag_iter) { + size_t fsize =3D frag_iter->len; + + if (skip >=3D fsize) { + skip -=3D fsize; + continue; + } + + part =3D umin(fsize - skip, len); + if (!iterate_skbuff_frag(frag_iter, ctx, skb_headlen(skb) + skip, + part, recursion_level + 1)) + return false; + len -=3D part; + if (len <=3D 0) + return false; + skip =3D 0; + } + return true; +} + +/* + * Handle iteration over ITER_SKBUFF. Modelled on __skb_to_sgvec(). + */ +static size_t iterate_skbuff(struct iov_iter *iter, size_t len, void *priv= , void *priv2, + iov_step_f step) +{ + struct skbuff_iter_ctx ctx =3D { + .step =3D step, + .progress =3D 0, + .priv =3D priv, + .priv2 =3D priv2, + }; + + iterate_skbuff_frag(iter->skb, &ctx, iter->iov_offset, len, 0); + + iter->iov_offset +=3D ctx.progress; + iter->count -=3D ctx.progress; + return ctx.progress; +} + /* * Out of line iteration for iterator types that don't need such fast hand= ling. */ @@ -2463,6 +2582,8 @@ size_t __iterate_and_advance2(struct iov_iter *iter, = size_t len, void *priv, return iterate_iterlist(iter, len, priv, priv2, ustep, step); if (iov_iter_is_scatterlist(iter)) return iterate_scatterlist(iter, len, priv, priv2, step); + if (iov_iter_is_skbuff(iter)) + return iterate_skbuff(iter, len, priv, priv2, step); WARN_ON(1); return 0; }