From: Keith Busch <kbusch@kernel.org>
Instead of ensuring each vector is block size aligned while constructing
the bio, just ensure the entire size is aligned after it's built. This
makes it more flexible to accepting device valid io vectors that would
otherwise get rejected by alignment checks.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
block/bio.c | 60 +++++++++++++++++++++++++++++++++++------------------
1 file changed, 40 insertions(+), 20 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 92c512e876c8d..9ecd546c5b077 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1227,13 +1227,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
extraction_flags |= ITER_ALLOW_P2PDMA;
- /*
- * Each segment in the iov is required to be a block size multiple.
- * However, we may not be able to get the entire segment if it spans
- * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
- * result to ensure the bio's total size is correct. The remainder of
- * the iov data will be picked up in the next bio iteration.
- */
size = iov_iter_extract_pages(iter, &pages,
UINT_MAX - bio->bi_iter.bi_size,
nr_pages, extraction_flags, &offset);
@@ -1241,18 +1234,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
return size ? size : -EFAULT;
nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
-
- if (bio->bi_bdev) {
- size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
- iov_iter_revert(iter, trim);
- size -= trim;
- }
-
- if (unlikely(!size)) {
- ret = -EFAULT;
- goto out;
- }
-
for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
struct page *page = pages[i];
struct folio *folio = page_folio(page);
@@ -1297,6 +1278,43 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
return ret;
}
+static inline void bio_revert(struct bio *bio, unsigned int nbytes)
+{
+ bio->bi_iter.bi_size -= nbytes;
+
+ while (nbytes) {
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+ if (nbytes < bv->bv_len) {
+ bv->bv_len -= nbytes;
+ return;
+ }
+
+ bio_release_page(bio, bv->bv_page);
+ bio->bi_vcnt--;
+ nbytes -= bv->bv_len;
+ }
+}
+
+static int bio_align_to_lbs(struct bio *bio, struct iov_iter *iter)
+{
+ struct block_device *bdev = bio->bi_bdev;
+ size_t nbytes;
+
+ if (!bdev)
+ return 0;
+
+ nbytes = bio->bi_iter.bi_size & (bdev_logical_block_size(bdev) - 1);
+ if (!nbytes)
+ return 0;
+
+ bio_revert(bio, nbytes);
+ iov_iter_revert(iter, nbytes);
+ if (!bio->bi_iter.bi_size)
+ return -EFAULT;
+ return 0;
+}
+
/**
* bio_iov_iter_get_pages - add user or kernel pages to a bio
* @bio: bio to add pages to
@@ -1336,7 +1354,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
- return bio->bi_vcnt ? 0 : ret;
+ if (bio->bi_vcnt)
+ return bio_align_to_lbs(bio, iter);
+ return ret;
}
EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
--
2.47.3
On 8/5/25 16:11, Keith Busch wrote: > From: Keith Busch <kbusch@kernel.org> > > Instead of ensuring each vector is block size aligned while constructing > the bio, just ensure the entire size is aligned after it's built. This > makes it more flexible to accepting device valid io vectors that would > otherwise get rejected by alignment checks. > > Signed-off-by: Keith Busch <kbusch@kernel.org> > --- > block/bio.c | 60 +++++++++++++++++++++++++++++++++++------------------ > 1 file changed, 40 insertions(+), 20 deletions(-) > Reviewed-by: Hannes Reinecke <hare@suse.de> Cheers, Hannes -- Dr. Hannes Reinecke Kernel Storage Architect hare@suse.de +49 911 74053 688 SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
On Tue, Aug 05, 2025 at 07:11:18AM -0700, Keith Busch wrote: > +static inline void bio_revert(struct bio *bio, unsigned int nbytes) Can you add a little comment explaining what this code does? The name suggast it is similar to iov_iter_revert, but I'm not sure how similar it is intended to be. The direct poking into bi_io_vec suggest it can only be used by the I/O submitter, and the use of bio_release_page suggests it is closely tied to to bio_iov_iter_get_pages despite the very generic name. > +static int bio_align_to_lbs(struct bio *bio, struct iov_iter *iter) > +{ > + struct block_device *bdev = bio->bi_bdev; > + size_t nbytes; > + > + if (!bdev) > + return 0; So this is something horribly Kent put in where he failed to deal with review feedback and just fed his stuff to Linus, and I think we need to fix it when touching this code again. Assuming we want to support bio_iov_iter_get_pages on bios without bi_bdev set we simplify can't rely in looking at bdev_logical_block_size here, but instead need to pass it explicitly. Which honestly doesn't sound to bad, just add an explicit argument for the required alignment to bio_iov_iter_get_pages instead of trying to derive it. Which is actually going to be useful to reduce duplicate checks for file systems the require > LBA size alignment as well. > - return bio->bi_vcnt ? 0 : ret; > + if (bio->bi_vcnt) > + return bio_align_to_lbs(bio, iter); > + return ret; Nit, this would flow a little more easily by moving the error / exceptional case into the branch, i.e. if (!bio->bi_vcnt) return ret; return bio_align_to_lbs(bio, iter);
© 2016 - 2025 Red Hat, Inc.