[PATCH 2/7] block: align the bio after building it

Keith Busch posted 7 patches 2 months ago
[PATCH 2/7] block: align the bio after building it
Posted by Keith Busch 2 months ago
From: Keith Busch <kbusch@kernel.org>

Ensure the entire size is aligned after it's built instead of ensuring
each vector is block size aligned while constructing it. This makes it
more flexible to accepting device valid vectors that would otherwise get
rejected by overzealous alignment checks.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/bio.c | 58 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 92c512e876c8d..c050903e1be0c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1227,13 +1227,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
 		extraction_flags |= ITER_ALLOW_P2PDMA;
 
-	/*
-	 * Each segment in the iov is required to be a block size multiple.
-	 * However, we may not be able to get the entire segment if it spans
-	 * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
-	 * result to ensure the bio's total size is correct. The remainder of
-	 * the iov data will be picked up in the next bio iteration.
-	 */
 	size = iov_iter_extract_pages(iter, &pages,
 				      UINT_MAX - bio->bi_iter.bi_size,
 				      nr_pages, extraction_flags, &offset);
@@ -1241,18 +1234,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 		return size ? size : -EFAULT;
 
 	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
-
-	if (bio->bi_bdev) {
-		size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
-		iov_iter_revert(iter, trim);
-		size -= trim;
-	}
-
-	if (unlikely(!size)) {
-		ret = -EFAULT;
-		goto out;
-	}
-
 	for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
 		struct page *page = pages[i];
 		struct folio *folio = page_folio(page);
@@ -1297,6 +1278,44 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	return ret;
 }
 
+static inline void bio_revert(struct bio *bio, unsigned int nbytes)
+{
+	bio->bi_iter.bi_size -= nbytes;
+
+	while (nbytes) {
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+		if (nbytes < bv->bv_len) {
+			bv->bv_len -= nbytes;
+			return;
+		}
+
+		bio_release_page(bio, bv->bv_page);
+		bio->bi_vcnt--;
+		nbytes -= bv->bv_len;
+       }
+}
+
+static int bio_align_to_lbs(struct bio *bio, struct iov_iter *iter)
+{
+	struct block_device *bdev = bio->bi_bdev;
+	size_t nbytes;
+
+	if (!bdev)
+		return 0;
+
+	nbytes = bio->bi_iter.bi_size & (bdev_logical_block_size(bdev) - 1);
+	if (!nbytes)
+		return 0;
+
+	bio_revert(bio, nbytes);
+	iov_iter_revert(iter, nbytes);
+	if (!bio->bi_iter.bi_size)
+		return -EFAULT;
+
+	return 0;
+}
+
 /**
  * bio_iov_iter_get_pages - add user or kernel pages to a bio
  * @bio: bio to add pages to
@@ -1336,6 +1355,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 		ret = __bio_iov_iter_get_pages(bio, iter);
 	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
 
+	ret = bio_align_to_lbs(bio, iter);
 	return bio->bi_vcnt ? 0 : ret;
 }
 EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
-- 
2.47.3
Re: [PATCH 2/7] block: align the bio after building it
Posted by Hannes Reinecke 2 months ago
On 8/2/25 01:47, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Ensure the entire size is aligned after it's built instead of ensuring
> each vector is block size aligned while constructing it. This makes it
> more flexible to accepting device valid vectors that would otherwise get
> rejected by overzealous alignment checks.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   block/bio.c | 58 +++++++++++++++++++++++++++++++++++------------------
>   1 file changed, 39 insertions(+), 19 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 92c512e876c8d..c050903e1be0c 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1227,13 +1227,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>   	if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
>   		extraction_flags |= ITER_ALLOW_P2PDMA;
>   
> -	/*
> -	 * Each segment in the iov is required to be a block size multiple.
> -	 * However, we may not be able to get the entire segment if it spans
> -	 * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
> -	 * result to ensure the bio's total size is correct. The remainder of
> -	 * the iov data will be picked up in the next bio iteration.
> -	 */
>   	size = iov_iter_extract_pages(iter, &pages,
>   				      UINT_MAX - bio->bi_iter.bi_size,
>   				      nr_pages, extraction_flags, &offset);
> @@ -1241,18 +1234,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>   		return size ? size : -EFAULT;
>   
>   	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
> -
> -	if (bio->bi_bdev) {
> -		size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
> -		iov_iter_revert(iter, trim);
> -		size -= trim;
> -	}
> -
> -	if (unlikely(!size)) {
> -		ret = -EFAULT;
> -		goto out;
> -	}
> -
>   	for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
>   		struct page *page = pages[i];
>   		struct folio *folio = page_folio(page);
> @@ -1297,6 +1278,44 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>   	return ret;
>   }
>   
> +static inline void bio_revert(struct bio *bio, unsigned int nbytes)
> +{
> +	bio->bi_iter.bi_size -= nbytes;
> +
> +	while (nbytes) {
> +		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
> +
> +		if (nbytes < bv->bv_len) {
> +			bv->bv_len -= nbytes;
> +			return;
> +		}
> +
> +		bio_release_page(bio, bv->bv_page);
> +		bio->bi_vcnt--;
> +		nbytes -= bv->bv_len;
> +       }
> +}
> +
> +static int bio_align_to_lbs(struct bio *bio, struct iov_iter *iter)
> +{
> +	struct block_device *bdev = bio->bi_bdev;
> +	size_t nbytes;
> +
> +	if (!bdev)
> +		return 0;
> +
> +	nbytes = bio->bi_iter.bi_size & (bdev_logical_block_size(bdev) - 1);
> +	if (!nbytes)
> +		return 0;
> +
> +	bio_revert(bio, nbytes);
> +	iov_iter_revert(iter, nbytes);
> +	if (!bio->bi_iter.bi_size)
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
>   /**
>    * bio_iov_iter_get_pages - add user or kernel pages to a bio
>    * @bio: bio to add pages to
> @@ -1336,6 +1355,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>   		ret = __bio_iov_iter_get_pages(bio, iter);
>   	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
>   
> +	ret = bio_align_to_lbs(bio, iter);
>   	return bio->bi_vcnt ? 0 : ret;

Wouldn't that cause the error from bio_align_to_lba() to be ignored
if bio->bi_vcnt is greater than 0?

>   }
>   EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);

Cheers,

Hsnnes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
Re: [PATCH 2/7] block: align the bio after building it
Posted by Keith Busch 2 months ago
On Mon, Aug 04, 2025 at 08:54:00AM +0200, Hannes Reinecke wrote:
> On 8/2/25 01:47, Keith Busch wrote:
> > +static int bio_align_to_lbs(struct bio *bio, struct iov_iter *iter)
> > +{
> > +	struct block_device *bdev = bio->bi_bdev;
> > +	size_t nbytes;
> > +
> > +	if (!bdev)
> > +		return 0;
> > +
> > +	nbytes = bio->bi_iter.bi_size & (bdev_logical_block_size(bdev) - 1);
> > +	if (!nbytes)
> > +		return 0;
> > +
> > +	bio_revert(bio, nbytes);
> > +	iov_iter_revert(iter, nbytes);
> > +	if (!bio->bi_iter.bi_size)
> > +		return -EFAULT;
> > +
> > +	return 0;
> > +}
> > +
> >   /**
> >    * bio_iov_iter_get_pages - add user or kernel pages to a bio
> >    * @bio: bio to add pages to
> > @@ -1336,6 +1355,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> >   		ret = __bio_iov_iter_get_pages(bio, iter);
> >   	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
> > +	ret = bio_align_to_lbs(bio, iter);
> >   	return bio->bi_vcnt ? 0 : ret;
> 
> Wouldn't that cause the error from bio_align_to_lba() to be ignored
> if bio->bi_vcnt is greater than 0?

That returns an error only if the alignment reduces the size to 0, so
there would be a bug somewhere if bi_vcnt is not also 0 in that case.
Re: [PATCH 2/7] block: align the bio after building it
Posted by Keith Busch 2 months ago
On Mon, Aug 04, 2025 at 08:08:38AM -0600, Keith Busch wrote:
> On Mon, Aug 04, 2025 at 08:54:00AM +0200, Hannes Reinecke wrote:
> > On 8/2/25 01:47, Keith Busch wrote:
> > > +static int bio_align_to_lbs(struct bio *bio, struct iov_iter *iter)
> > > +{
> > > +	struct block_device *bdev = bio->bi_bdev;
> > > +	size_t nbytes;
> > > +
> > > +	if (!bdev)
> > > +		return 0;
> > > +
> > > +	nbytes = bio->bi_iter.bi_size & (bdev_logical_block_size(bdev) - 1);
> > > +	if (!nbytes)
> > > +		return 0;
> > > +
> > > +	bio_revert(bio, nbytes);
> > > +	iov_iter_revert(iter, nbytes);
> > > +	if (!bio->bi_iter.bi_size)
> > > +		return -EFAULT;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > >   /**
> > >    * bio_iov_iter_get_pages - add user or kernel pages to a bio
> > >    * @bio: bio to add pages to
> > > @@ -1336,6 +1355,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> > >   		ret = __bio_iov_iter_get_pages(bio, iter);
> > >   	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
> > > +	ret = bio_align_to_lbs(bio, iter);
> > >   	return bio->bi_vcnt ? 0 : ret;
> > 
> > Wouldn't that cause the error from bio_align_to_lba() to be ignored
> > if bio->bi_vcnt is greater than 0?
> 
> That returns an error only if the alignment reduces the size to 0, so
> there would be a bug somewhere if bi_vcnt is not also 0 in that case.

But there is definitely a problem the other-way-around: if bi_vcnt is
already 0 because the first vector was a bad address, then my patch here
is mistakenly overriding 'ret' to indicate success when it wasn't.
Re: [PATCH 2/7] block: align the bio after building it
Posted by Hannes Reinecke 2 months ago
On 8/4/25 16:08, Keith Busch wrote:
> On Mon, Aug 04, 2025 at 08:54:00AM +0200, Hannes Reinecke wrote:
>> On 8/2/25 01:47, Keith Busch wrote:
>>> +static int bio_align_to_lbs(struct bio *bio, struct iov_iter *iter)
>>> +{
>>> +	struct block_device *bdev = bio->bi_bdev;
>>> +	size_t nbytes;
>>> +
>>> +	if (!bdev)
>>> +		return 0;
>>> +
>>> +	nbytes = bio->bi_iter.bi_size & (bdev_logical_block_size(bdev) - 1);
>>> +	if (!nbytes)
>>> +		return 0;
>>> +
>>> +	bio_revert(bio, nbytes);
>>> +	iov_iter_revert(iter, nbytes);
>>> +	if (!bio->bi_iter.bi_size)
>>> +		return -EFAULT;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>>    /**
>>>     * bio_iov_iter_get_pages - add user or kernel pages to a bio
>>>     * @bio: bio to add pages to
>>> @@ -1336,6 +1355,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>>>    		ret = __bio_iov_iter_get_pages(bio, iter);
>>>    	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
>>> +	ret = bio_align_to_lbs(bio, iter);
>>>    	return bio->bi_vcnt ? 0 : ret;
>>
>> Wouldn't that cause the error from bio_align_to_lba() to be ignored
>> if bio->bi_vcnt is greater than 0?
> 
> That returns an error only if the alignment reduces the size to 0, so
> there would be a bug somewhere if bi_vcnt is not also 0 in that case.

It would, but we wouldn't be seeing it as 'ret' would be obscured
if 'bio->bi_vcnt' continues to be greater than zero and 'ret' is set.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich