drivers/scsi/scsi_transport_sas.c | 40 +++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-)
From: Ionut Nechita <ionut.nechita@windriver.com>
sas_host_setup() unconditionally sets shost->opt_sectors from
dma_opt_mapping_size(). When the IOMMU is disabled or in passthrough
mode and no DMA ops provide an opt_mapping_size callback,
dma_opt_mapping_size() returns min(dma_max_mapping_size(), SIZE_MAX)
which equals dma_max_mapping_size() — a hard upper bound, not an
optimization hint.
On a Dell PowerEdge R750 with mpt3sas (Broadcom SAS3816, FW 33.15.00.00)
and intel_iommu=off the following values are observed:
dma_opt_mapping_size() = dma_max_mapping_size() (no real hint)
shost->max_sectors = 32767
opt_sectors = min(32767, huge >> 9) = 32767
optimal_io_size = 32767 << 9 = 16776704
→ round_down(16776704, 4096) = 16773120
The SAS disk (SAMSUNG MZILT800HBHQ0D3) do not report an
Optimal Transfer Length in VPD page B0,so sdkp->opt_xfer_blocks remains 0.
sd_revalidate_disk() then uses min_not_zero(0, opt_sectors) = opt_sectors,
propagating the bogus value into the block device's optimal_io_size
(visible as OPT-IO = 16773120 in lsblk --topology).
mkfs.xfs picks up optimal_io_size and minimum_io_size and computes:
swidth = 16773120 / 4096 = 4095
sunit = 8192 / 4096 = 2
Since 4095 % 2 != 0, XFS rejects the geometry:
SB stripe unit sanity check failed
This makes it impossible to create XFS filesystems (e.g. for
/var/lib/docker) during system bootstrap.
Fix this by introducing a sas_dma_opt_sectors() helper that only returns
a non-zero opt_sectors when dma_opt_mapping_size() is strictly less than
dma_max_mapping_size(), indicating a genuine DMA optimization constraint
from an IOMMU or DMA ops backend. The helper also rounds the value down
to a power of two so that filesystem geometry calculations always produce
clean results. When the two DMA values are equal, no backend provided a
real hint, so opt_sectors stays at 0 ("no preference").
A WARN_ONCE guards against dma_opt_mapping_size() returning a value
larger than dma_max_mapping_size(), which would indicate a driver bug.
The return value uses min_t(unsigned int, ...) to avoid any potential
overflow when shifting the size_t opt value down to sectors.
Fixes: 4cbfca5f7750 ("scsi: scsi_transport_sas: cap shost opt_sectors according to DMA optimal limit")
Cc: stable@vger.kernel.org
Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
---
drivers/scsi/scsi_transport_sas.c | 40 +++++++++++++++++++++++++++----
1 file changed, 36 insertions(+), 4 deletions(-)
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 12124f9d5ccd0..696627b6fe2c3 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -27,6 +27,7 @@
#include <linux/module.h>
#include <linux/jiffies.h>
#include <linux/err.h>
+#include <linux/log2.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/blkdev.h>
@@ -222,6 +223,38 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
* SAS host attributes
*/
+/**
+ * sas_dma_opt_sectors - derive opt_sectors from DMA optimal mapping size
+ * @dma_dev: device to query DMA parameters for
+ * @max_sectors: upper bound from the host adapter
+ *
+ * When the DMA layer reports a genuine optimization constraint (i.e.
+ * dma_opt_mapping_size() < dma_max_mapping_size()), convert it to a
+ * sector count, round it down to a power of two so that filesystem
+ * geometry calculations stay sane, and cap it at @max_sectors.
+ *
+ * When the two values are equal no backend provided a real hint and
+ * the function returns 0 ("no preference").
+ */
+static unsigned int sas_dma_opt_sectors(struct device *dma_dev,
+ unsigned int max_sectors)
+{
+ size_t opt = dma_opt_mapping_size(dma_dev);
+ size_t max = dma_max_mapping_size(dma_dev);
+
+ if (WARN_ONCE(opt > max,
+ "dma_opt_mapping_size (%zu) > dma_max_mapping_size (%zu)\n",
+ opt, max))
+ return 0;
+
+ if (opt == max)
+ return 0;
+
+ opt = rounddown_pow_of_two(opt);
+
+ return min_t(unsigned int, opt >> SECTOR_SHIFT, max_sectors);
+}
+
static int sas_host_setup(struct transport_container *tc, struct device *dev,
struct device *cdev)
{
@@ -239,10 +272,9 @@ static int sas_host_setup(struct transport_container *tc, struct device *dev,
dev_printk(KERN_ERR, dev, "fail to a bsg device %d\n",
shost->host_no);
- if (dma_dev->dma_mask) {
- shost->opt_sectors = min_t(unsigned int, shost->max_sectors,
- dma_opt_mapping_size(dma_dev) >> SECTOR_SHIFT);
- }
+ if (dma_dev->dma_mask)
+ shost->opt_sectors =
+ sas_dma_opt_sectors(dma_dev, shost->max_sectors);
return 0;
}
--
2.53.0
On 3/19/26 17:39, Ionut Nechita (Wind River) wrote:
> +static unsigned int sas_dma_opt_sectors(struct device *dma_dev,
> + unsigned int max_sectors)
> +{
> + size_t opt = dma_opt_mapping_size(dma_dev);
> + size_t max = dma_max_mapping_size(dma_dev);
> +
> + if (WARN_ONCE(opt > max,
> + "dma_opt_mapping_size (%zu) > dma_max_mapping_size (%zu)\n",
> + opt, max))
> + return 0;
> +
> + if (opt == max)
> + return 0;
Why return 0 ? This is a valid case, so this should get through the alignment below.
> +
> + opt = rounddown_pow_of_two(opt);
> +
> + return min_t(unsigned int, opt >> SECTOR_SHIFT, max_sectors);
> +}
> +
--
Damien Le Moal
Western Digital Research
On Wed, 19 Mar 2026 11:07:00 +0000, Damien Le Moal wrote: > Why return 0 ? This is a valid case, so this should get through the > alignment below. Hi Damien, Thanks for the review. The opt == max case is specifically the bug this patch fixes. When the IOMMU is disabled or in passthrough mode and no DMA ops provide an opt_mapping_size callback, dma_opt_mapping_size() falls back to min(SIZE_MAX, dma_max_mapping_size()), which equals dma_max_mapping_size(). So opt == max. If we let that value through, rounddown_pow_of_two() produces a huge power-of-two, and min_t() caps it at max_sectors (32767). That gives opt_sectors = 32767, which is exactly the bogus value that breaks mkfs.xfs: swidth = 16773120 / 4096 = 4095 sunit = 8192 / 4096 = 2 4095 % 2 != 0 -> "SB stripe unit sanity check failed" The key insight (from Robin Murphy's v1 review) is that when no backend provides a real optimization constraint, the DMA core returns the largest efficient size == the largest size. That is correct DMA semantics, but it means opt == max signals "no preference", not "the optimal size happens to equal the maximum". Returning 0 in that case means "no preference", which leaves opt_sectors at 0 and lets the disk's own geometry (or lack thereof) determine the I/O size. Regarding the Cc list: noted, I will trim it for v5 if needed. Thanks, Ionut
On Thu, 2026-03-19 at 22:43 +0200, Ionut Nechita (Wind River) wrote: > On Wed, 19 Mar 2026 11:07:00 +0000, Damien Le Moal wrote: > > Why return 0 ? This is a valid case, so this should get through the > > alignment below. > > Hi Damien, > > Thanks for the review. > > The opt == max case is specifically the bug this patch fixes. > > When the IOMMU is disabled or in passthrough mode and no DMA ops > provide an opt_mapping_size callback, dma_opt_mapping_size() falls > back to min(SIZE_MAX, dma_max_mapping_size()), which equals > dma_max_mapping_size(). So opt == max. > > If we let that value through, rounddown_pow_of_two() produces a > huge power-of-two, and min_t() caps it at max_sectors (32767). > That gives opt_sectors = 32767, which is exactly the bogus value > that breaks mkfs.xfs: > > swidth = 16773120 / 4096 = 4095 > sunit = 8192 / 4096 = 2 > 4095 % 2 != 0 -> "SB stripe unit sanity check failed" So if max_sectors is usually 32767 and this breaks xfs why the final line: > + return min_t(unsigned int, opt >> SECTOR_SHIFT, > max_sectors); because there are surely situations where the above max_sectors (32767) comes back as the minimum or are you assuming opt >> SECTOR_SHIFT is always less than max_sectors, in which case there's no need for min_t? Additionally, I note that the new AI code review: https://sashiko.dev/#/patchset/20260319083954.21056-1-ionut.nechita%40windriver.com Worries that if opt comes back as its don't care zero value then rounddown_pow_of_2(opt) returns a bogus value. Regards, James
On 3/20/26 05:43, Ionut Nechita (Wind River) wrote: > On Wed, 19 Mar 2026 11:07:00 +0000, Damien Le Moal wrote: >> Why return 0 ? This is a valid case, so this should get through the >> alignment below. > > Hi Damien, > > Thanks for the review. > > The opt == max case is specifically the bug this patch fixes. > > When the IOMMU is disabled or in passthrough mode and no DMA ops > provide an opt_mapping_size callback, dma_opt_mapping_size() falls > back to min(SIZE_MAX, dma_max_mapping_size()), which equals > dma_max_mapping_size(). So opt == max. > > If we let that value through, rounddown_pow_of_two() produces a > huge power-of-two, and min_t() caps it at max_sectors (32767). > That gives opt_sectors = 32767, which is exactly the bogus value > that breaks mkfs.xfs: > > swidth = 16773120 / 4096 = 4095 > sunit = 8192 / 4096 = 2 > 4095 % 2 != 0 -> "SB stripe unit sanity check failed" > > The key insight (from Robin Murphy's v1 review) is that when no > backend provides a real optimization constraint, the DMA core > returns the largest efficient size == the largest size. That is > correct DMA semantics, but it means opt == max signals "no > preference", not "the optimal size happens to equal the maximum". > > Returning 0 in that case means "no preference", which leaves > opt_sectors at 0 and lets the disk's own geometry (or lack > thereof) determine the I/O size. Thanks for re-explaining this. The code needs to have all this explanation as comment so that we do not trip on this again. > > Regarding the Cc list: noted, I will trim it for v5 if needed. > > Thanks, > Ionut -- Damien Le Moal Western Digital Research
On 3/19/26 17:39, Ionut Nechita (Wind River) wrote:
> From: Ionut Nechita <ionut.nechita@windriver.com>
>
> sas_host_setup() unconditionally sets shost->opt_sectors from
> dma_opt_mapping_size(). When the IOMMU is disabled or in passthrough
> mode and no DMA ops provide an opt_mapping_size callback,
> dma_opt_mapping_size() returns min(dma_max_mapping_size(), SIZE_MAX)
> which equals dma_max_mapping_size() — a hard upper bound, not an
> optimization hint.
Please reduce the distribution list. This is now a scsi patch. Nothing to do
with iommu or nvme.
>
> On a Dell PowerEdge R750 with mpt3sas (Broadcom SAS3816, FW 33.15.00.00)
> and intel_iommu=off the following values are observed:
>
> dma_opt_mapping_size() = dma_max_mapping_size() (no real hint)
> shost->max_sectors = 32767
> opt_sectors = min(32767, huge >> 9) = 32767
> optimal_io_size = 32767 << 9 = 16776704
> → round_down(16776704, 4096) = 16773120
>
> The SAS disk (SAMSUNG MZILT800HBHQ0D3) do not report an
> Optimal Transfer Length in VPD page B0,so sdkp->opt_xfer_blocks remains 0.
> sd_revalidate_disk() then uses min_not_zero(0, opt_sectors) = opt_sectors,
> propagating the bogus value into the block device's optimal_io_size
> (visible as OPT-IO = 16773120 in lsblk --topology).
>
> mkfs.xfs picks up optimal_io_size and minimum_io_size and computes:
>
> swidth = 16773120 / 4096 = 4095
> sunit = 8192 / 4096 = 2
>
> Since 4095 % 2 != 0, XFS rejects the geometry:
>
> SB stripe unit sanity check failed
>
> This makes it impossible to create XFS filesystems (e.g. for
> /var/lib/docker) during system bootstrap.
>
> Fix this by introducing a sas_dma_opt_sectors() helper that only returns
> a non-zero opt_sectors when dma_opt_mapping_size() is strictly less than
> dma_max_mapping_size(), indicating a genuine DMA optimization constraint
> from an IOMMU or DMA ops backend. The helper also rounds the value down
> to a power of two so that filesystem geometry calculations always produce
> clean results. When the two DMA values are equal, no backend provided a
> real hint, so opt_sectors stays at 0 ("no preference").
>
> A WARN_ONCE guards against dma_opt_mapping_size() returning a value
> larger than dma_max_mapping_size(), which would indicate a driver bug.
> The return value uses min_t(unsigned int, ...) to avoid any potential
> overflow when shifting the size_t opt value down to sectors.
>
> Fixes: 4cbfca5f7750 ("scsi: scsi_transport_sas: cap shost opt_sectors according to DMA optimal limit")
> Cc: stable@vger.kernel.org
> Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
> ---
> drivers/scsi/scsi_transport_sas.c | 40 +++++++++++++++++++++++++++----
> 1 file changed, 36 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
> index 12124f9d5ccd0..696627b6fe2c3 100644
> --- a/drivers/scsi/scsi_transport_sas.c
> +++ b/drivers/scsi/scsi_transport_sas.c
> @@ -27,6 +27,7 @@
> #include <linux/module.h>
> #include <linux/jiffies.h>
> #include <linux/err.h>
> +#include <linux/log2.h>
> #include <linux/slab.h>
> #include <linux/string.h>
> #include <linux/blkdev.h>
> @@ -222,6 +223,38 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
> * SAS host attributes
> */
>
> +/**
> + * sas_dma_opt_sectors - derive opt_sectors from DMA optimal mapping size
> + * @dma_dev: device to query DMA parameters for
> + * @max_sectors: upper bound from the host adapter
> + *
> + * When the DMA layer reports a genuine optimization constraint (i.e.
> + * dma_opt_mapping_size() < dma_max_mapping_size()), convert it to a
> + * sector count, round it down to a power of two so that filesystem
> + * geometry calculations stay sane, and cap it at @max_sectors.
> + *
> + * When the two values are equal no backend provided a real hint and
> + * the function returns 0 ("no preference").
> + */
> +static unsigned int sas_dma_opt_sectors(struct device *dma_dev,
> + unsigned int max_sectors)
> +{
> + size_t opt = dma_opt_mapping_size(dma_dev);
> + size_t max = dma_max_mapping_size(dma_dev);
> +
> + if (WARN_ONCE(opt > max,
> + "dma_opt_mapping_size (%zu) > dma_max_mapping_size (%zu)\n",
> + opt, max))
> + return 0;
> +
> + if (opt == max)
> + return 0;
> +
> + opt = rounddown_pow_of_two(opt);
> +
> + return min_t(unsigned int, opt >> SECTOR_SHIFT, max_sectors);
> +}
> +
> static int sas_host_setup(struct transport_container *tc, struct device *dev,
> struct device *cdev)
> {
> @@ -239,10 +272,9 @@ static int sas_host_setup(struct transport_container *tc, struct device *dev,
> dev_printk(KERN_ERR, dev, "fail to a bsg device %d\n",
> shost->host_no);
>
> - if (dma_dev->dma_mask) {
> - shost->opt_sectors = min_t(unsigned int, shost->max_sectors,
> - dma_opt_mapping_size(dma_dev) >> SECTOR_SHIFT);
> - }
> + if (dma_dev->dma_mask)
> + shost->opt_sectors =
> + sas_dma_opt_sectors(dma_dev, shost->max_sectors);
>
> return 0;
> }
--
Damien Le Moal
Western Digital Research
© 2016 - 2026 Red Hat, Inc.