Allow stacked devices to support atomic writes by aggregating the minimum
capability of all bottom devices.
Flag BLK_FEAT_ATOMIC_WRITES_STACKED is set for stacked devices which
have been enabled to support atomic writes.
Some things to note on the implementation:
- For simplicity, all bottom devices must have same atomic write boundary
value (if any)
- The atomic write boundary must be a power-of-2 already, but this
restriction could be relaxed. Furthermore, it is now required that the
chunk sectors for a top device must be aligned with this boundary.
- If a bottom device atomic write unit min/max are not aligned with the
top device chunk sectors, the top device atomic write unit min/max are
reduced to a value which works for the chunk sectors.
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
block/blk-settings.c | 89 ++++++++++++++++++++++++++++++++++++++++++
include/linux/blkdev.h | 4 ++
2 files changed, 93 insertions(+)
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1642e65a6521..6a900ef86e5a 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -496,6 +496,93 @@ static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lb
return sectors;
}
+static void blk_stack_atomic_writes_limits(struct queue_limits *t, struct queue_limits *b)
+{
+ if (!(t->features & BLK_FEAT_ATOMIC_WRITES_STACKED))
+ goto unsupported;
+
+ if (!b->atomic_write_unit_min)
+ goto unsupported;
+
+ /*
+ * If atomic_write_hw_max is set, we have already stacked 1x bottom
+ * device, so check for compliance.
+ */
+ if (t->atomic_write_hw_max) {
+ /* We're not going to support different boundary sizes.. yet */
+ if (t->atomic_write_hw_boundary != b->atomic_write_hw_boundary)
+ goto unsupported;
+
+ /* Can't support this */
+ if (t->atomic_write_hw_unit_min > b->atomic_write_hw_unit_max)
+ goto unsupported;
+
+ /* Or this */
+ if (t->atomic_write_hw_unit_max < b->atomic_write_hw_unit_min)
+ goto unsupported;
+
+ t->atomic_write_hw_max = min(t->atomic_write_hw_max,
+ b->atomic_write_hw_max);
+ t->atomic_write_hw_unit_min = max(t->atomic_write_hw_unit_min,
+ b->atomic_write_hw_unit_min);
+ t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max,
+ b->atomic_write_hw_unit_max);
+ return;
+ }
+
+ /* Check first bottom device limits */
+ if (!b->atomic_write_hw_boundary)
+ goto check_unit;
+ /*
+ * Ensure atomic write boundary is aligned with chunk sectors. Stacked
+ * devices store chunk sectors in t->io_min.
+ */
+ if (b->atomic_write_hw_boundary > t->io_min &&
+ b->atomic_write_hw_boundary % t->io_min)
+ goto unsupported;
+ else if (t->io_min > b->atomic_write_hw_boundary &&
+ t->io_min % b->atomic_write_hw_boundary)
+ goto unsupported;
+
+ t->atomic_write_hw_boundary = b->atomic_write_hw_boundary;
+
+check_unit:
+ if (t->io_min <= SECTOR_SIZE) {
+ /* No chunk sectors, so use bottom device values directly */
+ t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
+ t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min;
+ t->atomic_write_hw_max = b->atomic_write_hw_max;
+ return;
+ }
+
+ /*
+ * Find values for limits which work for chunk size.
+ * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk
+ * size (t->io_min), as chunk size is not restricted to a power-of-2.
+ * So we need to find highest power-of-2 which works for the chunk
+ * size.
+ * As an example scenario, we could have b->unit_max = 16K and
+ * t->io_min = 24K. For this case, reduce t->unit_max to a value
+ * aligned with both limits, i.e. 8K in this example.
+ */
+ t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
+ while (t->io_min % t->atomic_write_hw_unit_max)
+ t->atomic_write_hw_unit_max /= 2;
+
+ t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min,
+ t->atomic_write_hw_unit_max);
+ t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min);
+
+ return;
+
+unsupported:
+ t->atomic_write_hw_max = 0;
+ t->atomic_write_hw_unit_max = 0;
+ t->atomic_write_hw_unit_min = 0;
+ t->atomic_write_hw_boundary = 0;
+ t->features &= ~BLK_FEAT_ATOMIC_WRITES_STACKED;
+}
+
/**
* blk_stack_limits - adjust queue_limits for stacked devices
* @t: the stacking driver limits (top device)
@@ -656,6 +743,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->zone_write_granularity = 0;
t->max_zone_append_sectors = 0;
}
+ blk_stack_atomic_writes_limits(t, b);
+
return ret;
}
EXPORT_SYMBOL(blk_stack_limits);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d0a52ed05e60..bcd78634f6f2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -333,6 +333,10 @@ typedef unsigned int __bitwise blk_features_t;
#define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \
((__force blk_features_t)(1u << 15))
+/* stacked device can/does support atomic writes */
+#define BLK_FEAT_ATOMIC_WRITES_STACKED \
+ ((__force blk_features_t)(1u << 16))
+
/*
* Flags automatically inherited when stacking limits.
*/
--
2.31.1
On Wed, Oct 30, 2024 at 09:49:09AM +0000, John Garry wrote:
> Allow stacked devices to support atomic writes by aggregating the minimum
> capability of all bottom devices.
>
> Flag BLK_FEAT_ATOMIC_WRITES_STACKED is set for stacked devices which
> have been enabled to support atomic writes.
>
> Some things to note on the implementation:
> - For simplicity, all bottom devices must have same atomic write boundary
> value (if any)
> - The atomic write boundary must be a power-of-2 already, but this
> restriction could be relaxed. Furthermore, it is now required that the
> chunk sectors for a top device must be aligned with this boundary.
> - If a bottom device atomic write unit min/max are not aligned with the
> top device chunk sectors, the top device atomic write unit min/max are
> reduced to a value which works for the chunk sectors.
>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
> block/blk-settings.c | 89 ++++++++++++++++++++++++++++++++++++++++++
> include/linux/blkdev.h | 4 ++
> 2 files changed, 93 insertions(+)
>
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index 1642e65a6521..6a900ef86e5a 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -496,6 +496,93 @@ static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lb
> return sectors;
> }
>
> +static void blk_stack_atomic_writes_limits(struct queue_limits *t, struct queue_limits *b)
Avoid the overly long line here.
> + if (t->atomic_write_hw_max) {
Maybe split this branch and the code for when it is not set into
separate helpers to keep the function to a size where it can be
easily understood?
> + /* Check first bottom device limits */
> + if (!b->atomic_write_hw_boundary)
> + goto check_unit;
> + /*
> + * Ensure atomic write boundary is aligned with chunk sectors. Stacked
> + * devices store chunk sectors in t->io_min.
> + */
> + if (b->atomic_write_hw_boundary > t->io_min &&
> + b->atomic_write_hw_boundary % t->io_min)
> + goto unsupported;
> + else if (t->io_min > b->atomic_write_hw_boundary &&
No need for the else here.
> + t->io_min % b->atomic_write_hw_boundary)
> + goto unsupported;
> +
> + t->atomic_write_hw_boundary = b->atomic_write_hw_boundary;
> +
> +check_unit:
Maybe instead of the check_unit goto just move the checks between the
goto above and this into a branch?
Otherwise this looks conceptually fine to me.
On 30/10/2024 13:50, Christoph Hellwig wrote:
>>
>> +static void blk_stack_atomic_writes_limits(struct queue_limits *t, struct queue_limits *b)
> Avoid the overly long line here.
sure
>
>> + if (t->atomic_write_hw_max) {
> Maybe split this branch and the code for when it is not set into
> separate helpers to keep the function to a size where it can be
> easily understood?
I was trying to reduce indentation, but it does read a bit messy now, so
I can try break into a smaller function.
>
>> + /* Check first bottom device limits */
>> + if (!b->atomic_write_hw_boundary)
>> + goto check_unit;
>> + /*
>> + * Ensure atomic write boundary is aligned with chunk sectors. Stacked
>> + * devices store chunk sectors in t->io_min.
>> + */
>> + if (b->atomic_write_hw_boundary > t->io_min &&
>> + b->atomic_write_hw_boundary % t->io_min)
>> + goto unsupported;
>> + else if (t->io_min > b->atomic_write_hw_boundary &&
> No need for the else here.
>
>> + t->io_min % b->atomic_write_hw_boundary)
>> + goto unsupported;
>> +
>> + t->atomic_write_hw_boundary = b->atomic_write_hw_boundary;
>> +
>> +check_unit:
> Maybe instead of the check_unit goto just move the checks between the
> goto above and this into a branch?
I'm not sure, but I can try to avoid using the "goto check_unit" just to
skip code.
>
> Otherwise this looks conceptually fine to me.
ok, thanks!
© 2016 - 2026 Red Hat, Inc.