fs/ext4/ext4.h | 8 +++ fs/ext4/mballoc.c | 152 ++++++++++++++++++++++++++++++++++++++++++++-- fs/ext4/mballoc.h | 3 + fs/ext4/super.c | 18 +++++- 4 files changed, 175 insertions(+), 6 deletions(-)
Add support for the rotalloc allocation policy as a new mount
option. Policy rotates the starting block group for new allocations.
Changes:
- fs/ext4/ext4.h
rotalloc policy dedlared, extend sb with cursor, vector & lock
- fs/ext4/mballoc.h
expose allocator functions for vectoring in super.c
- fs/ext4/super.c
parse rotalloc mnt opt, init cursor, lock and allocator vector
- fs/ext4/mballoc.c
add rotalloc allocator, vectored allocator call in new_blocks
The policy is selected via a mount option and does not change the
on-disk format or default allocation behavior. It preserves existing
allocation heuristics within a block group while distributing
allocations across block groups in a deterministic sequential manner.
The rotating allocator is implemented as a separate allocation path
selected at mount time. This avoids conditional branches in the regular
allocator and keeps allocation policies isolated.
This also allows the rotating allocator to evolve independently in the
future without increasing complexity in the regular allocator.
The policy was tested using v6.18.6 stable locally with the new mount
option "rotalloc" enabled, confirmed working as desribed!
Signed-off-by: Mario Lohajner <mario_lohajner@rocketmail.com>
---
fs/ext4/ext4.h | 8 +++
fs/ext4/mballoc.c | 152 ++++++++++++++++++++++++++++++++++++++++++++--
fs/ext4/mballoc.h | 3 +
fs/ext4/super.c | 18 +++++-
4 files changed, 175 insertions(+), 6 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 56112f201cac..cbbb7c05d7a2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -229,6 +229,9 @@ struct ext4_allocation_request {
unsigned int flags;
};
+/* expose rotalloc allocator argument pointer type */
+struct ext4_allocation_context;
+
/*
* Logical to physical block mapping, used by ext4_map_blocks()
*
@@ -1230,6 +1233,7 @@ struct ext4_inode_info {
* Mount flags set via mount options or defaults
*/
#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */
+#define EXT4_MOUNT_ROTALLOC 0x00002 /* Use rotalloc policy/allocator */
#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
@@ -1559,6 +1563,10 @@ struct ext4_sb_info {
unsigned long s_mount_flags;
unsigned int s_def_mount_opt;
unsigned int s_def_mount_opt2;
+ /* Rotalloc cursor, lock & new_blocks allocator vector */
+ unsigned int s_rotalloc_cursor;
+ spinlock_t s_rotalloc_lock;
+ int (*s_mb_new_blocks)(struct ext4_allocation_context *ac);
ext4_fsblk_t s_sb_block;
atomic64_t s_resv_clusters;
kuid_t s_resuid;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 56d50fd3310b..74f79652c674 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2314,11 +2314,11 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
* stop the scan and use it immediately
*
* * If free extent found is smaller than goal, then keep retrying
- * upto a max of sbi->s_mb_max_to_scan times (default 200). After
+ * up to a max of sbi->s_mb_max_to_scan times (default 200). After
* that stop scanning and use whatever we have.
*
* * If free extent found is bigger than goal, then keep retrying
- * upto a max of sbi->s_mb_min_to_scan times (default 10) before
+ * up to a max of sbi->s_mb_min_to_scan times (default 10) before
* stopping the scan and using the extent.
*
*
@@ -2981,7 +2981,7 @@ static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
return ret;
}
-static noinline_for_stack int
+noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
ext4_group_t i;
@@ -3012,7 +3012,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
* is greater than equal to the sbi_s_mb_order2_reqs
* You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
* We also support searching for power-of-two requests only for
- * requests upto maximum buddy size we have constructed.
+ * requests up to maximum buddy size we have constructed.
*/
if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
if (is_power_of_2(ac->ac_g_ex.fe_len))
@@ -3101,6 +3101,144 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
return err;
}
+/* Rotating allocator (rotalloc mount option) */
+noinline_for_stack int
+ext4_mb_rotating_allocator(struct ext4_allocation_context *ac)
+{
+ ext4_group_t i, goal;
+ int err = 0;
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_buddy e4b;
+
+ BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+
+ /* Set the goal from s_rotalloc_cursor */
+ spin_lock(&sbi->s_rotalloc_lock);
+ goal = sbi->s_rotalloc_cursor;
+ spin_unlock(&sbi->s_rotalloc_lock);
+ ac->ac_g_ex.fe_group = goal;
+
+ /* first, try the goal */
+ err = ext4_mb_find_by_goal(ac, &e4b);
+ if (err || ac->ac_status == AC_STATUS_FOUND)
+ goto out;
+
+ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ goto out;
+
+ /*
+ * ac->ac_2order is set only if the fe_len is a power of 2
+ * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
+ * so that we try exact allocation using buddy.
+ */
+ i = fls(ac->ac_g_ex.fe_len);
+ ac->ac_2order = 0;
+ /*
+ * We search using buddy data only if the order of the request
+ * is greater than equal to the sbi_s_mb_order2_reqs
+ * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
+ * We also support searching for power-of-two requests only for
+ * requests up to maximum buddy size we have constructed.
+ */
+ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
+ if (is_power_of_2(ac->ac_g_ex.fe_len))
+ ac->ac_2order = array_index_nospec(i - 1,
+ MB_NUM_ORDERS(sb));
+ }
+
+ /* if stream allocation is enabled, use global goal */
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
+ int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
+
+ ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
+ ac->ac_g_ex.fe_start = -1;
+ ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
+ }
+
+ /*
+ * Let's just scan groups to find more-less suitable blocks We
+ * start with CR_GOAL_LEN_FAST, unless it is power of 2
+ * aligned, in which case let's do that faster approach first.
+ */
+ ac->ac_criteria = CR_GOAL_LEN_FAST;
+ if (ac->ac_2order)
+ ac->ac_criteria = CR_POWER2_ALIGNED;
+
+ ac->ac_e4b = &e4b;
+ ac->ac_prefetch_ios = 0;
+ ac->ac_first_err = 0;
+
+ /* Be sure to start scanning with goal from s_rotalloc_cursor! */
+ ac->ac_g_ex.fe_group = goal;
+repeat:
+ while (ac->ac_criteria < EXT4_MB_NUM_CRS) {
+ err = ext4_mb_scan_groups(ac);
+ if (err)
+ goto out;
+
+ if (ac->ac_status != AC_STATUS_CONTINUE)
+ break;
+ }
+
+ if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
+ !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+ /*
+ * We've been searching too long. Let's try to allocate
+ * the best chunk we've found so far
+ */
+ ext4_mb_try_best_found(ac, &e4b);
+ if (ac->ac_status != AC_STATUS_FOUND) {
+ int lost;
+
+ /*
+ * Someone more lucky has already allocated it.
+ * The only thing we can do is just take first
+ * found block(s)
+ */
+ lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
+ mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
+ ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len, lost);
+
+ ac->ac_b_ex.fe_group = 0;
+ ac->ac_b_ex.fe_start = 0;
+ ac->ac_b_ex.fe_len = 0;
+ ac->ac_status = AC_STATUS_CONTINUE;
+ ac->ac_flags |= EXT4_MB_HINT_FIRST;
+ ac->ac_criteria = CR_ANY_FREE;
+ goto repeat;
+ }
+ }
+
+ if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) {
+ atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC &&
+ ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group)
+ atomic_inc(&sbi->s_bal_stream_goals);
+ }
+out:
+ if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err)
+ err = ac->ac_first_err;
+
+ mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
+ ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
+ ac->ac_flags, ac->ac_criteria, err);
+
+ if (ac->ac_prefetch_nr)
+ ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr);
+
+ if (!err) {
+ /* Finally, if no errors, set the currsor to best group! */
+ goal = ac->ac_b_ex.fe_group;
+ spin_lock(&sbi->s_rotalloc_lock);
+ sbi->s_rotalloc_cursor = goal;
+ spin_unlock(&sbi->s_rotalloc_lock);
+ }
+
+ return err;
+}
+
static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
struct super_block *sb = pde_data(file_inode(seq->file));
@@ -6314,7 +6452,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
goto errout;
repeat:
/* allocate space in core */
- *errp = ext4_mb_regular_allocator(ac);
+ /*
+ * Use vectored allocator insead of fixed
+ * ext4_mb_regular_allocator(ac) function
+ */
+ *errp = sbi->s_mb_new_blocks(ac);
/*
* pa allocated above is added to grp->bb_prealloc_list only
* when we were able to allocate some block i.e. when
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 15a049f05d04..309190ce05ae 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -270,4 +270,7 @@ ext4_mballoc_query_range(
ext4_mballoc_query_range_fn formatter,
void *priv);
+/* Expose rotating & regular allocators for vectoring */
+int ext4_mb_rotating_allocator(struct ext4_allocation_context *ac);
+int ext4_mb_regular_allocator(struct ext4_allocation_context *ac);
#endif
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 87205660c5d0..f53501bbfb4b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1673,7 +1673,7 @@ enum {
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
- Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+ Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_rotalloc,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
@@ -1797,6 +1797,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
fsparam_u32 ("init_itable", Opt_init_itable),
fsparam_flag ("init_itable", Opt_init_itable),
fsparam_flag ("noinit_itable", Opt_noinit_itable),
+ fsparam_flag ("rotalloc", Opt_rotalloc),
#ifdef CONFIG_EXT4_DEBUG
fsparam_flag ("fc_debug_force", Opt_fc_debug_force),
fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay),
@@ -1878,6 +1879,7 @@ static const struct mount_opts {
{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
+ {Opt_rotalloc, EXT4_MOUNT_ROTALLOC, MOPT_SET},
{Opt_dax_type, 0, MOPT_EXT4_ONLY},
{Opt_journal_dev, 0, MOPT_NO_EXT2},
{Opt_journal_path, 0, MOPT_NO_EXT2},
@@ -2264,6 +2266,9 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->s_li_wait_mult = result.uint_32;
ctx->spec |= EXT4_SPEC_s_li_wait_mult;
return 0;
+ case Opt_rotalloc:
+ ctx_set_mount_opt(ctx, EXT4_MOUNT_ROTALLOC);
+ return 0;
case Opt_max_dir_size_kb:
ctx->s_max_dir_size_kb = result.uint_32;
ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
@@ -5512,6 +5517,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
}
}
+ /*
+ * Initialize rotalloc cursor, lock and
+ * vector new_blocks to rotating^regular allocator
+ */
+ sbi->s_rotalloc_cursor = 0;
+ spin_lock_init(&sbi->s_rotalloc_lock);
+ if (test_opt(sb, ROTALLOC))
+ sbi->s_mb_new_blocks = ext4_mb_rotating_allocator;
+ else
+ sbi->s_mb_new_blocks = ext4_mb_regular_allocator;
+
/*
* Get the # of file system overhead blocks from the
* superblock if present.
--
2.52.0
On 2026-02-04 11:31, Mario Lohajner wrote:
> Add support for the rotalloc allocation policy as a new mount
> option. Policy rotates the starting block group for new allocations.
>
> Changes:
> - fs/ext4/ext4.h
> rotalloc policy dedlared, extend sb with cursor, vector & lock
>
> - fs/ext4/mballoc.h
> expose allocator functions for vectoring in super.c
>
> - fs/ext4/super.c
> parse rotalloc mnt opt, init cursor, lock and allocator vector
>
> - fs/ext4/mballoc.c
> add rotalloc allocator, vectored allocator call in new_blocks
>
> The policy is selected via a mount option and does not change the
> on-disk format or default allocation behavior. It preserves existing
> allocation heuristics within a block group while distributing
> allocations across block groups in a deterministic sequential manner.
>
> The rotating allocator is implemented as a separate allocation path
> selected at mount time. This avoids conditional branches in the regular
> allocator and keeps allocation policies isolated.
> This also allows the rotating allocator to evolve independently in the
> future without increasing complexity in the regular allocator.
>
> The policy was tested using v6.18.6 stable locally with the new mount
> option "rotalloc" enabled, confirmed working as desribed!
>
> Signed-off-by: Mario Lohajner <mario_lohajner@rocketmail.com>
> ---
> fs/ext4/ext4.h | 8 +++
> fs/ext4/mballoc.c | 152 ++++++++++++++++++++++++++++++++++++++++++++--
> fs/ext4/mballoc.h | 3 +
> fs/ext4/super.c | 18 +++++-
> 4 files changed, 175 insertions(+), 6 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 56112f201cac..cbbb7c05d7a2 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -229,6 +229,9 @@ struct ext4_allocation_request {
> unsigned int flags;
> };
>
> +/* expose rotalloc allocator argument pointer type */
> +struct ext4_allocation_context;
> +
> /*
> * Logical to physical block mapping, used by ext4_map_blocks()
> *
> @@ -1230,6 +1233,7 @@ struct ext4_inode_info {
> * Mount flags set via mount options or defaults
> */
> #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */
> +#define EXT4_MOUNT_ROTALLOC 0x00002 /* Use rotalloc policy/allocator */
> #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
> #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
> #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
> @@ -1559,6 +1563,10 @@ struct ext4_sb_info {
> unsigned long s_mount_flags;
> unsigned int s_def_mount_opt;
> unsigned int s_def_mount_opt2;
> + /* Rotalloc cursor, lock & new_blocks allocator vector */
> + unsigned int s_rotalloc_cursor;
> + spinlock_t s_rotalloc_lock;
> + int (*s_mb_new_blocks)(struct ext4_allocation_context *ac);
> ext4_fsblk_t s_sb_block;
> atomic64_t s_resv_clusters;
> kuid_t s_resuid;
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 56d50fd3310b..74f79652c674 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2314,11 +2314,11 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
> * stop the scan and use it immediately
> *
> * * If free extent found is smaller than goal, then keep retrying
> - * upto a max of sbi->s_mb_max_to_scan times (default 200). After
> + * up to a max of sbi->s_mb_max_to_scan times (default 200). After
> * that stop scanning and use whatever we have.
> *
> * * If free extent found is bigger than goal, then keep retrying
> - * upto a max of sbi->s_mb_min_to_scan times (default 10) before
> + * up to a max of sbi->s_mb_min_to_scan times (default 10) before
> * stopping the scan and using the extent.
> *
> *
> @@ -2981,7 +2981,7 @@ static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
> return ret;
> }
>
> -static noinline_for_stack int
> +noinline_for_stack int
> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> {
> ext4_group_t i;
> @@ -3012,7 +3012,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> * is greater than equal to the sbi_s_mb_order2_reqs
> * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
> * We also support searching for power-of-two requests only for
> - * requests upto maximum buddy size we have constructed.
> + * requests up to maximum buddy size we have constructed.
> */
> if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
> if (is_power_of_2(ac->ac_g_ex.fe_len))
> @@ -3101,6 +3101,144 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> return err;
> }
>
> +/* Rotating allocator (rotalloc mount option) */
> +noinline_for_stack int
> +ext4_mb_rotating_allocator(struct ext4_allocation_context *ac)
> +{
> + ext4_group_t i, goal;
> + int err = 0;
> + struct super_block *sb = ac->ac_sb;
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + struct ext4_buddy e4b;
> +
> + BUG_ON(ac->ac_status == AC_STATUS_FOUND);
> +
> + /* Set the goal from s_rotalloc_cursor */
> + spin_lock(&sbi->s_rotalloc_lock);
> + goal = sbi->s_rotalloc_cursor;
> + spin_unlock(&sbi->s_rotalloc_lock);
> + ac->ac_g_ex.fe_group = goal;
> +
> + /* first, try the goal */
> + err = ext4_mb_find_by_goal(ac, &e4b);
> + if (err || ac->ac_status == AC_STATUS_FOUND)
> + goto out;
> +
> + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
> + goto out;
> +
> + /*
> + * ac->ac_2order is set only if the fe_len is a power of 2
> + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
> + * so that we try exact allocation using buddy.
> + */
> + i = fls(ac->ac_g_ex.fe_len);
> + ac->ac_2order = 0;
> + /*
> + * We search using buddy data only if the order of the request
> + * is greater than equal to the sbi_s_mb_order2_reqs
> + * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
> + * We also support searching for power-of-two requests only for
> + * requests up to maximum buddy size we have constructed.
> + */
> + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
> + if (is_power_of_2(ac->ac_g_ex.fe_len))
> + ac->ac_2order = array_index_nospec(i - 1,
> + MB_NUM_ORDERS(sb));
> + }
> +
> + /* if stream allocation is enabled, use global goal */
> + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
> + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
> +
> + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
> + ac->ac_g_ex.fe_start = -1;
> + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
Rotating block allocation looks a lot like stream allocation—they both
pick up from where the last successful allocation left off.
I noticed that the stream allocation's global goal is now split up.
Is there an advantage to keeping it as a single goal?
Alternatively, do you see any downsides to this split in your use case?
Hello Baokun Li,
This response was originally intended for Andreas.
I'm sending you the full copy to provide context for your query,
rather than writing a separate response.
Yes, the main motive for this allocator is flash wear leveling,
but it is not strictly a wear leveling mechanism, and it is not named
as such for a reason.
Wear leveling may (or may not) exist at the device/hardware level.
The goal of this policy is not to "fix" that.
This policy helps avoid allocation hotspots at mount start by
distributing allocations sequentially across the entire mount,
not just a file or allocation stream.
At the block/group allocation level, the file system is fairly stochastic
and timing-sensitive. Rather than providing raw benchmark data, I prefer
to explain the design analytically:
The vectored separation of the new allocator ensures that the performance
of the regular allocator is maintained (literally unchanged).
The overhead of the new rotating allocator is minimal and occurs outside
of the "hot loop":
the cursor is retrieved early at the start, updated upon successful
allocation,
and is negligible with respect to IO latency.
Because allocations proceed sequentially, latency is comparable to
or better than the regular allocator.
Having separated allocators increases maintainability and independence
with minimal (virtually no) overhead.
This policy benefits workloads with frequent large or small allocations,
while keeping file fragmentation and slack space minimal.
It is a conscious trade-off: sacrificing locality in favor of reinforced
sequentiality.
Of course, this is not optimal for classic HDDs, but NVMe drives behave
differently.
For this reason, the policy is optional per mount, turned off by default,
and can be toggled at mount time.
Best regards,
Mario
On 04. 02. 2026. 07:29, Baokun Li wrote:
> On 2026-02-04 11:31, Mario Lohajner wrote:
>> Add support for the rotalloc allocation policy as a new mount
>> option. Policy rotates the starting block group for new allocations.
>>
>> Changes:
>> - fs/ext4/ext4.h
>> rotalloc policy dedlared, extend sb with cursor, vector & lock
>>
>> - fs/ext4/mballoc.h
>> expose allocator functions for vectoring in super.c
>>
>> - fs/ext4/super.c
>> parse rotalloc mnt opt, init cursor, lock and allocator vector
>>
>> - fs/ext4/mballoc.c
>> add rotalloc allocator, vectored allocator call in new_blocks
>>
>> The policy is selected via a mount option and does not change the
>> on-disk format or default allocation behavior. It preserves existing
>> allocation heuristics within a block group while distributing
>> allocations across block groups in a deterministic sequential manner.
>>
>> The rotating allocator is implemented as a separate allocation path
>> selected at mount time. This avoids conditional branches in the regular
>> allocator and keeps allocation policies isolated.
>> This also allows the rotating allocator to evolve independently in the
>> future without increasing complexity in the regular allocator.
>>
>> The policy was tested using v6.18.6 stable locally with the new mount
>> option "rotalloc" enabled, confirmed working as desribed!
>>
>> Signed-off-by: Mario Lohajner <mario_lohajner@rocketmail.com>
>> ---
>> fs/ext4/ext4.h | 8 +++
>> fs/ext4/mballoc.c | 152 ++++++++++++++++++++++++++++++++++++++++++++--
>> fs/ext4/mballoc.h | 3 +
>> fs/ext4/super.c | 18 +++++-
>> 4 files changed, 175 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 56112f201cac..cbbb7c05d7a2 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -229,6 +229,9 @@ struct ext4_allocation_request {
>> unsigned int flags;
>> };
>>
>> +/* expose rotalloc allocator argument pointer type */
>> +struct ext4_allocation_context;
>> +
>> /*
>> * Logical to physical block mapping, used by ext4_map_blocks()
>> *
>> @@ -1230,6 +1233,7 @@ struct ext4_inode_info {
>> * Mount flags set via mount options or defaults
>> */
>> #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */
>> +#define EXT4_MOUNT_ROTALLOC 0x00002 /* Use rotalloc policy/allocator */
>> #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
>> #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
>> #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
>> @@ -1559,6 +1563,10 @@ struct ext4_sb_info {
>> unsigned long s_mount_flags;
>> unsigned int s_def_mount_opt;
>> unsigned int s_def_mount_opt2;
>> + /* Rotalloc cursor, lock & new_blocks allocator vector */
>> + unsigned int s_rotalloc_cursor;
>> + spinlock_t s_rotalloc_lock;
>> + int (*s_mb_new_blocks)(struct ext4_allocation_context *ac);
>> ext4_fsblk_t s_sb_block;
>> atomic64_t s_resv_clusters;
>> kuid_t s_resuid;
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index 56d50fd3310b..74f79652c674 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -2314,11 +2314,11 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
>> * stop the scan and use it immediately
>> *
>> * * If free extent found is smaller than goal, then keep retrying
>> - * upto a max of sbi->s_mb_max_to_scan times (default 200). After
>> + * up to a max of sbi->s_mb_max_to_scan times (default 200). After
>> * that stop scanning and use whatever we have.
>> *
>> * * If free extent found is bigger than goal, then keep retrying
>> - * upto a max of sbi->s_mb_min_to_scan times (default 10) before
>> + * up to a max of sbi->s_mb_min_to_scan times (default 10) before
>> * stopping the scan and using the extent.
>> *
>> *
>> @@ -2981,7 +2981,7 @@ static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
>> return ret;
>> }
>>
>> -static noinline_for_stack int
>> +noinline_for_stack int
>> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>> {
>> ext4_group_t i;
>> @@ -3012,7 +3012,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>> * is greater than equal to the sbi_s_mb_order2_reqs
>> * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
>> * We also support searching for power-of-two requests only for
>> - * requests upto maximum buddy size we have constructed.
>> + * requests up to maximum buddy size we have constructed.
>> */
>> if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
>> if (is_power_of_2(ac->ac_g_ex.fe_len))
>> @@ -3101,6 +3101,144 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>> return err;
>> }
>>
>> +/* Rotating allocator (rotalloc mount option) */
>> +noinline_for_stack int
>> +ext4_mb_rotating_allocator(struct ext4_allocation_context *ac)
>> +{
>> + ext4_group_t i, goal;
>> + int err = 0;
>> + struct super_block *sb = ac->ac_sb;
>> + struct ext4_sb_info *sbi = EXT4_SB(sb);
>> + struct ext4_buddy e4b;
>> +
>> + BUG_ON(ac->ac_status == AC_STATUS_FOUND);
>> +
>> + /* Set the goal from s_rotalloc_cursor */
>> + spin_lock(&sbi->s_rotalloc_lock);
>> + goal = sbi->s_rotalloc_cursor;
>> + spin_unlock(&sbi->s_rotalloc_lock);
>> + ac->ac_g_ex.fe_group = goal;
>> +
>> + /* first, try the goal */
>> + err = ext4_mb_find_by_goal(ac, &e4b);
>> + if (err || ac->ac_status == AC_STATUS_FOUND)
>> + goto out;
>> +
>> + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
>> + goto out;
>> +
>> + /*
>> + * ac->ac_2order is set only if the fe_len is a power of 2
>> + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
>> + * so that we try exact allocation using buddy.
>> + */
>> + i = fls(ac->ac_g_ex.fe_len);
>> + ac->ac_2order = 0;
>> + /*
>> + * We search using buddy data only if the order of the request
>> + * is greater than equal to the sbi_s_mb_order2_reqs
>> + * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
>> + * We also support searching for power-of-two requests only for
>> + * requests up to maximum buddy size we have constructed.
>> + */
>> + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
>> + if (is_power_of_2(ac->ac_g_ex.fe_len))
>> + ac->ac_2order = array_index_nospec(i - 1,
>> + MB_NUM_ORDERS(sb));
>> + }
>> +
>> + /* if stream allocation is enabled, use global goal */
>> + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
>> + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
>> +
>> + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
>> + ac->ac_g_ex.fe_start = -1;
>> + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
> Rotating block allocation looks a lot like stream allocation—they both
> pick up from where the last successful allocation left off.
>
> I noticed that the stream allocation's global goal is now split up.
> Is there an advantage to keeping it as a single goal?
> Alternatively, do you see any downsides to this split in your use case?
>
>
On 2026-02-04 19:06, Mario Lohajner wrote:
> Hello Baokun Li,
>
> This response was originally intended for Andreas.
> I'm sending you the full copy to provide context for your query,
> rather than writing a separate response.
>
> Yes, the main motive for this allocator is flash wear leveling,
> but it is not strictly a wear leveling mechanism, and it is not named
> as such for a reason.
> Wear leveling may (or may not) exist at the device/hardware level.
> The goal of this policy is not to "fix" that.
>
As Ted mentioned in another thread, wear leveling is media-dependent.
Most drivers can handle wear leveling effectively enough just via the
discard command.
If you are using UFS, F2FS might be a solid choice. However, for raw
NAND flash, UBIFS (via UBI) or JFFS2 would be more appropriate.
A single global goal would cause severe contention in multi-CPU
scenarios, which is precisely why the stream allocation goal was split
into multiple ones.
Furthermore, constantly overriding the inode goal leads to significant
file fragmentation, as it often misses opportunities for extent merging.
If we truly want to implement ext4_mb_rotating_allocator, we should strip
out inode goal, stream allocation, and optimize_scan, rather than simply
cloning ext4_mb_regular_allocator and forcing a goal setting.
Cheers,
Baokun
> This policy helps avoid allocation hotspots at mount start by
> distributing allocations sequentially across the entire mount,
> not just a file or allocation stream.
>
> At the block/group allocation level, the file system is fairly stochastic
> and timing-sensitive. Rather than providing raw benchmark data, I prefer
> to explain the design analytically:
> The vectored separation of the new allocator ensures that the performance
> of the regular allocator is maintained (literally unchanged).
> The overhead of the new rotating allocator is minimal and occurs outside
> of the "hot loop":
> the cursor is retrieved early at the start, updated upon successful
> allocation,
> and is negligible with respect to IO latency.
> Because allocations proceed sequentially, latency is comparable to
> or better than the regular allocator.
> Having separated allocators increases maintainability and independence
> with minimal (virtually no) overhead.
>
> This policy benefits workloads with frequent large or small allocations,
> while keeping file fragmentation and slack space minimal.
> It is a conscious trade-off: sacrificing locality in favor of reinforced
> sequentiality.
> Of course, this is not optimal for classic HDDs, but NVMe drives behave
> differently.
> For this reason, the policy is optional per mount, turned off by default,
> and can be toggled at mount time.
>
> Best regards,
> Mario
>
> On 04. 02. 2026. 07:29, Baokun Li wrote:
>> On 2026-02-04 11:31, Mario Lohajner wrote:
>>> Add support for the rotalloc allocation policy as a new mount
>>> option. Policy rotates the starting block group for new allocations.
>>>
>>> Changes:
>>> - fs/ext4/ext4.h
>>> rotalloc policy dedlared, extend sb with cursor, vector & lock
>>>
>>> - fs/ext4/mballoc.h
>>> expose allocator functions for vectoring in super.c
>>>
>>> - fs/ext4/super.c
>>> parse rotalloc mnt opt, init cursor, lock and allocator vector
>>>
>>> - fs/ext4/mballoc.c
>>> add rotalloc allocator, vectored allocator call in new_blocks
>>>
>>> The policy is selected via a mount option and does not change the
>>> on-disk format or default allocation behavior. It preserves existing
>>> allocation heuristics within a block group while distributing
>>> allocations across block groups in a deterministic sequential manner.
>>>
>>> The rotating allocator is implemented as a separate allocation path
>>> selected at mount time. This avoids conditional branches in the regular
>>> allocator and keeps allocation policies isolated.
>>> This also allows the rotating allocator to evolve independently in the
>>> future without increasing complexity in the regular allocator.
>>>
>>> The policy was tested using v6.18.6 stable locally with the new mount
>>> option "rotalloc" enabled, confirmed working as desribed!
>>>
>>> Signed-off-by: Mario Lohajner <mario_lohajner@rocketmail.com>
>>> ---
>>> fs/ext4/ext4.h | 8 +++
>>> fs/ext4/mballoc.c | 152 ++++++++++++++++++++++++++++++++++++++++++++--
>>> fs/ext4/mballoc.h | 3 +
>>> fs/ext4/super.c | 18 +++++-
>>> 4 files changed, 175 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>> index 56112f201cac..cbbb7c05d7a2 100644
>>> --- a/fs/ext4/ext4.h
>>> +++ b/fs/ext4/ext4.h
>>> @@ -229,6 +229,9 @@ struct ext4_allocation_request {
>>> unsigned int flags;
>>> };
>>> +/* expose rotalloc allocator argument pointer type */
>>> +struct ext4_allocation_context;
>>> +
>>> /*
>>> * Logical to physical block mapping, used by ext4_map_blocks()
>>> *
>>> @@ -1230,6 +1233,7 @@ struct ext4_inode_info {
>>> * Mount flags set via mount options or defaults
>>> */
>>> #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */
>>> +#define EXT4_MOUNT_ROTALLOC 0x00002 /* Use rotalloc
>>> policy/allocator */
>>> #define EXT4_MOUNT_GRPID 0x00004 /* Create files with
>>> directory's group */
>>> #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
>>> #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on
>>> errors */
>>> @@ -1559,6 +1563,10 @@ struct ext4_sb_info {
>>> unsigned long s_mount_flags;
>>> unsigned int s_def_mount_opt;
>>> unsigned int s_def_mount_opt2;
>>> + /* Rotalloc cursor, lock & new_blocks allocator vector */
>>> + unsigned int s_rotalloc_cursor;
>>> + spinlock_t s_rotalloc_lock;
>>> + int (*s_mb_new_blocks)(struct ext4_allocation_context *ac);
>>> ext4_fsblk_t s_sb_block;
>>> atomic64_t s_resv_clusters;
>>> kuid_t s_resuid;
>>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>>> index 56d50fd3310b..74f79652c674 100644
>>> --- a/fs/ext4/mballoc.c
>>> +++ b/fs/ext4/mballoc.c
>>> @@ -2314,11 +2314,11 @@ static void ext4_mb_check_limits(struct
>>> ext4_allocation_context *ac,
>>> * stop the scan and use it immediately
>>> *
>>> * * If free extent found is smaller than goal, then keep retrying
>>> - * upto a max of sbi->s_mb_max_to_scan times (default 200). After
>>> + * up to a max of sbi->s_mb_max_to_scan times (default 200). After
>>> * that stop scanning and use whatever we have.
>>> *
>>> * * If free extent found is bigger than goal, then keep retrying
>>> - * upto a max of sbi->s_mb_min_to_scan times (default 10) before
>>> + * up to a max of sbi->s_mb_min_to_scan times (default 10) before
>>> * stopping the scan and using the extent.
>>> *
>>> *
>>> @@ -2981,7 +2981,7 @@ static int ext4_mb_scan_group(struct
>>> ext4_allocation_context *ac,
>>> return ret;
>>> }
>>> -static noinline_for_stack int
>>> +noinline_for_stack int
>>> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>>> {
>>> ext4_group_t i;
>>> @@ -3012,7 +3012,7 @@ ext4_mb_regular_allocator(struct
>>> ext4_allocation_context *ac)
>>> * is greater than equal to the sbi_s_mb_order2_reqs
>>> * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
>>> * We also support searching for power-of-two requests only for
>>> - * requests upto maximum buddy size we have constructed.
>>> + * requests up to maximum buddy size we have constructed.
>>> */
>>> if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
>>> if (is_power_of_2(ac->ac_g_ex.fe_len))
>>> @@ -3101,6 +3101,144 @@ ext4_mb_regular_allocator(struct
>>> ext4_allocation_context *ac)
>>> return err;
>>> }
>>> +/* Rotating allocator (rotalloc mount option) */
>>> +noinline_for_stack int
>>> +ext4_mb_rotating_allocator(struct ext4_allocation_context *ac)
>>> +{
>>> + ext4_group_t i, goal;
>>> + int err = 0;
>>> + struct super_block *sb = ac->ac_sb;
>>> + struct ext4_sb_info *sbi = EXT4_SB(sb);
>>> + struct ext4_buddy e4b;
>>> +
>>> + BUG_ON(ac->ac_status == AC_STATUS_FOUND);
>>> +
>>> + /* Set the goal from s_rotalloc_cursor */
>>> + spin_lock(&sbi->s_rotalloc_lock);
>>> + goal = sbi->s_rotalloc_cursor;
>>> + spin_unlock(&sbi->s_rotalloc_lock);
>>> + ac->ac_g_ex.fe_group = goal;
>>> +
>>> + /* first, try the goal */
>>> + err = ext4_mb_find_by_goal(ac, &e4b);
>>> + if (err || ac->ac_status == AC_STATUS_FOUND)
>>> + goto out;
>>> +
>>> + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
>>> + goto out;
>>> +
>>> + /*
>>> + * ac->ac_2order is set only if the fe_len is a power of 2
>>> + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
>>> + * so that we try exact allocation using buddy.
>>> + */
>>> + i = fls(ac->ac_g_ex.fe_len);
>>> + ac->ac_2order = 0;
>>> + /*
>>> + * We search using buddy data only if the order of the request
>>> + * is greater than equal to the sbi_s_mb_order2_reqs
>>> + * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
>>> + * We also support searching for power-of-two requests only for
>>> + * requests up to maximum buddy size we have constructed.
>>> + */
>>> + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
>>> + if (is_power_of_2(ac->ac_g_ex.fe_len))
>>> + ac->ac_2order = array_index_nospec(i - 1,
>>> + MB_NUM_ORDERS(sb));
>>> + }
>>> +
>>> + /* if stream allocation is enabled, use global goal */
>>> + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
>>> + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
>>> +
>>> + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
>>> + ac->ac_g_ex.fe_start = -1;
>>> + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
>> Rotating block allocation looks a lot like stream allocation—they both
>> pick up from where the last successful allocation left off.
>>
>> I noticed that the stream allocation's global goal is now split up.
>> Is there an advantage to keeping it as a single goal?
>> Alternatively, do you see any downsides to this split in your use case?
>>
>>
Let me briefly restate the intent, focusing on the fundamentals.
Rotalloc is not wear leveling (and is intentionally not named as such).
It is a allocation policy whose goal is to reduce allocation hotspots by
enforcing mount-wide sequential allocation. Wear leveling, if any,
remains a device/firmware concern and is explicitly out of scope.
While WL motivated part of this work,
the main added value of this patch is allocator separation.
The policy indirection (aka vectored allocator) allows allocation
strategies that are orthogonal to the regular allocator to operate
outside the hot path, preserving existing heuristics and improving
maintainability. Rotalloc is one such case; when disabled (by default),
there is literally no behavioral change.
The cursor is global because the policy is global (KSS applied here :-).
It operates at the mount level, not per inode, stream, or CPU.
Splitting the cursor would still require keeping all cursors in sync;
once any allocator diverges, it either has to scan more blocks or
sequentiality collapses back into locality and hotspots reappear.
A single cursor is therefore intentional.
The rotating allocator itself is a working prototype.
It was written with minimal diff and clarity in mind to make the policy
reviewable. Refinements and simplifications are expected and welcome.
Regarding discard/trim: while discard prepares blocks for reuse and
signals that a block is free, it does not implement wear leveling by
itself. Rotalloc operates at a higher layer; by promoting sequentiality,
it reduces block/group allocation hotspots regardless of underlying
device behavior.
Since it is not in line with the current allocator goals, it is
implemented as an optional policy.
Best regards,
Mario
PS: thank you for acknowledging that there are workloads and scenarios
where this method is worthwhile :-).
On 05. 02. 2026. 04:52, Baokun Li wrote:
> On 2026-02-04 19:06, Mario Lohajner wrote:
>> Hello Baokun Li,
>>
>> This response was originally intended for Andreas.
>> I'm sending you the full copy to provide context for your query,
>> rather than writing a separate response.
>>
>> Yes, the main motive for this allocator is flash wear leveling,
>> but it is not strictly a wear leveling mechanism, and it is not named
>> as such for a reason.
>> Wear leveling may (or may not) exist at the device/hardware level.
>> The goal of this policy is not to "fix" that.
>>
> As Ted mentioned in another thread, wear leveling is media-dependent.
> Most drivers can handle wear leveling effectively enough just via the
> discard command.
>
> If you are using UFS, F2FS might be a solid choice. However, for raw
> NAND flash, UBIFS (via UBI) or JFFS2 would be more appropriate.
>
> A single global goal would cause severe contention in multi-CPU
> scenarios, which is precisely why the stream allocation goal was split
> into multiple ones.
>
> Furthermore, constantly overriding the inode goal leads to significant
> file fragmentation, as it often misses opportunities for extent merging.
>
> If we truly want to implement ext4_mb_rotating_allocator, we should strip
> out inode goal, stream allocation, and optimize_scan, rather than simply
> cloning ext4_mb_regular_allocator and forcing a goal setting.
>
>
> Cheers,
> Baokun
>
>> This policy helps avoid allocation hotspots at mount start by
>> distributing allocations sequentially across the entire mount,
>> not just a file or allocation stream.
>>
>> At the block/group allocation level, the file system is fairly stochastic
>> and timing-sensitive. Rather than providing raw benchmark data, I prefer
>> to explain the design analytically:
>> The vectored separation of the new allocator ensures that the performance
>> of the regular allocator is maintained (literally unchanged).
>> The overhead of the new rotating allocator is minimal and occurs outside
>> of the "hot loop":
>> the cursor is retrieved early at the start, updated upon successful
>> allocation,
>> and is negligible with respect to IO latency.
>> Because allocations proceed sequentially, latency is comparable to
>> or better than the regular allocator.
>> Having separated allocators increases maintainability and independence
>> with minimal (virtually no) overhead.
>>
>> This policy benefits workloads with frequent large or small allocations,
>> while keeping file fragmentation and slack space minimal.
>> It is a conscious trade-off: sacrificing locality in favor of reinforced
>> sequentiality.
>> Of course, this is not optimal for classic HDDs, but NVMe drives behave
>> differently.
>> For this reason, the policy is optional per mount, turned off by default,
>> and can be toggled at mount time.
>>
>> Best regards,
>> Mario
>>
>> On 04. 02. 2026. 07:29, Baokun Li wrote:
>>> On 2026-02-04 11:31, Mario Lohajner wrote:
>>>> Add support for the rotalloc allocation policy as a new mount
>>>> option. Policy rotates the starting block group for new allocations.
>>>>
>>>> Changes:
>>>> - fs/ext4/ext4.h
>>>> rotalloc policy dedlared, extend sb with cursor, vector & lock
>>>>
>>>> - fs/ext4/mballoc.h
>>>> expose allocator functions for vectoring in super.c
>>>>
>>>> - fs/ext4/super.c
>>>> parse rotalloc mnt opt, init cursor, lock and allocator vector
>>>>
>>>> - fs/ext4/mballoc.c
>>>> add rotalloc allocator, vectored allocator call in new_blocks
>>>>
>>>> The policy is selected via a mount option and does not change the
>>>> on-disk format or default allocation behavior. It preserves existing
>>>> allocation heuristics within a block group while distributing
>>>> allocations across block groups in a deterministic sequential manner.
>>>>
>>>> The rotating allocator is implemented as a separate allocation path
>>>> selected at mount time. This avoids conditional branches in the regular
>>>> allocator and keeps allocation policies isolated.
>>>> This also allows the rotating allocator to evolve independently in the
>>>> future without increasing complexity in the regular allocator.
>>>>
>>>> The policy was tested using v6.18.6 stable locally with the new mount
>>>> option "rotalloc" enabled, confirmed working as desribed!
>>>>
>>>> Signed-off-by: Mario Lohajner <mario_lohajner@rocketmail.com>
>>>> ---
>>>> fs/ext4/ext4.h | 8 +++
>>>> fs/ext4/mballoc.c | 152 ++++++++++++++++++++++++++++++++++++++++++++--
>>>> fs/ext4/mballoc.h | 3 +
>>>> fs/ext4/super.c | 18 +++++-
>>>> 4 files changed, 175 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>>> index 56112f201cac..cbbb7c05d7a2 100644
>>>> --- a/fs/ext4/ext4.h
>>>> +++ b/fs/ext4/ext4.h
>>>> @@ -229,6 +229,9 @@ struct ext4_allocation_request {
>>>> unsigned int flags;
>>>> };
>>>> +/* expose rotalloc allocator argument pointer type */
>>>> +struct ext4_allocation_context;
>>>> +
>>>> /*
>>>> * Logical to physical block mapping, used by ext4_map_blocks()
>>>> *
>>>> @@ -1230,6 +1233,7 @@ struct ext4_inode_info {
>>>> * Mount flags set via mount options or defaults
>>>> */
>>>> #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */
>>>> +#define EXT4_MOUNT_ROTALLOC 0x00002 /* Use rotalloc
>>>> policy/allocator */
>>>> #define EXT4_MOUNT_GRPID 0x00004 /* Create files with
>>>> directory's group */
>>>> #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
>>>> #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on
>>>> errors */
>>>> @@ -1559,6 +1563,10 @@ struct ext4_sb_info {
>>>> unsigned long s_mount_flags;
>>>> unsigned int s_def_mount_opt;
>>>> unsigned int s_def_mount_opt2;
>>>> + /* Rotalloc cursor, lock & new_blocks allocator vector */
>>>> + unsigned int s_rotalloc_cursor;
>>>> + spinlock_t s_rotalloc_lock;
>>>> + int (*s_mb_new_blocks)(struct ext4_allocation_context *ac);
>>>> ext4_fsblk_t s_sb_block;
>>>> atomic64_t s_resv_clusters;
>>>> kuid_t s_resuid;
>>>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>>>> index 56d50fd3310b..74f79652c674 100644
>>>> --- a/fs/ext4/mballoc.c
>>>> +++ b/fs/ext4/mballoc.c
>>>> @@ -2314,11 +2314,11 @@ static void ext4_mb_check_limits(struct
>>>> ext4_allocation_context *ac,
>>>> * stop the scan and use it immediately
>>>> *
>>>> * * If free extent found is smaller than goal, then keep retrying
>>>> - * upto a max of sbi->s_mb_max_to_scan times (default 200). After
>>>> + * up to a max of sbi->s_mb_max_to_scan times (default 200). After
>>>> * that stop scanning and use whatever we have.
>>>> *
>>>> * * If free extent found is bigger than goal, then keep retrying
>>>> - * upto a max of sbi->s_mb_min_to_scan times (default 10) before
>>>> + * up to a max of sbi->s_mb_min_to_scan times (default 10) before
>>>> * stopping the scan and using the extent.
>>>> *
>>>> *
>>>> @@ -2981,7 +2981,7 @@ static int ext4_mb_scan_group(struct
>>>> ext4_allocation_context *ac,
>>>> return ret;
>>>> }
>>>> -static noinline_for_stack int
>>>> +noinline_for_stack int
>>>> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>>>> {
>>>> ext4_group_t i;
>>>> @@ -3012,7 +3012,7 @@ ext4_mb_regular_allocator(struct
>>>> ext4_allocation_context *ac)
>>>> * is greater than equal to the sbi_s_mb_order2_reqs
>>>> * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
>>>> * We also support searching for power-of-two requests only for
>>>> - * requests upto maximum buddy size we have constructed.
>>>> + * requests up to maximum buddy size we have constructed.
>>>> */
>>>> if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
>>>> if (is_power_of_2(ac->ac_g_ex.fe_len))
>>>> @@ -3101,6 +3101,144 @@ ext4_mb_regular_allocator(struct
>>>> ext4_allocation_context *ac)
>>>> return err;
>>>> }
>>>> +/* Rotating allocator (rotalloc mount option) */
>>>> +noinline_for_stack int
>>>> +ext4_mb_rotating_allocator(struct ext4_allocation_context *ac)
>>>> +{
>>>> + ext4_group_t i, goal;
>>>> + int err = 0;
>>>> + struct super_block *sb = ac->ac_sb;
>>>> + struct ext4_sb_info *sbi = EXT4_SB(sb);
>>>> + struct ext4_buddy e4b;
>>>> +
>>>> + BUG_ON(ac->ac_status == AC_STATUS_FOUND);
>>>> +
>>>> + /* Set the goal from s_rotalloc_cursor */
>>>> + spin_lock(&sbi->s_rotalloc_lock);
>>>> + goal = sbi->s_rotalloc_cursor;
>>>> + spin_unlock(&sbi->s_rotalloc_lock);
>>>> + ac->ac_g_ex.fe_group = goal;
>>>> +
>>>> + /* first, try the goal */
>>>> + err = ext4_mb_find_by_goal(ac, &e4b);
>>>> + if (err || ac->ac_status == AC_STATUS_FOUND)
>>>> + goto out;
>>>> +
>>>> + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
>>>> + goto out;
>>>> +
>>>> + /*
>>>> + * ac->ac_2order is set only if the fe_len is a power of 2
>>>> + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
>>>> + * so that we try exact allocation using buddy.
>>>> + */
>>>> + i = fls(ac->ac_g_ex.fe_len);
>>>> + ac->ac_2order = 0;
>>>> + /*
>>>> + * We search using buddy data only if the order of the request
>>>> + * is greater than equal to the sbi_s_mb_order2_reqs
>>>> + * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
>>>> + * We also support searching for power-of-two requests only for
>>>> + * requests up to maximum buddy size we have constructed.
>>>> + */
>>>> + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
>>>> + if (is_power_of_2(ac->ac_g_ex.fe_len))
>>>> + ac->ac_2order = array_index_nospec(i - 1,
>>>> + MB_NUM_ORDERS(sb));
>>>> + }
>>>> +
>>>> + /* if stream allocation is enabled, use global goal */
>>>> + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
>>>> + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
>>>> +
>>>> + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
>>>> + ac->ac_g_ex.fe_start = -1;
>>>> + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
>>> Rotating block allocation looks a lot like stream allocation—they both
>>> pick up from where the last successful allocation left off.
>>>
>>> I noticed that the stream allocation's global goal is now split up.
>>> Is there an advantage to keeping it as a single goal?
>>> Alternatively, do you see any downsides to this split in your use case?
>>>
>>>
On Feb 5, 2026, at 05:23, Mario Lohajner <mario_lohajner@rocketmail.com> wrote:
>
> Let me briefly restate the intent, focusing on the fundamentals.
>
> Rotalloc is not wear leveling (and is intentionally not named as such).
> It is a allocation policy whose goal is to reduce allocation hotspots by
> enforcing mount-wide sequential allocation. Wear leveling, if any,
> remains a device/firmware concern and is explicitly out of scope.
> While WL motivated part of this work,
>
> the main added value of this patch is allocator separation.
> The policy indirection (aka vectored allocator) allows allocation
> strategies that are orthogonal to the regular allocator to operate
> outside the hot path, preserving existing heuristics and improving
> maintainability. Rotalloc is one such case; when disabled (by default),
> there is literally no behavioral change.
>
> The cursor is global because the policy is global (KSS applied here :-).
> It operates at the mount level, not per inode, stream, or CPU.
> Splitting the cursor would still require keeping all cursors in sync;
> once any allocator diverges, it either has to scan more blocks or
> sequentiality collapses back into locality and hotspots reappear.
> A single cursor is therefore intentional.
>
> The rotating allocator itself is a working prototype.
> It was written with minimal diff and clarity in mind to make the policy
> reviewable. Refinements and simplifications are expected and welcome.
>
> Regarding discard/trim: while discard prepares blocks for reuse and
> signals that a block is free, it does not implement wear leveling by
> itself. Rotalloc operates at a higher layer; by promoting sequentiality,
> it reduces block/group allocation hotspots regardless of underlying
> device behavior.
I think there are two main reasons why a round-robin allocation policy is of
interest:
- maximizing the time between block group re-use also maximizes the time
during which blocks can be freed in a group, reducing fragmentation
- this also improves the ability of fstrim to process larger segments
of free storage compared to "-o discard" which is sub-optimal when
processing recently-freed blocks (which may be small/unaligned compared
to the erase blocks/RAID of the storage and not actually trim any space)
The latter is what motivated the "-o fstrim" feature[1] that we developed
for Lustre as a lighter-weight alternative to "-o discard" to use fstrim
infrastructure on an ongoing basis to trim whole groups that have freed
space recently, but not track all of the freed extents individually.
Cheers, Andreas
[1] Patches to implement persistent TRIMMED flag, stats, and "fstrim":
https://github.com/lustre/lustre-release/blob/master/ldiskfs/kernel_patches/patches/rhel10.0/ext4-introduce-EXT4_BG_TRIMMED-to-optimize-fstrim.patch
https://github.com/lustre/lustre-release/blob/master/ldiskfs/kernel_patches/patches/rhel10.0/ext4-add-DISCARD-stats.patch
https://github.com/lustre/lustre-release/blob/master/ldiskfs/kernel_patches/patches/rhel9.4/ext4-add-fstrim-mount-option.patch
> Since it is not in line with the current allocator goals, it is
> implemented as an optional policy.
>
> Best regards,
> Mario
>
> PS: thank you for acknowledging that there are workloads and scenarios
> where this method is worthwhile :-).
>
> On 05. 02. 2026. 04:52, Baokun Li wrote:
>> On 2026-02-04 19:06, Mario Lohajner wrote:
>>> Hello Baokun Li,
>>>
>>> This response was originally intended for Andreas.
>>> I'm sending you the full copy to provide context for your query,
>>> rather than writing a separate response.
>>>
>>> Yes, the main motive for this allocator is flash wear leveling,
>>> but it is not strictly a wear leveling mechanism, and it is not named
>>> as such for a reason.
>>> Wear leveling may (or may not) exist at the device/hardware level.
>>> The goal of this policy is not to "fix" that.
>>>
>> As Ted mentioned in another thread, wear leveling is media-dependent.
>> Most drivers can handle wear leveling effectively enough just via the
>> discard command.
>>
>> If you are using UFS, F2FS might be a solid choice. However, for raw
>> NAND flash, UBIFS (via UBI) or JFFS2 would be more appropriate.
>>
>> A single global goal would cause severe contention in multi-CPU
>> scenarios, which is precisely why the stream allocation goal was split
>> into multiple ones.
>>
>> Furthermore, constantly overriding the inode goal leads to significant
>> file fragmentation, as it often misses opportunities for extent merging.
>>
>> If we truly want to implement ext4_mb_rotating_allocator, we should strip
>> out inode goal, stream allocation, and optimize_scan, rather than simply
>> cloning ext4_mb_regular_allocator and forcing a goal setting.
>>
>>
>> Cheers,
>> Baokun
>>
>>> This policy helps avoid allocation hotspots at mount start by
>>> distributing allocations sequentially across the entire mount,
>>> not just a file or allocation stream.
>>>
>>> At the block/group allocation level, the file system is fairly stochastic
>>> and timing-sensitive. Rather than providing raw benchmark data, I prefer
>>> to explain the design analytically:
>>> The vectored separation of the new allocator ensures that the performance
>>> of the regular allocator is maintained (literally unchanged).
>>> The overhead of the new rotating allocator is minimal and occurs outside
>>> of the "hot loop":
>>> the cursor is retrieved early at the start, updated upon successful
>>> allocation,
>>> and is negligible with respect to IO latency.
>>> Because allocations proceed sequentially, latency is comparable to
>>> or better than the regular allocator.
>>> Having separated allocators increases maintainability and independence
>>> with minimal (virtually no) overhead.
>>>
>>> This policy benefits workloads with frequent large or small allocations,
>>> while keeping file fragmentation and slack space minimal.
>>> It is a conscious trade-off: sacrificing locality in favor of reinforced
>>> sequentiality.
>>> Of course, this is not optimal for classic HDDs, but NVMe drives behave
>>> differently.
>>> For this reason, the policy is optional per mount, turned off by default,
>>> and can be toggled at mount time.
>>>
>>> Best regards,
>>> Mario
>>>
>>> On 04. 02. 2026. 07:29, Baokun Li wrote:
>>>> On 2026-02-04 11:31, Mario Lohajner wrote:
>>>>> Add support for the rotalloc allocation policy as a new mount
>>>>> option. Policy rotates the starting block group for new allocations.
>>>>>
>>>>> Changes:
>>>>> - fs/ext4/ext4.h
>>>>> rotalloc policy dedlared, extend sb with cursor, vector & lock
>>>>>
>>>>> - fs/ext4/mballoc.h
>>>>> expose allocator functions for vectoring in super.c
>>>>>
>>>>> - fs/ext4/super.c
>>>>> parse rotalloc mnt opt, init cursor, lock and allocator vector
>>>>>
>>>>> - fs/ext4/mballoc.c
>>>>> add rotalloc allocator, vectored allocator call in new_blocks
>>>>>
>>>>> The policy is selected via a mount option and does not change the
>>>>> on-disk format or default allocation behavior. It preserves existing
>>>>> allocation heuristics within a block group while distributing
>>>>> allocations across block groups in a deterministic sequential manner.
>>>>>
>>>>> The rotating allocator is implemented as a separate allocation path
>>>>> selected at mount time. This avoids conditional branches in the regular
>>>>> allocator and keeps allocation policies isolated.
>>>>> This also allows the rotating allocator to evolve independently in the
>>>>> future without increasing complexity in the regular allocator.
>>>>>
>>>>> The policy was tested using v6.18.6 stable locally with the new mount
>>>>> option "rotalloc" enabled, confirmed working as desribed!
>>>>>
>>>>> Signed-off-by: Mario Lohajner <mario_lohajner@rocketmail.com>
>>>>> ---
>>>>> fs/ext4/ext4.h | 8 +++
>>>>> fs/ext4/mballoc.c | 152 ++++++++++++++++++++++++++++++++++++++++++++--
>>>>> fs/ext4/mballoc.h | 3 +
>>>>> fs/ext4/super.c | 18 +++++-
>>>>> 4 files changed, 175 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>>>> index 56112f201cac..cbbb7c05d7a2 100644
>>>>> --- a/fs/ext4/ext4.h
>>>>> +++ b/fs/ext4/ext4.h
>>>>> @@ -229,6 +229,9 @@ struct ext4_allocation_request {
>>>>> unsigned int flags;
>>>>> };
>>>>> +/* expose rotalloc allocator argument pointer type */
>>>>> +struct ext4_allocation_context;
>>>>> +
>>>>> /*
>>>>> * Logical to physical block mapping, used by ext4_map_blocks()
>>>>> *
>>>>> @@ -1230,6 +1233,7 @@ struct ext4_inode_info {
>>>>> * Mount flags set via mount options or defaults
>>>>> */
>>>>> #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */
>>>>> +#define EXT4_MOUNT_ROTALLOC 0x00002 /* Use rotalloc
>>>>> policy/allocator */
>>>>> #define EXT4_MOUNT_GRPID 0x00004 /* Create files with
>>>>> directory's group */
>>>>> #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
>>>>> #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on
>>>>> errors */
>>>>> @@ -1559,6 +1563,10 @@ struct ext4_sb_info {
>>>>> unsigned long s_mount_flags;
>>>>> unsigned int s_def_mount_opt;
>>>>> unsigned int s_def_mount_opt2;
>>>>> + /* Rotalloc cursor, lock & new_blocks allocator vector */
>>>>> + unsigned int s_rotalloc_cursor;
>>>>> + spinlock_t s_rotalloc_lock;
>>>>> + int (*s_mb_new_blocks)(struct ext4_allocation_context *ac);
>>>>> ext4_fsblk_t s_sb_block;
>>>>> atomic64_t s_resv_clusters;
>>>>> kuid_t s_resuid;
>>>>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>>>>> index 56d50fd3310b..74f79652c674 100644
>>>>> --- a/fs/ext4/mballoc.c
>>>>> +++ b/fs/ext4/mballoc.c
>>>>> @@ -2314,11 +2314,11 @@ static void ext4_mb_check_limits(struct
>>>>> ext4_allocation_context *ac,
>>>>> * stop the scan and use it immediately
>>>>> *
>>>>> * * If free extent found is smaller than goal, then keep retrying
>>>>> - * upto a max of sbi->s_mb_max_to_scan times (default 200). After
>>>>> + * up to a max of sbi->s_mb_max_to_scan times (default 200). After
>>>>> * that stop scanning and use whatever we have.
>>>>> *
>>>>> * * If free extent found is bigger than goal, then keep retrying
>>>>> - * upto a max of sbi->s_mb_min_to_scan times (default 10) before
>>>>> + * up to a max of sbi->s_mb_min_to_scan times (default 10) before
>>>>> * stopping the scan and using the extent.
>>>>> *
>>>>> *
>>>>> @@ -2981,7 +2981,7 @@ static int ext4_mb_scan_group(struct
>>>>> ext4_allocation_context *ac,
>>>>> return ret;
>>>>> }
>>>>> -static noinline_for_stack int
>>>>> +noinline_for_stack int
>>>>> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>>>>> {
>>>>> ext4_group_t i;
>>>>> @@ -3012,7 +3012,7 @@ ext4_mb_regular_allocator(struct
>>>>> ext4_allocation_context *ac)
>>>>> * is greater than equal to the sbi_s_mb_order2_reqs
>>>>> * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
>>>>> * We also support searching for power-of-two requests only for
>>>>> - * requests upto maximum buddy size we have constructed.
>>>>> + * requests up to maximum buddy size we have constructed.
>>>>> */
>>>>> if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
>>>>> if (is_power_of_2(ac->ac_g_ex.fe_len))
>>>>> @@ -3101,6 +3101,144 @@ ext4_mb_regular_allocator(struct
>>>>> ext4_allocation_context *ac)
>>>>> return err;
>>>>> }
>>>>> +/* Rotating allocator (rotalloc mount option) */
>>>>> +noinline_for_stack int
>>>>> +ext4_mb_rotating_allocator(struct ext4_allocation_context *ac)
>>>>> +{
>>>>> + ext4_group_t i, goal;
>>>>> + int err = 0;
>>>>> + struct super_block *sb = ac->ac_sb;
>>>>> + struct ext4_sb_info *sbi = EXT4_SB(sb);
>>>>> + struct ext4_buddy e4b;
>>>>> +
>>>>> + BUG_ON(ac->ac_status == AC_STATUS_FOUND);
>>>>> +
>>>>> + /* Set the goal from s_rotalloc_cursor */
>>>>> + spin_lock(&sbi->s_rotalloc_lock);
>>>>> + goal = sbi->s_rotalloc_cursor;
>>>>> + spin_unlock(&sbi->s_rotalloc_lock);
>>>>> + ac->ac_g_ex.fe_group = goal;
>>>>> +
>>>>> + /* first, try the goal */
>>>>> + err = ext4_mb_find_by_goal(ac, &e4b);
>>>>> + if (err || ac->ac_status == AC_STATUS_FOUND)
>>>>> + goto out;
>>>>> +
>>>>> + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
>>>>> + goto out;
>>>>> +
>>>>> + /*
>>>>> + * ac->ac_2order is set only if the fe_len is a power of 2
>>>>> + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
>>>>> + * so that we try exact allocation using buddy.
>>>>> + */
>>>>> + i = fls(ac->ac_g_ex.fe_len);
>>>>> + ac->ac_2order = 0;
>>>>> + /*
>>>>> + * We search using buddy data only if the order of the request
>>>>> + * is greater than equal to the sbi_s_mb_order2_reqs
>>>>> + * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
>>>>> + * We also support searching for power-of-two requests only for
>>>>> + * requests up to maximum buddy size we have constructed.
>>>>> + */
>>>>> + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
>>>>> + if (is_power_of_2(ac->ac_g_ex.fe_len))
>>>>> + ac->ac_2order = array_index_nospec(i - 1,
>>>>> + MB_NUM_ORDERS(sb));
>>>>> + }
>>>>> +
>>>>> + /* if stream allocation is enabled, use global goal */
>>>>> + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
>>>>> + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
>>>>> +
>>>>> + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
>>>>> + ac->ac_g_ex.fe_start = -1;
>>>>> + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
>>>> Rotating block allocation looks a lot like stream allocation—they both
>>>> pick up from where the last successful allocation left off.
>>>>
>>>> I noticed that the stream allocation's global goal is now split up.
>>>> Is there an advantage to keeping it as a single goal?
>>>> Alternatively, do you see any downsides to this split in your use case?
>>>>
>>>>
Cheers, Andreas
On Thu, Feb 05, 2026 at 01:23:18PM +0100, Mario Lohajner wrote: > Let me briefly restate the intent, focusing on the fundamentals. > > Rotalloc is not wear leveling (and is intentionally not named as such). > It is a allocation policy whose goal is to reduce allocation hotspots by > enforcing mount-wide sequential allocation. Wear leveling, if any, > remains a device/firmware concern and is explicitly out of scope. > While WL motivated part of this work, Yes, but *why* are you trying to reduce allocation hotspots? What problem are you trying to solve? And actually, you are making allocation hotspots *worse* since with global cursor, by definition there is a single, super-hotspot. This will cause scalability issues on a system with multiple CPU's trying to write in parallel. > the main added value of this patch is allocator separation. > The policy indirection (aka vectored allocator) allows allocation > strategies that are orthogonal to the regular allocator to operate > outside the hot path, preserving existing heuristics and improving > maintainability. Allocator separation is not necessarily that an unalloyed good thing. By having duplicated code, it means that if we need to make a change in infrastructure code, we might now need to make it in multiple code paths. It is also one more code path that we have to test and maintain. So there is a real cost from the perspctive of the upstream maintenance perspective. Also, because having a single global allocation point (your "cursor") is going to absolutely *trash* performance, especially for high speed NVMe devices connected to high count CPU's, it's not clear to me why performance is necessary for rotalloc. > The rotating allocator itself is a working prototype. > It was written with minimal diff and clarity in mind to make the policy > reviewable. Refinements and simplifications are expected and welcome. OK, so this sounds like it's not ready for prime time.... > Regarding discard/trim: while discard prepares blocks for reuse and > signals that a block is free, it does not implement wear leveling by > itself. Rotalloc operates at a higher layer; by promoting sequentiality, > it reduces block/group allocation hotspots regardless of underlying > device behavior. > Since it is not in line with the current allocator goals, it is > implemented as an optional policy. Again, what is the high level goal of rotalloc? What specific hardware and workload are you trying to optimize for? If you want to impose a maintaince overhead on upstream, you need to justify why the mainteance overhead is worth it. And so that means you need to be a bit more explicit about what specific real-world solution you are trying to solve.... - Ted
On 06. 02. 2026. 02:42, Theodore Tso wrote: > On Thu, Feb 05, 2026 at 01:23:18PM +0100, Mario Lohajner wrote: >> Let me briefly restate the intent, focusing on the fundamentals. >> >> Rotalloc is not wear leveling (and is intentionally not named as such). >> It is a allocation policy whose goal is to reduce allocation hotspots by >> enforcing mount-wide sequential allocation. Wear leveling, if any, >> remains a device/firmware concern and is explicitly out of scope. >> While WL motivated part of this work, > > Yes, but *why* are you trying to reduce allocation hotspots? What > problem are you trying to solve? And actually, you are making > allocation hotspots *worse* since with global cursor, by definition > there is a single, super-hotspot. This will cause scalability issues > on a system with multiple CPU's trying to write in parallel. Greetings Ted, First off, apologies for the delayed reply — your emails somehow ended up in my spam! I hope this doesn’t happen again. Also, sorry for the lengthy responses; I really care to make my points clear. I’m not proposing that ext4 should implement or control wear leveling. WL clearly does (or does not) exist below the FS layer and is opaque to us (we have no way of knowing). What is observable in practice, however, is persistent allocation locality near the beginning of the LBA space under real workloads, and a corresponding concentration of wear in that area, interestingly it seems to be vendor-agnostic. = The force within is very strong :-) The elephant: My concern is a potential policy interaction: filesystem locality policies tend to concentrate hot metadata and early allocations. During deallocation, we naturally discard/trim those blocks ASAP to make them ready for write, thus optimizing for speed, while at the same time signaling them as free. Meanwhile, an underlying WL policy (if present) tries to consume free blocks opportunistically. If these two interact poorly, the result can be a sustained bias toward low-LBA hot regions (as observable in practice). The elephant is in the room and is called “wear” / hotspots at the LBA start. > >> the main added value of this patch is allocator separation. >> The policy indirection (aka vectored allocator) allows allocation >> strategies that are orthogonal to the regular allocator to operate >> outside the hot path, preserving existing heuristics and improving >> maintainability. > > Allocator separation is not necessarily that an unalloyed good thing. > By having duplicated code, it means that if we need to make a change > in infrastructure code, we might now need to make it in multiple code > paths. It is also one more code path that we have to test and > maintain. So there is a real cost from the perspctive of the upstream > maintenance perspective. My goal was to keep the regular allocator intact and trivially clean. Baokun noticed this well — I’m using all existing heuristics; the only tweak I do is to ‘fix the goal’ (i.e., set where to start), which then sequentially advances toward the region most likely to contain empty, unused space, at which point allocations become nearly instantaneous. Being orthogonal in principle, these two allocators/policies are meant to live independently of each other. Alternatively, we could drop the separation entirely and add a few conditional branches to the regular allocator to the same effect, but this introduces overhead, potential branch mispredictions, and all the associated shenanigans (minor but not insignifficant). Separation avoids that, at the minimal cost of maintaining 20-ish extra lines of code. (memory we have; time is scarce) > > Also, because having a single global allocation point (your "cursor") > is going to absolutely *trash* performance, especially for high speed > NVMe devices connected to high count CPU's, it's not clear to me why > performance is necessary for rotalloc. > >> The rotating allocator itself is a working prototype. >> It was written with minimal diff and clarity in mind to make the policy >> reviewable. Refinements and simplifications are expected and welcome. > > OK, so this sounds like it's not ready for prime time.... I don’t consider it “not ready for prime time.” It is a rather simple refinement of the existing allocator, producing clean, contiguous layouts with sequential allocation across the LBA space without increase in complexity and with equal or lower latency. Further refinements are anticipated and welcome — not because the current approach is flawed, but because this seems like an area where we can reasonably ask whether it can be even better. > >> Regarding discard/trim: while discard prepares blocks for reuse and >> signals that a block is free, it does not implement wear leveling by >> itself. Rotalloc operates at a higher layer; by promoting sequentiality, >> it reduces block/group allocation hotspots regardless of underlying >> device behavior. >> Since it is not in line with the current allocator goals, it is >> implemented as an optional policy. > > Again, what is the high level goal of rotalloc? What specific > hardware and workload are you trying to optimize for? If you want to > impose a maintaince overhead on upstream, you need to justify why the > mainteance overhead is worth it. And so that means you need to be a > bit more explicit about what specific real-world solution you are > trying to solve.... > > - Ted Again, we’re not focusing solely on wear leveling here, but since we can’t influence the WL implementation itself, the only lever we have is our own allocation policy. The question I’m trying to sanity-check is whether we can avoid reinforcing this pattern, and instead aim for an allocation strategy that helps minimize the issue—or even avoid it entirely if possible. Even though this pattern is clear in practice I’m not claiming this applies universally, only that it appears often enough to be worth discussing at the policy level. For that reason, it seems reasonable to treat this as an optional policy choice, disabled by default. Sincerely, Mario
On Fri, Feb 06, 2026 at 08:25:24PM +0100, Mario Lohajner wrote:
> What is observable in practice, however, is persistent allocation locality
> near the beginning of the LBA space under real workloads, and a
> corresponding concentration of wear in that area, interestingly it seems to
> be vendor-agnostic. = The force within is very strong :-)
This is simply not true. Data blocks are *not* located to the
low-numbered LBA's in kind of reasonble real-world situation. Why do
you think this is true, and what was your experiment that led you
believe this?
Let me show you *my* experiment:
root@kvm-xfstests:~# /sbin/mkfs.ext4 -qF /dev/vdc 5g
root@kvm-xfstests:~# mount /dev/vdc /vdc
[ 171.091299] EXT4-fs (vdc): mounted filesystem 06dd464f-1c3a-4a2b-b3dd-e937c1e7624f r/w with ordered data mode. Quota mode: none.
root@kvm-xfstests:~# tar -C /vdc -xJf /vtmp/ext4-6.12.tar.xz
root@kvm-xfstests:~# ls -li /vdc
total 1080
31018 -rw-r--r-- 1 15806 15806 496 Dec 12 2024 COPYING
347 -rw-r--r-- 1 15806 15806 105095 Dec 12 2024 CREDITS
31240 drwxr-xr-x 75 15806 15806 4096 Dec 12 2024 Documentation
31034 -rw-r--r-- 1 15806 15806 2573 Dec 12 2024 Kbuild
31017 -rw-r--r-- 1 15806 15806 555 Dec 12 2024 Kconfig
30990 drwxr-xr-x 6 15806 15806 4096 Dec 12 2024 LICENSES
323 -rw-r--r-- 1 15806 15806 781906 Dec 1 21:34 MAINTAINERS
19735 -rw-r--r-- 1 15806 15806 68977 Dec 1 21:34 Makefile
14 -rw-r--r-- 1 15806 15806 726 Dec 12 2024 README
1392 drwxr-xr-x 23 15806 15806 4096 Dec 12 2024 arch
669 drwxr-xr-x 3 15806 15806 4096 Dec 1 21:34 block
131073 drwxr-xr-x 2 15806 15806 4096 Dec 12 2024 certs
31050 drwxr-xr-x 4 15806 15806 4096 Dec 1 21:34 crypto
143839 drwxr-xr-x 143 15806 15806 4096 Dec 12 2024 drivers
140662 drwxr-xr-x 81 15806 15806 4096 Dec 1 21:34 fs
134043 drwxr-xr-x 32 15806 15806 4096 Dec 12 2024 include
31035 drwxr-xr-x 2 15806 15806 4096 Dec 1 21:34 init
140577 drwxr-xr-x 2 15806 15806 4096 Dec 1 21:34 io_uring
140648 drwxr-xr-x 2 15806 15806 4096 Dec 1 21:34 ipc
771 drwxr-xr-x 22 15806 15806 4096 Dec 1 21:34 kernel
143244 drwxr-xr-x 20 15806 15806 12288 Dec 1 21:34 lib
11 drwx------ 2 root root 16384 Feb 6 16:34 lost+found
22149 drwxr-xr-x 6 15806 15806 4096 Dec 1 21:34 mm
19736 drwxr-xr-x 72 15806 15806 4096 Dec 12 2024 net
42649 drwxr-xr-x 7 15806 15806 4096 Dec 1 21:34 rust
349 drwxr-xr-x 42 15806 15806 4096 Dec 12 2024 samples
42062 drwxr-xr-x 19 15806 15806 12288 Dec 1 21:34 scripts
15 drwxr-xr-x 15 15806 15806 4096 Dec 1 21:34 security
131086 drwxr-xr-x 27 15806 15806 4096 Dec 12 2024 sound
22351 drwxr-xr-x 45 15806 15806 4096 Dec 12 2024 tools
31019 drwxr-xr-x 4 15806 15806 4096 Dec 12 2024 usr
324 drwxr-xr-x 4 15806 15806 4096 Dec 12 2024 virt
Note how different directories have different inode numbers, which are
in different block groups. This is how we naturally spread block
allocations across different block groups. This is *specifically* to
spread block allocations across the entire storage device. So for example:
root@kvm-xfstests:~# filefrag -v /vdc/arch/Kconfig
Filesystem type is: ef53
File size of /vdc/arch/Kconfig is 51709 (13 blocks of 4096 bytes)
ext: logical_offset: physical_offset: length: expected: flags:
0: 0.. 12: 67551.. 67563: 13: last,eof
/vdc/arch/Kconfig: 1 extent found
root@kvm-xfstests:~# filefrag -v /vdc/sound/Makefile
Filesystem type is: ef53
File size of /vdc/sound/Makefile is 562 (1 block of 4096 bytes)
ext: logical_offset: physical_offset: length: expected: flags:
0: 0.. 0: 574197.. 574197: 1: last,eof
/vdc/sound/Makefile: 1 extent found
See? The are not spread across LBA's. Quod Erat Demonstratum.
By the way, spreading block allocations across LBA's was not done
because of a concern about flash storage. The ext2, ext3, and ewxt4
filesysetm has had this support going over a quarter of a century,
because spreading the blocks across file system avoids file
fragmentation. It's a technique that we took from BSD's Fast File
System, called the Orlov algorithm. For more inforamtion, see [1], or
in the ext4 sources[2].
[1] https://en.wikipedia.org/wiki/Orlov_block_allocator
[2] https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git/tree/fs/ext4/ialloc.c#n398
> My concern is a potential policy interaction: filesystem locality
> policies tend to concentrate hot metadata and early allocations. During
> deallocation, we naturally discard/trim those blocks ASAP to make them
> ready for write, thus optimizing for speed, while at the same time signaling
> them as free. Meanwhile, an underlying WL policy (if present) tries to
> consume free blocks opportunistically.
> If these two interact poorly, the result can be a sustained bias toward
> low-LBA hot regions (as observable in practice).
> The elephant is in the room and is called “wear” / hotspots at the LBA
> start.
First of all, most of the "sustained bias towards low-LBA regions" is
not because where data blocks are located, but because of the location
of static metadata blocks in particular, the superblock, block group
descriptors, and the allocation bitmaps. Having static metadata is
not unique to ext2/ext3/ext4. The FAT file system has the File
Allocation Table in low numbered LBA's, which are constantly updated
whenever blocks are allocated. Even log structured file systems, such
as btrfs, f2fs, and ZFS have a superblock at a static location which
gets rewriten at every file system commit.
Secondly, *because* all file systems rewrite certain LBA's, and how
flash erase blocks work, pretty much all flash translation layers for
the past two decades are *designed* to be able to deal with it.
Because of Digital Cameras and the FAT file systems, pretty much all
flash storage do *not* have a static mapping between a particular LBA
and a specific set of flash cells. The fact that you keep asserting
that "hotspots at the LBA start" is a problem indicates to me that you
don't understand how SSD's work in real life.
So I commend to you these two articles:
https://flashdba.com/2014/06/20/understanding-flash-blocks-pages-and-program-erases/
https://flashdba.com/2014/09/17/understanding-flash-the-flash-translation-layer/
These web pages date from 12 years ago, because SSD technology is in
2026, very old technology in an industry where two years == infinity.
For a more academic perspective, there's the paper from the
conference: 2009 First International Conference on Advances in System
Simulation, published by researchers from Pennsylvania State
University:
https://www.cse.psu.edu/~buu1/papers/ps/flashsim.pdf
The FlashSim is available as open source, and has since been used by
many other researchers to explore improvements in Flash Translation
Layer. And even the most basic FTL algorithms mean that your proposed
RotAlloc is ***pointless***. If you think otherwise, you're going to
need to provide convincing evidence.
> Again, we’re not focusing solely on wear leveling here, but since we
> can’t influence the WL implementation itself, the only lever we have is
> our own allocation policy.
You claim that you're not focusing on wear leveling, but every single
justification for your changes reference "wear / hotspotting". I'm
trying to tell you that it's not an issue. If you think it *could* be
an issue, *ever*, you need to provide *proof* --- at the very least,
proof that you understand things like how flash erase blocks work, how
flash translation layers work, and how the orlov block allocation
algorithm works. Because with all due respect, it appears that you
are profoundly ignorant, and it's not clear why we should be
respecting your opinion and your arguments. If you think we should,
you really need to up your game.
Regards,
- Ted
On 2/7/26 06:31, Theodore Tso wrote: > On Fri, Feb 06, 2026 at 08:25:24PM +0100, Mario Lohajner wrote: >> What is observable in practice, however, is persistent allocation locality >> near the beginning of the LBA space under real workloads, and a >> corresponding concentration of wear in that area, interestingly it seems to >> be vendor-agnostic. = The force within is very strong :-) > > This is simply not true. Data blocks are *not* located to the > low-numbered LBA's in kind of reasonble real-world situation. Why do > you think this is true, and what was your experiment that led you > believe this? > > Let me show you *my* experiment: > > root@kvm-xfstests:~# /sbin/mkfs.ext4 -qF /dev/vdc 5g > root@kvm-xfstests:~# mount /dev/vdc /vdc > [ 171.091299] EXT4-fs (vdc): mounted filesystem 06dd464f-1c3a-4a2b-b3dd-e937c1e7624f r/w with ordered data mode. Quota mode: none. > root@kvm-xfstests:~# tar -C /vdc -xJf /vtmp/ext4-6.12.tar.xz > root@kvm-xfstests:~# ls -li /vdc > total 1080 > 31018 -rw-r--r-- 1 15806 15806 496 Dec 12 2024 COPYING > 347 -rw-r--r-- 1 15806 15806 105095 Dec 12 2024 CREDITS > 31240 drwxr-xr-x 75 15806 15806 4096 Dec 12 2024 Documentation > 31034 -rw-r--r-- 1 15806 15806 2573 Dec 12 2024 Kbuild > 31017 -rw-r--r-- 1 15806 15806 555 Dec 12 2024 Kconfig > 30990 drwxr-xr-x 6 15806 15806 4096 Dec 12 2024 LICENSES > 323 -rw-r--r-- 1 15806 15806 781906 Dec 1 21:34 MAINTAINERS > 19735 -rw-r--r-- 1 15806 15806 68977 Dec 1 21:34 Makefile > 14 -rw-r--r-- 1 15806 15806 726 Dec 12 2024 README > 1392 drwxr-xr-x 23 15806 15806 4096 Dec 12 2024 arch > 669 drwxr-xr-x 3 15806 15806 4096 Dec 1 21:34 block > 131073 drwxr-xr-x 2 15806 15806 4096 Dec 12 2024 certs > 31050 drwxr-xr-x 4 15806 15806 4096 Dec 1 21:34 crypto > 143839 drwxr-xr-x 143 15806 15806 4096 Dec 12 2024 drivers > 140662 drwxr-xr-x 81 15806 15806 4096 Dec 1 21:34 fs > 134043 drwxr-xr-x 32 15806 15806 4096 Dec 12 2024 include > 31035 drwxr-xr-x 2 15806 15806 4096 Dec 1 21:34 init > 140577 drwxr-xr-x 2 15806 15806 4096 Dec 1 21:34 io_uring > 140648 drwxr-xr-x 2 15806 15806 4096 Dec 1 21:34 ipc > 771 drwxr-xr-x 22 15806 15806 4096 Dec 1 21:34 kernel > 143244 drwxr-xr-x 20 15806 15806 12288 Dec 1 21:34 lib > 11 drwx------ 2 root root 16384 Feb 6 16:34 lost+found > 22149 drwxr-xr-x 6 15806 15806 4096 Dec 1 21:34 mm > 19736 drwxr-xr-x 72 15806 15806 4096 Dec 12 2024 net > 42649 drwxr-xr-x 7 15806 15806 4096 Dec 1 21:34 rust > 349 drwxr-xr-x 42 15806 15806 4096 Dec 12 2024 samples > 42062 drwxr-xr-x 19 15806 15806 12288 Dec 1 21:34 scripts > 15 drwxr-xr-x 15 15806 15806 4096 Dec 1 21:34 security > 131086 drwxr-xr-x 27 15806 15806 4096 Dec 12 2024 sound > 22351 drwxr-xr-x 45 15806 15806 4096 Dec 12 2024 tools > 31019 drwxr-xr-x 4 15806 15806 4096 Dec 12 2024 usr > 324 drwxr-xr-x 4 15806 15806 4096 Dec 12 2024 virt > > Note how different directories have different inode numbers, which are > in different block groups. This is how we naturally spread block > allocations across different block groups. This is *specifically* to > spread block allocations across the entire storage device. So for example: > > root@kvm-xfstests:~# filefrag -v /vdc/arch/Kconfig > Filesystem type is: ef53 > File size of /vdc/arch/Kconfig is 51709 (13 blocks of 4096 bytes) > ext: logical_offset: physical_offset: length: expected: flags: > 0: 0.. 12: 67551.. 67563: 13: last,eof > /vdc/arch/Kconfig: 1 extent found > > root@kvm-xfstests:~# filefrag -v /vdc/sound/Makefile > Filesystem type is: ef53 > File size of /vdc/sound/Makefile is 562 (1 block of 4096 bytes) > ext: logical_offset: physical_offset: length: expected: flags: > 0: 0.. 0: 574197.. 574197: 1: last,eof > /vdc/sound/Makefile: 1 extent found > > See? The are not spread across LBA's. Quod Erat Demonstratum. > > By the way, spreading block allocations across LBA's was not done > because of a concern about flash storage. The ext2, ext3, and ewxt4 > filesysetm has had this support going over a quarter of a century, > because spreading the blocks across file system avoids file > fragmentation. It's a technique that we took from BSD's Fast File > System, called the Orlov algorithm. For more inforamtion, see [1], or > in the ext4 sources[2]. > > [1] https://en.wikipedia.org/wiki/Orlov_block_allocator > [2] https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git/tree/fs/ext4/ialloc.c#n398 > >> My concern is a potential policy interaction: filesystem locality >> policies tend to concentrate hot metadata and early allocations. During >> deallocation, we naturally discard/trim those blocks ASAP to make them >> ready for write, thus optimizing for speed, while at the same time signaling >> them as free. Meanwhile, an underlying WL policy (if present) tries to >> consume free blocks opportunistically. >> If these two interact poorly, the result can be a sustained bias toward >> low-LBA hot regions (as observable in practice). >> The elephant is in the room and is called “wear” / hotspots at the LBA >> start. > > First of all, most of the "sustained bias towards low-LBA regions" is > not because where data blocks are located, but because of the location > of static metadata blocks in particular, the superblock, block group > descriptors, and the allocation bitmaps. Having static metadata is > not unique to ext2/ext3/ext4. The FAT file system has the File > Allocation Table in low numbered LBA's, which are constantly updated > whenever blocks are allocated. Even log structured file systems, such > as btrfs, f2fs, and ZFS have a superblock at a static location which > gets rewriten at every file system commit. > > Secondly, *because* all file systems rewrite certain LBA's, and how > flash erase blocks work, pretty much all flash translation layers for > the past two decades are *designed* to be able to deal with it. > Because of Digital Cameras and the FAT file systems, pretty much all > flash storage do *not* have a static mapping between a particular LBA > and a specific set of flash cells. The fact that you keep asserting > that "hotspots at the LBA start" is a problem indicates to me that you > don't understand how SSD's work in real life. > > So I commend to you these two articles: > > https://flashdba.com/2014/06/20/understanding-flash-blocks-pages-and-program-erases/ > https://flashdba.com/2014/09/17/understanding-flash-the-flash-translation-layer/ > > These web pages date from 12 years ago, because SSD technology is in > 2026, very old technology in an industry where two years == infinity. > > For a more academic perspective, there's the paper from the > conference: 2009 First International Conference on Advances in System > Simulation, published by researchers from Pennsylvania State > University: > > https://www.cse.psu.edu/~buu1/papers/ps/flashsim.pdf > > The FlashSim is available as open source, and has since been used by > many other researchers to explore improvements in Flash Translation > Layer. And even the most basic FTL algorithms mean that your proposed > RotAlloc is ***pointless***. If you think otherwise, you're going to > need to provide convincing evidence. Hi Ted, Let me try to clarify this in a way that avoids talking past each other. I fully agree with the allocator theory, the Orlov algorithm, and with your demonstration. I am not disputing *anything*, nor have I ever intended to. The pattern I keep referring to as “observable in practice” is about repeated free -> reallocate cycles, allocator restart points, and reuse bias - i.e., which regions of the address space are revisited most frequently over time. > >> Again, we’re not focusing solely on wear leveling here, but since we >> can’t influence the WL implementation itself, the only lever we have is >> our own allocation policy. > > You claim that you're not focusing on wear leveling, but every single > justification for your changes reference "wear / hotspotting". I'm > trying to tell you that it's not an issue. If you think it *could* be > an issue, *ever*, you need to provide *proof* --- at the very least, > proof that you understand things like how flash erase blocks work, how > flash translation layers work, and how the orlov block allocation > algorithm works. Because with all due respect, it appears that you > are profoundly ignorant, and it's not clear why we should be > respecting your opinion and your arguments. If you think we should, > you really need to up your game. > > Regards, > > - Ted Although I admitted being WL-inspired right from the start, I maintain that *this is not* wear leveling - WL deals with reallocations, translations, amplification history... This simply *is not* that. Calling it "wear leveling" would be like an election promise - it might, but probably won’t, come true. The question I’m raising is much narrower: whether allocator policy choices can unintentionally reinforce reuse patterns under certain workloads - and whether offering an *alternative policy* is reasonable (I dare to say; in some cases more optimal). I was consciously avoiding turning this into a “your stats vs. my stats” &| “your methods vs. my methods” discussion. However, to avoid arguing from theory alone, I will follow up with a small set of real-world examples. https://github.com/mlohajner/elephant-in-the-room These are snapshots from different systems, illustrating the point I’m presenting here. Provided as-is, without annotations; while they do not show the allocation bitmap explicitly, they are statistically correlated with the most frequently used blocks/groups across the LBA space. Given that another maintainer has already expressed support for making this an *optional policy; disabled by default* I believe this discussion is less about allocator theory correctness and more about whether accommodating real-world workload diversity is desirable. Regards, Mario P.S. I'm so altruistic I dare say this out loud: At this point, my other concern is this: if we reach common ground and make it optional, and it truly helps more than it hurts, who will actually ever use it? :-) (Assuming end users even know it exists, to adopt it in a way that feels like a natural progression/improvement.)
On Sat, Feb 07, 2026 at 01:45:06PM +0100, Mario Lohajner wrote: > The pattern I keep referring to as “observable in practice” is about > repeated free -> reallocate cycles, allocator restart points, and reuse > bias - i.e., which regions of the address space are revisited most > frequently over time. But you haven't proved that this *matters*. You need to justify **why** we should care about portions of the address space are revisted more frequently. Why is it worth code complexity and mainteance overhead? "Because" is not an sufficient answer. > The question I’m raising is much narrower: whether allocator > policy choices can unintentionally reinforce reuse patterns under > certain workloads - and whether offering an *alternative policy* is > reasonable (I dare to say; in some cases more optimal). Optimal WHY? You have yet to show anything other than wear leveling why reusing portions of the LBA space is problematic, and why avoiding said reuse might be worthwhile. In fact, there is an argument to be made that an SSD-specific allocation algorithm which aggressively tries to reuse recently deleted blocks would result in better performance. Why? Because it is an implicit discard --- overwriting the LBA tells the Flash Translation Layer that the previous contents of the flash associated with the LBA is no longer needed, without the overhead of sending an explicit discard request. Discards are expensive for the FTL, and so when they have a lot of I/O pressure, some FTL implementations will just ignore the discard request in favor of serving immediate I/O requests, even if this results in more garbage collection overhead later. However, we've never done this because it wasn't clear the complexity was worth it --- and whenever you make changes to the block allocation algorithm, it's important to make sure performance and file fragmentation works well across a large number of workloads and a wide variety of different flash storage devices --- and both when the file system is freshly formatted, but also after the equivalent of years of file system aging (that is, after long-term use). For more information, see [1][2][3]. [1] https://www.cs.williams.edu/~jannen/teaching/s21/cs333/meetings/aging.html [2] https://www.usenix.org/conference/hotstorage19/presentation/conway [3] https://dl.acm.org/doi/10.1145/258612.258689 So an SSD-specific allocation policy which encourages and embraces reuse of LBA's (and NOT avoiding reuse) has a lot more theoretical and principled support. But despite that, the questions of "is this really worth the extra complexity", and "can we make sure that it works well across a wide variety of workloads and with both new and aged file systems" haven't been answered satisfactorily yet. The way to answer these questions would require running benchmarks and file system aging tools, such as those described in [3], while creating prototype changes. Hand-waving is enough for the creation of prototypes and proof-of-concept patches. But it's not enough for something that we would merge into the upstream kernel. Cheers, - Ted
On Feb 3, 2026, at 20:31, Mario Lohajner <mario_lohajner@rocketmail.com> wrote:
>
> Add support for the rotalloc allocation policy as a new mount
> option. Policy rotates the starting block group for new allocations.
>
> Changes:
> - fs/ext4/ext4.h
> rotalloc policy dedlared, extend sb with cursor, vector & lock
>
> - fs/ext4/mballoc.h
> expose allocator functions for vectoring in super.c
>
> - fs/ext4/super.c
> parse rotalloc mnt opt, init cursor, lock and allocator vector
>
> - fs/ext4/mballoc.c
> add rotalloc allocator, vectored allocator call in new_blocks
>
> The policy is selected via a mount option and does not change the
> on-disk format or default allocation behavior. It preserves existing
> allocation heuristics within a block group while distributing
> allocations across block groups in a deterministic sequential manner.
>
> The rotating allocator is implemented as a separate allocation path
> selected at mount time. This avoids conditional branches in the regular
> allocator and keeps allocation policies isolated.
> This also allows the rotating allocator to evolve independently in the
> future without increasing complexity in the regular allocator.
>
> The policy was tested using v6.18.6 stable locally with the new mount
> option "rotalloc" enabled, confirmed working as desribed!
Hi Mario,
can you please provide some background/reasoning behind this allocator?
I suspect there are good reasons/workloads that could benefit from it
(e.g. flash wear leveling), but that should be stated in the commit
message, and preferably with some benchmarks/measurements that show
some benefit from adding this feature.
Cheers, Andreas
>
> Signed-off-by: Mario Lohajner <mario_lohajner@rocketmail.com>
> ---
> fs/ext4/ext4.h | 8 +++
> fs/ext4/mballoc.c | 152 ++++++++++++++++++++++++++++++++++++++++++++--
> fs/ext4/mballoc.h | 3 +
> fs/ext4/super.c | 18 +++++-
> 4 files changed, 175 insertions(+), 6 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 56112f201cac..cbbb7c05d7a2 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -229,6 +229,9 @@ struct ext4_allocation_request {
> unsigned int flags;
> };
>
> +/* expose rotalloc allocator argument pointer type */
> +struct ext4_allocation_context;
> +
> /*
> * Logical to physical block mapping, used by ext4_map_blocks()
> *
> @@ -1230,6 +1233,7 @@ struct ext4_inode_info {
> * Mount flags set via mount options or defaults
> */
> #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */
> +#define EXT4_MOUNT_ROTALLOC 0x00002 /* Use rotalloc policy/allocator */
> #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
> #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
> #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
> @@ -1559,6 +1563,10 @@ struct ext4_sb_info {
> unsigned long s_mount_flags;
> unsigned int s_def_mount_opt;
> unsigned int s_def_mount_opt2;
> + /* Rotalloc cursor, lock & new_blocks allocator vector */
> + unsigned int s_rotalloc_cursor;
> + spinlock_t s_rotalloc_lock;
> + int (*s_mb_new_blocks)(struct ext4_allocation_context *ac);
> ext4_fsblk_t s_sb_block;
> atomic64_t s_resv_clusters;
> kuid_t s_resuid;
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 56d50fd3310b..74f79652c674 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2314,11 +2314,11 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
> * stop the scan and use it immediately
> *
> * * If free extent found is smaller than goal, then keep retrying
> - * upto a max of sbi->s_mb_max_to_scan times (default 200). After
> + * up to a max of sbi->s_mb_max_to_scan times (default 200). After
> * that stop scanning and use whatever we have.
> *
> * * If free extent found is bigger than goal, then keep retrying
> - * upto a max of sbi->s_mb_min_to_scan times (default 10) before
> + * up to a max of sbi->s_mb_min_to_scan times (default 10) before
> * stopping the scan and using the extent.
> *
> *
> @@ -2981,7 +2981,7 @@ static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
> return ret;
> }
>
> -static noinline_for_stack int
> +noinline_for_stack int
> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> {
> ext4_group_t i;
> @@ -3012,7 +3012,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> * is greater than equal to the sbi_s_mb_order2_reqs
> * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
> * We also support searching for power-of-two requests only for
> - * requests upto maximum buddy size we have constructed.
> + * requests up to maximum buddy size we have constructed.
> */
> if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
> if (is_power_of_2(ac->ac_g_ex.fe_len))
> @@ -3101,6 +3101,144 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> return err;
> }
>
> +/* Rotating allocator (rotalloc mount option) */
> +noinline_for_stack int
> +ext4_mb_rotating_allocator(struct ext4_allocation_context *ac)
> +{
> + ext4_group_t i, goal;
> + int err = 0;
> + struct super_block *sb = ac->ac_sb;
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + struct ext4_buddy e4b;
> +
> + BUG_ON(ac->ac_status == AC_STATUS_FOUND);
> +
> + /* Set the goal from s_rotalloc_cursor */
> + spin_lock(&sbi->s_rotalloc_lock);
> + goal = sbi->s_rotalloc_cursor;
> + spin_unlock(&sbi->s_rotalloc_lock);
> + ac->ac_g_ex.fe_group = goal;
> +
> + /* first, try the goal */
> + err = ext4_mb_find_by_goal(ac, &e4b);
> + if (err || ac->ac_status == AC_STATUS_FOUND)
> + goto out;
> +
> + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
> + goto out;
> +
> + /*
> + * ac->ac_2order is set only if the fe_len is a power of 2
> + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
> + * so that we try exact allocation using buddy.
> + */
> + i = fls(ac->ac_g_ex.fe_len);
> + ac->ac_2order = 0;
> + /*
> + * We search using buddy data only if the order of the request
> + * is greater than equal to the sbi_s_mb_order2_reqs
> + * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
> + * We also support searching for power-of-two requests only for
> + * requests up to maximum buddy size we have constructed.
> + */
> + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
> + if (is_power_of_2(ac->ac_g_ex.fe_len))
> + ac->ac_2order = array_index_nospec(i - 1,
> + MB_NUM_ORDERS(sb));
> + }
> +
> + /* if stream allocation is enabled, use global goal */
> + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
> + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
> +
> + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
> + ac->ac_g_ex.fe_start = -1;
> + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
> + }
> +
> + /*
> + * Let's just scan groups to find more-less suitable blocks We
> + * start with CR_GOAL_LEN_FAST, unless it is power of 2
> + * aligned, in which case let's do that faster approach first.
> + */
> + ac->ac_criteria = CR_GOAL_LEN_FAST;
> + if (ac->ac_2order)
> + ac->ac_criteria = CR_POWER2_ALIGNED;
> +
> + ac->ac_e4b = &e4b;
> + ac->ac_prefetch_ios = 0;
> + ac->ac_first_err = 0;
> +
> + /* Be sure to start scanning with goal from s_rotalloc_cursor! */
> + ac->ac_g_ex.fe_group = goal;
> +repeat:
> + while (ac->ac_criteria < EXT4_MB_NUM_CRS) {
> + err = ext4_mb_scan_groups(ac);
> + if (err)
> + goto out;
> +
> + if (ac->ac_status != AC_STATUS_CONTINUE)
> + break;
> + }
> +
> + if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
> + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
> + /*
> + * We've been searching too long. Let's try to allocate
> + * the best chunk we've found so far
> + */
> + ext4_mb_try_best_found(ac, &e4b);
> + if (ac->ac_status != AC_STATUS_FOUND) {
> + int lost;
> +
> + /*
> + * Someone more lucky has already allocated it.
> + * The only thing we can do is just take first
> + * found block(s)
> + */
> + lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
> + mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
> + ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
> + ac->ac_b_ex.fe_len, lost);
> +
> + ac->ac_b_ex.fe_group = 0;
> + ac->ac_b_ex.fe_start = 0;
> + ac->ac_b_ex.fe_len = 0;
> + ac->ac_status = AC_STATUS_CONTINUE;
> + ac->ac_flags |= EXT4_MB_HINT_FIRST;
> + ac->ac_criteria = CR_ANY_FREE;
> + goto repeat;
> + }
> + }
> +
> + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) {
> + atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
> + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC &&
> + ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group)
> + atomic_inc(&sbi->s_bal_stream_goals);
> + }
> +out:
> + if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err)
> + err = ac->ac_first_err;
> +
> + mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
> + ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
> + ac->ac_flags, ac->ac_criteria, err);
> +
> + if (ac->ac_prefetch_nr)
> + ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr);
> +
> + if (!err) {
> + /* Finally, if no errors, set the currsor to best group! */
> + goal = ac->ac_b_ex.fe_group;
> + spin_lock(&sbi->s_rotalloc_lock);
> + sbi->s_rotalloc_cursor = goal;
> + spin_unlock(&sbi->s_rotalloc_lock);
> + }
> +
> + return err;
> +}
> +
> static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
> {
> struct super_block *sb = pde_data(file_inode(seq->file));
> @@ -6314,7 +6452,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
> goto errout;
> repeat:
> /* allocate space in core */
> - *errp = ext4_mb_regular_allocator(ac);
> + /*
> + * Use vectored allocator insead of fixed
> + * ext4_mb_regular_allocator(ac) function
> + */
> + *errp = sbi->s_mb_new_blocks(ac);
> /*
> * pa allocated above is added to grp->bb_prealloc_list only
> * when we were able to allocate some block i.e. when
> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
> index 15a049f05d04..309190ce05ae 100644
> --- a/fs/ext4/mballoc.h
> +++ b/fs/ext4/mballoc.h
> @@ -270,4 +270,7 @@ ext4_mballoc_query_range(
> ext4_mballoc_query_range_fn formatter,
> void *priv);
>
> +/* Expose rotating & regular allocators for vectoring */
> +int ext4_mb_rotating_allocator(struct ext4_allocation_context *ac);
> +int ext4_mb_regular_allocator(struct ext4_allocation_context *ac);
> #endif
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 87205660c5d0..f53501bbfb4b 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1673,7 +1673,7 @@ enum {
> Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
> Opt_inode_readahead_blks, Opt_journal_ioprio,
> Opt_dioread_nolock, Opt_dioread_lock,
> - Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
> + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_rotalloc,
> Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
> Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
> Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
> @@ -1797,6 +1797,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
> fsparam_u32 ("init_itable", Opt_init_itable),
> fsparam_flag ("init_itable", Opt_init_itable),
> fsparam_flag ("noinit_itable", Opt_noinit_itable),
> + fsparam_flag ("rotalloc", Opt_rotalloc),
> #ifdef CONFIG_EXT4_DEBUG
> fsparam_flag ("fc_debug_force", Opt_fc_debug_force),
> fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay),
> @@ -1878,6 +1879,7 @@ static const struct mount_opts {
> {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
> {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
> {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
> + {Opt_rotalloc, EXT4_MOUNT_ROTALLOC, MOPT_SET},
> {Opt_dax_type, 0, MOPT_EXT4_ONLY},
> {Opt_journal_dev, 0, MOPT_NO_EXT2},
> {Opt_journal_path, 0, MOPT_NO_EXT2},
> @@ -2264,6 +2266,9 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
> ctx->s_li_wait_mult = result.uint_32;
> ctx->spec |= EXT4_SPEC_s_li_wait_mult;
> return 0;
> + case Opt_rotalloc:
> + ctx_set_mount_opt(ctx, EXT4_MOUNT_ROTALLOC);
> + return 0;
> case Opt_max_dir_size_kb:
> ctx->s_max_dir_size_kb = result.uint_32;
> ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
> @@ -5512,6 +5517,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
> }
> }
>
> + /*
> + * Initialize rotalloc cursor, lock and
> + * vector new_blocks to rotating^regular allocator
> + */
> + sbi->s_rotalloc_cursor = 0;
> + spin_lock_init(&sbi->s_rotalloc_lock);
> + if (test_opt(sb, ROTALLOC))
> + sbi->s_mb_new_blocks = ext4_mb_rotating_allocator;
> + else
> + sbi->s_mb_new_blocks = ext4_mb_regular_allocator;
> +
> /*
> * Get the # of file system overhead blocks from the
> * superblock if present.
> --
> 2.52.0
>
Cheers, Andreas
Hello Andreas,
Yes, the main motive for this allocator is flash wear leveling,
but it is not strictly a wear leveling mechanism, and it is not named
as such for a reason.
Wear leveling may (or may not) exist at the device/hardware level.
The goal of this policy is not to "fix" that.
This policy helps avoid allocation hotspots at mount start by
distributing allocations sequentially across the entire mount,
not just a file or allocation stream.
At the block/group allocation level, the file system is fairly stochastic
and timing-sensitive. Rather than providing raw benchmark data, I prefer
to explain the design analytically:
The vectored separation of the new allocator ensures that the performance
of the regular allocator is maintained (literally unchanged).
The overhead of the new rotating allocator is minimal and occurs outside
of the "hot loop":
the cursor is retrieved early at the start, updated upon successful
allocation,
and is negligible with respect to IO latency.
Because allocations proceed sequentially, latency is comparable to
or better than the regular allocator.
Having separated allocators increases maintainability and independence
with minimal (virtually no) overhead.
This policy benefits workloads with frequent large or small allocations,
while keeping file fragmentation and slack space minimal.
It is a conscious trade-off: sacrificing locality in favor of reinforced
sequentiality.
Of course, this is not optimal for classic HDDs, but NVMe drives behave
differently.
For this reason, the policy is optional per mount, turned off by default,
and can be toggled at mount time.
Best regards,
Mario
On 04. 02. 2026. 04:53, Andreas Dilger wrote:
> On Feb 3, 2026, at 20:31, Mario Lohajner <mario_lohajner@rocketmail.com> wrote:
>> Add support for the rotalloc allocation policy as a new mount
>> option. Policy rotates the starting block group for new allocations.
>>
>> Changes:
>> - fs/ext4/ext4.h
>> rotalloc policy dedlared, extend sb with cursor, vector & lock
>>
>> - fs/ext4/mballoc.h
>> expose allocator functions for vectoring in super.c
>>
>> - fs/ext4/super.c
>> parse rotalloc mnt opt, init cursor, lock and allocator vector
>>
>> - fs/ext4/mballoc.c
>> add rotalloc allocator, vectored allocator call in new_blocks
>>
>> The policy is selected via a mount option and does not change the
>> on-disk format or default allocation behavior. It preserves existing
>> allocation heuristics within a block group while distributing
>> allocations across block groups in a deterministic sequential manner.
>>
>> The rotating allocator is implemented as a separate allocation path
>> selected at mount time. This avoids conditional branches in the regular
>> allocator and keeps allocation policies isolated.
>> This also allows the rotating allocator to evolve independently in the
>> future without increasing complexity in the regular allocator.
>>
>> The policy was tested using v6.18.6 stable locally with the new mount
>> option "rotalloc" enabled, confirmed working as desribed!
> Hi Mario,
> can you please provide some background/reasoning behind this allocator?
> I suspect there are good reasons/workloads that could benefit from it
> (e.g. flash wear leveling), but that should be stated in the commit
> message, and preferably with some benchmarks/measurements that show
> some benefit from adding this feature.
>
> Cheers, Andreas
>
>> Signed-off-by: Mario Lohajner <mario_lohajner@rocketmail.com>
>> ---
>> fs/ext4/ext4.h | 8 +++
>> fs/ext4/mballoc.c | 152 ++++++++++++++++++++++++++++++++++++++++++++--
>> fs/ext4/mballoc.h | 3 +
>> fs/ext4/super.c | 18 +++++-
>> 4 files changed, 175 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 56112f201cac..cbbb7c05d7a2 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -229,6 +229,9 @@ struct ext4_allocation_request {
>> unsigned int flags;
>> };
>>
>> +/* expose rotalloc allocator argument pointer type */
>> +struct ext4_allocation_context;
>> +
>> /*
>> * Logical to physical block mapping, used by ext4_map_blocks()
>> *
>> @@ -1230,6 +1233,7 @@ struct ext4_inode_info {
>> * Mount flags set via mount options or defaults
>> */
>> #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */
>> +#define EXT4_MOUNT_ROTALLOC 0x00002 /* Use rotalloc policy/allocator */
>> #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
>> #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
>> #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
>> @@ -1559,6 +1563,10 @@ struct ext4_sb_info {
>> unsigned long s_mount_flags;
>> unsigned int s_def_mount_opt;
>> unsigned int s_def_mount_opt2;
>> + /* Rotalloc cursor, lock & new_blocks allocator vector */
>> + unsigned int s_rotalloc_cursor;
>> + spinlock_t s_rotalloc_lock;
>> + int (*s_mb_new_blocks)(struct ext4_allocation_context *ac);
>> ext4_fsblk_t s_sb_block;
>> atomic64_t s_resv_clusters;
>> kuid_t s_resuid;
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index 56d50fd3310b..74f79652c674 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -2314,11 +2314,11 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
>> * stop the scan and use it immediately
>> *
>> * * If free extent found is smaller than goal, then keep retrying
>> - * upto a max of sbi->s_mb_max_to_scan times (default 200). After
>> + * up to a max of sbi->s_mb_max_to_scan times (default 200). After
>> * that stop scanning and use whatever we have.
>> *
>> * * If free extent found is bigger than goal, then keep retrying
>> - * upto a max of sbi->s_mb_min_to_scan times (default 10) before
>> + * up to a max of sbi->s_mb_min_to_scan times (default 10) before
>> * stopping the scan and using the extent.
>> *
>> *
>> @@ -2981,7 +2981,7 @@ static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
>> return ret;
>> }
>>
>> -static noinline_for_stack int
>> +noinline_for_stack int
>> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>> {
>> ext4_group_t i;
>> @@ -3012,7 +3012,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>> * is greater than equal to the sbi_s_mb_order2_reqs
>> * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
>> * We also support searching for power-of-two requests only for
>> - * requests upto maximum buddy size we have constructed.
>> + * requests up to maximum buddy size we have constructed.
>> */
>> if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
>> if (is_power_of_2(ac->ac_g_ex.fe_len))
>> @@ -3101,6 +3101,144 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>> return err;
>> }
>>
>> +/* Rotating allocator (rotalloc mount option) */
>> +noinline_for_stack int
>> +ext4_mb_rotating_allocator(struct ext4_allocation_context *ac)
>> +{
>> + ext4_group_t i, goal;
>> + int err = 0;
>> + struct super_block *sb = ac->ac_sb;
>> + struct ext4_sb_info *sbi = EXT4_SB(sb);
>> + struct ext4_buddy e4b;
>> +
>> + BUG_ON(ac->ac_status == AC_STATUS_FOUND);
>> +
>> + /* Set the goal from s_rotalloc_cursor */
>> + spin_lock(&sbi->s_rotalloc_lock);
>> + goal = sbi->s_rotalloc_cursor;
>> + spin_unlock(&sbi->s_rotalloc_lock);
>> + ac->ac_g_ex.fe_group = goal;
>> +
>> + /* first, try the goal */
>> + err = ext4_mb_find_by_goal(ac, &e4b);
>> + if (err || ac->ac_status == AC_STATUS_FOUND)
>> + goto out;
>> +
>> + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
>> + goto out;
>> +
>> + /*
>> + * ac->ac_2order is set only if the fe_len is a power of 2
>> + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
>> + * so that we try exact allocation using buddy.
>> + */
>> + i = fls(ac->ac_g_ex.fe_len);
>> + ac->ac_2order = 0;
>> + /*
>> + * We search using buddy data only if the order of the request
>> + * is greater than equal to the sbi_s_mb_order2_reqs
>> + * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
>> + * We also support searching for power-of-two requests only for
>> + * requests up to maximum buddy size we have constructed.
>> + */
>> + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
>> + if (is_power_of_2(ac->ac_g_ex.fe_len))
>> + ac->ac_2order = array_index_nospec(i - 1,
>> + MB_NUM_ORDERS(sb));
>> + }
>> +
>> + /* if stream allocation is enabled, use global goal */
>> + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
>> + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
>> +
>> + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
>> + ac->ac_g_ex.fe_start = -1;
>> + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
>> + }
>> +
>> + /*
>> + * Let's just scan groups to find more-less suitable blocks We
>> + * start with CR_GOAL_LEN_FAST, unless it is power of 2
>> + * aligned, in which case let's do that faster approach first.
>> + */
>> + ac->ac_criteria = CR_GOAL_LEN_FAST;
>> + if (ac->ac_2order)
>> + ac->ac_criteria = CR_POWER2_ALIGNED;
>> +
>> + ac->ac_e4b = &e4b;
>> + ac->ac_prefetch_ios = 0;
>> + ac->ac_first_err = 0;
>> +
>> + /* Be sure to start scanning with goal from s_rotalloc_cursor! */
>> + ac->ac_g_ex.fe_group = goal;
>> +repeat:
>> + while (ac->ac_criteria < EXT4_MB_NUM_CRS) {
>> + err = ext4_mb_scan_groups(ac);
>> + if (err)
>> + goto out;
>> +
>> + if (ac->ac_status != AC_STATUS_CONTINUE)
>> + break;
>> + }
>> +
>> + if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
>> + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
>> + /*
>> + * We've been searching too long. Let's try to allocate
>> + * the best chunk we've found so far
>> + */
>> + ext4_mb_try_best_found(ac, &e4b);
>> + if (ac->ac_status != AC_STATUS_FOUND) {
>> + int lost;
>> +
>> + /*
>> + * Someone more lucky has already allocated it.
>> + * The only thing we can do is just take first
>> + * found block(s)
>> + */
>> + lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
>> + mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
>> + ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
>> + ac->ac_b_ex.fe_len, lost);
>> +
>> + ac->ac_b_ex.fe_group = 0;
>> + ac->ac_b_ex.fe_start = 0;
>> + ac->ac_b_ex.fe_len = 0;
>> + ac->ac_status = AC_STATUS_CONTINUE;
>> + ac->ac_flags |= EXT4_MB_HINT_FIRST;
>> + ac->ac_criteria = CR_ANY_FREE;
>> + goto repeat;
>> + }
>> + }
>> +
>> + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) {
>> + atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
>> + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC &&
>> + ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group)
>> + atomic_inc(&sbi->s_bal_stream_goals);
>> + }
>> +out:
>> + if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err)
>> + err = ac->ac_first_err;
>> +
>> + mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
>> + ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
>> + ac->ac_flags, ac->ac_criteria, err);
>> +
>> + if (ac->ac_prefetch_nr)
>> + ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr);
>> +
>> + if (!err) {
>> + /* Finally, if no errors, set the currsor to best group! */
>> + goal = ac->ac_b_ex.fe_group;
>> + spin_lock(&sbi->s_rotalloc_lock);
>> + sbi->s_rotalloc_cursor = goal;
>> + spin_unlock(&sbi->s_rotalloc_lock);
>> + }
>> +
>> + return err;
>> +}
>> +
>> static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
>> {
>> struct super_block *sb = pde_data(file_inode(seq->file));
>> @@ -6314,7 +6452,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
>> goto errout;
>> repeat:
>> /* allocate space in core */
>> - *errp = ext4_mb_regular_allocator(ac);
>> + /*
>> + * Use vectored allocator insead of fixed
>> + * ext4_mb_regular_allocator(ac) function
>> + */
>> + *errp = sbi->s_mb_new_blocks(ac);
>> /*
>> * pa allocated above is added to grp->bb_prealloc_list only
>> * when we were able to allocate some block i.e. when
>> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
>> index 15a049f05d04..309190ce05ae 100644
>> --- a/fs/ext4/mballoc.h
>> +++ b/fs/ext4/mballoc.h
>> @@ -270,4 +270,7 @@ ext4_mballoc_query_range(
>> ext4_mballoc_query_range_fn formatter,
>> void *priv);
>>
>> +/* Expose rotating & regular allocators for vectoring */
>> +int ext4_mb_rotating_allocator(struct ext4_allocation_context *ac);
>> +int ext4_mb_regular_allocator(struct ext4_allocation_context *ac);
>> #endif
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index 87205660c5d0..f53501bbfb4b 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -1673,7 +1673,7 @@ enum {
>> Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
>> Opt_inode_readahead_blks, Opt_journal_ioprio,
>> Opt_dioread_nolock, Opt_dioread_lock,
>> - Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
>> + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_rotalloc,
>> Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
>> Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
>> Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
>> @@ -1797,6 +1797,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
>> fsparam_u32 ("init_itable", Opt_init_itable),
>> fsparam_flag ("init_itable", Opt_init_itable),
>> fsparam_flag ("noinit_itable", Opt_noinit_itable),
>> + fsparam_flag ("rotalloc", Opt_rotalloc),
>> #ifdef CONFIG_EXT4_DEBUG
>> fsparam_flag ("fc_debug_force", Opt_fc_debug_force),
>> fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay),
>> @@ -1878,6 +1879,7 @@ static const struct mount_opts {
>> {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
>> {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
>> {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
>> + {Opt_rotalloc, EXT4_MOUNT_ROTALLOC, MOPT_SET},
>> {Opt_dax_type, 0, MOPT_EXT4_ONLY},
>> {Opt_journal_dev, 0, MOPT_NO_EXT2},
>> {Opt_journal_path, 0, MOPT_NO_EXT2},
>> @@ -2264,6 +2266,9 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
>> ctx->s_li_wait_mult = result.uint_32;
>> ctx->spec |= EXT4_SPEC_s_li_wait_mult;
>> return 0;
>> + case Opt_rotalloc:
>> + ctx_set_mount_opt(ctx, EXT4_MOUNT_ROTALLOC);
>> + return 0;
>> case Opt_max_dir_size_kb:
>> ctx->s_max_dir_size_kb = result.uint_32;
>> ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
>> @@ -5512,6 +5517,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
>> }
>> }
>>
>> + /*
>> + * Initialize rotalloc cursor, lock and
>> + * vector new_blocks to rotating^regular allocator
>> + */
>> + sbi->s_rotalloc_cursor = 0;
>> + spin_lock_init(&sbi->s_rotalloc_lock);
>> + if (test_opt(sb, ROTALLOC))
>> + sbi->s_mb_new_blocks = ext4_mb_rotating_allocator;
>> + else
>> + sbi->s_mb_new_blocks = ext4_mb_regular_allocator;
>> +
>> /*
>> * Get the # of file system overhead blocks from the
>> * superblock if present.
>> --
>> 2.52.0
>>
>
> Cheers, Andreas
>
>
>
>
>
On Wed, Feb 04, 2026 at 12:07:57PM +0100, Mario Lohajner wrote: > > Yes, the main motive for this allocator is flash wear leveling, > but it is not strictly a wear leveling mechanism, and it is not named > as such for a reason. If the device needs such a flash wear leveling scheme, it's very likely that it's not going to work very well for ext4, because there will be *far* more writes to statially located metadata --- the superblock, inode table, allocation bitmaps, which are scattered across the LBA space, --- that will potentially becausing problem to such a flash device. In practice, even the simplest Flash Translation Layer implementations do not require this, so I question whether devices that would need this exist in practice. Even the cheapest flash devices, for low-cost mobile and digital cameras, have not needed this in the 30 plus years that commercial flash storage have been around, and the micro-controllers which implement the FTL have been getting more sophisticated, not less. Do you have a specific flash storage device where this would be helpful? Or this a hypothetical exercise? > This policy helps avoid allocation hotspots at mount start by > distributing allocations sequentially across the entire mount, > not just a file or allocation stream. Why are you worrying about allocation hotspots? What's the high level problem that you are trying to address, if it is not about wear leveling? > At the block/group allocation level, the file system is fairly stochastic > and timing-sensitive. Rather than providing raw benchmark data, I prefer > to explain the design analytically: Whether you use raw benchmarks or try to do thought experiments you really need to specify your assumptions about the nature of (a) the storage device, and (b) the workload. For example, if the flash device has such a primitive, terible flash translation that the file system needs to handle wear levelling, it's generally the cheapest, most trashy storage device that can be imagined. In those cases, the bottleneck will likely be read/write speed. So we probably don't need to worry about the block allocate performance while writing to this storage device, because the I/O throughput latency is probably comparable to the worst possible USB thumb drive that you might find in the checkout line of a drug store. From the workload perforamnce, how many files are you expecting that system will be writing in parallel? For example, is the user going to be running "make -j32" while building some software project? Probably not, because why would connect a really powerful AMD Threadripper CPU to the cheapest possible trash flash device? That would be a system that would be very out of balance. But if this is going to be low-demand, low-power performacne, then you might be able to use an even simpler allocator --- say, like what FAT file system uses. Speaking of FAT, depending on the quality of the storage device and benchmark, perhaps another file system would be a better choice. In addition to FAT, another file system to consider is f2fs, which is a log-structured file system that avoids the static inode table which might be a problem with with a flash device that needs file system aware wear-leveling. > Of course, this is not optimal for classic HDDs, but NVMe drives behave > differently. I'm not aware of *any* NVMe devices that that would find this to be advantages. This is where some real benchmarks with real hardware, and with specific workload that is used in real world devices would be really helpful. Cheers, - Ted
© 2016 - 2026 Red Hat, Inc.