Currently, ext4's implementation of fsmap converts the ranges to cluster
units if bigalloc is enabled and then converts to block units whenever
needed. However, this back and forth conversion has known to cause
several edge case issues since even with bigalloc the metadata is still
in block size unit.
Hence, convert the code to use block size instead of cluster size.
Cluster size is only used when dealing with mballoc bitmap. This takes
care of the existing issues with the code, example, for a mapping as
follows:
xfs_io -c fsmap -d (bs=4096, clustersize=65536)
0: 254:16 [0..7]: static fs metadata 8
1: 254:16 [8..15]: special 102:1 8
2: 254:16 [16..327]: special 102:2 312
3: 254:16 [328..351]: special 102:3 24
4: 254:16 [352..375]: special 102:4 24
...
xfs_io -c fsmap -d 6 16 (before this patch):
0: 254:16 [0..7]: static fs metadata 8
1: 254:16 [8..23]: unknown 16 <--- incorrect/ambiguous entry
xfs_io -c fsmap -d 6 16 (after this patch):
0: 254:16 [0..7]: static fs metadata 8
1: 254:16 [8..15]: special 102:1 8
2: 254:16 [16..327]: special 102:2 312
Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
fs/ext4/fsmap.c | 42 +++++++++++++++++++++++-------------------
fs/ext4/mballoc.c | 25 +++++++++++++++++--------
2 files changed, 40 insertions(+), 27 deletions(-)
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 22fc333244ef..02e5d501dda8 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -193,13 +193,11 @@ static int ext4_getfsmap_meta_helper(struct super_block *sb,
struct ext4_getfsmap_info *info = priv;
struct ext4_fsmap *p;
struct ext4_fsmap *tmp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t fsb, fs_start, fs_end;
int error;
- fs_start = fsb = (EXT4_C2B(sbi, start) +
- ext4_group_first_block_no(sb, agno));
- fs_end = fs_start + EXT4_C2B(sbi, len);
+ fs_start = fsb = start + ext4_group_first_block_no(sb, agno);
+ fs_end = fs_start + len;
/*
* Return relevant extents from the meta_list. We emit all extents that
@@ -248,13 +246,12 @@ static int ext4_getfsmap_datadev_helper(struct super_block *sb,
struct ext4_getfsmap_info *info = priv;
struct ext4_fsmap *p;
struct ext4_fsmap *tmp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t fsb;
ext4_fsblk_t fslen;
int error;
- fsb = (EXT4_C2B(sbi, start) + ext4_group_first_block_no(sb, agno));
- fslen = EXT4_C2B(sbi, len);
+ fsb = start + ext4_group_first_block_no(sb, agno);
+ fslen = len;
/* If the retained free extent record is set... */
if (info->gfi_lastfree.fmr_owner) {
@@ -531,13 +528,13 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t start_fsb;
- ext4_fsblk_t end_fsb;
+ ext4_fsblk_t end_fsb, orig_end_fsb;
ext4_fsblk_t bofs;
ext4_fsblk_t eofs;
ext4_group_t start_ag;
ext4_group_t end_ag;
- ext4_grpblk_t first_cluster;
- ext4_grpblk_t last_cluster;
+ ext4_grpblk_t first_grpblk;
+ ext4_grpblk_t last_grpblk;
struct ext4_fsmap irec;
int error = 0;
@@ -553,10 +550,18 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
return 0;
start_fsb = keys[0].fmr_physical;
end_fsb = keys[1].fmr_physical;
+ orig_end_fsb = end_fsb;
- /* Determine first and last group to examine based on start and end */
- ext4_get_group_no_and_offset(sb, start_fsb, &start_ag, &first_cluster);
- ext4_get_group_no_and_offset(sb, end_fsb, &end_ag, &last_cluster);
+ /*
+ * Determine first and last group to examine based on start and end.
+ * NOTE: do_div() should take ext4_fsblk_t instead of ext4_group_t as
+ * first argument else it will fail on 32bit archs.
+ */
+ first_grpblk = do_div(start_fsb, EXT4_BLOCKS_PER_GROUP(sb));
+ last_grpblk = do_div(end_fsb, EXT4_BLOCKS_PER_GROUP(sb));
+
+ start_ag = start_fsb;
+ end_ag = end_fsb;
/*
* Convert the fsmap low/high keys to bg based keys. Initialize
@@ -564,7 +569,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
* of the bg.
*/
info->gfi_low = keys[0];
- info->gfi_low.fmr_physical = EXT4_C2B(sbi, first_cluster);
+ info->gfi_low.fmr_physical = first_grpblk;
info->gfi_low.fmr_length = 0;
memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high));
@@ -584,8 +589,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
*/
if (info->gfi_agno == end_ag) {
info->gfi_high = keys[1];
- info->gfi_high.fmr_physical = EXT4_C2B(sbi,
- last_cluster);
+ info->gfi_high.fmr_physical = last_grpblk;
info->gfi_high.fmr_length = 0;
}
@@ -600,8 +604,8 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
info->gfi_high.fmr_owner);
error = ext4_mballoc_query_range(sb, info->gfi_agno,
- EXT4_B2C(sbi, info->gfi_low.fmr_physical),
- EXT4_B2C(sbi, info->gfi_high.fmr_physical),
+ info->gfi_low.fmr_physical,
+ info->gfi_high.fmr_physical,
ext4_getfsmap_meta_helper,
ext4_getfsmap_datadev_helper, info);
if (error)
@@ -627,7 +631,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
* any allocated blocks at the end of the range.
*/
irec.fmr_device = 0;
- irec.fmr_physical = end_fsb + 1;
+ irec.fmr_physical = orig_end_fsb + 1;
irec.fmr_length = 0;
irec.fmr_owner = EXT4_FMR_OWN_FREE;
irec.fmr_flags = 0;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5898d92ba19f..cf78776940dd 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -7117,6 +7117,8 @@ ext4_mballoc_query_range(
ext4_grpblk_t start, next;
struct ext4_buddy e4b;
int error;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_grpblk_t start_c, end_c, next_c;
error = ext4_mb_load_buddy(sb, group, &e4b);
if (error)
@@ -7125,9 +7127,9 @@ ext4_mballoc_query_range(
ext4_lock_group(sb, group);
- start = max(e4b.bd_info->bb_first_free, first);
- if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
- end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+ start = max(EXT4_C2B(sbi, e4b.bd_info->bb_first_free), first);
+ if (end >= EXT4_BLOCKS_PER_GROUP(sb))
+ end = EXT4_BLOCKS_PER_GROUP(sb) - 1;
if (meta_formatter && start != first) {
if (start > end)
start = end;
@@ -7138,19 +7140,26 @@ ext4_mballoc_query_range(
goto out_unload;
ext4_lock_group(sb, group);
}
- while (start <= end) {
- start = mb_find_next_zero_bit(bitmap, end + 1, start);
- if (start > end)
+
+ start_c = EXT4_B2C(sbi, start);
+ end_c = EXT4_B2C(sbi, end);
+
+ while (start_c <= end_c) {
+ start_c = mb_find_next_zero_bit(bitmap, end_c + 1, start_c);
+ if (start_c > end_c)
break;
- next = mb_find_next_bit(bitmap, end + 1, start);
+ next_c = mb_find_next_bit(bitmap, end_c + 1, start_c);
ext4_unlock_group(sb, group);
+
+ start = EXT4_C2B(sbi, start_c);
+ next = EXT4_C2B(sbi, next_c);
error = formatter(sb, group, start, next - start, priv);
if (error)
goto out_unload;
ext4_lock_group(sb, group);
- start = next + 1;
+ start_c = next_c + 1;
}
ext4_unlock_group(sb, group);
--
2.49.0
On Fri, Sep 05, 2025 at 01:44:47PM +0530, Ojaswin Mujoo wrote: > Currently, ext4's implementation of fsmap converts the ranges to cluster > units if bigalloc is enabled and then converts to block units whenever > needed. However, this back and forth conversion has known to cause > several edge case issues since even with bigalloc the metadata is still > in block size unit.... This commit causes ext4/028 to fail with a 1k blocksize. The failure happens after just under 45 minutes; before this commit, ext4/028 would complete after a second. Do you have a preference regarding whether I just drop this commit, or drop the whole series? The previous patch looks fine to me and fixes a real problem, so my plan is to keep the 1/2 commit and drop this one. Cheers, - Ted
On Fri, Sep 26, 2025 at 08:35:36AM -0400, Theodore Ts'o wrote: > On Fri, Sep 05, 2025 at 01:44:47PM +0530, Ojaswin Mujoo wrote: > > Currently, ext4's implementation of fsmap converts the ranges to cluster > > units if bigalloc is enabled and then converts to block units whenever > > needed. However, this back and forth conversion has known to cause > > several edge case issues since even with bigalloc the metadata is still > > in block size unit.... > > This commit causes ext4/028 to fail with a 1k blocksize. The failure > happens after just under 45 minutes; before this commit, ext4/028 > would complete after a second. > > Do you have a preference regarding whether I just drop this commit, or > drop the whole series? The previous patch looks fine to me and fixes > a real problem, so my plan is to keep the 1/2 commit and drop this > one. Hey Ted, Thanks for pointing this out, I'll look into the failure. Sorry for the late reply I've been on vacation this week. I'll get to it as soon as possible. Regards, ojaswin > > Cheers, > > - Ted
© 2016 - 2026 Red Hat, Inc.