From nobody Mon Feb 9 13:35:21 2026 Received: from out30-131.freemail.mail.aliyun.com (out30-131.freemail.mail.aliyun.com [115.124.30.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 160E42236EF for ; Mon, 10 Mar 2025 09:55:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.131 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600520; cv=none; b=XYLe8bA8PWROTmJzHs75gaA53WeM6nRXn7F2wpZtcpFHFdVJXfDUv/scBc/C8vfNSoDZxqsxRmzpBSHlMIGDWV6XmwvvAp/msDLnjv9mOhHsNhkqoDlPs3BxiTcQVDDo84xlcfrCE9rRmFP2RU4aaWanIDPHC4iKqrJ3O8wPnO4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600520; c=relaxed/simple; bh=lGem3y17GOA2vjgE9Qcp1uMHhRBKPopkROLpNxh6ujI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=gAHGWWa69eLtHMi3ciE/qnSAkVdHTn2atbsSnF7DxnBWJ8pvubwrwEcplo69XjsjpIsdfzeJ92Zq4/G1te7vD5ouxIwL5AMZaHXYQzGT06Tc6/pyMH82oUEyBXAXpna+sov5fn4PRxflXYDTtJ/oYs1/09dFBnSxyplCa+rtAaE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=Yt3N8ESk; arc=none smtp.client-ip=115.124.30.131 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="Yt3N8ESk" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1741600507; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=dPqt8cykj9yE5zj2nUPmPKdQRAButXykg1ZWEVJBkQM=; b=Yt3N8ESkc9Sj7RKJCX/lvQ89Kf2PrdAapduWxqHXIeUMusahYgLKxpg4OHWA2L3WbST9erJiEymH/RrO8cxs7vfcbbUDT96miigsu1R6ZE8REacDgo+vL37Br3qvMBE7qMBtrI+HOvV4WKqMlSTlGiZD2++hpDpTSrDCc5MxzZc= Received: from x31i01179.sqa.na131.tbsite.net(mailfrom:hsiangkao@linux.alibaba.com fp:SMTPD_---0WR1F3yP_1741600506 cluster:ay36) by smtp.aliyun-inc.com; Mon, 10 Mar 2025 17:55:07 +0800 From: Gao Xiang To: linux-erofs@lists.ozlabs.org Cc: LKML , Gao Xiang Subject: [PATCH 03/10] erofs: add 48-bit block addressing on-disk support Date: Mon, 10 Mar 2025 17:54:53 +0800 Message-ID: <20250310095459.2620647-4-hsiangkao@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> References: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The current 32-bit block addressing limits EROFS to a 16TiB maximum volume size with 4KiB blocks. However, several new use cases now require larger capacity support: - Massive datasets for model training in order to boost random sampling performance for each epoch; - Object storage clients using EROFS direct passthrough. This extends core on-disk structures to support 48-bit block addressing, such as inodes, device slots, and inode chunks. Additionally: - Expand superblock root NID to 8-byte `rootnid_8b` to enable full out-of-place update incremental builds; - Introduce `epoch` field in the superblock as well as add `mtime` field to 32-byte compact inodes for basic timestamp support. Signed-off-by: Gao Xiang --- fs/erofs/data.c | 15 ++++---- fs/erofs/erofs_fs.h | 91 +++++++++++++++++++++------------------------ fs/erofs/inode.c | 6 +-- fs/erofs/internal.h | 6 +-- fs/erofs/super.c | 12 +++--- 5 files changed, 61 insertions(+), 69 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 2f45e39ce8c7..3c4a4eaffe8c 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -95,7 +95,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_ma= p_blocks *map) =20 map->m_flags =3D EROFS_MAP_MAPPED; if (map->m_la < pos) { - map->m_pa =3D erofs_pos(sb, vi->raw_blkaddr) + map->m_la; + map->m_pa =3D erofs_pos(sb, vi->startblk) + map->m_la; map->m_llen =3D pos - map->m_la; } else { map->m_pa =3D erofs_iloc(inode) + vi->inode_isize + @@ -124,7 +124,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_= map_blocks *map) map->m_llen =3D min_t(erofs_off_t, 1UL << vi->chunkbits, round_up(inode->i_size - map->m_la, blksz)); if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) { - startblk =3D le32_to_cpu(idx->blkaddr); + startblk =3D le32_to_cpu(idx->startblk_lo); if (startblk !=3D EROFS_NULL_ADDR) { map->m_deviceid =3D le16_to_cpu(idx->device_id) & EROFS_SB(sb)->device_id_mask; @@ -168,7 +168,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_= map_dev *map) { struct erofs_dev_context *devs =3D EROFS_SB(sb)->devs; struct erofs_device_info *dif; - erofs_off_t startoff, length; + erofs_off_t startoff; int id; =20 erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0); @@ -181,7 +181,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_= map_dev *map) return -ENODEV; } if (devs->flatdev) { - map->m_pa +=3D erofs_pos(sb, dif->mapped_blkaddr); + map->m_pa +=3D erofs_pos(sb, dif->uniaddr); up_read(&devs->rwsem); return 0; } @@ -190,13 +190,12 @@ int erofs_map_dev(struct super_block *sb, struct erof= s_map_dev *map) } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { - if (!dif->mapped_blkaddr) + if (!dif->uniaddr) continue; =20 - startoff =3D erofs_pos(sb, dif->mapped_blkaddr); - length =3D erofs_pos(sb, dif->blocks); + startoff =3D erofs_pos(sb, dif->uniaddr); if (map->m_pa >=3D startoff && - map->m_pa < startoff + length) { + map->m_pa < startoff + erofs_pos(sb, dif->blocks)) { map->m_pa -=3D startoff; erofs_fill_from_devinfo(map, sb, dif); break; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 199395ed1c1f..8330ca3b18d3 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -30,25 +30,19 @@ #define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 #define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 #define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040 +#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080 #define EROFS_ALL_FEATURE_INCOMPAT \ - (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ - EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ - EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ - EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ - EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ - EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ - EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ - EROFS_FEATURE_INCOMPAT_FRAGMENTS | \ - EROFS_FEATURE_INCOMPAT_DEDUPE | \ - EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES) + ((EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES << 1) - 1) =20 #define EROFS_SB_EXTSLOT_SIZE 16 =20 struct erofs_deviceslot { u8 tag[64]; /* digest(sha256), etc. */ - __le32 blocks; /* total fs blocks of this device */ - __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ - u8 reserved[56]; + __le32 blocks_lo; /* total blocks count of this device */ + __le32 uniaddr_lo; /* unified starting block of this device */ + __le32 blocks_hi; /* total blocks count MSB */ + __le16 uniaddr_hi; /* unified starting block MSB */ + u8 reserved[50]; }; #define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) =20 @@ -59,13 +53,14 @@ struct erofs_super_block { __le32 feature_compat; __u8 blkszbits; /* filesystem block size in bit shift */ __u8 sb_extslots; /* superblock size =3D 128 + sb_extslots * 16 */ - - __le16 root_nid; /* nid of root directory */ + union { + __le16 rootnid_2b; /* nid of root directory */ + __le16 blocks_hi; /* (48BIT on) blocks count MSB */ + } rb; __le64 inos; /* total valid ino # (=3D=3D f_files - f_favail) = */ - - __le64 build_time; /* compact inode time derivation */ - __le32 build_time_nsec; /* compact inode time derivation in ns scale */ - __le32 blocks; /* used for statfs */ + __le64 epoch; /* base seconds used for compact inodes */ + __le32 fixed_nsec; /* fixed nanoseconds for compact inodes */ + __le32 blocks_lo; /* blocks count LSB */ __le32 meta_blkaddr; /* start block address of metadata area */ __le32 xattr_blkaddr; /* start block address of shared xattr area */ __u8 uuid[16]; /* 128-bit uuid for volume */ @@ -84,7 +79,10 @@ struct erofs_super_block { __le32 xattr_prefix_start; /* start of long xattr prefixes */ __le64 packed_nid; /* nid of the special packed inode */ __u8 xattr_filter_reserved; /* reserved for xattr name filter */ - __u8 reserved2[23]; + __u8 reserved[3]; + __le32 build_time; /* seconds added to epoch for mkfs time */ + __le64 rootnid_8b; /* (48BIT on) nid of root directory */ + __u8 reserved2[8]; }; =20 /* @@ -115,19 +113,18 @@ static inline bool erofs_inode_is_data_compressed(uns= igned int datamode) #define EROFS_I_VERSION_MASK 0x01 #define EROFS_I_DATALAYOUT_MASK 0x07 =20 -#define EROFS_I_VERSION_BIT 0 -#define EROFS_I_DATALAYOUT_BIT 1 -#define EROFS_I_ALL_BIT 4 - -#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1) +#define EROFS_I_VERSION_BIT 0 +#define EROFS_I_DATALAYOUT_BIT 1 +#define EROFS_I_NLINK_1_BIT 4 /* non-directory compact inodes only */ +#define EROFS_I_ALL ((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1) =20 /* indicate chunk blkbits, thus 'chunksize =3D blocksize << chunk blkbits'= */ #define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F -/* with chunk indexes or just a 4-byte blkaddr array */ +/* with chunk indexes or just a 4-byte block array */ #define EROFS_CHUNK_FORMAT_INDEXES 0x0020 +#define EROFS_CHUNK_FORMAT_48BIT 0x0040 =20 -#define EROFS_CHUNK_FORMAT_ALL \ - (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES) +#define EROFS_CHUNK_FORMAT_ALL ((EROFS_CHUNK_FORMAT_48BIT << 1) - 1) =20 /* 32-byte on-disk inode */ #define EROFS_INODE_LAYOUT_COMPACT 0 @@ -140,45 +137,40 @@ struct erofs_inode_chunk_info { }; =20 union erofs_inode_i_u { - /* total compressed blocks for compressed inodes */ - __le32 compressed_blocks; - - /* block address for uncompressed flat inodes */ - __le32 raw_blkaddr; - - /* for device files, used to indicate old/new device # */ - __le32 rdev; - - /* for chunk-based files, it contains the summary info */ + __le32 blocks_lo; /* total blocks count (if compressed inodes) */ + __le32 startblk_lo; /* starting block number (if flat inodes) */ + __le32 rdev; /* device ID (if special inodes) */ struct erofs_inode_chunk_info c; }; =20 +union erofs_inode_i_nb { + __le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */ + __le16 blocks_hi; /* total blocks count MSB */ + __le16 startblk_hi; /* starting block number MSB */ +}; + /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { __le16 i_format; /* inode format hints */ - -/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ __le16 i_xattr_icount; __le16 i_mode; - __le16 i_nlink; + union erofs_inode_i_nb i_nb; __le32 i_size; - __le32 i_reserved; + __le32 i_mtime; union erofs_inode_i_u i_u; =20 __le32 i_ino; /* only used for 32-bit stat compatibility */ __le16 i_uid; __le16 i_gid; - __le32 i_reserved2; + __le32 i_reserved; }; =20 /* 64-byte complete form of an ondisk inode */ struct erofs_inode_extended { __le16 i_format; /* inode format hints */ - -/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ __le16 i_xattr_icount; __le16 i_mode; - __le16 i_reserved; + union erofs_inode_i_nb i_nb; __le64 i_size; union erofs_inode_i_u i_u; =20 @@ -248,6 +240,7 @@ static inline unsigned int erofs_xattr_ibody_size(__le1= 6 i_xattr_icount) if (!i_xattr_icount) return 0; =20 + /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ return sizeof(struct erofs_xattr_ibody_header) + sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1); } @@ -266,11 +259,11 @@ static inline unsigned int erofs_xattr_entry_size(str= uct erofs_xattr_entry *e) /* 4-byte block address array */ #define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32) =20 -/* 8-byte inode chunk indexes */ +/* 8-byte inode chunk index */ struct erofs_inode_chunk_index { - __le16 advise; /* always 0, don't care for now */ + __le16 startblk_hi; /* starting block number MSB */ __le16 device_id; /* back-end storage id (with bits masked) */ - __le32 blkaddr; /* start block address of this inode chunk */ + __le32 startblk_lo; /* starting block number of this chunk */ }; =20 /* dirent sorts in alphabet order, thus we can do binary search */ diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index c8ede541c239..e74c0c00aa26 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -108,7 +108,7 @@ static int erofs_read_inode(struct inode *inode) iu =3D dic->i_u; i_uid_write(inode, le16_to_cpu(dic->i_uid)); i_gid_write(inode, le16_to_cpu(dic->i_gid)); - set_nlink(inode, le16_to_cpu(dic->i_nlink)); + set_nlink(inode, le16_to_cpu(dic->i_nb.nlink)); inode_set_mtime(inode, sbi->build_time, sbi->build_time_nsec); =20 inode->i_size =3D le32_to_cpu(dic->i_size); @@ -129,7 +129,7 @@ static int erofs_read_inode(struct inode *inode) case S_IFREG: case S_IFDIR: case S_IFLNK: - vi->raw_blkaddr =3D le32_to_cpu(iu.raw_blkaddr); + vi->startblk =3D le32_to_cpu(iu.startblk_lo); if(S_ISLNK(inode->i_mode)) { err =3D erofs_fill_symlink(inode, ptr, ofs); if (err) @@ -152,7 +152,7 @@ static int erofs_read_inode(struct inode *inode) } =20 if (erofs_inode_is_data_compressed(vi->datalayout)) - inode->i_blocks =3D le32_to_cpu(iu.compressed_blocks) << + inode->i_blocks =3D le32_to_cpu(iu.blocks_lo) << (sb->s_blocksize_bits - 9); else inode->i_blocks =3D round_up(inode->i_size, sb->s_blocksize) >> 9; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index b357cbbce764..58e401131c75 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -47,8 +47,8 @@ struct erofs_device_info { struct dax_device *dax_dev; u64 dax_part_off; =20 - u32 blocks; - u32 mapped_blkaddr; + erofs_blk_t blocks; + erofs_blk_t uniaddr; }; =20 enum { @@ -252,7 +252,7 @@ struct erofs_inode { unsigned int *xattr_shared_xattrs; =20 union { - erofs_blk_t raw_blkaddr; + erofs_blk_t startblk; struct { unsigned short chunkformat; unsigned char chunkbits; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 19e52ffa34c5..a64f9765e95e 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -178,8 +178,8 @@ static int erofs_init_device(struct erofs_buf *buf, str= uct super_block *sb, dif->file =3D file; } =20 - dif->blocks =3D le32_to_cpu(dis->blocks); - dif->mapped_blkaddr =3D le32_to_cpu(dis->mapped_blkaddr); + dif->blocks =3D le32_to_cpu(dis->blocks_lo); + dif->uniaddr =3D le32_to_cpu(dis->uniaddr_lo); sbi->total_blocks +=3D dif->blocks; *pos +=3D EROFS_DEVT_SLOT_SIZE; return 0; @@ -299,7 +299,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->dif0.blocks =3D le32_to_cpu(dsb->blocks); + sbi->dif0.blocks =3D le32_to_cpu(dsb->blocks_lo); sbi->meta_blkaddr =3D le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr =3D le32_to_cpu(dsb->xattr_blkaddr); @@ -308,12 +308,12 @@ static int erofs_read_superblock(struct super_block *= sb) sbi->xattr_filter_reserved =3D dsb->xattr_filter_reserved; #endif sbi->islotbits =3D ilog2(sizeof(struct erofs_inode_compact)); - sbi->root_nid =3D le16_to_cpu(dsb->root_nid); + sbi->root_nid =3D le16_to_cpu(dsb->rb.rootnid_2b); sbi->packed_nid =3D le64_to_cpu(dsb->packed_nid); sbi->inos =3D le64_to_cpu(dsb->inos); =20 - sbi->build_time =3D le64_to_cpu(dsb->build_time); - sbi->build_time_nsec =3D le32_to_cpu(dsb->build_time_nsec); + sbi->build_time =3D le64_to_cpu(dsb->epoch); + sbi->build_time_nsec =3D le32_to_cpu(dsb->fixed_nsec); =20 super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); =20 --=20 2.43.5