From nobody Sun Feb 8 04:34:15 2026 Received: from out30-119.freemail.mail.aliyun.com (out30-119.freemail.mail.aliyun.com [115.124.30.119]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2DF2521D585 for ; Mon, 10 Mar 2025 09:55:14 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.119 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600517; cv=none; b=KVWlRUF03CsG4e9aRNcQSlaAcLhKB7OANIH7ZDHwgXngW7AFN1x9Vd3gVbq4jTSIuka8URrCMpwwpE3Nps+JSXCVQcsmNBwWMiyaDDCh8+MYBrRRnR/9WGr57QmFbfL/Si4wq+EwqsyQzD7A/6BLmC0C0IqafcE7vCFEDWK+m4I= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600517; c=relaxed/simple; bh=WKpqO9OcjV7tlcK2QY16WFcZDtgVwKHQ0/Hr3soYA5U=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=gCorGnqbs2igzYEpRSyfi7H4Kb+0ZiNivgRQlXSmVWWc9pypKwCjeW6rSmEvzBivjKaihi0hnDB6ombXAtVpuZ6QUdXlWCb0KES/9T/6nJclf3bIPl8y9IAhBdrAYVlE60lfNwclyaJN2wEdXqCHEzMARHRX8nApiATHsaGmuIc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=uvFj1rIT; arc=none smtp.client-ip=115.124.30.119 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="uvFj1rIT" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1741600505; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=uroFFZkg+9TmfgPuBM9ZBqZ5Jlq2UIuIm35LWHZSJMM=; b=uvFj1rITMbGXpsaYdJi7hAytlyWqWGtk/EvABF2Ym521jbO4nD0gWKGdRv2BiT3N1rl2xOHI83aQwhdoLSwvrAX9FUenjizVv7caalmPBQVMBXJUXzhja4g1crBmeRv9UdDNGe65Q8E3CQ4Gta46DGc57uBaN3zMQRK0UpejCPc= Received: from x31i01179.sqa.na131.tbsite.net(mailfrom:hsiangkao@linux.alibaba.com fp:SMTPD_---0WR1F3x1_1741600504 cluster:ay36) by smtp.aliyun-inc.com; Mon, 10 Mar 2025 17:55:05 +0800 From: Gao Xiang To: linux-erofs@lists.ozlabs.org Cc: LKML , Gao Xiang Subject: [PATCH 01/10] erofs: get rid of erofs_map_blocks_flatmode() Date: Mon, 10 Mar 2025 17:54:51 +0800 Message-ID: <20250310095459.2620647-2-hsiangkao@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> References: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" It's simple enough to be folded into erofs_map_blocks(). Signed-off-by: Gao Xiang Acked-by: Chao Yu --- fs/erofs/data.c | 117 +++++++++++++++++++----------------------------- 1 file changed, 47 insertions(+), 70 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 1d2cb0fa1baf..2f45e39ce8c7 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -70,58 +70,39 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct = super_block *sb, return erofs_bread(buf, offset, need_kmap); } =20 -static int erofs_map_blocks_flatmode(struct inode *inode, - struct erofs_map_blocks *map) -{ - struct erofs_inode *vi =3D EROFS_I(inode); - struct super_block *sb =3D inode->i_sb; - bool tailendpacking =3D (vi->datalayout =3D=3D EROFS_INODE_FLAT_INLINE); - erofs_blk_t lastblk =3D erofs_iblks(inode) - tailendpacking; - - map->m_flags =3D EROFS_MAP_MAPPED; /* no hole in flat inodes */ - if (map->m_la < erofs_pos(sb, lastblk)) { - map->m_pa =3D erofs_pos(sb, vi->raw_blkaddr) + map->m_la; - map->m_plen =3D erofs_pos(sb, lastblk) - map->m_la; - } else { - DBG_BUGON(!tailendpacking); - map->m_pa =3D erofs_iloc(inode) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(sb, map->m_la); - map->m_plen =3D inode->i_size - map->m_la; - - /* inline data should be located in the same meta block */ - if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { - erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } - map->m_flags |=3D EROFS_MAP_META; - } - return 0; -} - int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) { + struct erofs_buf buf =3D __EROFS_BUF_INITIALIZER; struct super_block *sb =3D inode->i_sb; + unsigned int unit, blksz =3D sb->s_blocksize; struct erofs_inode *vi =3D EROFS_I(inode); struct erofs_inode_chunk_index *idx; - struct erofs_buf buf =3D __EROFS_BUF_INITIALIZER; - u64 chunknr; - unsigned int unit; + erofs_blk_t startblk; + bool tailpacking; erofs_off_t pos; - void *kaddr; + u64 chunknr; int err =3D 0; =20 trace_erofs_map_blocks_enter(inode, map, 0); map->m_deviceid =3D 0; - if (map->m_la >=3D inode->i_size) { - /* leave out-of-bound access unmapped */ - map->m_flags =3D 0; - map->m_plen =3D map->m_llen; + map->m_flags =3D 0; + if (map->m_la >=3D inode->i_size) goto out; - } =20 if (vi->datalayout !=3D EROFS_INODE_CHUNK_BASED) { - err =3D erofs_map_blocks_flatmode(inode, map); + tailpacking =3D (vi->datalayout =3D=3D EROFS_INODE_FLAT_INLINE); + pos =3D erofs_pos(sb, erofs_iblks(inode) - tailpacking); + + map->m_flags =3D EROFS_MAP_MAPPED; + if (map->m_la < pos) { + map->m_pa =3D erofs_pos(sb, vi->raw_blkaddr) + map->m_la; + map->m_llen =3D pos - map->m_la; + } else { + map->m_pa =3D erofs_iloc(inode) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(sb, map->m_la); + map->m_llen =3D inode->i_size - map->m_la; + map->m_flags |=3D EROFS_MAP_META; + } goto out; } =20 @@ -134,45 +115,41 @@ int erofs_map_blocks(struct inode *inode, struct erof= s_map_blocks *map) pos =3D ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; =20 - kaddr =3D erofs_read_metabuf(&buf, sb, pos, true); - if (IS_ERR(kaddr)) { - err =3D PTR_ERR(kaddr); + idx =3D erofs_read_metabuf(&buf, sb, pos, true); + if (IS_ERR(idx)) { + err =3D PTR_ERR(idx); goto out; } map->m_la =3D chunknr << vi->chunkbits; - map->m_plen =3D min_t(erofs_off_t, 1UL << vi->chunkbits, - round_up(inode->i_size - map->m_la, sb->s_blocksize)); - - /* handle block map */ - if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { - __le32 *blkaddr =3D kaddr; - - if (le32_to_cpu(*blkaddr) =3D=3D EROFS_NULL_ADDR) { - map->m_flags =3D 0; - } else { - map->m_pa =3D erofs_pos(sb, le32_to_cpu(*blkaddr)); + map->m_llen =3D min_t(erofs_off_t, 1UL << vi->chunkbits, + round_up(inode->i_size - map->m_la, blksz)); + if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) { + startblk =3D le32_to_cpu(idx->blkaddr); + if (startblk !=3D EROFS_NULL_ADDR) { + map->m_deviceid =3D le16_to_cpu(idx->device_id) & + EROFS_SB(sb)->device_id_mask; + map->m_pa =3D erofs_pos(sb, startblk); + map->m_flags =3D EROFS_MAP_MAPPED; + } + } else { + startblk =3D le32_to_cpu(*(__le32 *)idx); + if (startblk !=3D EROFS_NULL_ADDR) { + map->m_pa =3D erofs_pos(sb, startblk); map->m_flags =3D EROFS_MAP_MAPPED; } - goto out_unlock; - } - /* parse chunk indexes */ - idx =3D kaddr; - switch (le32_to_cpu(idx->blkaddr)) { - case EROFS_NULL_ADDR: - map->m_flags =3D 0; - break; - default: - map->m_deviceid =3D le16_to_cpu(idx->device_id) & - EROFS_SB(sb)->device_id_mask; - map->m_pa =3D erofs_pos(sb, le32_to_cpu(idx->blkaddr)); - map->m_flags =3D EROFS_MAP_MAPPED; - break; } -out_unlock: erofs_put_metabuf(&buf); out: - if (!err) - map->m_llen =3D map->m_plen; + if (!err) { + map->m_plen =3D map->m_llen; + /* inline data should be located in the same meta block */ + if ((map->m_flags & EROFS_MAP_META) && + erofs_blkoff(sb, map->m_pa) + map->m_plen > blksz) { + erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + } trace_erofs_map_blocks_exit(inode, map, 0, err); return err; } --=20 2.43.5 From nobody Sun Feb 8 04:34:15 2026 Received: from out30-110.freemail.mail.aliyun.com (out30-110.freemail.mail.aliyun.com [115.124.30.110]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 09A67225771 for ; Mon, 10 Mar 2025 09:55:14 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.110 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600519; cv=none; b=K7GVnvGBkcbny3fb9ZsK2E1sIOD8z12MD40MTGfTo957PjKF3mZo/A6c4Fi7WRp1BlYyQrauGLqs2kU5EVtF2l8hE1v4LpXonamfLBAn38Z8YsxoOizctbkgkH16k8C2cPnaVvm2V1cbB5uR0WlubvFTiR5yt+oW7gqv93EiR5Y= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600519; c=relaxed/simple; bh=2LZRQgJepG5MTqai5nqPWj80Uh21MZ5qVNIFCnF62SU=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=qZ8u5bhOgQ5lfBYslRV5VOIphfbgqTRx+NJbMcjd09Bk6njcvrPJRQfI+mAPF/XOmYQM3pBbKoLWUjv9cW8ztmJ6JcHtS05AMjZfKZ/2loEjUxlu+920fBXk2LuraWhDXts8lHdlaQWUV9tIFjM2Bi9trJaCEJUaovk1dxNje90= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=TkpyQdT/; arc=none smtp.client-ip=115.124.30.110 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="TkpyQdT/" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1741600506; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=XkR7fIg1S/Lk9bWN6A3/iG9/WpYW4a9EMWzI9Z0Fxtk=; b=TkpyQdT/9mlpH1L58bPYAzaR8TEBplmCqGiuB10AZwatrz+JPZaGxQ7p0zHkgfAZlj/zliq/v/M3gCqGI+XChBkh92r9WrnjYWTtmHfAwjGVkJTiTLJcN7PdUSr7c23Qdy8Q4i8uUU98QMXdEYONLb3m996lzZqFw0WJEr04aPA= Received: from x31i01179.sqa.na131.tbsite.net(mailfrom:hsiangkao@linux.alibaba.com fp:SMTPD_---0WR1F3xk_1741600505 cluster:ay36) by smtp.aliyun-inc.com; Mon, 10 Mar 2025 17:55:06 +0800 From: Gao Xiang To: linux-erofs@lists.ozlabs.org Cc: LKML , Gao Xiang Subject: [PATCH 02/10] erofs: simplify erofs_{read,fill}_inode() Date: Mon, 10 Mar 2025 17:54:52 +0800 Message-ID: <20250310095459.2620647-3-hsiangkao@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> References: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" - Switch to on-stack `copied` since it's just 64 bytes; - Get rid of `nblks` and derive `i_blocks` directly; - Use `inode_set_mtime()` instead of `inode_set_ctime()` to follow the ondisk naming; - Rearrange the code. Signed-off-by: Gao Xiang Acked-by: Chao Yu --- fs/erofs/inode.c | 92 ++++++++++++++++++------------------------------ 1 file changed, 35 insertions(+), 57 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 4936bd43c438..c8ede541c239 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -27,29 +27,27 @@ static int erofs_fill_symlink(struct inode *inode, void= *kaddr, static int erofs_read_inode(struct inode *inode) { struct super_block *sb =3D inode->i_sb; + erofs_blk_t blkaddr =3D erofs_blknr(sb, erofs_iloc(inode)); + unsigned int ofs =3D erofs_blkoff(sb, erofs_iloc(inode)); + struct erofs_buf buf =3D __EROFS_BUF_INITIALIZER; struct erofs_sb_info *sbi =3D EROFS_SB(sb); struct erofs_inode *vi =3D EROFS_I(inode); - const erofs_off_t inode_loc =3D erofs_iloc(inode); - erofs_blk_t blkaddr, nblks =3D 0; - void *kaddr; + struct erofs_inode_extended *die, copied; struct erofs_inode_compact *dic; - struct erofs_inode_extended *die, *copied =3D NULL; union erofs_inode_i_u iu; - struct erofs_buf buf =3D __EROFS_BUF_INITIALIZER; - unsigned int ifmt, ofs; + unsigned int ifmt; + void *ptr; int err =3D 0; =20 - blkaddr =3D erofs_blknr(sb, inode_loc); - ofs =3D erofs_blkoff(sb, inode_loc); - - kaddr =3D erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), true); - if (IS_ERR(kaddr)) { - erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", - vi->nid, PTR_ERR(kaddr)); - return PTR_ERR(kaddr); + ptr =3D erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), true); + if (IS_ERR(ptr)) { + err =3D PTR_ERR(ptr); + erofs_err(sb, "failed to get inode (nid: %llu) page, err %d", + vi->nid, err); + goto err_out; } =20 - dic =3D kaddr + ofs; + dic =3D ptr + ofs; ifmt =3D le16_to_cpu(dic->i_format); if (ifmt & ~EROFS_I_ALL) { erofs_err(sb, "unsupported i_format %u of nid %llu", @@ -76,23 +74,18 @@ static int erofs_read_inode(struct inode *inode) } else { const unsigned int gotten =3D sb->s_blocksize - ofs; =20 - copied =3D kmalloc(vi->inode_isize, GFP_KERNEL); - if (!copied) { - err =3D -ENOMEM; - goto err_out; - } - memcpy(copied, dic, gotten); - kaddr =3D erofs_read_metabuf(&buf, sb, + memcpy(&copied, dic, gotten); + ptr =3D erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1), true); - if (IS_ERR(kaddr)) { - erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld", - vi->nid, PTR_ERR(kaddr)); - kfree(copied); - return PTR_ERR(kaddr); + if (IS_ERR(ptr)) { + err =3D PTR_ERR(ptr); + erofs_err(sb, "failed to get inode payload block (nid: %llu), err %d", + vi->nid, err); + goto err_out; } ofs =3D vi->inode_isize - gotten; - memcpy((u8 *)copied + gotten, kaddr, ofs); - die =3D copied; + memcpy((u8 *)&copied + gotten, ptr, ofs); + die =3D &copied; } vi->xattr_isize =3D erofs_xattr_ibody_size(die->i_xattr_icount); =20 @@ -101,12 +94,10 @@ static int erofs_read_inode(struct inode *inode) i_uid_write(inode, le32_to_cpu(die->i_uid)); i_gid_write(inode, le32_to_cpu(die->i_gid)); set_nlink(inode, le32_to_cpu(die->i_nlink)); - /* each extended inode has its own timestamp */ - inode_set_ctime(inode, le64_to_cpu(die->i_mtime), + inode_set_mtime(inode, le64_to_cpu(die->i_mtime), le32_to_cpu(die->i_mtime_nsec)); =20 inode->i_size =3D le64_to_cpu(die->i_size); - kfree(copied); break; case EROFS_INODE_LAYOUT_COMPACT: vi->inode_isize =3D sizeof(struct erofs_inode_compact); @@ -118,8 +109,7 @@ static int erofs_read_inode(struct inode *inode) i_uid_write(inode, le16_to_cpu(dic->i_uid)); i_gid_write(inode, le16_to_cpu(dic->i_gid)); set_nlink(inode, le16_to_cpu(dic->i_nlink)); - /* use build time for compact inodes */ - inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec); + inode_set_mtime(inode, sbi->build_time, sbi->build_time_nsec); =20 inode->i_size =3D le32_to_cpu(dic->i_size); break; @@ -141,7 +131,7 @@ static int erofs_read_inode(struct inode *inode) case S_IFLNK: vi->raw_blkaddr =3D le32_to_cpu(iu.raw_blkaddr); if(S_ISLNK(inode->i_mode)) { - err =3D erofs_fill_symlink(inode, kaddr, ofs); + err =3D erofs_fill_symlink(inode, ptr, ofs); if (err) goto err_out; } @@ -161,10 +151,13 @@ static int erofs_read_inode(struct inode *inode) goto err_out; } =20 - /* total blocks for compressed files */ - if (erofs_inode_is_data_compressed(vi->datalayout)) { - nblks =3D le32_to_cpu(iu.compressed_blocks); - } else if (vi->datalayout =3D=3D EROFS_INODE_CHUNK_BASED) { + if (erofs_inode_is_data_compressed(vi->datalayout)) + inode->i_blocks =3D le32_to_cpu(iu.compressed_blocks) << + (sb->s_blocksize_bits - 9); + else + inode->i_blocks =3D round_up(inode->i_size, sb->s_blocksize) >> 9; + + if (vi->datalayout =3D=3D EROFS_INODE_CHUNK_BASED) { /* fill chunked inode summary info */ vi->chunkformat =3D le16_to_cpu(iu.c.format); if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) { @@ -176,22 +169,15 @@ static int erofs_read_inode(struct inode *inode) vi->chunkbits =3D sb->s_blocksize_bits + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); } - inode_set_mtime_to_ts(inode, - inode_set_atime_to_ts(inode, inode_get_ctime(inode))); + inode_set_atime_to_ts(inode, + inode_set_ctime_to_ts(inode, inode_get_mtime(inode))); =20 inode->i_flags &=3D ~S_DAX; if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && (vi->datalayout =3D=3D EROFS_INODE_FLAT_PLAIN || vi->datalayout =3D=3D EROFS_INODE_CHUNK_BASED)) inode->i_flags |=3D S_DAX; - - if (!nblks) - /* measure inode.i_blocks as generic filesystems */ - inode->i_blocks =3D round_up(inode->i_size, sb->s_blocksize) >> 9; - else - inode->i_blocks =3D nblks << (sb->s_blocksize_bits - 9); err_out: - DBG_BUGON(err); erofs_put_metabuf(&buf); return err; } @@ -202,13 +188,10 @@ static int erofs_fill_inode(struct inode *inode) int err; =20 trace_erofs_fill_inode(inode); - - /* read inode base data from disk */ err =3D erofs_read_inode(inode); if (err) return err; =20 - /* setup the new inode */ switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op =3D &erofs_generic_iops; @@ -229,15 +212,10 @@ static int erofs_fill_inode(struct inode *inode) inode->i_op =3D &erofs_symlink_iops; inode_nohighmem(inode); break; - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: + default: inode->i_op =3D &erofs_generic_iops; init_special_inode(inode, inode->i_mode, inode->i_rdev); return 0; - default: - return -EFSCORRUPTED; } =20 mapping_set_large_folios(inode->i_mapping); --=20 2.43.5 From nobody Sun Feb 8 04:34:15 2026 Received: from out30-131.freemail.mail.aliyun.com (out30-131.freemail.mail.aliyun.com [115.124.30.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 160E42236EF for ; Mon, 10 Mar 2025 09:55:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.131 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600520; cv=none; b=XYLe8bA8PWROTmJzHs75gaA53WeM6nRXn7F2wpZtcpFHFdVJXfDUv/scBc/C8vfNSoDZxqsxRmzpBSHlMIGDWV6XmwvvAp/msDLnjv9mOhHsNhkqoDlPs3BxiTcQVDDo84xlcfrCE9rRmFP2RU4aaWanIDPHC4iKqrJ3O8wPnO4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600520; c=relaxed/simple; bh=lGem3y17GOA2vjgE9Qcp1uMHhRBKPopkROLpNxh6ujI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=gAHGWWa69eLtHMi3ciE/qnSAkVdHTn2atbsSnF7DxnBWJ8pvubwrwEcplo69XjsjpIsdfzeJ92Zq4/G1te7vD5ouxIwL5AMZaHXYQzGT06Tc6/pyMH82oUEyBXAXpna+sov5fn4PRxflXYDTtJ/oYs1/09dFBnSxyplCa+rtAaE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=Yt3N8ESk; arc=none smtp.client-ip=115.124.30.131 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="Yt3N8ESk" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1741600507; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=dPqt8cykj9yE5zj2nUPmPKdQRAButXykg1ZWEVJBkQM=; b=Yt3N8ESkc9Sj7RKJCX/lvQ89Kf2PrdAapduWxqHXIeUMusahYgLKxpg4OHWA2L3WbST9erJiEymH/RrO8cxs7vfcbbUDT96miigsu1R6ZE8REacDgo+vL37Br3qvMBE7qMBtrI+HOvV4WKqMlSTlGiZD2++hpDpTSrDCc5MxzZc= Received: from x31i01179.sqa.na131.tbsite.net(mailfrom:hsiangkao@linux.alibaba.com fp:SMTPD_---0WR1F3yP_1741600506 cluster:ay36) by smtp.aliyun-inc.com; Mon, 10 Mar 2025 17:55:07 +0800 From: Gao Xiang To: linux-erofs@lists.ozlabs.org Cc: LKML , Gao Xiang Subject: [PATCH 03/10] erofs: add 48-bit block addressing on-disk support Date: Mon, 10 Mar 2025 17:54:53 +0800 Message-ID: <20250310095459.2620647-4-hsiangkao@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> References: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The current 32-bit block addressing limits EROFS to a 16TiB maximum volume size with 4KiB blocks. However, several new use cases now require larger capacity support: - Massive datasets for model training in order to boost random sampling performance for each epoch; - Object storage clients using EROFS direct passthrough. This extends core on-disk structures to support 48-bit block addressing, such as inodes, device slots, and inode chunks. Additionally: - Expand superblock root NID to 8-byte `rootnid_8b` to enable full out-of-place update incremental builds; - Introduce `epoch` field in the superblock as well as add `mtime` field to 32-byte compact inodes for basic timestamp support. Signed-off-by: Gao Xiang Acked-by: Chao Yu --- fs/erofs/data.c | 15 ++++---- fs/erofs/erofs_fs.h | 91 +++++++++++++++++++++------------------------ fs/erofs/inode.c | 6 +-- fs/erofs/internal.h | 6 +-- fs/erofs/super.c | 12 +++--- 5 files changed, 61 insertions(+), 69 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 2f45e39ce8c7..3c4a4eaffe8c 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -95,7 +95,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_ma= p_blocks *map) =20 map->m_flags =3D EROFS_MAP_MAPPED; if (map->m_la < pos) { - map->m_pa =3D erofs_pos(sb, vi->raw_blkaddr) + map->m_la; + map->m_pa =3D erofs_pos(sb, vi->startblk) + map->m_la; map->m_llen =3D pos - map->m_la; } else { map->m_pa =3D erofs_iloc(inode) + vi->inode_isize + @@ -124,7 +124,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_= map_blocks *map) map->m_llen =3D min_t(erofs_off_t, 1UL << vi->chunkbits, round_up(inode->i_size - map->m_la, blksz)); if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) { - startblk =3D le32_to_cpu(idx->blkaddr); + startblk =3D le32_to_cpu(idx->startblk_lo); if (startblk !=3D EROFS_NULL_ADDR) { map->m_deviceid =3D le16_to_cpu(idx->device_id) & EROFS_SB(sb)->device_id_mask; @@ -168,7 +168,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_= map_dev *map) { struct erofs_dev_context *devs =3D EROFS_SB(sb)->devs; struct erofs_device_info *dif; - erofs_off_t startoff, length; + erofs_off_t startoff; int id; =20 erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0); @@ -181,7 +181,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_= map_dev *map) return -ENODEV; } if (devs->flatdev) { - map->m_pa +=3D erofs_pos(sb, dif->mapped_blkaddr); + map->m_pa +=3D erofs_pos(sb, dif->uniaddr); up_read(&devs->rwsem); return 0; } @@ -190,13 +190,12 @@ int erofs_map_dev(struct super_block *sb, struct erof= s_map_dev *map) } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { - if (!dif->mapped_blkaddr) + if (!dif->uniaddr) continue; =20 - startoff =3D erofs_pos(sb, dif->mapped_blkaddr); - length =3D erofs_pos(sb, dif->blocks); + startoff =3D erofs_pos(sb, dif->uniaddr); if (map->m_pa >=3D startoff && - map->m_pa < startoff + length) { + map->m_pa < startoff + erofs_pos(sb, dif->blocks)) { map->m_pa -=3D startoff; erofs_fill_from_devinfo(map, sb, dif); break; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 199395ed1c1f..8330ca3b18d3 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -30,25 +30,19 @@ #define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 #define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 #define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040 +#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080 #define EROFS_ALL_FEATURE_INCOMPAT \ - (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ - EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ - EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ - EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ - EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ - EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ - EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ - EROFS_FEATURE_INCOMPAT_FRAGMENTS | \ - EROFS_FEATURE_INCOMPAT_DEDUPE | \ - EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES) + ((EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES << 1) - 1) =20 #define EROFS_SB_EXTSLOT_SIZE 16 =20 struct erofs_deviceslot { u8 tag[64]; /* digest(sha256), etc. */ - __le32 blocks; /* total fs blocks of this device */ - __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ - u8 reserved[56]; + __le32 blocks_lo; /* total blocks count of this device */ + __le32 uniaddr_lo; /* unified starting block of this device */ + __le32 blocks_hi; /* total blocks count MSB */ + __le16 uniaddr_hi; /* unified starting block MSB */ + u8 reserved[50]; }; #define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) =20 @@ -59,13 +53,14 @@ struct erofs_super_block { __le32 feature_compat; __u8 blkszbits; /* filesystem block size in bit shift */ __u8 sb_extslots; /* superblock size =3D 128 + sb_extslots * 16 */ - - __le16 root_nid; /* nid of root directory */ + union { + __le16 rootnid_2b; /* nid of root directory */ + __le16 blocks_hi; /* (48BIT on) blocks count MSB */ + } rb; __le64 inos; /* total valid ino # (=3D=3D f_files - f_favail) = */ - - __le64 build_time; /* compact inode time derivation */ - __le32 build_time_nsec; /* compact inode time derivation in ns scale */ - __le32 blocks; /* used for statfs */ + __le64 epoch; /* base seconds used for compact inodes */ + __le32 fixed_nsec; /* fixed nanoseconds for compact inodes */ + __le32 blocks_lo; /* blocks count LSB */ __le32 meta_blkaddr; /* start block address of metadata area */ __le32 xattr_blkaddr; /* start block address of shared xattr area */ __u8 uuid[16]; /* 128-bit uuid for volume */ @@ -84,7 +79,10 @@ struct erofs_super_block { __le32 xattr_prefix_start; /* start of long xattr prefixes */ __le64 packed_nid; /* nid of the special packed inode */ __u8 xattr_filter_reserved; /* reserved for xattr name filter */ - __u8 reserved2[23]; + __u8 reserved[3]; + __le32 build_time; /* seconds added to epoch for mkfs time */ + __le64 rootnid_8b; /* (48BIT on) nid of root directory */ + __u8 reserved2[8]; }; =20 /* @@ -115,19 +113,18 @@ static inline bool erofs_inode_is_data_compressed(uns= igned int datamode) #define EROFS_I_VERSION_MASK 0x01 #define EROFS_I_DATALAYOUT_MASK 0x07 =20 -#define EROFS_I_VERSION_BIT 0 -#define EROFS_I_DATALAYOUT_BIT 1 -#define EROFS_I_ALL_BIT 4 - -#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1) +#define EROFS_I_VERSION_BIT 0 +#define EROFS_I_DATALAYOUT_BIT 1 +#define EROFS_I_NLINK_1_BIT 4 /* non-directory compact inodes only */ +#define EROFS_I_ALL ((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1) =20 /* indicate chunk blkbits, thus 'chunksize =3D blocksize << chunk blkbits'= */ #define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F -/* with chunk indexes or just a 4-byte blkaddr array */ +/* with chunk indexes or just a 4-byte block array */ #define EROFS_CHUNK_FORMAT_INDEXES 0x0020 +#define EROFS_CHUNK_FORMAT_48BIT 0x0040 =20 -#define EROFS_CHUNK_FORMAT_ALL \ - (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES) +#define EROFS_CHUNK_FORMAT_ALL ((EROFS_CHUNK_FORMAT_48BIT << 1) - 1) =20 /* 32-byte on-disk inode */ #define EROFS_INODE_LAYOUT_COMPACT 0 @@ -140,45 +137,40 @@ struct erofs_inode_chunk_info { }; =20 union erofs_inode_i_u { - /* total compressed blocks for compressed inodes */ - __le32 compressed_blocks; - - /* block address for uncompressed flat inodes */ - __le32 raw_blkaddr; - - /* for device files, used to indicate old/new device # */ - __le32 rdev; - - /* for chunk-based files, it contains the summary info */ + __le32 blocks_lo; /* total blocks count (if compressed inodes) */ + __le32 startblk_lo; /* starting block number (if flat inodes) */ + __le32 rdev; /* device ID (if special inodes) */ struct erofs_inode_chunk_info c; }; =20 +union erofs_inode_i_nb { + __le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */ + __le16 blocks_hi; /* total blocks count MSB */ + __le16 startblk_hi; /* starting block number MSB */ +}; + /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { __le16 i_format; /* inode format hints */ - -/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ __le16 i_xattr_icount; __le16 i_mode; - __le16 i_nlink; + union erofs_inode_i_nb i_nb; __le32 i_size; - __le32 i_reserved; + __le32 i_mtime; union erofs_inode_i_u i_u; =20 __le32 i_ino; /* only used for 32-bit stat compatibility */ __le16 i_uid; __le16 i_gid; - __le32 i_reserved2; + __le32 i_reserved; }; =20 /* 64-byte complete form of an ondisk inode */ struct erofs_inode_extended { __le16 i_format; /* inode format hints */ - -/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ __le16 i_xattr_icount; __le16 i_mode; - __le16 i_reserved; + union erofs_inode_i_nb i_nb; __le64 i_size; union erofs_inode_i_u i_u; =20 @@ -248,6 +240,7 @@ static inline unsigned int erofs_xattr_ibody_size(__le1= 6 i_xattr_icount) if (!i_xattr_icount) return 0; =20 + /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ return sizeof(struct erofs_xattr_ibody_header) + sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1); } @@ -266,11 +259,11 @@ static inline unsigned int erofs_xattr_entry_size(str= uct erofs_xattr_entry *e) /* 4-byte block address array */ #define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32) =20 -/* 8-byte inode chunk indexes */ +/* 8-byte inode chunk index */ struct erofs_inode_chunk_index { - __le16 advise; /* always 0, don't care for now */ + __le16 startblk_hi; /* starting block number MSB */ __le16 device_id; /* back-end storage id (with bits masked) */ - __le32 blkaddr; /* start block address of this inode chunk */ + __le32 startblk_lo; /* starting block number of this chunk */ }; =20 /* dirent sorts in alphabet order, thus we can do binary search */ diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index c8ede541c239..e74c0c00aa26 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -108,7 +108,7 @@ static int erofs_read_inode(struct inode *inode) iu =3D dic->i_u; i_uid_write(inode, le16_to_cpu(dic->i_uid)); i_gid_write(inode, le16_to_cpu(dic->i_gid)); - set_nlink(inode, le16_to_cpu(dic->i_nlink)); + set_nlink(inode, le16_to_cpu(dic->i_nb.nlink)); inode_set_mtime(inode, sbi->build_time, sbi->build_time_nsec); =20 inode->i_size =3D le32_to_cpu(dic->i_size); @@ -129,7 +129,7 @@ static int erofs_read_inode(struct inode *inode) case S_IFREG: case S_IFDIR: case S_IFLNK: - vi->raw_blkaddr =3D le32_to_cpu(iu.raw_blkaddr); + vi->startblk =3D le32_to_cpu(iu.startblk_lo); if(S_ISLNK(inode->i_mode)) { err =3D erofs_fill_symlink(inode, ptr, ofs); if (err) @@ -152,7 +152,7 @@ static int erofs_read_inode(struct inode *inode) } =20 if (erofs_inode_is_data_compressed(vi->datalayout)) - inode->i_blocks =3D le32_to_cpu(iu.compressed_blocks) << + inode->i_blocks =3D le32_to_cpu(iu.blocks_lo) << (sb->s_blocksize_bits - 9); else inode->i_blocks =3D round_up(inode->i_size, sb->s_blocksize) >> 9; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index b357cbbce764..58e401131c75 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -47,8 +47,8 @@ struct erofs_device_info { struct dax_device *dax_dev; u64 dax_part_off; =20 - u32 blocks; - u32 mapped_blkaddr; + erofs_blk_t blocks; + erofs_blk_t uniaddr; }; =20 enum { @@ -252,7 +252,7 @@ struct erofs_inode { unsigned int *xattr_shared_xattrs; =20 union { - erofs_blk_t raw_blkaddr; + erofs_blk_t startblk; struct { unsigned short chunkformat; unsigned char chunkbits; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 19e52ffa34c5..a64f9765e95e 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -178,8 +178,8 @@ static int erofs_init_device(struct erofs_buf *buf, str= uct super_block *sb, dif->file =3D file; } =20 - dif->blocks =3D le32_to_cpu(dis->blocks); - dif->mapped_blkaddr =3D le32_to_cpu(dis->mapped_blkaddr); + dif->blocks =3D le32_to_cpu(dis->blocks_lo); + dif->uniaddr =3D le32_to_cpu(dis->uniaddr_lo); sbi->total_blocks +=3D dif->blocks; *pos +=3D EROFS_DEVT_SLOT_SIZE; return 0; @@ -299,7 +299,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->dif0.blocks =3D le32_to_cpu(dsb->blocks); + sbi->dif0.blocks =3D le32_to_cpu(dsb->blocks_lo); sbi->meta_blkaddr =3D le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr =3D le32_to_cpu(dsb->xattr_blkaddr); @@ -308,12 +308,12 @@ static int erofs_read_superblock(struct super_block *= sb) sbi->xattr_filter_reserved =3D dsb->xattr_filter_reserved; #endif sbi->islotbits =3D ilog2(sizeof(struct erofs_inode_compact)); - sbi->root_nid =3D le16_to_cpu(dsb->root_nid); + sbi->root_nid =3D le16_to_cpu(dsb->rb.rootnid_2b); sbi->packed_nid =3D le64_to_cpu(dsb->packed_nid); sbi->inos =3D le64_to_cpu(dsb->inos); =20 - sbi->build_time =3D le64_to_cpu(dsb->build_time); - sbi->build_time_nsec =3D le32_to_cpu(dsb->build_time_nsec); + sbi->build_time =3D le64_to_cpu(dsb->epoch); + sbi->build_time_nsec =3D le32_to_cpu(dsb->fixed_nsec); =20 super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); =20 --=20 2.43.5 From nobody Sun Feb 8 04:34:15 2026 Received: from out30-133.freemail.mail.aliyun.com (out30-133.freemail.mail.aliyun.com [115.124.30.133]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3A6FE224B01 for ; Mon, 10 Mar 2025 09:55:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.133 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600521; cv=none; b=JP+k5j8/YTuE2NtrV6XcfW5udle1b7uzLVkuIwe5UrXUcJHNG1uWaQbcHwciJ5Y+JcRBISxKx1XaDmeJGWYeBtKIp6FkupB3lJl7cPalvM1g94H37GSkqZhh5dmc+1Vztbau65rlxhabC4UT/mro14vou8XmcfC0GUjLQQPCKjQ= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600521; c=relaxed/simple; bh=WR7Z61D0WTPYhdMw5VkMJf2ghCbQAVtcdibKXxAZfDc=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=des5Ss7CPVM1HfFzXBe1jKpHhptQgUeWxqO773vJFxKvaPDLQe0i5OAncDu9dly6/hsfdP5XPizaCp0jO582WxINPtxzXvlHw68EN/GU5bCfOVzZKlHNOpTJNMIAEoB4gGIs8Eg9QF5gSPS7cw2ZreidMNlZ8Cw+XXxmXKTFsTM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=clxrUSww; arc=none smtp.client-ip=115.124.30.133 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="clxrUSww" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1741600508; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=hLCfhtAgsnLF75lrVLJI+S/yAoxnVal1krGL9Ofsli4=; b=clxrUSwwOJpn7DlNpcSjGNGVw7w8YYHmyAtqxYyhW3nEkvrnXoZpQbUw19Zn7G3Lzt66yU7EkTosSURWA7xJl54FtZMBFPvZcyEDr7R0lev6IdsVyipQBXsSFRszGkAMP/wnv+s/FFaoaN8HgNhVpVXbexYWbx2aTGTVlyRf4jM= Received: from x31i01179.sqa.na131.tbsite.net(mailfrom:hsiangkao@linux.alibaba.com fp:SMTPD_---0WR1F3yt_1741600507 cluster:ay36) by smtp.aliyun-inc.com; Mon, 10 Mar 2025 17:55:08 +0800 From: Gao Xiang To: linux-erofs@lists.ozlabs.org Cc: LKML , Gao Xiang Subject: [PATCH 04/10] erofs: implement 48-bit block addressing for unencoded inodes Date: Mon, 10 Mar 2025 17:54:54 +0800 Message-ID: <20250310095459.2620647-5-hsiangkao@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> References: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" It adapts the on-disk changes from the previous commit. It also supports EROFS_NULL_ADDR (all 1's) for EROFS_INODE_FLAT_PLAIN inodes to indicate 0-filled inodes, as it's common for composefs use cases. As a result, EROFS_INODE_CHUNK_BASED is no longer needed. Signed-off-by: Gao Xiang Acked-by: Chao Yu --- fs/erofs/data.c | 13 +++++++++---- fs/erofs/dir.c | 2 +- fs/erofs/inode.c | 34 +++++++++++++++++++++++++--------- fs/erofs/internal.h | 12 ++++++------ fs/erofs/super.c | 15 ++++++++++----- fs/erofs/sysfs.c | 2 ++ include/trace/events/erofs.h | 2 +- 7 files changed, 54 insertions(+), 26 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 3c4a4eaffe8c..2409d2ab0c28 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -77,7 +77,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_ma= p_blocks *map) unsigned int unit, blksz =3D sb->s_blocksize; struct erofs_inode *vi =3D EROFS_I(inode); struct erofs_inode_chunk_index *idx; - erofs_blk_t startblk; + erofs_blk_t startblk, addrmask; bool tailpacking; erofs_off_t pos; u64 chunknr; @@ -91,6 +91,8 @@ int erofs_map_blocks(struct inode *inode, struct erofs_ma= p_blocks *map) =20 if (vi->datalayout !=3D EROFS_INODE_CHUNK_BASED) { tailpacking =3D (vi->datalayout =3D=3D EROFS_INODE_FLAT_INLINE); + if (!tailpacking && vi->startblk =3D=3D EROFS_NULL_ADDR) + goto out; pos =3D erofs_pos(sb, erofs_iblks(inode) - tailpacking); =20 map->m_flags =3D EROFS_MAP_MAPPED; @@ -124,8 +126,11 @@ int erofs_map_blocks(struct inode *inode, struct erofs= _map_blocks *map) map->m_llen =3D min_t(erofs_off_t, 1UL << vi->chunkbits, round_up(inode->i_size - map->m_la, blksz)); if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) { - startblk =3D le32_to_cpu(idx->startblk_lo); - if (startblk !=3D EROFS_NULL_ADDR) { + addrmask =3D (vi->chunkformat & EROFS_CHUNK_FORMAT_48BIT) ? + BIT_ULL(48) - 1 : BIT_ULL(32) - 1; + startblk =3D (((u64)le16_to_cpu(idx->startblk_hi) << 32) | + le32_to_cpu(idx->startblk_lo)) & addrmask; + if ((startblk ^ EROFS_NULL_ADDR) & addrmask) { map->m_deviceid =3D le16_to_cpu(idx->device_id) & EROFS_SB(sb)->device_id_mask; map->m_pa =3D erofs_pos(sb, startblk); @@ -133,7 +138,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_= map_blocks *map) } } else { startblk =3D le32_to_cpu(*(__le32 *)idx); - if (startblk !=3D EROFS_NULL_ADDR) { + if (startblk !=3D (u32)EROFS_NULL_ADDR) { map->m_pa =3D erofs_pos(sb, startblk); map->m_flags =3D EROFS_MAP_MAPPED; } diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 1d3bb8746ab1..fa3c2d380cc9 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -60,7 +60,7 @@ static int erofs_readdir(struct file *f, struct dir_conte= xt *ctx) =20 de =3D erofs_bread(&buf, dbstart, true); if (IS_ERR(de)) { - erofs_err(sb, "fail to readdir of logical block %u of nid %llu", + erofs_err(sb, "failed to readdir of logical block %llu of nid %llu", erofs_blknr(sb, dbstart), EROFS_I(dir)->nid); err =3D PTR_ERR(de); break; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index e74c0c00aa26..20d58228dfc9 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -31,10 +31,10 @@ static int erofs_read_inode(struct inode *inode) unsigned int ofs =3D erofs_blkoff(sb, erofs_iloc(inode)); struct erofs_buf buf =3D __EROFS_BUF_INITIALIZER; struct erofs_sb_info *sbi =3D EROFS_SB(sb); + erofs_blk_t addrmask =3D BIT_ULL(48) - 1; struct erofs_inode *vi =3D EROFS_I(inode); struct erofs_inode_extended *die, copied; struct erofs_inode_compact *dic; - union erofs_inode_i_u iu; unsigned int ifmt; void *ptr; int err =3D 0; @@ -71,6 +71,8 @@ static int erofs_read_inode(struct inode *inode) if (ofs + vi->inode_isize <=3D sb->s_blocksize) { ofs +=3D vi->inode_isize; die =3D (struct erofs_inode_extended *)dic; + copied.i_u =3D die->i_u; + copied.i_nb =3D die->i_nb; } else { const unsigned int gotten =3D sb->s_blocksize - ofs; =20 @@ -90,7 +92,6 @@ static int erofs_read_inode(struct inode *inode) vi->xattr_isize =3D erofs_xattr_ibody_size(die->i_xattr_icount); =20 inode->i_mode =3D le16_to_cpu(die->i_mode); - iu =3D die->i_u; i_uid_write(inode, le32_to_cpu(die->i_uid)); i_gid_write(inode, le32_to_cpu(die->i_gid)); set_nlink(inode, le32_to_cpu(die->i_nlink)); @@ -105,11 +106,21 @@ static int erofs_read_inode(struct inode *inode) vi->xattr_isize =3D erofs_xattr_ibody_size(dic->i_xattr_icount); =20 inode->i_mode =3D le16_to_cpu(dic->i_mode); - iu =3D dic->i_u; + copied.i_u =3D dic->i_u; + copied.i_nb =3D dic->i_nb; i_uid_write(inode, le16_to_cpu(dic->i_uid)); i_gid_write(inode, le16_to_cpu(dic->i_gid)); - set_nlink(inode, le16_to_cpu(dic->i_nb.nlink)); - inode_set_mtime(inode, sbi->build_time, sbi->build_time_nsec); + if (!S_ISDIR(inode->i_mode) && + ((ifmt >> EROFS_I_NLINK_1_BIT) & 1)) { + set_nlink(inode, 1); + copied.i_nb =3D dic->i_nb; + } else { + set_nlink(inode, le16_to_cpu(dic->i_nb.nlink)); + copied.i_nb.startblk_hi =3D 0; + addrmask =3D BIT_ULL(32) - 1; + } + inode_set_mtime(inode, sbi->epoch + le32_to_cpu(dic->i_mtime), + sbi->fixed_nsec); =20 inode->i_size =3D le32_to_cpu(dic->i_size); break; @@ -129,7 +140,12 @@ static int erofs_read_inode(struct inode *inode) case S_IFREG: case S_IFDIR: case S_IFLNK: - vi->startblk =3D le32_to_cpu(iu.startblk_lo); + vi->startblk =3D le32_to_cpu(copied.i_u.startblk_lo) | + ((u64)le16_to_cpu(copied.i_nb.startblk_hi) << 32); + if (vi->datalayout =3D=3D EROFS_INODE_FLAT_PLAIN && + !((vi->startblk ^ EROFS_NULL_ADDR) & addrmask)) + vi->startblk =3D EROFS_NULL_ADDR; + if(S_ISLNK(inode->i_mode)) { err =3D erofs_fill_symlink(inode, ptr, ofs); if (err) @@ -138,7 +154,7 @@ static int erofs_read_inode(struct inode *inode) break; case S_IFCHR: case S_IFBLK: - inode->i_rdev =3D new_decode_dev(le32_to_cpu(iu.rdev)); + inode->i_rdev =3D new_decode_dev(le32_to_cpu(copied.i_u.rdev)); break; case S_IFIFO: case S_IFSOCK: @@ -152,14 +168,14 @@ static int erofs_read_inode(struct inode *inode) } =20 if (erofs_inode_is_data_compressed(vi->datalayout)) - inode->i_blocks =3D le32_to_cpu(iu.blocks_lo) << + inode->i_blocks =3D le32_to_cpu(copied.i_u.blocks_lo) << (sb->s_blocksize_bits - 9); else inode->i_blocks =3D round_up(inode->i_size, sb->s_blocksize) >> 9; =20 if (vi->datalayout =3D=3D EROFS_INODE_CHUNK_BASED) { /* fill chunked inode summary info */ - vi->chunkformat =3D le16_to_cpu(iu.c.format); + vi->chunkformat =3D le16_to_cpu(copied.i_u.c.format); if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) { erofs_err(sb, "unsupported chunk format %x of nid %llu", vi->chunkformat, vi->nid); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 58e401131c75..07515a6f2534 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -37,8 +37,7 @@ __printf(2, 3) void _erofs_printk(struct super_block *sb,= const char *fmt, ...); =20 typedef u64 erofs_nid_t; typedef u64 erofs_off_t; -/* data type for filesystem-wide blocks number */ -typedef u32 erofs_blk_t; +typedef u64 erofs_blk_t; =20 struct erofs_device_info { char *path; @@ -143,8 +142,8 @@ struct erofs_sb_info { unsigned char blkszbits; /* filesystem block size in bit shift */ =20 u32 sb_size; /* total superblock size */ - u32 build_time_nsec; - u64 build_time; + u32 fixed_nsec; + s64 epoch; =20 /* what we really care is nid, rather than ino.. */ erofs_nid_t root_nid; @@ -205,8 +204,8 @@ struct erofs_buf { }; #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page =3D NULL }) =20 -#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_b= its)) -#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1)) +#define erofs_blknr(sb, pos) ((erofs_blk_t)((pos) >> (sb)->s_blocksize_bit= s)) +#define erofs_blkoff(sb, pos) ((pos) & ((sb)->s_blocksize - 1)) #define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) #define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_bl= kbits) =20 @@ -226,6 +225,7 @@ EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZT= AILPACKING) EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES) +EROFS_FEATURE_FUNCS(48bit, incompat, INCOMPAT_48BIT) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) =20 diff --git a/fs/erofs/super.c b/fs/erofs/super.c index a64f9765e95e..18445dc8597d 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -268,7 +268,7 @@ static int erofs_read_superblock(struct super_block *sb) goto out; } =20 - sbi->blkszbits =3D dsb->blkszbits; + sbi->blkszbits =3D dsb->blkszbits; if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) { erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits); goto out; @@ -308,13 +308,18 @@ static int erofs_read_superblock(struct super_block *= sb) sbi->xattr_filter_reserved =3D dsb->xattr_filter_reserved; #endif sbi->islotbits =3D ilog2(sizeof(struct erofs_inode_compact)); - sbi->root_nid =3D le16_to_cpu(dsb->rb.rootnid_2b); + if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) { + sbi->root_nid =3D le64_to_cpu(dsb->rootnid_8b); + sbi->dif0.blocks =3D (sbi->dif0.blocks << 32) | + le16_to_cpu(dsb->rb.blocks_hi); + } else { + sbi->root_nid =3D le16_to_cpu(dsb->rb.rootnid_2b); + } sbi->packed_nid =3D le64_to_cpu(dsb->packed_nid); sbi->inos =3D le64_to_cpu(dsb->inos); =20 - sbi->build_time =3D le64_to_cpu(dsb->epoch); - sbi->build_time_nsec =3D le32_to_cpu(dsb->fixed_nsec); - + sbi->epoch =3D (s64)le64_to_cpu(dsb->epoch); + sbi->fixed_nsec =3D le32_to_cpu(dsb->fixed_nsec); super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); =20 /* parse on-disk compression configurations */ diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 19d586273b70..dad4e6c6c155 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -81,6 +81,7 @@ EROFS_ATTR_FEATURE(sb_chksum); EROFS_ATTR_FEATURE(ztailpacking); EROFS_ATTR_FEATURE(fragments); EROFS_ATTR_FEATURE(dedupe); +EROFS_ATTR_FEATURE(48bit); =20 static struct attribute *erofs_feat_attrs[] =3D { ATTR_LIST(zero_padding), @@ -93,6 +94,7 @@ static struct attribute *erofs_feat_attrs[] =3D { ATTR_LIST(ztailpacking), ATTR_LIST(fragments), ATTR_LIST(dedupe), + ATTR_LIST(48bit), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index 57df3843e650..c69c7b1e41d1 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -75,7 +75,7 @@ TRACE_EVENT(erofs_fill_inode, __entry->ofs =3D erofs_blkoff(inode->i_sb, erofs_iloc(inode)); ), =20 - TP_printk("dev =3D (%d,%d), nid =3D %llu, blkaddr %u ofs %u", + TP_printk("dev =3D (%d,%d), nid =3D %llu, blkaddr %llu ofs %u", show_dev_nid(__entry), __entry->blkaddr, __entry->ofs) ); --=20 2.43.5 From nobody Sun Feb 8 04:34:15 2026 Received: from out30-119.freemail.mail.aliyun.com (out30-119.freemail.mail.aliyun.com [115.124.30.119]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 093FF225779 for ; Mon, 10 Mar 2025 09:55:17 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.119 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600521; cv=none; b=HU1KWRRh3yywUZhWTv01Rb4B5rq8Z4/1l7yZFzz9ze86tmubFthhaaFmDYCUCp7JtmsIX49Rq9mVxQqko+bX39zZX9qgIfTSFPJQ6bfWp94PHySXta5B4eUR/uoWinTNKgGrOQHx0rjHJqTOmkGocOxHltAB3CuCyy9ncX1QGec= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600521; c=relaxed/simple; bh=73+Jv7sZsXwu/uS0m4ZO/BsjxwhbO5akcYKKAQvI3UM=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=EhjZb+R53vmRW6B3t4jG3JQDs27XkTCAhL7vJF/ubJQhJoJ1ZQBBV/nb0MaYKfHtwX0JpJELgHmJk7+mkF3TH2jZNfwcJVfyy3XkGEeDB0AV+Ijk97zY+4JBU5sh95FjHOCRd7Uq9EYlhmSTQbDG08q9ZerE2eQdhZHDbOnHV7Y= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=TddCjJi+; arc=none smtp.client-ip=115.124.30.119 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="TddCjJi+" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1741600509; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=inQ7L5nN0a/HpCYRCvZiBbdUFWIni4CRkXK2nXN8wCY=; b=TddCjJi+gZCv4yRktRir9mMKCbAQAzBtLVcGWOw/YSHHX4hsFTxjNIrW0eKPhamMOy8S5lnvul2SivggokVcIaYQ208jN2h3QD6iMOtO2VpHal7TKRgyCeQDFqyylyTNYKZHeCks+0mp7dIo+1R70nahgQ+OOUoGLxhAeK3UUWM= Received: from x31i01179.sqa.na131.tbsite.net(mailfrom:hsiangkao@linux.alibaba.com fp:SMTPD_---0WR1F3zQ_1741600508 cluster:ay36) by smtp.aliyun-inc.com; Mon, 10 Mar 2025 17:55:09 +0800 From: Gao Xiang To: linux-erofs@lists.ozlabs.org Cc: LKML , Gao Xiang Subject: [PATCH 05/10] erofs: support dot-omitted directories Date: Mon, 10 Mar 2025 17:54:55 +0800 Message-ID: <20250310095459.2620647-6-hsiangkao@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> References: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" There's no need to record "." dirents in the directory data (while they could be used for sanity checks, they aren't very useful.) Omitting "." dirents also improves directory data deduplication. Use a per-inode (instead of per-sb) flag to indicate if the "." dirent is omitted or not, ensuring compatibility with incremental builds. It also reuses EROFS_I_NLINK_1_BIT, as it has very limited use cases for directories with `nlink =3D 1`. Emit the "." entry as the last virtual dirent in the directory because it is _much_ less frequently used than the ".." dirent. It also keeps `f_pos` meaningful, as it strictly follows the directory data when it's less than i_size. Signed-off-by: Gao Xiang Acked-by: Chao Yu --- fs/erofs/dir.c | 5 +++++ fs/erofs/erofs_fs.h | 1 + fs/erofs/inode.c | 4 +++- fs/erofs/internal.h | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index fa3c2d380cc9..2fae209d0274 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -90,6 +90,11 @@ static int erofs_readdir(struct file *f, struct dir_cont= ext *ctx) ofs =3D 0; } erofs_put_metabuf(&buf); + if (EROFS_I(dir)->dot_omitted && ctx->pos =3D=3D dir->i_size) { + if (!dir_emit_dot(f, ctx)) + return 0; + ++ctx->pos; + } return err < 0 ? err : 0; } =20 diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 8330ca3b18d3..791124b3f57c 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -116,6 +116,7 @@ static inline bool erofs_inode_is_data_compressed(unsig= ned int datamode) #define EROFS_I_VERSION_BIT 0 #define EROFS_I_DATALAYOUT_BIT 1 #define EROFS_I_NLINK_1_BIT 4 /* non-directory compact inodes only */ +#define EROFS_I_DOT_OMITTED_BIT 4 /* (directories) omit the `.` dirent */ #define EROFS_I_ALL ((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1) =20 /* indicate chunk blkbits, thus 'chunksize =3D blocksize << chunk blkbits'= */ diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 20d58228dfc9..3a5bb73a9397 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -137,8 +137,10 @@ static int erofs_read_inode(struct inode *inode) goto err_out; } switch (inode->i_mode & S_IFMT) { - case S_IFREG: case S_IFDIR: + vi->dot_omitted =3D (ifmt >> EROFS_I_DOT_OMITTED_BIT) & 1; + fallthrough; + case S_IFREG: case S_IFLNK: vi->startblk =3D le32_to_cpu(copied.i_u.startblk_lo) | ((u64)le16_to_cpu(copied.i_nb.startblk_hi) << 32); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 07515a6f2534..91d0b400459c 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -245,6 +245,7 @@ struct erofs_inode { =20 unsigned char datalayout; unsigned char inode_isize; + bool dot_omitted; unsigned int xattr_isize; =20 unsigned int xattr_name_filter; --=20 2.43.5 From nobody Sun Feb 8 04:34:15 2026 Received: from out30-132.freemail.mail.aliyun.com (out30-132.freemail.mail.aliyun.com [115.124.30.132]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0F35A2253FD for ; Mon, 10 Mar 2025 09:55:17 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.132 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600521; cv=none; b=jMVHzptdOcOqsBUql1y9UwFy7sSiuPUHlqcIk6iryf8xzG8s6WylXxD4qYzde8I1Nqr1f9KlLg9rs48yP8sTq/0B+6JVHayoxF46TgZOLYELDqKHfvaeDrZS49uaxu1oaMUg8khzgRx/OrOolq1OjAU3c+piFCEqKPO1uy7adwE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600521; c=relaxed/simple; bh=/BWPX7Uz85GMrx3mCFBiA2EA62gbqPSIIPgArZwjZT8=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=DW7GK4lmCc0e7BEFnft2BwN4u1q6TScdgO//KEVDHwlCmZUrynYAvvBniXaWMHjReQeq7wzbPeNbnM/lmggxR04oB0RUAR4zaKfWRM1hzaFqUi7GbMGCZPVusxdbTvqzoQftbmJG8EgYTl+KC4YQo2i1oWisk93eylpkRZX2I4o= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=qyPTt7sk; arc=none smtp.client-ip=115.124.30.132 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="qyPTt7sk" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1741600510; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=v3+UibsorVtK/yynVTxuLEiXsORirVMrgQEdb1NF2q4=; b=qyPTt7skhm0Mr7eyCyj9PYsO/uB192xbAauibE70MviexXDAcHpyHc7xB9bv8GYDA8rh2QHKoyMm8+0+zHi4p8jFJiJY0jQfhXg8A7yTM0K5C45RYKxVgCFVhcPTN0AGVtPGKixol0An19Zi6YuBJ0k+B7mLufOCMrve6hZ9lPE= Received: from x31i01179.sqa.na131.tbsite.net(mailfrom:hsiangkao@linux.alibaba.com fp:SMTPD_---0WR1F3zq_1741600509 cluster:ay36) by smtp.aliyun-inc.com; Mon, 10 Mar 2025 17:55:10 +0800 From: Gao Xiang To: linux-erofs@lists.ozlabs.org Cc: LKML , Gao Xiang Subject: [PATCH 06/10] erofs: initialize decompression early Date: Mon, 10 Mar 2025 17:54:56 +0800 Message-ID: <20250310095459.2620647-7-hsiangkao@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> References: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" - Rename erofs_init_managed_cache() to z_erofs_init_super(); - Move the initialization of managed_pslots into z_erofs_init_super() too; - Move z_erofs_init_super() and packed inode preparation upwards, before the root inode initialization. Therefore, the root directory can also be compressible. Signed-off-by: Gao Xiang Acked-by: Chao Yu --- fs/erofs/internal.h | 4 ++-- fs/erofs/super.c | 26 ++++++++++---------------- fs/erofs/zdata.c | 4 ++-- 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 91d0b400459c..b35742cf9431 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -436,6 +436,7 @@ int __init erofs_init_shrinker(void); void erofs_exit_shrinker(void); int __init z_erofs_init_subsystem(void); void z_erofs_exit_subsystem(void); +int z_erofs_init_super(struct super_block *sb); unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr_shrink); int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *= map, @@ -445,7 +446,6 @@ void z_erofs_put_gbuf(void *ptr); int z_erofs_gbuf_growsize(unsigned int nrpages); int __init z_erofs_gbuf_init(void); void z_erofs_gbuf_exit(void); -int erofs_init_managed_cache(struct super_block *sb); int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *d= sb); #else static inline void erofs_shrinker_register(struct super_block *sb) {} @@ -454,7 +454,7 @@ static inline int erofs_init_shrinker(void) { return 0;= } static inline void erofs_exit_shrinker(void) {} static inline int z_erofs_init_subsystem(void) { return 0; } static inline void z_erofs_exit_subsystem(void) {} -static inline int erofs_init_managed_cache(struct super_block *sb) { retur= n 0; } +static inline int z_erofs_init_super(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ =20 #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 18445dc8597d..0156ee7217c9 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -636,9 +636,16 @@ static int erofs_fc_fill_super(struct super_block *sb,= struct fs_context *fc) else sb->s_flags &=3D ~SB_POSIXACL; =20 -#ifdef CONFIG_EROFS_FS_ZIP - xa_init(&sbi->managed_pslots); -#endif + err =3D z_erofs_init_super(sb); + if (err) + return err; + + if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { + inode =3D erofs_iget(sb, sbi->packed_nid); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->packed_inode =3D inode; + } =20 inode =3D erofs_iget(sb, sbi->root_nid); if (IS_ERR(inode)) @@ -650,24 +657,11 @@ static int erofs_fc_fill_super(struct super_block *sb= , struct fs_context *fc) iput(inode); return -EINVAL; } - sb->s_root =3D d_make_root(inode); if (!sb->s_root) return -ENOMEM; =20 erofs_shrinker_register(sb); - if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { - sbi->packed_inode =3D erofs_iget(sb, sbi->packed_nid); - if (IS_ERR(sbi->packed_inode)) { - err =3D PTR_ERR(sbi->packed_inode); - sbi->packed_inode =3D NULL; - return err; - } - } - err =3D erofs_init_managed_cache(sb); - if (err) - return err; - err =3D erofs_xattr_prefixes_init(sb); if (err) return err; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5e4b65070b86..bc6d6842c5c2 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -644,18 +644,18 @@ static const struct address_space_operations z_erofs_= cache_aops =3D { .invalidate_folio =3D z_erofs_cache_invalidate_folio, }; =20 -int erofs_init_managed_cache(struct super_block *sb) +int z_erofs_init_super(struct super_block *sb) { struct inode *const inode =3D new_inode(sb); =20 if (!inode) return -ENOMEM; - set_nlink(inode, 1); inode->i_size =3D OFFSET_MAX; inode->i_mapping->a_ops =3D &z_erofs_cache_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); EROFS_SB(sb)->managed_cache =3D inode; + xa_init(&EROFS_SB(sb)->managed_pslots); return 0; } =20 --=20 2.43.5 From nobody Sun Feb 8 04:34:15 2026 Received: from out30-101.freemail.mail.aliyun.com (out30-101.freemail.mail.aliyun.com [115.124.30.101]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2CBE62253FB for ; Mon, 10 Mar 2025 09:55:19 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.101 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600523; cv=none; b=l1DV2LAOfomL4qn2zMnChQZDdqezmA1NSGWilNn/1PLDDV91zDrXA2ibeGldYIjIhOlQckSC8LnUif9dp791JSW6S6qvpJFDSEvXl2UrLRXPEO+XUAHQTDEY1D8db20kbv8IOOW7NuX5f2r3ObJipT2uAjxlLLs8bljEel9l5WM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600523; c=relaxed/simple; bh=8gFjaeVFkrQ1eCyaLBMY5MolyjnpzHJsmq7x3L68STs=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=KhHTWLPKJSht2SNq7/wE85Yv3gLQorGB4x2JbQUDm24g0o1GXG3Rvr453o6C8ukrW/usk0Gw9XIfOtnHc23VkH672mFE+Eo07b4nxWua7eg3UV5pqgT4kc/5n2RAhyfWwlDfL2ZllMU9t587UhKXnzIlYPpQ1MlqyKbM2fGGdSw= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=G9wo558R; arc=none smtp.client-ip=115.124.30.101 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="G9wo558R" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1741600511; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=Br4pjoIxe+WJd9+NNsPcX3mCBJZE87gP9k5YlaGbaj0=; b=G9wo558RE7dh+RJTregB4HOD1DDeZP0sXWg165piOENk84trVl/9Bu5Iddjh8XwgpZf5QaI6gjZ/06xGIF+hF6baIvQU9OqM7568LH8xCWmmPQhSgHjCJ8AJU0QtfdKpW9ZX3GioJ7eFYj3pQiUNE+b6oOtTpdufjNSoFzjyLZI= Received: from x31i01179.sqa.na131.tbsite.net(mailfrom:hsiangkao@linux.alibaba.com fp:SMTPD_---0WR1F4-I_1741600510 cluster:ay36) by smtp.aliyun-inc.com; Mon, 10 Mar 2025 17:55:10 +0800 From: Gao Xiang To: linux-erofs@lists.ozlabs.org Cc: LKML , Gao Xiang Subject: [PATCH 07/10] erofs: add encoded extent on-disk definition Date: Mon, 10 Mar 2025 17:54:57 +0800 Message-ID: <20250310095459.2620647-8-hsiangkao@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> References: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Previously, EROFS provided both (non-)compact compressed indexes to keep necessary hints for each logical block, enabling O(1) random indexing. This approach was originally designed for small compression units (e.g., 4KiB), where compressed data is strictly block-aligned via fixed-sized output compression. However, EROFS now supports big pclusters up to 1MiB and many users use large configurations to minimize image sizes. For such configurations, the total number of extents decreases significantly (e.g., only 1,024 extents for a 1GiB file using 1MiB pclusters), then runtime metadata overhead becomes negligible compared to data I/O and decoding costs. Additionally, some popular compression algorithm (mainly Zstd) still lacks native fixed-sized output compression support (although it's planned by their authors). Instead of just waiting for compressor improvements, let's adopt byte-oriented extents, allowing these compressors to retain their current methods. For example, it speeds up Zstd compression a lot: Processor: Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz * 96 Dataset: enwik9 Build time Size Type Command Line 3m52.339s 266653696 FO -C524288 -zzstd,22 3m48.549s 266174464 FO -E48bit -C524288 -zzstd,22 0m12.821s 272134144 FI -E48bit -C1048576 --max-extent-bytes=3D1048576 -= zzstd,22 0m14.528s 248987648 FO -C1048576 -zlzma,9 0m14.605s 248504320 FO -E48bit -C1048576 -zlzma,9 Encoded extents are structured as an array of `struct z_erofs_extent`, sorted by logical address in ascending order: __le32 plen // encoded length, algorithm id and flags __le32 pstart_lo // physical offset LSB __le32 pstart_hi // physical offset MSB __le32 lstart_lo // logical offset __le32 lstart_hi // logical offset MSB .. Note that prefixed reduced records can be used to minimize metadata for specific cases (e.g. lstart less than 32 bits, then 32 to 16 bytes). If the logical lengths of all encoded extents are the same, 4-byte (plen) and 8-byte (plen, pstart_lo) records can be used. Or, 16-byte (plen .. lstart_lo) and 32-byte full records have to be used instead. If 16-byte and 32-byte records are used, the total number of extents is kept in `struct z_erofs_map_header`, and binary search can be applied on them. Note that `eytzinger order` is not considerd because data sequential access is important. If 4-byte records are used, 8-byte start physical offset is between `struct z_erofs_map_header` and the `plen` array. In addition, 64-bit physical offsets can be applied with new encoded extent format to match full 48-bit block addressing. Remove redundant comments around `struct z_erofs_lcluster_index` too. Signed-off-by: Gao Xiang Acked-by: Chao Yu --- fs/erofs/erofs_fs.h | 99 +++++++++++++++++++++------------------------ fs/erofs/internal.h | 2 +- fs/erofs/zmap.c | 24 +++++------ 3 files changed, 58 insertions(+), 67 deletions(-) diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 791124b3f57c..6d461be790bd 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -331,21 +331,20 @@ struct z_erofs_zstd_cfgs { #define Z_EROFS_ZSTD_MAX_DICT_SIZE Z_EROFS_PCLUSTER_MAX_SIZE =20 /* - * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) - * e.g. for 4k logical cluster size, 4B if compacted 2B is of= f; - * (4B) + 2B + (4B) if compacted 2B is on. - * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) - * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) - * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) - * bit 4 : interlaced plain pcluster (0 - off; 1 - on) - * bit 5 : fragment pcluster (0 - off; 1 - on) + * Enable COMPACTED_2B for EROFS_INODE_COMPRESSED_COMPACT inodes: + * 4B (disabled) vs 4B+2B+4B (enabled) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 +/* Enable extent metadata for EROFS_INODE_COMPRESSED_FULL inodes */ +#define Z_EROFS_ADVISE_EXTENTS 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 #define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 #define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010 #define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020 +/* Indicate the record size for each extent if extent metadata is used */ +#define Z_EROFS_ADVISE_EXTRECSZ_BIT 1 +#define Z_EROFS_ADVISE_EXTRECSZ_MASK 0x3 =20 #define Z_EROFS_FRAGMENT_INODE_BIT 7 struct z_erofs_map_header { @@ -357,45 +356,24 @@ struct z_erofs_map_header { /* indicates the encoded size of tailpacking data */ __le16 h_idata_size; }; + __le32 h_extents_lo; /* extent count LSB */ }; __le16 h_advise; - /* - * bit 0-3 : algorithm type of head 1 (logical cluster type 01); - * bit 4-7 : algorithm type of head 2 (logical cluster type 11). - */ - __u8 h_algorithmtype; - /* - * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; - * bit 3-6 : reserved; - * bit 7 : move the whole file into packed inode or not. - */ - __u8 h_clusterbits; + union { + struct { + /* algorithm type (bit 0-3: HEAD1; bit 4-7: HEAD2) */ + __u8 h_algorithmtype; + /* + * bit 0-3 : logical cluster bits - blkszbits + * bit 4-6 : reserved + * bit 7 : pack the whole file into packed inode + */ + __u8 h_clusterbits; + }; + __le16 h_extents_hi; /* extent count MSB */ + }; }; =20 -/* - * On-disk logical cluster type: - * 0 - literal (uncompressed) lcluster - * 1,3 - compressed lcluster (for HEAD lclusters) - * 2 - compressed lcluster (for NONHEAD lclusters) - * - * In detail, - * 0 - literal (uncompressed) lcluster, - * di_advise =3D 0 - * di_clusterofs =3D the literal data offset of the lcluster - * di_blkaddr =3D the blkaddr of the literal pcluster - * - * 1,3 - compressed lcluster (for HEAD lclusters) - * di_advise =3D 1 or 3 - * di_clusterofs =3D the decompressed data offset of the lcluster - * di_blkaddr =3D the blkaddr of the compressed pcluster - * - * 2 - compressed lcluster (for NONHEAD lclusters) - * di_advise =3D 2 - * di_clusterofs =3D - * the decompressed data offset in its own HEAD lcluster - * di_u.delta[0] =3D distance to this HEAD lcluster - * di_u.delta[1] =3D distance to the next HEAD lcluster - */ enum { Z_EROFS_LCLUSTER_TYPE_PLAIN =3D 0, Z_EROFS_LCLUSTER_TYPE_HEAD1 =3D 1, @@ -409,11 +387,7 @@ enum { /* (noncompact only, HEAD) This pcluster refers to partial decompressed da= ta */ #define Z_EROFS_LI_PARTIAL_REF (1 << 15) =20 -/* - * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store = the - * compressed block count of a compressed extent (in logical clusters, aka. - * block count of a pcluster). - */ +/* Set on 1st non-head lcluster to store compressed block counti (in block= s) */ #define Z_EROFS_LI_D0_CBLKCNT (1 << 11) =20 struct z_erofs_lcluster_index { @@ -422,19 +396,36 @@ struct z_erofs_lcluster_index { __le16 di_clusterofs; =20 union { - /* for the HEAD lclusters */ - __le32 blkaddr; + __le32 blkaddr; /* for the HEAD lclusters */ /* - * for the NONHEAD lclusters * [0] - distance to its HEAD lcluster * [1] - distance to the next HEAD lcluster */ - __le16 delta[2]; + __le16 delta[2]; /* for the NONHEAD lclusters */ } di_u; }; =20 -#define Z_EROFS_FULL_INDEX_ALIGN(end) \ - (ALIGN(end, 8) + sizeof(struct z_erofs_map_header) + 8) +#define Z_EROFS_MAP_HEADER_END(end) \ + (ALIGN(end, 8) + sizeof(struct z_erofs_map_header)) +#define Z_EROFS_FULL_INDEX_START(end) (Z_EROFS_MAP_HEADER_END(end) + 8) + +#define Z_EROFS_EXTENT_PLEN_PARTIAL BIT(27) +#define Z_EROFS_EXTENT_PLEN_FMT_BIT 28 +#define Z_EROFS_EXTENT_PLEN_MASK ((Z_EROFS_PCLUSTER_MAX_SIZE << 1) - 1) +struct z_erofs_extent { + __le32 plen; /* encoded length */ + __le32 pstart_lo; /* physical offset */ + __le32 pstart_hi; /* physical offset MSB */ + __le32 lstart_lo; /* logical offset */ + __le32 lstart_hi; /* logical offset MSB (>=3D 4GiB inodes) */ + __u8 reserved[12]; /* for future use */ +}; + +static inline int z_erofs_extent_recsize(unsigned int advise) +{ + return 4 << ((advise >> Z_EROFS_ADVISE_EXTRECSZ_BIT) & + Z_EROFS_ADVISE_EXTRECSZ_MASK); +} =20 /* check the EROFS on-disk layout strictly at compile time */ static inline void erofs_check_ondisk_layout_definitions(void) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index b35742cf9431..f26191fe148b 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -262,7 +262,7 @@ struct erofs_inode { struct { unsigned short z_advise; unsigned char z_algorithmtype[2]; - unsigned char z_logical_clusterbits; + unsigned char z_lclusterbits; unsigned long z_tailextent_headlcn; erofs_off_t z_fragmentoff; unsigned short z_idata_size; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 87f933268ac7..25d3fa8e70d8 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -25,7 +25,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_mapr= ecorder *m, { struct inode *const inode =3D m->inode; struct erofs_inode *const vi =3D EROFS_I(inode); - const erofs_off_t pos =3D Z_EROFS_FULL_INDEX_ALIGN(erofs_iloc(inode) + + const erofs_off_t pos =3D Z_EROFS_FULL_INDEX_START(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize) + lcn * sizeof(struct z_erofs_lcluster_index); struct z_erofs_lcluster_index *di; @@ -40,7 +40,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_mapr= ecorder *m, advise =3D le16_to_cpu(di->di_advise); m->type =3D advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK; if (m->type =3D=3D Z_EROFS_LCLUSTER_TYPE_NONHEAD) { - m->clusterofs =3D 1 << vi->z_logical_clusterbits; + m->clusterofs =3D 1 << vi->z_lclusterbits; m->delta[0] =3D le16_to_cpu(di->di_u.delta[0]); if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) { if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | @@ -55,7 +55,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_mapr= ecorder *m, } else { m->partialref =3D !!(advise & Z_EROFS_LI_PARTIAL_REF); m->clusterofs =3D le16_to_cpu(di->di_clusterofs); - if (m->clusterofs >=3D 1 << vi->z_logical_clusterbits) { + if (m->clusterofs >=3D 1 << vi->z_lclusterbits) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -102,9 +102,9 @@ static int z_erofs_load_compact_lcluster(struct z_erofs= _maprecorder *m, { struct inode *const inode =3D m->inode; struct erofs_inode *const vi =3D EROFS_I(inode); - const erofs_off_t ebase =3D sizeof(struct z_erofs_map_header) + - ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - const unsigned int lclusterbits =3D vi->z_logical_clusterbits; + const erofs_off_t ebase =3D Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize); + const unsigned int lclusterbits =3D vi->z_lclusterbits; const unsigned int totalidx =3D erofs_iblks(inode); unsigned int compacted_4b_initial, compacted_2b, amortizedshift; unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; @@ -255,7 +255,7 @@ static int z_erofs_extent_lookback(struct z_erofs_mapre= corder *m, { struct super_block *sb =3D m->inode->i_sb; struct erofs_inode *const vi =3D EROFS_I(m->inode); - const unsigned int lclusterbits =3D vi->z_logical_clusterbits; + const unsigned int lclusterbits =3D vi->z_lclusterbits; =20 while (m->lcn >=3D lookback_distance) { unsigned long lcn =3D m->lcn - lookback_distance; @@ -304,7 +304,7 @@ static int z_erofs_get_extent_compressedlen(struct z_er= ofs_maprecorder *m, if ((m->headtype =3D=3D Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) || ((m->headtype =3D=3D Z_EROFS_LCLUSTER_TYPE_PLAIN || m->headtype =3D=3D Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) || - (lcn << vi->z_logical_clusterbits) >=3D inode->i_size) + (lcn << vi->z_lclusterbits) >=3D inode->i_size) m->compressedblks =3D 1; =20 if (m->compressedblks) @@ -354,7 +354,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_= erofs_maprecorder *m) struct inode *inode =3D m->inode; struct erofs_inode *vi =3D EROFS_I(inode); struct erofs_map_blocks *map =3D m->map; - unsigned int lclusterbits =3D vi->z_logical_clusterbits; + unsigned int lclusterbits =3D vi->z_lclusterbits; u64 lcn =3D m->lcn, headlcn =3D map->m_la >> lclusterbits; int err; =20 @@ -398,16 +398,16 @@ static int z_erofs_do_map_blocks(struct inode *inode, struct super_block *sb =3D inode->i_sb; bool fragment =3D vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; bool ztailpacking =3D vi->z_idata_size; + unsigned int lclusterbits =3D vi->z_lclusterbits; struct z_erofs_maprecorder m =3D { .inode =3D inode, .map =3D map, }; int err =3D 0; - unsigned int lclusterbits, endoff, afmt; + unsigned int endoff, afmt; unsigned long initial_lcn; unsigned long long ofs, end; =20 - lclusterbits =3D vi->z_logical_clusterbits; ofs =3D flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; initial_lcn =3D ofs >> lclusterbits; endoff =3D ofs & ((1 << lclusterbits) - 1); @@ -569,6 +569,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto done; } vi->z_advise =3D le16_to_cpu(h->h_advise); + vi->z_lclusterbits =3D sb->s_blocksize_bits + (h->h_clusterbits & 15); vi->z_algorithmtype[0] =3D h->h_algorithmtype & 15; vi->z_algorithmtype[1] =3D h->h_algorithmtype >> 4; if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) @@ -585,7 +586,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_put_metabuf; } =20 - vi->z_logical_clusterbits =3D sb->s_blocksize_bits + (h->h_clusterbits & = 7); if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { --=20 2.43.5 From nobody Sun Feb 8 04:34:15 2026 Received: from out30-97.freemail.mail.aliyun.com (out30-97.freemail.mail.aliyun.com [115.124.30.97]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id E0429225408 for ; Mon, 10 Mar 2025 09:55:19 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.97 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600522; cv=none; b=nY9tC2Wmk7OxA2YE6hhkgsAoH+AhND6hNAG+behzxfT4cMMIbhALFcr2IyRnNX8VCO4R+7r31DFQCB7qEKFpxv+0YoLWsTUjoXWIhhHV6gbNLLHZRyzPILQ5RrzDRKdYUFpmSLdrLejIBdl5XBbN3Ezpg6nfvU1h/fehVVEPpdE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600522; c=relaxed/simple; bh=4O0BS8duQ8hRgCRKbS/GapkPLCFWUBWhUjVnncjM+6k=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=RYYl3RpiKlIX8veDOna424Sd9RHuycjvHJLktENfHRrmPJMaeKFx00XQ531awkvNOeNAbcv7zWeYBg6PapGw7VOZncr/adcemZt4WtNN6X1n+PfHnch0Y7+w6460yOIgB9mTETQg+HCF7vcNmqZjhUmdj2+CTGby7w3osMt/Urc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=YT0MHLRA; arc=none smtp.client-ip=115.124.30.97 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="YT0MHLRA" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1741600512; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=8HUn6nXby9y1Idtt+UidKIGNy4BBNN0fCgY5e/PE3Jk=; b=YT0MHLRAlgH2Gg4R2V/w6QNBTgcAd6MBGjwn0w0rUxi85HF5FZ5YYPxxdQB8GLxss/5zNRfpm1VZVTV7W6b8mEgldZ15CAgYUOY2ErKm64wxVDjbrCIwaj2MYFcrAqhXav1CXWNt3tMsFBjqKODo7nLe1+P7ngaSo6ArOxvAASI= Received: from x31i01179.sqa.na131.tbsite.net(mailfrom:hsiangkao@linux.alibaba.com fp:SMTPD_---0WR1F4-p_1741600511 cluster:ay36) by smtp.aliyun-inc.com; Mon, 10 Mar 2025 17:55:11 +0800 From: Gao Xiang To: linux-erofs@lists.ozlabs.org Cc: LKML , Gao Xiang Subject: [PATCH 08/10] erofs: implement encoded extent metadata Date: Mon, 10 Mar 2025 17:54:58 +0800 Message-ID: <20250310095459.2620647-9-hsiangkao@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> References: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Implement the extent metadata parsing described in the previous commit. For 16-byte and 32-byte extent records, currently it is just a trivial binary search without considering the last access footprint, but it can be optimized for better sequential performance later. Tail fragments are supported, but ztailpacking feature is not for simplicity. Signed-off-by: Gao Xiang Acked-by: Chao Yu --- fs/erofs/internal.h | 5 +- fs/erofs/zmap.c | 142 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 135 insertions(+), 12 deletions(-) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index f26191fe148b..4ac188d5d894 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -263,7 +263,10 @@ struct erofs_inode { unsigned short z_advise; unsigned char z_algorithmtype[2]; unsigned char z_lclusterbits; - unsigned long z_tailextent_headlcn; + union { + u64 z_tailextent_headlcn; + u64 z_extents; + }; erofs_off_t z_fragmentoff; unsigned short z_idata_size; }; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 25d3fa8e70d8..8de50df05dfe 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -391,7 +391,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_= erofs_maprecorder *m) return 0; } =20 -static int z_erofs_do_map_blocks(struct inode *inode, +static int z_erofs_map_blocks_fo(struct inode *inode, struct erofs_map_blocks *map, int flags) { struct erofs_inode *vi =3D EROFS_I(inode); @@ -409,6 +409,14 @@ static int z_erofs_do_map_blocks(struct inode *inode, unsigned long long ofs, end; =20 ofs =3D flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; + if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) && + !vi->z_tailextent_headlcn) { + map->m_la =3D 0; + map->m_llen =3D inode->i_size; + map->m_flags =3D EROFS_MAP_MAPPED | + EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; + return 0; + } initial_lcn =3D ofs >> lclusterbits; endoff =3D ofs & ((1 << lclusterbits) - 1); =20 @@ -526,6 +534,115 @@ static int z_erofs_do_map_blocks(struct inode *inode, return err; } =20 +static int z_erofs_map_blocks_ext(struct inode *inode, + struct erofs_map_blocks *map, int flags) +{ + struct erofs_inode *vi =3D EROFS_I(inode); + struct super_block *sb =3D inode->i_sb; + bool interlaced =3D vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER; + unsigned int recsz =3D z_erofs_extent_recsize(vi->z_advise); + erofs_off_t pos =3D round_up(Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize), recsz); + erofs_off_t lend =3D inode->i_size; + erofs_off_t l, r, mid, pa, la, lstart; + struct z_erofs_extent *ext; + unsigned int fmt; + bool last; + + map->m_flags =3D 0; + if (recsz <=3D offsetof(struct z_erofs_extent, pstart_hi)) { + if (recsz <=3D offsetof(struct z_erofs_extent, pstart_lo)) { + ext =3D erofs_read_metabuf(&map->buf, sb, pos, true); + if (IS_ERR(ext)) + return PTR_ERR(ext); + pa =3D le64_to_cpu(*(__le64 *)ext); + pos +=3D sizeof(__le64); + lstart =3D 0; + } else { + lstart =3D map->m_la >> vi->z_lclusterbits; + pa =3D EROFS_NULL_ADDR; + } + + for (; lstart <=3D map->m_la; lstart +=3D 1 << vi->z_lclusterbits) { + ext =3D erofs_read_metabuf(&map->buf, sb, pos, true); + if (IS_ERR(ext)) + return PTR_ERR(ext); + map->m_plen =3D le32_to_cpu(ext->plen); + if (pa !=3D EROFS_NULL_ADDR) { + map->m_pa =3D pa; + pa +=3D map->m_plen & Z_EROFS_EXTENT_PLEN_MASK; + } else { + map->m_pa =3D le32_to_cpu(ext->pstart_lo); + } + pos +=3D recsz; + } + last =3D (lstart >=3D round_up(lend, 1 << vi->z_lclusterbits)); + lend =3D min(lstart, lend); + lstart -=3D 1 << vi->z_lclusterbits; + } else { + lstart =3D lend; + for (l =3D 0, r =3D vi->z_extents; l < r; ) { + mid =3D l + (r - l) / 2; + ext =3D erofs_read_metabuf(&map->buf, sb, + pos + mid * recsz, true); + if (IS_ERR(ext)) + return PTR_ERR(ext); + + la =3D le32_to_cpu(ext->lstart_lo); + pa =3D le32_to_cpu(ext->pstart_lo) | + (u64)le32_to_cpu(ext->pstart_hi) << 32; + if (recsz > offsetof(struct z_erofs_extent, lstart_hi)) + la |=3D (u64)le32_to_cpu(ext->lstart_hi) << 32; + + if (la > map->m_la) { + r =3D mid; + lend =3D la; + } else { + l =3D mid + 1; + if (map->m_la =3D=3D la) + r =3D min(l + 1, r); + lstart =3D la; + map->m_plen =3D le32_to_cpu(ext->plen); + map->m_pa =3D pa; + } + } + last =3D (l >=3D vi->z_extents); + } + + if (lstart < lend) { + map->m_la =3D lstart; + if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { + map->m_flags |=3D EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT; + vi->z_fragmentoff =3D map->m_plen; + if (recsz >=3D offsetof(struct z_erofs_extent, pstart_lo)) + vi->z_fragmentoff |=3D map->m_pa << 32; + } else if (map->m_plen) { + map->m_flags |=3D EROFS_MAP_MAPPED | + EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED; + fmt =3D map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT; + if (fmt) + map->m_algorithmformat =3D fmt - 1; + else if (interlaced && !erofs_blkoff(sb, map->m_pa)) + map->m_algorithmformat =3D + Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat =3D + Z_EROFS_COMPRESSION_SHIFTED; + if (map->m_plen & Z_EROFS_EXTENT_PLEN_PARTIAL) + map->m_flags |=3D EROFS_MAP_PARTIAL_REF; + map->m_plen &=3D Z_EROFS_EXTENT_PLEN_MASK; + } + } + map->m_llen =3D lend - map->m_la; + if (!last && map->m_llen < sb->s_blocksize) { + erofs_err(sb, "extent too small %llu @ offset %llu of nid %llu", + map->m_llen, map->m_la, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + return 0; +} + static int z_erofs_fill_inode_lazy(struct inode *inode) { struct erofs_inode *const vi =3D EROFS_I(inode); @@ -570,6 +687,13 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) } vi->z_advise =3D le16_to_cpu(h->h_advise); vi->z_lclusterbits =3D sb->s_blocksize_bits + (h->h_clusterbits & 15); + if (vi->datalayout =3D=3D EROFS_INODE_COMPRESSED_FULL && + (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) { + vi->z_extents =3D le32_to_cpu(h->h_extents_lo) | + ((u64)le16_to_cpu(h->h_extents_hi) << 32); + goto done; + } + vi->z_algorithmtype[0] =3D h->h_algorithmtype & 15; vi->z_algorithmtype[1] =3D h->h_algorithmtype >> 4; if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) @@ -609,7 +733,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) .buf =3D __EROFS_BUF_INITIALIZER }; =20 - err =3D z_erofs_do_map_blocks(inode, &map, + err =3D z_erofs_map_blocks_fo(inode, &map, EROFS_GET_BLOCKS_FINDTAIL); erofs_put_metabuf(&map.buf); if (err < 0) @@ -640,15 +764,11 @@ int z_erofs_map_blocks_iter(struct inode *inode, stru= ct erofs_map_blocks *map, } else { err =3D z_erofs_fill_inode_lazy(inode); if (!err) { - if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && - !vi->z_tailextent_headlcn) { - map->m_la =3D 0; - map->m_llen =3D inode->i_size; - map->m_flags =3D EROFS_MAP_MAPPED | - EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; - } else { - err =3D z_erofs_do_map_blocks(inode, map, flags); - } + if (vi->datalayout =3D=3D EROFS_INODE_COMPRESSED_FULL && + (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) + err =3D z_erofs_map_blocks_ext(inode, map, flags); + else + err =3D z_erofs_map_blocks_fo(inode, map, flags); } if (!err && (map->m_flags & EROFS_MAP_ENCODED) && unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || --=20 2.43.5 From nobody Sun Feb 8 04:34:15 2026 Received: from out30-130.freemail.mail.aliyun.com (out30-130.freemail.mail.aliyun.com [115.124.30.130]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id DED10225A40 for ; Mon, 10 Mar 2025 09:55:20 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.130 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600525; cv=none; b=Hquxokteff91d7XlhcrFRBWQbDqny/lPtrYSVgzBIoUn4MvBqfUFcmHoBLMZJJKooY5eSBi+RLyU6Uys1wi1LXXZC5mB0Du+2BWjbIL1v6zaUCDfGnJBDGLTRSeGnQCiG0MiCG+ph/BRBSQbnuBsdQyqaxm8jFlO2OsDIYaGReo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600525; c=relaxed/simple; bh=ww55T+hoX36OF565Dg/tA+6CIzmhFxu/2/WLLc2sscI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=F3H5GyLsesRYWlWLE5zBNZOcy/NatW7ubwA2ZR6og8zEjbAly8H2p7M57H6tOm0O/lSf0Bv3obhCdVVFi+c6fXpKarxWV+nym4KSghko2FrG7yK9WM+HOhcJrYc9m+heGKs+HPucnNUUWBDQXp1mFyFRoIcqov5NUVymIUuBb9U= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=SEJ27EFI; arc=none smtp.client-ip=115.124.30.130 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="SEJ27EFI" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1741600513; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=fTu+r29pJ2St4N5TdLAuOppyVCL2SeeAabz9j0fnfo8=; b=SEJ27EFIC1krInlV/hMnG/MTpAn+kF4UcVk5Qr8DkN4oRUkDXdW0VK1IFngyn32JPDyxX0xw7CAex85wJBHWAId8xi3fls5DJpPr0sPxR8d1PMnRXkhgfmDxITUG3VBYoTGfOPuGOGbDJOE3FQfbQDNeRKCZTZYKirOLMQNHz3o= Received: from x31i01179.sqa.na131.tbsite.net(mailfrom:hsiangkao@linux.alibaba.com fp:SMTPD_---0WR1F4.H_1741600511 cluster:ay36) by smtp.aliyun-inc.com; Mon, 10 Mar 2025 17:55:12 +0800 From: Gao Xiang To: linux-erofs@lists.ozlabs.org Cc: LKML , Gao Xiang Subject: [PATCH 09/10] erofs: support unaligned encoded data Date: Mon, 10 Mar 2025 17:54:59 +0800 Message-ID: <20250310095459.2620647-10-hsiangkao@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> References: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" We're almost there. It's straight-forward to adapt the current decompression subsystem to support unaligned encoded (compressed) data. Note that unaligned data is not encouraged because of worse I/O and caching efficiency unless the corresponding compressor doesn't support fixed-sized output compression natively like Zstd. Signed-off-by: Gao Xiang Acked-by: Chao Yu --- fs/erofs/decompressor.c | 2 +- fs/erofs/zdata.c | 92 ++++++++++++++++++++--------------------- 2 files changed, 46 insertions(+), 48 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 50e350b10f89..bf62e2836b60 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -313,7 +313,7 @@ static int z_erofs_transform_plain(struct z_erofs_decom= press_req *rq, rq->outputsize -=3D cur; } =20 - for (; rq->outputsize; rq->pageofs_in =3D 0, cur +=3D PAGE_SIZE, ni++) { + for (; rq->outputsize; rq->pageofs_in =3D 0, cur +=3D insz, ni++) { insz =3D min(PAGE_SIZE - rq->pageofs_in, rq->outputsize); rq->outputsize -=3D insz; if (!rq->in[ni]) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index bc6d6842c5c2..0671184d9cf1 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -44,8 +44,8 @@ struct z_erofs_pcluster { /* A: point to next chained pcluster or TAILs */ struct z_erofs_pcluster *next; =20 - /* I: start block address of this pcluster */ - erofs_off_t index; + /* I: start physical position of this pcluster */ + erofs_off_t pos; =20 /* L: the maximum decompression size of this round */ unsigned int length; @@ -73,6 +73,9 @@ struct z_erofs_pcluster { /* I: compression algorithm format */ unsigned char algorithmformat; =20 + /* I: whether compressed data is in-lined or not */ + bool from_meta; + /* L: whether partial decompression or not */ bool partial; =20 @@ -102,14 +105,9 @@ struct z_erofs_decompressqueue { bool eio, sync; }; =20 -static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) -{ - return !pcl->index; -} - static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *= pcl) { - return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; + return PAGE_ALIGN(pcl->pageofs_in + pcl->pclustersize) >> PAGE_SHIFT; } =20 static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio= *fo) @@ -133,7 +131,7 @@ struct z_erofs_pcluster_slab { =20 static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly =3D { _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), - _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) + _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES + 1) }; =20 struct z_erofs_bvec_iter { @@ -267,7 +265,6 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(= unsigned int size) pcl =3D kmem_cache_zalloc(pcs->slab, GFP_KERNEL); if (!pcl) return ERR_PTR(-ENOMEM); - pcl->pclustersize =3D size; return pcl; } return ERR_PTR(-EINVAL); @@ -516,6 +513,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend = *fe) struct z_erofs_pcluster *pcl =3D fe->pcl; unsigned int pclusterpages =3D z_erofs_pclusterpages(pcl); bool shouldalloc =3D z_erofs_should_alloc_cache(fe); + pgoff_t poff =3D pcl->pos >> PAGE_SHIFT; bool may_bypass =3D true; /* Optimistic allocation, as in-place I/O can be used as a fallback */ gfp_t gfp =3D (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | @@ -532,7 +530,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend = *fe) if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; =20 - folio =3D filemap_get_folio(mc, pcl->index + i); + folio =3D filemap_get_folio(mc, poff + i); if (IS_ERR(folio)) { may_bypass =3D false; if (!shouldalloc) @@ -575,7 +573,7 @@ static int erofs_try_to_free_all_cached_folios(struct e= rofs_sb_info *sbi, struct folio *folio; int i; =20 - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + DBG_BUGON(pcl->from_meta); /* Each cached folio contains one page unless bs > ps is supported */ for (i =3D 0; i < pclusterpages; ++i) { if (pcl->compressed_bvecs[i].page) { @@ -607,7 +605,7 @@ static bool z_erofs_cache_release_folio(struct folio *f= olio, gfp_t gfp) ret =3D false; spin_lock(&pcl->lockref.lock); if (pcl->lockref.count <=3D 0) { - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + DBG_BUGON(pcl->from_meta); for (; bvec < end; ++bvec) { if (bvec->page && page_folio(bvec->page) =3D=3D folio) { bvec->page =3D NULL; @@ -667,16 +665,20 @@ static int z_erofs_attach_page(struct z_erofs_fronten= d *fe, int ret; =20 if (exclusive) { - /* give priority for inplaceio to use file pages first */ - spin_lock(&pcl->lockref.lock); - while (fe->icur > 0) { - if (pcl->compressed_bvecs[--fe->icur].page) - continue; - pcl->compressed_bvecs[fe->icur] =3D *bvec; + /* Inplace I/O is limited to one page for uncompressed data */ + if (pcl->algorithmformat < Z_EROFS_COMPRESSION_MAX || + fe->icur <=3D 1) { + /* Try to prioritize inplace I/O here */ + spin_lock(&pcl->lockref.lock); + while (fe->icur > 0) { + if (pcl->compressed_bvecs[--fe->icur].page) + continue; + pcl->compressed_bvecs[fe->icur] =3D *bvec; + spin_unlock(&pcl->lockref.lock); + return 0; + } spin_unlock(&pcl->lockref.lock); - return 0; } - spin_unlock(&pcl->lockref.lock); =20 /* otherwise, check if it can be used as a bvpage */ if (fe->mode >=3D Z_EROFS_PCLUSTER_FOLLOWED && @@ -711,27 +713,26 @@ static int z_erofs_register_pcluster(struct z_erofs_f= rontend *fe) struct erofs_map_blocks *map =3D &fe->map; struct super_block *sb =3D fe->inode->i_sb; struct erofs_sb_info *sbi =3D EROFS_SB(sb); - bool ztailpacking =3D map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl, *pre; + unsigned int pageofs_in; int err; =20 - if (!(map->m_flags & EROFS_MAP_ENCODED) || - (!ztailpacking && !erofs_blknr(sb, map->m_pa))) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - /* no available pcluster, let's allocate one */ - pcl =3D z_erofs_alloc_pcluster(map->m_plen); + pageofs_in =3D erofs_blkoff(sb, map->m_pa); + pcl =3D z_erofs_alloc_pcluster(pageofs_in + map->m_plen); if (IS_ERR(pcl)) return PTR_ERR(pcl); =20 lockref_init(&pcl->lockref); /* one ref for this request */ pcl->algorithmformat =3D map->m_algorithmformat; + pcl->pclustersize =3D map->m_plen; + pcl->pageofs_in =3D pageofs_in; pcl->length =3D 0; pcl->partial =3D true; pcl->next =3D fe->head; + pcl->pos =3D map->m_pa; + pcl->pageofs_in =3D pageofs_in; pcl->pageofs_out =3D map->m_la & ~PAGE_MASK; + pcl->from_meta =3D map->m_flags & EROFS_MAP_META; fe->mode =3D Z_EROFS_PCLUSTER_FOLLOWED; =20 /* @@ -741,13 +742,10 @@ static int z_erofs_register_pcluster(struct z_erofs_f= rontend *fe) mutex_init(&pcl->lock); DBG_BUGON(!mutex_trylock(&pcl->lock)); =20 - if (ztailpacking) { - pcl->index =3D 0; /* which indicates ztailpacking */ - } else { - pcl->index =3D erofs_blknr(sb, map->m_pa); + if (!pcl->from_meta) { while (1) { xa_lock(&sbi->managed_pslots); - pre =3D __xa_cmpxchg(&sbi->managed_pslots, pcl->index, + pre =3D __xa_cmpxchg(&sbi->managed_pslots, pcl->pos, NULL, pcl, GFP_KERNEL); if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) { xa_unlock(&sbi->managed_pslots); @@ -779,7 +777,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_fronte= nd *fe) { struct erofs_map_blocks *map =3D &fe->map; struct super_block *sb =3D fe->inode->i_sb; - erofs_blk_t blknr =3D erofs_blknr(sb, map->m_pa); struct z_erofs_pcluster *pcl =3D NULL; int ret; =20 @@ -790,9 +787,9 @@ static int z_erofs_pcluster_begin(struct z_erofs_fronte= nd *fe) if (!(map->m_flags & EROFS_MAP_META)) { while (1) { rcu_read_lock(); - pcl =3D xa_load(&EROFS_SB(sb)->managed_pslots, blknr); + pcl =3D xa_load(&EROFS_SB(sb)->managed_pslots, map->m_pa); if (!pcl || z_erofs_get_pcluster(pcl)) { - DBG_BUGON(pcl && blknr !=3D pcl->index); + DBG_BUGON(pcl && map->m_pa !=3D pcl->pos); rcu_read_unlock(); break; } @@ -826,7 +823,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_fronte= nd *fe) =20 z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); - if (!z_erofs_is_inline_pcluster(fe->pcl)) { + if (!fe->pcl->from_meta) { /* bind cache first when cached decompression is preferred */ z_erofs_bind_cache(fe); } else { @@ -871,7 +868,7 @@ static bool __erofs_try_to_release_pcluster(struct erof= s_sb_info *sbi, * It's impossible to fail after the pcluster is freezed, but in order * to avoid some race conditions, add a DBG_BUGON to observe this. */ - DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) !=3D pcl); + DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->pos) !=3D pcl); =20 lockref_mark_dead(&pcl->lockref); return true; @@ -1221,7 +1218,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_back= end *be, bool *overlapped) } be->compressed_pages[i] =3D page; =20 - if (z_erofs_is_inline_pcluster(pcl) || + if (pcl->from_meta || erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) { if (!PageUptodate(page)) err =3D -EIO; @@ -1299,7 +1296,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs= _backend *be, int err) }, be->pagepool); =20 /* must handle all compressed pages before actual file pages */ - if (z_erofs_is_inline_pcluster(pcl)) { + if (pcl->from_meta) { page =3D pcl->compressed_bvecs[0].page; WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); @@ -1359,7 +1356,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs= _backend *be, int err) WRITE_ONCE(pcl->next, NULL); mutex_unlock(&pcl->lock); =20 - if (z_erofs_is_inline_pcluster(pcl)) + if (pcl->from_meta) z_erofs_free_pcluster(pcl); else z_erofs_put_pcluster(sbi, pcl, try_free); @@ -1540,7 +1537,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, folio =3D page_folio(page); out_tocache: if (!tocache || bs !=3D PAGE_SIZE || - filemap_add_folio(mc, folio, pcl->index + nr, gfp)) { + filemap_add_folio(mc, folio, (pcl->pos >> PAGE_SHIFT) + nr, gfp)) { /* turn into a temporary shortlived folio (1 ref) */ folio->private =3D (void *)Z_EROFS_SHORTLIVED_PAGE; return; @@ -1657,19 +1654,20 @@ static void z_erofs_submit_queue(struct z_erofs_fro= ntend *f, =20 pcl =3D next; next =3D READ_ONCE(pcl->next); - if (z_erofs_is_inline_pcluster(pcl)) { + if (pcl->from_meta) { z_erofs_move_to_bypass_queue(pcl, next, qtail); continue; } =20 /* no device id here, thus it will always succeed */ mdev =3D (struct erofs_map_dev) { - .m_pa =3D erofs_pos(sb, pcl->index), + .m_pa =3D round_down(pcl->pos, sb->s_blocksize), }; (void)erofs_map_dev(sb, &mdev); =20 cur =3D mdev.m_pa; - end =3D cur + pcl->pclustersize; + end =3D round_up(cur + pcl->pageofs_in + pcl->pclustersize, + sb->s_blocksize); do { bvec.bv_page =3D NULL; if (bio && (cur !=3D last_pa || --=20 2.43.5 From nobody Sun Feb 8 04:34:15 2026 Received: from out30-112.freemail.mail.aliyun.com (out30-112.freemail.mail.aliyun.com [115.124.30.112]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id F057F225765 for ; Mon, 10 Mar 2025 09:56:35 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.112 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600598; cv=none; b=L0ixHy/wk0RxjalWgUGsYC99F1misn+8EZoS0IjZ8P6b/dyTb/Lbrgsv8PiwGP8t/QxBvREGOWyianClnqMyioYTzCNL1IkoyG4AANfppXrdZ6DTO4HS0kO5gb+ZmCLuU0gugOLefO4199x7TN6wLZR7MaRvIewX5tslqXVtVZI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1741600598; c=relaxed/simple; bh=rPint/XhVsP8Z1f3hHjLyymK3IepZ6F3UIF5xCrRxKs=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=j4VEWRmpvCC59p5pg2ru0K9gFd7x4CDsYs9EHwJEAfsBFqz09VBOpYn9+CWoCnAMETpbgUPwMrYKf8hQJRhZIOk3S8MDwFhsrO8r7wqrDcs5ecHCs9H+dBjS++q43F5+dgZoKwhPMHUkCMmYLNAZvblVbdy8TSsYtxyrmU3de9U= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=jMjkmFFl; arc=none smtp.client-ip=115.124.30.112 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="jMjkmFFl" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1741600588; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=n44x2GIBHkV592zsiETUNG9/RVsDGqeAc5pa7H6OG30=; b=jMjkmFFlJMlOOIXZOQerOmrqPvozI0C/77Mr93ATza7N6S3AFtxsLIpYv3NLOsvji7gKe2L7w262dWZOS9qsDY+8JcjYGK7qKdISZ5b0MZt7lhDiLvx0EwdfR2fXbImgBFRuIUG1sjJNzv86pdn31B70EmzMOvL/yfXiG3GeVlM= Received: from x31i01179.sqa.na131.tbsite.net(mailfrom:hsiangkao@linux.alibaba.com fp:SMTPD_---0WR0rjXe_1741600587 cluster:ay36) by smtp.aliyun-inc.com; Mon, 10 Mar 2025 17:56:27 +0800 From: Gao Xiang To: linux-erofs@lists.ozlabs.org Cc: LKML , Gao Xiang Subject: [PATCH 10/10] erofs: enable 48-bit layout support Date: Mon, 10 Mar 2025 17:56:25 +0800 Message-ID: <20250310095625.2623817-1-hsiangkao@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> References: <20250310095459.2620647-1-hsiangkao@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Both 48-bit block addressing and encoded extents are implemented, let's enable them formally. Signed-off-by: Gao Xiang Acked-by: Chao Yu --- fs/erofs/Kconfig | 14 +++++++------- fs/erofs/erofs_fs.h | 2 +- fs/erofs/super.c | 2 ++ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 6ea60661fa55..331e49cd1b8d 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -13,12 +13,12 @@ config EROFS_FS smartphones with Android OS, LiveCDs and high-density hosts with numerous containers; =20 - It also provides fixed-sized output compression support in order to - improve storage density as well as keep relatively higher compression - ratios and implements in-place decompression to reuse the file page - for compressed data temporarily with proper strategies, which is - quite useful to ensure guaranteed end-to-end runtime decompression - performance under extremely memory pressure without extra cost. + It also provides transparent compression and deduplication support to + improve storage density and maintain relatively high compression + ratios, and it implements in-place decompression to temporarily reuse + page cache for compressed data using proper strategies, which is + quite useful for ensuring guaranteed end-to-end runtime decompression + performance under extreme memory pressure without extra cost. =20 See the documentation at and the web pages at for more details. @@ -97,7 +97,7 @@ config EROFS_FS_ZIP select LZ4_DECOMPRESS default y help - Enable fixed-sized output compression for EROFS. + Enable transparent compression support for EROFS file systems. =20 If you don't want to enable compression feature, say N. =20 diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 6d461be790bd..9581e9bf8192 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -32,7 +32,7 @@ #define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040 #define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080 #define EROFS_ALL_FEATURE_INCOMPAT \ - ((EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES << 1) - 1) + ((EROFS_FEATURE_INCOMPAT_48BIT << 1) - 1) =20 #define EROFS_SB_EXTSLOT_SIZE 16 =20 diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 0156ee7217c9..a8fc75fd1c74 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -330,6 +330,8 @@ static int erofs_read_superblock(struct super_block *sb) /* handle multiple devices */ ret =3D erofs_scan_devices(sb, dsb); =20 + if (erofs_sb_has_48bit(sbi)) + erofs_info(sb, "EXPERIMENTAL 48-bit layout support in use. Use at your o= wn risk!"); if (erofs_is_fscache_mode(sb)) erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use= . Use at your own risk!"); out: --=20 2.43.5