fs/erofs/data.c | 22 ++- fs/erofs/fileio.c | 2 +- fs/erofs/inode.c | 2 +- fs/erofs/internal.h | 15 +- fs/erofs/zdata.c | 332 +++++++++++++++++++++++++++++++++++++++----- 5 files changed, 322 insertions(+), 51 deletions(-)
Direct I/O is particularly useful in memory-sensitive scenarios, as it
provides more predictable performance by avoiding unnecessary page cache
overheads. For example, when accessing large files such as AI model files
that are typically read only once, buffered I/O introduces redundant page
cache usage and extra page copies, leading to unstable performance and
increased CPU load due to memory reclaim. While Direct I/O can avoid these.
The table below shows that the performance of direct I/O is up to 54.6%
higher than buffered I/O in the low-memory scenario. The results were
obtained using the fio benchmark with 8 threads, each thread read a 2.5GB
file, on ARM64 Android devices running the 6.6 kernel with an 8-core CPU
and 12GB of memory.
+--------------------------------------------------------------------------+
| fio benchmark | buffered I/O (MiB/s) | direct I/O (MiB/s) | diff |
|---------------------+----------------------+--------------------+--------|
| normal scenario | 2629.8 | 3648.7 | +38.7% |
|---------------------+----------------------+--------------------+--------|
| low memory scenario | 2350.0 | 3633.9 | +54.6% |
+--------------------------------------------------------------------------+
This patch does not support the following two cases. They will fall back to
buffered I/O:
(1) large folios, which will be supported in a follow-up patch.
(2) folios with private data attached, as the private data is required by
this direct I/O implementation.
Signed-off-by: Chunhai Guo <guochunhai@vivo.com>
---
fs/erofs/data.c | 22 ++-
fs/erofs/fileio.c | 2 +-
fs/erofs/inode.c | 2 +-
fs/erofs/internal.h | 15 +-
fs/erofs/zdata.c | 332 +++++++++++++++++++++++++++++++++++++++-----
5 files changed, 322 insertions(+), 51 deletions(-)
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 3b1ba571c728..3762e7efc94b 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -224,20 +224,12 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
return 0;
}
-/*
- * bit 30: I/O error occurred on this folio
- * bit 29: CPU has dirty data in D-cache (needs aliasing handling);
- * bit 0 - 29: remaining parts to complete this folio
- */
-#define EROFS_ONLINEFOLIO_EIO 30
-#define EROFS_ONLINEFOLIO_DIRTY 29
-
-void erofs_onlinefolio_init(struct folio *folio)
+void erofs_onlinefolio_init(struct folio *folio, bool dio)
{
union {
atomic_t o;
void *v;
- } u = { .o = ATOMIC_INIT(1) };
+ } u = { .o = ATOMIC_INIT(dio ? BIT(EROFS_ONLINEFOLIO_DIO) + 1 : 1) };
folio->private = u.v; /* valid only if file-backed folio is locked */
}
@@ -247,7 +239,7 @@ void erofs_onlinefolio_split(struct folio *folio)
atomic_inc((atomic_t *)&folio->private);
}
-void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty)
+bool erofs_onlinefolio_end(struct folio *folio, int err, bool dirty)
{
int orig, v;
@@ -258,12 +250,14 @@ void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty)
v |= (orig - 1) | (!!err << EROFS_ONLINEFOLIO_EIO);
} while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);
- if (v & (BIT(EROFS_ONLINEFOLIO_DIRTY) - 1))
- return;
+ if (v & (BIT(EROFS_ONLINEFOLIO_DIO) - 1))
+ return false;
folio->private = 0;
if (v & BIT(EROFS_ONLINEFOLIO_DIRTY))
flush_dcache_folio(folio);
- folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO)));
+ if (!(v & BIT(EROFS_ONLINEFOLIO_DIO)))
+ folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO)));
+ return true;
}
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index b7b3432a9882..aeecb861faa1 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -98,7 +98,7 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
loff_t pos = folio_pos(folio), ofs;
int err = 0;
- erofs_onlinefolio_init(folio);
+ erofs_onlinefolio_init(folio, false);
while (cur < end) {
if (!in_range(pos + cur, map->m_la, map->m_llen)) {
map->m_la = pos + cur;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 9a2f59721522..9248143e26df 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -214,7 +214,7 @@ static int erofs_fill_inode(struct inode *inode)
case S_IFREG:
inode->i_op = &erofs_generic_iops;
if (erofs_inode_is_data_compressed(vi->datalayout))
- inode->i_fop = &generic_ro_fops;
+ inode->i_fop = &z_erofs_file_fops;
else
inode->i_fop = &erofs_file_fops;
break;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 9319c66e86c3..f194ae889a73 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -407,6 +407,7 @@ extern const struct file_operations erofs_file_fops;
extern const struct file_operations erofs_dir_fops;
extern const struct iomap_ops z_erofs_iomap_report_ops;
+extern const struct file_operations z_erofs_file_fops;
/* flags for erofs_fscache_register_cookie() */
#define EROFS_REG_COOKIE_SHARE 0x0001
@@ -425,9 +426,9 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map);
-void erofs_onlinefolio_init(struct folio *folio);
+void erofs_onlinefolio_init(struct folio *folio, bool dio);
void erofs_onlinefolio_split(struct folio *folio);
-void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty);
+bool erofs_onlinefolio_end(struct folio *folio, int err, bool dirty);
struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
@@ -467,6 +468,16 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
}
void erofs_release_pages(struct page **pagepool);
+/*
+ * bit 30: I/O error occurred on this folio
+ * bit 29: CPU has dirty data in D-cache (needs aliasing handling);
+ * bit 28: FOLIO is read by direct I/O
+ * bit 0 - 27: remaining parts to complete this folio
+ */
+#define EROFS_ONLINEFOLIO_EIO 30
+#define EROFS_ONLINEFOLIO_DIRTY 29
+#define EROFS_ONLINEFOLIO_DIO 28
+
#ifdef CONFIG_EROFS_FS_ZIP
#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 625b8ae8f67f..e27b17606ad8 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -27,6 +27,20 @@ struct name { \
__Z_EROFS_BVSET(z_erofs_bvset,);
__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
+#define Z_EROFS_ONSTACK_PAGES 32
+
+struct dio_erofs {
+ bool is_pinned; /* T if we have pins on the pages */
+ bool should_dirty; /* if pages should be dirtied */
+ int eio; /* IO error */
+ atomic_t ref; /* refcount for AIO completion of pcl */
+ struct task_struct *waiter; /* waiting task (NULL if none) */
+ struct kiocb *iocb; /* kiocb */
+ loff_t pos; /* current file position we are operating on */
+ loff_t size; /* IO size */
+ struct page *pages[Z_EROFS_ONSTACK_PAGES]; /* page buffer */
+};
+
/*
* Structure fields follow one of the following exclusion rules.
*
@@ -39,6 +53,7 @@ __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
*/
struct z_erofs_pcluster {
struct mutex lock;
+ struct mutex dio_lock;
struct lockref lockref;
/* A: point to next chained pcluster or TAILs */
@@ -82,6 +97,9 @@ struct z_erofs_pcluster {
/* L: whether extra buffer allocations are best-effort */
bool besteffort;
+ /* L: store direct I/O-related information */
+ struct dio_erofs *dio;
+
/* A: compressed bvecs (can be cached or inplaced pages) */
struct z_erofs_bvec compressed_bvecs[];
};
@@ -112,8 +130,11 @@ static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
return fo->mapping == MNGD_MAPPING(sbi);
}
-#define Z_EROFS_ONSTACK_PAGES 32
-
+static inline void z_erofs_dio_size_add(struct dio_erofs *dio, loff_t len)
+{
+ if (dio)
+ dio->size += len;
+}
/*
* since pclustersize is variable for big pcluster feature, introduce slab
* pools implementation for different pcluster sizes.
@@ -506,16 +527,22 @@ struct z_erofs_frontend {
/* a pointer used to pick up inplace I/O pages */
unsigned int icur;
+
+ struct dio_erofs *dio;
};
#define Z_EROFS_DEFINE_FRONTEND(fe, i, ho) struct z_erofs_frontend fe = { \
.inode = i, .head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho }
+ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho, \
+ .dio = NULL }
static bool z_erofs_should_alloc_cache(struct z_erofs_frontend *fe)
{
unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
+ if (fe->dio)
+ return false;
+
if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
return false;
@@ -736,6 +763,24 @@ static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl)
return true;
}
+static void z_erofs_pcl_unlock(struct z_erofs_pcluster *pcl, int err)
+{
+ struct dio_erofs *dio = pcl->dio;
+
+ mutex_unlock(&pcl->lock);
+ if (dio) {
+ dio->eio = dio->eio ?: err;
+ if (atomic_dec_and_test(&dio->ref)) {
+ struct task_struct *waiter = dio->waiter;
+
+ WRITE_ONCE(dio->waiter, NULL);
+ wake_up_process(waiter);
+ }
+ pcl->dio = NULL;
+ mutex_unlock(&pcl->dio_lock);
+ }
+}
+
static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
@@ -766,7 +811,13 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
* lock all primary followed works before visible to others
* and mutex_trylock *never* fails for a new pcluster.
*/
+ mutex_init(&pcl->dio_lock);
mutex_init(&pcl->lock);
+ if (fe->dio) {
+ DBG_BUGON(!mutex_trylock(&pcl->dio_lock));
+ pcl->dio = fe->dio;
+ atomic_inc(&fe->dio->ref);
+ }
DBG_BUGON(!mutex_trylock(&pcl->lock));
if (!pcl->from_meta) {
@@ -795,7 +846,7 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
return 0;
err_out:
- mutex_unlock(&pcl->lock);
+ z_erofs_pcl_unlock(pcl, (err == -EEXIST) ? 0 : err);
z_erofs_free_pcluster(pcl);
return err;
}
@@ -835,12 +886,23 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
ret = z_erofs_register_pcluster(fe);
}
+ pcl = fe->pcl;
if (ret == -EEXIST) {
- mutex_lock(&fe->pcl->lock);
+ if (fe->dio) {
+ if (!mutex_is_locked(&pcl->dio_lock) ||
+ (mutex_get_owner(&pcl->dio_lock) !=
+ (unsigned long)current)) {
+ mutex_lock(&pcl->dio_lock);
+ DBG_BUGON(pcl->dio);
+ pcl->dio = fe->dio;
+ atomic_inc(&fe->dio->ref);
+ }
+ }
+ mutex_lock(&pcl->lock);
/* check if this pcluster hasn't been linked into any chain. */
- if (!cmpxchg(&fe->pcl->next, NULL, fe->head)) {
+ if (!cmpxchg(&pcl->next, NULL, fe->head)) {
/* .. so it can be attached to our submission chain */
- fe->head = fe->pcl;
+ fe->head = pcl;
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
} else { /* otherwise, it belongs to an inflight chain */
fe->mode = Z_EROFS_PCLUSTER_INFLIGHT;
@@ -849,9 +911,9 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
return ret;
}
- z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
- Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
- if (!fe->pcl->from_meta) {
+ z_erofs_bvec_iter_begin(&fe->biter, &pcl->bvset,
+ Z_EROFS_INLINE_BVECS, pcl->vcnt);
+ if (!pcl->from_meta) {
/* bind cache first when cached decompression is preferred */
z_erofs_bind_cache(fe);
} else {
@@ -866,12 +928,12 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
return ret;
}
folio_get(page_folio(map->buf.page));
- WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
- fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
+ WRITE_ONCE(pcl->compressed_bvecs[0].page, map->buf.page);
+ pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
}
/* file-backed inplace I/O pages are traversed in reverse order */
- fe->icur = z_erofs_pclusterpages(fe->pcl);
+ fe->icur = z_erofs_pclusterpages(pcl);
return 0;
}
@@ -1005,19 +1067,52 @@ static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio,
return 0;
}
+static bool erofs_is_dio_folio(struct folio *folio)
+{
+ return atomic_read((atomic_t *)&folio->private) &
+ BIT(EROFS_ONLINEFOLIO_DIO);
+}
+
+static bool z_erofs_page_is_invalidated(struct page *page)
+{
+ return !page_folio(page)->mapping &&
+ !z_erofs_is_shortlived_page(page) &&
+ !erofs_is_dio_folio(page_folio(page));
+}
+
+static void z_erofs_onlinefolio_end(struct folio *folio, int err, bool dirty,
+ struct dio_erofs *dio)
+{
+ bool ret, is_dfolio = erofs_is_dio_folio(folio);
+
+ DBG_BUGON(is_dfolio && !dio);
+ ret = erofs_onlinefolio_end(folio, err, dirty);
+ if (!ret || !dio || !is_dfolio)
+ return;
+
+ if (dio->should_dirty && !folio_test_dirty(folio)) {
+ DBG_BUGON(folio_test_locked(folio));
+ folio_lock(folio);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ }
+ if (dio->is_pinned)
+ unpin_user_folio(folio, 1);
+}
+
static int z_erofs_scan_folio(struct z_erofs_frontend *f,
struct folio *folio, bool ra)
{
struct inode *const inode = f->inode;
struct erofs_map_blocks *const map = &f->map;
- const loff_t offset = folio_pos(folio);
+ const loff_t offset = f->dio ? f->dio->pos : folio_pos(folio);
const unsigned int bs = i_blocksize(inode);
unsigned int end = folio_size(folio), split = 0, cur, pgs;
bool tight, excl;
int err = 0;
tight = (bs == PAGE_SIZE);
- erofs_onlinefolio_init(folio);
+ erofs_onlinefolio_init(folio, f->dio);
do {
if (offset + end - 1 < map->m_la ||
offset + end - 1 >= map->m_la + map->m_llen) {
@@ -1036,15 +1131,18 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f,
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
folio_zero_segment(folio, cur, end);
+ z_erofs_dio_size_add(f->dio, end - cur);
tight = false;
} else if (map->m_flags & __EROFS_MAP_FRAGMENT) {
erofs_off_t fpos = offset + cur - map->m_la;
+ u64 len = min(map->m_llen - fpos, end - cur);
err = z_erofs_read_fragment(inode->i_sb, folio, cur,
- cur + min(map->m_llen - fpos, end - cur),
+ cur + len,
EROFS_I(inode)->z_fragmentoff + fpos);
if (err)
break;
+ z_erofs_dio_size_add(f->dio, len);
tight = false;
} else {
if (!f->pcl) {
@@ -1094,7 +1192,7 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f,
tight = (bs == PAGE_SIZE);
}
} while ((end = cur) > 0);
- erofs_onlinefolio_end(folio, err, false);
+ z_erofs_onlinefolio_end(folio, err, false, f->dio);
return err;
}
@@ -1113,11 +1211,6 @@ static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
return false;
}
-static bool z_erofs_page_is_invalidated(struct page *page)
-{
- return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page);
-}
-
struct z_erofs_backend {
struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
struct super_block *sb;
@@ -1152,6 +1245,16 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be,
page = be->decompressed_pages + (poff >> PAGE_SHIFT);
if (!*page) {
*page = bvec->page;
+ if (be->pcl->dio &&
+ erofs_is_dio_folio(page_folio(bvec->page))) {
+ unsigned int end, cur;
+
+ end = min_t(unsigned int,
+ be->pcl->length - bvec->offset,
+ bvec->end);
+ cur = bvec->offset < 0 ? -bvec->offset : 0;
+ z_erofs_dio_size_add(be->pcl->dio, end - cur);
+ }
return;
}
} else {
@@ -1197,9 +1300,13 @@ static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err)
memcpy(dst + cur, src + scur, len);
kunmap_local(src);
cur += len;
+ if (!err && be->pcl->dio &&
+ erofs_is_dio_folio(page_folio(bvi->bvec.page)))
+ z_erofs_dio_size_add(be->pcl->dio, len);
}
kunmap_local(dst);
- erofs_onlinefolio_end(page_folio(bvi->bvec.page), err, true);
+ z_erofs_onlinefolio_end(page_folio(bvi->bvec.page), err, true,
+ be->pcl->dio);
list_del(p);
kfree(bvi);
}
@@ -1251,7 +1358,8 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped)
if (pcl->from_meta ||
erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) {
- if (!PageUptodate(page))
+ if (!PageUptodate(page) &&
+ !erofs_is_dio_folio(page_folio(page)))
err = -EIO;
continue;
}
@@ -1357,7 +1465,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
DBG_BUGON(z_erofs_page_is_invalidated(page));
if (!z_erofs_is_shortlived_page(page)) {
- erofs_onlinefolio_end(page_folio(page), err, true);
+ z_erofs_onlinefolio_end(page_folio(page), err, true,
+ pcl->dio);
continue;
}
if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) {
@@ -1383,8 +1492,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
/* pcluster lock MUST be taken before the following line */
WRITE_ONCE(pcl->next, NULL);
- mutex_unlock(&pcl->lock);
-
+ z_erofs_pcl_unlock(pcl, err);
if (pcl->from_meta)
z_erofs_free_pcluster(pcl);
else
@@ -1520,7 +1628,8 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
* File-backed folios for inplace I/Os are all locked steady,
* therefore it is impossible for `mapping` to be NULL.
*/
- if (mapping && mapping != mc) {
+ if ((mapping && mapping != mc) ||
+ (!folio_test_private(folio) && erofs_is_dio_folio(folio))) {
if (zbv.offset < 0)
bvec->bv_offset = round_up(-zbv.offset, bs);
bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
@@ -1641,16 +1750,17 @@ static void z_erofs_endio(struct bio *bio)
bio_for_each_folio_all(fi, bio) {
struct folio *folio = fi.folio;
- DBG_BUGON(folio_test_uptodate(folio));
+ DBG_BUGON(!erofs_is_dio_folio(folio) &&
+ folio_test_uptodate(folio));
DBG_BUGON(z_erofs_page_is_invalidated(&folio->page));
if (!erofs_folio_is_managed(EROFS_SB(q->sb), folio))
continue;
- if (!err)
+ if (err == BLK_STS_OK)
folio_mark_uptodate(folio);
folio_unlock(folio);
}
- if (err)
+ if (err != BLK_STS_OK)
q->eio = true;
z_erofs_decompress_kickoff(q, -1);
if (bio->bi_bdev)
@@ -1672,6 +1782,7 @@ static void z_erofs_submit_queue(struct z_erofs_frontend *f,
struct bio *bio = NULL;
unsigned long pflags;
int memstall = 0;
+ struct dio_erofs *dio = f->dio;
/* No need to read from device for pclusters in the bypass queue. */
q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
@@ -1748,6 +1859,13 @@ static void z_erofs_submit_queue(struct z_erofs_frontend *f,
else
bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
REQ_OP_READ, GFP_NOIO);
+ if (dio) {
+ bio->bi_write_hint =
+ f->inode->i_write_hint;
+ bio->bi_ioprio = dio->iocb->ki_ioprio;
+ if (dio->is_pinned)
+ bio_set_flag(bio, BIO_PAGE_PINNED);
+ }
bio->bi_end_io = z_erofs_endio;
bio->bi_iter.bi_sector =
(mdev.m_dif->fsoff + cur) >> 9;
@@ -1796,7 +1914,7 @@ static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rapages)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
struct erofs_sb_info *sbi = EROFS_I_SB(f->inode);
- bool force_fg = z_erofs_is_sync_decompress(sbi, rapages);
+ bool force_fg = !!f->dio || z_erofs_is_sync_decompress(sbi, rapages);
int err;
if (f->head == Z_EROFS_PCLUSTER_TAIL)
@@ -1830,6 +1948,8 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
if (backmost) {
if (rac)
end = headoffset + readahead_length(rac) - 1;
+ else if (f->dio)
+ end = f->dio->pos - 1;
else
end = headoffset + PAGE_SIZE - 1;
map->m_la = end;
@@ -1843,7 +1963,8 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
readahead_expand(rac, headoffset, cur - headoffset);
return;
- }
+ } else if (f->dio)
+ return;
end = round_up(end, PAGE_SIZE);
} else {
end = round_up(map->m_la, PAGE_SIZE);
@@ -1930,4 +2051,149 @@ static void z_erofs_readahead(struct readahead_control *rac)
const struct address_space_operations z_erofs_aops = {
.read_folio = z_erofs_read_folio,
.readahead = z_erofs_readahead,
+ .direct_IO = noop_direct_IO,
+};
+
+static ssize_t z_erofs_dio_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ Z_EROFS_DEFINE_FRONTEND(f, inode, iocb->ki_pos);
+ ssize_t err, off0;
+ loff_t offset = iocb->ki_pos;
+ unsigned int i = 0, total_pages, nr_pages = 0;
+ struct folio *head = NULL, *folio;
+ struct dio_erofs dio;
+ struct page **pages;
+ loff_t i_size;
+ struct iov_iter iter_saved = *iter;
+ int tmp_cnt = 0;
+
+ if (!iov_iter_count(iter))
+ return 0;
+
+ i_size = i_size_read(inode);
+ if (offset >= i_size)
+ return 0;
+
+ memset(&dio, 0, offsetof(struct dio_erofs, pages));
+ atomic_set(&dio.ref, 1);
+ dio.should_dirty = user_backed_iter(iter) && iov_iter_rw(iter) == READ;
+ dio.iocb = iocb;
+ dio.pos = ALIGN(min(iocb->ki_pos + (loff_t)iov_iter_count(iter),
+ i_size), PAGE_SIZE);
+ dio.is_pinned = iov_iter_extract_will_pin(iter);
+ dio.waiter = current;
+ f.dio = &dio;
+ iter_saved = *iter;
+ inode_dio_begin(inode);
+ pages = dio.pages;
+ total_pages = DIV_ROUND_UP(dio.pos - iocb->ki_pos, PAGE_SIZE);
+ for (; total_pages > 0; total_pages -= nr_pages) {
+ err = iov_iter_extract_pages(iter, &pages, LONG_MAX,
+ min(ARRAY_SIZE(dio.pages), total_pages), 0,
+ &off0);
+ if (err <= 0) {
+ err = -EFAULT;
+ goto fail_dio;
+ }
+ DBG_BUGON(off0);
+ iov_iter_revert(iter, err & ~PAGE_MASK);
+ nr_pages = DIV_ROUND_UP(err, PAGE_SIZE);
+ tmp_cnt += nr_pages;
+ for (i = 0; i < nr_pages; i++) {
+ folio = page_folio(pages[i]);
+ if (folio_test_large(folio) ||
+ folio_test_private(folio)) {
+ err = -EFAULT;
+ goto fail_dio;
+ }
+ folio->private = head;
+ head = folio;
+ }
+ }
+
+ z_erofs_pcluster_readmore(&f, NULL, true);
+ while (head) {
+ folio = head;
+ head = folio_get_private(folio);
+ dio.pos -= folio_size(folio);
+ err = z_erofs_scan_folio(&f, folio, false);
+ if (err && err != -EINTR)
+ erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
+ folio->index, EROFS_I(inode)->nid);
+ }
+ z_erofs_pcluster_end(&f);
+
+ err = z_erofs_runqueue(&f, 0);
+ erofs_put_metabuf(&f.map.buf);
+ erofs_release_pages(&f.pagepool);
+
+ if (!atomic_dec_and_test(&dio.ref)) {
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!READ_ONCE(dio.waiter))
+ break;
+
+ blk_io_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ }
+
+ err = err ?: dio.eio;
+ if (likely(!err)) {
+ err = dio.size;
+ if (offset + dio.size > i_size) /* check for short read */
+ err = i_size - offset;
+ iocb->ki_pos += err;
+ }
+ inode_dio_end(inode);
+ return err;
+
+fail_dio:
+ if (dio.is_pinned) {
+ while (head) {
+ folio = head;
+ head = folio_get_private(folio);
+ unpin_user_page(folio_page(folio, 0));
+ }
+ for (; i < nr_pages; i++)
+ unpin_user_page(dio.pages[i]);
+ }
+ *iter = iter_saved;
+ return err;
+}
+
+static bool erofs_should_use_dio(struct inode *inode, struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+
+ if (!(iocb->ki_flags & IOCB_DIRECT))
+ return false;
+
+ if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter),
+ i_blocksize(inode)))
+ return false;
+
+ return true;
+}
+
+static ssize_t z_erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ ssize_t ret;
+
+ if (erofs_should_use_dio(file_inode(iocb->ki_filp), iocb, iter)) {
+ ret = z_erofs_dio_read_iter(iocb, iter);
+ if (ret != -EFAULT)
+ return ret;
+ }
+
+ /* fallback to buffered I/O */
+ return filemap_read(iocb, iter, 0);
+}
+
+const struct file_operations z_erofs_file_fops = {
+ .llseek = generic_file_llseek,
+ .read_iter = z_erofs_file_read_iter,
+ .mmap = generic_file_readonly_mmap,
+ .splice_read = filemap_splice_read,
};
--
2.34.1
Hi Chunhai, kernel test robot noticed the following build errors: [auto build test ERROR on xiang-erofs/dev-test] [also build test ERROR on xiang-erofs/dev xiang-erofs/fixes linus/master v6.17-rc7 next-20250922] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Chunhai-Guo/erofs-add-direct-I-O-support-for-compressed-data/20250922-204843 base: https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev-test patch link: https://lore.kernel.org/r/20250922124304.489419-1-guochunhai%40vivo.com patch subject: [PATCH] erofs: add direct I/O support for compressed data config: i386-buildonly-randconfig-003-20250923 (https://download.01.org/0day-ci/archive/20250923/202509231034.jXPbkvNB-lkp@intel.com/config) compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250923/202509231034.jXPbkvNB-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202509231034.jXPbkvNB-lkp@intel.com/ All errors (new ones prefixed by >>, old ones prefixed by <<): >> ERROR: modpost: "z_erofs_file_fops" [fs/erofs/erofs.ko] undefined! -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
Hi Chunhai, kernel test robot noticed the following build warnings: [auto build test WARNING on xiang-erofs/dev-test] [also build test WARNING on xiang-erofs/dev xiang-erofs/fixes linus/master v6.17-rc7 next-20250922] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Chunhai-Guo/erofs-add-direct-I-O-support-for-compressed-data/20250922-204843 base: https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev-test patch link: https://lore.kernel.org/r/20250922124304.489419-1-guochunhai%40vivo.com patch subject: [PATCH] erofs: add direct I/O support for compressed data config: loongarch-randconfig-r072-20250923 (https://download.01.org/0day-ci/archive/20250923/202509231206.6HNck2h0-lkp@intel.com/config) compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project cafc064fc7a96b3979a023ddae1da2b499d6c954) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250923/202509231206.6HNck2h0-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202509231206.6HNck2h0-lkp@intel.com/ All warnings (new ones prefixed by >>): >> fs/erofs/zdata.c:2069:6: warning: variable 'tmp_cnt' set but not used [-Wunused-but-set-variable] 2069 | int tmp_cnt = 0; | ^ 1 warning generated. vim +/tmp_cnt +2069 fs/erofs/zdata.c 2056 2057 static ssize_t z_erofs_dio_read_iter(struct kiocb *iocb, struct iov_iter *iter) 2058 { 2059 struct inode *inode = file_inode(iocb->ki_filp); 2060 Z_EROFS_DEFINE_FRONTEND(f, inode, iocb->ki_pos); 2061 ssize_t err, off0; 2062 loff_t offset = iocb->ki_pos; 2063 unsigned int i = 0, total_pages, nr_pages = 0; 2064 struct folio *head = NULL, *folio; 2065 struct dio_erofs dio; 2066 struct page **pages; 2067 loff_t i_size; 2068 struct iov_iter iter_saved = *iter; > 2069 int tmp_cnt = 0; 2070 2071 if (!iov_iter_count(iter)) 2072 return 0; 2073 2074 i_size = i_size_read(inode); 2075 if (offset >= i_size) 2076 return 0; 2077 2078 memset(&dio, 0, offsetof(struct dio_erofs, pages)); 2079 atomic_set(&dio.ref, 1); 2080 dio.should_dirty = user_backed_iter(iter) && iov_iter_rw(iter) == READ; 2081 dio.iocb = iocb; 2082 dio.pos = ALIGN(min(iocb->ki_pos + (loff_t)iov_iter_count(iter), 2083 i_size), PAGE_SIZE); 2084 dio.is_pinned = iov_iter_extract_will_pin(iter); 2085 dio.waiter = current; 2086 f.dio = &dio; 2087 iter_saved = *iter; 2088 inode_dio_begin(inode); 2089 pages = dio.pages; 2090 total_pages = DIV_ROUND_UP(dio.pos - iocb->ki_pos, PAGE_SIZE); 2091 for (; total_pages > 0; total_pages -= nr_pages) { 2092 err = iov_iter_extract_pages(iter, &pages, LONG_MAX, 2093 min(ARRAY_SIZE(dio.pages), total_pages), 0, 2094 &off0); 2095 if (err <= 0) { 2096 err = -EFAULT; 2097 goto fail_dio; 2098 } 2099 DBG_BUGON(off0); 2100 iov_iter_revert(iter, err & ~PAGE_MASK); 2101 nr_pages = DIV_ROUND_UP(err, PAGE_SIZE); 2102 tmp_cnt += nr_pages; 2103 for (i = 0; i < nr_pages; i++) { 2104 folio = page_folio(pages[i]); 2105 if (folio_test_large(folio) || 2106 folio_test_private(folio)) { 2107 err = -EFAULT; 2108 goto fail_dio; 2109 } 2110 folio->private = head; 2111 head = folio; 2112 } 2113 } 2114 2115 z_erofs_pcluster_readmore(&f, NULL, true); 2116 while (head) { 2117 folio = head; 2118 head = folio_get_private(folio); 2119 dio.pos -= folio_size(folio); 2120 err = z_erofs_scan_folio(&f, folio, false); 2121 if (err && err != -EINTR) 2122 erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", 2123 folio->index, EROFS_I(inode)->nid); 2124 } 2125 z_erofs_pcluster_end(&f); 2126 2127 err = z_erofs_runqueue(&f, 0); 2128 erofs_put_metabuf(&f.map.buf); 2129 erofs_release_pages(&f.pagepool); 2130 2131 if (!atomic_dec_and_test(&dio.ref)) { 2132 for (;;) { 2133 set_current_state(TASK_UNINTERRUPTIBLE); 2134 if (!READ_ONCE(dio.waiter)) 2135 break; 2136 2137 blk_io_schedule(); 2138 } 2139 __set_current_state(TASK_RUNNING); 2140 } 2141 2142 err = err ?: dio.eio; 2143 if (likely(!err)) { 2144 err = dio.size; 2145 if (offset + dio.size > i_size) /* check for short read */ 2146 err = i_size - offset; 2147 iocb->ki_pos += err; 2148 } 2149 inode_dio_end(inode); 2150 return err; 2151 2152 fail_dio: 2153 if (dio.is_pinned) { 2154 while (head) { 2155 folio = head; 2156 head = folio_get_private(folio); 2157 unpin_user_page(folio_page(folio, 0)); 2158 } 2159 for (; i < nr_pages; i++) 2160 unpin_user_page(dio.pages[i]); 2161 } 2162 *iter = iter_saved; 2163 return err; 2164 } 2165 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
Hi Chunhai, On 2025/9/22 20:43, Chunhai Guo wrote: > Direct I/O is particularly useful in memory-sensitive scenarios, as it > provides more predictable performance by avoiding unnecessary page cache > overheads. For example, when accessing large files such as AI model files > that are typically read only once, buffered I/O introduces redundant page > cache usage and extra page copies, leading to unstable performance and > increased CPU load due to memory reclaim. While Direct I/O can avoid these. > > The table below shows that the performance of direct I/O is up to 54.6% > higher than buffered I/O in the low-memory scenario. The results were > obtained using the fio benchmark with 8 threads, each thread read a 2.5GB > file, on ARM64 Android devices running the 6.6 kernel with an 8-core CPU > and 12GB of memory. > > +--------------------------------------------------------------------------+ > | fio benchmark | buffered I/O (MiB/s) | direct I/O (MiB/s) | diff | > |---------------------+----------------------+--------------------+--------| > | normal scenario | 2629.8 | 3648.7 | +38.7% | > |---------------------+----------------------+--------------------+--------| > | low memory scenario | 2350.0 | 3633.9 | +54.6% | > +--------------------------------------------------------------------------+ Thanks for your patch! Yes, avoid page cache by using direct I/O for read-once data (e.g. distributing huge LLM model) actually makes sense on my side and your test result is impressive. I will look into your implementation later, since it's too late for v6.18. Let's address this feature for the v6.19 cycle. Thanks, Gao Xiang > > This patch does not support the following two cases. They will fall back to > buffered I/O: > (1) large folios, which will be supported in a follow-up patch. > (2) folios with private data attached, as the private data is required by > this direct I/O implementation. > > Signed-off-by: Chunhai Guo <guochunhai@vivo.com>
© 2016 - 2025 Red Hat, Inc.