[v2] ntfsplus: ntfs filesystem remake

[PATCH v2 06/11] ntfsplus: add iomap and address space operations

Posted by Namjae Jeon 2 months, 1 week ago

This adds the implementation of iomap and address space operations
for ntfsplus.

Signed-off-by: Hyunchul Lee <hyc.lee@gmail.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/ntfsplus/aops.c       | 617 ++++++++++++++++++++++++++++++++++
 fs/ntfsplus/ntfs_iomap.c | 700 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 1317 insertions(+)
 create mode 100644 fs/ntfsplus/aops.c
 create mode 100644 fs/ntfsplus/ntfs_iomap.c

diff --git a/fs/ntfsplus/aops.c b/fs/ntfsplus/aops.c
new file mode 100644
index 000000000000..9a1b3b80a146
--- /dev/null
+++ b/fs/ntfsplus/aops.c
@@ -0,0 +1,617 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/**
+ * NTFS kernel address space operations and page cache handling.
+ *
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2002 Richard Russon
+ * Copyright (c) 2025 LG Electronics Co., Ltd.
+ */
+
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/uio.h>
+
+#include "aops.h"
+#include "attrib.h"
+#include "mft.h"
+#include "ntfs.h"
+#include "misc.h"
+#include "ntfs_iomap.h"
+
+static s64 ntfs_convert_page_index_into_lcn(struct ntfs_volume *vol, struct ntfs_inode *ni,
+		unsigned long page_index)
+{
+	sector_t iblock;
+	s64 vcn;
+	s64 lcn;
+	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
+
+	iblock = (s64)page_index << (PAGE_SHIFT - blocksize_bits);
+	vcn = (s64)iblock << blocksize_bits >> vol->cluster_size_bits;
+
+	down_read(&ni->runlist.lock);
+	lcn = ntfs_attr_vcn_to_lcn_nolock(ni, vcn, false);
+	up_read(&ni->runlist.lock);
+
+	return lcn;
+}
+
+struct bio *ntfs_setup_bio(struct ntfs_volume *vol, blk_opf_t opf, s64 lcn,
+		unsigned int pg_ofs)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(vol->sb->s_bdev, 1, opf, GFP_NOIO);
+	if (!bio)
+		return NULL;
+	bio->bi_iter.bi_sector = ((lcn << vol->cluster_size_bits) + pg_ofs) >>
+		vol->sb->s_blocksize_bits;
+
+	return bio;
+}
+
+/**
+ * ntfs_read_folio - fill a @folio of a @file with data from the device
+ * @file:	open file to which the folio @folio belongs or NULL
+ * @folio:	page cache folio to fill with data
+ *
+ * For non-resident attributes, ntfs_read_folio() fills the @folio of the open
+ * file @file by calling the ntfs version of the generic block_read_full_folio()
+ * function, which in turn creates and reads in the buffers associated with
+ * the folio asynchronously.
+ *
+ * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the
+ * data from the mft record (which at this stage is most likely in memory) and
+ * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
+ * even if the mft record is not cached at this point in time, we need to wait
+ * for it to be read in before we can do the copy.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_read_folio(struct file *file, struct folio *folio)
+{
+	loff_t i_size;
+	struct inode *vi;
+	struct ntfs_inode *ni;
+
+	vi = folio->mapping->host;
+	i_size = i_size_read(vi);
+	/* Is the page fully outside i_size? (truncate in progress) */
+	if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >>
+			PAGE_SHIFT)) {
+		folio_zero_segment(folio, 0, PAGE_SIZE);
+		ntfs_debug("Read outside i_size - truncated?");
+		folio_mark_uptodate(folio);
+		folio_unlock(folio);
+		return 0;
+	}
+	/*
+	 * This can potentially happen because we clear PageUptodate() during
+	 * ntfs_writepage() of MstProtected() attributes.
+	 */
+	if (folio_test_uptodate(folio)) {
+		folio_unlock(folio);
+		return 0;
+	}
+	ni = NTFS_I(vi);
+
+	/*
+	 * Only $DATA attributes can be encrypted and only unnamed $DATA
+	 * attributes can be compressed.  Index root can have the flags set but
+	 * this means to create compressed/encrypted files, not that the
+	 * attribute is compressed/encrypted.  Note we need to check for
+	 * AT_INDEX_ALLOCATION since this is the type of both directory and
+	 * index inodes.
+	 */
+	if (ni->type != AT_INDEX_ALLOCATION) {
+		/* If attribute is encrypted, deny access, just like NT4. */
+		if (NInoEncrypted(ni)) {
+			folio_unlock(folio);
+			return -EACCES;
+		}
+		/* Compressed data streams are handled in compress.c. */
+		if (NInoNonResident(ni) && NInoCompressed(ni))
+			return ntfs_read_compressed_block(folio);
+	}
+
+	return iomap_read_folio(folio, &ntfs_read_iomap_ops);
+}
+
+static int ntfs_write_mft_block(struct ntfs_inode *ni, struct folio *folio,
+		struct writeback_control *wbc)
+{
+	struct inode *vi = VFS_I(ni);
+	struct ntfs_volume *vol = ni->vol;
+	u8 *kaddr;
+	struct ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE];
+	int nr_locked_nis = 0, err = 0, mft_ofs, prev_mft_ofs;
+	struct bio *bio = NULL;
+	unsigned long mft_no;
+	struct ntfs_inode *tni;
+	s64 lcn;
+	s64 vcn = (s64)folio->index << PAGE_SHIFT >> vol->cluster_size_bits;
+	s64 end_vcn = ni->allocated_size >> vol->cluster_size_bits;
+	unsigned int folio_sz;
+	struct runlist_element *rl;
+
+	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, folio index 0x%lx.",
+			vi->i_ino, ni->type, folio->index);
+
+	lcn = ntfs_convert_page_index_into_lcn(vol, ni, folio->index);
+	if (lcn <= LCN_HOLE) {
+		folio_start_writeback(folio);
+		folio_unlock(folio);
+		folio_end_writeback(folio);
+		return -EIO;
+	}
+
+	/* Map folio so we can access its contents. */
+	kaddr = kmap_local_folio(folio, 0);
+	/* Clear the page uptodate flag whilst the mst fixups are applied. */
+	folio_clear_uptodate(folio);
+
+	for (mft_ofs = 0; mft_ofs < PAGE_SIZE && vcn < end_vcn;
+	     mft_ofs += vol->mft_record_size) {
+		/* Get the mft record number. */
+		mft_no = (((s64)folio->index << PAGE_SHIFT) + mft_ofs) >>
+			vol->mft_record_size_bits;
+		vcn = mft_no << vol->mft_record_size_bits >> vol->cluster_size_bits;
+		/* Check whether to write this mft record. */
+		tni = NULL;
+		if (ntfs_may_write_mft_record(vol, mft_no,
+					(struct mft_record *)(kaddr + mft_ofs), &tni)) {
+			unsigned int mft_record_off = 0;
+			s64 vcn_off = vcn;
+
+			/*
+			 * The record should be written.  If a locked ntfs
+			 * inode was returned, add it to the array of locked
+			 * ntfs inodes.
+			 */
+			if (tni)
+				locked_nis[nr_locked_nis++] = tni;
+
+			if (bio && (mft_ofs != prev_mft_ofs + vol->mft_record_size)) {
+flush_bio:
+				flush_dcache_folio(folio);
+				submit_bio_wait(bio);
+				bio_put(bio);
+				bio = NULL;
+			}
+
+			if (vol->cluster_size < folio_size(folio)) {
+				down_write(&ni->runlist.lock);
+				rl = ntfs_attr_vcn_to_rl(ni, vcn_off, &lcn);
+				up_write(&ni->runlist.lock);
+				if (IS_ERR(rl) || lcn < 0) {
+					err = -EIO;
+					goto unm_done;
+				}
+
+				if (bio &&
+				   (bio_end_sector(bio) >> (vol->cluster_size_bits - 9)) !=
+				    lcn) {
+					flush_dcache_folio(folio);
+					submit_bio_wait(bio);
+					bio_put(bio);
+					bio = NULL;
+				}
+			}
+
+			if (!bio) {
+				unsigned int off;
+
+				off = ((mft_no << vol->mft_record_size_bits) +
+				       mft_record_off) & vol->cluster_size_mask;
+
+				bio = ntfs_setup_bio(vol, REQ_OP_WRITE, lcn, off);
+				if (!bio) {
+					err = -ENOMEM;
+					goto unm_done;
+				}
+			}
+
+			if (vol->cluster_size == NTFS_BLOCK_SIZE &&
+			    (mft_record_off ||
+			     rl->length - (vcn_off - rl->vcn) == 1 ||
+			     mft_ofs + NTFS_BLOCK_SIZE >= PAGE_SIZE))
+				folio_sz = NTFS_BLOCK_SIZE;
+			else
+				folio_sz = vol->mft_record_size;
+			if (!bio_add_folio(bio, folio, folio_sz,
+					   mft_ofs + mft_record_off)) {
+				err = -EIO;
+				bio_put(bio);
+				goto unm_done;
+			}
+			mft_record_off += folio_sz;
+
+			if (mft_record_off != vol->mft_record_size) {
+				vcn_off++;
+				goto flush_bio;
+			}
+			prev_mft_ofs = mft_ofs;
+
+			if (mft_no < vol->mftmirr_size)
+				ntfs_sync_mft_mirror(vol, mft_no,
+						(struct mft_record *)(kaddr + mft_ofs));
+		}
+
+	}
+
+	if (bio) {
+		flush_dcache_folio(folio);
+		submit_bio_wait(bio);
+		bio_put(bio);
+	}
+	flush_dcache_folio(folio);
+unm_done:
+	folio_mark_uptodate(folio);
+	kunmap_local(kaddr);
+
+	folio_start_writeback(folio);
+	folio_unlock(folio);
+	folio_end_writeback(folio);
+
+	/* Unlock any locked inodes. */
+	while (nr_locked_nis-- > 0) {
+		struct ntfs_inode *base_tni;
+
+		tni = locked_nis[nr_locked_nis];
+		mutex_unlock(&tni->mrec_lock);
+
+		/* Get the base inode. */
+		mutex_lock(&tni->extent_lock);
+		if (tni->nr_extents >= 0)
+			base_tni = tni;
+		else
+			base_tni = tni->ext.base_ntfs_ino;
+		mutex_unlock(&tni->extent_lock);
+		ntfs_debug("Unlocking %s inode 0x%lx.",
+				tni == base_tni ? "base" : "extent",
+				tni->mft_no);
+		atomic_dec(&tni->count);
+		iput(VFS_I(base_tni));
+	}
+
+	if (unlikely(err && err != -ENOMEM))
+		NVolSetErrors(vol);
+	if (likely(!err))
+		ntfs_debug("Done.");
+	return err;
+}
+
+/**
+ * ntfs_bmap - map logical file block to physical device block
+ * @mapping:	address space mapping to which the block to be mapped belongs
+ * @block:	logical block to map to its physical device block
+ *
+ * For regular, non-resident files (i.e. not compressed and not encrypted), map
+ * the logical @block belonging to the file described by the address space
+ * mapping @mapping to its physical device block.
+ *
+ * The size of the block is equal to the @s_blocksize field of the super block
+ * of the mounted file system which is guaranteed to be smaller than or equal
+ * to the cluster size thus the block is guaranteed to fit entirely inside the
+ * cluster which means we do not need to care how many contiguous bytes are
+ * available after the beginning of the block.
+ *
+ * Return the physical device block if the mapping succeeded or 0 if the block
+ * is sparse or there was an error.
+ *
+ * Note: This is a problem if someone tries to run bmap() on $Boot system file
+ * as that really is in block zero but there is nothing we can do.  bmap() is
+ * just broken in that respect (just like it cannot distinguish sparse from
+ * not available or error).
+ */
+static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
+{
+	s64 ofs, size;
+	loff_t i_size;
+	s64 lcn;
+	unsigned long blocksize, flags;
+	struct ntfs_inode *ni = NTFS_I(mapping->host);
+	struct ntfs_volume *vol = ni->vol;
+	unsigned int delta;
+	unsigned char blocksize_bits, cluster_size_shift;
+
+	ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.",
+			ni->mft_no, (unsigned long long)block);
+	if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) {
+		ntfs_error(vol->sb, "BMAP does not make sense for %s attributes, returning 0.",
+				(ni->type != AT_DATA) ? "non-data" :
+				(!NInoNonResident(ni) ? "resident" :
+				"encrypted"));
+		return 0;
+	}
+	/* None of these can happen. */
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
+	ofs = (s64)block << blocksize_bits;
+	read_lock_irqsave(&ni->size_lock, flags);
+	size = ni->initialized_size;
+	i_size = i_size_read(VFS_I(ni));
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * If the offset is outside the initialized size or the block straddles
+	 * the initialized size then pretend it is a hole unless the
+	 * initialized size equals the file size.
+	 */
+	if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size)))
+		goto hole;
+	cluster_size_shift = vol->cluster_size_bits;
+	down_read(&ni->runlist.lock);
+	lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false);
+	up_read(&ni->runlist.lock);
+	if (unlikely(lcn < LCN_HOLE)) {
+		/*
+		 * Step down to an integer to avoid gcc doing a long long
+		 * comparision in the switch when we know @lcn is between
+		 * LCN_HOLE and LCN_EIO (i.e. -1 to -5).
+		 *
+		 * Otherwise older gcc (at least on some architectures) will
+		 * try to use __cmpdi2() which is of course not available in
+		 * the kernel.
+		 */
+		switch ((int)lcn) {
+		case LCN_ENOENT:
+			/*
+			 * If the offset is out of bounds then pretend it is a
+			 * hole.
+			 */
+			goto hole;
+		case LCN_ENOMEM:
+			ntfs_error(vol->sb,
+				"Not enough memory to complete mapping for inode 0x%lx. Returning 0.",
+				ni->mft_no);
+			break;
+		default:
+			ntfs_error(vol->sb,
+				"Failed to complete mapping for inode 0x%lx.  Run chkdsk. Returning 0.",
+				ni->mft_no);
+			break;
+		}
+		return 0;
+	}
+	if (lcn < 0) {
+		/* It is a hole. */
+hole:
+		ntfs_debug("Done (returning hole).");
+		return 0;
+	}
+	/*
+	 * The block is really allocated and fullfils all our criteria.
+	 * Convert the cluster to units of block size and return the result.
+	 */
+	delta = ofs & vol->cluster_size_mask;
+	if (unlikely(sizeof(block) < sizeof(lcn))) {
+		block = lcn = ((lcn << cluster_size_shift) + delta) >>
+				blocksize_bits;
+		/* If the block number was truncated return 0. */
+		if (unlikely(block != lcn)) {
+			ntfs_error(vol->sb,
+				"Physical block 0x%llx is too large to be returned, returning 0.",
+				(long long)lcn);
+			return 0;
+		}
+	} else
+		block = ((lcn << cluster_size_shift) + delta) >>
+				blocksize_bits;
+	ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn);
+	return block;
+}
+
+static void ntfs_readahead(struct readahead_control *rac)
+{
+	struct address_space *mapping = rac->mapping;
+	struct inode *inode = mapping->host;
+	struct ntfs_inode *ni = NTFS_I(inode);
+
+	if (!NInoNonResident(ni) || NInoCompressed(ni)) {
+		/* No readahead for resident and compressed. */
+		return;
+	}
+
+	if (NInoMstProtected(ni) &&
+	    (ni->mft_no == FILE_MFT || ni->mft_no == FILE_MFTMirr))
+		return;
+
+	iomap_readahead(rac, &ntfs_read_iomap_ops);
+}
+
+static int ntfs_mft_writepage(struct folio *folio, struct writeback_control *wbc)
+{
+	struct address_space *mapping = folio->mapping;
+	struct inode *vi = mapping->host;
+	struct ntfs_inode *ni = NTFS_I(vi);
+	loff_t i_size;
+	int ret;
+
+	i_size = i_size_read(vi);
+
+	/* We have to zero every time due to mmap-at-end-of-file. */
+	if (folio->index >= (i_size >> PAGE_SHIFT)) {
+		/* The page straddles i_size. */
+		unsigned int ofs = i_size & ~PAGE_MASK;
+
+		folio_zero_segment(folio, ofs, PAGE_SIZE);
+	}
+
+	ret = ntfs_write_mft_block(ni, folio, wbc);
+	mapping_set_error(mapping, ret);
+	return ret;
+}
+
+static int ntfs_writepages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct ntfs_inode *ni = NTFS_I(inode);
+	struct iomap_writepage_ctx wpc = {
+		.inode		= mapping->host,
+		.wbc		= wbc,
+		.ops		= &ntfs_writeback_ops,
+	};
+
+	if (NVolShutdown(ni->vol))
+		return -EIO;
+
+	if (!NInoNonResident(ni))
+		return 0;
+
+	if (NInoMstProtected(ni) && ni->mft_no == FILE_MFT) {
+		struct folio *folio = NULL;
+		int error;
+
+		while ((folio = writeback_iter(mapping, wbc, folio, &error)))
+			error = ntfs_mft_writepage(folio, wbc);
+		return error;
+	}
+
+	/* If file is encrypted, deny access, just like NT4. */
+	if (NInoEncrypted(ni)) {
+		ntfs_debug("Denying write access to encrypted file.");
+		return -EACCES;
+	}
+
+	return iomap_writepages(&wpc);
+}
+
+static int ntfs_swap_activate(struct swap_info_struct *sis,
+		struct file *swap_file, sector_t *span)
+{
+	return iomap_swapfile_activate(sis, swap_file, span,
+			&ntfs_read_iomap_ops);
+}
+
+/**
+ * ntfs_normal_aops - address space operations for normal inodes and attributes
+ *
+ * Note these are not used for compressed or mst protected inodes and
+ * attributes.
+ */
+const struct address_space_operations ntfs_normal_aops = {
+	.read_folio		= ntfs_read_folio,
+	.readahead		= ntfs_readahead,
+	.writepages		= ntfs_writepages,
+	.direct_IO		= noop_direct_IO,
+	.dirty_folio		= iomap_dirty_folio,
+	.bmap			= ntfs_bmap,
+	.migrate_folio		= filemap_migrate_folio,
+	.is_partially_uptodate	= iomap_is_partially_uptodate,
+	.error_remove_folio	= generic_error_remove_folio,
+	.release_folio		= iomap_release_folio,
+	.invalidate_folio	= iomap_invalidate_folio,
+	.swap_activate          = ntfs_swap_activate,
+};
+
+/**
+ * ntfs_compressed_aops - address space operations for compressed inodes
+ */
+const struct address_space_operations ntfs_compressed_aops = {
+	.read_folio		= ntfs_read_folio,
+	.direct_IO		= noop_direct_IO,
+	.writepages		= ntfs_writepages,
+	.dirty_folio		= iomap_dirty_folio,
+	.migrate_folio		= filemap_migrate_folio,
+	.is_partially_uptodate	= iomap_is_partially_uptodate,
+	.error_remove_folio	= generic_error_remove_folio,
+	.release_folio		= iomap_release_folio,
+	.invalidate_folio	= iomap_invalidate_folio,
+};
+
+/**
+ * ntfs_mst_aops - general address space operations for mst protecteed inodes
+ *		   and attributes
+ */
+const struct address_space_operations ntfs_mst_aops = {
+	.read_folio		= ntfs_read_folio,	/* Fill page with data. */
+	.readahead		= ntfs_readahead,
+	.writepages		= ntfs_writepages,	/* Write dirty page to disk. */
+	.dirty_folio		= iomap_dirty_folio,
+	.migrate_folio		= filemap_migrate_folio,
+	.is_partially_uptodate	= iomap_is_partially_uptodate,
+	.error_remove_folio	= generic_error_remove_folio,
+	.release_folio		= iomap_release_folio,
+	.invalidate_folio	= iomap_invalidate_folio,
+};
+
+void mark_ntfs_record_dirty(struct folio *folio)
+{
+	iomap_dirty_folio(folio->mapping, folio);
+}
+
+int ntfs_dev_read(struct super_block *sb, void *buf, loff_t start, loff_t size)
+{
+	pgoff_t idx, idx_end;
+	loff_t offset, end = start + size;
+	u32 from, to, buf_off = 0;
+	struct folio *folio;
+	char *kaddr;
+
+	idx = start >> PAGE_SHIFT;
+	idx_end = end >> PAGE_SHIFT;
+	from = start & ~PAGE_MASK;
+
+	if (idx == idx_end)
+		idx_end++;
+
+	for (; idx < idx_end; idx++, from = 0) {
+		folio = ntfs_read_mapping_folio(sb->s_bdev->bd_mapping, idx);
+		if (IS_ERR(folio)) {
+			ntfs_error(sb, "Unable to read %ld page", idx);
+			return PTR_ERR(folio);
+		}
+
+		kaddr = kmap_local_folio(folio, 0);
+		offset = (loff_t)idx << PAGE_SHIFT;
+		to = min_t(u32, end - offset, PAGE_SIZE);
+
+		memcpy(buf + buf_off, kaddr + from, to);
+		buf_off += to;
+		kunmap_local(kaddr);
+		folio_put(folio);
+	}
+
+	return 0;
+}
+
+int ntfs_dev_write(struct super_block *sb, void *buf, loff_t start,
+			loff_t size, bool wait)
+{
+	pgoff_t idx, idx_end;
+	loff_t offset, end = start + size;
+	u32 from, to, buf_off = 0;
+	struct folio *folio;
+	char *kaddr;
+
+	idx = start >> PAGE_SHIFT;
+	idx_end = end >> PAGE_SHIFT;
+	from = start & ~PAGE_MASK;
+
+	if (idx == idx_end)
+		idx_end++;
+
+	for (; idx < idx_end; idx++, from = 0) {
+		folio = ntfs_read_mapping_folio(sb->s_bdev->bd_mapping, idx);
+		if (IS_ERR(folio)) {
+			ntfs_error(sb, "Unable to read %ld page", idx);
+			return PTR_ERR(folio);
+		}
+
+		kaddr = kmap_local_folio(folio, 0);
+		offset = (loff_t)idx << PAGE_SHIFT;
+		to = min_t(u32, end - offset, PAGE_SIZE);
+
+		memcpy(kaddr + from, buf + buf_off, to);
+		buf_off += to;
+		kunmap_local(kaddr);
+		folio_mark_uptodate(folio);
+		folio_mark_dirty(folio);
+		if (wait)
+			folio_wait_stable(folio);
+		folio_put(folio);
+	}
+
+	return 0;
+}
diff --git a/fs/ntfsplus/ntfs_iomap.c b/fs/ntfsplus/ntfs_iomap.c
new file mode 100644
index 000000000000..c9fd999820f4
--- /dev/null
+++ b/fs/ntfsplus/ntfs_iomap.c
@@ -0,0 +1,700 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/**
+ * iomap callack functions
+ *
+ * Copyright (c) 2025 LG Electronics Co., Ltd.
+ */
+
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/uio.h>
+
+#include "aops.h"
+#include "attrib.h"
+#include "mft.h"
+#include "ntfs.h"
+#include "misc.h"
+#include "ntfs_iomap.h"
+
+static void ntfs_iomap_put_folio(struct inode *inode, loff_t pos,
+		unsigned int len, struct folio *folio)
+{
+	struct ntfs_inode *ni = NTFS_I(inode);
+	unsigned long sector_size = 1UL << inode->i_blkbits;
+	loff_t start_down, end_up, init;
+
+	if (!NInoNonResident(ni))
+		goto out;
+
+	start_down = round_down(pos, sector_size);
+	end_up = (pos + len - 1) | (sector_size - 1);
+	init = ni->initialized_size;
+
+	if (init >= start_down && init <= end_up) {
+		if (init < pos) {
+			loff_t offset = offset_in_folio(folio, pos + len);
+
+			if (offset == 0)
+				offset = folio_size(folio);
+			folio_zero_segments(folio,
+					    offset_in_folio(folio, init),
+					    offset_in_folio(folio, pos),
+					    offset,
+					    folio_size(folio));
+
+		} else  {
+			loff_t offset = max_t(loff_t, pos + len, init);
+
+			offset = offset_in_folio(folio, offset);
+			if (offset == 0)
+				offset = folio_size(folio);
+			folio_zero_segment(folio,
+					   offset,
+					   folio_size(folio));
+		}
+	} else if (init <= pos) {
+		loff_t offset = 0, offset2 = offset_in_folio(folio, pos + len);
+
+		if ((init >> folio_shift(folio)) == (pos >> folio_shift(folio)))
+			offset = offset_in_folio(folio, init);
+		if (offset2 == 0)
+			offset2 = folio_size(folio);
+		folio_zero_segments(folio,
+				    offset,
+				    offset_in_folio(folio, pos),
+				    offset2,
+				    folio_size(folio));
+	}
+
+out:
+	folio_unlock(folio);
+	folio_put(folio);
+}
+
+const struct iomap_write_ops ntfs_iomap_folio_ops = {
+	.put_folio = ntfs_iomap_put_folio,
+};
+
+static int ntfs_read_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+		unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+	struct ntfs_inode *base_ni, *ni = NTFS_I(inode);
+	struct ntfs_attr_search_ctx *ctx;
+	loff_t i_size;
+	u32 attr_len;
+	int err = 0;
+	char *kattr;
+	struct page *ipage;
+
+	if (NInoNonResident(ni)) {
+		s64 vcn;
+		s64 lcn;
+		struct runlist_element *rl;
+		struct ntfs_volume *vol = ni->vol;
+		loff_t vcn_ofs;
+		loff_t rl_length;
+
+		vcn = offset >> vol->cluster_size_bits;
+		vcn_ofs = offset & vol->cluster_size_mask;
+
+		down_write(&ni->runlist.lock);
+		rl = ntfs_attr_vcn_to_rl(ni, vcn, &lcn);
+		if (IS_ERR(rl)) {
+			up_write(&ni->runlist.lock);
+			return PTR_ERR(rl);
+		}
+
+		if (flags & IOMAP_REPORT) {
+			if (lcn < LCN_HOLE) {
+				up_write(&ni->runlist.lock);
+				return -ENOENT;
+			}
+		} else if (lcn < LCN_ENOENT) {
+			up_write(&ni->runlist.lock);
+			return -EINVAL;
+		}
+
+		iomap->bdev = inode->i_sb->s_bdev;
+		iomap->offset = offset;
+
+		if (lcn <= LCN_DELALLOC) {
+			if (lcn == LCN_DELALLOC)
+				iomap->type = IOMAP_DELALLOC;
+			else
+				iomap->type = IOMAP_HOLE;
+			iomap->addr = IOMAP_NULL_ADDR;
+		} else {
+			if (!(flags & IOMAP_ZERO) && offset >= ni->initialized_size)
+				iomap->type = IOMAP_UNWRITTEN;
+			else
+				iomap->type = IOMAP_MAPPED;
+			iomap->addr = (lcn << vol->cluster_size_bits) + vcn_ofs;
+		}
+
+		rl_length = (rl->length - (vcn - rl->vcn)) << ni->vol->cluster_size_bits;
+
+		if (rl_length == 0 && rl->lcn > LCN_DELALLOC) {
+			ntfs_error(inode->i_sb,
+				   "runlist(vcn : %lld, length : %lld, lcn : %lld) is corrupted\n",
+				   rl->vcn, rl->length, rl->lcn);
+			up_write(&ni->runlist.lock);
+			return -EIO;
+		}
+
+		if (rl_length && length > rl_length - vcn_ofs)
+			iomap->length = rl_length - vcn_ofs;
+		else
+			iomap->length = length;
+		up_write(&ni->runlist.lock);
+
+		if (!(flags & IOMAP_ZERO) &&
+		    iomap->type == IOMAP_MAPPED &&
+		    iomap->offset < ni->initialized_size &&
+		    iomap->offset + iomap->length > ni->initialized_size) {
+			iomap->length = round_up(ni->initialized_size, 1 << inode->i_blkbits) -
+				iomap->offset;
+		}
+		iomap->flags |= IOMAP_F_MERGED;
+		return 0;
+	}
+
+	if (NInoAttr(ni))
+		base_ni = ni->ext.base_ntfs_ino;
+	else
+		base_ni = ni;
+
+	ctx = ntfs_attr_get_search_ctx(base_ni, NULL);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err))
+		goto out;
+
+	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
+	if (unlikely(attr_len > ni->initialized_size))
+		attr_len = ni->initialized_size;
+	i_size = i_size_read(inode);
+
+	if (unlikely(attr_len > i_size)) {
+		/* Race with shrinking truncate. */
+		attr_len = i_size;
+	}
+
+	if (offset >= attr_len) {
+		if (flags & IOMAP_REPORT)
+			err = -ENOENT;
+		else
+			err = -EFAULT;
+		goto out;
+	}
+
+	kattr = (u8 *)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset);
+
+	ipage = alloc_page(__GFP_NOWARN | __GFP_IO | __GFP_ZERO);
+	if (!ipage) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	memcpy(page_address(ipage), kattr, attr_len);
+	iomap->type = IOMAP_INLINE;
+	iomap->inline_data = page_address(ipage);
+	iomap->offset = 0;
+	iomap->length = min_t(loff_t, attr_len, PAGE_SIZE);
+	iomap->private = ipage;
+
+out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	return err;
+}
+
+static int ntfs_read_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+		ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+	if (iomap->type == IOMAP_INLINE) {
+		struct page *ipage = iomap->private;
+
+		put_page(ipage);
+	}
+	return written;
+}
+
+const struct iomap_ops ntfs_read_iomap_ops = {
+	.iomap_begin = ntfs_read_iomap_begin,
+	.iomap_end = ntfs_read_iomap_end,
+};
+
+static int ntfs_buffered_zeroed_clusters(struct inode *vi, s64 vcn)
+{
+	struct ntfs_inode *ni = NTFS_I(vi);
+	struct ntfs_volume *vol = ni->vol;
+	struct address_space *mapping = vi->i_mapping;
+	struct folio *folio;
+	pgoff_t idx, idx_end;
+	u32 from, to;
+
+	idx = (vcn << vol->cluster_size_bits) >> PAGE_SHIFT;
+	idx_end = ((vcn + 1) << vol->cluster_size_bits) >> PAGE_SHIFT;
+	from = (vcn << vol->cluster_size_bits) & ~PAGE_MASK;
+	if (idx == idx_end)
+		idx_end++;
+
+	to = min_t(u32, vol->cluster_size, PAGE_SIZE);
+	for (; idx < idx_end; idx++, from = 0) {
+		if (to != PAGE_SIZE) {
+			folio = ntfs_read_mapping_folio(mapping, idx);
+			if (IS_ERR(folio))
+				return PTR_ERR(folio);
+			folio_lock(folio);
+		} else {
+			folio = __filemap_get_folio(mapping, idx,
+					FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping));
+			if (IS_ERR(folio))
+				return PTR_ERR(folio);
+		}
+
+		if (folio_test_uptodate(folio) ||
+		    iomap_is_partially_uptodate(folio, from, to))
+			goto next_folio;
+
+		folio_zero_segment(folio, from, from + to);
+		folio_mark_uptodate(folio);
+
+next_folio:
+		iomap_dirty_folio(mapping, folio);
+		folio_unlock(folio);
+		folio_put(folio);
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+	}
+
+	return 0;
+}
+
+int ntfs_zeroed_clusters(struct inode *vi, s64 lcn, s64 num)
+{
+	struct ntfs_inode *ni = NTFS_I(vi);
+	struct ntfs_volume *vol = ni->vol;
+	u32 to;
+	struct bio *bio = NULL;
+	s64 err = 0, zero_len = num << vol->cluster_size_bits;
+	s64 loc = lcn << vol->cluster_size_bits, curr = 0;
+
+	while (zero_len > 0) {
+setup_bio:
+		if (!bio) {
+			bio = bio_alloc(vol->sb->s_bdev,
+					bio_max_segs(DIV_ROUND_UP(zero_len, PAGE_SIZE)),
+					REQ_OP_WRITE | REQ_SYNC | REQ_IDLE, GFP_NOIO);
+			if (!bio)
+				return -ENOMEM;
+			bio->bi_iter.bi_sector = (loc + curr) >> vol->sb->s_blocksize_bits;
+		}
+
+		to = min_t(u32, zero_len, PAGE_SIZE);
+		if (!bio_add_page(bio, ZERO_PAGE(0), to, 0)) {
+			err = submit_bio_wait(bio);
+			bio_put(bio);
+			bio = NULL;
+			if (err)
+				break;
+			goto setup_bio;
+		}
+		zero_len -= to;
+		curr += to;
+	}
+
+	if (bio) {
+		err = submit_bio_wait(bio);
+		bio_put(bio);
+	}
+
+	return err;
+}
+
+static int __ntfs_write_iomap_begin(struct inode *inode, loff_t offset,
+				    loff_t length, unsigned int flags,
+				    struct iomap *iomap, bool da, bool mapped)
+{
+	struct ntfs_inode *ni = NTFS_I(inode);
+	struct ntfs_volume *vol = ni->vol;
+	struct attr_record *a;
+	struct ntfs_attr_search_ctx *ctx;
+	u32 attr_len;
+	int err = 0;
+	char *kattr;
+	struct page *ipage;
+
+	if (NVolShutdown(vol))
+		return -EIO;
+
+	mutex_lock(&ni->mrec_lock);
+	if (NInoNonResident(ni)) {
+		s64 vcn;
+		loff_t vcn_ofs;
+		loff_t rl_length;
+		s64 max_clu_count =
+			round_up(length, vol->cluster_size) >> vol->cluster_size_bits;
+
+		vcn = offset >> vol->cluster_size_bits;
+		vcn_ofs = offset & vol->cluster_size_mask;
+
+		if (da) {
+			bool balloc = false;
+			s64 start_lcn, lcn_count;
+			bool update_mp;
+
+			update_mp = (flags & IOMAP_DIRECT) || mapped ||
+				NInoAttr(ni) || ni->mft_no < FILE_first_user;
+			down_write(&ni->runlist.lock);
+			err = ntfs_attr_map_cluster(ni, vcn, &start_lcn, &lcn_count,
+					max_clu_count, &balloc, update_mp,
+					!(flags & IOMAP_DIRECT) && !mapped);
+			up_write(&ni->runlist.lock);
+			mutex_unlock(&ni->mrec_lock);
+			if (err) {
+				ni->i_dealloc_clusters = 0;
+				return err;
+			}
+
+			iomap->bdev = inode->i_sb->s_bdev;
+			iomap->offset = offset;
+
+			rl_length = lcn_count << ni->vol->cluster_size_bits;
+			if (length > rl_length - vcn_ofs)
+				iomap->length = rl_length - vcn_ofs;
+			else
+				iomap->length = length;
+
+			if (start_lcn == LCN_HOLE)
+				iomap->type = IOMAP_HOLE;
+			else
+				iomap->type = IOMAP_MAPPED;
+			if (balloc == true)
+				iomap->flags = IOMAP_F_NEW;
+
+			iomap->addr = (start_lcn << vol->cluster_size_bits) + vcn_ofs;
+
+			if (balloc == true) {
+				if (flags & IOMAP_DIRECT || mapped == true) {
+					loff_t end = offset + length;
+
+					if (vcn_ofs || ((vol->cluster_size > iomap->length) &&
+							end < ni->initialized_size))
+						err = ntfs_zeroed_clusters(inode,
+								start_lcn, 1);
+					if (!err && lcn_count > 1 &&
+					    (iomap->length & vol->cluster_size_mask &&
+					     end < ni->initialized_size))
+						err = ntfs_zeroed_clusters(inode,
+								start_lcn + (lcn_count - 1), 1);
+				} else {
+					if (lcn_count > ni->i_dealloc_clusters)
+						ni->i_dealloc_clusters = 0;
+					else
+						ni->i_dealloc_clusters -= lcn_count;
+				}
+				if (err < 0)
+					return err;
+			}
+
+			if (mapped && iomap->offset + iomap->length >
+			    ni->initialized_size) {
+				err = ntfs_attr_set_initialized_size(ni, iomap->offset +
+								     iomap->length);
+				if (err)
+					return err;
+			}
+		} else {
+			struct runlist_element *rl, *rlc;
+			s64 lcn;
+			bool is_retry = false;
+
+			down_read(&ni->runlist.lock);
+			rl = ni->runlist.rl;
+			if (!rl) {
+				up_read(&ni->runlist.lock);
+				err = ntfs_map_runlist(ni, vcn);
+				if (err) {
+					mutex_unlock(&ni->mrec_lock);
+					return -ENOENT;
+				}
+				down_read(&ni->runlist.lock);
+				rl = ni->runlist.rl;
+			}
+			up_read(&ni->runlist.lock);
+
+			down_write(&ni->runlist.lock);
+remap_rl:
+			/* Seek to element containing target vcn. */
+			while (rl->length && rl[1].vcn <= vcn)
+				rl++;
+			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+
+			if (lcn <= LCN_RL_NOT_MAPPED && is_retry == false) {
+				is_retry = true;
+				if (!ntfs_map_runlist_nolock(ni, vcn, NULL)) {
+					rl = ni->runlist.rl;
+					goto remap_rl;
+				}
+			}
+
+			max_clu_count = min(max_clu_count, rl->length - (vcn - rl->vcn));
+			if (max_clu_count == 0) {
+				ntfs_error(inode->i_sb,
+					   "runlist(vcn : %lld, length : %lld) is corrupted\n",
+					   rl->vcn, rl->length);
+				up_write(&ni->runlist.lock);
+				mutex_unlock(&ni->mrec_lock);
+				return -EIO;
+			}
+
+			iomap->bdev = inode->i_sb->s_bdev;
+			iomap->offset = offset;
+
+			if (lcn <= LCN_DELALLOC) {
+				if (lcn < LCN_DELALLOC) {
+					max_clu_count =
+						ntfs_available_clusters_count(vol, max_clu_count);
+					if (max_clu_count < 0) {
+						err = max_clu_count;
+						up_write(&ni->runlist.lock);
+						mutex_unlock(&ni->mrec_lock);
+						return err;
+					}
+				}
+
+				iomap->type = IOMAP_DELALLOC;
+				iomap->addr = IOMAP_NULL_ADDR;
+
+				if (lcn <= LCN_HOLE) {
+					size_t new_rl_count;
+
+					rlc = ntfs_malloc_nofs(sizeof(struct runlist_element) * 2);
+					if (!rlc) {
+						up_write(&ni->runlist.lock);
+						mutex_unlock(&ni->mrec_lock);
+						return -ENOMEM;
+					}
+
+					rlc->vcn = vcn;
+					rlc->lcn = LCN_DELALLOC;
+					rlc->length = max_clu_count;
+
+					rlc[1].vcn = vcn + max_clu_count;
+					rlc[1].lcn = LCN_RL_NOT_MAPPED;
+					rlc[1].length = 0;
+
+					rl = ntfs_runlists_merge(&ni->runlist, rlc, 0,
+							&new_rl_count);
+					if (IS_ERR(rl)) {
+						ntfs_error(vol->sb, "Failed to merge runlists");
+						up_write(&ni->runlist.lock);
+						mutex_unlock(&ni->mrec_lock);
+						ntfs_free(rlc);
+						return PTR_ERR(rl);
+					}
+
+					ni->runlist.rl = rl;
+					ni->runlist.count = new_rl_count;
+					ni->i_dealloc_clusters += max_clu_count;
+				}
+				up_write(&ni->runlist.lock);
+				mutex_unlock(&ni->mrec_lock);
+
+				if (lcn < LCN_DELALLOC)
+					ntfs_hold_dirty_clusters(vol, max_clu_count);
+
+				rl_length = max_clu_count << ni->vol->cluster_size_bits;
+				if (length > rl_length - vcn_ofs)
+					iomap->length = rl_length - vcn_ofs;
+				else
+					iomap->length = length;
+
+				iomap->flags = IOMAP_F_NEW;
+				if (lcn <= LCN_HOLE) {
+					loff_t end = offset + length;
+
+					if (vcn_ofs || ((vol->cluster_size > iomap->length) &&
+							end < ni->initialized_size))
+						err = ntfs_buffered_zeroed_clusters(inode, vcn);
+					if (!err && max_clu_count > 1 &&
+					    (iomap->length & vol->cluster_size_mask &&
+					     end < ni->initialized_size))
+						err = ntfs_buffered_zeroed_clusters(inode,
+								vcn + (max_clu_count - 1));
+					if (err) {
+						ntfs_release_dirty_clusters(vol, max_clu_count);
+						return err;
+					}
+				}
+			} else {
+				up_write(&ni->runlist.lock);
+				mutex_unlock(&ni->mrec_lock);
+
+				iomap->type = IOMAP_MAPPED;
+				iomap->addr = (lcn << vol->cluster_size_bits) + vcn_ofs;
+
+				rl_length = max_clu_count << ni->vol->cluster_size_bits;
+				if (length > rl_length - vcn_ofs)
+					iomap->length = rl_length - vcn_ofs;
+				else
+					iomap->length = length;
+			}
+		}
+
+		return 0;
+	}
+
+	ctx = ntfs_attr_get_search_ctx(ni, NULL);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (err) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto out;
+	}
+
+	a = ctx->attr;
+	/* The total length of the attribute value. */
+	attr_len = le32_to_cpu(a->data.resident.value_length);
+	kattr = (u8 *)a + le16_to_cpu(a->data.resident.value_offset);
+
+	ipage = alloc_page(__GFP_NOWARN | __GFP_IO | __GFP_ZERO);
+	if (!ipage) {
+		err = -ENOMEM;
+		goto out;
+	}
+	memcpy(page_address(ipage), kattr, attr_len);
+
+	iomap->type = IOMAP_INLINE;
+	iomap->inline_data = page_address(ipage);
+	iomap->offset = 0;
+	/* iomap requires there is only one INLINE_DATA extent */
+	iomap->length = attr_len;
+	iomap->private = ipage;
+
+out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	mutex_unlock(&ni->mrec_lock);
+
+	return err;
+}
+
+static int ntfs_write_iomap_begin(struct inode *inode, loff_t offset,
+				  loff_t length, unsigned int flags,
+				  struct iomap *iomap, struct iomap *srcmap)
+{
+	return __ntfs_write_iomap_begin(inode, offset, length, flags, iomap,
+			false, false);
+}
+
+static int ntfs_write_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+		ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+	if (iomap->type == IOMAP_INLINE) {
+		struct page *ipage = iomap->private;
+		struct ntfs_inode *ni = NTFS_I(inode);
+		struct ntfs_attr_search_ctx *ctx;
+		u32 attr_len;
+		int err;
+		char *kattr;
+
+		mutex_lock(&ni->mrec_lock);
+		ctx = ntfs_attr_get_search_ctx(ni, NULL);
+		if (!ctx) {
+			written = -ENOMEM;
+			mutex_unlock(&ni->mrec_lock);
+			goto out;
+		}
+
+		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, 0, NULL, 0, ctx);
+		if (err) {
+			if (err == -ENOENT)
+				err = -EIO;
+			written = err;
+			goto err_out;
+		}
+
+		/* The total length of the attribute value. */
+		attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
+		if (pos >= attr_len || pos + written > attr_len)
+			goto err_out;
+
+		kattr = (u8 *)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset);
+		memcpy(kattr + pos, iomap_inline_data(iomap, pos), written);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+err_out:
+		ntfs_attr_put_search_ctx(ctx);
+		put_page(ipage);
+		mutex_unlock(&ni->mrec_lock);
+	}
+
+out:
+	return written;
+}
+
+const struct iomap_ops ntfs_write_iomap_ops = {
+	.iomap_begin		= ntfs_write_iomap_begin,
+	.iomap_end		= ntfs_write_iomap_end,
+};
+
+static int ntfs_page_mkwrite_iomap_begin(struct inode *inode, loff_t offset,
+				  loff_t length, unsigned int flags,
+				  struct iomap *iomap, struct iomap *srcmap)
+{
+	return __ntfs_write_iomap_begin(inode, offset, length, flags, iomap,
+			true, true);
+}
+
+const struct iomap_ops ntfs_page_mkwrite_iomap_ops = {
+	.iomap_begin		= ntfs_page_mkwrite_iomap_begin,
+	.iomap_end		= ntfs_write_iomap_end,
+};
+
+static int ntfs_dio_iomap_begin(struct inode *inode, loff_t offset,
+				  loff_t length, unsigned int flags,
+				  struct iomap *iomap, struct iomap *srcmap)
+{
+	return __ntfs_write_iomap_begin(inode, offset, length, flags, iomap,
+			true, false);
+}
+
+const struct iomap_ops ntfs_dio_iomap_ops = {
+	.iomap_begin		= ntfs_dio_iomap_begin,
+	.iomap_end		= ntfs_write_iomap_end,
+};
+
+static ssize_t ntfs_writeback_range(struct iomap_writepage_ctx *wpc,
+		struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
+{
+	if (offset < wpc->iomap.offset ||
+	    offset >= wpc->iomap.offset + wpc->iomap.length) {
+		int error;
+
+		error = __ntfs_write_iomap_begin(wpc->inode, offset,
+				NTFS_I(wpc->inode)->allocated_size - offset,
+				IOMAP_WRITE, &wpc->iomap, true, false);
+		if (error)
+			return error;
+	}
+
+	return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+}
+
+const struct iomap_writeback_ops ntfs_writeback_ops = {
+	.writeback_range	= ntfs_writeback_range,
+	.writeback_submit	= iomap_ioend_writeback_submit,
+};
-- 
2.25.1

Re: [PATCH v2 06/11] ntfsplus: add iomap and address space operations

Posted by Christoph Hellwig 2 months, 1 week ago

> +#include "ntfs_iomap.h"
> +
> +static s64 ntfs_convert_page_index_into_lcn(struct ntfs_volume *vol, struct ntfs_inode *ni,
> +		unsigned long page_index)
> +{
> +	sector_t iblock;
> +	s64 vcn;
> +	s64 lcn;
> +	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
> +
> +	iblock = (s64)page_index << (PAGE_SHIFT - blocksize_bits);
> +	vcn = (s64)iblock << blocksize_bits >> vol->cluster_size_bits;

I've seen this calculate in quite a few places, should there be a
generic helper for it?

> +struct bio *ntfs_setup_bio(struct ntfs_volume *vol, blk_opf_t opf, s64 lcn,
> +		unsigned int pg_ofs)
> +{
> +	struct bio *bio;
> +
> +	bio = bio_alloc(vol->sb->s_bdev, 1, opf, GFP_NOIO);
> +	if (!bio)
> +		return NULL;

bio_alloc never returns NULL if it can sleep.

> +	bio->bi_iter.bi_sector = ((lcn << vol->cluster_size_bits) + pg_ofs) >>
> +		vol->sb->s_blocksize_bits;

With a helper to calculate the sector the ntfs_setup_bio helper becomes
somewhat questionable.

> +static int ntfs_read_folio(struct file *file, struct folio *folio)
> +{
> +	loff_t i_size;
> +	struct inode *vi;
> +	struct ntfs_inode *ni;
> +
> +	vi = folio->mapping->host;
> +	i_size = i_size_read(vi);
> +	/* Is the page fully outside i_size? (truncate in progress) */
> +	if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >>
> +			PAGE_SHIFT)) {
> +		folio_zero_segment(folio, 0, PAGE_SIZE);
> +		ntfs_debug("Read outside i_size - truncated?");
> +		folio_mark_uptodate(folio);
> +		folio_unlock(folio);
> +		return 0;
> +	}

iomap should be taking care of this, why do you need the extra
handling?

> +	/*
> +	 * This can potentially happen because we clear PageUptodate() during
> +	 * ntfs_writepage() of MstProtected() attributes.
> +	 */
> +	if (folio_test_uptodate(folio)) {
> +		folio_unlock(folio);
> +		return 0;
> +	}

Clearing the folio uptodate flag sounds fairly dangerous, why is that
done?

> +static int ntfs_write_mft_block(struct ntfs_inode *ni, struct folio *folio,
> +		struct writeback_control *wbc)

Just a very high-level comment here with no immediate action needed:
Is there a reall good reason to use the page cache for metadata?
Our experience with XFS is that a dedicated buffer cache is not only
much easier to use, but also allows for much better caching.

> +static void ntfs_readahead(struct readahead_control *rac)
> +{
> +	struct address_space *mapping = rac->mapping;
> +	struct inode *inode = mapping->host;
> +	struct ntfs_inode *ni = NTFS_I(inode);
> +
> +	if (!NInoNonResident(ni) || NInoCompressed(ni)) {
> +		/* No readahead for resident and compressed. */
> +		return;
> +	}
> +
> +	if (NInoMstProtected(ni) &&
> +	    (ni->mft_no == FILE_MFT || ni->mft_no == FILE_MFTMirr))
> +		return;

Can you comment on why readahead is skipped here?

> +/**
> + * ntfs_compressed_aops - address space operations for compressed inodes
> + */
> +const struct address_space_operations ntfs_compressed_aops = {

From code in other patches is looks like ntfs never switches between
compressed and non-compressed for live inodes?  In that case the
separate aops should be fine, as switching between them at runtime
would involve races.  Is the compression policy per-directory?

> +		kaddr = kmap_local_folio(folio, 0);
> +		offset = (loff_t)idx << PAGE_SHIFT;
> +		to = min_t(u32, end - offset, PAGE_SIZE);
> +
> +		memcpy(buf + buf_off, kaddr + from, to);
> +		buf_off += to;
> +		kunmap_local(kaddr);
> +		folio_put(folio);
> +	}

Would this be a candidate for memcpy_from_folio?

> +		kaddr = kmap_local_folio(folio, 0);
> +		offset = (loff_t)idx << PAGE_SHIFT;
> +		to = min_t(u32, end - offset, PAGE_SIZE);
> +
> +		memcpy(kaddr + from, buf + buf_off, to);
> +		buf_off += to;
> +		kunmap_local(kaddr);
> +		folio_mark_uptodate(folio);
> +		folio_mark_dirty(folio);

And memcpy_to_folio?

> +++ b/fs/ntfsplus/ntfs_iomap.c

Any reason for the ntfs_ prefix here?

> +static void ntfs_iomap_put_folio(struct inode *inode, loff_t pos,
> +		unsigned int len, struct folio *folio)
> +{

This seems to basically be entirely about extra zeroing.  Can you
explain why this is needed in a comment?

> +static int ntfs_read_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> +		unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
> +{
> +	struct ntfs_inode *base_ni, *ni = NTFS_I(inode);
> +	struct ntfs_attr_search_ctx *ctx;
> +	loff_t i_size;
> +	u32 attr_len;
> +	int err = 0;
> +	char *kattr;
> +	struct page *ipage;
> +
> +	if (NInoNonResident(ni)) {

Can you split the resident and non-resident cases into separate
helpers to keep this easier to follow?
easier to follow?

> +	ipage = alloc_page(__GFP_NOWARN | __GFP_IO | __GFP_ZERO);
> +	if (!ipage) {
> +		err = -ENOMEM;
> +		goto out;
> +	}
> +
> +	memcpy(page_address(ipage), kattr, attr_len);

Is there a reason for this being a page allocation vs a kmalloc
sized to the inline data?

> +static int ntfs_buffered_zeroed_clusters(struct inode *vi, s64 vcn)

I think this should be ntfs_buffered_zero_clusters as it
performans the action?

Also curious why this can't use the existing iomap zeroing helper?

> +int ntfs_zeroed_clusters(struct inode *vi, s64 lcn, s64 num)

ntfs_zero_clusters

Again curious why we need special zeroing code in the file system.

> +	if (NInoNonResident(ni)) {

Another case for splitting the resident/non-resident code instead
of having a giant conditional block that just returns.

Re: [PATCH v2 06/11] ntfsplus: add iomap and address space operations

Posted by Namjae Jeon 2 months, 1 week ago

On Mon, Dec 1, 2025 at 4:35 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> > +#include "ntfs_iomap.h"
> > +
> > +static s64 ntfs_convert_page_index_into_lcn(struct ntfs_volume *vol, struct ntfs_inode *ni,
> > +             unsigned long page_index)
> > +{
> > +     sector_t iblock;
> > +     s64 vcn;
> > +     s64 lcn;
> > +     unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
> > +
> > +     iblock = (s64)page_index << (PAGE_SHIFT - blocksize_bits);
> > +     vcn = (s64)iblock << blocksize_bits >> vol->cluster_size_bits;
>
> I've seen this calculate in quite a few places, should there be a
> generic helper for it?
Okay. I will add it.
>
> > +struct bio *ntfs_setup_bio(struct ntfs_volume *vol, blk_opf_t opf, s64 lcn,
> > +             unsigned int pg_ofs)
> > +{
> > +     struct bio *bio;
> > +
> > +     bio = bio_alloc(vol->sb->s_bdev, 1, opf, GFP_NOIO);
> > +     if (!bio)
> > +             return NULL;
>
> bio_alloc never returns NULL if it can sleep.
Okay.
>
> > +     bio->bi_iter.bi_sector = ((lcn << vol->cluster_size_bits) + pg_ofs) >>
> > +             vol->sb->s_blocksize_bits;
>
> With a helper to calculate the sector the ntfs_setup_bio helper becomes
> somewhat questionable.
Okay, I will check it.
>
> > +static int ntfs_read_folio(struct file *file, struct folio *folio)
> > +{
> > +     loff_t i_size;
> > +     struct inode *vi;
> > +     struct ntfs_inode *ni;
> > +
> > +     vi = folio->mapping->host;
> > +     i_size = i_size_read(vi);
> > +     /* Is the page fully outside i_size? (truncate in progress) */
> > +     if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >>
> > +                     PAGE_SHIFT)) {
> > +             folio_zero_segment(folio, 0, PAGE_SIZE);
> > +             ntfs_debug("Read outside i_size - truncated?");
> > +             folio_mark_uptodate(folio);
> > +             folio_unlock(folio);
> > +             return 0;
> > +     }
>
> iomap should be taking care of this, why do you need the extra
> handling?
This is a leftover from old ntfs, so I will remove it.
>
> > +     /*
> > +      * This can potentially happen because we clear PageUptodate() during
> > +      * ntfs_writepage() of MstProtected() attributes.
> > +      */
> > +     if (folio_test_uptodate(folio)) {
> > +             folio_unlock(folio);
> > +             return 0;
> > +     }
>
> Clearing the folio uptodate flag sounds fairly dangerous, why is that
> done?
This is a leftover from old ntfs, I will check it.
>
> > +static int ntfs_write_mft_block(struct ntfs_inode *ni, struct folio *folio,
> > +             struct writeback_control *wbc)
>
> Just a very high-level comment here with no immediate action needed:
> Is there a reall good reason to use the page cache for metadata?
> Our experience with XFS is that a dedicated buffer cache is not only
> much easier to use, but also allows for much better caching.
Nothing special reason, It was to use existing ones instead of new,
complex implementations. NTFS metadata is treated as a file, and
handling it via the folio(page) API allows the driver to easily gain
performance benefits, such as readahead.
>
> > +static void ntfs_readahead(struct readahead_control *rac)
> > +{
> > +     struct address_space *mapping = rac->mapping;
> > +     struct inode *inode = mapping->host;
> > +     struct ntfs_inode *ni = NTFS_I(inode);
> > +
> > +     if (!NInoNonResident(ni) || NInoCompressed(ni)) {
> > +             /* No readahead for resident and compressed. */
> > +             return;
> > +     }
> > +
> > +     if (NInoMstProtected(ni) &&
> > +         (ni->mft_no == FILE_MFT || ni->mft_no == FILE_MFTMirr))
> > +             return;
>
> Can you comment on why readahead is skipped here?
Okay, I will add it.
>
> > +/**
> > + * ntfs_compressed_aops - address space operations for compressed inodes
> > + */
> > +const struct address_space_operations ntfs_compressed_aops = {
>
> From code in other patches is looks like ntfs never switches between
> compressed and non-compressed for live inodes?  In that case the
> separate aops should be fine, as switching between them at runtime
> would involve races.  Is the compression policy per-directory?
Non-compressed files can actually be switched to compressed files and
vice versa via setxattr at runtime. I will check the race handling
around aop switching again. And the compression policy is per-file,
not per-directory.
>
> > +             kaddr = kmap_local_folio(folio, 0);
> > +             offset = (loff_t)idx << PAGE_SHIFT;
> > +             to = min_t(u32, end - offset, PAGE_SIZE);
> > +
> > +             memcpy(buf + buf_off, kaddr + from, to);
> > +             buf_off += to;
> > +             kunmap_local(kaddr);
> > +             folio_put(folio);
> > +     }
>
> Would this be a candidate for memcpy_from_folio?
Right, I will change it.
>
> > +             kaddr = kmap_local_folio(folio, 0);
> > +             offset = (loff_t)idx << PAGE_SHIFT;
> > +             to = min_t(u32, end - offset, PAGE_SIZE);
> > +
> > +             memcpy(kaddr + from, buf + buf_off, to);
> > +             buf_off += to;
> > +             kunmap_local(kaddr);
> > +             folio_mark_uptodate(folio);
> > +             folio_mark_dirty(folio);
>
> And memcpy_to_folio?
Okay, I will change it.
>
> > +++ b/fs/ntfsplus/ntfs_iomap.c
>
> Any reason for the ntfs_ prefix here?
No reason, I will change it to iomap.c
>
> > +static void ntfs_iomap_put_folio(struct inode *inode, loff_t pos,
> > +             unsigned int len, struct folio *folio)
> > +{
>
> This seems to basically be entirely about extra zeroing.  Can you
> explain why this is needed in a comment?
Okay, I will add a comment for this.
>
> > +static int ntfs_read_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> > +             unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
> > +{
> > +     struct ntfs_inode *base_ni, *ni = NTFS_I(inode);
> > +     struct ntfs_attr_search_ctx *ctx;
> > +     loff_t i_size;
> > +     u32 attr_len;
> > +     int err = 0;
> > +     char *kattr;
> > +     struct page *ipage;
> > +
> > +     if (NInoNonResident(ni)) {
>
> Can you split the resident and non-resident cases into separate
> helpers to keep this easier to follow?
> easier to follow?
Okay. I will.
>
> > +     ipage = alloc_page(__GFP_NOWARN | __GFP_IO | __GFP_ZERO);
> > +     if (!ipage) {
> > +             err = -ENOMEM;
> > +             goto out;
> > +     }
> > +
> > +     memcpy(page_address(ipage), kattr, attr_len);
>
> Is there a reason for this being a page allocation vs a kmalloc
> sized to the inline data?
No reason, I will change it to kmalloc sized.
>
> > +static int ntfs_buffered_zeroed_clusters(struct inode *vi, s64 vcn)
>
> I think this should be ntfs_buffered_zero_clusters as it
> performans the action?
Okay. I will change it.
>
> Also curious why this can't use the existing iomap zeroing helper?
I will check it.
>
> > +int ntfs_zeroed_clusters(struct inode *vi, s64 lcn, s64 num)
>
> ntfs_zero_clusters
Okay.
>
> Again curious why we need special zeroing code in the file system.
To prevent reading garbage data after a new cluster allocation, we
must zero out the cluster. The cluster size can be up to 2MB, I will
check if that's possible through iomap.
>
> > +     if (NInoNonResident(ni)) {
>
> Another case for splitting the resident/non-resident code instead
> of having a giant conditional block that just returns.
Okay. Thanks for your review!
>

Re: [PATCH v2 06/11] ntfsplus: add iomap and address space operations

Posted by Christoph Hellwig 2 months, 1 week ago

On Tue, Dec 02, 2025 at 09:47:17AM +0900, Namjae Jeon wrote:
> Nothing special reason, It was to use existing ones instead of new,
> complex implementations. NTFS metadata is treated as a file, and
> handling it via the folio(page) API allows the driver to easily gain
> performance benefits, such as readahead.

On the one hand it does, on the other hand at least our experience
is that the user data file algorithm for things like readahead and
cache eviction policies worked pretty poorly for metadata in XFS.
Of course I don't actually know if the same applies to ntfs.

> > From code in other patches is looks like ntfs never switches between
> > compressed and non-compressed for live inodes?  In that case the
> > separate aops should be fine, as switching between them at runtime
> > would involve races.  Is the compression policy per-directory?
> Non-compressed files can actually be switched to compressed files and
> vice versa via setxattr at runtime. I will check the race handling
> around aop switching again. And the compression policy is per-file,
> not per-directory.

In that case you probably want to use the same set of address space
(and other operations) and do runtime switching inside the method.

> >
> > Again curious why we need special zeroing code in the file system.
> To prevent reading garbage data after a new cluster allocation, we
> must zero out the cluster. The cluster size can be up to 2MB, I will
> check if that's possible through iomap.

Ouch, that's a lot of zeroing.  But yeah, now that you mention it
XFS actually has the same issue with large RT extents.  Although we
create them as unwritten extents, i.e. disk allocations that always
return zeroes.  I guess ntfs doesn't have that.  For DAX access
there actually is zeroing in the allocator, which is probably
similar to what is done here, just always using the iomap-based
code (check for xfs_zero_range and callers).

Re: [PATCH v2 06/11] ntfsplus: add iomap and address space operations

Posted by Namjae Jeon 2 months, 1 week ago

On Tue, Dec 2, 2025 at 2:45 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Tue, Dec 02, 2025 at 09:47:17AM +0900, Namjae Jeon wrote:
> > Nothing special reason, It was to use existing ones instead of new,
> > complex implementations. NTFS metadata is treated as a file, and
> > handling it via the folio(page) API allows the driver to easily gain
> > performance benefits, such as readahead.
>
> On the one hand it does, on the other hand at least our experience
> is that the user data file algorithm for things like readahead and
> cache eviction policies worked pretty poorly for metadata in XFS.
> Of course I don't actually know if the same applies to ntfs.
We have observed performance improvements from readahead for NTFS
metadata since we are able to identify the continuous cluster ranges
of metadata files.
>
> > > From code in other patches is looks like ntfs never switches between
> > > compressed and non-compressed for live inodes?  In that case the
> > > separate aops should be fine, as switching between them at runtime
> > > would involve races.  Is the compression policy per-directory?
> > Non-compressed files can actually be switched to compressed files and
> > vice versa via setxattr at runtime. I will check the race handling
> > around aop switching again. And the compression policy is per-file,
> > not per-directory.
>
> In that case you probably want to use the same set of address space
> (and other operations) and do runtime switching inside the method.
Right, I will change it.
>
> > >
> > > Again curious why we need special zeroing code in the file system.
> > To prevent reading garbage data after a new cluster allocation, we
> > must zero out the cluster. The cluster size can be up to 2MB, I will
> > check if that's possible through iomap.
>
> Ouch, that's a lot of zeroing.  But yeah, now that you mention it
> XFS actually has the same issue with large RT extents.  Although we
> create them as unwritten extents, i.e. disk allocations that always
> return zeroes.  I guess ntfs doesn't have that.  For DAX access
> there actually is zeroing in the allocator, which is probably
> similar to what is done here, just always using the iomap-based
> code (check for xfs_zero_range and callers).
Right, ntfs does not have a direct equivalent to the unwritten extent mechanism.
I will check xfs codes. Thank you very much for the detailed review!