[v1] ext4 extent split/convert refactor and kunit tests

[PATCH 5/7] ext4: Refactor zeroout path and handle all cases

Posted by Ojaswin Mujoo 1 month ago

Currently, zeroout is used as a fallback in case we fail to
split/convert extents in the "traditional" modify-the-extent-tree way.
This is essential to mitigate failures in critical paths like extent
splitting during endio. However, the logic is very messy and not easy to
follow. Further, the fragile use of various flags has made it prone to
errors.

Refactor zeroout out logic by moving it up to ext4_split_extents().
Further, zeroout correctly based on the type of conversion we want, ie:
- unwritten to written: Zeroout everything around the mapped range.
- unwritten to unwritten: Zeroout everything
- written to unwritten: Zeroout only the mapped range.

Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
 fs/ext4/extents.c | 287 +++++++++++++++++++++++++++++++---------------
 1 file changed, 195 insertions(+), 92 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 460a70e6dae0..8082e1d93bbf 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,14 +44,6 @@
 #define EXT4_EXT_MARK_UNWRIT1	0x2  /* mark first half unwritten */
 #define EXT4_EXT_MARK_UNWRIT2	0x4  /* mark second half unwritten */
 
-/* first half contains valid data */
-#define EXT4_EXT_DATA_ENTIRE_VALID1	0x8   /* has entirely valid data */
-#define EXT4_EXT_DATA_PARTIAL_VALID1	0x10  /* has partially valid data */
-#define EXT4_EXT_DATA_VALID1		(EXT4_EXT_DATA_ENTIRE_VALID1 | \
-					 EXT4_EXT_DATA_PARTIAL_VALID1)
-
-#define EXT4_EXT_DATA_VALID2	0x20 /* second half contains valid data */
-
 static __le32 ext4_extent_block_csum(struct inode *inode,
 				     struct ext4_extent_header *eh)
 {
@@ -3194,7 +3186,8 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
  *  a> the extent are splitted into two extent.
  *  b> split is not needed, and just mark the extent.
  *
- * Return an extent path pointer on success, or an error pointer on failure.
+ * Return an extent path pointer on success, or an error pointer on failure. On
+ * failure, the extent is restored to original state.
  */
 static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
 						  struct inode *inode,
@@ -3204,14 +3197,10 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
 {
 	ext4_fsblk_t newblock;
 	ext4_lblk_t ee_block;
-	struct ext4_extent *ex, newex, orig_ex, zero_ex;
+	struct ext4_extent *ex, newex, orig_ex;
 	struct ext4_extent *ex2 = NULL;
 	unsigned int ee_len, depth;
-	int err = 0;
-
-	BUG_ON((split_flag & EXT4_EXT_DATA_VALID1) == EXT4_EXT_DATA_VALID1);
-	BUG_ON((split_flag & EXT4_EXT_DATA_VALID1) &&
-	       (split_flag & EXT4_EXT_DATA_VALID2));
+	int err = 0, insert_err = 0;
 
 	/* Do not cache extents that are in the process of being modified. */
 	flags |= EXT4_EX_NOCACHE;
@@ -3277,11 +3266,10 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
 
 	path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (!IS_ERR(path))
-		goto out;
+		return path;
 
-	err = PTR_ERR(path);
-	if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
-		goto out_path;
+	insert_err = PTR_ERR(path);
+	err = 0;
 
 	/*
 	 * Get a new path to try to zeroout or fix the extent length.
@@ -3297,53 +3285,13 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
 				 split, PTR_ERR(path));
 		goto out_path;
 	}
-	depth = ext_depth(inode);
-	ex = path[depth].p_ext;
-
-	if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
-		if (split_flag & EXT4_EXT_DATA_VALID1)
-			memcpy(&zero_ex, ex2, sizeof(zero_ex));
-		else if (split_flag & EXT4_EXT_DATA_VALID2)
-			memcpy(&zero_ex, ex, sizeof(zero_ex));
-		else
-			memcpy(&zero_ex, &orig_ex, sizeof(zero_ex));
-		ext4_ext_mark_initialized(&zero_ex);
 
-		err = ext4_ext_zeroout(inode, &zero_ex);
-		if (err)
-			goto fix_extent_len;
-
-		/*
-		 * The first half contains partially valid data, the splitting
-		 * of this extent has not been completed, fix extent length
-		 * and ext4_split_extent() split will the first half again.
-		 */
-		if (split_flag & EXT4_EXT_DATA_PARTIAL_VALID1) {
-			/*
-			 * Drop extent cache to prevent stale unwritten
-			 * extents remaining after zeroing out.
-			 */
-			ext4_es_remove_extent(inode,
-					le32_to_cpu(zero_ex.ee_block),
-					ext4_ext_get_actual_len(&zero_ex));
-			goto fix_extent_len;
-		}
-
-		/* update the extent length and mark as initialized */
-		ex->ee_len = cpu_to_le16(ee_len);
-		ext4_ext_try_to_merge(handle, inode, path, ex);
-		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
-		if (!err)
-			/* update extent status tree */
-			ext4_zeroout_es(inode, &zero_ex);
-		/*
-		 * If we failed at this point, we don't know in which
-		 * state the extent tree exactly is so don't try to fix
-		 * length of the original extent as it may do even more
-		 * damage.
-		 */
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
 		goto out;
-	}
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
 
 fix_extent_len:
 	ex->ee_len = orig_ex.ee_len;
@@ -3353,9 +3301,9 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
 	 */
 	ext4_ext_dirty(handle, inode, path + path->p_depth);
 out:
-	if (err) {
+	if (err || insert_err) {
 		ext4_free_ext_path(path);
-		path = ERR_PTR(err);
+		path = err ? ERR_PTR(err) : ERR_PTR(insert_err);
 	}
 out_path:
 	if (IS_ERR(path))
@@ -3365,6 +3313,115 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
 	return path;
 }
 
+static struct ext4_ext_path *
+ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
+			  struct ext4_ext_path *path,
+			  struct ext4_map_blocks *map, int flags)
+{
+	struct ext4_extent *ex;
+	unsigned int ee_len, depth;
+	ext4_lblk_t ee_block;
+	uint64_t lblk, pblk, len;
+	int is_unwrit;
+	int err = 0;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	is_unwrit = ext4_ext_is_unwritten(ex);
+
+	if (flags & EXT4_GET_BLOCKS_CONVERT) {
+		/*
+		 * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
+		 * map to be initialized. Zeroout everything except the map
+		 * range.
+		 */
+
+		loff_t map_end = (loff_t) map->m_lblk + map->m_len;
+		loff_t ex_end = (loff_t) ee_block + ee_len;
+
+		if (!is_unwrit)
+			/* Shouldn't happen. Just exit */
+			return ERR_PTR(-EINVAL);
+
+		/* zeroout left */
+		if (map->m_lblk > ee_block) {
+			lblk = ee_block;
+			len = map->m_lblk - ee_block;
+			pblk = ext4_ext_pblock(ex);
+			err = ext4_issue_zeroout(inode, lblk, pblk, len);
+			if (err)
+				/* ZEROOUT failed, just return original error */
+				return ERR_PTR(err);
+		}
+
+		/* zeroout right */
+		if (map->m_lblk + map->m_len < ee_block + ee_len) {
+			lblk = map_end;
+			len = ex_end - map_end;
+			pblk = ext4_ext_pblock(ex) + (map_end - ee_block);
+			err = ext4_issue_zeroout(inode, lblk, pblk, len);
+			if (err)
+				/* ZEROOUT failed, just return original error */
+				return ERR_PTR(err);
+		}
+	} else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+		/*
+		 * EXT4_GET_BLOCKS_CONVERT_UNWRITTEN: Caller wants the
+		 * range specified by map to be marked unwritten.
+		 * Zeroout the map range leaving rest as it is.
+		 */
+
+		if (is_unwrit)
+			/* Shouldn't happen. Just exit */
+			return ERR_PTR(-EINVAL);
+
+		lblk = map->m_lblk;
+		len = map->m_len;
+		pblk = ext4_ext_pblock(ex) + (map->m_lblk - ee_block);
+		err = ext4_issue_zeroout(inode, lblk, pblk, len);
+		if (err)
+			/* ZEROOUT failed, just return original error */
+			return ERR_PTR(err);
+	} else if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
+		/*
+		 * EXT4_GET_BLOCKS_UNWRIT_EXT: Today, this flag
+		 * implicitly implies that callers when wanting an
+		 * unwritten to unwritten split. So zeroout the whole
+		 * extent.
+		 *
+		 * TODO: The implicit meaning of the flag is not ideal
+		 * and eventually we should aim for a more well defined
+		 * behavior
+		 */
+
+		if (!is_unwrit)
+			/* Shouldn't happen. Just exit */
+			return ERR_PTR(-EINVAL);
+
+		lblk = ee_block;
+		len = ee_len;
+		pblk = ext4_ext_pblock(ex);
+		err = ext4_issue_zeroout(inode, lblk, pblk, len);
+		if (err)
+			/* ZEROOUT failed, just return original error */
+			return ERR_PTR(err);
+	}
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		return ERR_PTR(err);
+
+	ext4_ext_mark_initialized(ex);
+
+	ext4_ext_dirty(handle, inode, path + path->p_depth);
+	if (err)
+		return ERR_PTR(err);
+
+	return 0;
+}
+
 /*
  * ext4_split_extent() splits an extent and mark extent which is covered
  * by @map as split_flags indicates
@@ -3383,11 +3440,12 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
 					       int split_flag, int flags,
 					       unsigned int *allocated)
 {
-	ext4_lblk_t ee_block;
+	ext4_lblk_t ee_block, orig_ee_block;
 	struct ext4_extent *ex;
-	unsigned int ee_len, depth;
-	int unwritten;
-	int split_flag1, flags1;
+	unsigned int ee_len, orig_ee_len, depth;
+	int unwritten, orig_unwritten;
+	int split_flag1 = 0, flags1 = 0;
+	int err = 0, orig_err;
 
 	depth = ext_depth(inode);
 	ex = path[depth].p_ext;
@@ -3395,23 +3453,29 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
 	ee_len = ext4_ext_get_actual_len(ex);
 	unwritten = ext4_ext_is_unwritten(ex);
 
+	orig_ee_block = ee_block;
+	orig_ee_len = ee_len;
+	orig_unwritten = unwritten;
+
 	/* Do not cache extents that are in the process of being modified. */
 	flags |= EXT4_EX_NOCACHE;
 
 	if (map->m_lblk + map->m_len < ee_block + ee_len) {
-		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
 		flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
 		if (unwritten)
 			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
 				       EXT4_EXT_MARK_UNWRIT2;
-		if (split_flag & EXT4_EXT_DATA_VALID2)
-			split_flag1 |= map->m_lblk > ee_block ?
-				       EXT4_EXT_DATA_PARTIAL_VALID1 :
-				       EXT4_EXT_DATA_ENTIRE_VALID1;
 		path = ext4_split_extent_at(handle, inode, path,
 				map->m_lblk + map->m_len, split_flag1, flags1);
-		if (IS_ERR(path))
-			return path;
+
+		if (IS_ERR(path)) {
+			orig_err = PTR_ERR(path);
+			if (orig_err != -ENOSPC && orig_err != -EDQUOT &&
+			    orig_err != -ENOMEM)
+				return path;
+
+			goto try_zeroout;
+		}
 		/*
 		 * Update path is required because previous ext4_split_extent_at
 		 * may result in split of original leaf or extent zeroout.
@@ -3427,22 +3491,68 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
 			ext4_free_ext_path(path);
 			return ERR_PTR(-EFSCORRUPTED);
 		}
-		unwritten = ext4_ext_is_unwritten(ex);
 	}
 
 	if (map->m_lblk >= ee_block) {
-		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
+		split_flag1 = 0;
 		if (unwritten) {
 			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
-			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
-						     EXT4_EXT_MARK_UNWRIT2);
+			split_flag1 |= split_flag & EXT4_EXT_MARK_UNWRIT2;
 		}
-		path = ext4_split_extent_at(handle, inode, path,
-				map->m_lblk, split_flag1, flags);
+		path = ext4_split_extent_at(handle, inode, path, map->m_lblk,
+					    split_flag1, flags);
+
+		if (IS_ERR(path)) {
+			orig_err = PTR_ERR(path);
+			if (orig_err != -ENOSPC && orig_err != -EDQUOT &&
+			    orig_err != -ENOMEM)
+				return path;
+
+			goto try_zeroout;
+		}
+	}
+
+	if (!err)
+		goto out;
+
+try_zeroout:
+	/*
+	 * There was an error in splitting the extent, just zeroout and convert
+	 * to initialize as a last resort
+	 */
+	if (split_flag & EXT4_EXT_MAY_ZEROOUT) {
+		path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
 		if (IS_ERR(path))
 			return path;
+
+		depth = ext_depth(inode);
+		ex = path[depth].p_ext;
+		ee_block = le32_to_cpu(ex->ee_block);
+		ee_len = ext4_ext_get_actual_len(ex);
+		unwritten = ext4_ext_is_unwritten(ex);
+
+		/*
+		 * The extent to zeroout should have been unchanged
+		 * but its not, just return error to caller
+		 */
+		if (WARN_ON(ee_block != orig_ee_block ||
+			    ee_len != orig_ee_len ||
+			    unwritten != orig_unwritten))
+			return ERR_PTR(orig_err);
+
+		/*
+		 * Something went wrong in zeroout, just return the
+		 * original error
+		 */
+		if (ext4_split_extent_zeroout(handle, inode, path, map, flags))
+			return ERR_PTR(orig_err);
 	}
 
+	/* There's an error and we can't zeroout, just return the err */
+	return ERR_PTR(orig_err);
+
+out:
+
 	if (allocated) {
 		if (map->m_lblk + map->m_len > ee_block + ee_len)
 			*allocated = ee_len - (map->m_lblk - ee_block);
@@ -3486,7 +3596,7 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
 	ext4_lblk_t ee_block, eof_block;
 	unsigned int ee_len, depth, map_len = map->m_len;
 	int err = 0;
-	int split_flag = EXT4_EXT_DATA_VALID2;
+	int split_flag = 0;
 	unsigned int max_zeroout = 0;
 
 	ext_debug(inode, "logical block %llu, max_blocks %u\n",
@@ -3760,11 +3870,7 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
 
-	/* Convert to unwritten */
-	if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
-		split_flag |= EXT4_EXT_DATA_ENTIRE_VALID1;
-	/* Split the existing unwritten extent */
-	} else if (flags & (EXT4_GET_BLOCKS_UNWRIT_EXT |
+	if (flags & (EXT4_GET_BLOCKS_UNWRIT_EXT |
 			    EXT4_GET_BLOCKS_CONVERT)) {
 		/*
 		 * It is safe to convert extent to initialized via explicit
@@ -3773,9 +3879,6 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
 		split_flag |= ee_block + ee_len <= eof_block ?
 			      EXT4_EXT_MAY_ZEROOUT : 0;
 		split_flag |= EXT4_EXT_MARK_UNWRIT2;
-		/* Convert to initialized */
-		if (flags & EXT4_GET_BLOCKS_CONVERT)
-			split_flag |= EXT4_EXT_DATA_VALID2;
 	}
 	flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE;
 	return ext4_split_extent(handle, inode, path, map, split_flag, flags,
-- 
2.51.0

Re: [PATCH 5/7] ext4: Refactor zeroout path and handle all cases

Posted by Zhang Yi 1 month ago

On 1/4/2026 8:19 PM, Ojaswin Mujoo wrote:
> Currently, zeroout is used as a fallback in case we fail to
> split/convert extents in the "traditional" modify-the-extent-tree way.
> This is essential to mitigate failures in critical paths like extent
> splitting during endio. However, the logic is very messy and not easy to
> follow. Further, the fragile use of various flags has made it prone to
> errors.
> 
> Refactor zeroout out logic by moving it up to ext4_split_extents().
> Further, zeroout correctly based on the type of conversion we want, ie:
> - unwritten to written: Zeroout everything around the mapped range.
> - unwritten to unwritten: Zeroout everything
> - written to unwritten: Zeroout only the mapped range.
> 
> Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>

Hi, Ojaswin!

The refactor overall looks good to me. After this series, the split
logic becomes more straightforward and clear. :)

I have some comments below.

> ---
>  fs/ext4/extents.c | 287 +++++++++++++++++++++++++++++++---------------
>  1 file changed, 195 insertions(+), 92 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 460a70e6dae0..8082e1d93bbf 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c

[...]

> @@ -3365,6 +3313,115 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
>  	return path;
>  }
>  
> +static struct ext4_ext_path *
> +ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
> +			  struct ext4_ext_path *path,
> +			  struct ext4_map_blocks *map, int flags)
> +{
> +	struct ext4_extent *ex;
> +	unsigned int ee_len, depth;
> +	ext4_lblk_t ee_block;
> +	uint64_t lblk, pblk, len;
> +	int is_unwrit;
> +	int err = 0;
> +
> +	depth = ext_depth(inode);
> +	ex = path[depth].p_ext;
> +	ee_block = le32_to_cpu(ex->ee_block);
> +	ee_len = ext4_ext_get_actual_len(ex);
> +	is_unwrit = ext4_ext_is_unwritten(ex);
> +
> +	if (flags & EXT4_GET_BLOCKS_CONVERT) {
> +		/*
> +		 * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
> +		 * map to be initialized. Zeroout everything except the map
> +		 * range.
> +		 */
> +
> +		loff_t map_end = (loff_t) map->m_lblk + map->m_len;
> +		loff_t ex_end = (loff_t) ee_block + ee_len;
> +
> +		if (!is_unwrit)
> +			/* Shouldn't happen. Just exit */
> +			return ERR_PTR(-EINVAL);

For cases that are should not happen, I'd suggest adding a WARN_ON_ONCE or
a message to facilitate future problem identification. Same as below.

> +
> +		/* zeroout left */
> +		if (map->m_lblk > ee_block) {
> +			lblk = ee_block;
> +			len = map->m_lblk - ee_block;
> +			pblk = ext4_ext_pblock(ex);
> +			err = ext4_issue_zeroout(inode, lblk, pblk, len);
> +			if (err)
> +				/* ZEROOUT failed, just return original error */
> +				return ERR_PTR(err);
> +		}
> +
> +		/* zeroout right */
> +		if (map->m_lblk + map->m_len < ee_block + ee_len) {
> +			lblk = map_end;
> +			len = ex_end - map_end;
> +			pblk = ext4_ext_pblock(ex) + (map_end - ee_block);
> +			err = ext4_issue_zeroout(inode, lblk, pblk, len);
> +			if (err)
> +				/* ZEROOUT failed, just return original error */
> +				return ERR_PTR(err);
> +		}
> +	} else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
> +		/*
> +		 * EXT4_GET_BLOCKS_CONVERT_UNWRITTEN: Caller wants the
> +		 * range specified by map to be marked unwritten.
> +		 * Zeroout the map range leaving rest as it is.
> +		 */
> +
> +		if (is_unwrit)
> +			/* Shouldn't happen. Just exit */
> +			return ERR_PTR(-EINVAL);
> +
> +		lblk = map->m_lblk;
> +		len = map->m_len;
> +		pblk = ext4_ext_pblock(ex) + (map->m_lblk - ee_block);
> +		err = ext4_issue_zeroout(inode, lblk, pblk, len);
> +		if (err)
> +			/* ZEROOUT failed, just return original error */
> +			return ERR_PTR(err);
> +	} else if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
> +		/*
> +		 * EXT4_GET_BLOCKS_UNWRIT_EXT: Today, this flag
> +		 * implicitly implies that callers when wanting an
> +		 * unwritten to unwritten split. So zeroout the whole
> +		 * extent.
> +		 *
> +		 * TODO: The implicit meaning of the flag is not ideal
> +		 * and eventually we should aim for a more well defined
> +		 * behavior
> +		 */
> +

I don't think we need this branch anymore. After applying my patch "ext4:
don't split extent before submitting I/O", we will no longer encounter
situations where doing an unwritten to unwritten split. It means that at
all call sites of ext4_split_extent(), only EXT4_GET_BLOCKS_CONVERT or
EXT4_GET_BLOCKS_CONVERT_UNWRITTEN flags are passed. What do you think?

Thanks,
Yi.

> +		if (!is_unwrit)
> +			/* Shouldn't happen. Just exit */
> +			return ERR_PTR(-EINVAL);
> +
> +		lblk = ee_block;
> +		len = ee_len;
> +		pblk = ext4_ext_pblock(ex);
> +		err = ext4_issue_zeroout(inode, lblk, pblk, len);
> +		if (err)
> +			/* ZEROOUT failed, just return original error */
> +			return ERR_PTR(err);
> +	}
> +
> +	err = ext4_ext_get_access(handle, inode, path + depth);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	ext4_ext_mark_initialized(ex);
> +
> +	ext4_ext_dirty(handle, inode, path + path->p_depth);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	return 0;
> +}
> +
>  /*
>   * ext4_split_extent() splits an extent and mark extent which is covered
>   * by @map as split_flags indicates

[...]

Re: [PATCH 5/7] ext4: Refactor zeroout path and handle all cases

Posted by Ojaswin Mujoo 1 month ago

On Thu, Jan 08, 2026 at 07:58:21PM +0800, Zhang Yi wrote:
> On 1/4/2026 8:19 PM, Ojaswin Mujoo wrote:
> > Currently, zeroout is used as a fallback in case we fail to
> > split/convert extents in the "traditional" modify-the-extent-tree way.
> > This is essential to mitigate failures in critical paths like extent
> > splitting during endio. However, the logic is very messy and not easy to
> > follow. Further, the fragile use of various flags has made it prone to
> > errors.
> > 
> > Refactor zeroout out logic by moving it up to ext4_split_extents().
> > Further, zeroout correctly based on the type of conversion we want, ie:
> > - unwritten to written: Zeroout everything around the mapped range.
> > - unwritten to unwritten: Zeroout everything
> > - written to unwritten: Zeroout only the mapped range.
> > 
> > Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> 
> Hi, Ojaswin!
> 
> The refactor overall looks good to me. After this series, the split
> logic becomes more straightforward and clear. :)
> 
> I have some comments below.
> 
> > ---
> >  fs/ext4/extents.c | 287 +++++++++++++++++++++++++++++++---------------
> >  1 file changed, 195 insertions(+), 92 deletions(-)
> > 
> > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > index 460a70e6dae0..8082e1d93bbf 100644
> > --- a/fs/ext4/extents.c
> > +++ b/fs/ext4/extents.c
> 
> [...]
> 
> > @@ -3365,6 +3313,115 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
> >  	return path;
> >  }
> >  
> > +static struct ext4_ext_path *
> > +ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
> > +			  struct ext4_ext_path *path,
> > +			  struct ext4_map_blocks *map, int flags)
> > +{
> > +	struct ext4_extent *ex;
> > +	unsigned int ee_len, depth;
> > +	ext4_lblk_t ee_block;
> > +	uint64_t lblk, pblk, len;
> > +	int is_unwrit;
> > +	int err = 0;
> > +
> > +	depth = ext_depth(inode);
> > +	ex = path[depth].p_ext;
> > +	ee_block = le32_to_cpu(ex->ee_block);
> > +	ee_len = ext4_ext_get_actual_len(ex);
> > +	is_unwrit = ext4_ext_is_unwritten(ex);
> > +
> > +	if (flags & EXT4_GET_BLOCKS_CONVERT) {
> > +		/*
> > +		 * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
> > +		 * map to be initialized. Zeroout everything except the map
> > +		 * range.
> > +		 */
> > +
> > +		loff_t map_end = (loff_t) map->m_lblk + map->m_len;
> > +		loff_t ex_end = (loff_t) ee_block + ee_len;
> > +
> > +		if (!is_unwrit)
> > +			/* Shouldn't happen. Just exit */
> > +			return ERR_PTR(-EINVAL);
> 
> For cases that are should not happen, I'd suggest adding a WARN_ON_ONCE or
> a message to facilitate future problem identification. Same as below.

Hi Yi,

Thanks for the review! Sure I can do that in v2.
> 
> > +
> > +		/* zeroout left */
> > +		if (map->m_lblk > ee_block) {
> > +			lblk = ee_block;
> > +			len = map->m_lblk - ee_block;
> > +			pblk = ext4_ext_pblock(ex);
> > +			err = ext4_issue_zeroout(inode, lblk, pblk, len);
> > +			if (err)
> > +				/* ZEROOUT failed, just return original error */
> > +				return ERR_PTR(err);
> > +		}
> > +
> > +		/* zeroout right */
> > +		if (map->m_lblk + map->m_len < ee_block + ee_len) {
> > +			lblk = map_end;
> > +			len = ex_end - map_end;
> > +			pblk = ext4_ext_pblock(ex) + (map_end - ee_block);
> > +			err = ext4_issue_zeroout(inode, lblk, pblk, len);
> > +			if (err)
> > +				/* ZEROOUT failed, just return original error */
> > +				return ERR_PTR(err);
> > +		}
> > +	} else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
> > +		/*
> > +		 * EXT4_GET_BLOCKS_CONVERT_UNWRITTEN: Caller wants the
> > +		 * range specified by map to be marked unwritten.
> > +		 * Zeroout the map range leaving rest as it is.
> > +		 */
> > +
> > +		if (is_unwrit)
> > +			/* Shouldn't happen. Just exit */
> > +			return ERR_PTR(-EINVAL);
> > +
> > +		lblk = map->m_lblk;
> > +		len = map->m_len;
> > +		pblk = ext4_ext_pblock(ex) + (map->m_lblk - ee_block);
> > +		err = ext4_issue_zeroout(inode, lblk, pblk, len);
> > +		if (err)
> > +			/* ZEROOUT failed, just return original error */
> > +			return ERR_PTR(err);
> > +	} else if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
> > +		/*
> > +		 * EXT4_GET_BLOCKS_UNWRIT_EXT: Today, this flag
> > +		 * implicitly implies that callers when wanting an
> > +		 * unwritten to unwritten split. So zeroout the whole
> > +		 * extent.
> > +		 *
> > +		 * TODO: The implicit meaning of the flag is not ideal
> > +		 * and eventually we should aim for a more well defined
> > +		 * behavior
> > +		 */
> > +
> 
> I don't think we need this branch anymore. After applying my patch "ext4:
> don't split extent before submitting I/O", we will no longer encounter
> situations where doing an unwritten to unwritten split. It means that at
> all call sites of ext4_split_extent(), only EXT4_GET_BLOCKS_CONVERT or
> EXT4_GET_BLOCKS_CONVERT_UNWRITTEN flags are passed. What do you think?

Yes, I did notice that as well after rebasing on your changes. 

So the next patch enforces the behavior that if no flag is passed to
ext4_split_extent() -> ext4_split_extent_zeroout() then we assume a
split without conversion. As you mentioned, there is no remaining caller
that does this but I thought of handling it here so that in future if we
ever need to use unwrit to unwrit splits we handle it correctly.

Incase you still feel this makes it confusing or is uneccessary I can
remove the else part altoghether and add a WARN_ON.

Thanks,
ojaswin

> 
> Thanks,
> Yi.
> 
> > +		if (!is_unwrit)
> > +			/* Shouldn't happen. Just exit */
> > +			return ERR_PTR(-EINVAL);
> > +
> > +		lblk = ee_block;
> > +		len = ee_len;
> > +		pblk = ext4_ext_pblock(ex);
> > +		err = ext4_issue_zeroout(inode, lblk, pblk, len);
> > +		if (err)
> > +			/* ZEROOUT failed, just return original error */
> > +			return ERR_PTR(err);
> > +	}
> > +
> > +	err = ext4_ext_get_access(handle, inode, path + depth);
> > +	if (err)
> > +		return ERR_PTR(err);
> > +
> > +	ext4_ext_mark_initialized(ex);
> > +
> > +	ext4_ext_dirty(handle, inode, path + path->p_depth);
> > +	if (err)
> > +		return ERR_PTR(err);
> > +
> > +	return 0;
> > +}
> > +
> >  /*
> >   * ext4_split_extent() splits an extent and mark extent which is covered
> >   * by @map as split_flags indicates
> 
> [...]
>

Re: [PATCH 5/7] ext4: Refactor zeroout path and handle all cases

Posted by Zhang Yi 1 month ago

On 1/8/2026 8:42 PM, Ojaswin Mujoo wrote:
> On Thu, Jan 08, 2026 at 07:58:21PM +0800, Zhang Yi wrote:
>> On 1/4/2026 8:19 PM, Ojaswin Mujoo wrote:
>>> Currently, zeroout is used as a fallback in case we fail to
>>> split/convert extents in the "traditional" modify-the-extent-tree way.
>>> This is essential to mitigate failures in critical paths like extent
>>> splitting during endio. However, the logic is very messy and not easy to
>>> follow. Further, the fragile use of various flags has made it prone to
>>> errors.
>>>
>>> Refactor zeroout out logic by moving it up to ext4_split_extents().
>>> Further, zeroout correctly based on the type of conversion we want, ie:
>>> - unwritten to written: Zeroout everything around the mapped range.
>>> - unwritten to unwritten: Zeroout everything
>>> - written to unwritten: Zeroout only the mapped range.
>>>
>>> Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
>>
>> Hi, Ojaswin!
>>
>> The refactor overall looks good to me. After this series, the split
>> logic becomes more straightforward and clear. :)
>>
>> I have some comments below.
>>
>>> ---
>>>  fs/ext4/extents.c | 287 +++++++++++++++++++++++++++++++---------------
>>>  1 file changed, 195 insertions(+), 92 deletions(-)
>>>
>>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>>> index 460a70e6dae0..8082e1d93bbf 100644
>>> --- a/fs/ext4/extents.c
>>> +++ b/fs/ext4/extents.c
>>
>> [...]
>>
>>> @@ -3365,6 +3313,115 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
>>>  	return path;
>>>  }
>>>  
>>> +static struct ext4_ext_path *
>>> +ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
>>> +			  struct ext4_ext_path *path,
>>> +			  struct ext4_map_blocks *map, int flags)
>>> +{
>>> +	struct ext4_extent *ex;
>>> +	unsigned int ee_len, depth;
>>> +	ext4_lblk_t ee_block;
>>> +	uint64_t lblk, pblk, len;
>>> +	int is_unwrit;
>>> +	int err = 0;
>>> +
>>> +	depth = ext_depth(inode);
>>> +	ex = path[depth].p_ext;
>>> +	ee_block = le32_to_cpu(ex->ee_block);
>>> +	ee_len = ext4_ext_get_actual_len(ex);
>>> +	is_unwrit = ext4_ext_is_unwritten(ex);
>>> +
>>> +	if (flags & EXT4_GET_BLOCKS_CONVERT) {
>>> +		/*
>>> +		 * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
>>> +		 * map to be initialized. Zeroout everything except the map
>>> +		 * range.
>>> +		 */
>>> +
>>> +		loff_t map_end = (loff_t) map->m_lblk + map->m_len;
>>> +		loff_t ex_end = (loff_t) ee_block + ee_len;
>>> +
>>> +		if (!is_unwrit)
>>> +			/* Shouldn't happen. Just exit */
>>> +			return ERR_PTR(-EINVAL);
>>
>> For cases that are should not happen, I'd suggest adding a WARN_ON_ONCE or
>> a message to facilitate future problem identification. Same as below.
> 
> Hi Yi,
> 
> Thanks for the review! Sure I can do that in v2.
>>
>>> +
>>> +		/* zeroout left */
>>> +		if (map->m_lblk > ee_block) {
>>> +			lblk = ee_block;
>>> +			len = map->m_lblk - ee_block;
>>> +			pblk = ext4_ext_pblock(ex);
>>> +			err = ext4_issue_zeroout(inode, lblk, pblk, len);
>>> +			if (err)
>>> +				/* ZEROOUT failed, just return original error */
>>> +				return ERR_PTR(err);
>>> +		}
>>> +
>>> +		/* zeroout right */
>>> +		if (map->m_lblk + map->m_len < ee_block + ee_len) {
>>> +			lblk = map_end;
>>> +			len = ex_end - map_end;
>>> +			pblk = ext4_ext_pblock(ex) + (map_end - ee_block);
>>> +			err = ext4_issue_zeroout(inode, lblk, pblk, len);
>>> +			if (err)
>>> +				/* ZEROOUT failed, just return original error */
>>> +				return ERR_PTR(err);
>>> +		}
>>> +	} else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
>>> +		/*
>>> +		 * EXT4_GET_BLOCKS_CONVERT_UNWRITTEN: Caller wants the
>>> +		 * range specified by map to be marked unwritten.
>>> +		 * Zeroout the map range leaving rest as it is.
>>> +		 */
>>> +
>>> +		if (is_unwrit)
>>> +			/* Shouldn't happen. Just exit */
>>> +			return ERR_PTR(-EINVAL);
>>> +
>>> +		lblk = map->m_lblk;
>>> +		len = map->m_len;
>>> +		pblk = ext4_ext_pblock(ex) + (map->m_lblk - ee_block);
>>> +		err = ext4_issue_zeroout(inode, lblk, pblk, len);
>>> +		if (err)
>>> +			/* ZEROOUT failed, just return original error */
>>> +			return ERR_PTR(err);
>>> +	} else if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
>>> +		/*
>>> +		 * EXT4_GET_BLOCKS_UNWRIT_EXT: Today, this flag
>>> +		 * implicitly implies that callers when wanting an
>>> +		 * unwritten to unwritten split. So zeroout the whole
>>> +		 * extent.
>>> +		 *
>>> +		 * TODO: The implicit meaning of the flag is not ideal
>>> +		 * and eventually we should aim for a more well defined
>>> +		 * behavior
>>> +		 */
>>> +
>>
>> I don't think we need this branch anymore. After applying my patch "ext4:
>> don't split extent before submitting I/O", we will no longer encounter
>> situations where doing an unwritten to unwritten split. It means that at
>> all call sites of ext4_split_extent(), only EXT4_GET_BLOCKS_CONVERT or
>> EXT4_GET_BLOCKS_CONVERT_UNWRITTEN flags are passed. What do you think?
> 
> Yes, I did notice that as well after rebasing on your changes. 
> 
> So the next patch enforces the behavior that if no flag is passed to
> ext4_split_extent() -> ext4_split_extent_zeroout() then we assume a
> split without conversion. As you mentioned, there is no remaining caller
> that does this but I thought of handling it here so that in future if we
> ever need to use unwrit to unwrit splits we handle it correctly.
> 
> Incase you still feel this makes it confusing or is uneccessary I can
> remove the else part altoghether and add a WARN_ON.
> 

Yes, my personal suggestion is to add this part of the logic only when it
is really needed. :)

Cheers,
Yi.

> Thanks,
> ojaswin
> 
>>
>> Thanks,
>> Yi.
>>
>>> +		if (!is_unwrit)
>>> +			/* Shouldn't happen. Just exit */
>>> +			return ERR_PTR(-EINVAL);
>>> +
>>> +		lblk = ee_block;
>>> +		len = ee_len;
>>> +		pblk = ext4_ext_pblock(ex);
>>> +		err = ext4_issue_zeroout(inode, lblk, pblk, len);
>>> +		if (err)
>>> +			/* ZEROOUT failed, just return original error */
>>> +			return ERR_PTR(err);
>>> +	}
>>> +
>>> +	err = ext4_ext_get_access(handle, inode, path + depth);
>>> +	if (err)
>>> +		return ERR_PTR(err);
>>> +
>>> +	ext4_ext_mark_initialized(ex);
>>> +
>>> +	ext4_ext_dirty(handle, inode, path + path->p_depth);
>>> +	if (err)
>>> +		return ERR_PTR(err);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>>  /*
>>>   * ext4_split_extent() splits an extent and mark extent which is covered
>>>   * by @map as split_flags indicates
>>
>> [...]
>>

Re: [PATCH 5/7] ext4: Refactor zeroout path and handle all cases

Posted by Ojaswin Mujoo 1 month ago

On Thu, Jan 08, 2026 at 08:54:04PM +0800, Zhang Yi wrote:
> On 1/8/2026 8:42 PM, Ojaswin Mujoo wrote:
> > On Thu, Jan 08, 2026 at 07:58:21PM +0800, Zhang Yi wrote:
> >> On 1/4/2026 8:19 PM, Ojaswin Mujoo wrote:
> >>> Currently, zeroout is used as a fallback in case we fail to
> >>> split/convert extents in the "traditional" modify-the-extent-tree way.
> >>> This is essential to mitigate failures in critical paths like extent
> >>> splitting during endio. However, the logic is very messy and not easy to
> >>> follow. Further, the fragile use of various flags has made it prone to
> >>> errors.
> >>>
> >>> Refactor zeroout out logic by moving it up to ext4_split_extents().
> >>> Further, zeroout correctly based on the type of conversion we want, ie:
> >>> - unwritten to written: Zeroout everything around the mapped range.
> >>> - unwritten to unwritten: Zeroout everything
> >>> - written to unwritten: Zeroout only the mapped range.
> >>>
> >>> Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> >>
> >> Hi, Ojaswin!
> >>
> >> The refactor overall looks good to me. After this series, the split
> >> logic becomes more straightforward and clear. :)
> >>
> >> I have some comments below.
> >>
> >>> ---
> >>>  fs/ext4/extents.c | 287 +++++++++++++++++++++++++++++++---------------
> >>>  1 file changed, 195 insertions(+), 92 deletions(-)
> >>>
> >>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> >>> index 460a70e6dae0..8082e1d93bbf 100644
> >>> --- a/fs/ext4/extents.c
> >>> +++ b/fs/ext4/extents.c
> >>
> >> [...]
> >>
> >>> @@ -3365,6 +3313,115 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
> >>>  	return path;
> >>>  }
> >>>  
> >>> +static struct ext4_ext_path *
> >>> +ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
> >>> +			  struct ext4_ext_path *path,
> >>> +			  struct ext4_map_blocks *map, int flags)
> >>> +{
> >>> +	struct ext4_extent *ex;
> >>> +	unsigned int ee_len, depth;
> >>> +	ext4_lblk_t ee_block;
> >>> +	uint64_t lblk, pblk, len;
> >>> +	int is_unwrit;
> >>> +	int err = 0;
> >>> +
> >>> +	depth = ext_depth(inode);
> >>> +	ex = path[depth].p_ext;
> >>> +	ee_block = le32_to_cpu(ex->ee_block);
> >>> +	ee_len = ext4_ext_get_actual_len(ex);
> >>> +	is_unwrit = ext4_ext_is_unwritten(ex);
> >>> +
> >>> +	if (flags & EXT4_GET_BLOCKS_CONVERT) {
> >>> +		/*
> >>> +		 * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
> >>> +		 * map to be initialized. Zeroout everything except the map
> >>> +		 * range.
> >>> +		 */
> >>> +
> >>> +		loff_t map_end = (loff_t) map->m_lblk + map->m_len;
> >>> +		loff_t ex_end = (loff_t) ee_block + ee_len;
> >>> +
> >>> +		if (!is_unwrit)
> >>> +			/* Shouldn't happen. Just exit */
> >>> +			return ERR_PTR(-EINVAL);
> >>
> >> For cases that are should not happen, I'd suggest adding a WARN_ON_ONCE or
> >> a message to facilitate future problem identification. Same as below.
> > 
> > Hi Yi,
> > 
> > Thanks for the review! Sure I can do that in v2.
> >>
> >>> +
> >>> +		/* zeroout left */
> >>> +		if (map->m_lblk > ee_block) {
> >>> +			lblk = ee_block;
> >>> +			len = map->m_lblk - ee_block;
> >>> +			pblk = ext4_ext_pblock(ex);
> >>> +			err = ext4_issue_zeroout(inode, lblk, pblk, len);
> >>> +			if (err)
> >>> +				/* ZEROOUT failed, just return original error */
> >>> +				return ERR_PTR(err);
> >>> +		}
> >>> +
> >>> +		/* zeroout right */
> >>> +		if (map->m_lblk + map->m_len < ee_block + ee_len) {
> >>> +			lblk = map_end;
> >>> +			len = ex_end - map_end;
> >>> +			pblk = ext4_ext_pblock(ex) + (map_end - ee_block);
> >>> +			err = ext4_issue_zeroout(inode, lblk, pblk, len);
> >>> +			if (err)
> >>> +				/* ZEROOUT failed, just return original error */
> >>> +				return ERR_PTR(err);
> >>> +		}
> >>> +	} else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
> >>> +		/*
> >>> +		 * EXT4_GET_BLOCKS_CONVERT_UNWRITTEN: Caller wants the
> >>> +		 * range specified by map to be marked unwritten.
> >>> +		 * Zeroout the map range leaving rest as it is.
> >>> +		 */
> >>> +
> >>> +		if (is_unwrit)
> >>> +			/* Shouldn't happen. Just exit */
> >>> +			return ERR_PTR(-EINVAL);
> >>> +
> >>> +		lblk = map->m_lblk;
> >>> +		len = map->m_len;
> >>> +		pblk = ext4_ext_pblock(ex) + (map->m_lblk - ee_block);
> >>> +		err = ext4_issue_zeroout(inode, lblk, pblk, len);
> >>> +		if (err)
> >>> +			/* ZEROOUT failed, just return original error */
> >>> +			return ERR_PTR(err);
> >>> +	} else if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
> >>> +		/*
> >>> +		 * EXT4_GET_BLOCKS_UNWRIT_EXT: Today, this flag
> >>> +		 * implicitly implies that callers when wanting an
> >>> +		 * unwritten to unwritten split. So zeroout the whole
> >>> +		 * extent.
> >>> +		 *
> >>> +		 * TODO: The implicit meaning of the flag is not ideal
> >>> +		 * and eventually we should aim for a more well defined
> >>> +		 * behavior
> >>> +		 */
> >>> +
> >>
> >> I don't think we need this branch anymore. After applying my patch "ext4:
> >> don't split extent before submitting I/O", we will no longer encounter
> >> situations where doing an unwritten to unwritten split. It means that at
> >> all call sites of ext4_split_extent(), only EXT4_GET_BLOCKS_CONVERT or
> >> EXT4_GET_BLOCKS_CONVERT_UNWRITTEN flags are passed. What do you think?
> > 
> > Yes, I did notice that as well after rebasing on your changes. 
> > 
> > So the next patch enforces the behavior that if no flag is passed to
> > ext4_split_extent() -> ext4_split_extent_zeroout() then we assume a
> > split without conversion. As you mentioned, there is no remaining caller
> > that does this but I thought of handling it here so that in future if we
> > ever need to use unwrit to unwrit splits we handle it correctly.
> > 
> > Incase you still feel this makes it confusing or is uneccessary I can
> > remove the else part altoghether and add a WARN_ON.
> > 
> 
> Yes, my personal suggestion is to add this part of the logic only when it
> is really needed. :)

Okay sounds good, will take this out in v2.

Regards,
ojaswin

> 
> Cheers,
> Yi.
> 
> > Thanks,
> > ojaswin
> > 
> >>
> >> Thanks,
> >> Yi.
> >>
> >>> +		if (!is_unwrit)
> >>> +			/* Shouldn't happen. Just exit */
> >>> +			return ERR_PTR(-EINVAL);
> >>> +
> >>> +		lblk = ee_block;
> >>> +		len = ee_len;
> >>> +		pblk = ext4_ext_pblock(ex);
> >>> +		err = ext4_issue_zeroout(inode, lblk, pblk, len);
> >>> +		if (err)
> >>> +			/* ZEROOUT failed, just return original error */
> >>> +			return ERR_PTR(err);
> >>> +	}
> >>> +
> >>> +	err = ext4_ext_get_access(handle, inode, path + depth);
> >>> +	if (err)
> >>> +		return ERR_PTR(err);
> >>> +
> >>> +	ext4_ext_mark_initialized(ex);
> >>> +
> >>> +	ext4_ext_dirty(handle, inode, path + path->p_depth);
> >>> +	if (err)
> >>> +		return ERR_PTR(err);
> >>> +
> >>> +	return 0;
> >>> +}
> >>> +
> >>>  /*
> >>>   * ext4_split_extent() splits an extent and mark extent which is covered
> >>>   * by @map as split_flags indicates
> >>
> >> [...]
> >>
>

Re: [PATCH 5/7] ext4: Refactor zeroout path and handle all cases

Posted by Jan Kara 1 month ago

On Sun 04-01-26 17:49:18, Ojaswin Mujoo wrote:
> Currently, zeroout is used as a fallback in case we fail to
> split/convert extents in the "traditional" modify-the-extent-tree way.
> This is essential to mitigate failures in critical paths like extent
> splitting during endio. However, the logic is very messy and not easy to
> follow. Further, the fragile use of various flags has made it prone to
> errors.
> 
> Refactor zeroout out logic by moving it up to ext4_split_extents().
> Further, zeroout correctly based on the type of conversion we want, ie:
> - unwritten to written: Zeroout everything around the mapped range.
> - unwritten to unwritten: Zeroout everything
> - written to unwritten: Zeroout only the mapped range.
> 
> Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>

...

> @@ -3383,11 +3440,12 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
>  					       int split_flag, int flags,
>  					       unsigned int *allocated)
>  {
> -	ext4_lblk_t ee_block;
> +	ext4_lblk_t ee_block, orig_ee_block;
>  	struct ext4_extent *ex;
> -	unsigned int ee_len, depth;
> -	int unwritten;
> -	int split_flag1, flags1;
> +	unsigned int ee_len, orig_ee_len, depth;
> +	int unwritten, orig_unwritten;
> +	int split_flag1 = 0, flags1 = 0;
> +	int err = 0, orig_err;

Cannot orig_err be used uninitialized in this function? At least it isn't
obvious to me some of the branches setting it is always taken.

> @@ -3395,23 +3453,29 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
>  	ee_len = ext4_ext_get_actual_len(ex);
>  	unwritten = ext4_ext_is_unwritten(ex);
>  
> +	orig_ee_block = ee_block;
> +	orig_ee_len = ee_len;
> +	orig_unwritten = unwritten;
> +
>  	/* Do not cache extents that are in the process of being modified. */
>  	flags |= EXT4_EX_NOCACHE;
>  
>  	if (map->m_lblk + map->m_len < ee_block + ee_len) {
> -		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
>  		flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
>  		if (unwritten)
>  			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
>  				       EXT4_EXT_MARK_UNWRIT2;
> -		if (split_flag & EXT4_EXT_DATA_VALID2)
> -			split_flag1 |= map->m_lblk > ee_block ?
> -				       EXT4_EXT_DATA_PARTIAL_VALID1 :
> -				       EXT4_EXT_DATA_ENTIRE_VALID1;
>  		path = ext4_split_extent_at(handle, inode, path,
>  				map->m_lblk + map->m_len, split_flag1, flags1);
> -		if (IS_ERR(path))
> -			return path;
> +
> +		if (IS_ERR(path)) {
> +			orig_err = PTR_ERR(path);
> +			if (orig_err != -ENOSPC && orig_err != -EDQUOT &&
> +			    orig_err != -ENOMEM)
> +				return path;
> +
> +			goto try_zeroout;
> +		}
>  		/*
>  		 * Update path is required because previous ext4_split_extent_at
>  		 * may result in split of original leaf or extent zeroout.
> @@ -3427,22 +3491,68 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
>  			ext4_free_ext_path(path);
>  			return ERR_PTR(-EFSCORRUPTED);
>  		}
> -		unwritten = ext4_ext_is_unwritten(ex);
>  	}
>  
>  	if (map->m_lblk >= ee_block) {
> -		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
> +		split_flag1 = 0;
>  		if (unwritten) {
>  			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
> -			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
> -						     EXT4_EXT_MARK_UNWRIT2);
> +			split_flag1 |= split_flag & EXT4_EXT_MARK_UNWRIT2;
>  		}
> -		path = ext4_split_extent_at(handle, inode, path,
> -				map->m_lblk, split_flag1, flags);
> +		path = ext4_split_extent_at(handle, inode, path, map->m_lblk,
> +					    split_flag1, flags);
> +
> +		if (IS_ERR(path)) {
> +			orig_err = PTR_ERR(path);
> +			if (orig_err != -ENOSPC && orig_err != -EDQUOT &&
> +			    orig_err != -ENOMEM)
> +				return path;
> +
> +			goto try_zeroout;
> +		}
> +	}
> +
> +	if (!err)

Nothing touches 'err' in this function...

> +		goto out;
> +
> +try_zeroout:
> +	/*
> +	 * There was an error in splitting the extent, just zeroout and convert
> +	 * to initialize as a last resort
> +	 */
> +	if (split_flag & EXT4_EXT_MAY_ZEROOUT) {
> +		path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
>  		if (IS_ERR(path))
>  			return path;
> +
> +		depth = ext_depth(inode);
> +		ex = path[depth].p_ext;
> +		ee_block = le32_to_cpu(ex->ee_block);
> +		ee_len = ext4_ext_get_actual_len(ex);
> +		unwritten = ext4_ext_is_unwritten(ex);
> +
> +		/*
> +		 * The extent to zeroout should have been unchanged
> +		 * but its not, just return error to caller
> +		 */
> +		if (WARN_ON(ee_block != orig_ee_block ||
> +			    ee_len != orig_ee_len ||
> +			    unwritten != orig_unwritten))
> +			return ERR_PTR(orig_err);
> +
> +		/*
> +		 * Something went wrong in zeroout, just return the
> +		 * original error
> +		 */
> +		if (ext4_split_extent_zeroout(handle, inode, path, map, flags))
> +			return ERR_PTR(orig_err);
>  	}

Also nothing seems to zero out orig_err in case zero out above succeeded.
What am I missing?

								Honza

>  
> +	/* There's an error and we can't zeroout, just return the err */
> +	return ERR_PTR(orig_err);
> +
> +out:
> +
>  	if (allocated) {
>  		if (map->m_lblk + map->m_len > ee_block + ee_len)
>  			*allocated = ee_len - (map->m_lblk - ee_block);
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

Re: [PATCH 5/7] ext4: Refactor zeroout path and handle all cases

Posted by Ojaswin Mujoo 1 month ago

On Tue, Jan 06, 2026 at 04:31:23PM +0100, Jan Kara wrote:
> On Sun 04-01-26 17:49:18, Ojaswin Mujoo wrote:
> > Currently, zeroout is used as a fallback in case we fail to
> > split/convert extents in the "traditional" modify-the-extent-tree way.
> > This is essential to mitigate failures in critical paths like extent
> > splitting during endio. However, the logic is very messy and not easy to
> > follow. Further, the fragile use of various flags has made it prone to
> > errors.
> > 
> > Refactor zeroout out logic by moving it up to ext4_split_extents().
> > Further, zeroout correctly based on the type of conversion we want, ie:
> > - unwritten to written: Zeroout everything around the mapped range.
> > - unwritten to unwritten: Zeroout everything
> > - written to unwritten: Zeroout only the mapped range.
> > 
> > Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> 
> ...
> 
> > @@ -3383,11 +3440,12 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
> >  					       int split_flag, int flags,
> >  					       unsigned int *allocated)
> >  {
> > -	ext4_lblk_t ee_block;
> > +	ext4_lblk_t ee_block, orig_ee_block;
> >  	struct ext4_extent *ex;
> > -	unsigned int ee_len, depth;
> > -	int unwritten;
> > -	int split_flag1, flags1;
> > +	unsigned int ee_len, orig_ee_len, depth;
> > +	int unwritten, orig_unwritten;
> > +	int split_flag1 = 0, flags1 = 0;
> > +	int err = 0, orig_err;
> 
> Cannot orig_err be used uninitialized in this function? At least it isn't
> obvious to me some of the branches setting it is always taken.

Hi Jan, thanks for the reviews. Yes orig_err is always initialized
before it is used (initialized on error and used in zeroout path which
is only taked on error), but I agree that we can just init it to 0.

> 
> > @@ -3395,23 +3453,29 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
> >  	ee_len = ext4_ext_get_actual_len(ex);
> >  	unwritten = ext4_ext_is_unwritten(ex);
> >  
> > +	orig_ee_block = ee_block;
> > +	orig_ee_len = ee_len;
> > +	orig_unwritten = unwritten;
> > +
> >  	/* Do not cache extents that are in the process of being modified. */
> >  	flags |= EXT4_EX_NOCACHE;
> >  
> >  	if (map->m_lblk + map->m_len < ee_block + ee_len) {
> > -		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
> >  		flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
> >  		if (unwritten)
> >  			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
> >  				       EXT4_EXT_MARK_UNWRIT2;
> > -		if (split_flag & EXT4_EXT_DATA_VALID2)
> > -			split_flag1 |= map->m_lblk > ee_block ?
> > -				       EXT4_EXT_DATA_PARTIAL_VALID1 :
> > -				       EXT4_EXT_DATA_ENTIRE_VALID1;
> >  		path = ext4_split_extent_at(handle, inode, path,
> >  				map->m_lblk + map->m_len, split_flag1, flags1);
> > -		if (IS_ERR(path))
> > -			return path;
> > +
> > +		if (IS_ERR(path)) {
> > +			orig_err = PTR_ERR(path);
> > +			if (orig_err != -ENOSPC && orig_err != -EDQUOT &&
> > +			    orig_err != -ENOMEM)
> > +				return path;
> > +
> > +			goto try_zeroout;
> > +		}
> >  		/*
> >  		 * Update path is required because previous ext4_split_extent_at
> >  		 * may result in split of original leaf or extent zeroout.
> > @@ -3427,22 +3491,68 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
> >  			ext4_free_ext_path(path);
> >  			return ERR_PTR(-EFSCORRUPTED);
> >  		}
> > -		unwritten = ext4_ext_is_unwritten(ex);
> >  	}
> >  
> >  	if (map->m_lblk >= ee_block) {
> > -		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
> > +		split_flag1 = 0;
> >  		if (unwritten) {
> >  			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
> > -			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
> > -						     EXT4_EXT_MARK_UNWRIT2);
> > +			split_flag1 |= split_flag & EXT4_EXT_MARK_UNWRIT2;
> >  		}
> > -		path = ext4_split_extent_at(handle, inode, path,
> > -				map->m_lblk, split_flag1, flags);
> > +		path = ext4_split_extent_at(handle, inode, path, map->m_lblk,
> > +					    split_flag1, flags);
> > +
> > +		if (IS_ERR(path)) {
> > +			orig_err = PTR_ERR(path);
> > +			if (orig_err != -ENOSPC && orig_err != -EDQUOT &&
> > +			    orig_err != -ENOMEM)
> > +				return path;
> > +
> > +			goto try_zeroout;
> > +		}
> > +	}
> > +
> > +	if (!err)
> 
> Nothing touches 'err' in this function...

Yes :), I'll remove this.

> 
> > +		goto out;
> > +
> > +try_zeroout:
> > +	/*
> > +	 * There was an error in splitting the extent, just zeroout and convert
> > +	 * to initialize as a last resort
> > +	 */
> > +	if (split_flag & EXT4_EXT_MAY_ZEROOUT) {
> > +		path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
> >  		if (IS_ERR(path))
> >  			return path;
> > +
> > +		depth = ext_depth(inode);
> > +		ex = path[depth].p_ext;
> > +		ee_block = le32_to_cpu(ex->ee_block);
> > +		ee_len = ext4_ext_get_actual_len(ex);
> > +		unwritten = ext4_ext_is_unwritten(ex);
> > +
> > +		/*
> > +		 * The extent to zeroout should have been unchanged
> > +		 * but its not, just return error to caller
> > +		 */
> > +		if (WARN_ON(ee_block != orig_ee_block ||
> > +			    ee_len != orig_ee_len ||
> > +			    unwritten != orig_unwritten))
> > +			return ERR_PTR(orig_err);
> > +
> > +		/*
> > +		 * Something went wrong in zeroout, just return the
> > +		 * original error
> > +		 */
> > +		if (ext4_split_extent_zeroout(handle, inode, path, map, flags))
> > +			return ERR_PTR(orig_err);
> >  	}
> 
> Also nothing seems to zero out orig_err in case zero out above succeeded.
> What am I missing?

So if zeroout here succeeds we just goto out and return path, we never
use orig_err.  Not the best practice and admittedly, I seem to have
complicated the error handling a bit. I will streamline it in v2. 

Thanks for pointing this out.
Ojaswin.
> 
> 								Honza
> 
> >  
> > +	/* There's an error and we can't zeroout, just return the err */
> > +	return ERR_PTR(orig_err);
> > +
> > +out:
> > +
> >  	if (allocated) {
> >  		if (map->m_lblk + map->m_len > ee_block + ee_len)
> >  			*allocated = ee_len - (map->m_lblk - ee_block);
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR

[PATCH 1/7] ext4: kunit tests for extent splitting and conversion
[PATCH 2/7] ext4: kunit tests for higher level extent manipulation functions
[PATCH 3/7] ext4: propagate flags to convert_initialized_extent()
[PATCH 4/7] ext4: propagate flags to ext4_convert_unwritten_extents_endio()
[PATCH 5/7] ext4: Refactor zeroout path and handle all cases
[PATCH 6/7] ext4: Refactor split and convert extents
[PATCH 7/7] ext4: Allow zeroout when doing written to unwritten split