[v2] ext4 extent split/convert refactor and kunit tests

[PATCH v2 6/8] ext4: Refactor zeroout path and handle all cases

Posted by Ojaswin Mujoo 3 weeks, 3 days ago

Currently, zeroout is used as a fallback in case we fail to
split/convert extents in the "traditional" modify-the-extent-tree way.
This is essential to mitigate failures in critical paths like extent
splitting during endio. However, the logic is very messy and not easy to
follow. Further, the fragile use of various flags has made it prone to
errors.

Refactor zeroout out logic by moving it up to ext4_split_extents().
Further, zeroout correctly based on the type of conversion we want, ie:
- unwritten to written: Zeroout everything around the mapped range.
- written to unwritten: Zeroout only the mapped range.

Also, ext4_ext_convert_to_initialized() now passes
EXT4_GET_BLOCKS_CONVERT to make the intention clear.

Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
 fs/ext4/extents.c | 286 ++++++++++++++++++++++++++++++----------------
 1 file changed, 188 insertions(+), 98 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 54f45b40fe73..70d85f007dc7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,14 +44,6 @@
 #define EXT4_EXT_MARK_UNWRIT1	0x2  /* mark first half unwritten */
 #define EXT4_EXT_MARK_UNWRIT2	0x4  /* mark second half unwritten */
 
-/* first half contains valid data */
-#define EXT4_EXT_DATA_ENTIRE_VALID1	0x8   /* has entirely valid data */
-#define EXT4_EXT_DATA_PARTIAL_VALID1	0x10  /* has partially valid data */
-#define EXT4_EXT_DATA_VALID1		(EXT4_EXT_DATA_ENTIRE_VALID1 | \
-					 EXT4_EXT_DATA_PARTIAL_VALID1)
-
-#define EXT4_EXT_DATA_VALID2	0x20 /* second half contains valid data */
-
 static __le32 ext4_extent_block_csum(struct inode *inode,
 				     struct ext4_extent_header *eh)
 {
@@ -3193,7 +3185,8 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
  *  a> the extent are splitted into two extent.
  *  b> split is not needed, and just mark the extent.
  *
- * Return an extent path pointer on success, or an error pointer on failure.
+ * Return an extent path pointer on success, or an error pointer on failure. On
+ * failure, the extent is restored to original state.
  */
 static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
 						  struct inode *inode,
@@ -3203,14 +3196,10 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
 {
 	ext4_fsblk_t newblock;
 	ext4_lblk_t ee_block;
-	struct ext4_extent *ex, newex, orig_ex, zero_ex;
+	struct ext4_extent *ex, newex, orig_ex;
 	struct ext4_extent *ex2 = NULL;
 	unsigned int ee_len, depth;
-	int err = 0;
-
-	BUG_ON((split_flag & EXT4_EXT_DATA_VALID1) == EXT4_EXT_DATA_VALID1);
-	BUG_ON((split_flag & EXT4_EXT_DATA_VALID1) &&
-	       (split_flag & EXT4_EXT_DATA_VALID2));
+	int err = 0, insert_err = 0;
 
 	/* Do not cache extents that are in the process of being modified. */
 	flags |= EXT4_EX_NOCACHE;
@@ -3276,11 +3265,10 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
 
 	path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (!IS_ERR(path))
-		goto out;
+		return path;
 
-	err = PTR_ERR(path);
-	if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
-		goto out_path;
+	insert_err = PTR_ERR(path);
+	err = 0;
 
 	/*
 	 * Get a new path to try to zeroout or fix the extent length.
@@ -3296,72 +3284,130 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
 				 split, PTR_ERR(path));
 		goto out_path;
 	}
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+
 	depth = ext_depth(inode);
 	ex = path[depth].p_ext;
 
-	if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
-		if (split_flag & EXT4_EXT_DATA_VALID1)
-			memcpy(&zero_ex, ex2, sizeof(zero_ex));
-		else if (split_flag & EXT4_EXT_DATA_VALID2)
-			memcpy(&zero_ex, ex, sizeof(zero_ex));
-		else
-			memcpy(&zero_ex, &orig_ex, sizeof(zero_ex));
-		ext4_ext_mark_initialized(&zero_ex);
+fix_extent_len:
+	ex->ee_len = orig_ex.ee_len;
+	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+	if (err || insert_err) {
+		ext4_free_ext_path(path);
+		path = err ? ERR_PTR(err) : ERR_PTR(insert_err);
+	}
+out_path:
+	if (IS_ERR(path))
+		/* Remove all remaining potentially stale extents. */
+		ext4_es_remove_extent(inode, ee_block, ee_len);
+	ext4_ext_show_leaf(inode, path);
+	return path;
+}
 
-		err = ext4_ext_zeroout(inode, &zero_ex);
-		if (err)
-			goto fix_extent_len;
+static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
+				     struct ext4_ext_path *path,
+				     struct ext4_map_blocks *map, int flags)
+{
+	struct ext4_extent *ex;
+	unsigned int ee_len, depth;
+	ext4_lblk_t ee_block;
+	uint64_t lblk, pblk, len;
+	int is_unwrit;
+	int err = 0;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	is_unwrit = ext4_ext_is_unwritten(ex);
 
+	if (flags & EXT4_GET_BLOCKS_CONVERT) {
 		/*
-		 * The first half contains partially valid data, the splitting
-		 * of this extent has not been completed, fix extent length
-		 * and ext4_split_extent() split will the first half again.
+		 * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
+		 * map to be initialized. Zeroout everything except the map
+		 * range.
 		 */
-		if (split_flag & EXT4_EXT_DATA_PARTIAL_VALID1) {
-			/*
-			 * Drop extent cache to prevent stale unwritten
-			 * extents remaining after zeroing out.
-			 */
-			ext4_es_remove_extent(inode,
-					le32_to_cpu(zero_ex.ee_block),
-					ext4_ext_get_actual_len(&zero_ex));
-			goto fix_extent_len;
+
+		loff_t map_end = (loff_t) map->m_lblk + map->m_len;
+		loff_t ex_end = (loff_t) ee_block + ee_len;
+
+		if (!is_unwrit)
+			/* Shouldn't happen. Just exit */
+			return -EINVAL;
+
+		/* zeroout left */
+		if (map->m_lblk > ee_block) {
+			lblk = ee_block;
+			len = map->m_lblk - ee_block;
+			pblk = ext4_ext_pblock(ex);
+			err = ext4_issue_zeroout(inode, lblk, pblk, len);
+			if (err)
+				/* ZEROOUT failed, just return original error */
+				return err;
 		}
 
-		/* update the extent length and mark as initialized */
-		ex->ee_len = cpu_to_le16(ee_len);
-		ext4_ext_try_to_merge(handle, inode, path, ex);
-		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
-		if (!err)
-			/* update extent status tree */
-			ext4_zeroout_es(inode, &zero_ex);
+		/* zeroout right */
+		if (map->m_lblk + map->m_len < ee_block + ee_len) {
+			lblk = map_end;
+			len = ex_end - map_end;
+			pblk = ext4_ext_pblock(ex) + (map_end - ee_block);
+			err = ext4_issue_zeroout(inode, lblk, pblk, len);
+			if (err)
+				/* ZEROOUT failed, just return original error */
+				return err;
+		}
+	} else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
 		/*
-		 * If we failed at this point, we don't know in which
-		 * state the extent tree exactly is so don't try to fix
-		 * length of the original extent as it may do even more
-		 * damage.
+		 * EXT4_GET_BLOCKS_CONVERT_UNWRITTEN: Caller wants the
+		 * range specified by map to be marked unwritten.
+		 * Zeroout the map range leaving rest as it is.
 		 */
-		goto out;
+
+		if (is_unwrit)
+			/* Shouldn't happen. Just exit */
+			return -EINVAL;
+
+		lblk = map->m_lblk;
+		len = map->m_len;
+		pblk = ext4_ext_pblock(ex) + (map->m_lblk - ee_block);
+		err = ext4_issue_zeroout(inode, lblk, pblk, len);
+		if (err)
+			/* ZEROOUT failed, just return original error */
+			return err;
+	} else {
+		/*
+		 * We no longer perform unwritten to unwritten splits in IO paths.
+		 * Hence this should not happen.
+		 */
+		WARN_ON_ONCE(true);
+		return -EINVAL;
 	}
 
-fix_extent_len:
-	ex->ee_len = orig_ex.ee_len;
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		return err;
+
+	ext4_ext_mark_initialized(ex);
+
+	ext4_ext_dirty(handle, inode, path + path->p_depth);
+	if (err)
+		return err;
+
 	/*
-	 * Ignore ext4_ext_dirty return value since we are already in error path
-	 * and err is a non-zero error code.
+	 * The whole extent is initialized and stable now so it can be added to
+	 * es cache
 	 */
-	ext4_ext_dirty(handle, inode, path + path->p_depth);
-out:
-	if (err) {
-		ext4_free_ext_path(path);
-		path = ERR_PTR(err);
-	}
-out_path:
-	if (IS_ERR(path))
-		/* Remove all remaining potentially stale extents. */
-		ext4_es_remove_extent(inode, ee_block, ee_len);
-	ext4_ext_show_leaf(inode, path);
-	return path;
+	if (!(flags & EXT4_EX_NOCACHE))
+		ext4_es_insert_extent(inode, le32_to_cpu(ex->ee_block),
+				      ext4_ext_get_actual_len(ex),
+				      ext4_ext_pblock(ex),
+				      EXTENT_STATUS_WRITTEN, false);
+
+	return 0;
 }
 
 /*
@@ -3382,11 +3428,13 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
 					       int split_flag, int flags,
 					       unsigned int *allocated)
 {
-	ext4_lblk_t ee_block;
+	ext4_lblk_t ee_block, orig_ee_block;
 	struct ext4_extent *ex;
-	unsigned int ee_len, depth;
-	int unwritten;
-	int split_flag1, flags1;
+	unsigned int ee_len, orig_ee_len, depth;
+	int unwritten, orig_unwritten;
+	int split_flag1 = 0, flags1 = 0;
+	int  orig_err = 0;
+	int orig_flags = flags;
 
 	depth = ext_depth(inode);
 	ex = path[depth].p_ext;
@@ -3394,30 +3442,31 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
 	ee_len = ext4_ext_get_actual_len(ex);
 	unwritten = ext4_ext_is_unwritten(ex);
 
+	orig_ee_block = ee_block;
+	orig_ee_len = ee_len;
+	orig_unwritten = unwritten;
+
 	/* Do not cache extents that are in the process of being modified. */
 	flags |= EXT4_EX_NOCACHE;
 
 	if (map->m_lblk + map->m_len < ee_block + ee_len) {
-		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
 		flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
 		if (unwritten)
 			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
 				       EXT4_EXT_MARK_UNWRIT2;
-		if (split_flag & EXT4_EXT_DATA_VALID2)
-			split_flag1 |= map->m_lblk > ee_block ?
-				       EXT4_EXT_DATA_PARTIAL_VALID1 :
-				       EXT4_EXT_DATA_ENTIRE_VALID1;
 		path = ext4_split_extent_at(handle, inode, path,
 				map->m_lblk + map->m_len, split_flag1, flags1);
 		if (IS_ERR(path))
-			return path;
+			goto try_zeroout;
+
 		/*
 		 * Update path is required because previous ext4_split_extent_at
 		 * may result in split of original leaf or extent zeroout.
 		 */
 		path = ext4_find_extent(inode, map->m_lblk, path, flags);
 		if (IS_ERR(path))
-			return path;
+			goto try_zeroout;
+
 		depth = ext_depth(inode);
 		ex = path[depth].p_ext;
 		if (!ex) {
@@ -3426,22 +3475,64 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
 			ext4_free_ext_path(path);
 			return ERR_PTR(-EFSCORRUPTED);
 		}
-		unwritten = ext4_ext_is_unwritten(ex);
 	}
 
 	if (map->m_lblk >= ee_block) {
-		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
+		split_flag1 = 0;
 		if (unwritten) {
 			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
-			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
-						     EXT4_EXT_MARK_UNWRIT2);
+			split_flag1 |= split_flag & EXT4_EXT_MARK_UNWRIT2;
 		}
-		path = ext4_split_extent_at(handle, inode, path,
-				map->m_lblk, split_flag1, flags);
+		path = ext4_split_extent_at(handle, inode, path, map->m_lblk,
+					    split_flag1, flags);
 		if (IS_ERR(path))
-			return path;
+			goto try_zeroout;
 	}
 
+	goto success;
+
+try_zeroout:
+	/*
+	 * There was an error in splitting the extent. So instead, just zeroout
+	 * unwritten portions and convert it to initialize as a last resort. If
+	 * there is any failure here we just return the original error
+	 */
+
+	orig_err = PTR_ERR(path);
+	if (orig_err != -ENOSPC && orig_err != -EDQUOT && orig_err != -ENOMEM)
+		goto out_orig_err;
+
+	if (!(split_flag & EXT4_EXT_MAY_ZEROOUT))
+		/* There's an error and we can't zeroout, just return the
+		 * original err
+		 */
+		goto out_orig_err;
+
+	path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
+	if (IS_ERR(path))
+		goto out_orig_err;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	unwritten = ext4_ext_is_unwritten(ex);
+
+	if (WARN_ON(ee_block != orig_ee_block || ee_len != orig_ee_len ||
+		    unwritten != orig_unwritten))
+		/*
+		 * The extent to zeroout should have been unchanged
+		 * but its not.
+		 */
+		goto out_free_path;
+
+	if (ext4_split_extent_zeroout(handle, inode, path, map, orig_flags))
+		/*
+		 * Something went wrong in zeroout
+		 */
+		goto out_free_path;
+
+success:
 	if (allocated) {
 		if (map->m_lblk + map->m_len > ee_block + ee_len)
 			*allocated = ee_len - (map->m_lblk - ee_block);
@@ -3450,6 +3541,12 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
 	}
 	ext4_ext_show_leaf(inode, path);
 	return path;
+
+out_free_path:
+	ext4_free_ext_path(path);
+out_orig_err:
+	return ERR_PTR(orig_err);
+
 }
 
 /*
@@ -3485,7 +3582,7 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
 	ext4_lblk_t ee_block, eof_block;
 	unsigned int ee_len, depth, map_len = map->m_len;
 	int err = 0;
-	int split_flag = EXT4_EXT_DATA_VALID2;
+	int split_flag = 0;
 	unsigned int max_zeroout = 0;
 
 	ext_debug(inode, "logical block %llu, max_blocks %u\n",
@@ -3695,7 +3792,7 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
 
 fallback:
 	path = ext4_split_extent(handle, inode, path, &split_map, split_flag,
-				 flags, NULL);
+				 flags | EXT4_GET_BLOCKS_CONVERT, NULL);
 	if (IS_ERR(path))
 		return path;
 out:
@@ -3759,11 +3856,7 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
 
-	/* Convert to unwritten */
-	if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
-		split_flag |= EXT4_EXT_DATA_ENTIRE_VALID1;
-	/* Split the existing unwritten extent */
-	} else if (flags & (EXT4_GET_BLOCKS_UNWRIT_EXT |
+	if (flags & (EXT4_GET_BLOCKS_UNWRIT_EXT |
 			    EXT4_GET_BLOCKS_CONVERT)) {
 		/*
 		 * It is safe to convert extent to initialized via explicit
@@ -3772,9 +3865,6 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
 		split_flag |= ee_block + ee_len <= eof_block ?
 			      EXT4_EXT_MAY_ZEROOUT : 0;
 		split_flag |= EXT4_EXT_MARK_UNWRIT2;
-		/* Convert to initialized */
-		if (flags & EXT4_GET_BLOCKS_CONVERT)
-			split_flag |= EXT4_EXT_DATA_VALID2;
 	}
 	flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE;
 	return ext4_split_extent(handle, inode, path, map, split_flag, flags,
-- 
2.52.0

Re: [PATCH v2 6/8] ext4: Refactor zeroout path and handle all cases

Posted by Zhang Yi 3 weeks ago

On 1/14/2026 10:57 PM, Ojaswin Mujoo wrote:
> Currently, zeroout is used as a fallback in case we fail to
> split/convert extents in the "traditional" modify-the-extent-tree way.
> This is essential to mitigate failures in critical paths like extent
> splitting during endio. However, the logic is very messy and not easy to
> follow. Further, the fragile use of various flags has made it prone to
> errors.
> 
> Refactor zeroout out logic by moving it up to ext4_split_extents().
> Further, zeroout correctly based on the type of conversion we want, ie:
> - unwritten to written: Zeroout everything around the mapped range.
> - written to unwritten: Zeroout only the mapped range.
> 
> Also, ext4_ext_convert_to_initialized() now passes
> EXT4_GET_BLOCKS_CONVERT to make the intention clear.
> 
> Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>

Overall looks good to me besides one comment below. Feel free to add 
after fixing it:

Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

> ---
>   fs/ext4/extents.c | 286 ++++++++++++++++++++++++++++++----------------
>   1 file changed, 188 insertions(+), 98 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 54f45b40fe73..70d85f007dc7 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -44,14 +44,6 @@
>   #define EXT4_EXT_MARK_UNWRIT1	0x2  /* mark first half unwritten */
>   #define EXT4_EXT_MARK_UNWRIT2	0x4  /* mark second half unwritten */
>   
> -/* first half contains valid data */
> -#define EXT4_EXT_DATA_ENTIRE_VALID1	0x8   /* has entirely valid data */
> -#define EXT4_EXT_DATA_PARTIAL_VALID1	0x10  /* has partially valid data */
> -#define EXT4_EXT_DATA_VALID1		(EXT4_EXT_DATA_ENTIRE_VALID1 | \
> -					 EXT4_EXT_DATA_PARTIAL_VALID1)
> -
> -#define EXT4_EXT_DATA_VALID2	0x20 /* second half contains valid data */
> -
>   static __le32 ext4_extent_block_csum(struct inode *inode,
>   				     struct ext4_extent_header *eh)
>   {
> @@ -3193,7 +3185,8 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
>    *  a> the extent are splitted into two extent.
>    *  b> split is not needed, and just mark the extent.
>    *
> - * Return an extent path pointer on success, or an error pointer on failure.
> + * Return an extent path pointer on success, or an error pointer on failure. On
> + * failure, the extent is restored to original state.
>    */
>   static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
>   						  struct inode *inode,
> @@ -3203,14 +3196,10 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
>   {
>   	ext4_fsblk_t newblock;
>   	ext4_lblk_t ee_block;
> -	struct ext4_extent *ex, newex, orig_ex, zero_ex;
> +	struct ext4_extent *ex, newex, orig_ex;
>   	struct ext4_extent *ex2 = NULL;
>   	unsigned int ee_len, depth;
> -	int err = 0;
> -
> -	BUG_ON((split_flag & EXT4_EXT_DATA_VALID1) == EXT4_EXT_DATA_VALID1);
> -	BUG_ON((split_flag & EXT4_EXT_DATA_VALID1) &&
> -	       (split_flag & EXT4_EXT_DATA_VALID2));
> +	int err = 0, insert_err = 0;
>   
>   	/* Do not cache extents that are in the process of being modified. */
>   	flags |= EXT4_EX_NOCACHE;
> @@ -3276,11 +3265,10 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
>   
>   	path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
>   	if (!IS_ERR(path))
> -		goto out;
> +		return path;
>   
> -	err = PTR_ERR(path);
> -	if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
> -		goto out_path;
> +	insert_err = PTR_ERR(path);
> +	err = 0;
>   
>   	/*
>   	 * Get a new path to try to zeroout or fix the extent length.
> @@ -3296,72 +3284,130 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
>   				 split, PTR_ERR(path));
>   		goto out_path;
>   	}
> +
> +	err = ext4_ext_get_access(handle, inode, path + depth);
> +	if (err)
> +		goto out;
> +
>   	depth = ext_depth(inode);
>   	ex = path[depth].p_ext;
>   
> -	if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
> -		if (split_flag & EXT4_EXT_DATA_VALID1)
> -			memcpy(&zero_ex, ex2, sizeof(zero_ex));
> -		else if (split_flag & EXT4_EXT_DATA_VALID2)
> -			memcpy(&zero_ex, ex, sizeof(zero_ex));
> -		else
> -			memcpy(&zero_ex, &orig_ex, sizeof(zero_ex));
> -		ext4_ext_mark_initialized(&zero_ex);
> +fix_extent_len:
> +	ex->ee_len = orig_ex.ee_len;
> +	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
> +out:
> +	if (err || insert_err) {
> +		ext4_free_ext_path(path);
> +		path = err ? ERR_PTR(err) : ERR_PTR(insert_err);
> +	}
> +out_path:
> +	if (IS_ERR(path))
> +		/* Remove all remaining potentially stale extents. */
> +		ext4_es_remove_extent(inode, ee_block, ee_len);
> +	ext4_ext_show_leaf(inode, path);
> +	return path;
> +}
>   
> -		err = ext4_ext_zeroout(inode, &zero_ex);
> -		if (err)
> -			goto fix_extent_len;
> +static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
> +				     struct ext4_ext_path *path,
> +				     struct ext4_map_blocks *map, int flags)
> +{
> +	struct ext4_extent *ex;
> +	unsigned int ee_len, depth;
> +	ext4_lblk_t ee_block;
> +	uint64_t lblk, pblk, len;
> +	int is_unwrit;
> +	int err = 0;
> +
> +	depth = ext_depth(inode);
> +	ex = path[depth].p_ext;
> +	ee_block = le32_to_cpu(ex->ee_block);
> +	ee_len = ext4_ext_get_actual_len(ex);
> +	is_unwrit = ext4_ext_is_unwritten(ex);
>   
> +	if (flags & EXT4_GET_BLOCKS_CONVERT) {
>   		/*
> -		 * The first half contains partially valid data, the splitting
> -		 * of this extent has not been completed, fix extent length
> -		 * and ext4_split_extent() split will the first half again.
> +		 * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
> +		 * map to be initialized. Zeroout everything except the map
> +		 * range.
>   		 */
> -		if (split_flag & EXT4_EXT_DATA_PARTIAL_VALID1) {
> -			/*
> -			 * Drop extent cache to prevent stale unwritten
> -			 * extents remaining after zeroing out.
> -			 */
> -			ext4_es_remove_extent(inode,
> -					le32_to_cpu(zero_ex.ee_block),
> -					ext4_ext_get_actual_len(&zero_ex));
> -			goto fix_extent_len;
> +
> +		loff_t map_end = (loff_t) map->m_lblk + map->m_len;
> +		loff_t ex_end = (loff_t) ee_block + ee_len;
> +
> +		if (!is_unwrit)
> +			/* Shouldn't happen. Just exit */
> +			return -EINVAL;
> +
> +		/* zeroout left */
> +		if (map->m_lblk > ee_block) {
> +			lblk = ee_block;
> +			len = map->m_lblk - ee_block;
> +			pblk = ext4_ext_pblock(ex);
> +			err = ext4_issue_zeroout(inode, lblk, pblk, len);
> +			if (err)
> +				/* ZEROOUT failed, just return original error */
> +				return err;
>   		}
>   
> -		/* update the extent length and mark as initialized */
> -		ex->ee_len = cpu_to_le16(ee_len);
> -		ext4_ext_try_to_merge(handle, inode, path, ex);
> -		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
> -		if (!err)
> -			/* update extent status tree */
> -			ext4_zeroout_es(inode, &zero_ex);
> +		/* zeroout right */
> +		if (map->m_lblk + map->m_len < ee_block + ee_len) {
> +			lblk = map_end;
> +			len = ex_end - map_end;
> +			pblk = ext4_ext_pblock(ex) + (map_end - ee_block);
> +			err = ext4_issue_zeroout(inode, lblk, pblk, len);
> +			if (err)
> +				/* ZEROOUT failed, just return original error */
> +				return err;
> +		}
> +	} else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
>   		/*
> -		 * If we failed at this point, we don't know in which
> -		 * state the extent tree exactly is so don't try to fix
> -		 * length of the original extent as it may do even more
> -		 * damage.
> +		 * EXT4_GET_BLOCKS_CONVERT_UNWRITTEN: Caller wants the
> +		 * range specified by map to be marked unwritten.
> +		 * Zeroout the map range leaving rest as it is.
>   		 */
> -		goto out;
> +
> +		if (is_unwrit)
> +			/* Shouldn't happen. Just exit */
> +			return -EINVAL;
> +
> +		lblk = map->m_lblk;
> +		len = map->m_len;
> +		pblk = ext4_ext_pblock(ex) + (map->m_lblk - ee_block);
> +		err = ext4_issue_zeroout(inode, lblk, pblk, len);
> +		if (err)
> +			/* ZEROOUT failed, just return original error */
> +			return err;
> +	} else {
> +		/*
> +		 * We no longer perform unwritten to unwritten splits in IO paths.
> +		 * Hence this should not happen.
> +		 */
> +		WARN_ON_ONCE(true);
> +		return -EINVAL;
>   	}
>   
> -fix_extent_len:
> -	ex->ee_len = orig_ex.ee_len;
> +	err = ext4_ext_get_access(handle, inode, path + depth);
> +	if (err)
> +		return err;
> +
> +	ext4_ext_mark_initialized(ex);
> +
> +	ext4_ext_dirty(handle, inode, path + path->p_depth);
> +	if (err)
> +		return err;
> +
>   	/*
> -	 * Ignore ext4_ext_dirty return value since we are already in error path
> -	 * and err is a non-zero error code.
> +	 * The whole extent is initialized and stable now so it can be added to
> +	 * es cache
>   	 */
> -	ext4_ext_dirty(handle, inode, path + path->p_depth);
> -out:
> -	if (err) {
> -		ext4_free_ext_path(path);
> -		path = ERR_PTR(err);
> -	}
> -out_path:
> -	if (IS_ERR(path))
> -		/* Remove all remaining potentially stale extents. */
> -		ext4_es_remove_extent(inode, ee_block, ee_len);
> -	ext4_ext_show_leaf(inode, path);
> -	return path;
> +	if (!(flags & EXT4_EX_NOCACHE))
> +		ext4_es_insert_extent(inode, le32_to_cpu(ex->ee_block),
> +				      ext4_ext_get_actual_len(ex),
> +				      ext4_ext_pblock(ex),
> +				      EXTENT_STATUS_WRITTEN, false);
> +
> +	return 0;
>   }
>   
>   /*
> @@ -3382,11 +3428,13 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
>   					       int split_flag, int flags,
>   					       unsigned int *allocated)
>   {
> -	ext4_lblk_t ee_block;
> +	ext4_lblk_t ee_block, orig_ee_block;
>   	struct ext4_extent *ex;
> -	unsigned int ee_len, depth;
> -	int unwritten;
> -	int split_flag1, flags1;
> +	unsigned int ee_len, orig_ee_len, depth;
> +	int unwritten, orig_unwritten;
> +	int split_flag1 = 0, flags1 = 0;
> +	int  orig_err = 0;
> +	int orig_flags = flags;
>   
>   	depth = ext_depth(inode);
>   	ex = path[depth].p_ext;
> @@ -3394,30 +3442,31 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
>   	ee_len = ext4_ext_get_actual_len(ex);
>   	unwritten = ext4_ext_is_unwritten(ex);
>   
> +	orig_ee_block = ee_block;
> +	orig_ee_len = ee_len;
> +	orig_unwritten = unwritten;
> +
>   	/* Do not cache extents that are in the process of being modified. */
>   	flags |= EXT4_EX_NOCACHE;
>   
>   	if (map->m_lblk + map->m_len < ee_block + ee_len) {
> -		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
>   		flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
>   		if (unwritten)
>   			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
>   				       EXT4_EXT_MARK_UNWRIT2;
> -		if (split_flag & EXT4_EXT_DATA_VALID2)
> -			split_flag1 |= map->m_lblk > ee_block ?
> -				       EXT4_EXT_DATA_PARTIAL_VALID1 :
> -				       EXT4_EXT_DATA_ENTIRE_VALID1;
>   		path = ext4_split_extent_at(handle, inode, path,
>   				map->m_lblk + map->m_len, split_flag1, flags1);
>   		if (IS_ERR(path))
> -			return path;
> +			goto try_zeroout;
> +
>   		/*
>   		 * Update path is required because previous ext4_split_extent_at
>   		 * may result in split of original leaf or extent zeroout.
>   		 */
>   		path = ext4_find_extent(inode, map->m_lblk, path, flags);
>   		if (IS_ERR(path))
> -			return path;
> +			goto try_zeroout;
> +
>   		depth = ext_depth(inode);
>   		ex = path[depth].p_ext;
>   		if (!ex) {
> @@ -3426,22 +3475,64 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
>   			ext4_free_ext_path(path);
>   			return ERR_PTR(-EFSCORRUPTED);
>   		}
> -		unwritten = ext4_ext_is_unwritten(ex);

We need to update the 'orig_ee_len' parameter here, otherwise
it would trigger WARN_ON(ee_len != orig_ee_len) below.

Thanks,
Yi.

>   	}
>   
>   	if (map->m_lblk >= ee_block) {
> -		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
> +		split_flag1 = 0;
>   		if (unwritten) {
>   			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
> -			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
> -						     EXT4_EXT_MARK_UNWRIT2);
> +			split_flag1 |= split_flag & EXT4_EXT_MARK_UNWRIT2;
>   		}
> -		path = ext4_split_extent_at(handle, inode, path,
> -				map->m_lblk, split_flag1, flags);
> +		path = ext4_split_extent_at(handle, inode, path, map->m_lblk,
> +					    split_flag1, flags);
>   		if (IS_ERR(path))
> -			return path;
> +			goto try_zeroout;
>   	}
>   
> +	goto success;
> +
> +try_zeroout:
> +	/*
> +	 * There was an error in splitting the extent. So instead, just zeroout
> +	 * unwritten portions and convert it to initialize as a last resort. If
> +	 * there is any failure here we just return the original error
> +	 */
> +
> +	orig_err = PTR_ERR(path);
> +	if (orig_err != -ENOSPC && orig_err != -EDQUOT && orig_err != -ENOMEM)
> +		goto out_orig_err;
> +
> +	if (!(split_flag & EXT4_EXT_MAY_ZEROOUT))
> +		/* There's an error and we can't zeroout, just return the
> +		 * original err
> +		 */
> +		goto out_orig_err;
> +
> +	path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
> +	if (IS_ERR(path))
> +		goto out_orig_err;
> +
> +	depth = ext_depth(inode);
> +	ex = path[depth].p_ext;
> +	ee_block = le32_to_cpu(ex->ee_block);
> +	ee_len = ext4_ext_get_actual_len(ex);
> +	unwritten = ext4_ext_is_unwritten(ex);
> +
> +	if (WARN_ON(ee_block != orig_ee_block || ee_len != orig_ee_len ||
> +		    unwritten != orig_unwritten))
> +		/*
> +		 * The extent to zeroout should have been unchanged
> +		 * but its not.
> +		 */
> +		goto out_free_path;
> +
> +	if (ext4_split_extent_zeroout(handle, inode, path, map, orig_flags))
> +		/*
> +		 * Something went wrong in zeroout
> +		 */
> +		goto out_free_path;
> +
> +success:
>   	if (allocated) {
>   		if (map->m_lblk + map->m_len > ee_block + ee_len)
>   			*allocated = ee_len - (map->m_lblk - ee_block);
> @@ -3450,6 +3541,12 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
>   	}
>   	ext4_ext_show_leaf(inode, path);
>   	return path;
> +
> +out_free_path:
> +	ext4_free_ext_path(path);
> +out_orig_err:
> +	return ERR_PTR(orig_err);
> +
>   }
>   
>   /*
> @@ -3485,7 +3582,7 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
>   	ext4_lblk_t ee_block, eof_block;
>   	unsigned int ee_len, depth, map_len = map->m_len;
>   	int err = 0;
> -	int split_flag = EXT4_EXT_DATA_VALID2;
> +	int split_flag = 0;
>   	unsigned int max_zeroout = 0;
>   
>   	ext_debug(inode, "logical block %llu, max_blocks %u\n",
> @@ -3695,7 +3792,7 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
>   
>   fallback:
>   	path = ext4_split_extent(handle, inode, path, &split_map, split_flag,
> -				 flags, NULL);
> +				 flags | EXT4_GET_BLOCKS_CONVERT, NULL);
>   	if (IS_ERR(path))
>   		return path;
>   out:
> @@ -3759,11 +3856,7 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
>   	ee_block = le32_to_cpu(ex->ee_block);
>   	ee_len = ext4_ext_get_actual_len(ex);
>   
> -	/* Convert to unwritten */
> -	if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
> -		split_flag |= EXT4_EXT_DATA_ENTIRE_VALID1;
> -	/* Split the existing unwritten extent */
> -	} else if (flags & (EXT4_GET_BLOCKS_UNWRIT_EXT |
> +	if (flags & (EXT4_GET_BLOCKS_UNWRIT_EXT |
>   			    EXT4_GET_BLOCKS_CONVERT)) {
>   		/*
>   		 * It is safe to convert extent to initialized via explicit
> @@ -3772,9 +3865,6 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
>   		split_flag |= ee_block + ee_len <= eof_block ?
>   			      EXT4_EXT_MAY_ZEROOUT : 0;
>   		split_flag |= EXT4_EXT_MARK_UNWRIT2;
> -		/* Convert to initialized */
> -		if (flags & EXT4_GET_BLOCKS_CONVERT)
> -			split_flag |= EXT4_EXT_DATA_VALID2;
>   	}
>   	flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE;
>   	return ext4_split_extent(handle, inode, path, map, split_flag, flags,

Re: [PATCH v2 6/8] ext4: Refactor zeroout path and handle all cases

Posted by Ojaswin Mujoo 2 weeks, 5 days ago

On Sat, Jan 17, 2026 at 04:00:30PM +0800, Zhang Yi wrote:
> On 1/14/2026 10:57 PM, Ojaswin Mujoo wrote:
> > Currently, zeroout is used as a fallback in case we fail to
> > split/convert extents in the "traditional" modify-the-extent-tree way.
> > This is essential to mitigate failures in critical paths like extent
> > splitting during endio. However, the logic is very messy and not easy to
> > follow. Further, the fragile use of various flags has made it prone to
> > errors.
> > 
> > Refactor zeroout out logic by moving it up to ext4_split_extents().
> > Further, zeroout correctly based on the type of conversion we want, ie:
> > - unwritten to written: Zeroout everything around the mapped range.
> > - written to unwritten: Zeroout only the mapped range.
> > 
> > Also, ext4_ext_convert_to_initialized() now passes
> > EXT4_GET_BLOCKS_CONVERT to make the intention clear.
> > 
> > Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> 
> Overall looks good to me besides one comment below. Feel free to add after
> fixing it:
> 
> Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
> 
> > ---
> >   fs/ext4/extents.c | 286 ++++++++++++++++++++++++++++++----------------
> >   1 file changed, 188 insertions(+), 98 deletions(-)
> > 
> > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > index 54f45b40fe73..70d85f007dc7 100644
> > --- a/fs/ext4/extents.c
> > +++ b/fs/ext4/extents.c
> > @@ -44,14 +44,6 @@
> >   #define EXT4_EXT_MARK_UNWRIT1	0x2  /* mark first half unwritten */
> >   #define EXT4_EXT_MARK_UNWRIT2	0x4  /* mark second half unwritten */

<...>

> > -			split_flag1 |= map->m_lblk > ee_block ?
> > -				       EXT4_EXT_DATA_PARTIAL_VALID1 :
> > -				       EXT4_EXT_DATA_ENTIRE_VALID1;
> >   		path = ext4_split_extent_at(handle, inode, path,
> >   				map->m_lblk + map->m_len, split_flag1, flags1);
> >   		if (IS_ERR(path))
> > -			return path;
> > +			goto try_zeroout;
> > +
> >   		/*
> >   		 * Update path is required because previous ext4_split_extent_at
> >   		 * may result in split of original leaf or extent zeroout.
> >   		 */
> >   		path = ext4_find_extent(inode, map->m_lblk, path, flags);
> >   		if (IS_ERR(path))
> > -			return path;
> > +			goto try_zeroout;
> > +
> >   		depth = ext_depth(inode);
> >   		ex = path[depth].p_ext;
> >   		if (!ex) {
> > @@ -3426,22 +3475,64 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
> >   			ext4_free_ext_path(path);
> >   			return ERR_PTR(-EFSCORRUPTED);
> >   		}
> > -		unwritten = ext4_ext_is_unwritten(ex);
> 
> We need to update the 'orig_ee_len' parameter here, otherwise
> it would trigger WARN_ON(ee_len != orig_ee_len) below.
> 
> Thanks,
> Yi.

Hi Yi, thanks for the reviews! 

Yes thats a nice catch, I'll change it in v3.

Regards,
ojaswin

> 
> >   	}
> >   	if (map->m_lblk >= ee_block) {
> > -		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
> > +		split_flag1 = 0;
> >   		if (unwritten) {
> >   			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
> > -			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
> > -						     EXT4_EXT_MARK_UNWRIT2);
> > +			split_flag1 |= split_flag & EXT4_EXT_MARK_UNWRIT2;
> >   		}

<...>

Re: [PATCH v2 6/8] ext4: Refactor zeroout path and handle all cases

Posted by Jan Kara 3 weeks, 2 days ago

On Wed 14-01-26 20:27:50, Ojaswin Mujoo wrote:
> Currently, zeroout is used as a fallback in case we fail to
> split/convert extents in the "traditional" modify-the-extent-tree way.
> This is essential to mitigate failures in critical paths like extent
> splitting during endio. However, the logic is very messy and not easy to
> follow. Further, the fragile use of various flags has made it prone to
> errors.
> 
> Refactor zeroout out logic by moving it up to ext4_split_extents().
> Further, zeroout correctly based on the type of conversion we want, ie:
> - unwritten to written: Zeroout everything around the mapped range.
> - written to unwritten: Zeroout only the mapped range.
> 
> Also, ext4_ext_convert_to_initialized() now passes
> EXT4_GET_BLOCKS_CONVERT to make the intention clear.
> 
> Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>

Overall looks nice. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

A few nits below:

> +static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
> +				     struct ext4_ext_path *path,
> +				     struct ext4_map_blocks *map, int flags)
> +{
> +	struct ext4_extent *ex;
> +	unsigned int ee_len, depth;
> +	ext4_lblk_t ee_block;
> +	uint64_t lblk, pblk, len;
> +	int is_unwrit;
> +	int err = 0;
> +
> +	depth = ext_depth(inode);
> +	ex = path[depth].p_ext;
> +	ee_block = le32_to_cpu(ex->ee_block);
> +	ee_len = ext4_ext_get_actual_len(ex);
> +	is_unwrit = ext4_ext_is_unwritten(ex);
>  
> +	if (flags & EXT4_GET_BLOCKS_CONVERT) {
>  		/*
> -		 * The first half contains partially valid data, the splitting
> -		 * of this extent has not been completed, fix extent length
> -		 * and ext4_split_extent() split will the first half again.
> +		 * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
> +		 * map to be initialized. Zeroout everything except the map
> +		 * range.
>  		 */
> -		if (split_flag & EXT4_EXT_DATA_PARTIAL_VALID1) {
> -			/*
> -			 * Drop extent cache to prevent stale unwritten
> -			 * extents remaining after zeroing out.
> -			 */
> -			ext4_es_remove_extent(inode,
> -					le32_to_cpu(zero_ex.ee_block),
> -					ext4_ext_get_actual_len(&zero_ex));
> -			goto fix_extent_len;
> +
> +		loff_t map_end = (loff_t) map->m_lblk + map->m_len;
> +		loff_t ex_end = (loff_t) ee_block + ee_len;
> +
> +		if (!is_unwrit)
> +			/* Shouldn't happen. Just exit */
> +			return -EINVAL;
> +
> +		/* zeroout left */
> +		if (map->m_lblk > ee_block) {
> +			lblk = ee_block;
> +			len = map->m_lblk - ee_block;
> +			pblk = ext4_ext_pblock(ex);
> +			err = ext4_issue_zeroout(inode, lblk, pblk, len);
> +			if (err)
> +				/* ZEROOUT failed, just return original error */
> +				return err;
>  		}
>  
> -		/* update the extent length and mark as initialized */
> -		ex->ee_len = cpu_to_le16(ee_len);
> -		ext4_ext_try_to_merge(handle, inode, path, ex);
> -		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
> -		if (!err)
> -			/* update extent status tree */
> -			ext4_zeroout_es(inode, &zero_ex);
> +		/* zeroout right */
> +		if (map->m_lblk + map->m_len < ee_block + ee_len) {

Use map_end and ex_end in the above condition when we have it?

...
> @@ -3382,11 +3428,13 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
>  					       int split_flag, int flags,
>  					       unsigned int *allocated)
>  {
> -	ext4_lblk_t ee_block;
> +	ext4_lblk_t ee_block, orig_ee_block;
>  	struct ext4_extent *ex;
> -	unsigned int ee_len, depth;
> -	int unwritten;
> -	int split_flag1, flags1;
> +	unsigned int ee_len, orig_ee_len, depth;
> +	int unwritten, orig_unwritten;
> +	int split_flag1 = 0, flags1 = 0;
> +	int  orig_err = 0;
	   ^^ extra space

> +	int orig_flags = flags;
>  
>  	depth = ext_depth(inode);
>  	ex = path[depth].p_ext;
> @@ -3394,30 +3442,31 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
>  	ee_len = ext4_ext_get_actual_len(ex);
>  	unwritten = ext4_ext_is_unwritten(ex);
>  
> +	orig_ee_block = ee_block;
> +	orig_ee_len = ee_len;
> +	orig_unwritten = unwritten;
> +
>  	/* Do not cache extents that are in the process of being modified. */
>  	flags |= EXT4_EX_NOCACHE;
>  
>  	if (map->m_lblk + map->m_len < ee_block + ee_len) {
> -		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
>  		flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
>  		if (unwritten)
>  			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
>  				       EXT4_EXT_MARK_UNWRIT2;
> -		if (split_flag & EXT4_EXT_DATA_VALID2)
> -			split_flag1 |= map->m_lblk > ee_block ?
> -				       EXT4_EXT_DATA_PARTIAL_VALID1 :
> -				       EXT4_EXT_DATA_ENTIRE_VALID1;
>  		path = ext4_split_extent_at(handle, inode, path,
>  				map->m_lblk + map->m_len, split_flag1, flags1);
>  		if (IS_ERR(path))
> -			return path;
> +			goto try_zeroout;
> +
>  		/*
>  		 * Update path is required because previous ext4_split_extent_at
>  		 * may result in split of original leaf or extent zeroout.
>  		 */
>  		path = ext4_find_extent(inode, map->m_lblk, path, flags);
>  		if (IS_ERR(path))
> -			return path;
> +			goto try_zeroout;
> +
>  		depth = ext_depth(inode);
>  		ex = path[depth].p_ext;
>  		if (!ex) {
> @@ -3426,22 +3475,64 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
>  			ext4_free_ext_path(path);
>  			return ERR_PTR(-EFSCORRUPTED);
>  		}
> -		unwritten = ext4_ext_is_unwritten(ex);
>  	}
>  
>  	if (map->m_lblk >= ee_block) {
> -		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
> +		split_flag1 = 0;
>  		if (unwritten) {
>  			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
> -			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
> -						     EXT4_EXT_MARK_UNWRIT2);
> +			split_flag1 |= split_flag & EXT4_EXT_MARK_UNWRIT2;
>  		}
> -		path = ext4_split_extent_at(handle, inode, path,
> -				map->m_lblk, split_flag1, flags);
> +		path = ext4_split_extent_at(handle, inode, path, map->m_lblk,
> +					    split_flag1, flags);
>  		if (IS_ERR(path))
> -			return path;
> +			goto try_zeroout;
>  	}
>  
> +	goto success;
> +
> +try_zeroout:
> +	/*
> +	 * There was an error in splitting the extent. So instead, just zeroout
> +	 * unwritten portions and convert it to initialize as a last resort. If
> +	 * there is any failure here we just return the original error
> +	 */
> +
> +	orig_err = PTR_ERR(path);
> +	if (orig_err != -ENOSPC && orig_err != -EDQUOT && orig_err != -ENOMEM)
> +		goto out_orig_err;
> +
> +	if (!(split_flag & EXT4_EXT_MAY_ZEROOUT))
> +		/* There's an error and we can't zeroout, just return the
> +		 * original err
> +		 */

I'd put this before if and just write:

	/* We can't zeroout? Just return the original error */

so that the comment fits on a single line :)

> +		goto out_orig_err;
> +
> +	path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
> +	if (IS_ERR(path))
> +		goto out_orig_err;
> +
> +	depth = ext_depth(inode);
> +	ex = path[depth].p_ext;
> +	ee_block = le32_to_cpu(ex->ee_block);
> +	ee_len = ext4_ext_get_actual_len(ex);
> +	unwritten = ext4_ext_is_unwritten(ex);
> +
> +	if (WARN_ON(ee_block != orig_ee_block || ee_len != orig_ee_len ||
> +		    unwritten != orig_unwritten))
> +		/*
> +		 * The extent to zeroout should have been unchanged
> +		 * but its not.
> +		 */
> +		goto out_free_path;
> +
> +	if (ext4_split_extent_zeroout(handle, inode, path, map, orig_flags))
> +		/*
> +		 * Something went wrong in zeroout
> +		 */

I think this comment isn't really useful...

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

Re: [PATCH v2 6/8] ext4: Refactor zeroout path and handle all cases

Posted by Ojaswin Mujoo 3 weeks, 1 day ago

On Thu, Jan 15, 2026 at 01:01:14PM +0100, Jan Kara wrote:
> On Wed 14-01-26 20:27:50, Ojaswin Mujoo wrote:
> > Currently, zeroout is used as a fallback in case we fail to
> > split/convert extents in the "traditional" modify-the-extent-tree way.
> > This is essential to mitigate failures in critical paths like extent
> > splitting during endio. However, the logic is very messy and not easy to
> > follow. Further, the fragile use of various flags has made it prone to
> > errors.
> > 
> > Refactor zeroout out logic by moving it up to ext4_split_extents().
> > Further, zeroout correctly based on the type of conversion we want, ie:
> > - unwritten to written: Zeroout everything around the mapped range.
> > - written to unwritten: Zeroout only the mapped range.
> > 
> > Also, ext4_ext_convert_to_initialized() now passes
> > EXT4_GET_BLOCKS_CONVERT to make the intention clear.
> > 
> > Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> 
> Overall looks nice. Feel free to add:
> 
> Reviewed-by: Jan Kara <jack@suse.cz>

Thanks for the review Jan, I'll make the changes you suggested in v3.

Regards,
ojaswin
> 
> A few nits below:
> 
> > +static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
> > +				     struct ext4_ext_path *path,
> > +				     struct ext4_map_blocks *map, int flags)
> > +{
> > +	struct ext4_extent *ex;
> > +	unsigned int ee_len, depth;
> > +	ext4_lblk_t ee_block;
> > +	uint64_t lblk, pblk, len;
> > +	int is_unwrit;
> > +	int err = 0;
> > +
> > +	depth = ext_depth(inode);
> > +	ex = path[depth].p_ext;
> > +	ee_block = le32_to_cpu(ex->ee_block);
> > +	ee_len = ext4_ext_get_actual_len(ex);
> > +	is_unwrit = ext4_ext_is_unwritten(ex);
> >  
> > +	if (flags & EXT4_GET_BLOCKS_CONVERT) {
> >  		/*
> > -		 * The first half contains partially valid data, the splitting
> > -		 * of this extent has not been completed, fix extent length
> > -		 * and ext4_split_extent() split will the first half again.
> > +		 * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
> > +		 * map to be initialized. Zeroout everything except the map
> > +		 * range.
> >  		 */
> > -		if (split_flag & EXT4_EXT_DATA_PARTIAL_VALID1) {
> > -			/*
> > -			 * Drop extent cache to prevent stale unwritten
> > -			 * extents remaining after zeroing out.
> > -			 */
> > -			ext4_es_remove_extent(inode,
> > -					le32_to_cpu(zero_ex.ee_block),
> > -					ext4_ext_get_actual_len(&zero_ex));
> > -			goto fix_extent_len;
> > +
> > +		loff_t map_end = (loff_t) map->m_lblk + map->m_len;
> > +		loff_t ex_end = (loff_t) ee_block + ee_len;
> > +
> > +		if (!is_unwrit)
> > +			/* Shouldn't happen. Just exit */
> > +			return -EINVAL;
> > +
> > +		/* zeroout left */
> > +		if (map->m_lblk > ee_block) {
> > +			lblk = ee_block;
> > +			len = map->m_lblk - ee_block;
> > +			pblk = ext4_ext_pblock(ex);
> > +			err = ext4_issue_zeroout(inode, lblk, pblk, len);
> > +			if (err)
> > +				/* ZEROOUT failed, just return original error */
> > +				return err;
> >  		}
> >  
> > -		/* update the extent length and mark as initialized */
> > -		ex->ee_len = cpu_to_le16(ee_len);
> > -		ext4_ext_try_to_merge(handle, inode, path, ex);
> > -		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
> > -		if (!err)
> > -			/* update extent status tree */
> > -			ext4_zeroout_es(inode, &zero_ex);
> > +		/* zeroout right */
> > +		if (map->m_lblk + map->m_len < ee_block + ee_len) {
> 
> Use map_end and ex_end in the above condition when we have it?
> 
> ...
> > @@ -3382,11 +3428,13 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
> >  					       int split_flag, int flags,
> >  					       unsigned int *allocated)
> >  {
> > -	ext4_lblk_t ee_block;
> > +	ext4_lblk_t ee_block, orig_ee_block;
> >  	struct ext4_extent *ex;
> > -	unsigned int ee_len, depth;
> > -	int unwritten;
> > -	int split_flag1, flags1;
> > +	unsigned int ee_len, orig_ee_len, depth;
> > +	int unwritten, orig_unwritten;
> > +	int split_flag1 = 0, flags1 = 0;
> > +	int  orig_err = 0;
> 	   ^^ extra space
> 
> > +	int orig_flags = flags;
> >  
> >  	depth = ext_depth(inode);
> >  	ex = path[depth].p_ext;
> > @@ -3394,30 +3442,31 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
> >  	ee_len = ext4_ext_get_actual_len(ex);
> >  	unwritten = ext4_ext_is_unwritten(ex);
> >  
> > +	orig_ee_block = ee_block;
> > +	orig_ee_len = ee_len;
> > +	orig_unwritten = unwritten;
> > +
> >  	/* Do not cache extents that are in the process of being modified. */
> >  	flags |= EXT4_EX_NOCACHE;
> >  
> >  	if (map->m_lblk + map->m_len < ee_block + ee_len) {
> > -		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
> >  		flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
> >  		if (unwritten)
> >  			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
> >  				       EXT4_EXT_MARK_UNWRIT2;
> > -		if (split_flag & EXT4_EXT_DATA_VALID2)
> > -			split_flag1 |= map->m_lblk > ee_block ?
> > -				       EXT4_EXT_DATA_PARTIAL_VALID1 :
> > -				       EXT4_EXT_DATA_ENTIRE_VALID1;
> >  		path = ext4_split_extent_at(handle, inode, path,
> >  				map->m_lblk + map->m_len, split_flag1, flags1);
> >  		if (IS_ERR(path))
> > -			return path;
> > +			goto try_zeroout;
> > +
> >  		/*
> >  		 * Update path is required because previous ext4_split_extent_at
> >  		 * may result in split of original leaf or extent zeroout.
> >  		 */
> >  		path = ext4_find_extent(inode, map->m_lblk, path, flags);
> >  		if (IS_ERR(path))
> > -			return path;
> > +			goto try_zeroout;
> > +
> >  		depth = ext_depth(inode);
> >  		ex = path[depth].p_ext;
> >  		if (!ex) {
> > @@ -3426,22 +3475,64 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
> >  			ext4_free_ext_path(path);
> >  			return ERR_PTR(-EFSCORRUPTED);
> >  		}
> > -		unwritten = ext4_ext_is_unwritten(ex);
> >  	}
> >  
> >  	if (map->m_lblk >= ee_block) {
> > -		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
> > +		split_flag1 = 0;
> >  		if (unwritten) {
> >  			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
> > -			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
> > -						     EXT4_EXT_MARK_UNWRIT2);
> > +			split_flag1 |= split_flag & EXT4_EXT_MARK_UNWRIT2;
> >  		}
> > -		path = ext4_split_extent_at(handle, inode, path,
> > -				map->m_lblk, split_flag1, flags);
> > +		path = ext4_split_extent_at(handle, inode, path, map->m_lblk,
> > +					    split_flag1, flags);
> >  		if (IS_ERR(path))
> > -			return path;
> > +			goto try_zeroout;
> >  	}
> >  
> > +	goto success;
> > +
> > +try_zeroout:
> > +	/*
> > +	 * There was an error in splitting the extent. So instead, just zeroout
> > +	 * unwritten portions and convert it to initialize as a last resort. If
> > +	 * there is any failure here we just return the original error
> > +	 */
> > +
> > +	orig_err = PTR_ERR(path);
> > +	if (orig_err != -ENOSPC && orig_err != -EDQUOT && orig_err != -ENOMEM)
> > +		goto out_orig_err;
> > +
> > +	if (!(split_flag & EXT4_EXT_MAY_ZEROOUT))
> > +		/* There's an error and we can't zeroout, just return the
> > +		 * original err
> > +		 */
> 
> I'd put this before if and just write:
> 
> 	/* We can't zeroout? Just return the original error */
> 
> so that the comment fits on a single line :)
> 
> > +		goto out_orig_err;
> > +
> > +	path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
> > +	if (IS_ERR(path))
> > +		goto out_orig_err;
> > +
> > +	depth = ext_depth(inode);
> > +	ex = path[depth].p_ext;
> > +	ee_block = le32_to_cpu(ex->ee_block);
> > +	ee_len = ext4_ext_get_actual_len(ex);
> > +	unwritten = ext4_ext_is_unwritten(ex);
> > +
> > +	if (WARN_ON(ee_block != orig_ee_block || ee_len != orig_ee_len ||
> > +		    unwritten != orig_unwritten))
> > +		/*
> > +		 * The extent to zeroout should have been unchanged
> > +		 * but its not.
> > +		 */
> > +		goto out_free_path;
> > +
> > +	if (ext4_split_extent_zeroout(handle, inode, path, map, orig_flags))
> > +		/*
> > +		 * Something went wrong in zeroout
> > +		 */
> 
> I think this comment isn't really useful...
> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR