mm/page-writeback.c | 53 ++++++++++++++------------------------------- 1 file changed, 16 insertions(+), 37 deletions(-)
Address the feedback from "mm/page-writeback: raise wb_thresh to prevent
write blocking with strictlimit"(39ac99852fca98ca44d52716d792dfaf24981f53).
The wb_thresh bumping logic is scattered across wb_position_ratio,
__wb_calc_thresh, and wb_update_dirty_ratelimit. For consistency,
consolidate all wb_thresh bumping logic into __wb_calc_thresh.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jim Zhao <jimzhao.ai@gmail.com>
---
mm/page-writeback.c | 53 ++++++++++++++-------------------------------
1 file changed, 16 insertions(+), 37 deletions(-)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d213ead95675..8b13bcb42de3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -936,26 +936,25 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio);
wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE);
- wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
- if (wb_thresh > wb_max_thresh)
- wb_thresh = wb_max_thresh;
/*
- * With strictlimit flag, the wb_thresh is treated as
- * a hard limit in balance_dirty_pages() and wb_position_ratio().
- * It's possible that wb_thresh is close to zero, not because
- * the device is slow, but because it has been inactive.
- * To prevent occasional writes from being blocked, we raise wb_thresh.
+ * It's very possible that wb_thresh is close to 0 not because the
+ * device is slow, but that it has remained inactive for long time.
+ * Honour such devices a reasonable good (hopefully IO efficient)
+ * threshold, so that the occasional writes won't be blocked and active
+ * writes can rampup the threshold quickly.
*/
- if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
- unsigned long limit = hard_dirty_limit(dom, dtc->thresh);
- u64 wb_scale_thresh = 0;
-
- if (limit > dtc->dirty)
- wb_scale_thresh = (limit - dtc->dirty) / 100;
- wb_thresh = max(wb_thresh, min(wb_scale_thresh, wb_max_thresh / 4));
+ if (thresh > dtc->dirty) {
+ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT))
+ wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100);
+ else
+ wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8);
}
+ wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
+ if (wb_thresh > wb_max_thresh)
+ wb_thresh = wb_max_thresh;
+
return wb_thresh;
}
@@ -963,6 +962,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+ domain_dirty_avail(&gdtc, true);
return __wb_calc_thresh(&gdtc, thresh);
}
@@ -1139,12 +1139,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
long long wb_pos_ratio;
- if (dtc->wb_dirty < 8) {
- dtc->pos_ratio = min_t(long long, pos_ratio * 2,
- 2 << RATELIMIT_CALC_SHIFT);
- return;
- }
-
if (dtc->wb_dirty >= wb_thresh)
return;
@@ -1215,14 +1209,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
*/
if (unlikely(wb_thresh > dtc->thresh))
wb_thresh = dtc->thresh;
- /*
- * It's very possible that wb_thresh is close to 0 not because the
- * device is slow, but that it has remained inactive for long time.
- * Honour such devices a reasonable good (hopefully IO efficient)
- * threshold, so that the occasional writes won't be blocked and active
- * writes can rampup the threshold quickly.
- */
- wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
/*
* scale global setpoint to wb's:
* wb_setpoint = setpoint * wb_thresh / thresh
@@ -1478,17 +1464,10 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
* balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
* Hence, to calculate "step" properly, we have to use wb_dirty as
* "dirty" and wb_setpoint as "setpoint".
- *
- * We rampup dirty_ratelimit forcibly if wb_dirty is low because
- * it's possible that wb_thresh is close to zero due to inactivity
- * of backing device.
*/
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
dirty = dtc->wb_dirty;
- if (dtc->wb_dirty < 8)
- setpoint = dtc->wb_dirty + 1;
- else
- setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
+ setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
}
if (dirty < setpoint) {
--
2.20.1
From: Joshua Watt <jpewhacker@gmail.com> This patch strangely breaks NFS 4 clients for me. The behavior is that a client will start getting an I/O error which in turn is caused by the client getting a NFS3ERR_BADSESSION when attempting to write data to the server. I bisected the kernel from the latest master (9029dc666353504ea7c1ebfdf09bc1aab40f6147) to this commit (log below). Also, when I revert this commit on master the bug disappears. The server is running kernel 5.4.161, and the client that exhibits the behavior is running in qemux86, and has mounted the server with the options rw,relatime,vers=4.1,rsize=1048576,wsize=1048576,namlen=255,soft,proto=tcp,port=52049,timeo=600,retrans=2,sec=null,clientaddr=172.16.6.90,local_lock=none,addr=172.16.6.0 The program that I wrote to reproduce this is pretty simple; it does a file lock over NFS, then writes data to the file once per second. After about 32 seconds, it receives the I/O error, and this reproduced every time. I can provide the sample program if necessary. I also captured the NFS traffic both in the passing case and the failure case, and can provide them if useful. I did look at the two dumps and I'm not exactly sure what the difference is, other than with this patch the client tries to write every 30 seconds (and fails), where as without it attempts to write back every 5 seconds. I have no idea why this patch would cause this problem. Any help is appreciated. Thank you. git bisect start # status: waiting for both good and bad commits # good: [4fe89d07dcc2804c8b562f6c7896a45643d34b2f] Linux 6.0 git bisect good 4fe89d07dcc2804c8b562f6c7896a45643d34b2f # status: waiting for bad commit, 1 good commit known # bad: [e5f0a698b34ed76002dc5cff3804a61c80233a7a] Linux 6.17 git bisect bad e5f0a698b34ed76002dc5cff3804a61c80233a7a # good: [5f20e6ab1f65aaaaae248e6946d5cb6d039e7de8] Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next git bisect good 5f20e6ab1f65aaaaae248e6946d5cb6d039e7de8 # good: [28eb75e178d389d325f1666e422bc13bbbb9804c] Merge tag 'drm-next-2024-11-21' of https://gitlab.freedesktop.org/drm/kernel git bisect good 28eb75e178d389d325f1666e422bc13bbbb9804c # bad: [1afba39f9305fe4061a4e70baa6ebab9d41459da] Merge drm/drm-next into drm-misc-next git bisect bad 1afba39f9305fe4061a4e70baa6ebab9d41459da # bad: [350130afc22bd083ea18e17452dd3979c88b08ff] Merge tag 'ubifs-for-linus-6.14-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs git bisect bad 350130afc22bd083ea18e17452dd3979c88b08ff # good: [cf33d96f50903214226b379b3f10d1f262dae018] Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net git bisect good cf33d96f50903214226b379b3f10d1f262dae018 # good: [c9c0543b52d8cfe3a3b15d1e39ab9dbc91be6df4] Merge tag 'platform-drivers-x86-v6.14-1' of git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86 git bisect good c9c0543b52d8cfe3a3b15d1e39ab9dbc91be6df4 # good: [40648d246fa4307ef11d185933cb0d79fc9ff46c] Merge tag 'trace-tools-v6.14' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace git bisect good 40648d246fa4307ef11d185933cb0d79fc9ff46c # bad: [125ca745467d4f87ae58e671a4a5714e024d2908] Merge tag 'staging-6.14-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging git bisect bad 125ca745467d4f87ae58e671a4a5714e024d2908 # bad: [d1366e74342e75555af2648a2964deb2d5c92200] mm/compaction: fix UBSAN shift-out-of-bounds warning git bisect bad d1366e74342e75555af2648a2964deb2d5c92200 # good: [553e77529fb61e5520b839a0ce412a46cba996e0] mm: pgtable: introduce generic pagetable_dtor_free() git bisect good 553e77529fb61e5520b839a0ce412a46cba996e0 # good: [65a1cf15802c7a571299df507b1b72ba89ef1da8] mm/zsmalloc: convert migrate_zspage() to use zpdesc git bisect good 65a1cf15802c7a571299df507b1b72ba89ef1da8 # good: [136c5b40e0ad84f4b4a38584089cd565b97f799c] selftests/mm: use selftests framework to print test result git bisect good 136c5b40e0ad84f4b4a38584089cd565b97f799c # good: [fb7d3bc4149395c1ae99029c852eab6c28fc3c88] mm/filemap: drop streaming/uncached pages when writeback completes git bisect good fb7d3bc4149395c1ae99029c852eab6c28fc3c88 # good: [7882d8fc8fe0c2b2a01f09e56edf82df6b3013fd] selftests/mm/mkdirty: fix memory leak in test_uffdio_copy() git bisect good 7882d8fc8fe0c2b2a01f09e56edf82df6b3013fd # bad: [6aeb991c54b281710591ce388158bc1739afc227] mm/page-writeback: consolidate wb_thresh bumping logic into __wb_calc_thresh git bisect bad 6aeb991c54b281710591ce388158bc1739afc227 # good: [f752e677f85993c812fe9de7b4427f3f18408a11] mm: separate move/undo parts from migrate_pages_batch() git bisect good f752e677f85993c812fe9de7b4427f3f18408a11 # good: [686fa9537d78d2f1bea42bf3891828510202be14] mm/page_alloc: remove the incorrect and misleading comment git bisect good 686fa9537d78d2f1bea42bf3891828510202be14 # first bad commit: [6aeb991c54b281710591ce388158bc1739afc227] mm/page-writeback: consolidate wb_thresh bumping logic into __wb_calc_thresh
Hello! On Tue 07-10-25 10:17:11, Joshua Watt wrote: > From: Joshua Watt <jpewhacker@gmail.com> > > This patch strangely breaks NFS 4 clients for me. The behavior is that a > client will start getting an I/O error which in turn is caused by the client > getting a NFS3ERR_BADSESSION when attempting to write data to the server. I > bisected the kernel from the latest master > (9029dc666353504ea7c1ebfdf09bc1aab40f6147) to this commit (log below). Also, > when I revert this commit on master the bug disappears. > > The server is running kernel 5.4.161, and the client that exhibits the > behavior is running in qemux86, and has mounted the server with the options > rw,relatime,vers=4.1,rsize=1048576,wsize=1048576,namlen=255,soft,proto=tcp,port=52049,timeo=600,retrans=2,sec=null,clientaddr=172.16.6.90,local_lock=none,addr=172.16.6.0 > > The program that I wrote to reproduce this is pretty simple; it does a file > lock over NFS, then writes data to the file once per second. After about 32 > seconds, it receives the I/O error, and this reproduced every time. I can > provide the sample program if necessary. This is indeed rather curious. > I also captured the NFS traffic both in the passing case and the failure case, > and can provide them if useful. > > I did look at the two dumps and I'm not exactly sure what the difference is, > other than with this patch the client tries to write every 30 seconds (and > fails), where as without it attempts to write back every 5 seconds. I have no > idea why this patch would cause this problem. So the change in writeback behavior is not surprising. The commit does modify the logic computing dirty limits in some corner cases and your description matches the fact that previously the computed limits were lower so we've started writeback after 5s (dirty_writeback_interval) while with the patch we didn't cross the threshold and thus started writeback only once the dirty data was old enough, which is 30s (dirty_expire_interval). But that's all, you should be able to observe exactly the same writeback behavior if you write less even without this patch. So I suspect that the different writeback behavior is just triggering some bug in the NFS (either on the client or the server side). The NFS3ERR_BADSESSION error you're getting back sounds like something times out somewhere, falls out of cache and reports this error (which doesn't happen if we writeback after 5s instead of 30s). NFS guys maybe have better idea what's going on here. You could possibly workaround this problem (and verify my theory) by tuning /proc/sys/vm/dirty_expire_centisecs to a lower value (say 500). This will make inode writeback start earlier and thus should effectively mask the problem again. Honza -- Jan Kara <jack@suse.com> SUSE Labs, CR
On Wed, Oct 8, 2025 at 5:14 AM Jan Kara <jack@suse.cz> wrote: > > Hello! > > On Tue 07-10-25 10:17:11, Joshua Watt wrote: > > From: Joshua Watt <jpewhacker@gmail.com> > > > > This patch strangely breaks NFS 4 clients for me. The behavior is that a > > client will start getting an I/O error which in turn is caused by the client > > getting a NFS3ERR_BADSESSION when attempting to write data to the server. I > > bisected the kernel from the latest master > > (9029dc666353504ea7c1ebfdf09bc1aab40f6147) to this commit (log below). Also, > > when I revert this commit on master the bug disappears. > > > > The server is running kernel 5.4.161, and the client that exhibits the > > behavior is running in qemux86, and has mounted the server with the options > > rw,relatime,vers=4.1,rsize=1048576,wsize=1048576,namlen=255,soft,proto=tcp,port=52049,timeo=600,retrans=2,sec=null,clientaddr=172.16.6.90,local_lock=none,addr=172.16.6.0 > > > > The program that I wrote to reproduce this is pretty simple; it does a file > > lock over NFS, then writes data to the file once per second. After about 32 > > seconds, it receives the I/O error, and this reproduced every time. I can > > provide the sample program if necessary. > > This is indeed rather curious. > > > I also captured the NFS traffic both in the passing case and the failure case, > > and can provide them if useful. > > > > I did look at the two dumps and I'm not exactly sure what the difference is, > > other than with this patch the client tries to write every 30 seconds (and > > fails), where as without it attempts to write back every 5 seconds. I have no > > idea why this patch would cause this problem. > > So the change in writeback behavior is not surprising. The commit does > modify the logic computing dirty limits in some corner cases and your > description matches the fact that previously the computed limits were lower > so we've started writeback after 5s (dirty_writeback_interval) while with > the patch we didn't cross the threshold and thus started writeback only > once the dirty data was old enough, which is 30s (dirty_expire_interval). > > But that's all, you should be able to observe exactly the same writeback > behavior if you write less even without this patch. So I suspect that the > different writeback behavior is just triggering some bug in the NFS (either > on the client or the server side). The NFS3ERR_BADSESSION error you're > getting back sounds like something times out somewhere, falls out of cache > and reports this error (which doesn't happen if we writeback after 5s > instead of 30s). NFS guys maybe have better idea what's going on here. > > You could possibly workaround this problem (and verify my theory) by tuning > /proc/sys/vm/dirty_expire_centisecs to a lower value (say 500). This will > make inode writeback start earlier and thus should effectively mask the > problem again. Changing /proc/sys/vm/dirty_expire_centisecs did indeed prevent the issue from occurring. As an experiment, I tried to see what the lowest value I could use that worked, and it was also 500. Even setting it to 600 would cause it to error out eventually. This would indicate to me a server problem (which is unfortunate because that's much harder for me to debug), but perhaps the NFS folks could weigh in. > > Honza > -- > Jan Kara <jack@suse.com> > SUSE Labs, CR
On Wed, Oct 8, 2025 at 8:49 AM Joshua Watt <jpewhacker@gmail.com> wrote: > > On Wed, Oct 8, 2025 at 5:14 AM Jan Kara <jack@suse.cz> wrote: > > > > Hello! > > > > On Tue 07-10-25 10:17:11, Joshua Watt wrote: > > > From: Joshua Watt <jpewhacker@gmail.com> > > > > > > This patch strangely breaks NFS 4 clients for me. The behavior is that a > > > client will start getting an I/O error which in turn is caused by the client > > > getting a NFS3ERR_BADSESSION when attempting to write data to the server. I > > > bisected the kernel from the latest master > > > (9029dc666353504ea7c1ebfdf09bc1aab40f6147) to this commit (log below). Also, > > > when I revert this commit on master the bug disappears. > > > > > > The server is running kernel 5.4.161, and the client that exhibits the > > > behavior is running in qemux86, and has mounted the server with the options > > > rw,relatime,vers=4.1,rsize=1048576,wsize=1048576,namlen=255,soft,proto=tcp,port=52049,timeo=600,retrans=2,sec=null,clientaddr=172.16.6.90,local_lock=none,addr=172.16.6.0 > > > > > > The program that I wrote to reproduce this is pretty simple; it does a file > > > lock over NFS, then writes data to the file once per second. After about 32 > > > seconds, it receives the I/O error, and this reproduced every time. I can > > > provide the sample program if necessary. > > > > This is indeed rather curious. > > > > > I also captured the NFS traffic both in the passing case and the failure case, > > > and can provide them if useful. > > > > > > I did look at the two dumps and I'm not exactly sure what the difference is, > > > other than with this patch the client tries to write every 30 seconds (and > > > fails), where as without it attempts to write back every 5 seconds. I have no > > > idea why this patch would cause this problem. > > > > So the change in writeback behavior is not surprising. The commit does > > modify the logic computing dirty limits in some corner cases and your > > description matches the fact that previously the computed limits were lower > > so we've started writeback after 5s (dirty_writeback_interval) while with > > the patch we didn't cross the threshold and thus started writeback only > > once the dirty data was old enough, which is 30s (dirty_expire_interval). > > > > But that's all, you should be able to observe exactly the same writeback > > behavior if you write less even without this patch. So I suspect that the > > different writeback behavior is just triggering some bug in the NFS (either > > on the client or the server side). The NFS3ERR_BADSESSION error you're > > getting back sounds like something times out somewhere, falls out of cache > > and reports this error (which doesn't happen if we writeback after 5s > > instead of 30s). NFS guys maybe have better idea what's going on here. > > > > You could possibly workaround this problem (and verify my theory) by tuning > > /proc/sys/vm/dirty_expire_centisecs to a lower value (say 500). This will > > make inode writeback start earlier and thus should effectively mask the > > problem again. > > Changing /proc/sys/vm/dirty_expire_centisecs did indeed prevent the > issue from occurring. As an experiment, I tried to see what the lowest > value I could use that worked, and it was also 500. Even setting it to > 600 would cause it to error out eventually. This would indicate to me > a server problem (which is unfortunate because that's much harder for > me to debug), but perhaps the NFS folks could weigh in. I figured out the problem. There was a bug in the NFS client where it would not send state renewals within the first 5 minutes after booting; prior to this change, that was masked in my test case because the 5 second dirty writeback interval would keep the connection alive without needing the state renewals (and my test always did a reboot). I've submitted a patch to fix the NFS client to the mailing list [1]. Sorry for the noise, and thanks for your help. [1]: https://lore.kernel.org/linux-nfs/20251008230935.738405-1-JPEWhacker@gmail.com/T/#u > > > > > Honza > > -- > > Jan Kara <jack@suse.com> > > SUSE Labs, CR
On Wed 08-10-25 17:14:31, Joshua Watt wrote: > On Wed, Oct 8, 2025 at 8:49 AM Joshua Watt <jpewhacker@gmail.com> wrote: > > On Wed, Oct 8, 2025 at 5:14 AM Jan Kara <jack@suse.cz> wrote: > > > Hello! > > > > > > On Tue 07-10-25 10:17:11, Joshua Watt wrote: > > > > From: Joshua Watt <jpewhacker@gmail.com> > > > > > > > > This patch strangely breaks NFS 4 clients for me. The behavior is that a > > > > client will start getting an I/O error which in turn is caused by the client > > > > getting a NFS3ERR_BADSESSION when attempting to write data to the server. I > > > > bisected the kernel from the latest master > > > > (9029dc666353504ea7c1ebfdf09bc1aab40f6147) to this commit (log below). Also, > > > > when I revert this commit on master the bug disappears. > > > > > > > > The server is running kernel 5.4.161, and the client that exhibits the > > > > behavior is running in qemux86, and has mounted the server with the options > > > > rw,relatime,vers=4.1,rsize=1048576,wsize=1048576,namlen=255,soft,proto=tcp,port=52049,timeo=600,retrans=2,sec=null,clientaddr=172.16.6.90,local_lock=none,addr=172.16.6.0 > > > > > > > > The program that I wrote to reproduce this is pretty simple; it does a file > > > > lock over NFS, then writes data to the file once per second. After about 32 > > > > seconds, it receives the I/O error, and this reproduced every time. I can > > > > provide the sample program if necessary. > > > > > > This is indeed rather curious. > > > > > > > I also captured the NFS traffic both in the passing case and the failure case, > > > > and can provide them if useful. > > > > > > > > I did look at the two dumps and I'm not exactly sure what the difference is, > > > > other than with this patch the client tries to write every 30 seconds (and > > > > fails), where as without it attempts to write back every 5 seconds. I have no > > > > idea why this patch would cause this problem. > > > > > > So the change in writeback behavior is not surprising. The commit does > > > modify the logic computing dirty limits in some corner cases and your > > > description matches the fact that previously the computed limits were lower > > > so we've started writeback after 5s (dirty_writeback_interval) while with > > > the patch we didn't cross the threshold and thus started writeback only > > > once the dirty data was old enough, which is 30s (dirty_expire_interval). > > > > > > But that's all, you should be able to observe exactly the same writeback > > > behavior if you write less even without this patch. So I suspect that the > > > different writeback behavior is just triggering some bug in the NFS (either > > > on the client or the server side). The NFS3ERR_BADSESSION error you're > > > getting back sounds like something times out somewhere, falls out of cache > > > and reports this error (which doesn't happen if we writeback after 5s > > > instead of 30s). NFS guys maybe have better idea what's going on here. > > > > > > You could possibly workaround this problem (and verify my theory) by tuning > > > /proc/sys/vm/dirty_expire_centisecs to a lower value (say 500). This will > > > make inode writeback start earlier and thus should effectively mask the > > > problem again. > > > > Changing /proc/sys/vm/dirty_expire_centisecs did indeed prevent the > > issue from occurring. As an experiment, I tried to see what the lowest > > value I could use that worked, and it was also 500. Even setting it to > > 600 would cause it to error out eventually. This would indicate to me > > a server problem (which is unfortunate because that's much harder for > > me to debug), but perhaps the NFS folks could weigh in. > > I figured out the problem. There was a bug in the NFS client where it > would not send state renewals within the first 5 minutes after > booting; prior to this change, that was masked in my test case because > the 5 second dirty writeback interval would keep the connection alive > without needing the state renewals (and my test always did a reboot). > I've submitted a patch to fix the NFS client to the mailing list [1]. Cool that you've nailed it down :). > Sorry for the noise, and thanks for your help. You're welcome. Honza -- Jan Kara <jack@suse.com> SUSE Labs, CR
Hi, On Thu, Nov 21, 2024 at 06:05:39PM +0800, Jim Zhao wrote: > Address the feedback from "mm/page-writeback: raise wb_thresh to prevent > write blocking with strictlimit"(39ac99852fca98ca44d52716d792dfaf24981f53). > The wb_thresh bumping logic is scattered across wb_position_ratio, > __wb_calc_thresh, and wb_update_dirty_ratelimit. For consistency, > consolidate all wb_thresh bumping logic into __wb_calc_thresh. > > Reviewed-by: Jan Kara <jack@suse.cz> > Signed-off-by: Jim Zhao <jimzhao.ai@gmail.com> This patch triggers a boot failure with one of my 'sheb' boot tests. It is seen when trying to boot from flash (mtd). The log says ... Starting network: 8139cp 0000:00:02.0 eth0: link down udhcpc: started, v1.33.0 EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 udhcpc: sending discover udhcpc: sending discover udhcpc: sending discover EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 udhcpc: no lease, failing FAIL /etc/init.d/S55runtest: line 29: awk: Permission denied Found console ttySC1 /etc/init.d/S55runtest: line 45: awk: Permission denied can't run '/sbin/getty': No such device or address and boot stalls from there. There is no obvious kernel log message associated with the problem. Reverting this patch fixes the problem. Bisect log is attached for reference. Guenter --- # bad: [37136bf5c3a6f6b686d74f41837a6406bec6b7bc] Add linux-next specific files for 20250113 # good: [9d89551994a430b50c4fffcb1e617a057fa76e20] Linux 6.13-rc6 git bisect start 'HEAD~1' 'v6.13-rc6' # bad: [25dcaaf9b3bdaa117b8eb722ebde76ec9ed30038] Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git git bisect bad 25dcaaf9b3bdaa117b8eb722ebde76ec9ed30038 # bad: [435091688c6715e614213d84b1426dfb86fb1c11] Merge branch 'clk-next' of git://git.kernel.org/pub/scm/linux/kernel/git/clk/linux.git git bisect bad 435091688c6715e614213d84b1426dfb86fb1c11 # bad: [51bdf4c7090c8ab260e3a7e7ddaa5d9a7303f541] Merge branch 'perf-tools-next' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git git bisect bad 51bdf4c7090c8ab260e3a7e7ddaa5d9a7303f541 # bad: [972ccc84da30e3fbab197c91caa10fbb04e487c8] foo git bisect bad 972ccc84da30e3fbab197c91caa10fbb04e487c8 # bad: [243dd93e678dd4638b1005a13b085c3c5439447c] mm, swap: simplify percpu cluster updating git bisect bad 243dd93e678dd4638b1005a13b085c3c5439447c # bad: [09a3762697e810e311af23d10d79587da440e9dd] mm/hugetlb: support FOLL_FORCE|FOLL_WRITE git bisect bad 09a3762697e810e311af23d10d79587da440e9dd # bad: [6976848d304ef960e5ac5032611cfd8a9b1b6b01] docs: tmpfs: update the large folios policy for tmpfs and shmem git bisect bad 6976848d304ef960e5ac5032611cfd8a9b1b6b01 # good: [1d3d61aef8467ce44ab3a06e6a0e3fcd930590a7] mailmap, docs: update email to carlos.bilbao@kernel.org git bisect good 1d3d61aef8467ce44ab3a06e6a0e3fcd930590a7 # good: [4d9d1429f6deb91c66591074bbd8ca6aa4cba4dc] mm/page_alloc: move set_page_refcounted() to end of __alloc_pages() git bisect good 4d9d1429f6deb91c66591074bbd8ca6aa4cba4dc # good: [c1bc8fd460ebce85d7a768d8226861438e28bd53] mm: pgtable: make ptep_clear() non-atomic git bisect good c1bc8fd460ebce85d7a768d8226861438e28bd53 # bad: [f8db55561f1b5c70ba2dd260206f295ffee9d1c9] kasan: make kasan_record_aux_stack_noalloc() the default behaviour git bisect bad f8db55561f1b5c70ba2dd260206f295ffee9d1c9 # bad: [a82412684eaeda1ef8201472107de6a40843beec] mm: change type of cma_area_count to unsigned int git bisect bad a82412684eaeda1ef8201472107de6a40843beec # bad: [fcd31b7c35949323434d50416f896bc881a5c397] mm/page-writeback: consolidate wb_thresh bumping logic into __wb_calc_thresh git bisect bad fcd31b7c35949323434d50416f896bc881a5c397 # first bad commit: [fcd31b7c35949323434d50416f896bc881a5c397] mm/page-writeback: consolidate wb_thresh bumping logic into __wb_calc_thresh
On Mon 13-01-25 15:05:25, Guenter Roeck wrote: > Hi, > > On Thu, Nov 21, 2024 at 06:05:39PM +0800, Jim Zhao wrote: > > Address the feedback from "mm/page-writeback: raise wb_thresh to prevent > > write blocking with strictlimit"(39ac99852fca98ca44d52716d792dfaf24981f53). > > The wb_thresh bumping logic is scattered across wb_position_ratio, > > __wb_calc_thresh, and wb_update_dirty_ratelimit. For consistency, > > consolidate all wb_thresh bumping logic into __wb_calc_thresh. > > > > Reviewed-by: Jan Kara <jack@suse.cz> > > Signed-off-by: Jim Zhao <jimzhao.ai@gmail.com> > > This patch triggers a boot failure with one of my 'sheb' boot tests. > It is seen when trying to boot from flash (mtd). The log says > > ... > Starting network: 8139cp 0000:00:02.0 eth0: link down > udhcpc: started, v1.33.0 > EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 > udhcpc: sending discover > udhcpc: sending discover > udhcpc: sending discover > EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 Thanks for report! Uh, I have to say I'm very confused by this. It is clear than when ext2 detects the directory corruption (we fail checking directory inode 363 which is likely /etc/init.d/), the boot fails in interesting ways. What is unclear is how the commit can possibly cause ext2 directory corruption. If you didn't verify reverting the commit fixes the issue, I'd be suspecting bad bisection but that obviously isn't the case :-) Ext2 is storing directory data in the page cache so at least it uses the subsystem which the patch impacts but how writeback throttling can cause ext2 directory corruption is beyond me. BTW, do you recreate the root filesystem before each boot? How exactly? Honza > udhcpc: no lease, failing > FAIL > /etc/init.d/S55runtest: line 29: awk: Permission denied > Found console ttySC1 > /etc/init.d/S55runtest: line 45: awk: Permission denied > can't run '/sbin/getty': No such device or address > > and boot stalls from there. There is no obvious kernel log message > associated with the problem. Reverting this patch fixes the problem. > > Bisect log is attached for reference. > > Guenter > -- Jan Kara <jack@suse.com> SUSE Labs, CR
On 1/14/25 05:19, Jan Kara wrote: > On Mon 13-01-25 15:05:25, Guenter Roeck wrote: >> Hi, >> >> On Thu, Nov 21, 2024 at 06:05:39PM +0800, Jim Zhao wrote: >>> Address the feedback from "mm/page-writeback: raise wb_thresh to prevent >>> write blocking with strictlimit"(39ac99852fca98ca44d52716d792dfaf24981f53). >>> The wb_thresh bumping logic is scattered across wb_position_ratio, >>> __wb_calc_thresh, and wb_update_dirty_ratelimit. For consistency, >>> consolidate all wb_thresh bumping logic into __wb_calc_thresh. >>> >>> Reviewed-by: Jan Kara <jack@suse.cz> >>> Signed-off-by: Jim Zhao <jimzhao.ai@gmail.com> >> >> This patch triggers a boot failure with one of my 'sheb' boot tests. >> It is seen when trying to boot from flash (mtd). The log says >> >> ... >> Starting network: 8139cp 0000:00:02.0 eth0: link down >> udhcpc: started, v1.33.0 >> EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 >> udhcpc: sending discover >> udhcpc: sending discover >> udhcpc: sending discover >> EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 > > Thanks for report! Uh, I have to say I'm very confused by this. It is clear > than when ext2 detects the directory corruption (we fail checking directory > inode 363 which is likely /etc/init.d/), the boot fails in interesting > ways. What is unclear is how the commit can possibly cause ext2 directory > corruption. If you didn't verify reverting the commit fixes the issue, I'd > be suspecting bad bisection but that obviously isn't the case :-) > > Ext2 is storing directory data in the page cache so at least it uses the > subsystem which the patch impacts but how writeback throttling can cause > ext2 directory corruption is beyond me. BTW, do you recreate the root > filesystem before each boot? How exactly? > I use pre-built root file systems. For sheb, they are at https://github.com/groeck/linux-build-test/tree/master/rootfs/sheb I don't think this is related to ext2 itself. Booting an ext2 image from ata/ide drive works. Guenter
On Tue 14-01-25 07:01:08, Guenter Roeck wrote: > On 1/14/25 05:19, Jan Kara wrote: > > On Mon 13-01-25 15:05:25, Guenter Roeck wrote: > > > On Thu, Nov 21, 2024 at 06:05:39PM +0800, Jim Zhao wrote: > > > > Address the feedback from "mm/page-writeback: raise wb_thresh to prevent > > > > write blocking with strictlimit"(39ac99852fca98ca44d52716d792dfaf24981f53). > > > > The wb_thresh bumping logic is scattered across wb_position_ratio, > > > > __wb_calc_thresh, and wb_update_dirty_ratelimit. For consistency, > > > > consolidate all wb_thresh bumping logic into __wb_calc_thresh. > > > > > > > > Reviewed-by: Jan Kara <jack@suse.cz> > > > > Signed-off-by: Jim Zhao <jimzhao.ai@gmail.com> > > > > > > This patch triggers a boot failure with one of my 'sheb' boot tests. > > > It is seen when trying to boot from flash (mtd). The log says > > > > > > ... > > > Starting network: 8139cp 0000:00:02.0 eth0: link down > > > udhcpc: started, v1.33.0 > > > EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 > > > udhcpc: sending discover > > > udhcpc: sending discover > > > udhcpc: sending discover > > > EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 > > > > Thanks for report! Uh, I have to say I'm very confused by this. It is clear > > than when ext2 detects the directory corruption (we fail checking directory > > inode 363 which is likely /etc/init.d/), the boot fails in interesting > > ways. What is unclear is how the commit can possibly cause ext2 directory > > corruption. If you didn't verify reverting the commit fixes the issue, I'd > > be suspecting bad bisection but that obviously isn't the case :-) > > > > Ext2 is storing directory data in the page cache so at least it uses the > > subsystem which the patch impacts but how writeback throttling can cause > > ext2 directory corruption is beyond me. BTW, do you recreate the root > > filesystem before each boot? How exactly? > > I use pre-built root file systems. For sheb, they are at > https://github.com/groeck/linux-build-test/tree/master/rootfs/sheb Thanks. So the problematic directory is /usr/share/udhcpc/ where we read apparently bogus metadata at the beginning of that directory. > I don't think this is related to ext2 itself. Booting an ext2 image from > ata/ide drive works. Interesting this is specific to mtd. I'll read the patch carefully again if something rings a bell. Honza -- Jan Kara <jack@suse.com> SUSE Labs, CR
On 1/15/25 08:07, Jan Kara wrote: > On Tue 14-01-25 07:01:08, Guenter Roeck wrote: >> On 1/14/25 05:19, Jan Kara wrote: >>> On Mon 13-01-25 15:05:25, Guenter Roeck wrote: >>>> On Thu, Nov 21, 2024 at 06:05:39PM +0800, Jim Zhao wrote: >>>>> Address the feedback from "mm/page-writeback: raise wb_thresh to prevent >>>>> write blocking with strictlimit"(39ac99852fca98ca44d52716d792dfaf24981f53). >>>>> The wb_thresh bumping logic is scattered across wb_position_ratio, >>>>> __wb_calc_thresh, and wb_update_dirty_ratelimit. For consistency, >>>>> consolidate all wb_thresh bumping logic into __wb_calc_thresh. >>>>> >>>>> Reviewed-by: Jan Kara <jack@suse.cz> >>>>> Signed-off-by: Jim Zhao <jimzhao.ai@gmail.com> >>>> >>>> This patch triggers a boot failure with one of my 'sheb' boot tests. >>>> It is seen when trying to boot from flash (mtd). The log says >>>> >>>> ... >>>> Starting network: 8139cp 0000:00:02.0 eth0: link down >>>> udhcpc: started, v1.33.0 >>>> EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 >>>> udhcpc: sending discover >>>> udhcpc: sending discover >>>> udhcpc: sending discover >>>> EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 >>> >>> Thanks for report! Uh, I have to say I'm very confused by this. It is clear >>> than when ext2 detects the directory corruption (we fail checking directory >>> inode 363 which is likely /etc/init.d/), the boot fails in interesting >>> ways. What is unclear is how the commit can possibly cause ext2 directory >>> corruption. If you didn't verify reverting the commit fixes the issue, I'd >>> be suspecting bad bisection but that obviously isn't the case :-) >>> >>> Ext2 is storing directory data in the page cache so at least it uses the >>> subsystem which the patch impacts but how writeback throttling can cause >>> ext2 directory corruption is beyond me. BTW, do you recreate the root >>> filesystem before each boot? How exactly? >> >> I use pre-built root file systems. For sheb, they are at >> https://github.com/groeck/linux-build-test/tree/master/rootfs/sheb > > Thanks. So the problematic directory is /usr/share/udhcpc/ where we > read apparently bogus metadata at the beginning of that directory. > >> I don't think this is related to ext2 itself. Booting an ext2 image from >> ata/ide drive works. > > Interesting this is specific to mtd. I'll read the patch carefully again if > something rings a bell. > Interesting. Is there some endianness issue, by any chance ? I only see the problem with sheb (big endian), not with sh (little endian). I'd suspect that it is an emulation bug, but it is odd that the problem did not show up before. Guenter
On Wed 15-01-25 08:41:43, Guenter Roeck wrote:
> On 1/15/25 08:07, Jan Kara wrote:
> > On Tue 14-01-25 07:01:08, Guenter Roeck wrote:
> > > On 1/14/25 05:19, Jan Kara wrote:
> > > > On Mon 13-01-25 15:05:25, Guenter Roeck wrote:
> > > > > On Thu, Nov 21, 2024 at 06:05:39PM +0800, Jim Zhao wrote:
> > > > > > Address the feedback from "mm/page-writeback: raise wb_thresh to prevent
> > > > > > write blocking with strictlimit"(39ac99852fca98ca44d52716d792dfaf24981f53).
> > > > > > The wb_thresh bumping logic is scattered across wb_position_ratio,
> > > > > > __wb_calc_thresh, and wb_update_dirty_ratelimit. For consistency,
> > > > > > consolidate all wb_thresh bumping logic into __wb_calc_thresh.
> > > > > >
> > > > > > Reviewed-by: Jan Kara <jack@suse.cz>
> > > > > > Signed-off-by: Jim Zhao <jimzhao.ai@gmail.com>
> > > > >
> > > > > This patch triggers a boot failure with one of my 'sheb' boot tests.
> > > > > It is seen when trying to boot from flash (mtd). The log says
> > > > >
> > > > > ...
> > > > > Starting network: 8139cp 0000:00:02.0 eth0: link down
> > > > > udhcpc: started, v1.33.0
> > > > > EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2
> > > > > udhcpc: sending discover
> > > > > udhcpc: sending discover
> > > > > udhcpc: sending discover
> > > > > EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2
> > > >
> > > > Thanks for report! Uh, I have to say I'm very confused by this. It is clear
> > > > than when ext2 detects the directory corruption (we fail checking directory
> > > > inode 363 which is likely /etc/init.d/), the boot fails in interesting
> > > > ways. What is unclear is how the commit can possibly cause ext2 directory
> > > > corruption. If you didn't verify reverting the commit fixes the issue, I'd
> > > > be suspecting bad bisection but that obviously isn't the case :-)
> > > >
> > > > Ext2 is storing directory data in the page cache so at least it uses the
> > > > subsystem which the patch impacts but how writeback throttling can cause
> > > > ext2 directory corruption is beyond me. BTW, do you recreate the root
> > > > filesystem before each boot? How exactly?
> > >
> > > I use pre-built root file systems. For sheb, they are at
> > > https://github.com/groeck/linux-build-test/tree/master/rootfs/sheb
> >
> > Thanks. So the problematic directory is /usr/share/udhcpc/ where we
> > read apparently bogus metadata at the beginning of that directory.
> >
> > > I don't think this is related to ext2 itself. Booting an ext2 image from
> > > ata/ide drive works.
> >
> > Interesting this is specific to mtd. I'll read the patch carefully again if
> > something rings a bell.
> >
>
> Interesting. Is there some endianness issue, by any chance ? I only see the problem
> with sheb (big endian), not with sh (little endian). I'd suspect that it is an
> emulation bug, but it is odd that the problem did not show up before.
So far I don't have a good explanation. Let me write down here the facts,
maybe it will trigger the aha effect.
1) Ext2 stores the metadata in little endian ordering. We observe the
problem with the first directory entry in the folio. Both entry->rec_len
(16-bit) and entry->inode (32-bit) appear to be seen in wrong endianity
2) The function that fails is ext2_check_folio(). We kmap_local() the folio
in ext2_get_folio(), then in ext2_check_folio() we do:
ext2_dirent *p;
p = (ext2_dirent *)(kaddr + 0);
rec_len = ext2_rec_len_from_disk(p->rec_len);
^^^ value 3072 == 0x0c00 seen here instead of correct 0x000c
this value is invalid so we go to:
ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - "
"offset=%llu, inode=%lu, rec_len=%d, name_len=%d",
dir->i_ino, error, folio_pos(folio) + offs,
(unsigned long) le32_to_cpu(p->inode),
rec_len, p->name_len);
Here rec_len is printed so we see the wrong value. Also
le32_to_cpu(p->inode) is printed which also shows number with swapped byte
ordering (the message contains inode number 27393 == 0x00006b01 but the
proper inode number is 363 == 0x0000016b). This actually releals more about
the problem because only the two bytes were swapped in the inode number
although we always treat it as 32-bit entity. So this would indeed point
more at some architectural issue rather than a problem in the filesystem /
MM.
Note that to get at this point in the boot we must have correctly
byteswapped many other directory entries in the filesystem. So the problem
must be triggered by some parallel activity happening in the system or
something like that.
3) The problem appears only with MTD storage, not with IDE/SATA on the same
system + filesystem image. It it unclear how the storage influences the
reproduction, rather than that it influences timing of events in the
system.
4) The problem reliably happens with "mm/page-writeback: Consolidate wb_thresh
bumping logic into __wb_calc_thresh", not without it. All this patch does
is that it possibly changes a limit at which processes dirtying pages in
the page cache get throttled. Thus there are fairly limited opportunities
for how it can cause damage (I've checked for possible UAF issues or memory
corruption but I don't really see any such possibility there, it is just
crunching numbers from the mm counters and takes decision based on the
result). This change doesn't have direct on the directory ext2 code. The only
thing it does is that it possibly changes code alignment of ext2 code if it
gets linked afterwards into vmlinux image (provided ext2 is built in). Another
possibility is again that it changes timing of events in the system due to
differences in throttling of processes dirtying page cache.
So at this point I don't have a better explanation than blame the HW. What
really tipped my conviction in this direction is the 16-bit byteswap in a
32-bit entity. Hence I guess I'll ask Andrew to put Jim's patch back into
tree if you don't object.
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
On Thu, Jan 16, 2025 at 03:56:57PM +0100, Jan Kara wrote: ... > > > > Interesting. Is there some endianness issue, by any chance ? I only see the problem > > with sheb (big endian), not with sh (little endian). I'd suspect that it is an > > emulation bug, but it is odd that the problem did not show up before. > > So far I don't have a good explanation. Let me write down here the facts, > maybe it will trigger the aha effect. > > 1) Ext2 stores the metadata in little endian ordering. We observe the > problem with the first directory entry in the folio. Both entry->rec_len > (16-bit) and entry->inode (32-bit) appear to be seen in wrong endianity > > 2) The function that fails is ext2_check_folio(). We kmap_local() the folio > in ext2_get_folio(), then in ext2_check_folio() we do: > > ext2_dirent *p; > > p = (ext2_dirent *)(kaddr + 0); > rec_len = ext2_rec_len_from_disk(p->rec_len); > ^^^ value 3072 == 0x0c00 seen here instead of correct 0x000c > this value is invalid so we go to: > ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - " > "offset=%llu, inode=%lu, rec_len=%d, name_len=%d", > dir->i_ino, error, folio_pos(folio) + offs, > (unsigned long) le32_to_cpu(p->inode), > rec_len, p->name_len); > > Here rec_len is printed so we see the wrong value. Also > le32_to_cpu(p->inode) is printed which also shows number with swapped byte > ordering (the message contains inode number 27393 == 0x00006b01 but the > proper inode number is 363 == 0x0000016b). This actually releals more about > the problem because only the two bytes were swapped in the inode number > although we always treat it as 32-bit entity. So this would indeed point > more at some architectural issue rather than a problem in the filesystem / > MM. > > Note that to get at this point in the boot we must have correctly > byteswapped many other directory entries in the filesystem. So the problem > must be triggered by some parallel activity happening in the system or > something like that. > > 3) The problem appears only with MTD storage, not with IDE/SATA on the same > system + filesystem image. It it unclear how the storage influences the > reproduction, rather than that it influences timing of events in the > system. > > 4) The problem reliably happens with "mm/page-writeback: Consolidate wb_thresh > bumping logic into __wb_calc_thresh", not without it. All this patch does > is that it possibly changes a limit at which processes dirtying pages in > the page cache get throttled. Thus there are fairly limited opportunities > for how it can cause damage (I've checked for possible UAF issues or memory > corruption but I don't really see any such possibility there, it is just > crunching numbers from the mm counters and takes decision based on the > result). This change doesn't have direct on the directory ext2 code. The only > thing it does is that it possibly changes code alignment of ext2 code if it > gets linked afterwards into vmlinux image (provided ext2 is built in). Another > possibility is again that it changes timing of events in the system due to > differences in throttling of processes dirtying page cache. > > So at this point I don't have a better explanation than blame the HW. What > really tipped my conviction in this direction is the 16-bit byteswap in a > 32-bit entity. Hence I guess I'll ask Andrew to put Jim's patch back into > tree if you don't object. I agree that this is most likely an emulation problem (it would not be the first one), so please go ahead. Thanks, Guenter
On Wed 15-01-25 17:07:36, Jan Kara wrote: > On Tue 14-01-25 07:01:08, Guenter Roeck wrote: > > On 1/14/25 05:19, Jan Kara wrote: > > > On Mon 13-01-25 15:05:25, Guenter Roeck wrote: > > > > On Thu, Nov 21, 2024 at 06:05:39PM +0800, Jim Zhao wrote: > > > > > Address the feedback from "mm/page-writeback: raise wb_thresh to prevent > > > > > write blocking with strictlimit"(39ac99852fca98ca44d52716d792dfaf24981f53). > > > > > The wb_thresh bumping logic is scattered across wb_position_ratio, > > > > > __wb_calc_thresh, and wb_update_dirty_ratelimit. For consistency, > > > > > consolidate all wb_thresh bumping logic into __wb_calc_thresh. > > > > > > > > > > Reviewed-by: Jan Kara <jack@suse.cz> > > > > > Signed-off-by: Jim Zhao <jimzhao.ai@gmail.com> > > > > > > > > This patch triggers a boot failure with one of my 'sheb' boot tests. > > > > It is seen when trying to boot from flash (mtd). The log says > > > > > > > > ... > > > > Starting network: 8139cp 0000:00:02.0 eth0: link down > > > > udhcpc: started, v1.33.0 > > > > EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 > > > > udhcpc: sending discover > > > > udhcpc: sending discover > > > > udhcpc: sending discover > > > > EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 > > > > > > Thanks for report! Uh, I have to say I'm very confused by this. It is clear > > > than when ext2 detects the directory corruption (we fail checking directory > > > inode 363 which is likely /etc/init.d/), the boot fails in interesting > > > ways. What is unclear is how the commit can possibly cause ext2 directory > > > corruption. If you didn't verify reverting the commit fixes the issue, I'd > > > be suspecting bad bisection but that obviously isn't the case :-) > > > > > > Ext2 is storing directory data in the page cache so at least it uses the > > > subsystem which the patch impacts but how writeback throttling can cause > > > ext2 directory corruption is beyond me. BTW, do you recreate the root > > > filesystem before each boot? How exactly? > > > > I use pre-built root file systems. For sheb, they are at > > https://github.com/groeck/linux-build-test/tree/master/rootfs/sheb > > Thanks. So the problematic directory is /usr/share/udhcpc/ where we > read apparently bogus metadata at the beginning of that directory. Ah, the metadata isn't bogus. But the entries in the directory are apparently byte-swapped (little vs big endian). Is the machine actually little or big endian? Honza -- Jan Kara <jack@suse.com> SUSE Labs, CR
On 1/15/25 08:28, Jan Kara wrote: > On Wed 15-01-25 17:07:36, Jan Kara wrote: >> On Tue 14-01-25 07:01:08, Guenter Roeck wrote: >>> On 1/14/25 05:19, Jan Kara wrote: >>>> On Mon 13-01-25 15:05:25, Guenter Roeck wrote: >>>>> On Thu, Nov 21, 2024 at 06:05:39PM +0800, Jim Zhao wrote: >>>>>> Address the feedback from "mm/page-writeback: raise wb_thresh to prevent >>>>>> write blocking with strictlimit"(39ac99852fca98ca44d52716d792dfaf24981f53). >>>>>> The wb_thresh bumping logic is scattered across wb_position_ratio, >>>>>> __wb_calc_thresh, and wb_update_dirty_ratelimit. For consistency, >>>>>> consolidate all wb_thresh bumping logic into __wb_calc_thresh. >>>>>> >>>>>> Reviewed-by: Jan Kara <jack@suse.cz> >>>>>> Signed-off-by: Jim Zhao <jimzhao.ai@gmail.com> >>>>> >>>>> This patch triggers a boot failure with one of my 'sheb' boot tests. >>>>> It is seen when trying to boot from flash (mtd). The log says >>>>> >>>>> ... >>>>> Starting network: 8139cp 0000:00:02.0 eth0: link down >>>>> udhcpc: started, v1.33.0 >>>>> EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 >>>>> udhcpc: sending discover >>>>> udhcpc: sending discover >>>>> udhcpc: sending discover >>>>> EXT2-fs (mtdblock3): error: ext2_check_folio: bad entry in directory #363: : directory entry across blocks - offset=0, inode=27393, rec_len=3072, name_len=2 >>>> >>>> Thanks for report! Uh, I have to say I'm very confused by this. It is clear >>>> than when ext2 detects the directory corruption (we fail checking directory >>>> inode 363 which is likely /etc/init.d/), the boot fails in interesting >>>> ways. What is unclear is how the commit can possibly cause ext2 directory >>>> corruption. If you didn't verify reverting the commit fixes the issue, I'd >>>> be suspecting bad bisection but that obviously isn't the case :-) >>>> >>>> Ext2 is storing directory data in the page cache so at least it uses the >>>> subsystem which the patch impacts but how writeback throttling can cause >>>> ext2 directory corruption is beyond me. BTW, do you recreate the root >>>> filesystem before each boot? How exactly? >>> >>> I use pre-built root file systems. For sheb, they are at >>> https://github.com/groeck/linux-build-test/tree/master/rootfs/sheb >> >> Thanks. So the problematic directory is /usr/share/udhcpc/ where we >> read apparently bogus metadata at the beginning of that directory. > > Ah, the metadata isn't bogus. But the entries in the directory are > apparently byte-swapped (little vs big endian). Is the machine actually > little or big endian? > sheb is big endian. I only see the problem there, not with the little endian emulation. But that uses a different root file system. As I just mentioned in the other reply, it might well be an emulation bug, but it is odd that your patch would be required to expose that. Guenter
© 2016 - 2026 Red Hat, Inc.